summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp')
-rw-r--r--media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp622
1 files changed, 622 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp b/media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp
new file mode 100644
index 0000000..6fd41c3
--- /dev/null
+++ b/media/libstagefright/codecs/m4v_h263/enc/src/fastcodemb.cpp
@@ -0,0 +1,622 @@
+/* ------------------------------------------------------------------
+ * Copyright (C) 1998-2009 PacketVideo
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ * -------------------------------------------------------------------
+ */
+#include "mp4def.h"
+#include "mp4lib_int.h"
+#include "mp4enc_lib.h"
+#include "dct.h"
+#include "m4venc_oscl.h"
+
+/* ======================================================================== */
+/* Function : CodeMB_H263( ) */
+/* Date : 8/15/2001 */
+/* Purpose : Perform residue calc (only zero MV), DCT, H263 Quant/Dequant,*/
+/* IDCT and motion compensation.Modified from FastCodeMB() */
+/* Input : */
+/* video Video encoder data structure */
+/* function Approximate DCT function, scaling and threshold */
+/* ncoefblck Array for last nonzero coeff for speedup in VlcEncode */
+/* QP Combined offset from the origin to the current */
+/* macroblock and QP for current MB. */
+/* Output : */
+/* video->outputMB Quantized DCT coefficients. */
+/* currVop->yChan,uChan,vChan Reconstructed pixels */
+/* */
+/* Return : PV_STATUS */
+/* Modified : */
+/* 2/26/01
+ -modified threshold based on correlation coeff 0.75 only for mode H.263
+ -ncoefblck[] as input, to keep position of last non-zero coeff*/
+/* 8/10/01
+ -modified threshold based on correlation coeff 0.5
+ -used column threshold to speedup column DCT.
+ -used bitmap zigzag to speedup RunLevel(). */
+/* ======================================================================== */
+
+PV_STATUS CodeMB_H263(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
+{
+ Int sad, k, CBP, mbnum = video->mbnum;
+ Short *output, *dataBlock;
+ UChar Mode = video->headerInfo.Mode[mbnum];
+ UChar *bitmapcol, *bitmaprow = video->bitmaprow;
+ UInt *bitmapzz ;
+ UChar shortHeader = video->vol[video->currLayer]->shortVideoHeader;
+ Int dc_scaler = 8;
+ Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
+ struct QPstruct QuantParam;
+ Int dctMode, DctTh1;
+ Int ColTh;
+ Int(*BlockQuantDequantH263)(Short *, Short *, struct QPstruct *,
+ UChar[], UChar *, UInt *, Int, Int, Int, UChar);
+ Int(*BlockQuantDequantH263DC)(Short *, Short *, struct QPstruct *,
+ UChar *, UInt *, Int, UChar);
+ void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
+
+ /* motion comp. related var. */
+ Vop *currVop = video->currVop;
+ VideoEncFrameIO *inputFrame = video->input;
+ Int ind_x = video->outputMB->mb_x;
+ Int ind_y = video->outputMB->mb_y;
+ Int lx = currVop->pitch;
+ Int width = currVop->width;
+ UChar *rec, *input, *pred;
+ Int offset = QP >> 5; /* QP is combined offset and QP */
+ Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
+ /*****************************/
+
+ OSCL_UNUSED_ARG(function);
+
+ output = video->outputMB->block[0];
+ CBP = 0;
+ QP = QP & 0x1F;
+// M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero , 7/24/01*/
+
+ QuantParam.QPx2 = QP << 1;
+ QuantParam.QP = QP;
+ QuantParam.QPdiv2 = QP >> 1;
+ QuantParam.QPx2plus = QuantParam.QPx2 + QuantParam.QPdiv2;
+ QuantParam.Addition = QP - 1 + (QP & 0x1);
+
+ if (intra)
+ {
+ BlockDCT1x1 = &Block1x1DCTIntra;
+ BlockDCT2x2 = &Block2x2DCT_AANIntra;
+ BlockDCT4x4 = &Block4x4DCT_AANIntra;
+ BlockDCT8x8 = &BlockDCT_AANIntra;
+ BlockQuantDequantH263 = &BlockQuantDequantH263Intra;
+ BlockQuantDequantH263DC = &BlockQuantDequantH263DCIntra;
+ if (shortHeader)
+ {
+ dc_scaler = 8;
+ }
+ else
+ {
+ dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
+ }
+ DctTh1 = (Int)(dc_scaler * 3);//*1.829
+ ColTh = ColThIntra[QP];
+ }
+ else
+ {
+ BlockDCT1x1 = &Block1x1DCTwSub;
+ BlockDCT2x2 = &Block2x2DCT_AANwSub;
+ BlockDCT4x4 = &Block4x4DCT_AANwSub;
+ BlockDCT8x8 = &BlockDCT_AANwSub;
+
+ BlockQuantDequantH263 = &BlockQuantDequantH263Inter;
+ BlockQuantDequantH263DC = &BlockQuantDequantH263DCInter;
+ ColTh = ColThInter[QP];
+ DctTh1 = (Int)(16 * QP); //9*QP;
+ }
+
+ rec = currVop->yChan + offset;
+ input = inputFrame->yChan + offset;
+ if (lx != width) input -= (ind_y << 9); /* non-padded offset */
+
+ dataBlock = video->dataBlock;
+ pred = video->predictedMB;
+
+ for (k = 0; k < 6; k++)
+ {
+ CBP <<= 1;
+ bitmapcol = video->bitmapcol[k];
+ bitmapzz = video->bitmapzz[k]; /* 7/30/01 */
+ if (k < 4)
+ {
+ sad = video->mot[mbnum][k+1].sad;
+ if (k&1)
+ {
+ rec += 8;
+ input += 8;
+ }
+ else if (k == 2)
+ {
+ dctMode = ((width << 3) - 8);
+ input += dctMode;
+ dctMode = ((lx << 3) - 8);
+ rec += dctMode;
+ }
+ }
+ else
+ {
+ if (k == 4)
+ {
+ rec = currVop->uChan + offsetc;
+ input = inputFrame->uChan + offsetc;
+ if (lx != width) input -= (ind_y << 7);
+ lx >>= 1;
+ width >>= 1;
+ if (intra)
+ {
+ sad = getBlockSum(input, width);
+ if (shortHeader)
+ dc_scaler = 8;
+ else
+ {
+ dc_scaler = cal_dc_scalerENC(QP, 2); /* chrominance blocks */
+ }
+ DctTh1 = (Int)(dc_scaler * 3);//*1.829
+ }
+ else
+ sad = Sad8x8(input, pred, width);
+ }
+ else
+ {
+ rec = currVop->vChan + offsetc;
+ input = inputFrame->vChan + offsetc;
+ if (lx != width) input -= (ind_y << 7);
+ if (intra)
+ {
+ sad = getBlockSum(input, width);
+ }
+ else
+ sad = Sad8x8(input, pred, width);
+ }
+ }
+
+ if (sad < DctTh1 && !(shortHeader && intra)) /* all-zero */
+ { /* For shortHeader intra block, DC value cannot be zero */
+ dctMode = 0;
+ CBP |= 0;
+ ncoefblck[k] = 0;
+ }
+ else if (sad < 18*QP/*(QP<<4)*/) /* DC-only */
+ {
+ dctMode = 1;
+ BlockDCT1x1(dataBlock, input, pred, width);
+
+ CBP |= (*BlockQuantDequantH263DC)(dataBlock, output, &QuantParam,
+ bitmaprow + k, bitmapzz, dc_scaler, shortHeader);
+ ncoefblck[k] = 1;
+ }
+ else
+ {
+
+ dataBlock[64] = ColTh;
+
+ if (sad < 22*QP/*(QP<<4)+(QP<<1)*/) /* 2x2 DCT */
+ {
+ dctMode = 2;
+ BlockDCT2x2(dataBlock, input, pred, width);
+ ncoefblck[k] = 6;
+ }
+ else if (sad < (QP << 5)) /* 4x4 DCT */
+ {
+ dctMode = 4;
+ BlockDCT4x4(dataBlock, input, pred, width);
+ ncoefblck[k] = 26;
+ }
+ else /* Full-DCT */
+ {
+ dctMode = 8;
+ BlockDCT8x8(dataBlock, input, pred, width);
+ ncoefblck[k] = 64;
+ }
+
+ CBP |= (*BlockQuantDequantH263)(dataBlock, output, &QuantParam,
+ bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler, shortHeader);
+ }
+ BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | intra);
+ output += 64;
+ if (!(k&1))
+ {
+ pred += 8;
+ }
+ else
+ {
+ pred += 120;
+ }
+ }
+
+ video->headerInfo.CBP[mbnum] = CBP; /* 5/18/2001 */
+ return PV_SUCCESS;
+}
+
+#ifndef NO_MPEG_QUANT
+/* ======================================================================== */
+/* Function : CodeMB_MPEG( ) */
+/* Date : 8/15/2001 */
+/* Purpose : Perform residue calc (only zero MV), DCT, MPEG Quant/Dequant,*/
+/* IDCT and motion compensation.Modified from FastCodeMB() */
+/* Input : */
+/* video Video encoder data structure */
+/* function Approximate DCT function, scaling and threshold */
+/* ncoefblck Array for last nonzero coeff for speedup in VlcEncode */
+/* QP Combined offset from the origin to the current */
+/* macroblock and QP for current MB. */
+/* Output : */
+/* video->outputMB Quantized DCT coefficients. */
+/* currVop->yChan,uChan,vChan Reconstructed pixels */
+/* */
+/* Return : PV_STATUS */
+/* Modified : */
+/* 2/26/01
+ -modified threshold based on correlation coeff 0.75 only for mode H.263
+ -ncoefblck[] as input, keep position of last non-zero coeff*/
+/* 8/10/01
+ -modified threshold based on correlation coeff 0.5
+ -used column threshold to speedup column DCT.
+ -used bitmap zigzag to speedup RunLevel(). */
+/* ======================================================================== */
+
+PV_STATUS CodeMB_MPEG(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
+{
+ Int sad, k, CBP, mbnum = video->mbnum;
+ Short *output, *dataBlock;
+ UChar Mode = video->headerInfo.Mode[mbnum];
+ UChar *bitmapcol, *bitmaprow = video->bitmaprow;
+ UInt *bitmapzz ;
+ Int dc_scaler = 8;
+ Vol *currVol = video->vol[video->currLayer];
+ Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
+ Int *qmat;
+ Int dctMode, DctTh1, DctTh2, DctTh3, DctTh4;
+ Int ColTh;
+
+ Int(*BlockQuantDequantMPEG)(Short *, Short *, Int, Int *,
+ UChar [], UChar *, UInt *, Int, Int, Int);
+ Int(*BlockQuantDequantMPEGDC)(Short *, Short *, Int, Int *,
+ UChar [], UChar *, UInt *, Int);
+
+ void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
+ void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
+
+ /* motion comp. related var. */
+ Vop *currVop = video->currVop;
+ VideoEncFrameIO *inputFrame = video->input;
+ Int ind_x = video->outputMB->mb_x;
+ Int ind_y = video->outputMB->mb_y;
+ Int lx = currVop->pitch;
+ Int width = currVop->width;
+ UChar *rec, *input, *pred;
+ Int offset = QP >> 5;
+ Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
+ /*****************************/
+
+ OSCL_UNUSED_ARG(function);
+
+ output = video->outputMB->block[0];
+ CBP = 0;
+ QP = QP & 0x1F;
+// M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero , 7/24/01*/
+
+ if (intra)
+ {
+ BlockDCT1x1 = &Block1x1DCTIntra;
+ BlockDCT2x2 = &Block2x2DCT_AANIntra;
+ BlockDCT4x4 = &Block4x4DCT_AANIntra;
+ BlockDCT8x8 = &BlockDCT_AANIntra;
+
+ BlockQuantDequantMPEG = &BlockQuantDequantMPEGIntra;
+ BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCIntra;
+ dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
+ qmat = currVol->iqmat;
+ DctTh1 = (Int)(3 * dc_scaler);//2*dc_scaler);
+ DctTh2 = (Int)((1.25 * QP - 1) * qmat[1] * 0.45);//0.567);//0.567);
+ DctTh3 = (Int)((1.25 * QP - 1) * qmat[2] * 0.55);//1.162); /* 8/2/2001 */
+ DctTh4 = (Int)((1.25 * QP - 1) * qmat[32] * 0.8);//1.7583);//0.7942);
+ ColTh = ColThIntra[QP];
+ }
+ else
+ {
+ BlockDCT1x1 = &Block1x1DCTwSub;
+ BlockDCT2x2 = &Block2x2DCT_AANwSub;
+ BlockDCT4x4 = &Block4x4DCT_AANwSub;
+ BlockDCT8x8 = &BlockDCT_AANwSub;
+
+ BlockQuantDequantMPEG = &BlockQuantDequantMPEGInter;
+ BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCInter;
+ qmat = currVol->niqmat;
+ DctTh1 = (Int)(((QP << 1) - 0.5) * qmat[0] * 0.4);//0.2286);//0.3062);
+ DctTh2 = (Int)(((QP << 1) - 0.5) * qmat[1] * 0.45);//0.567);//0.4);
+ DctTh3 = (Int)(((QP << 1) - 0.5) * qmat[2] * 0.55);//1.162); /* 8/2/2001 */
+ DctTh4 = (Int)(((QP << 1) - 0.5) * qmat[32] * 0.8);//1.7583);//0.7942);
+ ColTh = ColThInter[QP];
+ }// get qmat, DctTh1, DctTh2, DctTh3
+
+ rec = currVop->yChan + offset;
+ input = inputFrame->yChan + offset;
+ if (lx != width) input -= (ind_y << 9); /* non-padded offset */
+
+ dataBlock = video->dataBlock;
+ pred = video->predictedMB;
+
+ for (k = 0; k < 6; k++)
+ {
+ CBP <<= 1;
+ bitmapcol = video->bitmapcol[k];
+ bitmapzz = video->bitmapzz[k]; /* 8/2/01 */
+ if (k < 4)
+ {//Y block
+ sad = video->mot[mbnum][k+1].sad;
+ if (k&1)
+ {
+ rec += 8;
+ input += 8;
+ }
+ else if (k == 2)
+ {
+ dctMode = ((width << 3) - 8);
+ input += dctMode;
+ dctMode = ((lx << 3) - 8);
+ rec += dctMode;
+ }
+ }
+ else
+ {// U, V block
+ if (k == 4)
+ {
+ rec = currVop->uChan + offsetc;
+ input = inputFrame->uChan + offsetc;
+ if (lx != width) input -= (ind_y << 7);
+ lx >>= 1;
+ width >>= 1;
+ if (intra)
+ {
+ dc_scaler = cal_dc_scalerENC(QP, 2); /* luminance blocks */
+ DctTh1 = dc_scaler * 3;
+ sad = getBlockSum(input, width);
+ }
+ else
+ sad = Sad8x8(input, pred, width);
+ }
+ else
+ {
+ rec = currVop->vChan + offsetc;
+ input = inputFrame->vChan + offsetc;
+ if (lx != width) input -= (ind_y << 7);
+ if (intra)
+ sad = getBlockSum(input, width);
+ else
+ sad = Sad8x8(input, pred, width);
+ }
+ }
+
+ if (sad < DctTh1) /* all-zero */
+ {
+ dctMode = 0;
+ CBP |= 0;
+ ncoefblck[k] = 0;
+ }
+ else if (sad < DctTh2) /* DC-only */
+ {
+ dctMode = 1;
+ BlockDCT1x1(dataBlock, input, pred, width);
+
+ CBP |= (*BlockQuantDequantMPEGDC)(dataBlock, output, QP, qmat,
+ bitmapcol, bitmaprow + k, bitmapzz, dc_scaler);
+ ncoefblck[k] = 1;
+ }
+ else
+ {
+ dataBlock[64] = ColTh;
+
+ if (sad < DctTh3) /* 2x2-DCT */
+ {
+ dctMode = 2;
+ BlockDCT2x2(dataBlock, input, pred, width);
+ ncoefblck[k] = 6;
+ }
+ else if (sad < DctTh4) /* 4x4 DCT */
+ {
+ dctMode = 4;
+ BlockDCT4x4(dataBlock, input, pred, width);
+ ncoefblck[k] = 26;
+ }
+ else /* full-DCT */
+ {
+ dctMode = 8;
+ BlockDCT8x8(dataBlock, input, pred, width);
+ ncoefblck[k] = 64;
+ }
+
+ CBP |= (*BlockQuantDequantMPEG)(dataBlock, output, QP, qmat,
+ bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler); //
+ }
+ dctMode = 8; /* for mismatch handle */
+ BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | (intra));
+
+ output += 64;
+ if (!(k&1))
+ {
+ pred += 8;
+ }
+ else
+ {
+ pred += 120;
+ }
+ }
+
+ video->headerInfo.CBP[mbnum] = CBP; /* 5/18/2001 */
+ return PV_SUCCESS;
+}
+
+#endif
+
+/* ======================================================================== */
+/* Function : getBlockSAV( ) */
+/* Date : 8/10/2000 */
+/* Purpose : Get SAV for one block */
+/* In/out : block[64] contain one block data */
+/* Return : */
+/* Modified : */
+/* ======================================================================== */
+/* can be written in MMX or SSE, 2/22/2001 */
+Int getBlockSAV(Short block[])
+{
+ Int i, val, sav = 0;
+
+ i = 8;
+ while (i--)
+ {
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ val = *block++;
+ if (val > 0) sav += val;
+ else sav -= val;
+ }
+
+ return sav;
+
+}
+
+/* ======================================================================== */
+/* Function : Sad8x8( ) */
+/* Date : 8/10/2000 */
+/* Purpose : Find SAD between prev block and current block */
+/* In/out : Previous and current frame block pointers, and frame width */
+/* Return : */
+/* Modified : */
+/* 8/15/01, - do 4 pixel at a time assuming 32 bit register */
+/* ======================================================================== */
+Int Sad8x8(UChar *cur, UChar *prev, Int width)
+{
+ UChar *end = cur + (width << 3);
+ Int sad = 0;
+ Int *curInt = (Int*) cur;
+ Int *prevInt = (Int*) prev;
+ Int cur1, cur2, prev1, prev2;
+ UInt mask, sgn_msk = 0x80808080;
+ Int sum2 = 0, sum4 = 0;
+ Int tmp;
+ do
+ {
+ mask = ~(0xFF00);
+ cur1 = curInt[1]; /* load cur[4..7] */
+ cur2 = curInt[0];
+ curInt += (width >> 2); /* load cur[0..3] and +=lx */
+ prev1 = prevInt[1];
+ prev2 = prevInt[0];
+ prevInt += 4;
+
+ tmp = prev2 ^ cur2;
+ cur2 = prev2 - cur2;
+ tmp = tmp ^ cur2; /* (^)^(-) last bit is one if carry */
+ tmp = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
+ if (cur2 < 0) tmp = tmp | 0x80000000; /* corcurt sign of first byte */
+ tmp = (tmp << 8) - tmp; /* carry borrowed bytes are marked with 0x1FE */
+ cur2 = cur2 + (tmp >> 7); /* negative bytes is added with 0xFF, -1 */
+ cur2 = cur2 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
+
+ tmp = prev1 ^ cur1;
+ cur1 = prev1 - cur1;
+ tmp = tmp ^ cur1; /* (^)^(-) last bit is one if carry */
+ tmp = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
+ if (cur1 < 0) tmp = tmp | 0x80000000; /* corcurt sign of first byte */
+ tmp = (tmp << 8) - tmp; /* carry borrowed bytes are marked with 0x1FE */
+ cur1 = cur1 + (tmp >> 7); /* negative bytes is added with 0xFF, -1 */
+ cur1 = cur1 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
+
+ sum4 = sum4 + cur1;
+ cur1 = cur1 & (mask << 8); /* mask first and third bytes */
+ sum2 = sum2 + ((UInt)cur1 >> 8);
+ sum4 = sum4 + cur2;
+ cur2 = cur2 & (mask << 8); /* mask first and third bytes */
+ sum2 = sum2 + ((UInt)cur2 >> 8);
+ }
+ while ((UInt)curInt < (UInt)end);
+
+ cur1 = sum4 - (sum2 << 8); /* get even-sum */
+ cur1 = cur1 + sum2; /* add 16 bit even-sum and odd-sum*/
+ cur1 = cur1 + (cur1 << 16); /* add upper and lower 16 bit sum */
+ sad = ((UInt)cur1 >> 16); /* take upper 16 bit */
+ return sad;
+}
+
+/* ======================================================================== */
+/* Function : getBlockSum( ) */
+/* Date : 8/10/2000 */
+/* Purpose : Find summation of value within a block. */
+/* In/out : Pointer to current block in a frame and frame width */
+/* Return : */
+/* Modified : */
+/* 8/15/01, - SIMD 4 pixels at a time */
+/* ======================================================================== */
+
+Int getBlockSum(UChar *cur, Int width)
+{
+ Int sad = 0, sum4 = 0, sum2 = 0;
+ UChar *end = cur + (width << 3);
+ Int *curInt = (Int*)cur;
+ UInt mask = ~(0xFF00);
+ Int load1, load2;
+
+ do
+ {
+ load1 = curInt[1];
+ load2 = curInt[0];
+ curInt += (width >> 2);
+ sum4 += load1;
+ load1 = load1 & (mask << 8); /* even bytes */
+ sum2 += ((UInt)load1 >> 8); /* sum even bytes, 16 bit */
+ sum4 += load2;
+ load2 = load2 & (mask << 8); /* even bytes */
+ sum2 += ((UInt)load2 >> 8); /* sum even bytes, 16 bit */
+ }
+ while ((UInt)curInt < (UInt)end);
+ load1 = sum4 - (sum2 << 8); /* get even-sum */
+ load1 = load1 + sum2; /* add 16 bit even-sum and odd-sum*/
+ load1 = load1 + (load1 << 16); /* add upper and lower 16 bit sum */
+ sad = ((UInt)load1 >> 16); /* take upper 16 bit */
+
+ return sad;
+}
+