diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s')
-rw-r--r-- | media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s | 713 |
1 files changed, 713 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s new file mode 100644 index 0000000..20965bf --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s @@ -0,0 +1,713 @@ +;// +;// +;// File Name: omxVCM4P2_MCReconBlock_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 9641 +;// Date: Thursday, February 7, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// +;// + +;// Include standard headers + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import symbols required from other files + + M_VARIANTS ARM1136JS + +;// *************************************************************************** +;// ARM1136JS implementation +;// *************************************************************************** + IF ARM1136JS + +;// *************************************************************************** +;// MACRO DEFINITIONS +;// *************************************************************************** + ;// Description: + ;// + ;// dest[j] = (x[j] + y[j] + round) >> 1, j=0..3 + ;// + ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to + ;// each sum before dividing by two, if round is 1 + ;// + ;// Syntax: + ;// M_UHADD8R $dest, $x, $y, $round, $mask + ;// + ;// Inputs: + ;// $x four packed bytes, x[3] : x[2] : x[1] : x[0] + ;// $y four packed bytes, y[3] : y[2] : y[1] : y[0] + ;// $round 0 if no rounding to be added, 1 if rounding to be done + ;// $mask some register set to 0x80808080 + ;// + ;// Outputs: + ;// $dest four packed bytes, z[3] : z[2] : z[1] : z[0] + + MACRO + M_UHADD8R $dest, $x, $y, $round, $mask + IF $round = 1 + IF $dest /= $y + MVN $dest, $x + UHSUB8 $dest, $y, $dest + EOR $dest, $dest, $mask + ELSE + MVN $dest, $y + UHSUB8 $dest, $x, $dest + EOR $dest, $dest, $mask + ENDIF + ELSE + UHADD8 $dest, $x, $y + ENDIF + MEND +;// *************************************************************************** + ;// Description: + ;// Load 8 bytes from $pSrc (aligned or unaligned locations) + ;// + ;// Syntax: + ;// M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset + ;// + ;// Inputs: + ;// $pSrc 4 byte aligned source pointer to an address just less than + ;// or equal to the data location + ;// $srcStep The stride on source + ;// $scratch A scratch register, used internally for temp calculations + ;// $offset Difference of source data location to the source pointer + ;// Use when $offset != 0 (unaligned load) + ;// + ;// Outputs: + ;// $pSrc In case the macro accepts stride, it increments the pSrc by + ;// that value, else unchanged + ;// $out0 four packed bytes, z[3] : z[2] : z[1] : z[0] + ;// $out1 four packed bytes, z[7] : z[6] : z[5] : z[4] + ;// + ;// Note: {$out0, $out1, $scratch} should be registers with ascending + ;// register numbering. In case offset is 0, $scratch is not modified. + + MACRO + M_LOAD_X $pSrc, $srcStep, $out0, $out1, $scratch, $offset + IF $offset = 0 + LDM $pSrc, {$out0, $out1} + ADD $pSrc, $pSrc, $srcStep + ELSE + LDM $pSrc, {$out0, $out1, $scratch} + ADD $pSrc, $pSrc, $srcStep + + MOV $out0, $out0, LSR #8 * $offset + ORR $out0, $out0, $out1, LSL #(32 - 8 * ($offset)) + MOV $out1, $out1, LSR #8 * $offset + ORR $out1, $out1, $scratch, LSL #(32 - 8 * ($offset)) + ENDIF + MEND + +;// *************************************************************************** + ;// Description: + ;// Loads three words for X interpolation, update pointer to next row. For + ;// X interpolation, given a truncated-4byteAligned source pointer, + ;// invariably three continous words are required from there to get the + ;// nine bytes from the source pointer for filtering. + ;// + ;// Syntax: + ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 + ;// + ;// Inputs: + ;// $pSrc 4 byte aligned source pointer to an address just less than + ;// or equal to the data location + ;// + ;// $srcStep The stride on source + ;// + ;// $offset Difference of source data location to the source pointer + ;// Use when $offset != 0 (unaligned load) + ;// + ;// Outputs: + ;// $pSrc Incremented by $srcStep + ;// + ;// $word0, $word1, $word2, $word3 + ;// Three of these are outputs based on the $offset parameter. + ;// The outputs are specifically generated to be processed by + ;// the M_EXT_XINT macro. Following is the illustration to show + ;// how the nine bytes are spanned for different offsets from + ;// notTruncatedForAlignmentSourcePointer. + ;// + ;// ------------------------------------------------------ + ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | + ;// |------------------------------------------------------| + ;// | 0 | 0 | 0123 | 4567 | 8xxx | | + ;// | 1 | -1 | x012 | 3456 | 78xx | | + ;// | 2 | -2 | xx01 | 2345 | 678x | | + ;// | 3 | -3 | xxx0 | | 1234 | 5678 | + ;// ------------------------------------------------------ + ;// + ;// where the numbering (0-8) is to designate the 9 bytes from + ;// start of a particular row. The illustration doesn't take in + ;// account the positioning of bytes with in the word and the + ;// macro combination with M_EXT_XINT will work only in little + ;// endian environs + ;// + ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending + ;// register numbering + + MACRO + M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3 + IF $offset /= 3 + LDM $pSrc, {$word0, $word1, $word2} + ELSE + LDM $pSrc, {$word0, $word2, $word3} + ENDIF + ADD $pSrc, $pSrc, $srcStep + MEND + +;// *************************************************************************** + ;// Description: + ;// Extract four registers of four pixels for X interpolation + ;// + ;// Syntax: + ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3 + ;// + ;// Inputs: + ;// $offset Difference of source data location to the source pointer + ;// Use when $offset != 0 (unaligned load) + ;// + ;// $word0, $word1, $word2, $word3 + ;// Three of these are inputs based on the $offset parameter. + ;// The inputs are specifically selected to be processed by + ;// the M_EXT_XINT macro. + ;// + ;// ------------------------------------------------------ + ;// | Offset | Aligned Ptr | word0 | word1 | word2 | word3 | + ;// |------------------------------------------------------| + ;// | 0 | 0 | 0123 | 4567 | 8xxx | yyyy | + ;// | 1 | -1 | x012 | 3456 | 78xx | yyyy | + ;// | 2 | -2 | xx01 | 2345 | 678x | yyyy | + ;// | 3 | -3 | xxx0 | yyyy | 1234 | 5678 | + ;// ------------------------------------------------------ + ;// + ;// Outputs: + ;// $word0, $word1, $word2, $word3 + ;// Bytes from the original source pointer (not truncated for + ;// 4 byte alignment) as shown in the table. + ;// ------------------------------- + ;// | word0 | word1 | word2 | word3 | + ;// |-------------------------------| + ;// | 0123 | 4567 | 1234 | 5678 | + ;// ------------------------------- + ;// + ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending + ;// register numbering + + MACRO + M_EXT_XINT $offset, $word0, $word1, $word2, $word3 + IF $offset = 0 + ; $word0 and $word1 are ok + ; $word2, $word3 are just 8 shifted versions + MOV $word3, $word1, LSR #8 + ORR $word3, $word3, $word2, LSL #24 + MOV $word2, $word0, LSR #8 + ORR $word2, $word2, $word1, LSL #24 + ELIF $offset = 3 + ; $word2 and $word3 are ok (taken care while loading itself) + ; set $word0 & $word1 + MOV $word0, $word0, LSR #24 + ORR $word0, $word0, $word2, LSL #8 + MOV $word1, $word2, LSR #24 + ORR $word1, $word1, $word3, LSL #8 + ELSE + MOV $word0, $word0, LSR #8 * $offset + ORR $word0, $word0, $word1, LSL #(32 - 8 * ($offset)) + MOV $word1, $word1, LSR #8 * $offset + ORR $word1, $word1, $word2, LSL #(32 - 8 * ($offset)) + + MOV $word3, $word1, LSR #8 + ORR $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1)) + MOV $word2, $word0, LSR #8 + ORR $word2, $word2, $word1, LSL #24 + ENDIF + MEND + +;// *************************************************************************** + ;// Description: + ;// Computes half-sum and xor of two inputs and puts them in the input + ;// registers in that order + ;// + ;// Syntax: + ;// M_HSUM_XOR $v0, $v1, $tmp + ;// + ;// Inputs: + ;// $v0 a, first input + ;// $v1 b, second input + ;// $tmp scratch register + ;// + ;// Outputs: + ;// $v0 (a + b)/2 + ;// $v1 a ^ b + + MACRO + M_HSUM_XOR $v0, $v1, $tmp + UHADD8 $tmp, $v0, $v1 ;// s0 = a + b + EOR $v1, $v0, $v1 ;// l0 = a ^ b + MOV $v0, $tmp ;// s0 + MEND +;// *************************************************************************** + ;// Description: + ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in + ;// mcReconBlock module. Very specific to the implementation of + ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and + ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are + ;// not significant and are used by the callee for row counter (y) + ;// + ;// Some points to note are: + ;// 1. Input is pair of pair-averages and Xors + ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another + ;// running average + ;// 3. Output is in the first argument + ;// + ;// Syntax: + ;// M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal + ;// + ;// Inputs: + ;// $sum0 (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged + ;// $lsb0 (a ^ b) + ;// $sum1 (c + d) >> 1. Not modified + ;// $lsb1 (c ^ d) Not modified + ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding + ;// + ;// Outputs: + ;// $sum0 (a + b + c + d + 1) / 4 : If no rounding + ;// (a + b + c + d + 2) / 4 : If rounding + + MACRO + M_AVG4 $sum0, $lsb0, $sum1, $lsb1, $rndVal + LCLS OP1 + LCLS OP2 + IF $rndVal = 0 ;// rounding case +OP1 SETS "AND" +OP2 SETS "ORR" + ELSE ;// Not rounding case +OP1 SETS "ORR" +OP2 SETS "AND" + ENDIF + + LCLS lsb2 + LCLS sum2 + LCLS dest + +lsb2 SETS "tmp" +sum2 SETS "$lsb0" +dest SETS "$sum0" + + $OP1 $lsb0, $lsb0, $lsb1 ;// e0 = e0 & e1 + EOR $lsb2, $sum0, $sum1 ;// e2 = s0 ^ s1 + $OP2 $lsb2, $lsb2, $lsb0 ;// e2 = e2 | e0 + AND $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask + UHADD8 $sum2, $sum0, $sum1 ;// s2 = (s0 + s1)/2 + UADD8 $dest, $sum2, $lsb2 ;// dest = s2 + e2 + MEND +;// *************************************************************************** +;// Motion compensation handler macros +;// *************************************************************************** + ;// Description: + ;// Implement motion compensation routines using the named registers in + ;// callee function. Each of the following 4 implement the 4 predict type + ;// Each handles 8 cases each ie all the combinations of 4 types of source + ;// alignment offsets and 2 types of rounding flag + ;// + ;// Syntax: + ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset + ;// M_MCRECONBLOCK_HalfPixelX $rndVal, $offset + ;// M_MCRECONBLOCK_HalfPixelY $rndVal, $offset + ;// M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset + ;// + ;// Inputs: + ;// $rndVal Assembler Variable. 0 for rounding, 1 for no rounding + ;// $offset $pSrc MOD 4 value. Offset from 4 byte aligned location. + ;// + ;// Outputs: + ;// Outputs come in the named registers of the callee functions + ;// The macro loads the data from the source pointer, processes it and + ;// stores in the destination pointer. Does the whole prediction cycle + ;// of Motion Compensation routine for a particular predictType + ;// After this only residue addition to the predicted values remain + + MACRO + M_MCRECONBLOCK_IntegerPixel $rndVal, $offset + ;// Algorithmic Description: + ;// This handles motion compensation for IntegerPixel predictType. Both + ;// rounding cases are handled by the same code base. It is just a copy + ;// from source to destination. Two lines are done per loop to reduce + ;// stalls. Loop has been software pipelined as well for that purpose. + ;// + ;// M_LOAD_X loads a whole row in two registers and then they are stored + +CaseIntegerPixelRnd0Offset$offset +CaseIntegerPixelRnd1Offset$offset + M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset + M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset +YloopIntegerPixelOffset$offset + SUBS y, y, #2 + STRD tmp1, tmp2, [pDst], dstStep + STRD tmp3, tmp4, [pDst], dstStep + M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp3, $offset + M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset + BGT YloopIntegerPixelOffset$offset + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + MACRO + M_MCRECONBLOCK_HalfPixelX $rndVal, $offset + ;// Algorithmic Description: + ;// This handles motion compensation for HalfPixelX predictType. The two + ;// rounding cases are handled by the different code base and spanned by + ;// different macro calls. Loop has been software pipelined to reduce + ;// stalls. + ;// + ;// Filtering involves averaging a pixel with the next horizontal pixel. + ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with + ;// all pixels in a row with 4 pixel in each register and another 2 + ;// registers with pixels corresponding to one horizontally shifted pixel + ;// corresponding to the initial row pixels. These are set of packed + ;// registers appropriate to do 4 lane SIMD. + ;// After that M_UHADD8R macro does the averaging taking care of the + ;// rounding as required + +CaseHalfPixelXRnd$rndVal.Offset$offset + IF $rndVal = 0 + LDR mask, =0x80808080 + ENDIF + + M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 +YloopHalfPixelXRnd$rndVal.Offset$offset + SUBS y, y, #1 + M_EXT_XINT $offset, tmp1, tmp2, tmp3, tmp4 + M_UHADD8R tmp5, tmp1, tmp3, (1-$rndVal), mask + M_UHADD8R tmp6, tmp2, tmp4, (1-$rndVal), mask + STRD tmp5, tmp6, [pDst], dstStep + M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4 + BGT YloopHalfPixelXRnd$rndVal.Offset$offset + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + MACRO + M_MCRECONBLOCK_HalfPixelY $rndVal, $offset + ;// Algorithmic Description: + ;// This handles motion compensation for HalfPixelY predictType. The two + ;// rounding cases are handled by the different code base and spanned by + ;// different macro calls. PreLoading is used to avoid reload of same data. + ;// + ;// Filtering involves averaging a pixel with the next vertical pixel. + ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in + ;// each register. These are set of packed registers appropriate to do + ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care + ;// of the rounding as required + +CaseHalfPixelYRnd$rndVal.Offset$offset + IF $rndVal = 0 + LDR mask, =0x80808080 + ENDIF + + M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load +YloopHalfPixelYRnd$rndVal.Offset$offset + SUBS y, y, #2 + ;// Processing one line + M_LOAD_X pSrc, srcStep, tmp3, tmp4, tmp5, $offset + M_UHADD8R tmp1, tmp1, tmp3, (1-$rndVal), mask + M_UHADD8R tmp2, tmp2, tmp4, (1-$rndVal), mask + STRD tmp1, tmp2, [pDst], dstStep + ;// Processing another line + M_LOAD_X pSrc, srcStep, tmp1, tmp2, tmp5, $offset + M_UHADD8R tmp3, tmp3, tmp1, (1-$rndVal), mask + M_UHADD8R tmp4, tmp4, tmp2, (1-$rndVal), mask + STRD tmp3, tmp4, [pDst], dstStep + + BGT YloopHalfPixelYRnd$rndVal.Offset$offset + + B SwitchPredictTypeEnd + MEND +;// *************************************************************************** + MACRO + M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset + ;// Algorithmic Description: + ;// This handles motion compensation for HalfPixelXY predictType. The two + ;// rounding cases are handled by the different code base and spanned by + ;// different macro calls. PreLoading is used to avoid reload of same data. + ;// + ;// Filtering involves averaging a pixel with the next vertical, horizontal + ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT + ;// and M_EXT_XINT combination generates 4 registers with a row and its + ;// 1 pixel right shifted version, with 4 pixels in one register. Another + ;// call of that macro-combination gets another row. Then M_HSUM_XOR is + ;// called to get mutual half-sum and xor combinations of a row with its + ;// shifted version as they are inputs to the M_AVG4 macro which computes + ;// the 4 element average with rounding. Note that it is the half-sum/xor + ;// values that are preserved for next row as they can be re-used in the + ;// next call to the M_AVG4 and saves recomputation. + ;// Due to lack of register, the row counter and a masking value required + ;// in M_AVG4 are packed into a single register yMask where the last nibble + ;// holds the row counter values and rest holds the masking variable left + ;// shifted by 4 + +CaseHalfPixelXYRnd$rndVal.Offset$offset + LDR yMask, =((0x01010101 << 4) + 8) + + M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' + M_EXT_XINT $offset, t00, t01, t10, t11 + M_HSUM_XOR t00, t10, tmp ;// s0, l0 + M_HSUM_XOR t01, t11, tmp ;// s0', l0' + +YloopHalfPixelXYRnd$rndVal.Offset$offset + ;// Processsing one line + ;// t00, t01, t10, t11 required from previous loop + M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d' + SUB yMask, yMask, #2 + M_EXT_XINT $offset, t20, t21, t30, t31 + M_HSUM_XOR t20, t30, tmp ;// s1, l1 + M_HSUM_XOR t21, t31, tmp ;// s1', l1' + M_AVG4 t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1 + M_AVG4 t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1' + STRD t00, t01, [pDst], dstStep ;// store the average + + ;// Processsing another line + ;// t20, t21, t30, t31 required from above + M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b' + TST yMask, #7 + M_EXT_XINT $offset, t00, t01, t10, t11 + M_HSUM_XOR t00, t10, tmp + M_HSUM_XOR t01, t11, tmp + M_AVG4 t20, t30, t00, t10, $rndVal + M_AVG4 t21, t31, t01, t11, $rndVal + STRD t20, t21, [pDst], dstStep + + BGT YloopHalfPixelXYRnd$rndVal.Offset$offset + + IF $offset/=3 :LOR: $rndVal/=1 + B SwitchPredictTypeEnd + ENDIF + MEND +;// *************************************************************************** +;// Motion compensation handler macros end here +;// *************************************************************************** + ;// Description: + ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal + ;// combination in the "switch" to prediction processing code segment + ;// + ;// Syntax: + ;// M_CASE_OFFSET $rnd, $predictType + ;// + ;// Inputs: + ;// $rnd 0 for rounding, 1 for no rounding + ;// $predictType The prediction mode + ;// + ;// Outputs: + ;// Populated list of "M_CASE"s for the "M_SWITCH" macro + + MACRO + M_CASE_OFFSET $rnd, $predictType + M_CASE Case$predictType.Rnd$rnd.Offset0 + M_CASE Case$predictType.Rnd$rnd.Offset1 + M_CASE Case$predictType.Rnd$rnd.Offset2 + M_CASE Case$predictType.Rnd$rnd.Offset3 + MEND +;// *************************************************************************** + ;// Description: + ;// Populates all 2 kinds of rounding "cases" for each predictType in the + ;// "switch" to prediction processing code segment + ;// + ;// Syntax: + ;// M_CASE_OFFSET $predictType + ;// + ;// Inputs: + ;// $predictType The prediction mode + ;// + ;// Outputs: + ;// Populated list of "M_CASE_OFFSET" macros + + MACRO + M_CASE_MCRECONBLOCK $predictType + M_CASE_OFFSET 0, $predictType ;// 0 for rounding + M_CASE_OFFSET 1, $predictType ;// 1 for no rounding + MEND +;// *************************************************************************** + ;// Description: + ;// Populates all 8 kinds of rounding and offset combinations handling macros + ;// for the specified predictType. In case of "IntegerPixel" predictType, + ;// rounding is not required so same code segment handles both cases + ;// + ;// Syntax: + ;// M_MCRECONBLOCK $predictType + ;// + ;// Inputs: + ;// $predictType The prediction mode + ;// + ;// Outputs: + ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified + ;// predictType. Each + ;// M_MCRECONBLOCK_<predictType> $rnd, $offset + ;// is an code segment (starting with a label indicating the predictType, + ;// rounding and offset combination) + ;// Four calls of this macro with the 4 prediction modes populate all the 32 + ;// handlers + + MACRO + M_MCRECONBLOCK $predictType + M_MCRECONBLOCK_$predictType 0, 0 + M_MCRECONBLOCK_$predictType 0, 1 + M_MCRECONBLOCK_$predictType 0, 2 + M_MCRECONBLOCK_$predictType 0, 3 + IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference + M_MCRECONBLOCK_$predictType 1, 0 + M_MCRECONBLOCK_$predictType 1, 1 + M_MCRECONBLOCK_$predictType 1, 2 + M_MCRECONBLOCK_$predictType 1, 3 + ENDIF + MEND +;// *************************************************************************** +;// Input/Output Registers +pSrc RN 0 +srcStep RN 1 +arg_pSrcResidue RN 2 +pSrcResidue RN 12 +pDst RN 3 +dstStep RN 2 +predictType RN 10 +rndVal RN 11 +mask RN 11 + +;// Local Scratch Registers +zero RN 12 +y RN 14 + +tmp1 RN 4 +tmp2 RN 5 +tmp3 RN 6 +tmp4 RN 7 +tmp5 RN 8 +tmp6 RN 9 +tmp7 RN 10 +tmp8 RN 11 +tmp9 RN 12 + +t00 RN 4 +t01 RN 5 +t10 RN 6 +t11 RN 7 +t20 RN 8 +t21 RN 9 +t30 RN 10 +t31 RN 11 +tmp RN 12 + +yMask RN 14 + +dst RN 1 +return RN 0 + + ;// Allocate memory on stack + M_ALLOC4 Stk_pDst, 4 + M_ALLOC4 Stk_pSrcResidue, 4 + ;// Function header + M_START omxVCM4P2_MCReconBlock, r11 + ;// Define stack arguments + M_ARG Arg_dstStep, 4 + M_ARG Arg_predictType, 4 + M_ARG Arg_rndVal, 4 + ;// Save on stack + M_STR pDst, Stk_pDst + M_STR arg_pSrcResidue, Stk_pSrcResidue + ;// Load argument from the stack + M_LDR dstStep, Arg_dstStep + M_LDR predictType, Arg_predictType + M_LDR rndVal, Arg_rndVal + + MOV y, #8 + + AND tmp1, pSrc, #3 + ORR predictType, tmp1, predictType, LSL #3 + ORR predictType, predictType, rndVal, LSL #2 + ;// Truncating source pointer to align to 4 byte location + BIC pSrc, pSrc, #3 + + ;// Implementation takes care of all combinations of different + ;// predictTypes, rounding cases and source pointer offsets to alignment + ;// of 4 bytes in different code bases unless one of these parameter wasn't + ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK + ;// macros branch into 8 M_CASE macros for all combinations of the 2 + ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte + ;// alignment. + M_SWITCH predictType + M_CASE_MCRECONBLOCK IntegerPixel + M_CASE_MCRECONBLOCK HalfPixelX + M_CASE_MCRECONBLOCK HalfPixelY + M_CASE_MCRECONBLOCK HalfPixelXY + M_ENDSWITCH + + ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 + ;// particular macros (4 in case of IntegerPixel as rounding makes no + ;// difference there) to generate the code for all cases of rounding and + ;// offsets. LTORG is used to segment the code as code size bloated beyond + ;// 4KB. + M_MCRECONBLOCK IntegerPixel + M_MCRECONBLOCK HalfPixelX + LTORG + M_MCRECONBLOCK HalfPixelY + M_MCRECONBLOCK HalfPixelXY +SwitchPredictTypeEnd + + ;// Residue Addition + ;// This is done in 2 lane SIMD though loads are further optimized and + ;// 4 bytes are loaded in case of destination buffer. Algorithmic + ;// details are in inlined comments + M_LDR pSrcResidue, Stk_pSrcResidue + CMP pSrcResidue, #0 + BEQ pSrcResidueConditionEnd +pSrcResidueNotNull + M_LDR pDst, Stk_pDst + MOV y, #8 + SUB dstStep, dstStep, #4 +Yloop_pSrcResidueNotNull + SUBS y, y, #1 + LDR dst, [pDst] ;// dst = [dcba] + LDMIA pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA] + PKHBT tmp3, tmp1, tmp2, LSL #16 ;// Deltaval1 = [C A] + PKHTB tmp4, tmp2, tmp1, ASR #16 ;// DeltaVal2 = [D B] + UXTB16 tmp1, dst ;// tmp1 = [0c0a] + UXTB16 tmp2, dst, ROR #8 ;// tmp2 = [0d0b] + QADD16 tmp1, tmp1, tmp3 ;// Add and saturate to 16 bits + QADD16 tmp2, tmp2, tmp4 + USAT16 tmp1, #8, tmp1 + USAT16 tmp2, #8, tmp2 ;// armClip(0, 255, tmp2) + ORR tmp1, tmp1, tmp2, LSL #8 ;// tmp1 = [dcba] + STR tmp1, [pDst], #4 + + LDR dst, [pDst] + LDMIA pSrcResidue!, {tmp1, tmp2} + PKHBT tmp3, tmp1, tmp2, LSL #16 + PKHTB tmp4, tmp2, tmp1, ASR #16 + UXTB16 tmp1, dst + UXTB16 tmp2, dst, ROR #8 + QADD16 tmp1, tmp1, tmp3 + QADD16 tmp2, tmp2, tmp4 + USAT16 tmp1, #8, tmp1 + USAT16 tmp2, #8, tmp2 + ORR tmp1, tmp1, tmp2, LSL #8 + STR tmp1, [pDst], dstStep + + BGT Yloop_pSrcResidueNotNull +pSrcResidueConditionEnd + + MOV return, #OMX_Sts_NoErr + + M_END + ENDIF ;// ARM1136JS + +;// *************************************************************************** +;// CortexA8 implementation +;// *************************************************************************** + END +;// *************************************************************************** +;// omxVCM4P2_MCReconBlock ends +;// *************************************************************************** |