1 files changed, 713 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
new file mode 100644
index 0000000..20965bf
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p2/src/omxVCM4P2_MCReconBlock_s.s
@@ -0,0 +1,713 @@
+;//
+;// 
+;// File Name:  omxVCM4P2_MCReconBlock_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   9641
+;// Date:       Thursday, February 7, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;//
+;//
+
+;// Include standard headers
+    INCLUDE omxtypes_s.h
+    INCLUDE armCOMM_s.h
+
+;// Import symbols required from other files
+
+    M_VARIANTS ARM1136JS
+
+;// ***************************************************************************
+;// ARM1136JS implementation
+;// ***************************************************************************
+    IF  ARM1136JS
+    
+;// ***************************************************************************
+;// MACRO DEFINITIONS
+;// ***************************************************************************
+    ;// Description:
+    ;//
+    ;//   dest[j] = (x[j] + y[j] + round) >> 1,   j=0..3
+    ;//
+    ;// Similar to UHADD8 instruction, but with a rounding value of 1 added to
+    ;// each sum before dividing by two, if round is 1
+    ;//
+    ;// Syntax:
+    ;// M_UHADD8R   $dest, $x, $y, $round, $mask
+    ;//
+    ;// Inputs:
+    ;// $x        four packed bytes,   x[3] :  x[2]  :  x[1]  :  x[0]
+    ;// $y        four packed bytes,   y[3] :  y[2]  :  y[1]  :  y[0]
+    ;// $round    0 if no rounding to be added, 1 if rounding to be done
+    ;// $mask     some register set to 0x80808080
+    ;//
+    ;// Outputs:
+    ;// $dest     four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
+
+    MACRO
+    M_UHADD8R   $dest, $x, $y, $round, $mask
+    IF $round = 1
+        IF  $dest /= $y
+            MVN         $dest, $x
+            UHSUB8      $dest, $y, $dest
+            EOR         $dest, $dest, $mask
+        ELSE
+            MVN         $dest, $y
+            UHSUB8      $dest, $x, $dest
+            EOR         $dest, $dest, $mask
+        ENDIF
+    ELSE
+        UHADD8      $dest, $x, $y
+    ENDIF
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Load 8 bytes from $pSrc (aligned or unaligned locations)
+    ;//
+    ;// Syntax:
+    ;// M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
+    ;// 
+    ;// Inputs:
+    ;// $pSrc       4 byte aligned source pointer to an address just less than 
+    ;//             or equal to the data location
+    ;// $srcStep    The stride on source
+    ;// $scratch    A scratch register, used internally for temp calculations
+    ;// $offset     Difference of source data location to the source pointer
+    ;//             Use when $offset != 0 (unaligned load)
+    ;//
+    ;// Outputs:
+    ;// $pSrc       In case the macro accepts stride, it increments the pSrc by 
+    ;//             that value, else unchanged
+    ;// $out0       four packed bytes,   z[3] :  z[2]  :  z[1]  :  z[0]
+    ;// $out1       four packed bytes,   z[7] :  z[6]  :  z[5]  :  z[4]
+    ;//
+    ;// Note: {$out0, $out1, $scratch} should be registers with ascending
+    ;// register numbering. In case offset is 0, $scratch is not modified.
+
+    MACRO
+    M_LOAD_X    $pSrc, $srcStep, $out0, $out1, $scratch, $offset
+        IF $offset = 0
+            LDM         $pSrc, {$out0, $out1}
+            ADD         $pSrc, $pSrc, $srcStep
+        ELSE
+            LDM         $pSrc, {$out0, $out1, $scratch} 
+            ADD         $pSrc, $pSrc, $srcStep
+            
+            MOV         $out0, $out0, LSR #8 * $offset
+            ORR         $out0, $out0, $out1, LSL #(32 - 8 * ($offset))
+            MOV         $out1, $out1, LSR #8 * $offset
+            ORR         $out1, $out1, $scratch, LSL #(32 - 8 * ($offset))
+        ENDIF
+    MEND
+
+;// ***************************************************************************
+    ;// Description:
+    ;// Loads three words for X interpolation, update pointer to next row. For 
+    ;// X interpolation, given a truncated-4byteAligned source pointer, 
+    ;// invariably three continous words are required from there to get the
+    ;// nine bytes from the source pointer for filtering. 
+    ;//
+    ;// Syntax:
+    ;// M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
+    ;// 
+    ;// Inputs:
+    ;// $pSrc       4 byte aligned source pointer to an address just less than 
+    ;//             or equal to the data location
+    ;//
+    ;// $srcStep    The stride on source
+    ;//
+    ;// $offset     Difference of source data location to the source pointer
+    ;//             Use when $offset != 0 (unaligned load)
+    ;//
+    ;// Outputs:
+    ;// $pSrc       Incremented by $srcStep
+    ;//
+    ;// $word0, $word1, $word2, $word3
+    ;//             Three of these are outputs based on the $offset parameter. 
+    ;//             The outputs are specifically generated to be processed by 
+    ;//             the M_EXT_XINT macro. Following is the illustration to show 
+    ;//             how the nine bytes are spanned for different offsets from 
+    ;//             notTruncatedForAlignmentSourcePointer.
+    ;//
+    ;//              ------------------------------------------------------
+    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
+    ;//             |------------------------------------------------------|
+    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  |       |
+    ;//             |    1   |      -1     | x012  | 3456  | 78xx  |       |
+    ;//             |    2   |      -2     | xx01  | 2345  | 678x  |       |
+    ;//             |    3   |      -3     | xxx0  |       | 1234  | 5678  |
+    ;//              ------------------------------------------------------
+    ;// 
+    ;//             where the numbering (0-8) is to designate the 9 bytes from
+    ;//             start of a particular row. The illustration doesn't take in 
+    ;//             account the positioning of bytes with in the word and the 
+    ;//             macro combination with M_EXT_XINT will work only in little 
+    ;//             endian environs
+    ;// 
+    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
+    ;// register numbering
+
+    MACRO
+    M_LOAD_XINT $pSrc, $srcStep, $offset, $word0, $word1, $word2, $word3
+        IF $offset /= 3
+            LDM         $pSrc, {$word0, $word1, $word2}
+        ELSE
+            LDM         $pSrc, {$word0, $word2, $word3}
+        ENDIF
+        ADD         $pSrc, $pSrc, $srcStep
+    MEND
+
+;// ***************************************************************************
+    ;// Description:
+    ;// Extract four registers of four pixels for X interpolation 
+    ;// 
+    ;// Syntax:
+    ;// M_EXT_XINT $offset, $word0, $word1, $word2, $word3
+    ;// 
+    ;// Inputs:
+    ;// $offset     Difference of source data location to the source pointer
+    ;//             Use when $offset != 0 (unaligned load)
+    ;// 
+    ;// $word0, $word1, $word2, $word3
+    ;//             Three of these are inputs based on the $offset parameter. 
+    ;//             The inputs are specifically selected to be processed by 
+    ;//             the M_EXT_XINT macro.
+    ;//
+    ;//              ------------------------------------------------------
+    ;//             | Offset | Aligned Ptr | word0 | word1 | word2 | word3 |
+    ;//             |------------------------------------------------------|
+    ;//             |    0   |       0     | 0123  | 4567  | 8xxx  | yyyy  |
+    ;//             |    1   |      -1     | x012  | 3456  | 78xx  | yyyy  |
+    ;//             |    2   |      -2     | xx01  | 2345  | 678x  | yyyy  |
+    ;//             |    3   |      -3     | xxx0  | yyyy  | 1234  | 5678  |
+    ;//              ------------------------------------------------------
+    ;// 
+    ;// Outputs:
+    ;// $word0, $word1, $word2, $word3
+    ;//             Bytes from the original source pointer (not truncated for
+    ;//             4 byte alignment) as shown in the table. 
+    ;//              -------------------------------
+    ;//             | word0 | word1 | word2 | word3 |
+    ;//             |-------------------------------|
+    ;//             | 0123  | 4567  | 1234  | 5678  |
+    ;//              -------------------------------
+    ;//
+    ;// Note: {$word0, $word1, $word2, $word3} should be registers with ascending
+    ;// register numbering
+
+    MACRO
+    M_EXT_XINT $offset, $word0, $word1, $word2, $word3
+        IF $offset = 0
+            ; $word0 and $word1 are ok
+            ; $word2, $word3 are just 8 shifted versions
+            MOV         $word3, $word1, LSR #8
+            ORR         $word3, $word3, $word2, LSL #24
+            MOV         $word2, $word0, LSR #8
+            ORR         $word2, $word2, $word1, LSL #24
+        ELIF $offset = 3
+            ; $word2 and $word3 are ok (taken care while loading itself)
+            ; set $word0 & $word1
+            MOV         $word0, $word0, LSR #24
+            ORR         $word0, $word0, $word2, LSL #8
+            MOV         $word1, $word2, LSR #24
+            ORR         $word1, $word1, $word3, LSL #8
+        ELSE
+            MOV         $word0, $word0, LSR #8 * $offset
+            ORR         $word0, $word0, $word1, LSL #(32 - 8 * ($offset))
+            MOV         $word1, $word1, LSR #8 * $offset
+            ORR         $word1, $word1, $word2, LSL #(32 - 8 * ($offset))
+
+            MOV         $word3, $word1, LSR #8
+            ORR         $word3, $word3, $word2, LSL #(32 - 8 * (($offset)+1))
+            MOV         $word2, $word0, LSR #8
+            ORR         $word2, $word2, $word1, LSL #24
+        ENDIF
+    MEND
+
+;// ***************************************************************************
+    ;// Description:
+    ;// Computes half-sum and xor of two inputs and puts them in the input 
+    ;// registers in that order
+    ;//
+    ;// Syntax:
+    ;// M_HSUM_XOR      $v0, $v1, $tmp
+    ;// 
+    ;// Inputs:
+    ;// $v0         a, first input
+    ;// $v1         b, second input
+    ;// $tmp        scratch register
+    ;// 
+    ;// Outputs:
+    ;// $v0         (a + b)/2
+    ;// $v1         a ^ b
+
+    MACRO
+    M_HSUM_XOR      $v0, $v1, $tmp
+        UHADD8      $tmp, $v0, $v1     ;// s0 = a + b
+        EOR         $v1, $v0, $v1      ;// l0 = a ^ b
+        MOV         $v0, $tmp          ;// s0
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Calculates average of 4 values (a,b,c,d) for HalfPixelXY predict type in 
+    ;// mcReconBlock module. Very specific to the implementation of 
+    ;// M_MCRECONBLOCK_HalfPixelXY done here. Uses "tmp" as scratch register and 
+    ;// "yMask" for mask variable "0x1010101x" set in it. In yMask 4 lsbs are 
+    ;// not significant and are used by the callee for row counter (y)
+    ;//
+    ;// Some points to note are:
+    ;// 1. Input is pair of pair-averages and Xors
+    ;// 2. $sum1 and $lsb1 are not modified and hence can be reused in another 
+    ;//    running average
+    ;// 3. Output is in the first argument
+    ;//
+    ;// Syntax:
+    ;// M_AVG4         $sum0, $lsb0, $sum1, $lsb1, $rndVal
+    ;// 
+    ;// Inputs:
+    ;// $sum0       (a + b) >> 1, where a and b are 1st and 2nd inputs to be averaged
+    ;// $lsb0       (a ^ b)
+    ;// $sum1       (c + d) >> 1. Not modified
+    ;// $lsb1       (c ^ d)       Not modified
+    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
+    ;// 
+    ;// Outputs:
+    ;// $sum0       (a + b + c + d + 1) / 4 : If no rounding
+    ;//             (a + b + c + d + 2) / 4 : If rounding
+
+    MACRO
+    M_AVG4          $sum0, $lsb0, $sum1, $lsb1, $rndVal
+        LCLS OP1
+        LCLS OP2
+        IF $rndVal = 0 ;// rounding case
+OP1 SETS "AND"
+OP2 SETS "ORR"
+        ELSE           ;// Not rounding case
+OP1 SETS "ORR"
+OP2 SETS "AND"
+        ENDIF
+        
+        LCLS lsb2
+        LCLS sum2
+        LCLS dest
+    
+lsb2  SETS "tmp"
+sum2  SETS "$lsb0"
+dest  SETS "$sum0"
+
+        $OP1        $lsb0, $lsb0, $lsb1          ;// e0 = e0 & e1
+        EOR         $lsb2, $sum0, $sum1          ;// e2 = s0 ^ s1
+        $OP2        $lsb2, $lsb2, $lsb0          ;// e2 = e2 | e0
+        AND         $lsb2, $lsb2, yMask, LSR # 4 ;// e2 = e2 & mask
+        UHADD8      $sum2, $sum0, $sum1          ;// s2 = (s0 + s1)/2
+        UADD8       $dest, $sum2, $lsb2          ;// dest =  s2 + e2
+    MEND
+;// ***************************************************************************
+;// Motion compensation handler macros
+;// ***************************************************************************
+    ;// Description:
+    ;// Implement motion compensation routines using the named registers in 
+    ;// callee function. Each of the following 4 implement the 4 predict type
+    ;// Each handles 8 cases each ie all the combinations of 4 types of source 
+    ;// alignment offsets and 2 types of rounding flag
+    ;//
+    ;// Syntax:
+    ;// M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
+    ;// M_MCRECONBLOCK_HalfPixelX   $rndVal, $offset
+    ;// M_MCRECONBLOCK_HalfPixelY   $rndVal, $offset
+    ;// M_MCRECONBLOCK_HalfPixelXY  $rndVal, $offset
+    ;// 
+    ;// Inputs:
+    ;// $rndVal     Assembler Variable. 0 for rounding, 1 for no rounding
+    ;// $offset     $pSrc MOD 4 value. Offset from 4 byte aligned location.
+    ;// 
+    ;// Outputs:
+    ;// Outputs come in the named registers of the callee functions
+    ;// The macro loads the data from the source pointer, processes it and 
+    ;// stores in the destination pointer. Does the whole prediction cycle
+    ;// of Motion Compensation routine for a particular predictType
+    ;// After this only residue addition to the predicted values remain
+
+    MACRO
+    M_MCRECONBLOCK_IntegerPixel $rndVal, $offset
+    ;// Algorithmic Description:
+    ;// This handles motion compensation for IntegerPixel predictType. Both
+    ;// rounding cases are handled by the same code base. It is just a copy
+    ;// from source to destination. Two lines are done per loop to reduce 
+    ;// stalls. Loop has been software pipelined as well for that purpose.
+    ;// 
+    ;// M_LOAD_X loads a whole row in two registers and then they are stored
+    
+CaseIntegerPixelRnd0Offset$offset
+CaseIntegerPixelRnd1Offset$offset
+    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
+    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
+YloopIntegerPixelOffset$offset
+    SUBS        y, y, #2
+    STRD        tmp1, tmp2, [pDst], dstStep
+    STRD        tmp3, tmp4, [pDst], dstStep
+    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp3, $offset
+    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
+    BGT         YloopIntegerPixelOffset$offset
+
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    MACRO
+    M_MCRECONBLOCK_HalfPixelX $rndVal, $offset
+    ;// Algorithmic Description:
+    ;// This handles motion compensation for HalfPixelX predictType. The two
+    ;// rounding cases are handled by the different code base and spanned by 
+    ;// different macro calls. Loop has been software pipelined to reduce 
+    ;// stalls.
+    ;// 
+    ;// Filtering involves averaging a pixel with the next horizontal pixel.
+    ;// M_LOAD_XINT and M_EXT_XINT combination generate 4 registers, 2 with 
+    ;// all pixels in a row with 4 pixel in each register and another 2
+    ;// registers with pixels corresponding to one horizontally shifted pixel
+    ;// corresponding to the initial row pixels. These are set of packed 
+    ;// registers appropriate to do 4 lane SIMD.
+    ;// After that M_UHADD8R macro does the averaging taking care of the 
+    ;// rounding as required
+    
+CaseHalfPixelXRnd$rndVal.Offset$offset
+    IF $rndVal = 0
+        LDR mask, =0x80808080
+    ENDIF
+
+    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
+YloopHalfPixelXRnd$rndVal.Offset$offset
+    SUBS        y, y, #1
+    M_EXT_XINT  $offset, tmp1, tmp2, tmp3, tmp4
+    M_UHADD8R   tmp5, tmp1, tmp3, (1-$rndVal), mask
+    M_UHADD8R   tmp6, tmp2, tmp4, (1-$rndVal), mask
+    STRD        tmp5, tmp6, [pDst], dstStep
+    M_LOAD_XINT pSrc, srcStep, $offset, tmp1, tmp2, tmp3, tmp4
+    BGT         YloopHalfPixelXRnd$rndVal.Offset$offset
+
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    MACRO
+    M_MCRECONBLOCK_HalfPixelY $rndVal, $offset
+    ;// Algorithmic Description:
+    ;// This handles motion compensation for HalfPixelY predictType. The two
+    ;// rounding cases are handled by the different code base and spanned by 
+    ;// different macro calls. PreLoading is used to avoid reload of same data. 
+    ;// 
+    ;// Filtering involves averaging a pixel with the next vertical pixel.
+    ;// M_LOAD_X generates 2 registers with all pixels in a row with 4 pixel in 
+    ;// each register. These are set of packed registers appropriate to do 
+    ;// 4 lane SIMD. After that M_UHADD8R macro does the averaging taking care 
+    ;// of the rounding as required
+    
+CaseHalfPixelYRnd$rndVal.Offset$offset
+    IF $rndVal = 0
+        LDR mask, =0x80808080
+    ENDIF
+
+    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset ;// Pre-load
+YloopHalfPixelYRnd$rndVal.Offset$offset
+    SUBS        y, y, #2
+    ;// Processing one line
+    M_LOAD_X    pSrc, srcStep, tmp3, tmp4, tmp5, $offset
+    M_UHADD8R   tmp1, tmp1, tmp3, (1-$rndVal), mask
+    M_UHADD8R   tmp2, tmp2, tmp4, (1-$rndVal), mask
+    STRD        tmp1, tmp2, [pDst], dstStep
+    ;// Processing another line
+    M_LOAD_X    pSrc, srcStep, tmp1, tmp2, tmp5, $offset
+    M_UHADD8R   tmp3, tmp3, tmp1, (1-$rndVal), mask
+    M_UHADD8R   tmp4, tmp4, tmp2, (1-$rndVal), mask
+    STRD        tmp3, tmp4, [pDst], dstStep
+
+    BGT         YloopHalfPixelYRnd$rndVal.Offset$offset
+
+    B           SwitchPredictTypeEnd
+    MEND
+;// ***************************************************************************
+    MACRO
+    M_MCRECONBLOCK_HalfPixelXY $rndVal, $offset
+    ;// Algorithmic Description:
+    ;// This handles motion compensation for HalfPixelXY predictType. The two
+    ;// rounding cases are handled by the different code base and spanned by 
+    ;// different macro calls. PreLoading is used to avoid reload of same data. 
+    ;// 
+    ;// Filtering involves averaging a pixel with the next vertical, horizontal 
+    ;// and right-down diagonal pixels. Just as in HalfPixelX case, M_LOAD_XINT
+    ;// and M_EXT_XINT combination generates 4 registers with a row and its
+    ;// 1 pixel right shifted version, with 4 pixels in one register. Another 
+    ;// call of that macro-combination gets another row. Then M_HSUM_XOR is 
+    ;// called to get mutual half-sum and xor combinations of a row with its
+    ;// shifted version as they are inputs to the M_AVG4 macro which computes
+    ;// the 4 element average with rounding. Note that it is the half-sum/xor 
+    ;// values that are preserved for next row as they can be re-used in the 
+    ;// next call to the M_AVG4 and saves recomputation.
+    ;// Due to lack of register, the row counter and a masking value required 
+    ;// in M_AVG4 are packed into a single register yMask where the last nibble
+    ;// holds the row counter values and rest holds the masking variable left 
+    ;// shifted by 4
+    
+CaseHalfPixelXYRnd$rndVal.Offset$offset
+    LDR         yMask, =((0x01010101 << 4) + 8)
+
+    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
+    M_EXT_XINT  $offset, t00, t01, t10, t11
+    M_HSUM_XOR  t00, t10, tmp               ;// s0, l0
+    M_HSUM_XOR  t01, t11, tmp               ;// s0', l0'
+
+YloopHalfPixelXYRnd$rndVal.Offset$offset
+    ;// Processsing one line
+    ;// t00, t01, t10, t11 required from previous loop
+    M_LOAD_XINT pSrc, srcStep, $offset, t20, t21, t30, t31 ;// Load c, c', d, d'
+    SUB         yMask, yMask, #2
+    M_EXT_XINT  $offset, t20, t21, t30, t31
+    M_HSUM_XOR  t20, t30, tmp               ;// s1, l1
+    M_HSUM_XOR  t21, t31, tmp               ;// s1', l1'
+    M_AVG4      t00, t10, t20, t30, $rndVal ;// s0, l0, s1, l1
+    M_AVG4      t01, t11, t21, t31, $rndVal ;// s0', l0', s1', l1'
+    STRD        t00, t01, [pDst], dstStep   ;// store the average
+    
+    ;// Processsing another line
+    ;// t20, t21, t30, t31 required from above
+    M_LOAD_XINT pSrc, srcStep, $offset, t00, t01, t10, t11 ;// Load a, a', b, b'
+    TST         yMask, #7
+    M_EXT_XINT  $offset, t00, t01, t10, t11
+    M_HSUM_XOR  t00, t10, tmp
+    M_HSUM_XOR  t01, t11, tmp
+    M_AVG4      t20, t30, t00, t10, $rndVal
+    M_AVG4      t21, t31, t01, t11, $rndVal
+    STRD        t20, t21, [pDst], dstStep
+
+    BGT         YloopHalfPixelXYRnd$rndVal.Offset$offset
+
+    IF $offset/=3 :LOR: $rndVal/=1
+        B           SwitchPredictTypeEnd
+    ENDIF
+    MEND
+;// ***************************************************************************
+;// Motion compensation handler macros end here
+;// ***************************************************************************
+    ;// Description:
+    ;// Populates all 4 kinds of offsets "cases" for each predictType and rndVal
+    ;// combination in the "switch" to prediction processing code segment
+    ;//
+    ;// Syntax:
+    ;// M_CASE_OFFSET $rnd, $predictType
+    ;// 
+    ;// Inputs:
+    ;// $rnd            0 for rounding, 1 for no rounding
+    ;// $predictType    The prediction mode
+    ;// 
+    ;// Outputs:
+    ;// Populated list of "M_CASE"s for the "M_SWITCH" macro
+
+    MACRO
+    M_CASE_OFFSET $rnd, $predictType
+        M_CASE      Case$predictType.Rnd$rnd.Offset0
+        M_CASE      Case$predictType.Rnd$rnd.Offset1
+        M_CASE      Case$predictType.Rnd$rnd.Offset2
+        M_CASE      Case$predictType.Rnd$rnd.Offset3
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Populates all 2 kinds of rounding "cases" for each predictType in the 
+    ;// "switch" to prediction processing code segment
+    ;//
+    ;// Syntax:
+    ;// M_CASE_OFFSET $predictType
+    ;// 
+    ;// Inputs:
+    ;// $predictType    The prediction mode
+    ;// 
+    ;// Outputs:
+    ;// Populated list of "M_CASE_OFFSET" macros
+
+    MACRO
+    M_CASE_MCRECONBLOCK $predictType
+        M_CASE_OFFSET  0, $predictType ;// 0 for rounding
+        M_CASE_OFFSET  1, $predictType ;// 1 for no rounding
+    MEND
+;// ***************************************************************************
+    ;// Description:
+    ;// Populates all 8 kinds of rounding and offset combinations handling macros 
+    ;// for the specified predictType. In case of "IntegerPixel" predictType, 
+    ;// rounding is not required so same code segment handles both cases
+    ;//
+    ;// Syntax:
+    ;// M_MCRECONBLOCK    $predictType
+    ;// 
+    ;// Inputs:
+    ;// $predictType    The prediction mode
+    ;// 
+    ;// Outputs:
+    ;// Populated list of "M_MCRECONBLOCK_<predictType>" macros for specified 
+    ;// predictType. Each 
+    ;//                 M_MCRECONBLOCK_<predictType> $rnd, $offset 
+    ;// is an code segment (starting with a label indicating the predictType, 
+    ;// rounding and offset combination)
+    ;// Four calls of this macro with the 4 prediction modes populate all the 32 
+    ;// handlers
+
+    MACRO
+    M_MCRECONBLOCK $predictType
+        M_MCRECONBLOCK_$predictType 0, 0
+        M_MCRECONBLOCK_$predictType 0, 1
+        M_MCRECONBLOCK_$predictType 0, 2
+        M_MCRECONBLOCK_$predictType 0, 3
+    IF "$predictType" /= "IntegerPixel" ;// If not IntegerPixel then rounding makes a difference
+        M_MCRECONBLOCK_$predictType 1, 0
+        M_MCRECONBLOCK_$predictType 1, 1
+        M_MCRECONBLOCK_$predictType 1, 2
+        M_MCRECONBLOCK_$predictType 1, 3
+    ENDIF
+    MEND
+;// ***************************************************************************
+;// Input/Output Registers
+pSrc                  RN 0
+srcStep               RN 1
+arg_pSrcResidue       RN 2
+pSrcResidue           RN 12
+pDst                  RN 3
+dstStep               RN 2
+predictType           RN 10
+rndVal                RN 11
+mask                  RN 11
+
+;// Local Scratch Registers
+zero                  RN 12
+y                     RN 14
+
+tmp1                  RN 4
+tmp2                  RN 5
+tmp3                  RN 6
+tmp4                  RN 7
+tmp5                  RN 8
+tmp6                  RN 9
+tmp7                  RN 10
+tmp8                  RN 11
+tmp9                  RN 12
+
+t00                   RN 4
+t01                   RN 5
+t10                   RN 6
+t11                   RN 7
+t20                   RN 8
+t21                   RN 9
+t30                   RN 10
+t31                   RN 11
+tmp                   RN 12
+
+yMask                 RN 14
+
+dst                   RN 1
+return                RN 0
+
+    ;// Allocate memory on stack
+    M_ALLOC4    Stk_pDst,           4
+    M_ALLOC4    Stk_pSrcResidue,    4
+    ;// Function header
+    M_START     omxVCM4P2_MCReconBlock, r11
+    ;// Define stack arguments
+    M_ARG       Arg_dstStep,        4
+    M_ARG       Arg_predictType,    4
+    M_ARG       Arg_rndVal,         4
+    ;// Save on stack
+    M_STR       pDst, Stk_pDst
+    M_STR       arg_pSrcResidue, Stk_pSrcResidue
+    ;// Load argument from the stack
+    M_LDR       dstStep, Arg_dstStep
+    M_LDR       predictType, Arg_predictType
+    M_LDR       rndVal, Arg_rndVal
+    
+    MOV         y, #8
+    
+    AND         tmp1, pSrc, #3
+    ORR         predictType, tmp1, predictType, LSL #3
+    ORR         predictType, predictType, rndVal, LSL #2
+    ;// Truncating source pointer to align to 4 byte location
+    BIC         pSrc, pSrc, #3
+
+    ;// Implementation takes care of all combinations of different 
+    ;// predictTypes, rounding cases and source pointer offsets to alignment 
+    ;// of 4 bytes in different code bases unless one of these parameter wasn't 
+    ;// making any difference to the implementation. Below M_CASE_MCRECONBLOCK
+    ;// macros branch into 8 M_CASE macros for all combinations of the 2 
+    ;// rounding cases and 4 offsets of the pSrc pointer to the 4 byte 
+    ;// alignment. 
+    M_SWITCH    predictType
+        M_CASE_MCRECONBLOCK IntegerPixel
+        M_CASE_MCRECONBLOCK HalfPixelX
+        M_CASE_MCRECONBLOCK HalfPixelY
+        M_CASE_MCRECONBLOCK HalfPixelXY
+    M_ENDSWITCH
+
+    ;// The M_MCRECONBLOCK macros populate the code bases by calling all 8 
+    ;// particular macros (4 in case of IntegerPixel as rounding makes no 
+    ;// difference there) to generate the code for all cases of rounding and 
+    ;// offsets. LTORG is used to segment the code as code size bloated beyond 
+    ;// 4KB.
+    M_MCRECONBLOCK IntegerPixel
+    M_MCRECONBLOCK HalfPixelX
+    LTORG
+    M_MCRECONBLOCK HalfPixelY
+    M_MCRECONBLOCK HalfPixelXY
+SwitchPredictTypeEnd
+
+    ;// Residue Addition
+    ;// This is done in 2 lane SIMD though loads are further optimized and
+    ;// 4 bytes are loaded in case of destination buffer. Algorithmic 
+    ;// details are in inlined comments
+    M_LDR       pSrcResidue, Stk_pSrcResidue
+    CMP         pSrcResidue, #0
+    BEQ         pSrcResidueConditionEnd
+pSrcResidueNotNull    
+    M_LDR       pDst, Stk_pDst
+    MOV         y, #8
+    SUB         dstStep, dstStep, #4
+Yloop_pSrcResidueNotNull
+    SUBS        y, y, #1
+    LDR         dst, [pDst]                ;// dst = [dcba]
+    LDMIA       pSrcResidue!, {tmp1, tmp2} ;// tmp1=[DC] tmp2=[BA]
+    PKHBT       tmp3, tmp1, tmp2, LSL #16  ;// Deltaval1 = [C A]
+    PKHTB       tmp4, tmp2, tmp1, ASR #16  ;// DeltaVal2 = [D B]
+    UXTB16      tmp1, dst                  ;// tmp1 = [0c0a]
+    UXTB16      tmp2, dst, ROR #8          ;// tmp2 = [0d0b]
+    QADD16      tmp1, tmp1, tmp3           ;// Add and saturate to 16 bits
+    QADD16      tmp2, tmp2, tmp4
+    USAT16      tmp1, #8, tmp1
+    USAT16      tmp2, #8, tmp2             ;// armClip(0, 255, tmp2)
+    ORR         tmp1, tmp1, tmp2, LSL #8   ;// tmp1 = [dcba]
+    STR         tmp1, [pDst], #4
+    
+    LDR         dst, [pDst]
+    LDMIA       pSrcResidue!, {tmp1, tmp2}
+    PKHBT       tmp3, tmp1, tmp2, LSL #16
+    PKHTB       tmp4, tmp2, tmp1, ASR #16
+    UXTB16      tmp1, dst
+    UXTB16      tmp2, dst, ROR #8
+    QADD16      tmp1, tmp1, tmp3
+    QADD16      tmp2, tmp2, tmp4
+    USAT16      tmp1, #8, tmp1
+    USAT16      tmp2, #8, tmp2
+    ORR         tmp1, tmp1, tmp2, LSL #8
+    STR         tmp1, [pDst], dstStep
+    
+    BGT         Yloop_pSrcResidueNotNull
+pSrcResidueConditionEnd
+
+    MOV         return, #OMX_Sts_NoErr
+
+    M_END
+    ENDIF ;// ARM1136JS
+
+;// ***************************************************************************
+;// CortexA8 implementation
+;// ***************************************************************************
+    END
+;// ***************************************************************************
+;// omxVCM4P2_MCReconBlock ends
+;// ***************************************************************************