1 files changed, 480 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
new file mode 100644
index 0000000..2b71486
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
@@ -0,0 +1,480 @@
+;//
+;// (c) Copyright 2007 ARM Limited. All Rights Reserved.
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;// 
+;// 
+
+        
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        IMPORT armVCM4P10_UnpackBlock4x4
+        IMPORT armVCM4P10_TransformResidual4x4
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixU16
+        IMPORT armVCM4P10_QPModuloTable 
+        
+    M_VARIANTS ARM1136JS, ARM1136JS_U
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_DequantLumaAC4x4
+
+;// Guarding implementation by the processor name
+    
+    IF  ARM1136JS 
+    
+;//Input Registers
+pSrcDst       RN  0
+QP            RN  1
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+pQPdiv          RN  4
+pQPmod          RN  5
+pVRow           RN  2
+QPmod           RN  6
+shift           RN  3
+rowLuma01       RN  1
+rowLuma23       RN  4
+
+SrcDst00        RN  5
+SrcDst02        RN  6
+SrcDst10        RN  7
+SrcDst12        RN  8
+SrcDst20        RN  9
+SrcDst22        RN  10
+SrcDst30        RN  11
+SrcDst32        RN  12
+
+temp1           RN  2
+temp2           RN  3
+temp3           RN  14
+    
+    
+        ;// Allocate stack memory required by the function
+        
+        ;// Write function header
+        M_START armVCM4P10_DequantLumaAC4x4,r11
+         
+        LDR    pQPmod,=armVCM4P10_QPModuloTable
+        LDR    pQPdiv,=armVCM4P10_QPDivTable        
+        LDR    pVRow,=armVCM4P10_VMatrixU16
+         
+        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
+        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
+                
+        LDRH    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [00|0a]
+        LDRH    temp3,[pVRow,#2]                     ;// temp3     = [00|0b]   
+        LDRH    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [00|0c] 
+        ORR     rowLuma01,rowLuma01,temp3,LSL #16    ;// rowLuma01 = [0b|0a]   
+        
+        ;// Load all the 16 'src' values
+        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+        
+        
+        ;//*********************************************************************************************
+        ;//
+        ;// 'Shift' ranges between [0,8] 
+        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
+        ;//
+        ;//*********************************************************************************************
+        
+        LSL    rowLuma01,rowLuma01,shift
+        LSL    rowLuma23,rowLuma23,shift
+        
+        
+        ;//**********************************************************************************************
+        ;//
+        ;// The idea is to unroll the Loop completely
+        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
+        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 
+        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
+        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
+        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
+        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
+        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
+        ;// 
+        ;// We then pack the two 16 bit multiplication result into a word and store at one go
+        ;//
+        ;//**********************************************************************************************
+        
+        
+        ;// Row 1
+        
+        
+        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)  
+        
+        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
+        
+        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
+        
+                
+        ;// Row 2
+        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
+        
+        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
+        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
+        
+        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
+        
+               
+        ;// Row 3    
+        
+        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)         
+        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)  
+       
+        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
+        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
+                                                            
+        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
+        
+        
+                        
+        ;// Row 4   
+        
+        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
+        
+        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
+       
+        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
+        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
+        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
+        
+        
+        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+        
+        
+        ;// Set return value
+          
+           
+      
+        ;// Write function tail
+        M_END
+        
+    ENDIF                                                    ;//ARM1136JS        
+ 
+
+;// Guarding implementation by the processor name
+    
+    IF  ARM1136JS_U
+    
+;//Input Registers
+pSrcDst       RN  0
+QP            RN  1
+
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+pQPdiv          RN  4
+pQPmod          RN  5
+pVRow           RN  2
+QPmod           RN  6
+shift           RN  3
+rowLuma01       RN  1
+rowLuma23       RN  4
+
+SrcDst00        RN  5
+SrcDst02        RN  6
+SrcDst10        RN  7
+SrcDst12        RN  8
+SrcDst20        RN  9
+SrcDst22        RN  10
+SrcDst30        RN  11
+SrcDst32        RN  12
+
+temp1           RN  2
+temp2           RN  3
+temp3           RN  14
+    
+    
+        ;// Allocate stack memory required by the function
+        
+        ;// Write function header
+        M_START armVCM4P10_DequantLumaAC4x4,r11
+         
+        LDR    pQPmod,=armVCM4P10_QPModuloTable
+        LDR    pQPdiv,=armVCM4P10_QPDivTable        
+        LDR    pVRow,=armVCM4P10_VMatrixU16
+         
+        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
+        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
+                
+        LDR    rowLuma01,[pVRow,QPmod]!             ;// rowLuma01 = [0b|0a]
+        LDR    rowLuma23,[pVRow,#4]                 ;// rowLuma23 = [0d|0c]    
+
+        ;// Load all the 16 'src' values
+        LDMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+        
+        
+        ;//*********************************************************************************************
+        ;//
+        ;// 'Shift' ranges between [0,8] 
+        ;// So we can shift the packed rowLuma values [0b|0a] with a single LSL operation
+        ;//
+        ;//*********************************************************************************************
+        
+        LSL    rowLuma01,rowLuma01,shift
+        LSL    rowLuma23,rowLuma23,shift
+        
+        
+        ;//**********************************************************************************************
+        ;//
+        ;// The idea is to unroll the Loop completely
+        ;// All the 16 src values are loaded at once into 8 registers : SrcDst<y><x> (above)
+        ;// 0<= armVCM4P10_PosToVCol4x4[i] <=2 for any 'i<16' 
+        ;// So the only values of pVRow[i] that need to be loaded are for i=0,1,2
+        ;// These 3 values are loaded into rowLuma01 and rowLuma23 (above)
+        ;// We first calculate pVRow[armVCM4P10_PosToVCol4x4[i]]) << Shift which fits into 16 bits (above)
+        ;// Then the product pSrcDst[i] * (pVRow[armVCM4P10_PosToVCol4x4[i]] << Shift) is calculated
+        ;// Here we interleave the PKHBT operations for various rows  to avoide pipeline stalls
+        ;// 
+        ;// We then pack the two 16 bit multiplication result into a word and store at one go
+        ;//
+        ;//**********************************************************************************************
+        
+        
+        ;// Row 1
+        
+        
+        SMULTB  temp1,SrcDst00,rowLuma23                    ;// pSrcDst[1] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst00,SrcDst00,rowLuma01                 ;// pSrcDst[0] * (pVRow[0]<<Shift)  
+        
+        SMULTB  temp2,SrcDst02,rowLuma23                    ;// pSrcDst[3] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst02,SrcDst02,rowLuma01                 ;// pSrcDst[2] * (pVRow[0]<<Shift)
+        
+        PKHBT   SrcDst00,SrcDst00,temp1,LSL #16             ;// Pack the first two product values
+        
+                
+        ;// Row 2
+        SMULTT  temp1,SrcDst10,rowLuma01                    ;// pSrcDst[5] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst10,SrcDst10,rowLuma23                 ;// pSrcDst[4] * (pVRow[2]<<Shift)
+        
+        PKHBT   SrcDst02,SrcDst02,temp2,LSL #16             ;// Pack the next two product values
+        SMULTT  temp2,SrcDst12,rowLuma01                    ;// pSrcDst[7] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst12,SrcDst12,rowLuma23                    ;// pSrcDst[6] * (pVRow[2]<<Shift)
+        
+        PKHBT   SrcDst10,SrcDst10,temp1,LSL #16             ;// Pack the next two product values
+        
+               
+        ;// Row 3    
+        
+        SMULTB  temp1,SrcDst20,rowLuma23                    ;// pSrcDst[9] * (pVRow[2]<<Shift)         
+        SMULBB  SrcDst20,SrcDst20,rowLuma01                    ;// pSrcDst[8] * (pVRow[0]<<Shift)  
+       
+        PKHBT   SrcDst12,SrcDst12,temp2,LSL #16               ;// Pack the next two product values
+        SMULTB  temp2,SrcDst22,rowLuma23                    ;// pSrcDst[11] * (pVRow[2]<<Shift) 
+        SMULBB  SrcDst22,SrcDst22,rowLuma01                    ;// pSrcDst[10] * (pVRow[0]<<Shift)
+                                                            
+        PKHBT   SrcDst20,SrcDst20,temp1,LSL #16             ;// Pack the next two product values
+        
+        
+                        
+        ;// Row 4   
+        
+        SMULTT  temp1,SrcDst30,rowLuma01                    ;// pSrcDst[13] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst30,SrcDst30,rowLuma23                    ;// pSrcDst[12] * (pVRow[2]<<Shift)
+        
+        SMULTT  temp3,SrcDst32,rowLuma01                    ;// pSrcDst[15] * (pVRow[1]<<Shift)
+        SMULBB  SrcDst32,SrcDst32,rowLuma23                    ;// pSrcDst[14] * (pVRow[2]<<Shift)
+       
+        PKHBT   SrcDst22,SrcDst22,temp2,LSL #16             ;// Pack the remaining product values
+        PKHBT   SrcDst30,SrcDst30,temp1,LSL #16
+        PKHBT   SrcDst32,SrcDst32,temp3,LSL #16
+        
+        
+        STMIA   pSrcDst,{SrcDst00,SrcDst02,SrcDst10,SrcDst12,SrcDst20,SrcDst22,SrcDst30,SrcDst32}
+        
+        
+        ;// Set return value
+          
+           
+      
+        ;// Write function tail
+        M_END
+        
+    ENDIF                                                    ;//ARM1136JS_U        
+
+
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    IF  ARM1136JS
+    
+;//Input Registers
+ppSrc       RN  0
+pPred       RN  1
+pDC         RN  2
+pDst        RN  3
+   
+
+;//Output Registers
+result      RN  0
+
+;//Local Scratch Registers
+pDelta      RN  4
+pDeltaTmp   RN  6
+AC          RN  5                   ;//Load from stack
+pPredTemp   RN  7
+pDCTemp     RN  8
+pDstTemp    RN  9
+pDeltaArg1  RN  1
+pDeltaArg0  RN  0
+QP          RN  1                   ;//Load from stack
+DCval       RN  10  
+DCvalCopy   RN  11
+predstep    RN  1
+dstStep     RN  10
+ycounter    RN  0
+PredVal1    RN  3
+PredVal2    RN  5
+DeltaVal1   RN  2
+DeltaVal2   RN  11
+PredVal     RN  8
+tmpDeltaVal RN  6
+sum1        RN  12
+sum2        RN  14
+    
+    
+           
+    ;// Allocate stack memory required by the function
+        M_ALLOC8 pBuffer, 32
+               
+
+    ;// Write function header
+        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11
+        
+        ;// Define stack arguments
+        M_ARG   predStepOnStack, 4
+        M_ARG   dstStepOnStack,4
+        M_ARG   QPOnStack, 4
+        M_ARG   ACOnStack,4
+  
+        
+        M_ADR   pDelta,pBuffer 
+        M_LDR   AC,ACOnStack 
+        
+         
+        ;// Save registers r1,r2,r3 before function call    
+        MOV     pPredTemp,pPred
+        MOV     pDCTemp,pDC
+        MOV     pDstTemp,pDst
+        
+        CMP     AC,#0
+        BEQ     DCcase
+        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
+    
+        BL      armVCM4P10_UnpackBlock4x4
+    
+        M_LDR   QP,QPOnStack                                ;// Set up r1 for DequantLumaAC4x4
+        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for DequantLumaAC4x4
+
+        BL      armVCM4P10_DequantLumaAC4x4
+        
+        
+        CMP     pDCTemp,#0
+        LDRSHNE DCval,[pDCTemp]
+        MOV     pDeltaArg0,pDelta                           ;// Set up r0 for armVCM4P10_TransformResidual4x4
+        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_TransformResidual4x4
+        STRHNE  DCval,[pDelta]
+        
+        BL      armVCM4P10_TransformResidual4x4
+        B       OutDCcase 
+        
+
+DCcase
+        LDRSH   DCval,[pDCTemp] 
+        ADD     DCval,DCval,#32 
+        ASR     DCval,DCval,#6
+        PKHBT   DCval,DCval,DCval,LSL #16                  ;// Duplicating the Lower halfword
+        MOV     DCvalCopy, DCval                           ;// Needed for STRD
+        STRD    DCval, [pDelta, #0]                        ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
+        STRD    DCval, [pDelta, #8]                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
+        STRD    DCval, [pDelta, #16]                       ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
+        STRD    DCval, [pDelta, #24]   
+        
+               
+OutDCcase      
+        M_LDR   predstep,predStepOnStack
+        M_LDR   dstStep,dstStepOnStack
+        
+        LDMIA   pDelta!,{tmpDeltaVal,DeltaVal2}             ;// Pre load
+        MOV     ycounter,#4                                 ;// Counter for the PredPlusDeltaLoop
+        LDR     PredVal,[pPredTemp]                         ;// Pre load
+
+PredPlusDeltaLoop
+        
+       
+        SUBS    ycounter,ycounter,#1
+        ADD     pPredTemp,pPredTemp,predstep                ;// Increment pPred ptr
+        
+        PKHBT   DeltaVal1,tmpDeltaVal,DeltaVal2,LSL #16     ;// Deltaval1 = [C A]   
+        PKHTB   DeltaVal2,DeltaVal2,tmpDeltaVal,ASR #16     ;// DeltaVal2 = [D B]
+        
+        UXTB16  PredVal1,PredVal                            ;// PredVal1 = [0c0a]
+        UXTB16  PredVal2,PredVal,ROR #8                     ;// PredVal2 = [0d0b]
+        
+        LDRGT   PredVal,[pPredTemp]                         ;// Pre load
+        
+        QADD16  sum2,DeltaVal2,PredVal2                     ;// Add and saturate to 16 bits
+        QADD16  sum1,DeltaVal1,PredVal1
+        
+        USAT16  sum2,#8,sum2                                ;// armClip(0,255,sum2)
+        USAT16  sum1,#8,sum1
+        
+        LDMGTIA   pDelta!,{tmpDeltaVal,DeltaVal2}           ;// Pre load
+          
+        ORR     sum1,sum1,sum2,LSL #8                       ;// sum1 = [dcba]
+        STR     sum1,[pDstTemp]
+        
+        ADD     pDstTemp,pDstTemp,dstStep                   ;// Increment pDst ptr
+        BGT     PredPlusDeltaLoop  
+        
+        
+        ;// Set return value
+        MOV     result,#OMX_Sts_NoErr
+        
+End                
+
+        
+        ;// Write function tail
+        
+        M_END
+        
+    ENDIF                                                    ;//ARM1136JS   
+    
+    
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    
+         
+            
+    END