1 files changed, 396 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
new file mode 100755
index 0000000..485a488
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
@@ -0,0 +1,396 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;// 
+;// 
+
+        
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Import symbols required from other files
+;// (For example tables)
+    
+        IMPORT armVCM4P10_UnpackBlock4x4
+        IMPORT armVCM4P10_TransformResidual4x4
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixU16
+        IMPORT armVCM4P10_QPModuloTable 
+        
+        M_VARIANTS CortexA8
+        
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_DequantLumaAC4x4
+
+;// Guarding implementation by the processor name
+    
+ 
+
+;// Guarding implementation by the processor name
+    
+
+
+
+
+
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    
+    
+;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd            
+    
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+
+;// ARM Registers
+
+;//Input Registers
+ppSrc       RN  0
+pPred       RN  1
+pDC         RN  2
+pDst        RN  3
+   
+
+;//Output Registers
+result      RN  0
+
+;//Local Scratch Registers
+
+;//Registers used in armVCM4P10_DequantLumaAC4x4
+pQPdiv      RN  10
+pQPmod      RN  11
+pVRow       RN  2
+QPmod       RN  12
+shift       RN  14
+index0      RN  1 
+index1      RN  10 
+
+;//Registers used in DequantTransformResidualFromPairAndAdd
+pDelta      RN  4
+pDeltaTmp   RN  6
+AC          RN  5                   ;//Load from stack
+pPredTemp   RN  7
+pDCTemp     RN  8
+pDstTemp    RN  9
+pDeltaArg1  RN  1
+pDeltaArg0  RN  0
+QP          RN  1                   ;//Load from stack
+DCval       RN  10  
+predstep    RN  1
+dstStep     RN  10
+PredVal1    RN  3
+PredVal2    RN  5
+
+
+
+
+;// Neon Registers
+
+;// Registers used in armVCM4P10_DequantLumaAC4x4
+
+dVmatrix            DN  D6.8  
+dindexRow0          DN  D7.32 
+dindexRow1          DN  D9.32 
+dByteIndexRow0      DN  D7.8
+dByteIndexRow1      DN  D9.8
+dVRow0              DN  D8.8  
+dVRow1              DN  D4.8
+dVRow0U16           DN  D8.U16
+dVRow1U16           DN  D4.U16
+dVRow2U16           DN  D8.U16
+dVRow3U16           DN  D4.U16
+
+dShift              DN  D5.U16
+dSrcRow0            DN  D0.I16
+dSrcRow1            DN  D1.I16
+dSrcRow2            DN  D2.I16    
+dSrcRow3            DN  D3.I16
+dDqntRow0           DN  D0.I16  
+dDqntRow1           DN  D1.I16 
+dDqntRow2           DN  D2.I16 
+dDqntRow3           DN  D3.I16  
+
+;// Registers used in TransformResidual4x4
+
+;// Packed Input pixels
+dIn0                DN  D0.S16       
+dIn1                DN  D1.S16       
+dIn2                DN  D2.S16       
+dIn3                DN  D3.S16
+qIn01               QN  Q0.32
+qIn23               QN  Q1.32
+
+;// Intermediate calculations       
+dZero               DN  D4.S16
+de0                 DN  D5.S16
+de1                 DN  D6.S16
+de2                 DN  D7.S16
+de3                 DN  D8.S16
+dIn1RS              DN  D7.S16
+dIn3RS              DN  D8.S16
+df0                 DN  D0.S16
+df1                 DN  D1.S16
+df2                 DN  D2.S16
+df3                 DN  D3.S16
+qf01                QN  Q0.32
+qf23                QN  Q1.32
+dg0                 DN  D5.S16
+dg1                 DN  D6.S16
+dg2                 DN  D7.S16
+dg3                 DN  D8.S16
+df1RS               DN  D7.S16
+df3RS               DN  D8.S16
+
+;// Output pixels
+dh0                 DN  D0.S16
+dh1                 DN  D1.S16
+dh2                 DN  D2.S16
+dh3                 DN  D3.S16 
+
+;// Registers used in DequantTransformResidualFromPairAndAdd
+
+dDeltaRow0          DN  D0.S16
+dDeltaRow1          DN  D1.S16
+dDeltaRow2          DN  D2.S16
+dDeltaRow3          DN  D3.S16
+qDeltaRow01         QN  Q0.S16
+qDeltaRow23         QN  Q1.S16
+
+dPredValRow01       DN  D4.U8
+dPredValRow23       DN  D5.U8
+
+qSumRow01           QN  Q3.S16
+qSumRow23           QN  Q4.S16
+dDstRow01           DN  D0.U8
+dDstRow23           DN  D1.U8
+dDstRow0            DN  D0.32[0]
+dDstRow1            DN  D0.32[1]
+dDstRow2            DN  D1.32[0]
+dDstRow3            DN  D1.32[1]
+    
+           
+    ;// Allocate stack memory required by the function
+        M_ALLOC8 pBuffer, 32
+               
+
+    ;// Write function header
+        M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9
+        
+        ;// Define stack arguments
+        M_ARG   predStepOnStack, 4
+        M_ARG   dstStepOnStack,4
+        M_ARG   QPOnStack, 4
+        M_ARG   ACOnStack,4
+  
+        
+        M_ADR   pDelta,pBuffer 
+        M_LDR   AC,ACOnStack 
+        
+         
+        ;// Save registers r1,r2,r3 before function call    
+        MOV     pPredTemp,pPred
+        MOV     pDCTemp,pDC
+        MOV     pDstTemp,pDst
+        
+        CMP     AC,#0
+        BEQ     DCcase
+        MOV     pDeltaArg1,pDelta                           ;// Set up r1 for armVCM4P10_UnpackBlock4x4
+    
+        BL      armVCM4P10_UnpackBlock4x4
+    
+        ;//--------------------------------------------------------
+        ;// armVCM4P10_DequantLumaAC4x4 : static function inlined
+        ;//--------------------------------------------------------
+        
+        ;//BL      armVCM4P10_DequantLumaAC4x4
+        M_LDR   QP,QPOnStack                                ;// Set up r1 for armVCM4P10_DequantLumaAC4x4
+                
+        LDR    pQPmod,=armVCM4P10_QPModuloTable
+        LDR    pQPdiv,=armVCM4P10_QPDivTable        
+        LDR    pVRow,=armVCM4P10_VMatrixU16
+        
+        
+        LDRSB  QPmod,[pQPmod,QP]                    ;// (QP%6) * 6
+        LDRSB  shift,[pQPdiv,QP]                    ;// Shift = QP / 6
+                
+        LDR    index1,=0x03020504 
+        LDR    index0,=0x05040100                   ;// Indexes into dVmatrix
+        ADD    pVRow,pVRow,QPmod
+        VDUP   dindexRow0,index0 
+        VDUP   dindexRow1,index1
+        VDUP   dShift,shift 
+        
+        ;// Load all 4x4 pVRow[] values
+        VLD1   dVmatrix,[pVRow]                     ;// dVmatrix = [0d|0c|0b|0a]
+        
+        
+        VTBL   dVRow0,dVmatrix,dByteIndexRow0       ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]]
+        VTBL   dVRow1,dVmatrix,dByteIndexRow1       ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]]
+        CMP     pDCTemp,#0
+        ;// Load all the 4x4 'src' values  
+        VLD1   { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] 
+        
+        VSHL   dVRow0U16,dVRow0U16,dShift 
+        VSHL   dVRow1U16,dVRow1U16,dShift 
+        LDRSHNE DCval,[pDCTemp]
+        
+        
+        ;// Multiply src[] with pVRow[]
+        VMUL    dDqntRow0,dSrcRow0,dVRow0U16
+        VMUL    dDqntRow1,dSrcRow1,dVRow1U16
+        VMUL    dDqntRow2,dSrcRow2,dVRow2U16
+        VMUL    dDqntRow3,dSrcRow3,dVRow3U16
+        
+        
+        
+        ;//-------------------------------------------------------------
+        ;// TransformResidual4x4 : Inlined to avoid Load/Stores
+        ;//-------------------------------------------------------------
+        
+        
+        ;//BL      armVCM4P10_TransformResidual4x4
+        ;//STRHNE  DCval,[pDelta]
+        VMOVNE    dIn0[0],DCval
+        
+        
+        
+        ;//*****************************************************************
+        ;// Transpose the input pixels : perform Row ops as Col ops
+        ;//*****************************************************************
+        
+        VTRN    dIn0,dIn1
+        VTRN    dIn2,dIn3
+        VTRN    qIn01,qIn23 
+         
+        
+        VMOV    dZero,#0                                    ;// Used to right shift by 1 
+        
+        
+        ;//**************************************** 
+        ;// Row Operations (Performed on columns)
+        ;//**************************************** 
+        
+        
+        VADD        de0,dIn0,dIn2                       ;//  e0 = d0 + d2 
+        VSUB        de1,dIn0,dIn2                        ;//  e1 = d0 - d2 
+        VHADD       dIn1RS,dIn1,dZero                   ;// (f1>>1) constZero is a register holding 0
+        VHADD       dIn3RS,dIn3,dZero
+        VSUB        de2,dIn1RS,dIn3                     ;//  e2 = (d1>>1) - d3 
+        VADD        de3,dIn1,dIn3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        df0,de0,de3                         ;//  f0 = e0 + e3
+        VADD        df1,de1,de2                            ;//  f1 = e1 + e2
+        VSUB        df2,de1,de2                            ;//  f2 = e1 - e2
+        VSUB        df3,de0,de3                            ;//  f3 = e0 - e3
+        
+        
+        
+        ;//*****************************************************************
+        ;// Transpose the resultant matrix
+        ;//*****************************************************************
+        
+        VTRN    df0,df1
+        VTRN    df2,df3
+        VTRN    qf01,qf23 
+        
+        
+        ;//******************************* 
+        ;// Coloumn Operations 
+        ;//******************************* 
+        
+        
+        VADD        dg0,df0,df2                         ;//  e0 = d0 + d2 
+        VSUB        dg1,df0,df2                            ;//  e1 = d0 - d2 
+        VHADD       df1RS,df1,dZero                     ;// (f1>>1) constZero is a register holding 0
+        VHADD       df3RS,df3,dZero
+        VSUB        dg2,df1RS,df3                       ;//  e2 = (d1>>1) - d3 
+        VADD        dg3,df1,df3RS                        ;//  e3 = d1 + (d3>>1) 
+        VADD        dh0,dg0,dg3                         ;//  f0 = e0 + e3
+        VADD        dh1,dg1,dg2                            ;//  f1 = e1 + e2
+        VSUB        dh2,dg1,dg2                            ;//  f2 = e1 - e2
+        VSUB        dh3,dg0,dg3                            ;//  f3 = e0 - e3
+        
+             
+        ;//************************************************
+        ;// Calculate final value (colOp[i][j] + 32)>>6
+        ;//************************************************
+        
+        VRSHR       dh0,#6
+        VRSHR       dh1,#6
+        VRSHR       dh2,#6
+        VRSHR       dh3,#6
+        
+               
+        B       OutDCcase 
+        
+
+DCcase
+        ;// Calculate the Transformed DCvalue : (DCval+32)>>6
+        LDRSH   DCval,[pDCTemp] 
+        ADD     DCval,DCval,#32 
+        ASR     DCval,DCval,#6
+        
+        VDUP    dDeltaRow0, DCval                       ;// pDelta[0]  = pDelta[1]  = pDelta[2]  = pDelta[3] = DCval
+        VDUP    dDeltaRow1, DCval                        ;// pDelta[4]  = pDelta[5]  = pDelta[6]  = pDelta[7] = DCval
+        VDUP    dDeltaRow2, DCval                        ;// pDelta[8]  = pDelta[9]  = pDelta[10] = pDelta[11] = DCval
+        VDUP    dDeltaRow3, DCval
+            
+                
+OutDCcase      
+        M_LDR   predstep,predStepOnStack
+        M_LDR   dstStep,dstStepOnStack
+        
+        LDR     PredVal1,[pPredTemp],predstep
+        LDR     PredVal2,[pPredTemp],predstep
+        VMOV    dPredValRow01,PredVal1,PredVal2
+        
+        LDR     PredVal1,[pPredTemp],predstep
+        LDR     PredVal2,[pPredTemp]
+        VMOV    dPredValRow23,PredVal1,PredVal2
+ 
+        
+        VADDW   qSumRow01,qDeltaRow01,dPredValRow01
+        VADDW   qSumRow23,qDeltaRow23,dPredValRow23
+        VQMOVUN dDstRow01,qSumRow01
+        VQMOVUN dDstRow23,qSumRow23
+        
+ 
+        VST1    dDstRow0,[pDstTemp],dstStep
+        VST1    dDstRow1,[pDstTemp],dstStep
+        VST1    dDstRow2,[pDstTemp],dstStep
+        VST1    dDstRow3,[pDstTemp]
+        
+        ;// Set return value
+        MOV     result,#OMX_Sts_NoErr
+        
+End                
+
+        
+        ;// Write function tail
+        
+        M_END
+        
+    ENDIF                                                    ;//CORTEXA8   
+    
+         
+            
+    END