1 files changed, 264 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
new file mode 100755
index 0000000..2529959
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
@@ -0,0 +1,264 @@
+;//
+;// 
+;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision:   12290
+;// Date:       Wednesday, April 9, 2008
+;// 
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;// 
+;// 
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;// 
+;// 
+
+;// Include standard headers
+
+        INCLUDE omxtypes_s.h
+        INCLUDE armCOMM_s.h
+        
+;// Import/Export symbols required from/to other files
+;// (For example tables)
+        
+        IMPORT armVCM4P10_UnpackBlock4x4 
+        IMPORT armVCM4P10_QPDivTable
+        IMPORT armVCM4P10_VMatrixQPModTable
+        
+        M_VARIANTS CortexA8
+
+;// Set debugging level        
+;//DEBUG_ON    SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+    
+
+;// Guarding implementation by the processor name
+    
+    
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+;// Guarding implementation by the processor name
+    
+    IF  CortexA8
+    
+;//Input Registers
+pData               RN  0
+QP                  RN  1    
+
+
+;//Local Scratch Registers
+
+;// ARM Registers
+
+pQPDivTable         RN  2
+pQPModTable         RN  3
+Shift               RN  4
+Scale               RN  5
+
+;// NEON Registers
+
+;// Packed Input pixels
+dIn0                DN  D0.S16
+dIn1                DN  D1.S16
+dIn2                DN  D2.S16
+dIn3                DN  D3.S16   
+
+;// Intermediate calculations
+dRowSum1            DN  D4.S16
+dRowSum2            DN  D5.S16
+dRowDiff1           DN  D6.S16
+dRowDiff2           DN  D7.S16
+
+;// Row operated pixels
+dRowOp0             DN  D0.S16
+dRowOp1                DN  D1.S16
+dRowOp2                DN  D2.S16
+dRowOp3                DN  D3.S16
+qRowOp01            QN  Q0.32
+qRowOp23            QN  Q1.32
+
+;// Intermediate calculations
+dColSum1            DN  D4.S16
+dColSum2            DN  D5.S16
+dColDiff1           DN  D6.S16
+dColDiff2           DN  D7.S16
+
+;// Coloumn operated pixels
+dColOp0             DN  D0.S16
+dColOp1                DN  D1.S16
+dColOp2                DN  D2.S16
+dColOp3                DN  D3.S16
+
+;// Temporary scratch varaibles
+
+dScale              DN  D5.S16
+qRound0             QN  Q3.S32
+qRound1             QN  Q4.S32
+qRound2             QN  Q5.S32
+qRound3             QN  Q6.S32
+
+;// InvTransformed and Dequantized pixels
+dOut0               DN  D0.S16
+dOut1                DN  D1.S16
+dOut2                DN  D2.S16
+dOut3                DN  D3.S16
+
+       
+    ;// Allocate stack memory required by the function
+        
+
+    ;// Write function header
+    M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
+    
+    ;******************************************************************
+    ;// The strategy used in implementing the transform is as follows:*
+    ;// Load the 4x4 block into 4 D-registers                         *  
+    ;// Transpose the 4x4 matrix                                      *  
+    ;// Perform the row operations (on columns) using SIMD            *  
+    ;// Transpose the 4x4 result matrix                               *  
+    ;// Perform the coloumn operations                                *
+    ;******************************************************************
+
+        ;// Load all the 4x4 pixels in Transposed form
+        
+        VLD4    {dIn0,dIn1,dIn2,dIn3},[pData]
+        LDR     pQPDivTable, =armVCM4P10_QPDivTable        ;// QP Division look-up-table base pointer
+        LDR     pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+        
+        ;**************************************** 
+        ;// Row Operations (Performed on columns)
+        ;**************************************** 
+        ;// Scale factor calculation is done using ARM instructions
+        ;// Interleaved with NEON instructions inorder to Dual issue
+        
+        VADD    dRowSum1,dIn0,dIn1
+        VADD    dRowSum2,dIn2,dIn3
+        VSUB    dRowDiff1,dIn0,dIn1
+        LDRSB   Shift, [pQPDivTable, QP]               ;// ARM CODE: Shift = pQPDivTable[QP]
+        VSUB    dRowDiff2,dIn2,dIn3
+        LDRSB   Scale, [pQPModTable, QP]               ;// ARM CODE: Scale = pQPModTable[QP] 
+        VADD    dRowOp0,dRowSum1,dRowSum2
+        VSUB    dRowOp1,dRowSum1,dRowSum2
+        VSUB    dRowOp2,dRowDiff1,dRowDiff2
+        LSL     Scale, Scale, Shift                    ;// ARM CODE: Scale = Scale << Shift
+        VADD    dRowOp3,dRowDiff1,dRowDiff2
+        
+        ;****************************************
+        ;// Transpose the resultant matrix
+        ;****************************************
+        
+        VTRN    dRowOp0,dRowOp1
+        VTRN    dRowOp2,dRowOp3
+        VTRN    qRowOp01,qRowOp23 
+        
+        ;**************************************** 
+        ;// Coloumn Operations 
+        ;**************************************** 
+        
+        VADD    dColSum1,dRowOp0,dRowOp1
+        VADD    dColSum2,dRowOp2,dRowOp3
+        VSUB    dColDiff1,dRowOp0,dRowOp1
+        VSUB    dColDiff2,dRowOp2,dRowOp3
+        VADD    dColOp0,dColSum1,dColSum2
+        VSUB    dColOp1,dColSum1,dColSum2
+        VSUB    dColOp2,dColDiff1,dColDiff2
+        VADD    dColOp3,dColDiff1,dColDiff2
+        
+        ;//----------------------------------------------------------------------
+        ;//
+        ;// <Dequantize> improves on the c-reference code
+        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
+        ;// We do not subtract 2 from Shift as in C reference, instead perform a
+        ;// Scale << Shift once in the beginning and do a right shift by a 
+        ;// constant 2 after the Multiplication. The value of Round would be 2 
+        ;// 
+        ;// By doing this we aviod the Branches required and also 
+        ;// reduce the code size substantially
+        ;// 
+        ;//----------------------------------------------------------------------
+        
+        
+        VDUP    dScale, Scale                            ;// ARM -> NEON  copy 'scale' to vector
+               
+                
+        VMOV    qRound0,#2                               ;// Set the Round Value 
+        VMOV    qRound1,#2
+        VMOV    qRound2,#2
+        VMOV    qRound3,#2
+        
+        VMLAL   qRound0,dColOp0,dScale                   ;// pDst[i] * Scale + Round 
+        VMLAL   qRound1,dColOp1,dScale
+        VMLAL   qRound2,dColOp2,dScale
+        VMLAL   qRound3,dColOp3,dScale
+        
+        VSHRN   dOut0,qRound0,#2                          ;// Right shift by 2 & (OMX_S16)Value
+        VSHRN   dOut1,qRound1,#2
+        VSHRN   dOut2,qRound2,#2
+        VSHRN   dOut3,qRound3,#2
+        
+        ;***************************
+        ;// Store all the 4x4 pixels
+        ;***************************
+        
+        VST1  {dOut0,dOut1,dOut2,dOut3}, [pData]
+
+        
+        ;// Set return value
+        
+        ;// Write function tail
+        M_END        
+        
+    ENDIF                                                           ;//CORTEXA8   
+        
+
+
+;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
+    
+;//Input Registers
+ppSrc               RN  0
+pDst                RN  1
+QPR2                RN  2
+
+;//Output Registers
+result              RN  0
+
+;//Local Scratch Registers
+pDstR4              RN  4
+pDstR0              RN  0
+QPR1                RN  1
+QPR5                RN  5
+
+;// Guarding implementation by the processor name
+    
+    IF CortexA8
+       
+    ;// Allocate stack memory required by the function
+        
+
+    ;// Write function header
+        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
+        
+        MOV     pDstR4,pDst                         ;// Saving register r1
+        MOV     QPR5,QPR2                           ;// Saving register r2
+        BL      armVCM4P10_UnpackBlock4x4
+        
+        MOV     pDstR0,pDstR4                       ;// Setting up register r0
+        MOV     QPR1,QPR5                           ;// Setting up register r1
+        BL      armVCM4P10_InvTransformDequantLumaDC4x4
+                               
+       
+        ;// Set return value
+        MOV     result,#OMX_Sts_NoErr        
+       
+        ;// Write function tail
+        M_END
+        
+            
+    ENDIF                                                           ;//ARM1136JS  
+    
+
+    END
+\ No newline at end of file