summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s264
1 files changed, 264 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
new file mode 100755
index 0000000..2529959
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s
@@ -0,0 +1,264 @@
+;//
+;//
+;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 12290
+;// Date: Wednesday, April 9, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// H.264 inverse quantize and transform module
+;//
+;//
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+;// Import/Export symbols required from/to other files
+;// (For example tables)
+
+ IMPORT armVCM4P10_UnpackBlock4x4
+ IMPORT armVCM4P10_QPDivTable
+ IMPORT armVCM4P10_VMatrixQPModTable
+
+ M_VARIANTS CortexA8
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+
+;// Guarding implementation by the processor name
+
+
+
+;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+;//Input Registers
+pData RN 0
+QP RN 1
+
+
+;//Local Scratch Registers
+
+;// ARM Registers
+
+pQPDivTable RN 2
+pQPModTable RN 3
+Shift RN 4
+Scale RN 5
+
+;// NEON Registers
+
+;// Packed Input pixels
+dIn0 DN D0.S16
+dIn1 DN D1.S16
+dIn2 DN D2.S16
+dIn3 DN D3.S16
+
+;// Intermediate calculations
+dRowSum1 DN D4.S16
+dRowSum2 DN D5.S16
+dRowDiff1 DN D6.S16
+dRowDiff2 DN D7.S16
+
+;// Row operated pixels
+dRowOp0 DN D0.S16
+dRowOp1 DN D1.S16
+dRowOp2 DN D2.S16
+dRowOp3 DN D3.S16
+qRowOp01 QN Q0.32
+qRowOp23 QN Q1.32
+
+;// Intermediate calculations
+dColSum1 DN D4.S16
+dColSum2 DN D5.S16
+dColDiff1 DN D6.S16
+dColDiff2 DN D7.S16
+
+;// Coloumn operated pixels
+dColOp0 DN D0.S16
+dColOp1 DN D1.S16
+dColOp2 DN D2.S16
+dColOp3 DN D3.S16
+
+;// Temporary scratch varaibles
+
+dScale DN D5.S16
+qRound0 QN Q3.S32
+qRound1 QN Q4.S32
+qRound2 QN Q5.S32
+qRound3 QN Q6.S32
+
+;// InvTransformed and Dequantized pixels
+dOut0 DN D0.S16
+dOut1 DN D1.S16
+dOut2 DN D2.S16
+dOut3 DN D3.S16
+
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13
+
+ ;******************************************************************
+ ;// The strategy used in implementing the transform is as follows:*
+ ;// Load the 4x4 block into 4 D-registers *
+ ;// Transpose the 4x4 matrix *
+ ;// Perform the row operations (on columns) using SIMD *
+ ;// Transpose the 4x4 result matrix *
+ ;// Perform the coloumn operations *
+ ;******************************************************************
+
+ ;// Load all the 4x4 pixels in Transposed form
+
+ VLD4 {dIn0,dIn1,dIn2,dIn3},[pData]
+ LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
+ LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
+
+ ;****************************************
+ ;// Row Operations (Performed on columns)
+ ;****************************************
+ ;// Scale factor calculation is done using ARM instructions
+ ;// Interleaved with NEON instructions inorder to Dual issue
+
+ VADD dRowSum1,dIn0,dIn1
+ VADD dRowSum2,dIn2,dIn3
+ VSUB dRowDiff1,dIn0,dIn1
+ LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP]
+ VSUB dRowDiff2,dIn2,dIn3
+ LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP]
+ VADD dRowOp0,dRowSum1,dRowSum2
+ VSUB dRowOp1,dRowSum1,dRowSum2
+ VSUB dRowOp2,dRowDiff1,dRowDiff2
+ LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift
+ VADD dRowOp3,dRowDiff1,dRowDiff2
+
+ ;****************************************
+ ;// Transpose the resultant matrix
+ ;****************************************
+
+ VTRN dRowOp0,dRowOp1
+ VTRN dRowOp2,dRowOp3
+ VTRN qRowOp01,qRowOp23
+
+ ;****************************************
+ ;// Coloumn Operations
+ ;****************************************
+
+ VADD dColSum1,dRowOp0,dRowOp1
+ VADD dColSum2,dRowOp2,dRowOp3
+ VSUB dColDiff1,dRowOp0,dRowOp1
+ VSUB dColDiff2,dRowOp2,dRowOp3
+ VADD dColOp0,dColSum1,dColSum2
+ VSUB dColOp1,dColSum1,dColSum2
+ VSUB dColOp2,dColDiff1,dColDiff2
+ VADD dColOp3,dColDiff1,dColDiff2
+
+ ;//----------------------------------------------------------------------
+ ;//
+ ;// <Dequantize> improves on the c-reference code
+ ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together
+ ;// We do not subtract 2 from Shift as in C reference, instead perform a
+ ;// Scale << Shift once in the beginning and do a right shift by a
+ ;// constant 2 after the Multiplication. The value of Round would be 2
+ ;//
+ ;// By doing this we aviod the Branches required and also
+ ;// reduce the code size substantially
+ ;//
+ ;//----------------------------------------------------------------------
+
+
+ VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector
+
+
+ VMOV qRound0,#2 ;// Set the Round Value
+ VMOV qRound1,#2
+ VMOV qRound2,#2
+ VMOV qRound3,#2
+
+ VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round
+ VMLAL qRound1,dColOp1,dScale
+ VMLAL qRound2,dColOp2,dScale
+ VMLAL qRound3,dColOp3,dScale
+
+ VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value
+ VSHRN dOut1,qRound1,#2
+ VSHRN dOut2,qRound2,#2
+ VSHRN dOut3,qRound3,#2
+
+ ;***************************
+ ;// Store all the 4x4 pixels
+ ;***************************
+
+ VST1 {dOut0,dOut1,dOut2,dOut3}, [pData]
+
+
+ ;// Set return value
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//CORTEXA8
+
+
+
+;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
+
+;//Input Registers
+ppSrc RN 0
+pDst RN 1
+QPR2 RN 2
+
+;//Output Registers
+result RN 0
+
+;//Local Scratch Registers
+pDstR4 RN 4
+pDstR0 RN 0
+QPR1 RN 1
+QPR5 RN 5
+
+;// Guarding implementation by the processor name
+
+ IF CortexA8
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
+
+ MOV pDstR4,pDst ;// Saving register r1
+ MOV QPR5,QPR2 ;// Saving register r2
+ BL armVCM4P10_UnpackBlock4x4
+
+ MOV pDstR0,pDstR4 ;// Setting up register r0
+ MOV QPR1,QPR5 ;// Setting up register r1
+ BL armVCM4P10_InvTransformDequantLumaDC4x4
+
+
+ ;// Set return value
+ MOV result,#OMX_Sts_NoErr
+
+ ;// Write function tail
+ M_END
+
+
+ ENDIF ;//ARM1136JS
+
+
+ END \ No newline at end of file