diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s')
-rwxr-xr-x | media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s | 264 |
1 files changed, 264 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s new file mode 100755 index 0000000..2529959 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s @@ -0,0 +1,264 @@ +;// +;// +;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 12290 +;// Date: Wednesday, April 9, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// H.264 inverse quantize and transform module +;// +;// + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + +;// Import/Export symbols required from/to other files +;// (For example tables) + + IMPORT armVCM4P10_UnpackBlock4x4 + IMPORT armVCM4P10_QPDivTable + IMPORT armVCM4P10_VMatrixQPModTable + + M_VARIANTS CortexA8 + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + + +;// Guarding implementation by the processor name + + + +;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 + +;// Guarding implementation by the processor name + + IF CortexA8 + +;//Input Registers +pData RN 0 +QP RN 1 + + +;//Local Scratch Registers + +;// ARM Registers + +pQPDivTable RN 2 +pQPModTable RN 3 +Shift RN 4 +Scale RN 5 + +;// NEON Registers + +;// Packed Input pixels +dIn0 DN D0.S16 +dIn1 DN D1.S16 +dIn2 DN D2.S16 +dIn3 DN D3.S16 + +;// Intermediate calculations +dRowSum1 DN D4.S16 +dRowSum2 DN D5.S16 +dRowDiff1 DN D6.S16 +dRowDiff2 DN D7.S16 + +;// Row operated pixels +dRowOp0 DN D0.S16 +dRowOp1 DN D1.S16 +dRowOp2 DN D2.S16 +dRowOp3 DN D3.S16 +qRowOp01 QN Q0.32 +qRowOp23 QN Q1.32 + +;// Intermediate calculations +dColSum1 DN D4.S16 +dColSum2 DN D5.S16 +dColDiff1 DN D6.S16 +dColDiff2 DN D7.S16 + +;// Coloumn operated pixels +dColOp0 DN D0.S16 +dColOp1 DN D1.S16 +dColOp2 DN D2.S16 +dColOp3 DN D3.S16 + +;// Temporary scratch varaibles + +dScale DN D5.S16 +qRound0 QN Q3.S32 +qRound1 QN Q4.S32 +qRound2 QN Q5.S32 +qRound3 QN Q6.S32 + +;// InvTransformed and Dequantized pixels +dOut0 DN D0.S16 +dOut1 DN D1.S16 +dOut2 DN D2.S16 +dOut3 DN D3.S16 + + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 + + ;****************************************************************** + ;// The strategy used in implementing the transform is as follows:* + ;// Load the 4x4 block into 4 D-registers * + ;// Transpose the 4x4 matrix * + ;// Perform the row operations (on columns) using SIMD * + ;// Transpose the 4x4 result matrix * + ;// Perform the coloumn operations * + ;****************************************************************** + + ;// Load all the 4x4 pixels in Transposed form + + VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] + LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer + LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer + + ;**************************************** + ;// Row Operations (Performed on columns) + ;**************************************** + ;// Scale factor calculation is done using ARM instructions + ;// Interleaved with NEON instructions inorder to Dual issue + + VADD dRowSum1,dIn0,dIn1 + VADD dRowSum2,dIn2,dIn3 + VSUB dRowDiff1,dIn0,dIn1 + LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] + VSUB dRowDiff2,dIn2,dIn3 + LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] + VADD dRowOp0,dRowSum1,dRowSum2 + VSUB dRowOp1,dRowSum1,dRowSum2 + VSUB dRowOp2,dRowDiff1,dRowDiff2 + LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift + VADD dRowOp3,dRowDiff1,dRowDiff2 + + ;**************************************** + ;// Transpose the resultant matrix + ;**************************************** + + VTRN dRowOp0,dRowOp1 + VTRN dRowOp2,dRowOp3 + VTRN qRowOp01,qRowOp23 + + ;**************************************** + ;// Coloumn Operations + ;**************************************** + + VADD dColSum1,dRowOp0,dRowOp1 + VADD dColSum2,dRowOp2,dRowOp3 + VSUB dColDiff1,dRowOp0,dRowOp1 + VSUB dColDiff2,dRowOp2,dRowOp3 + VADD dColOp0,dColSum1,dColSum2 + VSUB dColOp1,dColSum1,dColSum2 + VSUB dColOp2,dColDiff1,dColDiff2 + VADD dColOp3,dColDiff1,dColDiff2 + + ;//---------------------------------------------------------------------- + ;// + ;// <Dequantize> improves on the c-reference code + ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together + ;// We do not subtract 2 from Shift as in C reference, instead perform a + ;// Scale << Shift once in the beginning and do a right shift by a + ;// constant 2 after the Multiplication. The value of Round would be 2 + ;// + ;// By doing this we aviod the Branches required and also + ;// reduce the code size substantially + ;// + ;//---------------------------------------------------------------------- + + + VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector + + + VMOV qRound0,#2 ;// Set the Round Value + VMOV qRound1,#2 + VMOV qRound2,#2 + VMOV qRound3,#2 + + VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round + VMLAL qRound1,dColOp1,dScale + VMLAL qRound2,dColOp2,dScale + VMLAL qRound3,dColOp3,dScale + + VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value + VSHRN dOut1,qRound1,#2 + VSHRN dOut2,qRound2,#2 + VSHRN dOut3,qRound3,#2 + + ;*************************** + ;// Store all the 4x4 pixels + ;*************************** + + VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] + + + ;// Set return value + + ;// Write function tail + M_END + + ENDIF ;//CORTEXA8 + + + +;// Function: omxVCM4P10_TransformDequantLumaDCFromPair + +;//Input Registers +ppSrc RN 0 +pDst RN 1 +QPR2 RN 2 + +;//Output Registers +result RN 0 + +;//Local Scratch Registers +pDstR4 RN 4 +pDstR0 RN 0 +QPR1 RN 1 +QPR5 RN 5 + +;// Guarding implementation by the processor name + + IF CortexA8 + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 + + MOV pDstR4,pDst ;// Saving register r1 + MOV QPR5,QPR2 ;// Saving register r2 + BL armVCM4P10_UnpackBlock4x4 + + MOV pDstR0,pDstR4 ;// Setting up register r0 + MOV QPR1,QPR5 ;// Setting up register r1 + BL armVCM4P10_InvTransformDequantLumaDC4x4 + + + ;// Set return value + MOV result,#OMX_Sts_NoErr + + ;// Write function tail + M_END + + + ENDIF ;//ARM1136JS + + + END
\ No newline at end of file |