;// ;// Copyright (C) 2007-2008 ARM Limited ;// ;// Licensed under the Apache License, Version 2.0 (the "License"); ;// you may not use this file except in compliance with the License. ;// You may obtain a copy of the License at ;// ;// http://www.apache.org/licenses/LICENSE-2.0 ;// ;// Unless required by applicable law or agreed to in writing, software ;// distributed under the License is distributed on an "AS IS" BASIS, ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ;// See the License for the specific language governing permissions and ;// limitations under the License. ;// ;// ;// ;// File Name: omxVCM4P10_TransformDequantLumaDCFromPair_s.s ;// OpenMAX DL: v1.0.2 ;// Revision: 12290 ;// Date: Wednesday, April 9, 2008 ;// ;// ;// ;// ;// Description: ;// H.264 inverse quantize and transform module ;// ;// ;// Include standard headers INCLUDE omxtypes_s.h INCLUDE armCOMM_s.h ;// Import/Export symbols required from/to other files ;// (For example tables) IMPORT armVCM4P10_UnpackBlock4x4 IMPORT armVCM4P10_QPDivTable IMPORT armVCM4P10_VMatrixQPModTable M_VARIANTS CortexA8 ;// Set debugging level ;//DEBUG_ON SETL {TRUE} ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 ;// Guarding implementation by the processor name ;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4 ;// Guarding implementation by the processor name IF CortexA8 ;//Input Registers pData RN 0 QP RN 1 ;//Local Scratch Registers ;// ARM Registers pQPDivTable RN 2 pQPModTable RN 3 Shift RN 4 Scale RN 5 ;// NEON Registers ;// Packed Input pixels dIn0 DN D0.S16 dIn1 DN D1.S16 dIn2 DN D2.S16 dIn3 DN D3.S16 ;// Intermediate calculations dRowSum1 DN D4.S16 dRowSum2 DN D5.S16 dRowDiff1 DN D6.S16 dRowDiff2 DN D7.S16 ;// Row operated pixels dRowOp0 DN D0.S16 dRowOp1 DN D1.S16 dRowOp2 DN D2.S16 dRowOp3 DN D3.S16 qRowOp01 QN Q0.32 qRowOp23 QN Q1.32 ;// Intermediate calculations dColSum1 DN D4.S16 dColSum2 DN D5.S16 dColDiff1 DN D6.S16 dColDiff2 DN D7.S16 ;// Coloumn operated pixels dColOp0 DN D0.S16 dColOp1 DN D1.S16 dColOp2 DN D2.S16 dColOp3 DN D3.S16 ;// Temporary scratch varaibles dScale DN D5.S16 qRound0 QN Q3.S32 qRound1 QN Q4.S32 qRound2 QN Q5.S32 qRound3 QN Q6.S32 ;// InvTransformed and Dequantized pixels dOut0 DN D0.S16 dOut1 DN D1.S16 dOut2 DN D2.S16 dOut3 DN D3.S16 ;// Allocate stack memory required by the function ;// Write function header M_START armVCM4P10_InvTransformDequantLumaDC4x4,r5,d13 ;****************************************************************** ;// The strategy used in implementing the transform is as follows:* ;// Load the 4x4 block into 4 D-registers * ;// Transpose the 4x4 matrix * ;// Perform the row operations (on columns) using SIMD * ;// Transpose the 4x4 result matrix * ;// Perform the coloumn operations * ;****************************************************************** ;// Load all the 4x4 pixels in Transposed form VLD4 {dIn0,dIn1,dIn2,dIn3},[pData] LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer ;**************************************** ;// Row Operations (Performed on columns) ;**************************************** ;// Scale factor calculation is done using ARM instructions ;// Interleaved with NEON instructions inorder to Dual issue VADD dRowSum1,dIn0,dIn1 VADD dRowSum2,dIn2,dIn3 VSUB dRowDiff1,dIn0,dIn1 LDRSB Shift, [pQPDivTable, QP] ;// ARM CODE: Shift = pQPDivTable[QP] VSUB dRowDiff2,dIn2,dIn3 LDRSB Scale, [pQPModTable, QP] ;// ARM CODE: Scale = pQPModTable[QP] VADD dRowOp0,dRowSum1,dRowSum2 VSUB dRowOp1,dRowSum1,dRowSum2 VSUB dRowOp2,dRowDiff1,dRowDiff2 LSL Scale, Scale, Shift ;// ARM CODE: Scale = Scale << Shift VADD dRowOp3,dRowDiff1,dRowDiff2 ;**************************************** ;// Transpose the resultant matrix ;**************************************** VTRN dRowOp0,dRowOp1 VTRN dRowOp2,dRowOp3 VTRN qRowOp01,qRowOp23 ;**************************************** ;// Coloumn Operations ;**************************************** VADD dColSum1,dRowOp0,dRowOp1 VADD dColSum2,dRowOp2,dRowOp3 VSUB dColDiff1,dRowOp0,dRowOp1 VSUB dColDiff2,dRowOp2,dRowOp3 VADD dColOp0,dColSum1,dColSum2 VSUB dColOp1,dColSum1,dColSum2 VSUB dColOp2,dColDiff1,dColDiff2 VADD dColOp3,dColDiff1,dColDiff2 ;//---------------------------------------------------------------------- ;// ;// improves on the c-reference code ;// Both the cases i.e., Shift>=0 and Shift<0 cases are covered together ;// We do not subtract 2 from Shift as in C reference, instead perform a ;// Scale << Shift once in the beginning and do a right shift by a ;// constant 2 after the Multiplication. The value of Round would be 2 ;// ;// By doing this we aviod the Branches required and also ;// reduce the code size substantially ;// ;//---------------------------------------------------------------------- VDUP dScale, Scale ;// ARM -> NEON copy 'scale' to vector VMOV qRound0,#2 ;// Set the Round Value VMOV qRound1,#2 VMOV qRound2,#2 VMOV qRound3,#2 VMLAL qRound0,dColOp0,dScale ;// pDst[i] * Scale + Round VMLAL qRound1,dColOp1,dScale VMLAL qRound2,dColOp2,dScale VMLAL qRound3,dColOp3,dScale VSHRN dOut0,qRound0,#2 ;// Right shift by 2 & (OMX_S16)Value VSHRN dOut1,qRound1,#2 VSHRN dOut2,qRound2,#2 VSHRN dOut3,qRound3,#2 ;*************************** ;// Store all the 4x4 pixels ;*************************** VST1 {dOut0,dOut1,dOut2,dOut3}, [pData] ;// Set return value ;// Write function tail M_END ENDIF ;//CORTEXA8 ;// Function: omxVCM4P10_TransformDequantLumaDCFromPair ;//Input Registers ppSrc RN 0 pDst RN 1 QPR2 RN 2 ;//Output Registers result RN 0 ;//Local Scratch Registers pDstR4 RN 4 pDstR0 RN 0 QPR1 RN 1 QPR5 RN 5 ;// Guarding implementation by the processor name IF CortexA8 ;// Allocate stack memory required by the function ;// Write function header M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5 MOV pDstR4,pDst ;// Saving register r1 MOV QPR5,QPR2 ;// Saving register r2 BL armVCM4P10_UnpackBlock4x4 MOV pDstR0,pDstR4 ;// Setting up register r0 MOV QPR1,QPR5 ;// Setting up register r1 BL armVCM4P10_InvTransformDequantLumaDC4x4 ;// Set return value MOV result,#OMX_Sts_NoErr ;// Write function tail M_END ENDIF ;//ARM1136JS END