;// ;// Copyright (C) 2007-2008 ARM Limited ;// ;// Licensed under the Apache License, Version 2.0 (the "License"); ;// you may not use this file except in compliance with the License. ;// You may obtain a copy of the License at ;// ;// http://www.apache.org/licenses/LICENSE-2.0 ;// ;// Unless required by applicable law or agreed to in writing, software ;// distributed under the License is distributed on an "AS IS" BASIS, ;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ;// See the License for the specific language governing permissions and ;// limitations under the License. ;// ;// ;// ;// File Name: omxVCM4P10_DequantTransformResidualFromPairAndAdd_s.s ;// OpenMAX DL: v1.0.2 ;// Revision: 12290 ;// Date: Wednesday, April 9, 2008 ;// ;// ;// ;// ;// Description: ;// H.264 inverse quantize and transform module ;// ;// ;// Include standard headers INCLUDE omxtypes_s.h INCLUDE armCOMM_s.h ;// Import symbols required from other files ;// (For example tables) IMPORT armVCM4P10_UnpackBlock4x4 IMPORT armVCM4P10_TransformResidual4x4 IMPORT armVCM4P10_QPDivTable IMPORT armVCM4P10_VMatrixU16 IMPORT armVCM4P10_QPModuloTable M_VARIANTS CortexA8 ;// Set debugging level ;//DEBUG_ON SETL {TRUE} ;// Static Function: armVCM4P10_DequantLumaAC4x4 ;// Guarding implementation by the processor name ;// Guarding implementation by the processor name ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd ;// Guarding implementation by the processor name ;// Function: omxVCM4P10_DequantTransformResidualFromPairAndAdd ;// Guarding implementation by the processor name IF CortexA8 ;// ARM Registers ;//Input Registers ppSrc RN 0 pPred RN 1 pDC RN 2 pDst RN 3 ;//Output Registers result RN 0 ;//Local Scratch Registers ;//Registers used in armVCM4P10_DequantLumaAC4x4 pQPdiv RN 10 pQPmod RN 11 pVRow RN 2 QPmod RN 12 shift RN 14 index0 RN 1 index1 RN 10 ;//Registers used in DequantTransformResidualFromPairAndAdd pDelta RN 4 pDeltaTmp RN 6 AC RN 5 ;//Load from stack pPredTemp RN 7 pDCTemp RN 8 pDstTemp RN 9 pDeltaArg1 RN 1 pDeltaArg0 RN 0 QP RN 1 ;//Load from stack DCval RN 10 predstep RN 1 dstStep RN 10 PredVal1 RN 3 PredVal2 RN 5 ;// Neon Registers ;// Registers used in armVCM4P10_DequantLumaAC4x4 dVmatrix DN D6.8 dindexRow0 DN D7.32 dindexRow1 DN D9.32 dByteIndexRow0 DN D7.8 dByteIndexRow1 DN D9.8 dVRow0 DN D8.8 dVRow1 DN D4.8 dVRow0U16 DN D8.U16 dVRow1U16 DN D4.U16 dVRow2U16 DN D8.U16 dVRow3U16 DN D4.U16 dShift DN D5.U16 dSrcRow0 DN D0.I16 dSrcRow1 DN D1.I16 dSrcRow2 DN D2.I16 dSrcRow3 DN D3.I16 dDqntRow0 DN D0.I16 dDqntRow1 DN D1.I16 dDqntRow2 DN D2.I16 dDqntRow3 DN D3.I16 ;// Registers used in TransformResidual4x4 ;// Packed Input pixels dIn0 DN D0.S16 dIn1 DN D1.S16 dIn2 DN D2.S16 dIn3 DN D3.S16 qIn01 QN Q0.32 qIn23 QN Q1.32 ;// Intermediate calculations dZero DN D4.S16 de0 DN D5.S16 de1 DN D6.S16 de2 DN D7.S16 de3 DN D8.S16 dIn1RS DN D7.S16 dIn3RS DN D8.S16 df0 DN D0.S16 df1 DN D1.S16 df2 DN D2.S16 df3 DN D3.S16 qf01 QN Q0.32 qf23 QN Q1.32 dg0 DN D5.S16 dg1 DN D6.S16 dg2 DN D7.S16 dg3 DN D8.S16 df1RS DN D7.S16 df3RS DN D8.S16 ;// Output pixels dh0 DN D0.S16 dh1 DN D1.S16 dh2 DN D2.S16 dh3 DN D3.S16 ;// Registers used in DequantTransformResidualFromPairAndAdd dDeltaRow0 DN D0.S16 dDeltaRow1 DN D1.S16 dDeltaRow2 DN D2.S16 dDeltaRow3 DN D3.S16 qDeltaRow01 QN Q0.S16 qDeltaRow23 QN Q1.S16 dPredValRow01 DN D4.U8 dPredValRow23 DN D5.U8 qSumRow01 QN Q3.S16 qSumRow23 QN Q4.S16 dDstRow01 DN D0.U8 dDstRow23 DN D1.U8 dDstRow0 DN D0.32[0] dDstRow1 DN D0.32[1] dDstRow2 DN D1.32[0] dDstRow3 DN D1.32[1] ;// Allocate stack memory required by the function M_ALLOC8 pBuffer, 32 ;// Write function header M_START omxVCM4P10_DequantTransformResidualFromPairAndAdd,r11,d9 ;// Define stack arguments M_ARG predStepOnStack, 4 M_ARG dstStepOnStack,4 M_ARG QPOnStack, 4 M_ARG ACOnStack,4 M_ADR pDelta,pBuffer M_LDR AC,ACOnStack ;// Save registers r1,r2,r3 before function call MOV pPredTemp,pPred MOV pDCTemp,pDC MOV pDstTemp,pDst CMP AC,#0 BEQ DCcase MOV pDeltaArg1,pDelta ;// Set up r1 for armVCM4P10_UnpackBlock4x4 BL armVCM4P10_UnpackBlock4x4 ;//-------------------------------------------------------- ;// armVCM4P10_DequantLumaAC4x4 : static function inlined ;//-------------------------------------------------------- ;//BL armVCM4P10_DequantLumaAC4x4 M_LDR QP,QPOnStack ;// Set up r1 for armVCM4P10_DequantLumaAC4x4 LDR pQPmod,=armVCM4P10_QPModuloTable LDR pQPdiv,=armVCM4P10_QPDivTable LDR pVRow,=armVCM4P10_VMatrixU16 LDRSB QPmod,[pQPmod,QP] ;// (QP%6) * 6 LDRSB shift,[pQPdiv,QP] ;// Shift = QP / 6 LDR index1,=0x03020504 LDR index0,=0x05040100 ;// Indexes into dVmatrix ADD pVRow,pVRow,QPmod VDUP dindexRow0,index0 VDUP dindexRow1,index1 VDUP dShift,shift ;// Load all 4x4 pVRow[] values VLD1 dVmatrix,[pVRow] ;// dVmatrix = [0d|0c|0b|0a] VTBL dVRow0,dVmatrix,dByteIndexRow0 ;// row0 = row2 = [pVRow[2] | pVRow[0] | pVRow[2] | pVRow[0]] VTBL dVRow1,dVmatrix,dByteIndexRow1 ;// row1 = row3 = [pVRow[1] | pVRow[2] | pVRow[1] | pVRow[2]] CMP pDCTemp,#0 ;// Load all the 4x4 'src' values VLD1 { dSrcRow0,dSrcRow1,dSrcRow2,dSrcRow3 },[pDelta] VSHL dVRow0U16,dVRow0U16,dShift VSHL dVRow1U16,dVRow1U16,dShift LDRSHNE DCval,[pDCTemp] ;// Multiply src[] with pVRow[] VMUL dDqntRow0,dSrcRow0,dVRow0U16 VMUL dDqntRow1,dSrcRow1,dVRow1U16 VMUL dDqntRow2,dSrcRow2,dVRow2U16 VMUL dDqntRow3,dSrcRow3,dVRow3U16 ;//------------------------------------------------------------- ;// TransformResidual4x4 : Inlined to avoid Load/Stores ;//------------------------------------------------------------- ;//BL armVCM4P10_TransformResidual4x4 ;//STRHNE DCval,[pDelta] VMOVNE dIn0[0],DCval ;//***************************************************************** ;// Transpose the input pixels : perform Row ops as Col ops ;//***************************************************************** VTRN dIn0,dIn1 VTRN dIn2,dIn3 VTRN qIn01,qIn23 VMOV dZero,#0 ;// Used to right shift by 1 ;//**************************************** ;// Row Operations (Performed on columns) ;//**************************************** VADD de0,dIn0,dIn2 ;// e0 = d0 + d2 VSUB de1,dIn0,dIn2 ;// e1 = d0 - d2 VHADD dIn1RS,dIn1,dZero ;// (f1>>1) constZero is a register holding 0 VHADD dIn3RS,dIn3,dZero VSUB de2,dIn1RS,dIn3 ;// e2 = (d1>>1) - d3 VADD de3,dIn1,dIn3RS ;// e3 = d1 + (d3>>1) VADD df0,de0,de3 ;// f0 = e0 + e3 VADD df1,de1,de2 ;// f1 = e1 + e2 VSUB df2,de1,de2 ;// f2 = e1 - e2 VSUB df3,de0,de3 ;// f3 = e0 - e3 ;//***************************************************************** ;// Transpose the resultant matrix ;//***************************************************************** VTRN df0,df1 VTRN df2,df3 VTRN qf01,qf23 ;//******************************* ;// Coloumn Operations ;//******************************* VADD dg0,df0,df2 ;// e0 = d0 + d2 VSUB dg1,df0,df2 ;// e1 = d0 - d2 VHADD df1RS,df1,dZero ;// (f1>>1) constZero is a register holding 0 VHADD df3RS,df3,dZero VSUB dg2,df1RS,df3 ;// e2 = (d1>>1) - d3 VADD dg3,df1,df3RS ;// e3 = d1 + (d3>>1) VADD dh0,dg0,dg3 ;// f0 = e0 + e3 VADD dh1,dg1,dg2 ;// f1 = e1 + e2 VSUB dh2,dg1,dg2 ;// f2 = e1 - e2 VSUB dh3,dg0,dg3 ;// f3 = e0 - e3 ;//************************************************ ;// Calculate final value (colOp[i][j] + 32)>>6 ;//************************************************ VRSHR dh0,#6 VRSHR dh1,#6 VRSHR dh2,#6 VRSHR dh3,#6 B OutDCcase DCcase ;// Calculate the Transformed DCvalue : (DCval+32)>>6 LDRSH DCval,[pDCTemp] ADD DCval,DCval,#32 ASR DCval,DCval,#6 VDUP dDeltaRow0, DCval ;// pDelta[0] = pDelta[1] = pDelta[2] = pDelta[3] = DCval VDUP dDeltaRow1, DCval ;// pDelta[4] = pDelta[5] = pDelta[6] = pDelta[7] = DCval VDUP dDeltaRow2, DCval ;// pDelta[8] = pDelta[9] = pDelta[10] = pDelta[11] = DCval VDUP dDeltaRow3, DCval OutDCcase M_LDR predstep,predStepOnStack M_LDR dstStep,dstStepOnStack LDR PredVal1,[pPredTemp],predstep LDR PredVal2,[pPredTemp],predstep VMOV dPredValRow01,PredVal1,PredVal2 LDR PredVal1,[pPredTemp],predstep LDR PredVal2,[pPredTemp] VMOV dPredValRow23,PredVal1,PredVal2 VADDW qSumRow01,qDeltaRow01,dPredValRow01 VADDW qSumRow23,qDeltaRow23,dPredValRow23 VQMOVUN dDstRow01,qSumRow01 VQMOVUN dDstRow23,qSumRow23 VST1 dDstRow0,[pDstTemp],dstStep VST1 dDstRow1,[pDstTemp],dstStep VST1 dDstRow2,[pDstTemp],dstStep VST1 dDstRow3,[pDstTemp] ;// Set return value MOV result,#OMX_Sts_NoErr End ;// Write function tail M_END ENDIF ;//CORTEXA8 END