summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s')
-rwxr-xr-xmedia/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s320
1 files changed, 320 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
new file mode 100755
index 0000000..a73f64a
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p2/src/omxVCM4P2_PredictReconCoefIntra_s.s
@@ -0,0 +1,320 @@
+; **********
+; *
+; * File Name: omxVCM4P2_PredictReconCoefIntra_s.s
+; * OpenMAX DL: v1.0.2
+; * Revision: 12290
+; * Date: Wednesday, April 9, 2008
+; *
+; * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+; *
+; *
+; *
+; * Description:
+; * Contains module for DC/AC coefficient prediction
+; *
+; *
+; * Function: omxVCM4P2_PredictReconCoefIntra
+; *
+; * Description:
+; * Performs adaptive DC/AC coefficient prediction for an intra block. Prior
+; * to the function call, prediction direction (predDir) should be selected
+; * as specified in subclause 7.4.3.1 of ISO/IEC 14496-2.
+; *
+; * Remarks:
+; *
+; * Parameters:
+; * [in] pSrcDst pointer to the coefficient buffer which contains the
+; * quantized coefficient residuals (PQF) of the current
+; * block; must be aligned on a 4-byte boundary. The
+; * output coefficients are saturated to the range
+; * [-2048, 2047].
+; * [in] pPredBufRow pointer to the coefficient row buffer; must be aligned
+; * on a 4-byte boundary.
+; * [in] pPredBufCol pointer to the coefficient column buffer; must be
+; * aligned on a 4-byte boundary.
+; * [in] curQP quantization parameter of the current block. curQP may
+; * equal to predQP especially when the current block and
+; * the predictor block are in the same macroblock.
+; * [in] predQP quantization parameter of the predictor block
+; * [in] predDir indicates the prediction direction which takes one
+; * of the following values:
+; * OMX_VIDEO_HORIZONTAL predict horizontally
+; * OMX_VIDEO_VERTICAL predict vertically
+; * [in] ACPredFlag a flag indicating if AC prediction should be
+; * performed. It is equal to ac_pred_flag in the bit
+; * stream syntax of MPEG-4
+; * [in] videoComp video component type (luminance, chrominance or
+; * alpha) of the current block
+; * [out] pSrcDst pointer to the coefficient buffer which contains
+; * the quantized coefficients (QF) of the current
+; * block
+; * [out] pPredBufRow pointer to the updated coefficient row buffer
+; * [out] pPredBufCol pointer to the updated coefficient column buffer
+; * Return Value:
+; * OMX_Sts_NoErr - no error
+; * OMX_Sts_BadArgErr - Bad arguments
+; * - At least one of the pointers is NULL: pSrcDst, pPredBufRow, or pPredBufCol.
+; * - At least one the following cases: curQP <= 0, predQP <= 0, curQP >31,
+; * predQP > 31, preDir exceeds [1,2].
+; * - At least one of the pointers pSrcDst, pPredBufRow, or pPredBufCol is not
+; * 4-byte aligned.
+; *
+; *********
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS CortexA8
+
+
+
+ IMPORT armVCM4P2_Reciprocal_QP_S32
+ IMPORT armVCM4P2_Reciprocal_QP_S16
+ IMPORT armVCM4P2_DCScaler
+
+ IF CortexA8
+;// Input Arguments
+
+pSrcDst RN 0
+pPredBufRow RN 1
+pPredBufCol RN 2
+curQP RN 3
+QP RN 3
+predQP RN 4
+predDir RN 5
+ACPredFlag RN 6
+videoComp RN 7
+
+;// Local Variables
+
+shortVideoHeader RN 4
+dcScaler RN 4
+index RN 6
+predCoeffTable RN 7
+temp1 RN 6
+temp2 RN 9
+temp RN 14
+Const RN 8
+temppPredColBuf RN 8
+tempPred RN 9
+
+absCoeffDC RN 8
+negdcScaler RN 10
+Rem RN 11
+temp3 RN 12
+
+dcRowbufCoeff RN 10
+dcColBuffCoeff RN 11
+Return RN 0
+
+;//NEON Registers
+
+qPredRowBuf QN Q0.S16
+dPredRowBuf0 DN D0.S16
+dPredRowBuf1 DN D1.S16
+
+
+
+
+qCoeffTab QN Q1.S32
+
+qPredQP QN Q2.S16
+dPredQP0 DN D4.S16
+dPredQP1 DN D5.S16
+
+
+qtemp1 QN Q3.S32
+qtemp QN Q3.S16
+
+dtemp0 DN D6.S16
+dtemp1 DN D7.S16
+
+dtemp2 DN D8.S16
+dtemp3 DN D9.S16
+
+dtemp4 DN D2.S16
+dtemp5 DN D3.S16
+dtemp6 DN D4.S16
+dtemp7 DN D5.S16
+
+qtempPred1 QN Q5.S32
+qtempPred QN Q5.S16
+
+dtempPred0 DN D10.S16
+dtempPred1 DN D11.S16
+
+
+
+ M_START omxVCM4P2_PredictReconCoefIntra,r11,d11
+
+ ;// Assigning pointers to Input arguments on Stack
+
+ M_ARG predQPonStack,4
+ M_ARG predDironStack,4
+ M_ARG ACPredFlagonStack,4
+ M_ARG videoComponStack,4
+
+ ;// DC Prediction
+
+ M_LDR videoComp,videoComponStack ;// Load videoComp From Stack
+
+ M_LDR predDir,predDironStack ;// Load Prediction direction
+ ;// DC Scaler calculation
+ LDR index, =armVCM4P2_DCScaler
+ ADD index,index,videoComp,LSL #5
+ LDRB dcScaler,[index,QP]
+
+
+ LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S16 ;// Loading the table with entries 32767/(1 to 63)
+ CMP predDir,#2 ;// Check if the Prediction direction is vertical
+
+ ;// Caulucate tempPred
+
+ LDREQSH absCoeffDC,[pPredBufRow] ;// If vetical load the coeff from Row Prediction Buffer
+ LDRNESH absCoeffDC,[pPredBufCol] ;// If horizontal load the coeff from column Prediction Buffer
+
+ RSB negdcScaler,dcScaler,#0 ;// negdcScaler=-dcScaler
+ MOV temp1,absCoeffDC ;// Load the Prediction coeff to temp for comparision
+ CMP temp1,#0
+ RSBLT absCoeffDC,temp1,#0 ;// calculate absolute val of prediction coeff
+
+ ADD temp,dcScaler,dcScaler
+ LDRH temp,[predCoeffTable,temp] ;// Load value from coeff table for performing division using multiplication
+ SMULBB tempPred,temp,absCoeffDC ;// tempped=pPredBufRow(Col)[0]*32767/dcScaler
+ ADD temp3,dcScaler,#1
+ LSR tempPred,tempPred,#15 ;// tempped=pPredBufRow(Col)[0]/dcScaler
+ LSR temp3,temp3,#1 ;// temp3=round(dcScaler/2)
+ MLA Rem,negdcScaler,tempPred,absCoeffDC ;// Remainder Rem=abs(pPredBufRow(Col)[0])-tempPred*dcScaler
+
+ LDRH dcRowbufCoeff,[pPredBufCol]
+
+ CMP Rem,temp3 ;// compare Rem with (dcScaler/2)
+ ADDGE tempPred,#1 ;// tempPred=tempPred+1 if Rem>=(dcScaler/2)
+ CMP temp1,#0
+ RSBLT tempPred,tempPred,#0 ;// tempPred=-tempPred if
+
+ STRH dcRowbufCoeff,[pPredBufRow,#-16]
+
+
+ LDRH temp,[pSrcDst] ;// temp=pSrcDst[0]
+ ADD temp,temp,tempPred ;// temp=pSrcDst[0]+tempPred
+ SSAT16 temp,#12,temp ;// clip temp to [-2048,2047]
+ SMULBB dcColBuffCoeff,temp,dcScaler ;// temp1=clipped(pSrcDst[0])*dcScaler
+ M_LDR ACPredFlag,ACPredFlagonStack
+ STRH dcColBuffCoeff,[pPredBufCol]
+
+
+ ;// AC Prediction
+
+ M_LDR predQP,predQPonStack
+
+ CMP ACPredFlag,#1 ;// Check if the AC prediction flag is set or not
+ BNE Exit ;// If not set Exit
+ CMP predDir,#2 ;// Check the Prediction direction
+ LDR predCoeffTable, =armVCM4P2_Reciprocal_QP_S32 ;// Loading the table with entries 0x1ffff/(1 to 63)
+ MOV Const,#4
+ MUL curQP,curQP,Const ;// curQP=4*curQP
+ VDUP dPredQP0,predQP
+ LDR temp2,[predCoeffTable,curQP] ;// temp=0x1ffff/curQP
+ VDUP qCoeffTab,temp2
+ BNE Horizontal ;// If the Prediction direction is horizontal branch to Horizontal
+
+
+
+ ;// Vertical
+ ;//Calculating tempPred
+
+ VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufRow] ;// Loading pPredBufRow[i]:i=0 t0 7
+
+ VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i]: i=0 t0 3
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+
+ VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufRow[i]*dPredQP[i] : i=4 t0 7
+
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=0 t0 3
+ VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+
+
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufRow[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufRow[i]*dPredQP[i]/curQP) : i=4 t0 7
+ VLD1 {dtemp0,dtemp1},[pSrcDst] ;//Loading pSrcDst[i] : i=0 to 7
+ VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+ VMOV dtempPred0,dPredQP1
+
+ ;//updating source and row prediction buffer contents
+ VADD qtemp,qtemp,qtempPred ;//pSrcDst[i]=pSrcDst[i]+qtempPred[i]: i=0 to 7
+ VQSHL qtemp,qtemp,#4 ;//Clip to [-2048,2047]
+ LDRH dcRowbufCoeff,[pPredBufRow] ;//Loading Dc Value of Row Prediction buffer
+ VSHR qtemp,qtemp,#4
+
+ VST1 {dtemp0,dtemp1},[pSrcDst] ;//storing back the updated values
+ VST1 {dtemp0,dtemp1},[pPredBufRow] ;//storing back the updated row prediction values
+ STRH dcRowbufCoeff,[pPredBufRow] ;// storing the updated DC Row Prediction coeff
+
+ B Exit
+
+Horizontal
+
+ ;// Calculating Temppred
+
+
+
+ VLD1 {dPredRowBuf0,dPredRowBuf1},[pPredBufCol] ;// Loading pPredBufCol[i]:i=0 t0 7
+ VMULL qtemp1,dPredRowBuf0,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i]: i=0 t0 3
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=0 t0 3
+
+ VMULL qtemp1,dPredRowBuf1,dPredQP0 ;//qtemp1[i]=pPredBufCol[i]*dPredQP[i] : i=4 t0 7
+
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=0 t0 3
+ VSHRN dPredQP1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+
+
+ VMUL qtempPred1,qtemp1,qCoeffTab ;//qtempPred1[i]=pPredBufCol[i]*dPredQP[i]*0x1ffff/curQP : i=4 t0 7
+
+ MOV temppPredColBuf,pPredBufCol
+ VRSHR qtempPred1,qtempPred1,#17 ;//qtempPred1[i]=round(pPredBufCol[i]*dPredQP[i]/curQP) : i=4 t0 7
+ VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Loading coefficients Interleaving by 4
+ VSHRN dtempPred1,qtempPred1,#0 ;// narrow qtempPred1[i] to 16 bits
+ VMOV dtempPred0,dPredQP1
+
+ ;// Updating source and column prediction buffer contents
+ ADD temp2,pSrcDst,#32
+ VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2] ;// Loading next 16 coefficients Interleaving by 4
+ VUZP dtemp0,dtemp4 ;// Interleaving by 8
+ VADD dtemp0,dtemp0,dtempPred0 ;// Adding tempPred to coeffs
+ VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047]
+ VSHR dtemp0,dtemp0,#4
+ VST1 {dtemp0},[pPredBufCol]! ;// Updating Pridiction column buffer
+ VZIP dtemp0,dtemp4 ;// deinterleaving
+ VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[pSrcDst] ;// Updating source coeffs
+ VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]!
+
+ MOV temp1,temp2
+ VLD4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp2]! ;// Loading coefficients Interleaving by 4
+
+ VLD4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+ VUZP dtemp0,dtemp4 ;// Interleaving by 8
+ VADD dtemp0,dtemp0,dtempPred1
+ VQSHL dtemp0,dtemp0,#4 ;// Clip to [-2048,2047]
+ VSHR dtemp0,dtemp0,#4
+ VST1 {dtemp0},[pPredBufCol]!
+ VZIP dtemp0,dtemp4
+ VST4 {dtemp0,dtemp1,dtemp2,dtemp3},[temp1]
+ STRH dcColBuffCoeff,[temppPredColBuf]
+ VST4 {dtemp4,dtemp5,dtemp6,dtemp7},[temp2]
+
+Exit
+
+ STRH temp,[pSrcDst]
+
+
+ MOV Return,#OMX_Sts_NoErr
+
+ M_END
+ ENDIF
+
+
+ END
+
+
+