summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
diff options
context:
space:
mode:
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s')
-rw-r--r--media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s407
1 files changed, 407 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
new file mode 100644
index 0000000..241d188
--- /dev/null
+++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s
@@ -0,0 +1,407 @@
+;//
+;//
+;// File Name: armVCM4P10_TransformResidual4x4_s.s
+;// OpenMAX DL: v1.0.2
+;// Revision: 9641
+;// Date: Thursday, February 7, 2008
+;//
+;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+;//
+;//
+;//
+;// Description:
+;// Transform Residual 4x4 Coefficients
+;//
+;//
+
+
+;// Include standard headers
+
+ INCLUDE omxtypes_s.h
+ INCLUDE armCOMM_s.h
+
+ M_VARIANTS ARM1136JS
+
+;// Import symbols required from other files
+;// (For example tables)
+
+
+
+
+;// Set debugging level
+;//DEBUG_ON SETL {TRUE}
+
+
+
+;// Guarding implementation by the processor name
+
+ IF ARM1136JS
+
+;//Input Registers
+pDst RN 0
+pSrc RN 1
+
+;//Output Registers
+
+
+;//Local Scratch Registers
+
+;// Packed Input pixels
+in00 RN 2 ;// Src[0] & Src[1]
+in02 RN 3 ;// Src[2] & Src[3]
+in10 RN 4 ;// Src[4] & Src[5]
+in12 RN 5 ;// Src[6] & Src[7]
+in20 RN 6 ;// Src[8] & Src[9]
+in22 RN 7 ;// Src[10] & Src[11]
+in30 RN 8 ;// Src[12] & Src[13]
+in32 RN 9 ;// Src[14] & Src[15]
+
+;// Transpose for Row operations (Rows to cols)
+trRow00 RN 2
+trRow10 RN 10
+trRow02 RN 3
+trRow12 RN 5
+trRow20 RN 11
+trRow30 RN 12
+trRow32 RN 14
+trRow22 RN 7
+
+;// Intermediate calculations
+e0 RN 4
+e1 RN 6
+e2 RN 8
+e3 RN 9
+constZero RN 1
+
+;// Row operated pixels
+rowOp00 RN 2
+rowOp10 RN 10
+rowOp20 RN 11
+rowOp30 RN 12
+rowOp02 RN 3
+rowOp12 RN 5
+rowOp22 RN 7
+rowOp32 RN 14
+
+;// Transpose for colulmn operations
+trCol00 RN 2
+trCol02 RN 3
+trCol10 RN 4
+trCol12 RN 5
+trCol20 RN 6
+trCol22 RN 7
+trCol30 RN 8
+trCol32 RN 9
+
+;// Intermediate calculations
+g0 RN 10
+g1 RN 11
+g2 RN 12
+g3 RN 14
+
+;// Coloumn operated pixels
+colOp00 RN 2
+colOp02 RN 3
+colOp10 RN 4
+colOp12 RN 5
+colOp20 RN 6
+colOp22 RN 7
+colOp30 RN 8
+colOp32 RN 9
+
+
+temp1 RN 10 ;// Temporary scratch varaibles
+const1 RN 11
+const2 RN 12
+mask RN 14
+
+;// Output pixels
+out00 RN 2
+out02 RN 3
+out10 RN 4
+out12 RN 5
+out20 RN 6
+out22 RN 7
+out30 RN 8
+out32 RN 9
+
+
+
+ ;// Allocate stack memory required by the function
+
+
+ ;// Write function header
+ M_START armVCM4P10_TransformResidual4x4,r11
+
+ ;******************************************************************
+ ;// The strategy used in implementing the transform is as follows:*
+ ;// Load the 4x4 block into 8 registers *
+ ;// Transpose the 4x4 matrix *
+ ;// Perform the row operations (on columns) using SIMD *
+ ;// Transpose the 4x4 result matrix *
+ ;// Perform the coloumn operations *
+ ;// Store the 4x4 block at one go *
+ ;******************************************************************
+
+ ;// Load all the 4x4 pixels
+
+ LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
+
+ MOV constZero,#0 ;// Used to right shift by 1
+ ;LDR constZero,=0x00000000
+
+ ;*****************************************************************
+ ;//
+ ;// Transpose the matrix inorder to perform row ops as coloumn ops
+ ;// Input: in[][] = original matrix
+ ;// Output: trRow[][]= transposed matrix
+ ;// Step1: Obtain the LL part of the transposed matrix
+ ;// Step2: Obtain the HL part
+ ;// step3: Obtain the LH part
+ ;// Step4: Obtain the HH part
+ ;//
+ ;*****************************************************************
+
+ ;// LL 2x2 transposed matrix
+ ;// d0 d1 - -
+ ;// d4 d5 - -
+ ;// - - - -
+ ;// - - - -
+
+ PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1]
+ PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0]
+
+ ;// HL 2x2 transposed matrix
+ ;// - - - -
+ ;// - - - -
+ ;// d8 d9 - -
+ ;// d12 d13 - -
+
+
+ PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3]
+ PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2]
+
+ ;// LH 2x2 transposed matrix
+ ;// - - d2 d3
+ ;// - - d6 d7
+ ;// - - - -
+ ;// - - - -
+
+ PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8]
+ PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9]
+
+
+
+
+ ;// HH 2x2 transposed matrix
+ ;// - - - -
+ ;// - - - -
+ ;// - - d10 d11
+ ;// - - d14 d15
+
+ PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11]
+ PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10]
+
+
+ ;****************************************
+ ;// Row Operations (Performed on columns)
+ ;****************************************
+
+
+ ;// SIMD operations on first two columns(two rows of the original matrix)
+
+
+ SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2
+ SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2
+ SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0
+ SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3
+ SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3
+ SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1)
+ SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3
+ SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2
+ SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2
+ SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3
+
+ ;// SIMD operations on next two columns(next two rows of the original matrix)
+
+ SADD16 e0, trRow02,trRow22
+ SSUB16 e1, trRow02,trRow22
+ SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0
+ SHADD16 e3, trRow32,constZero
+ SSUB16 e2, e2, trRow32
+ SADD16 e3, e3, trRow12
+ SADD16 rowOp02, e0, e3
+ SADD16 rowOp12, e1, e2
+ SSUB16 rowOp22, e1, e2
+ SSUB16 rowOp32, e0, e3
+
+
+ ;*****************************************************************
+ ;// Transpose the resultant matrix
+ ;// Input: rowOp[][]
+ ;// Output: trCol[][]
+ ;*****************************************************************
+
+ ;// LL 2x2 transposed matrix
+ ;// d0 d1 - -
+ ;// d4 d5 - -
+ ;// - - - -
+ ;// - - - -
+
+ PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1]
+ PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0]
+
+ ;// HL 2x2 transposed matrix
+ ;// - - - -
+ ;// - - - -
+ ;// d8 d9 - -
+ ;// d12 d13 - -
+
+
+ PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3]
+ PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2]
+
+ ;// LH 2x2 transposed matrix
+ ;// - - d2 d3
+ ;// - - d6 d7
+ ;// - - - -
+ ;// - - - -
+
+ PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8]
+ PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9]
+
+
+
+
+ ;// HH 2x2 transposed matrix
+ ;// - - - -
+ ;// - - - -
+ ;// - - d10 d11
+ ;// - - d14 d15
+
+ PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11]
+ PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10]
+
+
+ ;*******************************
+ ;// Coloumn Operations
+ ;*******************************
+
+
+ ;// SIMD operations on first two columns
+
+
+ SADD16 g0, trCol00,trCol20
+ SSUB16 g1, trCol00,trCol20
+ SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0
+ SHADD16 g3, trCol30,constZero
+ SSUB16 g2, g2, trCol30
+ SADD16 g3, g3, trCol10
+ SADD16 colOp00, g0, g3
+ SADD16 colOp10, g1, g2
+ SSUB16 colOp20, g1, g2
+ SSUB16 colOp30, g0, g3
+
+ ;// SIMD operations on next two columns
+
+ SADD16 g0, trCol02,trCol22
+ SSUB16 g1, trCol02,trCol22
+ SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0
+ SHADD16 g3, trCol32,constZero
+ SSUB16 g2, g2, trCol32
+ SADD16 g3, g3, trCol12
+ SADD16 colOp02, g0, g3
+ SADD16 colOp12, g1, g2
+ SSUB16 colOp22, g1, g2
+ SSUB16 colOp32, g0, g3
+
+
+
+
+
+ ;************************************************
+ ;// Calculate final value (colOp[i][j] + 32)>>6
+ ;************************************************
+
+ ;// const1: Serves dual purpose
+ ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
+ ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
+
+ LDR const1, =0x00208020
+
+ LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits
+
+ ;// const2(#512): used to convert the lower 16bit number back to signed value
+
+ MOV const2,#0x200 ;// const2 = 2^9
+
+ ;// First Row
+
+ SADD16 colOp00, colOp00, const1
+ SADD16 colOp02, colOp02, const1
+ AND colOp00, mask, colOp00, ASR #6
+ AND colOp02, mask, colOp02, ASR #6
+ SSUB16 out00,colOp00,const2
+ SSUB16 out02,colOp02,const2
+
+
+ ;// Second Row
+
+ SADD16 colOp10, colOp10, const1
+ SADD16 colOp12, colOp12, const1
+ AND colOp10, mask, colOp10, ASR #6
+ AND colOp12, mask, colOp12, ASR #6
+ SSUB16 out10,colOp10,const2
+ SSUB16 out12,colOp12,const2
+
+
+ ;// Third Row
+
+ SADD16 colOp20, colOp20, const1
+ SADD16 colOp22, colOp22, const1
+ AND colOp20, mask, colOp20, ASR #6
+ AND colOp22, mask, colOp22, ASR #6
+ SSUB16 out20,colOp20,const2
+ SSUB16 out22,colOp22,const2
+
+
+ ;// Fourth Row
+
+ SADD16 colOp30, colOp30, const1
+ SADD16 colOp32, colOp32, const1
+ AND colOp30, mask, colOp30, ASR #6
+ AND colOp32, mask, colOp32, ASR #6
+ SSUB16 out30,colOp30,const2
+ SSUB16 out32,colOp32,const2
+
+
+
+
+ ;***************************
+ ;// Store all the 4x4 pixels
+ ;***************************
+
+ STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
+
+
+
+ ;// Set return value
+
+End
+
+
+ ;// Write function tail
+ M_END
+
+ ENDIF ;//ARM1136JS
+
+
+
+
+
+
+
+;// Guarding implementation by the processor name
+
+
+ END \ No newline at end of file