diff options
Diffstat (limited to 'media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s')
-rw-r--r-- | media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s | 407 |
1 files changed, 407 insertions, 0 deletions
diff --git a/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s new file mode 100644 index 0000000..241d188 --- /dev/null +++ b/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s @@ -0,0 +1,407 @@ +;// +;// +;// File Name: armVCM4P10_TransformResidual4x4_s.s +;// OpenMAX DL: v1.0.2 +;// Revision: 9641 +;// Date: Thursday, February 7, 2008 +;// +;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved. +;// +;// +;// +;// Description: +;// Transform Residual 4x4 Coefficients +;// +;// + + +;// Include standard headers + + INCLUDE omxtypes_s.h + INCLUDE armCOMM_s.h + + M_VARIANTS ARM1136JS + +;// Import symbols required from other files +;// (For example tables) + + + + +;// Set debugging level +;//DEBUG_ON SETL {TRUE} + + + +;// Guarding implementation by the processor name + + IF ARM1136JS + +;//Input Registers +pDst RN 0 +pSrc RN 1 + +;//Output Registers + + +;//Local Scratch Registers + +;// Packed Input pixels +in00 RN 2 ;// Src[0] & Src[1] +in02 RN 3 ;// Src[2] & Src[3] +in10 RN 4 ;// Src[4] & Src[5] +in12 RN 5 ;// Src[6] & Src[7] +in20 RN 6 ;// Src[8] & Src[9] +in22 RN 7 ;// Src[10] & Src[11] +in30 RN 8 ;// Src[12] & Src[13] +in32 RN 9 ;// Src[14] & Src[15] + +;// Transpose for Row operations (Rows to cols) +trRow00 RN 2 +trRow10 RN 10 +trRow02 RN 3 +trRow12 RN 5 +trRow20 RN 11 +trRow30 RN 12 +trRow32 RN 14 +trRow22 RN 7 + +;// Intermediate calculations +e0 RN 4 +e1 RN 6 +e2 RN 8 +e3 RN 9 +constZero RN 1 + +;// Row operated pixels +rowOp00 RN 2 +rowOp10 RN 10 +rowOp20 RN 11 +rowOp30 RN 12 +rowOp02 RN 3 +rowOp12 RN 5 +rowOp22 RN 7 +rowOp32 RN 14 + +;// Transpose for colulmn operations +trCol00 RN 2 +trCol02 RN 3 +trCol10 RN 4 +trCol12 RN 5 +trCol20 RN 6 +trCol22 RN 7 +trCol30 RN 8 +trCol32 RN 9 + +;// Intermediate calculations +g0 RN 10 +g1 RN 11 +g2 RN 12 +g3 RN 14 + +;// Coloumn operated pixels +colOp00 RN 2 +colOp02 RN 3 +colOp10 RN 4 +colOp12 RN 5 +colOp20 RN 6 +colOp22 RN 7 +colOp30 RN 8 +colOp32 RN 9 + + +temp1 RN 10 ;// Temporary scratch varaibles +const1 RN 11 +const2 RN 12 +mask RN 14 + +;// Output pixels +out00 RN 2 +out02 RN 3 +out10 RN 4 +out12 RN 5 +out20 RN 6 +out22 RN 7 +out30 RN 8 +out32 RN 9 + + + + ;// Allocate stack memory required by the function + + + ;// Write function header + M_START armVCM4P10_TransformResidual4x4,r11 + + ;****************************************************************** + ;// The strategy used in implementing the transform is as follows:* + ;// Load the 4x4 block into 8 registers * + ;// Transpose the 4x4 matrix * + ;// Perform the row operations (on columns) using SIMD * + ;// Transpose the 4x4 result matrix * + ;// Perform the coloumn operations * + ;// Store the 4x4 block at one go * + ;****************************************************************** + + ;// Load all the 4x4 pixels + + LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32} + + MOV constZero,#0 ;// Used to right shift by 1 + ;LDR constZero,=0x00000000 + + ;***************************************************************** + ;// + ;// Transpose the matrix inorder to perform row ops as coloumn ops + ;// Input: in[][] = original matrix + ;// Output: trRow[][]= transposed matrix + ;// Step1: Obtain the LL part of the transposed matrix + ;// Step2: Obtain the HL part + ;// step3: Obtain the LH part + ;// Step4: Obtain the HH part + ;// + ;***************************************************************** + + ;// LL 2x2 transposed matrix + ;// d0 d1 - - + ;// d4 d5 - - + ;// - - - - + ;// - - - - + + PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1] + PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0] + + ;// HL 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// d8 d9 - - + ;// d12 d13 - - + + + PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3] + PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2] + + ;// LH 2x2 transposed matrix + ;// - - d2 d3 + ;// - - d6 d7 + ;// - - - - + ;// - - - - + + PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8] + PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9] + + + + + ;// HH 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// - - d10 d11 + ;// - - d14 d15 + + PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11] + PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10] + + + ;**************************************** + ;// Row Operations (Performed on columns) + ;**************************************** + + + ;// SIMD operations on first two columns(two rows of the original matrix) + + + SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2 + SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2 + SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0 + SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3 + SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3 + SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1) + SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3 + SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2 + SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2 + SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3 + + ;// SIMD operations on next two columns(next two rows of the original matrix) + + SADD16 e0, trRow02,trRow22 + SSUB16 e1, trRow02,trRow22 + SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0 + SHADD16 e3, trRow32,constZero + SSUB16 e2, e2, trRow32 + SADD16 e3, e3, trRow12 + SADD16 rowOp02, e0, e3 + SADD16 rowOp12, e1, e2 + SSUB16 rowOp22, e1, e2 + SSUB16 rowOp32, e0, e3 + + + ;***************************************************************** + ;// Transpose the resultant matrix + ;// Input: rowOp[][] + ;// Output: trCol[][] + ;***************************************************************** + + ;// LL 2x2 transposed matrix + ;// d0 d1 - - + ;// d4 d5 - - + ;// - - - - + ;// - - - - + + PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1] + PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0] + + ;// HL 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// d8 d9 - - + ;// d12 d13 - - + + + PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3] + PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2] + + ;// LH 2x2 transposed matrix + ;// - - d2 d3 + ;// - - d6 d7 + ;// - - - - + ;// - - - - + + PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8] + PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9] + + + + + ;// HH 2x2 transposed matrix + ;// - - - - + ;// - - - - + ;// - - d10 d11 + ;// - - d14 d15 + + PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11] + PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10] + + + ;******************************* + ;// Coloumn Operations + ;******************************* + + + ;// SIMD operations on first two columns + + + SADD16 g0, trCol00,trCol20 + SSUB16 g1, trCol00,trCol20 + SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0 + SHADD16 g3, trCol30,constZero + SSUB16 g2, g2, trCol30 + SADD16 g3, g3, trCol10 + SADD16 colOp00, g0, g3 + SADD16 colOp10, g1, g2 + SSUB16 colOp20, g1, g2 + SSUB16 colOp30, g0, g3 + + ;// SIMD operations on next two columns + + SADD16 g0, trCol02,trCol22 + SSUB16 g1, trCol02,trCol22 + SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0 + SHADD16 g3, trCol32,constZero + SSUB16 g2, g2, trCol32 + SADD16 g3, g3, trCol12 + SADD16 colOp02, g0, g3 + SADD16 colOp12, g1, g2 + SSUB16 colOp22, g1, g2 + SSUB16 colOp32, g0, g3 + + + + + + ;************************************************ + ;// Calculate final value (colOp[i][j] + 32)>>6 + ;************************************************ + + ;// const1: Serves dual purpose + ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result + ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768) + + LDR const1, =0x00208020 + + LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits + + ;// const2(#512): used to convert the lower 16bit number back to signed value + + MOV const2,#0x200 ;// const2 = 2^9 + + ;// First Row + + SADD16 colOp00, colOp00, const1 + SADD16 colOp02, colOp02, const1 + AND colOp00, mask, colOp00, ASR #6 + AND colOp02, mask, colOp02, ASR #6 + SSUB16 out00,colOp00,const2 + SSUB16 out02,colOp02,const2 + + + ;// Second Row + + SADD16 colOp10, colOp10, const1 + SADD16 colOp12, colOp12, const1 + AND colOp10, mask, colOp10, ASR #6 + AND colOp12, mask, colOp12, ASR #6 + SSUB16 out10,colOp10,const2 + SSUB16 out12,colOp12,const2 + + + ;// Third Row + + SADD16 colOp20, colOp20, const1 + SADD16 colOp22, colOp22, const1 + AND colOp20, mask, colOp20, ASR #6 + AND colOp22, mask, colOp22, ASR #6 + SSUB16 out20,colOp20,const2 + SSUB16 out22,colOp22,const2 + + + ;// Fourth Row + + SADD16 colOp30, colOp30, const1 + SADD16 colOp32, colOp32, const1 + AND colOp30, mask, colOp30, ASR #6 + AND colOp32, mask, colOp32, ASR #6 + SSUB16 out30,colOp30,const2 + SSUB16 out32,colOp32,const2 + + + + + ;*************************** + ;// Store all the 4x4 pixels + ;*************************** + + STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32} + + + + ;// Set return value + +End + + + ;// Write function tail + M_END + + ENDIF ;//ARM1136JS + + + + + + + +;// Guarding implementation by the processor name + + + END
\ No newline at end of file |