media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_TransformResidual4x4_s.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407

;//
;// 
;// File Name:  armVCM4P10_TransformResidual4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//
;// Description:
;// Transform Residual 4x4 Coefficients
;// 
;// 

        
;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS ARM1136JS
        
;// Import symbols required from other files
;// (For example tables)
    
        
;// Set debugging level        
;//DEBUG_ON    SETL {TRUE}


;// Guarding implementation by the processor name
    
    IF  ARM1136JS 
    
;//Input Registers
pDst                RN  0
pSrc                RN  1

;//Output Registers


;//Local Scratch Registers

;// Packed Input pixels
in00                RN  2                   ;// Src[0] & Src[1] 
in02                RN  3                   ;// Src[2] & Src[3]
in10                RN  4                   ;// Src[4] & Src[5]
in12                RN  5                   ;// Src[6] & Src[7]
in20                RN  6                   ;// Src[8] & Src[9]
in22                RN  7                   ;// Src[10] & Src[11]
in30                RN  8                   ;// Src[12] & Src[13]
in32                RN  9                   ;// Src[14] & Src[15]

;// Transpose for Row operations (Rows to cols)
trRow00             RN  2
trRow10             RN  10
trRow02             RN  3
trRow12             RN  5
trRow20             RN  11
trRow30             RN  12
trRow32             RN  14
trRow22             RN  7

;// Intermediate calculations
e0                  RN  4                   
e1                  RN  6
e2                  RN  8
e3                  RN  9
constZero           RN  1

;// Row operated pixels
rowOp00             RN  2
rowOp10             RN  10
rowOp20             RN  11
rowOp30             RN  12
rowOp02             RN  3
rowOp12             RN  5
rowOp22             RN  7
rowOp32             RN  14

;// Transpose for colulmn operations
trCol00             RN  2                   
trCol02             RN  3                   
trCol10             RN  4                   
trCol12             RN  5                   
trCol20             RN  6                   
trCol22             RN  7                   
trCol30             RN  8                   
trCol32             RN  9  

;// Intermediate calculations
g0                  RN  10
g1                  RN  11
g2                  RN  12
g3                  RN  14   

;// Coloumn operated pixels
colOp00             RN  2                   
colOp02             RN  3                   
colOp10             RN  4                   
colOp12             RN  5                   
colOp20             RN  6                   
colOp22             RN  7                   
colOp30             RN  8                   
colOp32             RN  9  


temp1               RN  10                  ;// Temporary scratch varaibles
const1              RN  11      
const2              RN  12
mask                RN  14

;// Output pixels
out00               RN  2                   
out02               RN  3                   
out10               RN  4                   
out12               RN  5                   
out20               RN  6                   
out22               RN  7                   
out30               RN  8                   
out32               RN  9  
      
       
    ;// Allocate stack memory required by the function
        

    ;// Write function header
        M_START armVCM4P10_TransformResidual4x4,r11
        
        ;******************************************************************
        ;// The strategy used in implementing the transform is as follows:*
        ;// Load the 4x4 block into 8 registers                           *  
        ;// Transpose the 4x4 matrix                                      *  
        ;// Perform the row operations (on columns) using SIMD            *  
        ;// Transpose the 4x4 result matrix                               *  
        ;// Perform the coloumn operations                                *
        ;// Store the 4x4 block at one go                                 *  
        ;******************************************************************

        ;// Load all the 4x4 pixels
        
        LDMIA   pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
        
        MOV       constZero,#0                                     ;// Used to right shift by 1 
        ;LDR       constZero,=0x00000000  
        
        ;*****************************************************************
        ;//
        ;// Transpose the matrix inorder to perform row ops as coloumn ops
        ;// Input:   in[][] = original matrix
        ;// Output:  trRow[][]= transposed matrix
        ;// Step1: Obtain the LL part of the transposed matrix
        ;// Step2: Obtain the HL part
        ;// step3: Obtain the LH part
        ;// Step4: Obtain the HH part
        ;//
        ;*****************************************************************
        
        ;// LL 2x2 transposed matrix 
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -
        
        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]    
        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]  
        
        ;// HL 2x2 transposed matrix  
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -
        
         
         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2] 
        
        ;// LH 2x2 transposed matrix 
        ;//   - - d2 d3 
        ;//   - - d6 d7 
        ;//   - - -  -
        ;//   - - -  -
        
        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]  
        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9] 
        
        
        ;// HH 2x2 transposed matrix  
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15
        
        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
       
        
        ;**************************************** 
        ;// Row Operations (Performed on columns)
        ;**************************************** 
        
        
        ;// SIMD operations on first two columns(two rows of the original matrix)
        
        
        SADD16      e0, trRow00,trRow20                   ;//  e0 = d0 + d2 
        SSUB16    e1, trRow00,trRow20                   ;//  e1 = d0 - d2  
        SHADD16   e2, trRow10,constZero                 ;// (f1>>1) constZero is a register holding 0
        SHADD16   e3, trRow30,constZero                 ;//  avoid pipeline stalls for e2 and e3
        SSUB16    e2, e2, trRow30                       ;//  e2 = (d1>>1) - d3  
        SADD16    e3, e3, trRow10                       ;//  e3 = d1 + (d3>>1)  
        SADD16    rowOp00, e0, e3                       ;//  f0 = e0 + e3  
        SADD16    rowOp10, e1, e2                       ;//  f1 = e1 + e2  
        SSUB16    rowOp20, e1, e2                       ;//  f2 = e1 - e2  
        SSUB16    rowOp30, e0, e3                       ;//  f3 = e0 - e3
        
        ;// SIMD operations on next two columns(next two rows of the original matrix)
        
        SADD16      e0, trRow02,trRow22
        SSUB16    e1, trRow02,trRow22
        SHADD16   e2, trRow12,constZero                 ;//(f1>>1) constZero is a register holding 0
        SHADD16   e3, trRow32,constZero
        SSUB16    e2, e2, trRow32
        SADD16    e3, e3, trRow12
        SADD16    rowOp02, e0, e3
        SADD16    rowOp12, e1, e2
        SSUB16    rowOp22, e1, e2
        SSUB16    rowOp32, e0, e3
        
        
        ;*****************************************************************
        ;// Transpose the resultant matrix
        ;// Input:  rowOp[][]
        ;// Output: trCol[][] 
        ;*****************************************************************
        
        ;// LL 2x2 transposed matrix 
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -
        
        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]  
        
        ;// HL 2x2 transposed matrix  
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -
        
         
         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2] 
        
        ;// LH 2x2 transposed matrix 
        ;//   - - d2 d3 
        ;//   - - d6 d7 
        ;//   - - -  -
        ;//   - - -  -
        
        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]  
        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9] 
        
        
        ;// HH 2x2 transposed matrix  
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15
        
        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
       
        
        ;******************************* 
        ;// Coloumn Operations 
        ;******************************* 
        
        
        ;// SIMD operations on first two columns
        
          
        SADD16      g0, trCol00,trCol20
        SSUB16    g1, trCol00,trCol20
        SHADD16   g2, trCol10,constZero                     ;// (f1>>1) constZero is a register holding 0
        SHADD16   g3, trCol30,constZero
        SSUB16    g2, g2, trCol30
        SADD16    g3, g3, trCol10
        SADD16    colOp00, g0, g3
        SADD16    colOp10, g1, g2
        SSUB16    colOp20, g1, g2
        SSUB16    colOp30, g0, g3
        
        ;// SIMD operations on next two columns
        
        SADD16      g0, trCol02,trCol22
        SSUB16    g1, trCol02,trCol22
        SHADD16   g2, trCol12,constZero                     ;// (f1>>1) constZero is a register holding 0
        SHADD16   g3, trCol32,constZero
        SSUB16    g2, g2, trCol32
        SADD16    g3, g3, trCol12
        SADD16    colOp02, g0, g3
        SADD16    colOp12, g1, g2
        SSUB16    colOp22, g1, g2
        SSUB16    colOp32, g0, g3
        
        
        ;************************************************
        ;// Calculate final value (colOp[i][j] + 32)>>6
        ;************************************************
        
        ;// const1: Serves dual purpose 
        ;// (1) Add #32 to both the lower and higher 16bits of the SIMD result 
        ;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
        
        LDR     const1, =0x00208020             
        
        LDR     mask, =0xffff03ff                       ;// Used to mask the down shifted 6 bits  
        
        ;// const2(#512): used to convert the lower 16bit number back to signed value 
      
        MOV     const2,#0x200                           ;// const2 = 2^9
        
        ;// First Row 
        
        SADD16    colOp00, colOp00, const1
        SADD16    colOp02, colOp02, const1
        AND     colOp00, mask, colOp00, ASR #6
        AND     colOp02, mask, colOp02, ASR #6
        SSUB16  out00,colOp00,const2
        SSUB16  out02,colOp02,const2    
        

        ;// Second Row
        
        SADD16    colOp10, colOp10, const1
        SADD16    colOp12, colOp12, const1
        AND     colOp10, mask, colOp10, ASR #6
        AND     colOp12, mask, colOp12, ASR #6
        SSUB16  out10,colOp10,const2
        SSUB16  out12,colOp12,const2    
        
        
        ;// Third Row
        
        SADD16    colOp20, colOp20, const1
        SADD16    colOp22, colOp22, const1
        AND     colOp20, mask, colOp20, ASR #6
        AND     colOp22, mask, colOp22, ASR #6
        SSUB16  out20,colOp20,const2
        SSUB16  out22,colOp22,const2
        
        
        ;// Fourth Row   
        
        SADD16    colOp30, colOp30, const1
        SADD16    colOp32, colOp32, const1
        AND     colOp30, mask, colOp30, ASR #6
        AND     colOp32, mask, colOp32, ASR #6
        SSUB16  out30,colOp30,const2
        SSUB16  out32,colOp32,const2
        
        
        ;***************************
        ;// Store all the 4x4 pixels
        ;***************************
        
        STMIA   pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
        
                               
        ;// Set return value
        
End                

        
        ;// Write function tail
        M_END
        
    ENDIF                                                           ;//ARM1136JS    
    
    
;// Guarding implementation by the processor name
    
            
    END