media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_TransformDequantLumaDCFromPair_s.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469

;//
;// 
;// File Name:  omxVCM4P10_TransformDequantLumaDCFromPair_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//
;// Description:
;// H.264 inverse quantize and transform module
;// 
;// 

;// Include standard headers

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
;// Import/Export symbols required from/to other files
;// (For example tables)
        
        IMPORT armVCM4P10_UnpackBlock4x4 
        IMPORT armVCM4P10_QPDivTable
        IMPORT armVCM4P10_VMatrixQPModTable
        
        M_VARIANTS ARM1136JS

;// Set debugging level        
;//DEBUG_ON    SETL {TRUE}


;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4
    

;// Guarding implementation by the processor name
    
    IF  ARM1136JS 


;//Input Registers
pData               RN  0
QP                  RN  1

;//Output Registers


;//Local Scratch Registers

;// Packed Input pixels
in00                RN  2                   ;// Src[0] & Src[1] 
in02                RN  3                   ;// Src[2] & Src[3]
in10                RN  4                   ;// Src[4] & Src[5]
in12                RN  5                   ;// Src[6] & Src[7]
in20                RN  6                   ;// Src[8] & Src[9]
in22                RN  7                   ;// Src[10] & Src[11]
in30                RN  8                   ;// Src[12] & Src[13]
in32                RN  9                   ;// Src[14] & Src[15]

;// Transpose for Row operations (Rows to cols)
trRow00             RN  2
trRow10             RN  10
trRow02             RN  3
trRow12             RN  5
trRow20             RN  11
trRow30             RN  12
trRow32             RN  14
trRow22             RN  7

;// Intermediate calculations
rowSum1             RN  4
rowSum2             RN  6
rowDiff1            RN  8
rowDiff2            RN  9


;// Row operated pixels
rowOp00             RN  2
rowOp10             RN  10
rowOp20             RN  11
rowOp30             RN  12
rowOp02             RN  3
rowOp12             RN  5
rowOp22             RN  7
rowOp32             RN  14

;// Transpose for colulmn operations
trCol00             RN  2                   
trCol02             RN  3                   
trCol10             RN  4                   
trCol12             RN  5                   
trCol20             RN  6                   
trCol22             RN  7                   
trCol30             RN  8                   
trCol32             RN  9  

;// Intermediate calculations
colSum1             RN  10
colSum2             RN  11
colDiff1            RN  12
colDiff2            RN  14


;// Coloumn operated pixels
colOp00             RN  2                   
colOp02             RN  3                   
colOp10             RN  4                   
colOp12             RN  5                   
colOp20             RN  6                   
colOp22             RN  7                   
colOp30             RN  8                   
colOp32             RN  9  

;// Temporary scratch varaibles
pQPDivTable         RN  0
pQPModTable         RN  11
Shift               RN  10
Scale               RN  14
Round               RN  0

temp1               RN  10
temp2                RN  11
temp3               RN  12
temp4               RN  1


;// InvTransformed and Dequantized pixels
out00               RN  2                   
out02               RN  3                   
out10               RN  4                   
out12               RN  5                   
out20               RN  6                   
out22               RN  7                   
out30               RN  8                   
out32               RN  9  
      
        
    ;// Allocate stack memory required by the function
        M_ALLOC4    pDataOnStack, 4

    ;// Write function header
        M_START armVCM4P10_InvTransformDequantLumaDC4x4,r11
        
        ;******************************************************************
        ;// The strategy used in implementing the transform is as follows:*
        ;// Load the 4x4 block into 8 registers                           *  
        ;// Transpose the 4x4 matrix                                      *  
        ;// Perform the row operations (on columns) using SIMD            *  
        ;// Transpose the 4x4 result matrix                               *  
        ;// Perform the coloumn operations                                *
        ;// Store the 4x4 block at one go                                 *  
        ;******************************************************************

        ;// Load all the 4x4 pixels
        
        LDMIA   pData,{in00,in02,in10,in12,in20,in22,in30,in32}
        
        ;//*****************************************************************
        ;//
        ;// Transpose the matrix inorder to perform row ops as coloumn ops
        ;// Input:   in[][] = original matrix
        ;// Output:  trRow[][]= transposed matrix
        ;// Step1: Obtain the LL part of the transposed matrix
        ;// Step2: Obtain the HL part
        ;// step3: Obtain the LH part
        ;// Step4: Obtain the HH part
        ;//
        ;//*****************************************************************
        
        ;// LL 2x2 transposed matrix 
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -
        
        PKHTB   trRow10,in10,in00,ASR #16               ;// [5 4] = [f5:f1]    
        PKHBT   trRow00,in00,in10,LSL #16               ;// [1 0] = [f4:f0]  
        
        ;// HL 2x2 transposed matrix  
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -
        
         
         PKHTB   trRow30,in12,in02,ASR #16              ;// [13 12] = [7 3]
         PKHBT   trRow20,in02,in12,LSL #16              ;// [9 8] = [6 2] 
        
        ;// LH 2x2 transposed matrix 
        ;//   - - d2 d3 
        ;//   - - d6 d7 
        ;//   - - -  -
        ;//   - - -  -
        
        PKHBT   trRow02,in20,in30,LSL #16               ;// [3 2] = [f12:f8]  
        PKHTB   trRow12,in30,in20,ASR #16               ;// [7 6] = [f13:f9] 
        
        
        ;// HH 2x2 transposed matrix  
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15
        
        PKHTB   trRow32,in32,in22,ASR #16               ;// [15 14] = [15 11]
        PKHBT   trRow22,in22,in32,LSL #16               ;// [11 10] = [14 10]
       
        
        ;**************************************** 
        ;// Row Operations (Performed on columns)
        ;**************************************** 
        
        
        ;// SIMD operations on first two columns(two rows of the original matrix)
        
        SADD16      rowSum1,trRow00,trRow10                ;// (c0+c1)
        SADD16      rowSum2,trRow20,trRow30                ;// (c2+c3)
        SSUB16      rowDiff1,trRow00,trRow10               ;// (c0-c1)
        SSUB16      rowDiff2,trRow20,trRow30               ;// (c2-c3)
        SADD16      rowOp00,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
        SSUB16      rowOp10,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
        SSUB16      rowOp20,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
        SADD16      rowOp30,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
        
                
        ;// SIMD operations on next two columns(next two rows of the original matrix)
        
        SADD16      rowSum1,trRow02,trRow12                ;// (c0+c1)
        SADD16      rowSum2,trRow22,trRow32                ;// (c2+c3)
        SSUB16      rowDiff1,trRow02,trRow12               ;// (c0-c1)
        SSUB16      rowDiff2,trRow22,trRow32               ;// (c2-c3)
        SADD16      rowOp02,rowSum1,rowSum2                ;// (c0+c1+c2+c3)
        SSUB16      rowOp12,rowSum1,rowSum2                ;// (c0+c1-c2-c3)
        SSUB16      rowOp22,rowDiff1,rowDiff2              ;// (c0-c1-c2+c3)
        SADD16      rowOp32,rowDiff1,rowDiff2              ;// (c0-c1+c2-c3)
        
        
        ;*****************************************************************
        ;// Transpose the resultant matrix
        ;// Input:  rowOp[][]
        ;// Output: trCol[][] 
        ;*****************************************************************
        
        ;// LL 2x2 transposed matrix 
        ;//   d0 d1 - -
        ;//   d4 d5 - -
        ;//   -  -  - -
        ;//   -  -  - -
        
        PKHTB   trCol10,rowOp10,rowOp00,ASR #16           ;// [5 4] = [f5:f1]
        PKHBT   trCol00,rowOp00,rowOp10,LSL #16           ;// [1 0] = [f4:f0]  
        
        ;// HL 2x2 transposed matrix  
        ;//    -   -   - -
        ;//    -   -   - -
        ;//    d8  d9  - -
        ;//   d12 d13  - -
        
         
         PKHTB   trCol30,rowOp12,rowOp02,ASR #16          ;// [13 12] = [7 3]
         PKHBT   trCol20,rowOp02,rowOp12,LSL #16          ;// [9 8] = [6 2] 
        
        ;// LH 2x2 transposed matrix 
        ;//   - - d2 d3 
        ;//   - - d6 d7 
        ;//   - - -  -
        ;//   - - -  -
        
        PKHBT   trCol02,rowOp20,rowOp30,LSL #16           ;// [3 2] = [f12:f8]  
        PKHTB   trCol12,rowOp30,rowOp20,ASR #16           ;// [7 6] = [f13:f9] 
        
        
        ;// HH 2x2 transposed matrix  
        ;//    - -   -   -
        ;//    - -   -   -
        ;//    - -  d10 d11
        ;//    - -  d14 d15
        
        PKHTB   trCol32,rowOp32,rowOp22,ASR #16            ;// [15 14] = [15 11]
        PKHBT   trCol22,rowOp22,rowOp32,LSL #16            ;// [11 10] = [14 10]
       
        
        ;******************************* 
        ;// Coloumn Operations 
        ;******************************* 
        
        ;//--------------------------------------------------------------------------------------
        ;// Store pData(RN0) on stack and restore it only at the final store back
        ;// This frees up a register (RN0) which is used to reduce number of intermediate stalls 
        ;//--------------------------------------------------------------------------------------
        M_STR       pData,pDataOnStack
        
        
        ;// SIMD operations on first two columns(two rows of the original matrix)
                
        SADD16      colSum1,trCol00,trCol10                ;// (c0+c1)
        SADD16      colSum2,trCol20,trCol30                ;// (c2+c3)
        SSUB16      colDiff1,trCol00,trCol10               ;// (c0-c1)
        SSUB16      colDiff2,trCol20,trCol30               ;// (c2-c3)
        SADD16      colOp00,colSum1,colSum2                ;// (c0+c1+c2+c3)
        SSUB16      colOp10,colSum1,colSum2                ;// (c0+c1-c2-c3)
        SSUB16      colOp20,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
        SADD16      colOp30,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
        
                
        ;// SIMD operations on next two columns(next two rows of the original matrix)
        
        LDR         pQPDivTable, =armVCM4P10_QPDivTable    ;// QP Division look-up-table base pointer
        SADD16      colSum1,trCol02,trCol12                ;// (c0+c1)
        SADD16      colSum2,trCol22,trCol32                ;// (c2+c3)
        SSUB16      colDiff1,trCol02,trCol12               ;// (c0-c1)
        SSUB16      colDiff2,trCol22,trCol32               ;// (c2-c3)
        SADD16      colOp02,colSum1,colSum2                ;// (c0+c1+c2+c3)
        SSUB16      colOp12,colSum1,colSum2                ;// (c0+c1-c2-c3)
        LDR         pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
        LDRSB       Shift, [pQPDivTable, QP]               ;// Shift = pQPDivTable[QP]
        SSUB16      colOp22,colDiff1,colDiff2              ;// (c0-c1-c2+c3)
        SADD16      colOp32,colDiff1,colDiff2              ;// (c0-c1+c2-c3)
        
               
        LDRSB       Scale, [pQPModTable, QP]               ;// Scale = pQPModTable[QP] 
        
        ;//----------------------------------------------------------------------
        ;//
        ;// <Dequantize> improves on the c-reference code
        ;// Both the  cases i.e., Shift>=0 and Shift<0 cases are covered together
        ;// We do not subtract 2 from Shift as in C reference, instead perform a
        ;// Scale << Shift once in the beginning and do a right shift by a 
        ;// constant 2 after the Multiplication. The value of Round would be 2 
        ;// 
        ;// By doing this we aviod the Branches required and also 
        ;// reduce the code size substantially
        ;// 
        ;//----------------------------------------------------------------------
        
        MOV         Round, #2                               ;// Round = 2
        LSL         Scale, Scale, Shift                     ;// Scale = Scale << Shift
                
        
        ;// Row 1
        SMLABB  temp1, colOp00, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp02, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp00, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp02, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
        
        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out00,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out02,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
        
        
        ;// Row 2
        SMLABB  temp1, colOp10, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp12, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp10, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp12, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
        
        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out10,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out12,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
        
        ;// Row 3
        SMLABB  temp1, colOp20, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp22, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp20, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp22, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
        
        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2 
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out20,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out22,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
        
        ;// Row 4
        SMLABB  temp1, colOp30, Scale, Round                ;// Temp1 = B(c0w0) * Scale + Round
        SMLABB  temp3, colOp32, Scale, Round                ;// Temp3 = B(c1w0) * Scale + Round
        SMLATB  temp2, colOp30, Scale, Round                ;// Temp2 = T(c0w0) * Scale + Round
        SMLATB  temp4, colOp32, Scale, Round                ;// Temp4 = T(c1w0) * Scale + Round
        
        M_LDR   pData,pDataOnStack                          ;// Restore pData pointer from stack
        ASR     temp1, temp1, #2                            ;// Temp1 = Temp1 >> 2
        ASR     temp3, temp3, #2                            ;// Temp3 = Temp3 >> 2
        PKHBT   out30,  temp1, temp2, LSL #14               ;// c0w0  = | Temp2 | Temp1 |
        PKHBT   out32,  temp3, temp4, LSL #14               ;// c1w0  = | Temp2 | Temp1 |
        
        
        ;***************************
        ;// Store all the 4x4 pixels
        ;***************************

store_coeff
        
        STMIA   pData,{out00,out02,out10,out12,out20,out22,out30,out32}
        
                               
        ;// Set return value
        
       
        ;// Write function tail
        M_END        
        
    ENDIF                                                           ;//ARM1136JS        
    

;// Static Function: armVCM4P10_InvTransformDequantLumaDC4x4

;// Guarding implementation by the processor name
    
        
;// Function: omxVCM4P10_TransformDequantLumaDCFromPair
    
;//Input Registers
ppSrc               RN  0
pDst                RN  1
QPR2                RN  2

;//Output Registers
result              RN  0

;//Local Scratch Registers
pDstR4              RN  4
pDstR0              RN  0
QPR1                RN  1
QPR5                RN  5

;// Guarding implementation by the processor name
    
    IF ARM1136JS
       
    ;// Allocate stack memory required by the function
        

    ;// Write function header
        M_START omxVCM4P10_TransformDequantLumaDCFromPair,r5
        
        MOV     pDstR4,pDst                         ;// Saving register r1
        MOV     QPR5,QPR2                           ;// Saving register r2
        BL      armVCM4P10_UnpackBlock4x4
        
        MOV     pDstR0,pDstR4                       ;// Setting up register r0
        MOV     QPR1,QPR5                           ;// Setting up register r1
        BL      armVCM4P10_InvTransformDequantLumaDC4x4
                               
       
        ;// Set return value
        MOV     result,#OMX_Sts_NoErr        
       
        ;// Write function tail
        M_END
        
            
    ENDIF                                                           ;//ARM1136JS  
    

    END