1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
|
;//
;//
;// File Name: armVCM4P10_TransformResidual4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 9641
;// Date: Thursday, February 7, 2008
;//
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;//
;//
;//
;// Description:
;// Transform Residual 4x4 Coefficients
;//
;//
;// Include standard headers
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS ARM1136JS
;// Import symbols required from other files
;// (For example tables)
;// Set debugging level
;//DEBUG_ON SETL {TRUE}
;// Guarding implementation by the processor name
IF ARM1136JS
;//Input Registers
pDst RN 0
pSrc RN 1
;//Output Registers
;//Local Scratch Registers
;// Packed Input pixels
in00 RN 2 ;// Src[0] & Src[1]
in02 RN 3 ;// Src[2] & Src[3]
in10 RN 4 ;// Src[4] & Src[5]
in12 RN 5 ;// Src[6] & Src[7]
in20 RN 6 ;// Src[8] & Src[9]
in22 RN 7 ;// Src[10] & Src[11]
in30 RN 8 ;// Src[12] & Src[13]
in32 RN 9 ;// Src[14] & Src[15]
;// Transpose for Row operations (Rows to cols)
trRow00 RN 2
trRow10 RN 10
trRow02 RN 3
trRow12 RN 5
trRow20 RN 11
trRow30 RN 12
trRow32 RN 14
trRow22 RN 7
;// Intermediate calculations
e0 RN 4
e1 RN 6
e2 RN 8
e3 RN 9
constZero RN 1
;// Row operated pixels
rowOp00 RN 2
rowOp10 RN 10
rowOp20 RN 11
rowOp30 RN 12
rowOp02 RN 3
rowOp12 RN 5
rowOp22 RN 7
rowOp32 RN 14
;// Transpose for colulmn operations
trCol00 RN 2
trCol02 RN 3
trCol10 RN 4
trCol12 RN 5
trCol20 RN 6
trCol22 RN 7
trCol30 RN 8
trCol32 RN 9
;// Intermediate calculations
g0 RN 10
g1 RN 11
g2 RN 12
g3 RN 14
;// Coloumn operated pixels
colOp00 RN 2
colOp02 RN 3
colOp10 RN 4
colOp12 RN 5
colOp20 RN 6
colOp22 RN 7
colOp30 RN 8
colOp32 RN 9
temp1 RN 10 ;// Temporary scratch varaibles
const1 RN 11
const2 RN 12
mask RN 14
;// Output pixels
out00 RN 2
out02 RN 3
out10 RN 4
out12 RN 5
out20 RN 6
out22 RN 7
out30 RN 8
out32 RN 9
;// Allocate stack memory required by the function
;// Write function header
M_START armVCM4P10_TransformResidual4x4,r11
;******************************************************************
;// The strategy used in implementing the transform is as follows:*
;// Load the 4x4 block into 8 registers *
;// Transpose the 4x4 matrix *
;// Perform the row operations (on columns) using SIMD *
;// Transpose the 4x4 result matrix *
;// Perform the coloumn operations *
;// Store the 4x4 block at one go *
;******************************************************************
;// Load all the 4x4 pixels
LDMIA pSrc,{in00,in02,in10,in12,in20,in22,in30,in32}
MOV constZero,#0 ;// Used to right shift by 1
;LDR constZero,=0x00000000
;*****************************************************************
;//
;// Transpose the matrix inorder to perform row ops as coloumn ops
;// Input: in[][] = original matrix
;// Output: trRow[][]= transposed matrix
;// Step1: Obtain the LL part of the transposed matrix
;// Step2: Obtain the HL part
;// step3: Obtain the LH part
;// Step4: Obtain the HH part
;//
;*****************************************************************
;// LL 2x2 transposed matrix
;// d0 d1 - -
;// d4 d5 - -
;// - - - -
;// - - - -
PKHTB trRow10,in10,in00,ASR #16 ;// [5 4] = [f5:f1]
PKHBT trRow00,in00,in10,LSL #16 ;// [1 0] = [f4:f0]
;// HL 2x2 transposed matrix
;// - - - -
;// - - - -
;// d8 d9 - -
;// d12 d13 - -
PKHTB trRow30,in12,in02,ASR #16 ;// [13 12] = [7 3]
PKHBT trRow20,in02,in12,LSL #16 ;// [9 8] = [6 2]
;// LH 2x2 transposed matrix
;// - - d2 d3
;// - - d6 d7
;// - - - -
;// - - - -
PKHBT trRow02,in20,in30,LSL #16 ;// [3 2] = [f12:f8]
PKHTB trRow12,in30,in20,ASR #16 ;// [7 6] = [f13:f9]
;// HH 2x2 transposed matrix
;// - - - -
;// - - - -
;// - - d10 d11
;// - - d14 d15
PKHTB trRow32,in32,in22,ASR #16 ;// [15 14] = [15 11]
PKHBT trRow22,in22,in32,LSL #16 ;// [11 10] = [14 10]
;****************************************
;// Row Operations (Performed on columns)
;****************************************
;// SIMD operations on first two columns(two rows of the original matrix)
SADD16 e0, trRow00,trRow20 ;// e0 = d0 + d2
SSUB16 e1, trRow00,trRow20 ;// e1 = d0 - d2
SHADD16 e2, trRow10,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 e3, trRow30,constZero ;// avoid pipeline stalls for e2 and e3
SSUB16 e2, e2, trRow30 ;// e2 = (d1>>1) - d3
SADD16 e3, e3, trRow10 ;// e3 = d1 + (d3>>1)
SADD16 rowOp00, e0, e3 ;// f0 = e0 + e3
SADD16 rowOp10, e1, e2 ;// f1 = e1 + e2
SSUB16 rowOp20, e1, e2 ;// f2 = e1 - e2
SSUB16 rowOp30, e0, e3 ;// f3 = e0 - e3
;// SIMD operations on next two columns(next two rows of the original matrix)
SADD16 e0, trRow02,trRow22
SSUB16 e1, trRow02,trRow22
SHADD16 e2, trRow12,constZero ;//(f1>>1) constZero is a register holding 0
SHADD16 e3, trRow32,constZero
SSUB16 e2, e2, trRow32
SADD16 e3, e3, trRow12
SADD16 rowOp02, e0, e3
SADD16 rowOp12, e1, e2
SSUB16 rowOp22, e1, e2
SSUB16 rowOp32, e0, e3
;*****************************************************************
;// Transpose the resultant matrix
;// Input: rowOp[][]
;// Output: trCol[][]
;*****************************************************************
;// LL 2x2 transposed matrix
;// d0 d1 - -
;// d4 d5 - -
;// - - - -
;// - - - -
PKHTB trCol10,rowOp10,rowOp00,ASR #16 ;// [5 4] = [f5:f1]
PKHBT trCol00,rowOp00,rowOp10,LSL #16 ;// [1 0] = [f4:f0]
;// HL 2x2 transposed matrix
;// - - - -
;// - - - -
;// d8 d9 - -
;// d12 d13 - -
PKHTB trCol30,rowOp12,rowOp02,ASR #16 ;// [13 12] = [7 3]
PKHBT trCol20,rowOp02,rowOp12,LSL #16 ;// [9 8] = [6 2]
;// LH 2x2 transposed matrix
;// - - d2 d3
;// - - d6 d7
;// - - - -
;// - - - -
PKHBT trCol02,rowOp20,rowOp30,LSL #16 ;// [3 2] = [f12:f8]
PKHTB trCol12,rowOp30,rowOp20,ASR #16 ;// [7 6] = [f13:f9]
;// HH 2x2 transposed matrix
;// - - - -
;// - - - -
;// - - d10 d11
;// - - d14 d15
PKHTB trCol32,rowOp32,rowOp22,ASR #16 ;// [15 14] = [15 11]
PKHBT trCol22,rowOp22,rowOp32,LSL #16 ;// [11 10] = [14 10]
;*******************************
;// Coloumn Operations
;*******************************
;// SIMD operations on first two columns
SADD16 g0, trCol00,trCol20
SSUB16 g1, trCol00,trCol20
SHADD16 g2, trCol10,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 g3, trCol30,constZero
SSUB16 g2, g2, trCol30
SADD16 g3, g3, trCol10
SADD16 colOp00, g0, g3
SADD16 colOp10, g1, g2
SSUB16 colOp20, g1, g2
SSUB16 colOp30, g0, g3
;// SIMD operations on next two columns
SADD16 g0, trCol02,trCol22
SSUB16 g1, trCol02,trCol22
SHADD16 g2, trCol12,constZero ;// (f1>>1) constZero is a register holding 0
SHADD16 g3, trCol32,constZero
SSUB16 g2, g2, trCol32
SADD16 g3, g3, trCol12
SADD16 colOp02, g0, g3
SADD16 colOp12, g1, g2
SSUB16 colOp22, g1, g2
SSUB16 colOp32, g0, g3
;************************************************
;// Calculate final value (colOp[i][j] + 32)>>6
;************************************************
;// const1: Serves dual purpose
;// (1) Add #32 to both the lower and higher 16bits of the SIMD result
;// (2) Convert the lower 16 bit value to an unsigned number (Add 32768)
LDR const1, =0x00208020
LDR mask, =0xffff03ff ;// Used to mask the down shifted 6 bits
;// const2(#512): used to convert the lower 16bit number back to signed value
MOV const2,#0x200 ;// const2 = 2^9
;// First Row
SADD16 colOp00, colOp00, const1
SADD16 colOp02, colOp02, const1
AND colOp00, mask, colOp00, ASR #6
AND colOp02, mask, colOp02, ASR #6
SSUB16 out00,colOp00,const2
SSUB16 out02,colOp02,const2
;// Second Row
SADD16 colOp10, colOp10, const1
SADD16 colOp12, colOp12, const1
AND colOp10, mask, colOp10, ASR #6
AND colOp12, mask, colOp12, ASR #6
SSUB16 out10,colOp10,const2
SSUB16 out12,colOp12,const2
;// Third Row
SADD16 colOp20, colOp20, const1
SADD16 colOp22, colOp22, const1
AND colOp20, mask, colOp20, ASR #6
AND colOp22, mask, colOp22, ASR #6
SSUB16 out20,colOp20,const2
SSUB16 out22,colOp22,const2
;// Fourth Row
SADD16 colOp30, colOp30, const1
SADD16 colOp32, colOp32, const1
AND colOp30, mask, colOp30, ASR #6
AND colOp32, mask, colOp32, ASR #6
SSUB16 out30,colOp30,const2
SSUB16 out32,colOp32,const2
;***************************
;// Store all the 4x4 pixels
;***************************
STMIA pDst,{out00,out02,out10,out12,out20,out22,out30,out32}
;// Set return value
End
;// Write function tail
M_END
ENDIF ;//ARM1136JS
;// Guarding implementation by the processor name
END
|