1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
|
;//
;//
;// File Name: omxVCM4P10_PredictIntraChroma_8x8_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 9641
;// Date: Thursday, February 7, 2008
;//
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
EXPORT armVCM4P10_pIndexTable8x8
;// Define the processor variants supported by this file
M_VARIANTS ARM1136JS
AREA table, DATA
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------
M_TABLE armVCM4P10_pIndexTable8x8
DCD OMX_VC_CHROMA_DC, OMX_VC_CHROMA_HOR
DCD OMX_VC_CHROMA_VERT, OMX_VC_CHROMA_PLANE
M_TABLE armVCM4P10_MultiplierTableChroma8x8,1
DCW 3, 2, 1,4
DCW -3,-2,-1,0
DCW 1, 2, 3,4
IF ARM1136JS
;//--------------------------------------------
;// Constants
;//--------------------------------------------
BLK_SIZE EQU 0x8
MUL_CONST0 EQU 0x01010101
MASK_CONST EQU 0x00FF00FF
MUL_CONST1 EQU 0x80808080
;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
y RN 12
pc RN 15
return RN 0
pSrcLeft2 RN 1
pDst2 RN 2
sum1 RN 6
sum2 RN 7
pTable RN 9
dstStepx2 RN 11
leftStepx2 RN 14
outerCount RN 14
r0x01010101 RN 10
r0x00FF00FF RN 11
tVal0 RN 0
tVal1 RN 1
tVal2 RN 2
tVal3 RN 3
tVal4 RN 4
tVal5 RN 5
tVal6 RN 6
tVal7 RN 7
tVal8 RN 8
tVal9 RN 9
tVal10 RN 10
tVal11 RN 11
tVal12 RN 12
tVal14 RN 14
b RN 14
c RN 12
p2p0 RN 0
p3p1 RN 1
p6p4 RN 2
p7p5 RN 4
pp2pp0 RN 6
pp3pp1 RN 7
pp6pp4 RN 8
pp7pp5 RN 9
p3210 RN 10
p7654 RN 10
;//--------------------------------------------
;// Input Arguments
;//--------------------------------------------
pSrcLeft RN 0 ;// input pointer
pSrcAbove RN 1 ;// input pointer
pSrcAboveLeft RN 2 ;// input pointer
pDst RN 3 ;// output pointer
leftStep RN 4 ;// input variable
dstStep RN 5 ;// input variable
predMode RN 6 ;// input variable
availability RN 7 ;// input variable
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntraChroma_8x8 starts
;//-----------------------------------------------------------------------------------------------
;// Write function header
M_START omxVCM4P10_PredictIntraChroma_8x8, r11
;// Define stack arguments
M_ARG LeftStep, 4
M_ARG DstStep, 4
M_ARG PredMode, 4
M_ARG Availability, 4
;// M_STALL ARM1136JS=4
LDR pTable,=armVCM4P10_pIndexTable8x8 ;// Load index table for switch case
;// Load argument from the stack
M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg
M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg
M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg
M_LDR availability, Availability ;// Arg availability loaded from stack to reg
MOV y, #BLK_SIZE ;// Outer Loop Count
LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
OMX_VC_CHROMA_DC
AND availability, availability,#(OMX_VC_UPPER + OMX_VC_LEFT)
CMP availability, #(OMX_VC_UPPER + OMX_VC_LEFT) ;// if(availability & (#OMX_VC_UPPER | #OMX_VC_LEFT))
LDR r0x01010101, =MUL_CONST0
BNE TST_UPPER ;// Jump to Upper if not both
LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7]
ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep
ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep
;// M_STALL ARM1136JS=1
UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2]
UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3]
UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6]
UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7]
UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3])
ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7])
UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits)
UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits)
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0]
M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1]
M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2]
M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3]
ADD tVal2, tVal8, tVal9 ;// tVal14 = tVal8 + tVal9
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4]
M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5]
ADD tVal14, tVal4, tVal12 ;// tVal14 = tVal4 + tVal12
LDRB tVal4, [pSrcLeft] ;// tVal4 = pSrcLeft[6]
LDRB tVal12,[pSrcLeft2] ;// tVal12= pSrcLeft[7]
ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9
ADD tVal2, tVal2, tVal14 ;// leftsum1 = sum(pSrcLeft[0] to pSrcLeft[3])
ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12
ADD tVal14, tVal8, tVal4 ;// leftsum2 = sum(pSrcLeft[4] to pSrcLeft[7])
ADD tVal8, tVal14, #2 ;// tVal8 = leftsum2 + 2
ADD tVal9, sum2, #2 ;// tVal8 = upsum2 + 2
ADD sum1, sum1, tVal2 ;// sum1 = upsum1 + leftsum1
ADD sum2, sum2, tVal14 ;// sum2 = upsum2 + leftsum2
ADD sum1, sum1, #4 ;// (sum1 + 4)
ADD sum2, sum2, #4 ;// (sum2 + 4)
MOV sum1, sum1, LSR #3 ;// (sum1 + 4)>>3
MOV tVal9, tVal9, LSR #2 ;// (tVal9 + 2)>>2
MOV tVal8, tVal8, LSR #2 ;// (tVal8 + 2)>>2
MOV sum2, sum2, LSR #3 ;// (sum2 + 4)>>3
MUL tVal0, sum1, r0x01010101 ;// replicate the val in all the bytes
MUL tVal1, tVal9,r0x01010101 ;// replicate the val in all the bytes
MUL tVal8, tVal8,r0x01010101 ;// replicate the val in all the bytes
MUL tVal9, sum2, r0x01010101 ;// replicate the val in all the bytes
M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[0 to 7] = tVal 0 to 1
M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[8 to 15] = tVal 0 to 1
M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[16 to 23] = tVal 0 to 1
M_STRD tVal0, tVal1, [pDst], dstStep ;// pDst[24 to 31] = tVal 0 to 1
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9
MOV return, #OMX_Sts_NoErr
M_EXIT
TST_UPPER
;// M_STALL ARM1136JS=3
CMP availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER)
BNE TST_LEFT ;// Jump to Left if not upper
LDM pSrcAbove,{tVal8,tVal9} ;// tVal 8 to 9 = pSrcAbove[0 to 7]
;// M_STALL ARM1136JS=3
UXTB16 tVal7, tVal8 ;// pSrcAbove[0, 2]
UXTB16 tVal8, tVal8, ROR #8 ;// pSrcAbove[1, 3]
UADD16 sum1, tVal7, tVal8 ;// pSrcAbove[0, 2] + pSrcAbove[1, 3]
UXTB16 tVal7, tVal9 ;// pSrcAbove[4, 6]
UXTB16 tVal9, tVal9, ROR #8 ;// pSrcAbove[5, 7]
UADD16 sum2, tVal7, tVal9 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
ADD sum1, sum1, sum1, LSR #16 ;// sum(pSrcAbove[0] to pSrcAbove[3])
ADD sum2, sum2, sum2, LSR #16 ;// sum(pSrcAbove[4] to pSrcAbove[7])
UXTH sum1, sum1 ;// upsum1 (Clear the top junk bits)
UXTH sum2, sum2 ;// upsum2 (Clear the top junk bits)
ADD sum1, sum1, #2 ;// sum1 + 2
ADD sum2, sum2, #2 ;// sum2 + 2
MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2
MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2
MUL sum1, sum1,r0x01010101 ;// replicate the val in all the bytes
MUL sum2, sum2,r0x01010101 ;// replicate the val in all the bytes
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7
MOV return, #OMX_Sts_NoErr
M_EXIT
TST_LEFT
;// M_STALL ARM1136JS=3
CMP availability, #OMX_VC_LEFT
BNE TST_COUNT0
ADD leftStepx2, leftStep,leftStep ;// leftStepx2 = 2 * leftStep
ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[0]
M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[1]
M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[2]
M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[3]
ADD tVal6, tVal8, tVal9 ;// tVal6 = tVal8 + tVal9
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[4]
ADD tVal7, tVal4, tVal12 ;// tVal7 = tVal4 + tVal12
M_LDRB tVal9, [pSrcLeft2], +leftStepx2 ;// tVal9 = pSrcLeft[5]
M_LDRB tVal4, [pSrcLeft], +leftStepx2 ;// tVal4 = pSrcLeft[6]
M_LDRB tVal12,[pSrcLeft2], +leftStepx2 ;// tVal12= pSrcLeft[7]
ADD tVal8, tVal8, tVal9 ;// tVal8 = tVal8 + tVal9
ADD sum1, tVal6, tVal7 ;// sum1 = sum(pSrcLeft[0] to pSrcLeft[3])
ADD tVal4, tVal4, tVal12 ;// tVal4 = tVal4 + tVal12
ADD sum2, tVal8, tVal4 ;// sum2 = sum(pSrcLeft[4] to pSrcLeft[7])
ADD sum1, sum1, #2 ;// sum1 + 2
ADD sum2, sum2, #2 ;// sum2 + 2
MOV sum1, sum1, LSR #2 ;// (sum1 + 2)>>2
MOV sum2, sum2, LSR #2 ;// (sum2 + 2)>>2
MUL tVal6, sum1,r0x01010101 ;// replicate the val in all the bytes
MUL tVal8, sum2,r0x01010101 ;// replicate the val in all the bytes
;// M_STALL ARM1136JS=1
MOV tVal7,tVal6 ;// tVal7 = sum1
MOV tVal9,tVal8 ;// tVal9 = sum2
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[32 to 39] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[40 to 47] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[48 to 55] = tVal 8 to 9
M_STRD tVal8, tVal9, [pDst], dstStep ;// pDst[56 to 63] = tVal 8 to 9
MOV return, #OMX_Sts_NoErr
M_EXIT ;// Macro to exit midway-break frm case
TST_COUNT0
LDR sum1, =MUL_CONST1 ;// sum1 = 0x80808080 if(count == 0)
;// M_STALL ARM1136JS=2
MOV tVal7, sum1 ;// tVal7 = sum1
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7
MOV return, #OMX_Sts_NoErr
M_EXIT ;// Macro to exit midway-break frm case
OMX_VC_CHROMA_HOR
;// M_STALL ARM1136JS=2
ADD pSrcLeft2, pSrcLeft, leftStep ;// pSrcLeft2 = pSrcLeft + leftStep
ADD leftStepx2, leftStep, leftStep ;// leftStepx2 = leftStep * 2
ADD pDst2, pDst, dstStep ;// pDst2 = pDst + dstStep
ADD dstStepx2, dstStep, dstStep ;// double dstStep
SUB dstStepx2, dstStepx2, #4 ;// double dstStep minus 4
LDR r0x01010101, =MUL_CONST0 ;// Const to repeat the byte in reg 4 times
M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[0]
M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[1]
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[2]
M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[3]
MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes
MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes
MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes
MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes
STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3]
STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3]
M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7]
M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7]
STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3]
STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3]
M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7]
M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7]
M_LDRB tVal6, [pSrcLeft], +leftStepx2 ;// tVal6 = pSrcLeft[4]
M_LDRB tVal7, [pSrcLeft2],+leftStepx2 ;// tVal7 = pSrcLeft[5]
M_LDRB tVal8, [pSrcLeft], +leftStepx2 ;// tVal8 = pSrcLeft[6]
M_LDRB tVal9, [pSrcLeft2],+leftStepx2 ;// tVal9 = pSrcLeft[7]
MUL tVal6, tVal6, r0x01010101 ;// replicate the val in all the bytes
MUL tVal7, tVal7, r0x01010101 ;// replicate the val in all the bytes
MUL tVal8, tVal8, r0x01010101 ;// replicate the val in all the bytes
MUL tVal9, tVal9, r0x01010101 ;// replicate the val in all the bytes
STR tVal6, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3]
STR tVal7, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3]
M_STR tVal6, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7]
M_STR tVal7, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7]
STR tVal8, [pDst], #+4 ;// store {tVal6} at pDst [0 to 3]
STR tVal9, [pDst2], #+4 ;// store {tVal7} at pDst2[0 to 3]
M_STR tVal8, [pDst], dstStepx2 ;// store {tVal6} at pDst [4 to 7]
M_STR tVal9, [pDst2], dstStepx2 ;// store {tVal7} at pDst2[4 to 7]
MOV return, #OMX_Sts_NoErr
M_EXIT
OMX_VC_CHROMA_VERT
;// M_STALL ARM1136JS=4
LDMIA pSrcAbove, {tVal6,tVal7} ;// tVal 6 to 7 = pSrcAbove[0 to 7]
MOV return, #OMX_Sts_NoErr
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[0 to 7] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[8 to 15] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[16 to 23] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[24 to 31] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[32 to 39] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[40 to 47] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[48 to 55] = tVal 6 to 7
M_STRD tVal6, tVal7, [pDst], dstStep ;// pDst[56 to 63] = tVal 6 to 7
M_EXIT ;// Macro to exit midway-break frm case
OMX_VC_CHROMA_PLANE
;// M_STALL ARM1136JS=3
RSB tVal14, leftStep, leftStep, LSL #3 ;// 7*leftStep
LDRB tVal7, [pSrcAbove, #+7] ;// pSrcAbove[7]
LDRB tVal6, [pSrcLeft, +tVal14] ;// pSrcLeft[7*leftStep]
LDRB tVal8, [pSrcAboveLeft] ;// pSrcAboveLeft[0]
LDRB tVal9, [pSrcAbove, #+6 ] ;// pSrcAbove[6]
LDRB tVal10,[pSrcAbove] ;// pSrcAbove[0]
ADD tVal2, tVal7, tVal6 ;// pSrcAbove[7] + pSrcLeft[7*leftStep]
SUB tVal6, tVal6, tVal8 ;// V0 = pSrcLeft[7*leftStep] - pSrcAboveLeft[0]
SUB tVal7, tVal7, tVal8 ;// H0 = pSrcAbove[7] - pSrcAboveLeft[0]
LSL tVal2, tVal2, #4 ;// a = 16 * (pSrcAbove[15] + pSrcLeft[15*lS])
ADD tVal2, tVal2, #16 ;// a + 16
SUB tVal9, tVal9,tVal10 ;// pSrcAbove[6] - pSrcAbove[0]
LDRB tVal8, [pSrcAbove,#+5] ;// pSrcAbove[5]
LDRB tVal10,[pSrcAbove,#+1] ;// pSrcAbove[1]
ADD tVal9, tVal9, tVal9, LSL #1 ;// H1 = 3 * (pSrcAbove[6] - pSrcAbove[0])
ADD tVal7, tVal9, tVal7, LSL #2 ;// H = H1 + H0
SUB tVal8, tVal8, tVal10 ;// pSrcAbove[5] - pSrcAbove[1]
LDRB tVal9, [pSrcAbove,#+4] ;// pSrcAbove[4]
LDRB tVal10,[pSrcAbove,#+2] ;// pSrcAbove[2]
ADD tVal7, tVal7, tVal8, LSL #1 ;// H = H + H2
SUB tVal11, tVal14,leftStep ;// 6*leftStep
ADD tVal11, pSrcLeft, tVal11 ;// pSrcLeft + 6*leftStep
MOV tVal12, pSrcLeft ;// pSrcLeft
SUB tVal9, tVal9, tVal10 ;// pSrcAbove[4] - pSrcAbove[2]
ADD tVal7, tVal7, tVal9 ;// H = H + H3
M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[6*leftStep]
M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[0]
ADD tVal7, tVal7, tVal7, LSL #4 ;// 17 * H
ADD tVal7, tVal7, #16 ;// 17 * H + 16
SUB tVal8, tVal8, tVal10 ;// pSrcLeft[6*leftStep] - pSrcLeft[0]
ASR b, tVal7, #5 ;// b = (17 * H + 16) >> 5
ADD tVal8, tVal8, tVal8, LSL #1 ;// V1 = 3 * (pSrcLeft[6*leftStep] - pSrcLeft[0])
ADD tVal6, tVal8, tVal6, LSL #2 ;// V = V0 +V1
M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[5*leftStep]
M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[leftStep]
ADD tVal7, b, b, LSL #1 ;// 3*b
SUB tVal2, tVal2, tVal7 ;// a + 16 - 3*b
SUB tVal7, tVal8, tVal10 ;// pSrcLeft[5*leftStep] - pSrcLeft[leftStep]
M_LDRB tVal8, [tVal11],-leftStep ;// pSrcLeft[4*leftStep]
M_LDRB tVal10,[tVal12],+leftStep ;// pSrcLeft[2*leftStep]
ADD tVal6, tVal6, tVal7, LSL #1 ;// V = V + V2
LDR r0x00FF00FF, =MASK_CONST ;// r0x00FF00FF = 0x00FF00FF
SUB tVal7, tVal8, tVal10 ;// pSrcLeft[4*leftStep] - pSrcLeft[2*leftStep]
ADD tVal6, tVal6, tVal7 ;// V = V + V7
SUB dstStep, dstStep, #4 ;// dstStep - 4
ADD tVal6, tVal6, tVal6, LSL #4 ;// 17*V
ADD tVal6, tVal6, #16 ;// 17*V + 16
;// M_STALL ARM1136JS=1
ASR c, tVal6, #5 ;// c = (17*V + 16)>>5
;// M_STALL ARM1136JS=1
ADD tVal6, c, c, LSL #1 ;// 3*c
UXTH c, c ;// only in half word
SUB tVal6, tVal2, tVal6 ;// a - 3*b - 3*c + 16
ORR c, c, c, LSL #16 ;// c c
ADD tVal7, b, b ;// 2b
ADD tVal2, tVal6, tVal7 ;// pp2 = d + 2*b
ADD tVal7, tVal7, b ;// 3b
ORR p2p0, tVal6, tVal2, LSL #16 ;// p2p0 = pack {p2, p0}
UXTH b, b
UXTH tVal7, tVal7
ORR b, b, b, LSL #16 ;// {b,b}
ORR tVal7, tVal7, tVal7, LSL #16 ;// {3b,3b}
SADD16 p3p1, p2p0, b ;// p3p1 = p2p0 + {b,b}
SADD16 p6p4, p3p1, tVal7 ;// p6p4 = p3p1 + {3b,3b}
SADD16 p7p5, p6p4, b ;// p7p5 = p6p4 + {b,b}
MOV outerCount, #BLK_SIZE ;// Outer Loop Count
LOOP_PLANE
USAT16 p7p5, #13, p7p5 ;// clip13(p7) clip13(p5)
USAT16 p6p4, #13, p6p4 ;// clip13(p6) clip13(p4)
USAT16 p3p1, #13, p3p1 ;// clip13(p3) clip13(p1)
USAT16 p2p0, #13, p2p0 ;// clip13(p2) clip13(p0)
AND pp7pp5, r0x00FF00FF, p7p5, ASR #5 ;// clip8(p7) clip8(p5)
AND pp6pp4, r0x00FF00FF, p6p4, ASR #5 ;// clip8(p6) clip8(p4)
AND pp3pp1, r0x00FF00FF, p3p1, ASR #5 ;// clip8(p3) clip8(p1)
AND pp2pp0, r0x00FF00FF, p2p0, ASR #5 ;// clip8(p2) clip8(p0)
SUBS outerCount, outerCount, #1 ;// outerCount--
ORR p3210, pp2pp0, pp3pp1, LSL #8 ;// pack {p3,p2, p1, p0}
STR p3210, [pDst], #4 ;// store {pDst[0] to pDst[3]}
ORR p7654, pp6pp4, pp7pp5, LSL #8 ;// pack {p7,p6, p5, p4}
M_STR p7654, [pDst], dstStep ;// store {pDst[4] to pDst[7]}
SADD16 p7p5, p7p5, c ;// {p7 + c}, {p5 + c}
SADD16 p6p4, p6p4, c ;// {p6 + c}, {p4 + c}
SADD16 p3p1, p3p1, c ;// {p3 + c}, {p1 + c}
SADD16 p2p0, p2p0, c ;// {p2 + c}, {p0 + c}
BNE LOOP_PLANE ;// Loop for 8 times
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// ARM1136JS
END
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntraChroma_8x8 ends
;//-----------------------------------------------------------------------------------------------
|