1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
|
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: omxVCM4P10_PredictIntra_4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
;// Define the processor variants supported by this file
M_VARIANTS CortexA8
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------
M_TABLE armVCM4P10_pSwitchTable4x4
DCD OMX_VC_4x4_VERT, OMX_VC_4x4_HOR
DCD OMX_VC_4x4_DC, OMX_VC_4x4_DIAG_DL
DCD OMX_VC_4x4_DIAG_DR, OMX_VC_4x4_VR
DCD OMX_VC_4x4_HD, OMX_VC_4x4_VL
DCD OMX_VC_4x4_HU
IF CortexA8
;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
return RN 0
pTable RN 8
pc RN 15
;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft RN 0 ;// input pointer
pSrcAbove RN 1 ;// input pointer
pSrcAboveLeft RN 2 ;// input pointer
pDst RN 3 ;// output pointer
leftStep RN 4 ;// input variable
dstStep RN 5 ;// input variable
predMode RN 6 ;// input variable
availability RN 7 ;// input variable
pDst1 RN 1
pDst2 RN 4
pDst3 RN 6
pSrcTmp RN 9
srcStep RN 10
pDstTmp RN 11
dstep RN 12
;//-------------------
;// Neon registers
;//-------------------
;// OMX_VC_CHROMA_VERT
dAboveU32 DN D0.U32
;// OMX_VC_CHROMA_HOR
dLeftVal0 DN D0.8
dLeftVal1 DN D1.8
dLeftVal2 DN D2.8
dLeftVal3 DN D3.8
dLeftVal0U32 DN D0.U32
dLeftVal1U32 DN D1.U32
dLeftVal2U32 DN D2.U32
dLeftVal3U32 DN D3.U32
;// OMX_VC_4x4_DC
dLeftVal DN D0.U8
dLeftValU32 DN D0.U32
dSumAboveLeftU16 DN D1.U16
dSumAboveLeftU32 DN D1.U32
dSumAboveLeftU64 DN D1.U64
dSumAboveLeftU8 DN D1.U8
dSum DN D0.U8
dSumLeftValU16 DN D1.U16
dSumLeftValU32 DN D1.U32
dSumLeftValU64 DN D1.U64
dSumLeftValU8 DN D1.U8
dAboveVal DN D0.U8
dSumAboveValU16 DN D1.U16
dSumAboveValU32 DN D1.U32
dSumAboveValU64 DN D1.U64
dSumAboveValU8 DN D1.U8
dConst128U8 DN D0.U8
;//OMX_VC_4x4_DIAG_DL
dAbove DN D0.U8
dU7 DN D2.U8
dU3 DN D2.U8
dAbove0 DN D3.U8
dAbove1 DN D4.U8
dAbove2 DN D5.U8
dTmp DN D6.U8
dTmp0 DN D7.U8
dTmp1 DN D8.U8
dTmp2 DN D9.U8
dTmp3 DN D10.U8
dTmpU32 DN D6.U32
;//OMX_VC_4x4_DIAG_DR
dLeft DN D1.U8
dUL DN D2.U8
;//OMX_VC_4x4_VR
dLeft0 DN D1.U8
dLeft1 DN D2.U8
dEven0 DN D3.U8
dEven1 DN D4.U8
dEven2 DN D5.U8
dOdd0 DN D6.U8
dOdd1 DN D11.U8
dOdd2 DN D12.U8
dTmp3U32 DN D10.U32
dTmp2U32 DN D9.U32
;//OMX_VC_4x4_HD
dTmp1U64 DN D8.U64
dTmp0U64 DN D7.U64
dTmpU64 DN D6.U64
dTmpU32 DN D6.U32
dTmp1U32 DN D8.U32
;//OMX_VC_4x4_HU
dL3 DN D2.U8
dLeftHU0 DN D3.U8
dLeftHU1 DN D4.U8
dLeftHU2 DN D5.U8
dTmp0U32 DN D7.U32
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 starts
;//-----------------------------------------------------------------------------------------------
;// Write function header
M_START omxVCM4P10_PredictIntra_4x4, r12,d12
;// Define stack arguments
M_ARG LeftStep, 4
M_ARG DstStep, 4
M_ARG PredMode, 4
M_ARG Availability, 4
LDR pTable,=armVCM4P10_pSwitchTable4x4 ;// Load index table for switch case
;// Load argument from the stack
M_LDRD predMode,availability,PredMode ;// Arg predMode & availability loaded from stack to reg
M_LDRD leftStep,dstStep,LeftStep ;// Arg leftStep & dstStep loaded from stack to reg
LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
OMX_VC_4x4_HOR
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge
VLD1 {dLeftVal0[]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeftVal1[]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeftVal2[]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeftVal3[]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VST1 dLeftVal0U32[0],[pDst],dstep ;// pDst[0*dstStep+x] :0<= x <= 7
VST1 dLeftVal1U32[0],[pDstTmp],dstep ;// pDst[1*dstStep+x] :0<= x <= 7
VST1 dLeftVal2U32[0],[pDst] ;// pDst[2*dstStep+x] :0<= x <= 7
VST1 dLeftVal3U32[0],[pDstTmp] ;// pDst[3*dstStep+x] :0<= x <= 7
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VERT
;// Load Upper Edge
VLD1 dAboveU32[0],[pSrcAbove]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
DCPredict4x4VertStore
VST1 dAboveU32[0],[pDst],dstep
VST1 dAboveU32[0],[pDstTmp],dstep
VST1 dAboveU32[0],[pDst]
VST1 dAboveU32[0],[pDstTmp]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_DC
TST availability, #OMX_VC_LEFT
BEQ DCPredict4x4LeftNotAvailable
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge
VLD1 {dLeftVal[0]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeftVal[1]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeftVal[2]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeftVal[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
TST availability, #OMX_VC_UPPER
BEQ DCPredict4x4LeftOnlyAvailable
;// Load Upper Edge also
VLD1 dLeftValU32[1],[pSrcAbove] ;// pSrcAbove[0 to 3]
MOV return, #OMX_Sts_NoErr
VPADDL dSumAboveLeftU16, dLeftVal ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]
VPADDL dSumAboveLeftU32, dSumAboveLeftU16 ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]]
VPADDL dSumAboveLeftU64, dSumAboveLeftU32 ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]
VRSHR dSumAboveLeftU64,dSumAboveLeftU64,#3 ;// Sum = (Sum + 4) >> 3
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumAboveLeftU8[0]
B DCPredict4x4VertStore
DCPredict4x4LeftOnlyAvailable
MOV return, #OMX_Sts_NoErr ;// returnNoError
VPADDL dSumLeftValU16, dLeftVal ;// [ XX | pSrcLeft[2+3 | 0+1]]
VPADDL dSumLeftValU32, dSumLeftValU16 ;// [ XXXX | pSrcLeft[2+3+0+1]]
VRSHR dSumLeftValU32,dSumLeftValU32,#2 ;// Sum = (Sum + 2) >> 2
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumLeftValU8[0]
B DCPredict4x4VertStore
DCPredict4x4LeftNotAvailable
TST availability, #OMX_VC_UPPER
BEQ DCPredict4x4NoneAvailable
;// Load Upper Edge
VLD1 dAboveU32[0],[pSrcAbove] ;// pSrcAbove[0 to 3]
MOV return, #OMX_Sts_NoErr
VPADDL dSumAboveValU16, dAboveVal ;// [ XX | pSrcAbove[2+3 | 0+1]]
VPADDL dSumAboveValU32, dSumAboveValU16 ;// [ XXXX | pSrcAbove[2+3+0+1]]
VRSHR dSumAboveValU32,dSumAboveValU32,#2 ;// Sum = (Sum + 2) >> 2
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VDUP dSum,dSumAboveValU8[0]
B DCPredict4x4VertStore
DCPredict4x4NoneAvailable
VMOV dConst128U8,#0x80 ;// 0x8080808080808080 if(count == 0)
MOV return, #OMX_Sts_NoErr
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
B DCPredict4x4VertStore
OMX_VC_4x4_DIAG_DL
TST availability, #OMX_VC_UPPER_RIGHT
BEQ DiagDLUpperRightNotAvailable
VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
VDUP dU7, dAbove0[7] ;// [U7|U7|U7|U7|U7|U7|U7|U7]
VEXT dAbove1, dAbove0, dU7, #1 ;// [U7|U7|U6|U5|U4|U3|U2|U1]
VEXT dAbove2, dAbove0, dU7, #2 ;// [U7|U7|U7|U6|U5|U4|U3|U2]
B DiagDLPredict4x4Store
DiagDLUpperRightNotAvailable
VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
DiagDLPredict4x4Store
VHADD dTmp, dAbove0, dAbove2
VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst],dstStep
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_DIAG_DR
;// Load U0,U1,U2,U3
VLD1 dAboveU32[0],[pSrcAbove] ;// [X|X|X|X|U3|U2|U1|U0]
;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
VLD1 {dLeft[7]},[pSrcAboveLeft]
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
ADD pDst1,pDst,dstStep
VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VEXT dAbove0,dLeft,dAbove,#3 ;// [U2|U1|U0|UL|L0|L1|L2|L3]
ADD pDst2,pDst1,dstStep
VEXT dAbove1,dLeft,dAbove,#4 ;// [U3|U2|U1|U0|UL|L0|L1|L2]
ADD pDst3,pDst2,dstStep
VEXT dAbove2,dLeft,dAbove,#5 ;// [ X|U3|U2|U1|U0|UL|L0|L1]
VHADD dTmp, dAbove0, dAbove2
VRHADD dTmp, dTmp, dAbove1 ;// (a+2*b+c+2)>>2
VST1 dTmpU32[0],[pDst3] ;// Store pTmp[0],[1],[2],[3] @ pDst3
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst2] ;// Store pTmp[1],[2],[3],[4] @ pDst2
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst1] ;// Store pTmp[2],[3],[4],[5] @ pDst1
VEXT dTmp,dTmp,dTmp,#1
VST1 dTmpU32[0],[pDst] ;// Store pTmp[3],[4],[5],[6] @ pDst
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VR
;// Load UL,U0,U1,U2,U3
VLD1 dAboveU32[0],[pSrcAbove]
VLD1 dAbove[7],[pSrcAboveLeft] ;// [UL|X|X|X|U3|U2|U1|U0]
;// Load L0,L1,L2 ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
;// dLeft1 = [L1| X|X|X|X|X|X|X]
VLD1 {dLeft0[7]},[pSrcLeft],leftStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft1[7]},[pSrcLeft],leftStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft0[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VEXT dOdd2,dAbove,dAbove,#7 ;// [ x x x U3 U2 U1 U0 UL ]
VEXT dEven0,dLeft0,dOdd2,#6 ;// [ x x x U1 U0 UL L0 L2 ]
VEXT dEven1,dLeft1,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L1 ]
VEXT dEven2,dLeft0,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L0 ]
VEXT dOdd0,dLeft1,dAbove,#7 ;// [ x x x U3 U2 U1 U0 L1 ]
VEXT dOdd1,dLeft0,dOdd2,#7 ;// [ x x x U2 U1 U0 UL L0 ]
VHADD dTmp1, dOdd0, dOdd2
VRHADD dTmp1, dTmp1, dOdd1 ;// Tmp[ x x x 9 7 5 3 1 ]
VHADD dTmp0, dEven0, dEven2
VRHADD dTmp0, dTmp0, dEven1 ;// Tmp[ x x x 8 6 4 2 0 ]
VEXT dTmp3,dTmp1,dTmp1,#1 ;// Tmp[ x x x x 9 7 5 3 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VEXT dTmp2,dTmp0,dTmp0,#1 ;// Tmp[ x x x x 8 6 4 2 ]
VST1 dTmp3U32[0],[pDst],dstep ;// Tmp[9],[7],[5],[3]
VST1 dTmp2U32[0],[pDstTmp],dstep ;// Tmp[8],[6],[4],[2]
VST1 dTmp1U32[0],[pDst],dstep ;// Tmp[7],[5],[3],[1]
VST1 dTmp0U32[0],[pDstTmp] ;// Tmp[6],[4],[2],[0]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_HD
;// Load U0,U1,U2,U3
VLD1 dAbove,[pSrcAbove] ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
;// Load UL,L0,L1,L2,L3 ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]
VLD1 {dLeft[7]},[pSrcAboveLeft]
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
VLD1 {dLeft[6]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[4]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[3]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VEXT dAbove0,dLeft,dAbove,#3 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]
VEXT dAbove1,dLeft,dAbove,#2 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]
VEXT dAbove2,dLeft,dAbove,#1 ;// [ U0|UL|L0|L1|L2|L3|X|X ]
VHADD dTmp0, dAbove0, dAbove2
VRHADD dTmp0, dTmp0, dAbove1 ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
VRHADD dTmp1, dAbove1, dAbove0 ;// (a+b+1)>>1
VSHL dTmp1U64,dTmp1U64,#24 ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
VSHL dTmpU64,dTmp0U64,#16 ;// Tmp[ 2|4|6|8| X | X | X | X ]
VZIP dTmp1,dTmp ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
VEXT dTmp0,dTmp0,dTmp0,#6 ;// Tmp[ X| X| X| X| X| X| 0 | 1 ]
VEXT dTmp1,dTmp,dTmp0,#2 ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VST1 dTmp1U32[1],[pDst],dstep ;// Store pTmp[0|1|2|3]
VST1 dTmpU32[1],[pDstTmp],dstep ;// Store pTmp[2|3|4|5]
VST1 dTmp1U32[0],[pDst] ;// Store pTmp[4|5|6|7]
VST1 dTmpU32[0],[pDstTmp] ;// Store pTmp[6|7|8|9]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_VL
TST availability, #OMX_VC_UPPER_RIGHT
BEQ DiagVLUpperRightNotAvailable
VLD1 dAbove0,[pSrcAbove] ;// [U7|U6|U5|U4|U3|U2|U1|U0]
VEXT dAbove1,dAbove0,dAbove0,#1 ;// [ X|U7|U6|U5|U4|U3|U2|U1]
VEXT dAbove2,dAbove1,dAbove1,#1 ;// [ X| X|U7|U6|U5|U4|U3|U2]
B DiagVLPredict4x4Store
DiagVLUpperRightNotAvailable
VLD1 dAboveU32[1],[pSrcAbove] ;// [U3|U2|U1|U0|-|-|-|-]
VDUP dU3, dAbove[7] ;// [U3 U3 U3 U3 U3 U3 U3 U3]
VEXT dAbove0, dAbove, dU3, #4 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
VEXT dAbove1, dAbove, dU3, #5 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
VEXT dAbove2, dAbove, dU3, #6 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
DiagVLPredict4x4Store
VRHADD dTmp0, dAbove1, dAbove0 ;// (a+b+1)>>1
;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
VHADD dTmp3, dAbove0, dAbove2
VRHADD dTmp3, dTmp3, dAbove1 ;// (a+2*b+c+2)>>2
;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
VEXT dTmp1,dTmp0,dTmp0,#1 ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
ADD pDstTmp, pDst, dstStep
ADD dstep, dstStep, dstStep
VEXT dTmp2,dTmp3,dTmp1,#1 ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
VST1 dTmp0U32[0],[pDst],dstep ;// Tmp[6],[4],[2],[0]
VST1 dTmp3U32[0],[pDstTmp],dstep ;// Tmp[7],[5],[3],[1]
VST1 dTmp1U32[0],[pDst] ;// Tmp[8],[6],[4],[2]
VST1 dTmp2U32[0],[pDstTmp] ;// Tmp[9],[7],[5],[3]
B ExitPredict4x4 ;// Branch to exit code
OMX_VC_4x4_HU
ADD pSrcTmp, pSrcLeft, leftStep
ADD srcStep, leftStep, leftStep
;// Load Left Edge ;// [L3|L2|L1|L0|X|X|X|X]
VLD1 {dLeft[4]},[pSrcLeft],srcStep ;// pSrcLeft[0*leftStep]
VLD1 {dLeft[5]},[pSrcTmp],srcStep ;// pSrcLeft[1*leftStep]
VLD1 {dLeft[6]},[pSrcLeft] ;// pSrcLeft[2*leftStep]
VLD1 {dLeft[7]},[pSrcTmp] ;// pSrcLeft[3*leftStep]
VDUP dL3,dLeft[7] ;// [L3|L3|L3|L3|L3|L3|L3|L3]
VEXT dLeftHU0,dLeft,dL3,#4 ;// [L3|L3|L3|L3|L3|L2|L1|L0]
VEXT dLeftHU1,dLeft,dL3,#5 ;// [L3|L3|L3|L3|L3|L3|L2|L1]
VEXT dLeftHU2,dLeft,dL3,#6 ;// [L3|L3|L3|L3|L3|L3|L3|L2]
VHADD dTmp0, dLeftHU0, dLeftHU2
VRHADD dTmp0, dTmp0, dLeftHU1 ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
VRHADD dTmp1, dLeftHU1, dLeftHU0 ;// (a+b+1)>>1
;// Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
VZIP dTmp1,dTmp0 ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]
;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
VST1 dTmp1U32[0],[pDst],dstStep ;// [3|2|1|0]
VEXT dTmp1,dTmp1,dTmp1,#2
VST1 dTmp1U32[0],[pDst],dstStep ;// [5|4|3|2]
VEXT dTmp1,dTmp1,dTmp1,#2
VST1 dTmp1U32[0],[pDst],dstStep ;// [7|6|5|4]
VST1 dTmp0U32[0],[pDst] ;// [9|8|7|6]
ExitPredict4x4
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// CortexA8
END
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 ends
;//-----------------------------------------------------------------------------------------------
|