summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_4x4_s.s
blob: b5780efa7b21109e2dc4e02d9ed776757511d404 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;// 
;// File Name:  omxVCM4P10_PredictIntra_4x4_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// 
;// 
;//

        
        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
;// Define the processor variants supported by this file
         
         M_VARIANTS CortexA8
        
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------

    M_TABLE armVCM4P10_pSwitchTable4x4
    DCD  OMX_VC_4x4_VERT,     OMX_VC_4x4_HOR 
    DCD  OMX_VC_4x4_DC,       OMX_VC_4x4_DIAG_DL
    DCD  OMX_VC_4x4_DIAG_DR,  OMX_VC_4x4_VR
    DCD  OMX_VC_4x4_HD,       OMX_VC_4x4_VL
    DCD  OMX_VC_4x4_HU   
    
        
        IF CortexA8
        
;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
return          RN 0
pTable          RN 8
pc              RN 15

;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft        RN 0    ;// input pointer
pSrcAbove       RN 1    ;// input pointer
pSrcAboveLeft   RN 2    ;// input pointer
pDst            RN 3    ;// output pointer
leftStep        RN 4    ;// input variable
dstStep         RN 5    ;// input variable
predMode        RN 6    ;// input variable
availability    RN 7    ;// input variable
pDst1           RN 1 
pDst2           RN 4 
pDst3           RN 6 

pSrcTmp         RN 9
srcStep         RN 10
pDstTmp         RN 11
dstep           RN 12

;//-------------------
;// Neon registers
;//-------------------

;// OMX_VC_CHROMA_VERT
dAboveU32       DN  D0.U32

;// OMX_VC_CHROMA_HOR
dLeftVal0       DN  D0.8
dLeftVal1       DN  D1.8
dLeftVal2       DN  D2.8
dLeftVal3       DN  D3.8
dLeftVal0U32    DN  D0.U32
dLeftVal1U32    DN  D1.U32
dLeftVal2U32    DN  D2.U32
dLeftVal3U32    DN  D3.U32

;// OMX_VC_4x4_DC
dLeftVal        DN  D0.U8
dLeftValU32     DN  D0.U32
dSumAboveLeftU16  DN  D1.U16
dSumAboveLeftU32  DN  D1.U32
dSumAboveLeftU64  DN  D1.U64
dSumAboveLeftU8 DN  D1.U8
dSum            DN  D0.U8

dSumLeftValU16  DN  D1.U16
dSumLeftValU32  DN  D1.U32
dSumLeftValU64  DN  D1.U64
dSumLeftValU8   DN  D1.U8

dAboveVal       DN  D0.U8
dSumAboveValU16  DN  D1.U16
dSumAboveValU32  DN  D1.U32
dSumAboveValU64  DN  D1.U64
dSumAboveValU8   DN  D1.U8
dConst128U8     DN  D0.U8


;//OMX_VC_4x4_DIAG_DL

dAbove          DN  D0.U8
dU7             DN  D2.U8
dU3             DN  D2.U8
dAbove0         DN  D3.U8
dAbove1         DN  D4.U8
dAbove2         DN  D5.U8
dTmp            DN  D6.U8
dTmp0           DN  D7.U8
dTmp1           DN  D8.U8
dTmp2            DN  D9.U8
dTmp3            DN  D10.U8
dTmpU32         DN  D6.U32


;//OMX_VC_4x4_DIAG_DR
dLeft           DN  D1.U8
dUL             DN  D2.U8

;//OMX_VC_4x4_VR
dLeft0          DN  D1.U8
dLeft1          DN  D2.U8
dEven0          DN  D3.U8
dEven1          DN  D4.U8
dEven2          DN  D5.U8
dOdd0           DN  D6.U8
dOdd1           DN  D11.U8
dOdd2           DN  D12.U8
dTmp3U32        DN  D10.U32    
dTmp2U32        DN  D9.U32


;//OMX_VC_4x4_HD
dTmp1U64        DN  D8.U64
dTmp0U64        DN  D7.U64
dTmpU64         DN  D6.U64
dTmpU32         DN  D6.U32
dTmp1U32        DN  D8.U32

;//OMX_VC_4x4_HU
dL3             DN  D2.U8
dLeftHU0        DN  D3.U8
dLeftHU1        DN  D4.U8
dLeftHU2        DN  D5.U8
dTmp0U32        DN  D7.U32




;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 starts
;//-----------------------------------------------------------------------------------------------
        
        ;// Write function header
        M_START omxVCM4P10_PredictIntra_4x4, r12,d12
        
        ;// Define stack arguments
        M_ARG    LeftStep,     4
        M_ARG    DstStep,      4
        M_ARG    PredMode,     4
        M_ARG    Availability, 4
        
                
        LDR      pTable,=armVCM4P10_pSwitchTable4x4  ;// Load index table for switch case
        
        ;// Load argument from the stack
        M_LDRD   predMode,availability,PredMode     ;// Arg predMode & availability loaded from stack to reg 
        M_LDRD   leftStep,dstStep,LeftStep          ;// Arg leftStep & dstStep loaded from stack to reg 
        
        
        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode


OMX_VC_4x4_HOR
        
        ADD     pSrcTmp, pSrcLeft, leftStep
        ADD     srcStep, leftStep, leftStep
        ;// Load Left Edge
        VLD1    {dLeftVal0[]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
        VLD1    {dLeftVal1[]},[pSrcTmp],srcStep            ;//    pSrcLeft[1*leftStep]
        VLD1    {dLeftVal2[]},[pSrcLeft]                   ;//    pSrcLeft[2*leftStep]
        VLD1    {dLeftVal3[]},[pSrcTmp]                    ;//    pSrcLeft[3*leftStep]
        
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        
        VST1    dLeftVal0U32[0],[pDst],dstep                ;// pDst[0*dstStep+x] :0<= x <= 7
        VST1    dLeftVal1U32[0],[pDstTmp],dstep             ;// pDst[1*dstStep+x] :0<= x <= 7
        VST1    dLeftVal2U32[0],[pDst]                      ;// pDst[2*dstStep+x] :0<= x <= 7
        VST1    dLeftVal3U32[0],[pDstTmp]                   ;// pDst[3*dstStep+x] :0<= x <= 7
        
        B        ExitPredict4x4                             ;// Branch to exit code
        
OMX_VC_4x4_VERT
        
        ;// Load Upper Edge
        VLD1     dAboveU32[0],[pSrcAbove]
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        
DCPredict4x4VertStore         
        
        VST1     dAboveU32[0],[pDst],dstep
        VST1     dAboveU32[0],[pDstTmp],dstep
        VST1     dAboveU32[0],[pDst]
        VST1     dAboveU32[0],[pDstTmp]

        B        ExitPredict4x4                             ;// Branch to exit code

OMX_VC_4x4_DC
        
        
        TST     availability, #OMX_VC_LEFT
        BEQ     DCPredict4x4LeftNotAvailable

        ADD     pSrcTmp, pSrcLeft, leftStep
        ADD     srcStep, leftStep, leftStep
        ;// Load Left Edge
        VLD1    {dLeftVal[0]},[pSrcLeft],srcStep            ;// pSrcLeft[0*leftStep]
        VLD1    {dLeftVal[1]},[pSrcTmp],srcStep             ;//    pSrcLeft[1*leftStep]
        VLD1    {dLeftVal[2]},[pSrcLeft]                    ;//    pSrcLeft[2*leftStep]
        VLD1    {dLeftVal[3]},[pSrcTmp]                     ;//    pSrcLeft[3*leftStep]
        
        TST     availability, #OMX_VC_UPPER
        BEQ     DCPredict4x4LeftOnlyAvailable

        ;// Load Upper Edge also
        VLD1     dLeftValU32[1],[pSrcAbove]                 ;// pSrcAbove[0 to 3]
        MOV      return, #OMX_Sts_NoErr
        
        VPADDL   dSumAboveLeftU16, dLeftVal                 ;// [pSrcAbove[2+3 | 0+1] | pSrcLeft[2+3 | 0+1]]             
        VPADDL   dSumAboveLeftU32, dSumAboveLeftU16         ;// [pSrcAbove[2+3+0+1] | pSrcLeft[2+3+0+1]] 
        VPADDL   dSumAboveLeftU64, dSumAboveLeftU32         ;// [pSrcAbove[2+3+0+1] + pSrcLeft[2+3+0+1]]                          
        VRSHR    dSumAboveLeftU64,dSumAboveLeftU64,#3       ;// Sum = (Sum + 4) >> 3
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        VDUP     dSum,dSumAboveLeftU8[0]
        
        B        DCPredict4x4VertStore  
        
DCPredict4x4LeftOnlyAvailable

        MOV      return, #OMX_Sts_NoErr                     ;// returnNoError
        
        VPADDL   dSumLeftValU16, dLeftVal                   ;// [ XX | pSrcLeft[2+3 | 0+1]]             
        VPADDL   dSumLeftValU32, dSumLeftValU16             ;// [ XXXX | pSrcLeft[2+3+0+1]] 
        
        VRSHR    dSumLeftValU32,dSumLeftValU32,#2           ;// Sum = (Sum + 2) >> 2
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        VDUP     dSum,dSumLeftValU8[0]
        
        B        DCPredict4x4VertStore   
        
DCPredict4x4LeftNotAvailable
                 
        TST     availability, #OMX_VC_UPPER
        BEQ     DCPredict4x4NoneAvailable

        ;// Load Upper Edge 
        VLD1     dAboveU32[0],[pSrcAbove]                   ;// pSrcAbove[0 to 3]  
        MOV      return, #OMX_Sts_NoErr
        
        VPADDL   dSumAboveValU16, dAboveVal                 ;// [ XX | pSrcAbove[2+3 | 0+1]]             
        VPADDL   dSumAboveValU32, dSumAboveValU16           ;// [ XXXX | pSrcAbove[2+3+0+1]] 
        
        VRSHR    dSumAboveValU32,dSumAboveValU32,#2         ;// Sum = (Sum + 2) >> 2
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        VDUP     dSum,dSumAboveValU8[0]
        
        B        DCPredict4x4VertStore   
        
DCPredict4x4NoneAvailable        
        
        VMOV     dConst128U8,#0x80                          ;// 0x8080808080808080 if(count == 0)
        MOV      return, #OMX_Sts_NoErr
        
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        B        DCPredict4x4VertStore   
        
        
        
OMX_VC_4x4_DIAG_DL
        
        TST     availability, #OMX_VC_UPPER_RIGHT
        BEQ     DiagDLUpperRightNotAvailable
       
        VLD1    dAbove0,[pSrcAbove]                     ;// [U7|U6|U5|U4|U3|U2|U1|U0] 
        VDUP    dU7, dAbove0[7]                         ;// [U7|U7|U7|U7|U7|U7|U7|U7]
        VEXT    dAbove1, dAbove0, dU7, #1               ;// [U7|U7|U6|U5|U4|U3|U2|U1]
        VEXT    dAbove2, dAbove0, dU7, #2               ;// [U7|U7|U7|U6|U5|U4|U3|U2] 
        B       DiagDLPredict4x4Store         
       
DiagDLUpperRightNotAvailable
        VLD1    dAboveU32[1],[pSrcAbove]                ;// [U3|U2|U1|U0|-|-|-|-] 
        VDUP    dU3, dAbove[7]                          ;// [U3 U3 U3 U3 U3 U3 U3 U3]

        VEXT    dAbove0, dAbove, dU3, #4                ;// [U3 U3 U3 U3 U3 U2 U1 U0]
        VEXT    dAbove1, dAbove, dU3, #5                ;// [U3 U3 U3 U3 U3 U3 U2 U1]
        VEXT    dAbove2, dAbove, dU3, #6                ;// [U3 U3 U3 U3 U3 U3 U3 U2]
       
DiagDLPredict4x4Store  
        
        VHADD   dTmp, dAbove0, dAbove2
        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
        

        VST1    dTmpU32[0],[pDst],dstStep
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst],dstStep
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst],dstStep
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst]
        
        B        ExitPredict4x4                         ;// Branch to exit code
        

OMX_VC_4x4_DIAG_DR
        
        
        ;// Load U0,U1,U2,U3
        
        VLD1    dAboveU32[0],[pSrcAbove]                ;// [X|X|X|X|U3|U2|U1|U0]
                
        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X]    
        VLD1    {dLeft[7]},[pSrcAboveLeft]              
        ADD     pSrcTmp, pSrcLeft, leftStep
        ADD     srcStep, leftStep, leftStep
        ADD     pDst1,pDst,dstStep
        
        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
        
        
        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [U2|U1|U0|UL|L0|L1|L2|L3]   
        ADD     pDst2,pDst1,dstStep
        VEXT    dAbove1,dLeft,dAbove,#4                 ;// [U3|U2|U1|U0|UL|L0|L1|L2]   
        ADD     pDst3,pDst2,dstStep
        VEXT    dAbove2,dLeft,dAbove,#5                 ;// [ X|U3|U2|U1|U0|UL|L0|L1]   
        
        VHADD   dTmp, dAbove0, dAbove2
        VRHADD  dTmp, dTmp, dAbove1                     ;// (a+2*b+c+2)>>2
        
        
        VST1    dTmpU32[0],[pDst3]                      ;// Store pTmp[0],[1],[2],[3] @ pDst3
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst2]                      ;// Store pTmp[1],[2],[3],[4] @ pDst2
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst1]                      ;// Store pTmp[2],[3],[4],[5] @ pDst1
        VEXT    dTmp,dTmp,dTmp,#1
        VST1    dTmpU32[0],[pDst]                       ;// Store pTmp[3],[4],[5],[6] @ pDst
        
        B        ExitPredict4x4                         ;// Branch to exit code

OMX_VC_4x4_VR

        
        ;// Load UL,U0,U1,U2,U3
        VLD1    dAboveU32[0],[pSrcAbove]
        VLD1    dAbove[7],[pSrcAboveLeft]               ;// [UL|X|X|X|U3|U2|U1|U0]
        
        ;// Load L0,L1,L2                               ;// dLeft0 = [L0|L2|X|X|X|X|X|X]
                                                        ;// dLeft1 = [L1| X|X|X|X|X|X|X]    
        VLD1    {dLeft0[7]},[pSrcLeft],leftStep         ;// pSrcLeft[0*leftStep]
        VLD1    {dLeft1[7]},[pSrcLeft],leftStep         ;// pSrcLeft[1*leftStep]
        VLD1    {dLeft0[6]},[pSrcLeft]                  ;// pSrcLeft[2*leftStep]
        
        
        VEXT    dOdd2,dAbove,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 UL ]
        VEXT    dEven0,dLeft0,dOdd2,#6                  ;// [ x x x U1 U0 UL L0 L2 ]
        VEXT    dEven1,dLeft1,dOdd2,#7                  ;// [ x x x U2 U1 U0 UL L1 ]
        VEXT    dEven2,dLeft0,dAbove,#7                 ;// [ x x x U3 U2 U1 U0 L0 ]
        VEXT    dOdd0,dLeft1,dAbove,#7                  ;// [ x x x U3 U2 U1 U0 L1 ]
        VEXT    dOdd1,dLeft0,dOdd2,#7                   ;// [ x x x U2 U1 U0 UL L0 ]
        
        VHADD   dTmp1, dOdd0, dOdd2
        VRHADD  dTmp1, dTmp1, dOdd1                     ;// Tmp[ x x x 9 7 5 3 1 ]
        
        VHADD   dTmp0, dEven0, dEven2
        VRHADD  dTmp0, dTmp0, dEven1                    ;// Tmp[ x x x 8 6 4 2 0 ]
        
        
        VEXT    dTmp3,dTmp1,dTmp1,#1                    ;// Tmp[ x x x x 9 7 5 3 ] 
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        VEXT    dTmp2,dTmp0,dTmp0,#1                    ;// Tmp[ x x x x 8 6 4 2 ]
        
        
        VST1    dTmp3U32[0],[pDst],dstep                ;// Tmp[9],[7],[5],[3]
        VST1    dTmp2U32[0],[pDstTmp],dstep             ;// Tmp[8],[6],[4],[2]
        VST1    dTmp1U32[0],[pDst],dstep                ;// Tmp[7],[5],[3],[1]
        VST1    dTmp0U32[0],[pDstTmp]                   ;// Tmp[6],[4],[2],[0]
        
        B        ExitPredict4x4                         ;// Branch to exit code
        
OMX_VC_4x4_HD
        
        
        ;// Load U0,U1,U2,U3
        VLD1    dAbove,[pSrcAbove]                      ;//dAboveLeftVal = [U7|U6|U5|U4|U3|U2|U1|U0]
        
        ;// Load UL,L0,L1,L2,L3                         ;// dLeft = [UL|L0|L1|L2|L3|X|X|X] 
        VLD1    {dLeft[7]},[pSrcAboveLeft]   
        ADD     pSrcTmp, pSrcLeft, leftStep
        ADD     srcStep, leftStep, leftStep
        
        VLD1    {dLeft[6]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
        VLD1    {dLeft[4]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
        VLD1    {dLeft[3]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
        
        VEXT    dAbove0,dLeft,dAbove,#3                 ;// [ U2|U1|U0|UL|L0|L1|L2|L3 ]  
        VEXT    dAbove1,dLeft,dAbove,#2                 ;// [ U1|U0|UL|L0|L1|L2|L3|X ]   
        VEXT    dAbove2,dLeft,dAbove,#1                 ;// [ U0|UL|L0|L1|L2|L3|X|X ]     
        
        VHADD   dTmp0, dAbove0, dAbove2
        VRHADD  dTmp0, dTmp0, dAbove1                   ;// Tmp[ 0 | 1 | 2 | 4 | 6 | 8 | X | X ]
        
        
        VRHADD  dTmp1, dAbove1, dAbove0                 ;// (a+b+1)>>1
        VSHL    dTmp1U64,dTmp1U64,#24                   ;// Tmp[ 3|5| 7 |9 | X | X | X | X ]
        
        
        VSHL    dTmpU64,dTmp0U64,#16                    ;// Tmp[ 2|4|6|8| X | X | X | X ]
        VZIP    dTmp1,dTmp                              ;// dTmp = [ 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 ]
        VEXT    dTmp0,dTmp0,dTmp0,#6                    ;// Tmp[  X| X| X| X| X| X| 0 | 1 ]
        VEXT    dTmp1,dTmp,dTmp0,#2                     ;// Tmp[ 0 | 1 | 2 | 3 | 4 | 5 | 6 |7 ]
       
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        
        VST1    dTmp1U32[1],[pDst],dstep                ;// Store pTmp[0|1|2|3]
        VST1    dTmpU32[1],[pDstTmp],dstep              ;// Store pTmp[2|3|4|5]
        VST1    dTmp1U32[0],[pDst]                      ;// Store pTmp[4|5|6|7]
        VST1    dTmpU32[0],[pDstTmp]                    ;// Store pTmp[6|7|8|9]
        
        B        ExitPredict4x4                         ;// Branch to exit code
        
OMX_VC_4x4_VL

        
        TST     availability, #OMX_VC_UPPER_RIGHT
        BEQ     DiagVLUpperRightNotAvailable
       
        VLD1    dAbove0,[pSrcAbove]                      ;// [U7|U6|U5|U4|U3|U2|U1|U0] 
        VEXT    dAbove1,dAbove0,dAbove0,#1               ;// [ X|U7|U6|U5|U4|U3|U2|U1]
        VEXT    dAbove2,dAbove1,dAbove1,#1               ;// [ X| X|U7|U6|U5|U4|U3|U2]
        
        B       DiagVLPredict4x4Store         
       
DiagVLUpperRightNotAvailable
        VLD1    dAboveU32[1],[pSrcAbove]                 ;// [U3|U2|U1|U0|-|-|-|-] 
        VDUP    dU3, dAbove[7]                           ;// [U3 U3 U3 U3 U3 U3 U3 U3]

        VEXT    dAbove0, dAbove, dU3, #4                 ;// [U3 U3 U3 U3 U3 U2 U1 U0]
        VEXT    dAbove1, dAbove, dU3, #5                 ;// [U3 U3 U3 U3 U3 U3 U2 U1]
        VEXT    dAbove2, dAbove, dU3, #6                 ;// [U3 U3 U3 U3 U3 U3 U3 U2]
       
DiagVLPredict4x4Store  
        
        VRHADD  dTmp0, dAbove1, dAbove0                 ;// (a+b+1)>>1
                                                        ;// Tmp[ X| X| X| 8| 6| 4| 2| 0 ]
        
        VHADD   dTmp3, dAbove0, dAbove2
        VRHADD  dTmp3, dTmp3, dAbove1                   ;// (a+2*b+c+2)>>2
                                                        ;// Tmp[ X| X| X| 9| 7| 5| 3| 1 ]
                                                         
        VEXT    dTmp1,dTmp0,dTmp0,#1                    ;// Tmp[ X| X| X| X| 8| 6| 4| 2 ]
        ADD     pDstTmp, pDst, dstStep
        ADD     dstep, dstStep, dstStep
        VEXT    dTmp2,dTmp3,dTmp1,#1                    ;// Tmp[ X| X| X| X| 9| 7| 5| 3 ]
        
        VST1    dTmp0U32[0],[pDst],dstep                ;// Tmp[6],[4],[2],[0]
        VST1    dTmp3U32[0],[pDstTmp],dstep             ;// Tmp[7],[5],[3],[1]
        VST1    dTmp1U32[0],[pDst]                      ;// Tmp[8],[6],[4],[2]
        VST1    dTmp2U32[0],[pDstTmp]                   ;// Tmp[9],[7],[5],[3]
        
        B        ExitPredict4x4                         ;// Branch to exit code
        
OMX_VC_4x4_HU
        ADD     pSrcTmp, pSrcLeft, leftStep
        ADD     srcStep, leftStep, leftStep

        ;// Load Left Edge                              ;// [L3|L2|L1|L0|X|X|X|X]
        VLD1    {dLeft[4]},[pSrcLeft],srcStep           ;// pSrcLeft[0*leftStep]
        VLD1    {dLeft[5]},[pSrcTmp],srcStep            ;// pSrcLeft[1*leftStep]
        VLD1    {dLeft[6]},[pSrcLeft]                   ;// pSrcLeft[2*leftStep]
        VLD1    {dLeft[7]},[pSrcTmp]                    ;// pSrcLeft[3*leftStep]
        
        VDUP    dL3,dLeft[7]                            ;// [L3|L3|L3|L3|L3|L3|L3|L3]
        
        VEXT    dLeftHU0,dLeft,dL3,#4                   ;// [L3|L3|L3|L3|L3|L2|L1|L0]
        VEXT    dLeftHU1,dLeft,dL3,#5                   ;// [L3|L3|L3|L3|L3|L3|L2|L1]
        VEXT    dLeftHU2,dLeft,dL3,#6                   ;// [L3|L3|L3|L3|L3|L3|L3|L2]
        
        VHADD   dTmp0, dLeftHU0, dLeftHU2
        VRHADD  dTmp0, dTmp0, dLeftHU1                  ;// Tmp[ L3 | L3 | L3 | L3 | L3 | 5 | 3 | 1 ]
        
        VRHADD  dTmp1, dLeftHU1, dLeftHU0               ;// (a+b+1)>>1 
                                                        ;//  Tmp[ L3 | L3 | L3 | L3 | L3 | 4 | 2 | 0 ]
                                                      
        VZIP    dTmp1,dTmp0                             ;// dTmp1 = Tmp[7| 6| 5| 4| 3| 2| 1| 0]  
                                                        ;// dTmp0 = [L3|L3|L3|L3|L3|L3|L3|L3]
                                                                                                                            
        
        VST1    dTmp1U32[0],[pDst],dstStep              ;// [3|2|1|0] 
        VEXT    dTmp1,dTmp1,dTmp1,#2
        VST1    dTmp1U32[0],[pDst],dstStep              ;// [5|4|3|2] 
        VEXT    dTmp1,dTmp1,dTmp1,#2
        VST1    dTmp1U32[0],[pDst],dstStep              ;// [7|6|5|4]  
        VST1    dTmp0U32[0],[pDst]                      ;// [9|8|7|6] 
        
        
ExitPredict4x4
        
        MOV      return,  #OMX_Sts_NoErr
        M_END

        ENDIF ;// CortexA8
        
        END
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_4x4 ends
;//-----------------------------------------------------------------------------------------------