summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
blob: 1557208158041988ad756d2f3a892f4c14df68b0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
;//
;// 
;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS ARM1136JS    
  
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------

    M_TABLE armVCM4P10_pIndexTable16x16
    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 
    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
    
    IF ARM1136JS

;//--------------------------------------------
;// Constants 
;//--------------------------------------------  
BLK_SIZE        EQU 0x10
MUL_CONST0      EQU 0x01010101
MUL_CONST1      EQU 0x00060004
MUL_CONST2      EQU 0x00070005
MUL_CONST3      EQU 0x00030001
MASK_CONST      EQU 0x00FF00FF

;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
y               RN 12   
pc              RN 15   

return          RN 0    
innerCount      RN 0    
outerCount      RN 1    
pSrcLeft2       RN 1    
pDst2           RN 2    
sum             RN 6    
pTable          RN 9    
temp1           RN 10   
temp2           RN 12   
cMul1           RN 11   
cMul2           RN 12   
count           RN 12   
dstStepx2       RN 11   
leftStepx2      RN 14   
r0x01010101     RN 10   
r0x00FF00FF     RN 11

tVal0           RN 0    
tVal1           RN 1    
tVal2           RN 2    
tVal3           RN 3    
tVal4           RN 4    
tVal5           RN 5    
tVal6           RN 6    
tVal7           RN 7    
tVal8           RN 8    
tVal9           RN 9    
tVal10          RN 10   
tVal11          RN 11   
tVal12          RN 12   
tVal14          RN 14   

b               RN 12   
c               RN 14   

p2p0            RN 0    
p3p1            RN 1    
p6p4            RN 2    
p7p5            RN 4    
p10p8           RN 6    
p11p9           RN 7    
p14p12          RN 8    
p15p13          RN 9    

p3210           RN 10   
p7654           RN 10   
p111098         RN 10   
p15141312       RN 10   

;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft        RN 0    ;// input pointer
pSrcAbove       RN 1    ;// input pointer
pSrcAboveLeft   RN 2    ;// input pointer
pDst            RN 3    ;// output pointer
leftStep        RN 4    ;// input variable
dstStep         RN 5    ;// input variable
predMode        RN 6    ;// input variable
availability    RN 7    ;// input variable

;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_16x16 starts
;//-----------------------------------------------------------------------------------------------
        
        ;// Write function header
        M_START omxVCM4P10_PredictIntra_16x16, r11
        
        ;// Define stack arguments
        M_ARG    LeftStep,     4
        M_ARG    DstStep,      4
        M_ARG    PredMode,     4
        M_ARG    Availability, 4
        
        ;// M_STALL ARM1136JS=4
        
        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
        
        ;// Load argument from the stack
        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg 
        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg 
        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg         
        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
        
        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
        
OMX_VC_16X16_VERT
        LDM      pSrcAbove, {tVal6,tVal7,tVal8,tVal9};// tVal 6 to 9 = pSrcAbove[0 to 15]
        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
        
        ;// M_STALL ARM1136JS=2                       ;// Stall outside the loop

LOOP_VERT
        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
        SUBS     y, y, #2                            ;// y--
        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
        BNE      LOOP_VERT                           ;// Loop for 8 times
        MOV      return, #OMX_Sts_NoErr
        M_EXIT

        
OMX_VC_16X16_HOR
        
        ;// M_STALL ARM1136JS=6 
               
        LDR      r0x01010101, =MUL_CONST0            ;// Const to repeat the byte in reg 4 times
        MOV      y, #4                               ;// Outer Loop Count
        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal1 = pSrcLeft[4 to 7]
        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
        SUB      dstStepx2, dstStepx2, #12           ;// double dstStep  minus 12
       
LOOP_HOR        
        M_LDRB   tVal8, [pSrcLeft], +leftStep        ;// tVal8 = pSrcLeft[0 to 3]
        MUL      tVal6, tVal6, r0x01010101           ;// replicate the val in all the bytes
        M_LDRB   tVal9, [pSrcLeft], +leftStep        ;// tVal9 = pSrcLeft[4 to 7]
        MUL      tVal7, tVal7, r0x01010101           ;// replicate the val in all the bytes
        SUBS     y, y, #1                            ;// y--
        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3] 
        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
        MUL      tVal8, tVal8, r0x01010101           ;// replicate the val in all the bytes
        STR      tVal6, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
        STR      tVal7, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
        MUL      tVal9, tVal9, r0x01010101           ;// replicate the val in all the bytes
        M_STR    tVal6, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
        M_STR    tVal7, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[0 to 3] 
        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[0 to 3]
        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[4 to 7]
        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[4 to 7]
        STR      tVal8, [pDst],  #+4                 ;// store {tVal6} at pDst[8 to 11]
        STR      tVal9, [pDst2], #+4                 ;// store {tVal7} at pDst2[8 to 11]
        M_STR    tVal8, [pDst], dstStepx2            ;// store {tVal6} at pDst[12 to 15]
        M_LDRB   tVal6, [pSrcLeft], +leftStep        ;// tVal6 = pSrcLeft[0 to 3]
        M_STR    tVal9, [pDst2], dstStepx2           ;// store {tVal7} at pDst2[12 to 15]
        M_LDRB   tVal7, [pSrcLeft], +leftStep        ;// tVal7 = pSrcLeft[4 to 7]
        BNE      LOOP_HOR                            ;// Loop for 3 times
        MOV      return, #OMX_Sts_NoErr
        M_EXIT
        
OMX_VC_16X16_DC
        
        ;// M_STALL ARM1136JS=2
        
        MOV      count, #0                           ;// count = 0
        TST      availability, #OMX_VC_UPPER         ;// if(availability & #OMX_VC_UPPER)
        BEQ      TST_LEFT                            ;// Jump to Left if not upper
        LDM      pSrcAbove,{tVal8,tVal9,tVal10,tVal11};// tVal 8 to 11 = pSrcAbove[0 to 15]
        ADD      count, count, #1                    ;// if upper inc count by 1
        
        ;// M_STALL ARM1136JS=2
        
        UXTB16   tVal2, tVal8                        ;// pSrcAbove[0, 2]
        UXTB16   tVal6, tVal9                        ;// pSrcAbove[4, 6]
        UADD16   tVal2, tVal2, tVal6                 ;// pSrcAbove[0, 2] + pSrcAbove[4, 6]
        UXTB16   tVal8, tVal8, ROR #8                ;// pSrcAbove[1, 3]
        UXTB16   tVal9, tVal9, ROR #8                ;// pSrcAbove[5, 7]
        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[1, 3] + pSrcAbove[5, 7]
        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[7])
        
        UXTB16   tVal8, tVal10                       ;// pSrcAbove[8, 10]
        UXTB16   tVal9, tVal11                       ;// pSrcAbove[12, 14]
        UADD16   tVal8, tVal8, tVal9                 ;// pSrcAbove[8, 10] + pSrcAbove[12, 14]
        UXTB16   tVal10, tVal10, ROR #8              ;// pSrcAbove[9, 11]
        UXTB16   tVal11, tVal11, ROR #8              ;// pSrcAbove[13, 15]
        UADD16   tVal10, tVal10, tVal11              ;// pSrcAbove[9, 11] + pSrcAbove[13, 15]
        UADD16   tVal8, tVal8, tVal10                ;// sum(pSrcAbove[8] to pSrcAbove[15])
        
        UADD16   tVal2, tVal2, tVal8                 ;// sum(pSrcAbove[0] to pSrcAbove[15])
        
        ;// M_STALL ARM1136JS=1
        
        ADD      tVal2, tVal2, tVal2, LSR #16        ;// sum(pSrcAbove[0] to pSrcAbove[15])
        
        ;// M_STALL ARM1136JS=1
        
        UXTH     sum, tVal2                          ;// Extract the lower half for result
        
TST_LEFT        
        TST      availability, #OMX_VC_LEFT
        BEQ      TST_COUNT
        ADD      leftStepx2, leftStep,leftStep       ;// leftStepx2 = 2 * leftStep
        ADD      pSrcLeft2, pSrcLeft, leftStep       ;// pSrcLeft2 = pSrcLeft + leftStep
        
        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
        ADD      tVal7, tVal8, tVal9                 ;// tVal7 = tVal8 + tVal9
        ADD      count, count, #1                    ;// Inc Counter if Left is available
        ADD      tVal6, tVal10, tVal11               ;// tVal6 = tVal10 + tVal11
        
        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
        ADD      sum, tVal7, tVal6                   ;// sum = tVal8 + tVal10
        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
        
        
        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
        
        
        M_LDRB   tVal8, [pSrcLeft],  +leftStepx2     ;// tVal8 = pSrcLeft[0]
        M_LDRB   tVal9, [pSrcLeft2], +leftStepx2     ;// tVal9 = pSrcLeft[1]
        M_LDRB   tVal10, [pSrcLeft], +leftStepx2     ;// tVal10= pSrcLeft[2]
        M_LDRB   tVal11, [pSrcLeft2],+leftStepx2     ;// tVal11= pSrcLeft[3]
        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7
        ADD      tVal8, tVal8, tVal9                 ;// tVal8 = tVal8 + tVal9
        ADD      tVal10, tVal10, tVal11              ;// tVal10= tVal10 + tVal11
        ADD      tVal7, tVal8, tVal10                ;// tVal7 = tVal8 + tVal10
        ADD      sum, sum, tVal7                     ;// sum = sum + tVal7

TST_COUNT        
        CMP      count, #0                           ;// if(count == 0)
        MOVEQ    sum, #128                           ;// sum = 128 if(count == 0)
        BEQ      TST_COUNT0                          ;// if(count == 0)
        CMP      count, #1                           ;// if(count == 1)
        ADDEQ    sum, sum, #8                        ;// sum += 8 if(count == 1)
        ADDNE    sum, sum, tVal2                     ;// sum = sumleft + sumupper
        ADDNE    sum, sum, #16                       ;// sum += 16 if(count == 2)
        
        ;// M_STALL ARM1136JS=1
        
        UXTH     sum, sum                            ;// sum only byte rest cleared
        
        ;// M_STALL ARM1136JS=1
        
        LSREQ    sum, sum, #4                        ;// sum >> 4 if(count == 1)
        
        ;// M_STALL ARM1136JS=1
        
        LSRNE    sum, sum, #5                        ;// sum >> 5 if(count == 2)

TST_COUNT0
        
        ;// M_STALL ARM1136JS=1
        
        ORR      sum, sum, sum, LSL #8               ;// sum replicated in two halfword
        
        ;// M_STALL ARM1136JS=1
        
        ORR      tVal6, sum, sum, LSL #16            ;// sum  replicated in all bytes
        CPY      tVal7, tVal6                        ;// tVal1 = tVal0
        CPY      tVal8, tVal6                        ;// tVal2 = tVal0
        CPY      tVal9, tVal6                        ;// tVal3 = tVal0
        ADD      dstStepx2, dstStep, dstStep         ;// double dstStep
        ADD      pDst2, pDst, dstStep                ;// pDst2- pDst advanced by dstStep
        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
        
LOOP_DC        
        STM      pDst, {tVal6,tVal7,tVal8,tVal9}     ;// pDst[0 to 15] = tVal 6 to 9
        SUBS     y, y, #2                            ;// y--
        ADD      pDst, pDst, dstStepx2               ;// pDst advanced by dstStep
        STM      pDst2, {tVal6,tVal7,tVal8,tVal9}    ;// pDst2[16 to 31] = tVal 6 to 9
        ADD      pDst2, pDst2, dstStepx2             ;// pDst advanced by dstStep
        BNE      LOOP_DC                             ;// Loop for 8 times
        
        MOV      return, #OMX_Sts_NoErr
        M_EXIT

OMX_VC_16X16_PLANE
        
        ;// M_STALL ARM1136JS=3
        RSB      tVal14, leftStep, leftStep, LSL #4  ;// tVal14 = 15*leftStep
        
        ;// M_STALL ARM1136JS=2
        LDRB     tVal10, [pSrcLeft,  tVal14]         ;// tVal10 = pSrcLeft[15*leftStep]
        LDRB     tVal11, [pSrcAboveLeft]             ;// tVal11 = pSrcAboveLeft[0]
        LDRB     tVal12, [pSrcAbove, #15]

        ADD      tVal2,  tVal12,  tVal10             ;// tVal2  = pSrcAbove[15] + pSrcLeft[15*leftStep]
        SUB      tVal10, tVal10,  tVal11             ;// tVal10 = V0 = pSrcLeft[15*leftStep] - pSrcAboveLeft[0]
        SUB      tVal11, tVal12,  tVal11             ;// tVal11 = H0 = pSrcAbove[15] - pSrcAboveLeft[0]
        MOV      tVal2,  tVal2,   LSL #4             ;// tVal2  = a = 16 * (pSrcAbove[15] + pSrcLeft[15*leftStep])

        MOV     tVal11, tVal11, LSL #3              ;// 8*[15]-[-1]
        LDRB    tVal6, [pSrcAbove, #0]
        LDRB    tVal7, [pSrcAbove, #14]
        SUB     tVal8, tVal7, tVal6
        RSB     tVal8, tVal8, tVal8, LSL #3         ;// 7*[14]-[0]
        ADD     tVal11, tVal11, tVal8
        LDRB    tVal6, [pSrcAbove, #1]
        LDRB    tVal7, [pSrcAbove, #13]
        SUB     tVal8, tVal7, tVal6
        ADD     tVal8, tVal8, tVal8
        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 6*[13]-[1]
        ADD     tVal11, tVal11, tVal8
        LDRB    tVal6, [pSrcAbove, #2]
        LDRB    tVal7, [pSrcAbove, #12]
        SUB     tVal8, tVal7, tVal6
        ADD     tVal8, tVal8, tVal8, LSL #2         ;// 5*[12]-[2]
        ADD     tVal11, tVal11, tVal8
        LDRB    tVal6, [pSrcAbove, #3]
        LDRB    tVal7, [pSrcAbove, #11]
        SUB     tVal8, tVal7, tVal6
        ADD     tVal11, tVal11, tVal8, LSL #2       ;// + 4*[11]-[3]
        LDRB    tVal6, [pSrcAbove, #4]
        LDRB    tVal7, [pSrcAbove, #10]
        SUB     tVal8, tVal7, tVal6
        ADD     tVal8, tVal8, tVal8, LSL #1         ;// 3*[10]-[4]
        ADD     tVal11, tVal11, tVal8
        LDRB    tVal6, [pSrcAbove, #5]
        LDRB    tVal7, [pSrcAbove, #9]
        SUB     tVal8, tVal7, tVal6
        ADD     tVal11, tVal11, tVal8, LSL #1       ;// + 2*[9]-[5]
        LDRB    tVal6, [pSrcAbove, #6]
        LDRB    tVal7, [pSrcAbove, #8]
        SUB     tVal8, tVal7, tVal6                 ;// 1*[8]-[6]
        ADD     tVal7, tVal11, tVal8

        ADD      tVal2,  tVal2,   #16                ;// tVal2  = a + 16
        MOV      tVal1,  pSrcLeft                    ;// tVal4  = pSrcLeft
        SUB      tVal9,  tVal14,   leftStep          ;// tVal9  = 14*leftStep
        ADD      tVal9,  pSrcLeft, tVal9             ;// tVal9  = pSrcLeft + 14*leftStep
        
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[14*leftStep]
        M_LDRB   tVal11, [tVal1], +leftStep          ;// tVal11 = pSrcLeft[0]
        ADD      tVal7,  tVal7,  tVal7,  LSL #2      ;// tVal7  = 5 * H
        ADD      tVal7,  tVal7,  #32                 ;// tVal7  = 5 * H + 32
        SUB      tVal8,  tVal8,  tVal11              ;// tVal8  = pSrcLeft[14*leftStep] - pSrcLeft[0]
        ASR      tVal12, tVal7,  #6                  ;// tVal12 = b = (5 * H + 32) >> 6
        
        RSB      tVal8,  tVal8,  tVal8,  LSL #3      ;// tVal8  = V1 = 7* (pSrcLeft[14*leftStep]-pSrcLeft[0])
        ADD      tVal6,  tVal8,  tVal10, LSL #3      ;// tVal6  = V = V0 +V1
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[13*leftStep]
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[leftStep]
        RSB      tVal7,  tVal12,  tVal12,  LSL #3    ;// tVal7  = 7*b
        SUB      tVal2,  tVal2,   tVal7              ;// tVal2  = a + 16 - 7*b
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[13*leftStep] - pSrcLeft[leftStep]
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[12*lS]
        ADD      tVal7,  tVal7,   tVal7              ;// tVal7  = 2 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[2*leftStep]        
        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 6 * (pSrcLeft[13*leftStep] - pSrcLeft[leftStep])
        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V2
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep]
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[11*leftStep]
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[3*leftStep]
        ADD      tVal7,  tVal7,   tVal7,  LSL #2     ;// tVal7  = 5 * (pSrcLeft[12*leftStep] - pSrcLeft[2*leftStep])
        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V3
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[11*leftStep] - pSrcLeft[3*leftStep]
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[10*leftStep]
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[4*leftStep]
        ADD      tVal6,  tVal6,   tVal7,  LSL #2     ;// tVal6  = V = V + V4
        SUB      dstStep, dstStep, #16               ;// tVal5  = dstStep - 16
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep]
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[9*leftStep]
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[5*leftStep]
        ADD      tVal7,  tVal7,   tVal7,  LSL #1     ;// tVal7  = 3 * (pSrcLeft[10*leftStep] - pSrcLeft[4*leftStep])
        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V5
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[9*leftStep] - pSrcLeft[5*leftStep]
        M_LDRB   tVal8,  [tVal9], -leftStep          ;// tVal8  = pSrcLeft[8*leftStep]
        M_LDRB   tVal10, [tVal1], +leftStep          ;// tVal10 = pSrcLeft[6*leftStep]
        ADD      tVal6,  tVal6,   tVal7,  LSL #1     ;// tVal6  = V = V + V6
        
        ;// M_STALL ARM1136JS=1
        SUB      tVal7,  tVal8,   tVal10             ;// tVal7  = pSrcLeft[8*leftStep] - pSrcLeft[6*leftStep]
        ADD      tVal6,  tVal6,   tVal7              ;// tVal6  = V = V + V7
        
        ;// M_STALL ARM1136JS=1
        ADD      tVal6,  tVal6,   tVal6,  LSL #2     ;// tVal6  = 5*V
        ADD      tVal6,  tVal6,   #32                ;// tVal6  = 5*V + 32
        
        ;// M_STALL ARM1136JS=1
        ASR      tVal14, tVal6,   #6                 ;// tVal14 = c = (5*V + 32)>>6
        
        ;// M_STALL ARM1136JS=1
        RSB      tVal6,  tVal14,  tVal14, LSL #3     ;// tVal6  = 7*c
        UXTH     tVal14, tVal14                      ;// tVal14 = Cleared the upper half word
        ADD      tVal10, tVal12,  tVal12             ;// tVal10 = 2*b
        ORR      tVal14, tVal14,  tVal14, LSL #16    ;// tVal14 = {c  ,  c}
        SUB      tVal6,  tVal2,   tVal6              ;// tVal6  = d = a - 7*b - 7*c + 16
        ADD      tVal1,  tVal6,   tVal10             ;// tVal1  = pp2 = d + 2*b
        ADD      tVal10, tVal10,  tVal12             ;// tVal10 =3*b
        ORR      tVal0,  tVal6,   tVal1,  LSL #16    ;// tval0  = p2p0   = pack {p2, p0}
        UXTH     tVal12, tVal12                      ;// tVal12 = Cleared the upper half word
        UXTH     tVal10, tVal10                      ;// tVal12 = Cleared the upper half word
        ORR      tVal12, tVal12,  tVal12, LSL #16    ;// tVal12 = {b  ,  b}
        ORR      tVal10, tVal10,  tVal10, LSL #16    ;// tVal10 = {3b , 3b}
        SADD16   tVal1,  tVal0,   tVal12             ;// tVal1  = p3p1   = p2p0   + {b,b}
        SADD16   tVal2,  tVal1,   tVal10             ;// tVal2  = p6p4   = p3p1   + {3b,3b}
        SADD16   tVal4,  tVal2,   tVal12             ;// tVal4  = p7p5   = p6p4   + {b,b}
        SADD16   tVal6,  tVal4,   tVal10             ;// tVal6  = p10p8  = p7p5   + {3b,3b}
        SADD16   tVal7,  tVal6,   tVal12             ;// tVal7  = p11p9  = p10p8  + {b,b}
        SADD16   tVal8,  tVal7,   tVal10             ;// tVal8  = p14p12 = p11p9  + {3b,3b}
        SADD16   tVal9,  tVal8,   tVal12             ;// tVal9  = p15p13 = p14p12 + {b,b}
        LDR      r0x00FF00FF,     =MASK_CONST        ;// r0x00FF00FF = 0x00FF00FF
        
LOOP_PLANE        

        USAT16   temp2, #13, p3p1
        USAT16   temp1, #13, p2p0
        SADD16   p3p1,   p3p1,   c                    
        SADD16   p2p0,   p2p0,   c                    
        AND      temp2, r0x00FF00FF, temp2, ASR #5
        AND      temp1, r0x00FF00FF, temp1, ASR #5
        ORR      temp1, temp1, temp2, LSL #8
        STR      temp1, [pDst], #4
        
        USAT16   temp2, #13, p7p5
        USAT16   temp1, #13, p6p4
        SADD16   p7p5,   p7p5,   c                    
        SADD16   p6p4,   p6p4,   c                    
        AND      temp2, r0x00FF00FF, temp2, ASR #5
        AND      temp1, r0x00FF00FF, temp1, ASR #5
        ORR      temp1, temp1, temp2, LSL #8
        STR      temp1, [pDst], #4
        
        USAT16   temp2, #13, p11p9
        USAT16   temp1, #13, p10p8
        SADD16   p11p9,  p11p9,  c                    
        SADD16   p10p8,  p10p8,  c                    
        AND      temp2, r0x00FF00FF, temp2, ASR #5
        AND      temp1, r0x00FF00FF, temp1, ASR #5
        ORR      temp1, temp1, temp2, LSL #8
        STR      temp1, [pDst], #4
        
        USAT16   temp2, #13, p15p13
        USAT16   temp1, #13, p14p12
        SADD16   p15p13, p15p13, c                    
        SADD16   p14p12, p14p12, c                    
        AND      temp2, r0x00FF00FF, temp2, ASR #5
        AND      temp1, r0x00FF00FF, temp1, ASR #5
        ORR      temp1, temp1, temp2, LSL #8
        STR      temp1, [pDst], #4
        
        ADDS     r0x00FF00FF, r0x00FF00FF, #1<<28     ;// Loop counter value in top 4 bits
        
        ADD      pDst, pDst, dstStep                   
        
        BCC      LOOP_PLANE                           ;// Loop for 16 times
        MOV      return, #OMX_Sts_NoErr
        M_END
        
        ENDIF ;// ARM1136JS

            
        END
;-----------------------------------------------------------------------------------------------
; omxVCM4P10_PredictIntra_16x16 ends
;-----------------------------------------------------------------------------------------------