summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_PredictIntra_16x16_s.s
blob: e9c0eee49b62fe4797c303bb37df568cc57b9b9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
;//
;// 
;// File Name:  omxVCM4P10_PredictIntra_16x16_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS CortexA8
     
  
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------

    M_TABLE armVCM4P10_pIndexTable16x16
    DCD  OMX_VC_16X16_VERT, OMX_VC_16X16_HOR 
    DCD  OMX_VC_16X16_DC,   OMX_VC_16X16_PLANE
    

    IF CortexA8

    M_TABLE armVCM4P10_MultiplierTable16x16,1
    DCW   7,  6,  5,  4,  3,  2,  1,  8 
    DCW   0,  1,  2,  3,  4,  5,  6,  7
    DCW   8,  9, 10, 11, 12, 13, 14, 15
        
;//--------------------------------------------
;// Constants 
;//--------------------------------------------  
BLK_SIZE        EQU 0x10
MUL_CONST0      EQU 0x01010101
MUL_CONST1      EQU 0x00060004
MUL_CONST2      EQU 0x00070005
MUL_CONST3      EQU 0x00030001
MASK_CONST      EQU 0x00FF00FF

;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
y               RN 12   
pc              RN 15   

return          RN 0    
pTable          RN 9    
count           RN 11   
pMultTable      RN 9
; ----------------------------------------------
; Neon registers
; ----------------------------------------------
qAbove          QN Q0.U8
qLeft           QN Q1.U8
qSum8           QN Q0.U16
dSum80          DN D0.U16
dSum81          DN D1.U16
dSum4           DN D0.U16
dSum2           DN D0.U32
dSum1           DN D0.U64
qOut            QN Q3.U8
dSumLeft        DN D6.U64
dSumAbove       DN D7.U64
dSum            DN D8.U64
dSum0           DN D8.U8[0]

qH              QN Q11.S32
qV              QN Q12.S32
qA              QN Q11.S16
qB              QN Q6.S16
qC              QN Q7.S16

qB0             QN Q5.S16
qB1             QN Q6.S16
dA1             DN D23.S16

dH0             DN D22.S32
dH1             DN D23.S32
dV0             DN D24.S32
dV1             DN D25.S32

qHV             QN Q11.S64
qHV0            QN Q11.S32
qHV1            QN Q12.S64

dHV00           DN D22.S32
dHV01           DN D23.S32

dHV0            DN D22.S16[0]
dHV1            DN D23.S16[0]
dHV10           DN D24.S64
dHV11           DN D25.S64

qSum0           QN Q0.S16
qSum1           QN Q1.S16

dOut0           DN D6.U8
dOut1           DN D7.U8

dLeft0          DN D2.U8
dLeft1          DN D3.U8
qConst          QN Q13.S16

dAbove0         DN D0.U8
dAbove1         DN D1.U8

dRevLeft64      DN D12.U64
dRevLeft        DN D12.U8
dRevAbove64     DN D5.U64
dRevAbove       DN D5.U8
qLeftDiff       QN Q8.S16
dLeftDiff1      DN D17.S16
dLeftDiff64     DN D17.S64
qDiffLeft       QN Q8.S16
qDiffAbove      QN Q4.S16
dAboveDiff1     DN D9.S16
dAboveDiff64    DN D9.S64
qAboveDiff      QN Q4.S16

dAboveLeft      DN D4.U8

dDiffLeft0      DN D16.S16
dDiffLeft1      DN D17.S16
dDiffAbove0     DN D8.S16
dDiffAbove1     DN D9.S16

qLeft15minus0   QN Q7.S16
dLeft15minus0   DN D14.S16
qAbove15minus0  QN Q3.S16
dAbove15minus0  DN D6.S16

qMultiplier     QN Q10.S16
qMultiplier0    QN Q10.S16
qMultiplier1    QN Q12.S16
dMultiplier0    DN D20.S16
dMultiplier1    DN D21.S16

dBPlusCMult7    DN D1.S64
dBPlusCMult7S16 DN D1.S16

qTmp            QN Q0.U8

;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft        RN 0    ;// input pointer
pSrcAbove       RN 1    ;// input pointer
pSrcAboveLeft   RN 2    ;// input pointer
pDst            RN 3    ;// output pointer
leftStep        RN 4    ;// input variable
dstStep         RN 5    ;// input variable
predMode        RN 6    ;// input variable
availability    RN 7    ;// input variable

pTmp            RN 8
step            RN 10
pTmp2           RN 11

;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_16x16 starts
;//-----------------------------------------------------------------------------------------------
        
        ;// Write function header
        M_START omxVCM4P10_PredictIntra_16x16, r11, d15
        
        ;// Define stack arguments
        M_ARG    LeftStep,     4
        M_ARG    DstStep,      4
        M_ARG    PredMode,     4
        M_ARG    Availability, 4
        
        ;// M_STALL ARM1136JS=4
        
        LDR      pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
        
        ;// Load argument from the stack
        M_LDR    predMode, PredMode                  ;// Arg predMode loaded from stack to reg 
        M_LDR    leftStep, LeftStep                  ;// Arg leftStep loaded from stack to reg 
        M_LDR    dstStep,  DstStep                   ;// Arg dstStep loaded from stack to reg         
        M_LDR    availability, Availability          ;// Arg availability loaded from stack to reg
        
        MOV      y, #BLK_SIZE                        ;// Outer Loop Count
        LDR      pc, [pTable, predMode, LSL #2]      ;// Branch to the case based on preMode
        
OMX_VC_16X16_VERT
        VLD1    qAbove,  [pSrcAbove]
        ADD     pTmp, pDst, dstStep
        ADD     step, dstStep, dstStep
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst], step
        VST1    qAbove, [pTmp], step
        VST1    qAbove, [pDst]
        VST1    qAbove, [pTmp]
        MOV     return, #OMX_Sts_NoErr               ;// returnNoError
        M_EXIT
        
OMX_VC_16X16_HOR
        ADD     pTmp, pSrcLeft, leftStep
        ADD     leftStep, leftStep, leftStep
        ADD     pTmp2, pDst, dstStep
        ADD     dstStep, dstStep, dstStep
LoopHor 
        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
        VLD1     {qTmp[]}, [pTmp], leftStep       
        SUBS     y, y, #8
        VST1     qLeft, [pDst], dstStep
        VST1     qTmp, [pTmp2], dstStep
        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
        VLD1     {qTmp[]}, [pTmp], leftStep       
        VST1     qLeft, [pDst], dstStep
        VST1     qTmp, [pTmp2], dstStep
        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
        VLD1     {qTmp[]}, [pTmp], leftStep       
        VST1     qLeft, [pDst], dstStep
        VST1     qTmp, [pTmp2], dstStep
        VLD1     {qLeft[]}, [pSrcLeft], leftStep       
        VLD1     {qTmp[]}, [pTmp], leftStep       
        VST1     qLeft, [pDst], dstStep
        VST1     qTmp, [pTmp2], dstStep
        
        BNE      LoopHor                                  ;// Loop for 16 times
        MOV      return, #OMX_Sts_NoErr
        M_EXIT
        
OMX_VC_16X16_DC
        MOV      count, #0                                 ;// count = 0
        TST      availability, #OMX_VC_LEFT
        BEQ      UpperOrNoneAvailable                      ;// Jump to Upper if not left

        ADD     pTmp, pSrcLeft, leftStep
        ADD     step, leftStep, leftStep

        VLD1    {qLeft[0]}, [pSrcLeft],step    
        VLD1    {qLeft[1]}, [pTmp],step   
        VLD1    {qLeft[2]}, [pSrcLeft],step   
        VLD1    {qLeft[3]}, [pTmp],step
        VLD1    {qLeft[4]}, [pSrcLeft],step   
        VLD1    {qLeft[5]}, [pTmp],step   
        VLD1    {qLeft[6]}, [pSrcLeft],step    
        VLD1    {qLeft[7]}, [pTmp],step
        VLD1    {qLeft[8]}, [pSrcLeft],step    
        VLD1    {qLeft[9]}, [pTmp],step   
        VLD1    {qLeft[10]},[pSrcLeft],step   
        VLD1    {qLeft[11]},[pTmp],step    
        VLD1    {qLeft[12]},[pSrcLeft],step   
        VLD1    {qLeft[13]},[pTmp],step   
        VLD1    {qLeft[14]},[pSrcLeft],step    
        VLD1    {qLeft[15]},[pTmp] 
        
        VPADDL   qSum8, qLeft
        ADD     count, count, #1    
        VPADD    dSum4, dSum80, dSum81
        VPADDL   dSum2, dSum4
        VPADDL   dSumLeft, dSum2
        VRSHR    dSum, dSumLeft, #4
        
UpperOrNoneAvailable
        TST      availability,  #OMX_VC_UPPER              ;// if(availability & #OMX_VC_UPPER)
        BEQ      BothOrNoneAvailable                       ;// Jump to Left if not upper
        VLD1     qAbove, [pSrcAbove]
        ADD      count, count, #1                          ;// if upper inc count by 1
        VPADDL   qSum8, qAbove
        VPADD    dSum4, dSum80, dSum81
        VPADDL   dSum2, dSum4
        VPADDL   dSumAbove, dSum2
        VRSHR    dSum, dSumAbove, #4
        
BothOrNoneAvailable
        CMP      count, #2                                  ;// check if both available
        BNE      NoneAvailable
        VADD     dSum, dSumAbove, dSumLeft
        VRSHR    dSum, dSum, #5
        

NoneAvailable
        VDUP     qOut, dSum0        
        CMP      count, #0                                  ;// check if none available
        ADD      pTmp, pDst, dstStep
        ADD      step, dstStep, dstStep
        BNE      LoopDC
        VMOV     qOut, #128
LoopDC        
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        VST1    qOut, [pDst], step
        VST1    qOut, [pTmp], step
        MOV     return, #OMX_Sts_NoErr
        M_EXIT

OMX_VC_16X16_PLANE
        LDR     pMultTable, =armVCM4P10_MultiplierTable16x16
        VLD1    qAbove, [pSrcAbove]                         ;// pSrcAbove[x]      :0<= x <= 7    
        VLD1    dAboveLeft[0],[pSrcAboveLeft]                                               
        ADD     pTmp, pSrcLeft, leftStep
        ADD     step, leftStep, leftStep
        VLD1    {qLeft[0]},  [pSrcLeft],step                                             
        VLD1    {qLeft[1]},  [pTmp],step      
        VLD1    {qLeft[2]},  [pSrcLeft],step  
        VLD1    {qLeft[3]},  [pTmp],step       
        VLD1    {qLeft[4]},  [pSrcLeft],step  
        VLD1    {qLeft[5]},  [pTmp],step      
        VLD1    {qLeft[6]},  [pSrcLeft],step   
        VLD1    {qLeft[7]},  [pTmp],step
        VLD1    {qLeft[8]},  [pSrcLeft],step   
        VLD1    {qLeft[9]},  [pTmp],step      
        VLD1    {qLeft[10]}, [pSrcLeft],step  
        VLD1    {qLeft[11]}, [pTmp],step       
        VLD1    {qLeft[12]}, [pSrcLeft],step  
        VLD1    {qLeft[13]}, [pTmp],step      
        VLD1    {qLeft[14]}, [pSrcLeft],step   
        VLD1    {qLeft[15]}, [pTmp]   

        VREV64  dRevAbove, dAbove1                          ;// pSrcAbove[15:14:13:12:11:10:9:8] 
        VSUBL   qAbove15minus0, dRevAbove, dAboveLeft       ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0] 
        VSHR    dRevAbove64, dRevAbove64, #8                ;// pSrcAbove[14:13:12:11:10:9:8:X] 
        VSUBL   qAboveDiff, dRevAbove, dAbove0              
        
        VSHL    dAboveDiff64, dAboveDiff64, #16 
        VEXT    dDiffAbove1, dAboveDiff1, dAbove15minus0, #1  

        VREV64  dRevLeft,dLeft1                             ;// pSrcLeft[15:14:13:12:11:10:9:8] 
        VSUBL   qLeft15minus0,dRevLeft, dAboveLeft          ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0] 
        VSHR    dRevLeft64, dRevLeft64, #8                  ;// pSrcLeft[14:13:12:11:10:9:8:X] 
        VSUBL   qLeftDiff,dRevLeft, dLeft0                  
        
        ;// Multiplier = [8|1|2|...|6|7]
        VLD1    qMultiplier, [pMultTable]!                  
        
        VSHL    dLeftDiff64, dLeftDiff64, #16
        VEXT    dDiffLeft1, dLeftDiff1, dLeft15minus0, #1     
        
        VMULL   qH,dDiffAbove0, dMultiplier0                
        VMULL   qV,dDiffLeft0,  dMultiplier0                
        VMLAL   qH,dDiffAbove1, dMultiplier1 
        VMLAL   qV,dDiffLeft1,  dMultiplier1
        
        VPADD   dHV00,dH1,dH0                                 
        VPADD   dHV01,dV1,dV0                                 
        VPADDL  qHV, qHV0
        VSHL    qHV1,qHV,#2
        VADD    qHV,qHV,qHV1 
        
        ;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
        VRSHR   qHV,qHV,#6
        
        ;// HV1 = [c*7|b*7]
        VSHL    qHV1,qHV,#3
        VSUB    qHV1,qHV1,qHV                             
        
        ;// Multiplier1 = [0|1|2|...|7]
        VLD1    qMultiplier0, [pMultTable]!    
        VDUP    qB, dHV0                                  
        VDUP    qC, dHV1 
        
        VADDL   qA,dAbove1,dLeft1
        VSHL    qA,qA, #4
        VDUP    qA,dA1[3]  
        VADD    dBPlusCMult7, dHV10, dHV11
        
        ;// Multiplier1 = [8|9|10|...|15]
        VLD1    qMultiplier1, [pMultTable]
        ;// Const = a - 7*(b+c)
        VDUP    qConst, dBPlusCMult7S16[0]
        VSUB    qConst, qA, qConst
        
        ;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
        VMUL    qB0,qB,qMultiplier0
        
        ;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
        VMUL    qB1,qB,qMultiplier1
        
        VADD    qSum0, qB0, qConst
        VADD    qSum1, qB1, qConst  
        
        ;// Loops for 16 times
LoopPlane       
        ;// (b*x + c*y + C)>>5
        VQRSHRUN dOut0, qSum0,#5
        VQRSHRUN dOut1, qSum1,#5      
        SUBS     y, y, #1
        VST1     qOut,[pDst],dstStep
        VADD     qSum0,qSum0,qC 
        VADD     qSum1,qSum1,qC 
        BNE      LoopPlane
        
        MOV      return, #OMX_Sts_NoErr

        M_END
        
        ENDIF ;// CortexA8
            
        END
;-----------------------------------------------------------------------------------------------
; omxVCM4P10_PredictIntra_16x16 ends
;-----------------------------------------------------------------------------------------------