summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
blob: e6fbb34225800bd48ddba80bbcc005e220d73e98 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
;//
;// 
;// File Name:  omxVCM4P10_FilterDeblockingLuma_VerEdge_I_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h

        M_VARIANTS CortexA8

        IMPORT  armVCM4P10_DeblockingLumabSLT4_unsafe
        IMPORT  armVCM4P10_DeblockingLumabSGE4_unsafe
        
        IF CortexA8

LOOP_COUNT  EQU 0x11000000


;// Function arguments

pSrcDst     RN 0
srcdstStep  RN 1
pAlpha      RN 2
pBeta       RN 3

pThresholds RN 5
pBS         RN 4
bS10        RN 12

pAlpha_0    RN 2
pBeta_0     RN 3

pAlpha_1    RN 7
pBeta_1     RN 8

pTmp        RN 10
pTmpStep    RN 11

;// Loop 

XY          RN 9

;// Rows input
dRow0       DN D7.U8
dRow1       DN D8.U8  
dRow2       DN D5.U8  
dRow3       DN D10.U8  
dRow4       DN D6.U8  
dRow5       DN D9.U8  
dRow6       DN D4.U8 
dRow7       DN D11.U8 

;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3

;// Rows output
dRown0      DN D7.U8
dRown1      DN D24.U8
dRown2      DN D30.U8
dRown3      DN D10.U8
dRown4      DN D6.U8
dRown5      DN D25.U8
dRown6      DN D29.U8
dRown7      DN D11.U8

;// dP_0n       DN D29.U8
;// dP_1n       DN D30.U8
;// dP_2n       DN D31.U8
;// 
;// dQ_0n       DN D24.U8   ;!!;Temp2        
;// dQ_1n       DN D25.U8   ;!!;Temp2        
;// dQ_2n       DN D28.U8   ;!!;dQ_0t        
;// 
;// dRown0 - dP_3,  dRown1 - dQ_0n
;// dRown2 - dP_1n, dRown3 - dQ_2
;// dRown4 - dP_2,  dRown5 - dQ_1n
;// dRown6 - dP_0n, dRown7 - dQ_3

dRow0n      DN D7.U8
dRow1n      DN D24.U8
dRow2n      DN D30.U8
dRow3n      DN D28.U8
dRow4n      DN D31.U8
dRow5n      DN D25.U8
dRow6n      DN D29.U8
dRow7n      DN D11.U8

;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3

;// Pixels
dP_0        DN D4.U8
dP_1        DN D5.U8  
dP_2        DN D6.U8  
dP_3        DN D7.U8  
dQ_0        DN D8.U8  
dQ_1        DN D9.U8  
dQ_2        DN D10.U8 
dQ_3        DN D11.U8 


;// Filtering Decision
dAlpha      DN D0.U8
dBeta       DN D2.U8

dFilt       DN D16.U8
dAqflg      DN D12.U8
dApflg      DN D17.U8 

dAp0q0      DN D13.U8
dAp1p0      DN D12.U8
dAq1q0      DN D18.U8
dAp2p0      DN D19.U8
dAq2q0      DN D17.U8

;// bSLT4
dTC0        DN D18.U8   
dTC1        DN D19.U8   
dTC01       DN D18.U8   

dTCs        DN D31.S8
dTC         DN D31.U8

dMask_0     DN D14.U8
dMask_1     DN D15.U8    

Mask_0      RN 6

dTemp       DN D19.U8

;// Computing P0,Q0
qDq0p0      QN Q10.S16
qDp1q1      QN Q11.S16
qDelta      QN Q10.S16  ; reuse qDq0p0
dDelta      DN D20.S8


;// Computing P1,Q1
dRp0q0      DN D24.U8

dMaxP       DN D23.U8
dMinP       DN D22.U8

dMaxQ       DN D19.U8
dMinQ       DN D21.U8

dDeltaP     DN D26.U8
dDeltaQ     DN D27.U8

qP_0n       QN Q14.S16
qQ_0n       QN Q12.S16

dQ_0n       DN D24.U8
dQ_1n       DN D25.U8
dP_0n       DN D29.U8
dP_1n       DN D30.U8

;// bSGE4

qSp0q0      QN Q10.U16

qSp2q1      QN Q11.U16
qSp0q0p1    QN Q12.U16
qSp3p2      QN Q13.U16
dHSp0q1     DN D28.U8

qSq2p1      QN Q11.U16
qSp0q0q1    QN Q12.U16
qSq3q2      QN Q13.U16  ;!!
dHSq0p1     DN D28.U8   ;!!

qTemp1      QN Q11.U16  ;!!;qSp2q1 
qTemp2      QN Q12.U16  ;!!;qSp0q0p1        

dP_0t       DN D28.U8   ;!!;dHSp0q1        
dQ_0t       DN D22.U8   ;!!;Temp1        

dP_0n       DN D29.U8
dP_1n       DN D30.U8
dP_2n       DN D31.U8

dQ_0n       DN D24.U8   ;!!;Temp2        
dQ_1n       DN D25.U8   ;!!;Temp2        
dQ_2n       DN D28.U8   ;!!;dQ_0t        

        
        ;// Function header
        M_START omxVCM4P10_FilterDeblockingLuma_VerEdge_I, r11, d15
        
        ;//Arguments on the stack
        M_ARG   ppThresholds, 4
        M_ARG   ppBS, 4
        
        ;// d0-dAlpha_0
        ;// d2-dBeta_0

        ADD         pAlpha_1, pAlpha_0, #1
        ADD         pBeta_1, pBeta_0, #1
        
        VLD1        {dAlpha[]}, [pAlpha_0]
        SUB         pSrcDst, pSrcDst, #4
        VLD1        {dBeta[]}, [pBeta_0] 
        
        M_LDR       pBS, ppBS
        M_LDR       pThresholds, ppThresholds 

        MOV         Mask_0,#0

        ;dMask_0-14
        ;dMask_1-15

        VMOV        dMask_0, #0     
        VMOV        dMask_1, #1     

        LDR         XY,=LOOP_COUNT
    
        ADD         pTmpStep, srcdstStep, srcdstStep

        ;// p0-p3 - d4-d7
        ;// q0-q3 - d8-d11
LoopY        
LoopX        
        LDRH        bS10, [pBS], #4

        CMP         bS10, #0
        BEQ         NoFilterBS0

        ;// Load 8 rows of data
        ADD         pTmp, pSrcDst, srcdstStep
        VLD1        dRow0, [pSrcDst], pTmpStep
        VLD1        dRow1, [pTmp], pTmpStep
        VLD1        dRow2, [pSrcDst], pTmpStep
        VZIP.8      dRow0, dRow1
        VLD1        dRow3, [pTmp], pTmpStep
        VLD1        dRow4, [pSrcDst], pTmpStep
        VZIP.8      dRow2, dRow3
        VLD1        dRow5, [pTmp], pTmpStep
        VLD1        dRow6, [pSrcDst], pTmpStep
        VLD1        dRow7, [pTmp], pTmpStep
        VZIP.8      dRow4, dRow5
        VZIP.16     dRow1, dRow3
    

        ;// dRow0 = [q3r0 q2r0 q1r0 q0r0 p0r0 p1r0 p2r0 p3r0]
        ;// dRow1 = [q3r1 q2r1 q1r1 q0r1 p0r1 p1r1 p2r1 p3r1]
        ;// dRow2 = [q3r2 q2r2 q1r2 q0r2 p0r2 p1r2 p2r2 p3r2]
        ;// dRow3 = [q3r3 q2r3 q1r3 q0r3 p0r3 p1r3 p2r3 p3r3]
        ;// dRow4 = [q3r4 q2r4 q1r4 q0r4 p0r4 p1r4 p2r4 p3r4]
        ;// dRow5 = [q3r5 q2r5 q1r5 q0r5 p0r5 p1r5 p2r5 p3r5]
        ;// dRow6 = [q3r6 q2r6 q1r6 q0r6 p0r6 p1r6 p2r6 p3r6]
        ;// dRow7 = [q3r7 q2r7 q1r7 q0r7 p0r7 p1r7 p2r7 p3r7]

        ;// 8x8 Transpose

        VZIP.8      dRow6, dRow7

        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
        VZIP.16     dRow0, dRow2
        VZIP.16     dRow5, dRow7
        

        VZIP.16     dRow4, dRow6
        VZIP.32     dRow1, dRow5
        VZIP.32     dRow2, dRow6
        VZIP.32     dRow3, dRow7
        VZIP.32     dRow0, dRow4
        

        ;// dRow0 - dP_3, dRow1 - dQ_0, dRow2 - dP_1, dRow3 - dQ_2
        ;// dRow4 - dP_2, dRow5 - dQ_1, dRow6 - dP_0, dRow7 - dQ_3

        ;// dQ_0 = [q0r7 q0r6 q0r5 q0r4 q0r3 q0r2 q0r1 q0r0]
        ;// dQ_1 = [q1r7 q1r6 q1r5 q1r4 q1r3 q1r2 q1r1 q1r0]
        ;// dQ_2 = [q2r7 q2r6 q2r5 q2r4 q2r3 q2r2 q2r1 q2r0]
        ;// dQ_3 = [q3r7 q3r6 q3r5 q3r4 q3r3 q3r2 q3r1 q3r0]

        ;// dP_0 = [p0r7 p0r6 p0r5 p0r4 p0r3 p0r2 p0r1 p0r0]
        ;// dP_1 = [p1r7 p1r6 p1r5 p1r4 p1r3 p1r2 p1r1 p1r0]
        ;// dP_2 = [p2r7 p2r6 p2r5 p2r4 p2r3 p2r2 p2r1 p2r0]
        ;// dP_3 = [p3r7 p3r6 p3r5 p3r4 p3r3 p3r2 p3r1 p3r0]

        VABD        dAp0q0, dP_0, dQ_0
        VABD        dAp1p0, dP_1, dP_0

        VABD        dAq1q0, dQ_1, dQ_0
        VABD        dAp2p0, dP_2, dP_0
        
        TST         bS10, #0xff
        VCGT        dFilt, dAlpha, dAp0q0

        VMAX        dAp1p0, dAq1q0, dAp1p0
        VABD        dAq2q0, dQ_2, dQ_0

        VMOVEQ.U32  dFilt[0], Mask_0
        TST         bS10, #0xff00

        VCGT        dAp2p0, dBeta, dAp2p0
        VCGT        dAp1p0, dBeta, dAp1p0

        VMOVEQ.U32  dFilt[1], Mask_0

        VCGT        dAq2q0, dBeta, dAq2q0
        VAND        dFilt, dFilt, dAp1p0
        TST         bS10, #4 

        VAND        dAqflg, dFilt, dAq2q0
        VAND        dApflg, dFilt, dAp2p0
    
        BNE         bSGE4        
bSLT4
        ;// bS < 4 Filtering

        BL          armVCM4P10_DeblockingLumabSLT4_unsafe

        ;// Transpose

        VZIP.8      dP_3,  dP_2  
        VZIP.8      dP_1n, dP_0n
        VZIP.8      dQ_0n, dQ_1n
        VZIP.8      dQ_2,  dQ_3

        
        VZIP.16     dP_3,  dP_1n
        ADD         pTmp, pSrcDst, srcdstStep
        VZIP.16     dQ_0n, dQ_2
        VZIP.16     dQ_1n, dQ_3
        VZIP.16     dP_2,  dP_0n

        VZIP.32     dP_3,  dQ_0n
        VZIP.32     dP_1n, dQ_2
        VZIP.32     dP_2,  dQ_1n
        VZIP.32     dP_0n, dQ_3

        ;// dRown0 - dP_3,  dRown1 - dQ_0n
        ;// dRown2 - dP_1n, dRown3 - dQ_2
        ;// dRown4 - dP_2,  dRown5 - dQ_1n
        ;// dRown6 - dP_0n, dRown7 - dQ_3

        VST1        dRown0, [pSrcDst], pTmpStep
        VST1        dRown1, [pTmp], pTmpStep
        VST1        dRown2, [pSrcDst], pTmpStep
        VST1        dRown3, [pTmp], pTmpStep
        ;1
        VST1        dRown4, [pSrcDst], pTmpStep
        VST1        dRown5, [pTmp], pTmpStep
        ADDS        XY, XY, XY
        VST1        dRown6, [pSrcDst], pTmpStep
        ADD         pThresholds, pThresholds, #2
        VST1        dRown7, [pTmp], srcdstStep

        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
        VLD1        {dAlpha[]}, [pAlpha_1]
        ADD         pSrcDst, pSrcDst, #4
        VLD1        {dBeta[]}, [pBeta_1]

        BCC         LoopX
        B           ExitLoopY        

NoFilterBS0
        ADD         pSrcDst, pSrcDst, #4
        ADDS        XY, XY, XY
        VLD1        {dAlpha[]}, [pAlpha_1]
        ADD         pThresholds, pThresholds, #4
        VLD1        {dBeta[]}, [pBeta_1]
        BCC         LoopX
        B           ExitLoopY        
bSGE4        
        ;// bS >= 4 Filtering
        
        BL          armVCM4P10_DeblockingLumabSGE4_unsafe

        ;// Transpose

        VZIP.8      dP_3,  dP_2n   
        VZIP.8      dP_1n, dP_0n
        VZIP.8      dQ_0n, dQ_1n
        VZIP.8      dQ_2n, dQ_3

        VZIP.16     dP_3,  dP_1n
        ADD         pTmp, pSrcDst, srcdstStep
        VZIP.16     dQ_0n, dQ_2n
        VZIP.16     dQ_1n, dQ_3
        VZIP.16     dP_2n, dP_0n

        VZIP.32     dP_3,  dQ_0n
        VZIP.32     dP_1n, dQ_2n
        VZIP.32     dP_2n, dQ_1n
        VZIP.32     dP_0n, dQ_3

        ;// dRow0n - dP_3, dRow1n - dQ_0n, dRow2n - dP_1n, dRow3n - dQ_2n
        ;// dRow4n - dP_2, dRow5n - dQ_1n, dRow6n - dP_0n, dRow7n - dQ_3
        
        VST1        dRow0n, [pSrcDst], pTmpStep
        VST1        dRow1n, [pTmp], pTmpStep
        VST1        dRow2n, [pSrcDst], pTmpStep
        VST1        dRow3n, [pTmp], pTmpStep
        VST1        dRow4n, [pSrcDst], pTmpStep
        VST1        dRow5n, [pTmp], pTmpStep
        ADDS        XY,XY,XY
        VST1        dRow6n, [pSrcDst], pTmpStep
        ADD         pThresholds, pThresholds, #4
        VST1        dRow7n, [pTmp], pTmpStep

        SUB         pSrcDst, pSrcDst, srcdstStep, LSL #3
        VLD1        {dAlpha[]}, [pAlpha_1]
        ADD         pSrcDst, pSrcDst, #4
        VLD1        {dBeta[]}, [pBeta_1]

        BCC         LoopX

ExitLoopY        
        SUB         pBS, pBS, #14
        SUB         pThresholds, pThresholds, #14
        SUB         pSrcDst, pSrcDst, #16
        VLD1        {dAlpha[]}, [pAlpha_0]
        ADD         pSrcDst, pSrcDst, srcdstStep, LSL #3 
        VLD1        {dBeta[]}, [pBeta_0]
        BNE         LoopY

        MOV         r0, #OMX_Sts_NoErr

        M_END
        
    ENDIF
    
        
        END