media/libstagefright/codecs/on2/h264dec/omxdl/arm11/vc/m4p10/src/armVCM4P10_DeblockingLuma_unsafe_s.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380

;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;//      http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;// 
;// File Name:  armVCM4P10_DeblockingLuma_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   9641
;// Date:       Thursday, February 7, 2008
;// 
;// 
;// 
;//

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS ARM1136JS


    IF  ARM1136JS

MASK_1  EQU 0x01010101

;// Declare input registers

pQ0        RN 0
StepArg    RN 1
tC0Arg     RN 2
alpha      RN 6

beta       RN 14
bS         RN 14
tC0        RN 14
ptC0       RN 1

;// Declare Local/Temporary variables

;// Pixels
p_0     RN 3 
p_1     RN 5  
p_2     RN 4  
p_3     RN 2  
q_0     RN 8  
q_1     RN 9  
q_2     RN 10 
q_3     RN 12 


;// Filtering

ap0q0   RN 1  
filt    RN 2
        
m00     RN 7
m01     RN 11

apflg   RN 0 
aqflg   RN 6

tC      RN 1


;//Declarations for bSLT4 kernel

pos     RN 7
neg     RN 12

P0a     RN 1   
P1a     RN 8   
Q0a     RN 7  
Q1a     RN 4   

u1      RN 3   
max     RN 12
min     RN 2   
               
                
;//Declarations for bSGE4 kernel

q_3b    RN 9   
p_3b    RN 0
apqflg  RN 12

P0b     RN 6
P1b     RN 7 
P2b     RN 1

Q0b     RN 9 
Q1b     RN 0 
Q2b     RN 2

;// Miscellanous

a       RN 0
t0      RN 3 
t1      RN 12
t2      RN 7
t3      RN 11
t4      RN 4   
t5      RN 1   
t8      RN 6   
t9      RN 14  
t10     RN 5   
t11     RN 9   

;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;//        - 2 - filt, 0 - apflg, 6 - aqflg
;//        - 11 - m01, 7 - tC0
;//         
;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
;//
;// Registers Corrupted - 0-3,5-12,14


        M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr

        ;// Since beta <= 18 and alpha <= 255 we know
        ;// -254 <= p0-q0 <= 254
        ;//  -17 <= q1-q0 <= 17
        ;//  -17 <= p1-p0 <= 17

        ;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
        ;// 
        ;//    Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
        ;//                = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
        ;//                = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
        
        USUB8   t1, p_1, p_0
        MUL     tC0, t2, m01
        
        USUB8   t2, q_1, q_0
        SSUB8   t1, t1, t2

        USUB8   t2, p_0, q_0
        AND     t2, t2, m01
        SHSUB8  t1, t1, t2
        UHSUB8  t5, p_0, q_0
        SSUB8   t1, t1, t2
        SHSUB8  t1, t1, t5
        MOV     m00, #0
        SADD8   t1, t1, m01
        SHSUB8  t1, t1, t5
        
        ;// tC = tC0
        ;// if (ap < beta) tC++;
        ;// if (aq < beta) tC++;
        USUB8   t5, filt, m01   
        SEL     tC0, tC0, m00
        UQADD8  tC, tC0, apflg
        SSUB8   t1, t1, m00
        UQADD8  tC, tC, aqflg

        ;// Split into positive and negative part and clip 
        SEL     pos, t1, m00
        USUB8   neg, pos, t1
        USUB8   t3, pos, tC
        SEL     pos, tC, pos
        USUB8   t3, neg, tC
        SEL     neg, tC, neg
        
        ;//Reload m01
        LDR     m01,=MASK_1

        UQADD8  P0a, p_0, pos
        UQSUB8  Q0a, q_0, pos
        UQSUB8  P0a, P0a, neg
        UQADD8  Q0a, Q0a, neg
        
        ;// Choose to store the filtered
        ;// value or the original pixel
        USUB8   t1, filt, m01    
        SEL     P0a, P0a, p_0
        SEL     Q0a, Q0a, q_0
    
        ;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
        ;// u1 = (p0 + q0 + 1)>>1
        ;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
        MVN     p_0, p_0
        UHSUB8  u1, q_0, p_0 
        UQADD8  max, p_1, tC0
        EOR     u1, u1, m01 ,LSL #7
    
        ;// Calculate A = (p2+u1)>>1 
        ;// Then delta = Clip3( -tC0, tC0, A - p1)

        ;// Clip P1
        UHADD8  P1a, p_2, u1
        UQSUB8  min, p_1, tC0
        USUB8   t4, P1a, max
        SEL     P1a, max, P1a
        USUB8   t4, P1a, min
        SEL     P1a, P1a, min

        ;// Clip Q1
        UHADD8  Q1a, q_2, u1
        UQADD8  max, q_1, tC0
        UQSUB8  min, q_1, tC0
        USUB8   t0, Q1a, max
        SEL     Q1a, max, Q1a
        USUB8   t0, Q1a, min
        SEL     Q1a, Q1a, min
        
        ;// Choose to store the filtered
        ;// value or the original pixel
        USUB8   t0, apflg, m01
        SEL     P1a, P1a, p_1
        USUB8   t0, aqflg, m01
        SEL     t3, Q1a, q_1
        
        M_END

;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;//        - 2 - filt, 0 - apflg,aqflg
;//        - 1 - ap0q0, 6 - alpha
;//        - 7 - m00, 11 - m01
;//         
;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
;// 
;// Registers Corrupted - 0-3,5-12,14

        M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
    
        ;// apflg = apflg && |p0-q0|<((alpha>>2)+2) 
        ;// apflg = aqflg && |p0-q0|<((alpha>>2)+2) 

        M_ARG   pDummy,4
        M_ARG   pQ_3,4
        M_ARG   pP_3,4
        
        UHADD8  alpha, alpha, m00
        USUB8   t9, p_2, p_0    ;//t9 = dp2p0
        UHADD8  alpha, alpha, m00
        ADD     alpha, alpha, m01, LSL #1        
        USUB8   ap0q0, ap0q0, alpha
        SEL     apqflg, m00, apflg

        ;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 
        ;//    = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
        ;//    = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)

        ;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
        ;//    = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
        
        ;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
        ;//    = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
        ;//    = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)

        ;// Compute P0b
        USUB8   t2, p_0, q_0         
        SSUB8   t5, t9, t2           

        USUB8   t8, q_1, q_0         
        SHADD8  t8, t5, t8

        USUB8   t9, p_1, p_0         
        SADD8   t8, t8, t9
        SHSUB8  t8, t8, t2
        SHADD8  t5, t5, t9
        SHADD8  t8, t8, m01
        SHADD8  t9, t5, m01
        SADD8   P0b, p_0, t8         
        ;// P0b ready
        
        ;// Compute P1b
        M_LDR   p_3b, pP_3
        SADD8   P1b, p_0, t9         
        ;// P1b ready
        
        ;// Compute P2b
        USUB8   t9, p_2, p_0         
        SADD8   t5, t5, t9
        UHSUB8  t9, p_3b, p_0        
        EOR     a, p_3b, p_0         
        AND     a, a, m01
        SHADD8  t5, t5, a
        UHADD8  a, p_0, q_1
        SADD8   t5, t5, m01
        SHADD8  t5, t5, t9
        MVN     t9, p_1
        SADD8   P2b, p_0, t5         
        ;// P2b ready
        
        UHSUB8  a, a, t9
        ORR     t9, apqflg, m01
        USUB8   t9, apqflg, t9

        EOR     a, a, m01, LSL #7
        SEL     P0b, P0b, a
        SEL     P1b, P1b, p_1
        SEL     P2b, P2b, p_2

        USUB8   t4, filt, m01
        SEL     P0b, P0b, p_0

        
        ;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3 
        ;//    = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
        ;//    = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)

        ;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
        ;//    = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)

        ;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
        ;//    = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
        ;//    = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)


        ;// Compute Q0b Q1b
        USUB8   t4, q_2, q_0           
        USUB8   a, p_0, q_0
        USUB8   t9, p_1, p_0
        SADD8   t0, t4, a
        SHADD8  t9, t0, t9
        UHADD8  t10, q_0, p_1
        SADD8   t9, t9, a
        USUB8   a, q_1, q_0
        SHADD8  t9, t9, a
        SHADD8  t0, t0, a
        SHADD8  t9, t9, m01
        SHADD8  a, t0, m01
        SADD8   t9, q_0, t9            
        ;// Q0b ready - t9
        
        MOV     t4, #0
        UHADD8  apqflg, apqflg, t4
        
        SADD8   Q1b, q_0, a 
        ;// Q1b ready
       
        USUB8   t4, apqflg, m01
        SEL     Q1b, Q1b, q_1
        MVN     t11, q_1
        UHSUB8  t10, t10, t11
        M_LDR   q_3b, pQ_3
        EOR     t10, t10, m01, LSL #7
        SEL     t9, t9, t10            
        
        ;// Compute Q2b
        USUB8   t4, q_2, q_0
        SADD8   t4, t0, t4
        EOR     t0, q_3b, q_0 
        AND     t0, t0, m01
        SHADD8  t4, t4, t0
        UHSUB8  t10, q_3b, q_0
        SADD8   t4, t4, m01
        SHADD8  t4, t4, t10

        USUB8   t10, filt, m01
        SEL     Q0b, t9, q_0

        SADD8   t4, q_0, t4            
        ;// Q2b ready - t4

        USUB8   t10, apqflg, m01
        SEL     Q2b, t4, q_2

        M_END
    
    ENDIF

        END