1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
|
;//
;// Copyright (C) 2007-2008 ARM Limited
;//
;// Licensed under the Apache License, Version 2.0 (the "License");
;// you may not use this file except in compliance with the License.
;// You may obtain a copy of the License at
;//
;// http://www.apache.org/licenses/LICENSE-2.0
;//
;// Unless required by applicable law or agreed to in writing, software
;// distributed under the License is distributed on an "AS IS" BASIS,
;// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
;// See the License for the specific language governing permissions and
;// limitations under the License.
;//
;//
;//
;// File Name: armVCM4P10_DeblockingLuma_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 9641
;// Date: Thursday, February 7, 2008
;//
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS ARM1136JS
IF ARM1136JS
MASK_1 EQU 0x01010101
;// Declare input registers
pQ0 RN 0
StepArg RN 1
tC0Arg RN 2
alpha RN 6
beta RN 14
bS RN 14
tC0 RN 14
ptC0 RN 1
;// Declare Local/Temporary variables
;// Pixels
p_0 RN 3
p_1 RN 5
p_2 RN 4
p_3 RN 2
q_0 RN 8
q_1 RN 9
q_2 RN 10
q_3 RN 12
;// Filtering
ap0q0 RN 1
filt RN 2
m00 RN 7
m01 RN 11
apflg RN 0
aqflg RN 6
tC RN 1
;//Declarations for bSLT4 kernel
pos RN 7
neg RN 12
P0a RN 1
P1a RN 8
Q0a RN 7
Q1a RN 4
u1 RN 3
max RN 12
min RN 2
;//Declarations for bSGE4 kernel
q_3b RN 9
p_3b RN 0
apqflg RN 12
P0b RN 6
P1b RN 7
P2b RN 1
Q0b RN 9
Q1b RN 0
Q2b RN 2
;// Miscellanous
a RN 0
t0 RN 3
t1 RN 12
t2 RN 7
t3 RN 11
t4 RN 4
t5 RN 1
t8 RN 6
t9 RN 14
t10 RN 5
t11 RN 9
;// Register usage for - armVCM4P10_DeblockingLumabSLT4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;// - 2 - filt, 0 - apflg, 6 - aqflg
;// - 11 - m01, 7 - tC0
;//
;// Outputs - 1,8,7,11 - Output Pixels(P0a,P1a,Q0a,Q1a)
;//
;// Registers Corrupted - 0-3,5-12,14
M_START armVCM4P10_DeblockingLumabSLT4_unsafe, lr
;// Since beta <= 18 and alpha <= 255 we know
;// -254 <= p0-q0 <= 254
;// -17 <= q1-q0 <= 17
;// -17 <= p1-p0 <= 17
;// delta = Clip3( -tC, tC, ((((q0-p0)<<2) + (p1-q1) + 4)>>3))
;//
;// Calculate A = (((q0-p0)<<2) + (p1-q1) + 4)>>3
;// = (4*q0 - 4*p0 + p1 - q1 + 4)>>3
;// = ((p1-p0) - (q1-q0) - 3*(p0-q0) + 4)>>3
USUB8 t1, p_1, p_0
MUL tC0, t2, m01
USUB8 t2, q_1, q_0
SSUB8 t1, t1, t2
USUB8 t2, p_0, q_0
AND t2, t2, m01
SHSUB8 t1, t1, t2
UHSUB8 t5, p_0, q_0
SSUB8 t1, t1, t2
SHSUB8 t1, t1, t5
MOV m00, #0
SADD8 t1, t1, m01
SHSUB8 t1, t1, t5
;// tC = tC0
;// if (ap < beta) tC++;
;// if (aq < beta) tC++;
USUB8 t5, filt, m01
SEL tC0, tC0, m00
UQADD8 tC, tC0, apflg
SSUB8 t1, t1, m00
UQADD8 tC, tC, aqflg
;// Split into positive and negative part and clip
SEL pos, t1, m00
USUB8 neg, pos, t1
USUB8 t3, pos, tC
SEL pos, tC, pos
USUB8 t3, neg, tC
SEL neg, tC, neg
;//Reload m01
LDR m01,=MASK_1
UQADD8 P0a, p_0, pos
UQSUB8 Q0a, q_0, pos
UQSUB8 P0a, P0a, neg
UQADD8 Q0a, Q0a, neg
;// Choose to store the filtered
;// value or the original pixel
USUB8 t1, filt, m01
SEL P0a, P0a, p_0
SEL Q0a, Q0a, q_0
;// delta = (p2 + ((p0+q0+1)>>1) - (p1<<1))>>1;
;// u1 = (p0 + q0 + 1)>>1
;// u1 = ( (q_0 - p_0')>>1 ) ^ 0x80
MVN p_0, p_0
UHSUB8 u1, q_0, p_0
UQADD8 max, p_1, tC0
EOR u1, u1, m01 ,LSL #7
;// Calculate A = (p2+u1)>>1
;// Then delta = Clip3( -tC0, tC0, A - p1)
;// Clip P1
UHADD8 P1a, p_2, u1
UQSUB8 min, p_1, tC0
USUB8 t4, P1a, max
SEL P1a, max, P1a
USUB8 t4, P1a, min
SEL P1a, P1a, min
;// Clip Q1
UHADD8 Q1a, q_2, u1
UQADD8 max, q_1, tC0
UQSUB8 min, q_1, tC0
USUB8 t0, Q1a, max
SEL Q1a, max, Q1a
USUB8 t0, Q1a, min
SEL Q1a, Q1a, min
;// Choose to store the filtered
;// value or the original pixel
USUB8 t0, apflg, m01
SEL P1a, P1a, p_1
USUB8 t0, aqflg, m01
SEL t3, Q1a, q_1
M_END
;// Register usage for - armVCM4P10_DeblockingLumabSGE4_unsafe()
;//
;// Inputs - 3,4,5,8,9,10 - Input Pixels (p0-p2,q0-q2)
;// - 2 - filt, 0 - apflg,aqflg
;// - 1 - ap0q0, 6 - alpha
;// - 7 - m00, 11 - m01
;//
;// Outputs - 6,7,1,9,0,2 - Output Pixels(P0b,P1b,P2b, Q0b,Q1b,Q2b)
;//
;// Registers Corrupted - 0-3,5-12,14
M_START armVCM4P10_DeblockingLumabSGE4_unsafe, lr
;// apflg = apflg && |p0-q0|<((alpha>>2)+2)
;// apflg = aqflg && |p0-q0|<((alpha>>2)+2)
M_ARG pDummy,4
M_ARG pQ_3,4
M_ARG pP_3,4
UHADD8 alpha, alpha, m00
USUB8 t9, p_2, p_0 ;//t9 = dp2p0
UHADD8 alpha, alpha, m00
ADD alpha, alpha, m01, LSL #1
USUB8 ap0q0, ap0q0, alpha
SEL apqflg, m00, apflg
;// P0 = (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
;// = ((p2-p0) + 2*(p1-p0) + (q1-q0) + 3*(q0-p0) + 8*p0 + 4)>>3
;// = p0 + (((p2-p0) + 2*(p1-p0) + (q1-q0) - 3*(p0-q0) + 4)>>3)
;// P1 = (p2 + p1 + q0 + p0 + 2)>>2
;// = p0 + (((p2-p0) + (p1-p0) - (p0-q0) + 2)>>2)
;// P2 = (2*p3 + 3*p2 + p1 + p0 + q0 + 4)>>3
;// = (2*(p3-p0) + 3*(p2-p0) + (p1-p0) - (p0-q0) + 8*p0 + 4)>>3
;// = p0 + (((p3-p0) + (p2-p0) + t2 + 2)>>2)
;// Compute P0b
USUB8 t2, p_0, q_0
SSUB8 t5, t9, t2
USUB8 t8, q_1, q_0
SHADD8 t8, t5, t8
USUB8 t9, p_1, p_0
SADD8 t8, t8, t9
SHSUB8 t8, t8, t2
SHADD8 t5, t5, t9
SHADD8 t8, t8, m01
SHADD8 t9, t5, m01
SADD8 P0b, p_0, t8
;// P0b ready
;// Compute P1b
M_LDR p_3b, pP_3
SADD8 P1b, p_0, t9
;// P1b ready
;// Compute P2b
USUB8 t9, p_2, p_0
SADD8 t5, t5, t9
UHSUB8 t9, p_3b, p_0
EOR a, p_3b, p_0
AND a, a, m01
SHADD8 t5, t5, a
UHADD8 a, p_0, q_1
SADD8 t5, t5, m01
SHADD8 t5, t5, t9
MVN t9, p_1
SADD8 P2b, p_0, t5
;// P2b ready
UHSUB8 a, a, t9
ORR t9, apqflg, m01
USUB8 t9, apqflg, t9
EOR a, a, m01, LSL #7
SEL P0b, P0b, a
SEL P1b, P1b, p_1
SEL P2b, P2b, p_2
USUB8 t4, filt, m01
SEL P0b, P0b, p_0
;// Q0 = (q2 + 2*q1 + 2*q0 + 2*p0 + p1 + 4)>>3
;// = ((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 8*q0 + 4)>>3
;// = q0 + (((q2-q0) + 2*(q1-q0) + (p1-p0) + 3*(p0-q0) + 4)>>3)
;// Q1 = (q2 + q1 + p0 + q0 + 2)>>2
;// = q0 + (((q2-q0) + (q1-q0) + (p0-q0) + 2)>>2)
;// Q2 = (2*q3 + 3*q2 + q1 + q0 + p0 + 4)>>3
;// = (2*(q3-q0) + 3*(q2-q0) + (q1-q0) + (p0-q0) + 8*q0 + 4)>>3
;// = q0 + (((q3-q0) + (q2-q0) + t2 + 2)>>2)
;// Compute Q0b Q1b
USUB8 t4, q_2, q_0
USUB8 a, p_0, q_0
USUB8 t9, p_1, p_0
SADD8 t0, t4, a
SHADD8 t9, t0, t9
UHADD8 t10, q_0, p_1
SADD8 t9, t9, a
USUB8 a, q_1, q_0
SHADD8 t9, t9, a
SHADD8 t0, t0, a
SHADD8 t9, t9, m01
SHADD8 a, t0, m01
SADD8 t9, q_0, t9
;// Q0b ready - t9
MOV t4, #0
UHADD8 apqflg, apqflg, t4
SADD8 Q1b, q_0, a
;// Q1b ready
USUB8 t4, apqflg, m01
SEL Q1b, Q1b, q_1
MVN t11, q_1
UHSUB8 t10, t10, t11
M_LDR q_3b, pQ_3
EOR t10, t10, m01, LSL #7
SEL t9, t9, t10
;// Compute Q2b
USUB8 t4, q_2, q_0
SADD8 t4, t0, t4
EOR t0, q_3b, q_0
AND t0, t0, m01
SHADD8 t4, t4, t0
UHSUB8 t10, q_3b, q_0
SADD8 t4, t4, m01
SHADD8 t4, t4, t10
USUB8 t10, filt, m01
SEL Q0b, t9, q_0
SADD8 t4, q_0, t4
;// Q2b ready - t4
USUB8 t10, apqflg, m01
SEL Q2b, t4, q_2
M_END
ENDIF
END
|