1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
|
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
;--
;-------------------------------------------------------------------------------
IF :DEF: H264DEC_WINASM
;// We dont use REQUIRE8 and PRESERVE8 for winasm
ELSE
REQUIRE8
PRESERVE8
ENDIF
AREA |.text|, CODE
;// h264bsdInterpolateHorHalf register allocation
ref RN 0
mb RN 1
buff RN 1
count RN 2
x0 RN 2
y0 RN 3
x_2_0 RN 3
width RN 4
x_3_1 RN 4
height RN 5
x_6_4 RN 5
partW RN 6
x_7_5 RN 6
partH RN 7
tmp1 RN 7
tmp2 RN 8
tmp3 RN 9
tmp4 RN 10
mult_20_01 RN 11
mult_20_m5 RN 12
plus16 RN 14
;// function exports and imports
IMPORT h264bsdFillBlock
EXPORT h264bsdInterpolateHorHalf
;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
;// y_0 = 20 1 20 -5 -5 1
;// y_1 = -5 20 1 1 20 -5
;// y_2 = 1 -5 -5 20 1 20
;// y_3 = 1 20 -5 -5 20 1
h264bsdInterpolateHorHalf
STMFD sp!, {r0-r11, lr}
SUB sp, sp, #0x1e4
CMP x0, #0
BLT do_fill ;// (x0 < 0)
LDR partW, [sp,#0x220] ;// partWidth
ADD tmp4, x0, partW ;// (x0+partWidth)
ADD tmp4, tmp4, #5 ;// (y0+partW+5)
LDR width, [sp,#0x218] ;// width
CMP tmp4, width
BHI do_fill ;// (x0+partW)>width
CMP y0, #0
BLT do_fill ;// (y0 < 0)
LDR partH, [sp,#0x224] ;// partHeight
ADD tmp2, y0, partH ;// (y0+partHeight)
LDR height, [sp,#0x21c] ;// height
CMP tmp2, height
BLS skip_fill ;// no overfill needed
do_fill
LDR partH, [sp,#0x224] ;// partHeight
LDR height, [sp,#0x21c] ;// height
LDR partW, [sp,#0x220] ;// partWidth
ADD tmp4, partW, #5 ;// tmp4 = partW + 5;
STMIB sp, {height, tmp4} ;// sp+4 = height, sp+8 = partWidth+5
STR partH, [sp,#0xc] ;// sp+c = partHeight
STR tmp4, [sp,#0x10] ;// sp+10 = partWidth+5
LDR width, [sp,#0x218] ;// width
STR width, [sp,#0] ;// sp+0 = width
ADD buff, sp, #0x28 ;// buff = p1[21*21/4+1]
BL h264bsdFillBlock
MOV x0, #0
STR x0,[sp,#0x1ec] ;// x0 = 0
STR x0,[sp,#0x1f0] ;// y0 = 0
ADD ref,sp,#0x28 ;// ref = p1
STR tmp4, [sp,#0x218] ;// width = partWidth+5
skip_fill
LDR x0 ,[sp,#0x1ec] ;// x0
LDR y0 ,[sp,#0x1f0] ;// y0
LDR width, [sp,#0x218] ;// width
MLA tmp2, width, y0, x0 ;// y0*width+x0
ADD ref, ref, tmp2 ;// ref += y0*width+x0
ADD ref, ref, #8 ;// ref = ref+8
LDR mb, [sp, #0x1e8] ;// mb
;// pack values to count register
;// [31:28] loop_x (partWidth-1)
;// [27:24] loop_y (partHeight-1)
;// [23:20] partWidth-1
;// [19:16] partHeight-1
;// [15:00] width
MOV count, width
SUB partW, partW, #1;
SUB partH, partH, #1;
ADD tmp2, partH, partW, LSL #4
ADD count, count, tmp2, LSL #16
LDR mult_20_01, = 0x00140001
LDR mult_20_m5, = 0x0014FFFB
MOV plus16, #16
AND tmp1, count, #0x000F0000 ;// partHeight-1
AND tmp3, count, #0x00F00000 ;// partWidth-1
ADD count, count, tmp1, LSL #8
loop_y
LDR x_3_1, [ref, #-8]
ADD count, count, tmp3, LSL #8
LDR x_7_5, [ref, #-4]
UXTB16 x_2_0, x_3_1
UXTB16 x_3_1, x_3_1, ROR #8
UXTB16 x_6_4, x_7_5
loop_x
UXTB16 x_7_5, x_7_5, ROR #8
SMLAD tmp1, x_2_0, mult_20_01, plus16
SMLATB tmp3, x_2_0, mult_20_01, plus16
SMLATB tmp2, x_2_0, mult_20_m5, plus16
SMLATB tmp4, x_3_1, mult_20_01, plus16
SMLAD tmp1, x_3_1, mult_20_m5, tmp1
SMLATB tmp3, x_3_1, mult_20_m5, tmp3
SMLAD tmp2, x_3_1, mult_20_01, tmp2
LDR x_3_1, [ref], #4
SMLAD tmp4, x_6_4, mult_20_m5, tmp4
SMLABB tmp1, x_6_4, mult_20_m5, tmp1
SMLADX tmp3, x_6_4, mult_20_m5, tmp3
SMLADX tmp2, x_6_4, mult_20_01, tmp2
SMLADX tmp4, x_7_5, mult_20_m5, tmp4
SMLABB tmp1, x_7_5, mult_20_01, tmp1
UXTB16 x_2_0, x_3_1
SMLABB tmp2, x_7_5, mult_20_m5, tmp2
SMLADX tmp3, x_7_5, mult_20_01, tmp3
SMLABB tmp4, x_2_0, mult_20_01, tmp4
MOV tmp2, tmp2, ASR #5
MOV tmp1, tmp1, ASR #5
PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
USAT16 tmp2, #8, tmp2
USAT16 tmp1, #8, tmp1
SUBS count, count, #4<<28
ORR tmp1, tmp1, tmp2, LSL #8
STR tmp1, [mb], #4
BCC next_y
UXTB16 x_3_1, x_3_1, ROR #8
SMLAD tmp1, x_6_4, mult_20_01, plus16
SMLATB tmp3, x_6_4, mult_20_01, plus16
SMLATB tmp2, x_6_4, mult_20_m5, plus16
SMLATB tmp4, x_7_5, mult_20_01, plus16
SMLAD tmp1, x_7_5, mult_20_m5, tmp1
SMLATB tmp3, x_7_5, mult_20_m5, tmp3
SMLAD tmp2, x_7_5, mult_20_01, tmp2
LDR x_7_5, [ref], #4
SMLAD tmp4, x_2_0, mult_20_m5, tmp4
SMLABB tmp1, x_2_0, mult_20_m5, tmp1
SMLADX tmp3, x_2_0, mult_20_m5, tmp3
SMLADX tmp2, x_2_0, mult_20_01, tmp2
SMLADX tmp4, x_3_1, mult_20_m5, tmp4
SMLABB tmp1, x_3_1, mult_20_01, tmp1
UXTB16 x_6_4, x_7_5
SMLABB tmp2, x_3_1, mult_20_m5, tmp2
SMLADX tmp3, x_3_1, mult_20_01, tmp3
SMLABB tmp4, x_6_4, mult_20_01, tmp4
MOV tmp2, tmp2, ASR #5
MOV tmp1, tmp1, ASR #5
PKHBT tmp2, tmp2, tmp4, LSL #(16-5)
PKHBT tmp1, tmp1, tmp3, LSL #(16-5)
USAT16 tmp2, #8, tmp2
USAT16 tmp1, #8, tmp1
SUBS count, count, #4<<28
ORR tmp1, tmp1, tmp2, LSL #8
STR tmp1, [mb], #4
BCS loop_x
next_y
AND tmp3, count, #0x00F00000 ;// partWidth-1
SMLABB ref, count, mult_20_01, ref ;// +width
ADDS mb, mb, #16 ;// +16, Carry=0
SBC mb, mb, tmp3, LSR #20 ;// -(partWidth-1)-1
SBC ref, ref, tmp3, LSR #20 ;// -(partWidth-1)-1
ADDS count, count, #(1<<28)-(1<<24)
BGE loop_y
ADD sp,sp,#0x1f4
LDMFD sp!, {r4-r11, pc}
END
|