media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_hor_half.s


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251

; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;      http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.

;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version of h264bsdInterpolateHorHalf function
;--
;-------------------------------------------------------------------------------


    IF :DEF: H264DEC_WINASM
        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
    ELSE
        REQUIRE8
        PRESERVE8
    ENDIF

    AREA    |.text|, CODE

;// h264bsdInterpolateHorHalf register allocation

ref     RN 0

mb      RN 1
buff    RN 1

count   RN 2
x0      RN 2

y0      RN 3
x_2_0   RN 3

width   RN 4
x_3_1   RN 4

height  RN 5
x_6_4   RN 5

partW   RN 6
x_7_5   RN 6

partH   RN 7
tmp1    RN 7

tmp2    RN 8

tmp3    RN 9

tmp4    RN 10

mult_20_01  RN 11
mult_20_m5  RN 12

plus16  RN 14


;// function exports and imports

    IMPORT  h264bsdFillBlock

    EXPORT  h264bsdInterpolateHorHalf

;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
;// y_0 =   20  1     20 -5        -5         1
;// y_1 =   -5        20  1      1 20        -5
;// y_2 =    1        -5        -5 20      1 20
;// y_3 =              1        20 -5     -5 20         1


h264bsdInterpolateHorHalf
    STMFD   sp!, {r0-r11, lr}
    SUB     sp, sp, #0x1e4

    CMP     x0, #0
    BLT     do_fill                 ;// (x0 < 0)
    LDR     partW, [sp,#0x220]      ;// partWidth
    ADD     tmp4, x0, partW         ;// (x0+partWidth)
    ADD     tmp4, tmp4, #5          ;// (y0+partW+5)
    LDR     width, [sp,#0x218]      ;// width
    CMP     tmp4, width
    BHI     do_fill                 ;// (x0+partW)>width

    CMP     y0, #0
    BLT     do_fill                 ;// (y0 < 0)
    LDR     partH, [sp,#0x224]      ;// partHeight
    ADD     tmp2, y0, partH         ;// (y0+partHeight)
    LDR     height, [sp,#0x21c]     ;// height
    CMP     tmp2, height
    BLS     skip_fill               ;// no overfill needed


do_fill
    LDR     partH, [sp,#0x224]      ;// partHeight
    LDR     height, [sp,#0x21c]     ;// height
    LDR     partW, [sp,#0x220]      ;// partWidth
    ADD     tmp4, partW, #5         ;// tmp4 = partW + 5;
    STMIB   sp, {height, tmp4}      ;// sp+4 = height, sp+8 = partWidth+5
    STR     partH, [sp,#0xc]        ;// sp+c = partHeight
    STR     tmp4, [sp,#0x10]        ;// sp+10 = partWidth+5
    LDR     width, [sp,#0x218]      ;// width
    STR     width, [sp,#0]          ;// sp+0 = width
    ADD     buff, sp, #0x28         ;// buff = p1[21*21/4+1]
    BL      h264bsdFillBlock

    MOV     x0, #0
    STR     x0,[sp,#0x1ec]          ;// x0 = 0
    STR     x0,[sp,#0x1f0]          ;// y0 = 0
    ADD     ref,sp,#0x28            ;// ref = p1
    STR     tmp4, [sp,#0x218]       ;// width = partWidth+5


skip_fill
    LDR     x0 ,[sp,#0x1ec]         ;// x0
    LDR     y0 ,[sp,#0x1f0]         ;// y0
    LDR     width, [sp,#0x218]      ;// width
    MLA     tmp2, width, y0, x0     ;// y0*width+x0
    ADD     ref, ref, tmp2          ;// ref += y0*width+x0
    ADD     ref, ref, #8            ;// ref = ref+8
    LDR     mb, [sp, #0x1e8]        ;// mb

    ;// pack values to count register
    ;// [31:28] loop_x (partWidth-1)
    ;// [27:24] loop_y (partHeight-1)
    ;// [23:20] partWidth-1
    ;// [19:16] partHeight-1
    ;// [15:00] width
    MOV     count, width
    SUB     partW, partW, #1;
    SUB     partH, partH, #1;
    ADD     tmp2, partH, partW, LSL #4
    ADD     count, count, tmp2, LSL #16


    LDR     mult_20_01, = 0x00140001
    LDR     mult_20_m5, = 0x0014FFFB
    MOV     plus16, #16
    AND     tmp1, count, #0x000F0000    ;// partHeight-1
    AND     tmp3, count, #0x00F00000    ;// partWidth-1
    ADD     count, count, tmp1, LSL #8
loop_y
    LDR     x_3_1, [ref, #-8]
    ADD     count, count, tmp3, LSL #8
    LDR     x_7_5, [ref, #-4]
    UXTB16  x_2_0, x_3_1
    UXTB16  x_3_1, x_3_1, ROR #8
    UXTB16  x_6_4, x_7_5

loop_x
    UXTB16  x_7_5, x_7_5, ROR #8

    SMLAD   tmp1, x_2_0, mult_20_01, plus16
    SMLATB  tmp3, x_2_0, mult_20_01, plus16
    SMLATB  tmp2, x_2_0, mult_20_m5, plus16
    SMLATB  tmp4, x_3_1, mult_20_01, plus16

    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    LDR     x_3_1, [ref], #4
    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4

    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4

    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    UXTB16  x_2_0, x_3_1
    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    SMLABB  tmp4, x_2_0, mult_20_01, tmp4

    MOV     tmp2, tmp2, ASR #5
    MOV     tmp1, tmp1, ASR #5
    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    USAT16  tmp2, #8, tmp2
    USAT16  tmp1, #8, tmp1

    SUBS    count, count, #4<<28
    ORR     tmp1, tmp1, tmp2, LSL #8
    STR     tmp1, [mb], #4
    BCC     next_y

    UXTB16  x_3_1, x_3_1, ROR #8

    SMLAD   tmp1, x_6_4, mult_20_01, plus16
    SMLATB  tmp3, x_6_4, mult_20_01, plus16
    SMLATB  tmp2, x_6_4, mult_20_m5, plus16
    SMLATB  tmp4, x_7_5, mult_20_01, plus16

    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    LDR     x_7_5, [ref], #4
    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4

    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4

    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    UXTB16  x_6_4, x_7_5
    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    SMLABB  tmp4, x_6_4, mult_20_01, tmp4

    MOV     tmp2, tmp2, ASR #5
    MOV     tmp1, tmp1, ASR #5
    PKHBT   tmp2, tmp2, tmp4, LSL #(16-5)
    PKHBT   tmp1, tmp1, tmp3, LSL #(16-5)
    USAT16  tmp2, #8, tmp2
    USAT16  tmp1, #8, tmp1

    SUBS    count, count, #4<<28
    ORR     tmp1, tmp1, tmp2, LSL #8
    STR     tmp1, [mb], #4
    BCS     loop_x

next_y
    AND     tmp3, count, #0x00F00000    ;// partWidth-1
    SMLABB  ref, count, mult_20_01, ref ;// +width
    ADDS    mb, mb, #16                 ;// +16, Carry=0
    SBC     mb, mb, tmp3, LSR #20       ;// -(partWidth-1)-1
    SBC     ref, ref, tmp3, LSR #20     ;// -(partWidth-1)-1
    ADDS    count, count, #(1<<28)-(1<<24)
    BGE     loop_y

    ADD     sp,sp,#0x1f4
    LDMFD   sp!, {r4-r11, pc}

    END