1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
|
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
; http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.
;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version horizontal part of
;-- h264bsdInterpolateMid functions
;--
;-------------------------------------------------------------------------------
IF :DEF: H264DEC_WINASM
;// We dont use REQUIRE8 and PRESERVE8 for winasm
ELSE
REQUIRE8
PRESERVE8
ENDIF
AREA |.text|, CODE
;// Register allocation
ref RN 0 ;// pointer to current position in reference image
mb RN 1 ;// pointer to current position in interpolated mb
count RN 2 ;// bit-packed width and count values
x_2_0 RN 4
x_3_1 RN 5
x_6_4 RN 6
x_7_5 RN 7
tmp1 RN 8
tmp2 RN 9
tmp3 RN 10
tmp4 RN 11
mult_20_01 RN 12 ;// [20, 1]
mult_20_m5 RN 14 ;// [20, -5]
EXPORT h264bsdInterpolateMidHorPart
;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg x_2_0 x_3_1 x_6_4 x_7_5 x_2_0
;// [ 2 0 ] [ 3 1 ] [ 6 4 ] [ 7 5 ] [ 10 8 ] ...
;// y_0 = 20 1 20 -5 -5 1
;// y_1 = -5 20 1 1 20 -5
;// y_2 = 1 -5 -5 20 1 20
;// y_3 = 1 20 -5 -5 20 1
h264bsdInterpolateMidHorPart
STMFD sp!, {r4-r11, lr}
;// pack values to count register
;// [31:28] loop_x (partWidth-1)
;// [27:24] loop_y (partHeight-1)
;// [23:20] partWidth-1
;// [19:16] partHeight-1
;// [15:00] width
LDR mult_20_01, = 0x00140001
LDR mult_20_m5, = 0x0014FFFB
AND tmp3, count, #0x000F0000 ;// partWidth-1
loop_y
LDR x_3_1, [ref, #-8]
ADD count, count, tmp3, LSL #12
LDR x_7_5, [ref, #-4]
UXTB16 x_2_0, x_3_1
UXTB16 x_3_1, x_3_1, ROR #8
UXTB16 x_6_4, x_7_5
loop_x
UXTB16 x_7_5, x_7_5, ROR #8
SMUAD tmp1, x_2_0, mult_20_01
SMULTB tmp2, x_2_0, mult_20_m5
SMULTB tmp3, x_2_0, mult_20_01
SMULTB tmp4, x_3_1, mult_20_01
SMLAD tmp1, x_3_1, mult_20_m5, tmp1
SMLAD tmp2, x_3_1, mult_20_01, tmp2
SMLATB tmp3, x_3_1, mult_20_m5, tmp3
LDR x_3_1, [ref], #4
SMLAD tmp4, x_6_4, mult_20_m5, tmp4
SMLABB tmp1, x_6_4, mult_20_m5, tmp1
SMLADX tmp2, x_6_4, mult_20_01, tmp2
SMLADX tmp3, x_6_4, mult_20_m5, tmp3
SMLADX tmp4, x_7_5, mult_20_m5, tmp4
SMLABB tmp1, x_7_5, mult_20_01, tmp1
SMLABB tmp2, x_7_5, mult_20_m5, tmp2
UXTB16 x_2_0, x_3_1
SMLADX tmp3, x_7_5, mult_20_01, tmp3
SMLABB tmp4, x_2_0, mult_20_01, tmp4
SUBS count, count, #4<<28
STR tmp1, [mb], #4
STR tmp2, [mb], #4
STR tmp3, [mb], #4
STR tmp4, [mb], #4
BCC next_y
UXTB16 x_3_1, x_3_1, ROR #8
SMUAD tmp1, x_6_4, mult_20_01
SMULTB tmp2, x_6_4, mult_20_m5
SMULTB tmp3, x_6_4, mult_20_01
SMULTB tmp4, x_7_5, mult_20_01
SMLAD tmp1, x_7_5, mult_20_m5, tmp1
SMLAD tmp2, x_7_5, mult_20_01, tmp2
SMLATB tmp3, x_7_5, mult_20_m5, tmp3
LDR x_7_5, [ref], #4
SMLAD tmp4, x_2_0, mult_20_m5, tmp4
SMLABB tmp1, x_2_0, mult_20_m5, tmp1
SMLADX tmp2, x_2_0, mult_20_01, tmp2
SMLADX tmp3, x_2_0, mult_20_m5, tmp3
SMLADX tmp4, x_3_1, mult_20_m5, tmp4
SMLABB tmp1, x_3_1, mult_20_01, tmp1
SMLABB tmp2, x_3_1, mult_20_m5, tmp2
UXTB16 x_6_4, x_7_5
SMLADX tmp3, x_3_1, mult_20_01, tmp3
SMLABB tmp4, x_6_4, mult_20_01, tmp4
SUBS count, count, #4<<28
STR tmp1, [mb], #4
STR tmp2, [mb], #4
STR tmp3, [mb], #4
STR tmp4, [mb], #4
BCS loop_x
next_y
AND tmp3, count, #0x000F0000 ;// partWidth-1
SMLABB ref, count, mult_20_01, ref ;// +width
SBC ref, ref, tmp3, LSR #16 ;// -(partWidth-1)-1
ADDS count, count, #(1<<28)-(1<<20)
BGE loop_y
LDMFD sp!, {r4-r11, pc}
END
|