summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/source/arm11_asm/h264bsd_interpolate_mid_hor.s
blob: a81aed77b04a008022dbeffeb48d72de09b65203 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
; Copyright (C) 2009 The Android Open Source Project
;
; Licensed under the Apache License, Version 2.0 (the "License");
; you may not use this file except in compliance with the License.
; You may obtain a copy of the License at
;
;      http://www.apache.org/licenses/LICENSE-2.0
;
; Unless required by applicable law or agreed to in writing, software
; distributed under the License is distributed on an "AS IS" BASIS,
; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
; See the License for the specific language governing permissions and
; limitations under the License.

;-------------------------------------------------------------------------------
;--
;-- Abstract : ARMv6 optimized version horizontal part of 
;--            h264bsdInterpolateMid functions
;--
;-------------------------------------------------------------------------------


    IF :DEF: H264DEC_WINASM
        ;// We dont use REQUIRE8 and PRESERVE8 for winasm
    ELSE
        REQUIRE8
        PRESERVE8
    ENDIF

    AREA    |.text|, CODE


;// Register allocation

ref     RN 0    ;// pointer to current position in reference image
mb      RN 1    ;// pointer to current position in interpolated mb
count   RN 2    ;// bit-packed width and count values

x_2_0   RN 4
x_3_1   RN 5
x_6_4   RN 6
x_7_5   RN 7

tmp1    RN 8
tmp2    RN 9
tmp3    RN 10
tmp4    RN 11

mult_20_01  RN 12   ;// [20,  1]
mult_20_m5  RN 14   ;// [20, -5]


        EXPORT  h264bsdInterpolateMidHorPart

;// Horizontal filter approach
;//
;// Basic idea in horizontal filtering is to adjust coefficients
;// like below. Calculation is done with 16-bit maths.
;//
;// Reg     x_2_0     x_3_1     x_6_4     x_7_5     x_2_0
;//       [  2  0 ] [  3  1 ] [  6  4 ] [  7  5 ] [ 10  8 ] ...
;// y_0 =   20  1     20 -5        -5         1
;// y_1 =   -5        20  1      1 20        -5
;// y_2 =    1        -5        -5 20      1 20
;// y_3 =              1        20 -5     -5 20         1


h264bsdInterpolateMidHorPart
    STMFD   sp!, {r4-r11, lr}

    ;// pack values to count register
    ;// [31:28] loop_x (partWidth-1)
    ;// [27:24] loop_y (partHeight-1)
    ;// [23:20] partWidth-1
    ;// [19:16] partHeight-1
    ;// [15:00] width


    LDR     mult_20_01, = 0x00140001
    LDR     mult_20_m5, = 0x0014FFFB
    AND     tmp3, count, #0x000F0000    ;// partWidth-1
loop_y
    LDR     x_3_1, [ref, #-8]
    ADD     count, count, tmp3, LSL #12
    LDR     x_7_5, [ref, #-4]
    UXTB16  x_2_0, x_3_1
    UXTB16  x_3_1, x_3_1, ROR #8
    UXTB16  x_6_4, x_7_5

loop_x
    UXTB16  x_7_5, x_7_5, ROR #8

    SMUAD   tmp1, x_2_0, mult_20_01
    SMULTB  tmp2, x_2_0, mult_20_m5
    SMULTB  tmp3, x_2_0, mult_20_01
    SMULTB  tmp4, x_3_1, mult_20_01

    SMLAD   tmp1, x_3_1, mult_20_m5, tmp1
    SMLAD   tmp2, x_3_1, mult_20_01, tmp2
    SMLATB  tmp3, x_3_1, mult_20_m5, tmp3
    LDR     x_3_1, [ref], #4
    SMLAD   tmp4, x_6_4, mult_20_m5, tmp4

    SMLABB  tmp1, x_6_4, mult_20_m5, tmp1
    SMLADX  tmp2, x_6_4, mult_20_01, tmp2
    SMLADX  tmp3, x_6_4, mult_20_m5, tmp3
    SMLADX  tmp4, x_7_5, mult_20_m5, tmp4

    SMLABB  tmp1, x_7_5, mult_20_01, tmp1
    SMLABB  tmp2, x_7_5, mult_20_m5, tmp2
    UXTB16  x_2_0, x_3_1
    SMLADX  tmp3, x_7_5, mult_20_01, tmp3
    SMLABB  tmp4, x_2_0, mult_20_01, tmp4

    SUBS    count, count, #4<<28
    STR     tmp1, [mb], #4
    STR     tmp2, [mb], #4
    STR     tmp3, [mb], #4
    STR     tmp4, [mb], #4
    BCC     next_y

    UXTB16  x_3_1, x_3_1, ROR #8

    SMUAD   tmp1, x_6_4, mult_20_01
    SMULTB  tmp2, x_6_4, mult_20_m5
    SMULTB  tmp3, x_6_4, mult_20_01
    SMULTB  tmp4, x_7_5, mult_20_01

    SMLAD   tmp1, x_7_5, mult_20_m5, tmp1
    SMLAD   tmp2, x_7_5, mult_20_01, tmp2
    SMLATB  tmp3, x_7_5, mult_20_m5, tmp3
    LDR     x_7_5, [ref], #4
    SMLAD   tmp4, x_2_0, mult_20_m5, tmp4

    SMLABB  tmp1, x_2_0, mult_20_m5, tmp1
    SMLADX  tmp2, x_2_0, mult_20_01, tmp2
    SMLADX  tmp3, x_2_0, mult_20_m5, tmp3
    SMLADX  tmp4, x_3_1, mult_20_m5, tmp4

    SMLABB  tmp1, x_3_1, mult_20_01, tmp1
    SMLABB  tmp2, x_3_1, mult_20_m5, tmp2
    UXTB16  x_6_4, x_7_5
    SMLADX  tmp3, x_3_1, mult_20_01, tmp3
    SMLABB  tmp4, x_6_4, mult_20_01, tmp4

    SUBS    count, count, #4<<28
    STR     tmp1, [mb], #4
    STR     tmp2, [mb], #4
    STR     tmp3, [mb], #4
    STR     tmp4, [mb], #4
    BCS     loop_x

next_y
    AND     tmp3, count, #0x000F0000    ;// partWidth-1
    SMLABB  ref, count, mult_20_01, ref   ;// +width
    SBC     ref, ref, tmp3, LSR #16   ;// -(partWidth-1)-1
    ADDS    count, count, #(1<<28)-(1<<20)
    BGE     loop_y

    LDMFD   sp!, {r4-r11, pc}

    END