summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/omxdl/arm_neon/vc/m4p10/src/armVCM4P10_Average_4x_Align_unsafe_s.s
blob: 4f0892d63c2cc9fbb3536de834273ba529cecb56 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
;//
;// 
;// File Name:  armVCM4P10_Average_4x_Align_unsafe_s.s
;// OpenMAX DL: v1.0.2
;// Revision:   12290
;// Date:       Wednesday, April 9, 2008
;// 
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;// 
;// 
;//


;// Functions:
;//     armVCM4P10_Average_4x4_Align<ALIGNMENT>_unsafe  
;//
;// Implements Average of 4x4 with equation c = (a+b+1)>>1.
;// First operand will be at offset ALIGNMENT from aligned address
;// Second operand will be at aligned location and will be used as output.
;// destination pointed by (pDst) for vertical interpolation.
;// This function needs to copy 4 bytes in horizontal direction 
;//
;// Registers used as input for this function
;// r0,r1,r2,r3 where r2 containings aligned memory pointer and r3 step size
;//
;// Registers preserved for top level function
;// r4,r5,r6,r8,r9,r14
;//
;// Registers modified by the function
;// r7,r10,r11,r12
;//
;// Output registers
;// r2 - pointer to the aligned location
;// r3 - step size to this aligned location

        INCLUDE omxtypes_s.h
        INCLUDE armCOMM_s.h
        
        M_VARIANTS ARM1136JS

        EXPORT armVCM4P10_Average_4x4_Align0_unsafe
        EXPORT armVCM4P10_Average_4x4_Align2_unsafe
        EXPORT armVCM4P10_Average_4x4_Align3_unsafe

DEBUG_ON    SETL {FALSE}

;// Declare input registers
pPred0          RN 0
iPredStep0      RN 1
pPred1          RN 2
iPredStep1      RN 3
pDstPred        RN 2
iDstStep        RN 3

;// Declare other intermediate registers
iPredA0         RN 10
iPredA1         RN 11
iPredB0         RN 12
iPredB1         RN 14
Temp1           RN 4
Temp2           RN 5
ResultA         RN 5
ResultB         RN 4
r0x80808080     RN 7

    IF ARM1136JS
        
        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 0 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align0_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB1, [pPred1, iPredStep1]
        M_LDR       iPredA1, [pPred0], iPredStep0

        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredA1, [pPred0], iPredStep0
        M_LDR       iPredB1, [pPred1, iPredStep1]

        MVN         iPredB0, iPredB0
        UHSUB8      ResultA, iPredA0, iPredB0
        MVN         iPredB1, iPredB1
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End0
        M_END

        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 2 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align2_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        M_LDR       iPredB0, [pPred1]
        M_LDR       iPredB1, [pPred1, iPredStep1]
        M_LDR       Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1        
        MOV         iPredA0, iPredA0, LSR #16
        ORR         iPredA0, iPredA0, Temp1, LSL #16        
        MOV         iPredA1, iPredA1, LSR #16
        ORR         iPredA1, iPredA1, Temp2, LSL #16

        ;// (a+b+1)/2 = (a+256-(255-b))/2 = (a-(255-b))/2 + 128
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        LDR         Temp1, [pPred0, #4]
        M_LDR         iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR         iPredA1, [pPred0], iPredStep0
        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #16
        ORR         iPredA0, iPredA0, Temp1, LSL #16        
        MOV         iPredA1, iPredA1, LSR #16
        ORR         iPredA1, iPredA1, Temp2, LSL #16

        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End2
        M_END


        ;// This function calculates average of 4x4 block 
        ;// pPred0 is at alignment offset 3 and pPred1 is alignment 4

        ;// Function header
        M_START armVCM4P10_Average_4x4_Align3_unsafe, r6

        ;// Code start        
        LDR         r0x80808080, =0x80808080

        ;// 1st load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0

        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #24
        ORR         iPredA0, iPredA0, Temp1, LSL #8                
        MOV         iPredA1, iPredA1, LSR #24
        ORR         iPredA1, iPredA1, Temp2, LSL #8
        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep        
        
        ;// 2nd load
        LDR         Temp1, [pPred0, #4]
        M_LDR       iPredA0, [pPred0], iPredStep0        
        LDR         iPredB0, [pPred1]
        LDR         iPredB1, [pPred1, iPredStep1]
        LDR         Temp2, [pPred0, #4]
        M_LDR       iPredA1, [pPred0], iPredStep0

        MVN         iPredB0, iPredB0
        MVN         iPredB1, iPredB1
        MOV         iPredA0, iPredA0, LSR #24
        ORR         iPredA0, iPredA0, Temp1, LSL #8        
        MOV         iPredA1, iPredA1, LSR #24
        ORR         iPredA1, iPredA1, Temp2, LSL #8

        UHSUB8      ResultA, iPredA0, iPredB0
        UHSUB8      ResultB, iPredA1, iPredB1
        EOR         ResultA, ResultA, r0x80808080        
        M_STR       ResultA, [pDstPred], iDstStep        
        EOR         ResultB, ResultB, r0x80808080
        M_STR       ResultB, [pDstPred], iDstStep                
End3
        M_END

    ENDIF
    
    END