1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
|
;//
;//
;// File Name: omxVCM4P10_PredictIntra_16x16_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
M_VARIANTS CortexA8
;//-------------------------------------------------------
;// This table for implementing switch case of C in asm by
;// the mehtod of two levels of indexing.
;//-------------------------------------------------------
M_TABLE armVCM4P10_pIndexTable16x16
DCD OMX_VC_16X16_VERT, OMX_VC_16X16_HOR
DCD OMX_VC_16X16_DC, OMX_VC_16X16_PLANE
IF CortexA8
M_TABLE armVCM4P10_MultiplierTable16x16,1
DCW 7, 6, 5, 4, 3, 2, 1, 8
DCW 0, 1, 2, 3, 4, 5, 6, 7
DCW 8, 9, 10, 11, 12, 13, 14, 15
;//--------------------------------------------
;// Constants
;//--------------------------------------------
BLK_SIZE EQU 0x10
MUL_CONST0 EQU 0x01010101
MUL_CONST1 EQU 0x00060004
MUL_CONST2 EQU 0x00070005
MUL_CONST3 EQU 0x00030001
MASK_CONST EQU 0x00FF00FF
;//--------------------------------------------
;// Scratch variable
;//--------------------------------------------
y RN 12
pc RN 15
return RN 0
pTable RN 9
count RN 11
pMultTable RN 9
; ----------------------------------------------
; Neon registers
; ----------------------------------------------
qAbove QN Q0.U8
qLeft QN Q1.U8
qSum8 QN Q0.U16
dSum80 DN D0.U16
dSum81 DN D1.U16
dSum4 DN D0.U16
dSum2 DN D0.U32
dSum1 DN D0.U64
qOut QN Q3.U8
dSumLeft DN D6.U64
dSumAbove DN D7.U64
dSum DN D8.U64
dSum0 DN D8.U8[0]
qH QN Q11.S32
qV QN Q12.S32
qA QN Q11.S16
qB QN Q6.S16
qC QN Q7.S16
qB0 QN Q5.S16
qB1 QN Q6.S16
dA1 DN D23.S16
dH0 DN D22.S32
dH1 DN D23.S32
dV0 DN D24.S32
dV1 DN D25.S32
qHV QN Q11.S64
qHV0 QN Q11.S32
qHV1 QN Q12.S64
dHV00 DN D22.S32
dHV01 DN D23.S32
dHV0 DN D22.S16[0]
dHV1 DN D23.S16[0]
dHV10 DN D24.S64
dHV11 DN D25.S64
qSum0 QN Q0.S16
qSum1 QN Q1.S16
dOut0 DN D6.U8
dOut1 DN D7.U8
dLeft0 DN D2.U8
dLeft1 DN D3.U8
qConst QN Q13.S16
dAbove0 DN D0.U8
dAbove1 DN D1.U8
dRevLeft64 DN D12.U64
dRevLeft DN D12.U8
dRevAbove64 DN D5.U64
dRevAbove DN D5.U8
qLeftDiff QN Q8.S16
dLeftDiff1 DN D17.S16
dLeftDiff64 DN D17.S64
qDiffLeft QN Q8.S16
qDiffAbove QN Q4.S16
dAboveDiff1 DN D9.S16
dAboveDiff64 DN D9.S64
qAboveDiff QN Q4.S16
dAboveLeft DN D4.U8
dDiffLeft0 DN D16.S16
dDiffLeft1 DN D17.S16
dDiffAbove0 DN D8.S16
dDiffAbove1 DN D9.S16
qLeft15minus0 QN Q7.S16
dLeft15minus0 DN D14.S16
qAbove15minus0 QN Q3.S16
dAbove15minus0 DN D6.S16
qMultiplier QN Q10.S16
qMultiplier0 QN Q10.S16
qMultiplier1 QN Q12.S16
dMultiplier0 DN D20.S16
dMultiplier1 DN D21.S16
dBPlusCMult7 DN D1.S64
dBPlusCMult7S16 DN D1.S16
qTmp QN Q0.U8
;//--------------------------------------------
;// Declare input registers
;//--------------------------------------------
pSrcLeft RN 0 ;// input pointer
pSrcAbove RN 1 ;// input pointer
pSrcAboveLeft RN 2 ;// input pointer
pDst RN 3 ;// output pointer
leftStep RN 4 ;// input variable
dstStep RN 5 ;// input variable
predMode RN 6 ;// input variable
availability RN 7 ;// input variable
pTmp RN 8
step RN 10
pTmp2 RN 11
;//-----------------------------------------------------------------------------------------------
;// omxVCM4P10_PredictIntra_16x16 starts
;//-----------------------------------------------------------------------------------------------
;// Write function header
M_START omxVCM4P10_PredictIntra_16x16, r11, d15
;// Define stack arguments
M_ARG LeftStep, 4
M_ARG DstStep, 4
M_ARG PredMode, 4
M_ARG Availability, 4
;// M_STALL ARM1136JS=4
LDR pTable,=armVCM4P10_pIndexTable16x16 ;// Load index table for switch case
;// Load argument from the stack
M_LDR predMode, PredMode ;// Arg predMode loaded from stack to reg
M_LDR leftStep, LeftStep ;// Arg leftStep loaded from stack to reg
M_LDR dstStep, DstStep ;// Arg dstStep loaded from stack to reg
M_LDR availability, Availability ;// Arg availability loaded from stack to reg
MOV y, #BLK_SIZE ;// Outer Loop Count
LDR pc, [pTable, predMode, LSL #2] ;// Branch to the case based on preMode
OMX_VC_16X16_VERT
VLD1 qAbove, [pSrcAbove]
ADD pTmp, pDst, dstStep
ADD step, dstStep, dstStep
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst], step
VST1 qAbove, [pTmp], step
VST1 qAbove, [pDst]
VST1 qAbove, [pTmp]
MOV return, #OMX_Sts_NoErr ;// returnNoError
M_EXIT
OMX_VC_16X16_HOR
ADD pTmp, pSrcLeft, leftStep
ADD leftStep, leftStep, leftStep
ADD pTmp2, pDst, dstStep
ADD dstStep, dstStep, dstStep
LoopHor
VLD1 {qLeft[]}, [pSrcLeft], leftStep
VLD1 {qTmp[]}, [pTmp], leftStep
SUBS y, y, #8
VST1 qLeft, [pDst], dstStep
VST1 qTmp, [pTmp2], dstStep
VLD1 {qLeft[]}, [pSrcLeft], leftStep
VLD1 {qTmp[]}, [pTmp], leftStep
VST1 qLeft, [pDst], dstStep
VST1 qTmp, [pTmp2], dstStep
VLD1 {qLeft[]}, [pSrcLeft], leftStep
VLD1 {qTmp[]}, [pTmp], leftStep
VST1 qLeft, [pDst], dstStep
VST1 qTmp, [pTmp2], dstStep
VLD1 {qLeft[]}, [pSrcLeft], leftStep
VLD1 {qTmp[]}, [pTmp], leftStep
VST1 qLeft, [pDst], dstStep
VST1 qTmp, [pTmp2], dstStep
BNE LoopHor ;// Loop for 16 times
MOV return, #OMX_Sts_NoErr
M_EXIT
OMX_VC_16X16_DC
MOV count, #0 ;// count = 0
TST availability, #OMX_VC_LEFT
BEQ UpperOrNoneAvailable ;// Jump to Upper if not left
ADD pTmp, pSrcLeft, leftStep
ADD step, leftStep, leftStep
VLD1 {qLeft[0]}, [pSrcLeft],step
VLD1 {qLeft[1]}, [pTmp],step
VLD1 {qLeft[2]}, [pSrcLeft],step
VLD1 {qLeft[3]}, [pTmp],step
VLD1 {qLeft[4]}, [pSrcLeft],step
VLD1 {qLeft[5]}, [pTmp],step
VLD1 {qLeft[6]}, [pSrcLeft],step
VLD1 {qLeft[7]}, [pTmp],step
VLD1 {qLeft[8]}, [pSrcLeft],step
VLD1 {qLeft[9]}, [pTmp],step
VLD1 {qLeft[10]},[pSrcLeft],step
VLD1 {qLeft[11]},[pTmp],step
VLD1 {qLeft[12]},[pSrcLeft],step
VLD1 {qLeft[13]},[pTmp],step
VLD1 {qLeft[14]},[pSrcLeft],step
VLD1 {qLeft[15]},[pTmp]
VPADDL qSum8, qLeft
ADD count, count, #1
VPADD dSum4, dSum80, dSum81
VPADDL dSum2, dSum4
VPADDL dSumLeft, dSum2
VRSHR dSum, dSumLeft, #4
UpperOrNoneAvailable
TST availability, #OMX_VC_UPPER ;// if(availability & #OMX_VC_UPPER)
BEQ BothOrNoneAvailable ;// Jump to Left if not upper
VLD1 qAbove, [pSrcAbove]
ADD count, count, #1 ;// if upper inc count by 1
VPADDL qSum8, qAbove
VPADD dSum4, dSum80, dSum81
VPADDL dSum2, dSum4
VPADDL dSumAbove, dSum2
VRSHR dSum, dSumAbove, #4
BothOrNoneAvailable
CMP count, #2 ;// check if both available
BNE NoneAvailable
VADD dSum, dSumAbove, dSumLeft
VRSHR dSum, dSum, #5
NoneAvailable
VDUP qOut, dSum0
CMP count, #0 ;// check if none available
ADD pTmp, pDst, dstStep
ADD step, dstStep, dstStep
BNE LoopDC
VMOV qOut, #128
LoopDC
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
VST1 qOut, [pDst], step
VST1 qOut, [pTmp], step
MOV return, #OMX_Sts_NoErr
M_EXIT
OMX_VC_16X16_PLANE
LDR pMultTable, =armVCM4P10_MultiplierTable16x16
VLD1 qAbove, [pSrcAbove] ;// pSrcAbove[x] :0<= x <= 7
VLD1 dAboveLeft[0],[pSrcAboveLeft]
ADD pTmp, pSrcLeft, leftStep
ADD step, leftStep, leftStep
VLD1 {qLeft[0]}, [pSrcLeft],step
VLD1 {qLeft[1]}, [pTmp],step
VLD1 {qLeft[2]}, [pSrcLeft],step
VLD1 {qLeft[3]}, [pTmp],step
VLD1 {qLeft[4]}, [pSrcLeft],step
VLD1 {qLeft[5]}, [pTmp],step
VLD1 {qLeft[6]}, [pSrcLeft],step
VLD1 {qLeft[7]}, [pTmp],step
VLD1 {qLeft[8]}, [pSrcLeft],step
VLD1 {qLeft[9]}, [pTmp],step
VLD1 {qLeft[10]}, [pSrcLeft],step
VLD1 {qLeft[11]}, [pTmp],step
VLD1 {qLeft[12]}, [pSrcLeft],step
VLD1 {qLeft[13]}, [pTmp],step
VLD1 {qLeft[14]}, [pSrcLeft],step
VLD1 {qLeft[15]}, [pTmp]
VREV64 dRevAbove, dAbove1 ;// pSrcAbove[15:14:13:12:11:10:9:8]
VSUBL qAbove15minus0, dRevAbove, dAboveLeft ;// qAbove7minus0[0] = pSrcAbove[15] - pSrcAboveLeft[0]
VSHR dRevAbove64, dRevAbove64, #8 ;// pSrcAbove[14:13:12:11:10:9:8:X]
VSUBL qAboveDiff, dRevAbove, dAbove0
VSHL dAboveDiff64, dAboveDiff64, #16
VEXT dDiffAbove1, dAboveDiff1, dAbove15minus0, #1
VREV64 dRevLeft,dLeft1 ;// pSrcLeft[15:14:13:12:11:10:9:8]
VSUBL qLeft15minus0,dRevLeft, dAboveLeft ;// qAbove7minus0[0] = pSrcLeft[7] - pSrcAboveLeft[0]
VSHR dRevLeft64, dRevLeft64, #8 ;// pSrcLeft[14:13:12:11:10:9:8:X]
VSUBL qLeftDiff,dRevLeft, dLeft0
;// Multiplier = [8|1|2|...|6|7]
VLD1 qMultiplier, [pMultTable]!
VSHL dLeftDiff64, dLeftDiff64, #16
VEXT dDiffLeft1, dLeftDiff1, dLeft15minus0, #1
VMULL qH,dDiffAbove0, dMultiplier0
VMULL qV,dDiffLeft0, dMultiplier0
VMLAL qH,dDiffAbove1, dMultiplier1
VMLAL qV,dDiffLeft1, dMultiplier1
VPADD dHV00,dH1,dH0
VPADD dHV01,dV1,dV0
VPADDL qHV, qHV0
VSHL qHV1,qHV,#2
VADD qHV,qHV,qHV1
;// HV = [c = ((5*V+32)>>6) | b = ((5*H+32)>>6)]
VRSHR qHV,qHV,#6
;// HV1 = [c*7|b*7]
VSHL qHV1,qHV,#3
VSUB qHV1,qHV1,qHV
;// Multiplier1 = [0|1|2|...|7]
VLD1 qMultiplier0, [pMultTable]!
VDUP qB, dHV0
VDUP qC, dHV1
VADDL qA,dAbove1,dLeft1
VSHL qA,qA, #4
VDUP qA,dA1[3]
VADD dBPlusCMult7, dHV10, dHV11
;// Multiplier1 = [8|9|10|...|15]
VLD1 qMultiplier1, [pMultTable]
;// Const = a - 7*(b+c)
VDUP qConst, dBPlusCMult7S16[0]
VSUB qConst, qA, qConst
;// B0 = [0*b|1*b|2*b|3*b|......|7*b]
VMUL qB0,qB,qMultiplier0
;// B0 = [8*b|9*b|10*b|11*b|....|15*b]
VMUL qB1,qB,qMultiplier1
VADD qSum0, qB0, qConst
VADD qSum1, qB1, qConst
;// Loops for 16 times
LoopPlane
;// (b*x + c*y + C)>>5
VQRSHRUN dOut0, qSum0,#5
VQRSHRUN dOut1, qSum1,#5
SUBS y, y, #1
VST1 qOut,[pDst],dstStep
VADD qSum0,qSum0,qC
VADD qSum1,qSum1,qC
BNE LoopPlane
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// CortexA8
END
;-----------------------------------------------------------------------------------------------
; omxVCM4P10_PredictIntra_16x16 ends
;-----------------------------------------------------------------------------------------------
|