1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
|
;//
;//
;// File Name: omxVCM4P10_TransformDequantChromaDCFromPair_s.s
;// OpenMAX DL: v1.0.2
;// Revision: 12290
;// Date: Wednesday, April 9, 2008
;//
;// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
;//
;//
;//
INCLUDE omxtypes_s.h
INCLUDE armCOMM_s.h
IMPORT armVCM4P10_QPDivTable
IMPORT armVCM4P10_VMatrixQPModTable
M_VARIANTS CortexA8
IF CortexA8
;// ARM Registers
;//--------------------------------------
;// Declare input registers
;//--------------------------------------
ppSrc RN 0
pDst RN 1
QP RN 2
;//--------------------------------
;// Scratch variable for Unpack2x2
;//--------------------------------
pSrc RN 9
Value RN 4
Value2 RN 5
Flag RN 6
strOffset RN 7
cstOffset RN 8
;//--------------------------------
;// Scratch variable
;//--------------------------------
r0w0 RN 3
r0w1 RN 4
c0w0 RN 5
c1w0 RN 6
return RN 0
pQPDivTable RN 5
pQPModTable RN 6
Shift RN 9
Scale RN 2
;// Neon Registers
dZero DN D0.U16
dInvTrCoeff DN D0.S16
dScale DN D1.S16
qDqntCoeff QN Q1.S32
dDqntCoeff DN D2.S16
;// Write function header
M_START omxVCM4P10_TransformDequantChromaDCFromPair, r9
LDR pSrc, [ppSrc] ;// Load pSrc
VMOV dZero, #0
MOV cstOffset, #31 ;// To be used in the loop, to compute offset
;//-----------------------------------------------------------------------
;// Firstly, fill all the coefficient values on the <pDst> buffer by zero
;//-----------------------------------------------------------------------
VST1 dZero,[pDst] ;// pDst[0] = pDst[1] = pDst[2] = pDst[3] = 0
LDRB Flag, [pSrc], #1 ;// Preload <Flag> before <unpackLoop>
unpackLoop
TST Flag, #0x10 ;// Computing (Flag & 0x10)
LDRSBNE Value2,[pSrc,#1]
LDRBNE Value, [pSrc], #2 ;// Load byte wise to avoid unaligned access
AND strOffset, cstOffset, Flag, LSL #1 ;// strOffset = (Flag & 15) < 1;
LDRSBEQ Value, [pSrc], #1 ;// Value = (OMX_U8) *pSrc++
ORRNE Value,Value,Value2, LSL #8 ;// Value = (OMX_U16) *pSrc++
TST Flag, #0x20 ;// Computing (Flag & 0x20) to check, if we're done
LDRBEQ Flag, [pSrc], #1 ;// Flag = (OMX_U8) *pSrc++, for next iteration
STRH Value, [pDst, strOffset] ;// Store <Value> at offset <strOffset>
BEQ unpackLoop ;// Branch to the loop beginning
;//--------------------------------------------------
;//InvTransformDC2x2: Inlined (Implemented in ARM V6)
;//--------------------------------------------------
LDMIA pDst, {r0w0, r0w1} ;// r0w0 = |c1|c0| & r0w1 = |c3|c2|
STR pSrc, [ppSrc] ;// Update the bitstream pointer
LDR pQPDivTable, =armVCM4P10_QPDivTable ;// QP Division look-up-table base pointer
LDR pQPModTable, =armVCM4P10_VMatrixQPModTable ;// QP Modulo look-up-table base pointer
SADDSUBX r0w0, r0w0, r0w0 ;// [ c00+c01, c00-c01 ]
SADDSUBX r0w1, r0w1, r0w1 ;// [ c10+c11, c10-c11 ]
LDRSB Shift, [pQPDivTable, QP] ;// Shift = pQPDivTable[QP]
LDRSB Scale, [pQPModTable, QP] ;// Scale = pQPModTable[QP]
SADD16 c0w0, r0w0, r0w1 ;// [ d00+d10, d01+d11 ]
SSUB16 c1w0, r0w0, r0w1 ;// [ d00-d10, d01-d11 ]
;//-------------------------------------------------
;//DequantChromaDC2x2: Inlined (Neon Implementation)
;//-------------------------------------------------
LSL Scale, Scale, Shift ;// Scale = Scale << Shift
VMOV dInvTrCoeff, c0w0, c1w0
VREV32 dInvTrCoeff,dInvTrCoeff
VDUP dScale,Scale
VMULL qDqntCoeff,dInvTrCoeff,dScale
VSHRN dDqntCoeff,qDqntCoeff,#1
VST1 dDqntCoeff,[pDst] ;// Storing all the coefficients at once
MOV return, #OMX_Sts_NoErr
M_END
ENDIF ;// CortexA8
END
|