summaryrefslogtreecommitdiffstats
path: root/media/libstagefright/codecs/on2/h264dec/source/arm_neon_asm_gcc/h264bsdWriteMacroblock.S
blob: 495d560181299376e17c80bb5d70ec888bc7c3cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
@
@ Copyright (C) 2009 The Android Open Source Project
@
@ Licensed under the Apache License, Version 2.0 (the "License");
@ you may not use this file except in compliance with the License.
@ You may obtain a copy of the License at
@
@      http://www.apache.org/licenses/LICENSE-2.0
@
@ Unless required by applicable law or agreed to in writing, software
@ distributed under the License is distributed on an "AS IS" BASIS,
@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ See the License for the specific language governing permissions and
@ limitations under the License.
@

#include "asm_common.S"

    require8
    preserve8

    .arm
    .fpu neon
    .text

/* Input / output registers */
#define image   r0
#define data    r1
#define width   r2
#define luma    r3
#define cb      r4
#define cr      r5
#define cwidth  r6

/* -- NEON registers -- */

#define qRow0     Q0.U8
#define qRow1     Q1.U8
#define qRow2     Q2.U8
#define qRow3     Q3.U8
#define qRow4     Q4.U8
#define qRow5     Q5.U8
#define qRow6     Q6.U8
#define qRow7     Q7.U8
#define qRow8     Q8.U8
#define qRow9     Q9.U8
#define qRow10    Q10.U8
#define qRow11    Q11.U8
#define qRow12    Q12.U8
#define qRow13    Q13.U8
#define qRow14    Q14.U8
#define qRow15    Q15.U8

#define dRow0     D0.U8
#define dRow1     D1.U8
#define dRow2     D2.U8
#define dRow3     D3.U8
#define dRow4     D4.U8
#define dRow5     D5.U8
#define dRow6     D6.U8
#define dRow7     D7.U8
#define dRow8     D8.U8
#define dRow9     D9.U8
#define dRow10    D10.U8
#define dRow11    D11.U8
#define dRow12    D12.U8
#define dRow13    D13.U8
#define dRow14    D14.U8
#define dRow15    D15.U8

/*------------------------------------------------------------------------------

    Function: h264bsdWriteMacroblock

        Functional description:
            Write one macroblock into the image. Both luma and chroma
            components will be written at the same time.

        Inputs:
            data    pointer to macroblock data to be written, 256 values for
                    luma followed by 64 values for both chroma components

        Outputs:
            image   pointer to the image where the macroblock will be written

        Returns:
            none

------------------------------------------------------------------------------*/

function h264bsdWriteMacroblock, export=1
    PUSH    {r4-r6,lr}
    VPUSH   {q4-q7}

    LDR     width, [image, #4]
    LDR     luma, [image, #0xC]
    LDR     cb, [image, #0x10]
    LDR     cr, [image, #0x14]


@   Write luma
    VLD1    {qRow0, qRow1}, [data]!
    LSL     width, width, #4
    VLD1    {qRow2, qRow3}, [data]!
    LSR     cwidth, width, #1
    VST1    {qRow0}, [luma,:128], width
    VLD1    {qRow4, qRow5}, [data]!
    VST1    {qRow1}, [luma,:128], width
    VLD1    {qRow6, qRow7}, [data]!
    VST1    {qRow2}, [luma,:128], width
    VLD1    {qRow8, qRow9}, [data]!
    VST1    {qRow3}, [luma,:128], width
    VLD1    {qRow10, qRow11}, [data]!
    VST1    {qRow4}, [luma,:128], width
    VLD1    {qRow12, qRow13}, [data]!
    VST1    {qRow5}, [luma,:128], width
    VLD1    {qRow14, qRow15}, [data]!
    VST1    {qRow6}, [luma,:128], width

    VLD1    {qRow0, qRow1}, [data]! ;//cb rows 0,1,2,3
    VST1    {qRow7}, [luma,:128], width
    VLD1    {qRow2, qRow3}, [data]! ;//cb rows 4,5,6,7
    VST1    {qRow8}, [luma,:128], width
    VLD1    {qRow4, qRow5}, [data]! ;//cr rows 0,1,2,3
    VST1    {qRow9}, [luma,:128], width
    VLD1    {qRow6, qRow7}, [data]! ;//cr rows 4,5,6,7
    VST1    {qRow10}, [luma,:128], width
    VST1    {dRow0}, [cb,:64], cwidth
    VST1    {dRow8}, [cr,:64], cwidth
    VST1    {qRow11}, [luma,:128], width
    VST1    {dRow1}, [cb,:64], cwidth
    VST1    {dRow9}, [cr,:64], cwidth
    VST1    {qRow12}, [luma,:128], width
    VST1    {dRow2}, [cb,:64], cwidth
    VST1    {dRow10}, [cr,:64], cwidth
    VST1    {qRow13}, [luma,:128], width
    VST1    {dRow3}, [cb,:64], cwidth
    VST1    {dRow11}, [cr,:64], cwidth
    VST1    {qRow14}, [luma,:128], width
    VST1    {dRow4}, [cb,:64], cwidth
    VST1    {dRow12}, [cr,:64], cwidth
    VST1    {qRow15}, [luma]
    VST1    {dRow5}, [cb,:64], cwidth
    VST1    {dRow13}, [cr,:64], cwidth
    VST1    {dRow6}, [cb,:64], cwidth
    VST1    {dRow14}, [cr,:64], cwidth
    VST1    {dRow7}, [cb,:64]
    VST1    {dRow15}, [cr,:64]

    VPOP    {q4-q7}
    POP     {r4-r6,pc}
@    BX      lr

    .endfunc