Source/WebCore/platform/graphics/filters/arm/FELightingNEON.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464

/*
 * Copyright (C) 2011 University of Szeged
 * Copyright (C) 2011 Zoltan Herczeg
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "config.h"
#include "FELightingNEON.h"

#if CPU(ARM_NEON) && COMPILER(GCC)

#include <wtf/Vector.h>

namespace WebCore {

// These constants are copied to the following SIMD registers:
//   ALPHAX_Q ALPHAY_Q REMAPX_D REMAPY_D

WTF_ALIGNED(short, s_FELightingConstantsForNeon[], 16) = {
    // Alpha coefficients.
    -2, 1, 0, -1, 2, 1, 0, -1,
    0, -1, -2, -1, 0, 1, 2, 1,
    // Remapping indicies.
    0x0f0e, 0x0302, 0x0504, 0x0706,
    0x0b0a, 0x1312, 0x1514, 0x1716,
};

short* feLightingConstantsForNeon()
{
    return s_FELightingConstantsForNeon;
}

#define ASSTRING(str) #str
#define TOSTRING(value) ASSTRING(value)

#define PIXELS_OFFSET TOSTRING(0)
#define WIDTH_OFFSET TOSTRING(4)
#define HEIGHT_OFFSET TOSTRING(8)
#define FLAGS_OFFSET TOSTRING(12)
#define SPECULAR_EXPONENT_OFFSET TOSTRING(16)
#define CONE_EXPONENT_OFFSET TOSTRING(20)
#define FLOAT_ARGUMENTS_OFFSET TOSTRING(24)
#define DRAWING_CONSTANTS_OFFSET TOSTRING(28)
#define NL "\n"

// Register allocation
#define PAINTING_DATA_R       "r11"
#define RESET_WIDTH_R         PAINTING_DATA_R
#define PIXELS_R              "r4"
#define WIDTH_R               "r5"
#define HEIGHT_R              "r6"
#define FLAGS_R               "r7"
#define SPECULAR_EXPONENT_R   "r8"
#define CONE_EXPONENT_R       "r10"
#define SCANLINE_R            "r12"

#define TMP1_Q                "q0"
#define TMP1_D0               "d0"
#define TMP1_S0               "s0"
#define TMP1_S1               "s1"
#define TMP1_D1               "d1"
#define TMP1_S2               "s2"
#define TMP1_S3               "s3"
#define TMP2_Q                "q1"
#define TMP2_D0               "d2"
#define TMP2_S0               "s4"
#define TMP2_S1               "s5"
#define TMP2_D1               "d3"
#define TMP2_S2               "s6"
#define TMP2_S3               "s7"
#define TMP3_Q                "q2"
#define TMP3_D0               "d4"
#define TMP3_S0               "s8"
#define TMP3_S1               "s9"
#define TMP3_D1               "d5"
#define TMP3_S2               "s10"
#define TMP3_S3               "s11"

#define COSINE_OF_ANGLE       "s12"
#define POWF_INT_S            "s13"
#define POWF_FRAC_S           "s14"
#define SPOT_COLOR_Q          "q4"

// Because of VMIN and VMAX CONST_ZERO_S and CONST_ONE_S
// must be placed on the same side of the double vector

// Current pixel position
#define POSITION_Q            "q5"
#define POSITION_X_S          "s20"
#define POSITION_Y_S          "s21"
#define POSITION_Z_S          "s22"
#define CONST_ZERO_HI_D       "d11"
#define CONST_ZERO_S          "s23"

// -------------------------------
//     Variable arguments
// Misc arguments
#define READ1_RANGE           "d12-d15"
#define READ2_RANGE           "d16-d19"
#define READ3_RANGE           "d20-d21"

#define SCALE_S               "s24"
#define SCALE_DIV4_S          "s25"
#define DIFFUSE_CONST_S       "s26"

// Light source position
#define CONE_CUT_OFF_S        "s28"
#define CONE_FULL_LIGHT_S     "s29"
#define CONE_CUT_OFF_RANGE_S  "s30"
#define CONST_ONE_HI_D        "d15"
#define CONST_ONE_S           "s31"

#define LIGHT_Q               "q8"
#define DIRECTION_Q           "q9"
#define COLOR_Q               "q10"
// -------------------------------
//    Constant coefficients
#define READ4_RANGE           "d22-d25"
#define READ5_RANGE           "d26-d27"

#define ALPHAX_Q              "q11"
#define ALPHAY_Q              "q12"
#define REMAPX_D              "d26"
#define REMAPY_D              "d27"
// -------------------------------

#define ALL_ROWS_D            "{d28,d29,d30}"
#define TOP_ROW_D             "d28"
#define MIDDLE_ROW_D          "d29"
#define BOTTOM_ROW_D          "d30"

#define GET_LENGTH(source, temp) \
    "vmul.f32 " temp##_Q ", " source##_Q ", " source##_Q NL \
    "vadd.f32 " source##_S3 ", " temp##_S0 ", " temp##_S1 NL \
    "vadd.f32 " source##_S3 ", " source##_S3 ", " temp##_S2 NL \
    "vsqrt.f32 " source##_S3 ", " source##_S3 NL

// destination##_S3 can contain the multiply of length.
#define DOT_PRODUCT(destination, source1, source2) \
    "vmul.f32 " destination##_Q ", " source1##_Q ", " source2##_Q NL \
    "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S1 NL \
    "vadd.f32 " destination##_S0 ", " destination##_S0 ", " destination##_S2 NL

#define MULTIPLY_BY_DIFFUSE_CONST(normalVectorLength, dotProductLength) \
    "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL \
    "vmuleq.f32 " TMP2_S1 ", " DIFFUSE_CONST_S ", " normalVectorLength NL \
    "vdiveq.f32 " TMP2_S1 ", " TMP2_S1 ", " dotProductLength NL \
    "vdivne.f32 " TMP2_S1 ", " normalVectorLength ", " dotProductLength NL

#define POWF_SQR(value, exponent, current, remaining) \
    "tst " exponent ", #" ASSTRING(current) NL \
    "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
    "tst " exponent ", #" ASSTRING(remaining) NL \
    "vmulne.f32 " POWF_INT_S ", " POWF_INT_S ", " POWF_INT_S NL

#define POWF_SQRT(value, exponent, current, remaining) \
    "tst " exponent ", #" ASSTRING(remaining) NL \
    "vsqrtne.f32 " POWF_FRAC_S ", " POWF_FRAC_S NL \
    "tst " exponent ", #" ASSTRING(current) NL \
    "vmulne.f32 " value ", " value ", " POWF_FRAC_S NL

// This simplified powf function is sufficiently accurate.
#define POWF(value, exponent) \
    "tst " exponent ", #0xfc0" NL \
    "vmovne.f32 " POWF_INT_S ", " value NL \
    "tst " exponent ", #0x03f" NL \
    "vmovne.f32 " POWF_FRAC_S ", " value NL \
    "vmov.f32 " value ", " CONST_ONE_S NL \
    \
    POWF_SQR(value, exponent, 0x040, 0xf80) \
    POWF_SQR(value, exponent, 0x080, 0xf00) \
    POWF_SQR(value, exponent, 0x100, 0xe00) \
    POWF_SQR(value, exponent, 0x200, 0xc00) \
    POWF_SQR(value, exponent, 0x400, 0x800) \
    "tst " exponent ", #0x800" NL \
    "vmulne.f32 " value ", " value ", " POWF_INT_S NL \
    \
    POWF_SQRT(value, exponent, 0x20, 0x3f) \
    POWF_SQRT(value, exponent, 0x10, 0x1f) \
    POWF_SQRT(value, exponent, 0x08, 0x0f) \
    POWF_SQRT(value, exponent, 0x04, 0x07) \
    POWF_SQRT(value, exponent, 0x02, 0x03) \
    POWF_SQRT(value, exponent, 0x01, 0x01)

// The following algorithm is an ARM-NEON optimized version of
// the main loop found in FELighting.cpp. Since the whole code
// is redesigned to be as effective as possible (ARM specific
// thinking), it is four times faster than its C++ counterpart.

asm ( // NOLINT
".globl " TOSTRING(neonDrawLighting) NL
TOSTRING(neonDrawLighting) ":" NL
    // Because of the clever register allocation, nothing is stored on the stack
    // except the saved registers.
    // Stack must be aligned to 8 bytes.
    "stmdb sp!, {r4-r8, r10, r11, lr}" NL
    "vstmdb sp!, {d8-d15}" NL
    "mov " PAINTING_DATA_R ", r0" NL

    // The following two arguments are loaded to SIMD registers.
    "ldr r0, [" PAINTING_DATA_R ", #" FLOAT_ARGUMENTS_OFFSET "]" NL
    "ldr r1, [" PAINTING_DATA_R ", #" DRAWING_CONSTANTS_OFFSET "]" NL
    "ldr " PIXELS_R ", [" PAINTING_DATA_R ", #" PIXELS_OFFSET "]" NL
    "ldr " WIDTH_R ", [" PAINTING_DATA_R ", #" WIDTH_OFFSET "]" NL
    "ldr " HEIGHT_R ", [" PAINTING_DATA_R ", #" HEIGHT_OFFSET "]" NL
    "ldr " FLAGS_R ", [" PAINTING_DATA_R ", #" FLAGS_OFFSET "]" NL
    "ldr " SPECULAR_EXPONENT_R ", [" PAINTING_DATA_R ", #" SPECULAR_EXPONENT_OFFSET "]" NL
    "ldr " CONE_EXPONENT_R ", [" PAINTING_DATA_R ", #" CONE_EXPONENT_OFFSET "]" NL

    // Load all data to the SIMD registers with the least number of instructions.
    "vld1.f32 { " READ1_RANGE " }, [r0]!" NL
    "vld1.f32 { " READ2_RANGE " }, [r0]!" NL
    "vld1.f32 { " READ3_RANGE " }, [r0]!" NL
    "vld1.s16 {" READ4_RANGE "}, [r1]!" NL
    "vld1.s16 {" READ5_RANGE "}, [r1]!" NL

    // Initializing local variables.
    "mov " SCANLINE_R ", " WIDTH_R ", lsl #2" NL
    "add " SCANLINE_R ", " SCANLINE_R ", #8" NL
    "add " PIXELS_R ", " PIXELS_R ", " SCANLINE_R NL
    "add " PIXELS_R ", " PIXELS_R ", #3" NL
    "mov r0, #0" NL
    "vmov.f32 " CONST_ZERO_S ", r0" NL
    "vmov.f32 " POSITION_Y_S ", " CONST_ONE_S NL
    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    "vmov.f32 " SPOT_COLOR_Q ", " COLOR_Q NL
    "mov " RESET_WIDTH_R ", " WIDTH_R NL

".mainloop:" NL
    "mov r3, #3" NL
    "vmov.f32 " POSITION_X_S ", " CONST_ONE_S NL

".scanline:" NL
    // The ROW registers are storing the alpha channel of the last three pixels.
    // The alpha channel is stored as signed short (sint16) values. The fourth value
    // is garbage. The following instructions are shifting out the unnecessary alpha
    // values and load the next ones.
    "ldrb r0, [" PIXELS_R ", -" SCANLINE_R "]" NL
    "ldrb r1, [" PIXELS_R ", +" SCANLINE_R "]" NL
    "ldrb r2, [" PIXELS_R "], #4" NL
    "vext.s16 " TOP_ROW_D ", " TOP_ROW_D ", " TOP_ROW_D ", #3" NL
    "vext.s16 " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", " MIDDLE_ROW_D ", #3" NL
    "vext.s16 " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", " BOTTOM_ROW_D ", #3" NL
    "vmov.s16 " TOP_ROW_D "[1], r0" NL
    "vmov.s16 " MIDDLE_ROW_D "[1], r2" NL
    "vmov.s16 " BOTTOM_ROW_D "[1], r1" NL

    // The two border pixels (rightmost and leftmost) are skipped when
    // the next scanline is reached. It also jumps, when the algorithm
    // is started, and the first free alpha values are loaded to each row.
    "subs r3, r3, #1" NL
    "bne .scanline" NL

    // The light vector goes to TMP1_Q. It is constant in case of distant light.
    // The fourth value contains the length of the light vector.
    "tst " FLAGS_R ", #" TOSTRING(FLAG_POINT_LIGHT | FLAG_SPOT_LIGHT) NL
    "beq .distantLight" NL

    "vmov.s16 r3, " MIDDLE_ROW_D "[2]" NL
    "vmov.f32 " POSITION_Z_S ", r3" NL
    "vcvt.f32.s32 " POSITION_Z_S ", " POSITION_Z_S NL
    "vmul.f32 " POSITION_Z_S ", " POSITION_Z_S ", " SCALE_S NL

    "vsub.f32 " TMP1_Q ", " LIGHT_Q ", " POSITION_Q NL
    GET_LENGTH(TMP1, TMP2)

    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    "bne .cosineOfAngle" NL
".visiblePixel:" NL

    //     | -1  0  1 |      | -1 -2 -1 |
    // X = | -2  0  2 |  Y = |  0  0  0 |
    //     | -1  0  1 |      |  1  2  1 |

    // Multiply the alpha values by the X and Y matrices.

    // Moving the 8 alpha value to TMP3.
    "vtbl.8 " TMP3_D0 ", " ALL_ROWS_D ", " REMAPX_D NL
    "vtbl.8 " TMP3_D1 ", " ALL_ROWS_D ", " REMAPY_D NL

    "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAX_Q NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    "vmov.s16 r0, " TMP2_D0 "[0]" NL

    "vmul.s16 " TMP2_Q ", " TMP3_Q ", " ALPHAY_Q NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D1 NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    "vpadd.s16 " TMP2_D0 ", " TMP2_D0 ", " TMP2_D0 NL
    "vmov.s16 r1, " TMP2_D0 "[0]" NL

    // r0 and r1 contains the X and Y coordinates of the
    // normal vector, respectively.

    // Calculating the spot light strength.
    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPOT_LIGHT) NL
    "beq .endLight" NL

    "vneg.f32 " TMP3_S1 ", " COSINE_OF_ANGLE NL
    "tst " FLAGS_R ", #" TOSTRING(FLAG_CONE_EXPONENT_IS_1) NL
    "beq .coneExpPowf" NL
".coneExpPowfFinished:" NL

    // Smoothing the cone edge if necessary.
    "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_FULL_LIGHT_S NL
    "fmstat" NL
    "bhi .cutOff" NL
".cutOffFinished:" NL

    "vmin.f32 " TMP3_D0 ", " TMP3_D0 ", " CONST_ONE_HI_D NL
    "vmul.f32 " COLOR_Q ", " SPOT_COLOR_Q ", " TMP3_D0 "[1]" NL

".endLight:" NL
    // Summarize:
    // r0 and r1 contains the normalVector.
    // TMP1_Q contains the light vector and its length.
    // COLOR_Q contains the color of the light vector.

    // Test whether both r0 and r1 are zero (Normal vector is (0, 0, 1)).
    "orrs r2, r0, r1" NL
    "bne .normalVectorIsNonZero" NL

    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
    "bne .specularLight1" NL

    // Calculate diffuse light strength.
    MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
    "b .lightStrengthCalculated" NL

".specularLight1:" NL
    // Calculating specular light strength.
    "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
    GET_LENGTH(TMP1, TMP2)

    // When the exponent is 1, we don't need to call an expensive powf function.
    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
    "vdiveq.f32 " TMP2_S1 ", " TMP1_S2 ", " TMP1_S3 NL
    "beq .specularExpPowf" NL

    MULTIPLY_BY_DIFFUSE_CONST(TMP1_S2, TMP1_S3)
    "b .lightStrengthCalculated" NL

".normalVectorIsNonZero:" NL
    // Normal vector goes to TMP2, and its length is calculated as well.
    "vmov.s32 " TMP2_S0 ", r0" NL
    "vcvt.f32.s32 " TMP2_S0 ", " TMP2_S0 NL
    "vmul.f32 " TMP2_S0 ", " TMP2_S0 ", " SCALE_DIV4_S NL
    "vmov.s32 " TMP2_S1 ", r1" NL
    "vcvt.f32.s32 " TMP2_S1 ", " TMP2_S1 NL
    "vmul.f32 " TMP2_S1 ", " TMP2_S1 ", " SCALE_DIV4_S NL
    "vmov.f32 " TMP2_S2 ", " CONST_ONE_S NL
    GET_LENGTH(TMP2, TMP3)

    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_LIGHT) NL
    "bne .specularLight2" NL

    // Calculating diffuse light strength.
    DOT_PRODUCT(TMP3, TMP2, TMP1)
    MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)
    "b .lightStrengthCalculated" NL

".specularLight2:" NL
    // Calculating specular light strength.
    "vadd.f32 " TMP1_S2 ", " TMP1_S2 ", " TMP1_S3 NL
    GET_LENGTH(TMP1, TMP3)
    DOT_PRODUCT(TMP3, TMP2, TMP1)

    // When the exponent is 1, we don't need to call an expensive powf function.
    "tst " FLAGS_R ", #" TOSTRING(FLAG_SPECULAR_EXPONENT_IS_1) NL
    "vdiveq.f32 " TMP2_S1 ", " TMP3_S0 ", " TMP3_S3 NL
    "beq .specularExpPowf" NL
    MULTIPLY_BY_DIFFUSE_CONST(TMP3_S0, TMP3_S3)

".lightStrengthCalculated:" NL
    // TMP2_S1 contains the light strength. Clamp it to [0, 1]
    "vmax.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ZERO_HI_D NL
    "vmin.f32 " TMP2_D0 ", " TMP2_D0 ", " CONST_ONE_HI_D NL
    "vmul.f32 " TMP3_Q ", " COLOR_Q ", " TMP2_D0 "[1]" NL
    "vcvt.u32.f32 " TMP3_Q ", " TMP3_Q NL
    "vmov.u32 r2, r3, " TMP3_S0 ", " TMP3_S1 NL
    // The color values are stored in-place.
    "strb r2, [" PIXELS_R ", #-11]" NL
    "strb r3, [" PIXELS_R ", #-10]" NL
    "vmov.u32 r2, " TMP3_S2 NL
    "strb r2, [" PIXELS_R ", #-9]" NL

    // Continue to the next pixel.
".blackPixel:" NL
    "vadd.f32 " POSITION_X_S ", " CONST_ONE_S NL
    "mov r3, #1" NL
    "subs " WIDTH_R ", " WIDTH_R ", #1" NL
    "bne .scanline" NL

    // If the end of the scanline is reached, we continue
    // to the next scanline.
    "vadd.f32 " POSITION_Y_S ", " CONST_ONE_S NL
    "mov " WIDTH_R ", " RESET_WIDTH_R NL
    "subs " HEIGHT_R ", " HEIGHT_R ", #1" NL
    "bne .mainloop" NL

    // Return.
    "vldmia sp!, {d8-d15}" NL
    "ldmia sp!, {r4-r8, r10, r11, pc}" NL

".distantLight:" NL
    // In case of distant light, the light vector is constant,
    // we simply copy it.
    "vmov.f32 " TMP1_Q ", " LIGHT_Q NL
    "b .visiblePixel" NL

".cosineOfAngle:" NL
    // If the pixel is outside of the cone angle, it is simply a black pixel.
    DOT_PRODUCT(TMP3, TMP1, DIRECTION)
    "vdiv.f32 " COSINE_OF_ANGLE ", " TMP3_S0 ", " TMP1_S3 NL
    "vcmp.f32 " COSINE_OF_ANGLE ", " CONE_CUT_OFF_S NL
    "fmstat" NL
    "bls .visiblePixel" NL
    "mov r0, #0" NL
    "strh r0, [" PIXELS_R ", #-11]" NL
    "strb r0, [" PIXELS_R ", #-9]" NL
    "b .blackPixel" NL

".cutOff:" NL
    // Smoothing the light strength on the cone edge.
    "vsub.f32 " TMP3_S0 ", " CONE_CUT_OFF_S ", " COSINE_OF_ANGLE NL
    "vdiv.f32 " TMP3_S0 ", " TMP3_S0 ", " CONE_CUT_OFF_RANGE_S NL
    "vmul.f32 " TMP3_S1 ", " TMP3_S1 ", " TMP3_S0 NL
    "b .cutOffFinished" NL

".coneExpPowf:" NL
    POWF(TMP3_S1, CONE_EXPONENT_R)
    "b .coneExpPowfFinished" NL

".specularExpPowf:" NL
    POWF(TMP2_S1, SPECULAR_EXPONENT_R)
    "tst " FLAGS_R ", #" TOSTRING(FLAG_DIFFUSE_CONST_IS_1) NL
    "vmuleq.f32 " TMP2_S1 ", " TMP2_S1 ", " DIFFUSE_CONST_S NL
    "b .lightStrengthCalculated" NL
); // NOLINT

} // namespace WebCore

#endif // CPU(ARM_NEON) && COMPILER(GCC)