300369801b70a3f0bd1d06cd50c0a983836dd7c7
[WebKit-https.git] / Source / WebCore / platform / graphics / filters / arm / FEGaussianBlurNEON.cpp
1 /*
2  * Copyright (C) 2011 University of Szeged
3  * Copyright (C) 2011 Zoltan Herczeg
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26
27 #include "config.h"
28 #include "FEGaussianBlurNEON.h"
29
30 #if CPU(ARM_NEON) && CPU(ARM_TRADITIONAL) && COMPILER(GCC)
31
32 #include <wtf/Alignment.h>
33
34 namespace WebCore {
35
36 static WTF_ALIGNED(unsigned char, s_FEGaussianBlurConstantsForNeon[], 16) = {
37     // Mapping from NEON to ARM registers.
38     0, 4,  8,  12, 16, 16, 16, 16
39 };
40
41 unsigned char* feGaussianBlurConstantsForNeon()
42 {
43     return s_FEGaussianBlurConstantsForNeon;
44 }
45
46 #define ASSTRING(str) #str
47 #define TOSTRING(value) ASSTRING(value)
48
49 #define STRIDE_OFFSET TOSTRING(0)
50 #define STRIDE_WIDTH_OFFSET TOSTRING(4)
51 #define STRIDE_LINE_OFFSET TOSTRING(8)
52 #define STRIDE_LINE_WIDTH_OFFSET TOSTRING(12)
53 #define REMAINING_STRIDES_OFFSET TOSTRING(16)
54 #define DISTANCE_LEFT_OFFSET TOSTRING(20)
55 #define DISTANCE_RIGHT_OFFSET TOSTRING(24)
56 #define INVERTED_KERNEL_SIZE_OFFSET TOSTRING(28)
57 #define PAINTING_CONSTANTS_OFFSET TOSTRING(32)
58 #define NL "\n"
59
60 // Register allocation.
61 #define SOURCE_R                "r0"
62 #define DESTINATION_R           "r1"
63 #define LEFT_R                  "r2"
64 #define RIGHT_R                 "r3"
65 #define SOURCE_END_R            "r4"
66 #define DESTINATION_END_R       "r5"
67 #define STRIDE_R                "r6"
68 #define STRIDE_WIDTH_R          "r7"
69 #define STRIDE_LINE_R           "r8"
70 #define SOURCE_LINE_END_R       "r10"
71 #define DISTANCE_LEFT_R         "r11"
72 #define DISTANCE_RIGHT_R        "r12"
73 #define MAX_KERNEL_SIZE_R       "lr"
74
75 // Alternate names.
76 #define INIT_INVERTED_KERNEL_SIZE_R SOURCE_END_R
77 #define INIT_PAINTING_CONSTANTS_R DESTINATION_END_R
78 #define INIT_SUM_R LEFT_R
79 #define REMAINING_STRIDES_R SOURCE_LINE_END_R
80
81 #define INVERTED_KERNEL_SIZE_Q  "q0"
82 #define SUM_Q                   "q1"
83 #define PIXEL_Q                 "q2"
84 #define PIXEL_D0                "d4"
85 #define PIXEL_D1                "d5"
86 #define PIXEL_D00               "d4[0]"
87 #define PIXEL_D01               "d4[1]"
88 #define PIXEL_S1                "s9"
89 #define PIXEL_D10               "d5[0]"
90 #define PIXEL_S2                "s10"
91 #define PIXEL_D11               "d5[1]"
92 #define REMAINING_STRIDES_S0    "s12"
93
94 #define REMAP_NEON_ARM_Q        "d16"
95
96 asm ( // NOLINT
97 ".globl " TOSTRING(neonDrawAllChannelGaussianBlur) NL
98 TOSTRING(neonDrawAllChannelGaussianBlur) ":" NL
99     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
100     "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
101     "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
102     "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
103     "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
104     "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
105     "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
106     "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
107     "ldr " INIT_PAINTING_CONSTANTS_R ", [r2, #" PAINTING_CONSTANTS_OFFSET "]" NL
108
109     // Initialize locals.
110     "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
111     "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
112     "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
113     "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
114     "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
115     "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
116     "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
117     "vld1.f32 { " REMAP_NEON_ARM_Q " }, [" INIT_PAINTING_CONSTANTS_R "]!" NL
118
119 ".allChannelMainLoop:" NL
120
121     // Initialize the sum variable.
122     "vmov.u32 " SUM_Q ", #0" NL
123     "mov " INIT_SUM_R ", " SOURCE_R NL
124     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
125     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
126     "bcs .allChannelInitSumDone" NL
127 ".allChannelInitSum:" NL
128     "vld1.u32 " PIXEL_D00 ", [" INIT_SUM_R "], " STRIDE_R NL
129     "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
130     "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
131     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
132     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
133     "bcc .allChannelInitSum" NL
134 ".allChannelInitSumDone:" NL
135
136     // Blurring.
137     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
138     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
139     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
140     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
141
142 ".allChannelBlur:" NL
143     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
144     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
145     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
146     "vtbl.8 " PIXEL_D0 ", {" PIXEL_D0 "-" PIXEL_D1 "}, " REMAP_NEON_ARM_Q NL
147     "vst1.u32 " PIXEL_D00 ", [" DESTINATION_R "], " STRIDE_R NL
148
149     "cmp " LEFT_R ", " SOURCE_R NL
150     "bcc .allChannelSkipLeft" NL
151     "vld1.u32 " PIXEL_D00 ", [" LEFT_R "]" NL
152     "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
153     "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
154     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
155 ".allChannelSkipLeft: " NL
156
157     "cmp " RIGHT_R ", " SOURCE_END_R NL
158     "bcs .allChannelSkipRight" NL
159     "vld1.u32 " PIXEL_D00 ", [" RIGHT_R "]" NL
160     "vmovl.u8 " PIXEL_Q ", " PIXEL_D0 NL
161     "vmovl.u16 " PIXEL_Q ", " PIXEL_D0 NL
162     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
163 ".allChannelSkipRight: " NL
164
165     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
166     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
167     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
168     "bcc .allChannelBlur" NL
169     "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
170
171     "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R NL
172     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R NL
173     "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
174     "bcc .allChannelMainLoop" NL
175
176     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
177 ); // NOLINT
178
179 #define DATA_TRANSFER4(command, base) \
180     command " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
181     command " " PIXEL_D01 ", [" base "], " STRIDE_LINE_R NL \
182     command " " PIXEL_D10 ", [" base "], " STRIDE_LINE_R NL \
183     command " " PIXEL_D11 ", [" base "], " STRIDE_LINE_R NL \
184     "sub " base ", " base ", " STRIDE_LINE_R ", lsl #2" NL
185
186 // The number of reads depend on REMAINING_STRIDES_R, but it is always >= 1 and <= 3
187 #define CONDITIONAL_DATA_TRANSFER4(command1, command2, base) \
188     command1 " " PIXEL_D00 ", [" base "], " STRIDE_LINE_R NL \
189     "cmp " REMAINING_STRIDES_R ", #2" NL \
190     command2 "cs " PIXEL_S1 ", [" base "]" NL \
191     "add " base ", " base ", " STRIDE_LINE_R NL \
192     "cmp " REMAINING_STRIDES_R ", #3" NL \
193     command2 "cs " PIXEL_S2 ", [" base "]" NL \
194     "sub " base ", " base ", " STRIDE_LINE_R ", lsl #1" NL
195
196 asm ( // NOLINT
197 ".globl " TOSTRING(neonDrawAlphaChannelGaussianBlur) NL
198 TOSTRING(neonDrawAlphaChannelGaussianBlur) ":" NL
199     "stmdb sp!, {r4-r8, r10, r11, lr}" NL
200     "ldr " STRIDE_R ", [r2, #" STRIDE_OFFSET "]" NL
201     "ldr " STRIDE_WIDTH_R ", [r2, #" STRIDE_WIDTH_OFFSET "]" NL
202     "ldr " DISTANCE_LEFT_R ", [r2, #" DISTANCE_LEFT_OFFSET "]" NL
203     "ldr " DISTANCE_RIGHT_R ", [r2, #" DISTANCE_RIGHT_OFFSET "]" NL
204     "ldr " STRIDE_LINE_R ", [r2, #" STRIDE_LINE_OFFSET "]" NL
205     "ldr " SOURCE_LINE_END_R ", [r2, #" STRIDE_LINE_WIDTH_OFFSET "]" NL
206     "ldr " INIT_INVERTED_KERNEL_SIZE_R ", [r2, #" INVERTED_KERNEL_SIZE_OFFSET "]" NL
207     "vldr.u32 " REMAINING_STRIDES_S0 ", [r2, #" REMAINING_STRIDES_OFFSET "]" NL
208
209     // Initialize locals.
210     "mul " DISTANCE_LEFT_R ", " DISTANCE_LEFT_R ", " STRIDE_R NL
211     "mul " DISTANCE_RIGHT_R ", " DISTANCE_RIGHT_R ", " STRIDE_R NL
212     "mov " MAX_KERNEL_SIZE_R ", " DISTANCE_RIGHT_R NL
213     "cmp " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
214     "movcs " MAX_KERNEL_SIZE_R ", " STRIDE_WIDTH_R NL
215     "add " SOURCE_LINE_END_R ", " SOURCE_LINE_END_R ", " SOURCE_R NL
216     "vdup.f32 " INVERTED_KERNEL_SIZE_Q ", " INIT_INVERTED_KERNEL_SIZE_R NL
217     "cmp " SOURCE_LINE_END_R ", " SOURCE_R NL
218     "beq .alphaChannelEarlyLeave" NL
219
220     // Processing 4 strides parallelly.
221
222 ".alphaChannelMainLoop:" NL
223
224     // Initialize the sum variable.
225     "vmov.u32 " SUM_Q ", #0" NL
226     "mov " INIT_SUM_R ", " SOURCE_R NL
227     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
228     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
229     "bcs .alphaChannelInitSumDone" NL
230 ".alphaChannelInitSum:" NL
231     DATA_TRANSFER4("vld1.u32", INIT_SUM_R)
232     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
233     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
234     "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
235     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
236     "bcc .alphaChannelInitSum" NL
237 ".alphaChannelInitSumDone:" NL
238
239     // Blurring.
240     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
241     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
242     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
243     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
244
245 ".alphaChannelBlur:" NL
246     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
247     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
248     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
249     "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
250     DATA_TRANSFER4("vst1.u32", DESTINATION_R)
251
252     "cmp " LEFT_R ", " SOURCE_R NL
253     "bcc .alphaChannelSkipLeft" NL
254     DATA_TRANSFER4("vld1.u32", LEFT_R)
255     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
256     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
257 ".alphaChannelSkipLeft: " NL
258
259     "cmp " RIGHT_R ", " SOURCE_END_R NL
260     "bcs .alphaChannelSkipRight" NL
261     DATA_TRANSFER4("vld1.u32", RIGHT_R)
262     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
263     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
264 ".alphaChannelSkipRight: " NL
265
266     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
267     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
268     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
269     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
270     "bcc .alphaChannelBlur" NL
271     "sub " DESTINATION_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
272
273     "add " SOURCE_R ", " SOURCE_R ", " STRIDE_LINE_R ", lsl #2" NL
274     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_LINE_R ", lsl #2" NL
275     "cmp " SOURCE_R ", " SOURCE_LINE_END_R NL
276     "bcc .alphaChannelMainLoop" NL
277
278     // Processing the remaining strides (0 - 3).
279 ".alphaChannelEarlyLeave:" NL
280     "vmov.u32 " REMAINING_STRIDES_R ", " REMAINING_STRIDES_S0 NL
281     // Early return for 0 strides.
282     "cmp " REMAINING_STRIDES_R ", #0" NL
283     "ldmeqia sp!, {r4-r8, r10, r11, pc}" NL
284
285     // Initialize the sum variable.
286     "vmov.u32 " SUM_Q ", #0" NL
287     "mov " INIT_SUM_R ", " SOURCE_R NL
288     "add " SOURCE_END_R ", " SOURCE_R ", " MAX_KERNEL_SIZE_R NL
289     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
290     "bcs .alphaChannelSecondInitSumDone" NL
291 ".alphaChannelSecondInitSum:" NL
292     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", INIT_SUM_R)
293     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
294     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
295     "add " INIT_SUM_R ", " INIT_SUM_R ", " STRIDE_R NL
296     "cmp " INIT_SUM_R ", " SOURCE_END_R NL
297     "bcc .alphaChannelSecondInitSum" NL
298 ".alphaChannelSecondInitSumDone:" NL
299
300     // Blurring.
301     "add " SOURCE_END_R ", " SOURCE_R ", " STRIDE_WIDTH_R NL
302     "add " DESTINATION_END_R ", " DESTINATION_R ", " STRIDE_WIDTH_R NL
303     "sub " LEFT_R ", " SOURCE_R ", " DISTANCE_LEFT_R NL
304     "add " RIGHT_R ", " SOURCE_R ", " DISTANCE_RIGHT_R NL
305
306 ".alphaChannelSecondBlur:" NL
307     "vcvt.f32.u32 " PIXEL_Q ", " SUM_Q NL
308     "vmul.f32 " PIXEL_Q ", " PIXEL_Q ", " INVERTED_KERNEL_SIZE_Q NL
309     "vcvt.u32.f32 " PIXEL_Q ", " PIXEL_Q NL
310     "vshl.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
311     CONDITIONAL_DATA_TRANSFER4("vst1.u32", "vstr", DESTINATION_R)
312
313     "cmp " LEFT_R ", " SOURCE_R NL
314     "bcc .alphaChannelSecondSkipLeft" NL
315     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", LEFT_R)
316     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
317     "vsub.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
318 ".alphaChannelSecondSkipLeft: " NL
319
320     "cmp " RIGHT_R ", " SOURCE_END_R NL
321     "bcs .alphaChannelSecondSkipRight" NL
322     CONDITIONAL_DATA_TRANSFER4("vld1.u32", "vldr", RIGHT_R)
323     "vshr.u32 " PIXEL_Q ", " PIXEL_Q ", #24" NL
324     "vadd.u32 " SUM_Q ", " SUM_Q ", " PIXEL_Q NL
325 ".alphaChannelSecondSkipRight: " NL
326
327     "add " DESTINATION_R ", " DESTINATION_R ", " STRIDE_R NL
328     "add " LEFT_R ", " LEFT_R ", " STRIDE_R NL
329     "add " RIGHT_R ", " RIGHT_R ", " STRIDE_R NL
330     "cmp " DESTINATION_R ", " DESTINATION_END_R NL
331     "bcc .alphaChannelSecondBlur" NL
332
333     "ldmia sp!, {r4-r8, r10, r11, pc}" NL
334 ); // NOLINT
335
336 } // namespace WebCore
337
338 #endif // CPU(ARM_NEON) && COMPILER(GCC)