2 * Copyright (C) 2010, Google Inc. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 #include "VectorMath.h"
32 #include <Accelerate/Accelerate.h>
36 #include <emmintrin.h>
41 namespace VectorMath {
44 // On the Mac we use the highly optimized versions in Accelerate.framework
45 // In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
46 // our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
48 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
50 #if defined(__ppc__) || defined(__i386__)
51 ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
53 vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
57 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
59 #if defined(__ppc__) || defined(__i386__)
60 ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
62 vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
66 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
68 #if defined(__ppc__) || defined(__i386__)
69 ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
71 vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
75 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
80 sc1.realp = const_cast<float*>(real1P);
81 sc1.imagp = const_cast<float*>(imag1P);
82 sc2.realp = const_cast<float*>(real2P);
83 sc2.imagp = const_cast<float*>(imag2P);
84 dest.realp = realDestP;
85 dest.imagp = imagDestP;
86 #if defined(__ppc__) || defined(__i386__)
87 ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
89 vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
93 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
95 vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
100 void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
102 int n = framesToProcess;
105 if ((sourceStride == 1) && (destStride == 1)) {
108 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
109 while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
110 *destP += k * *sourceP;
116 // Now the sourceP address aligned and start to apply SSE.
117 int tailFrames = n % 4;
118 float* endP = destP + n - tailFrames;
123 __m128 mScale = _mm_set_ps1(k);
125 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
127 #define SSE2_MULT_ADD(loadInstr, storeInstr) \
128 while (destP < endP) \
130 pSource = _mm_load_ps(sourceP); \
131 temp = _mm_mul_ps(pSource, mScale); \
132 dest = _mm_##loadInstr##_ps(destP); \
133 dest = _mm_add_ps(dest, temp); \
134 _mm_##storeInstr##_ps(destP, dest); \
140 SSE2_MULT_ADD(load, store)
142 SSE2_MULT_ADD(loadu, storeu)
148 *destP += *sourceP * *scale;
149 sourceP += sourceStride;
156 void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
159 if ((sourceStride == 1) && (destStride == 1)) {
161 int n = framesToProcess;
164 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
165 while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
166 *destP = k * *sourceP;
172 // Now the sourceP address is aligned and start to apply SSE.
174 __m128 mScale = _mm_set_ps1(k);
180 if (reinterpret_cast<size_t>(destP) & 0x0F) {
182 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
183 dest = _mm_mul_ps(*pSource, mScale);
184 _mm_storeu_ps(destP, dest);
191 pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
192 pDest = reinterpret_cast<__m128*>(destP);
193 *pDest = _mm_mul_ps(*pSource, mScale);
200 // Non-SSE handling for remaining frames which is less than 4.
203 *destP = k * *sourceP;
208 } else { // If strides are not 1, rollback to normal algorithm.
210 int n = framesToProcess;
213 *destP = k * *sourceP;
214 sourceP += sourceStride;
222 void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
225 if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
227 int n = framesToProcess;
229 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
230 while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
231 *destP = *source1P + *source2P;
238 // Now the source1P address is aligned and start to apply SSE.
246 bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
247 bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
249 if (source2Aligned && destAligned) { // all aligned
251 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
252 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
253 pDest = reinterpret_cast<__m128*>(destP);
254 *pDest = _mm_add_ps(*pSource1, *pSource2);
261 } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
263 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
264 pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
265 dest = _mm_add_ps(*pSource1, *pSource2);
266 _mm_storeu_ps(destP, dest);
273 } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
275 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
276 source2 = _mm_loadu_ps(source2P);
277 pDest = reinterpret_cast<__m128*>(destP);
278 *pDest = _mm_add_ps(*pSource1, source2);
284 } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
286 pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
287 source2 = _mm_loadu_ps(source2P);
288 dest = _mm_add_ps(*pSource1, source2);
289 _mm_storeu_ps(destP, dest);
297 // Non-SSE handling for remaining frames which is less than 4.
300 *destP = *source1P + *source2P;
306 } else { // if strides are not 1, rollback to normal algorithm
308 int n = framesToProcess;
310 *destP = *source1P + *source2P;
311 source1P += sourceStride1;
312 source2P += sourceStride2;
320 void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
323 int n = framesToProcess;
326 if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
328 // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
329 while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
330 *destP = *source1P * *source2P;
337 // Now the source1P address aligned and start to apply SSE.
338 int tailFrames = n % 4;
339 float* endP = destP + n - tailFrames;
344 bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
345 bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
347 #define SSE2_MULT(loadInstr, storeInstr) \
348 while (destP < endP) \
350 pSource1 = _mm_load_ps(source1P); \
351 pSource2 = _mm_##loadInstr##_ps(source2P); \
352 dest = _mm_mul_ps(pSource1, pSource2); \
353 _mm_##storeInstr##_ps(destP, dest); \
359 if (source2Aligned && destAligned) // Both aligned.
360 SSE2_MULT(load, store)
361 else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
362 SSE2_MULT(load, storeu)
363 else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
364 SSE2_MULT(loadu, store)
365 else // Neither aligned.
366 SSE2_MULT(loadu, storeu)
372 *destP = *source1P * *source2P;
373 source1P += sourceStride1;
374 source2P += sourceStride2;
380 void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
384 // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
385 // Otherwise, fall through to the scalar code below.
386 if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
387 && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
388 && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
389 && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
390 && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
391 && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
393 unsigned endSize = framesToProcess - framesToProcess % 4;
394 while (i < endSize) {
395 __m128 real1 = _mm_load_ps(real1P + i);
396 __m128 real2 = _mm_load_ps(real2P + i);
397 __m128 imag1 = _mm_load_ps(imag1P + i);
398 __m128 imag2 = _mm_load_ps(imag2P + i);
399 __m128 real = _mm_mul_ps(real1, real2);
400 real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
401 __m128 imag = _mm_mul_ps(real1, imag2);
402 imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
403 _mm_store_ps(realDestP + i, real);
404 _mm_store_ps(imagDestP + i, imag);
409 for (; i < framesToProcess; ++i) {
410 realDestP[i] = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
411 imagDestP[i] = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
417 } // namespace VectorMath
419 } // namespace WebCore
421 #endif // ENABLE(WEB_AUDIO)