Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions with NEON intrinsics
authorrgabor@webkit.org <rgabor@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Mon, 7 Jan 2013 14:31:32 +0000 (14:31 +0000)
committerrgabor@webkit.org <rgabor@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Mon, 7 Jan 2013 14:31:32 +0000 (14:31 +0000)
https://bugs.webkit.org/show_bug.cgi?id=103614

Reviewed by Zoltan Herczeg.

Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions in GraphicsContext3D
with ARM NEON intrinsics. The optimized functions are 2-3x faster than the origin.

* platform/graphics/GraphicsContext3D.cpp:
(WebCore):
* platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
(WebCore::SIMD::unpackOneRowOfRGBA16LittleToRGBA8):
(SIMD):
(WebCore::SIMD::unpackOneRowOfRGB16LittleToRGBA8):
(WebCore::SIMD::unpackOneRowOfARGB16LittleToRGBA8):
(WebCore::SIMD::unpackOneRowOfBGRA16LittleToRGBA8):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@138936 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/ChangeLog
Source/WebCore/platform/graphics/GraphicsContext3D.cpp
Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h

index a50c2ac..1835080 100644 (file)
@@ -1,3 +1,22 @@
+2013-01-07  Gabor Rapcsanyi  <rgabor@webkit.org>
+
+        Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions with NEON intrinsics
+        https://bugs.webkit.org/show_bug.cgi?id=103614
+
+        Reviewed by Zoltan Herczeg.
+
+        Optimizing RGBA16, RGB16, ARGB16, BGRA16 unpacking functions in GraphicsContext3D
+        with ARM NEON intrinsics. The optimized functions are 2-3x faster than the origin.
+
+        * platform/graphics/GraphicsContext3D.cpp:
+        (WebCore):
+        * platform/graphics/cpu/arm/GraphicsContext3DNEON.h:
+        (WebCore::SIMD::unpackOneRowOfRGBA16LittleToRGBA8):
+        (SIMD):
+        (WebCore::SIMD::unpackOneRowOfRGB16LittleToRGBA8):
+        (WebCore::SIMD::unpackOneRowOfARGB16LittleToRGBA8):
+        (WebCore::SIMD::unpackOneRowOfBGRA16LittleToRGBA8):
+
 2013-01-07  Christophe Dumez  <christophe.dumez@intel.com>
 
         Regression(r138786): Causes webaudio tests to crash
index f30255c..7669017 100644 (file)
@@ -392,6 +392,9 @@ namespace {
 
 void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    SIMD::unpackOneRowOfRGBA16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         destination[0] = convertColor16LittleTo8(source[0]);
         destination[1] = convertColor16LittleTo8(source[1]);
@@ -428,6 +431,9 @@ void unpackOneRowOfRGB8ToRGBA8(const uint8_t* source, uint8_t* destination, unsi
 
 void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    SIMD::unpackOneRowOfRGB16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         destination[0] = convertColor16LittleTo8(source[0]);
         destination[1] = convertColor16LittleTo8(source[1]);
@@ -476,6 +482,9 @@ void unpackOneRowOfARGB8ToRGBA8(const uint8_t* source, uint8_t* destination, uns
 
 void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    SIMD::unpackOneRowOfARGB16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         destination[0] = convertColor16LittleTo8(source[1]);
         destination[1] = convertColor16LittleTo8(source[2]);
@@ -530,6 +539,9 @@ void unpackOneRowOfBGRA8ToRGBA8(const uint8_t* source, uint8_t* destination, uns
 
 void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    SIMD::unpackOneRowOfBGRA16LittleToRGBA8(source, destination, pixelsPerRow);
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         destination[0] = convertColor16LittleTo8(source[2]);
         destination[1] = convertColor16LittleTo8(source[1]);
index 9f38724..c043fed 100644 (file)
@@ -34,6 +34,86 @@ namespace WebCore {
 
 namespace SIMD {
 
+ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 16;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+    const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
+
+    for (unsigned i = 0; i < componentsSize; i += 16) {
+        uint8x16x2_t components = vld2q_u8(src + i * 2);
+        vst1q_u8(destination + i, components.val[1]);
+    }
+
+    source += componentsSize;
+    destination += componentsSize;
+    pixelsPerRow = tailComponents / 4;
+}
+
+ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+    unsigned componentsPerRow = pixelsPerRow * 3;
+    unsigned tailComponents = componentsPerRow % 24;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    uint8x8_t componentA = vdup_n_u8(0xFF);
+    for (unsigned i = 0; i < componentsSize; i += 24) {
+        uint16x8x3_t RGB16 = vld3q_u16(source + i);
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
+        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
+        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
+        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+        vst4_u8(destination, RGBA8);
+        destination += 32;
+    }
+
+    source += componentsSize;
+    pixelsPerRow = tailComponents / 3;
+}
+
+ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 32;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    for (unsigned i = 0; i < componentsSize; i += 32) {
+        uint16x8x4_t ARGB16 = vld4q_u16(source + i);
+        uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
+        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
+        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
+        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+        vst4_u8(destination + i, RGBA8);
+    }
+
+    source += componentsSize;
+    destination += componentsSize;
+    pixelsPerRow = tailComponents / 4;
+}
+
+ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
+{
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 32;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    for (unsigned i = 0; i < componentsSize; i += 32) {
+        uint16x8x4_t ARGB16 = vld4q_u16(source + i);
+        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
+        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
+        uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
+        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
+        vst4_u8(destination + i, RGBA8);
+    }
+
+    source += componentsSize;
+    destination += componentsSize;
+    pixelsPerRow = tailComponents / 4;
+}
+
 ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
 {
     unsigned tailPixels = pixelsPerRow % 8;