Optimize RGBA4444ToRGBA8 packing/unpacking functions with NEON intrinsics in Graphics...
authorrgabor@webkit.org <rgabor@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 13 Nov 2012 08:33:08 +0000 (08:33 +0000)
committerrgabor@webkit.org <rgabor@webkit.org@268f45cc-cd09-0410-ab3c-d52691b4dbfc>
Tue, 13 Nov 2012 08:33:08 +0000 (08:33 +0000)
https://bugs.webkit.org/show_bug.cgi?id=101473

Reviewed by Zoltan Herczeg.

With NEON intrinsics the packing/unpacking functions can be optimized well.
This particular function is about 3 times faster with ARM NEON. On top level tests
the speed up was 1.18x.

* CMakeLists.txt:
* GNUmakefile.am:
* GNUmakefile.list.am:
* Target.pri:
* WebCore.gyp/WebCore.gyp:
* WebCore.gypi:
* WebCore.pri:
* WebCore.xcodeproj/project.pbxproj:
* platform/graphics/GraphicsContext3D.cpp:
(WebCore):
* platform/graphics/cpu/arm/GraphicsContext3DNEON.h: Added.
(WebCore):
(ARM):
(WebCore::ARM::unpackOneRowOfRGBA4444ToRGBA8NEON):
(WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort4444NEON):

git-svn-id: https://svn.webkit.org/repository/webkit/trunk@134378 268f45cc-cd09-0410-ab3c-d52691b4dbfc

Source/WebCore/CMakeLists.txt
Source/WebCore/ChangeLog
Source/WebCore/GNUmakefile.am
Source/WebCore/GNUmakefile.list.am
Source/WebCore/Target.pri
Source/WebCore/WebCore.gyp/WebCore.gyp
Source/WebCore/WebCore.gypi
Source/WebCore/WebCore.pri
Source/WebCore/WebCore.xcodeproj/project.pbxproj
Source/WebCore/platform/graphics/GraphicsContext3D.cpp
Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h [new file with mode: 0644]

index 07caae4..db0fa49 100644 (file)
@@ -48,6 +48,7 @@ SET(WebCore_INCLUDE_DIRECTORIES
     "${WEBCORE_DIR}/platform/animation"
     "${WEBCORE_DIR}/platform/audio"
     "${WEBCORE_DIR}/platform/graphics"
+    "${WEBCORE_DIR}/platform/graphics/cpu/arm"
     "${WEBCORE_DIR}/platform/graphics/filters"
     "${WEBCORE_DIR}/platform/graphics/filters/arm"
     "${WEBCORE_DIR}/platform/graphics/harfbuzz"
index 6374b97..c949d8d 100644 (file)
@@ -1,3 +1,30 @@
+2012-11-13  Gabor Rapcsanyi  <rgabor@webkit.org>
+
+        Optimize RGBA4444ToRGBA8 packing/unpacking functions with NEON intrinsics in GraphicsContext3D
+        https://bugs.webkit.org/show_bug.cgi?id=101473
+
+        Reviewed by Zoltan Herczeg.
+
+        With NEON intrinsics the packing/unpacking functions can be optimized well.
+        This particular function is about 3 times faster with ARM NEON. On top level tests
+        the speed up was 1.18x.
+
+        * CMakeLists.txt:
+        * GNUmakefile.am:
+        * GNUmakefile.list.am:
+        * Target.pri:
+        * WebCore.gyp/WebCore.gyp:
+        * WebCore.gypi:
+        * WebCore.pri:
+        * WebCore.xcodeproj/project.pbxproj:
+        * platform/graphics/GraphicsContext3D.cpp:
+        (WebCore):
+        * platform/graphics/cpu/arm/GraphicsContext3DNEON.h: Added.
+        (WebCore):
+        (ARM):
+        (WebCore::ARM::unpackOneRowOfRGBA4444ToRGBA8NEON):
+        (WebCore::ARM::packOneRowOfRGBA8ToUnsignedShort4444NEON):
+
 2012-11-13  Takashi Sakamoto  <tasak@google.com>
 
         Crash when replacing parts of text inputs with content: url(...)
index 0397e32..356dedf 100644 (file)
@@ -54,6 +54,7 @@ webcore_cppflags += \
        -I$(srcdir)/Source/WebCore/platform/animation \
        -I$(srcdir)/Source/WebCore/platform/audio \
        -I$(srcdir)/Source/WebCore/platform/graphics \
+       -I$(srcdir)/Source/WebCore/platform/graphics/cpu/arm \
        -I$(srcdir)/Source/WebCore/platform/graphics/filters \
        -I$(srcdir)/Source/WebCore/platform/graphics/filters/arm \
        -I$(srcdir)/Source/WebCore/platform/graphics/gpu \
index 9300ccc..244c0f7 100644 (file)
@@ -4270,6 +4270,7 @@ webcore_sources += \
        Source/WebCore/platform/HistogramSupport.h \
        Source/WebCore/platform/graphics/ANGLEWebKitBridge.cpp \
        Source/WebCore/platform/graphics/ANGLEWebKitBridge.h \
+       Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h \
        Source/WebCore/platform/graphics/BitmapImage.cpp \
        Source/WebCore/platform/graphics/BitmapImage.h \
        Source/WebCore/platform/graphics/Color.cpp \
index d9ce6ce..5dd81ed 100644 (file)
@@ -3956,6 +3956,7 @@ enable?(WEBGL) {
 
 use?(3D_GRAPHICS) {
     HEADERS += \
+        platform/graphics/cpu/arm/GraphicsContext3DNEON.h \
         platform/graphics/ANGLEWebKitBridge.h \
         platform/graphics/Extensions3D.h \
         platform/graphics/GraphicsContext3D.h \
index 0dfd277..b1b6440 100644 (file)
       '../platform/graphics',
       '../platform/graphics/chromium',
       '../platform/graphics/chromium/cc',
+      '../platform/graphics/cpu/arm',
       '../platform/graphics/filters',
       '../platform/graphics/filters/arm',
       '../platform/graphics/filters/skia',
index 2bb0fc7..c60ad54 100644 (file)
             'platform/graphics/TextRenderingMode.h',
             'platform/graphics/TextRun.h',
             'platform/graphics/TypesettingFeatures.h',
+            'platform/graphics/cpu/arm/GraphicsContext3DNEON.h',
             'platform/graphics/cg/ImageBufferDataCG.h',
             'platform/graphics/mac/ColorMac.h',
             'platform/graphics/mac/MediaPlayerProxy.h',
index 2958bfd..71d276c 100644 (file)
@@ -53,6 +53,7 @@ INCLUDEPATH += \
     $$SOURCE_DIR/platform/animation \
     $$SOURCE_DIR/platform/audio \
     $$SOURCE_DIR/platform/graphics \
+    $$SOURCE_DIR/platform/graphics/cpu/arm \
     $$SOURCE_DIR/platform/graphics/filters \
     $$SOURCE_DIR/platform/graphics/filters/arm \
     $$SOURCE_DIR/platform/graphics/opengl \
index 56c7fa2..b501e9b 100644 (file)
                93309E9F099EB78C0056E581 /* SharedTimerMac.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = SharedTimerMac.mm; sourceTree = "<group>"; };
                93309EA0099EB78C0056E581 /* SharedTimer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = SharedTimer.h; sourceTree = "<group>"; };
                93309EA1099EB78C0056E581 /* Timer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Timer.cpp; sourceTree = "<group>"; };
+               9332AB3D16515D7700D827EC /* GraphicsContext3DNEON.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = GraphicsContext3DNEON.h; sourceTree = "<group>"; };
                93354A3B0B24F8C9003F6DEA /* UIEventWithKeyState.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = UIEventWithKeyState.cpp; sourceTree = "<group>"; };
                933A142B0B7D188600A53FFD /* TextEvent.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = TextEvent.cpp; sourceTree = "<group>"; };
                933A142C0B7D188600A53FFD /* TextEvent.idl */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = TextEvent.idl; sourceTree = "<group>"; };
                        tabWidth = 4;
                        usesTabs = 0;
                };
+               9332AB3B16515D7700D827EC /* cpu */ = {
+                       isa = PBXGroup;
+                       children = (
+                               9332AB3C16515D7700D827EC /* arm */,
+                       );
+                       path = cpu;
+                       sourceTree = "<group>";
+               };
+               9332AB3C16515D7700D827EC /* arm */ = {
+                       isa = PBXGroup;
+                       children = (
+                               9332AB3D16515D7700D827EC /* GraphicsContext3DNEON.h */,
+                       );
+                       path = arm;
+                       sourceTree = "<group>";
+               };
                9363B6290F8E8FE000803810 /* cf */ = {
                        isa = PBXGroup;
                        children = (
                B2A015910AF6CD53006BCE0E /* graphics */ = {
                        isa = PBXGroup;
                        children = (
+                               9332AB3B16515D7700D827EC /* cpu */,
                                076F0D0812B8192700C26AA4 /* avfoundation */,
                                499B3EC0128CCC1800E726C2 /* ca */,
                                B27535290B053814002CE64F /* cg */,
index 5f6942b..f7baca2 100644 (file)
@@ -29,6 +29,7 @@
 #if USE(3D_GRAPHICS)
 
 #include "GraphicsContext3D.h"
+#include "GraphicsContext3DNEON.h"
 
 #include "CheckedInt.h"
 #include "DrawingBuffer.h"
@@ -541,6 +542,16 @@ void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t* source, uint8_t* destination,
 
 void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t* source, uint8_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned tailPixels = pixelsPerRow % 8;
+    unsigned pixelSize = pixelsPerRow - tailPixels;
+
+    ARM::unpackOneRowOfRGBA4444ToRGBA8NEON(source, destination, pixelSize);
+
+    source += pixelSize;
+    destination += pixelSize * 4;
+    pixelsPerRow = tailPixels;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         uint16_t packedValue = source[0];
         uint8_t r = packedValue >> 12;
@@ -947,6 +958,17 @@ void packOneRowOfRGBA8ToRGBA8Unmultiply(const uint8_t* source, uint8_t* destinat
 
 void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t* source, uint16_t* destination, unsigned int pixelsPerRow)
 {
+#if HAVE(ARM_NEON_INTRINSICS)
+    unsigned componentsPerRow = pixelsPerRow * 4;
+    unsigned tailComponents = componentsPerRow % 32;
+    unsigned componentsSize = componentsPerRow - tailComponents;
+
+    ARM::packOneRowOfRGBA8ToUnsignedShort4444NEON(source, destination, componentsSize);
+
+    source += componentsSize;
+    destination += componentsSize / 4;
+    pixelsPerRow = tailComponents / 4;
+#endif
     for (unsigned int i = 0; i < pixelsPerRow; ++i) {
         *destination = (((source[0] & 0xF0) << 8)
                         | ((source[1] & 0xF0) << 4)
diff --git a/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h b/Source/WebCore/platform/graphics/cpu/arm/GraphicsContext3DNEON.h
new file mode 100644 (file)
index 0000000..d971de3
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of Szeged
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef GraphicsContext3DNEON_h
+#define GraphicsContext3DNEON_h
+
+#if HAVE(ARM_NEON_INTRINSICS)
+
+#include <arm_neon.h>
+
+namespace WebCore {
+
+namespace ARM {
+
+ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8NEON(const uint16_t* source, uint8_t* destination, unsigned pixelSize)
+{
+    uint16x8_t constant = vdupq_n_u16(0x0F);
+    for (unsigned i = 0; i < pixelSize; i += 8) {
+        uint16x8_t eightPixels = vld1q_u16(source + i);
+
+        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
+        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), constant));
+        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), constant));
+        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, constant));
+
+        componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
+        componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
+        componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB);
+        componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA);
+
+        uint8x8x4_t destComponents = {componentR, componentG, componentB, componentA};
+        vst4_u8(destination, destComponents);
+        destination += 32;
+    }
+}
+
+ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444NEON(const uint8_t* source, uint16_t* destination, unsigned componentsSize)
+{
+    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
+    uint8x8_t constant = vdup_n_u8(0xF0);
+    for (unsigned i = 0; i < componentsSize; i += 32) {
+        uint8x8x4_t RGBA8 = vld4_u8(source + i);
+
+        uint8x8_t componentR = vand_u8(RGBA8.val[0], constant);
+        uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], constant), 4);
+        uint8x8_t componentB = vand_u8(RGBA8.val[2], constant);
+        uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], constant), 4);
+
+        uint8x8x2_t RGBA4;
+        RGBA4.val[0] = vorr_u8(componentB, componentA);
+        RGBA4.val[1] = vorr_u8(componentR, componentG);
+        vst2_u8(dst, RGBA4);
+        dst += 16;
+    }
+}
+
+} // namespace ARM
+
+} // namespace WebCore
+
+#endif // HAVE(ARM_NEON_INTRINSICS)
+
+#endif // GraphicsContext3DNEON_h