python-pillow · homm · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 7, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -52,15 +52,19 @@ jobs:
         include:
         - { python-version: "3.11", PYTHONOPTIMIZE: 1, REVERSE: "--reverse" }
         - { python-version: "3.10", PYTHONOPTIMIZE: 2 }
+        # SIMD-accelerated builds for x86
+        - { os: "ubuntu-latest", python-version: "3.9", acceleration: "sse4"}
+        - { os: "ubuntu-latest", python-version: "3.12", acceleration: "avx2"}
         # Free-threaded
         - { os: "ubuntu-latest", python-version: "3.13-dev", disable-gil: true }
         # M1 only available for 3.10+
         - { os: "macos-13", python-version: "3.9" }
+        - { os: "macos-13", python-version: "3.9", acceleration: "avx2"}
         exclude:
         - { os: "macos-14", python-version: "3.9" }
 
     runs-on: ${{ matrix.os }}
-    name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.disable-gil && 'free-threaded' || '' }}
+    name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.acceleration }} ${{ matrix.disable-gil && 'free-threaded' || '' }}
 
     steps:
     - uses: actions/checkout@v4
@@ -108,7 +112,7 @@ jobs:
         GHA_LIBIMAGEQUANT_CACHE_HIT: ${{ steps.cache-libimagequant.outputs.cache-hit }}
 
     - name: Install macOS dependencies
-      if: startsWith(matrix.os, 'macOS')
+      if: startsWith(matrix.os, 'macos')
       run: |
         .github/workflows/macos-install.sh
       env:
@@ -118,6 +122,11 @@ jobs:
       if: "matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'"
       run: echo "::add-matcher::.github/problem-matchers/gcc.json"
 
+    - name: Set compiler options for optimization
+      if: ${{ matrix.acceleration }}
+      run: |
+        echo "CC=cc -m${{ matrix.acceleration }}" >> $GITHUB_ENV
+
     - name: Build
       run: |
         .ci/build.sh

diff --git a/Tests/test_features.py b/Tests/test_features.py
@@ -36,7 +36,9 @@ def test(name: str, function: Callable[[str], str | None]) -> None:
             assert version is None
         else:
             assert function(name) == version
-            if name != "PIL":
+            if name == "acceleration":
+                assert version in ("avx2", "sse4", "sse2", "neon", None)
+            elif name != "PIL":
                 if name == "zlib" and version is not None:
                     version = re.sub(".zlib-ng$", "", version)
                 elif name == "libtiff" and version is not None:

diff --git a/src/PIL/features.py b/src/PIL/features.py
@@ -128,6 +128,7 @@ def get_supported_codecs() -> list[str]:
     "libjpeg_turbo": ("PIL._imaging", "HAVE_LIBJPEGTURBO", "libjpeg_turbo_version"),
     "libimagequant": ("PIL._imaging", "HAVE_LIBIMAGEQUANT", "imagequant_version"),
     "xcb": ("PIL._imaging", "HAVE_XCB", None),
+    "acceleration": ("PIL._imaging", "acceleration", "acceleration"),
 }
 
 
@@ -267,6 +268,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:
 
     for name, feature in [
         ("pil", "PIL CORE"),
+        ("acceleration", "Acceleration"),
         ("tkinter", "TKINTER"),
         ("freetype2", "FREETYPE2"),
         ("littlecms2", "LITTLECMS2"),
@@ -291,7 +293,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:
             if v is None:
                 v = version(name)
             if v is not None:
-                version_static = name in ("pil", "jpg")
+                version_static = name in ("pil", "jpg", "acceleration")
                 if name == "littlecms2":
                     # this check is also in src/_imagingcms.c:setup_module()
                     version_static = tuple(int(x) for x in v.split(".")) < (2, 7)

diff --git a/src/_imaging.c b/src/_imaging.c
@@ -4407,6 +4407,19 @@
     Py_INCREF(have_xcb);
     PyModule_AddObject(m, "HAVE_XCB", have_xcb);
 
+#ifdef __AVX2__
+    PyModule_AddStringConstant(m, "acceleration", "avx2");
+#elif defined(__SSE4__)
+    PyModule_AddStringConstant(m, "acceleration", "sse4");
+#elif defined(__SSE2__)
+    PyModule_AddStringConstant(m, "acceleration", "sse2");
+#elif defined(__NEON__)
+    PyModule_AddStringConstant(m, "acceleration", "neon");
+#else
+    Py_INCREF(Py_False);
+    PyModule_AddObject(m, "acceleration", Py_False);
+#endif
+
     PyObject *pillow_version = PyUnicode_FromString(version);
     PyDict_SetItemString(
         d, "PILLOW_VERSION", pillow_version ? pillow_version : Py_None

diff --git a/src/libImaging/Bands.c b/src/libImaging/Bands.c
@@ -21,6 +21,9 @@ Imaging
 ImagingGetBand(Imaging imIn, int band) {
     Imaging imOut;
     int x, y;
+#ifdef __SSE4__
+    __m128i shuffle_mask;
+#endif
 
     /* Check arguments */
     if (!imIn || imIn->type != IMAGING_TYPE_UINT8) {
@@ -46,14 +49,41 @@ ImagingGetBand(Imaging imIn, int band) {
         return NULL;
     }
 
+#ifdef __SSE4__
+    shuffle_mask = _mm_set_epi8(
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        -1,
+        12 + band,
+        8 + band,
+        4 + band,
+        0 + band
+    );
+#endif
+
     /* Extract band from image */
     for (y = 0; y < imIn->ysize; y++) {
         UINT8 *in = (UINT8 *)imIn->image[y] + band;
         UINT8 *out = imOut->image8[y];
         x = 0;
         for (; x < imIn->xsize - 3; x += 4) {
+#ifdef __SSE4__
+            __m128i source = _mm_loadu_si128((__m128i *)(in - band));
+            *((UINT32 *)(out + x)) =
+                _mm_cvtsi128_si32(_mm_shuffle_epi8(source, shuffle_mask));
+#else
             UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
             memcpy(out + x, &v, sizeof(v));
+#endif
             in += 16;
         }
         for (; x < imIn->xsize; x++) {
@@ -99,10 +129,20 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
             UINT8 *out1 = bands[1]->image8[y];
             x = 0;
             for (; x < imIn->xsize - 3; x += 4) {
+#ifdef __SSE4__
+                __m128i source = _mm_loadu_si128((__m128i *)in);
+                source = _mm_shuffle_epi8(
+                    source,
+                    _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
+                );
+                *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
+                *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
+#else
                 UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
                 memcpy(out0 + x, &v, sizeof(v));
                 v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
                 memcpy(out1 + x, &v, sizeof(v));
+#endif
                 in += 16;
             }
             for (; x < imIn->xsize; x++) {
@@ -119,12 +159,23 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
             UINT8 *out2 = bands[2]->image8[y];
             x = 0;
             for (; x < imIn->xsize - 3; x += 4) {
+#ifdef __SSE4__
+                __m128i source = _mm_loadu_si128((__m128i *)in);
+                source = _mm_shuffle_epi8(
+                    source,
+                    _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
+                );
+                *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
+                *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
+                *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
+#else
                 UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
                 memcpy(out0 + x, &v, sizeof(v));
                 v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
                 memcpy(out1 + x, &v, sizeof(v));
                 v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]);
                 memcpy(out2 + x, &v, sizeof(v));
+#endif
                 in += 16;
             }
             for (; x < imIn->xsize; x++) {
@@ -143,6 +194,17 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
             UINT8 *out3 = bands[3]->image8[y];
             x = 0;
             for (; x < imIn->xsize - 3; x += 4) {
+#ifdef __SSE4__
+                __m128i source = _mm_loadu_si128((__m128i *)in);
+                source = _mm_shuffle_epi8(
+                    source,
+                    _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
+                );
+                *((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
+                *((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
+                *((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
+                *((UINT32 *)(out3 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
+#else
                 UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
                 memcpy(out0 + x, &v, sizeof(v));
                 v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
@@ -151,6 +213,7 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
                 memcpy(out2 + x, &v, sizeof(v));
                 v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
                 memcpy(out3 + x, &v, sizeof(v));
+#endif
                 in += 16;
             }
             for (; x < imIn->xsize; x++) {

diff --git a/src/libImaging/Filter.c b/src/libImaging/Filter.c
@@ -26,6 +26,24 @@
 
 #include "Imaging.h"
 
+#if defined(__SSE4__)
+/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
+   8 is number of bits in result.
+   Any coefficients delta smaller than this precision will have no effect. */
+#define PRECISION_BITS (8 + 5)
+/* 16 is number of bis required for kernel storage.
-/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
-   8 is number of bits in result.
-   Any coefficients delta smaller than this precision will have no effect. */
-#define PRECISION_BITS (8 + 5)
-/* 16 is number of bis required for kernel storage.
+/* 5 is enough bits to account for all kernel coefficients (1<<5 > 25).
+   8 is the number of bits in the result.
+   Any coefficients delta smaller than this precision will have no effect. */
+#define PRECISION_BITS (8 + 5)
+/* 16 is the number of bits required for kernel storage.
-/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
-   8 is number of bits in result.
-   Any coefficients delta smaller than this precision will have no effect. */
-#define PRECISION_BITS (8 + 5)
-/* 16 is number of bis required for kernel storage.
+/* 5 is enough bits to account for all kernel coefficients (1<<5 > 25).
+   8 is the number of bits in the result.
+   Any coefficients delta smaller than this precision will have no effect. */
+#define PRECISION_BITS (8 + 5)
+/* 16 is the number of bits required for kernel storage.
+   1 bit is reserver for sign.
+   Largest possible kernel coefficient is  */
+#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS))
+
+#include "FilterSIMD_3x3f_u8.c"
+#include "FilterSIMD_3x3f_4u8.c"
+#include "FilterSIMD_3x3i_4u8.c"
+#include "FilterSIMD_5x5f_u8.c"
+#include "FilterSIMD_5x5f_4u8.c"
+#include "FilterSIMD_5x5i_4u8.c"
+#endif  // defined(__SSE4__)
+
 static inline UINT8
 clip8(float in) {
     if (in <= 0.0) {
@@ -401,13 +419,66 @@
     }
 
     ImagingSectionEnter(&cookie);
+#if defined(__SSE4__)
+    {
+        int i, fast_path = 1;
+        FLOAT32 tmp;
+        INT16 norm_kernel[25];
+        INT32 norm_offset;
+
+        for (i = 0; i < xsize * ysize; i++) {
+            tmp = kernel[i] * (1 << PRECISION_BITS);
+            tmp += (tmp >= 0) ? 0.5 : -0.5;
+            if (tmp >= 32768 || tmp <= -32769) {
+                fast_path = 0;
+                break;
+            }
+            norm_kernel[i] = tmp;
+        }
+        tmp = offset * (1 << PRECISION_BITS);
+        tmp += (tmp >= 0) ? 0.5 : -0.5;
+        if (tmp >= 1 << 30) {
+            norm_offset = 1 << 30;
+        } else if (tmp <= -1 << 30) {
+            norm_offset = -1 << 30;
+        } else {
+            norm_offset = tmp;
+        }
+
+        if (xsize == 3) {
+            /* 3x3 kernel. */
+            if (im->image8) {
+                ImagingFilter3x3f_u8(imOut, im, kernel, offset);
+            } else {
+                if (fast_path) {
+                    ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
+                } else {
+                    ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
+                }
+            }
+        } else {
+            /* 5x5 kernel. */
+            if (im->image8) {
+                ImagingFilter5x5f_u8(imOut, im, kernel, offset);
+            } else {
+                if (fast_path) {
+                    ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset);
+                } else {
+                    ImagingFilter5x5f_4u8(imOut, im, kernel, offset);
+                }
+            }
+        }
+    }
+#else
     if (xsize == 3) {
         /* 3x3 kernel. */
         ImagingFilter3x3(imOut, im, kernel, offset);
     } else {
         /* 5x5 kernel. */
         ImagingFilter5x5(imOut, im, kernel, offset);
     }
+#endif
+
     ImagingSectionLeave(&cookie);
     return imOut;
 }