Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SIMD: Filter SSE4 & AVX2 #8301

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,19 @@ jobs:
include:
- { python-version: "3.11", PYTHONOPTIMIZE: 1, REVERSE: "--reverse" }
- { python-version: "3.10", PYTHONOPTIMIZE: 2 }
# SIMD-accelerated builds for x86
- { os: "ubuntu-latest", python-version: "3.9", acceleration: "sse4"}
- { os: "ubuntu-latest", python-version: "3.12", acceleration: "avx2"}
# Free-threaded
- { os: "ubuntu-latest", python-version: "3.13-dev", disable-gil: true }
# M1 only available for 3.10+
- { os: "macos-13", python-version: "3.9" }
- { os: "macos-13", python-version: "3.9", acceleration: "avx2"}
exclude:
- { os: "macos-14", python-version: "3.9" }

runs-on: ${{ matrix.os }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.disable-gil && 'free-threaded' || '' }}
name: ${{ matrix.os }} Python ${{ matrix.python-version }} ${{ matrix.acceleration }} ${{ matrix.disable-gil && 'free-threaded' || '' }}

steps:
- uses: actions/checkout@v4
Expand Down Expand Up @@ -108,7 +112,7 @@ jobs:
GHA_LIBIMAGEQUANT_CACHE_HIT: ${{ steps.cache-libimagequant.outputs.cache-hit }}

- name: Install macOS dependencies
if: startsWith(matrix.os, 'macOS')
if: startsWith(matrix.os, 'macos')
run: |
.github/workflows/macos-install.sh
env:
Expand All @@ -118,6 +122,11 @@ jobs:
if: "matrix.os == 'ubuntu-latest' && matrix.python-version == '3.12'"
run: echo "::add-matcher::.github/problem-matchers/gcc.json"

- name: Set compiler options for optimization
if: ${{ matrix.acceleration }}
run: |
echo "CC=cc -m${{ matrix.acceleration }}" >> $GITHUB_ENV

- name: Build
run: |
.ci/build.sh
Expand Down
4 changes: 3 additions & 1 deletion Tests/test_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@ def test(name: str, function: Callable[[str], str | None]) -> None:
assert version is None
else:
assert function(name) == version
if name != "PIL":
if name == "acceleration":
assert version in ("avx2", "sse4", "sse2", "neon", None)
elif name != "PIL":
if name == "zlib" and version is not None:
version = re.sub(".zlib-ng$", "", version)
elif name == "libtiff" and version is not None:
Expand Down
4 changes: 3 additions & 1 deletion src/PIL/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ def get_supported_codecs() -> list[str]:
"libjpeg_turbo": ("PIL._imaging", "HAVE_LIBJPEGTURBO", "libjpeg_turbo_version"),
"libimagequant": ("PIL._imaging", "HAVE_LIBIMAGEQUANT", "imagequant_version"),
"xcb": ("PIL._imaging", "HAVE_XCB", None),
"acceleration": ("PIL._imaging", "acceleration", "acceleration"),
}


Expand Down Expand Up @@ -267,6 +268,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:

for name, feature in [
("pil", "PIL CORE"),
("acceleration", "Acceleration"),
("tkinter", "TKINTER"),
("freetype2", "FREETYPE2"),
("littlecms2", "LITTLECMS2"),
Expand All @@ -291,7 +293,7 @@ def pilinfo(out: IO[str] | None = None, supported_formats: bool = True) -> None:
if v is None:
v = version(name)
if v is not None:
version_static = name in ("pil", "jpg")
version_static = name in ("pil", "jpg", "acceleration")
if name == "littlecms2":
# this check is also in src/_imagingcms.c:setup_module()
version_static = tuple(int(x) for x in v.split(".")) < (2, 7)
Expand Down
13 changes: 13 additions & 0 deletions src/_imaging.c
Original file line number Diff line number Diff line change
Expand Up @@ -4407,6 +4407,19 @@
Py_INCREF(have_xcb);
PyModule_AddObject(m, "HAVE_XCB", have_xcb);

#ifdef __AVX2__
PyModule_AddStringConstant(m, "acceleration", "avx2");
#elif defined(__SSE4__)
PyModule_AddStringConstant(m, "acceleration", "sse4");

Check warning on line 4413 in src/_imaging.c

View check run for this annotation

Codecov / codecov/patch

src/_imaging.c#L4413

Added line #L4413 was not covered by tests
#elif defined(__SSE2__)
PyModule_AddStringConstant(m, "acceleration", "sse2");
#elif defined(__NEON__)
PyModule_AddStringConstant(m, "acceleration", "neon");
#else
Py_INCREF(Py_False);
PyModule_AddObject(m, "acceleration", Py_False);
#endif

PyObject *pillow_version = PyUnicode_FromString(version);
PyDict_SetItemString(
d, "PILLOW_VERSION", pillow_version ? pillow_version : Py_None
Expand Down
63 changes: 63 additions & 0 deletions src/libImaging/Bands.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ Imaging
ImagingGetBand(Imaging imIn, int band) {
Imaging imOut;
int x, y;
#ifdef __SSE4__
__m128i shuffle_mask;
#endif

/* Check arguments */
if (!imIn || imIn->type != IMAGING_TYPE_UINT8) {
Expand All @@ -46,14 +49,41 @@ ImagingGetBand(Imaging imIn, int band) {
return NULL;
}

#ifdef __SSE4__
shuffle_mask = _mm_set_epi8(
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
-1,
12 + band,
8 + band,
4 + band,
0 + band
);
#endif

/* Extract band from image */
for (y = 0; y < imIn->ysize; y++) {
UINT8 *in = (UINT8 *)imIn->image[y] + band;
UINT8 *out = imOut->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)(in - band));
*((UINT32 *)(out + x)) =
_mm_cvtsi128_si32(_mm_shuffle_epi8(source, shuffle_mask));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand Down Expand Up @@ -99,10 +129,20 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out1 = bands[1]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
memcpy(out1 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand All @@ -119,12 +159,23 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out2 = bands[2]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
*((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
memcpy(out1 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 2], in[4 + 2], in[8 + 2], in[12 + 2]);
memcpy(out2 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand All @@ -143,6 +194,17 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
UINT8 *out3 = bands[3]->image8[y];
x = 0;
for (; x < imIn->xsize - 3; x += 4) {
#ifdef __SSE4__
__m128i source = _mm_loadu_si128((__m128i *)in);
source = _mm_shuffle_epi8(
source,
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0)
);
*((UINT32 *)(out0 + x)) = _mm_cvtsi128_si32(source);
*((UINT32 *)(out1 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 4));
*((UINT32 *)(out2 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 8));
*((UINT32 *)(out3 + x)) = _mm_cvtsi128_si32(_mm_srli_si128(source, 12));
#else
UINT32 v = MAKE_UINT32(in[0], in[4], in[8], in[12]);
memcpy(out0 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 1], in[4 + 1], in[8 + 1], in[12 + 1]);
Expand All @@ -151,6 +213,7 @@ ImagingSplit(Imaging imIn, Imaging bands[4]) {
memcpy(out2 + x, &v, sizeof(v));
v = MAKE_UINT32(in[0 + 3], in[4 + 3], in[8 + 3], in[12 + 3]);
memcpy(out3 + x, &v, sizeof(v));
#endif
in += 16;
}
for (; x < imIn->xsize; x++) {
Expand Down
71 changes: 71 additions & 0 deletions src/libImaging/Filter.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@

#include "Imaging.h"

#if defined(__SSE4__)
/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
8 is number of bits in result.
Any coefficients delta smaller than this precision will have no effect. */
#define PRECISION_BITS (8 + 5)
/* 16 is number of bis required for kernel storage.
Comment on lines +30 to +34
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/* 5 is number of bits enought to account all kernel coefficients (1<<5 > 25).
8 is number of bits in result.
Any coefficients delta smaller than this precision will have no effect. */
#define PRECISION_BITS (8 + 5)
/* 16 is number of bis required for kernel storage.
/* 5 is enough bits to account for all kernel coefficients (1<<5 > 25).
8 is the number of bits in the result.
Any coefficients delta smaller than this precision will have no effect. */
#define PRECISION_BITS (8 + 5)
/* 16 is the number of bits required for kernel storage.

1 bit is reserver for sign.
Largest possible kernel coefficient is */
#define LARGEST_KERNEL (1 << (16 - 1 - PRECISION_BITS))

#include "FilterSIMD_3x3f_u8.c"
#include "FilterSIMD_3x3f_4u8.c"
#include "FilterSIMD_3x3i_4u8.c"
#include "FilterSIMD_5x5f_u8.c"
#include "FilterSIMD_5x5f_4u8.c"
#include "FilterSIMD_5x5i_4u8.c"
#endif // defined(__SSE4__)

static inline UINT8
clip8(float in) {
if (in <= 0.0) {
Expand Down Expand Up @@ -401,13 +419,66 @@
}

ImagingSectionEnter(&cookie);
#if defined(__SSE4__)
{
int i, fast_path = 1;
FLOAT32 tmp;
INT16 norm_kernel[25];
INT32 norm_offset;

for (i = 0; i < xsize * ysize; i++) {
tmp = kernel[i] * (1 << PRECISION_BITS);
tmp += (tmp >= 0) ? 0.5 : -0.5;
if (tmp >= 32768 || tmp <= -32769) {
fast_path = 0;
break;
}
norm_kernel[i] = tmp;
}
tmp = offset * (1 << PRECISION_BITS);
tmp += (tmp >= 0) ? 0.5 : -0.5;
if (tmp >= 1 << 30) {
norm_offset = 1 << 30;

Check warning on line 441 in src/libImaging/Filter.c

View check run for this annotation

Codecov / codecov/patch

src/libImaging/Filter.c#L441

Added line #L441 was not covered by tests
} else if (tmp <= -1 << 30) {
norm_offset = -1 << 30;
} else {

Check warning on line 444 in src/libImaging/Filter.c

View check run for this annotation

Codecov / codecov/patch

src/libImaging/Filter.c#L443-L444

Added lines #L443 - L444 were not covered by tests
norm_offset = tmp;
}

if (xsize == 3) {
/* 3x3 kernel. */
if (im->image8) {
ImagingFilter3x3f_u8(imOut, im, kernel, offset);
} else {
if (fast_path) {
ImagingFilter3x3i_4u8(imOut, im, norm_kernel, norm_offset);
} else {
ImagingFilter3x3f_4u8(imOut, im, kernel, offset);
}
}
} else {
/* 5x5 kernel. */
if (im->image8) {
ImagingFilter5x5f_u8(imOut, im, kernel, offset);
} else {
if (fast_path) {
ImagingFilter5x5i_4u8(imOut, im, norm_kernel, norm_offset);
} else {
ImagingFilter5x5f_4u8(imOut, im, kernel, offset);

Check warning on line 467 in src/libImaging/Filter.c

View check run for this annotation

Codecov / codecov/patch

src/libImaging/Filter.c#L467

Added line #L467 was not covered by tests
}
}
}
}
#else
if (xsize == 3) {
/* 3x3 kernel. */
ImagingFilter3x3(imOut, im, kernel, offset);
} else {
/* 5x5 kernel. */
ImagingFilter5x5(imOut, im, kernel, offset);
}
#endif

ImagingSectionLeave(&cookie);
return imOut;
}
Loading
Loading