Skip to content

Commit

Permalink
Use span and improve spelling (#301)
Browse files Browse the repository at this point in the history
  • Loading branch information
vbaderks authored Jan 20, 2024
1 parent cb6a9d7 commit 1ba2294
Show file tree
Hide file tree
Showing 15 changed files with 356 additions and 88 deletions.
3 changes: 3 additions & 0 deletions CharLS.sln.DotSettings
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
<s:String x:Key="/Default/CodeStyle/Naming/CppNaming/Rules/=Union_0020members/@EntryIndexedValue">&lt;NamingElement Priority="12"&gt;&lt;Descriptor Static="Indeterminate" Constexpr="Indeterminate" Const="Indeterminate" Volatile="Indeterminate" Accessibility="NOT_APPLICABLE"&gt;&lt;type Name="union member" /&gt;&lt;/Descriptor&gt;&lt;Policy Inspect="False" Prefix="" Suffix="" Style="aa_bb" /&gt;&lt;/NamingElement&gt;</s:String>
<s:Boolean x:Key="/Default/Environment/CppEnableCppCli/IsEnabled/@EntryValue">False</s:Boolean>
<s:Boolean x:Key="/Default/Environment/SettingsMigration/IsMigratorApplied/=JetBrains_002EReSharper_002EFeature_002EServices_002EDaemon_002ESettings_002EMigration_002ESwaWarningsModeSettingsMigrate/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/GrammarAndSpelling/GrammarChecking/Exceptions/=G_0020G/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/GrammarAndSpelling/GrammarChecking/Exceptions/=Modified_0020Modified/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/GrammarAndSpelling/GrammarChecking/Exceptions/=than/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=alphatest/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=anymap/@EntryIndexedValue">True</s:Boolean>
Expand Down Expand Up @@ -89,6 +91,7 @@
<s:Boolean x:Key="/Default/UserDictionary/Words/=losslesstraits/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=maxval/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=memchr/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=memset/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=mrfx/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=nightshot/@EntryIndexedValue">True</s:Boolean>
<s:Boolean x:Key="/Default/UserDictionary/Words/=NODISCARD/@EntryIndexedValue">True</s:Boolean>
Expand Down
289 changes: 286 additions & 3 deletions benchmark/benchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,16 +313,299 @@ overwrite_buffer allocate_overwrite_buffer(const size_t size)
return buffer;
}


static void bm_resize_overwrite_buffer(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(allocate_buffer(size_t{512} * 512 * 16));
benchmark::DoNotOptimize(allocate_buffer(size_t{1024} * 1024 * 8 * 3));
benchmark::DoNotOptimize(allocate_overwrite_buffer(size_t{512} * 512 * 16));
benchmark::DoNotOptimize(allocate_overwrite_buffer(size_t{1024} * 1024 * 8 * 3));
}
}
BENCHMARK(bm_resize_overwrite_buffer);


int memset_buffer(uint8_t* data, const size_t size)
{
memset(data, 0, size);
return 0;
}
static void bm_memset_buffer(benchmark::State& state)
{
std::vector<uint8_t> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(memset_buffer(buffer.data(), size_t{512} * 512 * 16));
benchmark::DoNotOptimize(memset_buffer(buffer.data(), size_t{1024} * 1024 * 8 * 3));
}
}
BENCHMARK(bm_memset_buffer);


bool has_ff_byte_classic(const unsigned int value)
{
// Check if any byte is equal to 0xFF
return ((value & 0xFF) == 0xFF) || (((value >> 8) & 0xFF) == 0xFF) || (((value >> 16) & 0xFF) == 0xFF) ||
(((value >> 24) & 0xFF) == 0xFF);
}
static void bm_has_ff_byte_classic(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(has_ff_byte_classic(0));
benchmark::DoNotOptimize(has_ff_byte_classic(0xFF));
}
}
BENCHMARK(bm_has_ff_byte_classic);

bool has_ff_byte_loop(const unsigned int value)
{
// Iterate over each byte and check if it is equal to 0xFF
for (int i = 0; i < sizeof(unsigned int); ++i)
{
if ((value & (0xFF << (8 * i))) == (0xFFU << (8 * i)))
{
return true;
}
}
return false;
}
static void bm_has_ff_byte_loop(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(has_ff_byte_loop(0));
benchmark::DoNotOptimize(has_ff_byte_loop(0xFF));
}
}
BENCHMARK(bm_has_ff_byte_loop);

bool has_ff_byte_simd(const unsigned int value) {
// Use SSE instructions for parallel comparison
const __m128i xmm_value = _mm_set1_epi32(value);
const __m128i xmm_ff = _mm_set1_epi32(0xFF);

// Compare each byte for equality with 0xFF
const __m128i comparison = _mm_cmpeq_epi8(xmm_value, xmm_ff);

// Check if any comparison result is true
return _mm_testz_si128(comparison, comparison) == 0;
}
static void bm_has_ff_byte_simd(benchmark::State& state)
{
for (const auto _ : state)
{
benchmark::DoNotOptimize(has_ff_byte_simd(0));
benchmark::DoNotOptimize(has_ff_byte_simd(0xFF));
}
}
BENCHMARK(bm_has_ff_byte_simd);


const std::byte* find_jpeg_marker_start_byte(const std::byte* position, const std::byte* end_position) noexcept
{
constexpr std::byte jpeg_marker_start_byte{0xFF};

// Use memchr to find next start byte (0xFF). memchr is optimized on some platforms to search faster.
return static_cast<const std::byte*>(
memchr(position, std::to_integer<int>(jpeg_marker_start_byte), end_position - position));
}
static void bm_find_jpeg_marker_start_byte(benchmark::State& state)
{
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(find_jpeg_marker_start_byte(buffer.data(), buffer.data() + buffer.size()));
}
}
BENCHMARK(bm_find_jpeg_marker_start_byte);

// A simple overload with uint64_t\uint32_t doesn't work for macOS. size_t is not the same type as uint64_t.
template<int BitCount, typename T>
constexpr bool is_uint_v = sizeof(T) == BitCount / 8 && std::is_integral_v<T> && !std::is_signed_v<T>;

template<typename T>
[[nodiscard]]
auto byte_swap(const T value) noexcept
{
if constexpr (is_uint_v<16, T>)
{
#ifdef _MSC_VER
return _byteswap_ushort(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return static_cast<uint16_t>(value << 8 | value >> 8);
#endif
}
else if constexpr (is_uint_v<32, T>)
{
#ifdef _MSC_VER
return _byteswap_ulong(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return value >> 24 | (value & 0x00FF0000) >> 8 | (value & 0x0000FF00) << 8 | value << 24;
#endif
}
else
{
static_assert(is_uint_v<64, T>);
#ifdef _MSC_VER
return _byteswap_uint64(value);
#else
// Note: GCC and Clang will optimize this pattern to a built-in intrinsic.
return (value << 56) | ((value << 40) & 0x00FF'0000'0000'0000) | ((value << 24) & 0x0000'FF00'0000'0000) |
((value << 8) & 0x0000'00FF'0000'0000) | ((value >> 8) & 0x0000'0000'FF00'0000) |
((value >> 24) & 0x0000'0000'00FF'0000) | ((value >> 40) & 0x0000'0000'0000'FF00) | (value >> 56);
#endif
}
}



template<typename T>
[[nodiscard]]
T read_unaligned(const void* buffer) noexcept
{
// Note: MSVC, GCC and clang will replace this with a direct register read if the CPU architecture allows it
// On x86, x64 and ARM64 this will just be 1 register load.
T value;
memcpy(&value, buffer, sizeof(T));
return value;
}

template<typename T>
T read_big_endian_unaligned(const void* buffer) noexcept
{
#ifdef LITTLE_ENDIAN_ARCHITECTURE
return byte_swap(read_unaligned<T>(buffer));
#else
return read_unaligned<T>(buffer);
#endif
}

uint32_t read_all_bytes_with_ff_check(const std::byte* position, const std::byte* end_position)
{
uint32_t result{};

for (; position < end_position; position += sizeof(uint32_t))
{
if (const uint32_t value{read_big_endian_unaligned<uint32_t>(position)};
has_ff_byte_simd(value))
{
result++;
}
else
{
result |= value;
}

}

return result;
}
static void bm_read_all_bytes_with_ff_check(benchmark::State& state)
{
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(read_all_bytes_with_ff_check(buffer.data(), buffer.data() + buffer.size()));
}
}
BENCHMARK(bm_read_all_bytes_with_ff_check);


bool has_ff_byte_simd64(const uint64_t value)
{
// Use SSE instructions for parallel comparison
const __m128i xmm_value = _mm_set1_epi64x(value);
const __m128i xmm_ff = _mm_set1_epi32(0xFF);

// Compare each byte for equality with 0xFF
const __m128i comparison = _mm_cmpeq_epi8(xmm_value, xmm_ff);

// Check if any comparison result is true
return _mm_testz_si128(comparison, comparison) == 0;
}

uint64_t read_all_bytes_with_ff_check64(const std::byte* position, const std::byte* end_position)
{
uint64_t result{};

for (; position < end_position; position += sizeof(uint64_t))
{
if (const uint64_t value{read_big_endian_unaligned<uint64_t>(position)}; has_ff_byte_simd64(value))
{
result++;
}
else
{
result |= value;
}
}

return result;
}
static void bm_read_all_bytes_with_ff_check64(benchmark::State& state)
{
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(read_all_bytes_with_ff_check64(buffer.data(), buffer.data() + buffer.size()));
}
}
BENCHMARK(bm_read_all_bytes_with_ff_check64);


uint32_t read_all_bytes_no_check(const std::byte* position, const std::byte* end_position)
{
uint32_t result{};

for (; position < end_position; position += sizeof(uint32_t))
{
const uint32_t value{read_big_endian_unaligned<uint32_t>(position)};
result |= value;
}

return result;
}
static void bm_read_all_bytes_no_check(benchmark::State& state)
{
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(read_all_bytes_no_check(buffer.data(), buffer.data() + buffer.size()));
}
}
BENCHMARK(bm_read_all_bytes_no_check);

uint64_t read_all_bytes_no_check64(const std::byte* position, const std::byte* end_position)
{
uint64_t result{};

for (; position < end_position; position += sizeof(uint64_t))
{
const uint64_t value{read_big_endian_unaligned<uint64_t>(position)};
result |= value;
}

return result;
}
static void bm_read_all_bytes_no_check64(benchmark::State& state)
{
const std::vector<std::byte> buffer(size_t{1024} * 1024 * 8 * 3);

for (const auto _ : state)
{
benchmark::DoNotOptimize(read_all_bytes_no_check64(buffer.data(), buffer.data() + buffer.size()));
}
}
BENCHMARK(bm_read_all_bytes_no_check64);




BENCHMARK_MAIN();
12 changes: 0 additions & 12 deletions benchmark/benchmark.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -96,25 +96,13 @@
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Checked|Win32'">
<LinkIncremental>true</LinkIncremental>
<EnableClangTidyCodeAnalysis>false</EnableClangTidyCodeAnalysis>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|ARM64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Checked|x64'">
<LinkIncremental>true</LinkIncremental>
<EnableClangTidyCodeAnalysis>false</EnableClangTidyCodeAnalysis>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Checked|ARM64'">
<LinkIncremental>true</LinkIncremental>
<EnableClangTidyCodeAnalysis>false</EnableClangTidyCodeAnalysis>
</PropertyGroup>
<ItemDefinitionGroup Label="Global">
Expand Down
Loading

0 comments on commit 1ba2294

Please sign in to comment.