Skip to content

Commit

Permalink
Fixed: Regression in Hash Quality, Workaround .NET 8 Strangeness & Re…
Browse files Browse the repository at this point in the history
…vert to using 64-bit FNV1
  • Loading branch information
Sewer56 committed Mar 12, 2024
1 parent 91f0dd9 commit c7f9f88
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 67 deletions.
82 changes: 55 additions & 27 deletions src/Reloaded.Memory/Internals/Algorithms/UnstableStringHash.cs
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,17 @@ internal static unsafe nuint GetHashCodeUnstable(this ReadOnlySpan<char> text)
return text.UnstableHashVec256();

// Over 4 Vec128 regs (16 * 4 = 64 bytes)
if (Vector256.IsHardwareAccelerated && length >= (sizeof(Vector128<ulong>) / sizeof(char)) * 4)
if (Vector128.IsHardwareAccelerated && length >= (sizeof(Vector128<ulong>) / sizeof(char)) * 4)
return text.UnstableHashVec128();
#endif

return text.UnstableHashNonVector();
}

#if NET7_0_OR_GREATER
#if NET8_0 // Bug in .NET 8 seems to cause this to not re-jit to tier1 till like 200k calls on Linux x64
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
#endif
internal static unsafe UIntPtr UnstableHashVec128(this ReadOnlySpan<char> text)
{
fixed (char* src = &text.GetPinnableReference())
Expand All @@ -62,32 +65,32 @@ internal static unsafe UIntPtr UnstableHashVec128(this ReadOnlySpan<char> text)
var hash2 = hash1;
var ptr = (nuint*)(src);

var prime = Vector128.Create((uint)0x01000193);
var hash1_128 = Vector128.Create(0x811c9dc5);
var hash2_128 = Vector128.Create(0x811c9dc5);
var prime = Vector128.Create((ulong)0x100000001b3);
var hash1_128 = Vector128.Create(0xcbf29ce484222325);
var hash2_128 = Vector128.Create(0xcbf29ce484222325);

while (length >= sizeof(Vector128<ulong>) / sizeof(char) * 4) // 64 byte chunks.
{
length -= (sizeof(Vector128<ulong>) / sizeof(char)) * 4;
hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr).AsUInt32());
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr));
hash1_128 = HashMultiply128(hash1_128, prime);

hash2_128 = Vector128.Xor(hash2_128, Vector128.Load((ulong*)ptr + 2).AsUInt32());
hash2_128 = Vector128.Multiply(hash2_128, prime);
hash2_128 = Vector128.Xor(hash2_128, Vector128.Load((ulong*)ptr + 2));
hash2_128 = HashMultiply128(hash2_128, prime);

hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr + 4).AsUInt32());
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr + 4));
hash1_128 = HashMultiply128(hash1_128, prime);

hash2_128 = Vector128.Xor(hash2_128, Vector128.Load((ulong*)ptr + 6).AsUInt32());
hash2_128 = Vector128.Multiply(hash2_128, prime);
hash2_128 = Vector128.Xor(hash2_128, Vector128.Load((ulong*)ptr + 6));
hash2_128 = HashMultiply128(hash2_128, prime);
ptr += (sizeof(Vector128<ulong>) / sizeof(nuint)) * 4;
}

while (length >= sizeof(Vector128<ulong>) / sizeof(char)) // 16 byte chunks.
{
length -= sizeof(Vector128<ulong>) / sizeof(char);
hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr).AsUInt32());
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = Vector128.Xor(hash1_128, Vector128.Load((ulong*)ptr));
hash1_128 = HashMultiply128(hash1_128, prime);
ptr += (sizeof(Vector128<ulong>) / sizeof(nuint));
}

Expand Down Expand Up @@ -117,6 +120,9 @@ internal static unsafe UIntPtr UnstableHashVec128(this ReadOnlySpan<char> text)
}
}

#if NET8_0 // Bug in .NET 8 seems to cause this to not re-jit to tier1 till like 200k calls on Linux x64
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
#endif
internal static unsafe UIntPtr UnstableHashVec256(this ReadOnlySpan<char> text)
{
fixed (char* src = &text.GetPinnableReference())
Expand All @@ -126,32 +132,32 @@ internal static unsafe UIntPtr UnstableHashVec256(this ReadOnlySpan<char> text)
var hash2 = hash1;
var ptr = (nuint*)(src);

var prime = Vector256.Create((uint)0x01000193);
var hash1_256 = Vector256.Create(0x811c9dc5);
var hash2_256 = Vector256.Create(0x811c9dc5);
var prime = Vector256.Create((ulong)0x100000001b3);
var hash1_256 = Vector256.Create(0xcbf29ce484222325);
var hash2_256 = Vector256.Create(0xcbf29ce484222325);

while (length >= sizeof(Vector256<ulong>) / sizeof(char) * 4) // 128 byte chunks.
{
length -= (sizeof(Vector256<ulong>) / sizeof(char)) * 4;
hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr).AsUInt32());
hash1_256 = Vector256.Multiply(hash1_256, prime).AsUInt32();
hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr));
hash1_256 = HashMultiply256(hash1_256, prime);

hash2_256 = Vector256.Xor(hash2_256, Vector256.Load((ulong*)ptr + 4).AsUInt32());
hash2_256 = Vector256.Multiply(hash2_256, prime).AsUInt32();
hash2_256 = Vector256.Xor(hash2_256, Vector256.Load((ulong*)ptr + 4));
hash2_256 = HashMultiply256(hash2_256, prime);

hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr + 8).AsUInt32());
hash1_256 = Vector256.Multiply(hash1_256, prime).AsUInt32();
hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr + 8));
hash1_256 = HashMultiply256(hash1_256, prime);

hash2_256 = Vector256.Xor(hash2_256, Vector256.Load((ulong*)ptr + 12).AsUInt32());
hash2_256 = Vector256.Multiply(hash2_256, prime).AsUInt32();
hash2_256 = Vector256.Xor(hash2_256, Vector256.Load((ulong*)ptr + 12));
hash2_256 = HashMultiply256(hash2_256, prime);
ptr += (sizeof(Vector256<ulong>) / sizeof(nuint)) * 4;
}

while (length >= sizeof(Vector256<ulong>) / sizeof(char)) // 32 byte chunks.
{
length -= sizeof(Vector256<ulong>) / sizeof(char);
hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr).AsUInt32());
hash1_256 = Vector256.Multiply(hash1_256, prime).AsUInt32();
hash1_256 = Vector256.Xor(hash1_256, Vector256.Load((ulong*)ptr));
hash1_256 = HashMultiply256(hash1_256, prime);
ptr += (sizeof(Vector256<ulong>) / sizeof(nuint));
}

Expand Down Expand Up @@ -184,6 +190,28 @@ internal static unsafe UIntPtr UnstableHashVec256(this ReadOnlySpan<char> text)
return hash1 + (hash2 * 1566083941);
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector128<ulong> HashMultiply128(Vector128<ulong> a, Vector128<ulong> b)
{
// See comment in HashMultiply256
if (Sse2.IsSupported)
return Sse2.Multiply(a.AsUInt32(), b.AsUInt32()).AsUInt64();

return Vector128.Multiply(a.AsUInt32(), b.AsUInt32()).AsUInt64();
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static Vector256<ulong> HashMultiply256(Vector256<ulong> a, Vector256<ulong> b)
{
// On AVX2, we want VPMULUDQ.
// Unfortunately the Vector256 fallback can't produce this,
// so we fallback to multiplying 32-bit ints, which isn't as good, but still not terrible.
if (Avx2.IsSupported)
return Avx2.Multiply(a.AsUInt32(), b.AsUInt32()).AsUInt64();

return Vector256.Multiply(a.AsUInt32(), b.AsUInt32()).AsUInt64();
}
#endif

internal static unsafe UIntPtr UnstableHashNonVector(this ReadOnlySpan<char> text)
Expand Down
87 changes: 47 additions & 40 deletions src/Reloaded.Memory/Internals/Algorithms/UnstableStringHashLower.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
using Reloaded.Memory.Utilities;
using static Reloaded.Memory.Internals.Backports.System.Text.Unicode.Utf16Utility;
#if NET7_0_OR_GREATER
using static Reloaded.Memory.Internals.Algorithms.UnstableStringHash;
using Reloaded.Memory.Extensions;
using Reloaded.Memory.Internals.Backports.System.Globalization;
using System.Numerics;
Expand Down Expand Up @@ -48,7 +49,7 @@ internal static unsafe nuint GetHashCodeUnstableLower(this ReadOnlySpan<char> te
return text.UnstableHashVec256Lower();

// Over 4 Vec128 regs (16 * 4 = 64 bytes)
if (Vector256.IsHardwareAccelerated && length >= (sizeof(Vector128<ulong>) / sizeof(char)) * 4)
if (Vector128.IsHardwareAccelerated && length >= (sizeof(Vector128<ulong>) / sizeof(char)) * 4)
return text.UnstableHashVec128Lower();
#endif

Expand Down Expand Up @@ -104,6 +105,9 @@ internal static unsafe nuint GetHashCodeUnstableLowerSlow(this ReadOnlySpan<char
}

#if NET7_0_OR_GREATER
#if NET8_0 // Bug in .NET 8 seems to cause this to not re-jit to tier1 till like 200k calls on Linux x64
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
#endif
internal static unsafe UIntPtr UnstableHashVec128Lower(this ReadOnlySpan<char> text)
{
fixed (char* src = &text.GetPinnableReference())
Expand All @@ -113,59 +117,59 @@ internal static unsafe UIntPtr UnstableHashVec128Lower(this ReadOnlySpan<char> t
var hash2 = hash1;
var ptr = (nuint*)(src);

var prime = Vector128.Create((uint)0x01000193);
var hash1_128 = Vector128.Create(0x811c9dc5);
var hash2_128 = Vector128.Create(0x811c9dc5);
var prime = Vector128.Create((ulong)0x100000001b3);
var hash1_128 = Vector128.Create(0xcbf29ce484222325);
var hash2_128 = Vector128.Create(0xcbf29ce484222325);

// We "normalize to lowercase" every char by ORing with 0x0020. This casts
// a very wide net because it will change, e.g., '^' to '~'. But that should
// be ok because we expect this to be very rare in practice.
var toLower = Vector128.Create<uint>(0x0020_0020);
var toLower = Vector128.Create<short>(0x0020).AsUInt64();

while (length >= sizeof(Vector128<ulong>) / sizeof(char) * 4) // 64 byte chunks.
{
length -= (sizeof(Vector128<ulong>) / sizeof(char)) * 4;

var v0 = Vector128.Load((ulong*)ptr).AsUInt32();
if (!AllCharsInVector128AreAscii(v0))
var v0 = Vector128.Load((ulong*)ptr);
if (!AllCharsInVector128AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_128 = Vector128.Xor(hash1_128, Vector128.BitwiseOr(v0, toLower));
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = HashMultiply128(hash1_128, prime);

v0 = Vector128.Load((ulong*)ptr + 2).AsUInt32();
if (!AllCharsInVector128AreAscii(v0))
v0 = Vector128.Load((ulong*)ptr + 2);
if (!AllCharsInVector128AreAscii(v0.AsUInt16()))
goto NotAscii;

hash2_128 = Vector128.Xor(hash2_128, Vector128.BitwiseOr(v0, toLower));
hash2_128 = Vector128.Multiply(hash2_128, prime);
hash2_128 = HashMultiply128(hash2_128, prime);

v0 = Vector128.Load((ulong*)ptr + 4).AsUInt32();
if (!AllCharsInVector128AreAscii(v0))
v0 = Vector128.Load((ulong*)ptr + 4);
if (!AllCharsInVector128AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_128 = Vector128.Xor(hash1_128, Vector128.BitwiseOr(v0, toLower));
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = HashMultiply128(hash1_128, prime);

v0 = Vector128.Load((ulong*)ptr + 6).AsUInt32();
if (!AllCharsInVector128AreAscii(v0))
v0 = Vector128.Load((ulong*)ptr + 6);
if (!AllCharsInVector128AreAscii(v0.AsUInt16()))
goto NotAscii;

hash2_128 = Vector128.Xor(hash2_128, Vector128.BitwiseOr(v0, toLower));
hash2_128 = Vector128.Multiply(hash2_128, prime);
hash2_128 = HashMultiply128(hash2_128, prime);
ptr += (sizeof(Vector128<ulong>) / sizeof(nuint)) * 4;
}

while (length >= sizeof(Vector128<ulong>) / sizeof(char)) // 16 byte chunks.
{
length -= sizeof(Vector128<ulong>) / sizeof(char);

var v0 = Vector128.Load((ulong*)ptr).AsUInt32();
if (!AllCharsInVector128AreAscii(v0))
var v0 = Vector128.Load((ulong*)ptr);
if (!AllCharsInVector128AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_128 = Vector128.Xor(hash1_128, Vector128.BitwiseOr(v0, toLower));
hash1_128 = Vector128.Multiply(hash1_128, prime);
hash1_128 = HashMultiply128(hash1_128, prime);
ptr += (sizeof(Vector128<ulong>) / sizeof(nuint));
}

Expand Down Expand Up @@ -224,6 +228,9 @@ internal static unsafe UIntPtr UnstableHashVec128Lower(this ReadOnlySpan<char> t
return GetHashCodeUnstableLowerSlow(text);
}

#if NET8_0 // Bug in .NET 8 seems to cause this to not re-jit to tier1 till like 200k calls on Linux x64
[MethodImpl(MethodImplOptions.AggressiveOptimization)]
#endif
internal static unsafe UIntPtr UnstableHashVec256Lower(this ReadOnlySpan<char> text)
{
fixed (char* src = &text.GetPinnableReference())
Expand All @@ -233,59 +240,59 @@ internal static unsafe UIntPtr UnstableHashVec256Lower(this ReadOnlySpan<char> t
var hash2 = hash1;
var ptr = (nuint*)(src);

var prime = Vector256.Create((uint)0x01000193);
var hash1_256 = Vector256.Create(0x811c9dc5);
var hash2_256 = Vector256.Create(0x811c9dc5);
var prime = Vector256.Create((ulong)0x100000001b3);
var hash1_256 = Vector256.Create(0xcbf29ce484222325);
var hash2_256 = Vector256.Create(0xcbf29ce484222325);

// We "normalize to lowercase" every char by ORing with 0x0020. This casts
// a very wide net because it will change, e.g., '^' to '~'. But that should
// be ok because we expect this to be very rare in practice.
var toLower = Vector256.Create<uint>(0x0020_0020);
var toLower = Vector256.Create<short>(0x0020).AsUInt64();

while (length >= sizeof(Vector256<ulong>) / sizeof(char) * 4) // 128 byte chunks.
{
length -= (sizeof(Vector256<ulong>) / sizeof(char)) * 4;

var v0 = Vector256.Load((ulong*)ptr).AsUInt32();
if (!AllCharsInVector256AreAscii(v0))
var v0 = Vector256.Load((ulong*)ptr);
if (!AllCharsInVector256AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_256 = Vector256.Xor(hash1_256, Vector256.BitwiseOr(v0, toLower));
hash1_256 = Vector256.Multiply(hash1_256.AsUInt32(), prime.AsUInt32());
hash1_256 = HashMultiply256(hash1_256, prime);

v0 = Vector256.Load((ulong*)ptr + 4).AsUInt32();
if (!AllCharsInVector256AreAscii(v0))
v0 = Vector256.Load((ulong*)ptr + 4);
if (!AllCharsInVector256AreAscii(v0.AsUInt16()))
goto NotAscii;

hash2_256 = Vector256.Xor(hash2_256, Vector256.BitwiseOr(v0, toLower));
hash2_256 = Vector256.Multiply(hash2_256.AsUInt32(), prime.AsUInt32());
hash2_256 = HashMultiply256(hash2_256, prime);

v0 = Vector256.Load((ulong*)ptr + 8).AsUInt32();
if (!AllCharsInVector256AreAscii(v0))
v0 = Vector256.Load((ulong*)ptr + 8);
if (!AllCharsInVector256AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_256 = Vector256.Xor(hash1_256, Vector256.BitwiseOr(v0, toLower));
hash1_256 = Vector256.Multiply(hash1_256.AsUInt32(), prime.AsUInt32());
hash1_256 = HashMultiply256(hash1_256, prime);

v0 = Vector256.Load((ulong*)ptr + 12).AsUInt32();
if (!AllCharsInVector256AreAscii(v0))
v0 = Vector256.Load((ulong*)ptr + 12);
if (!AllCharsInVector256AreAscii(v0.AsUInt16()))
goto NotAscii;

hash2_256 = Vector256.Xor(hash2_256, Vector256.BitwiseOr(v0, toLower));
hash2_256 = Vector256.Multiply(hash2_256.AsUInt32(), prime.AsUInt32());
hash2_256 = HashMultiply256(hash2_256, prime);
ptr += (sizeof(Vector256<ulong>) / sizeof(nuint)) * 4;
}

while (length >= sizeof(Vector256<ulong>) / sizeof(char)) // 32 byte chunks.
{
length -= sizeof(Vector256<ulong>) / sizeof(char);

var v0 = Vector256.Load((ulong*)ptr).AsUInt32();
if (!AllCharsInVector256AreAscii(v0))
var v0 = Vector256.Load((ulong*)ptr);
if (!AllCharsInVector256AreAscii(v0.AsUInt16()))
goto NotAscii;

hash1_256 = Vector256.Xor(hash1_256, Vector256.BitwiseOr(v0, toLower));
hash1_256 = Vector256.Multiply(hash1_256.AsUInt32(), prime.AsUInt32());
hash1_256 = HashMultiply256(hash1_256, prime);
ptr += (sizeof(Vector256<ulong>) / sizeof(nuint));
}

Expand Down Expand Up @@ -347,7 +354,7 @@ internal static unsafe UIntPtr UnstableHashVec256Lower(this ReadOnlySpan<char> t
NotAscii:
return GetHashCodeUnstableLowerSlow(text);
}
#endif
#endif

internal static unsafe nuint UnstableHashNonVectorLower(this ReadOnlySpan<char> text)
{
Expand Down

0 comments on commit c7f9f88

Please sign in to comment.