Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes compilation w/ CMake & naive SIMD implementation on macOS/arm64 #2

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
build/
135 changes: 77 additions & 58 deletions PLA-SeedFinder/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,31 +16,31 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
#set(CMAKE_VERBOSE_MAKEFILE ON)

set(CMAKE_AUTOMOC ON)
set(CMAKE_AUTORCC ON)
set(CMAKE_AUTOUIC ON)

add_custom_target(build-time-make-directory ALL
COMMAND ${CMAKE_COMMAND} -E make_directory Assembly/)

#Find threads library
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)

#detect cpu arch
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set (X86 TRUE)
else ()
set (X86 FALSE)
endif ()

#add current directory to find tesseractPA.lib
#link_directories(${CMAKE_CURRENT_LIST_DIR})

file(GLOB MAIN_SOURCES
if (X86)
file(GLOB MAIN_SOURCES
Source/Compiler.h
Source/CpuId.cpp
Source/CpuFeatures.cpp
Source/CpuFeatures.h
Source/DynamicParallelizer.cpp
Source/DynamicParallelizer.h
Source/Kernels/SeedScan_Default.cpp
Source/Kernels/SeedScan_x64_AVX2.cpp
Source/Kernels/SeedScan_x64_AVX512.cpp
Source/Kernels/SeedScan_x64_SSE41.cpp
Source/Kernels/SeedScan_x86_AVX2.cpp
Source/Kernels/SeedScan_x86_AVX512.cpp
Source/Kernels/SeedScan_x86_SSE41.cpp
Source/Kernels/XoroShiro1_Default.h
Source/Kernels/XoroShiro2_SSE2.h
Source/Kernels/XoroShiro4_AVX2.h
Expand All @@ -54,8 +54,25 @@ file(GLOB MAIN_SOURCES
Source/SeedScan.h
Source/Tools.cpp
Source/Tools.h
)
add_executable(PLA-SeedFinder WIN32 ${MAIN_SOURCES})
)
else ()
file(GLOB MAIN_SOURCES
Source/DynamicParallelizer.cpp
Source/Kernels/SeedScan_Default.cpp
Source/Kernels/SeedScan_aarch64.cpp
Source/Main.cpp
Source/PLA-SeedFinder.cpp
Source/ReportCandidates.cpp
Source/SeedScan.cpp
Source/Tools.cpp
)
endif (X86)

if (WIN32)
add_executable(PLA-SeedFinder WIN32 ${MAIN_SOURCES})
else ()
add_executable(PLA-SeedFinder ${MAIN_SOURCES})
endif ()
set_target_properties(PLA-SeedFinder PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(PLA-SeedFinder)
target_link_libraries(PLA-SeedFinder Threads::Threads)
Expand All @@ -68,6 +85,9 @@ target_include_directories(PLA-SeedFinder PRIVATE Source/)

#enable MP with MSVC (Build with Multiple Processes)
if (MSVC)
add_custom_target(build-time-make-directory ALL
COMMAND ${CMAKE_COMMAND} -E make_directory Assembly/)

target_compile_options(PLA-SeedFinder PRIVATE /FAs /FaAssembly/ /MP /W4)
target_compile_options(PLA-SeedFinder PRIVATE /wd5054) # Deprecated enum arithemtic
target_compile_options(PLA-SeedFinder PRIVATE /wd4505) # unreferenced local function has been removed
Expand All @@ -82,50 +102,49 @@ if (MSVC)
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake)

else()
target_compile_options(PLA-SeedFinder PRIVATE -msse4.2)

target_compile_options(PLA-SeedFinder PRIVATE -Wall -Wpedantic -DPA_STATIC)

set(ARCH_FLAGS_09_Nehalem -march=nehalem)
set(ARCH_FLAGS_13_Haswell -march=haswell)
set(ARCH_FLAGS_17_Skylake -march=skylake-avx512)

# Run-time ISA dispatching
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_08_Nehalem)
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_13_Haswell)
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake)
endif()



# Run-time CPU dispatching.
if (ARCH_FLAGS_09_Nehalem)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x64_SSE41.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_09_Nehalem}
)
endif()
if (ARCH_FLAGS_13_Haswell)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x64_AVX2.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_13_Haswell}
)
target_compile_options(PLA-SeedFinder PRIVATE -Wall -Wpedantic -O2)
if (X86)
target_compile_options(PLA-SeedFinder PRIVATE -msse4.2)

set(ARCH_FLAGS_09_Nehalem -march=nehalem)
set(ARCH_FLAGS_13_Haswell -march=haswell)
set(ARCH_FLAGS_17_Skylake -march=skylake-avx512)

# Run-time ISA dispatching
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_08_Nehalem)
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_13_Haswell)
target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake)
endif (X86)
endif()
if (ARCH_FLAGS_17_Skylake)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x64_AVX512.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_17_Skylake}
)
endif()






#copy needed dlls
#file(COPY *.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(GLOB MY_DLLS
"*.dll"
)
file(COPY ${MY_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
if(X86)
# Run-time CPU dispatching.
if (ARCH_FLAGS_09_Nehalem)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x86_SSE41.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_09_Nehalem}
)
endif()
if (ARCH_FLAGS_13_Haswell)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x86_AVX2.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_13_Haswell}
)
endif()
if (ARCH_FLAGS_17_Skylake)
SET_SOURCE_FILES_PROPERTIES(
Source/Kernels/SeedScan_x86_AVX512.cpp
PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_17_Skylake}
)
endif()
endif(X86)

if (WIN32)
#copy needed dlls
#file(COPY *.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
file(GLOB MY_DLLS
"*.dll"
)
file(COPY ${MY_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR})
endif(WIN32)
119 changes: 119 additions & 0 deletions PLA-SeedFinder/Source/Kernels/SeedScan_aarch64.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#include <stdint.h>
#include <cstddef>
#include "XoroShiro_aarch64.h"

namespace PokemonAutomation{

#if defined __aarch64__ && defined __APPLE__

bool seed_scan_common_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){
simd_ulong4 t = simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid);
simd_ulong4 seed = simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000);
simd_ulong4 delta = simd_make_ulong4(0x500000000, 0x500000000, 0x500000000, 0x500000000);
simd_ulong4 pid;
iterations /= 4;

do {
XoroShiro4 rng(seed);
rng.next();
size_t lc = rolls;
do {
rng.next();
pid = rng.get_masked();
if (simd_reduce_min(pid ^ t) == 0) {
return true;
}
} while (--lc);
seed += delta;
} while (--iterations);
return false;
}

bool seed_scan_thorough_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){
desired_pid &= 0xefffffff;
simd_ulong4 t = simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid);
simd_ulong4 seed = simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000);
simd_ulong4 delta = simd_make_ulong4(0x500000000, 0x500000000, 0x500000000, 0x500000000);
simd_ulong4 mask = simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff);
simd_ulong4 pid;
iterations /= 4;

do {
XoroShiro4 rng(seed);
size_t lc = rolls + 1;
do {
rng.next();
pid = rng.get_masked();
if (simd_reduce_max(pid) == UINT32_MAX) {
return true;
}
pid &= mask;
if (simd_reduce_min(pid ^ t) == 0) {
return true;
}
} while (--lc);
seed += delta;
} while (--iterations);
return false;
}

bool seed_scan_common_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){
simd_ulong8 t = simd_make_ulong8(simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid),
simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid));
simd_ulong8 seed = simd_make_ulong8(simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000),
simd_make_ulong4(start_seed+0x500000000, start_seed+0x600000000, start_seed+0x700000000, start_seed+0x800000000));
simd_ulong8 delta = simd_make_ulong8(simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000),
simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000));
simd_ulong8 pid;
iterations /= 8;

do {
XoroShiro8 rng(seed);
rng.next();
size_t lc = rolls;
do {
rng.next();
pid = rng.get_masked();
if (simd_reduce_min(pid ^ t) == 0) {
return true;
}
} while (--lc);
seed += delta;
} while (--iterations);
return false;
}

bool seed_scan_thorough_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){
desired_pid &= 0xefffffff;
simd_ulong8 t = simd_make_ulong8(simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid),
simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid));
simd_ulong8 seed = simd_make_ulong8(simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000),
simd_make_ulong4(start_seed+0x500000000, start_seed+0x600000000, start_seed+0x700000000, start_seed+0x800000000));
simd_ulong8 delta = simd_make_ulong8(simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000),
simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000));
simd_ulong8 u1 = simd_make_ulong8(simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff),
simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff));
simd_ulong8 pid;
iterations /= 8;

do {
XoroShiro8 rng(seed);
size_t lc = rolls + 1;
do {
rng.next();
pid = rng.get_masked();
if (simd_reduce_max(pid) == UINT32_MAX) {
return true;
}
pid &= u1;
if (simd_reduce_min(pid ^ t) == 0) {
return true;
}
} while (--lc);
seed += delta;
} while (--iterations);
return false;
}

#endif
}
6 changes: 1 addition & 5 deletions PLA-SeedFinder/Source/Kernels/XoroShiro1_Default.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,7 @@ namespace PokemonAutomation{

class XoroShiroX1_Default{
public:
PA_FORCE_INLINE XoroShiroX1_Default(uint64_t seed){
state[0] = seed;
state[1] = 0x82A2B175229D6A5B;
}

PA_FORCE_INLINE XoroShiroX1_Default(uint64_t seed) : state {seed, 0x82A2B175229D6A5B} {}
PA_FORCE_INLINE uint64_t get_int64(){
return state[0] + state[1];
}
Expand Down
61 changes: 61 additions & 0 deletions PLA-SeedFinder/Source/Kernels/XoroShiro_aarch64.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma once

#if defined __aarch64__ && defined __APPLE__
#include <simd/simd.h>
#include "Compiler.h"

#define MAGIC_NUMBER 0x82A2B175229D6A5B

namespace PokemonAutomation{

static const simd_ulong4 magic_ulong4 = simd_make_ulong4(MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER);
static const simd_ulong4 mask_ulong4 = simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX);
static const simd_ulong8 mask_ulong8 = simd_make_ulong8(simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX),
simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX));

class XoroShiro4{
public:
PA_FORCE_INLINE XoroShiro4(simd_ulong4 seed)
: state {seed, magic_ulong4} {}
PA_FORCE_INLINE simd_ulong4 get(){ return state[0] + state[1]; }
PA_FORCE_INLINE simd_ulong4 get_masked(){ return (state[0] + state[1]) & mask_ulong4; }
PA_FORCE_INLINE void next(){
simd_ulong4 s0 = state[0];
simd_ulong4 s1 = state[1];
s1 = s1 ^ s0;
s0 = (s0 << 24) | (s0 >> 40);
state[1] = (s1 << 37) | (s1 >> 27);
s1 = s1 ^ (s1 << 16);
state[0] = s0 ^ s1;
}

private:
simd_ulong4 state[2];
};


class XoroShiro8{
public:
PA_FORCE_INLINE XoroShiro8(simd_ulong8 seed)
: state {seed, simd_make_ulong8(magic_ulong4, magic_ulong4)} {}
PA_FORCE_INLINE simd_ulong8 get(){ return state[0] + state[1]; }
PA_FORCE_INLINE simd_ulong8 get_masked(){ return (state[0] + state[1]) & mask_ulong8; }
PA_FORCE_INLINE void next(){
simd_ulong8 s0 = state[0];
simd_ulong8 s1 = state[1];
s1 = s1 ^ s0;
s0 = (s0 << 24) | (s0 >> 40);
state[1] = (s1 << 37) | (s1 >> 27);
s1 = s1 ^ (s1 << 16);
state[0] = s0 ^ s1;
}

private:
simd_ulong8 state[2];
};


}

#undef MAGIC_NUMBER
#endif
1 change: 0 additions & 1 deletion PLA-SeedFinder/Source/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ void test(){
}
#endif

#include <immintrin.h>


int main(){
Expand Down
2 changes: 0 additions & 2 deletions PLA-SeedFinder/Source/PLA-SeedFinder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ int32_t pa_PLA_find_seeds_threads(
<< stats.ivs[4] << " "
<< stats.ivs[5] << std::endl;

EcPidMatchReporter reporter(stats);

std::cout << std::endl;
print_isa();
std::cout << "Threads: " << threads << std::endl;
Expand Down
Loading