diff --git a/cpp/citor/CMakeLists.txt b/cpp/citor/CMakeLists.txt new file mode 100644 index 0000000..35c080b --- /dev/null +++ b/cpp/citor/CMakeLists.txt @@ -0,0 +1,65 @@ +cmake_minimum_required(VERSION 3.16) +project(runtime_benchmarks_citor) + +set(CMAKE_MODULE_PATH + ${runtime_benchmarks_citor_SOURCE_DIR}/../1CMake + ${CMAKE_MODULE_PATH}) + +set(CMAKE_EXPORT_COMPILE_COMMANDS "1") +set(CMAKE_CXX_STANDARD 20) + +add_definitions( + "-march=native" +) + +include(../1CMake/CPM.cmake) + +set(CITOR_GIT_TAG v0.4.5) +if(DEFINED ENV{RUNTIME_BENCHMARKS_LIBRARY_REF}) + set(CITOR_GIT_TAG "$ENV{RUNTIME_BENCHMARKS_LIBRARY_REF}") +endif() + +CPMAddPackage( + NAME citor + GIT_REPOSITORY https://github.com/Lallapallooza/citor.git + GIT_TAG ${CITOR_GIT_TAG} + OPTIONS + "CITOR_BUILD_TESTS OFF" + "CITOR_BUILD_BENCHMARK OFF" +) + +include_directories("../2common") + +find_package(libtcmalloc) + +if(LIBTCMALLOC_FOUND) + set(MALLOC_LIB "${LIBTCMALLOC_LIBRARY}") + message(STATUS "Using malloc: ${MALLOC_LIB}") +else() + find_package(libmimalloc) + + if(LIBMIMALLOC_FOUND) + set(MALLOC_LIB "${LIBMIMALLOC_LIBRARY}") + message(STATUS "Using malloc: ${MALLOC_LIB}") + else() + find_package(libjemalloc) + + if(LIBJEMALLOC_FOUND) + set(MALLOC_LIB "${LIBJEMALLOC_LIBRARY}") + message(STATUS "Using malloc: ${MALLOC_LIB}") + else() + message(STATUS "Using malloc: default") + endif() + endif() +endif() + +link_libraries(${MALLOC_LIB} citor::citor) + +# Larger worker stack for deep recursive fork-join (fib, nqueens). +add_compile_definitions(CITOR_WORKER_STACK_KIB=65536) + +add_executable(fib fib.cpp) +add_executable(skynet skynet.cpp) +add_executable(nqueens nqueens.cpp) +target_compile_options(nqueens PRIVATE "-falign-loops=64") +add_executable(matmul matmul.cpp) diff --git a/cpp/citor/CMakePresets.json b/cpp/citor/CMakePresets.json new file mode 100644 index 0000000..23d7f58 --- /dev/null +++ b/cpp/citor/CMakePresets.json @@ -0,0 +1,293 @@ +{ + "version": 3, + "configurePresets": [ + { + "name": "clang-linux-debug", + "displayName": "Clang-Linux Debug", + "generator": "Ninja", + "description": "Using compilers: C = clang, CXX = clang++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_C_COMPILER": "clang", + "CMAKE_CXX_COMPILER": "clang++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "clang-linux-release", + "displayName": "Clang-Linux Release", + "generator": "Ninja", + "description": "Using compilers: C = clang, CXX = clang++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_C_COMPILER": "clang", + "CMAKE_CXX_COMPILER": "clang++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "clang-linux-relwithdebinfo", + "displayName": "Clang-Linux Release with Debug Info", + "generator": "Ninja", + "description": "Using compilers: C = clang, CXX = clang++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "CMAKE_C_COMPILER": "clang", + "CMAKE_CXX_COMPILER": "clang++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "gcc-linux-debug", + "displayName": "GCC-Linux Debug", + "generator": "Ninja", + "description": "Using compilers: C = gcc, CXX = g++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_C_COMPILER": "gcc", + "CMAKE_CXX_COMPILER": "g++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "gcc-linux-release", + "displayName": "GCC-Linux Release", + "generator": "Ninja", + "description": "Using compilers: C = gcc, CXX = g++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_C_COMPILER": "gcc", + "CMAKE_CXX_COMPILER": "g++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "gcc-linux-relwithdebinfo", + "displayName": "GCC-Linux Release with Debug Info", + "generator": "Ninja", + "description": "Using compilers: C = gcc, CXX = g++", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "CMAKE_C_COMPILER": "gcc", + "CMAKE_CXX_COMPILER": "g++", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Linux" + } + }, + { + "name": "clang-win-debug", + "displayName": "Clang-Win Debug", + "generator": "Ninja", + "description": "Using compiler: clang-cl.exe", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_C_COMPILER": "clang-cl.exe", + "CMAKE_CXX_COMPILER": "clang-cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "vendor": { + "microsoft.com/VisualStudioSettings/CMake/1.0": { + "intelliSenseMode": "windows-clang-x64" + } + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + }, + { + "name": "clang-win-release", + "displayName": "Clang-Win Release", + "generator": "Ninja", + "description": "Using compiler: clang-cl.exe", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_C_COMPILER": "clang-cl.exe", + "CMAKE_CXX_COMPILER": "clang-cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "vendor": { + "microsoft.com/VisualStudioSettings/CMake/1.0": { + "intelliSenseMode": "windows-clang-x64" + } + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + }, + { + "name": "clang-win-relwithdebinfo", + "displayName": "Clang-Win Release with Debug Info", + "generator": "Ninja", + "description": "Using compiler: clang-cl.exe", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "CMAKE_C_COMPILER": "clang-cl.exe", + "CMAKE_CXX_COMPILER": "clang-cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "vendor": { + "microsoft.com/VisualStudioSettings/CMake/1.0": { + "intelliSenseMode": "windows-clang-x64" + } + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + }, + { + "name": "msvc-win-debug", + "displayName": "MSVC-Win Debug", + "description": "Using compiler: cl.exe", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_C_COMPILER": "cl.exe", + "CMAKE_CXX_COMPILER": "cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + }, + { + "name": "msvc-win-release", + "displayName": "MSVC-Win Release", + "description": "Using compiler: cl.exe", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_C_COMPILER": "cl.exe", + "CMAKE_CXX_COMPILER": "cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_FLAGS": "/DWIN32 /D_WINDOWS /W3 /GR /EHsc /arch:AVX2", + "CMAKE_C_FLAGS": "/DWIN32 /D_WINDOWS /W3 /arch:AVX2" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + }, + { + "name": "msvc-win-relwithdebinfo", + "displayName": "MSVC-Win Release with Debug Info", + "description": "Using compiler: cl.exe", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "RelWithDebInfo", + "CMAKE_C_COMPILER": "cl.exe", + "CMAKE_CXX_COMPILER": "cl.exe", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON", + "CMAKE_CXX_FLAGS": "/DWIN32 /D_WINDOWS /W3 /GR /EHsc /arch:AVX2", + "CMAKE_C_FLAGS": "/DWIN32 /D_WINDOWS /W3 /arch:AVX2", + "CMAKE_CXX_FLAGS_RELWITHDEBINFO": "/MD /Zi /O2 /Ob2 /DNDEBUG", + "CMAKE_C_FLAGS_RELWITHDEBINFO": "/MD /Zi /O2 /Ob2 /DNDEBUG", + "CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO": "/debug /INCREMENTAL:NO", + "CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO": "/debug /INCREMENTAL:NO", + "CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO": "/debug /INCREMENTAL:NO" + }, + "architecture": { + "value": "x64", + "strategy": "external" + }, + "toolset": { + "value": "host=x64", + "strategy": "external" + }, + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + } + ] +} \ No newline at end of file diff --git a/cpp/citor/build_all.sh b/cpp/citor/build_all.sh new file mode 100755 index 0000000..f5a6019 --- /dev/null +++ b/cpp/citor/build_all.sh @@ -0,0 +1,3 @@ +PRESET=${1:-"clang-linux-release"} +cmake --preset $PRESET . +cmake --build ./build --parallel 16 --target all diff --git a/cpp/citor/fib.cpp b/cpp/citor/fib.cpp new file mode 100644 index 0000000..40dc4fb --- /dev/null +++ b/cpp/citor/fib.cpp @@ -0,0 +1,68 @@ +// Port of cpp/libfork/fib.cpp using citor::forkJoin. + +#include "memusage.hpp" +#include "citor/thread_pool.h" +#include "citor/hints.h" + +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency() / 2; +static const size_t iter_count = 1; + +size_t fibonacci(citor::ThreadPool& pool, size_t n) { + if (n < 2) { + return n; + } + size_t x = 0; + size_t y = 0; + pool.forkJoin( + [&] { x = fibonacci(pool, n - 1); }, + [&] { y = fibonacci(pool, n - 2); } + ); + return x + y; +} + +int main(int argc, char* argv[]) { + if (argc > 2) { + thread_count = static_cast(atoi(argv[2])); + } + if (argc < 2) { + printf("Usage: fib \n"); + exit(0); + } + size_t n = static_cast(atoi(argv[1])); + + std::printf("threads: %zu\n", thread_count); + // citor's default PerCpu affinity caps workers at the physical-core + // count. When the sweep requests every logical CPU, opt into + // SMT-sibling placement so all hardware threads are used. + const citor::Affinity affinity = + (thread_count == std::thread::hardware_concurrency()) + ? citor::Affinity::PerCpuSmtPair + : citor::Affinity::PerCpu; + citor::ThreadPool pool(thread_count, affinity); + + size_t result = fibonacci(pool, 30); // warmup + (void)result; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < iter_count; ++i) { + result = fibonacci(pool, n); + std::printf("output: %zu\n", result); + } + + auto endTime = std::chrono::high_resolution_clock::now(); + auto totalTimeUs = + std::chrono::duration_cast(endTime - startTime); + std::printf("runs:\n"); + std::printf(" - iteration_count: %zu\n", iter_count); + std::printf(" duration: %" PRIu64 " us\n", + static_cast(totalTimeUs.count())); + std::printf(" max_rss: %ld KiB\n", peak_memory_usage()); + return 0; +} diff --git a/cpp/citor/matmul.cpp b/cpp/citor/matmul.cpp new file mode 100644 index 0000000..578a1c2 --- /dev/null +++ b/cpp/citor/matmul.cpp @@ -0,0 +1,113 @@ +// Port of cpp/libfork/matmul.cpp using citor::forkJoin. + +#include "matmul.hpp" +#include "memusage.hpp" +#include "citor/thread_pool.h" +#include "citor/hints.h" + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency() / 2; + +static void matmul(citor::ThreadPool& pool, int* a, int* b, int* c, int n, + int N) { + if (n <= 32) { + matmul_small(a, b, c, n, N); + return; + } + int k = n / 2; + + pool.forkJoin( + [&] { matmul(pool, a, b, c, k, N); }, + [&] { matmul(pool, a, b + k, c + k, k, N); }, + [&] { matmul(pool, a + k * N, b, c + k * N, k, N); }, + [&] { matmul(pool, a + k * N, b + k, c + k * N + k, k, N); } + ); + + pool.forkJoin( + [&] { matmul(pool, a + k, b + k * N, c, k, N); }, + [&] { matmul(pool, a + k, b + k * N + k, c + k, k, N); }, + [&] { matmul(pool, a + k * N + k, b + k * N, c + k * N, k, N); }, + [&] { matmul(pool, a + k * N + k, b + k * N + k, c + k * N + k, k, N); } + ); +} + +static std::vector run_matmul(citor::ThreadPool& pool, int N) { + std::vector A(N * N, 1); + std::vector B(N * N, 1); + std::vector C(N * N, 0); + + int* a = A.data(); + int* b = B.data(); + int* c = C.data(); + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + a[i * N + j] = 1; + b[i * N + j] = 1; + c[i * N + j] = 0; + } + } + matmul(pool, a, b, c, N, N); + return C; +} + +static void validate_result(std::vector& C, int N) { + std::atomic_thread_fence(std::memory_order_seq_cst); + int* c = C.data(); + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + auto res = c[i * N + j]; + if (res != N) { + std::printf( + "Wrong result at (%d,%d) : %d. expected %d\n", i, j, res, N + ); + std::fflush(stdout); + std::terminate(); + } + } + } +} + +static void run_one(citor::ThreadPool& pool, int N) { + auto startTime = std::chrono::high_resolution_clock::now(); + std::vector result = run_matmul(pool, N); + auto endTime = std::chrono::high_resolution_clock::now(); + validate_result(result, N); + auto totalTimeUs = + std::chrono::duration_cast(endTime - startTime); + std::printf(" - matrix_size: %d\n", N); + std::printf(" duration: %zu us\n", + static_cast(totalTimeUs.count())); + std::printf(" max_rss: %ld KiB\n", peak_memory_usage()); +} + +int main(int argc, char* argv[]) { + if (argc > 2) { + thread_count = static_cast(atoi(argv[2])); + } + if (argc < 2) { + printf("Usage: matmul \n"); + exit(0); + } + int n = atoi(argv[1]); + std::printf("threads: %zu\n", thread_count); + // citor's default PerCpu affinity caps workers at the physical-core + // count. When the sweep requests every logical CPU, opt into + // SMT-sibling placement so all hardware threads are used. + const citor::Affinity affinity = + (thread_count == std::thread::hardware_concurrency()) + ? citor::Affinity::PerCpuSmtPair + : citor::Affinity::PerCpu; + citor::ThreadPool pool(thread_count, affinity); + + run_matmul(pool, n); // warmup + + std::printf("runs:\n"); + run_one(pool, n); + return 0; +} diff --git a/cpp/citor/nqueens.cpp b/cpp/citor/nqueens.cpp new file mode 100644 index 0000000..5b2126f --- /dev/null +++ b/cpp/citor/nqueens.cpp @@ -0,0 +1,109 @@ +// Port of cpp/libfork/nqueens.cpp using citor::forkJoinAll. + +#include "memusage.hpp" +#include "citor/thread_pool.h" +#include "citor/hints.h" + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency() / 2; +static const size_t iter_count = 1; + +inline constexpr int nqueens_work = 14; + +inline constexpr std::array answers = { + 0, 1, 0, 0, 2, 10, 4, + 40, 92, 352, 724, 2'680, 14'200, 73'712, + 365'596, 2'279'184, 14'772'512, 95'815'104, 666'090'624, +}; + +static void check_answer(int result) { + if (result != answers[nqueens_work]) { + std::printf("error: expected %d, got %d\n", answers[nqueens_work], result); + } +} + +template +int nqueens(citor::ThreadPool& pool, int xMax, std::array buf) { + if (xMax == static_cast(N)) { + return 1; + } + + std::array ys{}; + size_t taskCount = 0; + for (int y = 0; y < static_cast(N); ++y) { + char q = static_cast(y); + bool legal = true; + for (int x = 0; x < xMax; ++x) { + char p = buf[x]; + if (q == p || q == p - (xMax - x) || q == p + (xMax - x)) { + legal = false; + break; + } + } + if (legal) { + ys[taskCount++] = static_cast(y); + } + } + + if (taskCount == 0) { + return 0; + } + + std::array values{}; + pool.forkJoinAll(taskCount, [&](size_t i) { + auto childBuf = buf; + childBuf[xMax] = ys[i]; + values[i] = nqueens(pool, xMax + 1, childBuf); + }); + + int ret = 0; + for (size_t i = 0; i < taskCount; ++i) { + ret += values[i]; + } + return ret; +} + +int main(int argc, char* argv[]) { + if (argc > 1) { + thread_count = static_cast(atoi(argv[1])); + } + std::printf("threads: %zu\n", thread_count); + // citor's default PerCpu affinity caps workers at the physical-core + // count. When the sweep requests every logical CPU, opt into + // SMT-sibling placement so all hardware threads are used. + const citor::Affinity affinity = + (thread_count == std::thread::hardware_concurrency()) + ? citor::Affinity::PerCpuSmtPair + : citor::Affinity::PerCpu; + citor::ThreadPool pool(thread_count, affinity); + { + std::array buf{}; + auto result = nqueens(pool, 0, buf); // warmup + check_answer(result); + } + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (size_t i = 0; i < iter_count; ++i) { + std::array buf{}; + auto result = nqueens(pool, 0, buf); + check_answer(result); + std::printf("output: %d\n", result); + } + + auto endTime = std::chrono::high_resolution_clock::now(); + auto totalTimeUs = + std::chrono::duration_cast(endTime - startTime); + std::printf("runs:\n"); + std::printf(" - iteration_count: %zu\n", iter_count); + std::printf(" duration: %" PRIu64 " us\n", + static_cast(totalTimeUs.count())); + std::printf(" max_rss: %ld KiB\n", peak_memory_usage()); + return 0; +} diff --git a/cpp/citor/skynet.cpp b/cpp/citor/skynet.cpp new file mode 100644 index 0000000..8cc45b8 --- /dev/null +++ b/cpp/citor/skynet.cpp @@ -0,0 +1,77 @@ +// Port of cpp/libfork/skynet.cpp using citor::forkJoinAll. + +#include "memusage.hpp" +#include "citor/thread_pool.h" +#include "citor/hints.h" + +#include +#include +#include +#include +#include +#include + +static size_t thread_count = std::thread::hardware_concurrency() / 2; +static const size_t iter_count = 1; + +template +size_t skynet_one(citor::ThreadPool& pool, size_t BaseNum, size_t Depth) { + if (Depth == DepthMax) { + return BaseNum; + } + size_t depthOffset = 1; + for (size_t i = 0; i < DepthMax - Depth - 1; ++i) { + depthOffset *= 10; + } + + std::array results{}; + pool.forkJoinAll(10, [&](size_t idx) { + results[idx] = + skynet_one(pool, BaseNum + depthOffset * idx, Depth + 1); + }); + + size_t count = 0; + for (size_t idx = 0; idx < 10; ++idx) { + count += results[idx]; + } + return count; +} + +template +void skynet(citor::ThreadPool& pool) { + size_t count = skynet_one(pool, 0, 0); + if (count != 4999999950000000ULL) { + std::printf("ERROR: wrong result - %zu\n", count); + } +} + +int main(int argc, char* argv[]) { + if (argc > 1) { + thread_count = static_cast(atoi(argv[1])); + } + std::printf("threads: %zu\n", thread_count); + // citor's default PerCpu affinity caps workers at the physical-core + // count. When the sweep requests every logical CPU, opt into + // SMT-sibling placement so all hardware threads are used. + const citor::Affinity affinity = + (thread_count == std::thread::hardware_concurrency()) + ? citor::Affinity::PerCpuSmtPair + : citor::Affinity::PerCpu; + citor::ThreadPool pool(thread_count, affinity); + + skynet<8>(pool); // warmup + + std::printf("runs:\n"); + auto startTime = std::chrono::high_resolution_clock::now(); + for (size_t j = 0; j < iter_count; ++j) { + skynet<8>(pool); + } + auto endTime = std::chrono::high_resolution_clock::now(); + auto totalTimeUs = + std::chrono::duration_cast(endTime - startTime); + std::printf(" - iteration_count: %zu\n", iter_count); + std::printf(" duration: %" PRIu64 " us\n", + static_cast(totalTimeUs.count())); + std::printf(" max_rss: %ld KiB\n", peak_memory_usage()); + return 0; +}