Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion CMake/Target.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ include(GenerateExportHeader)
include(CMakePackageConfigHelpers)

option(BUILD_TEST "Build unit tests" ON)
option(BUILD_BENCHMARK "Build benchmarks" ON)

set(GENERATED_DIR ${CMAKE_BINARY_DIR}/Generated CACHE PATH "" FORCE)
set(GENERATED_API_HEADER_DIR ${GENERATED_DIR}/Api CACHE PATH "" FORCE)
Expand All @@ -18,6 +19,12 @@ else()
add_compile_definitions(BUILD_TEST=0)
endif()

if (${BUILD_BENCHMARK})
add_compile_definitions(BUILD_BENCHMARK=1)
else()
add_compile_definitions(BUILD_BENCHMARK=0)
endif()

if ("${SUB_PROJECT_NAME}" STREQUAL "")
message(FATAL_ERROR "SUB_PROJECT_NAME not defined, please set it in your project cmake")
endif ()
Expand Down Expand Up @@ -379,7 +386,7 @@ endfunction()
function(exp_add_library)
set(options NOT_INSTALL)
set(singleValueArgs NAME TYPE)
set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB REFLECT)
set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB PRIVATE_COMPILE_OPT PUBLIC_COMPILE_OPT REFLECT)
cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})

if ("${arg_TYPE}" STREQUAL "SHARED")
Expand Down Expand Up @@ -448,6 +455,11 @@ function(exp_add_library)
PRIVATE ${arg_PRIVATE_LIB}
PUBLIC ${arg_PUBLIC_LIB}
)
target_compile_options(
${arg_NAME}
PRIVATE ${arg_PRIVATE_COMPILE_OPT}
PUBLIC ${arg_PUBLIC_COMPILE_OPT}
)

if ("${arg_TYPE}" STREQUAL "SHARED")
string(TOUPPER ${arg_NAME}_API api_name)
Expand Down Expand Up @@ -524,6 +536,30 @@ function(exp_add_test)
)
endfunction()

function(exp_add_benchmark)
if (NOT ${BUILD_BENCHMARK})
return()
endif()

set(options "")
set(singleValueArgs NAME)
set(multiValueArgs SRC INC LINK LIB DEP_TARGET RES REFLECT)
cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})

exp_add_executable(
NAME ${arg_NAME}
FOLDER Benchmark
SRC ${arg_SRC}
INC ${arg_INC}
LINK ${arg_LINK}
LIB Benchmark ${arg_LIB}
DEP_TARGET ${arg_DEP_TARGET}
RES ${arg_RES}
REFLECT ${arg_REFLECT}
NOT_INSTALL
)
endfunction()

install(
EXPORT ${SUB_PROJECT_NAME}Targets
FILE ${SUB_PROJECT_NAME}Targets.cmake
Expand Down
6 changes: 6 additions & 0 deletions Engine/Source/Benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
exp_add_library(
NAME Benchmark
TYPE STATIC
SRC Src/Main.cpp
PUBLIC_LIB benchmark::benchmark
)
7 changes: 7 additions & 0 deletions Engine/Source/Benchmark/Src/Main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
//
// Created by johnk on 2026/6/19.
//

#include <benchmark/benchmark.h>

BENCHMARK_MAIN();
4 changes: 4 additions & 0 deletions Engine/Source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ if (${BUILD_TEST})
add_subdirectory(Test)
endif()

if (${BUILD_BENCHMARK})
add_subdirectory(Benchmark)
endif()

add_subdirectory(Common)
add_subdirectory(Core)
add_subdirectory(Mirror)
Expand Down
228 changes: 228 additions & 0 deletions Engine/Source/Common/Benchmark/MathBenchmark.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
//
// Created by johnk on 2026/6/19.
//

#include <random>
#include <vector>

#include <benchmark/benchmark.h>

#include <Common/Math/Vector.h>
#include <Common/Math/Matrix.h>
#include <Common/Math/Quaternion.h>

using namespace Common;

// A single 4-wide op (one Vec add, one dot) is latency-bound and, for fixed-size loops, the compiler already
// auto-vectorizes the scalar backend, so an isolated op shows no SIMD delta. Worse, with compile-time-constant inputs
// the whole computation is constant-folded and hoisted out of the loop, so a single-op benchmark would measure only a
// DoNotOptimize store. These benchmarks instead run each op over a runtime-randomized batch (inputs the optimizer can
// not fold, output consumed via DoNotOptimize/ClobberMemory) and report items/s, which is the throughput metric where
// SIMD's lane width actually shows up.
namespace {
constexpr int batchSize = 1024;

std::vector<float> MakeRandomFloats(const size_t count)
{
std::mt19937 rng(0x1234u);
std::uniform_real_distribution<float> dist(0.5f, 1.5f);
std::vector<float> values(count);
for (auto& value : values) {
value = dist(rng);
}
return values;
}

template <MathBackend B>
std::vector<Vec<float, 4, B>> MakeRandomVecs(const size_t count)
{
const auto raw = MakeRandomFloats(count * 4);
std::vector<Vec<float, 4, B>> result(count);
for (size_t i = 0; i < count; i++) {
result[i] = Vec<float, 4, B>(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]);
}
return result;
}

template <MathBackend B>
std::vector<Mat<float, 4, 4, B>> MakeRandomMats(const size_t count)
{
const auto raw = MakeRandomFloats(count * 16);
std::vector<Mat<float, 4, 4, B>> result(count);
for (size_t i = 0; i < count; i++) {
const float* p = &raw[i * 16];
result[i] = Mat<float, 4, 4, B>(
p[0], p[1], p[2], p[3],
p[4], p[5], p[6], p[7],
p[8], p[9], p[10], p[11],
p[12], p[13], p[14], p[15]);
}
return result;
}

template <MathBackend B>
std::vector<Quaternion<float, B>> MakeRandomQuats(const size_t count)
{
const auto raw = MakeRandomFloats(count * 4);
std::vector<Quaternion<float, B>> result(count);
for (size_t i = 0; i < count; i++) {
result[i] = Quaternion<float, B>(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]);
}
return result;
}

template <MathBackend B>
std::vector<Vec<float, 3, B>> MakeRandomVec3s(const size_t count)
{
const auto raw = MakeRandomFloats(count * 3);
std::vector<Vec<float, 3, B>> result(count);
for (size_t i = 0; i < count; i++) {
result[i] = Vec<float, 3, B>(raw[i * 3 + 0], raw[i * 3 + 1], raw[i * 3 + 2]);
}
return result;
}

template <MathBackend B>
std::vector<Mat<float, 3, 3, B>> MakeRandomMat3s(const size_t count)
{
const auto raw = MakeRandomFloats(count * 9);
std::vector<Mat<float, 3, 3, B>> result(count);
for (size_t i = 0; i < count; i++) {
const float* p = &raw[i * 9];
result[i] = Mat<float, 3, 3, B>(
p[0], p[1], p[2],
p[3], p[4], p[5],
p[6], p[7], p[8]);
}
return result;
}
}

template <MathBackend B>
static void VecAddBatch(benchmark::State& state)
{
const auto a = MakeRandomVecs<B>(batchSize);
const auto b = MakeRandomVecs<B>(batchSize);
std::vector<Vec<float, 4, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = a[i] + b[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(VecAddBatch<MathBackend::scalar>);
BENCHMARK(VecAddBatch<MathBackend::simd>);

template <MathBackend B>
static void VecDotBatch(benchmark::State& state)
{
const auto a = MakeRandomVecs<B>(batchSize);
const auto b = MakeRandomVecs<B>(batchSize);
for (auto _ : state) {
float sum = 0.0f;
for (int i = 0; i < batchSize; i++) {
sum += a[i].Dot(b[i]);
}
benchmark::DoNotOptimize(sum);
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(VecDotBatch<MathBackend::scalar>);
BENCHMARK(VecDotBatch<MathBackend::simd>);

template <MathBackend B>
static void MatMulBatch(benchmark::State& state)
{
const auto a = MakeRandomMats<B>(batchSize);
const auto b = MakeRandomMats<B>(batchSize);
std::vector<Mat<float, 4, 4, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = a[i] * b[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(MatMulBatch<MathBackend::scalar>);
BENCHMARK(MatMulBatch<MathBackend::simd>);

// QuatOps<float, simd>::Mul evaluates the Hamilton product as four broadcast-and-permute terms, so this measures the
// SIMD quaternion product against the scalar one rather than a tie.
template <MathBackend B>
static void QuatMulBatch(benchmark::State& state)
{
const auto a = MakeRandomQuats<B>(batchSize);
const auto b = MakeRandomQuats<B>(batchSize);
std::vector<Quaternion<float, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = a[i] * b[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(QuatMulBatch<MathBackend::scalar>);
BENCHMARK(QuatMulBatch<MathBackend::simd>);

// Mat3 keeps its tight float[9] storage; the simd backend loads it with safe partial loads (two full 128-bit loads
// plus a Load3 tail). These batches show whether that 2b approach beats the scalar 3x3 paths once the per-op load cost
// is amortized across the matrix product / transform.
template <MathBackend B>
static void Mat4InverseBatch(benchmark::State& state)
{
const auto a = MakeRandomMats<B>(batchSize);
std::vector<Mat<float, 4, 4, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = a[i].Inverse();
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(Mat4InverseBatch<MathBackend::scalar>);
BENCHMARK(Mat4InverseBatch<MathBackend::simd>);

template <MathBackend B>
static void Mat3MulBatch(benchmark::State& state)
{
const auto a = MakeRandomMat3s<B>(batchSize);
const auto b = MakeRandomMat3s<B>(batchSize);
std::vector<Mat<float, 3, 3, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = a[i] * b[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(Mat3MulBatch<MathBackend::scalar>);
BENCHMARK(Mat3MulBatch<MathBackend::simd>);

template <MathBackend B>
static void Mat3MulVecBatch(benchmark::State& state)
{
const auto m = MakeRandomMat3s<B>(batchSize);
const auto v = MakeRandomVec3s<B>(batchSize);
std::vector<Vec<float, 3, B>> c(batchSize);
for (auto _ : state) {
for (int i = 0; i < batchSize; i++) {
c[i] = m[i] * v[i];
}
benchmark::DoNotOptimize(c.data());
benchmark::ClobberMemory();
}
state.SetItemsProcessed(state.iterations() * batchSize);
}
BENCHMARK(Mat3MulVecBatch<MathBackend::scalar>);
BENCHMARK(Mat3MulVecBatch<MathBackend::simd>);
22 changes: 22 additions & 0 deletions Engine/Source/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
# Math SIMD baseline. The math types are header-only, so the option is PUBLIC: every consumer instantiates them and must
# share one ISA baseline. The simd backend only relies on the SSE2/NEON baseline (present on every CPU this engine
# targets), so it is always on; this just lifts the compiler's instruction-set baseline where that helps codegen. MSVC
# x64 implies SSE2 and has no /arch:SSE4.2; on gcc/clang lift the baseline to SSE4.2 where supported (the check fails and
# is skipped on non-x86 targets such as aarch64, which already ship NEON).
if (NOT MSVC)
include(CheckCXXCompilerFlag)
check_cxx_compiler_flag("-msse4.2" has_msse42)
if (has_msse42)
set(math_public_compile_opt -msse4.2)
endif ()
endif ()

file(GLOB_RECURSE sources Src/*.cpp)
exp_add_library(
NAME Common
TYPE STATIC
SRC ${sources}
PUBLIC_INC Include
PUBLIC_LIB rapidjson debugbreak::debugbreak cityhash::cityhash Taskflow::Taskflow
PUBLIC_COMPILE_OPT ${math_public_compile_opt}
)

file(GLOB test_sources Test/*.cpp)
Expand All @@ -14,3 +28,11 @@ exp_add_test(
SRC ${test_sources}
LIB Common
)

# exp_add_benchmark early-returns when BUILD_BENCHMARK is OFF, so this is safe to declare unconditionally.
file(GLOB benchmark_sources Benchmark/*.cpp)
exp_add_benchmark(
NAME Common.Benchmark
SRC ${benchmark_sources}
LIB Common
)
Loading
Loading