ExplosionEngine · FlyAndNotDown · Jun 19, 2026 · Jun 18, 2026 · Jun 19, 2026 · Jun 18, 2026
diff --git a/CMake/Target.cmake b/CMake/Target.cmake
@@ -2,6 +2,7 @@ include(GenerateExportHeader)
 include(CMakePackageConfigHelpers)
 
 option(BUILD_TEST "Build unit tests" ON)
+option(BUILD_BENCHMARK "Build benchmarks" ON)
 
 set(GENERATED_DIR ${CMAKE_BINARY_DIR}/Generated CACHE PATH "" FORCE)
 set(GENERATED_API_HEADER_DIR ${GENERATED_DIR}/Api CACHE PATH "" FORCE)
@@ -18,6 +19,12 @@ else()
     add_compile_definitions(BUILD_TEST=0)
 endif()
 
+if (${BUILD_BENCHMARK})
+    add_compile_definitions(BUILD_BENCHMARK=1)
+else()
+    add_compile_definitions(BUILD_BENCHMARK=0)
+endif()
+
 if ("${SUB_PROJECT_NAME}" STREQUAL "")
     message(FATAL_ERROR "SUB_PROJECT_NAME not defined, please set it in your project cmake")
 endif ()
@@ -379,7 +386,7 @@ endfunction()
 function(exp_add_library)
     set(options NOT_INSTALL)
     set(singleValueArgs NAME TYPE)
-    set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB REFLECT)
+    set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB PRIVATE_COMPILE_OPT PUBLIC_COMPILE_OPT REFLECT)
     cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
 
     if ("${arg_TYPE}" STREQUAL "SHARED")
@@ -448,6 +455,11 @@ function(exp_add_library)
         PRIVATE ${arg_PRIVATE_LIB}
         PUBLIC ${arg_PUBLIC_LIB}
     )
+    target_compile_options(
+        ${arg_NAME}
+        PRIVATE ${arg_PRIVATE_COMPILE_OPT}
+        PUBLIC ${arg_PUBLIC_COMPILE_OPT}
+    )
 
     if ("${arg_TYPE}" STREQUAL "SHARED")
         string(TOUPPER ${arg_NAME}_API api_name)
@@ -524,6 +536,30 @@ function(exp_add_test)
     )
 endfunction()
 
+function(exp_add_benchmark)
+    if (NOT ${BUILD_BENCHMARK})
+        return()
+    endif()
+
+    set(options "")
+    set(singleValueArgs NAME)
+    set(multiValueArgs SRC INC LINK LIB DEP_TARGET RES REFLECT)
+    cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    exp_add_executable(
+        NAME ${arg_NAME}
+        FOLDER Benchmark
+        SRC ${arg_SRC}
+        INC ${arg_INC}
+        LINK ${arg_LINK}
+        LIB Benchmark ${arg_LIB}
+        DEP_TARGET ${arg_DEP_TARGET}
+        RES ${arg_RES}
+        REFLECT ${arg_REFLECT}
+        NOT_INSTALL
+    )
+endfunction()
+
 install(
     EXPORT ${SUB_PROJECT_NAME}Targets
     FILE ${SUB_PROJECT_NAME}Targets.cmake

diff --git a/Engine/Source/Benchmark/CMakeLists.txt b/Engine/Source/Benchmark/CMakeLists.txt
@@ -0,0 +1,6 @@
+exp_add_library(
+    NAME Benchmark
+    TYPE STATIC
+    SRC Src/Main.cpp
+    PUBLIC_LIB benchmark::benchmark
+)
diff --git a/Engine/Source/Benchmark/Src/Main.cpp b/Engine/Source/Benchmark/Src/Main.cpp
@@ -0,0 +1,7 @@
+//
+// Created by johnk on 2026/6/19.
+//
+
+#include <benchmark/benchmark.h>
+
+BENCHMARK_MAIN();
diff --git a/Engine/Source/CMakeLists.txt b/Engine/Source/CMakeLists.txt
@@ -2,6 +2,10 @@ if (${BUILD_TEST})
     add_subdirectory(Test)
 endif()
 
+if (${BUILD_BENCHMARK})
+    add_subdirectory(Benchmark)
+endif()
+
 add_subdirectory(Common)
 add_subdirectory(Core)
 add_subdirectory(Mirror)

diff --git a/Engine/Source/Common/Benchmark/MathBenchmark.cpp b/Engine/Source/Common/Benchmark/MathBenchmark.cpp
@@ -0,0 +1,228 @@
+//
+// Created by johnk on 2026/6/19.
+//
+
+#include <random>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+#include <Common/Math/Vector.h>
+#include <Common/Math/Matrix.h>
+#include <Common/Math/Quaternion.h>
+
+using namespace Common;
+
+// A single 4-wide op (one Vec add, one dot) is latency-bound and, for fixed-size loops, the compiler already
+// auto-vectorizes the scalar backend, so an isolated op shows no SIMD delta. Worse, with compile-time-constant inputs
+// the whole computation is constant-folded and hoisted out of the loop, so a single-op benchmark would measure only a
+// DoNotOptimize store. These benchmarks instead run each op over a runtime-randomized batch (inputs the optimizer can
+// not fold, output consumed via DoNotOptimize/ClobberMemory) and report items/s, which is the throughput metric where
+// SIMD's lane width actually shows up.
+namespace {
+    constexpr int batchSize = 1024;
+
+    std::vector<float> MakeRandomFloats(const size_t count)
+    {
+        std::mt19937 rng(0x1234u);
+        std::uniform_real_distribution<float> dist(0.5f, 1.5f);
+        std::vector<float> values(count);
+        for (auto& value : values) {
+            value = dist(rng);
+        }
+        return values;
+    }
+
+    template <MathBackend B>
+    std::vector<Vec<float, 4, B>> MakeRandomVecs(const size_t count)
+    {
+        const auto raw = MakeRandomFloats(count * 4);
+        std::vector<Vec<float, 4, B>> result(count);
+        for (size_t i = 0; i < count; i++) {
+            result[i] = Vec<float, 4, B>(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]);
+        }
+        return result;
+    }
+
+    template <MathBackend B>
+    std::vector<Mat<float, 4, 4, B>> MakeRandomMats(const size_t count)
+    {
+        const auto raw = MakeRandomFloats(count * 16);
+        std::vector<Mat<float, 4, 4, B>> result(count);
+        for (size_t i = 0; i < count; i++) {
+            const float* p = &raw[i * 16];
+            result[i] = Mat<float, 4, 4, B>(
+                p[0], p[1], p[2], p[3],
+                p[4], p[5], p[6], p[7],
+                p[8], p[9], p[10], p[11],
+                p[12], p[13], p[14], p[15]);
+        }
+        return result;
+    }
+
+    template <MathBackend B>
+    std::vector<Quaternion<float, B>> MakeRandomQuats(const size_t count)
+    {
+        const auto raw = MakeRandomFloats(count * 4);
+        std::vector<Quaternion<float, B>> result(count);
+        for (size_t i = 0; i < count; i++) {
+            result[i] = Quaternion<float, B>(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]);
+        }
+        return result;
+    }
+
+    template <MathBackend B>
+    std::vector<Vec<float, 3, B>> MakeRandomVec3s(const size_t count)
+    {
+        const auto raw = MakeRandomFloats(count * 3);
+        std::vector<Vec<float, 3, B>> result(count);
+        for (size_t i = 0; i < count; i++) {
+            result[i] = Vec<float, 3, B>(raw[i * 3 + 0], raw[i * 3 + 1], raw[i * 3 + 2]);
+        }
+        return result;
+    }
+
+    template <MathBackend B>
+    std::vector<Mat<float, 3, 3, B>> MakeRandomMat3s(const size_t count)
+    {
+        const auto raw = MakeRandomFloats(count * 9);
+        std::vector<Mat<float, 3, 3, B>> result(count);
+        for (size_t i = 0; i < count; i++) {
+            const float* p = &raw[i * 9];
+            result[i] = Mat<float, 3, 3, B>(
+                p[0], p[1], p[2],
+                p[3], p[4], p[5],
+                p[6], p[7], p[8]);
+        }
+        return result;
+    }
+}
+
+template <MathBackend B>
+static void VecAddBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomVecs<B>(batchSize);
+    const auto b = MakeRandomVecs<B>(batchSize);
+    std::vector<Vec<float, 4, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = a[i] + b[i];
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(VecAddBatch<MathBackend::scalar>);
+BENCHMARK(VecAddBatch<MathBackend::simd>);
+
+template <MathBackend B>
+static void VecDotBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomVecs<B>(batchSize);
+    const auto b = MakeRandomVecs<B>(batchSize);
+    for (auto _ : state) {
+        float sum = 0.0f;
+        for (int i = 0; i < batchSize; i++) {
+            sum += a[i].Dot(b[i]);
+        }
+        benchmark::DoNotOptimize(sum);
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(VecDotBatch<MathBackend::scalar>);
+BENCHMARK(VecDotBatch<MathBackend::simd>);
+
+template <MathBackend B>
+static void MatMulBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomMats<B>(batchSize);
+    const auto b = MakeRandomMats<B>(batchSize);
+    std::vector<Mat<float, 4, 4, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = a[i] * b[i];
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(MatMulBatch<MathBackend::scalar>);
+BENCHMARK(MatMulBatch<MathBackend::simd>);
+
+// QuatOps<float, simd>::Mul evaluates the Hamilton product as four broadcast-and-permute terms, so this measures the
+// SIMD quaternion product against the scalar one rather than a tie.
+template <MathBackend B>
+static void QuatMulBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomQuats<B>(batchSize);
+    const auto b = MakeRandomQuats<B>(batchSize);
+    std::vector<Quaternion<float, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = a[i] * b[i];
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(QuatMulBatch<MathBackend::scalar>);
+BENCHMARK(QuatMulBatch<MathBackend::simd>);
+
+// Mat3 keeps its tight float[9] storage; the simd backend loads it with safe partial loads (two full 128-bit loads
+// plus a Load3 tail). These batches show whether that 2b approach beats the scalar 3x3 paths once the per-op load cost
+// is amortized across the matrix product / transform.
+template <MathBackend B>
+static void Mat4InverseBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomMats<B>(batchSize);
+    std::vector<Mat<float, 4, 4, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = a[i].Inverse();
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(Mat4InverseBatch<MathBackend::scalar>);
+BENCHMARK(Mat4InverseBatch<MathBackend::simd>);
+
+template <MathBackend B>
+static void Mat3MulBatch(benchmark::State& state)
+{
+    const auto a = MakeRandomMat3s<B>(batchSize);
+    const auto b = MakeRandomMat3s<B>(batchSize);
+    std::vector<Mat<float, 3, 3, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = a[i] * b[i];
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(Mat3MulBatch<MathBackend::scalar>);
+BENCHMARK(Mat3MulBatch<MathBackend::simd>);
+
+template <MathBackend B>
+static void Mat3MulVecBatch(benchmark::State& state)
+{
+    const auto m = MakeRandomMat3s<B>(batchSize);
+    const auto v = MakeRandomVec3s<B>(batchSize);
+    std::vector<Vec<float, 3, B>> c(batchSize);
+    for (auto _ : state) {
+        for (int i = 0; i < batchSize; i++) {
+            c[i] = m[i] * v[i];
+        }
+        benchmark::DoNotOptimize(c.data());
+        benchmark::ClobberMemory();
+    }
+    state.SetItemsProcessed(state.iterations() * batchSize);
+}
+BENCHMARK(Mat3MulVecBatch<MathBackend::scalar>);
+BENCHMARK(Mat3MulVecBatch<MathBackend::simd>);
diff --git a/Engine/Source/Common/CMakeLists.txt b/Engine/Source/Common/CMakeLists.txt
@@ -1,10 +1,24 @@
+# Math SIMD baseline. The math types are header-only, so the option is PUBLIC: every consumer instantiates them and must
+# share one ISA baseline. The simd backend only relies on the SSE2/NEON baseline (present on every CPU this engine
+# targets), so it is always on; this just lifts the compiler's instruction-set baseline where that helps codegen. MSVC
+# x64 implies SSE2 and has no /arch:SSE4.2; on gcc/clang lift the baseline to SSE4.2 where supported (the check fails and
+# is skipped on non-x86 targets such as aarch64, which already ship NEON).
+if (NOT MSVC)
+    include(CheckCXXCompilerFlag)
+    check_cxx_compiler_flag("-msse4.2" has_msse42)
+    if (has_msse42)
+        set(math_public_compile_opt -msse4.2)
+    endif ()
+endif ()
+
 file(GLOB_RECURSE sources Src/*.cpp)
 exp_add_library(
     NAME Common
     TYPE STATIC
     SRC ${sources}
     PUBLIC_INC Include
     PUBLIC_LIB rapidjson debugbreak::debugbreak cityhash::cityhash Taskflow::Taskflow
+    PUBLIC_COMPILE_OPT ${math_public_compile_opt}
 )
 
 file(GLOB test_sources Test/*.cpp)
@@ -14,3 +28,11 @@ exp_add_test(
     SRC ${test_sources}
     LIB Common
 )
+
+# exp_add_benchmark early-returns when BUILD_BENCHMARK is OFF, so this is safe to declare unconditionally.
+file(GLOB benchmark_sources Benchmark/*.cpp)
+exp_add_benchmark(
+    NAME Common.Benchmark
+    SRC ${benchmark_sources}
+    LIB Common
+)