diff --git a/CMake/Target.cmake b/CMake/Target.cmake index 4c469956..8c93bd3c 100644 --- a/CMake/Target.cmake +++ b/CMake/Target.cmake @@ -2,6 +2,7 @@ include(GenerateExportHeader) include(CMakePackageConfigHelpers) option(BUILD_TEST "Build unit tests" ON) +option(BUILD_BENCHMARK "Build benchmarks" ON) set(GENERATED_DIR ${CMAKE_BINARY_DIR}/Generated CACHE PATH "" FORCE) set(GENERATED_API_HEADER_DIR ${GENERATED_DIR}/Api CACHE PATH "" FORCE) @@ -18,6 +19,12 @@ else() add_compile_definitions(BUILD_TEST=0) endif() +if (${BUILD_BENCHMARK}) + add_compile_definitions(BUILD_BENCHMARK=1) +else() + add_compile_definitions(BUILD_BENCHMARK=0) +endif() + if ("${SUB_PROJECT_NAME}" STREQUAL "") message(FATAL_ERROR "SUB_PROJECT_NAME not defined, please set it in your project cmake") endif () @@ -379,7 +386,7 @@ endfunction() function(exp_add_library) set(options NOT_INSTALL) set(singleValueArgs NAME TYPE) - set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB REFLECT) + set(multiValueArgs SRC PRIVATE_INC PUBLIC_INC PRIVATE_LINK PUBLIC_LINK PRIVATE_LIB PUBLIC_LIB PRIVATE_COMPILE_OPT PUBLIC_COMPILE_OPT REFLECT) cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) if ("${arg_TYPE}" STREQUAL "SHARED") @@ -448,6 +455,11 @@ function(exp_add_library) PRIVATE ${arg_PRIVATE_LIB} PUBLIC ${arg_PUBLIC_LIB} ) + target_compile_options( + ${arg_NAME} + PRIVATE ${arg_PRIVATE_COMPILE_OPT} + PUBLIC ${arg_PUBLIC_COMPILE_OPT} + ) if ("${arg_TYPE}" STREQUAL "SHARED") string(TOUPPER ${arg_NAME}_API api_name) @@ -524,6 +536,30 @@ function(exp_add_test) ) endfunction() +function(exp_add_benchmark) + if (NOT ${BUILD_BENCHMARK}) + return() + endif() + + set(options "") + set(singleValueArgs NAME) + set(multiValueArgs SRC INC LINK LIB DEP_TARGET RES REFLECT) + cmake_parse_arguments(arg "${options}" "${singleValueArgs}" "${multiValueArgs}" ${ARGN}) + + exp_add_executable( + NAME ${arg_NAME} + FOLDER Benchmark + SRC ${arg_SRC} + INC ${arg_INC} + LINK ${arg_LINK} + LIB Benchmark ${arg_LIB} + DEP_TARGET ${arg_DEP_TARGET} + RES ${arg_RES} + REFLECT ${arg_REFLECT} + NOT_INSTALL + ) +endfunction() + install( EXPORT ${SUB_PROJECT_NAME}Targets FILE ${SUB_PROJECT_NAME}Targets.cmake diff --git a/Engine/Source/Benchmark/CMakeLists.txt b/Engine/Source/Benchmark/CMakeLists.txt new file mode 100644 index 00000000..8d1a33c3 --- /dev/null +++ b/Engine/Source/Benchmark/CMakeLists.txt @@ -0,0 +1,6 @@ +exp_add_library( + NAME Benchmark + TYPE STATIC + SRC Src/Main.cpp + PUBLIC_LIB benchmark::benchmark +) diff --git a/Engine/Source/Benchmark/Src/Main.cpp b/Engine/Source/Benchmark/Src/Main.cpp new file mode 100644 index 00000000..c4f2923a --- /dev/null +++ b/Engine/Source/Benchmark/Src/Main.cpp @@ -0,0 +1,7 @@ +// +// Created by johnk on 2026/6/19. +// + +#include + +BENCHMARK_MAIN(); diff --git a/Engine/Source/CMakeLists.txt b/Engine/Source/CMakeLists.txt index 7973c315..6880b60e 100644 --- a/Engine/Source/CMakeLists.txt +++ b/Engine/Source/CMakeLists.txt @@ -2,6 +2,10 @@ if (${BUILD_TEST}) add_subdirectory(Test) endif() +if (${BUILD_BENCHMARK}) + add_subdirectory(Benchmark) +endif() + add_subdirectory(Common) add_subdirectory(Core) add_subdirectory(Mirror) diff --git a/Engine/Source/Common/Benchmark/MathBenchmark.cpp b/Engine/Source/Common/Benchmark/MathBenchmark.cpp new file mode 100644 index 00000000..0e0fabb5 --- /dev/null +++ b/Engine/Source/Common/Benchmark/MathBenchmark.cpp @@ -0,0 +1,228 @@ +// +// Created by johnk on 2026/6/19. +// + +#include +#include + +#include + +#include +#include +#include + +using namespace Common; + +// A single 4-wide op (one Vec add, one dot) is latency-bound and, for fixed-size loops, the compiler already +// auto-vectorizes the scalar backend, so an isolated op shows no SIMD delta. Worse, with compile-time-constant inputs +// the whole computation is constant-folded and hoisted out of the loop, so a single-op benchmark would measure only a +// DoNotOptimize store. These benchmarks instead run each op over a runtime-randomized batch (inputs the optimizer can +// not fold, output consumed via DoNotOptimize/ClobberMemory) and report items/s, which is the throughput metric where +// SIMD's lane width actually shows up. +namespace { + constexpr int batchSize = 1024; + + std::vector MakeRandomFloats(const size_t count) + { + std::mt19937 rng(0x1234u); + std::uniform_real_distribution dist(0.5f, 1.5f); + std::vector values(count); + for (auto& value : values) { + value = dist(rng); + } + return values; + } + + template + std::vector> MakeRandomVecs(const size_t count) + { + const auto raw = MakeRandomFloats(count * 4); + std::vector> result(count); + for (size_t i = 0; i < count; i++) { + result[i] = Vec(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]); + } + return result; + } + + template + std::vector> MakeRandomMats(const size_t count) + { + const auto raw = MakeRandomFloats(count * 16); + std::vector> result(count); + for (size_t i = 0; i < count; i++) { + const float* p = &raw[i * 16]; + result[i] = Mat( + p[0], p[1], p[2], p[3], + p[4], p[5], p[6], p[7], + p[8], p[9], p[10], p[11], + p[12], p[13], p[14], p[15]); + } + return result; + } + + template + std::vector> MakeRandomQuats(const size_t count) + { + const auto raw = MakeRandomFloats(count * 4); + std::vector> result(count); + for (size_t i = 0; i < count; i++) { + result[i] = Quaternion(raw[i * 4 + 0], raw[i * 4 + 1], raw[i * 4 + 2], raw[i * 4 + 3]); + } + return result; + } + + template + std::vector> MakeRandomVec3s(const size_t count) + { + const auto raw = MakeRandomFloats(count * 3); + std::vector> result(count); + for (size_t i = 0; i < count; i++) { + result[i] = Vec(raw[i * 3 + 0], raw[i * 3 + 1], raw[i * 3 + 2]); + } + return result; + } + + template + std::vector> MakeRandomMat3s(const size_t count) + { + const auto raw = MakeRandomFloats(count * 9); + std::vector> result(count); + for (size_t i = 0; i < count; i++) { + const float* p = &raw[i * 9]; + result[i] = Mat( + p[0], p[1], p[2], + p[3], p[4], p[5], + p[6], p[7], p[8]); + } + return result; + } +} + +template +static void VecAddBatch(benchmark::State& state) +{ + const auto a = MakeRandomVecs(batchSize); + const auto b = MakeRandomVecs(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = a[i] + b[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(VecAddBatch); +BENCHMARK(VecAddBatch); + +template +static void VecDotBatch(benchmark::State& state) +{ + const auto a = MakeRandomVecs(batchSize); + const auto b = MakeRandomVecs(batchSize); + for (auto _ : state) { + float sum = 0.0f; + for (int i = 0; i < batchSize; i++) { + sum += a[i].Dot(b[i]); + } + benchmark::DoNotOptimize(sum); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(VecDotBatch); +BENCHMARK(VecDotBatch); + +template +static void MatMulBatch(benchmark::State& state) +{ + const auto a = MakeRandomMats(batchSize); + const auto b = MakeRandomMats(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = a[i] * b[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(MatMulBatch); +BENCHMARK(MatMulBatch); + +// QuatOps::Mul evaluates the Hamilton product as four broadcast-and-permute terms, so this measures the +// SIMD quaternion product against the scalar one rather than a tie. +template +static void QuatMulBatch(benchmark::State& state) +{ + const auto a = MakeRandomQuats(batchSize); + const auto b = MakeRandomQuats(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = a[i] * b[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(QuatMulBatch); +BENCHMARK(QuatMulBatch); + +// Mat3 keeps its tight float[9] storage; the simd backend loads it with safe partial loads (two full 128-bit loads +// plus a Load3 tail). These batches show whether that 2b approach beats the scalar 3x3 paths once the per-op load cost +// is amortized across the matrix product / transform. +template +static void Mat4InverseBatch(benchmark::State& state) +{ + const auto a = MakeRandomMats(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = a[i].Inverse(); + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(Mat4InverseBatch); +BENCHMARK(Mat4InverseBatch); + +template +static void Mat3MulBatch(benchmark::State& state) +{ + const auto a = MakeRandomMat3s(batchSize); + const auto b = MakeRandomMat3s(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = a[i] * b[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(Mat3MulBatch); +BENCHMARK(Mat3MulBatch); + +template +static void Mat3MulVecBatch(benchmark::State& state) +{ + const auto m = MakeRandomMat3s(batchSize); + const auto v = MakeRandomVec3s(batchSize); + std::vector> c(batchSize); + for (auto _ : state) { + for (int i = 0; i < batchSize; i++) { + c[i] = m[i] * v[i]; + } + benchmark::DoNotOptimize(c.data()); + benchmark::ClobberMemory(); + } + state.SetItemsProcessed(state.iterations() * batchSize); +} +BENCHMARK(Mat3MulVecBatch); +BENCHMARK(Mat3MulVecBatch); diff --git a/Engine/Source/Common/CMakeLists.txt b/Engine/Source/Common/CMakeLists.txt index 044ee744..5b7f1648 100644 --- a/Engine/Source/Common/CMakeLists.txt +++ b/Engine/Source/Common/CMakeLists.txt @@ -1,3 +1,16 @@ +# Math SIMD baseline. The math types are header-only, so the option is PUBLIC: every consumer instantiates them and must +# share one ISA baseline. The simd backend only relies on the SSE2/NEON baseline (present on every CPU this engine +# targets), so it is always on; this just lifts the compiler's instruction-set baseline where that helps codegen. MSVC +# x64 implies SSE2 and has no /arch:SSE4.2; on gcc/clang lift the baseline to SSE4.2 where supported (the check fails and +# is skipped on non-x86 targets such as aarch64, which already ship NEON). +if (NOT MSVC) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-msse4.2" has_msse42) + if (has_msse42) + set(math_public_compile_opt -msse4.2) + endif () +endif () + file(GLOB_RECURSE sources Src/*.cpp) exp_add_library( NAME Common @@ -5,6 +18,7 @@ exp_add_library( SRC ${sources} PUBLIC_INC Include PUBLIC_LIB rapidjson debugbreak::debugbreak cityhash::cityhash Taskflow::Taskflow + PUBLIC_COMPILE_OPT ${math_public_compile_opt} ) file(GLOB test_sources Test/*.cpp) @@ -14,3 +28,11 @@ exp_add_test( SRC ${test_sources} LIB Common ) + +# exp_add_benchmark early-returns when BUILD_BENCHMARK is OFF, so this is safe to declare unconditionally. +file(GLOB benchmark_sources Benchmark/*.cpp) +exp_add_benchmark( + NAME Common.Benchmark + SRC ${benchmark_sources} + LIB Common +) diff --git a/Engine/Source/Common/Include/Common/Debug.h b/Engine/Source/Common/Include/Common/Debug.h index 5fc8a85d..3586972a 100644 --- a/Engine/Source/Common/Include/Common/Debug.h +++ b/Engine/Source/Common/Include/Common/Debug.h @@ -5,10 +5,11 @@ #pragma once #include +#include #include -#define Assert(expression) Common::Debug::AssertImpl(expression, #expression, __FILE__, __LINE__) -#define AssertWithReason(expression, reason) Common::Debug::AssertImpl(expression, #expression, __FILE__, __LINE__, reason) +#define Assert(expression) Common::Debug::AssertImpl((expression), #expression, __FILE__, __LINE__) +#define AssertWithReason(expression, reason) Common::Debug::AssertImpl((expression), #expression, __FILE__, __LINE__, (reason)) #define Unimplement() Assert(false) #define QuickFail() Assert(false) #define QuickFailWithReason(reason) AssertWithReason(false, reason) @@ -32,11 +33,23 @@ namespace Common { class Debug { public: - static void AssertImpl(bool expression, const std::string& name, const std::string& file, uint32_t line, const std::string& reason = ""); + // The passing case lives here and is inlined, so the optimizer can see through it: a compile-time-true + // condition folds away entirely, and a runtime one collapses to a single predicted-not-taken branch with no + // std::string construction and no heap allocation. Only an actual failure reaches the out-of-line cold path. + static void AssertImpl(bool expression, const char* name, const char* file, uint32_t line, + std::string_view reason = {}) + { + if (expression) { + return; + } + AssertFailed(name, file, line, reason); + } ~Debug(); private: + static void AssertFailed(const char* name, const char* file, uint32_t line, std::string_view reason); + Debug(); }; diff --git a/Engine/Source/Common/Include/Common/Math/Matrix.h b/Engine/Source/Common/Include/Common/Math/Matrix.h index efffeed1..9d5b7493 100644 --- a/Engine/Source/Common/Include/Common/Math/Matrix.h +++ b/Engine/Source/Common/Include/Common/Math/Matrix.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -24,35 +25,36 @@ namespace Common { max }; - template + template struct Quaternion; // matrix stored in row-major - template + template requires ValidMatDims struct BaseMat { T data[R * C]; }; - template - struct Mat : BaseMat { + template + struct Mat : BaseMat { using Type = T; static constexpr uint8_t rows = R; static constexpr uint8_t cols = C; + static constexpr MathBackend backend = B; - template ... IT> + template ... IT> requires ArgsNumEqual static Mat FromRowVecs(IT&&... inVectors); - template ... IT> + template ... IT> requires ArgsNumEqual static Mat FromColVecs(IT&&... inVectors); Mat(); Mat(T inValue); // NOLINT - Mat(const Mat& other); - Mat(Mat&& other) noexcept; - Mat& operator=(const Mat& other); + Mat(const Mat& other) = default; + Mat(Mat&& other) noexcept = default; + Mat& operator=(const Mat& other) = default; template requires ArgsNumGreater<1, IT...> @@ -86,10 +88,10 @@ namespace Common { Mat& operator-=(const Mat& rhs); template - Mat operator*(const Mat& rhs) const; + Mat operator*(const Mat& rhs) const; - Vec Row(uint8_t index) const; - Vec Col(uint8_t index) const; + Vec Row(uint8_t index) const; + Vec Col(uint8_t index) const; template void SetValues(IT&&... inValues); @@ -107,34 +109,34 @@ namespace Common { void SetCol(uint8_t index, IT&&... inValues); template - Mat CastTo() const; + Mat CastTo() const; - Mat Transpose() const; + Mat Transpose() const; template requires ValidSubMatDims - Mat SubMatrix() const; + Mat SubMatrix() const; bool CanInverse() const; Mat Inverse() const; T Determinant() const; - Vec ExtractTranslation() const; - Vec ExtractScale() const; - Quaternion ExtractRotation() const; + Vec ExtractTranslation() const; + Vec ExtractScale() const; + Quaternion ExtractRotation() const; }; - template + template requires ValidMatDims struct MatConsts { - static const Mat zero; + static const Mat zero; }; - template + template requires ValidVecDim - struct MatConsts { - static const Mat zero; - static const Mat identity; + struct MatConsts { + static const Mat zero; + static const Mat identity; }; using BMat1x1 = Mat; @@ -309,13 +311,13 @@ namespace Common { } namespace Common { // NOLINT - template - struct Serializer> { + template + struct Serializer> { static constexpr size_t typeId = HashUtils::StrCrc32("Common::Matrix") + Serializer::typeId + (R << 8) + C; - static size_t Serialize(BinarySerializeStream& stream, const Mat& value) + static size_t Serialize(BinarySerializeStream& stream, const Mat& value) { auto serialized = 0; for (auto i = 0; i < R * C; i++) { @@ -324,7 +326,7 @@ namespace Common { // NOLINT return serialized; } - static size_t Deserialize(BinaryDeserializeStream& stream, Mat& value) + static size_t Deserialize(BinaryDeserializeStream& stream, Mat& value) { auto deserialized = 0; for (auto i = 0; i < R * C; i++) { @@ -334,9 +336,9 @@ namespace Common { // NOLINT } }; - template - struct StringConverter> { - static std::string ToString(const Mat& inValue) + template + struct StringConverter> { + static std::string ToString(const Mat& inValue) { std::stringstream stream; stream << "("; @@ -353,9 +355,9 @@ namespace Common { // NOLINT } }; - template - struct JsonSerializer> { - static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Mat& inValue) + template + struct JsonSerializer> { + static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Mat& inValue) { outJsonValue.SetArray(); outJsonValue.Reserve(R * C, inAllocator); @@ -368,7 +370,7 @@ namespace Common { // NOLINT } } - static void JsonDeserialize(const rapidjson::Value& inJsonValue, Mat& outValue) + static void JsonDeserialize(const rapidjson::Value& inJsonValue, Mat& outValue) { if (!inJsonValue.IsArray() || inJsonValue.Size() != R * C) { return; @@ -386,8 +388,8 @@ namespace Common { // NOLINT } namespace Common::Internal { - template - static void CopyValuesToMatrix(Mat& matrix, VT&&... inValue, std::index_sequence) + template + static void CopyValuesToMatrix(Mat& matrix, VT&&... inValue, std::index_sequence) { static_assert(R * C == sizeof...(VT) && sizeof...(VT) == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -395,8 +397,8 @@ namespace Common::Internal { }(), 0)... }; } - template - static void CopyValuesToMatrixRow(Mat& matrix, uint8_t index, VT&&... inValue, std::index_sequence) + template + static void CopyValuesToMatrixRow(Mat& matrix, uint8_t index, VT&&... inValue, std::index_sequence) { static_assert(C == sizeof...(VT) && sizeof...(VT) == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -404,8 +406,8 @@ namespace Common::Internal { }(), 0)... }; } - template - static void CopyValuesToMatrixCol(Mat& matrix, uint8_t index, VT&&... inValue, std::index_sequence) + template + static void CopyValuesToMatrixCol(Mat& matrix, uint8_t index, VT&&... inValue, std::index_sequence) { static_assert(R == sizeof...(VT) && sizeof...(VT) == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -413,8 +415,8 @@ namespace Common::Internal { }(), 0)... }; } - template - static void CopyVectorToMatrixRow(Mat& matrix, uint8_t index, const Vec& inVector, std::index_sequence) + template + static void CopyVectorToMatrixRow(Mat& matrix, uint8_t index, const Vec& inVector, std::index_sequence) { static_assert(C == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -422,8 +424,8 @@ namespace Common::Internal { }(), 0)... }; } - template - static void CopyVectorToMatrixCol(Mat& matrix, uint8_t index, const Vec& inVector, std::index_sequence) + template + static void CopyVectorToMatrixCol(Mat& matrix, uint8_t index, const Vec& inVector, std::index_sequence) { static_assert(R == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -431,28 +433,28 @@ namespace Common::Internal { }(), 0)... }; } - template - static void CopyRowVectorsToMatrix(Mat& matrix, VT&&... inVectors, std::index_sequence) + template + static void CopyRowVectorsToMatrix(Mat& matrix, VT&&... inVectors, std::index_sequence) { static_assert(R == sizeof...(VT) && sizeof...(VT) == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { - static_assert(std::is_same_v>); + static_assert(std::is_same_v>); CopyVectorToMatrixRow(matrix, VI, inVectors, std::make_index_sequence {}); }(), 0)... }; } - template - static void CopyColVectorsToMatrix(Mat& matrix, VT&&... inVectors, std::index_sequence) + template + static void CopyColVectorsToMatrix(Mat& matrix, VT&&... inVectors, std::index_sequence) { static_assert(C == sizeof...(VT) && sizeof...(VT) == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { - static_assert(std::is_same_v>); + static_assert(std::is_same_v>); CopyVectorToMatrixCol(matrix, VI, inVectors, std::make_index_sequence {}); }(), 0)... }; } - template - static void SetMatrixToIdentity(Mat& matrix, std::index_sequence) + template + static void SetMatrixToIdentity(Mat& matrix, std::index_sequence) { static_assert(L == sizeof...(VI)); (void) std::initializer_list { ([&]() -> void { @@ -460,81 +462,461 @@ namespace Common::Internal { }(), 0)... }; } - template - static Mat GetIdentityMatrix() + template + static Mat GetIdentityMatrix() { - Mat result; + Mat result; SetMatrixToIdentity(result, std::make_index_sequence {}); return result; } + } -namespace Common { - template - template ... IT> - requires ArgsNumEqual - Mat Mat::FromRowVecs(IT&&... inVectors) +namespace Common::Internal { + // Single source of truth for the scalar matrix kernels. The primary MatOps and the SIMD specializations both route + // their non-accelerated operations here, so the generic implementation lives exactly once and Mat's members stay + // pure delegations to MatOps. + template + Mat MatTransposeScalar(const Mat& m) { - Mat result; - result.SetRows(std::forward(inVectors)...); + Mat result; + for (auto i = 0; i < R; i++) { + for (auto j = 0; j < C; j++) { + result.At(j, i) = m.At(i, j); + } + } return result; } - template - template ... IT> - requires ArgsNumEqual - Mat Mat::FromColVecs(IT&&... inVectors) + template + Vec MatMulVecScalar(const Mat& mat, const Vec& vec) { - Mat result; - result.SetCols(std::forward(inVectors)...); + Vec result; + for (auto i = 0; i < R; i++) { + result[i] = mat.Row(i).Dot(vec); + } return result; } - template - Mat::Mat() + template + T MatDeterminantScalar(const Mat& m) { - for (auto i = 0; i < R * C; i++) { - this->data[i] = 0; + static_assert(R == C && R > 1 && R < 5); + T result = static_cast(0); + if constexpr (R == 2) { + result = m.data[0] * m.data[3] - m.data[1] * m.data[2]; } + + if constexpr (R == 3) { + result = + m.data[0] * (m.data[4] * m.data[8] - m.data[5] * m.data[7]) + - m.data[1] * (m.data[3] * m.data[8] - m.data[5] * m.data[6]) + + m.data[2] * (m.data[3] * m.data[7] - m.data[4] * m.data[6]); + } + + if constexpr (R == 4) { + T subFactor0 = m.data[10] * m.data[15] - m.data[11] * m.data[14]; + T subFactor1 = m.data[6] * m.data[15] - m.data[7] * m.data[14]; + T subFactor2 = m.data[6] * m.data[11] - m.data[7] * m.data[10]; + T subFactor3 = m.data[2] * m.data[15] - m.data[3] * m.data[14]; + T subFactor4 = m.data[2] * m.data[11] - m.data[3] * m.data[10]; + T subFactor5 = m.data[2] * m.data[7] - m.data[3] * m.data[6]; + + T detCoef0 = m.data[5] * subFactor0 - m.data[9] * subFactor1 + m.data[13] * subFactor2; + T detCoef1 = m.data[9] * subFactor3 - m.data[1] * subFactor0 - m.data[13] * subFactor4; + T detCoef2 = m.data[1] * subFactor1 - m.data[5] * subFactor3 + m.data[13] * subFactor5; + T detCoef3 = m.data[5] * subFactor4 - m.data[1] * subFactor2 - m.data[9] * subFactor5; + + result = m.data[0] * detCoef0 + m.data[4] * detCoef1 + m.data[8] * detCoef2 + m.data[12] * detCoef3; + } + + return result; } - template - Mat::Mat(T inValue) + template + Mat MatInverseScalar(const Mat& m) { - for (auto i = 0; i < R * C; i++) { - this->data[i] = inValue; + static_assert(R == C && R > 1 && R < 5); + T oneOverDet = static_cast(1) / MatDeterminantScalar(m); + + Mat result; + if constexpr (R == 2) { + result.At(0, 0) = m.data[3] * oneOverDet; + result.At(0, 1) = -m.data[1] * oneOverDet; + result.At(1, 0) = -m.data[2] * oneOverDet; + result.At(1, 1) = m.data[0] * oneOverDet; + } + + if constexpr (R == 3) { + result.At(0, 0) = (m.data[4] * m.data[8] - m.data[5] * m.data[7]) * oneOverDet; + result.At(0, 1) = (m.data[2] * m.data[7] - m.data[1] * m.data[8]) * oneOverDet; + result.At(0, 2) = (m.data[1] * m.data[5] - m.data[2] * m.data[4]) * oneOverDet; + result.At(1, 0) = (m.data[5] * m.data[6] - m.data[3] * m.data[8]) * oneOverDet; + result.At(1, 1) = (m.data[0] * m.data[8] - m.data[2] * m.data[6]) * oneOverDet; + result.At(1, 2) = (m.data[2] * m.data[3] - m.data[0] * m.data[5]) * oneOverDet; + result.At(2, 0) = (m.data[3] * m.data[7] - m.data[4] * m.data[6]) * oneOverDet; + result.At(2, 1) = (m.data[1] * m.data[6] - m.data[7] * m.data[0]) * oneOverDet; + result.At(2, 2) = (m.data[0] * m.data[4] - m.data[1] * m.data[3]) * oneOverDet; } + + if constexpr (R == 4) { + T coef00 = m.data[10] * m.data[15] - m.data[11] * m.data[14]; + T coef02 = m.data[9] * m.data[15] - m.data[11] * m.data[13]; + T coef03 = m.data[9] * m.data[14] - m.data[10] * m.data[13]; + + T coef04 = m.data[6] * m.data[15] - m.data[7] * m.data[14]; + T coef06 = m.data[5] * m.data[15] - m.data[7] * m.data[13]; + T coef07 = m.data[5] * m.data[14] - m.data[6] * m.data[13]; + + T coef08 = m.data[6] * m.data[11] - m.data[7] * m.data[10]; + T coef10 = m.data[5] * m.data[11] - m.data[7] * m.data[9]; + T coef11 = m.data[5] * m.data[10] - m.data[6] * m.data[9]; + + T coef12 = m.data[2] * m.data[15] - m.data[3] * m.data[14]; + T coef14 = m.data[1] * m.data[15] - m.data[3] * m.data[13]; + T coef15 = m.data[1] * m.data[14] - m.data[2] * m.data[13]; + + T coef16 = m.data[2] * m.data[11] - m.data[3] * m.data[10]; + T coef18 = m.data[1] * m.data[11] - m.data[3] * m.data[9]; + T coef19 = m.data[1] * m.data[10] - m.data[2] * m.data[9]; + + T coef20 = m.data[2] * m.data[7] - m.data[3] * m.data[6]; + T coef22 = m.data[1] * m.data[7] - m.data[3] * m.data[5]; + T coef23 = m.data[1] * m.data[6] - m.data[2] * m.data[5]; + + Vec fac0(coef00, coef00, coef02, coef03); + Vec fac1(coef04, coef04, coef06, coef07); + Vec fac2(coef08, coef08, coef10, coef11); + Vec fac3(coef12, coef12, coef14, coef15); + Vec fac4(coef16, coef16, coef18, coef19); + Vec fac5(coef20, coef20, coef22, coef23); + + Vec vec0(m.data[1], m.data[0], m.data[0], m.data[0]); + Vec vec1(m.data[5], m.data[4], m.data[4], m.data[4]); + Vec vec2(m.data[9], m.data[8], m.data[8], m.data[8]); + Vec vec3(m.data[13], m.data[12], m.data[12], m.data[12]); + + Vec inv0(vec1 * fac0 - vec2 * fac1 + vec3 * fac2); + Vec inv1(vec0 * fac0 - vec2 * fac3 + vec3 * fac4); + Vec inv2(vec0 * fac1 - vec1 * fac3 + vec3 * fac5); + Vec inv3(vec0 * fac2 - vec1 * fac4 + vec2 * fac5); + + Vec signA(+1, -1, +1, -1); + Vec signB(-1, +1, -1, +1); + + Vec col0 = inv0 * signA; + Vec col1 = inv1 * signB; + Vec col2 = inv2 * signA; + Vec col3 = inv3 * signB; + + result.SetCol(0, col0); + result.SetCol(1, col1); + result.SetCol(2, col2); + result.SetCol(3, col3); + + result = result * oneOverDet; + } + + return result; } - template - Mat::Mat(const Mat& other) - { - for (auto i = 0; i < R * C; i++) { - this->data[i] = other.data[i]; + // Per-backend dispatch for matrix operations. The primary template forwards every operation to the scalar kernels + // above; the SIMD specializations override the ones that benefit and forward the rest, so each Mat member is a + // single MatOps call regardless of backend. + template + struct MatOps { + static Mat Add(const Mat& a, const Mat& b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] + b.data[i]; } + return result; + } + + static Mat Sub(const Mat& a, const Mat& b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] - b.data[i]; } + return result; + } + + static Mat AddScalar(const Mat& a, T b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] + b; } + return result; + } + + static Mat SubScalar(const Mat& a, T b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] - b; } + return result; + } + + static Mat MulScalar(const Mat& a, T b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] * b; } + return result; + } + + static Mat DivScalar(const Mat& a, T b) + { + Mat result; + for (auto i = 0; i < R * C; i++) { result.data[i] = a.data[i] / b; } + return result; + } + + static Mat Transpose(const Mat& m) { return MatTransposeScalar(m); } + static Vec MulVec(const Mat& m, const Vec& v) { return MatMulVecScalar(m, v); } + static T Determinant(const Mat& m) { return MatDeterminantScalar(m); } + static Mat Inverse(const Mat& m) { return MatInverseScalar(m); } + }; + + // Row-major 4x4 float matrix, backed by float[16] (four contiguous rows of 16 bytes each), so each row maps to an + // unaligned 128-bit load/store. Besides the element-wise ops shared with the primary template, this also offers a + // SIMD matrix product (Mul) that Mat::operator* dispatches to for the 4x4 float case. + template <> + struct MatOps { + using M = Mat; + using V = Vec; + + static M Add(const M& a, const M& b) { M r; Simd::MapBinary<16>(r.data, a.data, b.data, Simd::AddOp {}); return r; } + static M Sub(const M& a, const M& b) { M r; Simd::MapBinary<16>(r.data, a.data, b.data, Simd::SubOp {}); return r; } + + static M AddScalar(const M& a, float b) { M r; Simd::MapScalar<16>(r.data, a.data, b, Simd::AddOp {}); return r; } + static M SubScalar(const M& a, float b) { M r; Simd::MapScalar<16>(r.data, a.data, b, Simd::SubOp {}); return r; } + static M MulScalar(const M& a, float b) { M r; Simd::MapScalar<16>(r.data, a.data, b, Simd::MulOp {}); return r; } + static M DivScalar(const M& a, float b) { M r; Simd::MapScalar<16>(r.data, a.data, b, Simd::DivOp {}); return r; } + + // C = A * B, row-major. Each output row is a linear combination of B's rows weighted by one row of A: + // C_row_i = A[i][0]*B_row0 + A[i][1]*B_row1 + A[i][2]*B_row2 + A[i][3]*B_row3. + static M Mul(const M& a, const M& b) + { + const Simd::F32x4 bRow0 = Simd::LoadU(&b.data[0]); + const Simd::F32x4 bRow1 = Simd::LoadU(&b.data[4]); + const Simd::F32x4 bRow2 = Simd::LoadU(&b.data[8]); + const Simd::F32x4 bRow3 = Simd::LoadU(&b.data[12]); + + M result; + for (auto i = 0; i < 4; i++) { + const Simd::F32x4 row = Simd::Add( + Simd::Add( + Simd::Mul(Simd::Set1(a.data[i * 4 + 0]), bRow0), + Simd::Mul(Simd::Set1(a.data[i * 4 + 1]), bRow1)), + Simd::Add( + Simd::Mul(Simd::Set1(a.data[i * 4 + 2]), bRow2), + Simd::Mul(Simd::Set1(a.data[i * 4 + 3]), bRow3))); + Simd::StoreU(&result.data[i * 4], row); + } + return result; } + + // result = M * v, with v a column vector: result[i] = dot(row_i, v). Loading v once and reducing each row with + // the SIMD horizontal sum avoids the temporary row Vec the scalar path builds. The reduction order matches + // VecOps::Dot, so the result agrees with the scalar backend within float tolerance. + static V MulVec(const M& m, const V& v) + { + const Simd::F32x4 vv = Simd::LoadU(v.data); + V result; + result.data[0] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[0]), vv)); + result.data[1] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[4]), vv)); + result.data[2] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[8]), vv)); + result.data[3] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[12]), vv)); + return result; + } + + static M Transpose(const M& m) + { + Simd::F32x4 r0 = Simd::LoadU(&m.data[0]); + Simd::F32x4 r1 = Simd::LoadU(&m.data[4]); + Simd::F32x4 r2 = Simd::LoadU(&m.data[8]); + Simd::F32x4 r3 = Simd::LoadU(&m.data[12]); + Simd::Transpose4(r0, r1, r2, r3); + + M result; + Simd::StoreU(&result.data[0], r0); + Simd::StoreU(&result.data[4], r1); + Simd::StoreU(&result.data[8], r2); + Simd::StoreU(&result.data[12], r3); + return result; + } + + // Cofactor-expansion inverse, mirroring the scalar Mat4 path but building every 2x2 cofactor vector (fac) and + // the row-broadcast vectors (vec) with single-row shuffles instead of scalar gathers. fac for the row pair + // (p, q) is (p2*q3 - p3*q2, same, p1*q3 - p3*q1, p1*q2 - p2*q1). The cofactor columns are transposed into rows, + // and the determinant is recovered as row0 . (first row of the cofactor matrix), so there is no second + // Determinant pass. + static M Inverse(const M& m) + { + const Simd::F32x4 r0 = Simd::LoadU(&m.data[0]); + const Simd::F32x4 r1 = Simd::LoadU(&m.data[4]); + const Simd::F32x4 r2 = Simd::LoadU(&m.data[8]); + const Simd::F32x4 r3 = Simd::LoadU(&m.data[12]); + + const auto makeFac = [](const Simd::F32x4 p, const Simd::F32x4 q) { + return Simd::Sub( + Simd::Mul(Simd::Shuffle<2, 2, 1, 1>(p), Simd::Shuffle<3, 3, 3, 2>(q)), + Simd::Mul(Simd::Shuffle<3, 3, 3, 2>(p), Simd::Shuffle<2, 2, 1, 1>(q))); + }; + + const Simd::F32x4 fac0 = makeFac(r2, r3); + const Simd::F32x4 fac1 = makeFac(r1, r3); + const Simd::F32x4 fac2 = makeFac(r1, r2); + const Simd::F32x4 fac3 = makeFac(r0, r3); + const Simd::F32x4 fac4 = makeFac(r0, r2); + const Simd::F32x4 fac5 = makeFac(r0, r1); + + const Simd::F32x4 vec0 = Simd::Shuffle<1, 0, 0, 0>(r0); + const Simd::F32x4 vec1 = Simd::Shuffle<1, 0, 0, 0>(r1); + const Simd::F32x4 vec2 = Simd::Shuffle<1, 0, 0, 0>(r2); + const Simd::F32x4 vec3 = Simd::Shuffle<1, 0, 0, 0>(r3); + + const Simd::F32x4 inv0 = Simd::Add(Simd::Sub(Simd::Mul(vec1, fac0), Simd::Mul(vec2, fac1)), Simd::Mul(vec3, fac2)); + const Simd::F32x4 inv1 = Simd::Add(Simd::Sub(Simd::Mul(vec0, fac0), Simd::Mul(vec2, fac3)), Simd::Mul(vec3, fac4)); + const Simd::F32x4 inv2 = Simd::Add(Simd::Sub(Simd::Mul(vec0, fac1), Simd::Mul(vec1, fac3)), Simd::Mul(vec3, fac5)); + const Simd::F32x4 inv3 = Simd::Add(Simd::Sub(Simd::Mul(vec0, fac2), Simd::Mul(vec1, fac4)), Simd::Mul(vec2, fac5)); + + const Simd::F32x4 signA = Simd::Set(1.0f, -1.0f, 1.0f, -1.0f); + const Simd::F32x4 signB = Simd::Set(-1.0f, 1.0f, -1.0f, 1.0f); + + Simd::F32x4 col0 = Simd::Mul(inv0, signA); + Simd::F32x4 col1 = Simd::Mul(inv1, signB); + Simd::F32x4 col2 = Simd::Mul(inv2, signA); + Simd::F32x4 col3 = Simd::Mul(inv3, signB); + + // det = row0 . (column 0 of the cofactor matrix). That column is the col0 register as-is, so compute the + // determinant before Transpose4 turns col0 into the cofactor matrix's first row. + const float det = Simd::Sum(Simd::Mul(r0, col0)); + const Simd::F32x4 oneOverDet = Simd::Set1(1.0f / det); + + Simd::Transpose4(col0, col1, col2, col3); + + M result; + Simd::StoreU(&result.data[0], Simd::Mul(col0, oneOverDet)); + Simd::StoreU(&result.data[4], Simd::Mul(col1, oneOverDet)); + Simd::StoreU(&result.data[8], Simd::Mul(col2, oneOverDet)); + Simd::StoreU(&result.data[12], Simd::Mul(col3, oneOverDet)); + return result; + } + + static float Determinant(const M& m) { return MatDeterminantScalar(m); } + }; + + // Row-major 3x3 float matrix, backed by a tight float[9] (no padding, so the layout stays GPU/serialization + // friendly). The first eight elements are covered by two safe 128-bit loads (data[0..3], data[4..7]) with data[8] + // handled by a scalar tail; matrix-product rows and the transpose use Load3/Store3 to avoid over-running the + // float[9] on the last row. The 4th lane is always discarded on store, so the garbage it may carry is harmless. + template <> + struct MatOps { + using M = Mat; + using V = Vec; + + // MapBinary/MapScalar<9> cover data[0..7] with two safe 128-bit loads (the second, at index 4, reads data[4..7]) + // and finish data[8] in the scalar tail, so the float[9] is never over-run. + static M Add(const M& a, const M& b) { M r; Simd::MapBinary<9>(r.data, a.data, b.data, Simd::AddOp {}); return r; } + static M Sub(const M& a, const M& b) { M r; Simd::MapBinary<9>(r.data, a.data, b.data, Simd::SubOp {}); return r; } + + static M AddScalar(const M& a, float b) { M r; Simd::MapScalar<9>(r.data, a.data, b, Simd::AddOp {}); return r; } + static M SubScalar(const M& a, float b) { M r; Simd::MapScalar<9>(r.data, a.data, b, Simd::SubOp {}); return r; } + static M MulScalar(const M& a, float b) { M r; Simd::MapScalar<9>(r.data, a.data, b, Simd::MulOp {}); return r; } + static M DivScalar(const M& a, float b) { M r; Simd::MapScalar<9>(r.data, a.data, b, Simd::DivOp {}); return r; } + + // C_row_i = A[i][0]*B_row0 + A[i][1]*B_row1 + A[i][2]*B_row2. B_row0/B_row1 come from safe full loads (their 4th + // lane is the next row's first element, unused); B_row2 uses Load3 to stay in bounds. + static M Mul(const M& a, const M& b) + { + const Simd::F32x4 bRow0 = Simd::LoadU(&b.data[0]); + const Simd::F32x4 bRow1 = Simd::LoadU(&b.data[3]); + const Simd::F32x4 bRow2 = Simd::Load3(&b.data[6]); + + M result; + for (auto i = 0; i < 3; i++) { + const Simd::F32x4 row = Simd::Add( + Simd::Add( + Simd::Mul(Simd::Set1(a.data[i * 3 + 0]), bRow0), + Simd::Mul(Simd::Set1(a.data[i * 3 + 1]), bRow1)), + Simd::Mul(Simd::Set1(a.data[i * 3 + 2]), bRow2)); + Simd::Store3(&result.data[i * 3], row); + } + return result; + } + + // result[i] = dot(row_i, v). v is loaded with Load3 so its 4th lane is 0, which zeroes the unused 4th lane the + // full row loads carry, leaving the 4-wide horizontal sum equal to the 3-component dot. + static V MulVec(const M& m, const V& v) + { + const Simd::F32x4 vv = Simd::Load3(v.data); + V result; + result.data[0] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[0]), vv)); + result.data[1] = Simd::Sum(Simd::Mul(Simd::LoadU(&m.data[3]), vv)); + result.data[2] = Simd::Sum(Simd::Mul(Simd::Load3(&m.data[6]), vv)); + return result; + } + + // 3x3 transpose via the 4x4 primitive with a zero 4th row: the garbage in the loaded rows' 4th lanes only lands + // in the discarded 4th output row, so the three Store3'd rows are the exact transpose. + static M Transpose(const M& m) + { + Simd::F32x4 r0 = Simd::LoadU(&m.data[0]); + Simd::F32x4 r1 = Simd::LoadU(&m.data[3]); + Simd::F32x4 r2 = Simd::Load3(&m.data[6]); + Simd::F32x4 r3 = Simd::Set1(0.0f); + Simd::Transpose4(r0, r1, r2, r3); + + M result; + Simd::Store3(&result.data[0], r0); + Simd::Store3(&result.data[3], r1); + Simd::Store3(&result.data[6], r2); + return result; + } + + static M Inverse(const M& m) { return MatInverseScalar(m); } + static float Determinant(const M& m) { return MatDeterminantScalar(m); } + }; +} + +namespace Common { + template + template ... IT> + requires ArgsNumEqual + Mat Mat::FromRowVecs(IT&&... inVectors) + { + Mat result; + result.SetRows(std::forward(inVectors)...); + return result; } - template - Mat::Mat(Mat&& other) noexcept + template + template ... IT> + requires ArgsNumEqual + Mat Mat::FromColVecs(IT&&... inVectors) + { + Mat result; + result.SetCols(std::forward(inVectors)...); + return result; + } + + template + Mat::Mat() { for (auto i = 0; i < R * C; i++) { - this->data[i] = std::move(other.data[i]); + this->data[i] = 0; } } - template - Mat& Mat::operator=(const Mat& other) + template + Mat::Mat(T inValue) { for (auto i = 0; i < R * C; i++) { - this->data[i] = other.data[i]; + this->data[i] = inValue; } - return *this; } - template + template template requires ArgsNumGreater<1, IT...> - Mat::Mat(IT&&... inValues) + Mat::Mat(IT&&... inValues) { static_assert(sizeof...(IT) == R || sizeof...(IT) == R * C); if constexpr (sizeof...(IT) == R * C) { @@ -544,36 +926,36 @@ namespace Common { } } - template - T& Mat::At(uint8_t row, uint8_t col) + template + T& Mat::At(uint8_t row, uint8_t col) { Assert(row < R && col < C); return this->data[row * C + col]; } - template - const T& Mat::At(uint8_t row, uint8_t col) const + template + const T& Mat::At(uint8_t row, uint8_t col) const { Assert(row < R && col < C); return this->data[row * C + col]; } - template - T& Mat::operator[](uint32_t index) + template + T& Mat::operator[](uint32_t index) { Assert(index < R * C); return this->data[index]; } - template - const T& Mat::operator[](uint32_t index) const + template + const T& Mat::operator[](uint32_t index) const { Assert(index < R * C); return this->data[index]; } - template - bool Mat::operator==(T rhs) const + template + bool Mat::operator==(T rhs) const { bool result = true; for (auto i = 0; i < R * C; i++) { @@ -582,8 +964,8 @@ namespace Common { return result; } - template - bool Mat::operator==(const Mat& rhs) const + template + bool Mat::operator==(const Mat& rhs) const { bool result = true; for (auto i = 0; i < R * C; i++) { @@ -592,250 +974,229 @@ namespace Common { return result; } - template - bool Mat::operator!=(T rhs) const + template + bool Mat::operator!=(T rhs) const { return !this->operator==(rhs); } - template - bool Mat::operator!=(const Mat& rhs) const + template + bool Mat::operator!=(const Mat& rhs) const { return !this->operator==(rhs); } - template - Mat Mat::operator+(T rhs) const + template + Mat Mat::operator+(T rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] + rhs; - } - return result; + return Internal::MatOps::AddScalar(*this, rhs); } - template - Mat Mat::operator-(T rhs) const + template + Mat Mat::operator-(T rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] - rhs; - } - return result; + return Internal::MatOps::SubScalar(*this, rhs); } - template - Mat Mat::operator*(T rhs) const + template + Mat Mat::operator*(T rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] * rhs; - } - return result; + return Internal::MatOps::MulScalar(*this, rhs); } - template - Mat Mat::operator/(T rhs) const + template + Mat Mat::operator/(T rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] / rhs; - } - return result; + return Internal::MatOps::DivScalar(*this, rhs); } - template - Mat Mat::operator+(const Mat& rhs) const + template + Mat Mat::operator+(const Mat& rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] + rhs.data[i]; - } - return result; + return Internal::MatOps::Add(*this, rhs); } - template - Mat Mat::operator-(const Mat& rhs) const + template + Mat Mat::operator-(const Mat& rhs) const { - Mat result; - for (auto i = 0; i < R * C; i++) { - result.data[i] = this->data[i] - rhs.data[i]; - } - return result; + return Internal::MatOps::Sub(*this, rhs); } - template - Mat& Mat::operator+=(T rhs) + template + Mat& Mat::operator+=(T rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] += rhs; - } + *this = Internal::MatOps::AddScalar(*this, rhs); return *this; } - template - Mat& Mat::operator-=(T rhs) + template + Mat& Mat::operator-=(T rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] -= rhs; - } + *this = Internal::MatOps::SubScalar(*this, rhs); return *this; } - template - Mat& Mat::operator*=(T rhs) + template + Mat& Mat::operator*=(T rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] *= rhs; - } + *this = Internal::MatOps::MulScalar(*this, rhs); return *this; } - template - Mat& Mat::operator/=(T rhs) + template + Mat& Mat::operator/=(T rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] /= rhs; - } + *this = Internal::MatOps::DivScalar(*this, rhs); return *this; } - template - Mat& Mat::operator+=(const Mat& rhs) + template + Mat& Mat::operator+=(const Mat& rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] += rhs.data[i]; - } + *this = Internal::MatOps::Add(*this, rhs); return *this; } - template - Mat& Mat::operator-=(const Mat& rhs) + template + Mat& Mat::operator-=(const Mat& rhs) { - for (auto i = 0; i < R * C; i++) { - this->data[i] -= rhs.data[i]; - } + *this = Internal::MatOps::Sub(*this, rhs); return *this; } - template + template template - Mat Mat::operator*(const Mat& rhs) const + Mat Mat::operator*(const Mat& rhs) const { - Mat result; - for (auto i = 0; i < R; i++) { - for (auto j = 0; j < IC; j++) { - result.At(i, j) = this->Row(i).Dot(rhs.Col(j)); + if constexpr (B == MathBackend::simd && std::is_same_v && R == 4 && C == 4 && IC == 4) { + return Internal::MatOps::Mul(*this, rhs); + } else if constexpr (B == MathBackend::simd && std::is_same_v && R == 3 && C == 3 && IC == 3) { + return Internal::MatOps::Mul(*this, rhs); + } else { + // Row-linear-combination order (ikj): each result row is sum_k A[i][k] * B_row_k. Unlike the textbook + // Row(i).Dot(Col(j)) form it builds no temporary vectors and does not gather B's columns with a stride. The + // row base pointers are hoisted so the inner j loop is plain contiguous accesses over rhs and result with no + // per-element bounds checks, which lets the compiler auto-vectorize it. The k=0 term seeds the row so no + // separate zeroing pass is needed; accumulation stays in ascending k order, so the numerical result matches + // the scalar dot-product and the SIMD paths. + Mat result; + for (auto i = 0; i < R; i++) { + T* resultRow = &result.data[i * IC]; + const T* aRow = &this->data[i * C]; + for (auto j = 0; j < IC; j++) { + resultRow[j] = aRow[0] * rhs.data[j]; + } + for (auto k = 1; k < C; k++) { + const T aik = aRow[k]; + const T* bRow = &rhs.data[k * IC]; + for (auto j = 0; j < IC; j++) { + resultRow[j] += aik * bRow[j]; + } + } } + return result; } - return result; } - template - Vec Mat::Row(uint8_t index) const + template + Vec Mat::Row(uint8_t index) const { Assert(index < R); - Vec result; + Vec result; for (auto i = 0; i < C; i++) { result[i] = At(index, i); } return result; } - template - Vec Mat::Col(uint8_t index) const + template + Vec Mat::Col(uint8_t index) const { Assert(index < C); - Vec result; + Vec result; for (auto i = 0; i < R; i++) { result[i] = At(i, index); } return result; } - template + template template - void Mat::SetValues(IT&&... inValues) + void Mat::SetValues(IT&&... inValues) { - Internal::CopyValuesToMatrix(*this, std::forward(inValues)..., std::make_index_sequence {}); + Internal::CopyValuesToMatrix(*this, std::forward(inValues)..., std::make_index_sequence {}); } - template + template template - void Mat::SetRows(IT&&... inVectors) + void Mat::SetRows(IT&&... inVectors) { - Internal::CopyRowVectorsToMatrix(*this, std::forward(inVectors)..., std::make_index_sequence {}); + Internal::CopyRowVectorsToMatrix(*this, std::forward(inVectors)..., std::make_index_sequence {}); } - template + template template - void Mat::SetCols(IT&&... inVectors) + void Mat::SetCols(IT&&... inVectors) { - Internal::CopyColVectorsToMatrix(*this, std::forward(inVectors)..., std::make_index_sequence {}); + Internal::CopyColVectorsToMatrix(*this, std::forward(inVectors)..., std::make_index_sequence {}); } - template + template template - void Mat::SetRow(uint8_t index, IT&&... inValues) + void Mat::SetRow(uint8_t index, IT&&... inValues) { - if constexpr (sizeof...(IT) == 1 && IsAllSame, IT...>::value) { - Internal::CopyVectorToMatrixRow(*this, index, std::forward(inValues)..., std::make_index_sequence {}); + if constexpr (sizeof...(IT) == 1 && IsAllSame, IT...>::value) { + Internal::CopyVectorToMatrixRow(*this, index, std::forward(inValues)..., std::make_index_sequence {}); } else { - Internal::CopyValuesToMatrixRow(*this, index, std::forward(inValues)..., std::make_index_sequence {}); + Internal::CopyValuesToMatrixRow(*this, index, std::forward(inValues)..., std::make_index_sequence {}); } } - template + template template - void Mat::SetCol(uint8_t index, IT&&... inValues) + void Mat::SetCol(uint8_t index, IT&&... inValues) { - if constexpr (sizeof...(IT) == 1 && IsAllSame, IT...>::value) { - Internal::CopyVectorToMatrixCol(*this, index, std::forward(inValues)..., std::make_index_sequence {}); + if constexpr (sizeof...(IT) == 1 && IsAllSame, IT...>::value) { + Internal::CopyVectorToMatrixCol(*this, index, std::forward(inValues)..., std::make_index_sequence {}); } else { - Internal::CopyValuesToMatrixCol(*this, index, std::forward(inValues)..., std::make_index_sequence {}); + Internal::CopyValuesToMatrixCol(*this, index, std::forward(inValues)..., std::make_index_sequence {}); } } - template + template template - Mat Mat::CastTo() const + Mat Mat::CastTo() const { - Mat result; + Mat result; for (auto i = 0; i < R * C; i++) { result.data[i] = static_cast(this->data[i]); } return result; } - template - Mat Mat::Transpose() const + template + Mat Mat::Transpose() const { - Mat result; - for (auto i = 0; i < R; i++) { - for (auto j = 0; j < C; j++) { - result.At(j, i) = At(i, j); - } - } - return result; + return Internal::MatOps::Transpose(*this); } - template - Vec Mat::ExtractTranslation() const + template + Vec Mat::ExtractTranslation() const { static_assert( R == 4 && C == 4); - Vec ret = Vec(this->data[3], this->data[7], this->data[11]); + Vec ret = Vec(this->data[3], this->data[7], this->data[11]); return ret; } - template - Quaternion Mat::ExtractRotation() const + template + Quaternion Mat::ExtractRotation() const { static_assert( R == 4 && C == 4); - Quaternion ret = Quaternion(1, 0, 0, 0); + Quaternion ret = Quaternion(1, 0, 0, 0); - T sx = Vec(this->data[0], this->data[4], this->data[8]).Model(); - T sy = Vec(this->data[1], this->data[5], this->data[9]).Model(); - T sz = Vec(this->data[2], this->data[6], this->data[10]).Model(); + T sx = Vec(this->data[0], this->data[4], this->data[8]).Model(); + T sy = Vec(this->data[1], this->data[5], this->data[9]).Model(); + T sz = Vec(this->data[2], this->data[6], this->data[10]).Model(); T det = this->Determinant(); if (det < 0) { sx = -sx; @@ -877,30 +1238,30 @@ namespace Common { return ret; } - template - Vec Mat::ExtractScale() const + template + Vec Mat::ExtractScale() const { static_assert( R == 4 && C == 4); - T sx = Vec(this->data[0], this->data[4], this->data[8]).Model(); - T sy = Vec(this->data[1], this->data[5], this->data[9]).Model(); - T sz = Vec(this->data[2], this->data[6], this->data[10]).Model(); + T sx = Vec(this->data[0], this->data[4], this->data[8]).Model(); + T sy = Vec(this->data[1], this->data[5], this->data[9]).Model(); + T sz = Vec(this->data[2], this->data[6], this->data[10]).Model(); T det = this->Determinant(); if (det < 0) { sx = -sx; } - Vec ret = Vec(sx, sy, sz); + Vec ret = Vec(sx, sy, sz); return ret; } - template + template template requires ValidSubMatDims - Mat Mat::SubMatrix() const + Mat Mat::SubMatrix() const { - Mat result; + Mat result; for (auto i = 0; i < DR; i++) { for (auto j = 0; j < DC; j++) { result.At(i, j) = At(i, j); @@ -910,152 +1271,38 @@ namespace Common { } - template - bool Mat::CanInverse() const + template + bool Mat::CanInverse() const { return this->Determinant() != static_cast(0); } - template - Mat Mat::Inverse() const + template + Mat Mat::Inverse() const { - static_assert( R == C && R > 1 && R < 5); - T oneOverDet = static_cast(1) / this->Determinant(); - - Mat result; - if constexpr (R == 2) { - result.At(0, 0) = this->data[3] * oneOverDet; - result.At(0, 1) = -this->data[1] * oneOverDet; - result.At(1, 0) = -this->data[2] * oneOverDet; - result.At(1, 1) = this->data[0] * oneOverDet; - } - - if constexpr (R == 3) { - result.At(0, 0) = (this->data[4] * this->data[8] - this->data[5] * this->data[7]) * oneOverDet; - result.At(0, 1) = (this->data[2] * this->data[7] - this->data[1] * this->data[8]) * oneOverDet; - result.At(0, 2) = (this->data[1] * this->data[5] - this->data[2] * this->data[4]) * oneOverDet; - result.At(1, 0) = (this->data[5] * this->data[6] - this->data[3] * this->data[8]) * oneOverDet; - result.At(1, 1) = (this->data[0] * this->data[8] - this->data[2] * this->data[6]) * oneOverDet; - result.At(1, 2) = (this->data[2] * this->data[3] - this->data[0] * this->data[5]) * oneOverDet; - result.At(2, 0) = (this->data[3] * this->data[7] - this->data[4] * this->data[6]) * oneOverDet; - result.At(2, 1) = (this->data[1] * this->data[6] - this->data[7] * this->data[0]) * oneOverDet; - result.At(2, 2) = (this->data[0] * this->data[4] - this->data[1] * this->data[3]) * oneOverDet; - } - - if constexpr (R == 4) { - T coef00 = this->data[10] * this->data[15] - this->data[11] * this->data[14]; - T coef02 = this->data[9] * this->data[15] - this->data[11] * this->data[13]; - T coef03 = this->data[9] * this->data[14] - this->data[10] * this->data[13]; - - T coef04 = this->data[6] * this->data[15] - this->data[7] * this->data[14]; - T coef06 = this->data[5] * this->data[15] - this->data[7] * this->data[13]; - T coef07 = this->data[5] * this->data[14] - this->data[6] * this->data[13]; - - T coef08 = this->data[6] * this->data[11] - this->data[7] * this->data[10]; - T coef10 = this->data[5] * this->data[11] - this->data[7] * this->data[9]; - T coef11 = this->data[5] * this->data[10] - this->data[6] * this->data[9]; - - T coef12 = this->data[2] * this->data[15] - this->data[3] * this->data[14]; - T coef14 = this->data[1] * this->data[15] - this->data[3] * this->data[13]; - T coef15 = this->data[1] * this->data[14] - this->data[2] * this->data[13]; - - T coef16 = this->data[2] * this->data[11] - this->data[3] * this->data[10]; - T coef18 = this->data[1] * this->data[11] - this->data[3] * this->data[9]; - T coef19 = this->data[1] * this->data[10] - this->data[2] * this->data[9]; - - T coef20 = this->data[2] * this->data[7] - this->data[3] * this->data[6]; - T coef22 = this->data[1] * this->data[7] - this->data[3] * this->data[5]; - T coef23 = this->data[1] * this->data[6] - this->data[2] * this->data[5]; - - Vec fac0(coef00, coef00, coef02, coef03); - Vec fac1(coef04, coef04, coef06, coef07); - Vec fac2(coef08, coef08, coef10, coef11); - Vec fac3(coef12, coef12, coef14, coef15); - Vec fac4(coef16, coef16, coef18, coef19); - Vec fac5(coef20, coef20, coef22, coef23); - - Vec vec0(this->data[1], this->data[0], this->data[0], this->data[0]); - Vec vec1(this->data[5], this->data[4], this->data[4], this->data[4]); - Vec vec2(this->data[9], this->data[8], this->data[8], this->data[8]); - Vec vec3(this->data[13], this->data[12], this->data[12], this->data[12]); - - Vec inv0(vec1 * fac0 - vec2 * fac1 + vec3 * fac2); - Vec inv1(vec0 * fac0 - vec2 * fac3 + vec3 * fac4); - Vec inv2(vec0 * fac1 - vec1 * fac3 + vec3 * fac5); - Vec inv3(vec0 * fac2 - vec1 * fac4 + vec2 * fac5); - - Vec signA(+1, -1, +1, -1); - Vec signB(-1, +1, -1, +1); - - Vec col0 = inv0 * signA; - Vec col1 = inv1 * signB; - Vec col2 = inv2 * signA; - Vec col3 = inv3 * signB; - - result.SetCol(0, col0); - result.SetCol(1, col1); - result.SetCol(2, col2); - result.SetCol(3, col3); - - result = result * oneOverDet; - } - - return result; + return Internal::MatOps::Inverse(*this); } - template - T Mat::Determinant() const + template + T Mat::Determinant() const { - static_assert( R == C && R > 1 && R < 5); - T result = static_cast(0); - if constexpr (R == 2) { - result = this->data[0] * this->data[3] - this->data[1] * this->data[2]; - } - - if constexpr (R == 3) { - result = - this->data[0] * (this->data[4] * this->data[8] - this->data[5] * this->data[7]) - - this->data[1] * (this->data[3] * this->data[8] - this->data[5] * this->data[6]) - + this->data[2] * (this->data[3] * this->data[7] - this->data[4] * this->data[6]); - } - - if constexpr (R == 4) { - T subFactor0 = this->data[10] * this->data[15] - this->data[11] * this->data[14]; - T subFactor1 = this->data[6] * this->data[15] - this->data[7] * this->data[14]; - T subFactor2 = this->data[6] * this->data[11] - this->data[7] * this->data[10]; - T subFactor3 = this->data[2] * this->data[15] - this->data[3] * this->data[14]; - T subFactor4 = this->data[2] * this->data[11] - this->data[3] * this->data[10]; - T subFactor5 = this->data[2] * this->data[7] - this->data[3] * this->data[6]; - - T detCoef0 = this->data[5] * subFactor0 - this->data[9] * subFactor1 + this->data[13] * subFactor2; - T detCoef1 = this->data[9] * subFactor3 - this->data[1] * subFactor0 - this->data[13] * subFactor4; - T detCoef2 = this->data[1] * subFactor1 - this->data[5] * subFactor3 + this->data[13] * subFactor5; - T detCoef3 = this->data[5] * subFactor4 - this->data[1] * subFactor2 - this->data[9] * subFactor5; - - result = this->data[0] * detCoef0 + this->data[4] * detCoef1 + this->data[8] * detCoef2 + this->data[12] * detCoef3; - } - - return result; + return Internal::MatOps::Determinant(*this); } - template + template requires ValidMatDims - const Mat MatConsts::zero = Mat(0); + const Mat MatConsts::zero = Mat(0); - template + template requires ValidVecDim - const Mat MatConsts::zero = Mat(0); + const Mat MatConsts::zero = Mat(0); - template + template requires ValidVecDim - const Mat MatConsts::identity = Internal::GetIdentityMatrix(); + const Mat MatConsts::identity = Internal::GetIdentityMatrix(); - template - Vec operator*(const Mat& mat, const Vec& vec) { - Vec result; - for (auto i = 0; i < R; i++) { - result[i] = mat.Row(i).Dot(vec); - } - return result; + template + Vec operator*(const Mat& mat, const Vec& vec) { + return Internal::MatOps::MulVec(mat, vec); } } diff --git a/Engine/Source/Common/Include/Common/Math/Quaternion.h b/Engine/Source/Common/Include/Common/Math/Quaternion.h index 87852748..99444c57 100644 --- a/Engine/Source/Common/Include/Common/Math/Quaternion.h +++ b/Engine/Source/Common/Include/Common/Math/Quaternion.h @@ -4,6 +4,7 @@ #pragma once +#include #include #include #include @@ -61,18 +62,18 @@ namespace Common { // +x -> from screen outer to inner // +y -> from left to right // +z -> from bttom to up - template + template struct Quaternion : QuaternionBase { static Quaternion FromEulerZYX(T inAngleX, T inAngleY, T inAngleZ); static Quaternion FromEulerZYX(const Radian& inRadianX, const Radian& inRadianY, const Radian& inRadianZ); Quaternion(); Quaternion(T inW, T inX, T inY, T inZ); - Quaternion(const Vec& inAxis, float inAngle); - Quaternion(const Vec& inAxis, const Radian& inRadian); - Quaternion(const Quaternion& inValue); - Quaternion(Quaternion&& inValue) noexcept; - Quaternion& operator=(const Quaternion& inValue); + Quaternion(const Vec& inAxis, float inAngle); + Quaternion(const Vec& inAxis, const Radian& inRadian); + Quaternion(const Quaternion& inValue) = default; + Quaternion(Quaternion&& inValue) noexcept = default; + Quaternion& operator=(const Quaternion& inValue) = default; bool operator==(const Quaternion& rhs) const; bool operator!=(const Quaternion& rhs) const; @@ -88,24 +89,24 @@ namespace Common { Quaternion& operator*=(const Quaternion& rhs); Quaternion& operator/=(T rhs); - Vec ImaginaryPart() const; + Vec ImaginaryPart() const; T Model() const; Quaternion Negatived() const; Quaternion Conjugated() const; Quaternion Normalized() const; T Dot(const Quaternion& rhs) const; // when axis faced to us, ccw as positive direction - Vec RotateVector(const Vec& inVector) const; - Mat GetRotationMatrix() const; + Vec RotateVector(const Vec& inVector) const; + Mat GetRotationMatrix() const; template - Quaternion CastTo() const; + Quaternion CastTo() const; }; - template + template struct QuatConsts { - static const Quaternion zero; - static const Quaternion identity; + static const Quaternion zero; + static const Quaternion identity; }; using HAngle = Angle; @@ -160,13 +161,13 @@ namespace Common { } }; - template - struct Serializer> { + template + struct Serializer> { static constexpr size_t typeId = HashUtils::StrCrc32("Common::Quaternion") + Serializer::typeId; - static size_t Serialize(BinarySerializeStream& stream, const Quaternion& value) + static size_t Serialize(BinarySerializeStream& stream, const Quaternion& value) { size_t serialized = 0; serialized += Serializer::Serialize(stream, value.w); @@ -176,7 +177,7 @@ namespace Common { return serialized; } - static size_t Deserialize(BinaryDeserializeStream& stream, Quaternion& value) + static size_t Deserialize(BinaryDeserializeStream& stream, Quaternion& value) { size_t deserialized = 0; deserialized += Serializer::Deserialize(stream, value.w); @@ -203,9 +204,9 @@ namespace Common { } }; - template - struct StringConverter> { - static std::string ToString(const Quaternion& inValue) + template + struct StringConverter> { + static std::string ToString(const Quaternion& inValue) { return std::format( "({}, {}, {}, {})", @@ -242,9 +243,9 @@ namespace Common { } }; - template - struct JsonSerializer> { - static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Quaternion& inValue) + template + struct JsonSerializer> { + static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Quaternion& inValue) { rapidjson::Value xJson; JsonSerializer::JsonSerialize(xJson, inAllocator, inValue.w); @@ -265,7 +266,7 @@ namespace Common { outJsonValue.PushBack(wJson, inAllocator); } - static void JsonDeserialize(const rapidjson::Value& inJsonValue, Quaternion& outValue) + static void JsonDeserialize(const rapidjson::Value& inJsonValue, Quaternion& outValue) { if (!inJsonValue.IsArray() || inJsonValue.Size() != 4) { return; @@ -278,6 +279,112 @@ namespace Common { }; } +namespace Common::Internal { + // Per-backend dispatch for quaternion arithmetic. The primary template is the scalar implementation, so any + // (T, B) without a specialization degrades gracefully to scalar; the SIMD specialization follows immediately after. + template + struct QuatOps { + using Q = Quaternion; + + static Q Add(const Q& a, const Q& b) + { + Q result; + result.w = a.w + b.w; + result.x = a.x + b.x; + result.y = a.y + b.y; + result.z = a.z + b.z; + return result; + } + + static Q Sub(const Q& a, const Q& b) + { + Q result; + result.w = a.w - b.w; + result.x = a.x - b.x; + result.y = a.y - b.y; + result.z = a.z - b.z; + return result; + } + + static Q MulScalar(const Q& a, T b) + { + Q result; + result.w = a.w * b; + result.x = a.x * b; + result.y = a.y * b; + result.z = a.z * b; + return result; + } + + static Q DivScalar(const Q& a, T b) + { + Q result; + result.w = a.w / b; + result.x = a.x / b; + result.y = a.y / b; + result.z = a.z / b; + return result; + } + + static Q Mul(const Q& a, const Q& b) + { + Q result; + result.w = a.w * b.w - a.x * b.x - a.y * b.y - a.z * b.z; + result.x = a.w * b.x + a.x * b.w + a.y * b.z - a.z * b.y; + result.y = a.w * b.y - a.x * b.z + a.y * b.w + a.z * b.x; + result.z = a.w * b.z + a.x * b.y - a.y * b.x + a.z * b.w; + return result; + } + + static T Dot(const Q& a, const Q& b) + { + return a.w * b.w + a.x * b.x + a.y * b.y + a.z * b.z; + } + }; + + // QuaternionBase stores x, y, z, w as four contiguous floats, so &q.x is the base of a 16-byte block that + // maps to an unaligned 128-bit load/store. The element-wise ops and the dot product map onto the F32x4 wrapper + // directly; the Hamilton product is expressed as four broadcast-and-permute terms (see Mul). + template <> + struct QuatOps { + using Q = Quaternion; + + static Q Add(const Q& a, const Q& b) { Q r; Simd::MapBinary<4>(&r.x, &a.x, &b.x, Simd::AddOp {}); return r; } + static Q Sub(const Q& a, const Q& b) { Q r; Simd::MapBinary<4>(&r.x, &a.x, &b.x, Simd::SubOp {}); return r; } + + static Q MulScalar(const Q& a, float b) { Q r; Simd::MapScalar<4>(&r.x, &a.x, b, Simd::MulOp {}); return r; } + static Q DivScalar(const Q& a, float b) { Q r; Simd::MapScalar<4>(&r.x, &a.x, b, Simd::DivOp {}); return r; } + + // Hamilton product, with both quaternions loaded as (x, y, z, w). Each row of the product is one component of + // a broadcast against a sign-flipped permutation of b, summed across the four components of a: + // result = aw*(bx,by,bz,bw) + ax*(bw,-bz,by,-bx) + ay*(bz,bw,-bx,-by) + az*(-by,bx,bw,-bz) + // The accumulation order matches the scalar reference above, so both backends produce identical results. + static Q Mul(const Q& a, const Q& b) + { + const Simd::F32x4 av = Simd::LoadU(&a.x); + const Simd::F32x4 bv = Simd::LoadU(&b.x); + + const Simd::F32x4 sign0 = Simd::Set(1.0f, -1.0f, 1.0f, -1.0f); + const Simd::F32x4 sign1 = Simd::Set(1.0f, 1.0f, -1.0f, -1.0f); + const Simd::F32x4 sign2 = Simd::Set(-1.0f, 1.0f, 1.0f, -1.0f); + + Simd::F32x4 acc = Simd::Mul(Simd::Splat<3>(av), bv); + acc = Simd::Add(acc, Simd::Mul(Simd::Splat<0>(av), Simd::Mul(Simd::Shuffle<3, 2, 1, 0>(bv), sign0))); + acc = Simd::Add(acc, Simd::Mul(Simd::Splat<1>(av), Simd::Mul(Simd::Shuffle<2, 3, 0, 1>(bv), sign1))); + acc = Simd::Add(acc, Simd::Mul(Simd::Splat<2>(av), Simd::Mul(Simd::Shuffle<1, 0, 3, 2>(bv), sign2))); + + Q result; + Simd::StoreU(&result.x, acc); + return result; + } + + static float Dot(const Q& a, const Q& b) + { + return Simd::Sum(Simd::Mul(Simd::LoadU(&a.x), Simd::LoadU(&b.x))); + } + }; +} + namespace Common { template Angle::Angle() @@ -373,28 +480,28 @@ namespace Common { return this->value * 180.0f / pi; } - template - const Quaternion QuatConsts::zero = Quaternion(); + template + const Quaternion QuatConsts::zero = Quaternion(); - template - const Quaternion QuatConsts::identity = Quaternion(1, 0, 0, 0); + template + const Quaternion QuatConsts::identity = Quaternion(1, 0, 0, 0); - template - Quaternion Quaternion::FromEulerZYX(T inAngleX, T inAngleY, T inAngleZ) + template + Quaternion Quaternion::FromEulerZYX(T inAngleX, T inAngleY, T inAngleZ) { - return Quaternion(VecConsts::unitZ, inAngleZ) - * Quaternion(VecConsts::unitY, inAngleY) - * Quaternion(VecConsts::unitX, inAngleX); + return Quaternion(VecConsts::unitZ, inAngleZ) + * Quaternion(VecConsts::unitY, inAngleY) + * Quaternion(VecConsts::unitX, inAngleX); } - template - Quaternion Quaternion::FromEulerZYX(const Radian& inRadianX, const Radian& inRadianY, const Radian& inRadianZ) + template + Quaternion Quaternion::FromEulerZYX(const Radian& inRadianX, const Radian& inRadianY, const Radian& inRadianZ) { return FromEulerZYX(inRadianX.ToAngle(), inRadianY.ToAngle(), inRadianZ.ToAngle()); } - template - Quaternion::Quaternion() + template + Quaternion::Quaternion() { this->w = 0; this->x = 0; @@ -402,8 +509,8 @@ namespace Common { this->z = 0; } - template - Quaternion::Quaternion(T inW, T inX, T inY, T inZ) + template + Quaternion::Quaternion(T inW, T inX, T inY, T inZ) { this->w = inW; this->x = inX; @@ -411,10 +518,10 @@ namespace Common { this->z = inZ; } - template - Quaternion::Quaternion(const Vec& inAxis, float inAngle) + template + Quaternion::Quaternion(const Vec& inAxis, float inAngle) { - Vec axis = inAxis.Normalized(); + Vec axis = inAxis.Normalized(); T halfRadian = Angle(inAngle).ToRadian() / 2.0f; T halfRadianSin = std::sin(halfRadian); T halfRadianCos = std::cos(halfRadian); @@ -425,36 +532,14 @@ namespace Common { this->z = axis.z * halfRadianSin; } - template - Quaternion::Quaternion(const Vec& inAxis, const Radian& inRadian) + template + Quaternion::Quaternion(const Vec& inAxis, const Radian& inRadian) : Quaternion(inAxis, inRadian.ToAngle()) { } - template - Quaternion::Quaternion(const Quaternion& inValue) - : Quaternion(inValue.w, inValue.x, inValue.y, inValue.z) - { - } - - template - Quaternion::Quaternion(Quaternion&& inValue) noexcept - : Quaternion(inValue.w, inValue.x, inValue.y, inValue.z) - { - } - - template - Quaternion& Quaternion::operator=(const Quaternion& inValue) - { - this->w = inValue.w; - this->x = inValue.x; - this->y = inValue.y; - this->z = inValue.z; - return *this; - } - - template - bool Quaternion::operator==(const Quaternion& rhs) const + template + bool Quaternion::operator==(const Quaternion& rhs) const { return CompareNumber(this->w, rhs.w) && CompareNumber(this->x, rhs.x) @@ -462,130 +547,93 @@ namespace Common { && CompareNumber(this->z, rhs.z); } - template - bool Quaternion::operator!=(const Quaternion& rhs) const + template + bool Quaternion::operator!=(const Quaternion& rhs) const { return !this->operator==(rhs); } - template - Quaternion Quaternion::operator+(const Quaternion& rhs) const + template + Quaternion Quaternion::operator+(const Quaternion& rhs) const { - Quaternion result; - result.w = this->w + rhs.w; - result.x = this->x + rhs.x; - result.y = this->y + rhs.y; - result.z = this->z + rhs.z; - return result; + return Internal::QuatOps::Add(*this, rhs); } - template - Quaternion Quaternion::operator-(const Quaternion& rhs) const + template + Quaternion Quaternion::operator-(const Quaternion& rhs) const { - Quaternion result; - result.w = this->w - rhs.w; - result.x = this->x - rhs.x; - result.y = this->y - rhs.y; - result.z = this->z - rhs.z; - return result; + return Internal::QuatOps::Sub(*this, rhs); } - template - Quaternion Quaternion::operator*(T rhs) const + template + Quaternion Quaternion::operator*(T rhs) const { - Quaternion result; - result.w = this->w * rhs; - result.x = this->x * rhs; - result.y = this->y * rhs; - result.z = this->z * rhs; - return result; + return Internal::QuatOps::MulScalar(*this, rhs); } - template - Quaternion Quaternion::operator*(const Quaternion& rhs) const + template + Quaternion Quaternion::operator*(const Quaternion& rhs) const { - Quaternion result; - result.w = this->w * rhs.w - this->x * rhs.x - this->y * rhs.y - this->z * rhs.z; - result.x = this->w * rhs.x + this->x * rhs.w + this->y * rhs.z - this->z * rhs.y; - result.y = this->w * rhs.y - this->x * rhs.z + this->y * rhs.w + this->z * rhs.x; - result.z = this->w * rhs.z + this->x * rhs.y - this->y * rhs.x + this->z * rhs.w; - return result; + return Internal::QuatOps::Mul(*this, rhs); } - template - Quaternion Quaternion::operator/(T rhs) const + template + Quaternion Quaternion::operator/(T rhs) const { - Quaternion result; - result.w = this->w / rhs; - result.x = this->x / rhs; - result.y = this->y / rhs; - result.z = this->z / rhs; - return result; + return Internal::QuatOps::DivScalar(*this, rhs); } - template - Quaternion& Quaternion::operator+=(const Quaternion& rhs) + template + Quaternion& Quaternion::operator+=(const Quaternion& rhs) { - this->w += rhs.w; - this->x += rhs.x; - this->y += rhs.y; - this->z += rhs.z; + *this = Internal::QuatOps::Add(*this, rhs); return *this; } - template - Quaternion& Quaternion::operator-=(const Quaternion& rhs) + template + Quaternion& Quaternion::operator-=(const Quaternion& rhs) { - this->w -= rhs.w; - this->x -= rhs.x; - this->y -= rhs.y; - this->z -= rhs.z; + *this = Internal::QuatOps::Sub(*this, rhs); return *this; } - template - Quaternion& Quaternion::operator*=(T rhs) + template + Quaternion& Quaternion::operator*=(T rhs) { - this->w *= rhs; - this->x *= rhs; - this->y *= rhs; - this->z *= rhs; + *this = Internal::QuatOps::MulScalar(*this, rhs); return *this; } - template - Quaternion& Quaternion::operator*=(const Quaternion& rhs) + template + Quaternion& Quaternion::operator*=(const Quaternion& rhs) { *this = *this * rhs; return *this; } - template - Quaternion& Quaternion::operator/=(T rhs) + template + Quaternion& Quaternion::operator/=(T rhs) { - this->w /= rhs; - this->x /= rhs; - this->y /= rhs; - this->z /= rhs; + *this = Internal::QuatOps::DivScalar(*this, rhs); return *this; } - template - Vec Quaternion::ImaginaryPart() const + template + Vec Quaternion::ImaginaryPart() const { - return Vec(this->x, this->y, this->z); + return Vec(this->x, this->y, this->z); } - template - T Quaternion::Model() const + template + T Quaternion::Model() const { - return std::sqrt(this->w * this->w + this->x * this->x + this->y * this->y + this->z * this->z); + return std::sqrt(Internal::QuatOps::Dot(*this, *this)); } - template - Quaternion Quaternion::Negatived() const + template + Quaternion Quaternion::Negatived() const { - Quaternion result; + Quaternion result; result.w = -this->w; result.x = -this->x; result.y = -this->y; @@ -593,10 +641,10 @@ namespace Common { return result; } - template - Quaternion Quaternion::Conjugated() const + template + Quaternion Quaternion::Conjugated() const { - Quaternion result; + Quaternion result; result.w = this->w; result.x = -this->x; result.y = -this->y; @@ -604,28 +652,28 @@ namespace Common { return result; } - template - Quaternion Quaternion::Normalized() const + template + Quaternion Quaternion::Normalized() const { return this->operator/(Model()); } - template - T Quaternion::Dot(const Quaternion& rhs) const + template + T Quaternion::Dot(const Quaternion& rhs) const { - return this->w * rhs.w + this->x * rhs.x + this->y * rhs.y + this->z * rhs.z; + return Internal::QuatOps::Dot(*this, rhs); } - template - Vec Quaternion::RotateVector(const Vec& inVector) const + template + Vec Quaternion::RotateVector(const Vec& inVector) const { Quaternion v = Quaternion(0, inVector.x, inVector.y, inVector.z); Quaternion v2 = Conjugated() * v * (*this); - return Vec(v2.x, v2.y, v2.z); + return Vec(v2.x, v2.y, v2.z); } - template - Mat Quaternion::GetRotationMatrix() const + template + Mat Quaternion::GetRotationMatrix() const { T xx2 = this->x * this->x * 2; T yy2 = this->y * this->y * 2; @@ -638,7 +686,7 @@ namespace Common { T xz2 = this->x * this->z * 2; T yz2 = this->y * this->z * 2; - return Mat( + return Mat( 1 - yy2 - zz2, xy2 + wz2, xz2 - wy2, 0, xy2 - wz2, 1 - xx2 - zz2, yz2 + wx2, 0, xz2 + wy2, yz2 - wx2, 1 - xx2 - yy2, 0, @@ -646,11 +694,11 @@ namespace Common { ); } - template + template template - Quaternion Quaternion::CastTo() const + Quaternion Quaternion::CastTo() const { - Quaternion result; + Quaternion result; result.w = static_cast(this->w); result.x = static_cast(this->x); result.y = static_cast(this->y); diff --git a/Engine/Source/Common/Include/Common/Math/Simd.h b/Engine/Source/Common/Include/Common/Math/Simd.h new file mode 100644 index 00000000..2f8d6f92 --- /dev/null +++ b/Engine/Source/Common/Include/Common/Math/Simd.h @@ -0,0 +1,201 @@ +// +// Created by johnk on 2026/6/19. +// + +#pragma once + +#include + +#include + +// MirrorTool parses these headers only to harvest reflection metadata, and its libclang has no usable SSE2/NEON +// intrinsic header (no clang resource headers on Linux, a version-mismatched arm_neon.h on macOS). Reflection needs the +// declarations below but never the vectorized bodies, so under the parser we skip the intrinsic include and stand in a +// scalar F32x4 instead; every real translation unit still takes the intrinsic backend. +#if !defined(MIRROR_TOOL_PARSING) +#if ARCH_X86 +#include +#elif ARCH_ARM +#include +#else +#error "The math SIMD backend supports only x86-64 (SSE2) and arm64 (NEON) targets" +#endif +#endif + +namespace Common { + // The SIMD backend only relies on the SSE2/NEON baseline, available on every CPU this engine targets, so it is the + // default; defaultBackend aliases simd and sits after max so it does not change the enumerator count. The scalar + // backend can still be requested explicitly via Vec/Mat/Quaternion's backend parameter. + enum class MathBackend : uint8_t { + scalar, + simd, + max, + defaultBackend = simd + }; +} + +// A thin, header-only wrapper over a 4-wide float SIMD register, mapping to the SSE2 baseline on x86-64 and to NEON on +// arm64 (both always present, no extra flags). The math layer uses it to back the simd backend. +namespace Common::Simd { +#if defined(MIRROR_TOOL_PARSING) + struct F32x4 { float lanes[4]; }; + + inline F32x4 LoadU(const float* p) { return { p[0], p[1], p[2], p[3] }; } + inline void StoreU(float* p, F32x4 v) { for (int i = 0; i < 4; i++) { p[i] = v.lanes[i]; } } + inline F32x4 Set1(float s) { return { s, s, s, s }; } + inline F32x4 Add(F32x4 a, F32x4 b) { return { a.lanes[0] + b.lanes[0], a.lanes[1] + b.lanes[1], a.lanes[2] + b.lanes[2], a.lanes[3] + b.lanes[3] }; } + inline F32x4 Sub(F32x4 a, F32x4 b) { return { a.lanes[0] - b.lanes[0], a.lanes[1] - b.lanes[1], a.lanes[2] - b.lanes[2], a.lanes[3] - b.lanes[3] }; } + inline F32x4 Mul(F32x4 a, F32x4 b) { return { a.lanes[0] * b.lanes[0], a.lanes[1] * b.lanes[1], a.lanes[2] * b.lanes[2], a.lanes[3] * b.lanes[3] }; } + inline F32x4 Div(F32x4 a, F32x4 b) { return { a.lanes[0] / b.lanes[0], a.lanes[1] / b.lanes[1], a.lanes[2] / b.lanes[2], a.lanes[3] / b.lanes[3] }; } + inline float Sum(F32x4 v) { return v.lanes[0] + v.lanes[1] + v.lanes[2] + v.lanes[3]; } + inline F32x4 Set(float x, float y, float z, float w) { return { x, y, z, w }; } + + template + inline F32x4 Splat(F32x4 v) { return Set1(v.lanes[L]); } + + template + inline F32x4 Shuffle(F32x4 v) { return { v.lanes[I0], v.lanes[I1], v.lanes[I2], v.lanes[I3] }; } + + inline void Transpose4(F32x4& r0, F32x4& r1, F32x4& r2, F32x4& r3) + { + const F32x4 s0 = r0, s1 = r1, s2 = r2, s3 = r3; + r0 = { s0.lanes[0], s1.lanes[0], s2.lanes[0], s3.lanes[0] }; + r1 = { s0.lanes[1], s1.lanes[1], s2.lanes[1], s3.lanes[1] }; + r2 = { s0.lanes[2], s1.lanes[2], s2.lanes[2], s3.lanes[2] }; + r3 = { s0.lanes[3], s1.lanes[3], s2.lanes[3], s3.lanes[3] }; + } +#elif ARCH_X86 + using F32x4 = __m128; + + inline F32x4 LoadU(const float* p) { return _mm_loadu_ps(p); } + inline void StoreU(float* p, F32x4 v) { _mm_storeu_ps(p, v); } + inline F32x4 Set1(float s) { return _mm_set1_ps(s); } + inline F32x4 Add(F32x4 a, F32x4 b) { return _mm_add_ps(a, b); } + inline F32x4 Sub(F32x4 a, F32x4 b) { return _mm_sub_ps(a, b); } + inline F32x4 Mul(F32x4 a, F32x4 b) { return _mm_mul_ps(a, b); } + inline F32x4 Div(F32x4 a, F32x4 b) { return _mm_div_ps(a, b); } + + inline float Sum(F32x4 v) + { + F32x4 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1)); + F32x4 sums = _mm_add_ps(v, shuf); + shuf = _mm_movehl_ps(shuf, sums); + sums = _mm_add_ss(sums, shuf); + return _mm_cvtss_f32(sums); + } + + inline F32x4 Set(float x, float y, float z, float w) { return _mm_set_ps(w, z, y, x); } + + template + inline F32x4 Splat(F32x4 v) { return _mm_shuffle_ps(v, v, _MM_SHUFFLE(L, L, L, L)); } + + template + inline F32x4 Shuffle(F32x4 v) { return _mm_shuffle_ps(v, v, _MM_SHUFFLE(I3, I2, I1, I0)); } + + // In-place transpose of the 4x4 matrix whose rows are r0..r3. + inline void Transpose4(F32x4& r0, F32x4& r1, F32x4& r2, F32x4& r3) + { + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + } +#elif ARCH_ARM + using F32x4 = float32x4_t; + + inline F32x4 LoadU(const float* p) { return vld1q_f32(p); } + inline void StoreU(float* p, F32x4 v) { vst1q_f32(p, v); } + inline F32x4 Set1(float s) { return vdupq_n_f32(s); } + inline F32x4 Add(F32x4 a, F32x4 b) { return vaddq_f32(a, b); } + inline F32x4 Sub(F32x4 a, F32x4 b) { return vsubq_f32(a, b); } + inline F32x4 Mul(F32x4 a, F32x4 b) { return vmulq_f32(a, b); } + inline F32x4 Div(F32x4 a, F32x4 b) { return vdivq_f32(a, b); } + inline float Sum(F32x4 v) { return vaddvq_f32(v); } + + inline F32x4 Set(float x, float y, float z, float w) + { + const float values[4] = { x, y, z, w }; + return vld1q_f32(values); + } + + template + inline F32x4 Splat(F32x4 v) { return vdupq_n_f32(vgetq_lane_f32(v, L)); } + + // NEON has no single arbitrary 4-lane float shuffle, so go through memory; for a compile-time permutation the + // compiler routinely folds the store/reload back into register moves. + template + inline F32x4 Shuffle(F32x4 v) + { + float tmp[4]; + vst1q_f32(tmp, v); + const float out[4] = { tmp[I0], tmp[I1], tmp[I2], tmp[I3] }; + return vld1q_f32(out); + } + + // In-place transpose of the 4x4 matrix whose rows are r0..r3. + inline void Transpose4(F32x4& r0, F32x4& r1, F32x4& r2, F32x4& r3) + { + const float32x4x2_t t01 = vtrnq_f32(r0, r1); + const float32x4x2_t t23 = vtrnq_f32(r2, r3); + r0 = vcombine_f32(vget_low_f32(t01.val[0]), vget_low_f32(t23.val[0])); + r1 = vcombine_f32(vget_low_f32(t01.val[1]), vget_low_f32(t23.val[1])); + r2 = vcombine_f32(vget_high_f32(t01.val[0]), vget_high_f32(t23.val[0])); + r3 = vcombine_f32(vget_high_f32(t01.val[1]), vget_high_f32(t23.val[1])); + } +#endif + + // Safe partial load/store for tight 3-float storage (a Vec3, or one row of a Mat3). Load3 reads exactly three + // floats and zeroes the 4th lane, so it never over-reads the float[3] / float[9] backing; the zeroed lane also lets + // a 4-wide dot reduce to the 3-component dot. Store3 writes exactly three floats and leaves the 4th element alone. + inline F32x4 Load3(const float* p) { return Set(p[0], p[1], p[2], 0.0f); } + + inline void Store3(float* p, F32x4 v) + { + alignas(16) float tmp[4]; + StoreU(tmp, v); + p[0] = tmp[0]; + p[1] = tmp[1]; + p[2] = tmp[2]; + } + + // Element-wise binary ops as functors so a single Map* template can drive every Vec/Mat/Quaternion kernel. Each + // carries both a 4-wide register overload (the SIMD body) and a scalar overload (the <4 tail), so the same functor + // covers the full chunks and the remainder without the caller spelling out two lambdas. + struct AddOp { + F32x4 operator()(F32x4 a, F32x4 b) const { return Add(a, b); } + float operator()(float a, float b) const { return a + b; } + }; + + struct SubOp { + F32x4 operator()(F32x4 a, F32x4 b) const { return Sub(a, b); } + float operator()(float a, float b) const { return a - b; } + }; + + struct MulOp { + F32x4 operator()(F32x4 a, F32x4 b) const { return Mul(a, b); } + float operator()(float a, float b) const { return a * b; } + }; + + struct DivOp { + F32x4 operator()(F32x4 a, F32x4 b) const { return Div(a, b); } + float operator()(float a, float b) const { return a / b; } + }; + + // Lane-wise binary map over N contiguous floats: full 4-wide chunks via the register overload, any <4 tail via the + // scalar overload. dst may alias a or b since each lane is independent, so this also backs the in-place compound ops. + template + inline void MapBinary(float* dst, const float* a, const float* b, Op op = {}) + { + int i = 0; + for (; i + 4 <= N; i += 4) { StoreU(&dst[i], op(LoadU(&a[i]), LoadU(&b[i]))); } + for (; i < N; i++) { dst[i] = op(a[i], b[i]); } + } + + // As MapBinary, but the right operand is a single scalar broadcast across every lane (and applied directly in the + // scalar tail). + template + inline void MapScalar(float* dst, const float* a, float b, Op op = {}) + { + const F32x4 bv = Set1(b); + int i = 0; + for (; i + 4 <= N; i += 4) { StoreU(&dst[i], op(LoadU(&a[i]), bv)); } + for (; i < N; i++) { dst[i] = op(a[i], b); } + } +} diff --git a/Engine/Source/Common/Include/Common/Math/Vector.h b/Engine/Source/Common/Include/Common/Math/Vector.h index 47ee9648..f3d0cdc2 100644 --- a/Engine/Source/Common/Include/Common/Math/Vector.h +++ b/Engine/Source/Common/Include/Common/Math/Vector.h @@ -7,34 +7,40 @@ #include #include +#include #include #include #include +namespace Common { + template struct Vec; +} + namespace Common::Internal { - template + template struct VecCrossResultTraits { using Type = T; }; } namespace Common { - template struct Vec; - template concept ValidVecDim = L >= 1 && L <= 4; - template concept VecN = std::is_same_v>; + template concept VecN = std::is_same_v>; - template + template requires ValidVecDim struct BaseVec {}; - template - struct Vec : BaseVec { + template + struct Vec : BaseVec { using Type = T; static constexpr uint8_t dims = L; + static constexpr MathBackend backend = B; Vec(); Vec(T inValue); // NOLINT + // Not defaulted: BaseVec aliases data via an anonymous union, and a union's copy/move members are implicitly + // deleted when a variant member (e.g. HFloat) has a non-trivial copy, so these stay element-wise. Vec(const Vec& other); Vec(Vec&& other) noexcept; Vec& operator=(const Vec& other); @@ -71,66 +77,66 @@ namespace Common { Vec& operator/=(const Vec& rhs); template - Vec CastTo() const; + Vec CastTo() const; template - Vec SubVec() const; + Vec SubVec() const; T Model() const; Vec Normalized() const; void Normalize(); T Dot(const Vec& rhs) const; - typename Internal::VecCrossResultTraits::Type Cross(const Vec& rhs) const; + typename Internal::VecCrossResultTraits::Type Cross(const Vec& rhs) const; }; - template + template requires ValidVecDim struct VecConsts {}; - template - struct VecConsts { - static const Vec zero; - static const Vec unit; - static const Vec negaUnit; + template + struct VecConsts { + static const Vec zero; + static const Vec unit; + static const Vec negaUnit; }; - template - struct VecConsts { - static const Vec zero; - static const Vec unitX; - static const Vec unitY; - static const Vec unit; - static const Vec negaUnitX; - static const Vec negaUnitY; - static const Vec negaUnit; + template + struct VecConsts { + static const Vec zero; + static const Vec unitX; + static const Vec unitY; + static const Vec unit; + static const Vec negaUnitX; + static const Vec negaUnitY; + static const Vec negaUnit; }; - template - struct VecConsts { - static const Vec zero; - static const Vec unitX; - static const Vec unitY; - static const Vec unitZ; - static const Vec unit; - static const Vec negaUnitX; - static const Vec negaUnitY; - static const Vec negaUnitZ; - static const Vec negaUnit; + template + struct VecConsts { + static const Vec zero; + static const Vec unitX; + static const Vec unitY; + static const Vec unitZ; + static const Vec unit; + static const Vec negaUnitX; + static const Vec negaUnitY; + static const Vec negaUnitZ; + static const Vec negaUnit; }; - template - struct VecConsts { - static const Vec zero; - static const Vec unitX; - static const Vec unitY; - static const Vec unitZ; - static const Vec unitW; - static const Vec unit; - static const Vec negaUnitX; - static const Vec negaUnitY; - static const Vec negaUnitZ; - static const Vec negaUnitW; - static const Vec negaUnit; + template + struct VecConsts { + static const Vec zero; + static const Vec unitX; + static const Vec unitY; + static const Vec unitZ; + static const Vec unitW; + static const Vec unit; + static const Vec negaUnitX; + static const Vec negaUnitY; + static const Vec negaUnitZ; + static const Vec negaUnitW; + static const Vec negaUnit; }; using BVec1 = Vec; @@ -202,26 +208,26 @@ namespace Common::Internal { (void) std::initializer_list { ([&]() -> void { subVec.data[SubVecIndex] = vec.data[VecIndex]; }(), 0)... }; } - template - struct VecCrossResultTraits { + template + struct VecCrossResultTraits { using Type = T; }; - template - struct VecCrossResultTraits { - using Type = Vec; + template + struct VecCrossResultTraits { + using Type = Vec; }; } namespace Common { - template - struct Serializer> { + template + struct Serializer> { static constexpr size_t typeId = Common::HashUtils::StrCrc32("Common::Vector") + Serializer::typeId + L; - static size_t Serialize(BinarySerializeStream& stream, const Vec& value) + static size_t Serialize(BinarySerializeStream& stream, const Vec& value) { size_t serialized = 0; for (auto i = 0; i < L; i++) { @@ -230,7 +236,7 @@ namespace Common { return serialized; } - static size_t Deserialize(BinaryDeserializeStream& stream, Vec& value) + static size_t Deserialize(BinaryDeserializeStream& stream, Vec& value) { size_t deserialized = 0; for (auto i = 0; i < L; i++) { @@ -240,9 +246,9 @@ namespace Common { } }; - template - struct StringConverter> { - static std::string ToString(const Vec& inValue) + template + struct StringConverter> { + static std::string ToString(const Vec& inValue) { std::stringstream stream; stream << "("; @@ -257,9 +263,9 @@ namespace Common { } }; - template - struct JsonSerializer> { - static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Vec& inValue) + template + struct JsonSerializer> { + static void JsonSerialize(rapidjson::Value& outJsonValue, rapidjson::Document::AllocatorType& inAllocator, const Vec& inValue) { outJsonValue.SetArray(); outJsonValue.Reserve(L, inAllocator); @@ -270,7 +276,7 @@ namespace Common { } } - static void JsonDeserialize(const rapidjson::Value& inJsonValue, Vec& outValue) + static void JsonDeserialize(const rapidjson::Value& inJsonValue, Vec& outValue) { if (!inJsonValue.IsArray() || inJsonValue.Size() != L) { return; @@ -283,8 +289,8 @@ namespace Common { } namespace Common { - template - struct BaseVec { + template + struct BaseVec { BaseVec(); BaseVec(T inX); // NOLINT @@ -296,8 +302,8 @@ namespace Common { }; }; - template - struct BaseVec { + template + struct BaseVec { BaseVec(); BaseVec(T inValue); // NOLINT BaseVec(T inX, T inY); @@ -311,8 +317,8 @@ namespace Common { }; }; - template - struct BaseVec { + template + struct BaseVec { BaseVec(); BaseVec(T inValue); // NOLINT BaseVec(T inX, T inY, T inZ); @@ -327,8 +333,8 @@ namespace Common { }; }; - template - struct BaseVec { + template + struct BaseVec { BaseVec(); BaseVec(T inValue); // NOLINT BaseVec(T inX, T inY, T inZ, T inW); @@ -344,190 +350,288 @@ namespace Common { }; }; - template - BaseVec::BaseVec() + template + BaseVec::BaseVec() : x(0) { } - template - BaseVec::BaseVec(T inX) + template + BaseVec::BaseVec(T inX) : x(inX) { } - template - BaseVec::BaseVec() + template + BaseVec::BaseVec() : x(0) { } - template - BaseVec::BaseVec(T inValue) + template + BaseVec::BaseVec(T inValue) : x(inValue), y(inValue) { } - template - BaseVec::BaseVec(T inX, T inY) + template + BaseVec::BaseVec(T inX, T inY) : x(inX), y(inY) { } - template - BaseVec::BaseVec() + template + BaseVec::BaseVec() : x(0) { } - template - BaseVec::BaseVec(T inValue) + template + BaseVec::BaseVec(T inValue) : x(inValue), y(inValue), z(inValue) { } - template - BaseVec::BaseVec(T inX, T inY, T inZ) + template + BaseVec::BaseVec(T inX, T inY, T inZ) : x(inX), y(inY), z(inZ) { } - template - BaseVec::BaseVec() + template + BaseVec::BaseVec() : x(0) { } - template - BaseVec::BaseVec(T inValue) + template + BaseVec::BaseVec(T inValue) : x(inValue), y(inValue), z(inValue), w(inValue) { } - template - BaseVec::BaseVec(T inX, T inY, T inZ, T inW) + template + BaseVec::BaseVec(T inX, T inY, T inZ, T inW) : x(inX), y(inY), z(inZ), w(inW) { } - template - const Vec VecConsts::zero = Vec(); + template + const Vec VecConsts::zero = Vec(); + + template + const Vec VecConsts::unit = Vec(1); + + template + const Vec VecConsts::negaUnit = Vec(-1); - template - const Vec VecConsts::unit = Vec(1); + template + const Vec VecConsts::zero = Vec(); - template - const Vec VecConsts::negaUnit = Vec(-1); + template + const Vec VecConsts::unitX = Vec(1, 0); - template - const Vec VecConsts::zero = Vec(); + template + const Vec VecConsts::unitY = Vec(0, 1); - template - const Vec VecConsts::unitX = Vec(1, 0); + template + const Vec VecConsts::unit = Vec(1, 1); - template - const Vec VecConsts::unitY = Vec(0, 1); + template + const Vec VecConsts::negaUnitX = Vec(-1, 0); - template - const Vec VecConsts::unit = Vec(1, 1); + template + const Vec VecConsts::negaUnitY = Vec(0, -1); - template - const Vec VecConsts::negaUnitX = Vec(-1, 0); + template + const Vec VecConsts::negaUnit = Vec(-1, -1); - template - const Vec VecConsts::negaUnitY = Vec(0, -1); + template + const Vec VecConsts::zero = Vec(); - template - const Vec VecConsts::negaUnit = Vec(-1, -1); + template + const Vec VecConsts::unitX = Vec(1, 0, 0); - template - const Vec VecConsts::zero = Vec(); + template + const Vec VecConsts::unitY = Vec(0, 1, 0); - template - const Vec VecConsts::unitX = Vec(1, 0, 0); + template + const Vec VecConsts::unitZ = Vec(0, 0, 1); - template - const Vec VecConsts::unitY = Vec(0, 1, 0); + template + const Vec VecConsts::unit = Vec(1, 1, 1); - template - const Vec VecConsts::unitZ = Vec(0, 0, 1); + template + const Vec VecConsts::negaUnitX = Vec(-1, 0, 0); - template - const Vec VecConsts::unit = Vec(1, 1, 1); + template + const Vec VecConsts::negaUnitY = Vec(0, -1, 0); - template - const Vec VecConsts::negaUnitX = Vec(-1, 0, 0); + template + const Vec VecConsts::negaUnitZ = Vec(0, 0, -1); - template - const Vec VecConsts::negaUnitY = Vec(0, -1, 0); + template + const Vec VecConsts::negaUnit = Vec(-1, -1, -1); - template - const Vec VecConsts::negaUnitZ = Vec(0, 0, -1); + template + const Vec VecConsts::zero = Vec(); - template - const Vec VecConsts::negaUnit = Vec(-1, -1, -1); + template + const Vec VecConsts::unitX = Vec(1, 0, 0, 0); - template - const Vec VecConsts::zero = Vec(); + template + const Vec VecConsts::unitY = Vec(0, 1, 0, 0); - template - const Vec VecConsts::unitX = Vec(1, 0, 0, 0); + template + const Vec VecConsts::unitZ = Vec(0, 0, 1, 0); - template - const Vec VecConsts::unitY = Vec(0, 1, 0, 0); + template + const Vec VecConsts::unitW = Vec(0, 0, 0, 1); - template - const Vec VecConsts::unitZ = Vec(0, 0, 1, 0); + template + const Vec VecConsts::unit = Vec(1, 1, 1, 1); - template - const Vec VecConsts::unitW = Vec(0, 0, 0, 1); + template + const Vec VecConsts::negaUnitX = Vec(-1, 0, 0, 0); - template - const Vec VecConsts::unit = Vec(1, 1, 1, 1); + template + const Vec VecConsts::negaUnitY = Vec(0, -1, 0, 0); - template - const Vec VecConsts::negaUnitX = Vec(-1, 0, 0, 0); + template + const Vec VecConsts::negaUnitZ = Vec(0, 0, -1, 0); - template - const Vec VecConsts::negaUnitY = Vec(0, -1, 0, 0); + template + const Vec VecConsts::negaUnitW = Vec(0, 0, 0, -1); - template - const Vec VecConsts::negaUnitZ = Vec(0, 0, -1, 0); + template + const Vec VecConsts::negaUnit = Vec(-1, -1, -1, -1); +} - template - const Vec VecConsts::negaUnitW = Vec(0, 0, 0, -1); +namespace Common::Internal { + // Per-backend dispatch for the hot element-wise arithmetic. The primary template is the original scalar loop, so + // any (T, L, B) without a dedicated specialization degrades gracefully to scalar (this includes B == simd for + // types/dims that have no SIMD path). This block sits after the BaseVec specialization because the SIMD + // specialization below is a full (non-template) specialization whose member bodies eagerly need Vec's + // `data`. + template + struct VecOps { + static Vec Add(const Vec& a, const Vec& b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] + b.data[i]; } + return result; + } - template - const Vec VecConsts::negaUnit = Vec(-1, -1, -1, -1); + static Vec Sub(const Vec& a, const Vec& b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] - b.data[i]; } + return result; + } - template - Vec::Vec() : BaseVec(0) + static Vec Mul(const Vec& a, const Vec& b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] * b.data[i]; } + return result; + } + + static Vec Div(const Vec& a, const Vec& b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] / b.data[i]; } + return result; + } + + static Vec AddScalar(const Vec& a, T b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] + b; } + return result; + } + + static Vec SubScalar(const Vec& a, T b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] - b; } + return result; + } + + static Vec MulScalar(const Vec& a, T b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] * b; } + return result; + } + + static Vec DivScalar(const Vec& a, T b) + { + Vec result; + for (auto i = 0; i < L; i++) { result.data[i] = a.data[i] / b; } + return result; + } + + static T Dot(const Vec& a, const Vec& b) + { + T result = 0; + for (auto i = 0; i < L; i++) { result += a.data[i] * b.data[i]; } + return result; + } + }; + + // Vec is backed by float[4] (16 bytes), so unaligned 128-bit loads/stores stay in bounds. Vec3 is + // intentionally left to the scalar primary template: its float[3] storage cannot be loaded with a 128-bit load + // without reading out of bounds, and a masked load would not beat the scalar loop. + template <> + struct VecOps { + using V = Vec; + + static V Add(const V& a, const V& b) { V r; Simd::MapBinary<4>(r.data, a.data, b.data, Simd::AddOp {}); return r; } + static V Sub(const V& a, const V& b) { V r; Simd::MapBinary<4>(r.data, a.data, b.data, Simd::SubOp {}); return r; } + static V Mul(const V& a, const V& b) { V r; Simd::MapBinary<4>(r.data, a.data, b.data, Simd::MulOp {}); return r; } + static V Div(const V& a, const V& b) { V r; Simd::MapBinary<4>(r.data, a.data, b.data, Simd::DivOp {}); return r; } + + static V AddScalar(const V& a, float b) { V r; Simd::MapScalar<4>(r.data, a.data, b, Simd::AddOp {}); return r; } + static V SubScalar(const V& a, float b) { V r; Simd::MapScalar<4>(r.data, a.data, b, Simd::SubOp {}); return r; } + static V MulScalar(const V& a, float b) { V r; Simd::MapScalar<4>(r.data, a.data, b, Simd::MulOp {}); return r; } + static V DivScalar(const V& a, float b) { V r; Simd::MapScalar<4>(r.data, a.data, b, Simd::DivOp {}); return r; } + + static float Dot(const V& a, const V& b) + { + return Simd::Sum(Simd::Mul(Simd::LoadU(a.data), Simd::LoadU(b.data))); + } + }; +} + +namespace Common { + template + Vec::Vec() : BaseVec(0) { } - template - Vec::Vec(T inValue) : BaseVec(inValue) + template + Vec::Vec(T inValue) : BaseVec(inValue) { } - template - Vec::Vec(const Vec& other) + template + Vec::Vec(const Vec& other) { for (auto i = 0; i < L; i++) { this->data[i] = other.data[i]; } } - template - Vec::Vec(Vec&& other) noexcept + template + Vec::Vec(Vec&& other) noexcept { for (auto i = 0; i < L; i++) { this->data[i] = std::move(other.data[i]); } } - template - Vec& Vec::operator=(const Vec& other) + template + Vec& Vec::operator=(const Vec& other) { for (auto i = 0; i < L; i++) { this->data[i] = other.data[i]; @@ -535,26 +639,26 @@ namespace Common { return *this; } - template + template template - Vec::Vec(IT&&... inValues) : BaseVec(std::forward(inValues)...) + Vec::Vec(IT&&... inValues) : BaseVec(std::forward(inValues)...) { } - template - T& Vec::operator[](uint32_t i) + template + T& Vec::operator[](uint32_t i) { return this->data[i]; } - template - T Vec::operator[](uint32_t i) const + template + T Vec::operator[](uint32_t i) const { return this->data[i]; } - template - bool Vec::operator==(T rhs) const + template + bool Vec::operator==(T rhs) const { bool result = true; for (auto i = 0; i < L; i++) { @@ -563,8 +667,8 @@ namespace Common { return result; } - template - bool Vec::operator==(const Vec& rhs) const + template + bool Vec::operator==(const Vec& rhs) const { bool result = true; for (auto i = 0; i < L; i++) { @@ -573,232 +677,174 @@ namespace Common { return result; } - template - bool Vec::operator!=(T rhs) const + template + bool Vec::operator!=(T rhs) const { return !this->operator==(rhs); } - template - bool Vec::operator!=(const Vec& rhs) const + template + bool Vec::operator!=(const Vec& rhs) const { return !this->operator==(rhs); } - template - Vec Vec::operator+(T rhs) const + template + Vec Vec::operator+(T rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] + rhs; - } - return result; + return Internal::VecOps::AddScalar(*this, rhs); } - template - Vec Vec::operator-(T rhs) const + template + Vec Vec::operator-(T rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] - rhs; - } - return result; + return Internal::VecOps::SubScalar(*this, rhs); } - template - Vec Vec::operator*(T rhs) const + template + Vec Vec::operator*(T rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] * rhs; - } - return result; + return Internal::VecOps::MulScalar(*this, rhs); } - template - Vec Vec::operator/(T rhs) const + template + Vec Vec::operator/(T rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] / rhs; - } - return result; + return Internal::VecOps::DivScalar(*this, rhs); } - template - Vec Vec::operator+(const Vec& rhs) const + template + Vec Vec::operator+(const Vec& rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] + rhs.data[i]; - } - return result; + return Internal::VecOps::Add(*this, rhs); } - template - Vec Vec::operator-(const Vec& rhs) const + template + Vec Vec::operator-(const Vec& rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] - rhs.data[i]; - } - return result; + return Internal::VecOps::Sub(*this, rhs); } - template - Vec Vec::operator*(const Vec& rhs) const + template + Vec Vec::operator*(const Vec& rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] * rhs.data[i]; - } - return result; + return Internal::VecOps::Mul(*this, rhs); } - template - Vec Vec::operator/(const Vec& rhs) const + template + Vec Vec::operator/(const Vec& rhs) const { - Vec result; - for (auto i = 0; i < L; i++) { - result.data[i] = this->data[i] / rhs.data[i]; - } - return result; + return Internal::VecOps::Div(*this, rhs); } - template - Vec& Vec::operator+=(T rhs) + template + Vec& Vec::operator+=(T rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] += rhs; - } + *this = Internal::VecOps::AddScalar(*this, rhs); return *this; } - template - Vec& Vec::operator-=(T rhs) + template + Vec& Vec::operator-=(T rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] -= rhs; - } + *this = Internal::VecOps::SubScalar(*this, rhs); return *this; } - template - Vec& Vec::operator*=(T rhs) + template + Vec& Vec::operator*=(T rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] *= rhs; - } + *this = Internal::VecOps::MulScalar(*this, rhs); return *this; } - template - Vec& Vec::operator/=(T rhs) + template + Vec& Vec::operator/=(T rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] /= rhs; - } + *this = Internal::VecOps::DivScalar(*this, rhs); return *this; } - template - Vec& Vec::operator+=(const Vec& rhs) + template + Vec& Vec::operator+=(const Vec& rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] += rhs.data[i]; - } + *this = Internal::VecOps::Add(*this, rhs); return *this; } - template - Vec& Vec::operator-=(const Vec& rhs) + template + Vec& Vec::operator-=(const Vec& rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] -= rhs.data[i]; - } + *this = Internal::VecOps::Sub(*this, rhs); return *this; } - template - Vec& Vec::operator*=(const Vec& rhs) + template + Vec& Vec::operator*=(const Vec& rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] *= rhs.data[i]; - } + *this = Internal::VecOps::Mul(*this, rhs); return *this; } - template - Vec& Vec::operator/=(const Vec& rhs) + template + Vec& Vec::operator/=(const Vec& rhs) { - for (auto i = 0; i < L; i++) { - this->data[i] /= rhs.data[i]; - } + *this = Internal::VecOps::Div(*this, rhs); return *this; } - template + template template - Vec Vec::CastTo() const + Vec Vec::CastTo() const { - Vec result; + Vec result; for (auto i = 0; i < L; i++) { result.data[i] = static_cast(this->data[i]); } return result; } - template + template template - Vec Vec::SubVec() const + Vec Vec::SubVec() const { - Vec result; - Internal::CopyValueToSubVec, Vec, I...>(*this, result, std::make_index_sequence {}); + Vec result; + Internal::CopyValueToSubVec, Vec, I...>(*this, result, std::make_index_sequence {}); return result; } - template - T Vec::Model() const + template + T Vec::Model() const { static_assert(FloatingPoint); - T temp = 0; - for (auto i = 0; i < L; i++) { - temp += this->data[i] * this->data[i]; - } - return std::sqrt(temp); + return std::sqrt(Internal::VecOps::Dot(*this, *this)); } - template - Vec Vec::Normalized() const + template + Vec Vec::Normalized() const { return this->operator/(Model()); } - template - void Vec::Normalize() + template + void Vec::Normalize() { T oneOverModel = static_cast(1.0) / Model(); - for (auto i = 0; i < L; i++) { - this->data[i] *= oneOverModel; - } + *this = Internal::VecOps::MulScalar(*this, oneOverModel); } - template - T Vec::Dot(const Vec& rhs) const + template + T Vec::Dot(const Vec& rhs) const { static_assert(FloatingPoint); - T temp = 0; - for (auto i = 0; i < L; i++) { - temp += this->data[i] * rhs.data[i]; - } - return temp; + return Internal::VecOps::Dot(*this, rhs); } - template - typename Internal::VecCrossResultTraits::Type Vec::Cross(const Vec& rhs) const + template + typename Internal::VecCrossResultTraits::Type Vec::Cross(const Vec& rhs) const { static_assert(FloatingPoint && L >= 2 && L <= 3); - typename Internal::VecCrossResultTraits::Type result; + typename Internal::VecCrossResultTraits::Type result; if constexpr (L == 2) { result = this->x * rhs.y - this->y * rhs.x; } else { diff --git a/Engine/Source/Common/Include/Common/Platform.h b/Engine/Source/Common/Include/Common/Platform.h index a98c64bd..e85a819b 100644 --- a/Engine/Source/Common/Include/Common/Platform.h +++ b/Engine/Source/Common/Include/Common/Platform.h @@ -6,38 +6,18 @@ #include -// GCC and Clang in GNU mode predefine the bare macro `linux` as 1, which collides with the linux enumerators below. -// Drop it so the lowercase enumerator stays valid; the canonical `__linux__` is unaffected. -#ifdef linux -#undef linux +#define ARCH_X86 0 +#define ARCH_ARM 0 + +#if defined(__x86_64__) || defined(_M_X64) +#undef ARCH_X86 +#define ARCH_X86 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +#undef ARCH_ARM +#define ARCH_ARM 1 #endif namespace Common { - enum class DevelopmentPlatform { - windows, - macos, - linux, - max - }; - - enum class TargetPlatform { - windows, - macos, - linux, - android, - ios, - xbox, - playStation, - nintendoSwitch, - max - }; - - enum class CpuArch { - x86, - x64, - max - }; - class PlatformUtils { public: static void SetEnvVar(const std::string& inKey, const std::string& inValue); diff --git a/Engine/Source/Common/Src/Debug.cpp b/Engine/Source/Common/Src/Debug.cpp index 47fcaa68..57e64f74 100644 --- a/Engine/Source/Common/Src/Debug.cpp +++ b/Engine/Source/Common/Src/Debug.cpp @@ -10,13 +10,10 @@ #include namespace Common { - void Debug::AssertImpl(const bool expression, const std::string& name, const std::string& file, const uint32_t line, const std::string& reason) + void Debug::AssertFailed(const char* name, const char* file, const uint32_t line, const std::string_view reason) { AutoCerrFlush; - if (expression) { - return; - } std::cerr << "Assert failed: " << name << ", " << file << ", " << line << newline; std::cerr << "Reason: " << reason << newline; diff --git a/Engine/Source/Common/Test/MathTest.cpp b/Engine/Source/Common/Test/MathTest.cpp index d757df48..55089f96 100644 --- a/Engine/Source/Common/Test/MathTest.cpp +++ b/Engine/Source/Common/Test/MathTest.cpp @@ -1337,3 +1337,158 @@ TEST(MathTest, JsonSerializationTest) FltToJson(90.0f), FltToJson(512.0f), FltToJson(1024.0f), FltToJson(1.0f), FltToJson(500.0f), "}")); } + +// ==================================== Math backends ==================================== +// The scalar and simd backends must produce identical results (within epsilon) for every operation. These tests are +// the regression guard for the SIMD specializations. + +TEST(MathTest, VectorBackendConsistencyTest) +{ + using VecScalar = Vec; + using VecSimd = Vec; + + const VecScalar as(1.0f, -2.0f, 3.5f, -4.25f); + const VecScalar bs(0.5f, 6.0f, -7.0f, 8.0f); + const VecSimd ai(1.0f, -2.0f, 3.5f, -4.25f); + const VecSimd bi(0.5f, 6.0f, -7.0f, 8.0f); + + for (auto i = 0; i < 4; i++) { + ASSERT_FLOAT_EQ((as + bs)[i], (ai + bi)[i]); + ASSERT_FLOAT_EQ((as - bs)[i], (ai - bi)[i]); + ASSERT_FLOAT_EQ((as * bs)[i], (ai * bi)[i]); + ASSERT_FLOAT_EQ((as / bs)[i], (ai / bi)[i]); + ASSERT_FLOAT_EQ((as * 2.5f)[i], (ai * 2.5f)[i]); + ASSERT_FLOAT_EQ((as + 2.5f)[i], (ai + 2.5f)[i]); + } + ASSERT_FLOAT_EQ(as.Dot(bs), ai.Dot(bi)); + ASSERT_FLOAT_EQ(as.Model(), ai.Model()); + + for (auto i = 0; i < 4; i++) { + ASSERT_FLOAT_EQ(as.Normalized()[i], ai.Normalized()[i]); + } +} + +TEST(MathTest, MatrixBackendConsistencyTest) +{ + using MatScalar = Mat; + using MatSimd = Mat; + + const MatScalar as( + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f); + const MatScalar bs( + 17.0f, 18.0f, 19.0f, 20.0f, + 21.0f, 22.0f, 23.0f, 24.0f, + 25.0f, 26.0f, 27.0f, 28.0f, + 29.0f, 30.0f, 31.0f, 32.0f); + const MatSimd ai( + 1.0f, 2.0f, 3.0f, 4.0f, + 5.0f, 6.0f, 7.0f, 8.0f, + 9.0f, 10.0f, 11.0f, 12.0f, + 13.0f, 14.0f, 15.0f, 16.0f); + const MatSimd bi( + 17.0f, 18.0f, 19.0f, 20.0f, + 21.0f, 22.0f, 23.0f, 24.0f, + 25.0f, 26.0f, 27.0f, 28.0f, + 29.0f, 30.0f, 31.0f, 32.0f); + + const Vec vs(2.0f, -3.0f, 5.0f, -7.0f); + const Vec vi(2.0f, -3.0f, 5.0f, -7.0f); + + const MatScalar mulS = as * bs; + const MatSimd mulI = ai * bi; + const MatScalar addS = as + bs; + const MatSimd addI = ai + bi; + const MatScalar scaleS = as * 3.0f; + const MatSimd scaleI = ai * 3.0f; + const MatScalar transS = as.Transpose(); + const MatSimd transI = ai.Transpose(); + const auto mulVecS = as * vs; + const auto mulVecI = ai * vi; + for (auto i = 0; i < 16; i++) { + ASSERT_FLOAT_EQ(mulS[i], mulI[i]); + ASSERT_FLOAT_EQ(addS[i], addI[i]); + ASSERT_FLOAT_EQ(scaleS[i], scaleI[i]); + ASSERT_FLOAT_EQ(transS[i], transI[i]); + } + for (auto i = 0; i < 4; i++) { + ASSERT_FLOAT_EQ(mulVecS[i], mulVecI[i]); + } + + using Mat3Scalar = Mat; + using Mat3Simd = Mat; + + const Mat3Scalar a3s(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 10.0f); + const Mat3Scalar b3s(11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 20.0f); + const Mat3Simd a3i(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 10.0f); + const Mat3Simd b3i(11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 20.0f); + + const Vec v3s(2.0f, -3.0f, 5.0f); + const Vec v3i(2.0f, -3.0f, 5.0f); + + const Mat3Scalar mul3S = a3s * b3s; + const Mat3Simd mul3I = a3i * b3i; + const Mat3Scalar add3S = a3s + b3s; + const Mat3Simd add3I = a3i + b3i; + const Mat3Scalar scale3S = a3s * 3.0f; + const Mat3Simd scale3I = a3i * 3.0f; + const Mat3Scalar trans3S = a3s.Transpose(); + const Mat3Simd trans3I = a3i.Transpose(); + const auto mulVec3S = a3s * v3s; + const auto mulVec3I = a3i * v3i; + for (auto i = 0; i < 9; i++) { + ASSERT_FLOAT_EQ(mul3S[i], mul3I[i]); + ASSERT_FLOAT_EQ(add3S[i], add3I[i]); + ASSERT_FLOAT_EQ(scale3S[i], scale3I[i]); + ASSERT_FLOAT_EQ(trans3S[i], trans3I[i]); + } + for (auto i = 0; i < 3; i++) { + ASSERT_FLOAT_EQ(mulVec3S[i], mulVec3I[i]); + } + + // Deliberately asymmetric: a symmetric source has a symmetric adjugate, which would mask a transpose/column mix-up + // in the SIMD inverse. + const MatScalar invSrcS( + 5.0f, 1.0f, 7.0f, 2.0f, + 4.0f, 2.0f, 6.0f, 5.0f, + 3.0f, 5.0f, 1.0f, 8.0f, + 1.0f, 2.0f, 3.0f, 4.0f); + const MatSimd invSrcI( + 5.0f, 1.0f, 7.0f, 2.0f, + 4.0f, 2.0f, 6.0f, 5.0f, + 3.0f, 5.0f, 1.0f, 8.0f, + 1.0f, 2.0f, 3.0f, 4.0f); + const MatScalar invS = invSrcS.Inverse(); + const MatSimd invI = invSrcI.Inverse(); + for (auto i = 0; i < 16; i++) { + ASSERT_FLOAT_EQ(invS[i], invI[i]); + } +} + +TEST(MathTest, QuaternionBackendConsistencyTest) +{ + using QuatScalar = Quaternion; + using QuatSimd = Quaternion; + + const QuatScalar as(1.0f, 2.0f, 3.0f, 4.0f); + const QuatScalar bs(5.0f, 6.0f, 7.0f, 8.0f); + const QuatSimd ai(1.0f, 2.0f, 3.0f, 4.0f); + const QuatSimd bi(5.0f, 6.0f, 7.0f, 8.0f); + + const QuatScalar mulS = as * bs; + const QuatSimd mulI = ai * bi; + const QuatScalar addS = as + bs; + const QuatSimd addI = ai + bi; + ASSERT_FLOAT_EQ(mulS.w, mulI.w); + ASSERT_FLOAT_EQ(mulS.x, mulI.x); + ASSERT_FLOAT_EQ(mulS.y, mulI.y); + ASSERT_FLOAT_EQ(mulS.z, mulI.z); + ASSERT_FLOAT_EQ(addS.w, addI.w); + ASSERT_FLOAT_EQ(addS.x, addI.x); + ASSERT_FLOAT_EQ(addS.y, addI.y); + ASSERT_FLOAT_EQ(addS.z, addI.z); + ASSERT_FLOAT_EQ(as.Dot(bs), ai.Dot(bi)); + ASSERT_FLOAT_EQ(as.Model(), ai.Model()); +} diff --git a/ThirdParty/Registry.cmake b/ThirdParty/Registry.cmake index 1e50afeb..f8550eba 100644 --- a/ThirdParty/Registry.cmake +++ b/ThirdParty/Registry.cmake @@ -9,6 +9,7 @@ find_package(glfw3 REQUIRED GLOBAL) find_package(stb REQUIRED GLOBAL) find_package(cityhash REQUIRED GLOBAL) find_package(GTest REQUIRED GLOBAL) +find_package(benchmark REQUIRED GLOBAL) find_package(Taskflow REQUIRED GLOBAL) find_package(libclang REQUIRED GLOBAL) find_package(assimp REQUIRED GLOBAL) diff --git a/Tool/MirrorTool/Src/Parser.cpp b/Tool/MirrorTool/Src/Parser.cpp index 5ea6d07e..a1641180 100644 --- a/Tool/MirrorTool/Src/Parser.cpp +++ b/Tool/MirrorTool/Src/Parser.cpp @@ -464,6 +464,7 @@ namespace MirrorTool { std::vector argumentStrs = { "-x", "c++", "-std=c++20", + "-DMIRROR_TOOL_PARSING=1", #if BUILD_CONFIG_DEBUG "-DBUILD_CONFIG_DEBUG=1", #else diff --git a/conanfile.py b/conanfile.py index 0b3df166..a841f60e 100644 --- a/conanfile.py +++ b/conanfile.py @@ -9,6 +9,7 @@ def requirements(self): self.requires("stb/cci.20230920") self.requires("cityhash/1.0.1") self.requires("gtest/1.17.0") + self.requires("benchmark/1.9.5") self.requires("taskflow/3.10.0") self.requires("vulkan-headers/1.4.350.0") self.requires("vulkan-loader/1.4.350.0")