From e5c7fc842a70f21acf539f11e0a8fda27a75424e Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Wed, 1 Jul 2026 18:44:53 +0800 Subject: [PATCH 1/5] fix: critical bugs, architecture improvements, and test infrastructure ## Critical Bug Fixes - **getTheoreticalPeakGflops**: Remove spurious `* 1000` that made peak GFLOPS 1000x too large, causing all efficiency calculations to report 0.1% instead of 100%. This also broke the PeakPerformanceReference test upper bound check. - **CPU test compilation**: Rename test_device_info_cpu.cpp to .cu because it includes tensor_core_sgemm.cuh which contains CUDA `<<<>>>` kernel launch syntax that g++ cannot parse. Add CUDA compile options to the sgemm_add_cpu_test CMake helper for .cu sources. - **Duplicate main()**: Remove redundant `main()` from CPU-only test files (test_benchmark_settings.cpp, test_device_info_cpu.cu) that conflicted with GTest::gtest_main linkage. ## High-Severity Fixes - **Integer overflow**: Fix `M * K` int multiplication overflow in tensor_core_benchmark.cuh by using `static_cast` consistently for all matrix size calculations. - **Division by zero**: Add validation in measureGpuTime() to reject benchmark_runs <= 0, preventing undefined behavior. - **Negative dimensions**: Add M > 0 && K > 0 && N > 0 check in KernelConstraints::isSatisfied() to prevent undefined modulo behavior with non-positive dimensions. ## Medium-Severity Fixes - **NaN/Inf verification**: Handle NaN/Inf in reference values in compareMatricesImpl() instead of producing NaN comparison results. - **Exception safety in diagnostics**: Replace CUDA_CHECK (throwing) with non-throwing error handling in printSkipReason(). - **File write error checking**: Add file.fail() check after writing roofline export data. - **DRY violation**: Extract safeGridSize() and checkMatrixElementCount() to cuda_utils.cuh, eliminating duplicated overflow-check code between tensor_core_sgemm.cuh and tensor_core_benchmark.cuh. - **Performance test bounds**: Adjust PeakPerformanceReference bounds (100-200000 GFLOPS) to accommodate the corrected peak calculation and future GPU architectures. ## Infrastructure Improvements - **CI**: Add CPU test execution step (`ctest -L cpu`) to CI workflow. - **CMake helpers**: Add SgemmTestHelpers.cmake with standardized test target creation functions (sgemm_add_cpu_test, sgemm_add_cuda_test, sgemm_add_cuda_perf_test) with proper CTest labels. - **Test refactoring**: Migrate CPU-only tests from .cu to .cpp, split device_info_seam.cu into separate CPU and CUDA test files. - **Kernel catalog**: Add KernelConstraints struct with isSatisfied() method, refactor BenchmarkRunner to use catalog-driven dispatch. Generated with [Devin](https://devin.ai) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- .github/workflows/ci.yml | 8 +- CMakeLists.txt | 126 +++++------ CONTEXT.md | 90 +++++++- cmake/SgemmTestHelpers.cmake | 193 ++++++++++++++++ src/benchmark_runner.cuh | 123 +++++----- src/kernels/kernel_catalog.cuh | 210 +++++++++++++----- src/kernels/tensor_core_benchmark.cuh | 92 ++++---- src/kernels/tensor_core_sgemm.cuh | 30 +-- src/utils/benchmark.cuh | 8 +- src/utils/benchmark_core.cuh | 4 + src/utils/benchmark_metrics.cuh | 6 +- src/utils/cuda_utils.cuh | 30 ++- src/utils/verify.cuh | 179 ++++++++++++--- ...ettings.cu => test_benchmark_settings.cpp} | 19 +- ...e_info_seam.cu => test_device_info_cpu.cu} | 39 +--- tests/test_device_info_cuda.cu | 48 ++++ tests/test_kernel_catalog.cu | 203 +++++++++++++---- tests/test_performance.cu | 30 +-- 18 files changed, 1034 insertions(+), 404 deletions(-) create mode 100644 cmake/SgemmTestHelpers.cmake rename tests/{test_benchmark_settings.cu => test_benchmark_settings.cpp} (98%) rename tests/{test_device_info_seam.cu => test_device_info_cpu.cu} (82%) create mode 100644 tests/test_device_info_cuda.cu diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c126cea..aac1402 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,8 +1,8 @@ # Continuous Integration -# Validates code formatting and CUDA compilation of all targets (benchmark + tests) +# Validates code formatting, CUDA compilation, and CPU-only test execution # # Workflow separation: -# - This workflow: formatting + CUDA compile-time checks (no GPU runtime) +# - This workflow: formatting + CUDA compile-time checks + CPU tests (no GPU runtime) # - pages.yml: docs tests/build and GitHub Pages buildability name: CI @@ -58,7 +58,11 @@ jobs: - name: Build run: cmake --build build -j2 + - name: Run CPU tests + run: ctest --test-dir build -L cpu --output-on-failure + - name: Info run: | echo "✅ CUDA compilation successful for all targets (benchmark + tests)" + echo "✅ CPU-only tests passed" echo "ℹ️ GPU runtime tests require a CUDA-capable machine" diff --git a/CMakeLists.txt b/CMakeLists.txt index d1d29a3..d8a1f1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,91 +96,69 @@ if(BUILD_TESTS) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) - add_executable(test_sgemm tests/test_sgemm.cu) - target_include_directories(test_sgemm PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_compile_definitions(test_sgemm PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET}) - target_link_options(test_sgemm PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR}) - target_link_libraries(test_sgemm PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas - CUDA::curand - ) - target_compile_options(test_sgemm PRIVATE - $<$:--expt-relaxed-constexpr> - ) + # Include test helper functions + include(cmake/SgemmTestHelpers.cmake) - # 工具层测试 - add_executable(test_utils tests/test_utils.cu) - target_include_directories(test_utils PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_compile_definitions(test_utils PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET}) - target_link_options(test_utils PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR}) - target_link_libraries(test_utils PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas - CUDA::curand - ) - target_compile_options(test_utils PRIVATE - $<$:--expt-relaxed-constexpr> - ) + # ═══════════════════════════════════════════════════════════════ + # CPU-only Tests (no CUDA required) + # ═══════════════════════════════════════════════════════════════ - # 性能回归测试 - add_executable(test_performance tests/test_performance.cu) - target_include_directories(test_performance PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_compile_definitions(test_performance PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET}) - target_link_options(test_performance PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR}) - target_link_libraries(test_performance PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas - CUDA::curand - ) - target_compile_options(test_performance PRIVATE - $<$:--expt-relaxed-constexpr> + # Benchmark settings module test - pure C++, no CUDA dependencies + sgemm_add_cpu_test( + NAME test_benchmark_settings + SOURCES tests/test_benchmark_settings.cpp ) - # Benchmark 设置模块测试 - add_executable(test_benchmark_settings tests/test_benchmark_settings.cu) - target_include_directories(test_benchmark_settings PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_link_libraries(test_benchmark_settings PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas + # Device info provider CPU tests - uses fake device properties + # Compiled as .cu because it includes kernel headers with CUDA launch syntax + sgemm_add_cpu_test( + NAME test_device_info_cpu + SOURCES tests/test_device_info_cpu.cu ) - target_compile_options(test_benchmark_settings PRIVATE - $<$:--expt-relaxed-constexpr> + + # ═══════════════════════════════════════════════════════════════ + # CUDA Tests (requires CUDA device, skipped if unavailable) + # ═══════════════════════════════════════════════════════════════ + + # Kernel correctness tests with property-based testing + sgemm_add_cuda_test( + NAME test_sgemm + SOURCES tests/test_sgemm.cu + CUDA_LIBRARIES CUDA::curand + REQUIRES_WMMA ) - # Kernel catalog module test - add_executable(test_kernel_catalog tests/test_kernel_catalog.cu) - target_include_directories(test_kernel_catalog PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_link_libraries(test_kernel_catalog PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas + # Utility layer tests (DeviceMemory, verifier RAII, etc.) + sgemm_add_cuda_test( + NAME test_utils + SOURCES tests/test_utils.cu + CUDA_LIBRARIES CUDA::curand + REQUIRES_WMMA ) - target_compile_options(test_kernel_catalog PRIVATE - $<$:--expt-relaxed-constexpr> + + # Kernel catalog module test - requires device memory and kernel launch + sgemm_add_cuda_test( + NAME test_kernel_catalog + SOURCES tests/test_kernel_catalog.cu + REQUIRES_WMMA ) - # Device info seam test - add_executable(test_device_info_seam tests/test_device_info_seam.cu) - target_include_directories(test_device_info_seam PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src) - target_link_libraries(test_device_info_seam PRIVATE - GTest::gtest_main - CUDA::cudart - CUDA::cublas + # Device info provider CUDA tests - requires real GPU for production adapter + sgemm_add_cuda_test( + NAME test_device_info_cuda + SOURCES tests/test_device_info_cuda.cu ) - target_compile_options(test_device_info_seam PRIVATE - $<$:--expt-relaxed-constexpr> + + # ═══════════════════════════════════════════════════════════════ + # Performance Tests (CUDA + performance label) + # ═══════════════════════════════════════════════════════════════ + + # Performance regression tests + sgemm_add_cuda_perf_test( + NAME test_performance + SOURCES tests/test_performance.cu + CUDA_LIBRARIES CUDA::curand + REQUIRES_WMMA ) - include(GoogleTest) - gtest_discover_tests(test_sgemm) - gtest_discover_tests(test_utils) - gtest_discover_tests(test_performance) - gtest_discover_tests(test_benchmark_settings) - gtest_discover_tests(test_kernel_catalog) - gtest_discover_tests(test_device_info_seam) endif() diff --git a/CONTEXT.md b/CONTEXT.md index 9dcd977..9a552bb 100644 --- a/CONTEXT.md +++ b/CONTEXT.md @@ -4,6 +4,28 @@ ## 核心模块 +### Kernel Catalog 模块 + +**位置**: `src/kernels/kernel_catalog.cuh` + +**权威元数据源** - 内核阶梯的唯一事实来源: + +- **KernelCatalogEntry**: 完整的内核元数据 + - `name`: 显示名称 + - `type`: KernelType::Standard 或 KernelType::TensorCore + - `launcher`: 启动适配器 + - `constraints`: 运行时约束(Tensor Core 要求、维度对齐) +- **KernelConstraints**: 运行时约束描述 + - `requires_tensor_cores`: 是否需要 sm_70+ + - `dimension_alignment`: 维度对齐要求(0 = 无约束) + - `requires_compute_only`: 是否使用特殊 benchmark 接口 +- **查询工具**: `countKernelsByType()`, `getKernelNames()`, `canRunTensorCoreKernels()` + +**设计原则**: +- **单一事实源**: 新增内核只需添加一个 catalog 条目 +- **自描述约束**: 每个 entry 知道自己能否在给定条件下运行 +- **统一调度**: BenchmarkRunner 通过 catalog 迭代,无特殊分支 + ### Tensor Core 模块 **位置**: `src/kernels/tensor_core_sgemm.cuh` @@ -24,6 +46,7 @@ **位置**: `src/kernels/tensor_core_benchmark.cuh` Tensor Core 特有的 benchmark 功能,提供: +- `canRunTensorCoreComputeOnly()` - 约束检查(与 KernelCatalog 语义一致) - `runTensorCoreComputeOnlyBenchmark()` - 纯计算路径性能测试 **接口设计**:只接受 `cublasHandle_t`,不依赖整个 `SGEMMBenchmark` 类,避免内核层对工具层的上穿依赖。 @@ -32,19 +55,42 @@ Tensor Core 特有的 benchmark 功能,提供: **位置**: `src/utils/verify.cuh` -统一的验证逻辑: -- `detail::compareMatricesImpl()` - 内部实现,供其他函数共享 -- `compareMatrices()` - 独立的矩阵比较函数 -- `SGEMMVerifier` - 带 cuBLAS 句柄的验证器类 +**统一的验证策略** - reference + comparison + tolerance policy: + +- **VerifyResult**: 验证结果结构(pass/fail、错误指标) +- **VerifyTolerance**: 容差规范(numpy-style allclose 语义) + - `kStandardVerifyTolerance`: FP32 标准容差 + - `kTensorCoreVerifyTolerance`: Tensor Core 宽松容差 +- **比较函数**: + - `compareMatrices()`: Host 指针比较 + - `compareDeviceMatrices()`: Device 指针比较 +- **SGEMMVerifier**: cuBLAS 参考计算适配器 + - `computeReference()`: 计算参考结果 + - `verify()`, `verifyDevice()`: 验证内核输出 + +**设计原则**: +- **单一验证政策**: 所有内核共享同一套容差语义 +- **分离关注点**: 参考计算 vs 比较逻辑 +- **可扩展**: 未来可添加其他参考适配器 ## Benchmark 模块 项目将 Benchmark 功能拆分为三个深度模块,每个模块有独立的职责: +### Benchmark Settings +**位置**: `src/utils/benchmark_settings.cuh** + +配置集中化: +- `RunSettings`: 预热次数、测量次数 +- `VerificationSettings`: 容差配置 +- `OutputSettings`: Roofline 导出选项 +- `BenchmarkSettings`: 聚合配置 + ### Benchmark Core **位置**: `src/utils/benchmark_core.cuh` 核心性能测量: +- `BenchmarkResult`: 结果结构和报告生成 - `CudaTimer` - RAII 包装的 CUDA 事件计时器 - `measureGpuTime()` - 通用的 GPU 操作性能测量器 @@ -62,7 +108,6 @@ Tensor Core 特有的 benchmark 功能,提供: 聚合模块并提供: - `SGEMMBenchmark` - 高级 benchmark 编排器 -- `BenchmarkResult` - 结果结构和报告生成 ## 测试架构 @@ -70,6 +115,13 @@ Tensor Core 特有的 benchmark 功能,提供: 项目采用分层测试策略,确保每个层级都有独立的测试面: +#### CPU-only 测试 +**位置**: `tests/test_benchmark_settings.cpp`, `tests/test_device_info_cpu.cpp` + +纯 CPU 测试,不需要 CUDA 设备: +- 设置模块单元测试 +- 设备信息 Seam 测试(使用 fake provider) + #### 内核层测试 **位置**: `tests/test_sgemm.cu` @@ -78,6 +130,14 @@ Tensor Core 特有的 benchmark 功能,提供: - Tensor Core 快速路径和 fallback 测试 - 边界测试和维度不变性测试 +#### Kernel Catalog 测试 +**位置**: `tests/test_kernel_catalog.cu` + +测试内核目录的元数据和约束: +- Catalog 包含预期的内核 +- 条目有有效的元数据(名称、启动器、约束) +- 约束检查正确工作 + #### 工具层测试 **位置**: `tests/test_utils.cu` @@ -88,8 +148,6 @@ Tensor Core 特有的 benchmark 功能,提供: - `VerifyTolerance` - 容差配置和边界条件 - NaN/Inf 处理、异常安全性 -**设计原则**:工具层测试独立于内核测试,可以单独捕获工具类 bug。 - #### 性能回归测试 **位置**: `tests/test_performance.cu` @@ -105,7 +163,19 @@ Tensor Core 特有的 benchmark 功能,提供: - Double-Buffer: 35% 峰值 - Tensor Core: 50% 峰值(当可用时) -**设计原则**:性能测试独立于正确性测试,可在 CI 中检测重大性能退化。 +### 测试分类标签 + +项目使用 CTest labels 区分测试类型: +- `cpu`: CPU-only 测试,不需要 CUDA 设备 +- `cuda`: 需要 CUDA 设备的测试,无 GPU 时跳过 +- `performance`: 性能回归测试 + +**运行命令**: +```bash +ctest -L cpu # 只运行 CPU 测试 +ctest -L cuda # 只运行 CUDA 测试 +ctest -L performance # 只运行性能测试 +``` ## 架构原则 @@ -115,8 +185,8 @@ Tensor Core 特有的 benchmark 功能,提供: - `main.cu` - 入口点,仅负责组装 - `cli_parser.cuh` - 命令行解析、配置构造 - `benchmark_runner.cuh` - 内核调度、结果聚合 -2. **内核层** (`src/kernels/`) - 5 个内核实现 + Tensor Core 专用模块 -3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助 +2. **内核层** (`src/kernels/`) - 5 个内核实现 + Kernel Catalog + Tensor Core 专用模块 +3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助、设置模块 ### 依赖方向 diff --git a/cmake/SgemmTestHelpers.cmake b/cmake/SgemmTestHelpers.cmake new file mode 100644 index 0000000..67f5244 --- /dev/null +++ b/cmake/SgemmTestHelpers.cmake @@ -0,0 +1,193 @@ +# SGEMM CMake Test Helper Functions +# +# This module provides helper functions to standardize test target creation +# and ensure consistent handling of CPU-only vs CUDA-required tests. +# +# Test Categories: +# - CPU-only: Can compile and run without CUDA runtime/device. May still need +# CUDA headers for struct definitions (e.g., cudaDeviceProp). +# - CUDA: Requires CUDA device at runtime. Tests are skipped if no GPU. +# - Performance: CUDA tests with performance regression labels. + +# sgemm_add_cpu_test( +# NAME +# SOURCES [ ...] +# [LIBRARIES ...] +# ) +# +# Creates a CPU-only test target that does not require CUDA runtime. +# These tests: +# - Can run on any system with CUDA toolkit (headers only) +# - Are labeled with "cpu" for CTest filtering +# - Do NOT use runCudaAwareTests() - just gtest_main +# - Will NOT be skipped due to missing GPU +function(sgemm_add_cpu_test) + set(options "") + set(oneValueArgs NAME) + set(multiValueArgs SOURCES LIBRARIES) + cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT ARG_NAME) + message(FATAL_ERROR "sgemm_add_cpu_test: NAME is required") + endif() + if(NOT ARG_SOURCES) + message(FATAL_ERROR "sgemm_add_cpu_test: SOURCES is required") + endif() + + add_executable(${ARG_NAME} ${ARG_SOURCES}) + + target_include_directories(${ARG_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ) + + # CUDA compile options for .cu sources (if any) + target_compile_options(${ARG_NAME} PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + # CPU-only tests still need CUDA headers but not runtime + # Use CUDA::cudart header-only interface if available, otherwise link minimal + target_link_libraries(${ARG_NAME} PRIVATE + GTest::gtest_main + CUDA::cudart + ${ARG_LIBRARIES} + ) + + # Set C++ standard + target_compile_features(${ARG_NAME} PRIVATE cxx_std_17) + + # Register with CTest + gtest_discover_tests(${ARG_NAME} + PROPERTIES + LABELS "cpu" + DISCOVERY_MODE PRE_TEST + ) +endfunction() + +# sgemm_add_cuda_test( +# NAME +# SOURCES [ ...] +# [LIBRARIES ...] +# [CUDA_LIBRARIES ...] +# [REQUIRES_WMMA] +# ) +# +# Creates a CUDA test target that requires a CUDA device. +# These tests: +# - Require CUDA toolkit and a CUDA-capable GPU +# - Are automatically skipped when no GPU is available +# - Are labeled with "cuda" for CTest filtering +# - Use runCudaAwareTests() for proper environment setup +# +# REQUIRES_WMMA: If set, adds SGEMM_HAS_WMMA_TARGET compile definition +function(sgemm_add_cuda_test) + set(options "REQUIRES_WMMA") + set(oneValueArgs NAME) + set(multiValueArgs SOURCES LIBRARIES CUDA_LIBRARIES) + cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT ARG_NAME) + message(FATAL_ERROR "sgemm_add_cuda_test: NAME is required") + endif() + if(NOT ARG_SOURCES) + message(FATAL_ERROR "sgemm_add_cuda_test: SOURCES is required") + endif() + + add_executable(${ARG_NAME} ${ARG_SOURCES}) + + target_include_directories(${ARG_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ) + + # Standard CUDA compile options + target_compile_options(${ARG_NAME} PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + # Link against CUDA runtime and libraries + target_link_options(${ARG_NAME} PRIVATE + -L${SGEMM_CUDA_LIBRARY_DIR} + ) + + # WMMA target definition if needed + if(ARG_REQUIRES_WMMA) + target_compile_definitions(${ARG_NAME} PRIVATE + SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET} + ) + endif() + + # Default CUDA libraries + set(DEFAULT_CUDA_LIBS CUDA::cudart CUDA::cublas) + + target_link_libraries(${ARG_NAME} PRIVATE + GTest::gtest_main + ${DEFAULT_CUDA_LIBS} + ${ARG_CUDA_LIBRARIES} + ${ARG_LIBRARIES} + ) + + # Register with CTest + gtest_discover_tests(${ARG_NAME} + PROPERTIES + LABELS "cuda" + DISCOVERY_MODE PRE_TEST + ) +endfunction() + +# sgemm_add_cuda_perf_test( +# NAME +# SOURCES [ ...] +# [LIBRARIES ...] +# ) +# +# Creates a CUDA performance test target. +# Same as sgemm_add_cuda_test but with additional labels for performance testing. +function(sgemm_add_cuda_perf_test) + set(options "REQUIRES_WMMA") + set(oneValueArgs NAME) + set(multiValueArgs SOURCES LIBRARIES CUDA_LIBRARIES) + cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if(NOT ARG_NAME) + message(FATAL_ERROR "sgemm_add_cuda_perf_test: NAME is required") + endif() + if(NOT ARG_SOURCES) + message(FATAL_ERROR "sgemm_add_cuda_perf_test: SOURCES is required") + endif() + + add_executable(${ARG_NAME} ${ARG_SOURCES}) + + target_include_directories(${ARG_NAME} PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src + ) + + target_compile_options(${ARG_NAME} PRIVATE + $<$:--expt-relaxed-constexpr> + ) + + target_link_options(${ARG_NAME} PRIVATE + -L${SGEMM_CUDA_LIBRARY_DIR} + ) + + if(ARG_REQUIRES_WMMA) + target_compile_definitions(${ARG_NAME} PRIVATE + SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET} + ) + endif() + + set(DEFAULT_CUDA_LIBS CUDA::cudart CUDA::cublas CUDA::curand) + + target_link_libraries(${ARG_NAME} PRIVATE + GTest::gtest_main + ${DEFAULT_CUDA_LIBS} + ${ARG_CUDA_LIBRARIES} + ${ARG_LIBRARIES} + ) + + # Register with CTest with both cuda and performance labels + gtest_discover_tests(${ARG_NAME} + PROPERTIES + LABELS "cuda;performance" + DISCOVERY_MODE PRE_TEST + ) +endfunction() diff --git a/src/benchmark_runner.cuh b/src/benchmark_runner.cuh index 9d99b81..a7344a3 100644 --- a/src/benchmark_runner.cuh +++ b/src/benchmark_runner.cuh @@ -20,6 +20,11 @@ * * 负责调度所有内核 benchmark 并生成报告。 * 与 CLI 解析分离,可被测试或脚本直接调用。 + * + * 设计: + * - 使用 KernelCatalog 作为内核阶梯的唯一事实来源 + * - Catalog 条目包含所有约束和默认容差 + * - BenchmarkRunner 只负责编排流程,不包含内核特定逻辑 */ class BenchmarkRunner { public: @@ -96,6 +101,7 @@ class BenchmarkRunner { "===========\n"); SGEMMBenchmark benchmark; + bool has_tensor_cores = tensorCoresAvailable(); // cuBLAS 参考 printf("\nRunning cuBLAS (reference)...\n"); @@ -103,11 +109,11 @@ class BenchmarkRunner { M, K, N, config_.settings.run.warmup_runs, config_.settings.run.benchmark_runs); float cublas_gflops = cublas_result.gflops; - // 标准内核 - runStandardKernels(benchmark, M, K, N); + // 使用 Catalog 驱动的内核调度 + runCatalogKernels(benchmark, M, K, N, has_tensor_cores); - // Tensor Core 内核 - runTensorCoreKernels(benchmark, M, K, N); + // Tensor Core compute-only 是特殊情况(需要 cublas handle) + runTensorCoreComputeOnly(benchmark, M, K, N, has_tensor_cores); // 报告 benchmark.printSummary(); @@ -120,81 +126,72 @@ class BenchmarkRunner { } } - void runStandardKernels(SGEMMBenchmark &benchmark, int M, int K, int N) { - const auto& catalog = getKernelCatalog(); - - for (const auto& entry : catalog) { - if (entry.type != KernelType::Standard) { + void runCatalogKernels(SGEMMBenchmark &benchmark, int M, int K, int N, bool has_tensor_cores) { + const auto &catalog = getKernelCatalog(); + + for (const auto &entry : catalog) { + // Check runtime constraints from catalog + if (!entry.canRun(M, K, N, has_tensor_cores)) { + printSkipReason(entry, M, K, N, has_tensor_cores); continue; } - + + // Use tolerance from settings (which may override defaults) VerifyTolerance tolerance = config_.settings.toleranceForKernel(entry.type); - - // Strip tile size annotation for console message (keep result name as-is) - std::string console_name = entry.name; - size_t paren_pos = console_name.find(" ("); - if (paren_pos != std::string::npos) { - console_name = console_name.substr(0, paren_pos); - } - - printf("Running %s SGEMM...\n", console_name.c_str()); - benchmark.run( - entry.name, - entry.launcher, - M, K, N, - config_.settings.run.warmup_runs, - config_.settings.run.benchmark_runs, - tolerance); + + printf("Running %s SGEMM...\n", formatConsoleName(entry.name).c_str()); + benchmark.run(entry.name, entry.launcher, M, K, N, config_.settings.run.warmup_runs, + config_.settings.run.benchmark_runs, tolerance); } } - void runTensorCoreKernels(SGEMMBenchmark &benchmark, int M, int K, int N) { - if (!tensorCoresAvailable()) { - int device; - CUDA_CHECK(cudaGetDevice(&device)); - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - printf("Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n", - prop.major, prop.minor); - return; - } + void runTensorCoreComputeOnly(SGEMMBenchmark &benchmark, int M, int K, int N, + bool has_tensor_cores) { + auto tc_entry = getTensorCoreComputeOnlyEntry(); - if (!tensorCoreDimensionsSupported(M, K, N)) { - printf("Skipping Tensor Core benchmarks for %d x %d x %d " - "(requires positive dimensions aligned to 16, fallback would mislabel FP32 as WMMA).\n", - M, K, N); - return; + if (!tc_entry.canRun(M, K, N, has_tensor_cores)) { + return; // Skip silently - already reported by catalog kernels } - // Run catalog-based tensor core kernels - const auto& catalog = getKernelCatalog(); - for (const auto& entry : catalog) { - if (entry.type != KernelType::TensorCore) { - continue; - } - - VerifyTolerance tolerance = config_.settings.toleranceForKernel(entry.type); - - printf("Running Tensor Core SGEMM (end-to-end, includes FP32->FP16 " - "conversion/fallback)...\n"); - benchmark.run( - entry.name, - entry.launcher, - M, K, N, - config_.settings.run.warmup_runs, - config_.settings.run.benchmark_runs, - tolerance); - } - - // Tensor Core compute-only benchmark remains special-cased - // (requires cublas handle, different interface) VerifyTolerance tolerance = config_.settings.toleranceForKernel(KernelType::TensorCore); printf("Running Tensor Core SGEMM (compute-only WMMA path)...\n"); + BenchmarkResult tc_result = runTensorCoreComputeOnlyBenchmark( benchmark.getCublasHandle(), M, K, N, config_.settings.run.warmup_runs, config_.settings.run.benchmark_runs, tolerance); benchmark.addResult(tc_result); } + void printSkipReason(const KernelCatalogEntry &entry, int M, int K, int N, + bool has_tensor_cores) const { + if (entry.constraints.requires_tensor_cores && !has_tensor_cores) { + // Use non-throwing error handling for diagnostic output + int device = 0; + cudaDeviceProp prop{}; + if (cudaGetDevice(&device) == cudaSuccess && + cudaGetDeviceProperties(&prop, device) == cudaSuccess) { + printf("Skipping %s (requires sm_70+, current: sm_%d%d)\n", entry.name.c_str(), + prop.major, prop.minor); + } else { + printf("Skipping %s (requires sm_70+, current device unavailable)\n", + entry.name.c_str()); + } + } else if (entry.constraints.dimension_alignment > 0) { + printf("Skipping %s for %d x %d x %d " + "(requires dimensions aligned to %d)\n", + entry.name.c_str(), M, K, N, entry.constraints.dimension_alignment); + } + } + + std::string formatConsoleName(const std::string &name) const { + // Strip tile size annotation for console message + std::string result = name; + size_t paren_pos = result.find(" ("); + if (paren_pos != std::string::npos) { + result = result.substr(0, paren_pos); + } + return result; + } + BenchmarkConfig config_; }; diff --git a/src/kernels/kernel_catalog.cuh b/src/kernels/kernel_catalog.cuh index 57033fb..a8eff36 100644 --- a/src/kernels/kernel_catalog.cuh +++ b/src/kernels/kernel_catalog.cuh @@ -14,33 +14,91 @@ // ============================================================================ // Kernel Catalog Module // -// Centralizes kernel metadata, launch adapters, and kernel type classification. -// Eliminates repeated lambdas and inline dispatch logic in BenchmarkRunner. +// THE authoritative source for kernel ladder metadata. // // Design: -// - Static catalog of standard kernels (no global mutable state) -// - Each entry contains: name, type, and launch adapter -// - BenchmarkRunner iterates over catalog entries instead of hardcoding dispatch +// - Single source of truth for kernel name, type, tolerance, launcher, constraints +// - Benchmark, tests, and documentation all reference this catalog +// - New kernels only require adding one entry here // -// Note: Tensor Core compute-only benchmark remains a special case due to -// its different interface (requires cublas handle), but the end-to-end -// tensor core kernel is included in the catalog. +// Catalog entries provide: +// - Identity: name, type classification +// - Behavior: launch adapter, default tolerance +// - Constraints: dimension requirements (e.g., Tensor Core requires 16-aligned) // ============================================================================ // ============================================================================ // Kernel Launch Adapter Type // ============================================================================ -using KernelLauncher = std::function; +using KernelLauncher = std::function; + +// ============================================================================ +// Kernel Constraints +// ============================================================================ + +/** + * Describes runtime constraints for a kernel. + * + * Used by BenchmarkRunner to decide whether a kernel can run with given dimensions. + */ +struct KernelConstraints { + bool requires_tensor_cores; // Requires sm_70+ + int dimension_alignment; // All dimensions must be multiple of this (0 = no constraint) + bool requires_compute_only; // Special case: uses different benchmark interface + + static KernelConstraints standard() { return {false, 0, false}; } + + static KernelConstraints tensorCore() { return {true, 16, false}; } + + static KernelConstraints tensorCoreComputeOnly() { return {true, 16, true}; } + + bool isSatisfied(int M, int K, int N, bool has_tensor_cores) const { + if (M <= 0 || K <= 0 || N <= 0) { + return false; + } + if (requires_tensor_cores && !has_tensor_cores) { + return false; + } + if (dimension_alignment > 0) { + if (M % dimension_alignment != 0 || K % dimension_alignment != 0 || + N % dimension_alignment != 0) { + return false; + } + } + return true; + } +}; // ============================================================================ // Kernel Catalog Entry // ============================================================================ +/** + * Complete metadata for a benchmarkable kernel. + * + * Each entry represents one step in the kernel optimization ladder. + */ struct KernelCatalogEntry { - std::string name; - KernelType type; - KernelLauncher launcher; + std::string name; // Display name for reports + KernelType type; // Standard or TensorCore + KernelLauncher launcher; // Launch adapter + KernelConstraints constraints; // Runtime requirements + + /** + * Get default verification tolerance for this kernel type. + */ + VerifyTolerance defaultTolerance() const { + return (type == KernelType::TensorCore) ? kTensorCoreVerifyTolerance + : kStandardVerifyTolerance; + } + + /** + * Check if this kernel can run with given dimensions and hardware. + */ + bool canRun(int M, int K, int N, bool has_tensor_cores) const { + return constraints.isSatisfied(M, K, N, has_tensor_cores); + } }; // ============================================================================ @@ -55,50 +113,94 @@ struct KernelCatalogEntry { * 2. Tensor Core end-to-end kernel (WMMA with FP32->FP16 conversion/fallback) * * Note: The Tensor Core compute-only benchmark is NOT included because it - * has a different interface (requires cublas handle) and is handled separately - * in BenchmarkRunner. + * has a different interface (requires cublas handle). It is handled as a + * special case via getTensorCoreComputeOnlyEntry(). */ -inline const std::vector& getKernelCatalog() { +inline const std::vector &getKernelCatalog() { static const std::vector catalog = { - // Standard FP32 kernels - { - "Naive", - KernelType::Standard, - [](const float *A, const float *B, float *C, int M, int K, int N) { - launch_naive_sgemm<32>(A, B, C, M, K, N); - } - }, - { - "Tiled (32x32)", - KernelType::Standard, - [](const float *A, const float *B, float *C, int M, int K, int N) { - launch_tiled_sgemm<32>(A, B, C, M, K, N); - } - }, - { - "Bank Conflict Free", - KernelType::Standard, - [](const float *A, const float *B, float *C, int M, int K, int N) { - launch_bank_conflict_free_sgemm<32>(A, B, C, M, K, N); - } - }, - { - "Double Buffer", - KernelType::Standard, - [](const float *A, const float *B, float *C, int M, int K, int N) { - launch_double_buffer_sgemm<32>(A, B, C, M, K, N); - } - }, - // Tensor Core end-to-end kernel - { - "Tensor Core (WMMA end-to-end)", - KernelType::TensorCore, - [](const float *A, const float *B, float *C, int M, int K, int N) { - launch_tensor_core_sgemm_with_fallback(A, B, C, M, K, N, - defaultTensorCoreFallback()); + // Standard FP32 kernels - no constraints + {"Naive", KernelType::Standard, + [](const float *A, const float *B, float *C, int M, int K, int N) { + launch_naive_sgemm<32>(A, B, C, M, K, N); + }, + KernelConstraints::standard()}, + {"Tiled (32x32)", KernelType::Standard, + [](const float *A, const float *B, float *C, int M, int K, int N) { + launch_tiled_sgemm<32>(A, B, C, M, K, N); + }, + KernelConstraints::standard()}, + {"Bank Conflict Free", KernelType::Standard, + [](const float *A, const float *B, float *C, int M, int K, int N) { + launch_bank_conflict_free_sgemm<32>(A, B, C, M, K, N); + }, + KernelConstraints::standard()}, + {"Double Buffer", KernelType::Standard, + [](const float *A, const float *B, float *C, int M, int K, int N) { + launch_double_buffer_sgemm<32>(A, B, C, M, K, N); + }, + KernelConstraints::standard()}, + // Tensor Core end-to-end kernel - requires sm_70+ and 16-aligned dimensions + {"Tensor Core (WMMA end-to-end)", KernelType::TensorCore, + [](const float *A, const float *B, float *C, int M, int K, int N) { + launch_tensor_core_sgemm_with_fallback(A, B, C, M, K, N, defaultTensorCoreFallback()); + }, + KernelConstraints::tensorCore()}}; + + return catalog; +} + +/** + * Returns the Tensor Core compute-only entry. + * + * This is a special entry that uses a different benchmark interface + * (requires cublas handle for reference computation). + */ +inline KernelCatalogEntry getTensorCoreComputeOnlyEntry() { + return {"Tensor Core (WMMA compute-only)", KernelType::TensorCore, + nullptr, // Launcher is not used for compute-only; handled specially + KernelConstraints::tensorCoreComputeOnly()}; +} + +// ============================================================================ +// Catalog Query Utilities +// ============================================================================ + +/** + * Count kernels by type in the catalog. + */ +inline int countKernelsByType(KernelType type) { + int count = 0; + for (const auto &entry : getKernelCatalog()) { + if (entry.type == type) { + count++; + } + } + return count; +} + +/** + * Get list of kernel names for a given type. + */ +inline std::vector getKernelNames(KernelType type) { + std::vector names; + for (const auto &entry : getKernelCatalog()) { + if (entry.type == type) { + names.push_back(entry.name); + } + } + return names; +} + +/** + * Check if any Tensor Core kernel can run with given dimensions. + */ +inline bool canRunTensorCoreKernels(int M, int K, int N, bool has_tensor_cores) { + for (const auto &entry : getKernelCatalog()) { + if (entry.type == KernelType::TensorCore) { + if (entry.canRun(M, K, N, has_tensor_cores)) { + return true; } } - }; - - return catalog; + } + return false; } diff --git a/src/kernels/tensor_core_benchmark.cuh b/src/kernels/tensor_core_benchmark.cuh index 6527387..3822315 100644 --- a/src/kernels/tensor_core_benchmark.cuh +++ b/src/kernels/tensor_core_benchmark.cuh @@ -1,9 +1,9 @@ #pragma once -#include "tensor_core_sgemm.cuh" #include "../utils/benchmark_core.cuh" #include "../utils/benchmark_metrics.cuh" #include "../utils/verify.cuh" +#include "tensor_core_sgemm.cuh" #include #include @@ -18,8 +18,22 @@ // 此模块提供 Tensor Core 特有的 benchmark 功能。 // 接口设计:只接受 cublasHandle_t,不依赖整个 SGEMMBenchmark 类, // 避免内核层对工具层的上穿依赖。 +// +// 设计原则: +// - 与 KernelCatalog 的约束保持一致 +// - 使用统一的容差和验证策略 +// - 所有约束检查集中在 tensorCoreDimensionsSupported() 和 tensorCoresAvailable() // ============================================================================ +/** + * 检查 Tensor Core compute-only benchmark 是否可以运行 + * + * 统一的约束检查,与 KernelCatalog 的 canRun() 语义一致 + */ +inline bool canRunTensorCoreComputeOnly(int M, int K, int N) { + return tensorCoresAvailable() && tensorCoreDimensionsSupported(M, K, N); +} + /** * 运行 Tensor Core 纯计算路径 benchmark * @@ -32,13 +46,15 @@ * @param benchmark_runs 测量次数 * @param tolerance 验证容差 * @return BenchmarkResult 包含性能数据 + * @throws CudaError 如果约束不满足 */ inline BenchmarkResult runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, int N, int warmup_runs = 5, int benchmark_runs = 20, VerifyTolerance tolerance = kTensorCoreVerifyTolerance) { - if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) { + // 约束检查 - 与 KernelCatalog 语义一致 + if (!canRunTensorCoreComputeOnly(M, K, N)) { throw CudaError("Tensor Core compute-only benchmark requires sm_70+ and " "dimensions aligned to 16"); } @@ -49,56 +65,48 @@ runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, in result.K = K; result.N = N; - std::vector h_A(M * K), h_B(K * N), h_C(M * N), h_C_ref(M * N); - DeviceMemory d_A(M * K); - DeviceMemory d_B(K * N); - DeviceMemory d_C(M * N); - DeviceMemory d_C_ref(M * N); - DeviceMemory d_A_fp16(M * K); - DeviceMemory d_B_fp16(K * N); + // 安全计算矩阵大小,避免整数溢出 + size_t size_A = static_cast(M) * K; + size_t size_B = static_cast(K) * N; + size_t size_C = static_cast(M) * N; + + std::vector h_A(size_A), h_B(size_B), h_C(size_C), h_C_ref(size_C); + DeviceMemory d_A(size_A); + DeviceMemory d_B(size_B); + DeviceMemory d_C(size_C); + DeviceMemory d_C_ref(size_C); + DeviceMemory d_A_fp16(size_A); + DeviceMemory d_B_fp16(size_B); initRandomMatrix(h_A.data(), M, K, -1.0f, 1.0f, 42); initRandomMatrix(h_B.data(), K, N, -1.0f, 1.0f, 123); - d_A.copyFromHost(h_A.data(), M * K); - d_B.copyFromHost(h_B.data(), K * N); + d_A.copyFromHost(h_A.data(), size_A); + d_B.copyFromHost(h_B.data(), size_B); float alpha = 1.0f, beta = 0.0f; - CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, - d_B.get(), N, d_A.get(), K, &beta, d_C_ref.get(), N)); + CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(), N, + d_A.get(), K, &beta, d_C_ref.get(), N)); int blockSize = kDefaultBlockSize; - // 安全计算 gridSize,检查溢出 - auto safeGridSize = [](size_t num, int blk) -> int { - size_t grid = (num + blk - 1) / blk; - if (grid > static_cast(INT_MAX)) { - throw CudaError("Grid size overflow: matrix too large for kernel launch"); - } - return static_cast(grid); - }; - int gridSizeA = safeGridSize(static_cast(M) * K, blockSize); - int gridSizeB = safeGridSize(static_cast(K) * N, blockSize); - - size_t num_A = static_cast(M) * K; - size_t num_B = static_cast(K) * N; - - // 检查矩阵元素数量是否超过 int 最大值 - if (num_A > static_cast(INT_MAX)) { - throw CudaError("Matrix A size overflow: too many elements for int parameter"); - } - if (num_B > static_cast(INT_MAX)) { - throw CudaError("Matrix B size overflow: too many elements for int parameter"); - } - - float_to_half_kernel<<>>(d_A.get(), d_A_fp16.get(), static_cast(num_A)); - float_to_half_kernel<<>>(d_B.get(), d_B_fp16.get(), static_cast(num_B)); + checkMatrixElementCount(size_A, "A"); + checkMatrixElementCount(size_B, "B"); + int gridSizeA = safeGridSize(size_A, blockSize); + int gridSizeB = safeGridSize(size_B, blockSize); + + float_to_half_kernel<<>>(d_A.get(), d_A_fp16.get(), + static_cast(size_A)); + float_to_half_kernel<<>>(d_B.get(), d_B_fp16.get(), + static_cast(size_B)); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaDeviceSynchronize()); // 使用统一的 measureGpuTime 计时 - float time_ms = - measureGpuTime([&]() { launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N); }, - warmup_runs, benchmark_runs); + float time_ms = measureGpuTime( + [&]() { + launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N); + }, + warmup_runs, benchmark_runs); // 计算指标 PerformanceMetrics metrics = calculateSgemmMetrics(M, K, N, time_ms); @@ -107,8 +115,8 @@ runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, in result.bandwidth_gb_s = metrics.bandwidth_gb_s; result.efficiency = calculateEfficiency(result.gflops, getTheoreticalPeakGflops()); - d_C.copyToHost(h_C.data(), M * N); - d_C_ref.copyToHost(h_C_ref.data(), M * N); + d_C.copyToHost(h_C.data(), size_C); + d_C_ref.copyToHost(h_C_ref.data(), size_C); VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance); result.correct = verify_result.passed; diff --git a/src/kernels/tensor_core_sgemm.cuh b/src/kernels/tensor_core_sgemm.cuh index b5607fc..b77f08f 100644 --- a/src/kernels/tensor_core_sgemm.cuh +++ b/src/kernels/tensor_core_sgemm.cuh @@ -60,9 +60,7 @@ inline bool tensorCoresAvailable(const DeviceInfoProvider &provider) { /** * 检查当前设备是否支持 Tensor Core (默认使用生产环境设备) */ -inline bool tensorCoresAvailable() { - return tensorCoresAvailable(getProductionDeviceInfo()); -} +inline bool tensorCoresAvailable() { return tensorCoresAvailable(getProductionDeviceInfo()); } inline constexpr bool tensorCoreFastPathCompiled() { return SGEMM_HAS_WMMA_TARGET != 0; } /** @@ -117,8 +115,8 @@ using FallbackKernel = * * 提供一个空的 fallback(用于测试或显式配置场景) */ -[[maybe_unused]] inline void -nullFallback(const float *, const float *, float *, int, int, int, cudaStream_t = 0) { +[[maybe_unused]] inline void nullFallback(const float *, const float *, float *, int, int, int, + cudaStream_t = 0) { // 空实现 - 用于测试 } @@ -234,8 +232,8 @@ inline void launch_tensor_core_sgemm_fp16(const half *A, const half *B, float *C } if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) { - throw CudaError( - "launch_tensor_core_sgemm_fp16 requires runtime sm_70+ support and dimensions aligned to 16"); + throw CudaError("launch_tensor_core_sgemm_fp16 requires runtime sm_70+ support and " + "dimensions aligned to 16"); } launch_tensor_core_sgemm_fp16_fast_path(A, B, C, M, K, N, stream); @@ -283,25 +281,11 @@ inline void launch_tensor_core_sgemm_with_fallback(const float *A, const float * DeviceMemory d_B_fp16(num_B); int blockSize = kDefaultBlockSize; - // 安全计算 gridSize,检查溢出 - auto safeGridSize = [](size_t num, int blk) -> int { - size_t grid = (num + blk - 1) / blk; - if (grid > static_cast(INT_MAX)) { - throw CudaError("Grid size overflow: matrix too large for kernel launch"); - } - return static_cast(grid); - }; + checkMatrixElementCount(num_A, "A"); + checkMatrixElementCount(num_B, "B"); int gridSizeA = safeGridSize(num_A, blockSize); int gridSizeB = safeGridSize(num_B, blockSize); - // 检查矩阵元素数量是否超过 int 最大值 - if (num_A > static_cast(INT_MAX)) { - throw CudaError("Matrix A size overflow: too many elements for int parameter"); - } - if (num_B > static_cast(INT_MAX)) { - throw CudaError("Matrix B size overflow: too many elements for int parameter"); - } - float_to_half_kernel<<>>(A, d_A_fp16.get(), static_cast(num_A)); float_to_half_kernel<<>>(B, d_B_fp16.get(), diff --git a/src/utils/benchmark.cuh b/src/utils/benchmark.cuh index 6e044c4..be711d2 100644 --- a/src/utils/benchmark.cuh +++ b/src/utils/benchmark.cuh @@ -94,8 +94,9 @@ class SGEMMBenchmark { d_B.get(), N, d_A.get(), K, &beta, d_C_ref.get(), N)); // 使用统一的 measureGpuTime 计时 - float time_ms = measureGpuTime([&]() { kernel_func(d_A.get(), d_B.get(), d_C.get(), M, K, N); }, - warmup_runs, benchmark_runs); + float time_ms = + measureGpuTime([&]() { kernel_func(d_A.get(), d_B.get(), d_C.get(), M, K, N); }, + warmup_runs, benchmark_runs); // 计算指标 PerformanceMetrics metrics = calculateSgemmMetrics(M, K, N, time_ms); @@ -210,6 +211,9 @@ class SGEMMBenchmark { << "," << ai << "\n"; } + if (file.fail()) { + fprintf(stderr, "Warning: errors occurred while writing %s\n", filename.c_str()); + } file.close(); printf("Approximate roofline data exported to: %s\n", filename.c_str()); } diff --git a/src/utils/benchmark_core.cuh b/src/utils/benchmark_core.cuh index c299ce9..b66be9d 100644 --- a/src/utils/benchmark_core.cuh +++ b/src/utils/benchmark_core.cuh @@ -90,6 +90,10 @@ class CudaTimer { */ template float measureGpuTime(RunFunc func, int warmup_runs = 5, int benchmark_runs = 20) { + if (benchmark_runs <= 0) { + throw CudaError("measureGpuTime: benchmark_runs must be positive"); + } + // 预热运行 for (int i = 0; i < warmup_runs; ++i) { func(); diff --git a/src/utils/benchmark_metrics.cuh b/src/utils/benchmark_metrics.cuh index 4c25b4e..0ec3d59 100644 --- a/src/utils/benchmark_metrics.cuh +++ b/src/utils/benchmark_metrics.cuh @@ -59,8 +59,10 @@ inline PerformanceMetrics calculateSgemmMetrics(int M, int K, int N, float time_ * 提供重载版本以支持可注入的 device info provider。 */ inline float getTheoreticalPeakGflops(const DeviceInfoProvider &provider) { - // 峰值 GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor) - float peakGflops = provider.smCount() * provider.cores_per_sm * 2 * provider.clock_ghz * 1000; + // 峰值 GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) + // clock_ghz 已经是 GHz 单位,GFLOPS = FLOPS / 1e9 = cores * 2 * clock_GHz + float peakGflops = + static_cast(provider.smCount()) * provider.cores_per_sm * 2.0f * provider.clock_ghz; return peakGflops; } diff --git a/src/utils/cuda_utils.cuh b/src/utils/cuda_utils.cuh index 956249b..64ceb82 100644 --- a/src/utils/cuda_utils.cuh +++ b/src/utils/cuda_utils.cuh @@ -170,6 +170,29 @@ inline void initRandomMatrix(float *data, int rows, int cols, float min_val = -1 // Utility Functions // ============================================================================ +/** + * Safely compute grid size from element count and block size. + * Throws CudaError if the grid size would overflow int. + */ +inline int safeGridSize(size_t num, int blk) { + size_t grid = (num + blk - 1) / blk; + if (grid > static_cast(INT_MAX)) { + throw CudaError("Grid size overflow: matrix too large for kernel launch"); + } + return static_cast(grid); +} + +/** + * Check that a matrix element count fits in int (for kernel parameters). + * Throws CudaError if it doesn't. + */ +inline void checkMatrixElementCount(size_t count, const char *name) { + if (count > static_cast(INT_MAX)) { + throw CudaError(std::string("Matrix ") + name + + " size overflow: too many elements for int parameter"); + } +} + inline bool cudaDeviceAvailable() { int device_count = 0; cudaError_t status = cudaGetDeviceCount(&device_count); @@ -255,7 +278,8 @@ class DeviceInfoCache { } int computeCoresPerSM() const { - // 参考: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities + // 参考: + // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities if (prop_.major == 7) { return 64; // Volta (sm_70, sm_72), Turing (sm_75) } else if (prop_.major == 8) { @@ -291,6 +315,4 @@ inline DeviceInfoProvider ProductionDeviceInfoProvider::get() const { /** * Convenience function: get production device info */ -inline DeviceInfoProvider getProductionDeviceInfo() { - return ProductionDeviceInfoProvider{}.get(); -} +inline DeviceInfoProvider getProductionDeviceInfo() { return ProductionDeviceInfoProvider{}.get(); } diff --git a/src/utils/verify.cuh b/src/utils/verify.cuh index 77a1843..00512df 100644 --- a/src/utils/verify.cuh +++ b/src/utils/verify.cuh @@ -9,10 +9,29 @@ #include #include +// ============================================================================ +// Verification Module +// +// Provides unified verification strategy for SGEMM results: +// - Tolerance configuration (FP32 standard, Tensor Core relaxed) +// - Matrix comparison with numpy-style allclose semantics +// - cuBLAS reference computation adapter +// +// Design: +// - Single source of truth for tolerance policies +// - Separates reference computation from comparison logic +// - Supports both device and host pointer comparisons +// ============================================================================ + // ============================================================================ // Verification Result Structure // ============================================================================ +/** + * Result of matrix verification + * + * Captures all metrics needed to assess correctness and diagnose issues. + */ struct VerifyResult { bool passed; float max_abs_error; @@ -29,15 +48,27 @@ struct VerifyResult { 100.0f * error_count / total_elements); } } + + /** + * Legacy compatibility: returns true if verification failed. + * Used by older code that checks shouldFlagAsIncorrect(). + */ + bool shouldFlagAsIncorrect() const { return !passed; } }; // ============================================================================ // Tolerance Configuration // ============================================================================ +/** + * Tolerance specification for matrix comparison. + * + * Uses numpy-style allclose semantics: + * |test - ref| <= atol + rtol * |ref| + */ struct VerifyTolerance { - float rtol; - float atol; + float rtol; // Relative tolerance + float atol; // Absolute tolerance }; // Standard verification tolerance for FP32 kernels @@ -46,23 +77,32 @@ inline constexpr VerifyTolerance kStandardVerifyTolerance{1e-3f, 1e-4f}; // Tensor Core verification tolerance (FP16 intermediate precision) inline constexpr VerifyTolerance kTensorCoreVerifyTolerance{5e-2f, 1e-2f}; +/** + * Compute tolerance threshold for a specific reference value. + */ inline float toleranceForValue(float ref_val, VerifyTolerance tolerance) { return tolerance.atol + tolerance.rtol * std::fabs(ref_val); } +/** + * Check if a test value is within tolerance of a reference value. + */ inline bool isWithinTolerance(float test_val, float ref_val, VerifyTolerance tolerance) { float abs_error = std::fabs(test_val - ref_val); return abs_error <= toleranceForValue(ref_val, tolerance); } // ============================================================================ -// Internal Implementation +// Matrix Comparison (Internal Implementation) // ============================================================================ namespace detail { -// 内部实现:比较两个矩阵并返回验证结果 -// 供 verify() 和 compareMatrices() 共享 +/** + * Internal: Compare two host matrices and return verification result. + * + * This is the core comparison logic used by all verification functions. + */ inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref, size_t num_elements, VerifyTolerance tolerance) { VerifyResult result; @@ -75,8 +115,24 @@ inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref, float ref_val = h_ref[i]; float test_val = h_test[i]; - // Check for NaN or Inf + // Check for NaN or Inf in test output if (std::isnan(test_val) || std::isinf(test_val)) { + // If reference is also the same NaN/Inf, consider it a match + if (std::isnan(ref_val) && std::isnan(test_val)) { + continue; + } + if (std::isinf(ref_val) && std::isinf(test_val) && + std::signbit(ref_val) == std::signbit(test_val)) { + continue; + } + result.error_count++; + result.max_abs_error = std::numeric_limits::infinity(); + result.max_rel_error = std::numeric_limits::infinity(); + continue; + } + + // Skip comparison when reference is NaN/Inf (can't meaningfully compare) + if (std::isnan(ref_val) || std::isinf(ref_val)) { result.error_count++; result.max_abs_error = std::numeric_limits::infinity(); result.max_rel_error = std::numeric_limits::infinity(); @@ -104,18 +160,58 @@ inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref, // Standalone Verification Functions // ============================================================================ -// Compare two matrices and return verification result -// Uses numpy-style allclose: |test - ref| <= atol + rtol * |ref| +/** + * Compare two host matrices and return verification result. + * + * @param h_test Test matrix (host pointer) + * @param h_ref Reference matrix (host pointer) + * @param M, N Matrix dimensions + * @param tolerance Verification tolerance + */ inline VerifyResult compareMatrices(const float *h_test, const float *h_ref, int M, int N, VerifyTolerance tolerance = kStandardVerifyTolerance) { size_t num_elements = static_cast(M) * N; return detail::compareMatricesImpl(h_test, h_ref, num_elements, tolerance); } +/** + * Compare two device matrices and return verification result. + * + * Copies both matrices to host before comparison. + * + * @param d_test Test matrix (device pointer) + * @param d_ref Reference matrix (device pointer) + * @param M, N Matrix dimensions + * @param tolerance Verification tolerance + */ +inline VerifyResult compareDeviceMatrices(const float *d_test, const float *d_ref, int M, int N, + VerifyTolerance tolerance = kStandardVerifyTolerance) { + size_t num_elements = static_cast(M) * N; + std::vector h_test(num_elements); + std::vector h_ref(num_elements); + + CUDA_CHECK( + cudaMemcpy(h_test.data(), d_test, num_elements * sizeof(float), cudaMemcpyDeviceToHost)); + CUDA_CHECK( + cudaMemcpy(h_ref.data(), d_ref, num_elements * sizeof(float), cudaMemcpyDeviceToHost)); + + return detail::compareMatricesImpl(h_test.data(), h_ref.data(), num_elements, tolerance); +} + // ============================================================================ -// cuBLAS Reference SGEMM +// cuBLAS Reference Provider // ============================================================================ +/** + * RAII wrapper for cuBLAS-based reference computation. + * + * Provides: + * - cuBLAS handle management + * - Reference SGEMM computation + * - Verification against kernel output + * + * This is the primary adapter for producing reference results. + */ class SGEMMVerifier { public: SGEMMVerifier() { CUBLAS_CHECK(cublasCreate(&handle_)); } @@ -126,9 +222,21 @@ class SGEMMVerifier { } } - // Compute reference result using cuBLAS - // C = alpha * A * B + beta * C - // A: M x K, B: K x N, C: M x N (row-major) + // Non-copyable, non-movable + SGEMMVerifier(const SGEMMVerifier &) = delete; + SGEMMVerifier &operator=(const SGEMMVerifier &) = delete; + + /** + * Compute reference result using cuBLAS. + * + * C = alpha * A * B + beta * C + * A: M x K, B: K x N, C: M x N (row-major) + * + * @param d_A, d_B Input matrices (device pointers) + * @param d_C Output matrix (device pointer) + * @param M, K, N Matrix dimensions + * @param alpha, beta Scaling factors + */ void computeReference(const float *d_A, const float *d_B, float *d_C, int M, int K, int N, float alpha = 1.0f, float beta = 0.0f) { // cuBLAS uses column-major, so we compute C^T = B^T * A^T @@ -137,34 +245,45 @@ class SGEMMVerifier { &beta, d_C, N)); } - // Verify kernel output against reference - // Uses numpy-style allclose: |test - ref| <= atol + rtol * |ref| + /** + * Verify kernel output against reference (host pointers). + */ VerifyResult verify(const float *h_test, const float *h_ref, int M, int N, VerifyTolerance tolerance = kStandardVerifyTolerance) { - size_t num_elements = static_cast(M) * N; - return detail::compareMatricesImpl(h_test, h_ref, num_elements, tolerance); + return compareMatrices(h_test, h_ref, M, N, tolerance); } - // Verify with device pointers (copies to host internally) + /** + * Verify kernel output against reference (device pointers). + */ VerifyResult verifyDevice(const float *d_test, const float *d_ref, int M, int N, VerifyTolerance tolerance = kStandardVerifyTolerance) { - size_t num_elements = static_cast(M) * N; - std::vector h_test(num_elements); - std::vector h_ref(num_elements); - - CUDA_CHECK(cudaMemcpy(h_test.data(), d_test, num_elements * sizeof(float), - cudaMemcpyDeviceToHost)); - CUDA_CHECK( - cudaMemcpy(h_ref.data(), d_ref, num_elements * sizeof(float), cudaMemcpyDeviceToHost)); - - return verify(h_test.data(), h_ref.data(), M, N, tolerance); + return compareDeviceMatrices(d_test, d_ref, M, N, tolerance); } - // Keep error flagging semantics aligned with compareMatrices/verify. - static bool shouldFlagAsIncorrect(const VerifyResult &result) { return !result.passed; } - + /** + * Access the underlying cuBLAS handle. + * + * Note: This is provided for compatibility with existing code that needs + * direct cuBLAS access (e.g., Tensor Core compute-only benchmark). + * Prefer using computeReference() for standard verification flows. + */ cublasHandle_t getHandle() { return handle_; } private: cublasHandle_t handle_; }; + +// ============================================================================ +// Legacy Compatibility +// ============================================================================ + +/** + * Legacy function: Check if verification result indicates failure. + * + * @deprecated Use VerifyResult::shouldFlagAsIncorrect() or check result.passed directly. + */ +[[deprecated("Use VerifyResult::shouldFlagAsIncorrect() or check result.passed directly")]] +inline bool shouldFlagAsIncorrect(const VerifyResult &result) { + return !result.passed; +} diff --git a/tests/test_benchmark_settings.cu b/tests/test_benchmark_settings.cpp similarity index 98% rename from tests/test_benchmark_settings.cu rename to tests/test_benchmark_settings.cpp index eace3f1..1779177 100644 --- a/tests/test_benchmark_settings.cu +++ b/tests/test_benchmark_settings.cpp @@ -5,10 +5,12 @@ * - Run configuration (warmup, benchmark iterations) * - Verification tolerance policy * - Output/export options + * + * This is a CPU-only test file that does not require CUDA. */ -#include #include "utils/benchmark_settings.cuh" +#include // ============================================================================ // Run Settings Tests @@ -138,12 +140,12 @@ TEST(BenchmarkSettingsTest, CustomOutputSettings) { TEST(BenchmarkSettingsTest, ToleranceForKernelTypeUsesDefaults) { BenchmarkSettings settings; - + // Standard kernels use standard tolerance VerifyTolerance std_tol = settings.toleranceForKernel(KernelType::Standard); EXPECT_FLOAT_EQ(std_tol.rtol, kStandardVerifyTolerance.rtol); EXPECT_FLOAT_EQ(std_tol.atol, kStandardVerifyTolerance.atol); - + // Tensor Core kernels use tensor core tolerance VerifyTolerance tc_tol = settings.toleranceForKernel(KernelType::TensorCore); EXPECT_FLOAT_EQ(tc_tol.rtol, kTensorCoreVerifyTolerance.rtol); @@ -152,22 +154,17 @@ TEST(BenchmarkSettingsTest, ToleranceForKernelTypeUsesDefaults) { TEST(BenchmarkSettingsTest, ToleranceForKernelTypeRespectsCustomSettings) { BenchmarkSettings settings; - + // Customize both tolerances settings.verify.standard_tolerance = {0.01f, 0.001f}; settings.verify.tensor_core_tolerance = {0.1f, 0.05f}; - + // Verify toleranceForKernel returns the custom values VerifyTolerance std_tol = settings.toleranceForKernel(KernelType::Standard); EXPECT_FLOAT_EQ(std_tol.rtol, 0.01f); EXPECT_FLOAT_EQ(std_tol.atol, 0.001f); - + VerifyTolerance tc_tol = settings.toleranceForKernel(KernelType::TensorCore); EXPECT_FLOAT_EQ(tc_tol.rtol, 0.1f); EXPECT_FLOAT_EQ(tc_tol.atol, 0.05f); } - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/tests/test_device_info_seam.cu b/tests/test_device_info_cpu.cu similarity index 82% rename from tests/test_device_info_seam.cu rename to tests/test_device_info_cpu.cu index 16f8a28..640f953 100644 --- a/tests/test_device_info_seam.cu +++ b/tests/test_device_info_cpu.cu @@ -1,15 +1,15 @@ /** - * Device Info Provider Seam Tests + * Device Info Provider CPU-only Tests * - * Tests for the device capability query seam, demonstrating that tests can - * inject fake device info without relying on real GPU hardware capabilities. + * Tests for the device capability query seam, using fake device properties + * without requiring any CUDA device. These tests can run on any system + * with a C++17 compiler. */ #include #include "kernels/tensor_core_sgemm.cuh" #include "utils/benchmark_metrics.cuh" -#include "utils/cuda_utils.cuh" #include "utils/device_info_provider.cuh" namespace { @@ -25,9 +25,9 @@ class FakeDeviceProvider : public ::testing::Test { volta_prop_.major = 7; volta_prop_.minor = 0; volta_prop_.multiProcessorCount = 80; - volta_prop_.clockRate = 1530000; // 1.53 GHz in kHz + volta_prop_.clockRate = 1530000; // 1.53 GHz in kHz volta_prop_.memoryClockRate = 877000; // 877 MHz in kHz - volta_prop_.memoryBusWidth = 4096; // HBM2 + volta_prop_.memoryBusWidth = 4096; // HBM2 volta_provider_ = DeviceInfoProvider{ &volta_prop_, @@ -165,33 +165,6 @@ TEST_F(FakeDeviceProvider, ArchitectureNamingUnknown) { EXPECT_STREQ(getTensorCoreArchName(unknown_provider), "Unknown"); } -// ============================================================================ -// Production Adapter Integration Test -// ============================================================================ - -TEST(DeviceInfoSeam, ProductionAdapterWorks) { - // This test validates that the production adapter can successfully query - // real device info. It should pass on any CUDA-capable device. - DeviceInfoProvider prod = getProductionDeviceInfo(); - - // Basic sanity checks - EXPECT_NE(prod.prop, nullptr); - EXPECT_GT(prod.cores_per_sm, 0); - EXPECT_GT(prod.clock_ghz, 0.0f); - - // Check that overloaded functions work without provider parameter - float peak_gflops = getTheoreticalPeakGflops(); - EXPECT_GT(peak_gflops, 0.0f); - - float peak_bandwidth = getTheoreticalPeakBandwidth(); - EXPECT_GT(peak_bandwidth, 0.0f); - - // Tensor core availability should be consistent - bool has_tc_explicit = tensorCoresAvailable(prod); - bool has_tc_default = tensorCoresAvailable(); - EXPECT_EQ(has_tc_explicit, has_tc_default); -} - // ============================================================================ // Dimension Support Tests (no device dependency) // ============================================================================ diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu new file mode 100644 index 0000000..70fe348 --- /dev/null +++ b/tests/test_device_info_cuda.cu @@ -0,0 +1,48 @@ +/** + * Device Info Provider CUDA Tests + * + * Tests that require a real CUDA device. These tests will be skipped + * automatically when no CUDA device is available. + */ + +#include + +#include "gtest_cuda_environment.cuh" +#include "utils/benchmark_metrics.cuh" +#include "utils/cuda_utils.cuh" +#include "utils/device_info_provider.cuh" + +namespace { + +// ============================================================================ +// Production Adapter Integration Test +// ============================================================================ + +TEST(DeviceInfoSeam, ProductionAdapterWorks) { + // This test validates that the production adapter can successfully query + // real device info. It should pass on any CUDA-capable device. + DeviceInfoProvider prod = getProductionDeviceInfo(); + + // Basic sanity checks + EXPECT_NE(prod.prop, nullptr); + EXPECT_GT(prod.cores_per_sm, 0); + EXPECT_GT(prod.clock_ghz, 0.0f); + + // Check that overloaded functions work without provider parameter + float peak_gflops = getTheoreticalPeakGflops(); + EXPECT_GT(peak_gflops, 0.0f); + + float peak_bandwidth = getTheoreticalPeakBandwidth(); + EXPECT_GT(peak_bandwidth, 0.0f); + + // Tensor core availability should be consistent + bool has_tc_explicit = tensorCoresAvailable(prod); + bool has_tc_default = tensorCoresAvailable(); + EXPECT_EQ(has_tc_explicit, has_tc_default); +} + +} // namespace + +int main(int argc, char **argv) { + return runCudaAwareTests(argc, argv); +} diff --git a/tests/test_kernel_catalog.cu b/tests/test_kernel_catalog.cu index 075da9e..cce5861 100644 --- a/tests/test_kernel_catalog.cu +++ b/tests/test_kernel_catalog.cu @@ -3,27 +3,30 @@ * * Tests the kernel catalog registration system: * - Catalog contains expected kernels - * - Entries have valid names and launchers + * - Entries have valid metadata (names, launchers, constraints) + * - Constraints checking works correctly * - Launch functions are callable without crashes * - Order preservation of kernel entries + * + * This test requires a CUDA device and will be skipped + * automatically when no CUDA device is available. */ -#include +#include "gtest_cuda_environment.cuh" #include "kernels/kernel_catalog.cuh" -#include "utils/cuda_utils.cuh" #include "utils/benchmark_settings.cuh" +#include "utils/cuda_utils.cuh" +#include // ============================================================================ -// Kernel Catalog Tests +// Kernel Catalog Metadata Tests // ============================================================================ class KernelCatalogTest : public ::testing::Test { protected: - void SetUp() override { - catalog_ = &getKernelCatalog(); - } + void SetUp() override { catalog_ = &getKernelCatalog(); } - const std::vector* catalog_; + const std::vector *catalog_; }; TEST_F(KernelCatalogTest, CatalogNotEmpty) { @@ -31,51 +34,183 @@ TEST_F(KernelCatalogTest, CatalogNotEmpty) { } TEST_F(KernelCatalogTest, CatalogHasStandardKernels) { - int standard_count = 0; - for (const auto& entry : *catalog_) { - if (entry.type == KernelType::Standard) { - standard_count++; - } - } + int standard_count = countKernelsByType(KernelType::Standard); EXPECT_GE(standard_count, 4) << "Should have at least 4 standard kernels (Naive, Tiled, BankConflictFree, DoubleBuffer)"; } TEST_F(KernelCatalogTest, CatalogHasTensorCoreKernels) { - int tc_count = 0; - for (const auto& entry : *catalog_) { - if (entry.type == KernelType::TensorCore) { - tc_count++; - } - } - EXPECT_GE(tc_count, 1) - << "Should have at least 1 tensor core kernel (end-to-end)"; + int tc_count = countKernelsByType(KernelType::TensorCore); + EXPECT_GE(tc_count, 1) << "Should have at least 1 tensor core kernel (end-to-end)"; } TEST_F(KernelCatalogTest, CatalogEntriesHaveNamesAndLaunchers) { - for (const auto& entry : *catalog_) { + for (const auto &entry : *catalog_) { EXPECT_FALSE(entry.name.empty()) << "All entries should have non-empty names"; EXPECT_TRUE(static_cast(entry.launcher)) << "Entry '" << entry.name << "' should have a valid launcher"; } } +TEST_F(KernelCatalogTest, CatalogEntriesHaveConstraints) { + for (const auto &entry : *catalog_) { + // Standard kernels should not require tensor cores + if (entry.type == KernelType::Standard) { + EXPECT_FALSE(entry.constraints.requires_tensor_cores) + << "Standard kernel '" << entry.name << "' should not require tensor cores"; + } + + // Tensor Core kernels should require sm_70+ + if (entry.type == KernelType::TensorCore) { + EXPECT_TRUE(entry.constraints.requires_tensor_cores) + << "Tensor Core kernel '" << entry.name << "' should require tensor cores"; + } + } +} + +TEST_F(KernelCatalogTest, CatalogPreservesOrder) { + ASSERT_GE(catalog_->size(), 5u) << "Expected at least 5 kernels"; + + // Verify the expected order: Naive, Tiled, BankConflictFree, DoubleBuffer, TensorCore + // end-to-end + EXPECT_EQ((*catalog_)[0].name, "Naive") << "First kernel should be Naive"; + EXPECT_EQ((*catalog_)[1].name, "Tiled (32x32)") << "Second kernel should be Tiled"; + EXPECT_EQ((*catalog_)[2].name, "Bank Conflict Free") + << "Third kernel should be BankConflictFree"; + EXPECT_EQ((*catalog_)[3].name, "Double Buffer") << "Fourth kernel should be DoubleBuffer"; + EXPECT_EQ((*catalog_)[4].name, "Tensor Core (WMMA end-to-end)") + << "Fifth kernel should be Tensor Core end-to-end"; +} + +// ============================================================================ +// Kernel Constraints Tests +// ============================================================================ + +TEST(KernelConstraintsTest, StandardConstraintsAllowAnyDimensions) { + auto constraints = KernelConstraints::standard(); + + EXPECT_FALSE(constraints.requires_tensor_cores); + EXPECT_EQ(constraints.dimension_alignment, 0); + + // Should accept any positive dimensions + EXPECT_TRUE(constraints.isSatisfied(1, 1, 1, false)); + EXPECT_TRUE(constraints.isSatisfied(1024, 1024, 1024, false)); + EXPECT_TRUE(constraints.isSatisfied(15, 17, 33, false)); // Unaligned +} + +TEST(KernelConstraintsTest, TensorCoreConstraintsRequireAlignment) { + auto constraints = KernelConstraints::tensorCore(); + + EXPECT_TRUE(constraints.requires_tensor_cores); + EXPECT_EQ(constraints.dimension_alignment, 16); + + // Should reject without tensor cores + EXPECT_FALSE(constraints.isSatisfied(16, 16, 16, false)); + + // Should accept aligned dimensions with tensor cores + EXPECT_TRUE(constraints.isSatisfied(16, 16, 16, true)); + EXPECT_TRUE(constraints.isSatisfied(64, 128, 256, true)); + + // Should reject unaligned dimensions even with tensor cores + EXPECT_FALSE(constraints.isSatisfied(15, 16, 16, true)); + EXPECT_FALSE(constraints.isSatisfied(16, 17, 16, true)); + EXPECT_FALSE(constraints.isSatisfied(16, 16, 33, true)); +} + +TEST(KernelConstraintsTest, ComputeOnlyConstraintsAreSpecial) { + auto constraints = KernelConstraints::tensorCoreComputeOnly(); + + EXPECT_TRUE(constraints.requires_tensor_cores); + EXPECT_TRUE(constraints.requires_compute_only); + EXPECT_EQ(constraints.dimension_alignment, 16); +} + +// ============================================================================ +// Catalog Entry Tests +// ============================================================================ + +TEST(KernelCatalogEntryTest, DefaultToleranceForStandardKernels) { + auto constraints = KernelConstraints::standard(); + KernelCatalogEntry entry{"Test Standard", KernelType::Standard, nullptr, constraints}; + + auto tolerance = entry.defaultTolerance(); + EXPECT_FLOAT_EQ(tolerance.rtol, kStandardVerifyTolerance.rtol); + EXPECT_FLOAT_EQ(tolerance.atol, kStandardVerifyTolerance.atol); +} + +TEST(KernelCatalogEntryTest, DefaultToleranceForTensorCoreKernels) { + auto constraints = KernelConstraints::tensorCore(); + KernelCatalogEntry entry{"Test TensorCore", KernelType::TensorCore, nullptr, constraints}; + + auto tolerance = entry.defaultTolerance(); + EXPECT_FLOAT_EQ(tolerance.rtol, kTensorCoreVerifyTolerance.rtol); + EXPECT_FLOAT_EQ(tolerance.atol, kTensorCoreVerifyTolerance.atol); +} + +TEST(KernelCatalogEntryTest, CanRunMethodWorks) { + auto tc_entry = getTensorCoreComputeOnlyEntry(); + + // Should not run without tensor cores + EXPECT_FALSE(tc_entry.canRun(16, 16, 16, false)); + + // Should run with tensor cores and aligned dimensions + EXPECT_TRUE(tc_entry.canRun(16, 16, 16, true)); + EXPECT_TRUE(tc_entry.canRun(64, 128, 256, true)); + + // Should not run with unaligned dimensions + EXPECT_FALSE(tc_entry.canRun(15, 16, 16, true)); +} + +// ============================================================================ +// Catalog Query Utilities Tests +// ============================================================================ + +TEST(CatalogQueryTest, CountKernelsByTypeWorks) { + int standard_count = countKernelsByType(KernelType::Standard); + int tensor_core_count = countKernelsByType(KernelType::TensorCore); + + EXPECT_GE(standard_count, 4); + EXPECT_GE(tensor_core_count, 1); +} + +TEST(CatalogQueryTest, GetKernelNamesWorks) { + auto standard_names = getKernelNames(KernelType::Standard); + + EXPECT_GE(standard_names.size(), 4u); + EXPECT_EQ(standard_names[0], "Naive"); + EXPECT_EQ(standard_names[1], "Tiled (32x32)"); +} + +TEST(CatalogQueryTest, CanRunTensorCoreKernelsWorks) { + // With tensor cores + EXPECT_TRUE(canRunTensorCoreKernels(16, 16, 16, true)); + EXPECT_TRUE(canRunTensorCoreKernels(64, 128, 256, true)); + EXPECT_FALSE(canRunTensorCoreKernels(15, 16, 16, true)); // Unaligned + + // Without tensor cores + EXPECT_FALSE(canRunTensorCoreKernels(16, 16, 16, false)); +} + +// ============================================================================ +// Kernel Launch Tests (requires CUDA device) +// ============================================================================ + TEST_F(KernelCatalogTest, CatalogLaunchCallable) { ASSERT_FALSE(catalog_->empty()) << "Catalog is empty, cannot test launch"; - + // Small test: verify we can call the launcher without crashing - const auto& entry = (*catalog_)[0]; + const auto &entry = (*catalog_)[0]; const int M = 64, K = 64, N = 64; - + DeviceMemory d_A(M * K); DeviceMemory d_B(K * N); DeviceMemory d_C(M * N); - + // Initialize with zeros CUDA_CHECK(cudaMemset(d_A.get(), 0, M * K * sizeof(float))); CUDA_CHECK(cudaMemset(d_B.get(), 0, K * N * sizeof(float))); CUDA_CHECK(cudaMemset(d_C.get(), 0, M * N * sizeof(float))); - + // Launch should not crash EXPECT_NO_THROW({ entry.launcher(d_A.get(), d_B.get(), d_C.get(), M, K, N); @@ -83,14 +218,4 @@ TEST_F(KernelCatalogTest, CatalogLaunchCallable) { }); } -TEST_F(KernelCatalogTest, CatalogPreservesOrder) { - ASSERT_GE(catalog_->size(), 5u) << "Expected at least 5 kernels"; - - // Verify the expected order: Naive, Tiled, BankConflictFree, DoubleBuffer, TensorCore end-to-end - EXPECT_EQ((*catalog_)[0].name, "Naive") << "First kernel should be Naive"; - EXPECT_EQ((*catalog_)[1].name, "Tiled (32x32)") << "Second kernel should be Tiled"; - EXPECT_EQ((*catalog_)[2].name, "Bank Conflict Free") << "Third kernel should be BankConflictFree"; - EXPECT_EQ((*catalog_)[3].name, "Double Buffer") << "Fourth kernel should be DoubleBuffer"; - EXPECT_EQ((*catalog_)[4].name, "Tensor Core (WMMA end-to-end)") << "Fifth kernel should be Tensor Core end-to-end"; -} - +int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); } diff --git a/tests/test_performance.cu b/tests/test_performance.cu index 26c3b19..0db23e7 100644 --- a/tests/test_performance.cu +++ b/tests/test_performance.cu @@ -110,22 +110,24 @@ class PerformanceRegressionTest : public ::testing::Test { TEST_F(PerformanceRegressionTest, NaiveKernelPerformance) { printf("\nNaive Kernel Performance:\n"); for (const auto &[M, K, N] : test_dimensions_) { - runPerformanceTest("Naive", - [](const float *A, const float *B, float *C, int m, int k, int n) { - launch_naive_sgemm<>(A, B, C, m, k, n); - }, - M, K, N); + runPerformanceTest( + "Naive", + [](const float *A, const float *B, float *C, int m, int k, int n) { + launch_naive_sgemm<>(A, B, C, m, k, n); + }, + M, K, N); } } TEST_F(PerformanceRegressionTest, TiledKernelPerformance) { printf("\nTiled Kernel Performance:\n"); for (const auto &[M, K, N] : test_dimensions_) { - runPerformanceTest("Tiled", - [](const float *A, const float *B, float *C, int m, int k, int n) { - launch_tiled_sgemm<32>(A, B, C, m, k, n); - }, - M, K, N); + runPerformanceTest( + "Tiled", + [](const float *A, const float *B, float *C, int m, int k, int n) { + launch_tiled_sgemm<32>(A, B, C, m, k, n); + }, + M, K, N); } } @@ -190,10 +192,8 @@ TEST_F(PerformanceRegressionTest, PeakPerformanceReference) { printf(" Theoretical Peak Bandwidth: %.2f GB/s\n", peak_bandwidth_); // 验证峰值在合理范围内 - EXPECT_GT(peak_gflops_, 1000.0f) << "Peak GFLOPS seems too low"; - EXPECT_LT(peak_gflops_, 100000.0f) << "Peak GFLOPS seems too high"; + EXPECT_GT(peak_gflops_, 100.0f) << "Peak GFLOPS seems too low"; + EXPECT_LT(peak_gflops_, 200000.0f) << "Peak GFLOPS seems too high"; } -int main(int argc, char **argv) { - return runCudaAwareTests(argc, argv); -} +int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); } From 879e8222686732de722ace5aa11a9870cda97c56 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Wed, 1 Jul 2026 18:50:09 +0800 Subject: [PATCH 2/5] fix: CMake include(GoogleTest) and clang-format all test files - Add missing `include(GoogleTest)` before SgemmTestHelpers.cmake to resolve `gtest_discover_tests` unknown command error - Apply clang-format to test_utils.cu, test_sgemm.cu, and test_device_info_cuda.cu to fix format check failures Generated with [Devin](https://devin.ai) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- CMakeLists.txt | 2 ++ tests/test_device_info_cuda.cu | 4 +--- tests/test_sgemm.cu | 10 +++------- tests/test_utils.cu | 6 ++---- 4 files changed, 8 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d8a1f1a..e672b54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,8 @@ if(BUILD_TESTS) set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) FetchContent_MakeAvailable(googletest) + include(GoogleTest) + # Include test helper functions include(cmake/SgemmTestHelpers.cmake) diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu index 70fe348..885107c 100644 --- a/tests/test_device_info_cuda.cu +++ b/tests/test_device_info_cuda.cu @@ -43,6 +43,4 @@ TEST(DeviceInfoSeam, ProductionAdapterWorks) { } // namespace -int main(int argc, char **argv) { - return runCudaAwareTests(argc, argv); -} +int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); } diff --git a/tests/test_sgemm.cu b/tests/test_sgemm.cu index a644cd2..ba80869 100644 --- a/tests/test_sgemm.cu +++ b/tests/test_sgemm.cu @@ -344,13 +344,9 @@ TEST_F(DimensionInvarianceTest, AllStandardKernelsWorkWithVariousDimensions) { [](const float *A, const float *B, float *C, int m, int k, int n) { launch_bank_conflict_free_sgemm<32>(A, B, C, m, k, n); }); - testKernel("DoubleBuffer", - [](const float *A, const float *B, float *C, int m, int k, int n) { - launch_double_buffer_sgemm<32>(A, B, C, m, k, n); - }); + testKernel("DoubleBuffer", [](const float *A, const float *B, float *C, int m, int k, + int n) { launch_double_buffer_sgemm<32>(A, B, C, m, k, n); }); } } -int main(int argc, char **argv) { - return runCudaAwareTests(argc, argv); -} +int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); } diff --git a/tests/test_utils.cu b/tests/test_utils.cu index 5d688a1..0b18471 100644 --- a/tests/test_utils.cu +++ b/tests/test_utils.cu @@ -8,9 +8,9 @@ * - 容差配置和边界条件 */ +#include #include #include -#include #include #include #include @@ -402,6 +402,4 @@ TEST_F(UtilsIntegrationTest, FullWorkflowWithDeviceMemory) { EXPECT_TRUE(has_nonzero); } -int main(int argc, char **argv) { - return runCudaAwareTests(argc, argv); -} +int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); } From 03013cfbeee2d239ed31db523c6a87d99c837a5b Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Wed, 1 Jul 2026 18:55:23 +0800 Subject: [PATCH 3/5] fix: restore SGEMMVerifier::shouldFlagAsIncorrect static method The refactored verify.cuh removed the static shouldFlagAsIncorrect method from SGEMMVerifier, breaking test_sgemm.cu which calls SGEMMVerifier::shouldFlagAsIncorrect(result). Restore it as a deprecated compatibility method. Generated with [Devin](https://devin.ai) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- src/utils/verify.cuh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/utils/verify.cuh b/src/utils/verify.cuh index 00512df..3d84686 100644 --- a/src/utils/verify.cuh +++ b/src/utils/verify.cuh @@ -270,6 +270,13 @@ class SGEMMVerifier { */ cublasHandle_t getHandle() { return handle_; } + /** + * Legacy compatibility: check if verification result indicates failure. + * + * @deprecated Use result.passed or result.shouldFlagAsIncorrect() directly. + */ + static bool shouldFlagAsIncorrect(const VerifyResult &result) { return !result.passed; } + private: cublasHandle_t handle_; }; From eb10bc841bc53413f62bf1277ccd7512656d79d9 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Wed, 1 Jul 2026 19:01:52 +0800 Subject: [PATCH 4/5] fix: add missing tensor_core_sgemm.cuh include in test_device_info_cuda The test uses tensorCoresAvailable() which is defined in tensor_core_sgemm.cuh, but the header was not included. Generated with [Devin](https://devin.ai) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- tests/test_device_info_cuda.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu index 885107c..a2da02f 100644 --- a/tests/test_device_info_cuda.cu +++ b/tests/test_device_info_cuda.cu @@ -8,6 +8,7 @@ #include #include "gtest_cuda_environment.cuh" +#include "kernels/tensor_core_sgemm.cuh" #include "utils/benchmark_metrics.cuh" #include "utils/cuda_utils.cuh" #include "utils/device_info_provider.cuh" From f45adb0405d78c655be7a4b6e745b591fda2cd32 Mon Sep 17 00:00:00 2001 From: shijiashuai Date: Wed, 1 Jul 2026 19:05:55 +0800 Subject: [PATCH 5/5] fix: correct AmperePeakGflopsCalculation expected value The test expected 19481.0 GFLOPS but the correct calculation is 108 * 64 * 2 * 1.41 = 19491.84 GFLOPS. Fix the expected value and remove the stale * 1000 from the comment. Generated with [Devin](https://devin.ai) Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com> --- tests/test_device_info_cpu.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_device_info_cpu.cu b/tests/test_device_info_cpu.cu index 640f953..43cbd8d 100644 --- a/tests/test_device_info_cpu.cu +++ b/tests/test_device_info_cpu.cu @@ -106,10 +106,10 @@ TEST_F(FakeDeviceProvider, VoltaPeakGflopsCalculation) { } TEST_F(FakeDeviceProvider, AmperePeakGflopsCalculation) { - // A100: 108 SMs * 64 cores/SM * 2 (FMA) * 1.41 GHz * 1000 - // Expected: ~19.481 TFLOPS + // A100: 108 SMs * 64 cores/SM * 2 (FMA) * 1.41 GHz + // Expected: ~19.492 TFLOPS float peak_gflops = getTheoreticalPeakGflops(ampere_provider_); - EXPECT_NEAR(peak_gflops, 19481.0f, 0.1f); + EXPECT_NEAR(peak_gflops, 19491.84f, 0.1f); } TEST_F(FakeDeviceProvider, VoltaPeakBandwidthCalculation) {