From e5c7fc842a70f21acf539f11e0a8fda27a75424e Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Wed, 1 Jul 2026 18:44:53 +0800
Subject: [PATCH 1/5] fix: critical bugs, architecture improvements, and test
 infrastructure

## Critical Bug Fixes

- **getTheoreticalPeakGflops**: Remove spurious `* 1000` that made peak
  GFLOPS 1000x too large, causing all efficiency calculations to report
  0.1% instead of 100%. This also broke the PeakPerformanceReference test
  upper bound check.

- **CPU test compilation**: Rename test_device_info_cpu.cpp to .cu because
  it includes tensor_core_sgemm.cuh which contains CUDA `<<<>>>` kernel
  launch syntax that g++ cannot parse. Add CUDA compile options to the
  sgemm_add_cpu_test CMake helper for .cu sources.

- **Duplicate main()**: Remove redundant `main()` from CPU-only test files
  (test_benchmark_settings.cpp, test_device_info_cpu.cu) that conflicted
  with GTest::gtest_main linkage.

## High-Severity Fixes

- **Integer overflow**: Fix `M * K` int multiplication overflow in
  tensor_core_benchmark.cuh by using `static_cast<size_t>` consistently
  for all matrix size calculations.

- **Division by zero**: Add validation in measureGpuTime() to reject
  benchmark_runs <= 0, preventing undefined behavior.

- **Negative dimensions**: Add M > 0 && K > 0 && N > 0 check in
  KernelConstraints::isSatisfied() to prevent undefined modulo behavior
  with non-positive dimensions.

## Medium-Severity Fixes

- **NaN/Inf verification**: Handle NaN/Inf in reference values in
  compareMatricesImpl() instead of producing NaN comparison results.

- **Exception safety in diagnostics**: Replace CUDA_CHECK (throwing)
  with non-throwing error handling in printSkipReason().

- **File write error checking**: Add file.fail() check after writing
  roofline export data.

- **DRY violation**: Extract safeGridSize() and checkMatrixElementCount()
  to cuda_utils.cuh, eliminating duplicated overflow-check code between
  tensor_core_sgemm.cuh and tensor_core_benchmark.cuh.

- **Performance test bounds**: Adjust PeakPerformanceReference bounds
  (100-200000 GFLOPS) to accommodate the corrected peak calculation and
  future GPU architectures.

## Infrastructure Improvements

- **CI**: Add CPU test execution step (`ctest -L cpu`) to CI workflow.
- **CMake helpers**: Add SgemmTestHelpers.cmake with standardized test
  target creation functions (sgemm_add_cpu_test, sgemm_add_cuda_test,
  sgemm_add_cuda_perf_test) with proper CTest labels.
- **Test refactoring**: Migrate CPU-only tests from .cu to .cpp, split
  device_info_seam.cu into separate CPU and CUDA test files.
- **Kernel catalog**: Add KernelConstraints struct with isSatisfied()
  method, refactor BenchmarkRunner to use catalog-driven dispatch.

Generated with [Devin](https://devin.ai)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 .github/workflows/ci.yml                      |   8 +-
 CMakeLists.txt                                | 126 +++++------
 CONTEXT.md                                    |  90 +++++++-
 cmake/SgemmTestHelpers.cmake                  | 193 ++++++++++++++++
 src/benchmark_runner.cuh                      | 123 +++++-----
 src/kernels/kernel_catalog.cuh                | 210 +++++++++++++-----
 src/kernels/tensor_core_benchmark.cuh         |  92 ++++----
 src/kernels/tensor_core_sgemm.cuh             |  30 +--
 src/utils/benchmark.cuh                       |   8 +-
 src/utils/benchmark_core.cuh                  |   4 +
 src/utils/benchmark_metrics.cuh               |   6 +-
 src/utils/cuda_utils.cuh                      |  30 ++-
 src/utils/verify.cuh                          | 179 ++++++++++++---
 ...ettings.cu => test_benchmark_settings.cpp} |  19 +-
 ...e_info_seam.cu => test_device_info_cpu.cu} |  39 +---
 tests/test_device_info_cuda.cu                |  48 ++++
 tests/test_kernel_catalog.cu                  | 203 +++++++++++++----
 tests/test_performance.cu                     |  30 +--
 18 files changed, 1034 insertions(+), 404 deletions(-)
 create mode 100644 cmake/SgemmTestHelpers.cmake
 rename tests/{test_benchmark_settings.cu => test_benchmark_settings.cpp} (98%)
 rename tests/{test_device_info_seam.cu => test_device_info_cpu.cu} (82%)
 create mode 100644 tests/test_device_info_cuda.cu

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c126cea..aac1402 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,8 +1,8 @@
 # Continuous Integration
-# Validates code formatting and CUDA compilation of all targets (benchmark + tests)
+# Validates code formatting, CUDA compilation, and CPU-only test execution
 #
 # Workflow separation:
-# - This workflow: formatting + CUDA compile-time checks (no GPU runtime)
+# - This workflow: formatting + CUDA compile-time checks + CPU tests (no GPU runtime)
 # - pages.yml: docs tests/build and GitHub Pages buildability
 name: CI
 
@@ -58,7 +58,11 @@ jobs:
       - name: Build
         run: cmake --build build -j2
 
+      - name: Run CPU tests
+        run: ctest --test-dir build -L cpu --output-on-failure
+
       - name: Info
         run: |
           echo "✅ CUDA compilation successful for all targets (benchmark + tests)"
+          echo "✅ CPU-only tests passed"
           echo "ℹ️  GPU runtime tests require a CUDA-capable machine"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d1d29a3..d8a1f1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,91 +96,69 @@ if(BUILD_TESTS)
     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
     FetchContent_MakeAvailable(googletest)
 
-    add_executable(test_sgemm tests/test_sgemm.cu)
-    target_include_directories(test_sgemm PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_compile_definitions(test_sgemm PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
-    target_link_options(test_sgemm PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
-    target_link_libraries(test_sgemm PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
-        CUDA::curand
-    )
-    target_compile_options(test_sgemm PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+    # Include test helper functions
+    include(cmake/SgemmTestHelpers.cmake)
 
-    # 工具层测试
-    add_executable(test_utils tests/test_utils.cu)
-    target_include_directories(test_utils PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_compile_definitions(test_utils PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
-    target_link_options(test_utils PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
-    target_link_libraries(test_utils PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
-        CUDA::curand
-    )
-    target_compile_options(test_utils PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    )
+    # ═══════════════════════════════════════════════════════════════
+    # CPU-only Tests (no CUDA required)
+    # ═══════════════════════════════════════════════════════════════
 
-    # 性能回归测试
-    add_executable(test_performance tests/test_performance.cu)
-    target_include_directories(test_performance PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_compile_definitions(test_performance PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
-    target_link_options(test_performance PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
-    target_link_libraries(test_performance PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
-        CUDA::curand
-    )
-    target_compile_options(test_performance PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    # Benchmark settings module test - pure C++, no CUDA dependencies
+    sgemm_add_cpu_test(
+        NAME test_benchmark_settings
+        SOURCES tests/test_benchmark_settings.cpp
     )
 
-    # Benchmark 设置模块测试
-    add_executable(test_benchmark_settings tests/test_benchmark_settings.cu)
-    target_include_directories(test_benchmark_settings PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_link_libraries(test_benchmark_settings PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
+    # Device info provider CPU tests - uses fake device properties
+    # Compiled as .cu because it includes kernel headers with CUDA launch syntax
+    sgemm_add_cpu_test(
+        NAME test_device_info_cpu
+        SOURCES tests/test_device_info_cpu.cu
     )
-    target_compile_options(test_benchmark_settings PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+
+    # ═══════════════════════════════════════════════════════════════
+    # CUDA Tests (requires CUDA device, skipped if unavailable)
+    # ═══════════════════════════════════════════════════════════════
+
+    # Kernel correctness tests with property-based testing
+    sgemm_add_cuda_test(
+        NAME test_sgemm
+        SOURCES tests/test_sgemm.cu
+        CUDA_LIBRARIES CUDA::curand
+        REQUIRES_WMMA
     )
 
-    # Kernel catalog module test
-    add_executable(test_kernel_catalog tests/test_kernel_catalog.cu)
-    target_include_directories(test_kernel_catalog PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_link_libraries(test_kernel_catalog PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
+    # Utility layer tests (DeviceMemory, verifier RAII, etc.)
+    sgemm_add_cuda_test(
+        NAME test_utils
+        SOURCES tests/test_utils.cu
+        CUDA_LIBRARIES CUDA::curand
+        REQUIRES_WMMA
     )
-    target_compile_options(test_kernel_catalog PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+
+    # Kernel catalog module test - requires device memory and kernel launch
+    sgemm_add_cuda_test(
+        NAME test_kernel_catalog
+        SOURCES tests/test_kernel_catalog.cu
+        REQUIRES_WMMA
     )
 
-    # Device info seam test
-    add_executable(test_device_info_seam tests/test_device_info_seam.cu)
-    target_include_directories(test_device_info_seam PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
-    target_link_libraries(test_device_info_seam PRIVATE
-        GTest::gtest_main
-        CUDA::cudart
-        CUDA::cublas
+    # Device info provider CUDA tests - requires real GPU for production adapter
+    sgemm_add_cuda_test(
+        NAME test_device_info_cuda
+        SOURCES tests/test_device_info_cuda.cu
     )
-    target_compile_options(test_device_info_seam PRIVATE
-        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+
+    # ═══════════════════════════════════════════════════════════════
+    # Performance Tests (CUDA + performance label)
+    # ═══════════════════════════════════════════════════════════════
+
+    # Performance regression tests
+    sgemm_add_cuda_perf_test(
+        NAME test_performance
+        SOURCES tests/test_performance.cu
+        CUDA_LIBRARIES CUDA::curand
+        REQUIRES_WMMA
     )
 
-    include(GoogleTest)
-    gtest_discover_tests(test_sgemm)
-    gtest_discover_tests(test_utils)
-    gtest_discover_tests(test_performance)
-    gtest_discover_tests(test_benchmark_settings)
-    gtest_discover_tests(test_kernel_catalog)
-    gtest_discover_tests(test_device_info_seam)
 endif()
diff --git a/CONTEXT.md b/CONTEXT.md
index 9dcd977..9a552bb 100644
--- a/CONTEXT.md
+++ b/CONTEXT.md
@@ -4,6 +4,28 @@
 
 ## 核心模块
 
+### Kernel Catalog 模块
+
+**位置**: `src/kernels/kernel_catalog.cuh`
+
+**权威元数据源** - 内核阶梯的唯一事实来源：
+
+- **KernelCatalogEntry**: 完整的内核元数据
+  - `name`: 显示名称
+  - `type`: KernelType::Standard 或 KernelType::TensorCore
+  - `launcher`: 启动适配器
+  - `constraints`: 运行时约束（Tensor Core 要求、维度对齐）
+- **KernelConstraints**: 运行时约束描述
+  - `requires_tensor_cores`: 是否需要 sm_70+
+  - `dimension_alignment`: 维度对齐要求（0 = 无约束）
+  - `requires_compute_only`: 是否使用特殊 benchmark 接口
+- **查询工具**: `countKernelsByType()`, `getKernelNames()`, `canRunTensorCoreKernels()`
+
+**设计原则**：
+- **单一事实源**: 新增内核只需添加一个 catalog 条目
+- **自描述约束**: 每个 entry 知道自己能否在给定条件下运行
+- **统一调度**: BenchmarkRunner 通过 catalog 迭代，无特殊分支
+
 ### Tensor Core 模块
 
 **位置**: `src/kernels/tensor_core_sgemm.cuh`
@@ -24,6 +46,7 @@
 **位置**: `src/kernels/tensor_core_benchmark.cuh`
 
 Tensor Core 特有的 benchmark 功能，提供：
+- `canRunTensorCoreComputeOnly()` - 约束检查（与 KernelCatalog 语义一致）
 - `runTensorCoreComputeOnlyBenchmark()` - 纯计算路径性能测试
 
 **接口设计**：只接受 `cublasHandle_t`，不依赖整个 `SGEMMBenchmark` 类，避免内核层对工具层的上穿依赖。
@@ -32,19 +55,42 @@ Tensor Core 特有的 benchmark 功能，提供：
 
 **位置**: `src/utils/verify.cuh`
 
-统一的验证逻辑：
-- `detail::compareMatricesImpl()` - 内部实现，供其他函数共享
-- `compareMatrices()` - 独立的矩阵比较函数
-- `SGEMMVerifier` - 带 cuBLAS 句柄的验证器类
+**统一的验证策略** - reference + comparison + tolerance policy：
+
+- **VerifyResult**: 验证结果结构（pass/fail、错误指标）
+- **VerifyTolerance**: 容差规范（numpy-style allclose 语义）
+  - `kStandardVerifyTolerance`: FP32 标准容差
+  - `kTensorCoreVerifyTolerance`: Tensor Core 宽松容差
+- **比较函数**:
+  - `compareMatrices()`: Host 指针比较
+  - `compareDeviceMatrices()`: Device 指针比较
+- **SGEMMVerifier**: cuBLAS 参考计算适配器
+  - `computeReference()`: 计算参考结果
+  - `verify()`, `verifyDevice()`: 验证内核输出
+
+**设计原则**：
+- **单一验证政策**: 所有内核共享同一套容差语义
+- **分离关注点**: 参考计算 vs 比较逻辑
+- **可扩展**: 未来可添加其他参考适配器
 
 ## Benchmark 模块
 
 项目将 Benchmark 功能拆分为三个深度模块，每个模块有独立的职责：
 
+### Benchmark Settings
+**位置**: `src/utils/benchmark_settings.cuh**
+
+配置集中化：
+- `RunSettings`: 预热次数、测量次数
+- `VerificationSettings`: 容差配置
+- `OutputSettings`: Roofline 导出选项
+- `BenchmarkSettings`: 聚合配置
+
 ### Benchmark Core
 **位置**: `src/utils/benchmark_core.cuh`
 
 核心性能测量：
+- `BenchmarkResult`: 结果结构和报告生成
 - `CudaTimer` - RAII 包装的 CUDA 事件计时器
 - `measureGpuTime()` - 通用的 GPU 操作性能测量器
 
@@ -62,7 +108,6 @@ Tensor Core 特有的 benchmark 功能，提供：
 
 聚合模块并提供：
 - `SGEMMBenchmark` - 高级 benchmark 编排器
-- `BenchmarkResult` - 结果结构和报告生成
 
 ## 测试架构
 
@@ -70,6 +115,13 @@ Tensor Core 特有的 benchmark 功能，提供：
 
 项目采用分层测试策略，确保每个层级都有独立的测试面：
 
+#### CPU-only 测试
+**位置**: `tests/test_benchmark_settings.cpp`, `tests/test_device_info_cpu.cpp`
+
+纯 CPU 测试，不需要 CUDA 设备：
+- 设置模块单元测试
+- 设备信息 Seam 测试（使用 fake provider）
+
 #### 内核层测试
 **位置**: `tests/test_sgemm.cu`
 
@@ -78,6 +130,14 @@ Tensor Core 特有的 benchmark 功能，提供：
 - Tensor Core 快速路径和 fallback 测试
 - 边界测试和维度不变性测试
 
+#### Kernel Catalog 测试
+**位置**: `tests/test_kernel_catalog.cu`
+
+测试内核目录的元数据和约束：
+- Catalog 包含预期的内核
+- 条目有有效的元数据（名称、启动器、约束）
+- 约束检查正确工作
+
 #### 工具层测试
 **位置**: `tests/test_utils.cu`
 
@@ -88,8 +148,6 @@ Tensor Core 特有的 benchmark 功能，提供：
 - `VerifyTolerance` - 容差配置和边界条件
 - NaN/Inf 处理、异常安全性
 
-**设计原则**：工具层测试独立于内核测试，可以单独捕获工具类 bug。
-
 #### 性能回归测试
 **位置**: `tests/test_performance.cu`
 
@@ -105,7 +163,19 @@ Tensor Core 特有的 benchmark 功能，提供：
 - Double-Buffer: 35% 峰值
 - Tensor Core: 50% 峰值（当可用时）
 
-**设计原则**：性能测试独立于正确性测试，可在 CI 中检测重大性能退化。
+### 测试分类标签
+
+项目使用 CTest labels 区分测试类型：
+- `cpu`: CPU-only 测试，不需要 CUDA 设备
+- `cuda`: 需要 CUDA 设备的测试，无 GPU 时跳过
+- `performance`: 性能回归测试
+
+**运行命令**:
+```bash
+ctest -L cpu          # 只运行 CPU 测试
+ctest -L cuda         # 只运行 CUDA 测试
+ctest -L performance  # 只运行性能测试
+```
 
 ## 架构原则
 
@@ -115,8 +185,8 @@ Tensor Core 特有的 benchmark 功能，提供：
    - `main.cu` - 入口点，仅负责组装
    - `cli_parser.cuh` - 命令行解析、配置构造
    - `benchmark_runner.cuh` - 内核调度、结果聚合
-2. **内核层** (`src/kernels/`) - 5 个内核实现 + Tensor Core 专用模块
-3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助
+2. **内核层** (`src/kernels/`) - 5 个内核实现 + Kernel Catalog + Tensor Core 专用模块
+3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助、设置模块
 
 ### 依赖方向
 
diff --git a/cmake/SgemmTestHelpers.cmake b/cmake/SgemmTestHelpers.cmake
new file mode 100644
index 0000000..67f5244
--- /dev/null
+++ b/cmake/SgemmTestHelpers.cmake
@@ -0,0 +1,193 @@
+# SGEMM CMake Test Helper Functions
+#
+# This module provides helper functions to standardize test target creation
+# and ensure consistent handling of CPU-only vs CUDA-required tests.
+#
+# Test Categories:
+# - CPU-only: Can compile and run without CUDA runtime/device. May still need
+#   CUDA headers for struct definitions (e.g., cudaDeviceProp).
+# - CUDA: Requires CUDA device at runtime. Tests are skipped if no GPU.
+# - Performance: CUDA tests with performance regression labels.
+
+# sgemm_add_cpu_test(
+#   NAME <test_name>
+#   SOURCES <source1> [<source2> ...]
+#   [LIBRARIES <lib1> <lib2> ...]
+# )
+#
+# Creates a CPU-only test target that does not require CUDA runtime.
+# These tests:
+# - Can run on any system with CUDA toolkit (headers only)
+# - Are labeled with "cpu" for CTest filtering
+# - Do NOT use runCudaAwareTests() - just gtest_main
+# - Will NOT be skipped due to missing GPU
+function(sgemm_add_cpu_test)
+    set(options "")
+    set(oneValueArgs NAME)
+    set(multiValueArgs SOURCES LIBRARIES)
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT ARG_NAME)
+        message(FATAL_ERROR "sgemm_add_cpu_test: NAME is required")
+    endif()
+    if(NOT ARG_SOURCES)
+        message(FATAL_ERROR "sgemm_add_cpu_test: SOURCES is required")
+    endif()
+
+    add_executable(${ARG_NAME} ${ARG_SOURCES})
+
+    target_include_directories(${ARG_NAME} PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+    )
+
+    # CUDA compile options for .cu sources (if any)
+    target_compile_options(${ARG_NAME} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+
+    # CPU-only tests still need CUDA headers but not runtime
+    # Use CUDA::cudart header-only interface if available, otherwise link minimal
+    target_link_libraries(${ARG_NAME} PRIVATE
+        GTest::gtest_main
+        CUDA::cudart
+        ${ARG_LIBRARIES}
+    )
+
+    # Set C++ standard
+    target_compile_features(${ARG_NAME} PRIVATE cxx_std_17)
+
+    # Register with CTest
+    gtest_discover_tests(${ARG_NAME}
+        PROPERTIES
+            LABELS "cpu"
+        DISCOVERY_MODE PRE_TEST
+    )
+endfunction()
+
+# sgemm_add_cuda_test(
+#   NAME <test_name>
+#   SOURCES <source1> [<source2> ...]
+#   [LIBRARIES <lib1> <lib2> ...]
+#   [CUDA_LIBRARIES <cuda_lib1> <cuda_lib2> ...]
+#   [REQUIRES_WMMA]
+# )
+#
+# Creates a CUDA test target that requires a CUDA device.
+# These tests:
+# - Require CUDA toolkit and a CUDA-capable GPU
+# - Are automatically skipped when no GPU is available
+# - Are labeled with "cuda" for CTest filtering
+# - Use runCudaAwareTests() for proper environment setup
+#
+# REQUIRES_WMMA: If set, adds SGEMM_HAS_WMMA_TARGET compile definition
+function(sgemm_add_cuda_test)
+    set(options "REQUIRES_WMMA")
+    set(oneValueArgs NAME)
+    set(multiValueArgs SOURCES LIBRARIES CUDA_LIBRARIES)
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT ARG_NAME)
+        message(FATAL_ERROR "sgemm_add_cuda_test: NAME is required")
+    endif()
+    if(NOT ARG_SOURCES)
+        message(FATAL_ERROR "sgemm_add_cuda_test: SOURCES is required")
+    endif()
+
+    add_executable(${ARG_NAME} ${ARG_SOURCES})
+
+    target_include_directories(${ARG_NAME} PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+    )
+
+    # Standard CUDA compile options
+    target_compile_options(${ARG_NAME} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+
+    # Link against CUDA runtime and libraries
+    target_link_options(${ARG_NAME} PRIVATE
+        -L${SGEMM_CUDA_LIBRARY_DIR}
+    )
+
+    # WMMA target definition if needed
+    if(ARG_REQUIRES_WMMA)
+        target_compile_definitions(${ARG_NAME} PRIVATE
+            SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET}
+        )
+    endif()
+
+    # Default CUDA libraries
+    set(DEFAULT_CUDA_LIBS CUDA::cudart CUDA::cublas)
+
+    target_link_libraries(${ARG_NAME} PRIVATE
+        GTest::gtest_main
+        ${DEFAULT_CUDA_LIBS}
+        ${ARG_CUDA_LIBRARIES}
+        ${ARG_LIBRARIES}
+    )
+
+    # Register with CTest
+    gtest_discover_tests(${ARG_NAME}
+        PROPERTIES
+            LABELS "cuda"
+        DISCOVERY_MODE PRE_TEST
+    )
+endfunction()
+
+# sgemm_add_cuda_perf_test(
+#   NAME <test_name>
+#   SOURCES <source1> [<source2> ...]
+#   [LIBRARIES <lib1> <lib2> ...]
+# )
+#
+# Creates a CUDA performance test target.
+# Same as sgemm_add_cuda_test but with additional labels for performance testing.
+function(sgemm_add_cuda_perf_test)
+    set(options "REQUIRES_WMMA")
+    set(oneValueArgs NAME)
+    set(multiValueArgs SOURCES LIBRARIES CUDA_LIBRARIES)
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    if(NOT ARG_NAME)
+        message(FATAL_ERROR "sgemm_add_cuda_perf_test: NAME is required")
+    endif()
+    if(NOT ARG_SOURCES)
+        message(FATAL_ERROR "sgemm_add_cuda_perf_test: SOURCES is required")
+    endif()
+
+    add_executable(${ARG_NAME} ${ARG_SOURCES})
+
+    target_include_directories(${ARG_NAME} PRIVATE
+        ${CMAKE_CURRENT_SOURCE_DIR}/src
+    )
+
+    target_compile_options(${ARG_NAME} PRIVATE
+        $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    )
+
+    target_link_options(${ARG_NAME} PRIVATE
+        -L${SGEMM_CUDA_LIBRARY_DIR}
+    )
+
+    if(ARG_REQUIRES_WMMA)
+        target_compile_definitions(${ARG_NAME} PRIVATE
+            SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET}
+        )
+    endif()
+
+    set(DEFAULT_CUDA_LIBS CUDA::cudart CUDA::cublas CUDA::curand)
+
+    target_link_libraries(${ARG_NAME} PRIVATE
+        GTest::gtest_main
+        ${DEFAULT_CUDA_LIBS}
+        ${ARG_CUDA_LIBRARIES}
+        ${ARG_LIBRARIES}
+    )
+
+    # Register with CTest with both cuda and performance labels
+    gtest_discover_tests(${ARG_NAME}
+        PROPERTIES
+            LABELS "cuda;performance"
+        DISCOVERY_MODE PRE_TEST
+    )
+endfunction()
diff --git a/src/benchmark_runner.cuh b/src/benchmark_runner.cuh
index 9d99b81..a7344a3 100644
--- a/src/benchmark_runner.cuh
+++ b/src/benchmark_runner.cuh
@@ -20,6 +20,11 @@
  *
  * 负责调度所有内核 benchmark 并生成报告。
  * 与 CLI 解析分离，可被测试或脚本直接调用。
+ *
+ * 设计：
+ * - 使用 KernelCatalog 作为内核阶梯的唯一事实来源
+ * - Catalog 条目包含所有约束和默认容差
+ * - BenchmarkRunner 只负责编排流程，不包含内核特定逻辑
  */
 class BenchmarkRunner {
   public:
@@ -96,6 +101,7 @@ class BenchmarkRunner {
                "===========\n");
 
         SGEMMBenchmark benchmark;
+        bool has_tensor_cores = tensorCoresAvailable();
 
         // cuBLAS 参考
         printf("\nRunning cuBLAS (reference)...\n");
@@ -103,11 +109,11 @@ class BenchmarkRunner {
             M, K, N, config_.settings.run.warmup_runs, config_.settings.run.benchmark_runs);
         float cublas_gflops = cublas_result.gflops;
 
-        // 标准内核
-        runStandardKernels(benchmark, M, K, N);
+        // 使用 Catalog 驱动的内核调度
+        runCatalogKernels(benchmark, M, K, N, has_tensor_cores);
 
-        // Tensor Core 内核
-        runTensorCoreKernels(benchmark, M, K, N);
+        // Tensor Core compute-only 是特殊情况（需要 cublas handle）
+        runTensorCoreComputeOnly(benchmark, M, K, N, has_tensor_cores);
 
         // 报告
         benchmark.printSummary();
@@ -120,81 +126,72 @@ class BenchmarkRunner {
         }
     }
 
-    void runStandardKernels(SGEMMBenchmark &benchmark, int M, int K, int N) {
-        const auto& catalog = getKernelCatalog();
-        
-        for (const auto& entry : catalog) {
-            if (entry.type != KernelType::Standard) {
+    void runCatalogKernels(SGEMMBenchmark &benchmark, int M, int K, int N, bool has_tensor_cores) {
+        const auto &catalog = getKernelCatalog();
+
+        for (const auto &entry : catalog) {
+            // Check runtime constraints from catalog
+            if (!entry.canRun(M, K, N, has_tensor_cores)) {
+                printSkipReason(entry, M, K, N, has_tensor_cores);
                 continue;
             }
-            
+
+            // Use tolerance from settings (which may override defaults)
             VerifyTolerance tolerance = config_.settings.toleranceForKernel(entry.type);
-            
-            // Strip tile size annotation for console message (keep result name as-is)
-            std::string console_name = entry.name;
-            size_t paren_pos = console_name.find(" (");
-            if (paren_pos != std::string::npos) {
-                console_name = console_name.substr(0, paren_pos);
-            }
-            
-            printf("Running %s SGEMM...\n", console_name.c_str());
-            benchmark.run(
-                entry.name,
-                entry.launcher,
-                M, K, N,
-                config_.settings.run.warmup_runs,
-                config_.settings.run.benchmark_runs,
-                tolerance);
+
+            printf("Running %s SGEMM...\n", formatConsoleName(entry.name).c_str());
+            benchmark.run(entry.name, entry.launcher, M, K, N, config_.settings.run.warmup_runs,
+                          config_.settings.run.benchmark_runs, tolerance);
         }
     }
 
-    void runTensorCoreKernels(SGEMMBenchmark &benchmark, int M, int K, int N) {
-        if (!tensorCoresAvailable()) {
-            int device;
-            CUDA_CHECK(cudaGetDevice(&device));
-            cudaDeviceProp prop;
-            CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
-            printf("Skipping Tensor Core benchmarks (requires sm_70+, current: sm_%d%d)\n",
-                   prop.major, prop.minor);
-            return;
-        }
+    void runTensorCoreComputeOnly(SGEMMBenchmark &benchmark, int M, int K, int N,
+                                  bool has_tensor_cores) {
+        auto tc_entry = getTensorCoreComputeOnlyEntry();
 
-        if (!tensorCoreDimensionsSupported(M, K, N)) {
-            printf("Skipping Tensor Core benchmarks for %d x %d x %d "
-                   "(requires positive dimensions aligned to 16, fallback would mislabel FP32 as WMMA).\n",
-                   M, K, N);
-            return;
+        if (!tc_entry.canRun(M, K, N, has_tensor_cores)) {
+            return; // Skip silently - already reported by catalog kernels
         }
 
-        // Run catalog-based tensor core kernels
-        const auto& catalog = getKernelCatalog();
-        for (const auto& entry : catalog) {
-            if (entry.type != KernelType::TensorCore) {
-                continue;
-            }
-            
-            VerifyTolerance tolerance = config_.settings.toleranceForKernel(entry.type);
-            
-            printf("Running Tensor Core SGEMM (end-to-end, includes FP32->FP16 "
-                   "conversion/fallback)...\n");
-            benchmark.run(
-                entry.name,
-                entry.launcher,
-                M, K, N,
-                config_.settings.run.warmup_runs,
-                config_.settings.run.benchmark_runs,
-                tolerance);
-        }
-
-        // Tensor Core compute-only benchmark remains special-cased
-        // (requires cublas handle, different interface)
         VerifyTolerance tolerance = config_.settings.toleranceForKernel(KernelType::TensorCore);
         printf("Running Tensor Core SGEMM (compute-only WMMA path)...\n");
+
         BenchmarkResult tc_result = runTensorCoreComputeOnlyBenchmark(
             benchmark.getCublasHandle(), M, K, N, config_.settings.run.warmup_runs,
             config_.settings.run.benchmark_runs, tolerance);
         benchmark.addResult(tc_result);
     }
 
+    void printSkipReason(const KernelCatalogEntry &entry, int M, int K, int N,
+                         bool has_tensor_cores) const {
+        if (entry.constraints.requires_tensor_cores && !has_tensor_cores) {
+            // Use non-throwing error handling for diagnostic output
+            int device = 0;
+            cudaDeviceProp prop{};
+            if (cudaGetDevice(&device) == cudaSuccess &&
+                cudaGetDeviceProperties(&prop, device) == cudaSuccess) {
+                printf("Skipping %s (requires sm_70+, current: sm_%d%d)\n", entry.name.c_str(),
+                       prop.major, prop.minor);
+            } else {
+                printf("Skipping %s (requires sm_70+, current device unavailable)\n",
+                       entry.name.c_str());
+            }
+        } else if (entry.constraints.dimension_alignment > 0) {
+            printf("Skipping %s for %d x %d x %d "
+                   "(requires dimensions aligned to %d)\n",
+                   entry.name.c_str(), M, K, N, entry.constraints.dimension_alignment);
+        }
+    }
+
+    std::string formatConsoleName(const std::string &name) const {
+        // Strip tile size annotation for console message
+        std::string result = name;
+        size_t paren_pos = result.find(" (");
+        if (paren_pos != std::string::npos) {
+            result = result.substr(0, paren_pos);
+        }
+        return result;
+    }
+
     BenchmarkConfig config_;
 };
diff --git a/src/kernels/kernel_catalog.cuh b/src/kernels/kernel_catalog.cuh
index 57033fb..a8eff36 100644
--- a/src/kernels/kernel_catalog.cuh
+++ b/src/kernels/kernel_catalog.cuh
@@ -14,33 +14,91 @@
 // ============================================================================
 // Kernel Catalog Module
 //
-// Centralizes kernel metadata, launch adapters, and kernel type classification.
-// Eliminates repeated lambdas and inline dispatch logic in BenchmarkRunner.
+// THE authoritative source for kernel ladder metadata.
 //
 // Design:
-// - Static catalog of standard kernels (no global mutable state)
-// - Each entry contains: name, type, and launch adapter
-// - BenchmarkRunner iterates over catalog entries instead of hardcoding dispatch
+// - Single source of truth for kernel name, type, tolerance, launcher, constraints
+// - Benchmark, tests, and documentation all reference this catalog
+// - New kernels only require adding one entry here
 //
-// Note: Tensor Core compute-only benchmark remains a special case due to
-// its different interface (requires cublas handle), but the end-to-end
-// tensor core kernel is included in the catalog.
+// Catalog entries provide:
+// - Identity: name, type classification
+// - Behavior: launch adapter, default tolerance
+// - Constraints: dimension requirements (e.g., Tensor Core requires 16-aligned)
 // ============================================================================
 
 // ============================================================================
 // Kernel Launch Adapter Type
 // ============================================================================
 
-using KernelLauncher = std::function<void(const float*, const float*, float*, int, int, int)>;
+using KernelLauncher = std::function<void(const float *, const float *, float *, int, int, int)>;
+
+// ============================================================================
+// Kernel Constraints
+// ============================================================================
+
+/**
+ * Describes runtime constraints for a kernel.
+ *
+ * Used by BenchmarkRunner to decide whether a kernel can run with given dimensions.
+ */
+struct KernelConstraints {
+    bool requires_tensor_cores; // Requires sm_70+
+    int dimension_alignment;    // All dimensions must be multiple of this (0 = no constraint)
+    bool requires_compute_only; // Special case: uses different benchmark interface
+
+    static KernelConstraints standard() { return {false, 0, false}; }
+
+    static KernelConstraints tensorCore() { return {true, 16, false}; }
+
+    static KernelConstraints tensorCoreComputeOnly() { return {true, 16, true}; }
+
+    bool isSatisfied(int M, int K, int N, bool has_tensor_cores) const {
+        if (M <= 0 || K <= 0 || N <= 0) {
+            return false;
+        }
+        if (requires_tensor_cores && !has_tensor_cores) {
+            return false;
+        }
+        if (dimension_alignment > 0) {
+            if (M % dimension_alignment != 0 || K % dimension_alignment != 0 ||
+                N % dimension_alignment != 0) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
 
 // ============================================================================
 // Kernel Catalog Entry
 // ============================================================================
 
+/**
+ * Complete metadata for a benchmarkable kernel.
+ *
+ * Each entry represents one step in the kernel optimization ladder.
+ */
 struct KernelCatalogEntry {
-    std::string name;
-    KernelType type;
-    KernelLauncher launcher;
+    std::string name;              // Display name for reports
+    KernelType type;               // Standard or TensorCore
+    KernelLauncher launcher;       // Launch adapter
+    KernelConstraints constraints; // Runtime requirements
+
+    /**
+     * Get default verification tolerance for this kernel type.
+     */
+    VerifyTolerance defaultTolerance() const {
+        return (type == KernelType::TensorCore) ? kTensorCoreVerifyTolerance
+                                                : kStandardVerifyTolerance;
+    }
+
+    /**
+     * Check if this kernel can run with given dimensions and hardware.
+     */
+    bool canRun(int M, int K, int N, bool has_tensor_cores) const {
+        return constraints.isSatisfied(M, K, N, has_tensor_cores);
+    }
 };
 
 // ============================================================================
@@ -55,50 +113,94 @@ struct KernelCatalogEntry {
  * 2. Tensor Core end-to-end kernel (WMMA with FP32->FP16 conversion/fallback)
  *
  * Note: The Tensor Core compute-only benchmark is NOT included because it
- * has a different interface (requires cublas handle) and is handled separately
- * in BenchmarkRunner.
+ * has a different interface (requires cublas handle). It is handled as a
+ * special case via getTensorCoreComputeOnlyEntry().
  */
-inline const std::vector<KernelCatalogEntry>& getKernelCatalog() {
+inline const std::vector<KernelCatalogEntry> &getKernelCatalog() {
     static const std::vector<KernelCatalogEntry> catalog = {
-        // Standard FP32 kernels
-        {
-            "Naive",
-            KernelType::Standard,
-            [](const float *A, const float *B, float *C, int M, int K, int N) {
-                launch_naive_sgemm<32>(A, B, C, M, K, N);
-            }
-        },
-        {
-            "Tiled (32x32)",
-            KernelType::Standard,
-            [](const float *A, const float *B, float *C, int M, int K, int N) {
-                launch_tiled_sgemm<32>(A, B, C, M, K, N);
-            }
-        },
-        {
-            "Bank Conflict Free",
-            KernelType::Standard,
-            [](const float *A, const float *B, float *C, int M, int K, int N) {
-                launch_bank_conflict_free_sgemm<32>(A, B, C, M, K, N);
-            }
-        },
-        {
-            "Double Buffer",
-            KernelType::Standard,
-            [](const float *A, const float *B, float *C, int M, int K, int N) {
-                launch_double_buffer_sgemm<32>(A, B, C, M, K, N);
-            }
-        },
-        // Tensor Core end-to-end kernel
-        {
-            "Tensor Core (WMMA end-to-end)",
-            KernelType::TensorCore,
-            [](const float *A, const float *B, float *C, int M, int K, int N) {
-                launch_tensor_core_sgemm_with_fallback(A, B, C, M, K, N,
-                                                       defaultTensorCoreFallback());
+        // Standard FP32 kernels - no constraints
+        {"Naive", KernelType::Standard,
+         [](const float *A, const float *B, float *C, int M, int K, int N) {
+             launch_naive_sgemm<32>(A, B, C, M, K, N);
+         },
+         KernelConstraints::standard()},
+        {"Tiled (32x32)", KernelType::Standard,
+         [](const float *A, const float *B, float *C, int M, int K, int N) {
+             launch_tiled_sgemm<32>(A, B, C, M, K, N);
+         },
+         KernelConstraints::standard()},
+        {"Bank Conflict Free", KernelType::Standard,
+         [](const float *A, const float *B, float *C, int M, int K, int N) {
+             launch_bank_conflict_free_sgemm<32>(A, B, C, M, K, N);
+         },
+         KernelConstraints::standard()},
+        {"Double Buffer", KernelType::Standard,
+         [](const float *A, const float *B, float *C, int M, int K, int N) {
+             launch_double_buffer_sgemm<32>(A, B, C, M, K, N);
+         },
+         KernelConstraints::standard()},
+        // Tensor Core end-to-end kernel - requires sm_70+ and 16-aligned dimensions
+        {"Tensor Core (WMMA end-to-end)", KernelType::TensorCore,
+         [](const float *A, const float *B, float *C, int M, int K, int N) {
+             launch_tensor_core_sgemm_with_fallback(A, B, C, M, K, N, defaultTensorCoreFallback());
+         },
+         KernelConstraints::tensorCore()}};
+
+    return catalog;
+}
+
+/**
+ * Returns the Tensor Core compute-only entry.
+ *
+ * This is a special entry that uses a different benchmark interface
+ * (requires cublas handle for reference computation).
+ */
+inline KernelCatalogEntry getTensorCoreComputeOnlyEntry() {
+    return {"Tensor Core (WMMA compute-only)", KernelType::TensorCore,
+            nullptr, // Launcher is not used for compute-only; handled specially
+            KernelConstraints::tensorCoreComputeOnly()};
+}
+
+// ============================================================================
+// Catalog Query Utilities
+// ============================================================================
+
+/**
+ * Count kernels by type in the catalog.
+ */
+inline int countKernelsByType(KernelType type) {
+    int count = 0;
+    for (const auto &entry : getKernelCatalog()) {
+        if (entry.type == type) {
+            count++;
+        }
+    }
+    return count;
+}
+
+/**
+ * Get list of kernel names for a given type.
+ */
+inline std::vector<std::string> getKernelNames(KernelType type) {
+    std::vector<std::string> names;
+    for (const auto &entry : getKernelCatalog()) {
+        if (entry.type == type) {
+            names.push_back(entry.name);
+        }
+    }
+    return names;
+}
+
+/**
+ * Check if any Tensor Core kernel can run with given dimensions.
+ */
+inline bool canRunTensorCoreKernels(int M, int K, int N, bool has_tensor_cores) {
+    for (const auto &entry : getKernelCatalog()) {
+        if (entry.type == KernelType::TensorCore) {
+            if (entry.canRun(M, K, N, has_tensor_cores)) {
+                return true;
             }
         }
-    };
-    
-    return catalog;
+    }
+    return false;
 }
diff --git a/src/kernels/tensor_core_benchmark.cuh b/src/kernels/tensor_core_benchmark.cuh
index 6527387..3822315 100644
--- a/src/kernels/tensor_core_benchmark.cuh
+++ b/src/kernels/tensor_core_benchmark.cuh
@@ -1,9 +1,9 @@
 #pragma once
 
-#include "tensor_core_sgemm.cuh"
 #include "../utils/benchmark_core.cuh"
 #include "../utils/benchmark_metrics.cuh"
 #include "../utils/verify.cuh"
+#include "tensor_core_sgemm.cuh"
 
 #include <climits>
 #include <cublas_v2.h>
@@ -18,8 +18,22 @@
 // 此模块提供 Tensor Core 特有的 benchmark 功能。
 // 接口设计：只接受 cublasHandle_t，不依赖整个 SGEMMBenchmark 类，
 // 避免内核层对工具层的上穿依赖。
+//
+// 设计原则：
+// - 与 KernelCatalog 的约束保持一致
+// - 使用统一的容差和验证策略
+// - 所有约束检查集中在 tensorCoreDimensionsSupported() 和 tensorCoresAvailable()
 // ============================================================================
 
+/**
+ * 检查 Tensor Core compute-only benchmark 是否可以运行
+ *
+ * 统一的约束检查，与 KernelCatalog 的 canRun() 语义一致
+ */
+inline bool canRunTensorCoreComputeOnly(int M, int K, int N) {
+    return tensorCoresAvailable() && tensorCoreDimensionsSupported(M, K, N);
+}
+
 /**
  * 运行 Tensor Core 纯计算路径 benchmark
  *
@@ -32,13 +46,15 @@
  * @param benchmark_runs 测量次数
  * @param tolerance 验证容差
  * @return BenchmarkResult 包含性能数据
+ * @throws CudaError 如果约束不满足
  */
 inline BenchmarkResult
 runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, int N,
                                   int warmup_runs = 5, int benchmark_runs = 20,
                                   VerifyTolerance tolerance = kTensorCoreVerifyTolerance) {
 
-    if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) {
+    // 约束检查 - 与 KernelCatalog 语义一致
+    if (!canRunTensorCoreComputeOnly(M, K, N)) {
         throw CudaError("Tensor Core compute-only benchmark requires sm_70+ and "
                         "dimensions aligned to 16");
     }
@@ -49,56 +65,48 @@ runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, in
     result.K = K;
     result.N = N;
 
-    std::vector<float> h_A(M * K), h_B(K * N), h_C(M * N), h_C_ref(M * N);
-    DeviceMemory<float> d_A(M * K);
-    DeviceMemory<float> d_B(K * N);
-    DeviceMemory<float> d_C(M * N);
-    DeviceMemory<float> d_C_ref(M * N);
-    DeviceMemory<half> d_A_fp16(M * K);
-    DeviceMemory<half> d_B_fp16(K * N);
+    // 安全计算矩阵大小，避免整数溢出
+    size_t size_A = static_cast<size_t>(M) * K;
+    size_t size_B = static_cast<size_t>(K) * N;
+    size_t size_C = static_cast<size_t>(M) * N;
+
+    std::vector<float> h_A(size_A), h_B(size_B), h_C(size_C), h_C_ref(size_C);
+    DeviceMemory<float> d_A(size_A);
+    DeviceMemory<float> d_B(size_B);
+    DeviceMemory<float> d_C(size_C);
+    DeviceMemory<float> d_C_ref(size_C);
+    DeviceMemory<half> d_A_fp16(size_A);
+    DeviceMemory<half> d_B_fp16(size_B);
 
     initRandomMatrix(h_A.data(), M, K, -1.0f, 1.0f, 42);
     initRandomMatrix(h_B.data(), K, N, -1.0f, 1.0f, 123);
 
-    d_A.copyFromHost(h_A.data(), M * K);
-    d_B.copyFromHost(h_B.data(), K * N);
+    d_A.copyFromHost(h_A.data(), size_A);
+    d_B.copyFromHost(h_B.data(), size_B);
 
     float alpha = 1.0f, beta = 0.0f;
-    CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha,
-                             d_B.get(), N, d_A.get(), K, &beta, d_C_ref.get(), N));
+    CUBLAS_CHECK(cublasSgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, M, K, &alpha, d_B.get(), N,
+                             d_A.get(), K, &beta, d_C_ref.get(), N));
 
     int blockSize = kDefaultBlockSize;
-    // 安全计算 gridSize，检查溢出
-    auto safeGridSize = [](size_t num, int blk) -> int {
-        size_t grid = (num + blk - 1) / blk;
-        if (grid > static_cast<size_t>(INT_MAX)) {
-            throw CudaError("Grid size overflow: matrix too large for kernel launch");
-        }
-        return static_cast<int>(grid);
-    };
-    int gridSizeA = safeGridSize(static_cast<size_t>(M) * K, blockSize);
-    int gridSizeB = safeGridSize(static_cast<size_t>(K) * N, blockSize);
-
-    size_t num_A = static_cast<size_t>(M) * K;
-    size_t num_B = static_cast<size_t>(K) * N;
-
-    // 检查矩阵元素数量是否超过 int 最大值
-    if (num_A > static_cast<size_t>(INT_MAX)) {
-        throw CudaError("Matrix A size overflow: too many elements for int parameter");
-    }
-    if (num_B > static_cast<size_t>(INT_MAX)) {
-        throw CudaError("Matrix B size overflow: too many elements for int parameter");
-    }
-
-    float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(), static_cast<int>(num_A));
-    float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(), static_cast<int>(num_B));
+    checkMatrixElementCount(size_A, "A");
+    checkMatrixElementCount(size_B, "B");
+    int gridSizeA = safeGridSize(size_A, blockSize);
+    int gridSizeB = safeGridSize(size_B, blockSize);
+
+    float_to_half_kernel<<<gridSizeA, blockSize>>>(d_A.get(), d_A_fp16.get(),
+                                                   static_cast<int>(size_A));
+    float_to_half_kernel<<<gridSizeB, blockSize>>>(d_B.get(), d_B_fp16.get(),
+                                                   static_cast<int>(size_B));
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaDeviceSynchronize());
 
     // 使用统一的 measureGpuTime 计时
-    float time_ms =
-        measureGpuTime([&]() { launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N); },
-                       warmup_runs, benchmark_runs);
+    float time_ms = measureGpuTime(
+        [&]() {
+            launch_tensor_core_sgemm_fp16(d_A_fp16.get(), d_B_fp16.get(), d_C.get(), M, K, N);
+        },
+        warmup_runs, benchmark_runs);
 
     // 计算指标
     PerformanceMetrics metrics = calculateSgemmMetrics(M, K, N, time_ms);
@@ -107,8 +115,8 @@ runTensorCoreComputeOnlyBenchmark(cublasHandle_t cublas_handle, int M, int K, in
     result.bandwidth_gb_s = metrics.bandwidth_gb_s;
     result.efficiency = calculateEfficiency(result.gflops, getTheoreticalPeakGflops());
 
-    d_C.copyToHost(h_C.data(), M * N);
-    d_C_ref.copyToHost(h_C_ref.data(), M * N);
+    d_C.copyToHost(h_C.data(), size_C);
+    d_C_ref.copyToHost(h_C_ref.data(), size_C);
 
     VerifyResult verify_result = compareMatrices(h_C.data(), h_C_ref.data(), M, N, tolerance);
     result.correct = verify_result.passed;
diff --git a/src/kernels/tensor_core_sgemm.cuh b/src/kernels/tensor_core_sgemm.cuh
index b5607fc..b77f08f 100644
--- a/src/kernels/tensor_core_sgemm.cuh
+++ b/src/kernels/tensor_core_sgemm.cuh
@@ -60,9 +60,7 @@ inline bool tensorCoresAvailable(const DeviceInfoProvider &provider) {
 /**
  * 检查当前设备是否支持 Tensor Core (默认使用生产环境设备)
  */
-inline bool tensorCoresAvailable() {
-    return tensorCoresAvailable(getProductionDeviceInfo());
-}
+inline bool tensorCoresAvailable() { return tensorCoresAvailable(getProductionDeviceInfo()); }
 inline constexpr bool tensorCoreFastPathCompiled() { return SGEMM_HAS_WMMA_TARGET != 0; }
 
 /**
@@ -117,8 +115,8 @@ using FallbackKernel =
  *
  * 提供一个空的 fallback（用于测试或显式配置场景）
  */
-[[maybe_unused]] inline void
-nullFallback(const float *, const float *, float *, int, int, int, cudaStream_t = 0) {
+[[maybe_unused]] inline void nullFallback(const float *, const float *, float *, int, int, int,
+                                          cudaStream_t = 0) {
     // 空实现 - 用于测试
 }
 
@@ -234,8 +232,8 @@ inline void launch_tensor_core_sgemm_fp16(const half *A, const half *B, float *C
     }
 
     if (!tensorCoresAvailable() || !tensorCoreDimensionsSupported(M, K, N)) {
-        throw CudaError(
-            "launch_tensor_core_sgemm_fp16 requires runtime sm_70+ support and dimensions aligned to 16");
+        throw CudaError("launch_tensor_core_sgemm_fp16 requires runtime sm_70+ support and "
+                        "dimensions aligned to 16");
     }
 
     launch_tensor_core_sgemm_fp16_fast_path(A, B, C, M, K, N, stream);
@@ -283,25 +281,11 @@ inline void launch_tensor_core_sgemm_with_fallback(const float *A, const float *
     DeviceMemory<half> d_B_fp16(num_B);
 
     int blockSize = kDefaultBlockSize;
-    // 安全计算 gridSize，检查溢出
-    auto safeGridSize = [](size_t num, int blk) -> int {
-        size_t grid = (num + blk - 1) / blk;
-        if (grid > static_cast<size_t>(INT_MAX)) {
-            throw CudaError("Grid size overflow: matrix too large for kernel launch");
-        }
-        return static_cast<int>(grid);
-    };
+    checkMatrixElementCount(num_A, "A");
+    checkMatrixElementCount(num_B, "B");
     int gridSizeA = safeGridSize(num_A, blockSize);
     int gridSizeB = safeGridSize(num_B, blockSize);
 
-    // 检查矩阵元素数量是否超过 int 最大值
-    if (num_A > static_cast<size_t>(INT_MAX)) {
-        throw CudaError("Matrix A size overflow: too many elements for int parameter");
-    }
-    if (num_B > static_cast<size_t>(INT_MAX)) {
-        throw CudaError("Matrix B size overflow: too many elements for int parameter");
-    }
-
     float_to_half_kernel<<<gridSizeA, blockSize, 0, stream>>>(A, d_A_fp16.get(),
                                                               static_cast<int>(num_A));
     float_to_half_kernel<<<gridSizeB, blockSize, 0, stream>>>(B, d_B_fp16.get(),
diff --git a/src/utils/benchmark.cuh b/src/utils/benchmark.cuh
index 6e044c4..be711d2 100644
--- a/src/utils/benchmark.cuh
+++ b/src/utils/benchmark.cuh
@@ -94,8 +94,9 @@ class SGEMMBenchmark {
                                  d_B.get(), N, d_A.get(), K, &beta, d_C_ref.get(), N));
 
         // 使用统一的 measureGpuTime 计时
-        float time_ms = measureGpuTime([&]() { kernel_func(d_A.get(), d_B.get(), d_C.get(), M, K, N); },
-                                       warmup_runs, benchmark_runs);
+        float time_ms =
+            measureGpuTime([&]() { kernel_func(d_A.get(), d_B.get(), d_C.get(), M, K, N); },
+                           warmup_runs, benchmark_runs);
 
         // 计算指标
         PerformanceMetrics metrics = calculateSgemmMetrics(M, K, N, time_ms);
@@ -210,6 +211,9 @@ class SGEMMBenchmark {
                  << "," << ai << "\n";
         }
 
+        if (file.fail()) {
+            fprintf(stderr, "Warning: errors occurred while writing %s\n", filename.c_str());
+        }
         file.close();
         printf("Approximate roofline data exported to: %s\n", filename.c_str());
     }
diff --git a/src/utils/benchmark_core.cuh b/src/utils/benchmark_core.cuh
index c299ce9..b66be9d 100644
--- a/src/utils/benchmark_core.cuh
+++ b/src/utils/benchmark_core.cuh
@@ -90,6 +90,10 @@ class CudaTimer {
  */
 template <typename RunFunc>
 float measureGpuTime(RunFunc func, int warmup_runs = 5, int benchmark_runs = 20) {
+    if (benchmark_runs <= 0) {
+        throw CudaError("measureGpuTime: benchmark_runs must be positive");
+    }
+
     // 预热运行
     for (int i = 0; i < warmup_runs; ++i) {
         func();
diff --git a/src/utils/benchmark_metrics.cuh b/src/utils/benchmark_metrics.cuh
index 4c25b4e..0ec3d59 100644
--- a/src/utils/benchmark_metrics.cuh
+++ b/src/utils/benchmark_metrics.cuh
@@ -59,8 +59,10 @@ inline PerformanceMetrics calculateSgemmMetrics(int M, int K, int N, float time_
  * 提供重载版本以支持可注入的 device info provider。
  */
 inline float getTheoreticalPeakGflops(const DeviceInfoProvider &provider) {
-    // 峰值 GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz) * 1000 (MHz factor)
-    float peakGflops = provider.smCount() * provider.cores_per_sm * 2 * provider.clock_ghz * 1000;
+    // 峰值 GFLOPS = SMs * cores/SM * 2 (FMA) * clock (GHz)
+    // clock_ghz 已经是 GHz 单位，GFLOPS = FLOPS / 1e9 = cores * 2 * clock_GHz
+    float peakGflops =
+        static_cast<float>(provider.smCount()) * provider.cores_per_sm * 2.0f * provider.clock_ghz;
     return peakGflops;
 }
 
diff --git a/src/utils/cuda_utils.cuh b/src/utils/cuda_utils.cuh
index 956249b..64ceb82 100644
--- a/src/utils/cuda_utils.cuh
+++ b/src/utils/cuda_utils.cuh
@@ -170,6 +170,29 @@ inline void initRandomMatrix(float *data, int rows, int cols, float min_val = -1
 // Utility Functions
 // ============================================================================
 
+/**
+ * Safely compute grid size from element count and block size.
+ * Throws CudaError if the grid size would overflow int.
+ */
+inline int safeGridSize(size_t num, int blk) {
+    size_t grid = (num + blk - 1) / blk;
+    if (grid > static_cast<size_t>(INT_MAX)) {
+        throw CudaError("Grid size overflow: matrix too large for kernel launch");
+    }
+    return static_cast<int>(grid);
+}
+
+/**
+ * Check that a matrix element count fits in int (for kernel parameters).
+ * Throws CudaError if it doesn't.
+ */
+inline void checkMatrixElementCount(size_t count, const char *name) {
+    if (count > static_cast<size_t>(INT_MAX)) {
+        throw CudaError(std::string("Matrix ") + name +
+                        " size overflow: too many elements for int parameter");
+    }
+}
+
 inline bool cudaDeviceAvailable() {
     int device_count = 0;
     cudaError_t status = cudaGetDeviceCount(&device_count);
@@ -255,7 +278,8 @@ class DeviceInfoCache {
     }
 
     int computeCoresPerSM() const {
-        // 参考: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
+        // 参考:
+        // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#compute-capabilities
         if (prop_.major == 7) {
             return 64; // Volta (sm_70, sm_72), Turing (sm_75)
         } else if (prop_.major == 8) {
@@ -291,6 +315,4 @@ inline DeviceInfoProvider ProductionDeviceInfoProvider::get() const {
 /**
  * Convenience function: get production device info
  */
-inline DeviceInfoProvider getProductionDeviceInfo() {
-    return ProductionDeviceInfoProvider{}.get();
-}
+inline DeviceInfoProvider getProductionDeviceInfo() { return ProductionDeviceInfoProvider{}.get(); }
diff --git a/src/utils/verify.cuh b/src/utils/verify.cuh
index 77a1843..00512df 100644
--- a/src/utils/verify.cuh
+++ b/src/utils/verify.cuh
@@ -9,10 +9,29 @@
 #include <limits>
 #include <vector>
 
+// ============================================================================
+// Verification Module
+//
+// Provides unified verification strategy for SGEMM results:
+// - Tolerance configuration (FP32 standard, Tensor Core relaxed)
+// - Matrix comparison with numpy-style allclose semantics
+// - cuBLAS reference computation adapter
+//
+// Design:
+// - Single source of truth for tolerance policies
+// - Separates reference computation from comparison logic
+// - Supports both device and host pointer comparisons
+// ============================================================================
+
 // ============================================================================
 // Verification Result Structure
 // ============================================================================
 
+/**
+ * Result of matrix verification
+ *
+ * Captures all metrics needed to assess correctness and diagnose issues.
+ */
 struct VerifyResult {
     bool passed;
     float max_abs_error;
@@ -29,15 +48,27 @@ struct VerifyResult {
                    100.0f * error_count / total_elements);
         }
     }
+
+    /**
+     * Legacy compatibility: returns true if verification failed.
+     * Used by older code that checks shouldFlagAsIncorrect().
+     */
+    bool shouldFlagAsIncorrect() const { return !passed; }
 };
 
 // ============================================================================
 // Tolerance Configuration
 // ============================================================================
 
+/**
+ * Tolerance specification for matrix comparison.
+ *
+ * Uses numpy-style allclose semantics:
+ * |test - ref| <= atol + rtol * |ref|
+ */
 struct VerifyTolerance {
-    float rtol;
-    float atol;
+    float rtol; // Relative tolerance
+    float atol; // Absolute tolerance
 };
 
 // Standard verification tolerance for FP32 kernels
@@ -46,23 +77,32 @@ inline constexpr VerifyTolerance kStandardVerifyTolerance{1e-3f, 1e-4f};
 // Tensor Core verification tolerance (FP16 intermediate precision)
 inline constexpr VerifyTolerance kTensorCoreVerifyTolerance{5e-2f, 1e-2f};
 
+/**
+ * Compute tolerance threshold for a specific reference value.
+ */
 inline float toleranceForValue(float ref_val, VerifyTolerance tolerance) {
     return tolerance.atol + tolerance.rtol * std::fabs(ref_val);
 }
 
+/**
+ * Check if a test value is within tolerance of a reference value.
+ */
 inline bool isWithinTolerance(float test_val, float ref_val, VerifyTolerance tolerance) {
     float abs_error = std::fabs(test_val - ref_val);
     return abs_error <= toleranceForValue(ref_val, tolerance);
 }
 
 // ============================================================================
-// Internal Implementation
+// Matrix Comparison (Internal Implementation)
 // ============================================================================
 
 namespace detail {
 
-// 内部实现：比较两个矩阵并返回验证结果
-// 供 verify() 和 compareMatrices() 共享
+/**
+ * Internal: Compare two host matrices and return verification result.
+ *
+ * This is the core comparison logic used by all verification functions.
+ */
 inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref,
                                         size_t num_elements, VerifyTolerance tolerance) {
     VerifyResult result;
@@ -75,8 +115,24 @@ inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref,
         float ref_val = h_ref[i];
         float test_val = h_test[i];
 
-        // Check for NaN or Inf
+        // Check for NaN or Inf in test output
         if (std::isnan(test_val) || std::isinf(test_val)) {
+            // If reference is also the same NaN/Inf, consider it a match
+            if (std::isnan(ref_val) && std::isnan(test_val)) {
+                continue;
+            }
+            if (std::isinf(ref_val) && std::isinf(test_val) &&
+                std::signbit(ref_val) == std::signbit(test_val)) {
+                continue;
+            }
+            result.error_count++;
+            result.max_abs_error = std::numeric_limits<float>::infinity();
+            result.max_rel_error = std::numeric_limits<float>::infinity();
+            continue;
+        }
+
+        // Skip comparison when reference is NaN/Inf (can't meaningfully compare)
+        if (std::isnan(ref_val) || std::isinf(ref_val)) {
             result.error_count++;
             result.max_abs_error = std::numeric_limits<float>::infinity();
             result.max_rel_error = std::numeric_limits<float>::infinity();
@@ -104,18 +160,58 @@ inline VerifyResult compareMatricesImpl(const float *h_test, const float *h_ref,
 // Standalone Verification Functions
 // ============================================================================
 
-// Compare two matrices and return verification result
-// Uses numpy-style allclose: |test - ref| <= atol + rtol * |ref|
+/**
+ * Compare two host matrices and return verification result.
+ *
+ * @param h_test Test matrix (host pointer)
+ * @param h_ref Reference matrix (host pointer)
+ * @param M, N Matrix dimensions
+ * @param tolerance Verification tolerance
+ */
 inline VerifyResult compareMatrices(const float *h_test, const float *h_ref, int M, int N,
                                     VerifyTolerance tolerance = kStandardVerifyTolerance) {
     size_t num_elements = static_cast<size_t>(M) * N;
     return detail::compareMatricesImpl(h_test, h_ref, num_elements, tolerance);
 }
 
+/**
+ * Compare two device matrices and return verification result.
+ *
+ * Copies both matrices to host before comparison.
+ *
+ * @param d_test Test matrix (device pointer)
+ * @param d_ref Reference matrix (device pointer)
+ * @param M, N Matrix dimensions
+ * @param tolerance Verification tolerance
+ */
+inline VerifyResult compareDeviceMatrices(const float *d_test, const float *d_ref, int M, int N,
+                                          VerifyTolerance tolerance = kStandardVerifyTolerance) {
+    size_t num_elements = static_cast<size_t>(M) * N;
+    std::vector<float> h_test(num_elements);
+    std::vector<float> h_ref(num_elements);
+
+    CUDA_CHECK(
+        cudaMemcpy(h_test.data(), d_test, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
+    CUDA_CHECK(
+        cudaMemcpy(h_ref.data(), d_ref, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
+
+    return detail::compareMatricesImpl(h_test.data(), h_ref.data(), num_elements, tolerance);
+}
+
 // ============================================================================
-// cuBLAS Reference SGEMM
+// cuBLAS Reference Provider
 // ============================================================================
 
+/**
+ * RAII wrapper for cuBLAS-based reference computation.
+ *
+ * Provides:
+ * - cuBLAS handle management
+ * - Reference SGEMM computation
+ * - Verification against kernel output
+ *
+ * This is the primary adapter for producing reference results.
+ */
 class SGEMMVerifier {
   public:
     SGEMMVerifier() { CUBLAS_CHECK(cublasCreate(&handle_)); }
@@ -126,9 +222,21 @@ class SGEMMVerifier {
         }
     }
 
-    // Compute reference result using cuBLAS
-    // C = alpha * A * B + beta * C
-    // A: M x K, B: K x N, C: M x N (row-major)
+    // Non-copyable, non-movable
+    SGEMMVerifier(const SGEMMVerifier &) = delete;
+    SGEMMVerifier &operator=(const SGEMMVerifier &) = delete;
+
+    /**
+     * Compute reference result using cuBLAS.
+     *
+     * C = alpha * A * B + beta * C
+     * A: M x K, B: K x N, C: M x N (row-major)
+     *
+     * @param d_A, d_B Input matrices (device pointers)
+     * @param d_C Output matrix (device pointer)
+     * @param M, K, N Matrix dimensions
+     * @param alpha, beta Scaling factors
+     */
     void computeReference(const float *d_A, const float *d_B, float *d_C, int M, int K, int N,
                           float alpha = 1.0f, float beta = 0.0f) {
         // cuBLAS uses column-major, so we compute C^T = B^T * A^T
@@ -137,34 +245,45 @@ class SGEMMVerifier {
                                  &beta, d_C, N));
     }
 
-    // Verify kernel output against reference
-    // Uses numpy-style allclose: |test - ref| <= atol + rtol * |ref|
+    /**
+     * Verify kernel output against reference (host pointers).
+     */
     VerifyResult verify(const float *h_test, const float *h_ref, int M, int N,
                         VerifyTolerance tolerance = kStandardVerifyTolerance) {
-        size_t num_elements = static_cast<size_t>(M) * N;
-        return detail::compareMatricesImpl(h_test, h_ref, num_elements, tolerance);
+        return compareMatrices(h_test, h_ref, M, N, tolerance);
     }
 
-    // Verify with device pointers (copies to host internally)
+    /**
+     * Verify kernel output against reference (device pointers).
+     */
     VerifyResult verifyDevice(const float *d_test, const float *d_ref, int M, int N,
                               VerifyTolerance tolerance = kStandardVerifyTolerance) {
-        size_t num_elements = static_cast<size_t>(M) * N;
-        std::vector<float> h_test(num_elements);
-        std::vector<float> h_ref(num_elements);
-
-        CUDA_CHECK(cudaMemcpy(h_test.data(), d_test, num_elements * sizeof(float),
-                              cudaMemcpyDeviceToHost));
-        CUDA_CHECK(
-            cudaMemcpy(h_ref.data(), d_ref, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
-
-        return verify(h_test.data(), h_ref.data(), M, N, tolerance);
+        return compareDeviceMatrices(d_test, d_ref, M, N, tolerance);
     }
 
-    // Keep error flagging semantics aligned with compareMatrices/verify.
-    static bool shouldFlagAsIncorrect(const VerifyResult &result) { return !result.passed; }
-
+    /**
+     * Access the underlying cuBLAS handle.
+     *
+     * Note: This is provided for compatibility with existing code that needs
+     * direct cuBLAS access (e.g., Tensor Core compute-only benchmark).
+     * Prefer using computeReference() for standard verification flows.
+     */
     cublasHandle_t getHandle() { return handle_; }
 
   private:
     cublasHandle_t handle_;
 };
+
+// ============================================================================
+// Legacy Compatibility
+// ============================================================================
+
+/**
+ * Legacy function: Check if verification result indicates failure.
+ *
+ * @deprecated Use VerifyResult::shouldFlagAsIncorrect() or check result.passed directly.
+ */
+[[deprecated("Use VerifyResult::shouldFlagAsIncorrect() or check result.passed directly")]]
+inline bool shouldFlagAsIncorrect(const VerifyResult &result) {
+    return !result.passed;
+}
diff --git a/tests/test_benchmark_settings.cu b/tests/test_benchmark_settings.cpp
similarity index 98%
rename from tests/test_benchmark_settings.cu
rename to tests/test_benchmark_settings.cpp
index eace3f1..1779177 100644
--- a/tests/test_benchmark_settings.cu
+++ b/tests/test_benchmark_settings.cpp
@@ -5,10 +5,12 @@
  * - Run configuration (warmup, benchmark iterations)
  * - Verification tolerance policy
  * - Output/export options
+ *
+ * This is a CPU-only test file that does not require CUDA.
  */
 
-#include <gtest/gtest.h>
 #include "utils/benchmark_settings.cuh"
+#include <gtest/gtest.h>
 
 // ============================================================================
 // Run Settings Tests
@@ -138,12 +140,12 @@ TEST(BenchmarkSettingsTest, CustomOutputSettings) {
 
 TEST(BenchmarkSettingsTest, ToleranceForKernelTypeUsesDefaults) {
     BenchmarkSettings settings;
-    
+
     // Standard kernels use standard tolerance
     VerifyTolerance std_tol = settings.toleranceForKernel(KernelType::Standard);
     EXPECT_FLOAT_EQ(std_tol.rtol, kStandardVerifyTolerance.rtol);
     EXPECT_FLOAT_EQ(std_tol.atol, kStandardVerifyTolerance.atol);
-    
+
     // Tensor Core kernels use tensor core tolerance
     VerifyTolerance tc_tol = settings.toleranceForKernel(KernelType::TensorCore);
     EXPECT_FLOAT_EQ(tc_tol.rtol, kTensorCoreVerifyTolerance.rtol);
@@ -152,22 +154,17 @@ TEST(BenchmarkSettingsTest, ToleranceForKernelTypeUsesDefaults) {
 
 TEST(BenchmarkSettingsTest, ToleranceForKernelTypeRespectsCustomSettings) {
     BenchmarkSettings settings;
-    
+
     // Customize both tolerances
     settings.verify.standard_tolerance = {0.01f, 0.001f};
     settings.verify.tensor_core_tolerance = {0.1f, 0.05f};
-    
+
     // Verify toleranceForKernel returns the custom values
     VerifyTolerance std_tol = settings.toleranceForKernel(KernelType::Standard);
     EXPECT_FLOAT_EQ(std_tol.rtol, 0.01f);
     EXPECT_FLOAT_EQ(std_tol.atol, 0.001f);
-    
+
     VerifyTolerance tc_tol = settings.toleranceForKernel(KernelType::TensorCore);
     EXPECT_FLOAT_EQ(tc_tol.rtol, 0.1f);
     EXPECT_FLOAT_EQ(tc_tol.atol, 0.05f);
 }
-
-int main(int argc, char **argv) {
-    testing::InitGoogleTest(&argc, argv);
-    return RUN_ALL_TESTS();
-}
diff --git a/tests/test_device_info_seam.cu b/tests/test_device_info_cpu.cu
similarity index 82%
rename from tests/test_device_info_seam.cu
rename to tests/test_device_info_cpu.cu
index 16f8a28..640f953 100644
--- a/tests/test_device_info_seam.cu
+++ b/tests/test_device_info_cpu.cu
@@ -1,15 +1,15 @@
 /**
- * Device Info Provider Seam Tests
+ * Device Info Provider CPU-only Tests
  *
- * Tests for the device capability query seam, demonstrating that tests can
- * inject fake device info without relying on real GPU hardware capabilities.
+ * Tests for the device capability query seam, using fake device properties
+ * without requiring any CUDA device. These tests can run on any system
+ * with a C++17 compiler.
  */
 
 #include <gtest/gtest.h>
 
 #include "kernels/tensor_core_sgemm.cuh"
 #include "utils/benchmark_metrics.cuh"
-#include "utils/cuda_utils.cuh"
 #include "utils/device_info_provider.cuh"
 
 namespace {
@@ -25,9 +25,9 @@ class FakeDeviceProvider : public ::testing::Test {
         volta_prop_.major = 7;
         volta_prop_.minor = 0;
         volta_prop_.multiProcessorCount = 80;
-        volta_prop_.clockRate = 1530000; // 1.53 GHz in kHz
+        volta_prop_.clockRate = 1530000;      // 1.53 GHz in kHz
         volta_prop_.memoryClockRate = 877000; // 877 MHz in kHz
-        volta_prop_.memoryBusWidth = 4096; // HBM2
+        volta_prop_.memoryBusWidth = 4096;    // HBM2
 
         volta_provider_ = DeviceInfoProvider{
             &volta_prop_,
@@ -165,33 +165,6 @@ TEST_F(FakeDeviceProvider, ArchitectureNamingUnknown) {
     EXPECT_STREQ(getTensorCoreArchName(unknown_provider), "Unknown");
 }
 
-// ============================================================================
-// Production Adapter Integration Test
-// ============================================================================
-
-TEST(DeviceInfoSeam, ProductionAdapterWorks) {
-    // This test validates that the production adapter can successfully query
-    // real device info. It should pass on any CUDA-capable device.
-    DeviceInfoProvider prod = getProductionDeviceInfo();
-
-    // Basic sanity checks
-    EXPECT_NE(prod.prop, nullptr);
-    EXPECT_GT(prod.cores_per_sm, 0);
-    EXPECT_GT(prod.clock_ghz, 0.0f);
-
-    // Check that overloaded functions work without provider parameter
-    float peak_gflops = getTheoreticalPeakGflops();
-    EXPECT_GT(peak_gflops, 0.0f);
-
-    float peak_bandwidth = getTheoreticalPeakBandwidth();
-    EXPECT_GT(peak_bandwidth, 0.0f);
-
-    // Tensor core availability should be consistent
-    bool has_tc_explicit = tensorCoresAvailable(prod);
-    bool has_tc_default = tensorCoresAvailable();
-    EXPECT_EQ(has_tc_explicit, has_tc_default);
-}
-
 // ============================================================================
 // Dimension Support Tests (no device dependency)
 // ============================================================================
diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu
new file mode 100644
index 0000000..70fe348
--- /dev/null
+++ b/tests/test_device_info_cuda.cu
@@ -0,0 +1,48 @@
+/**
+ * Device Info Provider CUDA Tests
+ *
+ * Tests that require a real CUDA device. These tests will be skipped
+ * automatically when no CUDA device is available.
+ */
+
+#include <gtest/gtest.h>
+
+#include "gtest_cuda_environment.cuh"
+#include "utils/benchmark_metrics.cuh"
+#include "utils/cuda_utils.cuh"
+#include "utils/device_info_provider.cuh"
+
+namespace {
+
+// ============================================================================
+// Production Adapter Integration Test
+// ============================================================================
+
+TEST(DeviceInfoSeam, ProductionAdapterWorks) {
+    // This test validates that the production adapter can successfully query
+    // real device info. It should pass on any CUDA-capable device.
+    DeviceInfoProvider prod = getProductionDeviceInfo();
+
+    // Basic sanity checks
+    EXPECT_NE(prod.prop, nullptr);
+    EXPECT_GT(prod.cores_per_sm, 0);
+    EXPECT_GT(prod.clock_ghz, 0.0f);
+
+    // Check that overloaded functions work without provider parameter
+    float peak_gflops = getTheoreticalPeakGflops();
+    EXPECT_GT(peak_gflops, 0.0f);
+
+    float peak_bandwidth = getTheoreticalPeakBandwidth();
+    EXPECT_GT(peak_bandwidth, 0.0f);
+
+    // Tensor core availability should be consistent
+    bool has_tc_explicit = tensorCoresAvailable(prod);
+    bool has_tc_default = tensorCoresAvailable();
+    EXPECT_EQ(has_tc_explicit, has_tc_default);
+}
+
+} // namespace
+
+int main(int argc, char **argv) {
+    return runCudaAwareTests(argc, argv);
+}
diff --git a/tests/test_kernel_catalog.cu b/tests/test_kernel_catalog.cu
index 075da9e..cce5861 100644
--- a/tests/test_kernel_catalog.cu
+++ b/tests/test_kernel_catalog.cu
@@ -3,27 +3,30 @@
  *
  * Tests the kernel catalog registration system:
  * - Catalog contains expected kernels
- * - Entries have valid names and launchers
+ * - Entries have valid metadata (names, launchers, constraints)
+ * - Constraints checking works correctly
  * - Launch functions are callable without crashes
  * - Order preservation of kernel entries
+ *
+ * This test requires a CUDA device and will be skipped
+ * automatically when no CUDA device is available.
  */
 
-#include <gtest/gtest.h>
+#include "gtest_cuda_environment.cuh"
 #include "kernels/kernel_catalog.cuh"
-#include "utils/cuda_utils.cuh"
 #include "utils/benchmark_settings.cuh"
+#include "utils/cuda_utils.cuh"
+#include <gtest/gtest.h>
 
 // ============================================================================
-// Kernel Catalog Tests
+// Kernel Catalog Metadata Tests
 // ============================================================================
 
 class KernelCatalogTest : public ::testing::Test {
   protected:
-    void SetUp() override {
-        catalog_ = &getKernelCatalog();
-    }
+    void SetUp() override { catalog_ = &getKernelCatalog(); }
 
-    const std::vector<KernelCatalogEntry>* catalog_;
+    const std::vector<KernelCatalogEntry> *catalog_;
 };
 
 TEST_F(KernelCatalogTest, CatalogNotEmpty) {
@@ -31,51 +34,183 @@ TEST_F(KernelCatalogTest, CatalogNotEmpty) {
 }
 
 TEST_F(KernelCatalogTest, CatalogHasStandardKernels) {
-    int standard_count = 0;
-    for (const auto& entry : *catalog_) {
-        if (entry.type == KernelType::Standard) {
-            standard_count++;
-        }
-    }
+    int standard_count = countKernelsByType(KernelType::Standard);
     EXPECT_GE(standard_count, 4)
         << "Should have at least 4 standard kernels (Naive, Tiled, BankConflictFree, DoubleBuffer)";
 }
 
 TEST_F(KernelCatalogTest, CatalogHasTensorCoreKernels) {
-    int tc_count = 0;
-    for (const auto& entry : *catalog_) {
-        if (entry.type == KernelType::TensorCore) {
-            tc_count++;
-        }
-    }
-    EXPECT_GE(tc_count, 1)
-        << "Should have at least 1 tensor core kernel (end-to-end)";
+    int tc_count = countKernelsByType(KernelType::TensorCore);
+    EXPECT_GE(tc_count, 1) << "Should have at least 1 tensor core kernel (end-to-end)";
 }
 
 TEST_F(KernelCatalogTest, CatalogEntriesHaveNamesAndLaunchers) {
-    for (const auto& entry : *catalog_) {
+    for (const auto &entry : *catalog_) {
         EXPECT_FALSE(entry.name.empty()) << "All entries should have non-empty names";
         EXPECT_TRUE(static_cast<bool>(entry.launcher))
             << "Entry '" << entry.name << "' should have a valid launcher";
     }
 }
 
+TEST_F(KernelCatalogTest, CatalogEntriesHaveConstraints) {
+    for (const auto &entry : *catalog_) {
+        // Standard kernels should not require tensor cores
+        if (entry.type == KernelType::Standard) {
+            EXPECT_FALSE(entry.constraints.requires_tensor_cores)
+                << "Standard kernel '" << entry.name << "' should not require tensor cores";
+        }
+
+        // Tensor Core kernels should require sm_70+
+        if (entry.type == KernelType::TensorCore) {
+            EXPECT_TRUE(entry.constraints.requires_tensor_cores)
+                << "Tensor Core kernel '" << entry.name << "' should require tensor cores";
+        }
+    }
+}
+
+TEST_F(KernelCatalogTest, CatalogPreservesOrder) {
+    ASSERT_GE(catalog_->size(), 5u) << "Expected at least 5 kernels";
+
+    // Verify the expected order: Naive, Tiled, BankConflictFree, DoubleBuffer, TensorCore
+    // end-to-end
+    EXPECT_EQ((*catalog_)[0].name, "Naive") << "First kernel should be Naive";
+    EXPECT_EQ((*catalog_)[1].name, "Tiled (32x32)") << "Second kernel should be Tiled";
+    EXPECT_EQ((*catalog_)[2].name, "Bank Conflict Free")
+        << "Third kernel should be BankConflictFree";
+    EXPECT_EQ((*catalog_)[3].name, "Double Buffer") << "Fourth kernel should be DoubleBuffer";
+    EXPECT_EQ((*catalog_)[4].name, "Tensor Core (WMMA end-to-end)")
+        << "Fifth kernel should be Tensor Core end-to-end";
+}
+
+// ============================================================================
+// Kernel Constraints Tests
+// ============================================================================
+
+TEST(KernelConstraintsTest, StandardConstraintsAllowAnyDimensions) {
+    auto constraints = KernelConstraints::standard();
+
+    EXPECT_FALSE(constraints.requires_tensor_cores);
+    EXPECT_EQ(constraints.dimension_alignment, 0);
+
+    // Should accept any positive dimensions
+    EXPECT_TRUE(constraints.isSatisfied(1, 1, 1, false));
+    EXPECT_TRUE(constraints.isSatisfied(1024, 1024, 1024, false));
+    EXPECT_TRUE(constraints.isSatisfied(15, 17, 33, false)); // Unaligned
+}
+
+TEST(KernelConstraintsTest, TensorCoreConstraintsRequireAlignment) {
+    auto constraints = KernelConstraints::tensorCore();
+
+    EXPECT_TRUE(constraints.requires_tensor_cores);
+    EXPECT_EQ(constraints.dimension_alignment, 16);
+
+    // Should reject without tensor cores
+    EXPECT_FALSE(constraints.isSatisfied(16, 16, 16, false));
+
+    // Should accept aligned dimensions with tensor cores
+    EXPECT_TRUE(constraints.isSatisfied(16, 16, 16, true));
+    EXPECT_TRUE(constraints.isSatisfied(64, 128, 256, true));
+
+    // Should reject unaligned dimensions even with tensor cores
+    EXPECT_FALSE(constraints.isSatisfied(15, 16, 16, true));
+    EXPECT_FALSE(constraints.isSatisfied(16, 17, 16, true));
+    EXPECT_FALSE(constraints.isSatisfied(16, 16, 33, true));
+}
+
+TEST(KernelConstraintsTest, ComputeOnlyConstraintsAreSpecial) {
+    auto constraints = KernelConstraints::tensorCoreComputeOnly();
+
+    EXPECT_TRUE(constraints.requires_tensor_cores);
+    EXPECT_TRUE(constraints.requires_compute_only);
+    EXPECT_EQ(constraints.dimension_alignment, 16);
+}
+
+// ============================================================================
+// Catalog Entry Tests
+// ============================================================================
+
+TEST(KernelCatalogEntryTest, DefaultToleranceForStandardKernels) {
+    auto constraints = KernelConstraints::standard();
+    KernelCatalogEntry entry{"Test Standard", KernelType::Standard, nullptr, constraints};
+
+    auto tolerance = entry.defaultTolerance();
+    EXPECT_FLOAT_EQ(tolerance.rtol, kStandardVerifyTolerance.rtol);
+    EXPECT_FLOAT_EQ(tolerance.atol, kStandardVerifyTolerance.atol);
+}
+
+TEST(KernelCatalogEntryTest, DefaultToleranceForTensorCoreKernels) {
+    auto constraints = KernelConstraints::tensorCore();
+    KernelCatalogEntry entry{"Test TensorCore", KernelType::TensorCore, nullptr, constraints};
+
+    auto tolerance = entry.defaultTolerance();
+    EXPECT_FLOAT_EQ(tolerance.rtol, kTensorCoreVerifyTolerance.rtol);
+    EXPECT_FLOAT_EQ(tolerance.atol, kTensorCoreVerifyTolerance.atol);
+}
+
+TEST(KernelCatalogEntryTest, CanRunMethodWorks) {
+    auto tc_entry = getTensorCoreComputeOnlyEntry();
+
+    // Should not run without tensor cores
+    EXPECT_FALSE(tc_entry.canRun(16, 16, 16, false));
+
+    // Should run with tensor cores and aligned dimensions
+    EXPECT_TRUE(tc_entry.canRun(16, 16, 16, true));
+    EXPECT_TRUE(tc_entry.canRun(64, 128, 256, true));
+
+    // Should not run with unaligned dimensions
+    EXPECT_FALSE(tc_entry.canRun(15, 16, 16, true));
+}
+
+// ============================================================================
+// Catalog Query Utilities Tests
+// ============================================================================
+
+TEST(CatalogQueryTest, CountKernelsByTypeWorks) {
+    int standard_count = countKernelsByType(KernelType::Standard);
+    int tensor_core_count = countKernelsByType(KernelType::TensorCore);
+
+    EXPECT_GE(standard_count, 4);
+    EXPECT_GE(tensor_core_count, 1);
+}
+
+TEST(CatalogQueryTest, GetKernelNamesWorks) {
+    auto standard_names = getKernelNames(KernelType::Standard);
+
+    EXPECT_GE(standard_names.size(), 4u);
+    EXPECT_EQ(standard_names[0], "Naive");
+    EXPECT_EQ(standard_names[1], "Tiled (32x32)");
+}
+
+TEST(CatalogQueryTest, CanRunTensorCoreKernelsWorks) {
+    // With tensor cores
+    EXPECT_TRUE(canRunTensorCoreKernels(16, 16, 16, true));
+    EXPECT_TRUE(canRunTensorCoreKernels(64, 128, 256, true));
+    EXPECT_FALSE(canRunTensorCoreKernels(15, 16, 16, true)); // Unaligned
+
+    // Without tensor cores
+    EXPECT_FALSE(canRunTensorCoreKernels(16, 16, 16, false));
+}
+
+// ============================================================================
+// Kernel Launch Tests (requires CUDA device)
+// ============================================================================
+
 TEST_F(KernelCatalogTest, CatalogLaunchCallable) {
     ASSERT_FALSE(catalog_->empty()) << "Catalog is empty, cannot test launch";
-    
+
     // Small test: verify we can call the launcher without crashing
-    const auto& entry = (*catalog_)[0];
+    const auto &entry = (*catalog_)[0];
     const int M = 64, K = 64, N = 64;
-    
+
     DeviceMemory<float> d_A(M * K);
     DeviceMemory<float> d_B(K * N);
     DeviceMemory<float> d_C(M * N);
-    
+
     // Initialize with zeros
     CUDA_CHECK(cudaMemset(d_A.get(), 0, M * K * sizeof(float)));
     CUDA_CHECK(cudaMemset(d_B.get(), 0, K * N * sizeof(float)));
     CUDA_CHECK(cudaMemset(d_C.get(), 0, M * N * sizeof(float)));
-    
+
     // Launch should not crash
     EXPECT_NO_THROW({
         entry.launcher(d_A.get(), d_B.get(), d_C.get(), M, K, N);
@@ -83,14 +218,4 @@ TEST_F(KernelCatalogTest, CatalogLaunchCallable) {
     });
 }
 
-TEST_F(KernelCatalogTest, CatalogPreservesOrder) {
-    ASSERT_GE(catalog_->size(), 5u) << "Expected at least 5 kernels";
-    
-    // Verify the expected order: Naive, Tiled, BankConflictFree, DoubleBuffer, TensorCore end-to-end
-    EXPECT_EQ((*catalog_)[0].name, "Naive") << "First kernel should be Naive";
-    EXPECT_EQ((*catalog_)[1].name, "Tiled (32x32)") << "Second kernel should be Tiled";
-    EXPECT_EQ((*catalog_)[2].name, "Bank Conflict Free") << "Third kernel should be BankConflictFree";
-    EXPECT_EQ((*catalog_)[3].name, "Double Buffer") << "Fourth kernel should be DoubleBuffer";
-    EXPECT_EQ((*catalog_)[4].name, "Tensor Core (WMMA end-to-end)") << "Fifth kernel should be Tensor Core end-to-end";
-}
-
+int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); }
diff --git a/tests/test_performance.cu b/tests/test_performance.cu
index 26c3b19..0db23e7 100644
--- a/tests/test_performance.cu
+++ b/tests/test_performance.cu
@@ -110,22 +110,24 @@ class PerformanceRegressionTest : public ::testing::Test {
 TEST_F(PerformanceRegressionTest, NaiveKernelPerformance) {
     printf("\nNaive Kernel Performance:\n");
     for (const auto &[M, K, N] : test_dimensions_) {
-        runPerformanceTest("Naive",
-                           [](const float *A, const float *B, float *C, int m, int k, int n) {
-                               launch_naive_sgemm<>(A, B, C, m, k, n);
-                           },
-                           M, K, N);
+        runPerformanceTest(
+            "Naive",
+            [](const float *A, const float *B, float *C, int m, int k, int n) {
+                launch_naive_sgemm<>(A, B, C, m, k, n);
+            },
+            M, K, N);
     }
 }
 
 TEST_F(PerformanceRegressionTest, TiledKernelPerformance) {
     printf("\nTiled Kernel Performance:\n");
     for (const auto &[M, K, N] : test_dimensions_) {
-        runPerformanceTest("Tiled",
-                           [](const float *A, const float *B, float *C, int m, int k, int n) {
-                               launch_tiled_sgemm<32>(A, B, C, m, k, n);
-                           },
-                           M, K, N);
+        runPerformanceTest(
+            "Tiled",
+            [](const float *A, const float *B, float *C, int m, int k, int n) {
+                launch_tiled_sgemm<32>(A, B, C, m, k, n);
+            },
+            M, K, N);
     }
 }
 
@@ -190,10 +192,8 @@ TEST_F(PerformanceRegressionTest, PeakPerformanceReference) {
     printf("  Theoretical Peak Bandwidth: %.2f GB/s\n", peak_bandwidth_);
 
     // 验证峰值在合理范围内
-    EXPECT_GT(peak_gflops_, 1000.0f) << "Peak GFLOPS seems too low";
-    EXPECT_LT(peak_gflops_, 100000.0f) << "Peak GFLOPS seems too high";
+    EXPECT_GT(peak_gflops_, 100.0f) << "Peak GFLOPS seems too low";
+    EXPECT_LT(peak_gflops_, 200000.0f) << "Peak GFLOPS seems too high";
 }
 
-int main(int argc, char **argv) {
-    return runCudaAwareTests(argc, argv);
-}
+int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); }

From 879e8222686732de722ace5aa11a9870cda97c56 Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Wed, 1 Jul 2026 18:50:09 +0800
Subject: [PATCH 2/5] fix: CMake include(GoogleTest) and clang-format all test
 files

- Add missing `include(GoogleTest)` before SgemmTestHelpers.cmake to
  resolve `gtest_discover_tests` unknown command error
- Apply clang-format to test_utils.cu, test_sgemm.cu, and
  test_device_info_cuda.cu to fix format check failures

Generated with [Devin](https://devin.ai)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 CMakeLists.txt                 |  2 ++
 tests/test_device_info_cuda.cu |  4 +---
 tests/test_sgemm.cu            | 10 +++-------
 tests/test_utils.cu            |  6 ++----
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d8a1f1a..e672b54 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -96,6 +96,8 @@ if(BUILD_TESTS)
     set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
     FetchContent_MakeAvailable(googletest)
 
+    include(GoogleTest)
+
     # Include test helper functions
     include(cmake/SgemmTestHelpers.cmake)
 
diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu
index 70fe348..885107c 100644
--- a/tests/test_device_info_cuda.cu
+++ b/tests/test_device_info_cuda.cu
@@ -43,6 +43,4 @@ TEST(DeviceInfoSeam, ProductionAdapterWorks) {
 
 } // namespace
 
-int main(int argc, char **argv) {
-    return runCudaAwareTests(argc, argv);
-}
+int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); }
diff --git a/tests/test_sgemm.cu b/tests/test_sgemm.cu
index a644cd2..ba80869 100644
--- a/tests/test_sgemm.cu
+++ b/tests/test_sgemm.cu
@@ -344,13 +344,9 @@ TEST_F(DimensionInvarianceTest, AllStandardKernelsWorkWithVariousDimensions) {
                    [](const float *A, const float *B, float *C, int m, int k, int n) {
                        launch_bank_conflict_free_sgemm<32>(A, B, C, m, k, n);
                    });
-        testKernel("DoubleBuffer",
-                   [](const float *A, const float *B, float *C, int m, int k, int n) {
-                       launch_double_buffer_sgemm<32>(A, B, C, m, k, n);
-                   });
+        testKernel("DoubleBuffer", [](const float *A, const float *B, float *C, int m, int k,
+                                      int n) { launch_double_buffer_sgemm<32>(A, B, C, m, k, n); });
     }
 }
 
-int main(int argc, char **argv) {
-    return runCudaAwareTests(argc, argv);
-}
+int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); }
diff --git a/tests/test_utils.cu b/tests/test_utils.cu
index 5d688a1..0b18471 100644
--- a/tests/test_utils.cu
+++ b/tests/test_utils.cu
@@ -8,9 +8,9 @@
  * - 容差配置和边界条件
  */
 
+#include <cmath>
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
-#include <cmath>
 #include <limits>
 #include <memory>
 #include <vector>
@@ -402,6 +402,4 @@ TEST_F(UtilsIntegrationTest, FullWorkflowWithDeviceMemory) {
     EXPECT_TRUE(has_nonzero);
 }
 
-int main(int argc, char **argv) {
-    return runCudaAwareTests(argc, argv);
-}
+int main(int argc, char **argv) { return runCudaAwareTests(argc, argv); }

From 03013cfbeee2d239ed31db523c6a87d99c837a5b Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Wed, 1 Jul 2026 18:55:23 +0800
Subject: [PATCH 3/5] fix: restore SGEMMVerifier::shouldFlagAsIncorrect static
 method

The refactored verify.cuh removed the static shouldFlagAsIncorrect
method from SGEMMVerifier, breaking test_sgemm.cu which calls
SGEMMVerifier::shouldFlagAsIncorrect(result). Restore it as a
deprecated compatibility method.

Generated with [Devin](https://devin.ai)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 src/utils/verify.cuh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/utils/verify.cuh b/src/utils/verify.cuh
index 00512df..3d84686 100644
--- a/src/utils/verify.cuh
+++ b/src/utils/verify.cuh
@@ -270,6 +270,13 @@ class SGEMMVerifier {
      */
     cublasHandle_t getHandle() { return handle_; }
 
+    /**
+     * Legacy compatibility: check if verification result indicates failure.
+     *
+     * @deprecated Use result.passed or result.shouldFlagAsIncorrect() directly.
+     */
+    static bool shouldFlagAsIncorrect(const VerifyResult &result) { return !result.passed; }
+
   private:
     cublasHandle_t handle_;
 };

From eb10bc841bc53413f62bf1277ccd7512656d79d9 Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Wed, 1 Jul 2026 19:01:52 +0800
Subject: [PATCH 4/5] fix: add missing tensor_core_sgemm.cuh include in
 test_device_info_cuda

The test uses tensorCoresAvailable() which is defined in
tensor_core_sgemm.cuh, but the header was not included.

Generated with [Devin](https://devin.ai)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 tests/test_device_info_cuda.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/test_device_info_cuda.cu b/tests/test_device_info_cuda.cu
index 885107c..a2da02f 100644
--- a/tests/test_device_info_cuda.cu
+++ b/tests/test_device_info_cuda.cu
@@ -8,6 +8,7 @@
 #include <gtest/gtest.h>
 
 #include "gtest_cuda_environment.cuh"
+#include "kernels/tensor_core_sgemm.cuh"
 #include "utils/benchmark_metrics.cuh"
 #include "utils/cuda_utils.cuh"
 #include "utils/device_info_provider.cuh"

From f45adb0405d78c655be7a4b6e745b591fda2cd32 Mon Sep 17 00:00:00 2001
From: shijiashuai <jiashuai.shi@qq.com>
Date: Wed, 1 Jul 2026 19:05:55 +0800
Subject: [PATCH 5/5] fix: correct AmperePeakGflopsCalculation expected value

The test expected 19481.0 GFLOPS but the correct calculation is
108 * 64 * 2 * 1.41 = 19491.84 GFLOPS. Fix the expected value and
remove the stale * 1000 from the comment.

Generated with [Devin](https://devin.ai)

Co-Authored-By: Devin <158243242+devin-ai-integration[bot]@users.noreply.github.com>
---
 tests/test_device_info_cpu.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_device_info_cpu.cu b/tests/test_device_info_cpu.cu
index 640f953..43cbd8d 100644
--- a/tests/test_device_info_cpu.cu
+++ b/tests/test_device_info_cpu.cu
@@ -106,10 +106,10 @@ TEST_F(FakeDeviceProvider, VoltaPeakGflopsCalculation) {
 }
 
 TEST_F(FakeDeviceProvider, AmperePeakGflopsCalculation) {
-    // A100: 108 SMs * 64 cores/SM * 2 (FMA) * 1.41 GHz * 1000
-    // Expected: ~19.481 TFLOPS
+    // A100: 108 SMs * 64 cores/SM * 2 (FMA) * 1.41 GHz
+    // Expected: ~19.492 TFLOPS
     float peak_gflops = getTheoreticalPeakGflops(ampere_provider_);
-    EXPECT_NEAR(peak_gflops, 19481.0f, 0.1f);
+    EXPECT_NEAR(peak_gflops, 19491.84f, 0.1f);
 }
 
 TEST_F(FakeDeviceProvider, VoltaPeakBandwidthCalculation) {