Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Continuous Integration
# Validates code formatting and CUDA compilation of all targets (benchmark + tests)
# Validates code formatting, CUDA compilation, and CPU-only test execution
#
# Workflow separation:
# - This workflow: formatting + CUDA compile-time checks (no GPU runtime)
# - This workflow: formatting + CUDA compile-time checks + CPU tests (no GPU runtime)
# - pages.yml: docs tests/build and GitHub Pages buildability
name: CI

Expand Down Expand Up @@ -58,7 +58,11 @@ jobs:
- name: Build
run: cmake --build build -j2

- name: Run CPU tests
run: ctest --test-dir build -L cpu --output-on-failure

- name: Info
run: |
echo "✅ CUDA compilation successful for all targets (benchmark + tests)"
echo "✅ CPU-only tests passed"
echo "ℹ️ GPU runtime tests require a CUDA-capable machine"
128 changes: 54 additions & 74 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,91 +96,71 @@ if(BUILD_TESTS)
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

add_executable(test_sgemm tests/test_sgemm.cu)
target_include_directories(test_sgemm PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_compile_definitions(test_sgemm PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
target_link_options(test_sgemm PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
target_link_libraries(test_sgemm PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
CUDA::curand
)
target_compile_options(test_sgemm PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
)
include(GoogleTest)

# 工具层测试
add_executable(test_utils tests/test_utils.cu)
target_include_directories(test_utils PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_compile_definitions(test_utils PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
target_link_options(test_utils PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
target_link_libraries(test_utils PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
CUDA::curand
)
target_compile_options(test_utils PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
)
# Include test helper functions
include(cmake/SgemmTestHelpers.cmake)

# 性能回归测试
add_executable(test_performance tests/test_performance.cu)
target_include_directories(test_performance PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_compile_definitions(test_performance PRIVATE SGEMM_HAS_WMMA_TARGET=${SGEMM_HAS_WMMA_TARGET})
target_link_options(test_performance PRIVATE -L${SGEMM_CUDA_LIBRARY_DIR})
target_link_libraries(test_performance PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
CUDA::curand
)
target_compile_options(test_performance PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
# ═══════════════════════════════════════════════════════════════
# CPU-only Tests (no CUDA required)
# ═══════════════════════════════════════════════════════════════

# Benchmark settings module test - pure C++, no CUDA dependencies
sgemm_add_cpu_test(
NAME test_benchmark_settings
SOURCES tests/test_benchmark_settings.cpp
)

# Benchmark 设置模块测试
add_executable(test_benchmark_settings tests/test_benchmark_settings.cu)
target_include_directories(test_benchmark_settings PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_benchmark_settings PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
# Device info provider CPU tests - uses fake device properties
# Compiled as .cu because it includes kernel headers with CUDA launch syntax
sgemm_add_cpu_test(
NAME test_device_info_cpu
SOURCES tests/test_device_info_cpu.cu
)
target_compile_options(test_benchmark_settings PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>

# ═══════════════════════════════════════════════════════════════
# CUDA Tests (requires CUDA device, skipped if unavailable)
# ═══════════════════════════════════════════════════════════════

# Kernel correctness tests with property-based testing
sgemm_add_cuda_test(
NAME test_sgemm
SOURCES tests/test_sgemm.cu
CUDA_LIBRARIES CUDA::curand
REQUIRES_WMMA
)

# Kernel catalog module test
add_executable(test_kernel_catalog tests/test_kernel_catalog.cu)
target_include_directories(test_kernel_catalog PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_kernel_catalog PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
# Utility layer tests (DeviceMemory, verifier RAII, etc.)
sgemm_add_cuda_test(
NAME test_utils
SOURCES tests/test_utils.cu
CUDA_LIBRARIES CUDA::curand
REQUIRES_WMMA
)
target_compile_options(test_kernel_catalog PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>

# Kernel catalog module test - requires device memory and kernel launch
sgemm_add_cuda_test(
NAME test_kernel_catalog
SOURCES tests/test_kernel_catalog.cu
REQUIRES_WMMA
)

# Device info seam test
add_executable(test_device_info_seam tests/test_device_info_seam.cu)
target_include_directories(test_device_info_seam PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/src)
target_link_libraries(test_device_info_seam PRIVATE
GTest::gtest_main
CUDA::cudart
CUDA::cublas
# Device info provider CUDA tests - requires real GPU for production adapter
sgemm_add_cuda_test(
NAME test_device_info_cuda
SOURCES tests/test_device_info_cuda.cu
)
target_compile_options(test_device_info_seam PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>

# ═══════════════════════════════════════════════════════════════
# Performance Tests (CUDA + performance label)
# ═══════════════════════════════════════════════════════════════

# Performance regression tests
sgemm_add_cuda_perf_test(
NAME test_performance
SOURCES tests/test_performance.cu
CUDA_LIBRARIES CUDA::curand
REQUIRES_WMMA
)

include(GoogleTest)
gtest_discover_tests(test_sgemm)
gtest_discover_tests(test_utils)
gtest_discover_tests(test_performance)
gtest_discover_tests(test_benchmark_settings)
gtest_discover_tests(test_kernel_catalog)
gtest_discover_tests(test_device_info_seam)
endif()
90 changes: 80 additions & 10 deletions CONTEXT.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,28 @@

## 核心模块

### Kernel Catalog 模块

**位置**: `src/kernels/kernel_catalog.cuh`

**权威元数据源** - 内核阶梯的唯一事实来源:

- **KernelCatalogEntry**: 完整的内核元数据
- `name`: 显示名称
- `type`: KernelType::Standard 或 KernelType::TensorCore
- `launcher`: 启动适配器
- `constraints`: 运行时约束(Tensor Core 要求、维度对齐)
- **KernelConstraints**: 运行时约束描述
- `requires_tensor_cores`: 是否需要 sm_70+
- `dimension_alignment`: 维度对齐要求(0 = 无约束)
- `requires_compute_only`: 是否使用特殊 benchmark 接口
- **查询工具**: `countKernelsByType()`, `getKernelNames()`, `canRunTensorCoreKernels()`

**设计原则**:
- **单一事实源**: 新增内核只需添加一个 catalog 条目
- **自描述约束**: 每个 entry 知道自己能否在给定条件下运行
- **统一调度**: BenchmarkRunner 通过 catalog 迭代,无特殊分支

### Tensor Core 模块

**位置**: `src/kernels/tensor_core_sgemm.cuh`
Expand All @@ -24,6 +46,7 @@
**位置**: `src/kernels/tensor_core_benchmark.cuh`

Tensor Core 特有的 benchmark 功能,提供:
- `canRunTensorCoreComputeOnly()` - 约束检查(与 KernelCatalog 语义一致)
- `runTensorCoreComputeOnlyBenchmark()` - 纯计算路径性能测试

**接口设计**:只接受 `cublasHandle_t`,不依赖整个 `SGEMMBenchmark` 类,避免内核层对工具层的上穿依赖。
Expand All @@ -32,19 +55,42 @@ Tensor Core 特有的 benchmark 功能,提供:

**位置**: `src/utils/verify.cuh`

统一的验证逻辑:
- `detail::compareMatricesImpl()` - 内部实现,供其他函数共享
- `compareMatrices()` - 独立的矩阵比较函数
- `SGEMMVerifier` - 带 cuBLAS 句柄的验证器类
**统一的验证策略** - reference + comparison + tolerance policy:

- **VerifyResult**: 验证结果结构(pass/fail、错误指标)
- **VerifyTolerance**: 容差规范(numpy-style allclose 语义)
- `kStandardVerifyTolerance`: FP32 标准容差
- `kTensorCoreVerifyTolerance`: Tensor Core 宽松容差
- **比较函数**:
- `compareMatrices()`: Host 指针比较
- `compareDeviceMatrices()`: Device 指针比较
- **SGEMMVerifier**: cuBLAS 参考计算适配器
- `computeReference()`: 计算参考结果
- `verify()`, `verifyDevice()`: 验证内核输出

**设计原则**:
- **单一验证政策**: 所有内核共享同一套容差语义
- **分离关注点**: 参考计算 vs 比较逻辑
- **可扩展**: 未来可添加其他参考适配器

## Benchmark 模块

项目将 Benchmark 功能拆分为三个深度模块,每个模块有独立的职责:

### Benchmark Settings
**位置**: `src/utils/benchmark_settings.cuh**

配置集中化:
- `RunSettings`: 预热次数、测量次数
- `VerificationSettings`: 容差配置
- `OutputSettings`: Roofline 导出选项
- `BenchmarkSettings`: 聚合配置

### Benchmark Core
**位置**: `src/utils/benchmark_core.cuh`

核心性能测量:
- `BenchmarkResult`: 结果结构和报告生成
- `CudaTimer` - RAII 包装的 CUDA 事件计时器
- `measureGpuTime()` - 通用的 GPU 操作性能测量器

Expand All @@ -62,14 +108,20 @@ Tensor Core 特有的 benchmark 功能,提供:

聚合模块并提供:
- `SGEMMBenchmark` - 高级 benchmark 编排器
- `BenchmarkResult` - 结果结构和报告生成

## 测试架构

### 测试分层

项目采用分层测试策略,确保每个层级都有独立的测试面:

#### CPU-only 测试
**位置**: `tests/test_benchmark_settings.cpp`, `tests/test_device_info_cpu.cpp`

纯 CPU 测试,不需要 CUDA 设备:
- 设置模块单元测试
- 设备信息 Seam 测试(使用 fake provider)

#### 内核层测试
**位置**: `tests/test_sgemm.cu`

Expand All @@ -78,6 +130,14 @@ Tensor Core 特有的 benchmark 功能,提供:
- Tensor Core 快速路径和 fallback 测试
- 边界测试和维度不变性测试

#### Kernel Catalog 测试
**位置**: `tests/test_kernel_catalog.cu`

测试内核目录的元数据和约束:
- Catalog 包含预期的内核
- 条目有有效的元数据(名称、启动器、约束)
- 约束检查正确工作

#### 工具层测试
**位置**: `tests/test_utils.cu`

Expand All @@ -88,8 +148,6 @@ Tensor Core 特有的 benchmark 功能,提供:
- `VerifyTolerance` - 容差配置和边界条件
- NaN/Inf 处理、异常安全性

**设计原则**:工具层测试独立于内核测试,可以单独捕获工具类 bug。

#### 性能回归测试
**位置**: `tests/test_performance.cu`

Expand All @@ -105,7 +163,19 @@ Tensor Core 特有的 benchmark 功能,提供:
- Double-Buffer: 35% 峰值
- Tensor Core: 50% 峰值(当可用时)

**设计原则**:性能测试独立于正确性测试,可在 CI 中检测重大性能退化。
### 测试分类标签

项目使用 CTest labels 区分测试类型:
- `cpu`: CPU-only 测试,不需要 CUDA 设备
- `cuda`: 需要 CUDA 设备的测试,无 GPU 时跳过
- `performance`: 性能回归测试

**运行命令**:
```bash
ctest -L cpu # 只运行 CPU 测试
ctest -L cuda # 只运行 CUDA 测试
ctest -L performance # 只运行性能测试
```

## 架构原则

Expand All @@ -115,8 +185,8 @@ Tensor Core 特有的 benchmark 功能,提供:
- `main.cu` - 入口点,仅负责组装
- `cli_parser.cuh` - 命令行解析、配置构造
- `benchmark_runner.cuh` - 内核调度、结果聚合
2. **内核层** (`src/kernels/`) - 5 个内核实现 + Tensor Core 专用模块
3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助
2. **内核层** (`src/kernels/`) - 5 个内核实现 + Kernel Catalog + Tensor Core 专用模块
3. **工具层** (`src/utils/`) - RAII 内存管理、错误处理、验证辅助、设置模块

### 依赖方向

Expand Down
Loading
Loading