CUDA 多操作重叠实战笔记 + C++示例代码(第八章重点)
一、核心笔记(整合前八章核心)
1. VS Code + CUDA-GDB 调试指南
核心配置步骤
-
前提:已安装
C/C++ Extension Pack、Nsight Visual Studio Code Edition扩展,且项目编译为 Debug 模式(cmake -DCMAKE_BUILD_TYPE=Debug ..)。 -
创建调试配置文件(launch.json):
-
点击 VS Code 左侧「Run and Debug」→「create a launch.json file」;
-
选择调试器「CUDA C++ (CUDA-GDB)」;
-
配置核心参数:
{ "version": "0.2.0", "configurations": [ { "name": "CUDA Debug: Vector Add", "type": "cuda-gdb", "request": "launch", "program": "${workspaceFolder}/build/vector_add", // 可执行文件路径 "stopAtEntry": true, // 入口处暂停 "cwd": "${workspaceFolder}/build" // 工作目录 } ] }
-
2. CUDA 流(Streams):重叠操作核心
核心原理
-
GPU 具备 3 个异步引擎:1 个核函数执行引擎 + 2 个内存传输引擎(Host→Device、Device→Host);
-
利用多流可实现「Host→Device 传输」「核函数执行」「Device→Host 传输」三者并行,最大化硬件利用率;
-
关键约束:
-
同一流内操作串行执行,不同流间操作并行执行;
-
异步传输需使用 分页锁定内存(Pinned Memory)(
cudaMallocHost分配),普通内存(malloc)仅支持同步传输; -
PCIe 带宽有限(如 PCIe 3.0 x16 理论带宽 16 GB/s),需合理分块避免带宽瓶颈。
-
核心步骤(向量矩阵乘法示例)
-
内存分配:
-
主机端:用
cudaMallocHost分配 Pinned Memory(适配异步传输); -
设备端:分配多个数据块缓冲区(如 2 个矩阵块缓冲区 + 2 个结果块缓冲区),用于交替传输和计算。
-
-
创建流:
cudaStreamCreate(&stream1)、cudaStreamCreate(&stream2)(非默认流才支持异步并行)。 -
分块处理:
-
将大矩阵拆分为多个小块(如 100 MB/块,适配 PCIe 带宽峰值);
-
每个流执行「Host→Device 传输 → 核函数执行 →Device→Host 传输」的异步流水线。
-
-
流同步:
cudaStreamSynchronize(stream1)、cudaStreamSynchronize(stream2),确保所有操作完成后再使用结果。
二、C++(CUDA)示例代码
示例 1:VS Code 调试演示 - 带 bug 的向量加法
核心功能
故意引入 bug(乘法代替加法),演示 VS Code 调试 CUDA 代码的完整流程,呼应第八章「调试指南」。
#include <cuda_runtime.h>
#include <iostream>
#include <cmath>
using namespace std;
#define CHECK_CUDA_ERR(err) \
if (err != cudaSuccess) { \
cerr << "CUDA错误:" << cudaGetErrorString(err) << "(行号:" << __LINE__ << ")" << endl; \
exit(1); \
}
// GPU核函数:带bug(乘法代替加法)
__global__ void vectorAddBuggyGPU(int* d_a, int* d_b, int* d_c, int N) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx < N) {
d_c[idx] = d_a[idx] * d_b[idx]; // 错误:应为 d_a[idx] + d_b[idx]
}
}
// CPU串行向量加法(正确版本,用于验证)
void vectorAddCPU(int* h_a, int* h_b, int* h_c, int N) {
for (int i = 0; i < N; ++i) {
h_c[i] = h_a[i] + h_b[i];
}
}
// 验证GPU结果与CPU结果是否一致
void checkResult(int* cpu_res, int* gpu_res, int N) {
for (int i = 0; i < N; ++i) {
if (abs(cpu_res[i] - gpu_res[i]) > 1e-5) {
cerr << "结果不匹配!索引 " << i << ":CPU=" << cpu_res[i] << ",GPU=" << gpu_res[i] << endl;
exit(1);
}
}
cout << "结果验证通过!" << endl;
}
int main() {
const int N = 130; // 向量大小
const dim3 BLOCK_DIM(128); // 块大小
const dim3 GRID_DIM((N + BLOCK_DIM.x - 1) / BLOCK_DIM.x); // 网格大小
// 主机内存分配
int* h_a = new int[N];
int* h_b = new int[N];
int* h_c_cpu = new int[N];
int* h_c_gpu = new int[N];
// 初始化数据(1~N)
for (int i = 0; i < N; ++i) {
h_a[i] = i + 1;
h_b[i] = N - i;
}
// CPU计算(正确结果)
vectorAddCPU(h_a, h_b, h_c_cpu, N);
// 设备内存分配
int* d_a, *d_b, *d_c;
CHECK_CUDA_ERR(cudaMalloc((void**)&d_a, N * sizeof(int)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_b, N * sizeof(int)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_c, N * sizeof(int)));
// 数据拷贝(主机→设备)
CHECK_CUDA_ERR(cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice));
CHECK_CUDA_ERR(cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice));
// 启动核函数(带bug)
vectorAddBuggyGPU<<<GRID_DIM, BLOCK_DIM>>>(d_a, d_b, d_c, N);
CHECK_CUDA_ERR(cudaGetLastError());
// 数据拷贝(设备→主机)
CHECK_CUDA_ERR(cudaMemcpy(h_c_gpu, d_c, N * sizeof(int), cudaMemcpyDeviceToHost));
// 验证结果(会触发错误,用于调试)
checkResult(h_c_cpu, h_c_gpu, N);
// 释放资源
delete[] h_a;
delete[] h_b;
delete[] h_c_cpu;
delete[] h_c_gpu;
CHECK_CUDA_ERR(cudaFree(d_a));
CHECK_CUDA_ERR(cudaFree(d_b));
CHECK_CUDA_ERR(cudaFree(d_c));
return 0;
}
示例 2:CUDA 流重叠操作 - 向量矩阵乘法
核心功能
使用两个 CUDA 流重叠内存传输和核函数执行,提升吞吐量,呼应第八章「流重叠」核心知识点。
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
using namespace std;
using namespace chrono;
#define CHECK_CUDA_ERR(err) \
if (err != cudaSuccess) { \
cerr << "CUDA错误:" << cudaGetErrorString(err) << "(行号:" << __LINE__ << ")" << endl; \
exit(1); \
}
const int BLOCK_SIZE = 256; // 块大小
const int MATRIX_ROWS = 16380; // 矩阵行数(≈1GB数据)
const int MATRIX_COLS = 16380; // 矩阵列数
const int CHUNK_SIZE = 1638; // 分块大小(≈100MB/块,适配PCIe带宽)
// GPU核函数:向量矩阵乘法(每个线程处理矩阵一行)
__global__ void vectorMatrixMulKernel(float* d_vec, float* d_mat, float* d_res, int rows, int cols) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
if (row < rows) {
float sum = 0.0f;
for (int col = 0; col < cols; ++col) {
sum += d_mat[row * cols + col] * d_vec[col];
}
d_res[row] = sum;
}
}
// CPU串行向量矩阵乘法(基准)
void vectorMatrixMulCPU(float* h_vec, float* h_mat, float* h_res, int rows, int cols) {
for (int row = 0; row < rows; ++row) {
float sum = 0.0f;
for (int col = 0; col < cols; ++col) {
sum += h_mat[row * cols + col] * h_vec[col];
}
h_res[row] = sum;
}
}
int main() {
// 1. 主机内存分配(Pinned Memory,适配异步传输)
float *h_vec, *h_mat, *h_res_cpu, *h_res_gpu;
CHECK_CUDA_ERR(cudaMallocHost(&h_vec, MATRIX_ROWS * sizeof(float)));
CHECK_CUDA_ERR(cudaMallocHost(&h_mat, MATRIX_ROWS * MATRIX_COLS * sizeof(float)));
CHECK_CUDA_ERR(cudaMallocHost(&h_res_cpu, MATRIX_ROWS * sizeof(float)));
CHECK_CUDA_ERR(cudaMallocHost(&h_res_gpu, MATRIX_ROWS * sizeof(float)));
// 2. 初始化数据(随机数)
srand(time(0));
for (int i = 0; i < MATRIX_ROWS; ++i) {
h_vec[i] = rand() / (float)RAND_MAX; // 0~1随机数
}
for (int i = 0; i < MATRIX_ROWS * MATRIX_COLS; ++i) {
h_mat[i] = rand() / (float)RAND_MAX;
}
// 3. CPU基准测试
auto cpu_start = high_resolution_clock::now();
vectorMatrixMulCPU(h_vec, h_mat, h_res_cpu, MATRIX_ROWS, MATRIX_COLS);
auto cpu_time = duration_cast<milliseconds>(high_resolution_clock::now() - cpu_start).count();
cout << "CPU串行计算耗时:" << cpu_time << " ms" << endl;
// 4. GPU流重叠计算
// 设备内存分配(2个矩阵块缓冲区 + 2个结果块缓冲区)
float *d_vec, *d_mat1, *d_mat2, *d_res1, *d_res2;
CHECK_CUDA_ERR(cudaMalloc((void**)&d_vec, MATRIX_ROWS * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_mat1, CHUNK_SIZE * MATRIX_COLS * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_mat2, CHUNK_SIZE * MATRIX_COLS * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_res1, CHUNK_SIZE * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_res2, CHUNK_SIZE * sizeof(float)));
// 创建2个非默认流
cudaStream_t stream1, stream2;
CHECK_CUDA_ERR(cudaStreamCreate(&stream1));
CHECK_CUDA_ERR(cudaStreamCreate(&stream2));
// 核函数配置(按分块大小计算)
int blocks = (CHUNK_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
// GPU计时
cudaEvent_t gpu_start, gpu_stop;
CHECK_CUDA_ERR(cudaEventCreate(&gpu_start));
CHECK_CUDA_ERR(cudaEventCreate(&gpu_stop));
CHECK_CUDA_ERR(cudaEventRecord(gpu_start, 0));
// 拷贝向量到设备(仅一次,同步传输)
CHECK_CUDA_ERR(cudaMemcpy(d_vec, h_vec, MATRIX_ROWS * sizeof(float), cudaMemcpyHostToDevice));
// 分块处理:两个流交替传输和计算
for (int i = 0; i < MATRIX_ROWS; i += CHUNK_SIZE * 2) {
// 流1:处理第i~i+CHUNK_SIZE-1行
CHECK_CUDA_ERR(cudaMemcpyAsync(d_mat1, h_mat + i * MATRIX_COLS,
CHUNK_SIZE * MATRIX_COLS * sizeof(float),
cudaMemcpyHostToDevice, stream1));
vectorMatrixMulKernel<<<blocks, BLOCK_SIZE, 0, stream1>>>(d_vec, d_mat1, d_res1, CHUNK_SIZE, MATRIX_COLS);
CHECK_CUDA_ERR(cudaGetLastError());
CHECK_CUDA_ERR(cudaMemcpyAsync(h_res_gpu + i, d_res1,
CHUNK_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream1));
// 流2:处理第i+CHUNK_SIZE~i+2*CHUNK_SIZE-1行
CHECK_CUDA_ERR(cudaMemcpyAsync(d_mat2, h_mat + (i + CHUNK_SIZE) * MATRIX_COLS,
CHUNK_SIZE * MATRIX_COLS * sizeof(float),
cudaMemcpyHostToDevice, stream2));
vectorMatrixMulKernel<<<blocks, BLOCK_SIZE, 0, stream2>>>(d_vec, d_mat2, d_res2, CHUNK_SIZE, MATRIX_COLS);
CHECK_CUDA_ERR(cudaGetLastError());
CHECK_CUDA_ERR(cudaMemcpyAsync(h_res_gpu + i + CHUNK_SIZE, d_res2,
CHUNK_SIZE * sizeof(float),
cudaMemcpyDeviceToHost, stream2));
}
// 等待所有流完成
CHECK_CUDA_ERR(cudaStreamSynchronize(stream1));
CHECK_CUDA_ERR(cudaStreamSynchronize(stream2));
// 计时结束
CHECK_CUDA_ERR(cudaEventRecord(gpu_stop, 0));
CHECK_CUDA_ERR(cudaEventSynchronize(gpu_stop));
float gpu_time_ms;
CHECK_CUDA_ERR(cudaEventElapsedTime(&gpu_time_ms, gpu_start, gpu_stop));
// 5. 结果验证(前10个元素)
bool valid = true;
for (int i = 0; i < 10; ++i) {
if (abs(h_res_cpu[i] - h_res_gpu[i]) > 1e-3) valid = false;
}
// 输出结果
cout << "GPU流重叠计算耗时:" << gpu_time_ms << " ms" << endl;
cout << "结果验证:" << (valid ? "正确" : "错误") << endl;
cout << "GPU加速比(相对CPU):" << (double)cpu_time / gpu_time_ms << "x" << endl;
// 6. 释放资源
CHECK_CUDA_ERR(cudaFree(d_vec));
CHECK_CUDA_ERR(cudaFree(d_mat1));
CHECK_CUDA_ERR(cudaFree(d_mat2));
CHECK_CUDA_ERR(cudaFree(d_res1));
CHECK_CUDA_ERR(cudaFree(d_res2));
CHECK_CUDA_ERR(cudaStreamDestroy(stream1));
CHECK_CUDA_ERR(cudaStreamDestroy(stream2));
CHECK_CUDA_ERR(cudaEventDestroy(gpu_start));
CHECK_CUDA_ERR(cudaEventDestroy(gpu_stop));
CHECK_CUDA_ERR(cudaFreeHost(h_vec));
CHECK_CUDA_ERR(cudaFreeHost(h_mat));
CHECK_CUDA_ERR(cudaFreeHost(h_res_cpu));
CHECK_CUDA_ERR(cudaFreeHost(h_res_gpu));
return 0;
}
示例 3:多 GPU 协同 - 向量矩阵乘法
核心功能
使用两个 GPU 并行处理矩阵分片,演示多 GPU 协同编程,呼应第八章「多 GPU 协同」知识点。
#include <cuda_runtime.h>
#include <iostream>
#include <chrono>
#include <cstdlib>
using namespace std;
using namespace chrono;
#define CHECK_CUDA_ERR(err) \
if (err != cudaSuccess) { \
cerr << "CUDA错误:" << cudaGetErrorString(err) << "(行号:" << __LINE__ << ")" << endl; \
exit(1); \
}
const int BLOCK_SIZE = 256; // 块大小
const int MATRIX_ROWS = 16380; // 矩阵行数
const int MATRIX_COLS = 16380; // 矩阵列数
const int CHUNK_SIZE = MATRIX_ROWS / 2; // 每个GPU处理一半行
// GPU核函数:向量矩阵乘法(与示例2相同)
__global__ void vectorMatrixMulKernel(float* d_vec, float* d_mat, float* d_res, int rows, int cols) {
int row = threadIdx.x + blockIdx.x * blockDim.x;
if (row < rows) {
float sum = 0.0f;
for (int col = 0; col < cols; ++col) {
sum += d_mat[row * cols + col] * d_vec[col];
}
d_res[row] = sum;
}
}
int main() {
// 1. 检查GPU数量
int deviceCount;
CHECK_CUDA_ERR(cudaGetDeviceCount(&deviceCount));
if (deviceCount < 2) {
cerr << "错误:系统GPU数量不足2个,无法运行多GPU示例!" << endl;
exit(1);
}
cout << "系统GPU数量:" << deviceCount << ",使用前2个GPU协同计算" << endl;
// 2. 主机内存分配(Pinned Memory)
float *h_vec, *h_mat, *h_res;
CHECK_CUDA_ERR(cudaMallocHost(&h_vec, MATRIX_ROWS * sizeof(float)));
CHECK_CUDA_ERR(cudaMallocHost(&h_mat, MATRIX_ROWS * MATRIX_COLS * sizeof(float)));
CHECK_CUDA_ERR(cudaMallocHost(&h_res, MATRIX_ROWS * sizeof(float)));
// 3. 初始化数据
srand(time(0));
for (int i = 0; i < MATRIX_ROWS; ++i) {
h_vec[i] = rand() / (float)RAND_MAX;
}
for (int i = 0; i < MATRIX_ROWS * MATRIX_COLS; ++i) {
h_mat[i] = rand() / (float)RAND_MAX;
}
// 4. 设备资源分配(两个GPU)
float *d_vec[2], *d_mat[2], *d_res[2];
int blocks = (CHUNK_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
// 初始化每个GPU的内存和数据
for (int dev = 0; dev < 2; ++dev) {
CHECK_CUDA_ERR(cudaSetDevice(dev)); // 切换到第dev个GPU
// 分配设备内存
CHECK_CUDA_ERR(cudaMalloc((void**)&d_vec[dev], MATRIX_ROWS * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_mat[dev], CHUNK_SIZE * MATRIX_COLS * sizeof(float)));
CHECK_CUDA_ERR(cudaMalloc((void**)&d_res[dev], CHUNK_SIZE * sizeof(float)));
// 拷贝向量和矩阵分片到当前GPU
CHECK_CUDA_ERR(cudaMemcpy(d_vec[dev], h_vec, MATRIX_ROWS * sizeof(float), cudaMemcpyHostToDevice));
CHECK_CUDA_ERR(cudaMemcpy(d_mat[dev], h_mat + dev * CHUNK_SIZE * MATRIX_COLS,
CHUNK_SIZE * MATRIX_COLS * sizeof(float), cudaMemcpyHostToDevice));
}
// 5. 多GPU并行执行核函数
cudaEvent_t gpu_start, gpu_stop;
CHECK_CUDA_ERR(cudaEventCreate(&gpu_start));
CHECK_CUDA_ERR(cudaEventCreate(&gpu_stop));
CHECK_CUDA_ERR(cudaEventRecord(gpu_start, 0));
for (int dev = 0; dev < 2; ++dev) {
CHECK_CUDA_ERR(cudaSetDevice(dev));
vectorMatrixMulKernel<<<blocks, BLOCK_SIZE>>>(d_vec[dev], d_mat[dev], d_res[dev], CHUNK_SIZE, MATRIX_COLS);
CHECK_CUDA_ERR(cudaGetLastError());
}
// 6. 拷贝结果到主机并合并
for (int dev = 0; dev < 2; ++dev) {
CHECK_CUDA_ERR(cudaSetDevice(dev));
CHECK_CUDA_ERR(cudaMemcpy(h_res + dev * CHUNK_SIZE, d_res[dev],
CHUNK_SIZE * sizeof(float), cudaMemcpyDeviceToHost));
}
// 计时结束
CHECK_CUDA_ERR(cudaEventRecord(gpu_stop, 0));
CHECK_CUDA_ERR(cudaEventSynchronize(gpu_stop));
float gpu_time_ms;
CHECK_CUDA_ERR(cudaEventElapsedTime(&gpu_time_ms, gpu_start, gpu_stop));
// 7. 验证结果(简化:仅检查是否有NaN)
bool valid = true;
for (int i = 0; i < 10; ++i) {
if (isnan(h_res[i])) valid = false;
}
// 输出结果
cout << "多GPU协同计算耗时:" << gpu_time_ms << " ms" << endl;
cout << "结果验证:" << (valid ? "有效" : "无效") << endl;
// 8. 释放资源
for (int dev = 0; dev < 2; ++dev) {
CHECK_CUDA_ERR(cudaSetDevice(dev));
CHECK_CUDA_ERR(cudaFree(d_vec[dev]));
CHECK_CUDA_ERR(cudaFree(d_mat[dev]));
CHECK_CUDA_ERR(cudaFree(d_res[dev]));
}
CHECK_CUDA_ERR(cudaEventDestroy(gpu_start));
CHECK_CUDA_ERR(cudaEventDestroy(gpu_stop));
CHECK_CUDA_ERR(cudaFreeHost(h_vec));
CHECK_CUDA_ERR(cudaFreeHost(h_mat));
CHECK_CUDA_ERR(cudaFreeHost(h_res));
return 0;
}
三、关键说明
1. 代码与知识点关联
-
示例 1(调试演示):核心展示 VS Code + CUDA-GDB 的调试流程,包含断点、变量观察、bug 定位,对应第八章「调试指南」;
-
示例 2(CUDA 流重叠):完整实现流的创建、异步传输、分块处理、流同步,解决第八章强调的「传输与计算重叠」核心需求;
-
示例 3(多 GPU 协同):演示
cudaSetDevice设备切换、数据分片、并行执行、结果合并,对应第八章「多 GPU 编程」核心步骤。