当前位置：首页 > news >正文

CUDA编程：Shared Memory Bank Conflict 与 Padding 优化

news 2026/5/27 23:59:16

文章目录1. 核心原理Shared Memory 为什么也会冲突2. 用数组理解 bank 映射3. 为什么矩阵转置容易出现 bank conflict4. 为什么 tile[32][33] 能解决问题5. 实验矩阵转置 no-padding vs padding6. 答疑问题 1为什么两个 kernel 结果都正确但速度不同问题 2为什么 tile[32][32] 容易冲突问题 3为什么 tile[32][33] 有效7. 总结Global Memory 的 Memory Coalescing让 warp 内线程尽量访问连续 Global 内存地址。但是也需要注意Shared Memory 很快但如果访问方式不合理也会因为 bank conflict 变慢。摘要1. Shared Memory bank 是什么 2. 什么是 bank conflict 3. 为什么矩阵转置容易产生 bank conflict 4. 为什么 tile[32][33] 可以减少 bank conflict 5. 通过实验比较 no-padding 和 padding 的性能差异1. 核心原理Shared Memory 为什么也会冲突Shared Memory 它内部被分成多个bank。在很多 NVIDIA GPU 上可以简化理解为Shared Memory ≈ 32 个 bank Warp 32 个线程理想情况下一个 warp 的 32 个线程同时访问 shared memory 时thread 0 - bank 0 thread 1 - bank 1 thread 2 - bank 2 ... thread 31 - bank 31这时访问可以并行完成速度很快。但是如果多个线程访问同一个 bankthread 0 - bank 0 thread 1 - bank 0 thread 2 - bank 0 ...那么这些访问会被串行化。这就是Bank Conflict共享内存 bank 冲突。2. 用数组理解 bank 映射假设 shared memory 中存的是float一个float 4 bytes。可以粗略理解为bank_id index % 32例如shared[0] - bank 0 shared[1] - bank 1 shared[2] - bank 2 ... shared[31] - bank 31 shared[32] - bank 0 shared[33] - bank 1所以如果 warp 内线程访问shared[threadIdx.x]那么thread 0 - shared[0] - bank 0 thread 1 - shared[1] - bank 1 ... thread31 - shared[31] - bank31没有冲突。但如果访问shared[threadIdx.x*32]那么thread 0 - shared[0] - bank 0 thread 1 - shared[32] - bank 0 thread 2 - shared[64] - bank 0 ... thread31 - shared[992] - bank 0全部撞到 bank 0性能会明显下降。3. 为什么矩阵转置容易出现 bank conflict矩阵转置优化通常会这样做Global Memory 连续读 ↓ Shared Memory 暂存 tile ↓ Shared Memory 中完成局部转置 ↓ Global Memory 连续写经典 shared memory tile__shared__floattile[32][32];写入 shared memory 时tile[threadIdx.y][threadIdx.x]读取 shared memory 时为了转置会变成tile[threadIdx.x][threadIdx.y]问题出在第二种访问。对于tile[32][32]每一行长度是 32。按列访问时很容易出现thread 0 - tile[0][same_col] thread 1 - tile[1][same_col] thread 2 - tile[2][same_col] ...由于每一行跨度刚好是 32 个 float不同线程访问的地址间隔刚好映射到同一个 bank。于是发生严重 bank conflict。4. 为什么tile[32][33]能解决问题经典优化是把__shared__floattile[32][32];改成__shared__floattile[32][33];也就是每一行多加 1 个 float。这样每一行的跨度从32变成33bank 映射就会错开。原来第 0 行起点 - bank 0 第 1 行起点 - bank 0 第 2 行起点 - bank 0加 padding 后第 0 行起点 - bank 0 第 1 行起点 - bank 1 第 2 行起点 - bank 2 ...这样按列访问时不同线程更容易落到不同 bank冲突显著减少。这就是Padding 优化。5. 实验矩阵转置 no-padding vs padding用矩阵转置来观察 bank conflict。我们实现两个优化版 transpose kernel1. transpose_shared_no_padding 使用 tile[32][32] 可能有 shared memory bank conflict 2. transpose_shared_padding 使用 tile[32][33] 通过 padding 减少 bank conflict两个版本 Global Memory 读写模式基本一致主要差别在 shared memory 的列访问是否冲突。代码如下#includecuda_runtime.h#includecmath#includecstdlib#includeiomanip#includeiostream#includevector#defineCUDA_CHECK(call)\do{\cudaError_t errcall;\if(err!cudaSuccess){\std::cerrCUDA Error: cudaGetErrorString(err)\ at __FILE__:__LINE__std::endl;\std::exit(EXIT_FAILURE);\}\}while(0)constexprintTILE_DIM32;constexprintBLOCK_ROWS8;/* * Shared Memory transpose without padding. * * tile[32][32] 在转置读取时容易产生 bank conflict。 */__global__voidtranspose_shared_no_padding(constfloat*in,float*out,intwidth,intheight){__shared__floattile[TILE_DIM][TILE_DIM];intxblockIdx.x*TILE_DIMthreadIdx.x;intyblockIdx.y*TILE_DIMthreadIdx.y;/* * 读取 input。 * * threadIdx.x 连续因此 Global Memory 读取是 coalesced 的。 */for(intj0;jTILE_DIM;jBLOCK_ROWS){intyyyj;if(xwidthyyheight){tile[threadIdx.yj][threadIdx.x]in[yy*widthx];}}__syncthreads();/* * 交换 blockIdx.x 和 blockIdx.y写出转置后的 tile。 */intout_xblockIdx.y*TILE_DIMthreadIdx.x;intout_yblockIdx.x*TILE_DIMthreadIdx.y;/* * 这里读取 tile[threadIdx.x][threadIdx.y j]。 * 对 tile[32][32] 来说按列读 shared memory 容易发生 bank conflict。 */for(intj0;jTILE_DIM;jBLOCK_ROWS){intoyout_yj;if(out_xheightoywidth){out[oy*heightout_x]tile[threadIdx.x][threadIdx.yj];}}}/* * Shared Memory transpose with padding. * * tile[32][33] 通过多加 1 列打散 bank 映射减少 bank conflict。 */__global__voidtranspose_shared_padding(constfloat*in,float*out,intwidth,intheight){__shared__floattile[TILE_DIM][TILE_DIM1];intxblockIdx.x*TILE_DIMthreadIdx.x;intyblockIdx.y*TILE_DIMthreadIdx.y;/* * Global Memory 连续读取。 */for(intj0;jTILE_DIM;jBLOCK_ROWS){intyyyj;if(xwidthyyheight){tile[threadIdx.yj][threadIdx.x]in[yy*widthx];}}__syncthreads();intout_xblockIdx.y*TILE_DIMthreadIdx.x;intout_yblockIdx.x*TILE_DIMthreadIdx.y;/* * 仍然是 tile[threadIdx.x][threadIdx.y j] * 但由于每行长度是 33bank 映射错开冲突减少。 */for(intj0;jTILE_DIM;jBLOCK_ROWS){intoyout_yj;if(out_xheightoywidth){out[oy*heightout_x]tile[threadIdx.x][threadIdx.yj];}}}/* * CPU reference transpose用于结果校验。 */voidtranspose_cpu(conststd::vectorfloatin,std::vectorfloatout,intwidth,intheight){for(inty0;yheight;y){for(intx0;xwidth;x){out[x*heighty]in[y*widthx];}}}/* * 计时函数。 * * 注意 * 这里测的是 kernel time不包含 H2D/D2H。 */templatetypenameLaunchfloattime_kernel(Launch launch,intrepeat){/* * warmup避免第一次 kernel 启动影响正式计时。 */launch();CUDA_CHECK(cudaDeviceSynchronize());cudaEvent_t start,stop;CUDA_CHECK(cudaEventCreate(start));CUDA_CHECK(cudaEventCreate(stop));floattotal_ms0.0f;for(inti0;irepeat;i){CUDA_CHECK(cudaEventRecord(start));launch();CUDA_CHECK(cudaEventRecord(stop));CUDA_CHECK(cudaEventSynchronize(stop));floatms0.0f;CUDA_CHECK(cudaEventElapsedTime(ms,start,stop));total_msms;}CUDA_CHECK(cudaEventDestroy(start));CUDA_CHECK(cudaEventDestroy(stop));returntotal_ms/repeat;}boolcheck_result(conststd::vectorfloatref,conststd::vectorfloatout,floateps1e-5f){if(ref.size()!out.size()){returnfalse;}for(size_t i0;iref.size();i){floatdiffstd::fabs(ref[i]-out[i]);if(diffeps){std::cerrMismatch at i, refref[i], outout[i], diffdiffstd::endl;returnfalse;}}returntrue;}intmain(intargc,char**argv){intwidth4096;intheight4096;intrepeat10;if(argc2){widthstd::atoi(argv[1]);}if(argc3){heightstd::atoi(argv[2]);}if(argc4){repeatstd::atoi(argv[3]);}size_t countstatic_castsize_t(width)*static_castsize_t(height);size_t bytescount*sizeof(float);std::coutShared Memory Bank Conflict\n;std::coutMatrix size : height x width\n;std::coutData size : bytes/1024.0/1024.0 MB\n;std::coutRepeat : repeat\n;std::vectorfloath_in(count);std::vectorfloath_ref(count);std::vectorfloath_no_padding(count);std::vectorfloath_padding(count);for(size_t i0;icount;i){h_in[i]static_castfloat((i*1713)%1000)*0.001f;}/* * CPU reference。 * 大矩阵时会慢一些但 4096x4096 可以接受。 */transpose_cpu(h_in,h_ref,width,height);float*d_innullptr;float*d_out_no_paddingnullptr;float*d_out_paddingnullptr;CUDA_CHECK(cudaMalloc(d_in,bytes));CUDA_CHECK(cudaMalloc(d_out_no_padding,bytes));CUDA_CHECK(cudaMalloc(d_out_padding,bytes));CUDA_CHECK(cudaMemcpy(d_in,h_in.data(),bytes,cudaMemcpyHostToDevice));dim3block(TILE_DIM,BLOCK_ROWS);dim3grid((widthTILE_DIM-1)/TILE_DIM,(heightTILE_DIM-1)/TILE_DIM);std::coutBlock : (block.x, block.y)\n;std::coutGrid : (grid.x, grid.y)\n;autolaunch_no_padding[](){transpose_shared_no_paddinggrid,block(d_in,d_out_no_padding,width,height);CUDA_CHECK(cudaGetLastError());};autolaunch_padding[](){transpose_shared_paddinggrid,block(d_in,d_out_padding,width,height);CUDA_CHECK(cudaGetLastError());};floatno_padding_mstime_kernel(launch_no_padding,repeat);floatpadding_mstime_kernel(launch_padding,repeat);CUDA_CHECK(cudaMemcpy(h_no_padding.data(),d_out_no_padding,bytes,cudaMemcpyDeviceToHost));CUDA_CHECK(cudaMemcpy(h_padding.data(),d_out_padding,bytes,cudaMemcpyDeviceToHost));boolok_no_paddingcheck_result(h_ref,h_no_padding);boolok_paddingcheck_result(h_ref,h_padding);/* * 转置大致是读一次 input 写一次 output。 */doublemoved_bytes2.0*static_castdouble(bytes);doubleno_padding_bwmoved_bytes/(no_padding_ms/1000.0)/1e9;doublepadding_bwmoved_bytes/(padding_ms/1000.0)/1e9;std::coutstd::fixedstd::setprecision(4);std::cout\n[Timing]\n;std::coutNo-padding time : no_padding_ms ms\n;std::coutPadding time : padding_ms ms\n;std::coutSpeedup : no_padding_ms/padding_msx\n;std::cout\n[Effective Bandwidth]\n;std::coutNo-padding BW : no_padding_bw GB/s\n;std::coutPadding BW : padding_bw GB/s\n;std::cout\n[Check]\n;std::coutNo-padding check: (ok_no_padding?PASS:FAIL)\n;std::coutPadding check : (ok_padding?PASS:FAIL)\n;CUDA_CHECK(cudaFree(d_in));CUDA_CHECK(cudaFree(d_out_no_padding));CUDA_CHECK(cudaFree(d_out_padding));return(ok_no_paddingok_padding)?0:1;}看到输出结果CUDA Lesson 8: Shared Memory Bank Conflict Matrix size : 4096 x 4096 Data size : 64 MB Repeat : 10 Block : (32, 8) Grid : (128, 128) [Timing] No-padding time : 1.2885 ms Padding time : 0.6721 ms Speedup : 1.9172x [Effective Bandwidth] No-padding BW : 104.1683 GB/s Padding BW : 199.7107 GB/s [Check] No-padding check: PASS Padding check : PASS6. 答疑问题 1为什么两个 kernel 结果都正确但速度不同因为 no-padding 和 padding 只是 shared memory 布局不同数学计算没有变。所以结果应该一致。但 shared memory 的 bank 映射不同导致访问效率不同。问题 2为什么tile[32][32]容易冲突因为 32 刚好和 bank 数量相同。按列访问时相邻线程访问地址间隔是 32 个 float容易映射到同一个 bank。问题 3为什么tile[32][33]有效因为每行多 1 个元素让下一行起始位置错开一个 bank。这样按列读取时线程不再集中撞同一个 bank。7. 总结核心结论是Shared Memory 很快但也有内部结构Shared Memory 被划分成多个 bank一个 warp 内多个线程访问同一个 bank会发生 bank conflictBank conflict 会导致 shared memory 访问串行化矩阵转置中的列访问非常容易触发 bank conflicttile[32][33] 通过 padding 打散 bank 映射Padding 代价很小但可能明显提升性能一句话总结Shared Memory 优化不仅是“用不用 shared memory”还要看“怎么访问 shared memory”padding 是解决矩阵转置 bank conflict 的经典技巧。

查看全文

http://www.rkmt.cn/news/1408315.html