|  | 
|  | 1 | +// ====------ asm_cp.cu ----------------------------------- *- CUDA -* ---===// | 
|  | 2 | +// | 
|  | 3 | +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | 
|  | 4 | +// See https://llvm.org/LICENSE.txt for license information. | 
|  | 5 | +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | 
|  | 6 | +// | 
|  | 7 | +// | 
|  | 8 | +// ===---------------------------------------------------------------------===// | 
|  | 9 | + | 
|  | 10 | +#include <cuda.h> | 
|  | 11 | +#include <cuda_runtime.h> | 
|  | 12 | +#include <iostream> | 
|  | 13 | + | 
|  | 14 | +#define TEST(FN)                                                               \ | 
|  | 15 | +  {                                                                            \ | 
|  | 16 | +    if (FN()) {                                                                \ | 
|  | 17 | +      printf("Test " #FN " PASS\n");                                           \ | 
|  | 18 | +    } else {                                                                   \ | 
|  | 19 | +      printf("Test " #FN " FAIL\n");                                           \ | 
|  | 20 | +      return 1;                                                                \ | 
|  | 21 | +    }                                                                          \ | 
|  | 22 | +  } | 
|  | 23 | + | 
|  | 24 | +__device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, | 
|  | 25 | +                                      bool pred = true) { | 
|  | 26 | +  const int BYTES = 16; | 
|  | 27 | +  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr)); | 
|  | 28 | +  asm volatile("{\n" | 
|  | 29 | +               "   .reg .pred p;\n" | 
|  | 30 | +               "   setp.ne.b32 p, %0, 0;\n" | 
|  | 31 | +               "   @p cp.async.cg.shared.global [%1], [%2], %3;\n" | 
|  | 32 | +               "}\n" | 
|  | 33 | +               : | 
|  | 34 | +               : "r"((int)pred), "r"(smem), "l"(glob_ptr), "n"(BYTES)); | 
|  | 35 | +} | 
|  | 36 | + | 
|  | 37 | +__global__ void test_cp_async4_pred(int4 *d_out, int4 *d_in) { | 
|  | 38 | +  extern __shared__ int4 smem[]; | 
|  | 39 | +  int tid = threadIdx.x; | 
|  | 40 | + | 
|  | 41 | +  if (tid % 2) { | 
|  | 42 | +    cp_async4_pred(&smem[tid], &d_in[tid], true); | 
|  | 43 | + | 
|  | 44 | +    asm volatile("cp.async.commit_group;" ::: "memory"); | 
|  | 45 | +    asm volatile("cp.async.wait_all;" ::: "memory"); | 
|  | 46 | + | 
|  | 47 | +    __syncthreads(); | 
|  | 48 | +    d_out[tid] = smem[tid]; | 
|  | 49 | +  } | 
|  | 50 | +} | 
|  | 51 | + | 
|  | 52 | +bool cp_async4_pred_test() { | 
|  | 53 | +  const int N = 256; | 
|  | 54 | +  size_t size = N * sizeof(int4); | 
|  | 55 | + | 
|  | 56 | +  // Allocate host memory | 
|  | 57 | +  int4 *h_in = (int4 *)malloc(size); | 
|  | 58 | +  int4 *h_out = (int4 *)malloc(size); | 
|  | 59 | + | 
|  | 60 | +  for (int i = 0; i < N; i++) { | 
|  | 61 | +    h_in[i] = make_int4(i, i + 1, i + 2, i + 3); | 
|  | 62 | +  } | 
|  | 63 | + | 
|  | 64 | +  int4 *d_in, *d_out; | 
|  | 65 | +  cudaMalloc(&d_in, size); | 
|  | 66 | +  cudaMalloc(&d_out, size); | 
|  | 67 | + | 
|  | 68 | +  // Copy input data to device | 
|  | 69 | +  cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice); | 
|  | 70 | + | 
|  | 71 | +  // Launch kernel | 
|  | 72 | +  test_cp_async4_pred<<<1, N, size>>>(d_out, d_in); | 
|  | 73 | +  cudaDeviceSynchronize(); | 
|  | 74 | + | 
|  | 75 | +  // Copy output data back to host | 
|  | 76 | +  cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost); | 
|  | 77 | + | 
|  | 78 | +  bool passed = true; | 
|  | 79 | +  for (int i = 0; i < N; i++) { | 
|  | 80 | +    if (i % 2 && (h_out[i].x != h_in[i].x || h_out[i].y != h_in[i].y || | 
|  | 81 | +                  h_out[i].z != h_in[i].z || h_out[i].w != h_in[i].w)) { | 
|  | 82 | + | 
|  | 83 | +      passed = false; | 
|  | 84 | +      std::cout << "Mismatch at index " << i << "\n"; | 
|  | 85 | +      break; | 
|  | 86 | +    } | 
|  | 87 | +  } | 
|  | 88 | + | 
|  | 89 | +  free(h_in); | 
|  | 90 | +  free(h_out); | 
|  | 91 | +  cudaFree(d_in); | 
|  | 92 | +  cudaFree(d_out); | 
|  | 93 | +  return passed; | 
|  | 94 | +} | 
|  | 95 | + | 
|  | 96 | +__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) { | 
|  | 97 | +  const int BYTES = 16; | 
|  | 98 | +  uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr)); | 
|  | 99 | +  asm volatile("{\n" | 
|  | 100 | +               "   cp.async.cg.shared.global [%0], [%1], %2;\n" | 
|  | 101 | +               "}\n" ::"r"(smem), | 
|  | 102 | +               "l"(glob_ptr), "n"(BYTES)); | 
|  | 103 | +} | 
|  | 104 | + | 
|  | 105 | +__global__ void test_cp_async4(int4 *d_out, int4 *d_in) { | 
|  | 106 | +  extern __shared__ int4 smem[]; | 
|  | 107 | +  int tid = threadIdx.x; | 
|  | 108 | + | 
|  | 109 | +  // Perform async copy | 
|  | 110 | +  cp_async4(&smem[tid], &d_in[tid]); | 
|  | 111 | + | 
|  | 112 | +  // Ensure all async copies are completed before reading | 
|  | 113 | +  asm volatile("cp.async.commit_group;" ::: "memory"); | 
|  | 114 | +  asm volatile("cp.async.wait_all;" ::: "memory"); | 
|  | 115 | +  __syncthreads(); | 
|  | 116 | + | 
|  | 117 | +  // Store the result back to global memory for verification | 
|  | 118 | +  d_out[tid] = smem[tid]; | 
|  | 119 | +} | 
|  | 120 | + | 
|  | 121 | +bool cp_async4_test() { | 
|  | 122 | +  const int N = 256; | 
|  | 123 | +  size_t size = N * sizeof(int4); | 
|  | 124 | + | 
|  | 125 | +  // Allocate host memory | 
|  | 126 | +  int4 *h_in = (int4 *)malloc(size); | 
|  | 127 | +  int4 *h_out = (int4 *)malloc(size); | 
|  | 128 | + | 
|  | 129 | +  for (int i = 0; i < N; i++) { | 
|  | 130 | +    h_in[i] = make_int4(i, i + 1, i + 2, i + 3); | 
|  | 131 | +  } | 
|  | 132 | + | 
|  | 133 | +  int4 *d_in, *d_out; | 
|  | 134 | +  cudaMalloc(&d_in, size); | 
|  | 135 | +  cudaMalloc(&d_out, size); | 
|  | 136 | + | 
|  | 137 | +  // Copy input data to device | 
|  | 138 | +  cudaMemcpy(d_in, h_in, size, cudaMemcpyHostToDevice); | 
|  | 139 | + | 
|  | 140 | +  // Launch kernel | 
|  | 141 | +  test_cp_async4<<<1, N, size>>>(d_out, d_in); | 
|  | 142 | +  cudaDeviceSynchronize(); | 
|  | 143 | + | 
|  | 144 | +  // Copy output data back to host | 
|  | 145 | +  cudaMemcpy(h_out, d_out, size, cudaMemcpyDeviceToHost); | 
|  | 146 | + | 
|  | 147 | +  bool passed = true; | 
|  | 148 | +  for (int i = 0; i < N; i++) { | 
|  | 149 | +    if (h_out[i].x != h_in[i].x || h_out[i].y != h_in[i].y || | 
|  | 150 | +        h_out[i].z != h_in[i].z || h_out[i].w != h_in[i].w) { | 
|  | 151 | +      passed = false; | 
|  | 152 | +      std::cout << "Mismatch at index " << i << "\n"; | 
|  | 153 | +      break; | 
|  | 154 | +    } | 
|  | 155 | +  } | 
|  | 156 | + | 
|  | 157 | +  free(h_in); | 
|  | 158 | +  free(h_out); | 
|  | 159 | +  cudaFree(d_in); | 
|  | 160 | +  cudaFree(d_out); | 
|  | 161 | +  return passed; | 
|  | 162 | +} | 
|  | 163 | + | 
|  | 164 | +int main() { | 
|  | 165 | +  TEST(cp_async4_pred_test); | 
|  | 166 | +  TEST(cp_async4_test); | 
|  | 167 | +  return 0; | 
|  | 168 | +} | 
0 commit comments