cudaMemcpy()'s performance not improving after using cudaHostAlloc()

49 views Asked by At

I have a program that performs BLAS's gemm() subprogram in cuBLAS as follows:

size_t m = 10000;
size_t k = 4000;
size_t n = 6000;

// h_A is an m-by-k matrix, h_B is a k-by-n matrix and h_C is a m-by-n matrix.

double *h_C;
cudaHostAlloc((void **)(&h_C), m * n * sizeof(double), cudaHostAllocDefault);

uint64_t t0 = get_timestamp_in_microsec();

CUBLAS_CHECK(cublasCreate(&cublasH));

uint64_t t1 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMalloc((void **)(&d_A), sizeof(double) * m * k));
CUDA_CHECK(cudaMalloc((void **)(&d_B), sizeof(double) * k * n));
CUDA_CHECK(cudaMalloc((void **)(&d_C), sizeof(double) * m * n));

uint64_t t2 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMemcpy(d_A, h_A.data(), sizeof(double) * m * k, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B.data(), sizeof(double) * k * n, cudaMemcpyHostToDevice));

uint64_t t3 = get_timestamp_in_microsec();

CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));

uint64_t t4 = get_timestamp_in_microsec();

CUDA_CHECK(cudaMemcpy(h_C, d_C, sizeof(double) * m * n, cudaMemcpyDeviceToHost));

uint64_t t5 = get_timestamp_in_microsec();

The performace is okay except the final cudaMemcpy() that moves d_C back to h_C takes almost 3 seconds:

t1-t0: 12.929ms
t2-t1: 0.949ms
t3-t2: 53.256ms
t4-t3: 0.315ms
t5-t4: 2653.57ms

After searching SO and other sites, I migrated from malloc() to cudaHostAlloc() but the above result seems not changing at all.

Any issue in my code or is this performance optimal already?

1

There are 1 answers

0
D.J. Elkind On

As suggested by @njuffa, I inserted cudaDeviceSynchronize(); right after CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));. The numbers are more reasonable now:

t1-t0: 12.826ms
t2-t1: 0.955ms
t3-t2: 52.843ms
t4-t3: 2620.88ms
t5-t4: 36.406ms