I have a program that performs BLAS's gemm() subprogram in cuBLAS as follows:
size_t m = 10000;
size_t k = 4000;
size_t n = 6000;
// h_A is an m-by-k matrix, h_B is a k-by-n matrix and h_C is a m-by-n matrix.
double *h_C;
cudaHostAlloc((void **)(&h_C), m * n * sizeof(double), cudaHostAllocDefault);
uint64_t t0 = get_timestamp_in_microsec();
CUBLAS_CHECK(cublasCreate(&cublasH));
uint64_t t1 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMalloc((void **)(&d_A), sizeof(double) * m * k));
CUDA_CHECK(cudaMalloc((void **)(&d_B), sizeof(double) * k * n));
CUDA_CHECK(cudaMalloc((void **)(&d_C), sizeof(double) * m * n));
uint64_t t2 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMemcpy(d_A, h_A.data(), sizeof(double) * m * k, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_B, h_B.data(), sizeof(double) * k * n, cudaMemcpyHostToDevice));
uint64_t t3 = get_timestamp_in_microsec();
CUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));
uint64_t t4 = get_timestamp_in_microsec();
CUDA_CHECK(cudaMemcpy(h_C, d_C, sizeof(double) * m * n, cudaMemcpyDeviceToHost));
uint64_t t5 = get_timestamp_in_microsec();
The performace is okay except the final cudaMemcpy() that moves d_C back to h_C takes almost 3 seconds:
t1-t0: 12.929ms
t2-t1: 0.949ms
t3-t2: 53.256ms
t4-t3: 0.315ms
t5-t4: 2653.57ms
After searching SO and other sites, I migrated from malloc() to cudaHostAlloc() but the above result seems not changing at all.
Any issue in my code or is this performance optimal already?
As suggested by @njuffa, I inserted
cudaDeviceSynchronize();right afterCUBLAS_CHECK(cublasDgemm(cublasH, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, &alpha, d_A, lda, d_B, ldb, &beta, d_C, ldc));. The numbers are more reasonable now: