#include #include #include #include #include #include #include #define A(row, col) A[(col) * 3 + (row)] //3 is lda of matrix A using data_type = cuDoubleComplex; void printMatrix(int m, int n, const data_type* A, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, A(row, col).x, A(row, col).y); } } } void printMatrixS1(int m, int n, const cuDoubleComplex* A, const char* name) { int row, col; for (row = 0; row < m - 2; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% \n", name, row + 1, col + 1, A(row, col).x); printf("%s(%d, %d) = % lf% \n", name, row + 2, col + 2, A(row, col).y); } printf("%s(%d, %d) = % lf% \n", name, row + 3, col + 2, A(row + 1, col - 1).x); } } void printMatrixS2(int m, int n, const cuDoubleComplex* A, const char* name) { int row, col; int i = 1; for (row = 0; row < m - 2; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).x); i++; printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).y); } } } void printMatrixS3(int m, int n, const data_type* A, const char* name) { int row, col; int i = 1; for (row = 0; row < n; row++) { //********** for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).x); i++; printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).y); } } } void printMatrixW(int m, int n, const data_type* A, const char* name) { int row, col; int i = 1; for (row = 0; row < m - 2; row++) { //********** for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).x); i++; printf("%s(%d, %d) = % lf% \n", name, row + i, row + i, A(row, col).y); } } } int main(int argc, char* argv[]) { cusolverDnHandle_t cusolverH = NULL; cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS; cudaError_t cudaStat1 = cudaSuccess; cudaError_t cudaStat2 = cudaSuccess; cudaError_t cudaStat3 = cudaSuccess; cudaError_t cudaStat4 = cudaSuccess; cudaError_t cudaStat5 = cudaSuccess; cudaError_t cudaStat6 = cudaSuccess; cudaError_t cudaStat7 = cudaSuccess; const int m = 3; //******************* const int n = 2; //******************* const int lda = m; // lda >= m const int ldu = m; // ldu >= m const int ldvt = m; // important, ldvt >= n if jobu = 'A' data_type A[lda * n]; data_type U[ldu * m]; /* ldu-by-m unitary matrix */ data_type VT[ldvt * n]; /* ldvt-by-n unitary matrix */ data_type S[n]; /* singular value */ cuDoubleComplex SR[m * n] = { 0 }; /* | 1 2 | * A = | 4 5 | * | 2 1 | */ A(0, 0) = make_cuDoubleComplex(1.0, 0.0); A(0, 1) = make_cuDoubleComplex(2.0, 0.0); A(1, 0) = make_cuDoubleComplex(4.0, 0.0); A(1, 1) = make_cuDoubleComplex(5.0, 0.0); A(2, 0) = make_cuDoubleComplex(2.0, 0.0); A(2, 1) = make_cuDoubleComplex(1.0, 0.0); data_type* d_A = NULL; data_type* d_S = NULL; // singular values data_type* d_U = NULL; // left singular vectors data_type* d_VT = NULL; // right singular vectors int* d_info = NULL; data_type* d_W = NULL; /* W = S*VT */ /**/ data_type* d_SR = NULL; //singular values void* bufferOnDevice = NULL; size_t workspaceInBytesOnDevice = 0; void* bufferOnHost = NULL; size_t workspaceInBytesOnHost = 0; int info_gpu = 0; printf("A = (matlab base-1)\n"); printMatrix(m, n, A, "A"); printf("=====\n"); /* step 1: create cusolverDn/cublas handle */ cusolver_status = cusolverDnCreate(&cusolverH); assert(CUSOLVER_STATUS_SUCCESS == cusolver_status); cublas_status = cublasCreate(&cublasH); assert(CUBLAS_STATUS_SUCCESS == cublas_status); /* step 2: copy A and B to device */ cudaStat1 = cudaMalloc((void**)&d_A, sizeof(data_type) * lda * n); cudaStat2 = cudaMalloc((void**)&d_S, sizeof(data_type) * n); cudaStat3 = cudaMalloc((void**)&d_U, sizeof(data_type) * ldu * m); cudaStat4 = cudaMalloc((void**)&d_VT, sizeof(data_type) * ldvt * n); cudaStat5 = cudaMalloc((void**)&d_info, sizeof(int)); cudaStat6 = cudaMalloc((void**)&d_W, sizeof(data_type) * lda * n); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); assert(cudaSuccess == cudaStat4); assert(cudaSuccess == cudaStat5); assert(cudaSuccess == cudaStat6); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ cudaStat1 = cudaMemcpy(d_A, A, sizeof(data_type) * lda * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); cudaDeviceSynchronize(); /* wait until d_A is ready */ signed char jobu = 'A'; /* all m columns of U */ signed char jobvt = 'A'; /* all n rows of VT */ /* step 3: query working space of SVD */ cusolver_status = cusolverDnXgesvd_bufferSize( cusolverH, NULL, /* params */ jobu, jobvt, (int64_t)m, (int64_t)n, CUDA_C_64F, /* dataTypeA */ d_A, (int64_t)lda, CUDA_R_64F, /* dataTypeS */ d_S, CUDA_C_64F, /* dataTypeU */ d_U, (int64_t)ldu, CUDA_C_64F, /* dataTypeVT */ d_VT, (int64_t)ldvt, CUDA_C_64F, /* computeType */ &workspaceInBytesOnDevice, &workspaceInBytesOnHost); cudaStat1 = cudaMalloc((void**)&bufferOnDevice, workspaceInBytesOnDevice); assert(cudaSuccess == cudaStat1); if (0 < workspaceInBytesOnHost) { bufferOnHost = (void*)malloc(workspaceInBytesOnHost); assert(NULL != bufferOnHost); } /* step 4: compute SVD */ cusolver_status = cusolverDnXgesvd( cusolverH, NULL, /* params */ jobu, jobvt, (int64_t)m, (int64_t)n, CUDA_C_64F, /* dataTypeA */ d_A, (int64_t)lda, CUDA_R_64F, /* dataTypeS */ d_S, /* real array of dimension min(m, n) */ CUDA_C_64F, /* dataTypeU */ d_U, (int64_t)ldu, CUDA_C_64F, /* dataTypeVT */ d_VT, (int64_t)ldvt, CUDA_C_64F, /* computeType */ bufferOnDevice, /* d_work */ workspaceInBytesOnDevice, bufferOnHost, /* h_work */ workspaceInBytesOnHost, d_info); cudaStat1 = cudaDeviceSynchronize(); assert(CUSOLVER_STATUS_SUCCESS == cusolver_status); assert(cudaSuccess == cudaStat1); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ cudaStat1 = cudaMemcpy(U, d_U, sizeof(data_type) * ldu * m, cudaMemcpyDeviceToHost); cudaStat2 = cudaMemcpy(VT, d_VT, sizeof(data_type) * ldvt * n, cudaMemcpyDeviceToHost); cudaStat3 = cudaMemcpy(S, d_S, sizeof(data_type) * n, cudaMemcpyDeviceToHost); cudaStat4 = cudaMemcpy(&info_gpu, d_info, sizeof(int), cudaMemcpyDeviceToHost); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); assert(cudaSuccess == cudaStat4); cudaDeviceSynchronize(); /* wait until host data is ready */ printf("after gesvd: info_gpu = %d\n", info_gpu); assert(0 == info_gpu); printf("=====\n"); printf("S = (matlab base-1)\n"); printMatrixS3(n, 1, S, "S"); printf("=====\n"); printf("U = (matlab base-1)\n"); printMatrix(m, m, U, "U"); printf("=====\n"); printf("VT = (matlab base-1)\n"); printMatrix(n, n, VT, "VT"); printf("=====\n"); /*SR is a rectangulatr matrix*/ int i = 0; for (int j = 0; j < m - 2; j++) { SR[4 * i].x = S[j].x; SR[4 * (i + 1)].x = S[j].y; i = i + 2; } /* || 0 || 3 || * SR = || 1 || 4 || * || 2 || 5 || */ printf("SR of S = (matlab base-1)\n"); printMatrix(m, n, SR, "SR"); printf("=====\n"); /*d_SR replaces d_S*/ cudaStat7 = cudaMalloc((void**)&d_SR, sizeof(data_type) * m * n); cudaStat2 = cudaMemcpy(d_SR, SR, sizeof(data_type) * m * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat7); assert(cudaSuccess == cudaStat2); cudaDeviceSynchronize(); /* wait until host data is ready */ /* step 6: |A - U*S*VT| */ /* W = S*VT */ // C = diag(X) x A if mode == CUBLAS_SIDE_LEFT //d_W = diag(d_SR) x d_VT, matrix-matrix multiplication in device cublas_status = cublasZdgmm( cublasH, CUBLAS_SIDE_LEFT, n, //number of rows of matrix A and C n, //number of columns of matrix A and C d_VT, //A, array of dimensions lda x n with lda >= max(1, m) ldvt, //lda of A d_SR, //one-dimensional array of size |inc| x m if CUBLAS_SIDE_LEFT 4, //stride of one-dimensional array X, incx = ldb + 1 d_W, //array of dimensions ldc x n with ldc >= max(1, m) lda); //leading dimension of a 2-dimensional array used to store C assert(CUBLAS_STATUS_SUCCESS == cublas_status); /* A := -U*W + A */ cudaStat1 = cudaMemcpy(d_A, A, sizeof(data_type) * lda * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); cudaDeviceSynchronize(); /* wait until d_A is ready */ // Cmxn = alpha X Amxk X Bkxn + beta X Cmxn // const data_type h_zero = make_cuDoubleComplex(0, 0); const data_type h_one = make_cuDoubleComplex(1, 0); const data_type h_minus_one = make_cuDoubleComplex(-1, 0); //in device operation cublas_status = cublasZgemm_v2( cublasH, CUBLAS_OP_N, //Amxk, U CUBLAS_OP_N, //Bkxn, W m, /* m, number of rows of A */ n, /* n, number of columns of A */ n, /* k, number of columns of U */ &h_minus_one, /* alpha = -1, host pointer */ d_U, //Amxk, U ldu, d_W, //Bkxn, W lda, &h_one, /* beta = 1, host pointer */ d_A, //Cmxn, A lda); assert(CUBLAS_STATUS_SUCCESS == cublas_status); cudaStat1 = cudaMemcpy(A, d_A, sizeof(data_type) * lda * n, cudaMemcpyDeviceToHost); assert(cudaSuccess == cudaStat1); cudaDeviceSynchronize(); /* wait until host data is ready */ printf("A - U*S*VT = (matlab base-1)\n"); printMatrix(m, n, A, "(A - U*S*VT)"); printf("=====\n"); double dR_fro = 0.0; cublas_status = cublasDznrm2_v2( cublasH, lda * n, //integer, number of elements of d_A d_A, 1, //stride between elements of d_A &dR_fro); //the resulting norm assert(CUBLAS_STATUS_SUCCESS == cublas_status); printf("|A - U*S*VT| = %E \n", dR_fro); /* free resources */ if (d_A) cudaFree(d_A); if (d_S) cudaFree(d_S); if (d_U) cudaFree(d_U); if (d_VT) cudaFree(d_VT); if (d_info) cudaFree(d_info); if (d_W) cudaFree(d_W); if (bufferOnDevice) cudaFree(bufferOnDevice); if (bufferOnHost) cudaFree(bufferOnHost); /**/ if (d_SR) cudaFree(d_SR); if (cublasH) cublasDestroy(cublasH); if (cusolverH) cusolverDnDestroy(cusolverH); cudaDeviceReset(); return 0; }