#include #include #include #include #include //#include #include #include "cusolver_utils.h" /* void static printMatrix(int m, int n, const float* A, const char* name) { for (int row = 0; row < 2 * m * n; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } */ void static printMatrixS(int m, const float* A, const char* name) { for (int row = 0; row < m; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } #define A(row, col) A[(col)*3 + (row)] void static printMatrixA(int m, int n, const cuComplex* A, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, A(row, col).x, A(row, col).y); } } } #define MIN(a,b) ((a) < (b) ? (a) : (b)) /***************************************************************/ /*****************************selfSVD***************************/ /* matrixA is a m-row by n-column input matrix */ /* matrixS, matrixU, and matrixVT are output matrices */ /***************************************************************/ /* matrixA is a float[2*n*m] 1D Hankel matrix matrixS is a float[min(n, m)] 1D singular value matrix matrixR is a float[2*n*n] 1D U matrix matrixT is a float[2*m*m] 1D VT matrix n: number of lines of Hankel matrix m: number of columns of Hankel matrix */ void selfSVD(const int m, const int n, float* matrixA, float* matrixS, float* matrixR, float* matrixT) { // printf("matrixA = (matlab base-1)\n"); // printMatrix(m, n, matrixA, "matrixA"); // printf("=====\n"); cusolverDnHandle_t cusolverH = NULL; // cublasHandle_t cublasH = NULL; // cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS; const int lda = m; // lda >= m const int ldu = m; // ldu >= m const int ldvt = n; // ldvt >= n if jobu = 'A' // point to host device cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * n); cuComplex* U = (cuComplex*)malloc(sizeof(cuComplex) * m * m); cuComplex* VT = (cuComplex*)malloc(sizeof(cuComplex) * n * n); if (!A || !U || !VT) { /* Memory location failed */ free(A); free(U); free(VT); exit(EXIT_FAILURE); } // matrix A for (int i = 0; i < m * n; i++) { A[i] = make_cuComplex((float)matrixA[2 * i], (float)matrixA[2 * i + 1]); } printf("A = (matlab base-1)\n"); printMatrixA(m, n, A, "A"); printf("=====\n"); // point to device device cuComplex* d_A = NULL; float* d_S = NULL; cuComplex* d_U = NULL; cuComplex* d_VT = NULL; int* d_info = NULL; void* bufferOnDevice = NULL; size_t workspaceInBytesOnDevice = 0; void* bufferOnHost = NULL; size_t workspaceInBytesOnHost = 0; /* step 1: create cusolverDn */ CUSOLVER_CHECK(cusolverDnCreate(&cusolverH)); /* step 2: copy A to device */ CUDA_CHECK(cudaMalloc((void**)&d_A, sizeof(cuComplex) * m * n)); CUDA_CHECK(cudaMalloc((void**)&d_S, sizeof(float) * MIN(m, n))); CUDA_CHECK(cudaMalloc((void**)&d_U, sizeof(cuComplex) * m * m)); CUDA_CHECK(cudaMalloc((void**)&d_VT, sizeof(cuComplex) * n * n)); CUDA_CHECK(cudaMalloc((void**)&d_info, sizeof(int))); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ CUDA_CHECK(cudaMemcpy(d_A, A, sizeof(cuComplex) * m * n, cudaMemcpyHostToDevice)); cudaDeviceSynchronize(); /* wait until d_A is ready */ /* step 3: query working space of SVD */ signed char jobu = 'A'; /* all m columns of U */ signed char jobvt = 'A'; /* all n rows of VT */ CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize( cusolverH, NULL, /* params */ jobu, jobvt, (int32_t)m, (int32_t)n, CUDA_C_32F, /* dataTypeA */ d_A, (int32_t)lda, CUDA_R_32F, /* dataTypeS */ d_S, CUDA_C_32F, /* dataTypeU */ d_U, (int32_t)ldu, CUDA_C_32F, /* dataTypeVT */ d_VT, (int32_t)ldvt, CUDA_C_32F, /* computeType */ &workspaceInBytesOnDevice, &workspaceInBytesOnHost)); CUDA_CHECK(cudaMalloc((void**)&bufferOnDevice, workspaceInBytesOnDevice)); if (0 < workspaceInBytesOnHost) { bufferOnHost = (void*)malloc(workspaceInBytesOnHost); assert(NULL != bufferOnHost); } /* step 4: compute SVD */ CUSOLVER_CHECK(cusolverDnXgesvd( cusolverH, NULL, /* params */ jobu, jobvt, (int32_t)m, (int32_t)n, CUDA_C_32F, /* dataTypeA */ d_A, (int32_t)lda, CUDA_R_32F, /* dataTypeS */ d_S, CUDA_C_32F, /* dataTypeU */ d_U, (int32_t)ldu, CUDA_C_32F, /* dataTypeVT */ d_VT, (int32_t)ldvt, CUDA_C_32F, /* computeType */ bufferOnDevice, /* d_work */ workspaceInBytesOnDevice, bufferOnHost, /* h_work */ workspaceInBytesOnHost, d_info)); cudaDeviceSynchronize(); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ CUDA_CHECK(cudaMemcpy(U, d_U, sizeof(cuComplex) * m * m, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(VT, d_VT, sizeof(cuComplex) * n * n, cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(matrixS, d_S, sizeof(float) * MIN(m, n), cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); /* wait until host data is ready */ //S is a 1D real matrix of MIN(m, n) singular values printf("matrixS = (matlab base-1)\n"); printMatrixS(MIN(m, n), matrixS, "matrixS"); printf("=====\n"); printf("U = (matlab base-1)\n"); printMatrixA(m, m, U, "U"); printf("=====\n"); printf("VT = (matlab base-1)\n"); printMatrixA(n, n, VT, "VT"); printf("=====\n"); // matrixR is a float[2 * n * n] 1D U matrix // matrixT is a float[2 * m * m] 1D VT matrix for (int j = 0; j < m * m; j++) { matrixR[2 * j] = (float)U[j].x; matrixR[2 * j + 1] = (float)U[j].y; } for (int j = 0; j < n * n; j++) { matrixT[2 * j] = (float)VT[j].x; matrixT[2 * j + 1] = (float)VT[j].y; } /* Clean up workspace, input, and output memory allocations */ if (d_A) cudaFree(d_A); if (d_S) cudaFree(d_S); if (d_U) cudaFree(d_U); if (d_VT) cudaFree(d_VT); if (d_info) cudaFree(d_info); if (bufferOnDevice) cudaFree(bufferOnDevice); if (bufferOnHost) free(bufferOnHost); if (A) free(A); if (U) free(U); if (VT) free(VT); // if (cublasH) cublasDestroy(cublasH); if (cusolverH) cusolverDnDestroy(cusolverH); cudaDeviceReset(); } int main(int argc, char* argv[]) { /* * | 1+i 2-i 3 | * Z = | 2-i 4+i 5-i| * | 3 5-i 6+i| */ const int m = 3; //number of rows of complex matrix of Z const int n = 3; //number of colums of complex matrix of Z float Z[2 * m * n] = { 0 }; float Y[MIN(m, n)] = { 0 }; float C[2 * m * m] = { 0 }; float D[2 * n * n] = { 0 }; // Z provides 4X4 complex matrix Z[0] = 1.0; Z[1] = 1.0; Z[2] = 2.0; Z[3] = -1.0; Z[4] = 3.0; Z[5] = 0.0; Z[6] = 2.0; Z[7] = -1.0; Z[8] = 4.0; Z[9] = 1.0; Z[10] = 5.0; Z[11] = -1.0; Z[12] = 3.0; Z[13] = 0.0; Z[14] = 5.0; Z[15] = -1.0; Z[16] = 6.0; Z[17] = 1.0; selfSVD(m, n, Z, Y, C, D); }