// svdComplexDevice2.cpp : Defines the exported functions for the DLL. //#include "pch.h" //in Visual Studio 2019 and later #include "svdComplexDevice2.h" // Standard Library imports #include #include #include #include #include #include #include /***************************************************************** void static printMatrix(int m, int n, const float* A, const char* name) { for (int row = 0; row < 2 * m * n; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } void static printMatrixS(int m, const float* A, const char* name) { for (int row = 0; row < m; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } #define B(row, col) B[(col)*3 + (row)] void static printMatrixB(int m, int n, const cuComplex* B, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, B(row, col).x, B(row, col).y); } } } #define C(row, col) C[(col)*4 + (row)] void static printMatrixC(int m, int n, const cuComplex* C, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, C(row, col).x, C(row, col).y); } } } #define D(row, col) D[(col)*2 + (row)] void static printMatrixD(int m, int n, const cuComplex* D, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, D(row, col).x, D(row, col).y); } } } ****************************************************************/ #define MIN(a,b) ((a) < (b) ? (a) : (b)) extern "C" __declspec(dllexport) void __cdecl selfSVD( const int m, const int n, float* matrixA, float* matrixS, float* matrixR, float* matrixT){ deviceComplexSVDExample(m, n, matrixA, matrixS, matrixR, matrixT);} extern "C" __declspec(dllexport) void __cdecl selfgemm( int m, int k, int n, float* matrixA, float* matrixB, float* matrixC){ deviceComplexgemmExample(m, k, n, matrixA, matrixB, matrixC);} extern "C" __declspec(dllexport) void __cdecl selfdgmm( int m, int n, float* matrixB, float* matrixA, float* matrixC){ deviceComplexdgmmExample(m, n, matrixB, matrixA, matrixC);} /***************************************************************/ /*****************************selfSVD***************************/ /* matrixA is a m-row by n-column input matrix */ /* matrixS, matrixU, and matrixVT are output matrices */ /***************************************************************/ /* matrixA is a float[2*n*m] 1D Hankel matrix matrixS is a float[min(n, m)] 1D singular value matrix matrixR is a float[2*n*n] 1D U matrix matrixT is a float[2*m*m] 1D VT matrix n: number of lines of Hankel matrix m: number of columns of Hankel matrix */ void deviceComplexSVDExample(const int m, const int n, float* matrixA, float* matrixS, float* matrixR, float* matrixT) { // printf("matrixA = (matlab base-1)\n"); // printMatrix(m, n, matrixA, "matrixA"); // printf("=====\n"); cusolverDnHandle_t cusolverH = NULL; cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cusolverStatus_t cusolver_status = CUSOLVER_STATUS_SUCCESS; cudaError_t cudaStat1 = cudaSuccess; cudaError_t cudaStat2 = cudaSuccess; cudaError_t cudaStat3 = cudaSuccess; cudaError_t cudaStat4 = cudaSuccess; cudaError_t cudaStat5 = cudaSuccess; const int lda = m; // lda >= m const int ldu = m; // ldu >= m const int ldvt = n; // ldvt >= n if jobu = 'A' // point to host device cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * n); cuComplex* U = (cuComplex*)malloc(sizeof(cuComplex) * m * m); cuComplex* VT = (cuComplex*)malloc(sizeof(cuComplex) * n * n); if (!A || !U || !VT) { /* Memory location failed */ free(A); free(U); free(VT); exit(EXIT_FAILURE); } // matrix A for (int i = 0; i < m * n; i++) { A[i] = make_cuComplex((float)matrixA[2*i], (float)matrixA[2*i + 1]); } // printf("A = (matlab base-1)\n"); // printMatrixC(m, n, A, "A"); // printf("=====\n"); // point to device device cuComplex* d_A = NULL; float* d_S = NULL; cuComplex* d_U = NULL; cuComplex* d_VT = NULL; int* d_info = NULL; void* bufferOnDevice = NULL; size_t workspaceInBytesOnDevice = 0; void* bufferOnHost = NULL; size_t workspaceInBytesOnHost = 0; /* step 1: create cusolverDn */ cusolver_status = cusolverDnCreate(&cusolverH); assert(CUSOLVER_STATUS_SUCCESS == cusolver_status); /* step 2: copy A to device */ cudaStat1 = cudaMalloc((void**)&d_A, sizeof(cuComplex) * m * n); cudaStat2 = cudaMalloc((void**)&d_S, sizeof(float) * MIN(m, n)); cudaStat3 = cudaMalloc((void**)&d_U, sizeof(cuComplex) * m * m); cudaStat4 = cudaMalloc((void**)&d_VT, sizeof(cuComplex) * n * n); cudaStat5 = cudaMalloc((void**)&d_info, sizeof(int)); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); assert(cudaSuccess == cudaStat4); assert(cudaSuccess == cudaStat5); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuComplex) * m * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); cudaDeviceSynchronize(); /* wait until d_A is ready */ /* step 3: query working space of SVD */ signed char jobu = 'A'; /* all m columns of U */ signed char jobvt = 'A'; /* all n rows of VT */ cusolver_status = cusolverDnXgesvd_bufferSize( cusolverH, NULL, /* params */ jobu, jobvt, (int32_t)m, (int32_t)n, CUDA_C_32F, /* dataTypeA */ d_A, (int32_t)lda, CUDA_R_32F, /* dataTypeS */ d_S, CUDA_C_32F, /* dataTypeU */ d_U, (int32_t)ldu, CUDA_C_32F, /* dataTypeVT */ d_VT, (int32_t)ldvt, CUDA_C_32F, /* computeType */ &workspaceInBytesOnDevice, &workspaceInBytesOnHost); cudaStat1 = cudaMalloc((void**)&bufferOnDevice, workspaceInBytesOnDevice); assert(cudaSuccess == cudaStat1); if (0 < workspaceInBytesOnHost) { bufferOnHost = (void*)malloc(workspaceInBytesOnHost); assert(NULL != bufferOnHost); } /* step 4: compute SVD */ cusolver_status = cusolverDnXgesvd( cusolverH, NULL, /* params */ jobu, jobvt, (int32_t)m, (int32_t)n, CUDA_C_32F, /* dataTypeA */ d_A, (int32_t)lda, CUDA_R_32F, /* dataTypeS */ d_S, CUDA_C_32F, /* dataTypeU */ d_U, (int32_t)ldu, CUDA_C_32F, /* dataTypeVT */ d_VT, (int32_t)ldvt, CUDA_C_32F, /* computeType */ bufferOnDevice, /* d_work */ workspaceInBytesOnDevice, bufferOnHost, /* h_work */ workspaceInBytesOnHost, d_info); cudaStat1 = cudaDeviceSynchronize(); assert(CUSOLVER_STATUS_SUCCESS == cusolver_status); assert(cudaSuccess == cudaStat1); /* Syntax: cudaMemcpy(Destination Pointer Variable, Source Pointer Variable, size of memory, cudaMemcpyHostToDevice / cudaMemcpyDeviceToHost) */ cudaStat1 = cudaMemcpy(U, d_U, sizeof(cuComplex) * m * m, cudaMemcpyDeviceToHost); cudaStat2 = cudaMemcpy(VT, d_VT, sizeof(cuComplex) * n * n, cudaMemcpyDeviceToHost); cudaStat3 = cudaMemcpy(matrixS, d_S, sizeof(float) * MIN(m, n), cudaMemcpyDeviceToHost); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); cudaDeviceSynchronize(); /* wait until host data is ready */ //S is a 1D real matrix of MIN(m, n) singular values // printf("matrixS = (matlab base-1)\n"); // printMatrixS(MIN(m, n), matrixS, "matrixS"); // printf("=====\n"); // printf("U = (matlab base-1)\n"); // printMatrixC(m, m, U, "U"); // printf("=====\n"); // printf("VT = (matlab base-1)\n"); // printMatrixC(n, n, VT, "VT"); // printf("=====\n"); // matrixR is a float[2 * n * n] 1D U matrix // matrixT is a float[2 * m * m] 1D VT matrix for (int j = 0; j < m * m; j++) { matrixR[2*j] = (float)U[j].x; matrixR[2*j + 1] = (float)U[j].y; } for (int j = 0; j < n * n; j++) { matrixT[2*j] = (float)VT[j].x; matrixT[2*j + 1] = (float)VT[j].y; } /* Clean up workspace, input, and output memory allocations */ if (d_A) cudaFree(d_A); if (d_S) cudaFree(d_S); if (d_U) cudaFree(d_U); if (d_VT) cudaFree(d_VT); if (d_info) cudaFree(d_info); if (bufferOnDevice) cudaFree(bufferOnDevice); if (bufferOnHost) free(bufferOnHost); if (A) free(A); if (U) free(U); if (VT) free(VT); if (cublasH) cublasDestroy(cublasH); if (cusolverH) cusolverDnDestroy(cusolverH); cudaDeviceReset(); } /***************************************************************/ /*****************************selfgemm**************************/ /* Cmxn = alpha X Amxk X Bkxn + beta X Cmxn */ /* matrixA and matrixB are input matrices */ /* matrixC is an output matrix */ /***************************************************************/ void deviceComplexgemmExample (int m, int k, int n, float* matrixA, float* matrixB, float* matrixC) { cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cudaError_t cudaStat1 = cudaSuccess; cudaError_t cudaStat2 = cudaSuccess; cudaError_t cudaStat3 = cudaSuccess; // allocate host memory for matrices A, B, and C cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * k); cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * k * n); cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n); if (!A || !B || !C) { /* Memory location failed */ free(A); free(B); free(C); exit(EXIT_FAILURE); } const int lda = m; const int ldb = k; const int ldc = m; // matrix A for (int i = 0; i < m * k; i++) { A[i] = make_cuComplex((float)matrixA[2*i], (float)matrixA[2*i + 1]); } // matrix B for (int i = 0; i < k * n; i++) { B[i] = make_cuComplex((float)matrixB[2*i], (float)matrixB[2*i + 1]); } // printf("A = (matlab base-1)\n"); // printMatrixD(m, k, A, "A"); // printf("=====\n"); // printf("B = (matlab base-1)\n"); // printMatrixB(k, n, B, "B"); // printf("=====\n"); // point to device memory cuComplex* d_A = NULL; cuComplex* d_B = NULL; cuComplex* d_C = NULL; /* step 1: create cublas handle */ cublas_status = cublasCreate(&cublasH); assert(CUBLAS_STATUS_SUCCESS == cublas_status); /* step 2: copy matrix A and matrix B to device */ cudaStat1 = cudaMalloc((void**)&d_A, sizeof(cuComplex) * m * k); cudaStat2 = cudaMalloc((void**)&d_B, sizeof(cuComplex) * k * n); cudaStat3 = cudaMalloc((void**)&d_C, sizeof(cuComplex) * m * n); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuComplex) * m * k, cudaMemcpyHostToDevice); cudaStat2 = cudaMemcpy(d_B, B, sizeof(cuComplex) * k * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); cudaDeviceSynchronize(); /* wait until device is ready */ const cuComplex h_one = make_cuComplex(1, 0); //alpha const cuComplex h_zero = make_cuComplex(0, 0); //beta // Cmxn = alpha X Amxk X Bkxn + beta X Cmxn cublas_status = cublasCgemm_v2( cublasH, CUBLAS_OP_N, //A CUBLAS_OP_N, //B m, /* number of rows of A */ n, /* number of columns of B */ k, /* number of columns of A */ &h_one, /* host pointer */ d_A, lda, d_B, ldb, &h_zero, /* host pointer */ d_C, ldc); assert(CUBLAS_STATUS_SUCCESS == cublas_status); cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); /* wait until C is ready */ // printf("C = (matlab base-1)\n"); // printMatrixD(m, n, C, "C"); // printf("=====\n"); for (int j = 0; j < m * n; j++) { matrixC[2*j] = (float)C[j].x; matrixC[2*j + 1] = (float)C[j].y; } // printf("matrixC = (matlab base-1)\n"); // printMatrix(m, n, matrixC, "matrixC"); // printf("=====\n"); /* Clean up workspace, input, and output memory allocations */ cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(A); free(B); free(C); if (cublasH) cublasDestroy(cublasH); cudaDeviceReset(); } /***************************************************************/ /****************************selfdgmm***************************/ /* d_C = diag(d_S) x d_VT in device operation */ /* Cmxn Bmxm Amxn */ /***************************************************************/ /* matrixB is a 1D real matrix with m elements */ /* matrixA is a 1D real matrix with 2*m*n elements */ /* matrixC is a 1D real matrix with 2*m*n elements */ /* m is the number of rows of complex matrix C */ /* n is the number of columns of complex matrix C */ /***************************************************************/ void deviceComplexdgmmExample (const int m, const int n, float* matrixB, float* matrixA, float* matrixC) { // printf("matrixB = (matlab base-1)\n"); // printMatrixS(m, matrixB, "matrixB"); // printf("=====\n"); // printf("matrixA = (matlab base-1)\n"); // printMatrix(m, n, matrixA, "matrixA"); // printf("=====\n"); cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cudaError_t cudaStat1 = cudaSuccess; cudaError_t cudaStat2 = cudaSuccess; cudaError_t cudaStat3 = cudaSuccess; // allocate host memory for matrices A, B and C cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * n); cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * m * m); cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n); const int ldvt = m; const int lds = m; const int ldc = m; // point to device memory cuComplex* d_VT = NULL; cuComplex* d_S = NULL; cuComplex* d_C = NULL; // matrix A for (int i = 0; i < m * n; i++) { A[i] = make_cuComplex((float)matrixA[2*i], (float)matrixA[2*i + 1]); } // matrix B for (int i = 0; i < m * m; i++) { //RAZ B[i] = make_cuComplex(0, 0); } for (int i = 0; i < m; i++) { B[i * (lds + 1)] = make_cuComplex(matrixB[i], 0); } // printf("A = (matlab base-1)\n"); // printMatrixB(m, n, A, "A"); // printf("=====\n"); // printf("B = (matlab base-1)\n"); // printMatrixB(m, m, B, "B"); // printf("=====\n"); /* step 1: create cublas handle */ cublas_status = cublasCreate(&cublasH); assert(CUBLAS_STATUS_SUCCESS == cublas_status); /* step 2: copy matrixVT and matrixS to device */ cudaStat1 = cudaMalloc((void**)&d_VT, sizeof(cuComplex) * m * n); cudaStat2 = cudaMalloc((void**)&d_S, sizeof(cuComplex) * m * m); cudaStat3 = cudaMalloc((void**)&d_C, sizeof(cuComplex) * m * n); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); cudaStat1 = cudaMemcpy(d_VT, A, sizeof(cuComplex) * m * n, cudaMemcpyHostToDevice); cudaStat2 = cudaMemcpy(d_S, B, sizeof(cuComplex) * m * m, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); cudaDeviceSynchronize(); /* wait until device is ready */ // C = diag(X) x A if mode == CUBLAS_SIDE_LEFT //d_C = diag(d_S) x d_VT in device operation cublas_status = cublasCdgmm( cublasH, CUBLAS_SIDE_LEFT, m, //number of rows of matrix A and C n, //number of columns of matrix A and C d_VT, //A, array of dimensions lda x n with lda >= max(1, m) ldvt, /*********/ //lda of A d_S, //one-dimensional array of size |inc| x m if CUBLAS_SIDE_LEFT lds + 1, //stride of one-dimensional array X d_C, //array of dimensions ldc x n with ldc >= max(1, m) ldc); //leading dimension of a 2-dimensional array used to store C assert(CUBLAS_STATUS_SUCCESS == cublas_status); cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); /* wait until C is ready */ // printf("C = (matlab base-1)\n"); // printMatrixB(m, n, C, "C"); // printf("=====\n"); for (int j = 0; j < m * n; j++) { matrixC[2*j] = (float)C[j].x; matrixC[2*j + 1] = (float)C[j].y; } // printf("matrixC = (matlab base-1)\n"); // printMatrix(m, n, matrixC, "matrixC"); // printf("=====\n"); /* Clean up workspace, input, and output memory allocations */ cudaFree(d_VT); cudaFree(d_S); cudaFree(d_C); free(A); free(B); free(C); if (cublasH) cublasDestroy(cublasH); cudaDeviceReset(); }