#include #include #include #include #include #include "cublas_v2.h" #include "cusolver_utils.h" //3 is the number of rows of any matrix AB #define AB(row, col) AB[(col)*3 + (row)] void static printMatrixB(int m, int n, const cuComplex* AB, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, AB(row, col).x, AB(row, col).y); } } } void static printMatrix(int m, int n, const float* A, const char* name) { for (int row = 0; row < 2 * m * n; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } void static printMatrixS(int m, const float* A, const char* name) { for (int row = 0; row < m; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } /***************************************************************/ /****************************selfdgmm***************************/ /* d_C = diag(d_S) x d_VT in device operation */ /* Cmxn Bmxm Amxn */ /***************************************************************/ /* matrixB is a 1D real matrix with m elements */ /* matrixA is a 1D real matrix with 2*m*n elements */ /* matrixC is a 1D real matrix with 2*m*n elements */ /* m is the number of rows of complex matrix C */ /* n is the number of columns of complex matrix C */ /***************************************************************/ void selfdgmm(const int m, const int n, float* matrixB, float* matrixA, float* matrixC) { printf("matrixB = (matlab base-1)\n"); printMatrixS(m, matrixB, "matrixB"); printf("=====\n"); printf("matrixA = (matlab base-1)\n"); printMatrix(m, n, matrixA, "matrixA"); printf("=====\n"); cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; // allocate host memory for matrices A, B and C cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * n); cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * m * m); cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n); const int ldvt = m; const int lds = m; const int ldc = m; // point to device memory cuComplex* d_VT = NULL; cuComplex* d_S = NULL; cuComplex* d_C = NULL; // matrix A for (int i = 0; i < m * n; i++) { A[i] = make_cuComplex((float)matrixA[2 * i], (float)matrixA[2 * i + 1]); } // matrix B for (int i = 0; i < m * m; i++) { //RAZ B[i] = make_cuComplex(0, 0); } for (int i = 0; i < m; i++) { B[i * (lds + 1)] = make_cuComplex(matrixB[i], 0); } printf("A = (matlab base-1)\n"); printMatrixB(m, n, A, "A"); printf("=====\n"); printf("B = (matlab base-1)\n"); printMatrixB(m, m, B, "B"); printf("=====\n"); /* step 1: create cublas handle */ CUBLAS_CHECK(cublasCreate(&cublasH)); /* step 2: copy matrixVT and matrixS to device */ CUDA_CHECK(cudaMalloc((void**)&d_VT, sizeof(cuComplex) * m * n)); CUDA_CHECK(cudaMalloc((void**)&d_S, sizeof(cuComplex) * m * m)); CUDA_CHECK(cudaMalloc((void**)&d_C, sizeof(cuComplex) * m * n)); CUDA_CHECK(cudaMemcpy(d_VT, A, sizeof(cuComplex) * m * n, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_S, B, sizeof(cuComplex) * m * m, cudaMemcpyHostToDevice)); cudaDeviceSynchronize(); /* wait until device is ready */ // C = diag(X) x A if mode == CUBLAS_SIDE_LEFT //d_C = diag(d_S) x d_VT in device operation CUBLAS_CHECK(cublasCdgmm( cublasH, CUBLAS_SIDE_LEFT, m, //number of rows of matrix A and C n, //number of columns of matrix A and C d_VT, //A, array of dimensions lda x n with lda >= max(1, m) ldvt, /*********/ //lda of A d_S, //one-dimensional array of size |inc| x m if CUBLAS_SIDE_LEFT lds + 1, //stride of one-dimensional array X d_C, //array of dimensions ldc x n with ldc >= max(1, m) ldc)); //leading dimension of a 2-dimensional array used to store C CUDA_CHECK(cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost)); cudaDeviceSynchronize(); /* wait until C is ready */ printf("C = (matlab base-1)\n"); printMatrixB(m, n, C, "C"); printf("=====\n"); for (int j = 0; j < m * n; j++) { matrixC[2 * j] = (float)C[j].x; matrixC[2 * j + 1] = (float)C[j].y; } // printf("matrixC = (matlab base-1)\n"); // printMatrix(m, n, matrixC, "matrixC"); // printf("=====\n"); /* Clean up workspace, input, and output memory allocations */ CUDA_CHECK(cudaFree(d_VT)); CUDA_CHECK(cudaFree(d_S)); CUDA_CHECK(cudaFree(d_C)); free(A); free(B); free(C); if (cublasH) cublasDestroy(cublasH); cudaDeviceReset(); } int main(int argc, char* argv[]) { const int m = 3; // number of rows of C const int n = 2; // number of columns of C float A[2 * m * n] = { 0 }; float B[m] = { 0 }; float C[2 * m * n] = { 0 }; //B provides 3X3 complex matrix B[0] = 1.0; B[1] = 1.0; B[2] = 1.0; // A provides 3X2 complex matrix A[0] = 1.0; A[1] = 2.0; A[2] = 2.0; A[3] = 3.0; A[4] = 3.0; A[5] = 4.0; A[6] = 4.0; A[7] = 5.0; A[8] = 5.0; A[9] = 6.0; A[10] = 6.0; A[11] = 7.0; selfdgmm(m, n, B, A, C); }