#include #include #include #include #include #include //2 and 3 are the number of row of matrices B, and C, respectively. #define A(row, col) A[(col)*2 + (row)] #define B(row, col) B[(col)*3 + (row)] void static printMatrix(int m, int n, const float* A, const char* name) { for (int row = 0; row < 2 * m * n; row++) { printf("%s(%d) = %lf%\n", name, row + 1, A[row]); } } void static printMatrixA(int m, int n, const cuComplex* A, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, A(row, col).x, A(row, col).y); } } } void static printMatrixB(int m, int n, const cuComplex* B, const char* name) { int row, col; for (row = 0; row < m; row++) { for (col = 0; col < n; col++) { printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, B(row, col).x, B(row, col).y); } } } /***************************************************************/ /*****************************selfgemm**************************/ /* Cmxn = alpha X Amxk X Bkxn + beta X Cmxn */ /* matrixA and matrixB are input matrices */ /* matrixC is an output matrix */ /***************************************************************/ void selfgemm(int m, int k, int n, float* matrixA, float* matrixB, float* matrixC) { cublasHandle_t cublasH = NULL; cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS; cudaError_t cudaStat1 = cudaSuccess; cudaError_t cudaStat2 = cudaSuccess; cudaError_t cudaStat3 = cudaSuccess; // allocate host memory for matrices A, B, and C cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * k); cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * k * n); cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n); if (!A || !B || !C) { /* Memory location failed */ free(A); free(B); free(C); exit(EXIT_FAILURE); } const int lda = m; const int ldb = k; const int ldc = m; // matrix A for (int i = 0; i < m * k; i++) { A[i] = make_cuComplex((float)matrixA[2 * i], (float)matrixA[2 * i + 1]); } // matrix B for (int i = 0; i < k * n; i++) { B[i] = make_cuComplex((float)matrixB[2 * i], (float)matrixB[2 * i + 1]); } printf("A = (matlab base-1)\n"); printMatrixA(m, k, A, "A"); printf("=====\n"); printf("B = (matlab base-1)\n"); printMatrixB(k, n, B, "B"); printf("=====\n"); // point to device memory cuComplex* d_A = NULL; cuComplex* d_B = NULL; cuComplex* d_C = NULL; /* step 1: create cublas handle */ cublas_status = cublasCreate(&cublasH); assert(CUBLAS_STATUS_SUCCESS == cublas_status); /* step 2: copy matrix A and matrix B to device */ cudaStat1 = cudaMalloc((void**)&d_A, sizeof(cuComplex) * m * k); cudaStat2 = cudaMalloc((void**)&d_B, sizeof(cuComplex) * k * n); cudaStat3 = cudaMalloc((void**)&d_C, sizeof(cuComplex) * m * n); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); assert(cudaSuccess == cudaStat3); cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuComplex) * m * k, cudaMemcpyHostToDevice); cudaStat2 = cudaMemcpy(d_B, B, sizeof(cuComplex) * k * n, cudaMemcpyHostToDevice); assert(cudaSuccess == cudaStat1); assert(cudaSuccess == cudaStat2); cudaDeviceSynchronize(); /* wait until device is ready */ const cuComplex h_one = make_cuComplex(1, 0); //alpha const cuComplex h_zero = make_cuComplex(0, 0); //beta // Cmxn = alpha X Amxk X Bkxn + beta X Cmxn cublas_status = cublasCgemm_v2( cublasH, CUBLAS_OP_N, //A CUBLAS_OP_N, //B m, /* number of rows of A */ n, /* number of columns of B */ k, /* number of columns of A and number of rows of B */ &h_one, /* host pointer */ d_A, lda, d_B, ldb, &h_zero, /* host pointer */ d_C, ldc); assert(CUBLAS_STATUS_SUCCESS == cublas_status); cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost); cudaDeviceSynchronize(); /* wait until C is ready */ printf("C = (matlab base-1)\n"); printMatrixA(m, n, C, "C"); printf("=====\n"); for (int j = 0; j < m * n; j++) { matrixC[2 * j] = (float)C[j].x; matrixC[2 * j + 1] = (float)C[j].y; } // printf("matrixC = (matlab base-1)\n"); // printMatrix(m, n, matrixC, "matrixC"); // printf("=====\n"); /* Clean up workspace, input, and output memory allocations */ cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); free(A); free(B); free(C); if (cublasH) cublasDestroy(cublasH); cudaDeviceReset(); } int main(int argc, char* argv[]) { /* * | 1+2i 2+3i 3+4i | * A = | 4+5i 5+6i 6+7i | */ /* * | 1+2i 2+3i 3+4i 4+5i | * B = | 5+6i 6+7i 7+8i 8+9i | * | 9+10i 10+11i 11+23i 12+13i | */ const int m = 2; //number of rows of complex matrix of A and C const int k = 3; //number of colums of complex matrix of A ///number of rows of complex matrix of B const int n = 4; //number of colums of complex matrices of B and C float A[2 * m * k] = { 0 }; float B[2 * k * n] = { 0 }; float C[2 * m * n] = { 0 }; // A provides 2X3 complex matrix A[0] = 1.0; A[1] = 2.0; A[2] = 4.0; A[3] = 5.0; A[4] = 2.0; A[5] = 3.0; A[6] = 5.0; A[7] = 6.0; A[8] = 3.0; A[9] = 4.0; A[10] = 6.0; A[11] = 7.0; // B provides 3X4 complex matrix B[0] = 1.0; B[1] = 2.0; B[2] = 5.0; B[3] = 6.0; B[4] = 9.0; B[5] = 10.0; B[6] = 2.0; B[7] = 3.0; B[8] = 6.0; B[9] = 7.0; B[10] = 10.0; B[11] = 11.0; B[12] = 3.0; B[13] = 4.0; B[14] = 7.0; B[15] = 8.0; B[16] = 11.0; B[17] = 12.0; B[18] = 4.0; B[19] = 5.0; B[20] = 8.0; B[21] = 9.0; B[22] = 12.0; B[23] = 13.0; selfgemm(m, k, n, A, B, C); }