#include <stdio.h>
#include <stdlib.h>   
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

//2 and 3 are the number of row of matrices B, and C, respectively.
#define A(row, col)  A[(col)*2 + (row)]
#define B(row, col)  B[(col)*3 + (row)]     

void static printMatrix(int m, int n, const float* A, const char* name) {
    for (int row = 0; row < 2 * m * n; row++) {
        printf("%s(%d) = %lf%\n", name, row + 1, A[row]);
    }
}

void static printMatrixA(int m, int n, const cuComplex* A, const char* name) {
    int row, col;
    for (row = 0; row < m; row++) {
        for (col = 0; col < n; col++) {
            printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, A(row, col).x, A(row, col).y);
        }
    }
}

void static printMatrixB(int m, int n, const cuComplex* B, const char* name) {
    int row, col;
    for (row = 0; row < m; row++) {
        for (col = 0; col < n; col++) {
            printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1, B(row, col).x, B(row, col).y);
        }
    }
}

/***************************************************************/
/*****************************selfgemm**************************/
/*           Cmxn = alpha X Amxk X Bkxn + beta X Cmxn          */
/*            matrixA and matrixB are input matrices           */
/*                 matrixC is an output matrix                 */
/***************************************************************/

void selfgemm(int m, int k, int n,
    float* matrixA, float* matrixB, float* matrixC) {

    cublasHandle_t cublasH = NULL;
    cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;
    cudaError_t cudaStat1 = cudaSuccess;
    cudaError_t cudaStat2 = cudaSuccess;
    cudaError_t cudaStat3 = cudaSuccess;

    // allocate host memory for matrices A, B, and C
    cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * k);
    cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * k * n);
    cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n);

    if (!A || !B || !C) { /* Memory location failed */
        free(A);
        free(B);
        free(C);
        exit(EXIT_FAILURE);
    }

    const int lda = m;
    const int ldb = k;
    const int ldc = m;

    //	matrix A
    for (int i = 0; i < m * k; i++) {
        A[i] = make_cuComplex((float)matrixA[2 * i], (float)matrixA[2 * i + 1]);
    }

    //	matrix B
    for (int i = 0; i < k * n; i++) {
        B[i] = make_cuComplex((float)matrixB[2 * i], (float)matrixB[2 * i + 1]);
    }

    printf("A = (matlab base-1)\n");
    printMatrixA(m, k, A, "A");
    printf("=====\n");

    printf("B = (matlab base-1)\n");
    printMatrixB(k, n, B, "B");
    printf("=====\n");

    // point to device memory
    cuComplex* d_A = NULL;
    cuComplex* d_B = NULL;
    cuComplex* d_C = NULL;

    /* step 1: create cublas handle */

    cublas_status = cublasCreate(&cublasH);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);

    /* step 2: copy matrix A and matrix B to device */

    cudaStat1 = cudaMalloc((void**)&d_A, sizeof(cuComplex) * m * k);
    cudaStat2 = cudaMalloc((void**)&d_B, sizeof(cuComplex) * k * n);
    cudaStat3 = cudaMalloc((void**)&d_C, sizeof(cuComplex) * m * n);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    assert(cudaSuccess == cudaStat3);

    cudaStat1 = cudaMemcpy(d_A, A, sizeof(cuComplex) * m * k, cudaMemcpyHostToDevice);
    cudaStat2 = cudaMemcpy(d_B, B, sizeof(cuComplex) * k * n, cudaMemcpyHostToDevice);
    assert(cudaSuccess == cudaStat1);
    assert(cudaSuccess == cudaStat2);
    cudaDeviceSynchronize(); /* wait until device is ready */

    const cuComplex h_one = make_cuComplex(1, 0); //alpha
    const cuComplex h_zero = make_cuComplex(0, 0); //beta

    // Cmxn = alpha X Amxk X Bkxn + beta X Cmxn	
    cublas_status = cublasCgemm_v2(
        cublasH,
        CUBLAS_OP_N,    //A
        CUBLAS_OP_N,    //B
        m, /* number of rows of A    */
        n, /* number of columns of B */
        k, /* number of columns of A and number of rows of B */
        &h_one,    /* host pointer */
        d_A,
        lda,
        d_B,
        ldb,
        &h_zero,   /* host pointer */
        d_C,
        ldc);
    assert(CUBLAS_STATUS_SUCCESS == cublas_status);

    cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize(); /* wait until C is ready */

    printf("C = (matlab base-1)\n");
    printMatrixA(m, n, C, "C");
    printf("=====\n");

    for (int j = 0; j < m * n; j++) {
        matrixC[2 * j] = (float)C[j].x;
        matrixC[2 * j + 1] = (float)C[j].y;
    }

    //    printf("matrixC = (matlab base-1)\n");
    //    printMatrix(m, n, matrixC, "matrixC");
    //    printf("=====\n");

        /* Clean up workspace, input, and output memory allocations */
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);

    free(A);
    free(B);
    free(C);

    if (cublasH) cublasDestroy(cublasH);
    cudaDeviceReset();
}

int main(int argc, char* argv[])
{
    /*
     *     | 1+2i  2+3i   3+4i |
     * A = | 4+5i  5+6i   6+7i |
     */

     /*
      *     | 1+2i    2+3i    3+4i    4+5i  |
      * B = | 5+6i    6+7i    7+8i    8+9i |
      *     | 9+10i  10+11i  11+23i  12+13i |
      */


    const int m = 2;   //number of rows of complex matrix of A and C
    const int k = 3;   //number of colums of complex matrix of A
    ///number of rows of complex matrix of B
    const int n = 4;   //number of colums of complex matrices of B and C

    float A[2 * m * k] = { 0 };
    float B[2 * k * n] = { 0 };
    float C[2 * m * n] = { 0 };

    //  A provides 2X3 complex matrix
    A[0] = 1.0;
    A[1] = 2.0;
    A[2] = 4.0;
    A[3] = 5.0;

    A[4] = 2.0;
    A[5] = 3.0;
    A[6] = 5.0;
    A[7] = 6.0;

    A[8] = 3.0;
    A[9] = 4.0;
    A[10] = 6.0;
    A[11] = 7.0;

    //  B provides 3X4 complex matrix
    B[0] = 1.0;
    B[1] = 2.0;
    B[2] = 5.0;
    B[3] = 6.0;
    B[4] = 9.0;
    B[5] = 10.0;

    B[6] = 2.0;
    B[7] = 3.0;
    B[8] = 6.0;
    B[9] = 7.0;
    B[10] = 10.0;
    B[11] = 11.0;

    B[12] = 3.0;
    B[13] = 4.0;
    B[14] = 7.0;
    B[15] = 8.0;
    B[16] = 11.0;
    B[17] = 12.0;

    B[18] = 4.0;
    B[19] = 5.0;
    B[20] = 8.0;
    B[21] = 9.0;
    B[22] = 12.0;
    B[23] = 13.0;

    selfgemm(m, k, n, A, B, C);
}