#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"

#include "cusolver_utils.h"

//3 is the number of rows of any matrix AB
#define AB(row, col)  AB[(col)*3 + (row)]

void static printMatrixB(int m, int n,
const cuComplex* AB, const char* name) {
    int row, col;
    for (row = 0; row < m; row++) {
        for (col = 0; col < n; col++) {
            printf("%s(%d, %d) = % lf% + lfi\n", name, row + 1, col + 1,
			AB(row, col).x, AB(row, col).y);
        }
    }
}

void static printMatrix(int m, int n, const float* A, const char* name) {
	for (int row = 0; row < 2 * m * n; row++) {
		printf("%s(%d) = %lf%\n", name, row + 1, A[row]);
	}
}

void static printMatrixS(int m, const float* A, const char* name) {
	for (int row = 0; row < m; row++) {
		printf("%s(%d) = %lf%\n", name, row + 1, A[row]);
	}
}

/***************************************************************/
/****************************selfdgmm***************************/
/*         d_C = diag(d_S) x d_VT in device operation          */
/*         Cmxn    Bmxm      Amxn                              */
/***************************************************************/
/*       matrixB is a 1D real matrix with m     elements       */
/*       matrixA is a 1D real matrix with 2*m*n elements       */
/*       matrixC is a 1D real matrix with 2*m*n elements       */
/*       m is the number of rows of complex matrix C           */
/*       n is the number of columns of complex matrix C        */
/***************************************************************/

void selfdgmm(const int m, const int n,
float* matrixB, float* matrixA, float* matrixC) {

	printf("matrixB = (matlab base-1)\n");
	printMatrixS(m, matrixB, "matrixB");
	printf("=====\n");

	printf("matrixA = (matlab base-1)\n");
	printMatrix(m, n, matrixA, "matrixA");
	printf("=====\n");

	cublasHandle_t cublasH = NULL;
	cublasStatus_t cublas_status = CUBLAS_STATUS_SUCCESS;

	// allocate host memory for matrices A, B and C
	cuComplex* A = (cuComplex*)malloc(sizeof(cuComplex) * m * n);
	cuComplex* B = (cuComplex*)malloc(sizeof(cuComplex) * m * m);
	cuComplex* C = (cuComplex*)malloc(sizeof(cuComplex) * m * n);

	const int ldvt = m;
	const int lds  = m;
	const int ldc  = m;

	// point to device memory
	cuComplex* d_VT = NULL;
	cuComplex* d_S  = NULL;
	cuComplex* d_C  = NULL;

	//	matrix A
	for (int i = 0; i < m * n; i++) {
		A[i] = make_cuComplex((float)matrixA[2 * i], (float)matrixA[2 * i + 1]);
	}
	//	matrix B
	for (int i = 0; i < m * m; i++) {  //RAZ
		B[i] = make_cuComplex(0, 0);
	}
	for (int i = 0; i < m; i++) {
		B[i * (lds + 1)] = make_cuComplex(matrixB[i], 0);
	}

	printf("A = (matlab base-1)\n");
	printMatrixB(m, n, A, "A");
	printf("=====\n");

	printf("B = (matlab base-1)\n");
	printMatrixB(m, m, B, "B");
	printf("=====\n");

	/* step 1: create cublas handle */

	CUBLAS_CHECK(cublasCreate(&cublasH));

	/* step 2: copy matrixVT and matrixS to device */

	CUDA_CHECK(cudaMalloc((void**)&d_VT, sizeof(cuComplex) * m * n));
	CUDA_CHECK(cudaMalloc((void**)&d_S,  sizeof(cuComplex) * m * m));
	CUDA_CHECK(cudaMalloc((void**)&d_C,  sizeof(cuComplex) * m * n));

	CUDA_CHECK(cudaMemcpy(d_VT, A, sizeof(cuComplex) * m * n, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_S, B,  sizeof(cuComplex) * m * m, cudaMemcpyHostToDevice));
	cudaDeviceSynchronize(); /* wait until device is ready */

	//  C =  diag(X)  x A    if mode == CUBLAS_SIDE_LEFT
	//d_C = diag(d_S) x d_VT in device operation
	CUBLAS_CHECK(cublasCdgmm(
		cublasH,
		CUBLAS_SIDE_LEFT,
		m,      //number of rows of matrix A and C
		n,      //number of columns of matrix A and C
		d_VT,   //A, array of dimensions lda x n with lda >= max(1, m)
		ldvt,   /*********/ //lda of A
		d_S,    //one-dimensional array of size |inc| x m if CUBLAS_SIDE_LEFT
		lds + 1, //stride of one-dimensional array X
		d_C,    //array of dimensions ldc x n with ldc >= max(1, m)
		ldc));   //leading dimension of a 2-dimensional array used to store C

	CUDA_CHECK(cudaMemcpy(C, d_C, sizeof(cuComplex) * m * n, cudaMemcpyDeviceToHost));
	cudaDeviceSynchronize(); /* wait until C is ready */

	printf("C = (matlab base-1)\n");
	printMatrixB(m, n, C, "C");
	printf("=====\n");

	for (int j = 0; j < m * n; j++) {
		matrixC[2 * j]     = (float)C[j].x;
		matrixC[2 * j + 1] = (float)C[j].y;
	}

//	printf("matrixC = (matlab base-1)\n");
//	printMatrix(m, n, matrixC, "matrixC");
//	printf("=====\n");

	/* Clean up workspace, input, and output memory allocations */
	CUDA_CHECK(cudaFree(d_VT));
	CUDA_CHECK(cudaFree(d_S));
	CUDA_CHECK(cudaFree(d_C));

	free(A);
	free(B);
	free(C);

	if (cublasH) cublasDestroy(cublasH);
	cudaDeviceReset();
}

int main(int argc, char* argv[]) {
	const int m = 3;   // number of rows of C
	const int n = 2;   // number of columns of C

	float A[2 * m * n] = { 0 };
	float B[m]         = { 0 };
	float C[2 * m * n] = { 0 };

	//B provides 3X3 complex matrix
	B[0] = 1.0;
	B[1] = 1.0;
	B[2] = 1.0;

	// A provides 3X2 complex matrix
	A[0] = 1.0;
	A[1] = 2.0;
	A[2] = 2.0;
	A[3] = 3.0;
	A[4] = 3.0;
	A[5] = 4.0;

	A[6] = 4.0;
	A[7] = 5.0;
	A[8] = 5.0;
	A[9] = 6.0;
	A[10] = 6.0;
	A[11] = 7.0;

	selfdgmm(m, n, B, A, C);
}