// svdComplexDevice1.cpp : Defines the exported functions for the DLL application. // #include "pch.h" //windows11 version #define CULA_USE_CUDA_COMPLEX //win32 #include "svdComplexDevice1.h" #include #include extern "C" __declspec(dllexport) void __cdecl selfSVD(const int M, const int N, float* matrixA, float* matrixS, float* matrixR, float* matrixT) { if (!MeetsMinimumCulaRequirements()) printf("Version check failed\n"); culaDeviceComplexSVDExample(M, N, matrixA, matrixS, matrixR, matrixT); printf("Version check passed\n\n"); } extern "C" __declspec(dllexport) void __cdecl selfgemm (int M, int N, int K, float* matrixA, float* matrixB, float* matrixC) { if (!MeetsMinimumCulaRequirements()) printf("Version check failed\n"); culaDeviceComplexgemmExample(M, N, K, matrixA, matrixB, matrixC); printf("Version check passed\n\n"); } void culaDeviceComplexSVDExample(const int M, const int N, float* matrixA, float* matrixS, float* matrixR, float* matrixT) { int i; char jobu = 'A', jobvt = 'A'; int LDA = M, LDU = M, LDVT = N; time_t begin_time; time_t end_time; double cula_time; culaStatus status; cudaError_t err; // point to host memory culaFloatComplex* Matrix_A = NULL; culaFloat* Matrix_S = NULL; culaFloatComplex* Matrix_R = NULL; culaFloatComplex* Matrix_T = NULL; // point to device memory culaDeviceFloatComplex* Matrix_A_device = NULL; culaDeviceFloat* Matrix_S_device = NULL; culaDeviceFloatComplex* Matrix_R_device = NULL; culaDeviceFloatComplex* Matrix_T_device = NULL; printf("Allocating Matrices\n"); Matrix_A = (culaFloatComplex*)malloc(M*N * sizeof(culaFloatComplex)); Matrix_S = (culaFloat*)malloc(M*N * sizeof(culaFloat)); Matrix_R = (culaFloatComplex*)malloc(M*M * sizeof(culaFloatComplex)); Matrix_T = (culaFloatComplex*)malloc(N*N * sizeof(culaFloatComplex)); if (!Matrix_A || !Matrix_S || !Matrix_R || !Matrix_T) { /* Memory location failed */ free(Matrix_A); free(Matrix_S); free(Matrix_R); free(Matrix_T); exit(EXIT_FAILURE); } err = cudaMalloc((void**)&Matrix_A_device, M*N * sizeof(culaFloatComplex)); checkCudaError(err); err = cudaMalloc((void**)&Matrix_S_device, M*N * sizeof(culaFloat)); checkCudaError(err); err = cudaMalloc((void**)&Matrix_R_device, M*M * sizeof(culaFloatComplex)); checkCudaError(err); err = cudaMalloc((void**)&Matrix_T_device, N*N * sizeof(culaFloatComplex)); checkCudaError(err); printf("Initializing CULA\n"); status = culaInitialize(); checkStatus(status); // matrix A for (i = 0; i