/* * Copyright 2020 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #include #include #include #include #include #include "cusolver_utils.h" int main(int argc, char* argv[]) { cusolverDnHandle_t cusolverH = NULL; cublasHandle_t cublasH = NULL; cudaStream_t stream = NULL; cusolverDnParams_t params = NULL; using data_type = double; const int64_t m = 3; const int64_t n = 2; const int64_t lda = m; // lda >= m const int64_t ldu = m; // ldu >= m const int64_t ldvt = n; // ldvt >= n if jobu = 'A' /* * | 1 2 | * A = | 4 5 | * | 2 1 | */ const std::vector A = { 1.0, 4.0, 2.0, 2.0, 5.0, 1.0 }; std::vector U(ldu * m, 0); std::vector VT(ldvt * n, 0); std::vector S(n, 0); std::vector S_exact = { 7.065283497082729, 1.040081297712078 }; // exact singular values data_type* d_A = nullptr; data_type* d_S = nullptr; // singular values data_type* d_U = nullptr; // left singular vectors data_type* d_VT = nullptr; // right singular vectors int* d_info = nullptr; data_type* d_work = nullptr; data_type* h_work = nullptr; data_type* d_rwork = nullptr; data_type* d_W = nullptr; // W = S*VT size_t workspaceInBytesOnDevice = 0; size_t workspaceInBytesOnHost = 0; int info = 0; const data_type h_one = 1; const data_type h_minus_one = -1; std::printf("A = (matlab base-1)\n"); print_matrix(m, n, A.data(), lda); std::printf("=====\n"); /* step 1: create cusolver handle, bind a stream */ CUSOLVER_CHECK(cusolverDnCreate(&cusolverH)); CUBLAS_CHECK(cublasCreate(&cublasH)); CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream)); CUBLAS_CHECK(cublasSetStream(cublasH, stream)); CUSOLVER_CHECK(cusolverDnCreateParams(¶ms)); /* step 2: copy A to device */ CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_A), sizeof(data_type) * A.size())); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_S), sizeof(data_type) * S.size())); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_U), sizeof(data_type) * U.size())); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_VT), sizeof(data_type) * VT.size())); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_info), sizeof(int))); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_W), sizeof(data_type) * lda * n)); CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * lda * n, cudaMemcpyHostToDevice, stream)); signed char jobu = 'A'; // all m columns of U signed char jobvt = 'A'; // all n rows of VT /* step 3: query working space of SVD */ CUSOLVER_CHECK(cusolverDnXgesvd_bufferSize( cusolverH, params, jobu, jobvt, m, n, traits::cuda_data_type, d_A, lda, traits::cuda_data_type, d_S, traits::cuda_data_type, d_U, ldu, traits::cuda_data_type, d_VT, ldvt, traits::cuda_data_type, &workspaceInBytesOnDevice, &workspaceInBytesOnHost)); CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_work), workspaceInBytesOnDevice)); if (0 < workspaceInBytesOnHost) { h_work = reinterpret_cast(malloc(workspaceInBytesOnHost)); if (h_work == nullptr) { throw std::runtime_error("Error: h_work not allocated."); } } /* step 4: compute SVD */ CUSOLVER_CHECK(cusolverDnXgesvd( cusolverH, params, jobu, jobvt, m, n, traits::cuda_data_type, d_A, lda, traits::cuda_data_type, d_S, traits::cuda_data_type, d_U, ldu, traits::cuda_data_type, d_VT, ldvt, traits::cuda_data_type, d_work, workspaceInBytesOnDevice, h_work, workspaceInBytesOnHost, d_info)); CUDA_CHECK(cudaMemcpyAsync(U.data(), d_U, sizeof(data_type) * U.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(VT.data(), d_VT, sizeof(data_type) * VT.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(S.data(), d_S, sizeof(data_type) * S.size(), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); std::printf("after Xgesvd: info = %d\n", info); if (0 == info) { std::printf("Xgesvd converges \n"); } else if (0 > info) { std::printf("%d-th parameter is wrong \n", -info); exit(1); } else { std::printf("WARNING: info = %d : Xgesvd does not converge \n", info); } std::printf("=====\n"); std::printf("S = (matlab base-1)\n"); print_matrix(n, 1, S.data(), lda); std::printf("=====\n"); std::printf("U = (matlab base-1)\n"); print_matrix(m, m, U.data(), ldu); std::printf("=====\n"); std::printf("VT = (matlab base-1)\n"); print_matrix(n, n, VT.data(), ldvt); std::printf("=====\n"); // step 5: measure error of singular value double ds_sup = 0; for (int j = 0; j < n; j++) { double err = fabs(S[j] - S_exact[j]); ds_sup = (ds_sup > err) ? ds_sup : err; } std::printf("|S - S_exact| = %E \n", ds_sup); // step 6: |A - U*S*VT| // W = S*VT CUBLAS_CHECK(cublasDdgmm(cublasH, CUBLAS_SIDE_LEFT, n, n, d_VT, ldvt, d_S, 1, d_W, lda)); CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), cudaMemcpyHostToDevice, stream)); CUBLAS_CHECK(cublasDgemm_v2(cublasH, CUBLAS_OP_N, // U CUBLAS_OP_N, // W m, // number of rows of A n, // number of columns of A n, // number of columns of U &h_minus_one, /* host pointer */ d_U, // U ldu, d_W, // W lda, &h_one, /* hostpointer */ d_A, lda)); double dR_fro = 0.0; CUBLAS_CHECK(cublasDnrm2_v2(cublasH, A.size(), d_A, 1, &dR_fro)); CUDA_CHECK(cudaStreamSynchronize(stream)); std::printf("|A - U*S*VT| = %E \n", dR_fro); /* free resources */ CUDA_CHECK(cudaFree(d_A)); CUDA_CHECK(cudaFree(d_S)); CUDA_CHECK(cudaFree(d_U)); CUDA_CHECK(cudaFree(d_VT)); CUDA_CHECK(cudaFree(d_info)); CUDA_CHECK(cudaFree(d_work)); CUDA_CHECK(cudaFree(d_rwork)); CUDA_CHECK(cudaFree(d_W)); free(h_work); CUSOLVER_CHECK(cusolverDnDestroy(cusolverH)); CUBLAS_CHECK(cublasDestroy(cublasH)); CUDA_CHECK(cudaStreamDestroy(stream)); CUDA_CHECK(cudaDeviceReset()); return EXIT_SUCCESS; }