From b19161bce385ca3f9905a012d1ff2c64f2829b26 Mon Sep 17 00:00:00 2001 From: panqingfeng Date: Mon, 20 Jan 2020 15:00:05 +0800 Subject: [PATCH 01/19] Kernel interface change to batch_kernel_function(from kernel_function) Implement a CUDA version of batch_kernel_function --- svm.cpp | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 272 insertions(+), 24 deletions(-) diff --git a/svm.cpp b/svm.cpp index fa44818b..83deef90 100644 --- a/svm.cpp +++ b/svm.cpp @@ -8,6 +8,13 @@ #include #include #include "svm.h" +#ifdef SVM_CUDA +#include +#include +#include +#include +#include +#endif int libsvm_version = LIBSVM_VERSION; typedef float Qfloat; typedef signed char schar; @@ -196,28 +203,35 @@ class QMatrix { virtual Qfloat *get_Q(int column, int len) const = 0; virtual double *get_QD() const = 0; virtual void swap_index(int i, int j) const = 0; + virtual void done_shrinking() const = 0; virtual ~QMatrix() {} }; -class Kernel: public QMatrix { +class KernelBase: public QMatrix { public: - Kernel(int l, svm_node * const * x, const svm_parameter& param); - virtual ~Kernel(); + KernelBase(int l, svm_node * const * x, const svm_parameter& param); + virtual ~KernelBase(); static double k_function(const svm_node *x, const svm_node *y, const svm_parameter& param); virtual Qfloat *get_Q(int column, int len) const = 0; virtual double *get_QD() const = 0; - virtual void swap_index(int i, int j) const // no so const... + virtual void swap_index(int i, int j) const // no so const... { swap(x[i],x[j]); if(x_square) swap(x_square[i],x_square[j]); } + virtual void done_shrinking() const {} protected: - double (Kernel::*kernel_function)(int i, int j) const; + double (KernelBase::*kernel_function)(int i, int j) const; + virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const + { + for(int j=start;j*kernel_function)(i,j); + } -private: +protected: const svm_node **x; double *x_square; @@ -250,26 +264,26 @@ class Kernel: public QMatrix { } }; -Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param) +KernelBase::KernelBase(int l, svm_node * const * x_, const svm_parameter& param) :kernel_type(param.kernel_type), degree(param.degree), gamma(param.gamma), coef0(param.coef0) { switch(kernel_type) { case LINEAR: - kernel_function = &Kernel::kernel_linear; + kernel_function = &KernelBase::kernel_linear; break; case POLY: - kernel_function = &Kernel::kernel_poly; + kernel_function = &KernelBase::kernel_poly; break; case RBF: - kernel_function = &Kernel::kernel_rbf; + kernel_function = &KernelBase::kernel_rbf; break; case SIGMOID: - kernel_function = &Kernel::kernel_sigmoid; + kernel_function = &KernelBase::kernel_sigmoid; break; case PRECOMPUTED: - kernel_function = &Kernel::kernel_precomputed; + kernel_function = &KernelBase::kernel_precomputed; break; } @@ -285,13 +299,13 @@ Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param) x_square = 0; } -Kernel::~Kernel() +KernelBase::~KernelBase() { delete[] x; delete[] x_square; } -double Kernel::dot(const svm_node *px, const svm_node *py) +double KernelBase::dot(const svm_node *px, const svm_node *py) { double sum = 0; while(px->index != -1 && py->index != -1) @@ -313,7 +327,7 @@ double Kernel::dot(const svm_node *px, const svm_node *py) return sum; } -double Kernel::k_function(const svm_node *x, const svm_node *y, +double KernelBase::k_function(const svm_node *x, const svm_node *y, const svm_parameter& param) { switch(param.kernel_type) @@ -372,6 +386,238 @@ double Kernel::k_function(const svm_node *x, const svm_node *y, } } +#ifdef SVM_CUDA +class CudaKernel : public KernelBase { +public: + CudaKernel(int l, svm_node * const * x, const svm_parameter& param); + virtual ~CudaKernel(); + virtual void done_shrinking() const; +protected: + virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const; +private: + void libsvm2csr(int l) const; //convert svm_node** x -> csr format +private: + cublasHandle_t m_cublas; + cusparseHandle_t m_cusparse; + cusparseMatDescr_t m_descr; + int m_max_index; + std::vector m_csrRowPtr; + int* d_csrColIdx; + double* d_csrVal; + double* d_vectorX; + int m_l; + int m_nnz; +}; + +CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), + m_cublas(NULL), m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1), + d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_l(l), m_nnz(0) { + // Init cublas & cusparse handlers + cublasStatus_t cublasResult = cublasCreate(&m_cublas); + assert(CUBLAS_STATUS_SUCCESS == cublasResult); + cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&m_descr); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL); + + // get Num-non-zero and max_index + m_max_index = 0; + const svm_node *node; + for( int i=0; iindex != -1) { + if( node->index > m_max_index ) + m_max_index = node->index; + m_nnz++; + node++; + } + } + + libsvm2csr(l); +} + +void CudaKernel::libsvm2csr(int l) const { + // alloc host memory and fill up + std::vector csrVal(m_nnz); + std::vector csrColIdx(m_nnz); + int curPos = 0; + const svm_node *node; + int* csrRowPtr = (int*)m_csrRowPtr.data(); + for( int i=0; iindex != -1) { + csrVal[curPos] = node->value; + csrColIdx[curPos] = node->index; + curPos++; + node++; + } + } + csrRowPtr[l] = curPos; + + // copy csrVal, csrColIdx to device + cudaError_t cudaResult; + if( !d_vectorX ) { + cudaResult = cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index); + assert(cudaSuccess == cudaResult); + } + if( !d_csrColIdx ) { + cudaResult = cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz); + assert(cudaSuccess == cudaResult); + } + if( !d_csrVal ) { + cudaResult = cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz); + assert(cudaSuccess == cudaResult); + } + cudaResult = cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice); + assert(cudaSuccess == cudaResult); + cudaResult = cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice); + assert(cudaSuccess == cudaResult); +} + +CudaKernel::~CudaKernel() { + if(d_csrColIdx) cudaFree(d_csrColIdx); + if(d_csrVal) cudaFree(d_csrVal); + if(d_vectorX) cudaFree(d_vectorX); + if(m_cublas) cublasDestroy(m_cublas); + if(m_cusparse) cusparseDestroy(m_cusparse); + if(m_descr) cusparseDestroyMatDescr(m_descr); +} + +void CudaKernel::done_shrinking() const { + KernelBase::done_shrinking(); + libsvm2csr(m_l); +} + +void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { + cudaError_t cudaResult; + int len = end-start; + + // get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx + double* csrValA = d_csrVal + m_csrRowPtr[start]; + int* csrColIndA = d_csrColIdx + m_csrRowPtr[start]; + int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start]; + + // get csrRowPtrA ready for cusparseCsrmvEx + std::vector csrRow(len+1); + for( int idx=0; idx<=len; idx++ ) { + csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; + } + int* csrRowPtrA = NULL; + cudaResult = cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1)); + assert(cudaSuccess == cudaResult); + cudaResult = cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice); + assert(cudaSuccess == cudaResult); + + // get vector x ready for cusparseCsrmvEx + std::vector vectorX(m_max_index, 0.0); + const svm_node *node = x[i]; + while(node->index != -1) { + vectorX[node->index] = node->value; + node++; + } + cudaResult = cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice); + assert(cudaSuccess == cudaResult); + + // get vector y ready for cusparseCsrmvEx + std::vector vectorY(len, 0.0); + double* d_vectorY = NULL; + cudaResult = cudaMalloc((void**)&d_vectorY, sizeof(double)*len); + assert(cudaSuccess == cudaResult); + cudaResult = cudaMemcpy(d_vectorY, vectorY.data(), sizeof(double)*len, cudaMemcpyHostToDevice); + assert(cudaSuccess == cudaResult); + + // get temp buffer size and get it ready + size_t buffer_size = 0; + double h_one = 1.0; + double h_zero = 0.0; + cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + len, + m_max_index, + nnz, + &h_one, CUDA_R_64F, + m_descr, + csrValA, CUDA_R_64F, + csrRowPtrA, + csrColIndA, + d_vectorX, CUDA_R_64F, + &h_zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, + &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + void* buffer = NULL; + cudaResult = cudaMalloc ((void**)&buffer, buffer_size); + assert(cudaSuccess == cudaResult); + + // call cusparseCsrmvEx + cusparseResult = cusparseCsrmvEx(m_cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + len, + m_max_index, + nnz, + &h_one, CUDA_R_64F, + m_descr, + csrValA, CUDA_R_64F, + csrRowPtrA, + csrColIndA, + d_vectorX, CUDA_R_64F, + &h_zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, + buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // copy d_vectorY back to host + cudaResult = cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost); + assert(cudaSuccess == cudaResult); + for( int idx=0; idx Qbase(start+len, 0.0); + KernelBase::batch_kernel_function(i, Qbase.data(), start, end); + for( int idx=0; idx0.0001 ) { + printf("%f <-> %f(%d, %d)\n", Q[idx], Qbase[idx], i, start+idx); + } + }*/ +} +#endif + +#ifdef SVM_CUDA +typedef CudaKernel Kernel; +#else +typedef KernelBase Kernel; +#endif + // An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 // Solves: // @@ -568,7 +814,10 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_, if(--counter == 0) { counter = min(l,1000); - if(shrinking) do_shrinking(); + if(shrinking) { + do_shrinking(); + Q.done_shrinking(); + } info("."); } @@ -1282,8 +1531,9 @@ class SVC_Q: public Kernel int start, j; if((start = cache->get_data(i,&data,len)) < len) { + batch_kernel_function(i,data,start,len); for(j=start;j*kernel_function)(i,j)); + data[j] *= (Qfloat)(y[i]*y[j]); } return data; } @@ -1328,11 +1578,10 @@ class ONE_CLASS_Q: public Kernel Qfloat *get_Q(int i, int len) const { Qfloat *data; - int start, j; + int start; if((start = cache->get_data(i,&data,len)) < len) { - for(j=start;j*kernel_function)(i,j); + batch_kernel_function(i,data,start,len); } return data; } @@ -1394,18 +1643,17 @@ class SVR_Q: public Kernel Qfloat *get_Q(int i, int len) const { Qfloat *data; - int j, real_i = index[i]; + int real_i = index[i]; if(cache->get_data(real_i,&data,l) < l) { - for(j=0;j*kernel_function)(real_i,j); + batch_kernel_function(real_i,data,0,l); } // reorder and copy Qfloat *buf = buffer[next_buffer]; next_buffer = 1 - next_buffer; schar si = sign[i]; - for(j=0;j Date: Wed, 22 Jan 2020 16:52:09 +0800 Subject: [PATCH 02/19] a CUDA port of libsvm --- Makefile | 9 +- svm.cpp | 433 ++++++++++++++++++++++++++++++++++++++++++++++++------- svm.h | 10 ++ 3 files changed, 395 insertions(+), 57 deletions(-) diff --git a/Makefile b/Makefile index db6ab346..6254268f 100644 --- a/Makefile +++ b/Makefile @@ -22,4 +22,11 @@ svm-scale: svm-scale.c svm.o: svm.cpp svm.h $(CXX) $(CFLAGS) -c svm.cpp clean: - rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER) + rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER) svm-train-cuda svm-train.o svm-cuda.o + +svm-train.o: svm-train.c + $(CXX) $(CFLAGS) -c svm-train.c +svm-cuda.o: svm.cpp + $(CXX) $(CFLAGS) -DSVM_CUDA -c svm.cpp -o svm-cuda.o +svm-train-cuda: svm-train.o svm-cuda.o + $(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart diff --git a/svm.cpp b/svm.cpp index 83deef90..cddd1722 100644 --- a/svm.cpp +++ b/svm.cpp @@ -9,6 +9,7 @@ #include #include "svm.h" #ifdef SVM_CUDA +#include #include #include #include @@ -212,6 +213,7 @@ class KernelBase: public QMatrix { KernelBase(int l, svm_node * const * x, const svm_parameter& param); virtual ~KernelBase(); + static void batch_k_function(const svm_node *x, const svm_model* model, double* out); static double k_function(const svm_node *x, const svm_node *y, const svm_parameter& param); virtual Qfloat *get_Q(int column, int len) const = 0; @@ -327,6 +329,14 @@ double KernelBase::dot(const svm_node *px, const svm_node *py) return sum; } +void KernelBase::batch_k_function(const svm_node *x, const svm_model* model, double* out) { + svm_node **SV = model->SV; + const svm_parameter& param = model->param; + for(int i=0; il; i++ ) { + out[i] = k_function(x,SV[i],param); + } +} + double KernelBase::k_function(const svm_node *x, const svm_node *y, const svm_parameter& param) { @@ -374,7 +384,6 @@ double KernelBase::k_function(const svm_node *x, const svm_node *y, sum += y->value * y->value; ++y; } - return exp(-param.gamma*sum); } case SIGMOID: @@ -387,34 +396,39 @@ double KernelBase::k_function(const svm_node *x, const svm_node *y, } #ifdef SVM_CUDA +#define CUDA_CHECK(error) { \ + if (error!=cudaSuccess){ \ + std::cerr<<"CUDA ERROR "<< cudaGetErrorName(error) << " in file " << __FILE__ << " line " <<__LINE__<< std::endl; \ + exit(0); \ + } \ +} + class CudaKernel : public KernelBase { public: CudaKernel(int l, svm_node * const * x, const svm_parameter& param); virtual ~CudaKernel(); virtual void done_shrinking() const; + static void batch_k_function(const svm_node *x, const svm_model* model, double* out); protected: virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const; private: - void libsvm2csr(int l) const; //convert svm_node** x -> csr format + void x2csrMatrix(int sizeX) const; //convert svm_node** x -> csr format matrix private: - cublasHandle_t m_cublas; cusparseHandle_t m_cusparse; cusparseMatDescr_t m_descr; - int m_max_index; - std::vector m_csrRowPtr; - int* d_csrColIdx; - double* d_csrVal; + int m_max_index; // max column of x matrix + std::vector m_csrRowPtr; // csr row ptr of x matrix + int* d_csrColIdx; // csr col index of x matrix + double* d_csrVal; // csr val vector of x matrix double* d_vectorX; - int m_l; - int m_nnz; + int m_sizeX; // size of x + int m_nnz; // number of non-zero value of x matrix }; CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), - m_cublas(NULL), m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1), - d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_l(l), m_nnz(0) { - // Init cublas & cusparse handlers - cublasStatus_t cublasResult = cublasCreate(&m_cublas); - assert(CUBLAS_STATUS_SUCCESS == cublasResult); + m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1), + d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_sizeX(l), m_nnz(0) { + // Init handlers cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); cusparseResult = cusparseCreateMatDescr(&m_descr); @@ -434,18 +448,19 @@ CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) node++; } } + m_max_index++; - libsvm2csr(l); + x2csrMatrix(l); } -void CudaKernel::libsvm2csr(int l) const { +void CudaKernel::x2csrMatrix(int sizeX) const { // alloc host memory and fill up std::vector csrVal(m_nnz); std::vector csrColIdx(m_nnz); int curPos = 0; const svm_node *node; int* csrRowPtr = (int*)m_csrRowPtr.data(); - for( int i=0; iindex != -1) { @@ -455,44 +470,36 @@ void CudaKernel::libsvm2csr(int l) const { node++; } } - csrRowPtr[l] = curPos; + csrRowPtr[sizeX] = curPos; // copy csrVal, csrColIdx to device - cudaError_t cudaResult; if( !d_vectorX ) { - cudaResult = cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index)); } if( !d_csrColIdx ) { - cudaResult = cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz)); } if( !d_csrVal ) { - cudaResult = cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz)); } - cudaResult = cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice); - assert(cudaSuccess == cudaResult); - cudaResult = cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice)); } CudaKernel::~CudaKernel() { if(d_csrColIdx) cudaFree(d_csrColIdx); if(d_csrVal) cudaFree(d_csrVal); if(d_vectorX) cudaFree(d_vectorX); - if(m_cublas) cublasDestroy(m_cublas); if(m_cusparse) cusparseDestroy(m_cusparse); if(m_descr) cusparseDestroyMatDescr(m_descr); } void CudaKernel::done_shrinking() const { KernelBase::done_shrinking(); - libsvm2csr(m_l); + x2csrMatrix(m_sizeX); } void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { - cudaError_t cudaResult; int len = end-start; // get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx @@ -506,10 +513,8 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; } int* csrRowPtrA = NULL; - cudaResult = cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1)); - assert(cudaSuccess == cudaResult); - cudaResult = cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1))); + CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice)); // get vector x ready for cusparseCsrmvEx std::vector vectorX(m_max_index, 0.0); @@ -518,21 +523,16 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con vectorX[node->index] = node->value; node++; } - cudaResult = cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice)); // get vector y ready for cusparseCsrmvEx - std::vector vectorY(len, 0.0); double* d_vectorY = NULL; - cudaResult = cudaMalloc((void**)&d_vectorY, sizeof(double)*len); - assert(cudaSuccess == cudaResult); - cudaResult = cudaMemcpy(d_vectorY, vectorY.data(), sizeof(double)*len, cudaMemcpyHostToDevice); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*len)); // get temp buffer size and get it ready size_t buffer_size = 0; - double h_one = 1.0; - double h_zero = 0.0; + static double h_one = 1.0; + static double h_zero = 0.0; cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -551,8 +551,7 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con &buffer_size); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); void* buffer = NULL; - cudaResult = cudaMalloc ((void**)&buffer, buffer_size); - assert(cudaSuccess == cudaResult); + CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); // call cusparseCsrmvEx cusparseResult = cusparseCsrmvEx(m_cusparse, @@ -574,8 +573,8 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); // copy d_vectorY back to host - cudaResult = cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost); - assert(cudaSuccess == cudaResult); + std::vector vectorY(len); + CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost)); for( int idx=0; idx Qbase(start+len, 0.0); KernelBase::batch_kernel_function(i, Qbase.data(), start, end); for( int idx=0; idx0.0001 ) { - printf("%f <-> %f(%d, %d)\n", Q[idx], Qbase[idx], i, start+idx); + if( fabs(Q[start+idx]-Qbase[start+idx])>0.01 ) { + printf("%f <-> %f(%d, %d)\n", Q[start+idx], Qbase[start+idx], i, start+idx); + } + }*/ +} + +void svm_model::cuda_init() { + nMaxIdxSV = 0; + nnzSV = 0; + const svm_node *node; + for( int i=0; iindex != -1) { + if( node->index > nMaxIdxSV ) + nMaxIdxSV = node->index; + nnzSV++; + node++; + } + } + + // get csrVal csrColIdx ready + std::vector csrRowPtr(l+1); + std::vector csrVal(nnzSV); + std::vector csrColIdx(nnzSV); + int curPos = 0; + for( int i=0; iindex != -1) { + csrVal[curPos] = node->value; + csrColIdx[curPos] = node->index; + curPos++; + node++; + } + } + csrRowPtr[l] = curPos; + + // copy csrVal, csrColIdx to device + csrColIdxSV = NULL; + csrValSV = NULL; + csrRowPtrSV = NULL; + CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV)); + CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); +} + +void svm_model::cuda_free() { + CUDA_CHECK(cudaFree(csrRowPtrSV)); + CUDA_CHECK(cudaFree(csrColIdxSV)); + CUDA_CHECK(cudaFree(csrValSV)); +} + +void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) { + const svm_parameter& param = model->param; + static const double one = 1.0; + static const double NegOne = -1.0; + static const double zero = 0.0; + cublasHandle_t cublas = NULL; + cusparseHandle_t cusparse = NULL; + cusparseMatDescr_t descrSV = NULL; + cusparseMatDescr_t descrX = NULL; + cusparseMatDescr_t descrC = NULL; + cudaStream_t stream = NULL; + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + cublasStatus_t cublasResult = cublasCreate(&cublas); + assert(CUBLAS_STATUS_SUCCESS == cublasResult); + cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&descrSV); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseResult = cusparseCreateMatDescr(&descrX); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descrX,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descrX, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseResult = cusparseCreateMatDescr(&descrC); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); + cusparseSetStream(cusparse, stream); + cublasSetStream(cublas, stream); + + // get max_index + int max_index = model->nMaxIdxSV; + const svm_node *node; + node = x; + int x_nnz = 0; + while(node->index != -1) { + if( node->index > max_index ) + max_index = node->index; + x_nnz++; + node++; + } + max_index++; + + // create a sparse matrix X = model->l * [x] + std::vector csrRowPtr(model->l+1); + std::vector csrVal(model->l*x_nnz); + std::vector csrColIdx(model->l*x_nnz); + int curPos = 0; + for( int i=0; il; i++ ) { + csrRowPtr[i] = curPos; + node = x; + while(node->index != -1) { + csrVal[curPos] = node->value; + csrColIdx[curPos] = node->index; + curPos++; + node++; } + } + csrRowPtr[model->l] = curPos; + int* csrColIdxX = NULL; + double* csrValX = NULL; + int* csrRowPtrX = NULL; + CUDA_CHECK(cudaMalloc((void**)&csrRowPtrX, sizeof(int)*(model->l+1))); + CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*model->l*x_nnz)); + CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*model->l*x_nnz)); + CUDA_CHECK(cudaMemcpyAsync(csrRowPtrX, csrRowPtr.data(), sizeof(int)*(model->l+1), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(csrColIdxX, csrColIdx.data(), sizeof(int)*model->l*x_nnz, cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(csrValX, csrVal.data(), sizeof(double)*model->l*x_nnz, cudaMemcpyHostToDevice, stream)); + + int nnzC; + size_t buffer_size; + char* buffer = NULL; + double* csrValC = NULL; + double* temp_csrValC = NULL; + int* csrRowPtrC = NULL; + int* csrColIdxC = NULL; + double* all_one = NULL; + double* d_out = NULL; + double* d_vectorX = NULL, *d_vectorY = NULL; + std::vector h_all_one, vectorX; + switch(param.kernel_type) + { + case LINEAR: + case POLY: + case SIGMOID: + // get dense vector x ready + vectorX.resize(max_index, 0.0); + CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index)); + node = x; + while(node->index != -1) { + vectorX[node->index] = node->value; + node++; + } + CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream)); + + // get dense vector y ready + CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l)); + + // get buffer size + cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + model->nnzSV, &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); + + // y = dot(SV, x) + cusparseResult = cusparseCsrmvEx(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + model->nnzSV, + &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // copy vectorY back to host + CUDA_CHECK(cudaMemcpyAsync(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // update out base on kernel_type + if( param.kernel_type==POLY ) { + for( int i=0; il; i++ ) + out[i] = powi(param.gamma*out[i]+param.coef0,param.degree); + } else if( param.kernel_type==SIGMOID ) { + for( int i=0; il; i++ ) + out[i] = tanh(param.gamma*out[i]+param.coef0); + } + break; + case RBF: + // create a sparse matrix C = X-SV + CUDA_CHECK(cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(model->l+1))); + cusparseResult = cusparseXcsrgeamNnz(cusparse, model->l, max_index, + descrX, model->l*x_nnz, csrRowPtrX, csrColIdxX, + descrSV, model->nnzSV, model->csrRowPtrSV, model->csrColIdxSV, + descrC, csrRowPtrC, &nnzC); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + CUDA_CHECK(cudaMalloc((void**)&csrColIdxC, sizeof(int)*nnzC)); + CUDA_CHECK(cudaMalloc((void**)&csrValC, sizeof(double)*nnzC)); + CUDA_CHECK(cudaMalloc((void**)&temp_csrValC, sizeof(double)*nnzC)); + + cusparseResult = cusparseDcsrgeam(cusparse, model->l, max_index, + &one, + descrX, model->l*x_nnz, + csrValX, csrRowPtrX, csrColIdxX, + &NegOne, + descrSV, model->nnzSV, + model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, + descrC, + csrValC, csrRowPtrC, csrColIdxC); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // do C = C^2 + // equal to vector csrValC = csrValC^2 + CUDA_CHECK(cudaMemcpyAsync(temp_csrValC, csrValC,sizeof(double)*nnzC, cudaMemcpyDeviceToDevice, stream)); + cublasResult = cublasDsbmv(cublas, CUBLAS_FILL_MODE_LOWER, + nnzC, 0, &one, + temp_csrValC, 1, + temp_csrValC, 1, + &zero, csrValC, 1); + assert(CUBLAS_STATUS_SUCCESS == cublasResult); + + // out = C * all_one_vector_with_size_max_index + CUDA_CHECK(cudaMalloc((void**)&all_one, sizeof(double)*max_index)); + CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l)); + h_all_one.resize(max_index, 1.0); + CUDA_CHECK(cudaMemcpyAsync(all_one, h_all_one.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream)); + + cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + nnzC, &one, + CUDA_R_64F, descrC, + csrValC, CUDA_R_64F, + csrRowPtrC, csrColIdxC, + all_one, CUDA_R_64F, + &zero, CUDA_R_64F, + d_out, CUDA_R_64F, + CUDA_R_64F, &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); + + cusparseResult = cusparseCsrmvEx(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + nnzC, &one, + CUDA_R_64F, descrC, + csrValC, CUDA_R_64F, + csrRowPtrC, csrColIdxC, + all_one, CUDA_R_64F, + &zero, CUDA_R_64F, + d_out, CUDA_R_64F, + CUDA_R_64F, buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // copy d_out back to host + CUDA_CHECK(cudaMemcpyAsync(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // do out[i] = exp(-param.gamma*out[i]) + for( int i=0; il; i++ ) + out[i] = exp(-param.gamma*out[i]); + break; + default: + printf("unsupport kernel type\n"); + exit(0); + break; + } + + cudaFree(csrColIdxX); + cudaFree(csrValX); + cudaFree(csrRowPtrX); + cudaFree(csrRowPtrC); + cudaFree(buffer); + cudaFree(csrValC); + cudaFree(temp_csrValC); + cudaFree(csrColIdxC); + cudaFree(all_one); + cudaFree(d_out); + cudaFree(d_vectorX); + cudaFree(d_vectorY); + cusparseDestroyMatDescr(descrSV); + cusparseDestroyMatDescr(descrX); + cusparseDestroyMatDescr(descrC); + cublasDestroy(cublas); + cusparseDestroy(cusparse); + cudaStreamDestroy(stream); + + // debug, compare with base class result from cpu + /*std::vector Outbase(model->l, 0.0); + KernelBase::batch_k_function(x, model, Outbase.data()); + for( int idx=0; idxl; idx++ ) { + if( fabs(Outbase[idx]-out[idx])>0.01 ) + printf("batch_k_function %f <-> %f %d\n", Outbase[idx], out[idx], idx); }*/ } + #endif #ifdef SVM_CUDA @@ -2580,6 +2884,9 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) free(nz_count); free(nz_start); } +#ifdef SVM_CUDA + model->cuda_init(); +#endif return model; } @@ -2755,8 +3062,14 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec { double *sv_coef = model->sv_coef[0]; double sum = 0; + //for(i=0;il;i++) + // sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param); + double *kvalue = Malloc(double,model->l); + Kernel::batch_k_function(x,model,kvalue); for(i=0;il;i++) - sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param); + sum += sv_coef[i] * kvalue[i]; + free(kvalue); + sum -= model->rho[0]; *dec_values = sum; @@ -2771,8 +3084,9 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec int l = model->l; double *kvalue = Malloc(double,l); - for(i=0;iSV[i],model->param); + //for(i=0;iSV[i],model->param); + Kernel::batch_k_function(x,model,kvalue); int *start = Malloc(int,nr_class); start[0] = 0; @@ -3237,6 +3551,9 @@ svm_model *svm_load_model(const char *model_file_name) return NULL; model->free_sv = 1; // XXX +#ifdef SVM_CUDA + model->cuda_init(); +#endif return model; } @@ -3273,6 +3590,10 @@ void svm_free_model_content(svm_model* model_ptr) free(model_ptr->nSV); model_ptr->nSV = NULL; + +#ifdef SVM_CUDA + model_ptr->cuda_free(); +#endif } void svm_free_and_destroy_model(svm_model** model_ptr_ptr) diff --git a/svm.h b/svm.h index b0bf6ef0..227ffbd0 100644 --- a/svm.h +++ b/svm.h @@ -69,6 +69,16 @@ struct svm_model /* XXX */ int free_sv; /* 1 if svm_model is created by svm_load_model*/ /* 0 if svm_model is created by svm_train */ +#ifdef SVM_CUDA + void cuda_init(); + void cuda_free(); + + int nMaxIdxSV; /* max index of SV */ + int nnzSV; /* non-zero value count of SV */ + int* csrRowPtrSV; /* row index of matrix SV */ + int* csrColIdxSV; /* column index of matrix SV */ + double* csrValSV; /* values of matrix SV */ +#endif }; struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param); From e8523d78803aa75ecc26b116874d738a857ff172 Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Wed, 22 Jan 2020 16:56:16 +0800 Subject: [PATCH 03/19] Update README --- README | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README b/README index dce2c1f1..a22756a4 100644 --- a/README +++ b/README @@ -1,3 +1,14 @@ +How to build: +make clean; make svm-train-cuda + +How to use: +run svm-train-cuda instead of svm-train, with same command line arguments of svm-train + + +/////////////////////////////////////////////////////////////////////////////////////// + + + Libsvm is a simple, easy-to-use, and efficient software for SVM classification and regression. It solves C-SVM classification, nu-SVM classification, one-class-SVM, epsilon-SVM regression, and nu-SVM From 68d6cd9d5ae4ad94eb19042fa8ca37833891fb4d Mon Sep 17 00:00:00 2001 From: panqingfeng Date: Fri, 24 Jan 2020 01:28:48 +0800 Subject: [PATCH 04/19] a faster RBF_k_function --- Makefile | 2 +- svm.cpp | 350 +++++++++++++++++++++++-------------------------------- 2 files changed, 150 insertions(+), 202 deletions(-) diff --git a/Makefile b/Makefile index 6254268f..22980070 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,6 @@ clean: svm-train.o: svm-train.c $(CXX) $(CFLAGS) -c svm-train.c svm-cuda.o: svm.cpp - $(CXX) $(CFLAGS) -DSVM_CUDA -c svm.cpp -o svm-cuda.o + nvcc -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o svm-train-cuda: svm-train.o svm-cuda.o $(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart diff --git a/svm.cpp b/svm.cpp index cddd1722..40a442cf 100644 --- a/svm.cpp +++ b/svm.cpp @@ -331,8 +331,9 @@ double KernelBase::dot(const svm_node *px, const svm_node *py) void KernelBase::batch_k_function(const svm_node *x, const svm_model* model, double* out) { svm_node **SV = model->SV; - const svm_parameter& param = model->param; - for(int i=0; il; i++ ) { + const svm_parameter& param = model->param; + int i; + for(i=0; il; i++ ) { out[i] = k_function(x,SV[i],param); } } @@ -659,37 +660,49 @@ void svm_model::cuda_free() { CUDA_CHECK(cudaFree(csrValSV)); } +__global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX, + int countSV, + double* csrValSV, int* csrRowPtrSV, int* csrColIdxSV, + double* out) { + int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x; + if( nSVIdx>=countSV ) + return; + int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx]; + double* ValSV = csrValSV + csrRowPtrSV[nSVIdx]; + int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx]; + + int colX = 0, colSV = 0; + double sum = 0.0; + while( colXparam; - static const double one = 1.0; - static const double NegOne = -1.0; - static const double zero = 0.0; - cublasHandle_t cublas = NULL; - cusparseHandle_t cusparse = NULL; - cusparseMatDescr_t descrSV = NULL; - cusparseMatDescr_t descrX = NULL; - cusparseMatDescr_t descrC = NULL; - cudaStream_t stream = NULL; - CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); - cublasStatus_t cublasResult = cublasCreate(&cublas); - assert(CUBLAS_STATUS_SUCCESS == cublasResult); - cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseResult = cusparseCreateMatDescr(&descrSV); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseResult = cusparseCreateMatDescr(&descrX); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseSetMatIndexBase(descrX,CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatType(descrX, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseResult = cusparseCreateMatDescr(&descrC); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); - cusparseSetStream(cusparse, stream); - cublasSetStream(cublas, stream); // get max_index int max_index = model->nMaxIdxSV; @@ -704,181 +717,125 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou } max_index++; - // create a sparse matrix X = model->l * [x] - std::vector csrRowPtr(model->l+1); - std::vector csrVal(model->l*x_nnz); - std::vector csrColIdx(model->l*x_nnz); - int curPos = 0; - for( int i=0; il; i++ ) { - csrRowPtr[i] = curPos; - node = x; - while(node->index != -1) { - csrVal[curPos] = node->value; - csrColIdx[curPos] = node->index; - curPos++; - node++; - } - } - csrRowPtr[model->l] = curPos; + cusparseHandle_t cusparse = NULL; + cusparseMatDescr_t descrSV = NULL; int* csrColIdxX = NULL; double* csrValX = NULL; - int* csrRowPtrX = NULL; - CUDA_CHECK(cudaMalloc((void**)&csrRowPtrX, sizeof(int)*(model->l+1))); - CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*model->l*x_nnz)); - CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*model->l*x_nnz)); - CUDA_CHECK(cudaMemcpyAsync(csrRowPtrX, csrRowPtr.data(), sizeof(int)*(model->l+1), cudaMemcpyHostToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(csrColIdxX, csrColIdx.data(), sizeof(int)*model->l*x_nnz, cudaMemcpyHostToDevice, stream)); - CUDA_CHECK(cudaMemcpyAsync(csrValX, csrVal.data(), sizeof(double)*model->l*x_nnz, cudaMemcpyHostToDevice, stream)); - - int nnzC; - size_t buffer_size; char* buffer = NULL; - double* csrValC = NULL; - double* temp_csrValC = NULL; - int* csrRowPtrC = NULL; - int* csrColIdxC = NULL; double* all_one = NULL; double* d_out = NULL; - double* d_vectorX = NULL, *d_vectorY = NULL; - std::vector h_all_one, vectorX; + double* d_vectorX = NULL; + double* d_vectorY = NULL; switch(param.kernel_type) { case LINEAR: case POLY: case SIGMOID: - // get dense vector x ready - vectorX.resize(max_index, 0.0); - CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index)); - node = x; - while(node->index != -1) { - vectorX[node->index] = node->value; - node++; - } - CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream)); - - // get dense vector y ready - CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l)); - - // get buffer size - cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, - CUSPARSE_ALG0, - CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, - model->nnzSV, &one, CUDA_R_64F, - descrSV, - model->csrValSV, CUDA_R_64F, - model->csrRowPtrSV, model->csrColIdxSV, - d_vectorX, CUDA_R_64F, - &zero, CUDA_R_64F, - d_vectorY, CUDA_R_64F, - CUDA_R_64F, &buffer_size); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); + { + static const double one = 1.0; + static const double zero = 0.0; + std::vector h_all_one, vectorX; + size_t buffer_size; + + cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&descrSV); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); + + // get dense vector x ready + vectorX.resize(max_index, 0.0); + CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index)); + node = x; + while(node->index != -1) { + vectorX[node->index] = node->value; + node++; + } + CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice)); + + // get dense vector y ready + CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l)); + + // get buffer size + cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + model->nnzSV, &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, &buffer_size); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); - // y = dot(SV, x) - cusparseResult = cusparseCsrmvEx(cusparse, - CUSPARSE_ALG0, - CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, - model->nnzSV, - &one, CUDA_R_64F, - descrSV, - model->csrValSV, CUDA_R_64F, - model->csrRowPtrSV, model->csrColIdxSV, - d_vectorX, CUDA_R_64F, - &zero, CUDA_R_64F, - d_vectorY, CUDA_R_64F, - CUDA_R_64F, buffer); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - - // copy vectorY back to host - CUDA_CHECK(cudaMemcpyAsync(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - // update out base on kernel_type - if( param.kernel_type==POLY ) { - for( int i=0; il; i++ ) - out[i] = powi(param.gamma*out[i]+param.coef0,param.degree); - } else if( param.kernel_type==SIGMOID ) { - for( int i=0; il; i++ ) - out[i] = tanh(param.gamma*out[i]+param.coef0); + // y = dot(SV, x) + cusparseResult = cusparseCsrmvEx(cusparse, + CUSPARSE_ALG0, + CUSPARSE_OPERATION_NON_TRANSPOSE, + model->l, max_index, + model->nnzSV, + &one, CUDA_R_64F, + descrSV, + model->csrValSV, CUDA_R_64F, + model->csrRowPtrSV, model->csrColIdxSV, + d_vectorX, CUDA_R_64F, + &zero, CUDA_R_64F, + d_vectorY, CUDA_R_64F, + CUDA_R_64F, buffer); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + + // copy vectorY back to host + CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); + + // update out base on kernel_type + if( param.kernel_type==POLY ) { + for( int i=0; il; i++ ) + out[i] = powi(param.gamma*out[i]+param.coef0,param.degree); + } else if( param.kernel_type==SIGMOID ) { + for( int i=0; il; i++ ) + out[i] = tanh(param.gamma*out[i]+param.coef0); + } } break; case RBF: - // create a sparse matrix C = X-SV - CUDA_CHECK(cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(model->l+1))); - cusparseResult = cusparseXcsrgeamNnz(cusparse, model->l, max_index, - descrX, model->l*x_nnz, csrRowPtrX, csrColIdxX, - descrSV, model->nnzSV, model->csrRowPtrSV, model->csrColIdxSV, - descrC, csrRowPtrC, &nnzC); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - CUDA_CHECK(cudaMalloc((void**)&csrColIdxC, sizeof(int)*nnzC)); - CUDA_CHECK(cudaMalloc((void**)&csrValC, sizeof(double)*nnzC)); - CUDA_CHECK(cudaMalloc((void**)&temp_csrValC, sizeof(double)*nnzC)); - - cusparseResult = cusparseDcsrgeam(cusparse, model->l, max_index, - &one, - descrX, model->l*x_nnz, - csrValX, csrRowPtrX, csrColIdxX, - &NegOne, - descrSV, model->nnzSV, - model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, - descrC, - csrValC, csrRowPtrC, csrColIdxC); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - - // do C = C^2 - // equal to vector csrValC = csrValC^2 - CUDA_CHECK(cudaMemcpyAsync(temp_csrValC, csrValC,sizeof(double)*nnzC, cudaMemcpyDeviceToDevice, stream)); - cublasResult = cublasDsbmv(cublas, CUBLAS_FILL_MODE_LOWER, - nnzC, 0, &one, - temp_csrValC, 1, - temp_csrValC, 1, - &zero, csrValC, 1); - assert(CUBLAS_STATUS_SUCCESS == cublasResult); - - // out = C * all_one_vector_with_size_max_index - CUDA_CHECK(cudaMalloc((void**)&all_one, sizeof(double)*max_index)); - CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l)); - h_all_one.resize(max_index, 1.0); - CUDA_CHECK(cudaMemcpyAsync(all_one, h_all_one.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream)); - - cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, - CUSPARSE_ALG0, - CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, - nnzC, &one, - CUDA_R_64F, descrC, - csrValC, CUDA_R_64F, - csrRowPtrC, csrColIdxC, - all_one, CUDA_R_64F, - &zero, CUDA_R_64F, - d_out, CUDA_R_64F, - CUDA_R_64F, &buffer_size); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); - - cusparseResult = cusparseCsrmvEx(cusparse, - CUSPARSE_ALG0, - CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, - nnzC, &one, - CUDA_R_64F, descrC, - csrValC, CUDA_R_64F, - csrRowPtrC, csrColIdxC, - all_one, CUDA_R_64F, - &zero, CUDA_R_64F, - d_out, CUDA_R_64F, - CUDA_R_64F, buffer); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - - // copy d_out back to host - CUDA_CHECK(cudaMemcpyAsync(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - - // do out[i] = exp(-param.gamma*out[i]) - for( int i=0; il; i++ ) - out[i] = exp(-param.gamma*out[i]); + { + // get vector x ready + std::vector h_csrValX(x_nnz); + std::vector h_csrColIdxX(x_nnz); + int curPos = 0; + node = x; + while(node->index != -1) { + h_csrValX[curPos] = node->value; + h_csrColIdxX[curPos] = node->index; + curPos++; + node++; + } + CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*x_nnz)); + CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*x_nnz)); + CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l)); + CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemset(d_out, 0, sizeof(double)*model->l)); + + const int threadPerBlock=256; + int blockPerGrid=(model->l+threadPerBlock-1)/threadPerBlock; + RBF_k_function<<>>(x_nnz, csrValX, csrColIdxX, + model->l, model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, + d_out); + + // copy d_out back to host + CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); + + // do out[i] = exp(-param.gamma*out[i]) + for( int i=0; il; i++ ) + out[i] = exp(-param.gamma*out[i]); + } break; default: printf("unsupport kernel type\n"); @@ -888,22 +845,13 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou cudaFree(csrColIdxX); cudaFree(csrValX); - cudaFree(csrRowPtrX); - cudaFree(csrRowPtrC); cudaFree(buffer); - cudaFree(csrValC); - cudaFree(temp_csrValC); - cudaFree(csrColIdxC); cudaFree(all_one); cudaFree(d_out); cudaFree(d_vectorX); cudaFree(d_vectorY); cusparseDestroyMatDescr(descrSV); - cusparseDestroyMatDescr(descrX); - cusparseDestroyMatDescr(descrC); - cublasDestroy(cublas); cusparseDestroy(cusparse); - cudaStreamDestroy(stream); // debug, compare with base class result from cpu /*std::vector Outbase(model->l, 0.0); From 8c4d66a612ca4d6c7891c0cbded50ec30ca435bc Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Sun, 2 Feb 2020 08:54:58 +0800 Subject: [PATCH 05/19] a faster RBF_k_function, with less cudaMalloc/cudaFree --- svm.cpp | 242 ++++++++++++++++++++++++++++++++++++++++---------------- svm.h | 14 ++++ 2 files changed, 188 insertions(+), 68 deletions(-) diff --git a/svm.cpp b/svm.cpp index 40a442cf..06fef5f8 100644 --- a/svm.cpp +++ b/svm.cpp @@ -612,6 +612,14 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con } void svm_model::cuda_init() { + problem = NULL; + nPreloadIdx = 0; + csrRowPtrProblem = NULL; + csrColIdxProblem = NULL; + csrValProblem = NULL; + csrValProblemSquare = NULL; + d_out = NULL; + nMaxIdxSV = 0; nnzSV = 0; const svm_node *node; @@ -628,6 +636,7 @@ void svm_model::cuda_init() { // get csrVal csrColIdx ready std::vector csrRowPtr(l+1); std::vector csrVal(nnzSV); + std::vector csrValSquare(nnzSV); std::vector csrColIdx(nnzSV); int curPos = 0; for( int i=0; iindex != -1) { csrVal[curPos] = node->value; + csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos]; csrColIdx[curPos] = node->index; curPos++; node++; @@ -645,30 +655,108 @@ void svm_model::cuda_init() { // copy csrVal, csrColIdx to device csrColIdxSV = NULL; csrValSV = NULL; + csrValSVSquare = NULL; csrRowPtrSV = NULL; CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1))); CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV)); CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&csrValSVSquare, sizeof(double)*nnzSV)); CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValSVSquare, csrValSquare.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); +} + +void svm_model::cuda_preload_problem(const struct svm_problem *prob) { + free(csrRowPtrProblem); + CUDA_CHECK(cudaFree(csrColIdxProblem)); + CUDA_CHECK(cudaFree(csrValProblem)); + CUDA_CHECK(cudaFree(csrValProblemSquare)); + CUDA_CHECK(cudaFree(d_out)); + csrRowPtrProblem = NULL; + csrColIdxProblem = NULL; + csrValProblem = NULL; + csrValProblemSquare = NULL; + d_out = NULL; + problem = prob; + if( problem!=NULL ) { + nMaxIdxProblem = 0; + nnzProblem = 0; + const svm_node *node; + for( int i=0; il; i++ ) { + node = problem->x[i]; + while(node->index != -1) { + if( node->index > nMaxIdxProblem ) + nMaxIdxProblem = node->index; + nnzProblem++; + node++; + } + } + csrRowPtrProblem = Malloc(int,problem->l+1); + std::vector csrVal(nnzProblem); + std::vector csrValSquare(nnzProblem); + std::vector csrColIdx(nnzProblem); + int curPos = 0; + for( int i=0; il; i++ ) { + csrRowPtrProblem[i] = curPos; + node = problem->x[i]; + while(node->index != -1) { + csrVal[curPos] = node->value; + csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos]; + csrColIdx[curPos] = node->index; + curPos++; + node++; + } + } + csrRowPtrProblem[problem->l] = curPos; + + CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*l)); + CUDA_CHECK(cudaMalloc((void**)&csrColIdxProblem, sizeof(int)*nnzProblem)); + CUDA_CHECK(cudaMalloc((void**)&csrValProblem, sizeof(double)*nnzProblem)); + CUDA_CHECK(cudaMalloc((void**)&csrValProblemSquare, sizeof(double)*nnzProblem)); + CUDA_CHECK(cudaMemcpy(csrColIdxProblem, csrColIdx.data(), sizeof(int)*nnzProblem, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValProblem, csrVal.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValProblemSquare, csrValSquare.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice)); + } +} + +void svm_model::setPreloadIdx(int nIdx) { + nPreloadIdx = nIdx; +} + +int svm_model::getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const { + if( problem==NULL ) + return 0; + int* csrRowPtr = csrRowPtrProblem + nPreloadIdx; + *nnz = *(csrRowPtr+1)-*csrRowPtr; + *csrColIdx = csrColIdxProblem + *csrRowPtr; + *csrVal = csrValProblem + *csrRowPtr; + *csrValSquare = csrValProblemSquare + *csrRowPtr; + return 1; } void svm_model::cuda_free() { CUDA_CHECK(cudaFree(csrRowPtrSV)); CUDA_CHECK(cudaFree(csrColIdxSV)); CUDA_CHECK(cudaFree(csrValSV)); + CUDA_CHECK(cudaFree(csrValSVSquare)); + free(csrRowPtrProblem); + CUDA_CHECK(cudaFree(csrColIdxProblem)); + CUDA_CHECK(cudaFree(csrValProblem)); + CUDA_CHECK(cudaFree(csrValProblemSquare)); + CUDA_CHECK(cudaFree(d_out)); } -__global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX, +__global__ void RBF_k_function(int nnzX, double* csrValX, double* csrValXSquare, int* csrColIdxX, int countSV, - double* csrValSV, int* csrRowPtrSV, int* csrColIdxSV, + double* csrValSV, double* csrValSVSquare, int* csrRowPtrSV, int* csrColIdxSV, double* out) { int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x; if( nSVIdx>=countSV ) return; int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx]; double* ValSV = csrValSV + csrRowPtrSV[nSVIdx]; + double* ValSVSquare = csrValSVSquare + csrRowPtrSV[nSVIdx]; int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx]; int colX = 0, colSV = 0; @@ -681,60 +769,39 @@ __global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX, colSV++; } else { if( csrColIdxX[colX] < ColIdxSV[colSV] ) { - sum += csrValX[colX]*csrValX[colX]; + sum += csrValXSquare[colX]; colX++; } else { - sum += ValSV[colSV]*ValSV[colSV]; + sum += ValSVSquare[colSV]; colSV++; } } } while( colXparam; - - // get max_index - int max_index = model->nMaxIdxSV; - const svm_node *node; - node = x; - int x_nnz = 0; - while(node->index != -1) { - if( node->index > max_index ) - max_index = node->index; - x_nnz++; - node++; - } - max_index++; - - cusparseHandle_t cusparse = NULL; - cusparseMatDescr_t descrSV = NULL; - int* csrColIdxX = NULL; - double* csrValX = NULL; - char* buffer = NULL; - double* all_one = NULL; - double* d_out = NULL; - double* d_vectorX = NULL; - double* d_vectorY = NULL; switch(param.kernel_type) { case LINEAR: case POLY: case SIGMOID: { + cusparseHandle_t cusparse = NULL; + cusparseMatDescr_t descrSV = NULL; static const double one = 1.0; static const double zero = 0.0; - std::vector h_all_one, vectorX; + std::vector vectorX; size_t buffer_size; cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); @@ -745,9 +812,22 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); + int max_index = model->nMaxIdxSV; + const svm_node *node = x; + int x_nnz = 0; + while(node->index != -1) { + if( node->index > max_index ) + max_index = node->index; + x_nnz++; + node++; + } + max_index++; + // get dense vector x ready + double* d_vector = NULL; + CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(max_index+model->l))); + double* d_vectorX = d_vector; vectorX.resize(max_index, 0.0); - CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index)); node = x; while(node->index != -1) { vectorX[node->index] = node->value; @@ -756,7 +836,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice)); // get dense vector y ready - CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l)); + double* d_vectorY = d_vector + max_index; // get buffer size cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, @@ -772,6 +852,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou d_vectorY, CUDA_R_64F, CUDA_R_64F, &buffer_size); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + char* buffer = NULL; CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); // y = dot(SV, x) @@ -801,36 +882,64 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou for( int i=0; il; i++ ) out[i] = tanh(param.gamma*out[i]+param.coef0); } + + cudaFree(d_vector); + cudaFree(buffer); + cusparseDestroyMatDescr(descrSV); + cusparseDestroy(cusparse); } break; case RBF: { - // get vector x ready - std::vector h_csrValX(x_nnz); - std::vector h_csrColIdxX(x_nnz); - int curPos = 0; - node = x; - while(node->index != -1) { - h_csrValX[curPos] = node->value; - h_csrColIdxX[curPos] = node->index; - curPos++; - node++; + const int threadPerBlock = 256; + int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock; + int nnzX = 0; + int* csrColIdxX = NULL; + double* csrValX = NULL; + double* csrValXSquare = NULL; + if( model->getPreloadX(&nnzX, &csrColIdxX, &csrValX, &csrValXSquare) ) { + // vector X is preloaded(training) + RBF_k_function<<>>(nnzX, csrValX, csrValXSquare, csrColIdxX, + model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, + model->d_out); + CUDA_CHECK(cudaMemcpy(out, model->d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); + } else { + // vector X is not preloaded(predict) + const svm_node *node = x; + int x_nnz = 0; + while(node->index != -1) { + x_nnz++; + node++; + } + + // get vector x ready + std::vector h_csrValX(x_nnz*2); + double* h_csrValXSquare = h_csrValX.data()+x_nnz; + std::vector h_csrColIdxX(x_nnz); + int curPos = 0; + node = x; + while(node->index != -1) { + h_csrValX[curPos] = node->value; + h_csrValXSquare[curPos] = h_csrValX[curPos]*h_csrValX[curPos]; + h_csrColIdxX[curPos] = node->index; + curPos++; + node++; + } + CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*(x_nnz*2+model->l)+sizeof(int)*x_nnz)); + csrValXSquare = csrValX + x_nnz; + double* d_out = csrValXSquare + x_nnz; + csrColIdxX = (int*)&d_out[model->l]; + CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz*2, cudaMemcpyHostToDevice)); + + RBF_k_function<<>>(x_nnz, csrValX, csrValXSquare, csrColIdxX, + model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, + d_out); + + // copy d_out back to host + CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaFree(csrValX)); } - CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*x_nnz)); - CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*x_nnz)); - CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l)); - CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemset(d_out, 0, sizeof(double)*model->l)); - - const int threadPerBlock=256; - int blockPerGrid=(model->l+threadPerBlock-1)/threadPerBlock; - RBF_k_function<<>>(x_nnz, csrValX, csrColIdxX, - model->l, model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, - d_out); - - // copy d_out back to host - CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); // do out[i] = exp(-param.gamma*out[i]) for( int i=0; il; i++ ) @@ -843,16 +952,6 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou break; } - cudaFree(csrColIdxX); - cudaFree(csrValX); - cudaFree(buffer); - cudaFree(all_one); - cudaFree(d_out); - cudaFree(d_vectorX); - cudaFree(d_vectorY); - cusparseDestroyMatDescr(descrSV); - cusparseDestroy(cusparse); - // debug, compare with base class result from cpu /*std::vector Outbase(model->l, 0.0); KernelBase::batch_k_function(x, model, Outbase.data()); @@ -2459,8 +2558,15 @@ static void svm_binary_svc_probability( subparam.weight[0]=Cp; subparam.weight[1]=Cn; struct svm_model *submodel = svm_train(&subprob,&subparam); +#ifdef SVM_CUDA + submodel->cuda_preload_problem(prob); for(j=begin;jsetPreloadIdx(perm[j]); +#else + for(j=begin;jx[perm[j]],&(dec_values[perm[j]])); // ensure +1 -1 order; reason not using CV subroutine dec_values[perm[j]] *= submodel->label[0]; diff --git a/svm.h b/svm.h index 227ffbd0..5b2ca0d9 100644 --- a/svm.h +++ b/svm.h @@ -78,6 +78,20 @@ struct svm_model int* csrRowPtrSV; /* row index of matrix SV */ int* csrColIdxSV; /* column index of matrix SV */ double* csrValSV; /* values of matrix SV */ + double* csrValSVSquare; /* value^2 of matrix SV */ + + void cuda_preload_problem(const struct svm_problem *problem); /* load problem in gpu memory, non-thread-safe, speed up training */ + void setPreloadIdx(int nIdx); /* called before getPreloadX() */ + int getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const; /* get vector x */ + int nPreloadIdx; + const struct svm_problem *problem; + int* csrRowPtrProblem; /* row index of matrix problem */ + int nMaxIdxProblem; /* max index of problem */ + int nnzProblem; /* non-zero value count of problem */ + int* csrColIdxProblem; /* cloumn index of matrix problem */ + double* csrValProblem; /* values of matrix problem */ + double* csrValProblemSquare; /* value^2 of matrix problem */ + double* d_out; /* temp output */ #endif }; From 4dbd835fa0d5235b9a99d825a6b420ec2cb944e6 Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Wed, 5 Feb 2020 22:48:25 +0800 Subject: [PATCH 06/19] a faster RBF_kernel_function --- svm.cpp | 305 +++++++++++++------------------------------------------- svm.h | 14 +-- 2 files changed, 73 insertions(+), 246 deletions(-) diff --git a/svm.cpp b/svm.cpp index 06fef5f8..d85a6848 100644 --- a/svm.cpp +++ b/svm.cpp @@ -421,14 +421,14 @@ class CudaKernel : public KernelBase { std::vector m_csrRowPtr; // csr row ptr of x matrix int* d_csrColIdx; // csr col index of x matrix double* d_csrVal; // csr val vector of x matrix - double* d_vectorX; + double* d_squareX; // a gpu copy of x_square int m_sizeX; // size of x int m_nnz; // number of non-zero value of x matrix }; CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1), - d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_sizeX(l), m_nnz(0) { + d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) { // Init handlers cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); @@ -474,23 +474,24 @@ void CudaKernel::x2csrMatrix(int sizeX) const { csrRowPtr[sizeX] = curPos; // copy csrVal, csrColIdx to device - if( !d_vectorX ) { - CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index)); - } if( !d_csrColIdx ) { CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz)); } if( !d_csrVal ) { CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz)); } + if( !d_squareX ) { + CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*sizeX)); + } CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_squareX, x_square, sizeof(double)*sizeX, cudaMemcpyHostToDevice)); } CudaKernel::~CudaKernel() { - if(d_csrColIdx) cudaFree(d_csrColIdx); - if(d_csrVal) cudaFree(d_csrVal); - if(d_vectorX) cudaFree(d_vectorX); + CUDA_CHECK(cudaFree(d_csrColIdx)); + CUDA_CHECK(cudaFree(d_csrVal)); + CUDA_CHECK(cudaFree(d_squareX)); if(m_cusparse) cusparseDestroy(m_cusparse); if(m_descr) cusparseDestroyMatDescr(m_descr); } @@ -500,6 +501,28 @@ void CudaKernel::done_shrinking() const { x2csrMatrix(m_sizeX); } +__global__ void dot2kernelValue(int kernel_type, int len, double gamma, double coef0, double degree, double squareX, double* d_squareA, int start, double* Vector) { + int idx = blockDim.x*blockIdx.x + threadIdx.x; + if( idx >= len ) + return; + + // Vector input dot(), output kernel Value + switch(kernel_type) { + default: + case LINEAR: // do nothing + break; + case POLY: + Vector[idx] = pow(gamma*Vector[idx]+coef0,degree); + break; + case RBF: + Vector[idx] = exp(-gamma*(squareX+d_squareA[start+idx]-2*Vector[idx])); + break; + case SIGMOID: + Vector[idx] = tanh(gamma*Vector[idx]+coef0); + break; + } +} + void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { int len = end-start; @@ -508,27 +531,35 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con int* csrColIndA = d_csrColIdx + m_csrRowPtr[start]; int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start]; + const svm_node *node = x[i]; + int max_index = 0; + while(node->index != -1) { + if( max_index < node->index ) + max_index = node->index; + node++; + } + max_index++; + double* d_vectors = NULL; + CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*max_index+sizeof(double)*len)); + double* d_vectorX = d_vectors; + double* d_vectorY = &d_vectorX[max_index]; + int* csrRowPtrA = (int*)&d_vectorY[len]; + // get csrRowPtrA ready for cusparseCsrmvEx std::vector csrRow(len+1); for( int idx=0; idx<=len; idx++ ) { csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; } - int* csrRowPtrA = NULL; - CUDA_CHECK(cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1))); CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice)); // get vector x ready for cusparseCsrmvEx - std::vector vectorX(m_max_index, 0.0); - const svm_node *node = x[i]; + std::vector vectorX(max_index, 0.0); + node = x[i]; while(node->index != -1) { vectorX[node->index] = node->value; node++; } - CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice)); - - // get vector y ready for cusparseCsrmvEx - double* d_vectorY = NULL; - CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*len)); + CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice)); // get temp buffer size and get it ready size_t buffer_size = 0; @@ -573,32 +604,20 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con buffer); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + // update out base on kernel_type + const int threadPerBlock = 256; + int blockPerGrid = (len+threadPerBlock-1)/threadPerBlock; + dot2kernelValue<<>>(kernel_type, len, gamma, coef0, + degree, x_square[i], d_squareX, start, d_vectorY); + // copy d_vectorY back to host std::vector vectorY(len); CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost)); for( int idx=0; idx csrVal(nnzSV); std::vector csrValSquare(nnzSV); std::vector csrColIdx(nnzSV); + std::vector hSVSquare(l); int curPos = 0; for( int i=0; iindex != -1) { csrVal[curPos] = node->value; csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos]; csrColIdx[curPos] = node->index; + hSVSquare[i] += node->value*node->value; curPos++; node++; } @@ -657,82 +671,17 @@ void svm_model::cuda_init() { csrValSV = NULL; csrValSVSquare = NULL; csrRowPtrSV = NULL; + SVSquare = NULL; CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1))); CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV)); CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV)); CUDA_CHECK(cudaMalloc((void**)&csrValSVSquare, sizeof(double)*nnzSV)); + CUDA_CHECK(cudaMalloc((void**)&SVSquare, sizeof(double)*l)); CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(csrValSVSquare, csrValSquare.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice)); -} - -void svm_model::cuda_preload_problem(const struct svm_problem *prob) { - free(csrRowPtrProblem); - CUDA_CHECK(cudaFree(csrColIdxProblem)); - CUDA_CHECK(cudaFree(csrValProblem)); - CUDA_CHECK(cudaFree(csrValProblemSquare)); - CUDA_CHECK(cudaFree(d_out)); - csrRowPtrProblem = NULL; - csrColIdxProblem = NULL; - csrValProblem = NULL; - csrValProblemSquare = NULL; - d_out = NULL; - problem = prob; - if( problem!=NULL ) { - nMaxIdxProblem = 0; - nnzProblem = 0; - const svm_node *node; - for( int i=0; il; i++ ) { - node = problem->x[i]; - while(node->index != -1) { - if( node->index > nMaxIdxProblem ) - nMaxIdxProblem = node->index; - nnzProblem++; - node++; - } - } - csrRowPtrProblem = Malloc(int,problem->l+1); - std::vector csrVal(nnzProblem); - std::vector csrValSquare(nnzProblem); - std::vector csrColIdx(nnzProblem); - int curPos = 0; - for( int i=0; il; i++ ) { - csrRowPtrProblem[i] = curPos; - node = problem->x[i]; - while(node->index != -1) { - csrVal[curPos] = node->value; - csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos]; - csrColIdx[curPos] = node->index; - curPos++; - node++; - } - } - csrRowPtrProblem[problem->l] = curPos; - - CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*l)); - CUDA_CHECK(cudaMalloc((void**)&csrColIdxProblem, sizeof(int)*nnzProblem)); - CUDA_CHECK(cudaMalloc((void**)&csrValProblem, sizeof(double)*nnzProblem)); - CUDA_CHECK(cudaMalloc((void**)&csrValProblemSquare, sizeof(double)*nnzProblem)); - CUDA_CHECK(cudaMemcpy(csrColIdxProblem, csrColIdx.data(), sizeof(int)*nnzProblem, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(csrValProblem, csrVal.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(csrValProblemSquare, csrValSquare.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice)); - } -} - -void svm_model::setPreloadIdx(int nIdx) { - nPreloadIdx = nIdx; -} - -int svm_model::getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const { - if( problem==NULL ) - return 0; - int* csrRowPtr = csrRowPtrProblem + nPreloadIdx; - *nnz = *(csrRowPtr+1)-*csrRowPtr; - *csrColIdx = csrColIdxProblem + *csrRowPtr; - *csrVal = csrValProblem + *csrRowPtr; - *csrValSquare = csrValProblemSquare + *csrRowPtr; - return 1; + CUDA_CHECK(cudaMemcpy(SVSquare, hSVSquare.data(), sizeof(double)*l, cudaMemcpyHostToDevice)); } void svm_model::cuda_free() { @@ -740,53 +689,7 @@ void svm_model::cuda_free() { CUDA_CHECK(cudaFree(csrColIdxSV)); CUDA_CHECK(cudaFree(csrValSV)); CUDA_CHECK(cudaFree(csrValSVSquare)); - free(csrRowPtrProblem); - CUDA_CHECK(cudaFree(csrColIdxProblem)); - CUDA_CHECK(cudaFree(csrValProblem)); - CUDA_CHECK(cudaFree(csrValProblemSquare)); - CUDA_CHECK(cudaFree(d_out)); -} - -__global__ void RBF_k_function(int nnzX, double* csrValX, double* csrValXSquare, int* csrColIdxX, - int countSV, - double* csrValSV, double* csrValSVSquare, int* csrRowPtrSV, int* csrColIdxSV, - double* out) { - int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x; - if( nSVIdx>=countSV ) - return; - int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx]; - double* ValSV = csrValSV + csrRowPtrSV[nSVIdx]; - double* ValSVSquare = csrValSVSquare + csrRowPtrSV[nSVIdx]; - int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx]; - - int colX = 0, colSV = 0; - double sum = 0.0; - while( colXnMaxIdxSV; const svm_node *node = x; int x_nnz = 0; + double squareX = 0.0; while(node->index != -1) { if( node->index > max_index ) max_index = node->index; x_nnz++; + squareX += node->value * node->value; node++; } max_index++; @@ -871,81 +777,21 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou CUDA_R_64F, buffer); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + // update out base on kernel_type + const int threadPerBlock = 256; + int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock; + dot2kernelValue<<>>(param.kernel_type, model->l, param.gamma, param.coef0, + param.degree, squareX, model->SVSquare, 0, d_vectorY); + // copy vectorY back to host CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); - // update out base on kernel_type - if( param.kernel_type==POLY ) { - for( int i=0; il; i++ ) - out[i] = powi(param.gamma*out[i]+param.coef0,param.degree); - } else if( param.kernel_type==SIGMOID ) { - for( int i=0; il; i++ ) - out[i] = tanh(param.gamma*out[i]+param.coef0); - } - cudaFree(d_vector); cudaFree(buffer); cusparseDestroyMatDescr(descrSV); cusparseDestroy(cusparse); } break; - case RBF: - { - const int threadPerBlock = 256; - int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock; - int nnzX = 0; - int* csrColIdxX = NULL; - double* csrValX = NULL; - double* csrValXSquare = NULL; - if( model->getPreloadX(&nnzX, &csrColIdxX, &csrValX, &csrValXSquare) ) { - // vector X is preloaded(training) - RBF_k_function<<>>(nnzX, csrValX, csrValXSquare, csrColIdxX, - model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, - model->d_out); - CUDA_CHECK(cudaMemcpy(out, model->d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); - } else { - // vector X is not preloaded(predict) - const svm_node *node = x; - int x_nnz = 0; - while(node->index != -1) { - x_nnz++; - node++; - } - - // get vector x ready - std::vector h_csrValX(x_nnz*2); - double* h_csrValXSquare = h_csrValX.data()+x_nnz; - std::vector h_csrColIdxX(x_nnz); - int curPos = 0; - node = x; - while(node->index != -1) { - h_csrValX[curPos] = node->value; - h_csrValXSquare[curPos] = h_csrValX[curPos]*h_csrValX[curPos]; - h_csrColIdxX[curPos] = node->index; - curPos++; - node++; - } - CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*(x_nnz*2+model->l)+sizeof(int)*x_nnz)); - csrValXSquare = csrValX + x_nnz; - double* d_out = csrValXSquare + x_nnz; - csrColIdxX = (int*)&d_out[model->l]; - CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz*2, cudaMemcpyHostToDevice)); - - RBF_k_function<<>>(x_nnz, csrValX, csrValXSquare, csrColIdxX, - model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, - d_out); - - // copy d_out back to host - CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaFree(csrValX)); - } - - // do out[i] = exp(-param.gamma*out[i]) - for( int i=0; il; i++ ) - out[i] = exp(-param.gamma*out[i]); - } - break; default: printf("unsupport kernel type\n"); exit(0); @@ -2558,15 +2404,8 @@ static void svm_binary_svc_probability( subparam.weight[0]=Cp; subparam.weight[1]=Cn; struct svm_model *submodel = svm_train(&subprob,&subparam); -#ifdef SVM_CUDA - submodel->cuda_preload_problem(prob); for(j=begin;jsetPreloadIdx(perm[j]); -#else - for(j=begin;jx[perm[j]],&(dec_values[perm[j]])); // ensure +1 -1 order; reason not using CV subroutine dec_values[perm[j]] *= submodel->label[0]; diff --git a/svm.h b/svm.h index 5b2ca0d9..0f546ccd 100644 --- a/svm.h +++ b/svm.h @@ -79,19 +79,7 @@ struct svm_model int* csrColIdxSV; /* column index of matrix SV */ double* csrValSV; /* values of matrix SV */ double* csrValSVSquare; /* value^2 of matrix SV */ - - void cuda_preload_problem(const struct svm_problem *problem); /* load problem in gpu memory, non-thread-safe, speed up training */ - void setPreloadIdx(int nIdx); /* called before getPreloadX() */ - int getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const; /* get vector x */ - int nPreloadIdx; - const struct svm_problem *problem; - int* csrRowPtrProblem; /* row index of matrix problem */ - int nMaxIdxProblem; /* max index of problem */ - int nnzProblem; /* non-zero value count of problem */ - int* csrColIdxProblem; /* cloumn index of matrix problem */ - double* csrValProblem; /* values of matrix problem */ - double* csrValProblemSquare; /* value^2 of matrix problem */ - double* d_out; /* temp output */ + double* SVSquare; /* dot(x,x) of matrix SV */ #endif }; From 75952f7417c0230a8c2d4a5687e4c0e823ce24bd Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Sat, 8 Feb 2020 20:29:10 +0800 Subject: [PATCH 07/19] speed up with OpenMP, 2x speed up than single thread one --- Makefile | 4 +-- svm.cpp | 96 +++++++++++++++++++++++++++----------------------------- 2 files changed, 48 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index 22980070..5d2d3bf0 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,6 @@ clean: svm-train.o: svm-train.c $(CXX) $(CFLAGS) -c svm-train.c svm-cuda.o: svm.cpp - nvcc -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o + nvcc -Xcompiler -fopenmp -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o svm-train-cuda: svm-train.o svm-cuda.o - $(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart + $(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart -fopenmp diff --git a/svm.cpp b/svm.cpp index d85a6848..45c37c10 100644 --- a/svm.cpp +++ b/svm.cpp @@ -15,7 +15,9 @@ #include #include #include +#include #endif + int libsvm_version = LIBSVM_VERSION; typedef float Qfloat; typedef signed char schar; @@ -415,8 +417,6 @@ class CudaKernel : public KernelBase { private: void x2csrMatrix(int sizeX) const; //convert svm_node** x -> csr format matrix private: - cusparseHandle_t m_cusparse; - cusparseMatDescr_t m_descr; int m_max_index; // max column of x matrix std::vector m_csrRowPtr; // csr row ptr of x matrix int* d_csrColIdx; // csr col index of x matrix @@ -427,16 +427,8 @@ class CudaKernel : public KernelBase { }; CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), - m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1), + m_max_index(0), m_csrRowPtr(l+1), d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) { - // Init handlers - cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseResult = cusparseCreateMatDescr(&m_descr); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL); - // get Num-non-zero and max_index m_max_index = 0; const svm_node *node; @@ -492,8 +484,6 @@ CudaKernel::~CudaKernel() { CUDA_CHECK(cudaFree(d_csrColIdx)); CUDA_CHECK(cudaFree(d_csrVal)); CUDA_CHECK(cudaFree(d_squareX)); - if(m_cusparse) cusparseDestroy(m_cusparse); - if(m_descr) cusparseDestroyMatDescr(m_descr); } void CudaKernel::done_shrinking() const { @@ -525,24 +515,25 @@ __global__ void dot2kernelValue(int kernel_type, int len, double gamma, double c void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { int len = end-start; + cusparseHandle_t cusparse = NULL; + cusparseMatDescr_t descr = NULL; + cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&descr); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); // get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx double* csrValA = d_csrVal + m_csrRowPtr[start]; int* csrColIndA = d_csrColIdx + m_csrRowPtr[start]; int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start]; - const svm_node *node = x[i]; - int max_index = 0; - while(node->index != -1) { - if( max_index < node->index ) - max_index = node->index; - node++; - } - max_index++; double* d_vectors = NULL; - CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*max_index+sizeof(double)*len)); + CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*m_max_index+sizeof(double)*len)); double* d_vectorX = d_vectors; - double* d_vectorY = &d_vectorX[max_index]; + double* d_vectorY = &d_vectorX[m_max_index]; int* csrRowPtrA = (int*)&d_vectorY[len]; // get csrRowPtrA ready for cusparseCsrmvEx @@ -553,26 +544,27 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice)); // get vector x ready for cusparseCsrmvEx - std::vector vectorX(max_index, 0.0); - node = x[i]; + std::vector vectorX(m_max_index, 0.0); + const svm_node* node = x[i]; while(node->index != -1) { vectorX[node->index] = node->value; node++; } - CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice)); // get temp buffer size and get it ready + // https://docs.nvidia.com/cuda/cusparse/index.html#asynchronous-execution size_t buffer_size = 0; static double h_one = 1.0; static double h_zero = 0.0; - cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse, + cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, len, m_max_index, nnz, &h_one, CUDA_R_64F, - m_descr, + descr, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, @@ -586,14 +578,14 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); // call cusparseCsrmvEx - cusparseResult = cusparseCsrmvEx(m_cusparse, + cusparseResult = cusparseCsrmvEx(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, len, m_max_index, nnz, &h_one, CUDA_R_64F, - m_descr, + descr, csrValA, CUDA_R_64F, csrRowPtrA, csrColIndA, @@ -617,8 +609,10 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con Q[start+idx] = (Qfloat)(vectorY[idx]); } - cudaFree(d_vectors); - cudaFree(buffer); + CUDA_CHECK(cudaFree(d_vectors)); + CUDA_CHECK(cudaFree(buffer)); + cusparseDestroyMatDescr(descr); + cusparseDestroy(cusparse); // debug, compare with base class result from cpu /*std::vector Qbase(start+len, 0.0); @@ -643,6 +637,7 @@ void svm_model::cuda_init() { node++; } } + nMaxIdxSV++; // get csrVal csrColIdx ready std::vector csrRowPtr(l+1); @@ -716,39 +711,35 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); - int max_index = model->nMaxIdxSV; const svm_node *node = x; int x_nnz = 0; double squareX = 0.0; while(node->index != -1) { - if( node->index > max_index ) - max_index = node->index; x_nnz++; squareX += node->value * node->value; node++; } - max_index++; // get dense vector x ready double* d_vector = NULL; - CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(max_index+model->l))); + CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(model->nMaxIdxSV+model->l))); double* d_vectorX = d_vector; - vectorX.resize(max_index, 0.0); + vectorX.resize(model->nMaxIdxSV, 0.0); node = x; while(node->index != -1) { vectorX[node->index] = node->value; node++; } - CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice)); // get dense vector y ready - double* d_vectorY = d_vector + max_index; - + double* d_vectorY = d_vector + model->nMaxIdxSV; + // get buffer size cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, + model->l, model->nMaxIdxSV, model->nnzSV, &one, CUDA_R_64F, descrSV, model->csrValSV, CUDA_R_64F, @@ -765,7 +756,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou cusparseResult = cusparseCsrmvEx(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, - model->l, max_index, + model->l, model->nMaxIdxSV, model->nnzSV, &one, CUDA_R_64F, descrSV, @@ -786,8 +777,8 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou // copy vectorY back to host CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); - cudaFree(d_vector); - cudaFree(buffer); + CUDA_CHECK(cudaFree(d_vector)); + CUDA_CHECK(cudaFree(buffer)); cusparseDestroyMatDescr(descrSV); cusparseDestroy(cusparse); } @@ -2634,10 +2625,14 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) probB=Malloc(double,nr_class*(nr_class-1)/2); } - int p = 0; - for(i=0;iprobability) svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p]); - f[p] = svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j]); + for(k=0;k 0) nonzero[si+k] = true; @@ -2668,8 +2663,9 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) nonzero[sj+k] = true; free(sub_prob.x); free(sub_prob.y); - ++p; } + t += (nr_class-i-1); + } // build output @@ -2720,7 +2716,7 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) model->l = total_sv; model->SV = Malloc(svm_node *,total_sv); model->sv_indices = Malloc(int,total_sv); - p = 0; + int p = 0; for(i=0;i Date: Sat, 8 Feb 2020 20:42:02 +0800 Subject: [PATCH 08/19] Update README --- README | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README b/README index a22756a4..9cbd0c19 100644 --- a/README +++ b/README @@ -4,6 +4,14 @@ make clean; make svm-train-cuda How to use: run svm-train-cuda instead of svm-train, with same command line arguments of svm-train +multithreading +svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one. + +If you run out of memory or GPU memory, you can disable it with environment: +export OMP_NUM_THREADS=1 + +You can try setting OMP_NUM_THREADS to another number, balancing the speed and GPU/CPU usage. + /////////////////////////////////////////////////////////////////////////////////////// From 62243fbaff637437a7b35e7b50c3b0f567c23348 Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Sat, 8 Feb 2020 20:46:35 +0800 Subject: [PATCH 09/19] Update README --- README | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README b/README index 9cbd0c19..42aa7e94 100644 --- a/README +++ b/README @@ -6,12 +6,13 @@ run svm-train-cuda instead of svm-train, with same command line arguments of svm multithreading svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one. - If you run out of memory or GPU memory, you can disable it with environment: export OMP_NUM_THREADS=1 - You can try setting OMP_NUM_THREADS to another number, balancing the speed and GPU/CPU usage. +tips +try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1) + /////////////////////////////////////////////////////////////////////////////////////// From d3df56df23a09bc5b5ff4fe335b0987d5d66afb6 Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Sun, 9 Feb 2020 01:21:59 +0800 Subject: [PATCH 10/19] use OpenMP collapse() for better performance --- svm.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/svm.cpp b/svm.cpp index 45c37c10..60d7ed87 100644 --- a/svm.cpp +++ b/svm.cpp @@ -2625,14 +2625,13 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) probB=Malloc(double,nr_class*(nr_class-1)/2); } - int t = 0; - for(i=0;i Date: Sun, 9 Feb 2020 10:04:10 +0800 Subject: [PATCH 11/19] default omp_set_num_threads set to 2 --- svm.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/svm.cpp b/svm.cpp index 60d7ed87..44700dd7 100644 --- a/svm.cpp +++ b/svm.cpp @@ -2626,6 +2626,8 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) } #ifdef SVM_CUDA + if( !getenv("OMP_NUM_THREADS") ) + omp_set_num_threads(2); #pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(2) #endif for(i=0;i Date: Mon, 10 Feb 2020 11:51:28 +0800 Subject: [PATCH 12/19] Change loop style to make new version OpenMP happy --- svm.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/svm.cpp b/svm.cpp index 44700dd7..daf514b9 100644 --- a/svm.cpp +++ b/svm.cpp @@ -2627,13 +2627,14 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) #ifdef SVM_CUDA if( !getenv("OMP_NUM_THREADS") ) - omp_set_num_threads(2); - #pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(2) + omp_set_num_threads(1); + #pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(1) #endif - for(i=0;i=nr_class ) + continue; svm_problem sub_prob; int si = start[i], sj = start[j]; int ci = count[i], cj = count[j]; From 6b86f45d8daa6cff1bf04249abeec43d722f5d8c Mon Sep 17 00:00:00 2001 From: panqingfeng Date: Tue, 11 Feb 2020 10:14:45 +0800 Subject: [PATCH 13/19] less cudaMalloc/cudaFree, a new -u command line argument --- svm-train.c | 5 ++ svm.cpp | 163 ++++++++++++++++++++++++++++++++-------------------- svm.h | 1 + 3 files changed, 108 insertions(+), 61 deletions(-) diff --git a/svm-train.c b/svm-train.c index b6ce987d..a5a4d5ad 100644 --- a/svm-train.c +++ b/svm-train.c @@ -38,6 +38,7 @@ void exit_with_help() "-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)\n" "-v n: n-fold cross validation mode\n" "-q : quiet mode (no outputs)\n" + "-u n : cuda train threads count (default 1)\n" ); exit(1); } @@ -179,6 +180,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode param.nr_weight = 0; param.weight_label = NULL; param.weight = NULL; + param.cuda_threads = 1; cross_validation = 0; // parse options @@ -245,6 +247,9 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]); param.weight[param.nr_weight-1] = atof(argv[i]); break; + case 'u': + param.cuda_threads = atoi(argv[i]); + break; default: fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]); exit_with_help(); diff --git a/svm.cpp b/svm.cpp index daf514b9..18eb3b9f 100644 --- a/svm.cpp +++ b/svm.cpp @@ -424,11 +424,27 @@ class CudaKernel : public KernelBase { double* d_squareX; // a gpu copy of x_square int m_sizeX; // size of x int m_nnz; // number of non-zero value of x matrix + + cusparseHandle_t m_cusparse; + cusparseMatDescr_t m_descr; + cudaStream_t m_stream; + double* d_vectorX; + double* d_vectorY; + int* d_csrRowPtrA; + double* h_vectorX; + double* h_vectorY; + int* h_csrRowPtrA; + char* d_defaultBuf; + static int defaultBufSize; }; +int CudaKernel::defaultBufSize = 10240; CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), m_max_index(0), m_csrRowPtr(l+1), - d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) { + d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0), + m_cusparse(NULL), m_descr(NULL), m_stream(NULL), d_vectorX(NULL), d_vectorY(NULL), + d_csrRowPtrA(NULL), h_vectorX(NULL), h_vectorY(NULL), h_csrRowPtrA(NULL), + d_defaultBuf(NULL) { // get Num-non-zero and max_index m_max_index = 0; const svm_node *node; @@ -443,6 +459,26 @@ CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) } m_max_index++; + CUDA_CHECK(cudaStreamCreate(&m_stream)); + cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseResult = cusparseCreateMatDescr(&m_descr); + assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetPointerMode(m_cusparse, CUSPARSE_POINTER_MODE_HOST); + + CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index)); + CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*l)); + CUDA_CHECK(cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMallocHost((void**)&h_vectorX, sizeof(double)*m_max_index)); + CUDA_CHECK(cudaMallocHost((void**)&h_vectorY, sizeof(double)*l)); + CUDA_CHECK(cudaMallocHost((void**)&h_csrRowPtrA, sizeof(int)*(l+1))); + CUDA_CHECK(cudaMalloc((void**)&d_defaultBuf, defaultBufSize)); + CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz)); + CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz)); + CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*l)); + x2csrMatrix(l); } @@ -466,15 +502,6 @@ void CudaKernel::x2csrMatrix(int sizeX) const { csrRowPtr[sizeX] = curPos; // copy csrVal, csrColIdx to device - if( !d_csrColIdx ) { - CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz)); - } - if( !d_csrVal ) { - CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz)); - } - if( !d_squareX ) { - CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*sizeX)); - } CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_squareX, x_square, sizeof(double)*sizeX, cudaMemcpyHostToDevice)); @@ -484,6 +511,16 @@ CudaKernel::~CudaKernel() { CUDA_CHECK(cudaFree(d_csrColIdx)); CUDA_CHECK(cudaFree(d_csrVal)); CUDA_CHECK(cudaFree(d_squareX)); + CUDA_CHECK(cudaFree(d_vectorX)); + CUDA_CHECK(cudaFree(d_vectorY)); + CUDA_CHECK(cudaFree(d_csrRowPtrA)); + CUDA_CHECK(cudaFree(d_defaultBuf)); + CUDA_CHECK(cudaFreeHost(h_vectorX)); + CUDA_CHECK(cudaFreeHost(h_vectorY)); + CUDA_CHECK(cudaFreeHost(h_csrRowPtrA)); + cusparseDestroyMatDescr(m_descr); + cusparseDestroy(m_cusparse); + CUDA_CHECK(cudaStreamDestroy(m_stream)); } void CudaKernel::done_shrinking() const { @@ -515,58 +552,44 @@ __global__ void dot2kernelValue(int kernel_type, int len, double gamma, double c void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const { int len = end-start; - cusparseHandle_t cusparse = NULL; - cusparseMatDescr_t descr = NULL; - cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseResult = cusparseCreateMatDescr(&descr); - assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); - cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST); + assert(len<=m_sizeX); // get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx double* csrValA = d_csrVal + m_csrRowPtr[start]; int* csrColIndA = d_csrColIdx + m_csrRowPtr[start]; int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start]; - double* d_vectors = NULL; - CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*m_max_index+sizeof(double)*len)); - double* d_vectorX = d_vectors; - double* d_vectorY = &d_vectorX[m_max_index]; - int* csrRowPtrA = (int*)&d_vectorY[len]; - // get csrRowPtrA ready for cusparseCsrmvEx - std::vector csrRow(len+1); for( int idx=0; idx<=len; idx++ ) { - csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; + h_csrRowPtrA[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start]; } - CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(len+1), cudaMemcpyHostToDevice, m_stream)); // get vector x ready for cusparseCsrmvEx - std::vector vectorX(m_max_index, 0.0); + memset(h_vectorX, 0, sizeof(double)*m_max_index); const svm_node* node = x[i]; while(node->index != -1) { - vectorX[node->index] = node->value; + h_vectorX[node->index] = node->value; node++; } - CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(d_vectorX, h_vectorX, sizeof(double)*m_max_index, cudaMemcpyHostToDevice, m_stream)); // get temp buffer size and get it ready // https://docs.nvidia.com/cuda/cusparse/index.html#asynchronous-execution size_t buffer_size = 0; - static double h_one = 1.0; - static double h_zero = 0.0; - cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, + static const double h_zero = 0.0; + static const double h_one = 1.0; + cusparseSetStream(m_cusparse, m_stream); + cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, len, m_max_index, nnz, &h_one, CUDA_R_64F, - descr, + m_descr, csrValA, CUDA_R_64F, - csrRowPtrA, + d_csrRowPtrA, csrColIndA, d_vectorX, CUDA_R_64F, &h_zero, CUDA_R_64F, @@ -574,20 +597,26 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con CUDA_R_64F, &buffer_size); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); + void* alloc_buffer = NULL; void* buffer = NULL; - CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); + if( buffer_size > defaultBufSize ) { + CUDA_CHECK(cudaMalloc((void**)&alloc_buffer, buffer_size)); + buffer = alloc_buffer; + } else + buffer = d_defaultBuf; // call cusparseCsrmvEx - cusparseResult = cusparseCsrmvEx(cusparse, + cusparseSetStream(m_cusparse, m_stream); + cusparseResult = cusparseCsrmvEx(m_cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, len, m_max_index, nnz, &h_one, CUDA_R_64F, - descr, + m_descr, csrValA, CUDA_R_64F, - csrRowPtrA, + d_csrRowPtrA, csrColIndA, d_vectorX, CUDA_R_64F, &h_zero, CUDA_R_64F, @@ -599,20 +628,18 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con // update out base on kernel_type const int threadPerBlock = 256; int blockPerGrid = (len+threadPerBlock-1)/threadPerBlock; - dot2kernelValue<<>>(kernel_type, len, gamma, coef0, + dot2kernelValue<<>>(kernel_type, len, gamma, coef0, degree, x_square[i], d_squareX, start, d_vectorY); // copy d_vectorY back to host - std::vector vectorY(len); - CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(h_vectorY, d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost, m_stream)); + CUDA_CHECK(cudaStreamSynchronize(m_stream)); for( int idx=0; idx Qbase(start+len, 0.0); @@ -698,11 +725,12 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou { cusparseHandle_t cusparse = NULL; cusparseMatDescr_t descrSV = NULL; + cudaStream_t stream = NULL; static const double one = 1.0; static const double zero = 0.0; - std::vector vectorX; size_t buffer_size; + CUDA_CHECK(cudaStreamCreate(&stream)); cusparseStatus_t cusparseResult = cusparseCreate(&cusparse); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); cusparseResult = cusparseCreateMatDescr(&descrSV); @@ -724,18 +752,22 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou double* d_vector = NULL; CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(model->nMaxIdxSV+model->l))); double* d_vectorX = d_vector; - vectorX.resize(model->nMaxIdxSV, 0.0); + double* d_vectorY = d_vector + model->nMaxIdxSV; + double* h_vectors = NULL; + CUDA_CHECK(cudaMallocHost((void**)&h_vectors, sizeof(double)*(model->nMaxIdxSV+model->l))); + double* vectorX = h_vectors; + double* vectorY = vectorX + model->nMaxIdxSV; + + memset(vectorX, 0, model->nMaxIdxSV); node = x; while(node->index != -1) { vectorX[node->index] = node->value; node++; } - CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX, sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice, stream)); - // get dense vector y ready - double* d_vectorY = d_vector + model->nMaxIdxSV; - // get buffer size + cusparseSetStream(cusparse, stream); cusparseResult = cusparseCsrmvEx_bufferSize(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -750,9 +782,10 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou CUDA_R_64F, &buffer_size); assert(CUSPARSE_STATUS_SUCCESS == cusparseResult); char* buffer = NULL; - CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size)); + CUDA_CHECK(cudaMalloc((void**)&buffer, buffer_size)); // y = dot(SV, x) + cusparseSetStream(cusparse, stream); cusparseResult = cusparseCsrmvEx(cusparse, CUSPARSE_ALG0, CUSPARSE_OPERATION_NON_TRANSPOSE, @@ -771,16 +804,20 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou // update out base on kernel_type const int threadPerBlock = 256; int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock; - dot2kernelValue<<>>(param.kernel_type, model->l, param.gamma, param.coef0, + dot2kernelValue<<>>(param.kernel_type, model->l, param.gamma, param.coef0, param.degree, squareX, model->SVSquare, 0, d_vectorY); // copy vectorY back to host - CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpyAsync(vectorY, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); + memcpy(out, vectorY, sizeof(double)*model->l); CUDA_CHECK(cudaFree(d_vector)); + CUDA_CHECK(cudaFreeHost(h_vectors)); CUDA_CHECK(cudaFree(buffer)); cusparseDestroyMatDescr(descrSV); cusparseDestroy(cusparse); + CUDA_CHECK(cudaStreamDestroy(stream)); } break; default: @@ -2342,6 +2379,9 @@ static void svm_binary_svc_probability( int j = i+rand()%(prob->l-i); swap(perm[i],perm[j]); } +#ifdef SVM_CUDA + #pragma omp parallel for schedule(dynamic) num_threads(param->cuda_threads) +#endif for(i=0;il/nr_fold; @@ -2626,12 +2666,12 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) } #ifdef SVM_CUDA - if( !getenv("OMP_NUM_THREADS") ) - omp_set_num_threads(1); - #pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(1) + // Another "#pragma omp parallel for" in svm_binary_svc_probability(), if param->probability + #pragma omp parallel for schedule(dynamic) if(nr_class!=2 && !param->probability) num_threads(param->cuda_threads) #endif for(int p=0; p=nr_class ) continue; @@ -3229,6 +3269,7 @@ bool read_model_header(FILE *fp, svm_model* model) param.nr_weight = 0; param.weight_label = NULL; param.weight = NULL; + param.cuda_threads = 1; char cmd[81]; while(1) diff --git a/svm.h b/svm.h index 0f546ccd..99726cc8 100644 --- a/svm.h +++ b/svm.h @@ -44,6 +44,7 @@ struct svm_parameter double p; /* for EPSILON_SVR */ int shrinking; /* use the shrinking heuristics */ int probability; /* do probability estimates */ + int cuda_threads; /* cuda thread count of svm_train() */ }; // From a8b1b89aed36b63c1de3310adc37ac4eb5ca7218 Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Tue, 11 Feb 2020 20:39:19 +0800 Subject: [PATCH 14/19] Update README --- README | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/README b/README index 42aa7e94..bb48d92e 100644 --- a/README +++ b/README @@ -5,10 +5,14 @@ How to use: run svm-train-cuda instead of svm-train, with same command line arguments of svm-train multithreading -svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one. -If you run out of memory or GPU memory, you can disable it with environment: -export OMP_NUM_THREADS=1 -You can try setting OMP_NUM_THREADS to another number, balancing the speed and GPU/CPU usage. +svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, +it can be faster than single thread one. +You can define cuda thread count with command line argument -u +-u n : cuda train threads count (default 1) +Please note: cache is base on thread, total host memory usage is cuda_thread*cache_size_per_thread +you will have to cut down some cache memory if you are using multithreading cuda, cache is faster than GPU, +if the training size is small enough. +So you may try and get the best -u cuda_thread -m cache_size combination, base on the size of your problem. tips try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1) From 497d570de06aaee7b086649f61602844d35bdc5c Mon Sep 17 00:00:00 2001 From: panqingfeng Date: Thu, 13 Feb 2020 00:22:28 +0800 Subject: [PATCH 15/19] speed up host code with OpenMP, to catch up with the speed of GPU --- svm-train.c | 5 --- svm.cpp | 111 ++++++++++++++++++++++++++++++++++++++++------------ svm.h | 1 - 3 files changed, 86 insertions(+), 31 deletions(-) diff --git a/svm-train.c b/svm-train.c index a5a4d5ad..b6ce987d 100644 --- a/svm-train.c +++ b/svm-train.c @@ -38,7 +38,6 @@ void exit_with_help() "-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)\n" "-v n: n-fold cross validation mode\n" "-q : quiet mode (no outputs)\n" - "-u n : cuda train threads count (default 1)\n" ); exit(1); } @@ -180,7 +179,6 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode param.nr_weight = 0; param.weight_label = NULL; param.weight = NULL; - param.cuda_threads = 1; cross_validation = 0; // parse options @@ -247,9 +245,6 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]); param.weight[param.nr_weight-1] = atof(argv[i]); break; - case 'u': - param.cuda_threads = atoi(argv[i]); - break; default: fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]); exit_with_help(); diff --git a/svm.cpp b/svm.cpp index 18eb3b9f..d2f9e382 100644 --- a/svm.cpp +++ b/svm.cpp @@ -1264,38 +1264,82 @@ int Solver::select_working_set(int &out_i, int &out_j) // j: minimizes the decrease of obj value // (if quadratic coefficeint <= 0, replace it with tau) // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) - double Gmax = -INF; double Gmax2 = -INF; int Gmax_idx = -1; int Gmin_idx = -1; double obj_diff_min = INF; +#ifdef SVM_CUDA + static double tGmax = -INF; + static double tGmax2 = -INF; + static int tGmax_idx = -1; + static int tGmin_idx = -1; + static double tobj_diff_min = INF; +#pragma omp threadprivate(tGmax, tGmax2, tGmax_idx, tGmin_idx, tobj_diff_min) + + tGmax = -INF; + tGmax_idx = -1; +#pragma omp parallel for copyin(tGmax, tGmax_idx) +#else + double& tGmax = Gmax; + double& tGmax2 = Gmax2; + int& tGmax_idx = Gmax_idx; + int& tGmin_idx = Gmin_idx; + double& tobj_diff_min = obj_diff_min; +#endif for(int t=0;t= Gmax) + if(-G[t] >= tGmax) { - Gmax = -G[t]; - Gmax_idx = t; + tGmax = -G[t]; + tGmax_idx = t; } } else { if(!is_lower_bound(t)) - if(G[t] >= Gmax) + if(G[t] >= tGmax) { - Gmax = G[t]; - Gmax_idx = t; + tGmax = G[t]; + tGmax_idx = t; } } +#ifdef SVM_CUDA +#pragma omp parallel + { + #pragma omp critical + { + if( tGmax>=Gmax ) + { + if( tGmax==Gmax ) + { + if( tGmax_idx>Gmax_idx ) + Gmax_idx = tGmax_idx; + } + else + { + Gmax = tGmax; + Gmax_idx = tGmax_idx; + } + } + } + } +#endif int i = Gmax_idx; const Qfloat *Q_i = NULL; if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1 Q_i = Q->get_Q(i,active_size); + tGmax2 = -INF; + tGmin_idx = -1; + tobj_diff_min = INF; +#ifdef SVM_CUDA +#pragma omp parallel for copyin(tGmax2, tGmin_idx, tobj_diff_min) +#endif for(int j=0;j= Gmax2) - Gmax2 = G[j]; + if (G[j] >= tGmax2) + tGmax2 = G[j]; if (grad_diff > 0) { double obj_diff; @@ -1314,10 +1358,10 @@ int Solver::select_working_set(int &out_i, int &out_j) else obj_diff = -(grad_diff*grad_diff)/TAU; - if (obj_diff <= obj_diff_min) + if (obj_diff <= tobj_diff_min) { - Gmin_idx=j; - obj_diff_min = obj_diff; + tGmin_idx=j; + tobj_diff_min = obj_diff; } } } @@ -1327,8 +1371,8 @@ int Solver::select_working_set(int &out_i, int &out_j) if (!is_upper_bound(j)) { double grad_diff= Gmax-G[j]; - if (-G[j] >= Gmax2) - Gmax2 = -G[j]; + if (-G[j] >= tGmax2) + tGmax2 = -G[j]; if (grad_diff > 0) { double obj_diff; @@ -1338,15 +1382,37 @@ int Solver::select_working_set(int &out_i, int &out_j) else obj_diff = -(grad_diff*grad_diff)/TAU; - if (obj_diff <= obj_diff_min) + if (obj_diff <= tobj_diff_min) { - Gmin_idx=j; - obj_diff_min = obj_diff; + tGmin_idx=j; + tobj_diff_min = obj_diff; } } } } } +#ifdef SVM_CUDA +#pragma omp parallel + { + #pragma omp critical + { + if(tGmax2>Gmax2 ) + Gmax2 = tGmax2; + if( tobj_diff_min <= obj_diff_min ) + { + if( tobj_diff_min Gmin_idx ) + { + Gmin_idx = tGmin_idx; + } + } + } + } +#endif if(Gmax+Gmax2 < eps || Gmin_idx == -1) return 1; @@ -2379,9 +2445,6 @@ static void svm_binary_svc_probability( int j = i+rand()%(prob->l-i); swap(perm[i],perm[j]); } -#ifdef SVM_CUDA - #pragma omp parallel for schedule(dynamic) num_threads(param->cuda_threads) -#endif for(i=0;il/nr_fold; @@ -2435,6 +2498,9 @@ static void svm_binary_svc_probability( subparam.weight[0]=Cp; subparam.weight[1]=Cn; struct svm_model *submodel = svm_train(&subprob,&subparam); +#ifdef SVM_CUDA +#pragma omp parallel for private(j) +#endif for(j=begin;jx[perm[j]],&(dec_values[perm[j]])); @@ -2665,10 +2731,6 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param) probB=Malloc(double,nr_class*(nr_class-1)/2); } -#ifdef SVM_CUDA - // Another "#pragma omp parallel for" in svm_binary_svc_probability(), if param->probability - #pragma omp parallel for schedule(dynamic) if(nr_class!=2 && !param->probability) num_threads(param->cuda_threads) -#endif for(int p=0; p Date: Thu, 13 Feb 2020 00:30:55 +0800 Subject: [PATCH 16/19] Update README --- README | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/README b/README index bb48d92e..6f2ea3ce 100644 --- a/README +++ b/README @@ -5,14 +5,10 @@ How to use: run svm-train-cuda instead of svm-train, with same command line arguments of svm-train multithreading -svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, -it can be faster than single thread one. -You can define cuda thread count with command line argument -u --u n : cuda train threads count (default 1) -Please note: cache is base on thread, total host memory usage is cuda_thread*cache_size_per_thread -you will have to cut down some cache memory if you are using multithreading cuda, cache is faster than GPU, -if the training size is small enough. -So you may try and get the best -u cuda_thread -m cache_size combination, base on the size of your problem. +svm-train-cuda is multithreading by default(with OpenMP), you can change thread count with +setenv OMP_NUM_THREADS 8 (Windows) +or +export OMP_NUM_THREADS=8 (Linux) tips try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1) From 39d381ade76acf119a637ec46c7f8b6647d69344 Mon Sep 17 00:00:00 2001 From: panqingfeng Date: Fri, 14 Feb 2020 11:27:55 +0800 Subject: [PATCH 17/19] speed up host code with OpenMP, to catch up with the speed of GPU --- svm.cpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/svm.cpp b/svm.cpp index d2f9e382..446fb08d 100644 --- a/svm.cpp +++ b/svm.cpp @@ -951,6 +951,9 @@ void Solver::reconstruct_gradient() if (nr_free*l > 2*active_size*(l-active_size)) { +#ifdef SVM_CUDA +#pragma omp parallel for private(i) +#endif for(i=active_size;iget_Q(i,active_size); @@ -961,6 +964,9 @@ void Solver::reconstruct_gradient() } else { +#ifdef SVM_CUDA +#pragma omp parallel for private(i) +#endif for(i=0;i Date: Fri, 28 Aug 2020 09:58:21 +0800 Subject: [PATCH 18/19] speed up host code with OpenMP, to catch up with the speed of GPU --- svm.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/svm.cpp b/svm.cpp index 446fb08d..ba56a47b 100644 --- a/svm.cpp +++ b/svm.cpp @@ -951,9 +951,6 @@ void Solver::reconstruct_gradient() if (nr_free*l > 2*active_size*(l-active_size)) { -#ifdef SVM_CUDA -#pragma omp parallel for private(i) -#endif for(i=active_size;iget_Q(i,active_size); @@ -964,9 +961,6 @@ void Solver::reconstruct_gradient() } else { -#ifdef SVM_CUDA -#pragma omp parallel for private(i) -#endif for(i=0;ix[perm[j]],&(dec_values[perm[j]])); From 78b2b62d5e8f55665348dbf30d81c447b119cf7a Mon Sep 17 00:00:00 2001 From: Ryan Pan Date: Thu, 7 Jan 2021 14:53:17 +0800 Subject: [PATCH 19/19] Update README --- README | 6 ------ 1 file changed, 6 deletions(-) diff --git a/README b/README index 6f2ea3ce..19b4a65b 100644 --- a/README +++ b/README @@ -4,12 +4,6 @@ make clean; make svm-train-cuda How to use: run svm-train-cuda instead of svm-train, with same command line arguments of svm-train -multithreading -svm-train-cuda is multithreading by default(with OpenMP), you can change thread count with -setenv OMP_NUM_THREADS 8 (Windows) -or -export OMP_NUM_THREADS=8 (Linux) - tips try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1)