From b19161bce385ca3f9905a012d1ff2c64f2829b26 Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Mon, 20 Jan 2020 15:00:05 +0800
Subject: [PATCH 01/19] Kernel interface change to batch_kernel_function(from
 kernel_function) Implement a CUDA version of batch_kernel_function

---
 svm.cpp | 296 +++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 272 insertions(+), 24 deletions(-)
diff --git a/svm.cpp b/svm.cpp
index fa44818b..83deef90 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -8,6 +8,13 @@
 #include <limits.h>
 #include <locale.h>
 #include "svm.h"
+#ifdef SVM_CUDA
+#include <assert.h>
+#include <vector>
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusparse.h>
+#endif
 int libsvm_version = LIBSVM_VERSION;
 typedef float Qfloat;
 typedef signed char schar;
@@ -196,28 +203,35 @@ class QMatrix {
 	virtual Qfloat *get_Q(int column, int len) const = 0;
 	virtual double *get_QD() const = 0;
 	virtual void swap_index(int i, int j) const = 0;
+	virtual void done_shrinking() const = 0;
 	virtual ~QMatrix() {}
 };
 
-class Kernel: public QMatrix {
+class KernelBase: public QMatrix {
 public:
-	Kernel(int l, svm_node * const * x, const svm_parameter& param);
-	virtual ~Kernel();
+	KernelBase(int l, svm_node * const * x, const svm_parameter& param);
+	virtual ~KernelBase();
 
 	static double k_function(const svm_node *x, const svm_node *y,
 				 const svm_parameter& param);
 	virtual Qfloat *get_Q(int column, int len) const = 0;
 	virtual double *get_QD() const = 0;
-	virtual void swap_index(int i, int j) const	// no so const...
+	virtual void swap_index(int i, int j) const // no so const...
 	{
 		swap(x[i],x[j]);
 		if(x_square) swap(x_square[i],x_square[j]);
 	}
+	virtual void done_shrinking() const {}
 protected:
 
-	double (Kernel::*kernel_function)(int i, int j) const;
+	double (KernelBase::*kernel_function)(int i, int j) const;
+	virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const
+	{
+		for(int j=start;j<end;j++)
+			Q[j] = (Qfloat)(this->*kernel_function)(i,j);
+	}
 
-private:
+protected:
 	const svm_node **x;
 	double *x_square;
 
@@ -250,26 +264,26 @@ class Kernel: public QMatrix {
 	}
 };
 
-Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param)
+KernelBase::KernelBase(int l, svm_node * const * x_, const svm_parameter& param)
 :kernel_type(param.kernel_type), degree(param.degree),
  gamma(param.gamma), coef0(param.coef0)
 {
 	switch(kernel_type)
 	{
 		case LINEAR:
-			kernel_function = &Kernel::kernel_linear;
+			kernel_function = &KernelBase::kernel_linear;
 			break;
 		case POLY:
-			kernel_function = &Kernel::kernel_poly;
+			kernel_function = &KernelBase::kernel_poly;
 			break;
 		case RBF:
-			kernel_function = &Kernel::kernel_rbf;
+			kernel_function = &KernelBase::kernel_rbf;
 			break;
 		case SIGMOID:
-			kernel_function = &Kernel::kernel_sigmoid;
+			kernel_function = &KernelBase::kernel_sigmoid;
 			break;
 		case PRECOMPUTED:
-			kernel_function = &Kernel::kernel_precomputed;
+			kernel_function = &KernelBase::kernel_precomputed;
 			break;
 	}
 
@@ -285,13 +299,13 @@ Kernel::Kernel(int l, svm_node * const * x_, const svm_parameter& param)
 		x_square = 0;
 }
 
-Kernel::~Kernel()
+KernelBase::~KernelBase()
 {
 	delete[] x;
 	delete[] x_square;
 }
 
-double Kernel::dot(const svm_node *px, const svm_node *py)
+double KernelBase::dot(const svm_node *px, const svm_node *py)
 {
 	double sum = 0;
 	while(px->index != -1 && py->index != -1)
@@ -313,7 +327,7 @@ double Kernel::dot(const svm_node *px, const svm_node *py)
 	return sum;
 }
 
-double Kernel::k_function(const svm_node *x, const svm_node *y,
+double KernelBase::k_function(const svm_node *x, const svm_node *y,
 			  const svm_parameter& param)
 {
 	switch(param.kernel_type)
@@ -372,6 +386,238 @@ double Kernel::k_function(const svm_node *x, const svm_node *y,
 	}
 }
 
+#ifdef SVM_CUDA
+class CudaKernel : public KernelBase {
+public:
+	CudaKernel(int l, svm_node * const * x, const svm_parameter& param);
+	virtual ~CudaKernel();
+	virtual void done_shrinking() const;
+protected:
+	virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const;
+private:
+	void libsvm2csr(int l) const;  //convert svm_node** x -> csr format
+private:
+	cublasHandle_t m_cublas;
+	cusparseHandle_t m_cusparse;
+	cusparseMatDescr_t m_descr;
+	int m_max_index;
+	std::vector<int> m_csrRowPtr;
+	int* d_csrColIdx;
+	double* d_csrVal;
+	double* d_vectorX;
+	int m_l;
+	int m_nnz;
+};
+
+CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), 
+	m_cublas(NULL), m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1),
+	d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_l(l), m_nnz(0) {
+	// Init cublas & cusparse handlers
+	cublasStatus_t cublasResult = cublasCreate(&m_cublas);
+	assert(CUBLAS_STATUS_SUCCESS == cublasResult);
+	cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseResult = cusparseCreateMatDescr(&m_descr);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO);
+	cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+
+	// get Num-non-zero and max_index
+	m_max_index = 0;
+	const svm_node *node;
+	for( int i=0; i<l; i++ ) {
+		node = x[i];
+		while(node->index != -1) {
+			if( node->index > m_max_index )
+				m_max_index = node->index;
+			m_nnz++;
+			node++;
+		}
+	}
+
+	libsvm2csr(l);
+}
+
+void CudaKernel::libsvm2csr(int l) const {
+	// alloc host memory and fill up
+	std::vector<double> csrVal(m_nnz);
+	std::vector<int> csrColIdx(m_nnz);
+	int curPos = 0;
+	const svm_node *node;
+	int* csrRowPtr = (int*)m_csrRowPtr.data();
+	for( int i=0; i<l; i++ ) {
+		csrRowPtr[i] = curPos;
+		node = x[i];
+		while(node->index != -1) {
+			csrVal[curPos] = node->value;
+			csrColIdx[curPos] = node->index;
+			curPos++;
+			node++;
+		}
+	}
+	csrRowPtr[l] = curPos;
+
+	// copy csrVal, csrColIdx to device
+	cudaError_t cudaResult;
+	if( !d_vectorX ) {
+		cudaResult = cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index);
+		assert(cudaSuccess == cudaResult);
+	}
+	if( !d_csrColIdx ) {
+	        cudaResult = cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz);
+		assert(cudaSuccess == cudaResult);
+	}
+	if( !d_csrVal ) {
+		cudaResult = cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz);
+		assert(cudaSuccess == cudaResult);
+	}
+	cudaResult = cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice);
+	assert(cudaSuccess == cudaResult);
+	cudaResult = cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice);
+	assert(cudaSuccess == cudaResult);
+}
+
+CudaKernel::~CudaKernel() {
+	if(d_csrColIdx) cudaFree(d_csrColIdx);
+	if(d_csrVal) cudaFree(d_csrVal);
+	if(d_vectorX) cudaFree(d_vectorX);
+	if(m_cublas) cublasDestroy(m_cublas);
+	if(m_cusparse) cusparseDestroy(m_cusparse);
+	if(m_descr) cusparseDestroyMatDescr(m_descr);
+}
+
+void CudaKernel::done_shrinking() const {
+	KernelBase::done_shrinking();
+	libsvm2csr(m_l);
+}
+
+void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const {
+	cudaError_t cudaResult;
+	int len = end-start;
+
+	// get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx
+	double* csrValA = d_csrVal + m_csrRowPtr[start];
+	int* csrColIndA = d_csrColIdx + m_csrRowPtr[start];
+	int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start];
+
+	// get csrRowPtrA ready for cusparseCsrmvEx
+	std::vector<int> csrRow(len+1);
+	for( int idx=0; idx<=len; idx++ ) {
+		csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start];
+	}
+	int* csrRowPtrA = NULL;
+	cudaResult = cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1));
+	assert(cudaSuccess == cudaResult);
+	cudaResult = cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice);
+	assert(cudaSuccess == cudaResult);
+
+	// get vector x ready for cusparseCsrmvEx
+	std::vector<double> vectorX(m_max_index, 0.0);
+	const svm_node *node = x[i];
+	while(node->index != -1) {
+		vectorX[node->index] = node->value;
+		node++;	
+	}
+	cudaResult = cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice);
+	assert(cudaSuccess == cudaResult);
+
+	// get vector y ready for cusparseCsrmvEx
+	std::vector<double> vectorY(len, 0.0);
+	double* d_vectorY = NULL;
+	cudaResult = cudaMalloc((void**)&d_vectorY, sizeof(double)*len);
+	assert(cudaSuccess == cudaResult);
+	cudaResult = cudaMemcpy(d_vectorY, vectorY.data(), sizeof(double)*len, cudaMemcpyHostToDevice);
+	assert(cudaSuccess == cudaResult);
+
+	// get temp buffer size and get it ready
+	size_t buffer_size = 0;
+	double h_one = 1.0;
+	double h_zero = 0.0;
+	cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse,
+		CUSPARSE_ALG0,
+		CUSPARSE_OPERATION_NON_TRANSPOSE,
+		len,
+		m_max_index,
+		nnz,
+		&h_one, CUDA_R_64F,
+		m_descr,
+		csrValA, CUDA_R_64F,
+		csrRowPtrA,
+		csrColIndA,
+		d_vectorX, CUDA_R_64F,
+		&h_zero, CUDA_R_64F,
+		d_vectorY, CUDA_R_64F,
+		CUDA_R_64F,
+		&buffer_size);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	void* buffer = NULL;
+	cudaResult = cudaMalloc ((void**)&buffer, buffer_size);
+	assert(cudaSuccess == cudaResult);
+
+	// call cusparseCsrmvEx
+	cusparseResult = cusparseCsrmvEx(m_cusparse,
+		CUSPARSE_ALG0,
+		CUSPARSE_OPERATION_NON_TRANSPOSE,
+		len,
+		m_max_index,
+		nnz,
+		&h_one, CUDA_R_64F,
+		m_descr,
+		csrValA, CUDA_R_64F,
+		csrRowPtrA,
+		csrColIndA,
+		d_vectorX, CUDA_R_64F,
+		&h_zero, CUDA_R_64F,
+		d_vectorY, CUDA_R_64F,
+		CUDA_R_64F,
+		buffer);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+
+	// copy d_vectorY back to host
+	cudaResult = cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost);
+	assert(cudaSuccess == cudaResult);
+	for( int idx=0; idx<len; idx++ ) {
+		switch(kernel_type)
+		{
+			case LINEAR:
+				Q[start+idx] = (Qfloat)(vectorY[idx]);   // dot()
+				break;
+			case POLY:
+				Q[start+idx] = (Qfloat)(powi(gamma*vectorY[idx]+coef0,degree));
+				break;
+			case RBF:
+				Q[start+idx] = (Qfloat)(exp(-gamma*(x_square[i]+x_square[start+idx]-2*vectorY[idx])));
+				break;
+			case SIGMOID:
+				Q[start+idx] = (Qfloat)(tanh(gamma*vectorY[idx]+coef0));
+				break;
+			default:
+				printf("unsupport kernel type");
+				exit(0);
+		}
+	}
+
+	cudaFree(csrRowPtrA);
+	cudaFree(d_vectorY);
+	cudaFree(buffer);
+
+	// debug, compare with base class
+	/*std::vector<Qfloat> Qbase(start+len, 0.0);
+        KernelBase::batch_kernel_function(i, Qbase.data(), start, end);
+	for( int idx=0; idx<len; idx++ ) {
+		if( fabs(Q[start+idx]-Qbase[start+idx])>0.0001 ) {
+			printf("%f <-> %f(%d, %d)\n", Q[idx], Qbase[idx], i, start+idx);
+		}
+	}*/
+}
+#endif
+
+#ifdef SVM_CUDA
+typedef CudaKernel Kernel;
+#else
+typedef KernelBase Kernel;
+#endif
+
 // An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918
 // Solves:
 //
@@ -568,7 +814,10 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 		if(--counter == 0)
 		{
 			counter = min(l,1000);
-			if(shrinking) do_shrinking();
+			if(shrinking) {
+				do_shrinking();
+				Q.done_shrinking();
+			}
 			info(".");
 		}
 
@@ -1282,8 +1531,9 @@ class SVC_Q: public Kernel
 		int start, j;
 		if((start = cache->get_data(i,&data,len)) < len)
 		{
+			batch_kernel_function(i,data,start,len);
 			for(j=start;j<len;j++)
-				data[j] = (Qfloat)(y[i]*y[j]*(this->*kernel_function)(i,j));
+				data[j] *= (Qfloat)(y[i]*y[j]);
 		}
 		return data;
 	}
@@ -1328,11 +1578,10 @@ class ONE_CLASS_Q: public Kernel
 	Qfloat *get_Q(int i, int len) const
 	{
 		Qfloat *data;
-		int start, j;
+		int start;
 		if((start = cache->get_data(i,&data,len)) < len)
 		{
-			for(j=start;j<len;j++)
-				data[j] = (Qfloat)(this->*kernel_function)(i,j);
+			batch_kernel_function(i,data,start,len);
 		}
 		return data;
 	}
@@ -1394,18 +1643,17 @@ class SVR_Q: public Kernel
 	Qfloat *get_Q(int i, int len) const
 	{
 		Qfloat *data;
-		int j, real_i = index[i];
+		int real_i = index[i];
 		if(cache->get_data(real_i,&data,l) < l)
 		{
-			for(j=0;j<l;j++)
-				data[j] = (Qfloat)(this->*kernel_function)(real_i,j);
+			batch_kernel_function(real_i,data,0,l);
 		}
 
 		// reorder and copy
 		Qfloat *buf = buffer[next_buffer];
 		next_buffer = 1 - next_buffer;
 		schar si = sign[i];
-		for(j=0;j<len;j++)
+		for(int j=0;j<len;j++)
 			buf[j] = (Qfloat) si * (Qfloat) sign[j] * data[index[j]];
 		return buf;
 	}

From b1cc0e408d6cbae951521a51963057cbb4575c6b Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Wed, 22 Jan 2020 16:52:09 +0800
Subject: [PATCH 02/19] a CUDA port of libsvm

---
 Makefile |   9 +-
 svm.cpp  | 433 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 svm.h    |  10 ++
 3 files changed, 395 insertions(+), 57 deletions(-)

diff --git a/Makefile b/Makefile
index db6ab346..6254268f 100644
--- a/Makefile
+++ b/Makefile
@@ -22,4 +22,11 @@ svm-scale: svm-scale.c
 svm.o: svm.cpp svm.h
 	$(CXX) $(CFLAGS) -c svm.cpp
 clean:
-	rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER)
+	rm -f *~ svm.o svm-train svm-predict svm-scale libsvm.so.$(SHVER) svm-train-cuda svm-train.o svm-cuda.o
+
+svm-train.o: svm-train.c
+	$(CXX) $(CFLAGS) -c svm-train.c
+svm-cuda.o: svm.cpp
+	$(CXX) $(CFLAGS) -DSVM_CUDA -c svm.cpp -o svm-cuda.o
+svm-train-cuda: svm-train.o svm-cuda.o
+	$(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart
diff --git a/svm.cpp b/svm.cpp
index 83deef90..cddd1722 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -9,6 +9,7 @@
 #include <locale.h>
 #include "svm.h"
 #ifdef SVM_CUDA
+#include <iostream>
 #include <assert.h>
 #include <vector>
 #include <cuda_runtime.h>
@@ -212,6 +213,7 @@ class KernelBase: public QMatrix {
 	KernelBase(int l, svm_node * const * x, const svm_parameter& param);
 	virtual ~KernelBase();
 
+	static void batch_k_function(const svm_node *x, const svm_model* model, double* out);
 	static double k_function(const svm_node *x, const svm_node *y,
 				 const svm_parameter& param);
 	virtual Qfloat *get_Q(int column, int len) const = 0;
@@ -327,6 +329,14 @@ double KernelBase::dot(const svm_node *px, const svm_node *py)
 	return sum;
 }
 
+void KernelBase::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
+	svm_node **SV = model->SV;
+	const svm_parameter& param = model->param; 
+	for(int i=0; i<model->l; i++ ) {
+		out[i] = k_function(x,SV[i],param);
+	}
+}
+
 double KernelBase::k_function(const svm_node *x, const svm_node *y,
 			  const svm_parameter& param)
 {
@@ -374,7 +384,6 @@ double KernelBase::k_function(const svm_node *x, const svm_node *y,
 				sum += y->value * y->value;
 				++y;
 			}
-
 			return exp(-param.gamma*sum);
 		}
 		case SIGMOID:
@@ -387,34 +396,39 @@ double KernelBase::k_function(const svm_node *x, const svm_node *y,
 }
 
 #ifdef SVM_CUDA
+#define CUDA_CHECK(error) { \
+	if (error!=cudaSuccess){ \
+		std::cerr<<"CUDA ERROR "<< cudaGetErrorName(error) << " in file "  << __FILE__ << " line " <<__LINE__<< std::endl; \
+		exit(0); \
+	}  \
+}
+
 class CudaKernel : public KernelBase {
 public:
 	CudaKernel(int l, svm_node * const * x, const svm_parameter& param);
 	virtual ~CudaKernel();
 	virtual void done_shrinking() const;
+	static void batch_k_function(const svm_node *x, const svm_model* model, double* out);
 protected:
 	virtual void batch_kernel_function(int i, Qfloat *Q, int start, int end) const;
 private:
-	void libsvm2csr(int l) const;  //convert svm_node** x -> csr format
+	void x2csrMatrix(int sizeX) const;  //convert svm_node** x -> csr format matrix
 private:
-	cublasHandle_t m_cublas;
 	cusparseHandle_t m_cusparse;
 	cusparseMatDescr_t m_descr;
-	int m_max_index;
-	std::vector<int> m_csrRowPtr;
-	int* d_csrColIdx;
-	double* d_csrVal;
+	int m_max_index;		// max column of x matrix
+	std::vector<int> m_csrRowPtr;	// csr row ptr of x matrix
+	int* d_csrColIdx;		// csr col index of x matrix
+	double* d_csrVal;		// csr val vector of x matrix
 	double* d_vectorX;
-	int m_l;
-	int m_nnz;
+	int m_sizeX;		// size of x
+	int m_nnz;		// number of non-zero value of x matrix 
 };
 
 CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), 
-	m_cublas(NULL), m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1),
-	d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_l(l), m_nnz(0) {
-	// Init cublas & cusparse handlers
-	cublasStatus_t cublasResult = cublasCreate(&m_cublas);
-	assert(CUBLAS_STATUS_SUCCESS == cublasResult);
+	m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1),
+	d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_sizeX(l), m_nnz(0) {
+	// Init handlers
 	cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse);
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 	cusparseResult = cusparseCreateMatDescr(&m_descr);
@@ -434,18 +448,19 @@ CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param)
 			node++;
 		}
 	}
+	m_max_index++;
 
-	libsvm2csr(l);
+	x2csrMatrix(l);
 }
 
-void CudaKernel::libsvm2csr(int l) const {
+void CudaKernel::x2csrMatrix(int sizeX) const {
 	// alloc host memory and fill up
 	std::vector<double> csrVal(m_nnz);
 	std::vector<int> csrColIdx(m_nnz);
 	int curPos = 0;
 	const svm_node *node;
 	int* csrRowPtr = (int*)m_csrRowPtr.data();
-	for( int i=0; i<l; i++ ) {
+	for( int i=0; i<sizeX; i++ ) {
 		csrRowPtr[i] = curPos;
 		node = x[i];
 		while(node->index != -1) {
@@ -455,44 +470,36 @@ void CudaKernel::libsvm2csr(int l) const {
 			node++;
 		}
 	}
-	csrRowPtr[l] = curPos;
+	csrRowPtr[sizeX] = curPos;
 
 	// copy csrVal, csrColIdx to device
-	cudaError_t cudaResult;
 	if( !d_vectorX ) {
-		cudaResult = cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index);
-		assert(cudaSuccess == cudaResult);
+		CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index));
 	}
 	if( !d_csrColIdx ) {
-	        cudaResult = cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz);
-		assert(cudaSuccess == cudaResult);
+	        CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz));
 	}
 	if( !d_csrVal ) {
-		cudaResult = cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz);
-		assert(cudaSuccess == cudaResult);
+		CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz));
 	}
-	cudaResult = cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice);
-	assert(cudaSuccess == cudaResult);
-	cudaResult = cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice);
-	assert(cudaSuccess == cudaResult);
+	CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice));
 }
 
 CudaKernel::~CudaKernel() {
 	if(d_csrColIdx) cudaFree(d_csrColIdx);
 	if(d_csrVal) cudaFree(d_csrVal);
 	if(d_vectorX) cudaFree(d_vectorX);
-	if(m_cublas) cublasDestroy(m_cublas);
 	if(m_cusparse) cusparseDestroy(m_cusparse);
 	if(m_descr) cusparseDestroyMatDescr(m_descr);
 }
 
 void CudaKernel::done_shrinking() const {
 	KernelBase::done_shrinking();
-	libsvm2csr(m_l);
+	x2csrMatrix(m_sizeX);
 }
 
 void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const {
-	cudaError_t cudaResult;
 	int len = end-start;
 
 	// get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx
@@ -506,10 +513,8 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start];
 	}
 	int* csrRowPtrA = NULL;
-	cudaResult = cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1));
-	assert(cudaSuccess == cudaResult);
-	cudaResult = cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice);
-	assert(cudaSuccess == cudaResult);
+	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1)));
+	CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice));
 
 	// get vector x ready for cusparseCsrmvEx
 	std::vector<double> vectorX(m_max_index, 0.0);
@@ -518,21 +523,16 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		vectorX[node->index] = node->value;
 		node++;	
 	}
-	cudaResult = cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice);
-	assert(cudaSuccess == cudaResult);
+	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice));
 
 	// get vector y ready for cusparseCsrmvEx
-	std::vector<double> vectorY(len, 0.0);
 	double* d_vectorY = NULL;
-	cudaResult = cudaMalloc((void**)&d_vectorY, sizeof(double)*len);
-	assert(cudaSuccess == cudaResult);
-	cudaResult = cudaMemcpy(d_vectorY, vectorY.data(), sizeof(double)*len, cudaMemcpyHostToDevice);
-	assert(cudaSuccess == cudaResult);
+	CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*len));
 
 	// get temp buffer size and get it ready
 	size_t buffer_size = 0;
-	double h_one = 1.0;
-	double h_zero = 0.0;
+	static double h_one = 1.0;
+	static double h_zero = 0.0;
 	cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse,
 		CUSPARSE_ALG0,
 		CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -551,8 +551,7 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		&buffer_size);
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 	void* buffer = NULL;
-	cudaResult = cudaMalloc ((void**)&buffer, buffer_size);
-	assert(cudaSuccess == cudaResult);
+	CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
 
 	// call cusparseCsrmvEx
 	cusparseResult = cusparseCsrmvEx(m_cusparse,
@@ -574,8 +573,8 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 
 	// copy d_vectorY back to host
-	cudaResult = cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost);
-	assert(cudaSuccess == cudaResult);
+	std::vector<double> vectorY(len);
+	CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost));
 	for( int idx=0; idx<len; idx++ ) {
 		switch(kernel_type)
 		{
@@ -592,7 +591,7 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 				Q[start+idx] = (Qfloat)(tanh(gamma*vectorY[idx]+coef0));
 				break;
 			default:
-				printf("unsupport kernel type");
+				printf("unsupport kernel type\n");
 				exit(0);
 		}
 	}
@@ -601,15 +600,320 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	cudaFree(d_vectorY);
 	cudaFree(buffer);
 
-	// debug, compare with base class
+	// debug, compare with base class result from cpu
 	/*std::vector<Qfloat> Qbase(start+len, 0.0);
         KernelBase::batch_kernel_function(i, Qbase.data(), start, end);
 	for( int idx=0; idx<len; idx++ ) {
-		if( fabs(Q[start+idx]-Qbase[start+idx])>0.0001 ) {
-			printf("%f <-> %f(%d, %d)\n", Q[idx], Qbase[idx], i, start+idx);
+		if( fabs(Q[start+idx]-Qbase[start+idx])>0.01 ) {
+			printf("%f <-> %f(%d, %d)\n", Q[start+idx], Qbase[start+idx], i, start+idx);
+		}
+	}*/
+}
+
+void svm_model::cuda_init() {
+	nMaxIdxSV = 0;
+	nnzSV = 0;
+	const svm_node *node;
+	for( int i=0; i<l; i++ ) {
+		node = SV[i];
+		while(node->index != -1) {
+			if( node->index > nMaxIdxSV )
+				nMaxIdxSV = node->index;
+			nnzSV++;
+			node++;
+		}
+	}
+
+	// get csrVal csrColIdx ready
+	std::vector<int> csrRowPtr(l+1);
+	std::vector<double> csrVal(nnzSV);
+	std::vector<int> csrColIdx(nnzSV);
+	int curPos = 0;
+	for( int i=0; i<l; i++ ) {
+		csrRowPtr[i] = curPos;
+		node = SV[i];
+		while(node->index != -1) {
+			csrVal[curPos] = node->value;
+			csrColIdx[curPos] = node->index;
+			curPos++;
+			node++;
+		}
+	}
+	csrRowPtr[l] = curPos;
+
+	// copy csrVal, csrColIdx to device
+	csrColIdxSV = NULL;
+	csrValSV = NULL;
+	csrRowPtrSV = NULL;
+	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1)));
+        CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV));
+        CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV));
+	CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice));
+        CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice));
+}
+
+void svm_model::cuda_free() {
+	CUDA_CHECK(cudaFree(csrRowPtrSV));
+	CUDA_CHECK(cudaFree(csrColIdxSV));
+	CUDA_CHECK(cudaFree(csrValSV));
+}
+
+void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
+	const svm_parameter& param = model->param;
+	static const double one = 1.0;
+	static const double NegOne = -1.0;
+	static const double zero = 0.0;	
+	cublasHandle_t cublas = NULL;
+	cusparseHandle_t cusparse = NULL;
+	cusparseMatDescr_t descrSV = NULL;
+	cusparseMatDescr_t descrX = NULL;
+	cusparseMatDescr_t descrC = NULL;
+	cudaStream_t stream = NULL;
+	CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+	cublasStatus_t cublasResult = cublasCreate(&cublas);
+	assert(CUBLAS_STATUS_SUCCESS == cublasResult);
+	cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseResult = cusparseCreateMatDescr(&descrSV);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO);
+	cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseResult = cusparseCreateMatDescr(&descrX);
+        assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+        cusparseSetMatIndexBase(descrX,CUSPARSE_INDEX_BASE_ZERO);
+        cusparseSetMatType(descrX, CUSPARSE_MATRIX_TYPE_GENERAL);
+        cusparseResult = cusparseCreateMatDescr(&descrC);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
+	cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
+	cusparseSetStream(cusparse, stream);
+	cublasSetStream(cublas, stream);
+
+	// get max_index
+	int max_index = model->nMaxIdxSV;
+	const svm_node *node;
+	node = x;
+	int x_nnz = 0;
+	while(node->index != -1) {
+		if( node->index > max_index )
+			max_index = node->index;
+		x_nnz++;
+		node++;
+	}
+	max_index++;
+
+	// create a sparse matrix X = model->l * [x]
+	std::vector<int> csrRowPtr(model->l+1);
+	std::vector<double> csrVal(model->l*x_nnz);
+	std::vector<int> csrColIdx(model->l*x_nnz);
+	int curPos = 0;
+	for( int i=0; i<model->l; i++ ) {
+		csrRowPtr[i] = curPos;
+		node = x;
+		while(node->index != -1) {
+			csrVal[curPos] = node->value;
+			csrColIdx[curPos] = node->index;
+			curPos++;
+			node++;
 		}
+	}
+	csrRowPtr[model->l] = curPos;	
+	int* csrColIdxX = NULL;
+	double* csrValX = NULL;
+	int* csrRowPtrX = NULL;
+	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrX, sizeof(int)*(model->l+1)));
+	CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*model->l*x_nnz));
+	CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*model->l*x_nnz));
+	CUDA_CHECK(cudaMemcpyAsync(csrRowPtrX, csrRowPtr.data(), sizeof(int)*(model->l+1), cudaMemcpyHostToDevice, stream));
+	CUDA_CHECK(cudaMemcpyAsync(csrColIdxX, csrColIdx.data(), sizeof(int)*model->l*x_nnz, cudaMemcpyHostToDevice, stream));
+	CUDA_CHECK(cudaMemcpyAsync(csrValX, csrVal.data(), sizeof(double)*model->l*x_nnz, cudaMemcpyHostToDevice, stream));
+
+	int nnzC;
+	size_t buffer_size;
+	char* buffer = NULL; 
+	double* csrValC = NULL;
+	double* temp_csrValC = NULL;
+	int* csrRowPtrC = NULL;
+	int* csrColIdxC = NULL;
+	double* all_one = NULL;
+	double* d_out = NULL;
+	double* d_vectorX = NULL, *d_vectorY = NULL;
+	std::vector<double> h_all_one, vectorX;
+	switch(param.kernel_type)
+	{
+		case LINEAR:
+		case POLY:
+		case SIGMOID:
+			// get dense vector x ready
+			vectorX.resize(max_index, 0.0);
+			CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index));
+			node = x;
+			while(node->index != -1) {
+				vectorX[node->index] = node->value;
+				node++;
+			}
+			CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream));
+
+			// get dense vector y ready
+			CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l));
+
+			// get buffer size
+			cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
+					CUSPARSE_ALG0,
+					CUSPARSE_OPERATION_NON_TRANSPOSE,
+					model->l, max_index,
+					model->nnzSV, &one, CUDA_R_64F,
+					descrSV, 
+					model->csrValSV, CUDA_R_64F,
+					model->csrRowPtrSV, model->csrColIdxSV,
+					d_vectorX, CUDA_R_64F,
+					&zero, CUDA_R_64F,
+					d_vectorY, CUDA_R_64F,
+					CUDA_R_64F, &buffer_size);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+			CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
+	
+			// y = dot(SV, x)
+			cusparseResult = cusparseCsrmvEx(cusparse,
+					CUSPARSE_ALG0,
+					CUSPARSE_OPERATION_NON_TRANSPOSE,
+					model->l, max_index,
+					model->nnzSV,
+					&one, CUDA_R_64F,
+					descrSV,
+					model->csrValSV, CUDA_R_64F,
+					model->csrRowPtrSV, model->csrColIdxSV,
+					d_vectorX, CUDA_R_64F,
+					&zero, CUDA_R_64F,
+					d_vectorY, CUDA_R_64F,
+					CUDA_R_64F, buffer);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+
+			// copy vectorY back to host
+			CUDA_CHECK(cudaMemcpyAsync(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK(cudaStreamSynchronize(stream));
+
+			// update out base on kernel_type
+			if( param.kernel_type==POLY ) {
+				for( int i=0; i<model->l; i++ )
+					out[i] = powi(param.gamma*out[i]+param.coef0,param.degree);
+			} else if( param.kernel_type==SIGMOID ) {
+				for( int i=0; i<model->l; i++ ) 
+					out[i] = tanh(param.gamma*out[i]+param.coef0);
+			}
+			break;
+		case RBF:
+			// create a sparse matrix C = X-SV
+			CUDA_CHECK(cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(model->l+1)));
+			cusparseResult = cusparseXcsrgeamNnz(cusparse, model->l, max_index,
+					descrX, model->l*x_nnz, csrRowPtrX, csrColIdxX,
+					descrSV, model->nnzSV, model->csrRowPtrSV, model->csrColIdxSV,
+					descrC, csrRowPtrC, &nnzC);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+			CUDA_CHECK(cudaMalloc((void**)&csrColIdxC, sizeof(int)*nnzC));
+			CUDA_CHECK(cudaMalloc((void**)&csrValC, sizeof(double)*nnzC));
+			CUDA_CHECK(cudaMalloc((void**)&temp_csrValC, sizeof(double)*nnzC));
+
+			cusparseResult = cusparseDcsrgeam(cusparse, model->l, max_index,
+					&one,
+					descrX, model->l*x_nnz,
+					csrValX, csrRowPtrX, csrColIdxX,
+					&NegOne,
+					descrSV, model->nnzSV,
+					model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV,
+					descrC,
+					csrValC, csrRowPtrC, csrColIdxC);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+				
+			// do C = C^2
+			// equal to vector csrValC = csrValC^2
+			CUDA_CHECK(cudaMemcpyAsync(temp_csrValC, csrValC,sizeof(double)*nnzC, cudaMemcpyDeviceToDevice, stream));
+			cublasResult = cublasDsbmv(cublas, CUBLAS_FILL_MODE_LOWER,
+					nnzC, 0, &one,
+					temp_csrValC, 1,
+					temp_csrValC, 1,
+					&zero, csrValC, 1);
+			assert(CUBLAS_STATUS_SUCCESS == cublasResult);
+
+			// out = C * all_one_vector_with_size_max_index
+			CUDA_CHECK(cudaMalloc((void**)&all_one, sizeof(double)*max_index));
+			CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l));
+			h_all_one.resize(max_index, 1.0);
+			CUDA_CHECK(cudaMemcpyAsync(all_one, h_all_one.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream));
+
+			cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
+					CUSPARSE_ALG0,
+					CUSPARSE_OPERATION_NON_TRANSPOSE,
+					model->l, max_index,
+					nnzC, &one,
+					CUDA_R_64F, descrC,
+					csrValC, CUDA_R_64F,
+					csrRowPtrC, csrColIdxC,
+					all_one, CUDA_R_64F,
+					&zero, CUDA_R_64F,
+					d_out, CUDA_R_64F,
+					CUDA_R_64F, &buffer_size);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+			CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
+
+			cusparseResult = cusparseCsrmvEx(cusparse,
+					CUSPARSE_ALG0,
+					CUSPARSE_OPERATION_NON_TRANSPOSE,
+					model->l, max_index,
+					nnzC, &one,
+					CUDA_R_64F, descrC,
+					csrValC, CUDA_R_64F,
+					csrRowPtrC, csrColIdxC,
+					all_one, CUDA_R_64F,
+					&zero, CUDA_R_64F,
+					d_out, CUDA_R_64F,
+					CUDA_R_64F, buffer);
+			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+
+			// copy d_out back to host
+			CUDA_CHECK(cudaMemcpyAsync(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream));
+			CUDA_CHECK(cudaStreamSynchronize(stream));
+
+			// do out[i] = exp(-param.gamma*out[i])
+			for( int i=0; i<model->l; i++ )
+				out[i] = exp(-param.gamma*out[i]);
+			break;
+		default:
+			printf("unsupport kernel type\n");
+			exit(0);
+			break;
+	}
+
+	cudaFree(csrColIdxX);
+	cudaFree(csrValX);
+	cudaFree(csrRowPtrX);
+	cudaFree(csrRowPtrC);
+        cudaFree(buffer);
+        cudaFree(csrValC);
+	cudaFree(temp_csrValC);
+        cudaFree(csrColIdxC);
+	cudaFree(all_one);
+	cudaFree(d_out);
+	cudaFree(d_vectorX);
+	cudaFree(d_vectorY);
+	cusparseDestroyMatDescr(descrSV);
+	cusparseDestroyMatDescr(descrX);
+	cusparseDestroyMatDescr(descrC);
+	cublasDestroy(cublas);
+	cusparseDestroy(cusparse);
+	cudaStreamDestroy(stream);
+
+        // debug, compare with base class result from cpu
+	/*std::vector<double> Outbase(model->l, 0.0);
+	KernelBase::batch_k_function(x, model, Outbase.data());
+	for( int idx=0; idx<model->l; idx++ ) {
+		if( fabs(Outbase[idx]-out[idx])>0.01 )
+			printf("batch_k_function %f <-> %f %d\n", Outbase[idx], out[idx], idx);
 	}*/
 }
+
 #endif
 
 #ifdef SVM_CUDA
@@ -2580,6 +2884,9 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 		free(nz_count);
 		free(nz_start);
 	}
+#ifdef SVM_CUDA
+	model->cuda_init();
+#endif
 	return model;
 }
 
@@ -2755,8 +3062,14 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec
 	{
 		double *sv_coef = model->sv_coef[0];
 		double sum = 0;
+		//for(i=0;i<model->l;i++)
+		//	sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param);
+		double *kvalue = Malloc(double,model->l);
+		Kernel::batch_k_function(x,model,kvalue);
 		for(i=0;i<model->l;i++)
-			sum += sv_coef[i] * Kernel::k_function(x,model->SV[i],model->param);
+			sum += sv_coef[i] * kvalue[i];
+		free(kvalue);
+
 		sum -= model->rho[0];
 		*dec_values = sum;
 
@@ -2771,8 +3084,9 @@ double svm_predict_values(const svm_model *model, const svm_node *x, double* dec
 		int l = model->l;
 
 		double *kvalue = Malloc(double,l);
-		for(i=0;i<l;i++)
-			kvalue[i] = Kernel::k_function(x,model->SV[i],model->param);
+		//for(i=0;i<l;i++)
+		//	kvalue[i] = Kernel::k_function(x,model->SV[i],model->param);
+		Kernel::batch_k_function(x,model,kvalue);
 
 		int *start = Malloc(int,nr_class);
 		start[0] = 0;
@@ -3237,6 +3551,9 @@ svm_model *svm_load_model(const char *model_file_name)
 		return NULL;
 
 	model->free_sv = 1;	// XXX
+#ifdef SVM_CUDA
+	model->cuda_init();
+#endif	
 	return model;
 }
 
@@ -3273,6 +3590,10 @@ void svm_free_model_content(svm_model* model_ptr)
 
 	free(model_ptr->nSV);
 	model_ptr->nSV = NULL;
+
+#ifdef SVM_CUDA
+	model_ptr->cuda_free();
+#endif	
 }
 
 void svm_free_and_destroy_model(svm_model** model_ptr_ptr)
diff --git a/svm.h b/svm.h
index b0bf6ef0..227ffbd0 100644
--- a/svm.h
+++ b/svm.h
@@ -69,6 +69,16 @@ struct svm_model
 	/* XXX */
 	int free_sv;		/* 1 if svm_model is created by svm_load_model*/
 				/* 0 if svm_model is created by svm_train */
+#ifdef SVM_CUDA
+	void cuda_init();
+	void cuda_free();
+
+	int nMaxIdxSV;		/* max index of SV */
+	int nnzSV;		/* non-zero value count of SV */
+	int* csrRowPtrSV;	/* row index of matrix SV */
+	int* csrColIdxSV;	/* column index of matrix SV */
+	double* csrValSV;	/* values of matrix SV */
+#endif
 };
 
 struct svm_model *svm_train(const struct svm_problem *prob, const struct svm_parameter *param);

From e8523d78803aa75ecc26b116874d738a857ff172 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Wed, 22 Jan 2020 16:56:16 +0800
Subject: [PATCH 03/19] Update README

---
 README | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README b/README
index dce2c1f1..a22756a4 100644
--- a/README
+++ b/README
@@ -1,3 +1,14 @@
+How to build:
+make clean; make svm-train-cuda
+
+How to use:
+run svm-train-cuda instead of svm-train, with same command line arguments of svm-train
+
+
+///////////////////////////////////////////////////////////////////////////////////////
+
+
+
 Libsvm is a simple, easy-to-use, and efficient software for SVM
 classification and regression. It solves C-SVM classification, nu-SVM
 classification, one-class-SVM, epsilon-SVM regression, and nu-SVM

From 68d6cd9d5ae4ad94eb19042fa8ca37833891fb4d Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Fri, 24 Jan 2020 01:28:48 +0800
Subject: [PATCH 04/19] a faster RBF_k_function

---
 Makefile |   2 +-
 svm.cpp  | 350 +++++++++++++++++++++++--------------------------------
 2 files changed, 150 insertions(+), 202 deletions(-)

diff --git a/Makefile b/Makefile
index 6254268f..22980070 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,6 @@ clean:
 svm-train.o: svm-train.c
 	$(CXX) $(CFLAGS) -c svm-train.c
 svm-cuda.o: svm.cpp
-	$(CXX) $(CFLAGS) -DSVM_CUDA -c svm.cpp -o svm-cuda.o
+	nvcc -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o
 svm-train-cuda: svm-train.o svm-cuda.o
 	$(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart
diff --git a/svm.cpp b/svm.cpp
index cddd1722..40a442cf 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -331,8 +331,9 @@ double KernelBase::dot(const svm_node *px, const svm_node *py)
 
 void KernelBase::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
 	svm_node **SV = model->SV;
-	const svm_parameter& param = model->param; 
-	for(int i=0; i<model->l; i++ ) {
+	const svm_parameter& param = model->param;
+	int i;	
+	for(i=0; i<model->l; i++ ) {
 		out[i] = k_function(x,SV[i],param);
 	}
 }
@@ -659,37 +660,49 @@ void svm_model::cuda_free() {
 	CUDA_CHECK(cudaFree(csrValSV));
 }
 
+__global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX,
+		int countSV,
+		double* csrValSV, int* csrRowPtrSV, int* csrColIdxSV,
+		double* out) {
+	int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x;
+	if( nSVIdx>=countSV )
+		return;
+	int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx];
+	double* ValSV = csrValSV + csrRowPtrSV[nSVIdx];
+	int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx];
+
+	int colX = 0, colSV = 0;
+	double sum = 0.0;
+	while( colX<nnzX && colSV<nnzSV ) {
+		if( csrColIdxX[colX]==ColIdxSV[colSV] ) {
+			double d = csrValX[colX] - ValSV[colSV];
+			sum += d*d;
+			colX++;
+			colSV++;
+		} else {
+			if( csrColIdxX[colX] < ColIdxSV[colSV] ) {
+				sum += csrValX[colX]*csrValX[colX];
+				colX++;
+			} else {
+				sum += ValSV[colSV]*ValSV[colSV];
+				colSV++;
+			}
+		}
+	}
+	while( colX<nnzX ) {
+		sum += csrValX[colX]*csrValX[colX];
+		colX++;
+	}
+	while( colSV<nnzSV ) {
+		sum += ValSV[colSV]*ValSV[colSV];
+		colSV++;
+	}
+
+	out[nSVIdx] += sum;
+}
+
 void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
 	const svm_parameter& param = model->param;
-	static const double one = 1.0;
-	static const double NegOne = -1.0;
-	static const double zero = 0.0;	
-	cublasHandle_t cublas = NULL;
-	cusparseHandle_t cusparse = NULL;
-	cusparseMatDescr_t descrSV = NULL;
-	cusparseMatDescr_t descrX = NULL;
-	cusparseMatDescr_t descrC = NULL;
-	cudaStream_t stream = NULL;
-	CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
-	cublasStatus_t cublasResult = cublasCreate(&cublas);
-	assert(CUBLAS_STATUS_SUCCESS == cublasResult);
-	cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseResult = cusparseCreateMatDescr(&descrSV);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO);
-	cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseResult = cusparseCreateMatDescr(&descrX);
-        assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-        cusparseSetMatIndexBase(descrX,CUSPARSE_INDEX_BASE_ZERO);
-        cusparseSetMatType(descrX, CUSPARSE_MATRIX_TYPE_GENERAL);
-        cusparseResult = cusparseCreateMatDescr(&descrC);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseSetMatIndexBase(descrC,CUSPARSE_INDEX_BASE_ZERO);
-	cusparseSetMatType(descrC, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
-	cusparseSetStream(cusparse, stream);
-	cublasSetStream(cublas, stream);
 
 	// get max_index
 	int max_index = model->nMaxIdxSV;
@@ -704,181 +717,125 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 	}
 	max_index++;
 
-	// create a sparse matrix X = model->l * [x]
-	std::vector<int> csrRowPtr(model->l+1);
-	std::vector<double> csrVal(model->l*x_nnz);
-	std::vector<int> csrColIdx(model->l*x_nnz);
-	int curPos = 0;
-	for( int i=0; i<model->l; i++ ) {
-		csrRowPtr[i] = curPos;
-		node = x;
-		while(node->index != -1) {
-			csrVal[curPos] = node->value;
-			csrColIdx[curPos] = node->index;
-			curPos++;
-			node++;
-		}
-	}
-	csrRowPtr[model->l] = curPos;	
+	cusparseHandle_t cusparse = NULL;
+	cusparseMatDescr_t descrSV = NULL;	
 	int* csrColIdxX = NULL;
 	double* csrValX = NULL;
-	int* csrRowPtrX = NULL;
-	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrX, sizeof(int)*(model->l+1)));
-	CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*model->l*x_nnz));
-	CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*model->l*x_nnz));
-	CUDA_CHECK(cudaMemcpyAsync(csrRowPtrX, csrRowPtr.data(), sizeof(int)*(model->l+1), cudaMemcpyHostToDevice, stream));
-	CUDA_CHECK(cudaMemcpyAsync(csrColIdxX, csrColIdx.data(), sizeof(int)*model->l*x_nnz, cudaMemcpyHostToDevice, stream));
-	CUDA_CHECK(cudaMemcpyAsync(csrValX, csrVal.data(), sizeof(double)*model->l*x_nnz, cudaMemcpyHostToDevice, stream));
-
-	int nnzC;
-	size_t buffer_size;
 	char* buffer = NULL; 
-	double* csrValC = NULL;
-	double* temp_csrValC = NULL;
-	int* csrRowPtrC = NULL;
-	int* csrColIdxC = NULL;
 	double* all_one = NULL;
 	double* d_out = NULL;
-	double* d_vectorX = NULL, *d_vectorY = NULL;
-	std::vector<double> h_all_one, vectorX;
+	double* d_vectorX = NULL;
+	double* d_vectorY = NULL;
 	switch(param.kernel_type)
 	{
 		case LINEAR:
 		case POLY:
 		case SIGMOID:
-			// get dense vector x ready
-			vectorX.resize(max_index, 0.0);
-			CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index));
-			node = x;
-			while(node->index != -1) {
-				vectorX[node->index] = node->value;
-				node++;
-			}
-			CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream));
-
-			// get dense vector y ready
-			CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l));
-
-			// get buffer size
-			cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
-					CUSPARSE_ALG0,
-					CUSPARSE_OPERATION_NON_TRANSPOSE,
-					model->l, max_index,
-					model->nnzSV, &one, CUDA_R_64F,
-					descrSV, 
-					model->csrValSV, CUDA_R_64F,
-					model->csrRowPtrSV, model->csrColIdxSV,
-					d_vectorX, CUDA_R_64F,
-					&zero, CUDA_R_64F,
-					d_vectorY, CUDA_R_64F,
-					CUDA_R_64F, &buffer_size);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-			CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
+			{
+				static const double one = 1.0;
+				static const double zero = 0.0;
+				std::vector<double> h_all_one, vectorX;
+				size_t buffer_size;
+
+				cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
+				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+				cusparseResult = cusparseCreateMatDescr(&descrSV);
+				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+				cusparseSetMatIndexBase(descrSV,CUSPARSE_INDEX_BASE_ZERO);
+				cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL);
+				cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
+
+				// get dense vector x ready
+				vectorX.resize(max_index, 0.0);
+				CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index));
+				node = x;
+				while(node->index != -1) {
+					vectorX[node->index] = node->value;
+					node++;
+				}
+				CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice));
+
+				// get dense vector y ready
+				CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l));
+
+				// get buffer size
+				cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
+						CUSPARSE_ALG0,
+						CUSPARSE_OPERATION_NON_TRANSPOSE,
+						model->l, max_index,
+						model->nnzSV, &one, CUDA_R_64F,
+						descrSV, 
+						model->csrValSV, CUDA_R_64F,
+						model->csrRowPtrSV, model->csrColIdxSV,
+						d_vectorX, CUDA_R_64F,
+						&zero, CUDA_R_64F,
+						d_vectorY, CUDA_R_64F,
+						CUDA_R_64F, &buffer_size);
+				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+				CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
 	
-			// y = dot(SV, x)
-			cusparseResult = cusparseCsrmvEx(cusparse,
-					CUSPARSE_ALG0,
-					CUSPARSE_OPERATION_NON_TRANSPOSE,
-					model->l, max_index,
-					model->nnzSV,
-					&one, CUDA_R_64F,
-					descrSV,
-					model->csrValSV, CUDA_R_64F,
-					model->csrRowPtrSV, model->csrColIdxSV,
-					d_vectorX, CUDA_R_64F,
-					&zero, CUDA_R_64F,
-					d_vectorY, CUDA_R_64F,
-					CUDA_R_64F, buffer);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-
-			// copy vectorY back to host
-			CUDA_CHECK(cudaMemcpyAsync(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream));
-			CUDA_CHECK(cudaStreamSynchronize(stream));
-
-			// update out base on kernel_type
-			if( param.kernel_type==POLY ) {
-				for( int i=0; i<model->l; i++ )
-					out[i] = powi(param.gamma*out[i]+param.coef0,param.degree);
-			} else if( param.kernel_type==SIGMOID ) {
-				for( int i=0; i<model->l; i++ ) 
-					out[i] = tanh(param.gamma*out[i]+param.coef0);
+				// y = dot(SV, x)
+				cusparseResult = cusparseCsrmvEx(cusparse,
+						CUSPARSE_ALG0,
+						CUSPARSE_OPERATION_NON_TRANSPOSE,
+						model->l, max_index,
+						model->nnzSV,
+						&one, CUDA_R_64F,
+						descrSV,
+						model->csrValSV, CUDA_R_64F,
+						model->csrRowPtrSV, model->csrColIdxSV,
+						d_vectorX, CUDA_R_64F,
+						&zero, CUDA_R_64F,
+						d_vectorY, CUDA_R_64F,
+						CUDA_R_64F, buffer);
+				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+
+				// copy vectorY back to host
+				CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
+
+				// update out base on kernel_type
+				if( param.kernel_type==POLY ) {
+					for( int i=0; i<model->l; i++ )
+						out[i] = powi(param.gamma*out[i]+param.coef0,param.degree);
+				} else if( param.kernel_type==SIGMOID ) {
+					for( int i=0; i<model->l; i++ ) 
+						out[i] = tanh(param.gamma*out[i]+param.coef0);
+				}
 			}
 			break;
 		case RBF:
-			// create a sparse matrix C = X-SV
-			CUDA_CHECK(cudaMalloc((void**)&csrRowPtrC, sizeof(int)*(model->l+1)));
-			cusparseResult = cusparseXcsrgeamNnz(cusparse, model->l, max_index,
-					descrX, model->l*x_nnz, csrRowPtrX, csrColIdxX,
-					descrSV, model->nnzSV, model->csrRowPtrSV, model->csrColIdxSV,
-					descrC, csrRowPtrC, &nnzC);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-			CUDA_CHECK(cudaMalloc((void**)&csrColIdxC, sizeof(int)*nnzC));
-			CUDA_CHECK(cudaMalloc((void**)&csrValC, sizeof(double)*nnzC));
-			CUDA_CHECK(cudaMalloc((void**)&temp_csrValC, sizeof(double)*nnzC));
-
-			cusparseResult = cusparseDcsrgeam(cusparse, model->l, max_index,
-					&one,
-					descrX, model->l*x_nnz,
-					csrValX, csrRowPtrX, csrColIdxX,
-					&NegOne,
-					descrSV, model->nnzSV,
-					model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV,
-					descrC,
-					csrValC, csrRowPtrC, csrColIdxC);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-				
-			// do C = C^2
-			// equal to vector csrValC = csrValC^2
-			CUDA_CHECK(cudaMemcpyAsync(temp_csrValC, csrValC,sizeof(double)*nnzC, cudaMemcpyDeviceToDevice, stream));
-			cublasResult = cublasDsbmv(cublas, CUBLAS_FILL_MODE_LOWER,
-					nnzC, 0, &one,
-					temp_csrValC, 1,
-					temp_csrValC, 1,
-					&zero, csrValC, 1);
-			assert(CUBLAS_STATUS_SUCCESS == cublasResult);
-
-			// out = C * all_one_vector_with_size_max_index
-			CUDA_CHECK(cudaMalloc((void**)&all_one, sizeof(double)*max_index));
-			CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l));
-			h_all_one.resize(max_index, 1.0);
-			CUDA_CHECK(cudaMemcpyAsync(all_one, h_all_one.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice, stream));
-
-			cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
-					CUSPARSE_ALG0,
-					CUSPARSE_OPERATION_NON_TRANSPOSE,
-					model->l, max_index,
-					nnzC, &one,
-					CUDA_R_64F, descrC,
-					csrValC, CUDA_R_64F,
-					csrRowPtrC, csrColIdxC,
-					all_one, CUDA_R_64F,
-					&zero, CUDA_R_64F,
-					d_out, CUDA_R_64F,
-					CUDA_R_64F, &buffer_size);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-			CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
-
-			cusparseResult = cusparseCsrmvEx(cusparse,
-					CUSPARSE_ALG0,
-					CUSPARSE_OPERATION_NON_TRANSPOSE,
-					model->l, max_index,
-					nnzC, &one,
-					CUDA_R_64F, descrC,
-					csrValC, CUDA_R_64F,
-					csrRowPtrC, csrColIdxC,
-					all_one, CUDA_R_64F,
-					&zero, CUDA_R_64F,
-					d_out, CUDA_R_64F,
-					CUDA_R_64F, buffer);
-			assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-
-			// copy d_out back to host
-			CUDA_CHECK(cudaMemcpyAsync(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream));
-			CUDA_CHECK(cudaStreamSynchronize(stream));
-
-			// do out[i] = exp(-param.gamma*out[i])
-			for( int i=0; i<model->l; i++ )
-				out[i] = exp(-param.gamma*out[i]);
+			{
+				// get vector x ready
+				std::vector<double> h_csrValX(x_nnz);
+				std::vector<int> h_csrColIdxX(x_nnz);
+				int curPos = 0;
+				node = x;
+				while(node->index != -1) {
+					h_csrValX[curPos] = node->value;
+					h_csrColIdxX[curPos] = node->index;
+					curPos++;
+					node++;
+				}
+				CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*x_nnz));
+				CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*x_nnz));
+				CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l));
+				CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice));
+				CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz, cudaMemcpyHostToDevice));
+				CUDA_CHECK(cudaMemset(d_out, 0, sizeof(double)*model->l));
+
+				const int threadPerBlock=256;
+				int blockPerGrid=(model->l+threadPerBlock-1)/threadPerBlock;
+				RBF_k_function<<<blockPerGrid, threadPerBlock>>>(x_nnz, csrValX, csrColIdxX,
+						model->l, model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, 
+						d_out);
+
+				// copy d_out back to host
+				CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
+
+				// do out[i] = exp(-param.gamma*out[i])
+				for( int i=0; i<model->l; i++ )
+					out[i] = exp(-param.gamma*out[i]);
+			}
 			break;
 		default:
 			printf("unsupport kernel type\n");
@@ -888,22 +845,13 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 
 	cudaFree(csrColIdxX);
 	cudaFree(csrValX);
-	cudaFree(csrRowPtrX);
-	cudaFree(csrRowPtrC);
         cudaFree(buffer);
-        cudaFree(csrValC);
-	cudaFree(temp_csrValC);
-        cudaFree(csrColIdxC);
 	cudaFree(all_one);
 	cudaFree(d_out);
 	cudaFree(d_vectorX);
 	cudaFree(d_vectorY);
 	cusparseDestroyMatDescr(descrSV);
-	cusparseDestroyMatDescr(descrX);
-	cusparseDestroyMatDescr(descrC);
-	cublasDestroy(cublas);
 	cusparseDestroy(cusparse);
-	cudaStreamDestroy(stream);
 
         // debug, compare with base class result from cpu
 	/*std::vector<double> Outbase(model->l, 0.0);

From 8c4d66a612ca4d6c7891c0cbded50ec30ca435bc Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sun, 2 Feb 2020 08:54:58 +0800
Subject: [PATCH 05/19] a faster RBF_k_function, with less cudaMalloc/cudaFree

---
 svm.cpp | 242 ++++++++++++++++++++++++++++++++++++++++----------------
 svm.h   |  14 ++++
 2 files changed, 188 insertions(+), 68 deletions(-)

diff --git a/svm.cpp b/svm.cpp
index 40a442cf..06fef5f8 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -612,6 +612,14 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 }
 
 void svm_model::cuda_init() {
+	problem = NULL;
+	nPreloadIdx = 0;
+	csrRowPtrProblem = NULL;
+	csrColIdxProblem = NULL;
+	csrValProblem = NULL;
+	csrValProblemSquare = NULL;
+	d_out = NULL;
+
 	nMaxIdxSV = 0;
 	nnzSV = 0;
 	const svm_node *node;
@@ -628,6 +636,7 @@ void svm_model::cuda_init() {
 	// get csrVal csrColIdx ready
 	std::vector<int> csrRowPtr(l+1);
 	std::vector<double> csrVal(nnzSV);
+	std::vector<double> csrValSquare(nnzSV);
 	std::vector<int> csrColIdx(nnzSV);
 	int curPos = 0;
 	for( int i=0; i<l; i++ ) {
@@ -635,6 +644,7 @@ void svm_model::cuda_init() {
 		node = SV[i];
 		while(node->index != -1) {
 			csrVal[curPos] = node->value;
+			csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos];
 			csrColIdx[curPos] = node->index;
 			curPos++;
 			node++;
@@ -645,30 +655,108 @@ void svm_model::cuda_init() {
 	// copy csrVal, csrColIdx to device
 	csrColIdxSV = NULL;
 	csrValSV = NULL;
+	csrValSVSquare = NULL;
 	csrRowPtrSV = NULL;
 	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1)));
         CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV));
         CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV));
+	CUDA_CHECK(cudaMalloc((void**)&csrValSVSquare, sizeof(double)*nnzSV));
 	CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(csrValSVSquare, csrValSquare.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice));
+}
+
+void svm_model::cuda_preload_problem(const struct svm_problem *prob) {
+	free(csrRowPtrProblem);
+	CUDA_CHECK(cudaFree(csrColIdxProblem));
+	CUDA_CHECK(cudaFree(csrValProblem));
+	CUDA_CHECK(cudaFree(csrValProblemSquare));
+	CUDA_CHECK(cudaFree(d_out));
+	csrRowPtrProblem = NULL;
+	csrColIdxProblem = NULL;
+	csrValProblem = NULL;
+	csrValProblemSquare = NULL;
+	d_out = NULL;
+	problem = prob;
+	if( problem!=NULL ) {
+		nMaxIdxProblem = 0;
+		nnzProblem = 0;
+		const svm_node *node;
+		for( int i=0; i<problem->l; i++ ) {
+			node = problem->x[i];
+			while(node->index != -1) {
+				if( node->index > nMaxIdxProblem )
+					nMaxIdxProblem = node->index;
+				nnzProblem++;
+				node++;
+			}
+		}
+		csrRowPtrProblem = Malloc(int,problem->l+1);
+		std::vector<double> csrVal(nnzProblem);
+		std::vector<double> csrValSquare(nnzProblem);
+		std::vector<int> csrColIdx(nnzProblem);
+		int curPos = 0;
+		for( int i=0; i<problem->l; i++ ) {
+			csrRowPtrProblem[i] = curPos;
+			node = problem->x[i];
+			while(node->index != -1) {
+				csrVal[curPos] = node->value;
+				csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos];
+				csrColIdx[curPos] = node->index;
+				curPos++;
+				node++;
+			}
+		}
+		csrRowPtrProblem[problem->l] = curPos;
+
+		CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*l));
+		CUDA_CHECK(cudaMalloc((void**)&csrColIdxProblem, sizeof(int)*nnzProblem));
+		CUDA_CHECK(cudaMalloc((void**)&csrValProblem, sizeof(double)*nnzProblem));
+		CUDA_CHECK(cudaMalloc((void**)&csrValProblemSquare, sizeof(double)*nnzProblem));
+		CUDA_CHECK(cudaMemcpy(csrColIdxProblem, csrColIdx.data(), sizeof(int)*nnzProblem, cudaMemcpyHostToDevice));
+		CUDA_CHECK(cudaMemcpy(csrValProblem, csrVal.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice));
+		CUDA_CHECK(cudaMemcpy(csrValProblemSquare, csrValSquare.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice));
+	}
+}
+
+void svm_model::setPreloadIdx(int nIdx) {
+	nPreloadIdx = nIdx;
+}
+
+int svm_model::getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const {
+	if( problem==NULL )
+		return 0;
+	int* csrRowPtr = csrRowPtrProblem + nPreloadIdx;
+	*nnz = *(csrRowPtr+1)-*csrRowPtr;
+	*csrColIdx = csrColIdxProblem + *csrRowPtr;
+	*csrVal = csrValProblem + *csrRowPtr;
+	*csrValSquare = csrValProblemSquare + *csrRowPtr;
+	return 1;
 }
 
 void svm_model::cuda_free() {
 	CUDA_CHECK(cudaFree(csrRowPtrSV));
 	CUDA_CHECK(cudaFree(csrColIdxSV));
 	CUDA_CHECK(cudaFree(csrValSV));
+	CUDA_CHECK(cudaFree(csrValSVSquare));
+	free(csrRowPtrProblem);
+	CUDA_CHECK(cudaFree(csrColIdxProblem));
+	CUDA_CHECK(cudaFree(csrValProblem));
+	CUDA_CHECK(cudaFree(csrValProblemSquare));
+	CUDA_CHECK(cudaFree(d_out));
 }
 
-__global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX,
+__global__ void RBF_k_function(int nnzX, double* csrValX, double* csrValXSquare, int* csrColIdxX,
 		int countSV,
-		double* csrValSV, int* csrRowPtrSV, int* csrColIdxSV,
+		double* csrValSV, double* csrValSVSquare, int* csrRowPtrSV, int* csrColIdxSV,
 		double* out) {
 	int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x;
 	if( nSVIdx>=countSV )
 		return;
 	int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx];
 	double* ValSV = csrValSV + csrRowPtrSV[nSVIdx];
+	double* ValSVSquare = csrValSVSquare + csrRowPtrSV[nSVIdx];
 	int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx];
 
 	int colX = 0, colSV = 0;
@@ -681,60 +769,39 @@ __global__ void RBF_k_function(int nnzX, double* csrValX, int* csrColIdxX,
 			colSV++;
 		} else {
 			if( csrColIdxX[colX] < ColIdxSV[colSV] ) {
-				sum += csrValX[colX]*csrValX[colX];
+				sum += csrValXSquare[colX];
 				colX++;
 			} else {
-				sum += ValSV[colSV]*ValSV[colSV];
+				sum += ValSVSquare[colSV];
 				colSV++;
 			}
 		}
 	}
 	while( colX<nnzX ) {
-		sum += csrValX[colX]*csrValX[colX];
+		sum += csrValXSquare[colX];
 		colX++;
 	}
 	while( colSV<nnzSV ) {
-		sum += ValSV[colSV]*ValSV[colSV];
+		sum += ValSVSquare[colSV];
 		colSV++;
 	}
 
-	out[nSVIdx] += sum;
+	out[nSVIdx] = sum;
 }
 
 void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
 	const svm_parameter& param = model->param;
-
-	// get max_index
-	int max_index = model->nMaxIdxSV;
-	const svm_node *node;
-	node = x;
-	int x_nnz = 0;
-	while(node->index != -1) {
-		if( node->index > max_index )
-			max_index = node->index;
-		x_nnz++;
-		node++;
-	}
-	max_index++;
-
-	cusparseHandle_t cusparse = NULL;
-	cusparseMatDescr_t descrSV = NULL;	
-	int* csrColIdxX = NULL;
-	double* csrValX = NULL;
-	char* buffer = NULL; 
-	double* all_one = NULL;
-	double* d_out = NULL;
-	double* d_vectorX = NULL;
-	double* d_vectorY = NULL;
 	switch(param.kernel_type)
 	{
 		case LINEAR:
 		case POLY:
 		case SIGMOID:
 			{
+				cusparseHandle_t cusparse = NULL;
+				cusparseMatDescr_t descrSV = NULL;
 				static const double one = 1.0;
 				static const double zero = 0.0;
-				std::vector<double> h_all_one, vectorX;
+				std::vector<double> vectorX;
 				size_t buffer_size;
 
 				cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
@@ -745,9 +812,22 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL);
 				cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
 
+				int max_index = model->nMaxIdxSV;
+				const svm_node *node = x;
+				int x_nnz = 0;
+				while(node->index != -1) {
+					if( node->index > max_index )
+						max_index = node->index;
+					x_nnz++;
+					node++;
+				}
+				max_index++;
+
 				// get dense vector x ready
+				double* d_vector = NULL;
+				CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(max_index+model->l)));
+				double* d_vectorX = d_vector;
 				vectorX.resize(max_index, 0.0);
-				CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*max_index));
 				node = x;
 				while(node->index != -1) {
 					vectorX[node->index] = node->value;
@@ -756,7 +836,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice));
 
 				// get dense vector y ready
-				CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*model->l));
+				double* d_vectorY = d_vector + max_index;
 
 				// get buffer size
 				cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
@@ -772,6 +852,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 						d_vectorY, CUDA_R_64F,
 						CUDA_R_64F, &buffer_size);
 				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+				char* buffer = NULL;
 				CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
 	
 				// y = dot(SV, x)
@@ -801,36 +882,64 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 					for( int i=0; i<model->l; i++ ) 
 						out[i] = tanh(param.gamma*out[i]+param.coef0);
 				}
+
+				cudaFree(d_vector);
+				cudaFree(buffer);
+				cusparseDestroyMatDescr(descrSV);
+				cusparseDestroy(cusparse);
 			}
 			break;
 		case RBF:
 			{
-				// get vector x ready
-				std::vector<double> h_csrValX(x_nnz);
-				std::vector<int> h_csrColIdxX(x_nnz);
-				int curPos = 0;
-				node = x;
-				while(node->index != -1) {
-					h_csrValX[curPos] = node->value;
-					h_csrColIdxX[curPos] = node->index;
-					curPos++;
-					node++;
+				const int threadPerBlock = 256;
+				int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock;
+				int nnzX = 0;
+				int* csrColIdxX = NULL;
+				double* csrValX = NULL;
+				double* csrValXSquare = NULL;
+				if( model->getPreloadX(&nnzX, &csrColIdxX, &csrValX, &csrValXSquare) ) {
+					// vector X is preloaded(training)
+                                	RBF_k_function<<<blockPerGrid, threadPerBlock>>>(nnzX, csrValX, csrValXSquare, csrColIdxX,
+                                                model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV,
+                                                model->d_out);
+					CUDA_CHECK(cudaMemcpy(out, model->d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
+				} else {
+					// vector X is not preloaded(predict)
+					const svm_node *node = x;
+					int x_nnz = 0;
+					while(node->index != -1) {
+						x_nnz++;
+						node++;
+					}
+
+					// get vector x ready
+					std::vector<double> h_csrValX(x_nnz*2);
+					double* h_csrValXSquare = h_csrValX.data()+x_nnz;
+					std::vector<int> h_csrColIdxX(x_nnz);
+					int curPos = 0;
+					node = x;
+					while(node->index != -1) {
+						h_csrValX[curPos] = node->value;
+						h_csrValXSquare[curPos] = h_csrValX[curPos]*h_csrValX[curPos];
+						h_csrColIdxX[curPos] = node->index;
+						curPos++;
+						node++;
+					}
+					CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*(x_nnz*2+model->l)+sizeof(int)*x_nnz));
+					csrValXSquare = csrValX + x_nnz;
+					double* d_out = csrValXSquare + x_nnz;
+					csrColIdxX = (int*)&d_out[model->l];
+					CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice));
+					CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz*2, cudaMemcpyHostToDevice));
+
+					RBF_k_function<<<blockPerGrid, threadPerBlock>>>(x_nnz, csrValX, csrValXSquare, csrColIdxX,
+							model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, 
+							d_out);
+
+					// copy d_out back to host
+					CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
+					CUDA_CHECK(cudaFree(csrValX));
 				}
-				CUDA_CHECK(cudaMalloc((void**)&csrColIdxX, sizeof(int)*x_nnz));
-				CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*x_nnz));
-				CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*model->l));
-				CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice));
-				CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz, cudaMemcpyHostToDevice));
-				CUDA_CHECK(cudaMemset(d_out, 0, sizeof(double)*model->l));
-
-				const int threadPerBlock=256;
-				int blockPerGrid=(model->l+threadPerBlock-1)/threadPerBlock;
-				RBF_k_function<<<blockPerGrid, threadPerBlock>>>(x_nnz, csrValX, csrColIdxX,
-						model->l, model->csrValSV, model->csrRowPtrSV, model->csrColIdxSV, 
-						d_out);
-
-				// copy d_out back to host
-				CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
 
 				// do out[i] = exp(-param.gamma*out[i])
 				for( int i=0; i<model->l; i++ )
@@ -843,16 +952,6 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 			break;
 	}
 
-	cudaFree(csrColIdxX);
-	cudaFree(csrValX);
-        cudaFree(buffer);
-	cudaFree(all_one);
-	cudaFree(d_out);
-	cudaFree(d_vectorX);
-	cudaFree(d_vectorY);
-	cusparseDestroyMatDescr(descrSV);
-	cusparseDestroy(cusparse);
-
         // debug, compare with base class result from cpu
 	/*std::vector<double> Outbase(model->l, 0.0);
 	KernelBase::batch_k_function(x, model, Outbase.data());
@@ -2459,8 +2558,15 @@ static void svm_binary_svc_probability(
 			subparam.weight[0]=Cp;
 			subparam.weight[1]=Cn;
 			struct svm_model *submodel = svm_train(&subprob,&subparam);
+#ifdef SVM_CUDA
+			submodel->cuda_preload_problem(prob);
 			for(j=begin;j<end;j++)
 			{
+				submodel->setPreloadIdx(perm[j]);
+#else
+			for(j=begin;j<end;j++)
+			{
+#endif
 				svm_predict_values(submodel,prob->x[perm[j]],&(dec_values[perm[j]]));
 				// ensure +1 -1 order; reason not using CV subroutine
 				dec_values[perm[j]] *= submodel->label[0];
diff --git a/svm.h b/svm.h
index 227ffbd0..5b2ca0d9 100644
--- a/svm.h
+++ b/svm.h
@@ -78,6 +78,20 @@ struct svm_model
 	int* csrRowPtrSV;	/* row index of matrix SV */
 	int* csrColIdxSV;	/* column index of matrix SV */
 	double* csrValSV;	/* values of matrix SV */
+	double* csrValSVSquare; /* value^2 of matrix SV */
+
+	void cuda_preload_problem(const struct svm_problem *problem);  /* load problem in gpu memory, non-thread-safe, speed up training */
+	void setPreloadIdx(int nIdx); /* called before getPreloadX() */
+	int getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const; /* get vector x */
+	int nPreloadIdx;
+	const struct svm_problem *problem;
+	int* csrRowPtrProblem;  /* row index of matrix problem */
+	int nMaxIdxProblem;	/* max index of problem */
+	int nnzProblem;		/* non-zero value count of problem */
+	int* csrColIdxProblem;	/* cloumn index of matrix problem */
+	double* csrValProblem;	/* values of matrix problem */
+	double* csrValProblemSquare; /* value^2 of matrix problem */
+	double* d_out;		/* temp output */
 #endif
 };
 

From 4dbd835fa0d5235b9a99d825a6b420ec2cb944e6 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Wed, 5 Feb 2020 22:48:25 +0800
Subject: [PATCH 06/19] a faster RBF_kernel_function

---
 svm.cpp | 305 +++++++++++++-------------------------------------------
 svm.h   |  14 +--
 2 files changed, 73 insertions(+), 246 deletions(-)

diff --git a/svm.cpp b/svm.cpp
index 06fef5f8..d85a6848 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -421,14 +421,14 @@ class CudaKernel : public KernelBase {
 	std::vector<int> m_csrRowPtr;	// csr row ptr of x matrix
 	int* d_csrColIdx;		// csr col index of x matrix
 	double* d_csrVal;		// csr val vector of x matrix
-	double* d_vectorX;
+	double* d_squareX;		// a gpu copy of x_square
 	int m_sizeX;		// size of x
 	int m_nnz;		// number of non-zero value of x matrix 
 };
 
 CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), 
 	m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1),
-	d_csrColIdx(NULL), d_csrVal(NULL), d_vectorX(NULL), m_sizeX(l), m_nnz(0) {
+	d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) {
 	// Init handlers
 	cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse);
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
@@ -474,23 +474,24 @@ void CudaKernel::x2csrMatrix(int sizeX) const {
 	csrRowPtr[sizeX] = curPos;
 
 	// copy csrVal, csrColIdx to device
-	if( !d_vectorX ) {
-		CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index));
-	}
 	if( !d_csrColIdx ) {
 	        CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz));
 	}
 	if( !d_csrVal ) {
 		CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz));
 	}
+	if( !d_squareX ) {
+		CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*sizeX));
+	}
 	CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(d_squareX, x_square, sizeof(double)*sizeX, cudaMemcpyHostToDevice));
 }
 
 CudaKernel::~CudaKernel() {
-	if(d_csrColIdx) cudaFree(d_csrColIdx);
-	if(d_csrVal) cudaFree(d_csrVal);
-	if(d_vectorX) cudaFree(d_vectorX);
+	CUDA_CHECK(cudaFree(d_csrColIdx));
+	CUDA_CHECK(cudaFree(d_csrVal));
+	CUDA_CHECK(cudaFree(d_squareX));
 	if(m_cusparse) cusparseDestroy(m_cusparse);
 	if(m_descr) cusparseDestroyMatDescr(m_descr);
 }
@@ -500,6 +501,28 @@ void CudaKernel::done_shrinking() const {
 	x2csrMatrix(m_sizeX);
 }
 
+__global__ void dot2kernelValue(int kernel_type, int len, double gamma, double coef0, double degree, double squareX, double* d_squareA, int start, double* Vector) {
+	int idx = blockDim.x*blockIdx.x + threadIdx.x;
+	if( idx >= len )
+		return;
+
+	// Vector input dot(), output kernel Value
+	switch(kernel_type) {
+		default:
+		case LINEAR:	// do nothing
+			break;
+		case POLY:
+			Vector[idx] = pow(gamma*Vector[idx]+coef0,degree);
+			break;
+		case RBF:
+			Vector[idx] = exp(-gamma*(squareX+d_squareA[start+idx]-2*Vector[idx]));
+			break;
+		case SIGMOID:
+			Vector[idx] = tanh(gamma*Vector[idx]+coef0);
+			break;
+	}
+}
+
 void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const {
 	int len = end-start;
 
@@ -508,27 +531,35 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	int* csrColIndA = d_csrColIdx + m_csrRowPtr[start];
 	int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start];
 
+	const svm_node *node = x[i];
+	int max_index = 0;
+	while(node->index != -1) {
+		if( max_index < node->index )
+			max_index = node->index;
+		node++;
+	}
+	max_index++;
+	double* d_vectors = NULL;
+	CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*max_index+sizeof(double)*len));
+	double* d_vectorX = d_vectors;
+	double* d_vectorY = &d_vectorX[max_index];
+	int* csrRowPtrA = (int*)&d_vectorY[len];
+
 	// get csrRowPtrA ready for cusparseCsrmvEx
 	std::vector<int> csrRow(len+1);
 	for( int idx=0; idx<=len; idx++ ) {
 		csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start];
 	}
-	int* csrRowPtrA = NULL;
-	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrA, sizeof(int)*(len+1)));
 	CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice));
 
 	// get vector x ready for cusparseCsrmvEx
-	std::vector<double> vectorX(m_max_index, 0.0);
-	const svm_node *node = x[i];
+	std::vector<double> vectorX(max_index, 0.0);
+	node = x[i];
 	while(node->index != -1) {
 		vectorX[node->index] = node->value;
 		node++;	
 	}
-	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice));
-
-	// get vector y ready for cusparseCsrmvEx
-	double* d_vectorY = NULL;
-	CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*len));
+	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice));
 
 	// get temp buffer size and get it ready
 	size_t buffer_size = 0;
@@ -573,32 +604,20 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		buffer);
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 
+	// update out base on kernel_type
+	const int threadPerBlock = 256;
+	int blockPerGrid = (len+threadPerBlock-1)/threadPerBlock;
+	dot2kernelValue<<<blockPerGrid, threadPerBlock>>>(kernel_type, len, gamma, coef0,
+		degree, x_square[i], d_squareX, start, d_vectorY);
+
 	// copy d_vectorY back to host
 	std::vector<double> vectorY(len);
 	CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost));
 	for( int idx=0; idx<len; idx++ ) {
-		switch(kernel_type)
-		{
-			case LINEAR:
-				Q[start+idx] = (Qfloat)(vectorY[idx]);   // dot()
-				break;
-			case POLY:
-				Q[start+idx] = (Qfloat)(powi(gamma*vectorY[idx]+coef0,degree));
-				break;
-			case RBF:
-				Q[start+idx] = (Qfloat)(exp(-gamma*(x_square[i]+x_square[start+idx]-2*vectorY[idx])));
-				break;
-			case SIGMOID:
-				Q[start+idx] = (Qfloat)(tanh(gamma*vectorY[idx]+coef0));
-				break;
-			default:
-				printf("unsupport kernel type\n");
-				exit(0);
-		}
+		Q[start+idx] = (Qfloat)(vectorY[idx]);
 	}
 
-	cudaFree(csrRowPtrA);
-	cudaFree(d_vectorY);
+	cudaFree(d_vectors);
 	cudaFree(buffer);
 
 	// debug, compare with base class result from cpu
@@ -612,14 +631,6 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 }
 
 void svm_model::cuda_init() {
-	problem = NULL;
-	nPreloadIdx = 0;
-	csrRowPtrProblem = NULL;
-	csrColIdxProblem = NULL;
-	csrValProblem = NULL;
-	csrValProblemSquare = NULL;
-	d_out = NULL;
-
 	nMaxIdxSV = 0;
 	nnzSV = 0;
 	const svm_node *node;
@@ -638,14 +649,17 @@ void svm_model::cuda_init() {
 	std::vector<double> csrVal(nnzSV);
 	std::vector<double> csrValSquare(nnzSV);
 	std::vector<int> csrColIdx(nnzSV);
+	std::vector<double> hSVSquare(l);
 	int curPos = 0;
 	for( int i=0; i<l; i++ ) {
 		csrRowPtr[i] = curPos;
 		node = SV[i];
+		hSVSquare[i] = 0;
 		while(node->index != -1) {
 			csrVal[curPos] = node->value;
 			csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos];
 			csrColIdx[curPos] = node->index;
+			hSVSquare[i] += node->value*node->value;
 			curPos++;
 			node++;
 		}
@@ -657,82 +671,17 @@ void svm_model::cuda_init() {
 	csrValSV = NULL;
 	csrValSVSquare = NULL;
 	csrRowPtrSV = NULL;
+	SVSquare = NULL;
 	CUDA_CHECK(cudaMalloc((void**)&csrRowPtrSV, sizeof(int)*(l+1)));
         CUDA_CHECK(cudaMalloc((void**)&csrColIdxSV, sizeof(int)*nnzSV));
         CUDA_CHECK(cudaMalloc((void**)&csrValSV, sizeof(double)*nnzSV));
 	CUDA_CHECK(cudaMalloc((void**)&csrValSVSquare, sizeof(double)*nnzSV));
+	CUDA_CHECK(cudaMalloc((void**)&SVSquare, sizeof(double)*l));
 	CUDA_CHECK(cudaMemcpy(csrRowPtrSV, csrRowPtr.data(), sizeof(int)*(l+1), cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(csrColIdxSV, csrColIdx.data(), sizeof(int)*nnzSV, cudaMemcpyHostToDevice));
         CUDA_CHECK(cudaMemcpy(csrValSV, csrVal.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMemcpy(csrValSVSquare, csrValSquare.data(), sizeof(double)*nnzSV, cudaMemcpyHostToDevice));
-}
-
-void svm_model::cuda_preload_problem(const struct svm_problem *prob) {
-	free(csrRowPtrProblem);
-	CUDA_CHECK(cudaFree(csrColIdxProblem));
-	CUDA_CHECK(cudaFree(csrValProblem));
-	CUDA_CHECK(cudaFree(csrValProblemSquare));
-	CUDA_CHECK(cudaFree(d_out));
-	csrRowPtrProblem = NULL;
-	csrColIdxProblem = NULL;
-	csrValProblem = NULL;
-	csrValProblemSquare = NULL;
-	d_out = NULL;
-	problem = prob;
-	if( problem!=NULL ) {
-		nMaxIdxProblem = 0;
-		nnzProblem = 0;
-		const svm_node *node;
-		for( int i=0; i<problem->l; i++ ) {
-			node = problem->x[i];
-			while(node->index != -1) {
-				if( node->index > nMaxIdxProblem )
-					nMaxIdxProblem = node->index;
-				nnzProblem++;
-				node++;
-			}
-		}
-		csrRowPtrProblem = Malloc(int,problem->l+1);
-		std::vector<double> csrVal(nnzProblem);
-		std::vector<double> csrValSquare(nnzProblem);
-		std::vector<int> csrColIdx(nnzProblem);
-		int curPos = 0;
-		for( int i=0; i<problem->l; i++ ) {
-			csrRowPtrProblem[i] = curPos;
-			node = problem->x[i];
-			while(node->index != -1) {
-				csrVal[curPos] = node->value;
-				csrValSquare[curPos] = csrVal[curPos]*csrVal[curPos];
-				csrColIdx[curPos] = node->index;
-				curPos++;
-				node++;
-			}
-		}
-		csrRowPtrProblem[problem->l] = curPos;
-
-		CUDA_CHECK(cudaMalloc((void**)&d_out, sizeof(double)*l));
-		CUDA_CHECK(cudaMalloc((void**)&csrColIdxProblem, sizeof(int)*nnzProblem));
-		CUDA_CHECK(cudaMalloc((void**)&csrValProblem, sizeof(double)*nnzProblem));
-		CUDA_CHECK(cudaMalloc((void**)&csrValProblemSquare, sizeof(double)*nnzProblem));
-		CUDA_CHECK(cudaMemcpy(csrColIdxProblem, csrColIdx.data(), sizeof(int)*nnzProblem, cudaMemcpyHostToDevice));
-		CUDA_CHECK(cudaMemcpy(csrValProblem, csrVal.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice));
-		CUDA_CHECK(cudaMemcpy(csrValProblemSquare, csrValSquare.data(), sizeof(double)*nnzProblem, cudaMemcpyHostToDevice));
-	}
-}
-
-void svm_model::setPreloadIdx(int nIdx) {
-	nPreloadIdx = nIdx;
-}
-
-int svm_model::getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const {
-	if( problem==NULL )
-		return 0;
-	int* csrRowPtr = csrRowPtrProblem + nPreloadIdx;
-	*nnz = *(csrRowPtr+1)-*csrRowPtr;
-	*csrColIdx = csrColIdxProblem + *csrRowPtr;
-	*csrVal = csrValProblem + *csrRowPtr;
-	*csrValSquare = csrValProblemSquare + *csrRowPtr;
-	return 1;
+	CUDA_CHECK(cudaMemcpy(SVSquare, hSVSquare.data(), sizeof(double)*l, cudaMemcpyHostToDevice));
 }
 
 void svm_model::cuda_free() {
@@ -740,53 +689,7 @@ void svm_model::cuda_free() {
 	CUDA_CHECK(cudaFree(csrColIdxSV));
 	CUDA_CHECK(cudaFree(csrValSV));
 	CUDA_CHECK(cudaFree(csrValSVSquare));
-	free(csrRowPtrProblem);
-	CUDA_CHECK(cudaFree(csrColIdxProblem));
-	CUDA_CHECK(cudaFree(csrValProblem));
-	CUDA_CHECK(cudaFree(csrValProblemSquare));
-	CUDA_CHECK(cudaFree(d_out));
-}
-
-__global__ void RBF_k_function(int nnzX, double* csrValX, double* csrValXSquare, int* csrColIdxX,
-		int countSV,
-		double* csrValSV, double* csrValSVSquare, int* csrRowPtrSV, int* csrColIdxSV,
-		double* out) {
-	int nSVIdx = blockDim.x*blockIdx.x + threadIdx.x;
-	if( nSVIdx>=countSV )
-		return;
-	int* ColIdxSV = csrColIdxSV + csrRowPtrSV[nSVIdx];
-	double* ValSV = csrValSV + csrRowPtrSV[nSVIdx];
-	double* ValSVSquare = csrValSVSquare + csrRowPtrSV[nSVIdx];
-	int nnzSV = csrRowPtrSV[nSVIdx+1]-csrRowPtrSV[nSVIdx];
-
-	int colX = 0, colSV = 0;
-	double sum = 0.0;
-	while( colX<nnzX && colSV<nnzSV ) {
-		if( csrColIdxX[colX]==ColIdxSV[colSV] ) {
-			double d = csrValX[colX] - ValSV[colSV];
-			sum += d*d;
-			colX++;
-			colSV++;
-		} else {
-			if( csrColIdxX[colX] < ColIdxSV[colSV] ) {
-				sum += csrValXSquare[colX];
-				colX++;
-			} else {
-				sum += ValSVSquare[colSV];
-				colSV++;
-			}
-		}
-	}
-	while( colX<nnzX ) {
-		sum += csrValXSquare[colX];
-		colX++;
-	}
-	while( colSV<nnzSV ) {
-		sum += ValSVSquare[colSV];
-		colSV++;
-	}
-
-	out[nSVIdx] = sum;
+	CUDA_CHECK(cudaFree(SVSquare));
 }
 
 void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, double* out) {
@@ -796,6 +699,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 		case LINEAR:
 		case POLY:
 		case SIGMOID:
+		case RBF:
 			{
 				cusparseHandle_t cusparse = NULL;
 				cusparseMatDescr_t descrSV = NULL;
@@ -815,10 +719,12 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				int max_index = model->nMaxIdxSV;
 				const svm_node *node = x;
 				int x_nnz = 0;
+				double squareX = 0.0;
 				while(node->index != -1) {
 					if( node->index > max_index )
 						max_index = node->index;
 					x_nnz++;
+					squareX += node->value * node->value;
 					node++;
 				}
 				max_index++;
@@ -871,81 +777,21 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 						CUDA_R_64F, buffer);
 				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 
+				// update out base on kernel_type
+				const int threadPerBlock = 256;
+				int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock;
+				dot2kernelValue<<<blockPerGrid, threadPerBlock>>>(param.kernel_type, model->l, param.gamma, param.coef0,
+					param.degree, squareX, model->SVSquare, 0, d_vectorY);
+
 				// copy vectorY back to host
 				CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
 
-				// update out base on kernel_type
-				if( param.kernel_type==POLY ) {
-					for( int i=0; i<model->l; i++ )
-						out[i] = powi(param.gamma*out[i]+param.coef0,param.degree);
-				} else if( param.kernel_type==SIGMOID ) {
-					for( int i=0; i<model->l; i++ ) 
-						out[i] = tanh(param.gamma*out[i]+param.coef0);
-				}
-
 				cudaFree(d_vector);
 				cudaFree(buffer);
 				cusparseDestroyMatDescr(descrSV);
 				cusparseDestroy(cusparse);
 			}
 			break;
-		case RBF:
-			{
-				const int threadPerBlock = 256;
-				int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock;
-				int nnzX = 0;
-				int* csrColIdxX = NULL;
-				double* csrValX = NULL;
-				double* csrValXSquare = NULL;
-				if( model->getPreloadX(&nnzX, &csrColIdxX, &csrValX, &csrValXSquare) ) {
-					// vector X is preloaded(training)
-                                	RBF_k_function<<<blockPerGrid, threadPerBlock>>>(nnzX, csrValX, csrValXSquare, csrColIdxX,
-                                                model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV,
-                                                model->d_out);
-					CUDA_CHECK(cudaMemcpy(out, model->d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
-				} else {
-					// vector X is not preloaded(predict)
-					const svm_node *node = x;
-					int x_nnz = 0;
-					while(node->index != -1) {
-						x_nnz++;
-						node++;
-					}
-
-					// get vector x ready
-					std::vector<double> h_csrValX(x_nnz*2);
-					double* h_csrValXSquare = h_csrValX.data()+x_nnz;
-					std::vector<int> h_csrColIdxX(x_nnz);
-					int curPos = 0;
-					node = x;
-					while(node->index != -1) {
-						h_csrValX[curPos] = node->value;
-						h_csrValXSquare[curPos] = h_csrValX[curPos]*h_csrValX[curPos];
-						h_csrColIdxX[curPos] = node->index;
-						curPos++;
-						node++;
-					}
-					CUDA_CHECK(cudaMalloc((void**)&csrValX, sizeof(double)*(x_nnz*2+model->l)+sizeof(int)*x_nnz));
-					csrValXSquare = csrValX + x_nnz;
-					double* d_out = csrValXSquare + x_nnz;
-					csrColIdxX = (int*)&d_out[model->l];
-					CUDA_CHECK(cudaMemcpy(csrColIdxX, h_csrColIdxX.data(), sizeof(int)*x_nnz, cudaMemcpyHostToDevice));
-					CUDA_CHECK(cudaMemcpy(csrValX, h_csrValX.data(), sizeof(double)*x_nnz*2, cudaMemcpyHostToDevice));
-
-					RBF_k_function<<<blockPerGrid, threadPerBlock>>>(x_nnz, csrValX, csrValXSquare, csrColIdxX,
-							model->l, model->csrValSV, model->csrValSVSquare, model->csrRowPtrSV, model->csrColIdxSV, 
-							d_out);
-
-					// copy d_out back to host
-					CUDA_CHECK(cudaMemcpy(out, d_out, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
-					CUDA_CHECK(cudaFree(csrValX));
-				}
-
-				// do out[i] = exp(-param.gamma*out[i])
-				for( int i=0; i<model->l; i++ )
-					out[i] = exp(-param.gamma*out[i]);
-			}
-			break;
 		default:
 			printf("unsupport kernel type\n");
 			exit(0);
@@ -2558,15 +2404,8 @@ static void svm_binary_svc_probability(
 			subparam.weight[0]=Cp;
 			subparam.weight[1]=Cn;
 			struct svm_model *submodel = svm_train(&subprob,&subparam);
-#ifdef SVM_CUDA
-			submodel->cuda_preload_problem(prob);
 			for(j=begin;j<end;j++)
 			{
-				submodel->setPreloadIdx(perm[j]);
-#else
-			for(j=begin;j<end;j++)
-			{
-#endif
 				svm_predict_values(submodel,prob->x[perm[j]],&(dec_values[perm[j]]));
 				// ensure +1 -1 order; reason not using CV subroutine
 				dec_values[perm[j]] *= submodel->label[0];
diff --git a/svm.h b/svm.h
index 5b2ca0d9..0f546ccd 100644
--- a/svm.h
+++ b/svm.h
@@ -79,19 +79,7 @@ struct svm_model
 	int* csrColIdxSV;	/* column index of matrix SV */
 	double* csrValSV;	/* values of matrix SV */
 	double* csrValSVSquare; /* value^2 of matrix SV */
-
-	void cuda_preload_problem(const struct svm_problem *problem);  /* load problem in gpu memory, non-thread-safe, speed up training */
-	void setPreloadIdx(int nIdx); /* called before getPreloadX() */
-	int getPreloadX(int* nnz, int** csrColIdx, double** csrVal, double** csrValSquare) const; /* get vector x */
-	int nPreloadIdx;
-	const struct svm_problem *problem;
-	int* csrRowPtrProblem;  /* row index of matrix problem */
-	int nMaxIdxProblem;	/* max index of problem */
-	int nnzProblem;		/* non-zero value count of problem */
-	int* csrColIdxProblem;	/* cloumn index of matrix problem */
-	double* csrValProblem;	/* values of matrix problem */
-	double* csrValProblemSquare; /* value^2 of matrix problem */
-	double* d_out;		/* temp output */
+	double* SVSquare;	/* dot(x,x) of matrix SV */
 #endif
 };
 

From 75952f7417c0230a8c2d4a5687e4c0e823ce24bd Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sat, 8 Feb 2020 20:29:10 +0800
Subject: [PATCH 07/19] speed up with OpenMP, 2x speed up than single thread
 one

---
 Makefile |  4 +--
 svm.cpp  | 96 +++++++++++++++++++++++++++-----------------------------
 2 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/Makefile b/Makefile
index 22980070..5d2d3bf0 100644
--- a/Makefile
+++ b/Makefile
@@ -27,6 +27,6 @@ clean:
 svm-train.o: svm-train.c
 	$(CXX) $(CFLAGS) -c svm-train.c
 svm-cuda.o: svm.cpp
-	nvcc -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o
+	nvcc -Xcompiler -fopenmp -O3 -x cu -DSVM_CUDA -c svm.cpp -o svm-cuda.o
 svm-train-cuda: svm-train.o svm-cuda.o
-	$(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart
+	$(CXX) $(CFLAGS) svm-train.o svm-cuda.o -o svm-train-cuda -lm -lcusparse -lcublas -lcudart -fopenmp
diff --git a/svm.cpp b/svm.cpp
index d85a6848..45c37c10 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -15,7 +15,9 @@
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cusparse.h>
+#include <omp.h>
 #endif
+
 int libsvm_version = LIBSVM_VERSION;
 typedef float Qfloat;
 typedef signed char schar;
@@ -415,8 +417,6 @@ class CudaKernel : public KernelBase {
 private:
 	void x2csrMatrix(int sizeX) const;  //convert svm_node** x -> csr format matrix
 private:
-	cusparseHandle_t m_cusparse;
-	cusparseMatDescr_t m_descr;
 	int m_max_index;		// max column of x matrix
 	std::vector<int> m_csrRowPtr;	// csr row ptr of x matrix
 	int* d_csrColIdx;		// csr col index of x matrix
@@ -427,16 +427,8 @@ class CudaKernel : public KernelBase {
 };
 
 CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), 
-	m_cusparse(NULL), m_descr(NULL), m_max_index(0), m_csrRowPtr(l+1),
+	m_max_index(0), m_csrRowPtr(l+1),
 	d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) {
-	// Init handlers
-	cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseResult = cusparseCreateMatDescr(&m_descr);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO);
-	cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-
 	// get Num-non-zero and max_index
 	m_max_index = 0;
 	const svm_node *node;
@@ -492,8 +484,6 @@ CudaKernel::~CudaKernel() {
 	CUDA_CHECK(cudaFree(d_csrColIdx));
 	CUDA_CHECK(cudaFree(d_csrVal));
 	CUDA_CHECK(cudaFree(d_squareX));
-	if(m_cusparse) cusparseDestroy(m_cusparse);
-	if(m_descr) cusparseDestroyMatDescr(m_descr);
 }
 
 void CudaKernel::done_shrinking() const {
@@ -525,24 +515,25 @@ __global__ void dot2kernelValue(int kernel_type, int len, double gamma, double c
 
 void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const {
 	int len = end-start;
+	cusparseHandle_t cusparse = NULL;
+	cusparseMatDescr_t descr = NULL;
+	cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseResult = cusparseCreateMatDescr(&descr);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
 
 	// get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx
 	double* csrValA = d_csrVal + m_csrRowPtr[start];
 	int* csrColIndA = d_csrColIdx + m_csrRowPtr[start];
 	int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start];
 
-	const svm_node *node = x[i];
-	int max_index = 0;
-	while(node->index != -1) {
-		if( max_index < node->index )
-			max_index = node->index;
-		node++;
-	}
-	max_index++;
 	double* d_vectors = NULL;
-	CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*max_index+sizeof(double)*len));
+	CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*m_max_index+sizeof(double)*len));
 	double* d_vectorX = d_vectors;
-	double* d_vectorY = &d_vectorX[max_index];
+	double* d_vectorY = &d_vectorX[m_max_index];
 	int* csrRowPtrA = (int*)&d_vectorY[len];
 
 	// get csrRowPtrA ready for cusparseCsrmvEx
@@ -553,26 +544,27 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice));
 
 	// get vector x ready for cusparseCsrmvEx
-	std::vector<double> vectorX(max_index, 0.0);
-	node = x[i];
+	std::vector<double> vectorX(m_max_index, 0.0);
+	const svm_node* node = x[i];
 	while(node->index != -1) {
 		vectorX[node->index] = node->value;
 		node++;	
 	}
-	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice));
 
 	// get temp buffer size and get it ready
+	// https://docs.nvidia.com/cuda/cusparse/index.html#asynchronous-execution
 	size_t buffer_size = 0;
 	static double h_one = 1.0;
 	static double h_zero = 0.0;
-	cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse,
+	cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
 		CUSPARSE_ALG0,
 		CUSPARSE_OPERATION_NON_TRANSPOSE,
 		len,
 		m_max_index,
 		nnz,
 		&h_one, CUDA_R_64F,
-		m_descr,
+		descr,
 		csrValA, CUDA_R_64F,
 		csrRowPtrA,
 		csrColIndA,
@@ -586,14 +578,14 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
 
 	// call cusparseCsrmvEx
-	cusparseResult = cusparseCsrmvEx(m_cusparse,
+	cusparseResult = cusparseCsrmvEx(cusparse,
 		CUSPARSE_ALG0,
 		CUSPARSE_OPERATION_NON_TRANSPOSE,
 		len,
 		m_max_index,
 		nnz,
 		&h_one, CUDA_R_64F,
-		m_descr,
+		descr,
 		csrValA, CUDA_R_64F,
 		csrRowPtrA,
 		csrColIndA,
@@ -617,8 +609,10 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		Q[start+idx] = (Qfloat)(vectorY[idx]);
 	}
 
-	cudaFree(d_vectors);
-	cudaFree(buffer);
+	CUDA_CHECK(cudaFree(d_vectors));
+	CUDA_CHECK(cudaFree(buffer));
+	cusparseDestroyMatDescr(descr);
+	cusparseDestroy(cusparse);
 
 	// debug, compare with base class result from cpu
 	/*std::vector<Qfloat> Qbase(start+len, 0.0);
@@ -643,6 +637,7 @@ void svm_model::cuda_init() {
 			node++;
 		}
 	}
+	nMaxIdxSV++;
 
 	// get csrVal csrColIdx ready
 	std::vector<int> csrRowPtr(l+1);
@@ -716,39 +711,35 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				cusparseSetMatType(descrSV, CUSPARSE_MATRIX_TYPE_GENERAL);
 				cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
 
-				int max_index = model->nMaxIdxSV;
 				const svm_node *node = x;
 				int x_nnz = 0;
 				double squareX = 0.0;
 				while(node->index != -1) {
-					if( node->index > max_index )
-						max_index = node->index;
 					x_nnz++;
 					squareX += node->value * node->value;
 					node++;
 				}
-				max_index++;
 
 				// get dense vector x ready
 				double* d_vector = NULL;
-				CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(max_index+model->l)));
+				CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(model->nMaxIdxSV+model->l)));
 				double* d_vectorX = d_vector;
-				vectorX.resize(max_index, 0.0);
+				vectorX.resize(model->nMaxIdxSV, 0.0);
 				node = x;
 				while(node->index != -1) {
 					vectorX[node->index] = node->value;
 					node++;
 				}
-				CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*max_index, cudaMemcpyHostToDevice));
+				CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice));
 
 				// get dense vector y ready
-				double* d_vectorY = d_vector + max_index;
-
+				double* d_vectorY = d_vector + model->nMaxIdxSV;
+				
 				// get buffer size
 				cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
 						CUSPARSE_ALG0,
 						CUSPARSE_OPERATION_NON_TRANSPOSE,
-						model->l, max_index,
+						model->l, model->nMaxIdxSV,
 						model->nnzSV, &one, CUDA_R_64F,
 						descrSV, 
 						model->csrValSV, CUDA_R_64F,
@@ -765,7 +756,7 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				cusparseResult = cusparseCsrmvEx(cusparse,
 						CUSPARSE_ALG0,
 						CUSPARSE_OPERATION_NON_TRANSPOSE,
-						model->l, max_index,
+						model->l, model->nMaxIdxSV,
 						model->nnzSV,
 						&one, CUDA_R_64F,
 						descrSV,
@@ -786,8 +777,8 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				// copy vectorY back to host
 				CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
 
-				cudaFree(d_vector);
-				cudaFree(buffer);
+				CUDA_CHECK(cudaFree(d_vector));
+				CUDA_CHECK(cudaFree(buffer));
 				cusparseDestroyMatDescr(descrSV);
 				cusparseDestroy(cusparse);
 			}
@@ -2634,10 +2625,14 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 			probB=Malloc(double,nr_class*(nr_class-1)/2);
 		}
 
-		int p = 0;
-		for(i=0;i<nr_class;i++)
+		int t = 0;
+		for(i=0;i<nr_class;i++) {
+#ifdef SVM_CUDA
+			#pragma omp parallel for if(nr_class!=2) schedule(dynamic)
+#endif
 			for(int j=i+1;j<nr_class;j++)
 			{
+				int p = t + j - (i+1);
 				svm_problem sub_prob;
 				int si = start[i], sj = start[j];
 				int ci = count[i], cj = count[j];
@@ -2658,8 +2653,8 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 
 				if(param->probability)
 					svm_binary_svc_probability(&sub_prob,param,weighted_C[i],weighted_C[j],probA[p],probB[p]);
-
 				f[p] = svm_train_one(&sub_prob,param,weighted_C[i],weighted_C[j]);
+
 				for(k=0;k<ci;k++)
 					if(!nonzero[si+k] && fabs(f[p].alpha[k]) > 0)
 						nonzero[si+k] = true;
@@ -2668,8 +2663,9 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 						nonzero[sj+k] = true;
 				free(sub_prob.x);
 				free(sub_prob.y);
-				++p;
 			}
+			t += (nr_class-i-1);
+		}
 
 		// build output
 
@@ -2720,7 +2716,7 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 		model->l = total_sv;
 		model->SV = Malloc(svm_node *,total_sv);
 		model->sv_indices = Malloc(int,total_sv);
-		p = 0;
+		int p = 0;
 		for(i=0;i<l;i++)
 			if(nonzero[i])
 			{

From 7edbb48bb31039cfc8add2f4f65378f0ad192b83 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sat, 8 Feb 2020 20:42:02 +0800
Subject: [PATCH 08/19] Update README

---
 README | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README b/README
index a22756a4..9cbd0c19 100644
--- a/README
+++ b/README
@@ -4,6 +4,14 @@ make clean; make svm-train-cuda
 How to use:
 run svm-train-cuda instead of svm-train, with same command line arguments of svm-train
 
+multithreading
+svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one.
+
+If you run out of memory or GPU memory, you can disable it with environment:
+export OMP_NUM_THREADS=1
+
+You can try setting OMP_NUM_THREADS to another number,  balancing the speed and GPU/CPU usage.
+
 
 ///////////////////////////////////////////////////////////////////////////////////////
 

From 62243fbaff637437a7b35e7b50c3b0f567c23348 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sat, 8 Feb 2020 20:46:35 +0800
Subject: [PATCH 09/19] Update README

---
 README | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README b/README
index 9cbd0c19..42aa7e94 100644
--- a/README
+++ b/README
@@ -6,12 +6,13 @@ run svm-train-cuda instead of svm-train, with same command line arguments of svm
 
 multithreading
 svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one.
-
 If you run out of memory or GPU memory, you can disable it with environment:
 export OMP_NUM_THREADS=1
-
 You can try setting OMP_NUM_THREADS to another number,  balancing the speed and GPU/CPU usage.
 
+tips
+try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1)
+
 
 ///////////////////////////////////////////////////////////////////////////////////////
 

From d3df56df23a09bc5b5ff4fe335b0987d5d66afb6 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sun, 9 Feb 2020 01:21:59 +0800
Subject: [PATCH 10/19] use OpenMP collapse() for better performance

---
 svm.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/svm.cpp b/svm.cpp
index 45c37c10..60d7ed87 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -2625,14 +2625,13 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 			probB=Malloc(double,nr_class*(nr_class-1)/2);
 		}
 
-		int t = 0;
-		for(i=0;i<nr_class;i++) {
 #ifdef SVM_CUDA
-			#pragma omp parallel for if(nr_class!=2) schedule(dynamic)
+		#pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(2)
 #endif
+		for(i=0;i<nr_class;i++) {
 			for(int j=i+1;j<nr_class;j++)
 			{
-				int p = t + j - (i+1);
+				int p = i*(nr_class-1) - i*(i-1)/2 + j-(i+1);
 				svm_problem sub_prob;
 				int si = start[i], sj = start[j];
 				int ci = count[i], cj = count[j];
@@ -2664,7 +2663,6 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 				free(sub_prob.x);
 				free(sub_prob.y);
 			}
-			t += (nr_class-i-1);
 		}
 
 		// build output

From 1489f5d50def19f8bf175a4e691a3b14183c6de4 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Sun, 9 Feb 2020 10:04:10 +0800
Subject: [PATCH 11/19] default omp_set_num_threads set to 2

---
 svm.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/svm.cpp b/svm.cpp
index 60d7ed87..44700dd7 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -2626,6 +2626,8 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 		}
 
 #ifdef SVM_CUDA
+		if( !getenv("OMP_NUM_THREADS") )
+			omp_set_num_threads(2);
 		#pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(2)
 #endif
 		for(i=0;i<nr_class;i++) {

From d2ebf580c81310a791bb55ca4161fb18dcab9ff5 Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Mon, 10 Feb 2020 11:51:28 +0800
Subject: [PATCH 12/19] Change loop style to make new version OpenMP happy

---
 svm.cpp | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/svm.cpp b/svm.cpp
index 44700dd7..daf514b9 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -2627,13 +2627,14 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 
 #ifdef SVM_CUDA
 		if( !getenv("OMP_NUM_THREADS") )
-			omp_set_num_threads(2);
-		#pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(2)
+			omp_set_num_threads(1);
+		#pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(1)
 #endif
-		for(i=0;i<nr_class;i++) {
-			for(int j=i+1;j<nr_class;j++)
-			{
-				int p = i*(nr_class-1) - i*(i-1)/2 + j-(i+1);
+		for(int p=0; p<nr_class*(nr_class-1)/2; p++) {
+			for( int i=0; i<nr_class; i++ ) {
+				int j = p - i*(nr_class-1) + i*(i-1)/2 + i + 1;
+				if( j<=i || j>=nr_class )
+					continue;
 				svm_problem sub_prob;
 				int si = start[i], sj = start[j];
 				int ci = count[i], cj = count[j];

From 6b86f45d8daa6cff1bf04249abeec43d722f5d8c Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Tue, 11 Feb 2020 10:14:45 +0800
Subject: [PATCH 13/19] less cudaMalloc/cudaFree, a new -u command line
 argument

---
 svm-train.c |   5 ++
 svm.cpp     | 163 ++++++++++++++++++++++++++++++++--------------------
 svm.h       |   1 +
 3 files changed, 108 insertions(+), 61 deletions(-)

diff --git a/svm-train.c b/svm-train.c
index b6ce987d..a5a4d5ad 100644
--- a/svm-train.c
+++ b/svm-train.c
@@ -38,6 +38,7 @@ void exit_with_help()
 	"-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)\n"
 	"-v n: n-fold cross validation mode\n"
 	"-q : quiet mode (no outputs)\n"
+	"-u n : cuda train threads count (default 1)\n"
 	);
 	exit(1);
 }
@@ -179,6 +180,7 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
 	param.nr_weight = 0;
 	param.weight_label = NULL;
 	param.weight = NULL;
+	param.cuda_threads = 1;
 	cross_validation = 0;
 
 	// parse options
@@ -245,6 +247,9 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
 				param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]);
 				param.weight[param.nr_weight-1] = atof(argv[i]);
 				break;
+			case 'u':
+				param.cuda_threads = atoi(argv[i]);
+				break;
 			default:
 				fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]);
 				exit_with_help();
diff --git a/svm.cpp b/svm.cpp
index daf514b9..18eb3b9f 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -424,11 +424,27 @@ class CudaKernel : public KernelBase {
 	double* d_squareX;		// a gpu copy of x_square
 	int m_sizeX;		// size of x
 	int m_nnz;		// number of non-zero value of x matrix 
+
+	cusparseHandle_t m_cusparse;
+	cusparseMatDescr_t m_descr;
+	cudaStream_t m_stream;
+	double* d_vectorX;
+	double* d_vectorY;
+	int* d_csrRowPtrA;
+	double* h_vectorX;
+	double* h_vectorY;
+	int* h_csrRowPtrA;
+	char* d_defaultBuf;
+	static int defaultBufSize;
 };
 
+int CudaKernel::defaultBufSize = 10240;
 CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param) : KernelBase(l, x, param), 
 	m_max_index(0), m_csrRowPtr(l+1),
-	d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0) {
+	d_csrColIdx(NULL), d_csrVal(NULL), d_squareX(NULL), m_sizeX(l), m_nnz(0),
+	m_cusparse(NULL), m_descr(NULL), m_stream(NULL), d_vectorX(NULL), d_vectorY(NULL),
+	d_csrRowPtrA(NULL), h_vectorX(NULL), h_vectorY(NULL), h_csrRowPtrA(NULL),
+	d_defaultBuf(NULL) {
 	// get Num-non-zero and max_index
 	m_max_index = 0;
 	const svm_node *node;
@@ -443,6 +459,26 @@ CudaKernel::CudaKernel(int l, svm_node * const * x, const svm_parameter& param)
 	}
 	m_max_index++;
 
+	CUDA_CHECK(cudaStreamCreate(&m_stream));
+	cusparseStatus_t cusparseResult = cusparseCreate(&m_cusparse);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseResult = cusparseCreateMatDescr(&m_descr);
+	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	cusparseSetMatIndexBase(m_descr,CUSPARSE_INDEX_BASE_ZERO);
+	cusparseSetMatType(m_descr, CUSPARSE_MATRIX_TYPE_GENERAL);
+	cusparseSetPointerMode(m_cusparse, CUSPARSE_POINTER_MODE_HOST);
+
+	CUDA_CHECK(cudaMalloc((void**)&d_vectorX, sizeof(double)*m_max_index));
+	CUDA_CHECK(cudaMalloc((void**)&d_vectorY, sizeof(double)*l));
+	CUDA_CHECK(cudaMalloc((void**)&d_csrRowPtrA, sizeof(int)*(l+1)));
+	CUDA_CHECK(cudaMallocHost((void**)&h_vectorX, sizeof(double)*m_max_index));
+	CUDA_CHECK(cudaMallocHost((void**)&h_vectorY, sizeof(double)*l));
+	CUDA_CHECK(cudaMallocHost((void**)&h_csrRowPtrA, sizeof(int)*(l+1)));
+	CUDA_CHECK(cudaMalloc((void**)&d_defaultBuf, defaultBufSize));
+	CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz));
+	CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz));
+	CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*l));
+
 	x2csrMatrix(l);
 }
 
@@ -466,15 +502,6 @@ void CudaKernel::x2csrMatrix(int sizeX) const {
 	csrRowPtr[sizeX] = curPos;
 
 	// copy csrVal, csrColIdx to device
-	if( !d_csrColIdx ) {
-	        CUDA_CHECK(cudaMalloc((void**)&d_csrColIdx, sizeof(int)*m_nnz));
-	}
-	if( !d_csrVal ) {
-		CUDA_CHECK(cudaMalloc((void**)&d_csrVal, sizeof(double)*m_nnz));
-	}
-	if( !d_squareX ) {
-		CUDA_CHECK(cudaMalloc((void**)&d_squareX, sizeof(double)*sizeX));
-	}
 	CUDA_CHECK(cudaMemcpy(d_csrColIdx, csrColIdx.data(), sizeof(int)*m_nnz, cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMemcpy(d_csrVal, csrVal.data(), sizeof(double)*m_nnz, cudaMemcpyHostToDevice));
 	CUDA_CHECK(cudaMemcpy(d_squareX, x_square, sizeof(double)*sizeX, cudaMemcpyHostToDevice));
@@ -484,6 +511,16 @@ CudaKernel::~CudaKernel() {
 	CUDA_CHECK(cudaFree(d_csrColIdx));
 	CUDA_CHECK(cudaFree(d_csrVal));
 	CUDA_CHECK(cudaFree(d_squareX));
+        CUDA_CHECK(cudaFree(d_vectorX));
+	CUDA_CHECK(cudaFree(d_vectorY));
+	CUDA_CHECK(cudaFree(d_csrRowPtrA));
+	CUDA_CHECK(cudaFree(d_defaultBuf));
+        CUDA_CHECK(cudaFreeHost(h_vectorX));
+	CUDA_CHECK(cudaFreeHost(h_vectorY));
+	CUDA_CHECK(cudaFreeHost(h_csrRowPtrA));
+        cusparseDestroyMatDescr(m_descr);
+        cusparseDestroy(m_cusparse);
+        CUDA_CHECK(cudaStreamDestroy(m_stream));
 }
 
 void CudaKernel::done_shrinking() const {
@@ -515,58 +552,44 @@ __global__ void dot2kernelValue(int kernel_type, int len, double gamma, double c
 
 void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) const {
 	int len = end-start;
-	cusparseHandle_t cusparse = NULL;
-	cusparseMatDescr_t descr = NULL;
-	cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseResult = cusparseCreateMatDescr(&descr);
-	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
-	cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
-	cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL);
-	cusparseSetPointerMode(cusparse, CUSPARSE_POINTER_MODE_HOST);
+	assert(len<=m_sizeX);
 
 	// get csrValA, csrColIndA, nnz ready for cusparseCsrmvEx
 	double* csrValA = d_csrVal + m_csrRowPtr[start];
 	int* csrColIndA = d_csrColIdx + m_csrRowPtr[start];
 	int nnz = m_csrRowPtr[len+start] - m_csrRowPtr[start];
 
-	double* d_vectors = NULL;
-	CUDA_CHECK(cudaMalloc((void**)&d_vectors, sizeof(int)*(len+1)+sizeof(double)*m_max_index+sizeof(double)*len));
-	double* d_vectorX = d_vectors;
-	double* d_vectorY = &d_vectorX[m_max_index];
-	int* csrRowPtrA = (int*)&d_vectorY[len];
-
 	// get csrRowPtrA ready for cusparseCsrmvEx
-	std::vector<int> csrRow(len+1);
 	for( int idx=0; idx<=len; idx++ ) {
-		csrRow[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start];
+		h_csrRowPtrA[idx] = m_csrRowPtr[idx+start] - m_csrRowPtr[start];
 	}
-	CUDA_CHECK(cudaMemcpy(csrRowPtrA, csrRow.data(), sizeof(int)*(len+1), cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpyAsync(d_csrRowPtrA, h_csrRowPtrA, sizeof(int)*(len+1), cudaMemcpyHostToDevice, m_stream));
 
 	// get vector x ready for cusparseCsrmvEx
-	std::vector<double> vectorX(m_max_index, 0.0);
+	memset(h_vectorX, 0, sizeof(double)*m_max_index);
 	const svm_node* node = x[i];
 	while(node->index != -1) {
-		vectorX[node->index] = node->value;
+		h_vectorX[node->index] = node->value;
 		node++;	
 	}
-	CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*m_max_index, cudaMemcpyHostToDevice));
+	CUDA_CHECK(cudaMemcpyAsync(d_vectorX, h_vectorX, sizeof(double)*m_max_index, cudaMemcpyHostToDevice, m_stream));
 
 	// get temp buffer size and get it ready
 	// https://docs.nvidia.com/cuda/cusparse/index.html#asynchronous-execution
 	size_t buffer_size = 0;
-	static double h_one = 1.0;
-	static double h_zero = 0.0;
-	cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
+	static const double h_zero = 0.0;
+	static const double h_one = 1.0;
+	cusparseSetStream(m_cusparse, m_stream);
+	cusparseStatus_t cusparseResult = cusparseCsrmvEx_bufferSize(m_cusparse,
 		CUSPARSE_ALG0,
 		CUSPARSE_OPERATION_NON_TRANSPOSE,
 		len,
 		m_max_index,
 		nnz,
 		&h_one, CUDA_R_64F,
-		descr,
+		m_descr,
 		csrValA, CUDA_R_64F,
-		csrRowPtrA,
+		d_csrRowPtrA,
 		csrColIndA,
 		d_vectorX, CUDA_R_64F,
 		&h_zero, CUDA_R_64F,
@@ -574,20 +597,26 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 		CUDA_R_64F,
 		&buffer_size);
 	assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
+	void* alloc_buffer = NULL;
 	void* buffer = NULL;
-	CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
+	if( buffer_size > defaultBufSize ) {
+		CUDA_CHECK(cudaMalloc((void**)&alloc_buffer, buffer_size));
+		buffer = alloc_buffer;
+	} else
+		buffer = d_defaultBuf;
 
 	// call cusparseCsrmvEx
-	cusparseResult = cusparseCsrmvEx(cusparse,
+	cusparseSetStream(m_cusparse, m_stream);
+	cusparseResult = cusparseCsrmvEx(m_cusparse,
 		CUSPARSE_ALG0,
 		CUSPARSE_OPERATION_NON_TRANSPOSE,
 		len,
 		m_max_index,
 		nnz,
 		&h_one, CUDA_R_64F,
-		descr,
+		m_descr,
 		csrValA, CUDA_R_64F,
-		csrRowPtrA,
+		d_csrRowPtrA,
 		csrColIndA,
 		d_vectorX, CUDA_R_64F,
 		&h_zero, CUDA_R_64F,
@@ -599,20 +628,18 @@ void CudaKernel::batch_kernel_function(int i, Qfloat *Q, int start, int end) con
 	// update out base on kernel_type
 	const int threadPerBlock = 256;
 	int blockPerGrid = (len+threadPerBlock-1)/threadPerBlock;
-	dot2kernelValue<<<blockPerGrid, threadPerBlock>>>(kernel_type, len, gamma, coef0,
+	dot2kernelValue<<<blockPerGrid, threadPerBlock, 0, m_stream>>>(kernel_type, len, gamma, coef0,
 		degree, x_square[i], d_squareX, start, d_vectorY);
 
 	// copy d_vectorY back to host
-	std::vector<double> vectorY(len);
-	CUDA_CHECK(cudaMemcpy(vectorY.data(), d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost));
+	CUDA_CHECK(cudaMemcpyAsync(h_vectorY, d_vectorY, sizeof(double)*len, cudaMemcpyDeviceToHost, m_stream));
+	CUDA_CHECK(cudaStreamSynchronize(m_stream));
 	for( int idx=0; idx<len; idx++ ) {
-		Q[start+idx] = (Qfloat)(vectorY[idx]);
+		Q[start+idx] = (Qfloat)(h_vectorY[idx]);
 	}
 
-	CUDA_CHECK(cudaFree(d_vectors));
-	CUDA_CHECK(cudaFree(buffer));
-	cusparseDestroyMatDescr(descr);
-	cusparseDestroy(cusparse);
+	if( alloc_buffer )
+		CUDA_CHECK(cudaFree(alloc_buffer));
 
 	// debug, compare with base class result from cpu
 	/*std::vector<Qfloat> Qbase(start+len, 0.0);
@@ -698,11 +725,12 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 			{
 				cusparseHandle_t cusparse = NULL;
 				cusparseMatDescr_t descrSV = NULL;
+				cudaStream_t stream = NULL;
 				static const double one = 1.0;
 				static const double zero = 0.0;
-				std::vector<double> vectorX;
 				size_t buffer_size;
 
+				CUDA_CHECK(cudaStreamCreate(&stream));
 				cusparseStatus_t cusparseResult = cusparseCreate(&cusparse);
 				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 				cusparseResult = cusparseCreateMatDescr(&descrSV);
@@ -724,18 +752,22 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				double* d_vector = NULL;
 				CUDA_CHECK(cudaMalloc((void**)&d_vector, sizeof(double)*(model->nMaxIdxSV+model->l)));
 				double* d_vectorX = d_vector;
-				vectorX.resize(model->nMaxIdxSV, 0.0);
+				double* d_vectorY = d_vector + model->nMaxIdxSV;
+				double* h_vectors = NULL;
+				CUDA_CHECK(cudaMallocHost((void**)&h_vectors, sizeof(double)*(model->nMaxIdxSV+model->l)));
+				double* vectorX = h_vectors;
+				double* vectorY = vectorX + model->nMaxIdxSV;
+
+				memset(vectorX, 0, model->nMaxIdxSV);
 				node = x;
 				while(node->index != -1) {
 					vectorX[node->index] = node->value;
 					node++;
 				}
-				CUDA_CHECK(cudaMemcpy(d_vectorX, vectorX.data(), sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice));
+				CUDA_CHECK(cudaMemcpyAsync(d_vectorX, vectorX, sizeof(double)*model->nMaxIdxSV, cudaMemcpyHostToDevice, stream));
 
-				// get dense vector y ready
-				double* d_vectorY = d_vector + model->nMaxIdxSV;
-				
 				// get buffer size
+				cusparseSetStream(cusparse, stream);
 				cusparseResult = cusparseCsrmvEx_bufferSize(cusparse,
 						CUSPARSE_ALG0,
 						CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -750,9 +782,10 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 						CUDA_R_64F, &buffer_size);
 				assert(CUSPARSE_STATUS_SUCCESS == cusparseResult);
 				char* buffer = NULL;
-				CUDA_CHECK(cudaMalloc ((void**)&buffer, buffer_size));
+				CUDA_CHECK(cudaMalloc((void**)&buffer, buffer_size));
 	
 				// y = dot(SV, x)
+				cusparseSetStream(cusparse, stream);
 				cusparseResult = cusparseCsrmvEx(cusparse,
 						CUSPARSE_ALG0,
 						CUSPARSE_OPERATION_NON_TRANSPOSE,
@@ -771,16 +804,20 @@ void CudaKernel::batch_k_function(const svm_node *x, const svm_model* model, dou
 				// update out base on kernel_type
 				const int threadPerBlock = 256;
 				int blockPerGrid = (model->l+threadPerBlock-1)/threadPerBlock;
-				dot2kernelValue<<<blockPerGrid, threadPerBlock>>>(param.kernel_type, model->l, param.gamma, param.coef0,
+				dot2kernelValue<<<blockPerGrid, threadPerBlock, 0, stream>>>(param.kernel_type, model->l, param.gamma, param.coef0,
 					param.degree, squareX, model->SVSquare, 0, d_vectorY);
 
 				// copy vectorY back to host
-				CUDA_CHECK(cudaMemcpy(out, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost));
+				CUDA_CHECK(cudaMemcpyAsync(vectorY, d_vectorY, sizeof(double)*model->l, cudaMemcpyDeviceToHost, stream));
+				CUDA_CHECK(cudaStreamSynchronize(stream));
+				memcpy(out, vectorY, sizeof(double)*model->l);
 
 				CUDA_CHECK(cudaFree(d_vector));
+				CUDA_CHECK(cudaFreeHost(h_vectors));
 				CUDA_CHECK(cudaFree(buffer));
 				cusparseDestroyMatDescr(descrSV);
 				cusparseDestroy(cusparse);
+				CUDA_CHECK(cudaStreamDestroy(stream));
 			}
 			break;
 		default:
@@ -2342,6 +2379,9 @@ static void svm_binary_svc_probability(
 		int j = i+rand()%(prob->l-i);
 		swap(perm[i],perm[j]);
 	}
+#ifdef SVM_CUDA
+	#pragma omp parallel for schedule(dynamic) num_threads(param->cuda_threads)
+#endif	
 	for(i=0;i<nr_fold;i++)
 	{
 		int begin = i*prob->l/nr_fold;
@@ -2626,12 +2666,12 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 		}
 
 #ifdef SVM_CUDA
-		if( !getenv("OMP_NUM_THREADS") )
-			omp_set_num_threads(1);
-		#pragma omp parallel for if(nr_class!=2) schedule(dynamic) collapse(1)
+		// Another "#pragma omp parallel for" in svm_binary_svc_probability(), if param->probability
+		#pragma omp parallel for schedule(dynamic) if(nr_class!=2 && !param->probability) num_threads(param->cuda_threads)
 #endif
 		for(int p=0; p<nr_class*(nr_class-1)/2; p++) {
-			for( int i=0; i<nr_class; i++ ) {
+			for( int i=0; i<nr_class; i++ )
+			{
 				int j = p - i*(nr_class-1) + i*(i-1)/2 + i + 1;
 				if( j<=i || j>=nr_class )
 					continue;
@@ -3229,6 +3269,7 @@ bool read_model_header(FILE *fp, svm_model* model)
 	param.nr_weight = 0;
 	param.weight_label = NULL;
 	param.weight = NULL;
+	param.cuda_threads = 1;
 
 	char cmd[81];
 	while(1)
diff --git a/svm.h b/svm.h
index 0f546ccd..99726cc8 100644
--- a/svm.h
+++ b/svm.h
@@ -44,6 +44,7 @@ struct svm_parameter
 	double p;	/* for EPSILON_SVR */
 	int shrinking;	/* use the shrinking heuristics */
 	int probability; /* do probability estimates */
+	int cuda_threads;  /* cuda thread count of svm_train() */
 };
 
 //

From a8b1b89aed36b63c1de3310adc37ac4eb5ca7218 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Tue, 11 Feb 2020 20:39:19 +0800
Subject: [PATCH 14/19] Update README

---
 README | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/README b/README
index 42aa7e94..bb48d92e 100644
--- a/README
+++ b/README
@@ -5,10 +5,14 @@ How to use:
 run svm-train-cuda instead of svm-train, with same command line arguments of svm-train
 
 multithreading
-svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags, it can much faster than single thread one.
-If you run out of memory or GPU memory, you can disable it with environment:
-export OMP_NUM_THREADS=1
-You can try setting OMP_NUM_THREADS to another number,  balancing the speed and GPU/CPU usage.
+svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags,
+it can be faster than single thread one.
+You can define cuda thread count with command line argument -u
+-u n : cuda train threads count (default 1)
+Please note: cache is base on thread, total host memory usage is cuda_thread*cache_size_per_thread
+you will have to cut down some cache memory if you are using multithreading cuda, cache is faster than GPU, 
+if the training size is small enough. 
+So you may try and get the best -u cuda_thread -m cache_size combination, base on the size of your problem.
 
 tips
 try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1)

From 497d570de06aaee7b086649f61602844d35bdc5c Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Thu, 13 Feb 2020 00:22:28 +0800
Subject: [PATCH 15/19] speed up host code with OpenMP, to catch up with the
 speed of GPU

---
 svm-train.c |   5 ---
 svm.cpp     | 111 ++++++++++++++++++++++++++++++++++++++++------------
 svm.h       |   1 -
 3 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/svm-train.c b/svm-train.c
index a5a4d5ad..b6ce987d 100644
--- a/svm-train.c
+++ b/svm-train.c
@@ -38,7 +38,6 @@ void exit_with_help()
 	"-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)\n"
 	"-v n: n-fold cross validation mode\n"
 	"-q : quiet mode (no outputs)\n"
-	"-u n : cuda train threads count (default 1)\n"
 	);
 	exit(1);
 }
@@ -180,7 +179,6 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
 	param.nr_weight = 0;
 	param.weight_label = NULL;
 	param.weight = NULL;
-	param.cuda_threads = 1;
 	cross_validation = 0;
 
 	// parse options
@@ -247,9 +245,6 @@ void parse_command_line(int argc, char **argv, char *input_file_name, char *mode
 				param.weight_label[param.nr_weight-1] = atoi(&argv[i-1][2]);
 				param.weight[param.nr_weight-1] = atof(argv[i]);
 				break;
-			case 'u':
-				param.cuda_threads = atoi(argv[i]);
-				break;
 			default:
 				fprintf(stderr,"Unknown option: -%c\n", argv[i-1][1]);
 				exit_with_help();
diff --git a/svm.cpp b/svm.cpp
index 18eb3b9f..d2f9e382 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -1264,38 +1264,82 @@ int Solver::select_working_set(int &out_i, int &out_j)
 	// j: minimizes the decrease of obj value
 	//    (if quadratic coefficeint <= 0, replace it with tau)
 	//    -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha)
-
 	double Gmax = -INF;
 	double Gmax2 = -INF;
 	int Gmax_idx = -1;
 	int Gmin_idx = -1;
 	double obj_diff_min = INF;
 
+#ifdef SVM_CUDA
+	static double tGmax = -INF;
+	static double tGmax2 = -INF;
+	static int tGmax_idx = -1;
+	static int tGmin_idx = -1;
+	static double tobj_diff_min = INF;
+#pragma omp threadprivate(tGmax, tGmax2, tGmax_idx, tGmin_idx, tobj_diff_min)
+
+	tGmax = -INF;
+	tGmax_idx = -1;
+#pragma omp parallel for copyin(tGmax, tGmax_idx)	
+#else
+	double& tGmax = Gmax;
+	double& tGmax2 = Gmax2;
+	int& tGmax_idx = Gmax_idx;
+	int& tGmin_idx = Gmin_idx;
+	double& tobj_diff_min = obj_diff_min;
+#endif
 	for(int t=0;t<active_size;t++)
 		if(y[t]==+1)
 		{
 			if(!is_upper_bound(t))
-				if(-G[t] >= Gmax)
+				if(-G[t] >= tGmax)
 				{
-					Gmax = -G[t];
-					Gmax_idx = t;
+					tGmax = -G[t];
+					tGmax_idx = t;
 				}
 		}
 		else
 		{
 			if(!is_lower_bound(t))
-				if(G[t] >= Gmax)
+				if(G[t] >= tGmax)
 				{
-					Gmax = G[t];
-					Gmax_idx = t;
+					tGmax = G[t];
+					tGmax_idx = t;
 				}
 		}
+#ifdef SVM_CUDA
+#pragma omp parallel
+	{
+		#pragma omp critical
+		{
+			if( tGmax>=Gmax )
+			{
+				if( tGmax==Gmax )
+				{
+					if( tGmax_idx>Gmax_idx )
+						Gmax_idx = tGmax_idx;
+				}
+				else
+				{
+					Gmax = tGmax;
+					Gmax_idx = tGmax_idx;
+				}
+			}
+		}
+	}
+#endif
 
 	int i = Gmax_idx;
 	const Qfloat *Q_i = NULL;
 	if(i != -1) // NULL Q_i not accessed: Gmax=-INF if i=-1
 		Q_i = Q->get_Q(i,active_size);
 
+	tGmax2 = -INF;
+	tGmin_idx = -1;
+	tobj_diff_min = INF;
+#ifdef SVM_CUDA
+#pragma omp parallel for copyin(tGmax2, tGmin_idx, tobj_diff_min)
+#endif
 	for(int j=0;j<active_size;j++)
 	{
 		if(y[j]==+1)
@@ -1303,8 +1347,8 @@ int Solver::select_working_set(int &out_i, int &out_j)
 			if (!is_lower_bound(j))
 			{
 				double grad_diff=Gmax+G[j];
-				if (G[j] >= Gmax2)
-					Gmax2 = G[j];
+				if (G[j] >= tGmax2)
+					tGmax2 = G[j];
 				if (grad_diff > 0)
 				{
 					double obj_diff;
@@ -1314,10 +1358,10 @@ int Solver::select_working_set(int &out_i, int &out_j)
 					else
 						obj_diff = -(grad_diff*grad_diff)/TAU;
 
-					if (obj_diff <= obj_diff_min)
+					if (obj_diff <= tobj_diff_min)
 					{
-						Gmin_idx=j;
-						obj_diff_min = obj_diff;
+						tGmin_idx=j;
+						tobj_diff_min = obj_diff;
 					}
 				}
 			}
@@ -1327,8 +1371,8 @@ int Solver::select_working_set(int &out_i, int &out_j)
 			if (!is_upper_bound(j))
 			{
 				double grad_diff= Gmax-G[j];
-				if (-G[j] >= Gmax2)
-					Gmax2 = -G[j];
+				if (-G[j] >= tGmax2)
+					tGmax2 = -G[j];
 				if (grad_diff > 0)
 				{
 					double obj_diff;
@@ -1338,15 +1382,37 @@ int Solver::select_working_set(int &out_i, int &out_j)
 					else
 						obj_diff = -(grad_diff*grad_diff)/TAU;
 
-					if (obj_diff <= obj_diff_min)
+					if (obj_diff <= tobj_diff_min)
 					{
-						Gmin_idx=j;
-						obj_diff_min = obj_diff;
+						tGmin_idx=j;
+						tobj_diff_min = obj_diff;
 					}
 				}
 			}
 		}
 	}
+#ifdef SVM_CUDA
+#pragma omp parallel
+	{
+		#pragma omp critical
+		{
+			if(tGmax2>Gmax2 )
+				Gmax2 = tGmax2;
+			if( tobj_diff_min <= obj_diff_min )
+			{
+				if( tobj_diff_min<obj_diff_min )
+				{
+					obj_diff_min = tobj_diff_min;
+					Gmin_idx = tGmin_idx;
+				}
+				else if( tGmin_idx > Gmin_idx )
+				{
+					Gmin_idx = tGmin_idx;
+				}
+			}
+		}
+	}
+#endif
 
 	if(Gmax+Gmax2 < eps || Gmin_idx == -1)
 		return 1;
@@ -2379,9 +2445,6 @@ static void svm_binary_svc_probability(
 		int j = i+rand()%(prob->l-i);
 		swap(perm[i],perm[j]);
 	}
-#ifdef SVM_CUDA
-	#pragma omp parallel for schedule(dynamic) num_threads(param->cuda_threads)
-#endif	
 	for(i=0;i<nr_fold;i++)
 	{
 		int begin = i*prob->l/nr_fold;
@@ -2435,6 +2498,9 @@ static void svm_binary_svc_probability(
 			subparam.weight[0]=Cp;
 			subparam.weight[1]=Cn;
 			struct svm_model *submodel = svm_train(&subprob,&subparam);
+#ifdef SVM_CUDA			
+#pragma omp parallel for private(j)
+#endif
 			for(j=begin;j<end;j++)
 			{
 				svm_predict_values(submodel,prob->x[perm[j]],&(dec_values[perm[j]]));
@@ -2665,10 +2731,6 @@ svm_model *svm_train(const svm_problem *prob, const svm_parameter *param)
 			probB=Malloc(double,nr_class*(nr_class-1)/2);
 		}
 
-#ifdef SVM_CUDA
-		// Another "#pragma omp parallel for" in svm_binary_svc_probability(), if param->probability
-		#pragma omp parallel for schedule(dynamic) if(nr_class!=2 && !param->probability) num_threads(param->cuda_threads)
-#endif
 		for(int p=0; p<nr_class*(nr_class-1)/2; p++) {
 			for( int i=0; i<nr_class; i++ )
 			{
@@ -3269,7 +3331,6 @@ bool read_model_header(FILE *fp, svm_model* model)
 	param.nr_weight = 0;
 	param.weight_label = NULL;
 	param.weight = NULL;
-	param.cuda_threads = 1;
 
 	char cmd[81];
 	while(1)
diff --git a/svm.h b/svm.h
index 99726cc8..0f546ccd 100644
--- a/svm.h
+++ b/svm.h
@@ -44,7 +44,6 @@ struct svm_parameter
 	double p;	/* for EPSILON_SVR */
 	int shrinking;	/* use the shrinking heuristics */
 	int probability; /* do probability estimates */
-	int cuda_threads;  /* cuda thread count of svm_train() */
 };
 
 //

From 2a0d4c8e1dbfddc8075d560d16d61addeae3a7b6 Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Thu, 13 Feb 2020 00:30:55 +0800
Subject: [PATCH 16/19] Update README

---
 README | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/README b/README
index bb48d92e..6f2ea3ce 100644
--- a/README
+++ b/README
@@ -5,14 +5,10 @@ How to use:
 run svm-train-cuda instead of svm-train, with same command line arguments of svm-train
 
 multithreading
-svm-train-cuda is multithreading by default(with OpenMP), If you are runing a svm classification job with more than 2 tags,
-it can be faster than single thread one.
-You can define cuda thread count with command line argument -u
--u n : cuda train threads count (default 1)
-Please note: cache is base on thread, total host memory usage is cuda_thread*cache_size_per_thread
-you will have to cut down some cache memory if you are using multithreading cuda, cache is faster than GPU, 
-if the training size is small enough. 
-So you may try and get the best -u cuda_thread -m cache_size combination, base on the size of your problem.
+svm-train-cuda is multithreading by default(with OpenMP), you can change thread count with
+setenv OMP_NUM_THREADS 8 (Windows)
+or
+export OMP_NUM_THREADS=8 (Linux)
 
 tips
 try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1)

From 39d381ade76acf119a637ec46c7f8b6647d69344 Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Fri, 14 Feb 2020 11:27:55 +0800
Subject: [PATCH 17/19] speed up host code with OpenMP, to catch up with the
 speed of GPU

---
 svm.cpp | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/svm.cpp b/svm.cpp
index d2f9e382..446fb08d 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -951,6 +951,9 @@ void Solver::reconstruct_gradient()
 
 	if (nr_free*l > 2*active_size*(l-active_size))
 	{
+#ifdef SVM_CUDA
+#pragma omp parallel for private(i)
+#endif
 		for(i=active_size;i<l;i++)
 		{
 			const Qfloat *Q_i = Q->get_Q(i,active_size);
@@ -961,6 +964,9 @@ void Solver::reconstruct_gradient()
 	}
 	else
 	{
+#ifdef SVM_CUDA
+#pragma omp parallel for private(i)
+#endif
 		for(i=0;i<active_size;i++)
 			if(is_free(i))
 			{
@@ -1165,6 +1171,9 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 		double delta_alpha_i = alpha[i] - old_alpha_i;
 		double delta_alpha_j = alpha[j] - old_alpha_j;
 
+#ifdef SVM_CUDA
+#pragma omp parallel for
+#endif
 		for(int k=0;k<active_size;k++)
 		{
 			G[k] += Q_i[k]*delta_alpha_i + Q_j[k]*delta_alpha_j;
@@ -1182,9 +1191,15 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 			{
 				Q_i = Q.get_Q(i,l);
 				if(ui)
+#ifdef SVM_CUDA
+#pragma omp parallel for private(k)
+#endif					
 					for(k=0;k<l;k++)
 						G_bar[k] -= C_i * Q_i[k];
 				else
+#ifdef SVM_CUDA
+#pragma omp parallel for private(k)
+#endif
 					for(k=0;k<l;k++)
 						G_bar[k] += C_i * Q_i[k];
 			}
@@ -1193,9 +1208,15 @@ void Solver::Solve(int l, const QMatrix& Q, const double *p_, const schar *y_,
 			{
 				Q_j = Q.get_Q(j,l);
 				if(uj)
+#ifdef SVM_CUDA
+#pragma omp parallel for private(k)
+#endif
 					for(k=0;k<l;k++)
 						G_bar[k] -= C_j * Q_j[k];
 				else
+#ifdef SVM_CUDA
+#pragma omp parallel for private(k)
+#endif
 					for(k=0;k<l;k++)
 						G_bar[k] += C_j * Q_j[k];
 			}

From a287722c0eea68825fc7a6ac2645d5636386dbad Mon Sep 17 00:00:00 2001
From: panqingfeng <panqingfeng@gmail.com>
Date: Fri, 28 Aug 2020 09:58:21 +0800
Subject: [PATCH 18/19] speed up host code with OpenMP, to catch up with the
 speed of GPU

---
 svm.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/svm.cpp b/svm.cpp
index 446fb08d..ba56a47b 100644
--- a/svm.cpp
+++ b/svm.cpp
@@ -951,9 +951,6 @@ void Solver::reconstruct_gradient()
 
 	if (nr_free*l > 2*active_size*(l-active_size))
 	{
-#ifdef SVM_CUDA
-#pragma omp parallel for private(i)
-#endif
 		for(i=active_size;i<l;i++)
 		{
 			const Qfloat *Q_i = Q->get_Q(i,active_size);
@@ -964,9 +961,6 @@ void Solver::reconstruct_gradient()
 	}
 	else
 	{
-#ifdef SVM_CUDA
-#pragma omp parallel for private(i)
-#endif
 		for(i=0;i<active_size;i++)
 			if(is_free(i))
 			{
@@ -2519,9 +2513,6 @@ static void svm_binary_svc_probability(
 			subparam.weight[0]=Cp;
 			subparam.weight[1]=Cn;
 			struct svm_model *submodel = svm_train(&subprob,&subparam);
-#ifdef SVM_CUDA			
-#pragma omp parallel for private(j)
-#endif
 			for(j=begin;j<end;j++)
 			{
 				svm_predict_values(submodel,prob->x[perm[j]],&(dec_values[perm[j]]));

From 78b2b62d5e8f55665348dbf30d81c447b119cf7a Mon Sep 17 00:00:00 2001
From: Ryan Pan <panqingfeng@gmail.com>
Date: Thu, 7 Jan 2021 14:53:17 +0800
Subject: [PATCH 19/19] Update README

---
 README | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/README b/README
index 6f2ea3ce..19b4a65b 100644
--- a/README
+++ b/README
@@ -4,12 +4,6 @@ make clean; make svm-train-cuda
 How to use:
 run svm-train-cuda instead of svm-train, with same command line arguments of svm-train
 
-multithreading
-svm-train-cuda is multithreading by default(with OpenMP), you can change thread count with
-setenv OMP_NUM_THREADS 8 (Windows)
-or
-export OMP_NUM_THREADS=8 (Linux)
-
 tips
 try add -h 0 as arguments of svm-train-cuda, it may faster than default one(-h 1)