diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu index 8044ff699bc..7ffdc541113 100644 --- a/src/cudamatrix/cu-kernels.cu +++ b/src/cudamatrix/cu-kernels.cu @@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA, } // Warp reduce. Implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } // output 1 sum per thread block @@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M, } // Warp reduce to 1 element. Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; - } + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } // output 1 sum per thread block @@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll for (int shift = warpSize; shift >= TileDim; shift >>= 1) { - ssum[tid] += ssum[tid + shift]; + if (tid < warpSize) { + ssum[tid] += ssum[tid + shift]; + } + __syncwarp(); } - } // output TileDim sums per thread block if (tid < TileDim) { @@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M, // Warp reduce to 1 element per column. // Threads implicitly synchronized within a warp. - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + for (int shift = warpSize; shift >= TileDim; shift >>= 1) { + if (tid < warpSize) { smem.sum[tid] += smem.sum[tid + shift]; } + __syncwarp(); } - // output TileDim sums per thread block if (tid < TileDim && j_n < dim_N.cols) { v[j_n] = alpha * smem.sum[tid] + beta * v[j_n]; @@ -1793,10 +1796,11 @@ static void _vec_transform_reduce( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); } + __syncwarp(); } // Output to vector result. @@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } // Output to vector result. @@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols( } // Reduce last warp. Threads implicitly synchronized within a warp. - if (tid < warpSize) { - for (int shift = warpSize; shift > 0; shift >>= 1) + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); + } + __syncwarp(); } - + // Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); @@ -2087,13 +2095,12 @@ static void _group_transform_reduce( x_idx += threads_per_group; } sreduction[tid] = treduction; - if (threads_per_group > warpSize) { - __syncthreads(); - } + __syncthreads(); // tree-reduce to 2x warpSize elements per group # pragma unroll - for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) { + int shift = threads_per_group / 2; + for (; shift > warpSize; shift >>= 1) { if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } @@ -2101,14 +2108,12 @@ static void _group_transform_reduce( } // Warp-reduce to 1 element per group. - // Threads implicitly synchronized within the warp. - const int warp_reduce_size = - threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize; - if (threadIdx.x < warp_reduce_size) { # pragma unroll - for (int shift = warp_reduce_size; shift > 0; shift >>= 1) { + for (; shift > 0; shift >>= 1) { + if (threadIdx.x < shift) { sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]); } + __syncwarp(); } // Store the result. @@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv, } // reduce to 1 element per row - if (tid < warpSize) { # pragma unroll - for (int shift = warpSize; shift > 0; shift >>= 1) { + for (int shift = warpSize; shift > 0; shift >>= 1) { + if (tid < warpSize) { sprod[tid] += sprod[tid + shift]; snorm[tid] += snorm[tid + shift]; } + __syncwarp(); } // broadcast the sum results @@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id, } // Warp reduce without __syncthreads() // (note.: synchronizes implicitly within a warp at the multiprocessor) - if (tid < warpSize / 2) { #pragma unroll - for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; - num_working_threads >>= 1) { + for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0; + num_working_threads >>= 1) { + if (tid < warpSize / 2) { if (smax[tid + num_working_threads] > smax[tid]) { smax[tid] = smax[tid + num_working_threads]; sidx[tid] = sidx[tid + num_working_threads]; } } + __syncwarp(); } if (tid == 0) { diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc index aad34b5dd54..0c2230a8731 100644 --- a/src/cudamatrix/cu-sparse-matrix-test.cc +++ b/src/cudamatrix/cu-sparse-matrix-test.cc @@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() { template static void UnitTestCuSparseMatrixTraceMatSmat() { for (int32 i = 0; i < 2; i++) { - MatrixIndexT row = 10 + Rand() % 40; - MatrixIndexT col = 10 + Rand() % 50; + MatrixIndexT row = 2 + Rand() % 3; + MatrixIndexT col = 1 + Rand() % 4; CuMatrix mat1(row, col); CuMatrix mat2(col, row); @@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() { cu_smat2.CopyToMat(&mat2); Real trace1 = TraceMatMat(mat3, mat1, kTrans); + Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans); AssertEqual(trace1, trace2, 0.00001); trace1 = TraceMatMat(mat3, mat2, kNoTrans); trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans); + AssertEqual(trace1, trace2, 0.00001); } } diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc index 703aa40e735..f24613fa231 100644 --- a/src/cudamatrix/cu-sparse-matrix.cc +++ b/src/cudamatrix/cu-sparse-matrix.cc @@ -161,7 +161,7 @@ void CuSparseMatrix::SelectRows(const CuArray &row_indexes, template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { @@ -194,8 +194,8 @@ template CuSparseMatrix::CuSparseMatrix(const CuArray &indexes, const CuVectorBase &weights, int32 dim, MatrixTransposeType trans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( - NULL) { + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), + csr_val_(NULL) { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined); @@ -266,8 +266,9 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc( 1 * sizeof(int))); + csr_col_idx_ = NULL; // may be freed, but this is allowed. csr_val_ = NULL; } else { KALDI_ASSERT(num_rows > 0); @@ -277,10 +278,16 @@ void CuSparseMatrix::Resize(const MatrixIndexT num_rows, num_rows_ = num_rows; num_cols_ = num_cols; nnz_ = nnz; - csr_row_ptr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( - (num_rows + 1 + nnz) * sizeof(int))); - csr_val_ = static_cast(CuDevice::Instantiate().Malloc( + csr_row_ptr_ = static_cast(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int))); + if (nnz > 0) { + csr_col_idx_ = static_cast(CuDevice::Instantiate().Malloc( + nnz * sizeof(int))); + csr_val_ = static_cast(CuDevice::Instantiate().Malloc( nnz * sizeof(Real))); + } else { + csr_col_idx_ = NULL; + csr_val_ = NULL; + } CuSubArray row_ptr(CsrRowPtr(), NumRows() + 1); row_ptr.Set(nnz); if (resize_type == kSetZero) { @@ -302,8 +309,11 @@ void CuSparseMatrix::Destroy() { #if HAVE_CUDA == 1 if (CuDevice::Instantiate().Enabled()) { CuTimer tim; - if (csr_row_ptr_col_idx_) { - CuDevice::Instantiate().Free(csr_row_ptr_col_idx_); + if (csr_row_ptr_) { + CuDevice::Instantiate().Free(csr_row_ptr_); + } + if (csr_col_idx_) { + CuDevice::Instantiate().Free(csr_col_idx_); } if (csr_val_) { CuDevice::Instantiate().Free(csr_val_); @@ -311,7 +321,8 @@ void CuSparseMatrix::Destroy() { num_rows_ = 0; num_cols_ = 0; nnz_ = 0; - csr_row_ptr_col_idx_ = NULL; + csr_row_ptr_ = NULL; + csr_col_idx_ = NULL; csr_val_ = NULL; CuDevice::Instantiate().AccuProfile(__func__, tim); } else @@ -378,11 +389,17 @@ void CuSparseMatrix::CopyFromSmat(const CuSparseMatrix& smat, CuSubVector val_from(smat.CsrVal(), smat.NumElements()); val_to.CopyFromVec(val_from); - CuSubArray idx_to(csr_row_ptr_col_idx_, - NumRows() + 1 + NumElements()); - CuSubArray idx_from(smat.csr_row_ptr_col_idx_, - smat.NumRows() + 1 + smat.NumElements()); - idx_to.CopyFromArray(idx_from); + { + CuSubArray idx_to(csr_row_ptr_, NumRows() + 1); + CuSubArray idx_from(smat.csr_row_ptr_, NumRows() + 1); + idx_to.CopyFromArray(idx_from); + } + + { + CuSubArray idx_to(csr_col_idx_, NumElements()); + CuSubArray idx_from(smat.csr_col_idx_, NumElements()); + idx_to.CopyFromArray(idx_from); + } } else { Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined); @@ -413,9 +430,14 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { smat->Resize(0, 0); return; } - CuSubArray idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements()); - std::vector idx_cpu; - idx.CopyToVec(&idx_cpu); + CuSubArray row_ptr(csr_row_ptr_, NumRows() + 1); + std::vector row_ptr_cpu; + row_ptr.CopyToVec(&row_ptr_cpu); + + + CuSubArray col_idx(csr_col_idx_, NumElements()); + std::vector col_idx_cpu; + col_idx.CopyToVec(&col_idx_cpu); CuSubVector val(CsrVal(), NumElements()); Vector val_cpu(NumElements(), kUndefined); @@ -425,8 +447,8 @@ void CuSparseMatrix::CopyToSmat(SparseMatrix *smat) const { NumRows()); int n = 0; for (int i = 0; i < NumRows(); ++i) { - for (; n < idx_cpu[i + 1]; ++n) { - const MatrixIndexT j = idx_cpu[NumRows() + 1 + n]; + for (; n < row_ptr_cpu[i + 1]; ++n) { + const MatrixIndexT j = col_idx_cpu[n]; pairs[i].push_back( { j, val_cpu(n) }); } } @@ -484,7 +506,8 @@ void CuSparseMatrix::Swap(CuSparseMatrix *smat) { std::swap(num_rows_, smat->num_rows_); std::swap(num_cols_, smat->num_cols_); std::swap(nnz_, smat->nnz_); - std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_); + std::swap(csr_row_ptr_, smat->csr_row_ptr_); + std::swap(csr_col_idx_, smat->csr_col_idx_); std::swap(csr_val_, smat->csr_val_); } else #endif diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h index 82b17a0dc71..180beed6183 100644 --- a/src/cudamatrix/cu-sparse-matrix.h +++ b/src/cudamatrix/cu-sparse-matrix.h @@ -121,13 +121,13 @@ class CuSparseMatrix { /// Default constructor CuSparseMatrix() : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { } /// Constructor from CPU-based sparse matrix. explicit CuSparseMatrix(const SparseMatrix &smat) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat); } @@ -135,7 +135,7 @@ class CuSparseMatrix { /// Constructor from GPU-based sparse matrix (supports transposition). CuSparseMatrix(const CuSparseMatrix &smat, MatrixTransposeType trans = kNoTrans) : - num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_( + num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_( NULL) { this->CopyFromSmat(smat, trans); } @@ -200,19 +200,19 @@ class CuSparseMatrix { /// indices of the first nonzero element in the i-th row, while the last entry /// contains nnz_, as zero-based CSR format is used. const int* CsrRowPtr() const { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } int* CsrRowPtr() { - return csr_row_ptr_col_idx_; + return csr_row_ptr_; } /// Returns pointer to the integer array of length nnz_ that contains /// the column indices of the corresponding elements in array CsrVal() const int* CsrColIdx() const { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } int* CsrColIdx() { - return csr_row_ptr_col_idx_ + num_rows_ + 1; + return csr_col_idx_; } private: @@ -238,9 +238,10 @@ class CuSparseMatrix { // number of non-zeros MatrixIndexT nnz_; - // csr row ptrs and col indices in a single int array - // of the length (num_rows_ + 1 + nnz_) - int* csr_row_ptr_col_idx_; + // length num_rows_ + 1 + int* csr_row_ptr_; + // length nnz_ + int* csr_col_idx_; // csr value array of the length nnz_ Real* csr_val_;