Skip to content

Commit

Permalink
Merge pull request kaldi-asr#4880 from danpovey/cuda_fixes
Browse files Browse the repository at this point in the history
Fixes to work with CUDA 12 toolkit
  • Loading branch information
danpovey committed Nov 3, 2023
2 parents 1b07b59 + cdbc05b commit a670447
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 66 deletions.
73 changes: 40 additions & 33 deletions src/cudamatrix/cu-kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
}

// Warp reduce. Implicitly synchronized within a warp.
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift > 0; shift >>= 1) {
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
smem.sum[tid] += smem.sum[tid + shift];
}
__syncwarp();
}

// output 1 sum per thread block
Expand Down Expand Up @@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
}

// Warp reduce to 1 element. Threads implicitly synchronized within a warp.
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift > 0; shift >>= 1) {
ssum[tid] += ssum[tid + shift];
}
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
ssum[tid] += ssum[tid + shift];
}
__syncwarp();
}

// output 1 sum per thread block
Expand Down Expand Up @@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,

// Warp reduce to 1 element per column.
// Threads implicitly synchronized within a warp.
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
ssum[tid] += ssum[tid + shift];
if (tid < warpSize) {
ssum[tid] += ssum[tid + shift];
}
__syncwarp();
}
}

// output TileDim sums per thread block
if (tid < TileDim) {
Expand Down Expand Up @@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,

// Warp reduce to 1 element per column.
// Threads implicitly synchronized within a warp.
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
if (tid < warpSize) {
smem.sum[tid] += smem.sum[tid + shift];
}
__syncwarp();
}

// output TileDim sums per thread block
if (tid < TileDim && j_n < dim_N.cols) {
v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
Expand Down Expand Up @@ -1793,10 +1796,11 @@ static void _vec_transform_reduce(
}

// Reduce last warp. Threads implicitly synchronized within a warp.
if (tid < warpSize) {
for (int shift = warpSize; shift > 0; shift >>= 1) {
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
}
__syncwarp();
}

// Output to vector result.
Expand Down Expand Up @@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows(
}

// Reduce last warp. Threads implicitly synchronized within a warp.
if (tid < warpSize) {
for (int shift = warpSize; shift > 0; shift >>= 1)
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
}
__syncwarp();
}

// Output to vector result.
Expand Down Expand Up @@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols(
}

// Reduce last warp. Threads implicitly synchronized within a warp.
if (tid < warpSize) {
for (int shift = warpSize; shift > 0; shift >>= 1)
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
}
__syncwarp();
}

// Output to vector result.
if (tid == 0) {
result[i] = op.PostReduce(sdata[0], result[i]);
Expand Down Expand Up @@ -2087,28 +2095,25 @@ static void _group_transform_reduce(
x_idx += threads_per_group;
}
sreduction[tid] = treduction;
if (threads_per_group > warpSize) {
__syncthreads();
}
__syncthreads();

// tree-reduce to 2x warpSize elements per group
# pragma unroll
for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
int shift = threads_per_group / 2;
for (; shift > warpSize; shift >>= 1) {
if (threadIdx.x < shift) {
sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
}
__syncthreads();
}

// Warp-reduce to 1 element per group.
// Threads implicitly synchronized within the warp.
const int warp_reduce_size =
threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
if (threadIdx.x < warp_reduce_size) {
# pragma unroll
for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
for (; shift > 0; shift >>= 1) {
if (threadIdx.x < shift) {
sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
}
__syncwarp();
}

// Store the result.
Expand Down Expand Up @@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
}

// reduce to 1 element per row
if (tid < warpSize) {
# pragma unroll
for (int shift = warpSize; shift > 0; shift >>= 1) {
for (int shift = warpSize; shift > 0; shift >>= 1) {
if (tid < warpSize) {
sprod[tid] += sprod[tid + shift];
snorm[tid] += snorm[tid + shift];
}
__syncwarp();
}

// broadcast the sum results
Expand Down Expand Up @@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
}
// Warp reduce without __syncthreads()
// (note.: synchronizes implicitly within a warp at the multiprocessor)
if (tid < warpSize / 2) {
#pragma unroll
for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
num_working_threads >>= 1) {
for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
num_working_threads >>= 1) {
if (tid < warpSize / 2) {
if (smax[tid + num_working_threads] > smax[tid]) {
smax[tid] = smax[tid + num_working_threads];
sidx[tid] = sidx[tid + num_working_threads];
}
}
__syncwarp();
}

if (tid == 0) {
Expand Down
6 changes: 4 additions & 2 deletions src/cudamatrix/cu-sparse-matrix-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
template <typename Real>
static void UnitTestCuSparseMatrixTraceMatSmat() {
for (int32 i = 0; i < 2; i++) {
MatrixIndexT row = 10 + Rand() % 40;
MatrixIndexT col = 10 + Rand() % 50;
MatrixIndexT row = 2 + Rand() % 3;
MatrixIndexT col = 1 + Rand() % 4;

CuMatrix<Real> mat1(row, col);
CuMatrix<Real> mat2(col, row);
Expand All @@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
cu_smat2.CopyToMat(&mat2);

Real trace1 = TraceMatMat(mat3, mat1, kTrans);

Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
AssertEqual(trace1, trace2, 0.00001);

trace1 = TraceMatMat(mat3, mat2, kNoTrans);
trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);

AssertEqual(trace1, trace2, 0.00001);
}
}
Expand Down
65 changes: 44 additions & 21 deletions src/cudamatrix/cu-sparse-matrix.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
template<typename Real>
CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
MatrixTransposeType trans) :
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
NULL) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Expand Down Expand Up @@ -194,8 +194,8 @@ template<typename Real>
CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
const CuVectorBase<Real> &weights,
int32 dim, MatrixTransposeType trans) :
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
NULL) {
num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL),
csr_val_(NULL) {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
Expand Down Expand Up @@ -266,8 +266,9 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
num_rows_ = 0;
num_cols_ = 0;
nnz_ = 0;
csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
1 * sizeof(int)));
csr_col_idx_ = NULL; // may be freed, but this is allowed.
csr_val_ = NULL;
} else {
KALDI_ASSERT(num_rows > 0);
Expand All @@ -277,10 +278,16 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
num_rows_ = num_rows;
num_cols_ = num_cols;
nnz_ = nnz;
csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
(num_rows + 1 + nnz) * sizeof(int)));
csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
if (nnz > 0) {
csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
nnz * sizeof(int)));
csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
nnz * sizeof(Real)));
} else {
csr_col_idx_ = NULL;
csr_val_ = NULL;
}
CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
row_ptr.Set(nnz);
if (resize_type == kSetZero) {
Expand All @@ -302,16 +309,20 @@ void CuSparseMatrix<Real>::Destroy() {
#if HAVE_CUDA == 1
if (CuDevice::Instantiate().Enabled()) {
CuTimer tim;
if (csr_row_ptr_col_idx_) {
CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
if (csr_row_ptr_) {
CuDevice::Instantiate().Free(csr_row_ptr_);
}
if (csr_col_idx_) {
CuDevice::Instantiate().Free(csr_col_idx_);
}
if (csr_val_) {
CuDevice::Instantiate().Free(csr_val_);
}
num_rows_ = 0;
num_cols_ = 0;
nnz_ = 0;
csr_row_ptr_col_idx_ = NULL;
csr_row_ptr_ = NULL;
csr_col_idx_ = NULL;
csr_val_ = NULL;
CuDevice::Instantiate().AccuProfile(__func__, tim);
} else
Expand Down Expand Up @@ -378,11 +389,17 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
val_to.CopyFromVec(val_from);

CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
NumRows() + 1 + NumElements());
CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
smat.NumRows() + 1 + smat.NumElements());
idx_to.CopyFromArray(idx_from);
{
CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
idx_to.CopyFromArray(idx_from);
}

{
CuSubArray<int> idx_to(csr_col_idx_, NumElements());
CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
idx_to.CopyFromArray(idx_from);
}

} else {
Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
Expand Down Expand Up @@ -413,9 +430,14 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
smat->Resize(0, 0);
return;
}
CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
std::vector<int> idx_cpu;
idx.CopyToVec(&idx_cpu);
CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
std::vector<int> row_ptr_cpu;
row_ptr.CopyToVec(&row_ptr_cpu);


CuSubArray<int> col_idx(csr_col_idx_, NumElements());
std::vector<int> col_idx_cpu;
col_idx.CopyToVec(&col_idx_cpu);

CuSubVector<Real> val(CsrVal(), NumElements());
Vector<OtherReal> val_cpu(NumElements(), kUndefined);
Expand All @@ -425,8 +447,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
NumRows());
int n = 0;
for (int i = 0; i < NumRows(); ++i) {
for (; n < idx_cpu[i + 1]; ++n) {
const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
for (; n < row_ptr_cpu[i + 1]; ++n) {
const MatrixIndexT j = col_idx_cpu[n];
pairs[i].push_back( { j, val_cpu(n) });
}
}
Expand Down Expand Up @@ -484,7 +506,8 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
std::swap(num_rows_, smat->num_rows_);
std::swap(num_cols_, smat->num_cols_);
std::swap(nnz_, smat->nnz_);
std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
std::swap(csr_row_ptr_, smat->csr_row_ptr_);
std::swap(csr_col_idx_, smat->csr_col_idx_);
std::swap(csr_val_, smat->csr_val_);
} else
#endif
Expand Down
Loading

0 comments on commit a670447

Please sign in to comment.