diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 8044ff699bc..7ffdc541113 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -953,11 +953,12 @@ static void _trace_mat_mat(const Real* A, const Real* B, MatrixDim dA,
   }
 
   // Warp reduce. Implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1206,11 +1207,12 @@ static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
   }
 
   // Warp reduce to 1 element. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
-    }
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+     if (tid < warpSize) {
+       ssum[tid] += ssum[tid + shift];
+     }
+     __syncwarp();
   }
 
   // output 1 sum per thread block
@@ -1257,12 +1259,13 @@ static void _add_diag_mat_mat_MTN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
     for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
-      ssum[tid] += ssum[tid + shift];
+      if (tid < warpSize) {
+	ssum[tid] += ssum[tid + shift];
+      }
+      __syncwarp();
     }
-  }
 
   // output TileDim sums per thread block
   if (tid < TileDim) {
@@ -1340,13 +1343,13 @@ static void _add_diag_mat_mat_MN(const Real alpha, const Real* M,
 
   // Warp reduce to 1 element per column.
   // Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+  for (int shift = warpSize; shift >= TileDim; shift >>= 1) {
+    if (tid < warpSize) {
       smem.sum[tid] += smem.sum[tid + shift];
     }
+    __syncwarp();
   }
-
   // output TileDim sums per thread block
   if (tid < TileDim && j_n < dim_N.cols) {
     v[j_n] = alpha * smem.sum[tid] + beta * v[j_n];
@@ -1793,10 +1796,11 @@ static void _vec_transform_reduce(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
     }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2006,9 +2010,11 @@ static void _transform_reduce_mat_rows(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }
+    __syncwarp();
   }
 
   // Output to vector result.
@@ -2045,11 +2051,13 @@ static void _transform_reduce_mat_cols(
   }
 
   // Reduce last warp. Threads implicitly synchronized within a warp.
-  if (tid < warpSize) {
-    for (int shift = warpSize; shift > 0; shift >>= 1)
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);
+    }    
+    __syncwarp();
   }
-
+  
   // Output to vector result.
   if (tid == 0) {
     result[i] = op.PostReduce(sdata[0], result[i]);
@@ -2087,13 +2095,12 @@ static void _group_transform_reduce(
       x_idx += threads_per_group;
     }
     sreduction[tid] = treduction;
-    if (threads_per_group > warpSize) {
-      __syncthreads();
-    }
+    __syncthreads();
 
     // tree-reduce to 2x warpSize elements per group
 #   pragma unroll
-    for (int shift = threads_per_group / 2; shift > warpSize; shift >>= 1) {
+    int shift = threads_per_group / 2;
+    for (; shift > warpSize; shift >>= 1) {
       if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
@@ -2101,14 +2108,12 @@ static void _group_transform_reduce(
     }
 
     // Warp-reduce to 1 element per group.
-    // Threads implicitly synchronized within the warp.
-    const int warp_reduce_size =
-        threads_per_group / 2 < warpSize ? threads_per_group / 2 : warpSize;
-    if (threadIdx.x < warp_reduce_size) {
 #     pragma unroll
-      for (int shift = warp_reduce_size; shift > 0; shift >>= 1) {
+    for (; shift > 0; shift >>= 1) {
+      if (threadIdx.x < shift) {
         sreduction[tid] = op.Reduce(sreduction[tid], sreduction[tid + shift]);
       }
+      __syncwarp();
     }
 
     // Store the result.
@@ -2967,12 +2972,13 @@ static void _diff_normalize_per_row(Real *id, int id_stride, const Real *iv,
   }
 
   // reduce to 1 element per row
-  if (tid < warpSize) {
 #   pragma unroll
-    for (int shift = warpSize; shift > 0; shift >>= 1) {
+  for (int shift = warpSize; shift > 0; shift >>= 1) {
+    if (tid < warpSize) {
       sprod[tid] += sprod[tid + shift];
       snorm[tid] += snorm[tid + shift];
     }
+    __syncwarp();
   }
 
   // broadcast the sum results
@@ -3254,15 +3260,16 @@ static void _find_row_max_id(const Real* mat, Real* vec_val, int32_cuda* vec_id,
   }
   // Warp reduce without __syncthreads()
   // (note.: synchronizes implicitly within a warp at the multiprocessor)
-  if (tid < warpSize / 2) {
 #pragma unroll
-    for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
-        num_working_threads >>= 1) {
+  for (int32_cuda num_working_threads = warpSize / 2; num_working_threads > 0;
+      num_working_threads >>= 1) {
+    if (tid < warpSize / 2) {
       if (smax[tid + num_working_threads] > smax[tid]) {
         smax[tid] = smax[tid + num_working_threads];
         sidx[tid] = sidx[tid + num_working_threads];
       }
     }
+    __syncwarp();
   }
 
   if (tid == 0) {
diff --git a/src/cudamatrix/cu-sparse-matrix-test.cc b/src/cudamatrix/cu-sparse-matrix-test.cc
index aad34b5dd54..0c2230a8731 100644
--- a/src/cudamatrix/cu-sparse-matrix-test.cc
+++ b/src/cudamatrix/cu-sparse-matrix-test.cc
@@ -125,8 +125,8 @@ static void UnitTestCuSparseMatrixSelectRowsAndTranspose() {
 template <typename Real>
 static void UnitTestCuSparseMatrixTraceMatSmat() {
   for (int32 i = 0; i < 2; i++) {
-    MatrixIndexT row = 10 + Rand() % 40;
-    MatrixIndexT col = 10 + Rand() % 50;
+    MatrixIndexT row = 2 + Rand() % 3;
+    MatrixIndexT col = 1 + Rand() % 4;
 
     CuMatrix<Real> mat1(row, col);
     CuMatrix<Real> mat2(col, row);
@@ -147,11 +147,13 @@ static void UnitTestCuSparseMatrixTraceMatSmat() {
     cu_smat2.CopyToMat(&mat2);
 
     Real trace1 = TraceMatMat(mat3, mat1, kTrans);
+
     Real trace2 = TraceMatSmat(mat3, cu_smat1, kTrans);
     AssertEqual(trace1, trace2, 0.00001);
 
     trace1 = TraceMatMat(mat3, mat2, kNoTrans);
     trace2 = TraceMatSmat(mat3, cu_smat2, kNoTrans);
+
     AssertEqual(trace1, trace2, 0.00001);
   }
 }
diff --git a/src/cudamatrix/cu-sparse-matrix.cc b/src/cudamatrix/cu-sparse-matrix.cc
index 703aa40e735..f24613fa231 100644
--- a/src/cudamatrix/cu-sparse-matrix.cc
+++ b/src/cudamatrix/cu-sparse-matrix.cc
@@ -161,7 +161,7 @@ void CuSparseMatrix<Real>::SelectRows(const CuArray<int32> &row_indexes,
 template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes, int32 dim,
                                      MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
     NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
@@ -194,8 +194,8 @@ template<typename Real>
 CuSparseMatrix<Real>::CuSparseMatrix(const CuArray<int32> &indexes,
                                      const CuVectorBase<Real> &weights,
                                      int32 dim, MatrixTransposeType trans) :
-    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
-    NULL) {
+  num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), 
+  csr_val_(NULL) {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     Resize(indexes.Dim(), dim, indexes.Dim(), kUndefined);
@@ -266,8 +266,9 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = 0;
       num_cols_ = 0;
       nnz_ = 0;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
           1 * sizeof(int)));
+      csr_col_idx_ = NULL;   // may be freed, but this is allowed.
       csr_val_ = NULL;
     } else {
       KALDI_ASSERT(num_rows > 0);
@@ -277,10 +278,16 @@ void CuSparseMatrix<Real>::Resize(const MatrixIndexT num_rows,
       num_rows_ = num_rows;
       num_cols_ = num_cols;
       nnz_ = nnz;
-      csr_row_ptr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
-          (num_rows + 1 + nnz) * sizeof(int)));
-      csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
+      csr_row_ptr_ = static_cast<int*>(CuDevice::Instantiate().Malloc((num_rows + 1) * sizeof(int)));
+      if (nnz > 0) {
+	csr_col_idx_ = static_cast<int*>(CuDevice::Instantiate().Malloc(
+          nnz * sizeof(int)));
+	csr_val_ = static_cast<Real*>(CuDevice::Instantiate().Malloc(
           nnz * sizeof(Real)));
+      } else {
+	csr_col_idx_ = NULL;
+	csr_val_ = NULL;
+      }
       CuSubArray<int> row_ptr(CsrRowPtr(), NumRows() + 1);
       row_ptr.Set(nnz);
       if (resize_type == kSetZero) {
@@ -302,8 +309,11 @@ void CuSparseMatrix<Real>::Destroy() {
 #if HAVE_CUDA == 1
   if (CuDevice::Instantiate().Enabled()) {
     CuTimer tim;
-    if (csr_row_ptr_col_idx_) {
-      CuDevice::Instantiate().Free(csr_row_ptr_col_idx_);
+    if (csr_row_ptr_) {
+      CuDevice::Instantiate().Free(csr_row_ptr_);
+    }
+    if (csr_col_idx_) {
+      CuDevice::Instantiate().Free(csr_col_idx_);
     }
     if (csr_val_) {
       CuDevice::Instantiate().Free(csr_val_);
@@ -311,7 +321,8 @@ void CuSparseMatrix<Real>::Destroy() {
     num_rows_ = 0;
     num_cols_ = 0;
     nnz_ = 0;
-    csr_row_ptr_col_idx_ = NULL;
+    csr_row_ptr_ = NULL;
+    csr_col_idx_ = NULL;    
     csr_val_ = NULL;
     CuDevice::Instantiate().AccuProfile(__func__, tim);
   } else
@@ -378,11 +389,17 @@ void CuSparseMatrix<Real>::CopyFromSmat(const CuSparseMatrix<Real>& smat,
       CuSubVector<Real> val_from(smat.CsrVal(), smat.NumElements());
       val_to.CopyFromVec(val_from);
 
-      CuSubArray<int> idx_to(csr_row_ptr_col_idx_,
-                             NumRows() + 1 + NumElements());
-      CuSubArray<int> idx_from(smat.csr_row_ptr_col_idx_,
-                               smat.NumRows() + 1 + smat.NumElements());
-      idx_to.CopyFromArray(idx_from);
+      {
+	CuSubArray<int> idx_to(csr_row_ptr_, NumRows() + 1);
+	CuSubArray<int> idx_from(smat.csr_row_ptr_, NumRows() + 1);
+	idx_to.CopyFromArray(idx_from);
+      }
+
+      {
+	CuSubArray<int> idx_to(csr_col_idx_, NumElements());
+	CuSubArray<int> idx_from(smat.csr_col_idx_, NumElements());
+	idx_to.CopyFromArray(idx_from);
+      }
 
     } else {
       Resize(smat.NumCols(), smat.NumRows(), smat.NumElements(), kUndefined);
@@ -413,9 +430,14 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
       smat->Resize(0, 0);
       return;
     }
-    CuSubArray<int> idx(csr_row_ptr_col_idx_, NumRows() + 1 + NumElements());
-    std::vector<int> idx_cpu;
-    idx.CopyToVec(&idx_cpu);
+    CuSubArray<int> row_ptr(csr_row_ptr_, NumRows() + 1);
+    std::vector<int> row_ptr_cpu;
+    row_ptr.CopyToVec(&row_ptr_cpu);
+
+
+    CuSubArray<int> col_idx(csr_col_idx_, NumElements());
+    std::vector<int> col_idx_cpu;
+    col_idx.CopyToVec(&col_idx_cpu);
 
     CuSubVector<Real> val(CsrVal(), NumElements());
     Vector<OtherReal> val_cpu(NumElements(), kUndefined);
@@ -425,8 +447,8 @@ void CuSparseMatrix<Real>::CopyToSmat(SparseMatrix<OtherReal> *smat) const {
         NumRows());
     int n = 0;
     for (int i = 0; i < NumRows(); ++i) {
-      for (; n < idx_cpu[i + 1]; ++n) {
-        const MatrixIndexT j = idx_cpu[NumRows() + 1 + n];
+      for (; n < row_ptr_cpu[i + 1]; ++n) {
+        const MatrixIndexT j = col_idx_cpu[n];
         pairs[i].push_back( { j, val_cpu(n) });
       }
     }
@@ -484,7 +506,8 @@ void CuSparseMatrix<Real>::Swap(CuSparseMatrix<Real> *smat) {
     std::swap(num_rows_, smat->num_rows_);
     std::swap(num_cols_, smat->num_cols_);
     std::swap(nnz_, smat->nnz_);
-    std::swap(csr_row_ptr_col_idx_, smat->csr_row_ptr_col_idx_);
+    std::swap(csr_row_ptr_, smat->csr_row_ptr_);
+    std::swap(csr_col_idx_, smat->csr_col_idx_);
     std::swap(csr_val_, smat->csr_val_);
   } else
 #endif
diff --git a/src/cudamatrix/cu-sparse-matrix.h b/src/cudamatrix/cu-sparse-matrix.h
index 82b17a0dc71..180beed6183 100644
--- a/src/cudamatrix/cu-sparse-matrix.h
+++ b/src/cudamatrix/cu-sparse-matrix.h
@@ -121,13 +121,13 @@ class CuSparseMatrix {
 
   /// Default constructor
   CuSparseMatrix() :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
           NULL) {
   }
 
   /// Constructor from CPU-based sparse matrix.
   explicit CuSparseMatrix(const SparseMatrix<Real> &smat) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat);
   }
@@ -135,7 +135,7 @@ class CuSparseMatrix {
   /// Constructor from GPU-based sparse matrix (supports transposition).
   CuSparseMatrix(const CuSparseMatrix<Real> &smat, MatrixTransposeType trans =
                      kNoTrans) :
-      num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_col_idx_(NULL), csr_val_(
+    num_rows_(0), num_cols_(0), nnz_(0), csr_row_ptr_(NULL), csr_col_idx_(NULL), csr_val_(
       NULL) {
     this->CopyFromSmat(smat, trans);
   }
@@ -200,19 +200,19 @@ class CuSparseMatrix {
   /// indices of the first nonzero element in the i-th row, while the last entry
   /// contains nnz_, as zero-based CSR format is used.
   const int* CsrRowPtr() const {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
   int* CsrRowPtr() {
-    return csr_row_ptr_col_idx_;
+    return csr_row_ptr_;
   }
 
   /// Returns pointer to the integer array of length nnz_ that contains
   /// the column indices of the corresponding elements in array CsrVal()
   const int* CsrColIdx() const {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
   int* CsrColIdx() {
-    return csr_row_ptr_col_idx_ + num_rows_ + 1;
+    return csr_col_idx_;
   }
 
 private:
@@ -238,9 +238,10 @@ class CuSparseMatrix {
   // number of non-zeros
   MatrixIndexT nnz_;
 
-  // csr row ptrs and col indices in a single int array
-  // of the length (num_rows_ + 1 + nnz_)
-  int* csr_row_ptr_col_idx_;
+  // length num_rows_ + 1
+  int* csr_row_ptr_;
+  // length nnz_
+  int* csr_col_idx_;
 
   // csr value array of the length nnz_
   Real* csr_val_;