From 870778f2747e41c214b7d1173411a84cb2803e20 Mon Sep 17 00:00:00 2001
From: Haowen Qiu <qindazhu@gmail.com>
Date: Wed, 5 Feb 2020 09:38:40 +0800
Subject: [PATCH] remove natural gradient

---
 egs/aishell/s10/progress.70.log               | 179 ++++++++++++++++++
 .../steps/libs/nnet3/xconfig/basic_layers.py  |   8 +-
 .../libs/nnet3/xconfig/composite_layers.py    |   8 +-
 .../libs/nnet3/xconfig/trivial_layers.py      |   2 +-
 4 files changed, 188 insertions(+), 9 deletions(-)
 create mode 100644 egs/aishell/s10/progress.70.log

diff --git a/egs/aishell/s10/progress.70.log b/egs/aishell/s10/progress.70.log
new file mode 100644
index 00000000000..a99c9890fd5
--- /dev/null
+++ b/egs/aishell/s10/progress.70.log
@@ -0,0 +1,179 @@
+# Running on c3-asr-train-i2-03.bj
+# Started at Wed Feb 5 03:34:34 CST 2020
+# nnet3-am-info exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/70.mdl && nnet3-show-progress --use-gpu=no exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/69.mdl exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/70.mdl 
+nnet3-am-info exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/70.mdl 
+input-dim: 40
+ivector-dim: -1
+num-pdfs: 3456
+prior-dimension: 0
+# Nnet info follows.
+left-context: 28
+right-context: 28
+num-parameters: 9254656
+modulus: 1
+input-node name=input dim=40
+component-node name=lda component=lda input=Append(Offset(input, -1), input, Offset(input, 1)) input-dim=120 output-dim=120
+component-node name=tdnn1.affine component=tdnn1.affine input=lda input-dim=120 output-dim=1024
+component-node name=tdnn1.relu component=tdnn1.relu input=tdnn1.affine input-dim=1024 output-dim=1024
+component-node name=tdnn1.batchnorm component=tdnn1.batchnorm input=tdnn1.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf2.linear component=tdnnf2.linear input=tdnn1.batchnorm input-dim=1024 output-dim=128
+component-node name=tdnnf2.affine component=tdnnf2.affine input=tdnnf2.linear input-dim=128 output-dim=1024
+component-node name=tdnnf2.relu component=tdnnf2.relu input=tdnnf2.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf2.batchnorm component=tdnnf2.batchnorm input=tdnnf2.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf2.noop component=tdnnf2.noop input=Sum(Scale(0.66, tdnn1.batchnorm), tdnnf2.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf3.linear component=tdnnf3.linear input=tdnnf2.noop input-dim=1024 output-dim=128
+component-node name=tdnnf3.affine component=tdnnf3.affine input=tdnnf3.linear input-dim=128 output-dim=1024
+component-node name=tdnnf3.relu component=tdnnf3.relu input=tdnnf3.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf3.batchnorm component=tdnnf3.batchnorm input=tdnnf3.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf3.noop component=tdnnf3.noop input=Sum(Scale(0.66, tdnnf2.noop), tdnnf3.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf4.linear component=tdnnf4.linear input=tdnnf3.noop input-dim=1024 output-dim=128
+component-node name=tdnnf4.affine component=tdnnf4.affine input=tdnnf4.linear input-dim=128 output-dim=1024
+component-node name=tdnnf4.relu component=tdnnf4.relu input=tdnnf4.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf4.batchnorm component=tdnnf4.batchnorm input=tdnnf4.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf4.noop component=tdnnf4.noop input=Sum(Scale(0.66, tdnnf3.noop), tdnnf4.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf5.linear component=tdnnf5.linear input=tdnnf4.noop input-dim=1024 output-dim=128
+component-node name=tdnnf5.affine component=tdnnf5.affine input=tdnnf5.linear input-dim=128 output-dim=1024
+component-node name=tdnnf5.relu component=tdnnf5.relu input=tdnnf5.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf5.batchnorm component=tdnnf5.batchnorm input=tdnnf5.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf5.noop component=tdnnf5.noop input=Sum(Scale(0.66, tdnnf4.noop), tdnnf5.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf6.linear component=tdnnf6.linear input=tdnnf5.noop input-dim=1024 output-dim=128
+component-node name=tdnnf6.affine component=tdnnf6.affine input=tdnnf6.linear input-dim=128 output-dim=1024
+component-node name=tdnnf6.relu component=tdnnf6.relu input=tdnnf6.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf6.batchnorm component=tdnnf6.batchnorm input=tdnnf6.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf6.noop component=tdnnf6.noop input=Sum(Scale(0.66, tdnnf5.noop), tdnnf6.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf7.linear component=tdnnf7.linear input=tdnnf6.noop input-dim=1024 output-dim=128
+component-node name=tdnnf7.affine component=tdnnf7.affine input=tdnnf7.linear input-dim=128 output-dim=1024
+component-node name=tdnnf7.relu component=tdnnf7.relu input=tdnnf7.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf7.batchnorm component=tdnnf7.batchnorm input=tdnnf7.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf7.noop component=tdnnf7.noop input=Sum(Scale(0.66, tdnnf6.noop), tdnnf7.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf8.linear component=tdnnf8.linear input=tdnnf7.noop input-dim=1024 output-dim=128
+component-node name=tdnnf8.affine component=tdnnf8.affine input=tdnnf8.linear input-dim=128 output-dim=1024
+component-node name=tdnnf8.relu component=tdnnf8.relu input=tdnnf8.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf8.batchnorm component=tdnnf8.batchnorm input=tdnnf8.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf8.noop component=tdnnf8.noop input=Sum(Scale(0.66, tdnnf7.noop), tdnnf8.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf9.linear component=tdnnf9.linear input=tdnnf8.noop input-dim=1024 output-dim=128
+component-node name=tdnnf9.affine component=tdnnf9.affine input=tdnnf9.linear input-dim=128 output-dim=1024
+component-node name=tdnnf9.relu component=tdnnf9.relu input=tdnnf9.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf9.batchnorm component=tdnnf9.batchnorm input=tdnnf9.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf9.noop component=tdnnf9.noop input=Sum(Scale(0.66, tdnnf8.noop), tdnnf9.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf10.linear component=tdnnf10.linear input=tdnnf9.noop input-dim=1024 output-dim=128
+component-node name=tdnnf10.affine component=tdnnf10.affine input=tdnnf10.linear input-dim=128 output-dim=1024
+component-node name=tdnnf10.relu component=tdnnf10.relu input=tdnnf10.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf10.batchnorm component=tdnnf10.batchnorm input=tdnnf10.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf10.noop component=tdnnf10.noop input=Sum(Scale(0.66, tdnnf9.noop), tdnnf10.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf11.linear component=tdnnf11.linear input=tdnnf10.noop input-dim=1024 output-dim=128
+component-node name=tdnnf11.affine component=tdnnf11.affine input=tdnnf11.linear input-dim=128 output-dim=1024
+component-node name=tdnnf11.relu component=tdnnf11.relu input=tdnnf11.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf11.batchnorm component=tdnnf11.batchnorm input=tdnnf11.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf11.noop component=tdnnf11.noop input=Sum(Scale(0.66, tdnnf10.noop), tdnnf11.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf12.linear component=tdnnf12.linear input=tdnnf11.noop input-dim=1024 output-dim=128
+component-node name=tdnnf12.affine component=tdnnf12.affine input=tdnnf12.linear input-dim=128 output-dim=1024
+component-node name=tdnnf12.relu component=tdnnf12.relu input=tdnnf12.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf12.batchnorm component=tdnnf12.batchnorm input=tdnnf12.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf12.noop component=tdnnf12.noop input=Sum(Scale(0.66, tdnnf11.noop), tdnnf12.batchnorm) input-dim=1024 output-dim=1024
+component-node name=tdnnf13.linear component=tdnnf13.linear input=tdnnf12.noop input-dim=1024 output-dim=128
+component-node name=tdnnf13.affine component=tdnnf13.affine input=tdnnf13.linear input-dim=128 output-dim=1024
+component-node name=tdnnf13.relu component=tdnnf13.relu input=tdnnf13.affine input-dim=1024 output-dim=1024
+component-node name=tdnnf13.batchnorm component=tdnnf13.batchnorm input=tdnnf13.relu input-dim=1024 output-dim=1024
+component-node name=tdnnf13.noop component=tdnnf13.noop input=Sum(Scale(0.66, tdnnf12.noop), tdnnf13.batchnorm) input-dim=1024 output-dim=1024
+component-node name=prefinal-l component=prefinal-l input=tdnnf13.noop input-dim=1024 output-dim=256
+component-node name=prefinal-chain.affine component=prefinal-chain.affine input=prefinal-l input-dim=256 output-dim=1024
+component-node name=prefinal-chain.relu component=prefinal-chain.relu input=prefinal-chain.affine input-dim=1024 output-dim=1024
+component-node name=prefinal-chain.batchnorm1 component=prefinal-chain.batchnorm1 input=prefinal-chain.relu input-dim=1024 output-dim=1024
+component-node name=prefinal-chain.linear component=prefinal-chain.linear input=prefinal-chain.batchnorm1 input-dim=1024 output-dim=256
+component-node name=prefinal-chain.batchnorm2 component=prefinal-chain.batchnorm2 input=prefinal-chain.linear input-dim=256 output-dim=256
+component-node name=output.affine component=output.affine input=prefinal-chain.batchnorm2 input-dim=256 output-dim=3456
+output-node name=output input=output.affine dim=3456 objective=linear
+component-node name=prefinal-xent.affine component=prefinal-xent.affine input=prefinal-l input-dim=256 output-dim=1024
+component-node name=prefinal-xent.relu component=prefinal-xent.relu input=prefinal-xent.affine input-dim=1024 output-dim=1024
+component-node name=prefinal-xent.batchnorm1 component=prefinal-xent.batchnorm1 input=prefinal-xent.relu input-dim=1024 output-dim=1024
+component-node name=prefinal-xent.linear component=prefinal-xent.linear input=prefinal-xent.batchnorm1 input-dim=1024 output-dim=256
+component-node name=prefinal-xent.batchnorm2 component=prefinal-xent.batchnorm2 input=prefinal-xent.linear input-dim=256 output-dim=256
+component-node name=output-xent.affine component=output-xent.affine input=prefinal-xent.batchnorm2 input-dim=256 output-dim=3456
+component-node name=output-xent.log-softmax component=output-xent.log-softmax input=output-xent.affine input-dim=3456 output-dim=3456
+output-node name=output-xent input=output-xent.log-softmax dim=3456 objective=linear
+component name=lda type=FixedAffineComponent, input-dim=120, output-dim=120, linear-params-rms=0.003114, bias-{mean,stddev}=-0.0003382,0.002592
+component name=tdnn1.affine type=AffineComponent, input-dim=120, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, linear-params-rms=0.03729, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.20,0.21,0.21,0.22 0.23,0.25,0.38,0.51,0.56 0.60,0.65,0.67,0.98), mean=0.388, stddev=0.128], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.73,0.78,0.78,0.79 0.79,0.83,0.95,1.6,1.7 1.9,2.0,2.1,2.1), mean=1.13, stddev=0.374], bias-{mean,stddev}=-0.008923,0.0711
+component name=tdnn1.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=3.41e+06, self-repaired-proportion=0.0445918, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0003,0.0005,0.0006,0.0008 0.001,0.002,0.02,0.07,0.09 0.11,0.14,0.14,0.19), mean=0.036, stddev=0.0382], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.06,0.06,0.07 0.07,0.09,0.37,0.85,0.93 0.94,0.94,0.94,1.0), mean=0.439, stddev=0.338], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.03,0.03,0.04 0.04,0.04,0.05,0.07,0.07 0.08,0.08,0.09,0.14), mean=0.0551, stddev=0.0135], oderiv-count=5.15834e+06
+component name=tdnn1.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=48057.7, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0003,0.0005,0.0006,0.0008 0.001,0.002,0.03,0.07,0.09 0.11,0.14,0.15,0.19), mean=0.0362, stddev=0.0381], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.003,0.003,0.004 0.005,0.008,0.04,0.06,0.08 0.09,0.11,0.12,0.18), mean=0.0377, stddev=0.0292]
+component name=tdnnf2.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-1,0, linear-params-rms=0.01914, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.87,0.87,0.87,0.87 0.87,0.87,0.87,0.87,0.87 0.87,0.87,0.87,0.87), mean=0.866, stddev=0.000345], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.11,0.14,0.15,0.15 0.16,0.17,0.21,0.25,0.26 0.28,0.30,0.31,0.38), mean=0.213, stddev=0.0404], has-bias=false, use-natural-gradient=false
+component name=tdnnf2.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,1, linear-params-rms=0.0218, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.20,0.27,0.27,0.29 0.29,0.30,0.33,0.37,0.40 0.46,0.55,0.61,0.85), mean=0.343, stddev=0.0633], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.62,0.62,0.63,0.63 0.64,0.66,0.69,0.73,0.75 0.77,0.79,0.80,0.82), mean=0.696, stddev=0.0405], bias-{mean,stddev}=-0.04018,0.2477, use-natural-gradient=false
+component name=tdnnf2.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=3.28e+06, self-repaired-proportion=0.00019611, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.006,0.02,0.03,0.05 0.07,0.10,0.22,0.35,0.42 0.48,0.56,0.59,0.74), mean=0.233, stddev=0.138], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.07,0.11,0.13,0.18 0.23,0.29,0.47,0.62,0.68 0.73,0.79,0.83,0.90), mean=0.463, stddev=0.173], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.003,0.003,0.003 0.004,0.004,0.006,0.01,0.01 0.02,0.02,0.02,0.04), mean=0.00717, stddev=0.00412], oderiv-count=5.20516e+06
+component name=tdnnf2.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=47545.7, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.006,0.03,0.03,0.05 0.07,0.10,0.22,0.35,0.42 0.48,0.56,0.60,0.74), mean=0.235, stddev=0.137], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.03,0.11,0.13,0.17 0.20,0.24,0.34,0.43,0.48 0.52,0.57,0.62,0.78), mean=0.339, stddev=0.112]
+component name=tdnnf2.noop type=NoOpComponent, dim=1024
+component name=tdnnf3.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-1,0, linear-params-rms=0.01953, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.88,0.88,0.88,0.88 0.88,0.88,0.88,0.88,0.88 0.88,0.88,0.88,0.88), mean=0.884, stddev=0.000345], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.14,0.16,0.17,0.17 0.18,0.19,0.21,0.24,0.26 0.28,0.30,0.31,0.35), mean=0.219, stddev=0.0324], has-bias=false, use-natural-gradient=false
+component name=tdnnf3.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,1, linear-params-rms=0.02007, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.17,0.27,0.27,0.28 0.29,0.30,0.32,0.34,0.35 0.37,0.41,0.42,0.49), mean=0.32, stddev=0.0301], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.56,0.56,0.58,0.58 0.59,0.61,0.64,0.67,0.69 0.70,0.71,0.72,0.74), mean=0.641, stddev=0.036], bias-{mean,stddev}=-0.0286,0.2409, use-natural-gradient=false
+component name=tdnnf3.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=3.35e+06, self-repaired-proportion=8.02105e-06, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.04,0.05,0.07 0.10,0.14,0.26,0.39,0.46 0.51,0.58,0.62,0.85), mean=0.269, stddev=0.136], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.08,0.13,0.16,0.20 0.25,0.33,0.49,0.61,0.67 0.71,0.74,0.78,0.85), mean=0.475, stddev=0.155], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.003,0.003,0.003 0.003,0.004,0.005,0.006,0.008 0.009,0.01,0.01,0.04), mean=0.00511, stddev=0.00232], oderiv-count=5.26352e+06
+component name=tdnnf3.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=47033.7, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.05,0.05,0.07 0.10,0.14,0.26,0.39,0.46 0.51,0.58,0.61,0.85), mean=0.269, stddev=0.135], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.05,0.15,0.19,0.22 0.26,0.30,0.40,0.47,0.52 0.56,0.59,0.64,0.81), mean=0.393, stddev=0.104]
+component name=tdnnf3.noop type=NoOpComponent, dim=1024
+component name=tdnnf4.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-1,0, linear-params-rms=0.01788, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.81,0.81,0.81,0.81 0.81,0.81,0.81,0.81,0.81 0.81,0.81,0.81,0.81), mean=0.809, stddev=0.000345], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.14,0.16,0.16,0.17 0.17,0.18,0.20,0.22,0.23 0.24,0.26,0.27,0.32), mean=0.201, stddev=0.0237], has-bias=false, use-natural-gradient=false
+component name=tdnnf4.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,1, linear-params-rms=0.01928, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.18,0.26,0.27,0.28 0.28,0.29,0.31,0.32,0.33 0.34,0.35,0.36,0.41), mean=0.308, stddev=0.0213], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.55,0.56,0.56,0.57 0.57,0.59,0.61,0.65,0.66 0.67,0.68,0.69,0.70), mean=0.616, stddev=0.0328], bias-{mean,stddev}=-0.01463,0.2537, use-natural-gradient=false
+component name=tdnnf4.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=1.12e+06, self-repaired-proportion=0.00388304, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(4e-05,0.05,0.06,0.09 0.12,0.17,0.28,0.42,0.50 0.57,0.63,0.68,0.88), mean=0.301, stddev=0.147], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0007,0.13,0.15,0.21 0.27,0.35,0.48,0.62,0.68 0.72,0.77,0.78,0.90), mean=0.48, stddev=0.154], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.004,0.004,0.005,0.005 0.005,0.006,0.007,0.01,0.01 0.01,0.02,0.02,0.11), mean=0.00834, stddev=0.00599], oderiv-count=1.7128e+06
+component name=tdnnf4.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=15677.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0001,0.05,0.07,0.09 0.12,0.17,0.29,0.42,0.50 0.57,0.63,0.68,0.88), mean=0.303, stddev=0.146], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.20,0.22,0.27 0.30,0.34,0.45,0.53,0.57 0.60,0.65,0.67,0.82), mean=0.442, stddev=0.109]
+component name=tdnnf4.noop type=NoOpComponent, dim=1024
+component name=tdnnf5.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=0, linear-params-rms=0.02112, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.68,0.68,0.68,0.68 0.68,0.68,0.68,0.68,0.68 0.68,0.68,0.68,0.68), mean=0.676, stddev=-nan], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.18,0.19,0.20,0.21 0.21,0.22,0.24,0.25,0.26 0.27,0.29,0.29,0.32), mean=0.238, stddev=0.0209], has-bias=false, use-natural-gradient=false
+component name=tdnnf5.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0, linear-params-rms=0.02522, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.20,0.24,0.25,0.26 0.26,0.27,0.28,0.30,0.31 0.32,0.33,0.34,0.40), mean=0.285, stddev=0.0196], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.75,0.75,0.75,0.77 0.77,0.78,0.81,0.82,0.84 0.85,0.86,0.87,0.92), mean=0.806, stddev=0.0279], bias-{mean,stddev}=-0.01854,0.2426, use-natural-gradient=false
+component name=tdnnf5.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=1.13e+06, self-repaired-proportion=0.00190954, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0003,0.02,0.03,0.04 0.06,0.09,0.19,0.32,0.39 0.46,0.56,0.61,0.82), mean=0.213, stddev=0.136], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.007,0.08,0.09,0.14 0.20,0.28,0.47,0.63,0.70 0.76,0.82,0.85,0.93), mean=0.461, stddev=0.189], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.005,0.006,0.006,0.007 0.007,0.008,0.01,0.02,0.02 0.02,0.03,0.03,0.07), mean=0.0128, stddev=0.00604], oderiv-count=1.69631e+06
+component name=tdnnf5.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=15677.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0005,0.03,0.03,0.04 0.06,0.09,0.19,0.32,0.39 0.46,0.55,0.61,0.82), mean=0.213, stddev=0.135], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.007,0.12,0.13,0.15 0.17,0.21,0.30,0.38,0.42 0.45,0.49,0.52,0.66), mean=0.297, stddev=0.0957]
+component name=tdnnf5.noop type=NoOpComponent, dim=1024
+component name=tdnnf6.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01767, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.80,0.80,0.80,0.80 0.80,0.80,0.80,0.80,0.80 0.80,0.80,0.80,0.80), mean=0.8, stddev=0.000345], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.14,0.16,0.16,0.17 0.17,0.18,0.20,0.22,0.23 0.24,0.25,0.26,0.30), mean=0.199, stddev=0.0217], has-bias=false, use-natural-gradient=false
+component name=tdnnf6.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.02036, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.24,0.28,0.28,0.29 0.30,0.30,0.32,0.34,0.35 0.36,0.39,0.45,0.60), mean=0.324, stddev=0.0312], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.56,0.58,0.58,0.58 0.59,0.61,0.64,0.69,0.71 0.73,0.74,0.75,0.80), mean=0.65, stddev=0.0473], bias-{mean,stddev}=-0.03415,0.2372, use-natural-gradient=false
+component name=tdnnf6.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=1.03e+06, self-repaired-proportion=0, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.03,0.05,0.05,0.08 0.10,0.15,0.25,0.37,0.44 0.50,0.54,0.58,0.90), mean=0.265, stddev=0.13], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.07,0.11,0.16,0.20 0.27,0.35,0.50,0.61,0.67 0.70,0.75,0.78,0.90), mean=0.48, stddev=0.151], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.004,0.005,0.005,0.005 0.006,0.006,0.008,0.01,0.01 0.02,0.02,0.02,0.03), mean=0.00914, stddev=0.00344], oderiv-count=1.60991e+06
+component name=tdnnf6.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=15165.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.03,0.05,0.06,0.08 0.11,0.15,0.25,0.37,0.44 0.49,0.54,0.58,0.90), mean=0.267, stddev=0.13], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.16,0.19,0.20,0.23 0.28,0.31,0.39,0.46,0.49 0.52,0.56,0.58,0.67), mean=0.386, stddev=0.0866]
+component name=tdnnf6.noop type=NoOpComponent, dim=1024
+component name=tdnnf7.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01761, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.80,0.80,0.80,0.80 0.80,0.80,0.80,0.80,0.80 0.80,0.80,0.80,0.80), mean=0.797, stddev=-nan], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.14,0.15,0.16,0.17 0.17,0.18,0.20,0.21,0.23 0.24,0.25,0.27,0.30), mean=0.198, stddev=0.0227], has-bias=false, use-natural-gradient=false
+component name=tdnnf7.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01984, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.24,0.27,0.28,0.29 0.29,0.30,0.31,0.33,0.34 0.35,0.37,0.40,0.50), mean=0.317, stddev=0.0234], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.53,0.56,0.56,0.57 0.58,0.59,0.63,0.68,0.70 0.71,0.73,0.74,0.74), mean=0.633, stddev=0.0452], bias-{mean,stddev}=-0.02511,0.2542, use-natural-gradient=false
+component name=tdnnf7.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=1.07e+06, self-repaired-proportion=0, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.03,0.04,0.06 0.08,0.12,0.22,0.36,0.43 0.51,0.60,0.65,0.86), mean=0.246, stddev=0.14], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.08,0.12,0.14,0.18 0.22,0.32,0.49,0.64,0.70 0.75,0.80,0.83,0.91), mean=0.479, stddev=0.174], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.004,0.004,0.005,0.005 0.006,0.006,0.008,0.01,0.02 0.02,0.02,0.02,0.03), mean=0.00961, stddev=0.00422], oderiv-count=1.60017e+06
+component name=tdnnf7.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=14653.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.03,0.04,0.05,0.06 0.08,0.12,0.22,0.36,0.43 0.51,0.60,0.65,0.86), mean=0.248, stddev=0.139], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.13,0.17,0.17,0.20 0.23,0.26,0.35,0.43,0.46 0.50,0.53,0.56,0.67), mean=0.349, stddev=0.0916]
+component name=tdnnf7.noop type=NoOpComponent, dim=1024
+component name=tdnnf8.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01714, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.77,0.77,0.78,0.78 0.78,0.78,0.78,0.78,0.78 0.78,0.78,0.78,0.78), mean=0.776, stddev=0.000345], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.13,0.15,0.15,0.16 0.17,0.17,0.19,0.21,0.22 0.23,0.25,0.26,0.33), mean=0.193, stddev=0.0227], has-bias=false, use-natural-gradient=false
+component name=tdnnf8.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01942, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.24,0.26,0.27,0.28 0.29,0.29,0.31,0.33,0.34 0.34,0.36,0.37,0.45), mean=0.31, stddev=0.0211], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.54,0.55,0.55,0.56 0.57,0.58,0.62,0.66,0.68 0.69,0.70,0.72,0.74), mean=0.62, stddev=0.0421], bias-{mean,stddev}=-0.03305,0.2552, use-natural-gradient=false
+component name=tdnnf8.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=9.74e+05, self-repaired-proportion=0.000966924, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.03,0.03,0.05 0.07,0.11,0.21,0.34,0.42 0.49,0.58,0.63,0.79), mean=0.233, stddev=0.139], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.04,0.08,0.10,0.15 0.20,0.30,0.49,0.63,0.70 0.74,0.79,0.84,0.89), mean=0.469, stddev=0.182], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.004,0.004,0.005 0.005,0.006,0.008,0.01,0.02 0.02,0.02,0.02,0.04), mean=0.00946, stddev=0.00448], oderiv-count=1.58216e+06
+component name=tdnnf8.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=14141.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.03,0.03,0.05 0.07,0.11,0.21,0.34,0.42 0.49,0.58,0.62,0.79), mean=0.234, stddev=0.138], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.07,0.14,0.15,0.18 0.21,0.25,0.33,0.41,0.45 0.48,0.52,0.55,0.73), mean=0.33, stddev=0.0929]
+component name=tdnnf8.noop type=NoOpComponent, dim=1024
+component name=tdnnf9.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01687, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.76,0.76,0.76,0.76 0.76,0.76,0.76,0.76,0.76 0.76,0.76,0.76,0.76), mean=0.763, stddev=0], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.13,0.15,0.15,0.16 0.16,0.17,0.19,0.21,0.22 0.23,0.25,0.26,0.32), mean=0.19, stddev=0.0225], has-bias=false, use-natural-gradient=false
+component name=tdnnf9.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01919, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.19,0.26,0.27,0.28 0.28,0.29,0.30,0.32,0.33 0.34,0.35,0.37,0.43), mean=0.306, stddev=0.0212], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.53,0.54,0.54,0.56 0.56,0.58,0.61,0.65,0.67 0.68,0.69,0.71,0.74), mean=0.613, stddev=0.0393], bias-{mean,stddev}=-0.02521,0.2462, use-natural-gradient=false
+component name=tdnnf9.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.94e+05, self-repaired-proportion=0.00241802, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0006,0.02,0.04,0.05 0.07,0.11,0.21,0.34,0.42 0.46,0.54,0.60,0.85), mean=0.23, stddev=0.132], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.009,0.08,0.11,0.15 0.21,0.30,0.48,0.64,0.69 0.75,0.79,0.82,0.91), mean=0.47, stddev=0.181], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.004,0.004,0.004 0.005,0.006,0.008,0.01,0.01 0.02,0.02,0.02,0.07), mean=0.00863, stddev=0.00438], oderiv-count=1.43808e+06
+component name=tdnnf9.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=13629.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0007,0.02,0.04,0.05 0.08,0.11,0.21,0.34,0.42 0.46,0.54,0.60,0.84), mean=0.231, stddev=0.132], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.13,0.15,0.18 0.21,0.25,0.33,0.40,0.44 0.47,0.51,0.53,0.65), mean=0.328, stddev=0.0891]
+component name=tdnnf9.noop type=NoOpComponent, dim=1024
+component name=tdnnf10.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01667, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.75,0.75,0.75,0.75 0.75,0.75,0.75,0.75,0.75 0.75,0.75,0.75,0.75), mean=0.754, stddev=0.000244], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.13,0.15,0.15,0.16 0.16,0.17,0.19,0.20,0.22 0.23,0.24,0.25,0.29), mean=0.187, stddev=0.0211], has-bias=false, use-natural-gradient=false
+component name=tdnnf10.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.019, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.24,0.26,0.27,0.28 0.28,0.29,0.30,0.32,0.33 0.34,0.35,0.36,0.38), mean=0.303, stddev=0.0197], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.53,0.53,0.55,0.56 0.56,0.57,0.61,0.63,0.65 0.66,0.68,0.69,0.73), mean=0.607, stddev=0.035], bias-{mean,stddev}=-0.003625,0.2464, use-natural-gradient=false
+component name=tdnnf10.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=9.14e+05, self-repaired-proportion=2.19202e-05, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.03,0.04,0.06 0.08,0.11,0.22,0.36,0.42 0.48,0.56,0.61,0.74), mean=0.238, stddev=0.134], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.06,0.10,0.13,0.18 0.23,0.30,0.49,0.65,0.72 0.77,0.82,0.87,0.93), mean=0.48, stddev=0.185], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.004,0.004,0.004 0.005,0.005,0.007,0.01,0.01 0.01,0.02,0.02,0.03), mean=0.00764, stddev=0.00342], oderiv-count=1.40875e+06
+component name=tdnnf10.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=13117.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.04,0.04,0.06 0.08,0.12,0.22,0.36,0.42 0.48,0.56,0.61,0.73), mean=0.239, stddev=0.133], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.11,0.14,0.16,0.19 0.22,0.26,0.33,0.40,0.44 0.47,0.51,0.53,0.61), mean=0.333, stddev=0.0859]
+component name=tdnnf10.noop type=NoOpComponent, dim=1024
+component name=tdnnf11.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01627, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.74,0.74,0.74,0.74 0.74,0.74,0.74,0.74,0.74 0.74,0.74,0.74,0.74), mean=0.737, stddev=0], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.12,0.15,0.15,0.15 0.16,0.17,0.18,0.20,0.21 0.22,0.24,0.25,0.30), mean=0.183, stddev=0.0207], has-bias=false, use-natural-gradient=false
+component name=tdnnf11.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01892, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.19,0.26,0.27,0.27 0.28,0.28,0.30,0.32,0.33 0.34,0.35,0.36,0.42), mean=0.302, stddev=0.0211], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.53,0.54,0.54,0.55 0.56,0.58,0.60,0.63,0.64 0.66,0.68,0.68,0.70), mean=0.605, stddev=0.033], bias-{mean,stddev}=-0.02701,0.2484, use-natural-gradient=false
+component name=tdnnf11.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.76e+05, self-repaired-proportion=0.00264987, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.03,0.04,0.06 0.07,0.10,0.20,0.34,0.41 0.47,0.52,0.58,0.75), mean=0.226, stddev=0.132], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.10,0.13,0.16 0.20,0.27,0.47,0.63,0.71 0.75,0.81,0.84,0.94), mean=0.458, stddev=0.187], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.004,0.004,0.004 0.005,0.005,0.007,0.009,0.01 0.01,0.02,0.02,0.05), mean=0.00739, stddev=0.00342], oderiv-count=1.34051e+06
+component name=tdnnf11.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=12605.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.03,0.05,0.06 0.07,0.10,0.21,0.34,0.41 0.47,0.52,0.58,0.75), mean=0.227, stddev=0.131], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.14,0.16,0.18 0.21,0.25,0.33,0.40,0.45 0.48,0.52,0.53,0.65), mean=0.327, stddev=0.0898]
+component name=tdnnf11.noop type=NoOpComponent, dim=1024
+component name=tdnnf12.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01625, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.73,0.73,0.73,0.73 0.73,0.74,0.74,0.74,0.74 0.74,0.74,0.74,0.74), mean=0.735, stddev=0.000244], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.14,0.14,0.15,0.15 0.16,0.17,0.18,0.20,0.21 0.22,0.24,0.25,0.40), mean=0.183, stddev=0.021], has-bias=false, use-natural-gradient=false
+component name=tdnnf12.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01889, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.20,0.26,0.26,0.27 0.28,0.29,0.30,0.32,0.33 0.34,0.35,0.36,0.43), mean=0.302, stddev=0.0208], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.53,0.54,0.55,0.55 0.56,0.58,0.60,0.63,0.65 0.66,0.68,0.69,0.72), mean=0.604, stddev=0.0331], bias-{mean,stddev}=-0.01954,0.2406, use-natural-gradient=false
+component name=tdnnf12.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.85e+05, self-repaired-proportion=0.0028921, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0003,0.04,0.05,0.06 0.08,0.12,0.20,0.33,0.41 0.47,0.55,0.60,0.73), mean=0.228, stddev=0.128], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.006,0.11,0.13,0.18 0.22,0.29,0.45,0.62,0.70 0.76,0.82,0.85,0.93), mean=0.457, stddev=0.18], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.004,0.004,0.004 0.004,0.005,0.006,0.008,0.01 0.01,0.01,0.02,0.07), mean=0.00702, stddev=0.00353], oderiv-count=1.25371e+06
+component name=tdnnf12.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=12093.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0005,0.04,0.05,0.06 0.08,0.12,0.20,0.33,0.41 0.47,0.55,0.59,0.72), mean=0.228, stddev=0.127], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.007,0.15,0.17,0.19 0.22,0.26,0.33,0.40,0.44 0.47,0.50,0.51,0.76), mean=0.331, stddev=0.0844]
+component name=tdnnf12.noop type=NoOpComponent, dim=1024
+component name=tdnnf13.linear type=TdnnComponent, input-dim=1024, output-dim=128, learning-rate=0.000210434, l2-regularize=0.008, orthonormal-constraint=-1, time-offsets=-3,0, linear-params-rms=0.01613, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.73,0.73,0.73,0.73 0.73,0.73,0.73,0.73,0.73 0.73,0.73,0.73,0.73), mean=0.73, stddev=-nan], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.13,0.14,0.15,0.15 0.16,0.16,0.18,0.19,0.21 0.22,0.24,0.26,0.33), mean=0.181, stddev=0.0217], has-bias=false, use-natural-gradient=false
+component name=tdnnf13.affine type=TdnnComponent, input-dim=128, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, time-offsets=0,3, linear-params-rms=0.01887, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.20,0.25,0.27,0.27 0.28,0.29,0.30,0.32,0.32 0.33,0.34,0.35,0.47), mean=0.301, stddev=0.0204], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.52,0.53,0.54,0.55 0.56,0.57,0.60,0.63,0.65 0.66,0.67,0.68,0.71), mean=0.603, stddev=0.0346], bias-{mean,stddev}=-0.01462,0.2372, use-natural-gradient=false
+component name=tdnnf13.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.38e+05, self-repaired-proportion=0.00195639, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(4e-05,0.05,0.06,0.07 0.09,0.12,0.21,0.34,0.42 0.48,0.54,0.58,0.74), mean=0.236, stddev=0.128], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0008,0.15,0.16,0.19 0.23,0.28,0.44,0.60,0.69 0.74,0.79,0.82,0.90), mean=0.448, stddev=0.171], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.003,0.003,0.004,0.004 0.004,0.005,0.006,0.008,0.009 0.01,0.01,0.01,0.12), mean=0.00668, stddev=0.00428], oderiv-count=1.23962e+06
+component name=tdnnf13.batchnorm type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=11581.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(5e-05,0.05,0.06,0.08 0.09,0.12,0.21,0.34,0.42 0.48,0.54,0.58,0.73), mean=0.237, stddev=0.127], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.18,0.19,0.21 0.24,0.28,0.36,0.43,0.47 0.50,0.54,0.56,0.66), mean=0.355, stddev=0.0899]
+component name=tdnnf13.noop type=NoOpComponent, dim=1024
+component name=prefinal-l type=LinearComponent, input-dim=1024, output-dim=256, learning-rate=0.000210434, l2-regularize=0.008, params-rms=0.02743, params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.88,0.88,0.88,0.88 0.88,0.88,0.88,0.88,0.88 0.88,0.88,0.88,0.88), mean=0.878, stddev=0.000244], params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.36,0.38,0.39,0.40 0.41,0.42,0.44,0.46,0.47 0.47,0.48,0.49,0.56), mean=0.438, stddev=0.0236], orthonormal-constraint=-1, use-natural-gradient=false, rank-in=20, rank-out=80, num-samples-history=2000, update-period=4, alpha=4
+component name=prefinal-chain.affine type=AffineComponent, input-dim=256, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, linear-params-rms=0.01852, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.22,0.26,0.26,0.27 0.28,0.28,0.29,0.31,0.32 0.33,0.34,0.34,0.39), mean=0.296, stddev=0.0173], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.55,0.55,0.55,0.56 0.57,0.58,0.59,0.61,0.62 0.63,0.63,0.65,0.70), mean=0.592, stddev=0.0207], bias-{mean,stddev}=-0.01883,0.2448
+component name=prefinal-chain.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.2e+05, self-repaired-proportion=0.00206127, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.03,0.04,0.06 0.08,0.12,0.22,0.35,0.43 0.49,0.59,0.64,0.79), mean=0.24, stddev=0.138], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.01,0.10,0.13,0.16 0.21,0.28,0.43,0.59,0.65 0.71,0.77,0.82,0.86), mean=0.436, stddev=0.168], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.003,0.003,0.004 0.004,0.004,0.006,0.008,0.01 0.01,0.01,0.02,0.04), mean=0.00653, stddev=0.00289], oderiv-count=1.2505e+06
+component name=prefinal-chain.batchnorm1 type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=11581.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.03,0.04,0.06 0.08,0.12,0.22,0.35,0.43 0.49,0.59,0.63,0.79), mean=0.24, stddev=0.138], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.16,0.17,0.20 0.24,0.28,0.38,0.47,0.50 0.54,0.57,0.58,0.63), mean=0.374, stddev=0.103]
+component name=prefinal-chain.linear type=LinearComponent, input-dim=1024, output-dim=256, learning-rate=0.000210434, l2-regularize=0.008, params-rms=0.02452, params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.78,0.78,0.78,0.78 0.78,0.78,0.78,0.78,0.78 0.78,0.78,0.78,0.78), mean=0.785, stddev=0.000244], params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.27,0.32,0.33,0.34 0.35,0.37,0.39,0.42,0.43 0.44,0.45,0.46,0.49), mean=0.391, stddev=0.0294], orthonormal-constraint=-1, use-natural-gradient=false, rank-in=20, rank-out=80, num-samples-history=2000, update-period=4, alpha=4
+component name=prefinal-chain.batchnorm2 type=BatchNormComponent, dim=256, block-dim=256, epsilon=0.001, target-rms=1, count=11581.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-2e-08,-1e-08,-1e-08,-1e-08 -9e-09,-6e-09,8e-10,7e-09,9e-09 1e-08,1e-08,2e-08,2e-08), mean=4.19e-10, stddev=7.17e-09], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.79,0.88,0.88,0.90 0.92,0.94,0.98,1.0,1.1 1.1,1.1,1.2,1.2), mean=0.989, stddev=0.0607]
+component name=output.affine type=AffineComponent, input-dim=256, output-dim=3456, learning-rate=0.000210434, l2-regularize=0.002, linear-params-rms=0.02283, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.001,0.04,0.08,0.16 0.23,0.27,0.34,0.46,0.51 0.54,0.57,0.59,1.2), mean=0.347, stddev=0.114], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(1.2,1.2,1.3,1.3 1.3,1.3,1.3,1.4,1.4 1.4,1.4,1.4,1.4), mean=1.34, stddev=0.0375], bias-{mean,stddev}=0.0003586,0.03823
+component name=prefinal-xent.affine type=AffineComponent, input-dim=256, output-dim=1024, learning-rate=0.000210434, l2-regularize=0.008, linear-params-rms=0.01655, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.17,0.23,0.23,0.24 0.25,0.25,0.26,0.28,0.28 0.29,0.29,0.30,0.32), mean=0.264, stddev=0.0147], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.48,0.49,0.49,0.50 0.50,0.51,0.53,0.55,0.56 0.57,0.57,0.58,0.59), mean=0.529, stddev=0.021], bias-{mean,stddev}=-0.01395,0.2337
+component name=prefinal-xent.relu type=RectifiedLinearComponent, dim=1024, self-repair-scale=1e-05, count=8.12e+05, self-repaired-proportion=0.00683594, value-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(7e-06,0.03,0.04,0.06 0.08,0.11,0.19,0.33,0.42 0.46,0.54,0.59,0.80), mean=0.222, stddev=0.131], deriv-avg=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0002,0.09,0.12,0.15 0.21,0.28,0.44,0.62,0.70 0.75,0.82,0.86,0.94), mean=0.451, stddev=0.186], oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.002,0.002,0.002,0.002 0.002,0.003,0.003,0.004,0.005 0.006,0.007,0.009,0.04), mean=0.00369, stddev=0.00251], oderiv-count=1.20978e+06
+component name=prefinal-xent.batchnorm1 type=BatchNormComponent, dim=1024, block-dim=1024, epsilon=0.001, target-rms=1, count=11581.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(9e-06,0.03,0.04,0.06 0.08,0.11,0.19,0.33,0.41 0.46,0.54,0.58,0.79), mean=0.222, stddev=0.13], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.0008,0.13,0.16,0.19 0.22,0.25,0.32,0.39,0.42 0.45,0.49,0.51,0.61), mean=0.319, stddev=0.084]
+component name=prefinal-xent.linear type=LinearComponent, input-dim=1024, output-dim=256, learning-rate=0.000210434, l2-regularize=0.008, params-rms=0.01933, params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.62,0.62,0.62,0.62 0.62,0.62,0.62,0.62,0.62 0.62,0.62,0.62,0.62), mean=0.619, stddev=0], params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.24,0.26,0.27,0.27 0.28,0.29,0.31,0.33,0.34 0.35,0.36,0.36,0.43), mean=0.309, stddev=0.0224], orthonormal-constraint=-1, use-natural-gradient=false, rank-in=20, rank-out=80, num-samples-history=2000, update-period=4, alpha=4
+component name=prefinal-xent.batchnorm2 type=BatchNormComponent, dim=256, block-dim=256, epsilon=0.001, target-rms=1, count=11581.9, test-mode=false, data-mean=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(-2e-08,-2e-08,-1e-08,-1e-08 -8e-09,-6e-09,-2e-10,6e-09,8e-09 1e-08,1e-08,2e-08,2e-08), mean=5.3e-11, stddev=6.52e-09], data-stddev=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.58,0.60,0.60,0.61 0.62,0.64,0.66,0.70,0.72 0.74,0.76,0.78,0.81), mean=0.668, stddev=0.0384]
+component name=output-xent.affine type=AffineComponent, input-dim=256, output-dim=3456, learning-rate=0.00105217, l2-regularize=0.002, learning-rate-factor=5, linear-params-rms=0.05335, linear-params-row-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(0.02,0.07,0.13,0.40 0.63,0.70,0.80,0.98,1.1 1.2,1.3,1.4,4.1), mean=0.819, stddev=0.241], linear-params-col-norms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(3.0,3.0,3.1,3.1 3.1,3.1,3.1,3.2,3.2 3.2,3.2,3.2,3.3), mean=3.14, stddev=0.0397], bias-{mean,stddev}=2.349e-08,0.05481
+component name=output-xent.log-softmax type=LogSoftmaxComponent, dim=3456, oderiv-rms=[percentiles(0,1,2,5 10,20,50,80,90 95,98,99,100)=(1e-09,4e-05,6e-05,0.0001 0.0003,0.0005,0.0009,0.001,0.002 0.003,0.004,0.005,0.04), mean=0.00109, stddev=0.00117], oderiv-count=1.28505e+06
+nnet3-show-progress --use-gpu=no exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/69.mdl exp/chain_cleaned_1c_rd_rmc_rng/tdnn1c_sp/70.mdl 
+LOG (nnet3-show-progress[5.5.750~2-3c61c]:SelectGpuId():cu-device.cc:202) Manually selected to compute on CPU.
+LOG (nnet3-show-progress[5.5.750~2-3c61c]:main():nnet3-show-progress.cc:143) Parameter differences per layer are [ tdnn1.affine:0.256339 tdnnf2.linear:0.227557 tdnnf2.affine:0.386311 tdnnf3.linear:0.283733 tdnnf3.affine:0.358061 tdnnf4.linear:0.267213 tdnnf4.affine:0.328628 tdnnf5.linear:0.219517 tdnnf5.affine:0.281164 tdnnf6.linear:0.283759 tdnnf6.affine:0.362356 tdnnf7.linear:0.291193 tdnnf7.affine:0.347106 tdnnf8.linear:0.276181 tdnnf8.affine:0.323737 tdnnf9.linear:0.254516 tdnnf9.affine:0.29721 tdnnf10.linear:0.232191 tdnnf10.affine:0.274365 tdnnf11.linear:0.216651 tdnnf11.affine:0.268776 tdnnf12.linear:0.207308 tdnnf12.affine:0.259874 tdnnf13.linear:0.200426 tdnnf13.affine:0.252007 prefinal-l:0.24496 prefinal-chain.affine:0.269982 prefinal-chain.linear:0.221904 output.affine:0.37835 prefinal-xent.affine:0.191588 prefinal-xent.linear:0.154945 output-xent.affine:0.420465 ]
+LOG (nnet3-show-progress[5.5.750~2-3c61c]:main():nnet3-show-progress.cc:153) Norms of parameter matrices from <new-nnet-in> are [ tdnn1.affine:13.2725 tdnnf2.linear:9.79786 tdnnf2.affine:13.7497 tdnnf3.linear:9.9981 tdnnf3.affine:12.8793 tdnnf4.linear:9.1542 tdnnf4.affine:12.7898 tdnnf5.linear:7.64498 tdnnf5.affine:11.9983 tdnnf6.linear:9.04887 tdnnf6.affine:12.9389 tdnnf7.linear:9.01759 tdnnf7.affine:13.0367 tdnnf8.linear:8.77638 tdnnf8.affine:12.91 tdnnf9.linear:8.63701 tdnnf9.affine:12.6207 tdnnf10.linear:8.53404 tdnnf10.affine:12.5218 tdnnf11.linear:8.33278 tdnnf11.affine:12.5607 tdnnf12.linear:8.3187 tdnnf12.affine:12.3768 tdnnf13.linear:8.26108 tdnnf13.affine:12.2942 prefinal-l:14.0424 prefinal-chain.affine:12.3159 prefinal-chain.linear:12.5548 output.affine:21.5923 prefinal-xent.affine:11.3085 prefinal-xent.linear:9.89914 output-xent.affine:50.2817 ]
+LOG (nnet3-show-progress[5.5.750~2-3c61c]:main():nnet3-show-progress.cc:157) Relative parameter differences per layer are [ tdnn1.affine:0.0192753 tdnnf2.linear:0.0232278 tdnnf2.affine:0.027956 tdnnf3.linear:0.0283875 tdnnf3.affine:0.0276328 tdnnf4.linear:0.0291937 tdnnf4.affine:0.0255129 tdnnf5.linear:0.0286933 tdnnf5.affine:0.0232522 tdnnf6.linear:0.0313656 tdnnf6.affine:0.0278445 tdnnf7.linear:0.0322901 tdnnf7.affine:0.0264605 tdnnf8.linear:0.0314645 tdnnf8.affine:0.0249126 tdnnf9.linear:0.0294591 tdnnf9.affine:0.0233898 tdnnf10.linear:0.0271971 tdnnf10.affine:0.0217566 tdnnf11.linear:0.0259888 tdnnf11.affine:0.0212416 tdnnf12.linear:0.0249052 tdnnf12.affine:0.0208438 tdnnf13.linear:0.0242414 tdnnf13.affine:0.0203464 prefinal-l:0.0174401 prefinal-chain.affine:0.0217511 prefinal-chain.linear:0.0176476 output.affine:0.0175741 prefinal-xent.affine:0.0167902 prefinal-xent.linear:0.0156657 output-xent.affine:0.00837788 ]
+# Accounting: time=1 threads=1
+# Finished at Wed Feb 5 03:34:35 CST 2020 with status 0
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
index e18c1359b61..fe66b883d9e 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/basic_layers.py
@@ -624,7 +624,7 @@ def _generate_config(self):
             # note: by default the LinearComponent uses natural gradient.
             line = ('component name={0}.linear type=LinearComponent '
                     'orthonormal-constraint={1} param-stddev={2} '
-                    'input-dim={3} output-dim={4} max-change=0.75 {5}'
+                    'input-dim={3} output-dim={4} max-change=0.75 use-natural-gradient=false {5}'
                     ''.format(self.name, self.config['orthonormal-constraint'],
                               self.config['orthonormal-constraint'] / math.sqrt(input_dim),
                               input_dim, bottleneck_dim, linear_options))
@@ -637,7 +637,7 @@ def _generate_config(self):
 
 
         line = ('component name={0}.affine'
-                ' type=NaturalGradientAffineComponent'
+                ' type=AffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, cur_dim, output_dim, affine_options))
         configs.append(line)
@@ -843,7 +843,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             bottleneck_dim = self.config['bottleneck-dim']
             # note: by default the LinearComponent uses natural gradient.
             line = ('component name={0}.linear type=LinearComponent '
-                    'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}'
+                    'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3} use-natural-gradient=false'
                     ''.format(self.name, input_dim, bottleneck_dim, linear_options))
             configs.append(line)
             line = ('component-node name={0}.linear component={0}.linear input={1}'
@@ -853,7 +853,7 @@ def _add_components(self, input_desc, input_dim, nonlinearities):
             cur_dim = bottleneck_dim
 
 
-        line = ('component name={0}.affine type=NaturalGradientAffineComponent'
+        line = ('component name={0}.affine type=AffineComponent'
                 ' input-dim={1} output-dim={2} {3}'
                 ''.format(self.name, cur_dim, output_dim, affine_options))
         configs.append(line)
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
index 928ca445ccc..663a2dbcbeb 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/composite_layers.py
@@ -167,7 +167,7 @@ def _generate_config(self):
         # The first linear layer, from input-dim (spliced x2) to bottleneck-dim
         configs.append('component name={0}.linear type=TdnnComponent input-dim={1} '
                        'output-dim={2} l2-regularize={3} max-change={4} use-bias=false '
-                       'time-offsets={5} orthonormal-constraint=-1.0'.format(
+                       'time-offsets={5} orthonormal-constraint=-1.0 use-natural-gradient=false'.format(
                            name, input_dim, bottleneck_dim, l2_regularize,
                            max_change, time_offsets1))
         configs.append('component-node name={0}.linear component={0}.linear '
@@ -176,7 +176,7 @@ def _generate_config(self):
         # The affine layer, from bottleneck-dim (spliced x2) to output-dim
         configs.append('component name={0}.affine type=TdnnComponent '
                        'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
-                       'time-offsets={5}'.format(
+                       'time-offsets={5} use-natural-gradient=false'.format(
                            name, bottleneck_dim, output_dim, l2_regularize,
                            max_change, time_offsets2))
         configs.append('component-node name={0}.affine component={0}.affine '
@@ -291,7 +291,7 @@ def _generate_config(self):
         self_repair_scale = self.config['self-repair-scale']
 
         # The affine layer, from input-dim to big-dim.
-        configs.append('component name={0}.affine type=NaturalGradientAffineComponent '
+        configs.append('component name={0}.affine type=AffineComponent '
                        'input-dim={1} output-dim={2} l2-regularize={3} max-change={4}'.format(
                            name, input_dim, big_dim, l2_regularize, max_change))
         configs.append('component-node name={0}.affine component={0}.affine '
@@ -314,7 +314,7 @@ def _generate_config(self):
         # ("floating" orthonormal constraint).
         configs.append('component name={0}.linear type=LinearComponent '
                        'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} '
-                       'orthonormal-constraint=-1 '.format(
+                       'orthonormal-constraint=-1 use-natural-gradient=false '.format(
                            name, big_dim, small_dim,
                            l2_regularize, max_change))
         configs.append('component-node name={0}.linear component={0}.linear '
diff --git a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
index 4afea78ad3f..5c146860361 100644
--- a/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
+++ b/egs/wsj/s5/steps/libs/nnet3/xconfig/trivial_layers.py
@@ -328,7 +328,7 @@ def _generate_config(self):
 
         configs = []
         line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} '
-                '{3}'.format(self.name, input_dim, output_dim, opts))
+                '{3} use-natural-gradient=false'.format(self.name, input_dim, output_dim, opts))
         configs.append(line)
         line = ('component-node name={0} component={0} input={1}'.format(
             self.name, input_desc))