From 2c3c01745f4ccab47e8f879f3d431c0efb7b3271 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Thu, 21 Mar 2019 15:13:32 -0400 Subject: [PATCH 1/2] Exponential CMN --- .../tuning/run_tdnn_lstm_1b.sh | 2 +- .../tuning/run_tdnn_lstm_1c.sh | 475 ++++++++++++++++++ .../nnet3/train/chain_objf/acoustic_model.py | 7 +- egs/wsj/s5/steps/nnet3/chain/train.py | 1 + egs/wsj/s5/steps/nnet3/decode.sh | 10 +- egs/wsj/s5/steps/nnet3/decode_semisup.sh | 10 +- src/feat/feature-functions.cc | 54 +- src/feat/feature-functions.h | 10 +- 8 files changed, 558 insertions(+), 11 deletions(-) create mode 100755 egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1c.sh diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh index 1a839b045bd..50b22739aa2 100755 --- a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh +++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1b.sh @@ -292,7 +292,7 @@ if [ $stage -le 16 ]; then fi steps/nnet3/chain/train.py --stage $train_stage \ - --cmd "$decode_cmd" \ + --cmd "$train_cmd --mem 4G" \ --feat.online-ivector-dir $train_ivector_dir \ --feat.cmvn-opts "--norm-means=false --norm-vars=false" \ --chain.xent-regularize $xent_regularize \ diff --git a/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1c.sh b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1c.sh new file mode 100755 index 00000000000..eae44f66697 --- /dev/null +++ b/egs/ami/s5b/local/chain/multi_condition/tuning/run_tdnn_lstm_1c.sh @@ -0,0 +1,475 @@ +#!/bin/bash + +# This is a chain-training script with TDNN+LSTM neural networks. +# This script is based on local/chain/tuning/run_tdnn_lstm_1i.sh, but adding +# the reverberated IHM data into the train set. +# This script obtains better results on IHM, SDM and MDM tasks. + +# Please see RESULTS_* for examples of command lines invoking this script. + +# local/chain/multi_condition/run_tdnn_lstm.sh --mic ihm --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/run_tdnn_lstm.sh --mic sdm1 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & +# local/chain/multi_condition/run_tdnn_lstm.sh --mic mdm8 --use-ihm-ali true --train-set train_cleaned --gmm tri3_cleaned & + + +set -e -o pipefail + +# First the options that are passed through to run_ivector_common.sh +# (some of which are also used in this script directly). +stage=0 +mic=ihm +nj=30 +use_ihm_ali=false +train_set=train_cleaned +gmm=tri3_cleaned # the gmm for the target data +ihm_gmm=tri3_cleaned # the gmm for the IHM system (if --use-ihm-ali true). +num_threads_ubm=32 +num_data_reps=1 +num_epochs=4 +decode_icsi=false +decode_aspire=false +decode_tedlium=false + +chunk_width=160,140,110,80 +chunk_left_context=40 +chunk_right_context=0 +label_delay=5 +dropout_schedule='0,0@0.20,0.3@0.50,0' # dropout schedule controls the dropout + # proportion for each training iteration. +xent_regularize=0.025 + +# The rest are configs specific to this script. Most of the parameters +# are just hardcoded at this level, in the commands below. +train_stage=-10 +tree_affix= # affix for tree directory, e.g. "a" or "b", in case we change the configuration. +tlstm_affix=1c #affix for TDNN-LSTM directory, e.g. "a" or "b", in case we change the configuration. +common_egs_dir= # you can set this to use previously dumped egs. +cmvn_opts="--norm-vars=false --cmn-window=300 --center=false" + +# decode options +extra_left_context=50 +frames_per_chunk=160 + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! $use_ihm_ali; then + [ "$mic" != "ihm" ] && \ + echo "$0: you cannot specify --use-ihm-ali false if the microphone is not ihm." && \ + exit 1; +else + [ "$mic" == "ihm" ] && \ + echo "$0: you must specify --use-ihm-ali false if the microphone is ihm." && \ + exit 1; +fi + +if ! cuda-compiled; then + cat <data/lang_chain/topo + fi +fi + +if [ $stage -le 13 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/align_fmllr_lats.sh --nj 100 --cmd "$train_cmd" \ + --generate-ali-from-lats true ${lores_train_data_dir} \ + data/lang $gmm_dir $original_lat_dir + rm $original_lat_dir/fsts.*.gz # save space + + lat_dir_ihmdata=exp/ihm/chain${nnet3_affix}/${gmm}_${train_set}_sp_lats + + original_lat_nj=$(cat $original_lat_dir/num_jobs) + ihm_lat_nj=$(cat $lat_dir_ihmdata/num_jobs) + + $train_cmd --max-jobs-run 10 JOB=1:$original_lat_nj $lat_dir/temp/log/copy_original_lats.JOB.log \ + lattice-copy "ark:gunzip -c $original_lat_dir/lat.JOB.gz |" ark,scp:$lat_dir/temp/lats.JOB.ark,$lat_dir/temp/lats.JOB.scp + + $train_cmd --max-jobs-run 10 JOB=1:$ihm_lat_nj $lat_dir/temp2/log/copy_ihm_lats.JOB.log \ + lattice-copy "ark:gunzip -c $lat_dir_ihmdata/lat.JOB.gz |" ark,scp:$lat_dir/temp2/lats.JOB.ark,$lat_dir/temp2/lats.JOB.scp + + for n in $(seq $original_lat_nj); do + cat $lat_dir/temp/lats.$n.scp + done > $lat_dir/temp/combined_lats.scp + + for i in `seq 1 $num_data_reps`; do + for n in $(seq $ihm_lat_nj); do + cat $lat_dir/temp2/lats.$n.scp + done | sed -e "s/^/rev${i}_/" + done >> $lat_dir/temp/combined_lats.scp + + sort -u $lat_dir/temp/combined_lats.scp > $lat_dir/temp/combined_lats_sorted.scp + + utils/split_data.sh $train_data_dir $nj + + $train_cmd --max-jobs-run 10 JOB=1:$nj $lat_dir/copy_combined_lats.JOB.log \ + lattice-copy --include=$train_data_dir/split$nj/JOB/utt2spk \ + scp:$lat_dir/temp/combined_lats_sorted.scp \ + "ark:|gzip -c >$lat_dir/lat.JOB.gz" || exit 1; + + echo $nj > $lat_dir/num_jobs + + # copy other files from original lattice dir + for f in cmvn_opts final.mdl splice_opts tree; do + cp $original_lat_dir/$f $lat_dir/$f + done +fi + + +if [ $stage -le 14 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --context-opts "--context-width=2 --central-position=1" \ + --leftmost-questions-truncate -1 \ + --cmd "$train_cmd" 4200 ${lores_train_data_dir} data/lang_chain $original_lat_dir $tree_dir +fi + +if [ $stage -le 15 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree |grep num-pdfs|awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + tdnn_opts="l2-regularize=0.006" + lstm_opts="l2-regularize=0.0025 decay-time=20 dropout-proportion=0.0" + output_opts="l2-regularize=0.001" + + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + # please note that it is important to have input layer with the name=input + # as the layer immediately preceding the fixed-affine-layer to enable + # the use of short notation for the descriptor + fixed-affine-layer name=lda input=Append(-1,0,1) affine-transform-file=$dir/configs/lda.mat + + # the first splicing is moved before the lda layer, so no splicing here + relu-batchnorm-layer name=tdnn1 dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn2 input=Append(-1,0,1) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn3 input=Append(-1,0,1) dim=1024 $tdnn_opts + + # check steps/libs/nnet3/xconfig/lstm.py for the other options and defaults + fast-lstmp-layer name=lstm1 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn4 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn5 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn6 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm2 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + relu-batchnorm-layer name=tdnn7 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn8 input=Append(-3,0,3) dim=1024 $tdnn_opts + relu-batchnorm-layer name=tdnn9 input=Append(-3,0,3) dim=1024 $tdnn_opts + fast-lstmp-layer name=lstm3 cell-dim=1024 recurrent-projection-dim=256 non-recurrent-projection-dim=256 delay=-3 $lstm_opts + + ## adding the layers for chain branch + output-layer name=output input=lstm3 output-delay=$label_delay include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' models... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + output-layer name=output-xent input=lstm3 output-delay=$label_delay dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts + +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + +if [ $stage -le 16 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/ami-$(date +'%m_%d_%H_%M')/s5b/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage $train_stage \ + --cmd "$train_cmd --mem 4G" \ + --feat.cmvn-opts "$cmvn_opts" \ + --feat.use-sliding-window-cmvn=true \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.0 \ + --chain.apply-deriv-weights false \ + --chain.lm-opts="--num-extra-lm-states=2000" \ + --egs.dir "$common_egs_dir" \ + --egs.opts "--frames-overlap-per-eg 0" \ + --egs.chunk-width $chunk_width \ + --egs.chunk-left-context $chunk_left_context \ + --egs.chunk-right-context $chunk_right_context \ + --egs.chunk-left-context-initial 0 \ + --egs.chunk-right-context-final 0 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.num-chunk-per-minibatch 64,32 \ + --trainer.frames-per-iter 1500000 \ + --trainer.num-epochs $num_epochs \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 12 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.max-param-change 2.0 \ + --trainer.deriv-truncate-margin 8 \ + --cleanup.remove-egs false \ + --feat-dir $train_data_dir \ + --tree-dir $tree_dir \ + --lat-dir $lat_dir \ + --dir $dir +fi + + +graph_dir=$dir/graph_${LM} +if [ $stage -le 17 ]; then + # Note: it might appear that this data/lang_chain directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_${LM} $dir $graph_dir +fi + +if [ $stage -le 18 ]; then + rm $dir/.error 2>/dev/null || true + + [ -z $extra_left_context ] && extra_left_context=$chunk_left_context; + + for decode_set in dev eval; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$decode_cmd" \ + --extra-left-context $extra_left_context \ + --frames-per-chunk "$frames_per_chunk" \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --scoring-opts "--min-lmwt 5 " \ + $graph_dir data/$mic/${decode_set}_hires $dir/decode_${decode_set} || exit 1; + ) || touch $dir/.error & + done + wait + if [ -f $dir/.error ]; then + echo "$0: something went wrong in decoding" + exit 1 + fi +fi + +if $decode_aspire; then + if [ $stage -le 19 ]; then + for data in dev_aspire; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires.conf data/${data}_hires + steps/compute_cmvn_stats.sh data/${data}_hires + done + fi + + test_lang=data/lang_fisher_test + test_graph_affix=_fisher + graph_dir=$dir/graph${test_graph_affix} + if [ $stage -le 21 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $graph_dir + fi + + if [ $stage -le 22 ]; then + rm -f $dir/.error + for dset in dev_aspire; do + ( + decode_dir=$dir/decode${test_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context 0 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + fi + + if [ $stage -le 23 ]; then + for dset in dev_aspire; do + decode_dir=$dir/decode${test_graph_affix}_${dset} + local/score_aspire.sh --min-lmwt 8 --max-lmwt 12 \ + --cmd "$decode_cmd" --resolve-overlaps false \ + $graph_dir $decode_dir ${dset} ${dset} $decode_dir/ctm_out + done + fi +fi + +if $decode_icsi; then + if [ $stage -le 19 ]; then + for data in dev_icsi; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires.conf data/$mic/${data}_hires + steps/compute_cmvn_stats.sh data/$mic/${data}_hires + done + fi + + if [ $stage -le 22 ]; then + rm -f $dir/.error + for dset in dev_icsi eval_icsi; do + ( + decode_dir=$dir/decode${test_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context 0 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" \ + $graph_dir data/$mic/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + fi +fi + +if $decode_tedlium; then + if [ $stage -le 19 ]; then + for data in tedlium_dev tedlium_test; do + steps/make_mfcc.sh --cmd "$train_cmd" --nj 30 --mfcc-config conf/mfcc_hires.conf data/${data}_hires + steps/compute_cmvn_stats.sh data/${data}_hires + utils/fix_data_dir.sh data/${data}_hires + done + fi + + test_lang=data/ted_lang_nosp + test_graph_affix=_ted + graph_dir=$dir/graph${test_graph_affix} + if [ $stage -le 21 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 ${test_lang} $dir $graph_dir + fi + + if [ $stage -le 22 ]; then + rm -f $dir/.error + for dset in tedlium_dev tedlium_test; do + ( + decode_dir=$dir/decode${test_graph_affix}_${dset} + + steps/nnet3/decode.sh --nj 30 --cmd "$decode_cmd" \ + --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $extra_left_context \ + --extra-right-context 0 \ + --extra-left-context-initial 0 --extra-right-context-final 0 \ + --frames-per-chunk "$frames_per_chunk" --skip-scoring true \ + $graph_dir data/${dset}_hires $decode_dir || { echo "Failed decoding in $decode_dir"; touch $dir/.error; } + ) & + done + wait + + if [ -f $dir/.error ]; then + echo "Failed decoding." + exit 1 + fi + fi + + if [ $stage -le 23 ]; then + for dset in tedlium_dev tedlium_test; do + decode_dir=$dir/decode${test_graph_affix}_${dset} + steps/scoring/score_kaldi_wer.sh --min-lmwt 8 --max-lmwt 12 \ + --cmd "$decode_cmd" data/${dset}_hires \ + $graph_dir $decode_dir + done + fi +fi + +exit 0 +exit 0 diff --git a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py index c932a9c54f7..acc6f77523a 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -71,7 +71,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, alignment_subsampling_factor=3, online_ivector_dir=None, frames_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None): + egs_opts=None, cmvn_opts=None, use_sliding_window_cmvn="false"): """Wrapper for steps/nnet3/chain/get_egs.sh See options in that script. @@ -80,7 +80,7 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, common_lib.execute_command( """steps/nnet3/chain/get_egs.sh {egs_opts} \ --cmd "{command}" \ - --cmvn-opts "{cmvn_opts}" \ + --cmvn-opts "{cmvn_opts}" --use-sliding-window-cmvn {sliding_cmvn} \ --online-ivector-dir "{ivector_dir}" \ --left-context {left_context} \ --right-context {right_context} \ @@ -92,11 +92,12 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, --alignment-subsampling-factor {alignment_subsampling_factor} \ --stage {stage} \ --frames-per-iter {frames_per_iter} \ - --frames-per-eg {frames_per_eg_str} \ + --frames-per-eg "{frames_per_eg_str}" \ --srand {srand} \ {data} {dir} {lat_dir} {egs_dir}""".format( command=run_opts.egs_command, cmvn_opts=cmvn_opts if cmvn_opts is not None else '', + sliding_cmvn=use_sliding_window_cmvn, ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ''), diff --git a/egs/wsj/s5/steps/nnet3/chain/train.py b/egs/wsj/s5/steps/nnet3/chain/train.py index 40b65afe273..b20de51c945 100755 --- a/egs/wsj/s5/steps/nnet3/chain/train.py +++ b/egs/wsj/s5/steps/nnet3/chain/train.py @@ -381,6 +381,7 @@ def train(args, run_opts): frames_per_eg_str=args.chunk_width, srand=args.srand, egs_opts=args.egs_opts, + use_sliding_window_cmvn=args.use_sliding_window_cmvn, cmvn_opts=args.cmvn_opts, online_ivector_dir=args.online_ivector_dir, frames_per_iter=args.frames_per_iter, diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh index 14dda2bd457..ad4be2a8a90 100755 --- a/egs/wsj/s5/steps/nnet3/decode.sh +++ b/egs/wsj/s5/steps/nnet3/decode.sh @@ -80,6 +80,10 @@ done sdata=$data/split$nj; cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; +use_sliding_window_cmvn=false +if [ -f $srcdir/sliding_window_cmvn ]; then + use_sliding_window_cmvn=`cat $srcdir/sliding_window_cmvn` +fi thread_string= if $use_gpu; then if [ $num_threads -eq 1 ]; then @@ -100,7 +104,11 @@ echo $nj > $dir/num_jobs ## Set up features. echo "$0: feature type is raw" -feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +if $use_sliding_window_cmvn; then + feats="ark,s,cs:apply-cmvn-sliding $cmvn_opts scp:$sdata/JOB/feats.scp ark:- |" +else + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +fi if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/egs/wsj/s5/steps/nnet3/decode_semisup.sh b/egs/wsj/s5/steps/nnet3/decode_semisup.sh index 25ce232b2c6..f9c417da070 100755 --- a/egs/wsj/s5/steps/nnet3/decode_semisup.sh +++ b/egs/wsj/s5/steps/nnet3/decode_semisup.sh @@ -86,6 +86,10 @@ done sdata=$data/split$nj; cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1; +use_sliding_window_cmvn=false +if [ -f $srcdir/sliding_window_cmvn ]; then + use_sliding_window_cmvn=`cat $srcdir/sliding_window_cmvn` +fi thread_string= [ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads" @@ -97,7 +101,11 @@ echo $nj > $dir/num_jobs ## Set up features. echo "$0: feature type is raw" -feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +if $use_sliding_window_cmvn; then + feats="ark,s,cs:apply-cmvn-sliding $cmvn_opts scp:$sdata/JOB/feats.scp ark:- |" +else + feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" +fi if [ ! -z "$online_ivector_dir" ]; then ivector_period=$(cat $online_ivector_dir/ivector_period) || exit 1; diff --git a/src/feat/feature-functions.cc b/src/feat/feature-functions.cc index 4ae2550c364..aba4e2e19eb 100644 --- a/src/feat/feature-functions.cc +++ b/src/feat/feature-functions.cc @@ -241,9 +241,50 @@ void ReverseFrames(const MatrixBase &input_features, void SlidingWindowCmnOptions::Check() const { KALDI_ASSERT(cmn_window > 0); - if (center) + if (!center) { KALDI_ASSERT(min_window > 0 && min_window <= cmn_window); + } // else ignored so value doesn't matter. + if (exponential_normalization) + KALDI_ASSERT(!center && cmn_window > 1 && normalize_variance == false); +} + +void ExponentialWindowCmnInternal(const SlidingWindowCmnOptions &opts, + const MatrixBase &input, + MatrixBase *output) { + opts.Check(); + int32 num_frames = input.NumRows(), dim = input.NumCols(), + last_window_start = -1, last_window_end = -1; + Vector cur_avg(dim); + + for (int32 t = 0; t < num_frames; t++) { + // at start of decoding, get at least min_window frames + int32 window_end = std::max(t + 1, opts.min_window); + if (window_end > num_frames) { + // ensure it does not exceed the number of input frames + window_end = num_frames; + } + if (last_window_start == -1) { + // first time -- need to compute the average over a window + SubMatrix input_part(input, 0, window_end, + 0, dim); + cur_avg.AddRowSumMat(1.0 / window_end, input_part , 0.0); + } else if (window_end > last_window_end) { + KALDI_ASSERT(window_end == last_window_end + 1); + SubVector frame_to_add(input, last_window_end); + BaseFloat alpha = 1.0 / opts.cmn_window; + cur_avg.Scale(1.0 - alpha); + cur_avg.AddVec(alpha, frame_to_add); + } + last_window_start = 0; + last_window_end = window_end; + + SubVector input_frame(input, t), + output_frame(*output, t); + output_frame.CopyFromVec(input_frame); + + output_frame.AddVec(-1.0, cur_avg); + } } // Internal version of SlidingWindowCmn with double-precision arguments. @@ -251,6 +292,11 @@ void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, const MatrixBase &input, MatrixBase *output) { opts.Check(); + if (opts.exponential_normalization) { + ExponentialWindowCmnInternal(opts, input, output); + return; + } + int32 num_frames = input.NumRows(), dim = input.NumCols(), last_window_start = -1, last_window_end = -1, warning_count = 0; @@ -271,7 +317,7 @@ void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, window_start = 0; // or: window_start -= window_start } if (!opts.center) { - if (window_end > t) + if (window_end > t) // at start of decoding, get at least min_window frames window_end = std::max(t + 1, opts.min_window); } if (window_end > num_frames) { @@ -281,8 +327,8 @@ void SlidingWindowCmnInternal(const SlidingWindowCmnOptions &opts, } if (last_window_start == -1) { SubMatrix input_part(input, - window_start, window_end - window_start, - 0, dim); + window_start, window_end - window_start, + 0, dim); cur_sum.AddRowSumMat(1.0, input_part , 0.0); if (opts.normalize_variance) cur_sumsq.AddDiagMat2(1.0, input_part, kTrans, 0.0); diff --git a/src/feat/feature-functions.h b/src/feat/feature-functions.h index 52454f3048b..4e04cba2012 100644 --- a/src/feat/feature-functions.h +++ b/src/feat/feature-functions.h @@ -161,13 +161,15 @@ struct SlidingWindowCmnOptions { int32 max_warnings; bool normalize_variance; bool center; + bool exponential_normalization; SlidingWindowCmnOptions(): cmn_window(600), min_window(100), max_warnings(5), normalize_variance(false), - center(false) { } + center(false), + exponential_normalization(false) { } void Register(OptionsItf *opts) { opts->Register("cmn-window", &cmn_window, "Window in frames for running " @@ -182,6 +184,12 @@ struct SlidingWindowCmnOptions { opts->Register("center", ¢er, "If true, use a window centered on the " "current frame (to the extent possible, modulo end effects). " "If false, window is to the left."); + opts->Register("exponential-normalization", &exponential_normalization, + "If true, use exponentially decaying weighted window with " + "time-constant = cmn-window instead of the usual " + "rectangular window averaging. " + "Only valid if center == false and " + "norm-vars == false."); } void Check() const; }; From 14d4a3443ca7eeb7f2d748c7a24f08c9a458e3d7 Mon Sep 17 00:00:00 2001 From: Vimal Manohar Date: Sat, 23 Mar 2019 14:51:47 -0400 Subject: [PATCH 2/2] Fixing missing options --- egs/wsj/s5/steps/libs/nnet3/train/common.py | 8 +++++++- egs/wsj/s5/steps/nnet3/chain/get_egs.sh | 16 +++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/egs/wsj/s5/steps/libs/nnet3/train/common.py b/egs/wsj/s5/steps/libs/nnet3/train/common.py index 1a038cc23f2..d1032d5e04e 100644 --- a/egs/wsj/s5/steps/libs/nnet3/train/common.py +++ b/egs/wsj/s5/steps/libs/nnet3/train/common.py @@ -321,7 +321,8 @@ def halve_minibatch_size_str(minibatch_size_str): def copy_egs_properties_to_exp_dir(egs_dir, dir): try: - for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat']: + for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat', + 'sliding_window_cmvn']: file_name = '{dir}/{file}'.format(dir=egs_dir, file=file) if os.path.isfile(file_name): shutil.copy(file_name, dir) @@ -716,6 +717,11 @@ def __init__(self, action=common_lib.NullstrToNoneAction, help="A string specifying '--norm-means' " "and '--norm-vars' values") + self.parser.add_argument("--feat.use-sliding-window-cmvn", type=str, + dest='use_sliding_window_cmvn', default='false', + choices=["true", "false"], + help="Use sliding window CMVN instead of " + "per-speaker or per-utterance CMVN") # egs extraction options. there is no point adding the chunk context # option for non-RNNs (by which we mean basic TDNN-type topologies), as diff --git a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh index 9996820d6d3..74de39d1f8b 100755 --- a/egs/wsj/s5/steps/nnet3/chain/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/chain/get_egs.sh @@ -65,6 +65,7 @@ online_ivector_dir= # can be used if we are including speaker information as iV cmvn_opts= # can be used for specifying CMVN options, if feature type is not lda (if lda, # it doesn't make sense to use different options than were used as input to the # LDA transform). This is used to turn off CMVN in the online-nnet experiments. +use_sliding_window_cmvn=false lattice_lm_scale= # If supplied, the graph/lm weight of the lattices will be # used (with this scale) in generating supervisions # This is 0 by default for conventional supervised training, @@ -184,9 +185,18 @@ fi ## Set up features. echo "$0: feature type is raw" -feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" -valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" -train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + +if $use_sliding_window_cmvn; then + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn-sliding $cmvn_opts scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn-sliding $cmvn_opts scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn-sliding $cmvn_opts scp:- ark:- |" + echo $use_sliding_window_cmvn > $dir/sliding_window_cmvn +else + feats="ark,s,cs:utils/filter_scp.pl --exclude $dir/valid_uttlist $sdata/JOB/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:- ark:- |" + valid_feats="ark,s,cs:utils/filter_scp.pl $dir/valid_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" + train_subset_feats="ark,s,cs:utils/filter_scp.pl $dir/train_subset_uttlist $data/feats.scp | apply-cmvn $cmvn_opts --utt2spk=ark:$data/utt2spk scp:$data/cmvn.scp scp:- ark:- |" +fi + echo $cmvn_opts >$dir/cmvn_opts # caution: the top-level nnet training script should copy this to its own dir now. tree-info $chaindir/tree | grep num-pdfs | awk '{print $2}' > $dir/info/num_pdfs || exit 1