Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added modified MFCC features based on DNN-c and fDNN-c features; it i… #2908

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 14 additions & 27 deletions src/feat/feature-mfcc-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ static void UnitTestSimple() {
op.frame_opts.round_to_power_of_two = true;
op.mel_opts.low_freq = 0.0;
op.mel_opts.htk_mode = true;
op.mel_opts.modified = (Rand() % 2 == 0 ? true : false);
op.htk_compat = true;

Mfcc mfcc(op);
// use default parameters

Expand Down Expand Up @@ -613,42 +613,29 @@ static void UnitTestHTKCompare6() {
}

std::cout << "Test passed :)\n\n";

unlink("tmp.test.wav.fea_kaldi.6");
}

void UnitTestVtln() {
// Test the function VtlnWarpFreq.
BaseFloat low_freq = 10, high_freq = 7800,
vtln_low_cutoff = 20, vtln_high_cutoff = 7400;

BaseFloat low_freq = 10, high_freq = 7800;
MelBanksOptions mel_opts;
mel_opts.low_freq = low_freq, mel_opts.high_freq = high_freq;
FrameExtractionOptions frame_opts;
MelBanks melfbank(mel_opts, frame_opts, 0.9);
for (size_t i = 0; i < 100; i++) {
BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2;
AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, warp_factor,
freq),
freq / warp_factor);

AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, warp_factor,
low_freq),
low_freq);
AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, warp_factor,
high_freq),
high_freq);
AssertEqual(melfbank.VtlnWarpFreq(warp_factor, freq), freq / warp_factor);

AssertEqual(melfbank.VtlnWarpFreq(warp_factor, low_freq), low_freq);
AssertEqual(melfbank.VtlnWarpFreq(warp_factor, high_freq), high_freq);
BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(),
freq3 = freq2 + (high_freq-freq2) * RandUniform(); // freq3>=freq2
BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, warp_factor,
freq2);
BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, warp_factor,
freq3);
BaseFloat w2 = melfbank.VtlnWarpFreq(warp_factor, freq2);
BaseFloat w3 = melfbank.VtlnWarpFreq(warp_factor, freq3);
KALDI_ASSERT(w3 >= w2); // increasing function.
BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq, 1.0,
freq3);
BaseFloat w3dash = melfbank.VtlnWarpFreq(1.0, freq3);
AssertEqual(w3dash, freq3);
}
}
Expand Down
249 changes: 154 additions & 95 deletions src/feat/mel-computations.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,86 +34,65 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
BaseFloat vtln_warp_factor):
htk_mode_(opts.htk_mode) {
SetConfigs(opts, frame_opts, vtln_warp_factor);

int32 num_bins = opts.num_bins;
if (num_bins < 3) KALDI_ERR << "Must have at least 3 mel bins";
BaseFloat sample_freq = frame_opts.samp_freq;
int32 window_length_padded = frame_opts.PaddedWindowSize();
KALDI_ASSERT(window_length_padded % 2 == 0);
int32 num_fft_bins = window_length_padded / 2;
BaseFloat nyquist = 0.5 * sample_freq;

BaseFloat low_freq = opts.low_freq, high_freq;
if (opts.high_freq > 0.0)
high_freq = opts.high_freq;
else
high_freq = nyquist + opts.high_freq;

if (low_freq < 0.0 || low_freq >= nyquist
|| high_freq <= 0.0 || high_freq > nyquist
|| high_freq <= low_freq)
KALDI_ERR << "Bad values in options: low-freq " << low_freq
<< " and high-freq " << high_freq << " vs. nyquist "
<< nyquist;

BaseFloat fft_bin_width = sample_freq / window_length_padded;
// fft-bin width [think of it as Nyquist-freq / half-window-length]
BaseFloat mel_low_freq = MelScale(low_freq_);
BaseFloat mel_high_freq = MelScale(high_freq_);

BaseFloat mel_low_freq = MelScale(low_freq);
BaseFloat mel_high_freq = MelScale(high_freq);

debug_ = opts.debug_mel;

// divide by num_bins+1 in next line because of end-effects where the bins
// spread out to the sides.
BaseFloat mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins+1);
bins_.resize(num_bins);
center_freqs_.Resize(num_bins);

BaseFloat vtln_low = opts.vtln_low,
vtln_high = opts.vtln_high;
if (vtln_high < 0.0) {
vtln_high += nyquist;
for (int32 bin = 0; bin < num_bins; bin++) {
BaseFloat mel = mel_low_freq +
(bin + 1) * (mel_high_freq - mel_low_freq) / (num_bins + 1);
if (vtln_warp_factor != 1.0)
mel = VtlnWarpMelFreq(vtln_warp_factor, mel);
center_freqs_(bin) = InverseMelScale(mel);
}

if (vtln_warp_factor != 1.0 &&
(vtln_low < 0.0 || vtln_low <= low_freq
|| vtln_low >= high_freq
|| vtln_high <= 0.0 || vtln_high >= high_freq
|| vtln_high <= vtln_low))
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low
<< " and vtln-high " << vtln_high << ", versus "
<< "low-freq " << low_freq << " and high-freq "
<< high_freq;
if (!opts.modified)
ComputeBins(opts.htk_mode);
else
ComputeModifiedBins();

bins_.resize(num_bins);
center_freqs_.Resize(num_bins);
if (debug_) {
for (size_t i = 0; i < bins_.size(); i++) {
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
<< ", vec = " << bins_[i].second;
}
}
}

void MelBanks::ComputeBins(bool htk_mode) {
int32 num_bins = center_freqs_.Dim();
for (int32 bin = 0; bin < num_bins; bin++) {
BaseFloat left_mel = mel_low_freq + bin * mel_freq_delta,
center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;

if (vtln_warp_factor != 1.0) {
left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, left_mel);
center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, center_mel);
right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
vtln_warp_factor, right_mel);
}
center_freqs_(bin) = InverseMelScale(center_mel);
// center_mel is the center frequency (in mel) of this bin, and left_mel and
// right_mel are those of the bins immediately to the left and right.
BaseFloat center_mel = MelScale(center_freqs_(bin)),
left_mel = MelScale(bin == 0 ?
low_freq_ : center_freqs_(bin - 1)),
right_mel = MelScale(bin == num_bins - 1 ?
high_freq_ : center_freqs_(bin + 1));
// this_bin will be a vector of coefficients that is only
// nonzero where this mel bin is active.
Vector<BaseFloat> this_bin(num_fft_bins);
Vector<BaseFloat> this_bin(num_fft_bins_);
int32 first_index = -1, last_index = -1;
for (int32 i = 0; i < num_fft_bins; i++) {
BaseFloat freq = (fft_bin_width * i); // Center frequency of this fft
for (int32 i = 0; i < num_fft_bins_; i++) {
BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft
// bin.
BaseFloat mel = MelScale(freq);
if (mel > left_mel && mel < right_mel) {
BaseFloat weight;
if (mel <= center_mel)
weight = (mel - left_mel) / (center_mel - left_mel);
else
weight = (right_mel-mel) / (right_mel-center_mel);
weight = (right_mel - mel) / (right_mel - center_mel);
this_bin(i) = weight;
if (first_index == -1)
first_index = i;
Expand All @@ -129,29 +108,73 @@ MelBanks::MelBanks(const MelBanksOptions &opts,
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));

// Replicate a bug in HTK, for testing purposes.
if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0)
if (htk_mode && bin == 0 && low_freq_ != 0.0)
bins_[bin].second(0) = 0.0;

}
if (debug_) {
for (size_t i = 0; i < bins_.size(); i++) {
KALDI_LOG << "bin " << i << ", offset = " << bins_[i].first
<< ", vec = " << bins_[i].second;
}

/*
Notes on the shape of the modified bins.

They are shaped like a cosine function from -pi/2 to pi/2 (unlike the standard
triangular bins). We define their diameter as the distance between the
first and last nonzero value (pi for the canonical function). If there are
a lot of bins, their diamter is defined by a formula and it's a function of
the center frequency f of the bin:
diameter = alpha1 + alpha2 * f / (f + breakpoint_).
So, it increases from alpha1 Hz to (alpha1 + alpha2) Hz with a knee around breakpoint_ (Hz).
However (and this matters if the number of bins is relatively small), we never
let the diameter fall below the point where the crossing-point of this and
the next bin would be less than 0.2. By this I mean is the y-value where the
raised-cosines cross. This value ensures that there won't be too a 'dip'
in the middle of the two bins.
*/
void MelBanks::ComputeModifiedBins() {
int32 num_bins = center_freqs_.Dim();
for (int32 bin = 0; bin < num_bins; bin++) {
BaseFloat center_freq = center_freqs_(bin),
next_center = (bin == num_bins - 1 ?
high_freq_ : center_freqs_(bin + 1));

// note: breakpoint_ is 900 (Hz).
BaseFloat diameter_floor = (next_center - center_freq) * 1.2,
diameter = 80.0 + 100.0 * (center_freq / (center_freq + breakpoint_));

diameter = sqrt(diameter * diameter + diameter_floor * diameter_floor);

// 'freq_scale' is the scaling factor on the frequencies that will ensure
// that the diameter becomes equal to pi, like the canonical bin function
// (the cosine from -pi/2 to pi/2).
BaseFloat freq_scale = M_PI / diameter;

// this_bin will be a vector of coefficients that is only
// nonzero where this mel bin is active.
Vector<BaseFloat> this_bin(num_fft_bins_);
int32 first_index = -1, last_index = -1;

for (int32 i = 0; i < num_fft_bins_; i++) {
BaseFloat freq = (fft_bin_width_ * i); // Center frequency of this fft
// bin.
BaseFloat normalized_freq = freq_scale * (freq - center_freq);
if (normalized_freq > -M_PI_2 && normalized_freq < M_PI_2) {
BaseFloat weight = cos(normalized_freq);
this_bin(i) = weight;
if (first_index == -1)
first_index = i;
last_index = i;
}
}
KALDI_ASSERT(first_index != -1 && last_index >= first_index
&& "You may have set --num-mel-bins too large.");

bins_[bin].first = first_index;
int32 size = last_index + 1 - first_index;
bins_[bin].second.Resize(size);
bins_[bin].second.CopyFromVec(this_bin.Range(first_index, size));
}
}

MelBanks::MelBanks(const MelBanks &other):
center_freqs_(other.center_freqs_),
bins_(other.bins_),
debug_(other.debug_),
htk_mode_(other.htk_mode_) { }

BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
BaseFloat vtln_high_cutoff,
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
BaseFloat high_freq,
BaseFloat vtln_warp_factor,
BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_warp_factor,
BaseFloat freq) {
/// This computes a VTLN warping function that is not the same as HTK's one,
/// but has similar inputs (this function has the advantage of never producing
Expand Down Expand Up @@ -180,45 +203,34 @@ BaseFloat MelBanks::VtlnWarpFreq(BaseFloat vtln_low_cutoff, // upper+lower freq
/// = vtln_low_cutoff * max(1, vtln_warp_factor)


if (freq < low_freq || freq > high_freq) return freq; // in case this gets called
if (freq < low_freq_ || freq > high_freq_) return freq; // in case this gets called
// for out-of-range frequencies, just return the freq.

KALDI_ASSERT(vtln_low_cutoff > low_freq &&
"be sure to set the --vtln-low option higher than --low-freq");
KALDI_ASSERT(vtln_high_cutoff < high_freq &&
"be sure to set the --vtln-high option lower than --high-freq [or negative]");
BaseFloat one = 1.0;
BaseFloat l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
BaseFloat h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
BaseFloat l = vtln_low_ * std::max(BaseFloat(1.0), vtln_warp_factor);
BaseFloat h = vtln_high_ * std::min(BaseFloat(1.0), vtln_warp_factor);
BaseFloat scale = 1.0 / vtln_warp_factor;
BaseFloat Fl = scale * l; // F(l);
BaseFloat Fh = scale * h; // F(h);
KALDI_ASSERT(l > low_freq && h < high_freq);
KALDI_ASSERT(l > low_freq_ && h < high_freq_);
// slope of left part of the 3-piece linear function
BaseFloat scale_left = (Fl - low_freq) / (l - low_freq);
BaseFloat scale_left = (Fl - low_freq_) / (l - low_freq_);
// [slope of center part is just "scale"]

// slope of right part of the 3-piece linear function
BaseFloat scale_right = (high_freq - Fh) / (high_freq - h);
BaseFloat scale_right = (high_freq_ - Fh) / (high_freq_ - h);

if (freq < l) {
return low_freq + scale_left * (freq - low_freq);
return low_freq_ + scale_left * (freq - low_freq_);
} else if (freq < h) {
return scale * freq;
} else { // freq >= h
return high_freq + scale_right * (freq - high_freq);
return high_freq_ + scale_right * (freq - high_freq_);
}
}

BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_low_cutoff, // upper+lower frequency cutoffs for VTLN.
BaseFloat vtln_high_cutoff,
BaseFloat low_freq, // upper+lower frequency cutoffs in mel computation
BaseFloat high_freq,
BaseFloat vtln_warp_factor,
BaseFloat MelBanks::VtlnWarpMelFreq(BaseFloat vtln_warp_factor,
BaseFloat mel_freq) {
return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
low_freq, high_freq,
vtln_warp_factor, InverseMelScale(mel_freq)));
return MelScale(VtlnWarpFreq(vtln_warp_factor, InverseMelScale(mel_freq)));
}


Expand Down Expand Up @@ -250,6 +262,53 @@ void MelBanks::Compute(const VectorBase<BaseFloat> &power_spectrum,
}
}

void MelBanks::SetConfigs(const MelBanksOptions &opts,
const FrameExtractionOptions &frame_opts,
BaseFloat vtln_warp_factor) {
BaseFloat sample_freq = frame_opts.samp_freq,
nyquist = 0.5 * sample_freq;
int32 window_length_padded = frame_opts.PaddedWindowSize();
KALDI_ASSERT(window_length_padded % 2 == 0);
num_fft_bins_ = window_length_padded / 2;
// fft-bin width [think of it as Nyquist-freq / half-window-length]
fft_bin_width_ = sample_freq / window_length_padded;

debug_ = opts.debug_mel;


low_freq_ = opts.low_freq;
if (opts.high_freq > 0.0)
high_freq_ = opts.high_freq;
else
high_freq_ = nyquist + opts.high_freq;

if (low_freq_ < 0.0 || low_freq_ >= nyquist
|| high_freq_ <= 0.0 || high_freq_ > nyquist
|| high_freq_ <= low_freq_)
KALDI_ERR << "Bad values in options: low-freq " << low_freq_
<< " and high-freq " << high_freq_ << " vs. nyquist "
<< nyquist;

breakpoint_ = (opts.modified ? 500.0 : 700.0);
second_breakpoint_ = (opts.modified ? 3500 : -1);
vtln_low_ = opts.vtln_low;
if (opts.vtln_high > 0.0)
vtln_high_ = opts.vtln_high;
else
vtln_high_ = opts.vtln_high + nyquist;

if (vtln_warp_factor != 1.0 &&
(vtln_low_ < 0.0 || vtln_low_ <= low_freq_
|| vtln_low_ >= high_freq_
|| vtln_high_ <= 0.0 || vtln_high_ >= high_freq_
|| vtln_high_ <= vtln_low_))
KALDI_ERR << "Bad values in options: vtln-low " << vtln_low_
<< " and vtln-high " << vtln_high_ << ", versus "
<< "low-freq " << low_freq_ << " and high-freq "
<< high_freq_;
}


void ComputeLifterCoeffs(BaseFloat Q, VectorBase<BaseFloat> *coeffs) {
// Compute liftering coefficients (scaling on cepstral coeffs)
// coeffs are numbered slightly differently from HTK: the zeroth
Expand Down
Loading