Edinburgh Speech Tools  2.1-release
EST_sigpr_utt.h
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 
34 #ifndef __EST_SIGPR_UTT_H__
35 #define __EST_SIGPR_UTT_H__
36 
37 #include "sigpr/EST_sigpr_frame.h"
38 #include "sigpr/EST_Window.h"
39 #include "EST_Track.h"
40 #include "EST_Wave.h"
41 #include "EST_Option.h"
42 
43 #define DEFAULT_WINDOW_NAME "hamming"
44 #define DEFAULT_FRAME_FACTOR 2.0
45 
46 /* Note: some of these functions deliberately don't have
47  doxygen style comments, mainly because they are, or will be
48  superseded soon.
49 */
50 
51 /** @defgroup FunctionsForGeneratingTracks Functions for generating Tracks
52  */
53 
54 /**@defgroup Functionsforusewithframebasedprocessing Functions for use with frame based processing
55  @ingroup FunctionsForGeneratingTracks
56 
57 In the following functions, the input is a EST_Wave waveform,
58 and the output is a (usually multi-channel) EST_Track. The
59 track must be set up appropriately before hand. This means the track
60 must be resized accordingly with the correct numbers of frame and
61 channels.
62 
63 The positions of the frames are found by examination of the **time**
64 array in the EST_Track, which must be filled prior to the function
65 call. The usual requirement is for fixed frame analysis, where each
66 analysis frame is, say, 10ms after the previous one.
67 
68 A common alternative is to perform pitch-synchronous
69 analysis where the time shift is related to the local pitch period.
70 
71 */
72 
73 ///@{
74 
75 /** Produce a single set of coefficients from a waveform. The type of
76  coefficient required is given in the argument `type`.
77 
78  \param type { Possible types are:
79  - **lpc**: linear predictive coding
80  - **cep**: cepstrum coding from lpc coefficients
81  - **melcep**: Mel scale cepstrum coding via fbank
82  - **fbank**: Mel scale log filterbank analysis
83  - **lsf**: line spectral frequencies
84  - **ref**: Linear prediction reflection coefficients
85  - **power**:
86  - **f0**: srpd algorithm
87  - **energy**: root mean square energy
88  }
89 
90 The order of the analysis is calculated from the number of
91 channels in `fv`. The positions of the analysis
92 windows must be given by filling in the track's time array.
93 
94 This function windows the waveform at the intervals given by the track
95 time array. The length of each window is `factor * the local time shift`.
96 The windowing function is given by `wf`.
97 
98  @param sig input waveform
99  @param fv {output coefficients. These have been pre-allocated and the
100  number of channels in a indicates the order of the analysis.}
101  @param type the types of coefficients to be produced. "lpc", "cep" etc
102  @param factor {the frame length factor, i.e. the analysis frame length
103  will be this times the local pitch period.}
104 
105  @param wf function for windowing. See Windowing mechanisms
106 */
107 void sig2coef(EST_Wave &sig, EST_Track &a, EST_String type,
108  float factor = 2.0,
110 
111 /** Produce multiple coefficients from a waveform by repeated calls to
112  sig2coef.
113 
114 @param sig: input waveform
115 @param fv: output coefficients. These have been pre-allocated and the
116  number of channels in a indicates the order of the analysis.
117 @param op: Features structure containing options for analysis order,
118  frame shift etc.
119 @param slist: list of types of coefficients required, from the set of
120 possible types that sig2coef can take.
121 */
122 void sigpr_base(EST_Wave &sig, EST_Track &fv, EST_Features &op,
123  const EST_StrList &slist);
124 
125 /** Calculate the power for each frame of the waveform.
126 
127 @param sig: input waveform
128 @param a: output power track
129 @param factor: the frame length factor, i.e. the analysis frame length
130  will be this times the local pitch period.
131 */
132 void power(EST_Wave &sig, EST_Track &a, float factor);
133 
134 /** Calculate the rms energy for each frame of the waveform.
135 
136 This function calls
137 \ref sig2energy
138 
139 
140 @param sig input waveform
141 @param a output coefficients
142 @param factor optional: the frame length factor, i.e. the analysis frame length
143  will be this times the local pitch period.
144 
145 */
146 void energy(EST_Wave &sig, EST_Track &a, float factor);
147 
148 
149 /** Mel scale filter bank analysis. The Mel scale triangular filters
150 are computed via an FFT (see \ref fastFFT). This routine is required
151 for Mel cepstral analysis (see \ref melcep). The analysis of each
152 frame is done by \ref sig2fbank.
153 
154 A typical filter bank analysis for speech recognition might use log
155 energy outputs from 20 filters.
156 
157 @param sig: input waveform
158 @param fbank: the output. The number of filters is determined from the number
159  size of this track.
160 @param factor: the frame length factor, i.e. the analysis frame length
161  will be this times the local pitch period
162 @param wf: function for windowing. See \ref {Windowing mechanisms}
163 @param up: whether the filterbank analysis should use
164  power rather than energy.
165 @param take_log: whether to take logs of the filter outputs
166 
167 @see sig2fbank
168 @see melcep
169 */
170 void fbank(EST_Wave &sig,
171  EST_Track &fbank,
172  const float factor,
174  const bool up = false,
175  const bool take_log = true);
176 
177 /** Mel scale cepstral analysis via filter bank analysis. Cepstral
178 parameters are computed for each frame of speech. The analysis
179 requires \ref fbank . The cepstral analysis of the filterbank outputs
180 is performed by \ref fbank2melcep .
181 
182 A typical Mel cepstral coefficient (MFCC) analysis for speech recognition
183 might use 12 cepstral coefficients computed from a 20 channel filterbank.
184 
185 
186 @param sig input: waveform
187 @param mfcc_track: the output
188 @param factor: the frame length factor, i.e. the analysis frame length
189  will be this times the local pitch period
190 @param fbank_order: the number of Mel scale filters used for the analysis
191 @param liftering_parameter: for filtering in the cepstral domain
192  See \ref fbank2melcep
193 @param wf: function for windowing. See \ref Windowing mechanisms
194 @param include_c0: whether the zero'th cepstral coefficient is to be included
195 @param up: whether the filterbank analysis should use
196  power rather than energy.
197 
198 @see fbank
199 @see fbank2melcep
200 */
201 void melcep(EST_Wave &sig,
202  EST_Track &mfcc_track,
203  float factor,
204  int fbank_order,
205  float liftering_parameter,
207  const bool include_c0 = false,
208  const bool up = false);
209 
210 ///@}
211 
212 
213 /**@defgroup PitchF0DetectionAlgorithmfunctions Pitch/F0 Detection Algorithm functions
214  @ingroup FunctionsForGeneratingTracks
215 
216 These functions are used to produce a track of fundamental frequency
217 (F0) against time of a waveform.
218 */
219 
220 ///@{
221 
222 
223 /** Top level pitch (F0) detection algorithm. Returns a track
224 containing evenly spaced frames of speech, each containing a F0 value
225 for that point.
226 
227 At present, only the \ref srpd pitch tracker is implemented, so
228 this is always called regardless of what `method`
229 is set to.
230 
231 @param sig: input waveform
232 @param fz: output f0 contour
233 @param op: parameters for pitch tracker
234 @param method: pda method to be used.
235 */
236 void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method="");
237 
238 
239 /** Top level intonation contour detection algorithm. Returns a track
240 containing evenly spaced frames of speech, each containing a F0 for that
241 point. `icda` differs from \ref pda in that the contour is smoothed,
242 and unvoiced portions have interpolated F0 values.
243 
244 @param sig: input waveform
245 @param fz: output f0 contour
246 @param speech: {Interpolation is controlled by the `speech` track. When
247 a point has a positive value in the speech track, it is a candidate
248 for interpolation. }
249 @param op: parameters for pitch tracker
250 @param method: pda method to be used.
251 */
252 void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech,
253  EST_Option &op, EST_String method = "");
254 
255 /** Create a set sensible defaults for use in pda and icda.
256 
257 */
259 
260 
261 /** Super resolution pitch tracker.
262 
263 srpd is a pitch detection algorithm that produces a fundamental
264 frequency contour from a speech waveform. At present only the super
265 resolution pitch determination algorithm is implemented. See (Medan,
266 Yair, and Chazan, 1991) and (Bagshaw et al., 1993) for a detailed
267 description of the algorithm.
268 
269 Frames of data are read in from `sig` in
270 chronological order such that each frame is shifted in time from its
271 predecessor by `pda_frame_shift`. Each frame is
272 analysed in turn.
273 
274 
275 The maximum and minimum signal amplitudes are initially found over the
276 duration of two segments, each of length N_min samples. If the sum of
277 their absolute values is below two times
278 noise_floor, the frame is classified as
279 representing silence and no coefficients are calculated. Otherwise, a
280 cross correlation coefficient is calculated for all n from a period in
281 samples corresponding to `min_pitch`
282 to a period in samples corresponding to
283 `max_pitch`, in steps
284 of `decimation_factor`. In calculating the
285 coefficient only one in `decimation_factor`
286 samples of the two segments are used. Such down-sampling permits rapid
287 estimates of the coefficients to be calculated over the range
288 N_min <= n <= N_max. This results in a cross-correlation track for the
289 frame being analysed.
290 
291 Local maxima of the track with a coefficient value above a specified
292 threshold form candidates for the fundamental period. The threshold is
293 adaptive and dependent upon the values `v2uv_coeff_thresh`,
294 `min_v2uv_coef_thresh `, and `v2uv_coef_thresh_rati_ratio`. If the previously
295 analysed frame was classified as unvoiced or silent (which is the
296 initial state) then the threshold is set to
297 `v2uv_coef_thresh`. Otherwise, the previous
298 frame was classified as being voiced, and the threshold is set equal
299 to [\-r] `v2uv_coef_thresh_rati_ratio`
300 times the cross-correlation coefficient
301 value at the point of the previous fundamental period in the former
302 coefficients track. This product is not permitted to drop below
303 `v2uv_coef_thresh`.
304 
305 If no candidates for the fundamental period are found, the frame is classified
306 as being unvoiced. Otherwise, the candidates are further processed to identify
307 the most likely true pitch period. During this additional processing, a
308 threshold given by `anti_doubling_thres` is used.
309 
310 If the `peak_tracking` flag is set to true,
311 biasing is applied to the cross-correlation track as described in
312 (Bagshaw et al., 1993).
313 
314 @param sig: input waveform
315 @param op: options regarding pitch tracking parameters
316 @param op.min_pitch: minimum permitted F0 value
317 @param op.max_pitch: maximum permitted F0 value
318 @param op.pda_frame_shift: analysis frame shift
319 @param op.pda_frame_length: analysis frame length
320 @param op.lpf_cutoff: cut off frequency for low pass filtering
321 @param op.lpf_order: order of low pass filtering (must be odd)
322 @param op.decimation
323 @param op.noise_floor
324 @param op.min_v2uv_coef_thresh
325 @param op.v2uv_coef_thresh_ratio
326 @param op.v2uv_coef_thresh
327 @param op.anti_doubling_thresh
328 @param op.peak_tracking
329 
330 */
331 void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &options);
332 
333 /** Smooth selected parts of an f0 contour. Interpolation is
334 controlled by the <tt>speech</tt> track. When a point has a positive
335 value in the speech track, it is a candidate for interpolation.
336 */
337 void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options,
338  EST_Track &sm);
339 
340 /** Smooth all the points in an F0 contour*/
341 void smooth_portion(EST_Track &c, EST_Option &op);
342 
343 ///@}
344 
345 
346 /**@defgroup DeltaandAccelerationcoefficients Delta and Acceleration coefficients
347  @ingroup FunctionsForGeneratingTracks
348 
349 Produce delta and acceleration coefficients from a set of coefficients
350 or the waveform.
351 */
352 
353 ///@{
354 
355 /** Produce a set of delta coefficients for a track
356 
357 The delta function is used to produce a set of coefficients which
358 estimate the rate of change of a set of parameters. The output track
359 `d` must be setup before hand, i.e. it must have
360 the same number of frames and channels as `tr`.
361 
362 @param tr: input track of base coefficients
363 @param d: output track of delta coefficients.
364 @param regression_length: number of previous frames on which delta
365  estimation is calculated on.
366 */
367 void delta(EST_Track &tr, EST_Track &d, int regression_length = 3);
368 
369 /** Produce multiple sets of delta coefficients from a waveform.
370 
371  Calculate specified types of delta coefficients. This function is
372  used when the base types of coefficients haven't been calculated.
373  This function calls sig2coef to calculate the base types from which
374  the deltas are calculated, and hence the requirements governing the
375  setup of `fv` for sig2coef also hold here.
376 
377 @param sig: input waveform
378 @param fv: output coefficients. These have been pre-allocated and the
379  number of channels in a indicates the order of the analysis.
380 @param op: Features structure containing options for analysis order,
381  frame shift etc.
382 @param slist: list of types of delta coefficients required.
383 */
384 void sigpr_delta(EST_Wave &sig, EST_Track &fv, EST_Features &op,
385  const EST_StrList &slist);
386 
387 /** Produce multiple sets of acceleration coefficients from a waveform
388 
389  Calculate specified types of acceleration coefficients. This function
390  is used when the base types of coefficient haven't been calculated.
391  This function calls sig2coef to calculate the base types from which
392  the deltas are calculated, and hence the requirements governing the
393  setup of `fv` for sig2coef also hold here.
394 
395 @param sig: input waveform
396 @param fv: output coefficients. These have been pre-allocated and the
397  number of channels in a indicates the order of the analysis.
398 @param op: Features structure containing options for analysis order,
399  frame shift etc.
400 @param slist: list of types of acceleration coefficients required.
401 
402 
403 The delta function is used to produce a set of coefficients which
404 estimate the rate of change of a set of parameters.
405 */
406 void sigpr_acc(EST_Wave &sig, EST_Track &fv, EST_Features &op,
407  const EST_StrList &slist);
408 
409 ///@}
410 
411 /* Convert a track containing coefficients of one type to a track
412 containing coefficients of another.
413 
414 @param in_track input set of coefficients
415 @param out_track input set of coefficients
416 @param out_name name of desired output coefficients.
417 @param in_name optional: often it is possible to determine the type of
418 the input coefficients from the channel names. If this is not possible or
419 these names should be ignored, the `in_type` parameter can be used.
420 
421 */
422 void convert_track(EST_Track &in_track, EST_Track &out_track,
423  const EST_String &out_type,
424  const EST_String &in_type = "");
425 
426 
427 
428 #endif /* __EST_SIGPR_UTT_H__ */
429 
A class for storing digital waveforms. The waveform is stored as an array of 16 bit shorts...
Definition: EST_Wave.h:64
void convert_track(EST_Track &in_track, EST_Track &out_track, const EST_String &out_type, const EST_String &in_type="")
Definition: sigpr_utt.cc:473
static Func * creator(const char *name, bool report_error=false)
Return the creation function for the given window type.
Definition: EST_Window.cc:218
void delta(EST_Track &tr, EST_Track &d, int regression_length=3)
Definition: delta.cc:52
void energy(EST_Wave &sig, EST_Track &a, float factor)
Definition: sigpr_utt.cc:445
void power(EST_Wave &sig, EST_Track &a, float factor)
Definition: sigpr_utt.cc:422
void melcep(EST_Wave &sig, EST_Track &mfcc_track, float factor, int fbank_order, float liftering_parameter, EST_WindowFunc *wf=EST_Window::creator(DEFAULT_WINDOW_NAME), const bool include_c0=false, const bool up=false)
Definition: sigpr_utt.cc:540
void sig2coef(EST_Wave &sig, EST_Track &a, EST_String type, float factor=2.0, EST_WindowFunc *wf=EST_Window::creator(DEFAULT_WINDOW_NAME))
Definition: sigpr_utt.cc:399
void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Option &op, EST_String method="")
void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &options)
Definition: pda.cc:85
void default_pda_options(EST_Features &al)
Definition: pda.cc:229
void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options, EST_Track &sm)
Definition: smooth_pda.cc:54
void sigpr_delta(EST_Wave &sig, EST_Track &fv, EST_Features &op, const EST_StrList &slist)
Definition: sigpr_utt.cc:309
void EST_WindowFunc(int size, EST_TBuffer< float > &r_window, int window_centre)
Function which creates a window.
Definition: EST_Window.h:52
void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method="")
Definition: pda.cc:51
void smooth_portion(EST_Track &c, EST_Option &op)
void sigpr_acc(EST_Wave &sig, EST_Track &fv, EST_Features &op, const EST_StrList &slist)
Definition: sigpr_utt.cc:302
#define DEFAULT_WINDOW_NAME
Definition: EST_sigpr_utt.h:43
void fbank(EST_Wave &sig, EST_Track &fbank, const float factor, EST_WindowFunc *wf=EST_Window::creator(DEFAULT_WINDOW_NAME), const bool up=false, const bool take_log=true)
Definition: sigpr_utt.cc:496
void sigpr_base(EST_Wave &sig, EST_Track &fv, EST_Features &op, const EST_StrList &slist)
Definition: sigpr_utt.cc:138