Edinburgh Speech Tools  2.1-release
pda.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Paul Taylor */
34 /* Date : April 1994 */
35 /*************************************************************************/
36 
37 #include "EST_speech_class.h"
38 #include "sigpr/EST_sigpr_utt.h"
39 #include "sigpr/EST_filter.h"
40 #include "srpd.h"
41 #include "EST_error.h"
42 #include "EST_string_aux.h"
43 
44 int read_next_wave_segment (EST_Wave &sig, struct Srpd_Op *paras,
45  SEGMENT_ *p_seg);
46 
47 static void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize);
48 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd);
49 static void parse_srpd_list(EST_Features &a_list, struct Srpd_Op *srpd);
50 
51 void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)
52 {
53  if (method == "")
54  {
55  if (op.present("pda_method"))
56  method = op.S("pda_method");
57  }
58  if (method == "")
59  srpd(sig, fz, op);
60  else if (method == "srpd")
61  srpd(sig, fz, op);
62  else
63  EST_error("Unknown pda %s\n", (const char *)method);
64 }
65 
66 void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op,
67  EST_String method)
68 { // intonation contour detection algorithm
69  EST_Track raw_fz;
70  if (method == "")
71  {
72  if (op.present("pda_method"))
73  method = op.S("pda_method");
74  }
75  if (method == "")
76  srpd(sig, raw_fz, op);
77  else if (method == "srpd")
78  srpd(sig, raw_fz, op);
79  else
80  EST_error("Unknown pda %s\n", (const char *)method);
81 
82  smooth_phrase(raw_fz, speech, op, fz);
83 }
84 
85 void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)
86 {
87  Srpd_Op srpd_op;
88 
89  default_srpd_op(&srpd_op); // default values
90  parse_srpd_list(op, &srpd_op); // override with options
91 
92  if (op.I("do_low_pass",0))
93  FIRlowpass_filter(sig, op.I("lpf_cutoff"),op.I("lpf_order"));
94 
95  srpd(sig, fz, srpd_op, op.I("srpd_resize", 0));
96 }
97 
98 /*void do_srpd_fz(EST_Wave &sig, EST_Track &fz)
99 {
100  Srpd_Op srpd_op;
101  default_srpd_op(&srpd_op);
102  srpd(sig, fz, srpd_op, 1);
103 }
104 */
105 
106 void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize)
107 {
108  ssize_t i, rns, tracklen, j = 0;
109  SEGMENT_ segment;
110  CROSS_CORR_ cc;
111  STATUS_ pda_status, held_status;
112  srpd_op.sample_freq = sig.sample_rate();
113  /* Unused variables:
114  float min, max;
115  min = srpd_op.min_pitch; // must store as set up routines corrupt
116  max = srpd_op.max_pitch;
117  */
118 
119  initialise_structures (&srpd_op, &segment, &cc);
120  initialise_status (&srpd_op, &pda_status);
121  initialise_status (&srpd_op, &held_status);
122 
123  tracklen = (sig.num_samples() - segment.length) / segment.shift + 1;
124 
125  if (resize)
126  {
127  fz.set_equal_space(true);
128  fz.resize(tracklen, 1);
129  fz.set_channel_name("F0", 0);
130  fz.fill_time(srpd_op.shift/1000);
131  }
132 
133  if (!fz.equal_space())
134  EST_error("Pitch tracking algorithm must have equal spaced track\n");
135 
136  while ((rns = read_next_wave_segment (sig, &srpd_op, &segment)) != 0)
137  {
138  if (rns == 2)
139  {
140  for (i = 0; i < cc.size; cc.coeff[i++] = 0.0);
141  initialise_status (&srpd_op, &pda_status);
142  }
143  else
144  super_resolution_pda (&srpd_op, segment, &cc, &pda_status);
145  if (pda_status.s_h == HOLD)
146  {
147  held_status.pitch_freq = pda_status.pitch_freq;
148  held_status.v_uv = VOICED;
149  held_status.s_h = HELD;
150  held_status.cc_max = pda_status.cc_max;
151  held_status.threshold = pda_status.threshold;
152  continue;
153  }
154  if (held_status.s_h == HELD)
155  {
156  if (pda_status.pitch_freq == BREAK_NUMBER)
157  {
158  held_status.pitch_freq = BREAK_NUMBER;
159  held_status.v_uv = UNVOICED;
160  }
161  held_status.s_h = SENT;
162  if (held_status.v_uv != VOICED)
163  fz.set_break(j);
164  fz.a(j++) = held_status.pitch_freq;
165  // printf( "track set: %d (of %d) to %f\n", j-1, fz.length(), held_status.pitch_freq );
166  }
167  if (pda_status.v_uv != VOICED)
168  fz.set_break(j);
169  fz.a(j++) = pda_status.pitch_freq;
170  //printf( "track set: %d (of %d) to %f\n", j-1, fz.length(), pda_status.pitch_freq );
171  }
172  if (held_status.s_h == HELD)
173  {
174  held_status.pitch_freq = BREAK_NUMBER;
175  held_status.v_uv = UNVOICED;
176  fz.set_break(j);
177  fz.a(j++) = held_status.pitch_freq;
178  }
179  end_structure_use (&segment, &cc);
180 }
181 
182 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd)
183 {
184  srpd->L = DEFAULT_DECIMATION;
187  srpd->shift = DEFAULT_SHIFT;
188  srpd->length = DEFAULT_LENGTH;
189  srpd->Tsilent = DEFAULT_TSILENT;
190  srpd->Tmin = DEFAULT_TMIN;
192  srpd->Thigh = DEFAULT_THIGH;
193  srpd->Tdh = DEFAULT_TDH;
194  srpd->make_ascii = 0;
195  srpd->peak_tracking = 0;
196  srpd->sample_freq = DEFAULT_SF;
197  /* p_par->Nmax and p_par->Nmin cannot be initialised */
198  return(srpd);
199 }
200 
201 static void parse_srpd_list(EST_Features &al, struct Srpd_Op *srpd)
202 {
203  if (al.present("decimation"))
204  srpd->L = al.I("decimation");
205  if (al.present("min_pitch"))
206  srpd->min_pitch = al.F("min_pitch");
207  if (al.present("max_pitch"))
208  srpd->max_pitch = al.F("max_pitch");
209  if (al.present("pda_frame_shift"))
210  srpd->shift = al.F("pda_frame_shift") * 1000.0;
211  if (al.present("pda_frame_length"))
212  srpd->length = al.F("pda_frame_length") * 1000.0;
213  if (al.present("noise_floor"))
214  srpd->Tsilent = al.I("noise_floor");
215  if (al.present("v2uv_coeff_thresh"))
216  srpd->Thigh = al.F("v2uv_coef_thresh");
217  if (al.present("min_v2uv_coef_thresh"))
218  srpd->Tmin = al.F("min_v2uv_coef_thresh");
219  if (al.present("v2uv_coef_thresh_ratio"))
220  srpd->Tmax_ratio = al.F("v2uv_coef_thresh_ratio");
221  if (al.present("anti_doubling_thresh"))
222  srpd->Tdh = al.F("anti_doubling_thresh");
223  if (al.present("peak_tracking"))
224  srpd->peak_tracking = al.I("peak_tracking");
225  if (al.present("sample_frequency"))
226  srpd->sample_freq = al.I("sample_frequency");
227 }
228 
230 {
231  al.set("min_pitch", "40.0");
232  al.set("max_pitch", "400.0");
233  al.set("pda_frame_shift", "0.005");
234  al.set("pda_frame_length", DEFAULT_LENGTH / 1000.0);
235  al.set("lpf_cutoff", "600");
236  al.set("lpf_order", "49");
237  al.set("f0_file_type", "esps");
238  al.set("decimation", DEFAULT_DECIMATION);
239  al.set("noise_floor", DEFAULT_TSILENT);
240  al.set("min_v2uv_coef_thresh", DEFAULT_TMIN);
241  al.set("v2uv_coef_thresh_ratio", DEFAULT_TMAX_RATIO);
242  al.set("v2uv_coef_thresh", DEFAULT_THIGH);
243  al.set("anti_doubling_thresh", DEFAULT_TDH);
244  al.set("peak_tracking", 0);
245 }
246 
248 {
249  // The standard waveform input options
250  return
251  EST_String("")+
252  "-L Perform low pass filtering on input. This option should always \n"
253  " be used in normal processing as it usually increases \n"
254  " performance considerably\n\n"
255  "-P perform peak tracking\n\n"
256  "-fmin <float> miniumum F0 value. Sets the minimum allowed F0 in \n"
257  " output track. Default is "+ftoString(DEFAULT_MIN_PITCH)+".\n "
258  " Changing this to suit the speaker usually increases \n"
259  " performance. Typical recommended values are 60-90Hz for\n"
260  " males and 120-150Hz for females\n\n"
261  "-fmax <float> maxiumum F0 value. Sets the maximum allowed F0 in \n"
262  " output track. Default is "+ftoString(DEFAULT_MAX_PITCH)+". \n"
263  " Changing this to suit the speaker usually increases \n"
264  " performance. Typical recommended values are 200Hz for \n"
265  " males and 300-400Hz for females\n\n"
266  "-shift <float> frame spacing in seconds for fixed frame analysis. \n"
267  " This doesn't have to be the same as the output file spacing - \n"
268  " the -S option can be used to resample the track before saving \n"
269  " default: "+ftoString(DEFAULT_SHIFT/1000.0) +"\n\n"
270  "-length <float> analysis frame length in seconds.\n"
271  " default: "+ftoString(DEFAULT_LENGTH/1000.0) +"\n\n"
272  "-lpfilter <int> Low pass filter, with cutoff frequency in Hz \n"
273  " Filtering is performed by a FIR filter which is built at run \n"
274  " time. The order of the filter can be given by -forder. The \n"
275  " default value is 199\n\n"
276  "-forder <int> Order of FIR filter used for lpfilter and \n"
277  " hpfilter. This must be ODD. Sensible values range \n"
278  " from 19 (quick but with a shallow rolloff) to 199 \n"
279  " (slow but with a steep rolloff). The default is 199.\n\n";
280 }
281 
283 {
284  // The standard waveform input options
285  return
286  EST_String("")+
287  "-d <float> decimation factor\n"
288  " set down-sampling for quicker computation so that only one in \n"
289  " <parameter>decimation factor</parameter> samples are used in the first instance. \n"
290  " Must be in the range of one to ten inclusive. Default is four. \n"
291  " For data sampled at 10kHz, it is advised that a decimation \n"
292  " factor of two isselected.\n\n"
293 
294  "-n <float> Inoise floor.\n"
295  " Set the maximum absolute signal amplitude that represents \n"
296  " silence to <parameter>Inoise floor</parameter>. If the absolute amplitude of \n"
297  " the first segment in a given frame is below this level at all \n"
298  " times, then the frame is classified as representing silence. \n"
299  " Must be a positive number. Default is 120 ADC units.\n\n"
300 
301  "-H <float> unvoiced to voiced coeff threshold\n"
302  " set the correlation coefficient threshold which must be \n"
303  " exceeded in a transition from an unvoiced classified frame \n"
304  " of speech to a voiced frame as the unvoiced to voiced coeff \n"
305  " threshold. Must be in the range zero to one inclusive. \n"
306  " Default is 0.88.\n\n"
307 
308  "-m <float> min voiced to unvoiced coeff threshold \n"
309  " set the minimum allowed correlation coefficient threshold \n"
310  " which must not be exceeded in a transition from a voiced \n"
311  " classified frame of speech to an unvoiced frame, as \n"
312  " <parameter>min voiced to unvoiced coeff threshold</parameter>. Must be in the \n"
313  " range zero to <parameter>unvoiced to voiced coeff threshold</parameter> \n"
314  " inclusive. Default is 0.75.\n\n"
315 
316  "-R <float> voiced to unvoiced coeff threshold-ratio \n"
317  " set the scaling factor used in determining the correlation\n"
318  " coefficient threshold which must not be exceeded in a voiced \n"
319  " frame to unvoiced frame transition, as <parameter>voiced to unvoiced</parameter> \n"
320  " coeff threshold -ratio. The voiced to unvoiced coefficient \n"
321  " threshold is determined by multiplying this scaling factor \n"
322  " with the maximum cross-correlation coefficient of the \n"
323  " previously voiced frame. If this product is less than \n"
324  " <parameter>min voiced to unvoiced coeff threshold</parameter> then this is used \n"
325  " instead. Must be in the range zero to one inclusive. \n"
326  " Default is 0.85.\n\n"
327 
328  "-t <float> anti pitch doubling/halving threshold\n"
329  " set the threshold used in eliminating (as far as possible) \n"
330  " pitch doubling and pitch halving errors as <parameter>anti pitch \n"
331  " double/halving threshold</parameter>. Must be in the range zero to \n"
332  " one inclusive. Default is 0.77.\n\n";
333 }
334 
335 
A class for storing digital waveforms. The waveform is stored as an array of 16 bit shorts...
Definition: EST_Wave.h:64
void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op, EST_String method)
Definition: pda.cc:66
void end_structure_use(SEGMENT_ *p_seg, CROSS_CORR_ *p_cc)
Definition: srpd1.3.cc:547
#define VOICED
Definition: srpd.h:62
Definition: srpd.h:80
Definition: srpd.h:86
double length
Definition: srpd.h:89
double threshold
Definition: srpd.h:102
char v_uv
Definition: srpd.h:101
void set_channel_name(const EST_String &name, int channel)
set the name of the channel.
Definition: EST_Track.cc:168
void set_break(ssize_t i)
set frame i to be a break
Definition: EST_Track.cc:124
void FIRlowpass_filter(EST_Wave &sigin, int freq, int order=DEFAULT_FILTER_ORDER)
Definition: filter.cc:526
int L
Definition: srpd.h:92
Definition: srpd.h:99
int shift
Definition: srpd.h:81
#define UNVOICED
Definition: srpd.h:61
double Tmax_ratio
Definition: srpd.h:93
int peak_tracking
Definition: srpd.h:96
void initialise_status(struct Srpd_Op *p, STATUS_ *p_status)
Definition: srpd1.3.cc:535
char s_h
Definition: srpd.h:101
#define DEFAULT_TDH
Definition: srpd.h:59
ssize_t num_samples() const
return the number of samples in the waveform
Definition: EST_Wave.h:143
void set(const EST_String &name, int ival)
Definition: EST_Features.h:186
#define HOLD
Definition: srpd.h:65
int ssize_t
void resize(ssize_t num_frames, int num_channels, bool preserve=1)
Definition: EST_Track.cc:214
double cc_max
Definition: srpd.h:102
EST_String options_pda_general(void)
Definition: pda.cc:247
int sample_freq
Definition: srpd.h:87
EST_String options_pda_srpd(void)
Definition: pda.cc:282
EST_String ftoString(float n, int pres=3, int width=0, int l=0)
Make a EST_String object from an float, with variable precision.
Definition: util_io.cc:149
void super_resolution_pda(struct Srpd_Op *paras, SEGMENT_ seg, CROSS_CORR_ *p_cc, STATUS_ *p_status)
Definition: srpd1.3.cc:84
const EST_String S(const EST_String &path) const
Definition: EST_Features.h:158
#define DEFAULT_MAX_PITCH
Definition: srpd.h:50
void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)
Definition: pda.cc:85
#define DEFAULT_TSILENT
Definition: srpd.h:55
void default_pda_options(EST_Features &al)
Definition: pda.cc:229
bool equal_space() const
return true if track has equal (i.e. fixed) frame spacing */
Definition: EST_Track.h:670
float & a(ssize_t i, int c=0)
Definition: EST_Track.cc:1025
double shift
Definition: srpd.h:89
double Tdh
Definition: srpd.h:93
int make_ascii
Definition: srpd.h:95
void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options, EST_Track &sm)
Definition: smooth_pda.cc:54
double * coeff
Definition: srpd.h:76
double min_pitch
Definition: srpd.h:90
#define DEFAULT_TMIN
Definition: srpd.h:56
#define DEFAULT_THIGH
Definition: srpd.h:58
#define DEFAULT_MIN_PITCH
Definition: srpd.h:49
#define DEFAULT_LENGTH
Definition: srpd.h:54
int present(const EST_String &name) const
#define EST_error
Definition: EST_error.h:104
#define DEFAULT_TMAX_RATIO
Definition: srpd.h:57
void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)
Definition: pda.cc:51
double pitch_freq
Definition: srpd.h:100
#define DEFAULT_SHIFT
Definition: srpd.h:53
int sample_rate() const
return the sampling rate (frequency)
Definition: EST_Wave.h:147
int Tsilent
Definition: srpd.h:94
int length
Definition: srpd.h:81
#define DEFAULT_SF
Definition: srpd.h:52
#define DEFAULT_DECIMATION
Definition: srpd.h:48
double Tmin
Definition: srpd.h:93
#define HELD
Definition: srpd.h:66
#define SENT
Definition: srpd.h:68
int read_next_wave_segment(EST_Wave &sig, struct Srpd_Op *paras, SEGMENT_ *p_seg)
Definition: srpd1.3.cc:627
double Thigh
Definition: srpd.h:93
int I(const EST_String &path) const
Definition: EST_Features.h:147
int size
Definition: srpd.h:75
double max_pitch
Definition: srpd.h:91
EST_String
void initialise_structures(struct Srpd_Op *p, SEGMENT_ *p_seg, CROSS_CORR_ *p_cc)
Definition: srpd1.3.cc:515
#define BREAK_NUMBER
Definition: srpd.h:46
void set_equal_space(bool t)
Definition: EST_Track.h:675
void fill_time(float t, int start=1)
Definition: EST_Track.cc:789
float F(const EST_String &path) const
Definition: EST_Features.h:136
Utility EST_String Functions header file.