speech-tools/pda_8cc_source.html

 /*************************************************************************/
 /*                                                                       */
 /*                Centre for Speech Technology Research                  */
 /*                     University of Edinburgh, UK                       */
 /*                      Copyright (c) 1995,1996                          */
 /*                        All Rights Reserved.                           */
 /*                                                                       */
 /*  Permission is hereby granted, free of charge, to use and distribute  */
 /*  this software and its documentation without restriction, including   */
 /*  without limitation the rights to use, copy, modify, merge, publish,  */
 /*  distribute, sublicense, and/or sell copies of this work, and to      */
 /*  permit persons to whom this work is furnished to do so, subject to   */
 /*  the following conditions:                                            */
 /*   1. The code must retain the above copyright notice, this list of    */
 /*      conditions and the following disclaimer.                         */
 /*   2. Any modifications must be clearly marked as such.                */
 /*   3. Original authors' names are not deleted.                         */
 /*   4. The authors' names are not used to endorse or promote products   */
 /*      derived from this software without specific prior written        */
 /*      permission.                                                      */
 /*                                                                       */
 /*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
 /*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
 /*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
 /*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
 /*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
 /*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
 /*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
 /*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
 /*  THIS SOFTWARE.                                                       */
 /*                                                                       */
 /*************************************************************************/
 /*                   Author :  Paul Taylor                               */
 /*                   Date   :  April 1994                                */
 /*************************************************************************/

 #include "EST_speech_class.h"
 #include "sigpr/EST_sigpr_utt.h"
 #include "sigpr/EST_filter.h"
 #include "srpd.h"
 #include "EST_error.h"
 #include "EST_string_aux.h"

 int read_next_wave_segment (EST_Wave &sig, struct Srpd_Op *paras,
                 SEGMENT_ *p_seg);

 static void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize);
 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd);
 static void parse_srpd_list(EST_Features &a_list, struct Srpd_Op *srpd);

 void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)
 {
     if (method == "")
     {
     if (op.present("pda_method"))
         method = op.S("pda_method");
     }
     if (method == "")
     srpd(sig, fz, op);
     else if  (method == "srpd")
     srpd(sig, fz, op);
     else
     EST_error("Unknown pda %s\n", (const char *)method);
 }

 void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op,
            EST_String method)
 { // intonation contour detection algorithm
     EST_Track raw_fz;
     if (method == "")
     {
     if (op.present("pda_method"))
         method = op.S("pda_method");
     }
     if (method == "")
     srpd(sig, raw_fz, op);
     else if  (method == "srpd")
     srpd(sig, raw_fz, op);
     else
     EST_error("Unknown pda %s\n", (const char *)method);

     smooth_phrase(raw_fz, speech, op, fz);
 }

 void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)
 {
     Srpd_Op srpd_op;

     default_srpd_op(&srpd_op); // default values
     parse_srpd_list(op, &srpd_op); // override with options

     if (op.I("do_low_pass",0))
     FIRlowpass_filter(sig, op.I("lpf_cutoff"),op.I("lpf_order"));

     srpd(sig, fz, srpd_op, op.I("srpd_resize", 0));
 }

 /*void do_srpd_fz(EST_Wave &sig, EST_Track &fz)
 {
     Srpd_Op srpd_op;
     default_srpd_op(&srpd_op);
     srpd(sig, fz, srpd_op, 1);
 }
 */

 void srpd(EST_Wave &sig, EST_Track &fz, Srpd_Op &srpd_op, int resize)
 {
     ssize_t i, rns, tracklen, j = 0;
     SEGMENT_ segment;
     CROSS_CORR_ cc;
     STATUS_ pda_status, held_status;
     srpd_op.sample_freq = sig.sample_rate();
     /* Unused variables:
     float min, max;
     min = srpd_op.min_pitch; // must store as set up routines corrupt
     max = srpd_op.max_pitch;
     */

     initialise_structures (&srpd_op, &segment, &cc);
     initialise_status (&srpd_op, &pda_status);
     initialise_status (&srpd_op, &held_status);

     tracklen = (sig.num_samples() - segment.length) / segment.shift + 1;

     if (resize)
     {
     fz.set_equal_space(true);
     fz.resize(tracklen, 1);
     fz.set_channel_name("F0", 0);
     fz.fill_time(srpd_op.shift/1000);
     }

     if (!fz.equal_space())
     EST_error("Pitch tracking algorithm must have equal spaced track\n");

     while ((rns = read_next_wave_segment (sig, &srpd_op, &segment)) != 0)
     {
     if (rns == 2)
     {
         for (i = 0; i < cc.size; cc.coeff[i++] = 0.0);
         initialise_status (&srpd_op, &pda_status);
     }
     else
         super_resolution_pda (&srpd_op, segment, &cc, &pda_status);
     if (pda_status.s_h == HOLD)
     {
         held_status.pitch_freq = pda_status.pitch_freq;
         held_status.v_uv = VOICED;
         held_status.s_h = HELD;
         held_status.cc_max = pda_status.cc_max;
         held_status.threshold = pda_status.threshold;
         continue;
     }
     if (held_status.s_h == HELD)
     {
         if (pda_status.pitch_freq == BREAK_NUMBER)
         {
         held_status.pitch_freq = BREAK_NUMBER;
         held_status.v_uv = UNVOICED;
         }
         held_status.s_h = SENT;
         if (held_status.v_uv != VOICED)
         fz.set_break(j);
         fz.a(j++) = held_status.pitch_freq;
         //    printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), held_status.pitch_freq );
     }
     if (pda_status.v_uv != VOICED)
         fz.set_break(j);
     fz.a(j++) = pda_status.pitch_freq;
     //printf( "track set:  %d (of %d) to %f\n", j-1, fz.length(), pda_status.pitch_freq );
     }
     if (held_status.s_h == HELD)
     {
     held_status.pitch_freq = BREAK_NUMBER;
     held_status.v_uv = UNVOICED;
     fz.set_break(j);
     fz.a(j++) = held_status.pitch_freq;
     }
     end_structure_use (&segment, &cc);
 }

 static struct Srpd_Op *default_srpd_op(struct Srpd_Op *srpd)
 {
     srpd->L = DEFAULT_DECIMATION;
     srpd->min_pitch = DEFAULT_MIN_PITCH;
     srpd->max_pitch = DEFAULT_MAX_PITCH;
     srpd->shift = DEFAULT_SHIFT;
     srpd->length = DEFAULT_LENGTH;
     srpd->Tsilent = DEFAULT_TSILENT;
     srpd->Tmin = DEFAULT_TMIN;
     srpd->Tmax_ratio = DEFAULT_TMAX_RATIO;
     srpd->Thigh = DEFAULT_THIGH;
     srpd->Tdh = DEFAULT_TDH;
     srpd->make_ascii = 0;
     srpd->peak_tracking = 0;
     srpd->sample_freq = DEFAULT_SF;
       /* p_par->Nmax and p_par->Nmin cannot be initialised */
     return(srpd);
 }

 static void parse_srpd_list(EST_Features &al, struct Srpd_Op *srpd)
 {
     if (al.present("decimation"))
     srpd->L = al.I("decimation");
     if (al.present("min_pitch"))
     srpd->min_pitch = al.F("min_pitch");
     if (al.present("max_pitch"))
     srpd->max_pitch = al.F("max_pitch");
     if (al.present("pda_frame_shift"))
     srpd->shift = al.F("pda_frame_shift") * 1000.0;
     if (al.present("pda_frame_length"))
     srpd->length = al.F("pda_frame_length") * 1000.0;
     if (al.present("noise_floor"))
     srpd->Tsilent = al.I("noise_floor");
     if (al.present("v2uv_coeff_thresh"))
     srpd->Thigh = al.F("v2uv_coef_thresh");
     if (al.present("min_v2uv_coef_thresh"))
     srpd->Tmin = al.F("min_v2uv_coef_thresh");
     if (al.present("v2uv_coef_thresh_ratio"))
     srpd->Tmax_ratio = al.F("v2uv_coef_thresh_ratio");
     if (al.present("anti_doubling_thresh"))
     srpd->Tdh = al.F("anti_doubling_thresh");
     if (al.present("peak_tracking"))
     srpd->peak_tracking = al.I("peak_tracking");
     if (al.present("sample_frequency"))
     srpd->sample_freq = al.I("sample_frequency");
 }

 void default_pda_options(EST_Features &al)
 {
     al.set("min_pitch", "40.0");
     al.set("max_pitch", "400.0");
     al.set("pda_frame_shift", "0.005");
     al.set("pda_frame_length", DEFAULT_LENGTH / 1000.0);
     al.set("lpf_cutoff", "600");
     al.set("lpf_order", "49");
     al.set("f0_file_type", "esps");
     al.set("decimation", DEFAULT_DECIMATION);
     al.set("noise_floor", DEFAULT_TSILENT);
     al.set("min_v2uv_coef_thresh", DEFAULT_TMIN);
     al.set("v2uv_coef_thresh_ratio", DEFAULT_TMAX_RATIO);
     al.set("v2uv_coef_thresh", DEFAULT_THIGH);
     al.set("anti_doubling_thresh", DEFAULT_TDH);
     al.set("peak_tracking", 0);
 }

 EST_String options_pda_general(void)
 {
     // The standard waveform input options
     return
     EST_String("")+
     "-L  Perform low pass filtering on input. This option should always \n"
     "    be used in normal processing as it usually increases \n"
     "    performance considerably\n\n"
     "-P  perform peak tracking\n\n"
     "-fmin <float> miniumum F0 value. Sets the minimum allowed F0 in \n"
     "    output track. Default is "+ftoString(DEFAULT_MIN_PITCH)+".\n "
     "    Changing this to suit the speaker usually increases  \n"
     "    performance. Typical recommended values are 60-90Hz for\n"
     "    males and 120-150Hz  for females\n\n"
     "-fmax <float> maxiumum F0 value. Sets the maximum allowed F0 in \n"
     "    output track. Default is "+ftoString(DEFAULT_MAX_PITCH)+". \n"
     "    Changing this to suit the speaker usually increases \n"
     "    performance. Typical recommended values are 200Hz for \n"
     "    males and 300-400Hz for females\n\n"
     "-shift <float> frame spacing in seconds for fixed frame analysis. \n"
     "    This doesn't have to be the same as the output file spacing - \n"
     "    the -S option can be used to resample the track before saving \n"
     "    default: "+ftoString(DEFAULT_SHIFT/1000.0) +"\n\n"
     "-length <float> analysis frame length in seconds.\n"
     "    default: "+ftoString(DEFAULT_LENGTH/1000.0) +"\n\n"
     "-lpfilter <int>   Low pass filter, with cutoff frequency in Hz \n"
     "    Filtering is performed by a FIR filter which is built at run \n"
     "    time. The order of the filter can be given by -forder. The \n"
     "    default value is 199\n\n"
     "-forder <int>  Order of FIR filter used for lpfilter and \n"
     "    hpfilter. This must be ODD. Sensible values range \n"
     "    from 19 (quick but with a shallow rolloff) to 199 \n"
     "    (slow but with a steep rolloff). The default is 199.\n\n";
 }

 EST_String options_pda_srpd(void)
 {
     // The standard waveform input options
     return
     EST_String("")+
     "-d <float> decimation factor\n"
     "    set down-sampling for quicker computation so that only one in \n"
     "    <parameter>decimation factor</parameter> samples are used in the first instance. \n"
     "    Must be in the range of one to ten inclusive. Default is four. \n"
     "    For data sampled at 10kHz, it is advised that a decimation \n"
     "    factor of two isselected.\n\n"

     "-n <float> Inoise floor.\n"
     "    Set the maximum absolute signal amplitude that represents  \n"
     "    silence to <parameter>Inoise floor</parameter>. If the absolute amplitude of \n"
     "    the first segment in a given frame is below this level at all \n"
     "    times, then the frame is classified as representing silence. \n"
     "    Must be a positive number. Default is 120 ADC units.\n\n"

     "-H <float> unvoiced to voiced coeff threshold\n"
     "    set the correlation coefficient threshold which must be \n"
     "    exceeded in a transition from an unvoiced classified frame \n"
     "    of speech to a voiced frame as the unvoiced to voiced coeff \n"
     "    threshold. Must be in the range zero to one inclusive. \n"
     "    Default is 0.88.\n\n"

     "-m <float> min voiced to unvoiced coeff threshold \n"
     "    set the minimum allowed correlation coefficient threshold \n"
     "    which must not be exceeded in a transition from a voiced \n"
     "    classified frame of speech to an unvoiced frame, as \n"
     "    <parameter>min voiced to unvoiced coeff threshold</parameter>. Must be in the \n"
     "    range zero to <parameter>unvoiced to voiced coeff threshold</parameter> \n"
     "    inclusive. Default is 0.75.\n\n"

     "-R <float> voiced to unvoiced coeff threshold-ratio  \n"
     "    set the scaling factor used in determining the correlation\n"
     "    coefficient threshold which must not be exceeded in a voiced \n"
     "    frame to unvoiced frame transition, as <parameter>voiced to unvoiced</parameter> \n"
     "    coeff threshold -ratio. The voiced to unvoiced coefficient \n"
     "    threshold is determined by multiplying this scaling factor \n"
     "    with the maximum cross-correlation coefficient of the \n"
     "    previously voiced frame. If this product is less than \n"
     "    <parameter>min voiced to unvoiced coeff threshold</parameter> then this is used \n"
     "    instead. Must be in the range zero to one inclusive. \n"
     "     Default is 0.85.\n\n"

     "-t <float> anti pitch doubling/halving threshold\n"
     "    set the threshold used in eliminating (as far as possible) \n"
     "    pitch doubling and pitch halving errors as <parameter>anti pitch \n"
     "    double/halving threshold</parameter>. Must be in the range zero to \n"
     "    one inclusive. Default is 0.77.\n\n";
 }


EST_Wave
A class for storing digital waveforms. The waveform is stored as an array of 16 bit shorts...
Definition: EST_Wave.h:64

icda
void icda(EST_Wave &sig, EST_Track &fz, EST_Track &speech, EST_Features &op, EST_String method)
Definition: pda.cc:66

end_structure_use
void end_structure_use(SEGMENT_ *p_seg, CROSS_CORR_ *p_cc)
Definition: srpd1.3.cc:547

VOICED
#define VOICED
Definition: srpd.h:62

SEGMENT_
Definition: srpd.h:80

EST_speech_class.h

Srpd_Op
Definition: srpd.h:86

Srpd_Op::length
double length
Definition: srpd.h:89

STATUS_::threshold
double threshold
Definition: srpd.h:102

STATUS_::v_uv
char v_uv
Definition: srpd.h:101

EST_Track::set_channel_name
void set_channel_name(const EST_String &name, int channel)
set the name of the channel.
Definition: EST_Track.cc:168

EST_Track::set_break
void set_break(ssize_t i)
set frame i to be a break
Definition: EST_Track.cc:124

EST_Features
Definition: EST_Features.h:63

FIRlowpass_filter
void FIRlowpass_filter(EST_Wave &sigin, int freq, int order=DEFAULT_FILTER_ORDER)
Definition: filter.cc:526

Srpd_Op::L
int L
Definition: srpd.h:92

STATUS_
Definition: srpd.h:99

SEGMENT_::shift
int shift
Definition: srpd.h:81

UNVOICED
#define UNVOICED
Definition: srpd.h:61

Srpd_Op::Tmax_ratio
double Tmax_ratio
Definition: srpd.h:93

Srpd_Op::peak_tracking
int peak_tracking
Definition: srpd.h:96

initialise_status
void initialise_status(struct Srpd_Op *p, STATUS_ *p_status)
Definition: srpd1.3.cc:535

STATUS_::s_h
char s_h
Definition: srpd.h:101

DEFAULT_TDH
#define DEFAULT_TDH
Definition: srpd.h:59

EST_Wave::num_samples
ssize_t num_samples() const
return the number of samples in the waveform
Definition: EST_Wave.h:143

EST_Features::set
void set(const EST_String &name, int ival)
Definition: EST_Features.h:186

HOLD
#define HOLD
Definition: srpd.h:65

ssize_t
int ssize_t
Definition: EST_socket_win32.h:48

EST_Track::resize
void resize(ssize_t num_frames, int num_channels, bool preserve=1)
Definition: EST_Track.cc:214

STATUS_::cc_max
double cc_max
Definition: srpd.h:102

options_pda_general
EST_String options_pda_general(void)
Definition: pda.cc:247

Srpd_Op::sample_freq
int sample_freq
Definition: srpd.h:87

EST_filter.h

options_pda_srpd
EST_String options_pda_srpd(void)
Definition: pda.cc:282

ftoString
EST_String ftoString(float n, int pres=3, int width=0, int l=0)
Make a EST_String object from an float, with variable precision.
Definition: util_io.cc:149

super_resolution_pda
void super_resolution_pda(struct Srpd_Op *paras, SEGMENT_ seg, CROSS_CORR_ *p_cc, STATUS_ *p_status)
Definition: srpd1.3.cc:84

EST_Features::S
const EST_String S(const EST_String &path) const
Definition: EST_Features.h:158

DEFAULT_MAX_PITCH
#define DEFAULT_MAX_PITCH
Definition: srpd.h:50

srpd
void srpd(EST_Wave &sig, EST_Track &fz, EST_Features &op)
Definition: pda.cc:85

DEFAULT_TSILENT
#define DEFAULT_TSILENT
Definition: srpd.h:55

EST_sigpr_utt.h

default_pda_options
void default_pda_options(EST_Features &al)
Definition: pda.cc:229

EST_Track::equal_space
bool equal_space() const
return true if track has equal (i.e. fixed) frame spacing */
Definition: EST_Track.h:670

EST_Track::a
float & a(ssize_t i, int c=0)
Definition: EST_Track.cc:1025

Srpd_Op::shift
double shift
Definition: srpd.h:89

Srpd_Op::Tdh
double Tdh
Definition: srpd.h:93

Srpd_Op::make_ascii
int make_ascii
Definition: srpd.h:95

smooth_phrase
void smooth_phrase(EST_Track &c, EST_Track &speech, EST_Features &options, EST_Track &sm)
Definition: smooth_pda.cc:54

CROSS_CORR_::coeff
double * coeff
Definition: srpd.h:76

srpd.h

Srpd_Op::min_pitch
double min_pitch
Definition: srpd.h:90

DEFAULT_TMIN
#define DEFAULT_TMIN
Definition: srpd.h:56

DEFAULT_THIGH
#define DEFAULT_THIGH
Definition: srpd.h:58

DEFAULT_MIN_PITCH
#define DEFAULT_MIN_PITCH
Definition: srpd.h:49

DEFAULT_LENGTH
#define DEFAULT_LENGTH
Definition: srpd.h:54

EST_Features::present
int present(const EST_String &name) const
Definition: EST_Features.cc:147

EST_error
#define EST_error
Definition: EST_error.h:104

DEFAULT_TMAX_RATIO
#define DEFAULT_TMAX_RATIO
Definition: srpd.h:57

pda
void pda(EST_Wave &sig, EST_Track &fz, EST_Features &op, EST_String method)
Definition: pda.cc:51

STATUS_::pitch_freq
double pitch_freq
Definition: srpd.h:100

DEFAULT_SHIFT
#define DEFAULT_SHIFT
Definition: srpd.h:53

EST_Wave::sample_rate
int sample_rate() const
return the sampling rate (frequency)
Definition: EST_Wave.h:147

CROSS_CORR_
Definition: srpd.h:74

EST_error.h

Srpd_Op::Tsilent
int Tsilent
Definition: srpd.h:94

SEGMENT_::length
int length
Definition: srpd.h:81

DEFAULT_SF
#define DEFAULT_SF
Definition: srpd.h:52

DEFAULT_DECIMATION
#define DEFAULT_DECIMATION
Definition: srpd.h:48

Srpd_Op::Tmin
double Tmin
Definition: srpd.h:93

HELD
#define HELD
Definition: srpd.h:66

SENT
#define SENT
Definition: srpd.h:68

read_next_wave_segment
int read_next_wave_segment(EST_Wave &sig, struct Srpd_Op *paras, SEGMENT_ *p_seg)
Definition: srpd1.3.cc:627

Srpd_Op::Thigh
double Thigh
Definition: srpd.h:93

EST_Features::I
int I(const EST_String &path) const
Definition: EST_Features.h:147

CROSS_CORR_::size
int size
Definition: srpd.h:75

Srpd_Op::max_pitch
double max_pitch
Definition: srpd.h:91

EST_String
EST_String
Definition: EST_features_aux.cc:50

initialise_structures
void initialise_structures(struct Srpd_Op *p, SEGMENT_ *p_seg, CROSS_CORR_ *p_cc)
Definition: srpd1.3.cc:515

BREAK_NUMBER
#define BREAK_NUMBER
Definition: srpd.h:46

EST_Track::set_equal_space
void set_equal_space(bool t)
Definition: EST_Track.h:675

EST_Track
Definition: EST_Track.h:90

EST_Track::fill_time
void fill_time(float t, int start=1)
Definition: EST_Track.cc:789

EST_Features::F
float F(const EST_String &path) const
Definition: EST_Features.h:136

EST_string_aux.h
Utility EST_String Functions header file.

EST_String
Definition: EST_String.h:76