| File: | modules/clustergen/me_mlsa.cc |
| Location: | line 934, column 13 |
| Description: | Value stored to 'postfilter_size' is never read |
| 1 | /** |
| 2 | * The HMM-Based Speech Synthesis System (HTS) |
| 3 | * HTS Working Group |
| 4 | * |
| 5 | * Department of Computer Science |
| 6 | * Nagoya Institute of Technology |
| 7 | * and |
| 8 | * Interdisciplinary Graduate School of Science and Engineering |
| 9 | * Tokyo Institute of Technology |
| 10 | * |
| 11 | * Portions Copyright (c) 2001-2006 |
| 12 | * All Rights Reserved. |
| 13 | * |
| 14 | * Portions Copyright 2000-2007 DFKI GmbH. |
| 15 | * All Rights Reserved. |
| 16 | * |
| 17 | * Permission is hereby granted, free of charge, to use and |
| 18 | * distribute this software and its documentation without |
| 19 | * restriction, including without limitation the rights to use, |
| 20 | * copy, modify, merge, publish, distribute, sublicense, and/or |
| 21 | * sell copies of this work, and to permit persons to whom this |
| 22 | * work is furnished to do so, subject to the following conditions: |
| 23 | * |
| 24 | * 1. The source code must retain the above copyright notice, |
| 25 | * this list of conditions and the following disclaimer. |
| 26 | * |
| 27 | * 2. Any modifications to the source code must be clearly |
| 28 | * marked as such. |
| 29 | * |
| 30 | * 3. Redistributions in binary form must reproduce the above |
| 31 | * copyright notice, this list of conditions and the |
| 32 | * following disclaimer in the documentation and/or other |
| 33 | * materials provided with the distribution. Otherwise, one |
| 34 | * must contact the HTS working group. |
| 35 | * |
| 36 | * NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF TECHNOLOGY, |
| 37 | * HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM |
| 38 | * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
| 39 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT |
| 40 | * SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF |
| 41 | * TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE |
| 42 | * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
| 43 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
| 44 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS |
| 45 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
| 46 | * PERFORMANCE OF THIS SOFTWARE. |
| 47 | * |
| 48 | * |
| 49 | * This software was translated to C for use within Festival to offer |
| 50 | * multi-excitation MLSA |
| 51 | * Alan W Black (awb@cs.cmu.edu) 3rd April 2009 |
| 52 | * |
| 53 | */ |
| 54 | |
| 55 | #include <stdio.h> |
| 56 | #include <stdlib.h> |
| 57 | #include <string.h> |
| 58 | #include <math.h> |
| 59 | #include <EST_walloc.h> |
| 60 | #include "festival.h" |
| 61 | |
| 62 | #include "mlsa_resynthesis.h" |
| 63 | |
| 64 | /** |
| 65 | * Synthesis of speech out of speech parameters. |
| 66 | * Mixed excitation MLSA vocoder. |
| 67 | * |
| 68 | * Java port and extension of HTS engine version 2.0 |
| 69 | * Extension: mixed excitation |
| 70 | * @author Marcela Charfuelan |
| 71 | * And ported to C by Alan W Black (awb@cs.cmu.edu) |
| 72 | */ |
| 73 | |
| 74 | #define booleanint int |
| 75 | #define true1 1 |
| 76 | #define false0 0 |
| 77 | |
| 78 | typedef struct HTSData_struct { |
| 79 | |
| 80 | int rate; |
| 81 | int fperiod; |
| 82 | double rhos; |
| 83 | |
| 84 | int stage; |
| 85 | double alpha; |
| 86 | double beta; |
| 87 | booleanint useLogGain; |
| 88 | double uf; |
| 89 | booleanint algnst; /* use state level alignment for duration */ |
| 90 | booleanint algnph; /* use phoneme level alignment for duration */ |
| 91 | booleanint useMixExc; /* use Mixed Excitation */ |
| 92 | booleanint useFourierMag; /* use Fourier magnitudes for pulse generation */ |
| 93 | booleanint useGV; /* use global variance in parameter generation */ |
| 94 | booleanint useGmmGV; /* use global variance as a Gaussian Mixture Model */ |
| 95 | booleanint useUnitDurationContinuousFeature; /* for using external duration, so it will not be generated from HMMs*/ |
| 96 | booleanint useUnitLogF0ContinuousFeature; /* for using external f0, so it will not be generated from HMMs*/ |
| 97 | |
| 98 | /** variables for controling generation of speech in the vocoder |
| 99 | * these variables have default values but can be fixed and read from the |
| 100 | * audio effects component. [Default][min--max] */ |
| 101 | double length; /* total number of frame for generated speech */ |
| 102 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
| 103 | double durationScale; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
| 104 | |
| 105 | booleanint LogGain; |
| 106 | char *PdfStrFile, *PdfMagFile; |
| 107 | |
| 108 | int NumFilters, OrderFilters; |
| 109 | double **MixFilters; |
| 110 | double F0Std; |
| 111 | double F0Mean; |
| 112 | |
| 113 | } HTSData; |
| 114 | |
| 115 | #if 0 |
| 116 | typedef struct HTSData_struct { |
| 117 | |
| 118 | int rate = 16000; |
| 119 | int fperiod = 80; |
| 120 | double rhos = 0.0; |
| 121 | |
| 122 | int stage = 0; |
| 123 | double alpha = 0.42; |
| 124 | booleanint useLogGain = false0; |
| 125 | double uf = 0.5; |
| 126 | booleanint algnst = false0; /* use state level alignment for duration */ |
| 127 | booleanint algnph = false0; /* use phoneme level alignment for duration */ |
| 128 | booleanint useMixExc = true1; /* use Mixed Excitation */ |
| 129 | booleanint useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ |
| 130 | booleanint useGV = false0; /* use global variance in parameter generation */ |
| 131 | booleanint useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ |
| 132 | booleanint useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ |
| 133 | booleanint useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ |
| 134 | |
| 135 | /** variables for controling generation of speech in the vocoder |
| 136 | * these variables have default values but can be fixed and read from the |
| 137 | * audio effects component. [Default][min--max] */ |
| 138 | double f0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ |
| 139 | double f0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ |
| 140 | double length = 0.0; /* total number of frame for generated speech */ |
| 141 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
| 142 | double durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
| 143 | |
| 144 | } HTSData; |
| 145 | #endif |
| 146 | |
| 147 | static int IPERIOD = 1; |
| 148 | static booleanint GAUSS = true1; |
| 149 | static int PADEORDER = 5; /* pade order for MLSA filter */ |
| 150 | static int IRLENG = 96; /* length of impulse response */ |
| 151 | |
| 152 | /* for MGLSA filter (mel-generalised log spectrum approximation filter) */ |
| 153 | static booleanint NORMFLG1 = true1; |
| 154 | static booleanint NORMFLG2 = false0; |
| 155 | static booleanint MULGFLG1 = true1; |
| 156 | static booleanint MULGFLG2 = false0; |
| 157 | static booleanint NGAIN = false0; |
| 158 | |
| 159 | static double ZERO = 1.0e-10; /* ~(0) */ |
| 160 | static double LZERO = (-1.0e+10); /* ~log(0) */ |
| 161 | |
| 162 | static int stage; /* Gamma=-1/stage : if stage=0 then Gamma=0 */ |
| 163 | static double xgamma; /* Gamma */ |
| 164 | static booleanint use_log_gain; /* log gain flag (for LSP) */ |
| 165 | static int fprd; /* frame shift */ |
| 166 | static int iprd; /* interpolation period */ |
| 167 | static booleanint gauss; /* flag to use Gaussian noise */ |
| 168 | static double p1; /* used in excitation generation */ |
| 169 | static double pc; /* used in excitation generation */ |
| 170 | static double *pade; /* used in mlsadf */ |
| 171 | static int ppade; /* offset for vector ppade */ |
| 172 | |
| 173 | static double *C; /* used in the MLSA/MGLSA filter */ |
| 174 | static double *CC; /* used in the MLSA/MGLSA filter */ |
| 175 | static double *CINC; /* used in the MLSA/MGLSA filter */ |
| 176 | static double *D1; /* used in the MLSA/MGLSA filter */ |
| 177 | static int CINC_length, CC_length, C_length, D1_length; |
| 178 | |
| 179 | static double rate; |
| 180 | static int pt1; /* used in mlsadf1 */ |
| 181 | static int pt2; /* used in mlsadf2 */ |
| 182 | static int *pt3; /* used in mlsadf2 */ |
| 183 | |
| 184 | /* mixed excitation variables */ |
| 185 | static int numM; /* Number of bandpass filters for mixed excitation */ |
| 186 | static int orderM; /* Order of filters for mixed excitation */ |
| 187 | static double **h; /* filters for mixed excitation */ |
| 188 | static double *xpulseSignal; /* the size of this should be orderM */ |
| 189 | static double *xnoiseSignal; /* the size of this should be orderM */ |
| 190 | static booleanint mixedExcitation = false0; |
| 191 | static booleanint fourierMagnitudes = false0; |
| 192 | |
| 193 | static booleanint lpcVocoder = false0; /* true if lpc vocoder is used, then the input should be lsp parameters */ |
| 194 | |
| 195 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData); |
| 196 | int htsMLSAVocoder(EST_Track *lf0Pst, |
| 197 | EST_Track *mcepPst, |
| 198 | EST_Track *strPst, |
| 199 | EST_Track *magPst, |
| 200 | int *voiced, |
| 201 | HTSData *htsData, |
| 202 | EST_Wave *wave); |
| 203 | |
| 204 | |
| 205 | LISP me_mlsa_resynthesis(LISP ltrack, LISP strack) |
| 206 | { |
| 207 | /* Resynthesizes a wave from given track with mixed excitation*/ |
| 208 | EST_Track *t; |
| 209 | EST_Track *str_track; |
| 210 | EST_Wave *wave = 0; |
| 211 | EST_Track *mcep; |
| 212 | EST_Track *f0v; |
| 213 | EST_Track *str; |
| 214 | EST_Track *mag; |
| 215 | int *voiced; |
| 216 | int sr = 16000; |
| 217 | int i,j; |
| 218 | double shift; |
| 219 | HTSData htsData; |
| 220 | |
| 221 | htsData.alpha = 0.42; |
| 222 | htsData.beta = 0.0; |
| 223 | |
| 224 | if ((ltrack == NULL__null) || |
| 225 | (TYPEP(ltrack,tc_string)( (ltrack != __null) && ((((ltrack) == ((struct obj * ) 0)) ? 0 : ((*(ltrack)).type)) == (13)) ) && |
| 226 | (streq(get_c_string(ltrack),"nil")(strcmp(get_c_string(ltrack),"nil")==0)))) |
| 227 | return siod(new EST_Wave(0,1,sr)); |
| 228 | |
| 229 | t = track(ltrack); |
| 230 | str_track = track(strack); |
| 231 | |
| 232 | f0v = new EST_Track(t->num_frames(),1); |
| 233 | mcep = new EST_Track(t->num_frames(),25); |
| 234 | str = new EST_Track(t->num_frames(),5); |
| 235 | mag = new EST_Track(t->num_frames(),10); |
| 236 | voiced = walloc(int,t->num_frames())((int *)safe_walloc(sizeof(int)*(t->num_frames()))); |
| 237 | |
| 238 | for (i=0; i<t->num_frames(); i++) |
| 239 | { |
| 240 | f0v->a(i) = t->a(i,0); |
| 241 | if (f0v->a(i) > 0) |
| 242 | voiced[i] = 1; |
| 243 | else |
| 244 | voiced[i] = 0; |
| 245 | for (j=1; j<26; j++) |
| 246 | mcep->a(i,j-1) = t->a(i,j); |
| 247 | |
| 248 | for (j=0; j<5; j++) |
| 249 | { |
| 250 | str->a(i,j) = str_track->a(i,j); |
| 251 | } |
| 252 | /* printf("awb_debug str %d 0 %f 1 %f 2 %f 3 %f 4 %f\n", |
| 253 | i,str->a(i,0),str->a(i,1),str->a(i,2),str->a(i,3),str->a(i,4));*/ |
| 254 | #if 0 |
| 255 | for (j=57; j<66; j++) |
| 256 | mag->a(i,j-57) = t->a(i,j); |
| 257 | #endif |
| 258 | } |
| 259 | |
| 260 | if (t->num_frames() > 1) |
| 261 | shift = 1000.0*(t->t(1)-t->t(0)); |
| 262 | else |
| 263 | shift = 5.0; |
| 264 | |
| 265 | htsData.alpha = FLONM(siod_get_lval("mlsa_alpha_param",((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data) |
| 266 | "mlsa: mlsa_alpha_param not set"))((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data); |
| 267 | htsData.beta = FLONM(siod_get_lval("mlsa_beta_param",((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data) |
| 268 | "mlsa: mlsa_beta_param not set"))((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data); |
| 269 | htsData.stage = 0; |
| 270 | htsData.LogGain = false0; |
| 271 | htsData.fperiod = 80; |
| 272 | htsData.rate = 16000; |
| 273 | htsData.rhos = 0.0; |
| 274 | |
| 275 | htsData.uf = 0.5; |
| 276 | htsData.algnst = false0; /* use state level alignment for duration */ |
| 277 | htsData.algnph = false0; /* use phoneme level alignment for duration */ |
| 278 | htsData.useMixExc = true1; /* use Mixed Excitation */ |
| 279 | htsData.useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ |
| 280 | htsData.useGV = false0; /* use global variance in parameter generation */ |
| 281 | htsData.useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ |
| 282 | htsData.useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ |
| 283 | htsData.useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ |
| 284 | |
| 285 | /** variables for controling generation of speech in the vocoder |
| 286 | * these variables have default values but can be fixed and read from the |
| 287 | * audio effects component. [Default][min--max] */ |
| 288 | htsData.F0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ |
| 289 | htsData.F0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ |
| 290 | htsData.length = 0.0; /* total number of frame for generated speech */ |
| 291 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
| 292 | htsData.durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
| 293 | |
| 294 | LISP filters = siod_get_lval("me_mix_filters", |
| 295 | "mlsa: me_mix_filters not set"); |
| 296 | LISP f; |
| 297 | int fl; |
| 298 | htsData.NumFilters = 5; |
| 299 | for (fl=0,f=filters; f; fl++) |
| 300 | f=cdr(f); |
| 301 | htsData.OrderFilters = fl/htsData.NumFilters; |
| 302 | htsData.MixFilters = walloc(double *,htsData.NumFilters)((double * *)safe_walloc(sizeof(double *)*(htsData.NumFilters ))); |
| 303 | for (i=0; i < htsData.NumFilters; i++) |
| 304 | { |
| 305 | htsData.MixFilters[i] = walloc(double,htsData.OrderFilters)((double *)safe_walloc(sizeof(double)*(htsData.OrderFilters)) ); |
| 306 | for (j=0; j<htsData.OrderFilters; j++) |
| 307 | { |
| 308 | htsData.MixFilters[i][j] = FLONM(car(filters))((*car(filters)).storage_as.flonum.data); |
| 309 | filters = cdr(filters); |
| 310 | } |
| 311 | } |
| 312 | |
| 313 | wave = new EST_Wave(0,1,sr); |
| 314 | |
| 315 | if (mcep->num_frames() > 0) |
| 316 | /* mcep_order and number of deltas */ |
| 317 | htsMLSAVocoder(f0v,mcep,str,mag,voiced,&htsData,wave); |
| 318 | |
| 319 | delete f0v; |
| 320 | delete mcep; |
| 321 | delete str; |
| 322 | delete mag; |
| 323 | delete voiced; |
| 324 | |
| 325 | return siod(wave); |
| 326 | } |
| 327 | |
| 328 | /** The initialisation of VocoderSetup should be done when there is already |
| 329 | * information about the number of feature vectors to be processed, |
| 330 | * size of the mcep vector file, etc. */ |
| 331 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData) |
| 332 | { |
| 333 | int vector_size; |
| 334 | double xrand; |
| 335 | |
| 336 | stage = htsData->stage; |
| 337 | if(stage != 0) |
| 338 | xgamma = -1.0 / stage; |
| 339 | else |
| 340 | xgamma = 0.0; |
| 341 | use_log_gain = htsData->LogGain; |
| 342 | |
| 343 | fprd = htsData->fperiod; |
| 344 | rate = htsData->rate; |
| 345 | iprd = IPERIOD; |
| 346 | gauss = GAUSS; |
| 347 | |
| 348 | /* XXX */ |
| 349 | xrand = rand(); |
| 350 | |
| 351 | if(stage == 0 ){ /* for MCP */ |
| 352 | |
| 353 | /* mcep_order=74 and pd=PADEORDER=5 (if no HTS_EMBEDDED is used) */ |
| 354 | vector_size = (mcep_vsize * ( 3 + PADEORDER) + 5 * PADEORDER + 6) - (3 * (mcep_order+1)); |
| 355 | CINC_length = CC_length = C_length = mcep_order+1; |
| 356 | D1_length = vector_size; |
| 357 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); |
| 358 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); |
| 359 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); |
| 360 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); |
| 361 | |
| 362 | vector_size=21; |
| 363 | pade = walloc(double,vector_size)((double *)safe_walloc(sizeof(double)*(vector_size))); |
| 364 | /* ppade is a copy of pade in mlsadf() function : ppade = &( pade[pd*(pd+1)/2] ); */ |
| 365 | ppade = PADEORDER*(PADEORDER+1)/2; /* offset for vector pade */ |
| 366 | pade[0] = 1.0; |
| 367 | pade[1] = 1.0; |
| 368 | pade[2] = 0.0; |
| 369 | pade[3] = 1.0; |
| 370 | pade[4] = 0.0; |
| 371 | pade[5] = 0.0; |
| 372 | pade[6] = 1.0; |
| 373 | pade[7] = 0.0; |
| 374 | pade[8] = 0.0; |
| 375 | pade[9] = 0.0; |
| 376 | pade[10] = 1.0; |
| 377 | pade[11] = 0.4999273; |
| 378 | pade[12] = 0.1067005; |
| 379 | pade[13] = 0.01170221; |
| 380 | pade[14] = 0.0005656279; |
| 381 | pade[15] = 1.0; |
| 382 | pade[16] = 0.4999391; |
| 383 | pade[17] = 0.1107098; |
| 384 | pade[18] = 0.01369984; |
| 385 | pade[19] = 0.0009564853; |
| 386 | pade[20] = 0.00003041721; |
| 387 | |
| 388 | pt1 = PADEORDER+1; |
| 389 | pt2 = ( 2 * (PADEORDER+1)) + (PADEORDER * (mcep_order+2)); |
| 390 | pt3 = new int[PADEORDER+1]; |
| 391 | for(int i=PADEORDER; i>=1; i--) |
| 392 | pt3[i] = ( 2 * (PADEORDER+1)) + ((i-1)*(mcep_order+2)); |
| 393 | |
| 394 | } else { /* for LSP */ |
| 395 | vector_size = ((mcep_vsize+1) * (stage+3)) - ( 3 * (mcep_order+1)); |
| 396 | CINC_length = CC_length = C_length = mcep_order+1; |
| 397 | D1_length = vector_size; |
| 398 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); |
| 399 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); |
| 400 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); |
| 401 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); |
| 402 | } |
| 403 | |
| 404 | /* excitation initialisation */ |
| 405 | p1 = -1; |
| 406 | pc = 0.0; |
| 407 | |
| 408 | } /* method initVocoder */ |
| 409 | |
| 410 | |
| 411 | |
| 412 | /** |
| 413 | * HTS_MLSA_Vocoder: Synthesis of speech out of mel-cepstral coefficients. |
| 414 | * This procedure uses the parameters generated in pdf2par stored in: |
| 415 | * PStream mceppst: Mel-cepstral coefficients |
| 416 | * PStream strpst : Filter bank stregths for mixed excitation |
| 417 | * PStream magpst : Fourier magnitudes ( OJO!! this is not used yet) |
| 418 | * PStream lf0pst : Log F0 |
| 419 | */ |
| 420 | #if 0 |
| 421 | AudioInputStream htsMLSAVocoder(HTSParameterGeneration pdf2par, HMMData htsData) |
| 422 | { |
| 423 | float sampleRate = 16000.0F; //8000,11025,16000,22050,44100 |
| 424 | int sampleSizeInBits = 16; //8,16 |
| 425 | int channels = 1; //1,2 |
| 426 | booleanint signed = true1; //true,false |
| 427 | booleanint bigEndian = false0; //true,false |
| 428 | AudioFormat af = new AudioFormat( |
| 429 | sampleRate, |
| 430 | sampleSizeInBits, |
| 431 | channels, |
| 432 | signed, |
| 433 | bigEndian); |
| 434 | double [] audio_double = NULL__null; |
| 435 | |
| 436 | audio_double = htsMLSAVocoder(pdf2par.getlf0Pst(), pdf2par.getMcepPst(), pdf2par.getStrPst(), pdf2par.getMagPst(), |
| 437 | pdf2par.getVoicedArray(), htsData); |
| 438 | |
| 439 | long lengthInSamples = (audio_double.length * 2 ) / (sampleSizeInBits/8); |
| 440 | logger.info("length in samples=" + lengthInSamples ); |
| 441 | |
| 442 | /* Normalise the signal before return, this will normalise between 1 and -1 */ |
| 443 | double MaxSample = MathUtils.getAbsMax(audio_double); |
| 444 | for (int i=0; i<audio_double.length; i++) |
| 445 | audio_double[i] = 0.3 * ( audio_double[i] / MaxSample ); |
| 446 | |
| 447 | DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af); |
| 448 | return oais; |
| 449 | |
| 450 | |
| 451 | } /* method htsMLSAVocoder() */ |
| 452 | #endif |
| 453 | |
| 454 | static double mlsafir(double x, double *b, int m, double a, double aa, double *d, int _pt3 ) |
| 455 | { |
| 456 | double y = 0.0; |
| 457 | int i; |
| 458 | |
| 459 | d[_pt3+0] = x; |
| 460 | d[_pt3+1] = aa * d[_pt3+0] + ( a * d[_pt3+1] ); |
| 461 | |
| 462 | for(i=2; i<=m; i++){ |
| 463 | d[_pt3+i] += a * ( d[_pt3+i+1] - d[_pt3+i-1]); |
| 464 | } |
| 465 | |
| 466 | for(i=2; i<=m; i++){ |
| 467 | y += d[_pt3+i] * b[i]; |
| 468 | } |
| 469 | |
| 470 | for(i=m+1; i>1; i--){ |
| 471 | d[_pt3+i] = d[_pt3+i-1]; |
| 472 | } |
| 473 | |
| 474 | return(y); |
| 475 | } |
| 476 | |
| 477 | /** mlsdaf1: sub functions for MLSA filter */ |
| 478 | static double mlsadf1(double x, double *b, int m, double a, double aa, double *d) |
| 479 | { |
| 480 | double v; |
| 481 | double out = 0.0; |
| 482 | int i; |
| 483 | //pt1 --> pt = &d1[pd+1] |
| 484 | |
| 485 | for(i=PADEORDER; i>=1; i--) { |
| 486 | d[i] = aa * d[pt1+i-1] + a * d[i]; |
| 487 | d[pt1+i] = d[i] * b[1]; |
| 488 | v = d[pt1+i] * pade[ppade+i]; |
| 489 | |
| 490 | //x += (1 & i) ? v : -v; |
| 491 | if(i == 1 || i == 3 || i == 5) |
| 492 | x += v; |
| 493 | else |
| 494 | x += -v; |
| 495 | out += v; |
| 496 | } |
| 497 | d[pt1+0] = x; |
| 498 | out += x; |
| 499 | |
| 500 | return(out); |
| 501 | |
| 502 | } |
| 503 | |
| 504 | /** mlsdaf2: sub functions for MLSA filter */ |
| 505 | static double mlsadf2(double x, double *b, int m, double a, double aa, double *d) |
| 506 | { |
| 507 | double v; |
| 508 | double out = 0.0; |
| 509 | int i; |
| 510 | // pt2 --> pt = &d1[pd * (m+2)] |
| 511 | // pt3 --> pt = &d1[ 2*(pd+1) ] |
| 512 | |
| 513 | for(i=PADEORDER; i>=1; i--) { |
| 514 | d[pt2+i] = mlsafir(d[(pt2+i)-1], b, m, a, aa, d, pt3[i]); |
| 515 | v = d[pt2+i] * pade[ppade+i]; |
| 516 | |
| 517 | if(i == 1 || i == 3 || i == 5) |
| 518 | x += v; |
| 519 | else |
| 520 | x += -v; |
| 521 | out += v; |
| 522 | |
| 523 | } |
| 524 | d[pt2+0] = x; |
| 525 | out += x; |
| 526 | |
| 527 | return out; |
| 528 | } |
| 529 | |
| 530 | /** mlsadf: HTS Mel Log Spectrum Approximation filter */ |
| 531 | static double mlsadf(double x, double *b, int m, double a, double aa, double *d) |
| 532 | { |
| 533 | |
| 534 | x = mlsadf1(x, b, m, a, aa, d); |
| 535 | x = mlsadf2(x, b, m-1, a, aa, d); |
| 536 | |
| 537 | return x; |
| 538 | } |
| 539 | |
| 540 | |
| 541 | /** uniform_rand: generate uniformly distributed random numbers 1 or -1 */ |
| 542 | static double uniformRand() |
| 543 | { |
| 544 | double x; |
| 545 | |
| 546 | x = rand(); /* double uniformly distributed between 0.0 <= Math.random() < 1.0.*/ |
| 547 | if(x >= RAND_MAX2147483647/2.0) |
| 548 | return 1.0; |
| 549 | else |
| 550 | return -1.0; |
| 551 | } |
| 552 | |
| 553 | /** mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ |
| 554 | static void mc2b(double *mc, double *b, int m, double a ) |
| 555 | { |
| 556 | |
| 557 | b[m] = mc[m]; |
| 558 | for(m--; m>=0; m--) { |
| 559 | b[m] = mc[m] - a * b[m+1]; |
| 560 | } |
| 561 | } |
| 562 | |
| 563 | /** b2mc: transform MLSA digital filter coefficients to mel-cepstrum */ |
| 564 | static void b2mc(double *b, double *mc, int m, double a) |
| 565 | { |
| 566 | double d, o; |
| 567 | int i; |
| 568 | d = mc[m] = b[m]; |
| 569 | for(i=m--; i>=0; i--) { |
| 570 | o = b[i] + (a * d); |
| 571 | d = b[i]; |
| 572 | mc[i] = o; |
| 573 | } |
| 574 | } |
| 575 | |
| 576 | |
| 577 | /** freqt: frequency transformation */ |
| 578 | //private void freqt(double c1[], int m1, int cepIndex, int m2, double a){ |
| 579 | static void freqt(double *c1, int m1, double *c2, int m2, double a) |
| 580 | { |
| 581 | double *freqt_buff=NULL__null; /* used in freqt */ |
| 582 | int freqt_size=0; /* buffer size for freqt */ |
| 583 | int i, j; |
| 584 | double b = 1 - a * a; |
| 585 | int g; /* offset of freqt_buff */ |
| 586 | |
| 587 | if(m2 > freqt_size) { |
| 588 | freqt_buff = walloc(double,m2 + m2 + 2)((double *)safe_walloc(sizeof(double)*(m2 + m2 + 2))); |
| 589 | freqt_size = m2; |
| 590 | } |
| 591 | g = freqt_size +1; |
| 592 | |
| 593 | for(i = 0; i < m2+1; i++) |
| 594 | freqt_buff[g+i] = 0.0; |
| 595 | |
| 596 | for(i = -m1; i <= 0; i++){ |
| 597 | if(0 <= m2 ) |
| 598 | freqt_buff[g+0] = c1[-i] + a * (freqt_buff[0] = freqt_buff[g+0]); |
| 599 | if(1 <= m2) |
| 600 | freqt_buff[g+1] = b * freqt_buff[0] + a * (freqt_buff[1] = freqt_buff[g+1]); |
| 601 | |
| 602 | for(j=2; j<=m2; j++) |
| 603 | freqt_buff[g+j] = freqt_buff[j-1] + a * ( (freqt_buff[j] = freqt_buff[g+j]) - freqt_buff[g+j-1]); |
| 604 | |
| 605 | } |
| 606 | |
| 607 | /* move memory */ |
| 608 | for(i=0; i<m2+1; i++) |
| 609 | c2[i] = freqt_buff[g+i]; |
| 610 | |
| 611 | if (freqt_buff) |
| 612 | wfree(freqt_buff); |
| 613 | |
| 614 | } |
| 615 | |
| 616 | /** c2ir: The minimum phase impulse response is evaluated from the minimum phase cepstrum */ |
| 617 | static void c2ir(double *c, int nc, double *hh, int leng ) |
| 618 | { |
| 619 | int n, k, upl; |
| 620 | double d; |
| 621 | |
| 622 | hh[0] = exp(c[0]); |
| 623 | for(n = 1; n < leng; n++) { |
| 624 | d = 0; |
| 625 | upl = (n >= nc) ? nc - 1 : n; |
| 626 | for(k = 1; k <= upl; k++ ) |
| 627 | d += k * c[k] * hh[n - k]; |
| 628 | hh[n] = d / n; |
| 629 | } |
| 630 | } |
| 631 | |
| 632 | /** b2en: functions for postfiltering */ |
| 633 | static double b2en(double *b, int m, double a) |
| 634 | { |
| 635 | double *spectrum2en_buff=NULL__null; /* used in spectrum2en */ |
| 636 | int spectrum2en_size=0; /* buffer size for spectrum2en */ |
| 637 | double en = 0.0; |
| 638 | int i; |
| 639 | double *cep, *ir; |
| 640 | |
| 641 | if(spectrum2en_size < m) { |
| 642 | spectrum2en_buff = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); |
| 643 | spectrum2en_size = m; |
| 644 | } |
| 645 | cep = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); /* CHECK! these sizes!!! */ |
| 646 | ir = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); |
| 647 | |
| 648 | b2mc(b, spectrum2en_buff, m, a); |
| 649 | /* freqt(vs->mc, m, vs->cep, vs->irleng - 1, -a);*/ |
| 650 | freqt(spectrum2en_buff, m, cep, IRLENG-1, -a); |
| 651 | /* HTS_c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); */ |
| 652 | c2ir(cep, IRLENG, ir, IRLENG); |
| 653 | en = 0.0; |
| 654 | |
| 655 | for(i = 0; i < IRLENG; i++) |
| 656 | en += ir[i] * ir[i]; |
| 657 | |
| 658 | if (spectrum2en_buff) |
| 659 | wfree(spectrum2en_buff); |
| 660 | wfree(cep); |
| 661 | wfree(ir); |
| 662 | |
| 663 | return(en); |
| 664 | } |
| 665 | |
| 666 | /** ignorm: inverse gain normalization */ |
| 667 | static void ignorm(double *c1, double *c2, int m, double ng) |
| 668 | { |
| 669 | double k; |
| 670 | int i; |
| 671 | if(ng != 0.0 ) { |
| 672 | k = pow(c1[0], ng); |
| 673 | for(i=m; i>=1; i--) |
| 674 | c2[i] = k * c1[i]; |
| 675 | c2[0] = (k - 1.0) / ng; |
| 676 | } else { |
| 677 | /* movem */ |
| 678 | for(i=1; i<m; i++) |
| 679 | c2[i] = c1[i]; |
| 680 | c2[0] = log(c1[0]); |
| 681 | } |
| 682 | } |
| 683 | |
| 684 | /** ignorm: gain normalization */ |
| 685 | static void gnorm(double *c1, double *c2, int m, double g) |
| 686 | { |
| 687 | double k; |
| 688 | int i; |
| 689 | if(g != 0.0) { |
| 690 | k = 1.0 + g * c1[0]; |
| 691 | for(; m>=1; m--) |
| 692 | c2[m] = c1[m] / k; |
| 693 | c2[0] = pow(k, 1.0 / g); |
| 694 | } else { |
| 695 | /* movem */ |
| 696 | for(i=1; i<=m; i++) |
| 697 | c2[i] = c1[i]; |
| 698 | c2[0] = exp(c1[0]); |
| 699 | } |
| 700 | |
| 701 | } |
| 702 | |
| 703 | /** lsp2lpc: transform LSP to LPC. lsp[1..m] --> a=lpc[0..m] a[0]=1.0 */ |
| 704 | static void lsp2lpc(double *lsp, double *a, int m) |
| 705 | { |
| 706 | double *lsp2lpc_buff=NULL__null; /* used in lsp2lpc */ |
| 707 | int lsp2lpc_size=0; /* buffer size of lsp2lpc */ |
| 708 | int i, k, mh1, mh2, flag_odd; |
| 709 | double xx, xf, xff; |
| 710 | int p, q; /* offsets of lsp2lpc_buff */ |
| 711 | int a0, a1, a2, b0, b1, b2; /* offsets of lsp2lpc_buff */ |
| 712 | |
| 713 | flag_odd = 0; |
| 714 | if(m % 2 == 0) |
| 715 | mh1 = mh2 = m / 2; |
| 716 | else { |
| 717 | mh1 = (m+1) / 2; |
| 718 | mh2 = (m-1) / 2; |
| 719 | flag_odd = 1; |
| 720 | } |
| 721 | |
| 722 | if(m > lsp2lpc_size){ |
| 723 | lsp2lpc_buff = walloc(double,5 * m + 6)((double *)safe_walloc(sizeof(double)*(5 * m + 6))); |
| 724 | lsp2lpc_size = m; |
| 725 | } |
| 726 | |
| 727 | /* offsets of lsp2lpcbuff */ |
| 728 | p = m; |
| 729 | q = p + mh1; |
| 730 | a0 = q + mh2; |
| 731 | a1 = a0 + (mh1 +1); |
| 732 | a2 = a1 + (mh1 +1); |
| 733 | b0 = a2 + (mh1 +1); |
| 734 | b1 = b0 + (mh2 +1); |
| 735 | b2 = b1 + (mh2 +1); |
| 736 | |
| 737 | /* move lsp -> lsp2lpc_buff */ |
| 738 | for(i=0; i<m; i++) |
| 739 | lsp2lpc_buff[i] = lsp[i+1]; |
| 740 | |
| 741 | for (i = 0; i < mh1 + 1; i++) |
| 742 | lsp2lpc_buff[a0 + i] = 0.0; |
| 743 | for (i = 0; i < mh1 + 1; i++) |
| 744 | lsp2lpc_buff[a1 + i] = 0.0; |
| 745 | for (i = 0; i < mh1 + 1; i++) |
| 746 | lsp2lpc_buff[a2 + i] = 0.0; |
| 747 | for (i = 0; i < mh2 + 1; i++) |
| 748 | lsp2lpc_buff[b0 + i] = 0.0; |
| 749 | for (i = 0; i < mh2 + 1; i++) |
| 750 | lsp2lpc_buff[b1 + i] = 0.0; |
| 751 | for (i = 0; i < mh2 + 1; i++) |
| 752 | lsp2lpc_buff[b2 + i] = 0.0; |
| 753 | |
| 754 | /* lsp filter parameters */ |
| 755 | for (i = k = 0; i < mh1; i++, k += 2) |
| 756 | lsp2lpc_buff[p + i] = -2.0 * cos(lsp2lpc_buff[k]); |
| 757 | for (i = k = 0; i < mh2; i++, k += 2) |
| 758 | lsp2lpc_buff[q + i] = -2.0 * cos(lsp2lpc_buff[k + 1]); |
| 759 | |
| 760 | /* impulse response of analysis filter */ |
| 761 | xx = 1.0; |
| 762 | xf = xff = 0.0; |
| 763 | |
| 764 | for (k = 0; k <= m; k++) { |
| 765 | if (flag_odd == 1) { |
| 766 | lsp2lpc_buff[a0 + 0] = xx; |
| 767 | lsp2lpc_buff[b0 + 0] = xx - xff; |
| 768 | xff = xf; |
| 769 | xf = xx; |
| 770 | } else { |
| 771 | lsp2lpc_buff[a0 + 0] = xx + xf; |
| 772 | lsp2lpc_buff[b0 + 0] = xx - xf; |
| 773 | xf = xx; |
| 774 | } |
| 775 | |
| 776 | for (i = 0; i < mh1; i++) { |
| 777 | lsp2lpc_buff[a0 + i + 1] = lsp2lpc_buff[a0 + i] + lsp2lpc_buff[p + i] * lsp2lpc_buff[a1 + i] + lsp2lpc_buff[a2 + i]; |
| 778 | lsp2lpc_buff[a2 + i] = lsp2lpc_buff[a1 + i]; |
| 779 | lsp2lpc_buff[a1 + i] = lsp2lpc_buff[a0 + i]; |
| 780 | } |
| 781 | |
| 782 | for (i = 0; i < mh2; i++) { |
| 783 | lsp2lpc_buff[b0 + i + 1] = lsp2lpc_buff[b0 + i] + lsp2lpc_buff[q + i] * lsp2lpc_buff[b1 + i] + lsp2lpc_buff[b2 + i]; |
| 784 | lsp2lpc_buff[b2 + i] = lsp2lpc_buff[b1 + i]; |
| 785 | lsp2lpc_buff[b1 + i] = lsp2lpc_buff[b0 + i]; |
| 786 | } |
| 787 | |
| 788 | if (k != 0) |
| 789 | a[k - 1] = -0.5 * (lsp2lpc_buff[a0 + mh1] + lsp2lpc_buff[b0 + mh2]); |
| 790 | xx = 0.0; |
| 791 | } |
| 792 | |
| 793 | for (i = m - 1; i >= 0; i--) |
| 794 | a[i + 1] = -a[i]; |
| 795 | a[0] = 1.0; |
| 796 | |
| 797 | if (lsp2lpc_buff) |
| 798 | wfree(lsp2lpc_buff); |
| 799 | } |
| 800 | |
| 801 | /** gc2gc: generalized cepstral transformation */ |
| 802 | static void gc2gc(double *c1, int m1, double g1, double *c2, int m2, double g2) |
| 803 | { |
| 804 | double *gc2gc_buff=NULL__null; /* used in gc2gc */ |
| 805 | int gc2gc_size=0; /* buffer size for gc2gc */ |
| 806 | int i, min, k, mk; |
| 807 | double ss1, ss2, cc; |
| 808 | |
| 809 | if( m1 > gc2gc_size ) { |
| 810 | gc2gc_buff = walloc(double,m1 + 1)((double *)safe_walloc(sizeof(double)*(m1 + 1))); /* check if these buffers should be created all the time */ |
| 811 | gc2gc_size = m1; |
| 812 | } |
| 813 | |
| 814 | /* movem*/ |
| 815 | for(i=0; i<(m1+1); i++) |
| 816 | gc2gc_buff[i] = c1[i]; |
| 817 | |
| 818 | c2[0] = gc2gc_buff[0]; |
| 819 | |
| 820 | for( i=1; i<=m2; i++){ |
| 821 | ss1 = ss2 = 0.0; |
| 822 | min = m1 < i ? m1 : i - 1; |
| 823 | for(k=1; k<=min; k++){ |
| 824 | mk = i - k; |
| 825 | cc = gc2gc_buff[k] * c2[mk]; |
| 826 | ss2 += k * cc; |
| 827 | ss1 += mk * cc; |
| 828 | } |
| 829 | |
| 830 | if(i <= m1) |
| 831 | c2[i] = gc2gc_buff[i] + (g2 * ss2 - g1 * ss1) / i; |
| 832 | else |
| 833 | c2[i] = (g2 * ss2 - g1 * ss1) / i; |
| 834 | } |
| 835 | |
| 836 | if (gc2gc_buff) |
| 837 | wfree(gc2gc_buff); |
| 838 | } |
| 839 | |
| 840 | /** mgc2mgc: frequency and generalized cepstral transformation */ |
| 841 | static void mgc2mgc(double *c1, int m1, double a1, double g1, double *c2, int m2, double a2, double g2) |
| 842 | { |
| 843 | double a; |
| 844 | |
| 845 | if(a1 == a2){ |
| 846 | gnorm(c1, c1, m1, g1); |
| 847 | gc2gc(c1, m1, g1, c2, m2, g2); |
| 848 | ignorm(c2, c2, m2, g2); |
| 849 | } else { |
| 850 | a = (a2 -a1) / (1 - a1 * a2); |
| 851 | freqt(c1, m1, c2, m2, a); |
| 852 | gnorm(c2, c2, m2, g1); |
| 853 | gc2gc(c2, m2, g1, c2, m2, g2); |
| 854 | ignorm(c2, c2, m2, g2); |
| 855 | |
| 856 | } |
| 857 | } |
| 858 | |
| 859 | /** lsp2mgc: transform LSP to MGC. lsp=C[0..m] mgc=C[0..m] */ |
| 860 | static void lsp2mgc(double *lsp, double *mgc, int m, double alpha) |
| 861 | { |
| 862 | int i; |
| 863 | /* lsp2lpc */ |
| 864 | lsp2lpc(lsp, mgc, m); /* lsp starts in 1! lsp[1..m] --> mgc[0..m] */ |
| 865 | if(use_log_gain) |
| 866 | mgc[0] = exp(lsp[0]); |
| 867 | else |
| 868 | mgc[0] = lsp[0]; |
| 869 | |
| 870 | /* mgc2mgc*/ |
| 871 | if(NORMFLG1) |
| 872 | ignorm(mgc, mgc, m, xgamma); |
| 873 | else if(MULGFLG1) |
| 874 | mgc[0] = (1.0 - mgc[0]) * stage; |
| 875 | |
| 876 | if(MULGFLG1) |
| 877 | for(i=m; i>=1; i--) |
| 878 | mgc[i] *= -stage; |
| 879 | |
| 880 | mgc2mgc(mgc, m, alpha, xgamma, mgc, m, alpha, xgamma); /* input and output is in mgc=C */ |
| 881 | |
| 882 | if(NORMFLG2) |
| 883 | gnorm(mgc, mgc, m, xgamma); |
| 884 | else if(MULGFLG2) |
| 885 | mgc[0] = mgc[0] * xgamma + 1.0; |
| 886 | |
| 887 | if(MULGFLG2) |
| 888 | for(i=m; i>=1; i--) |
| 889 | mgc[i] *= xgamma; |
| 890 | |
| 891 | } |
| 892 | |
| 893 | /** mglsadf: sub functions for MGLSA filter */ |
| 894 | static double mglsadff(double x, double *b, int m, double a, double *d, int d_offset) |
| 895 | { |
| 896 | int i; |
| 897 | double y; |
| 898 | y = d[d_offset+0] * b[1]; |
| 899 | |
| 900 | for(i=1; i<m; i++) { |
| 901 | d[d_offset+i] += a * (d[d_offset+i+1] -d[d_offset+i-1]); |
| 902 | y += d[d_offset+i] * b[i+1]; |
| 903 | } |
| 904 | x -= y; |
| 905 | |
| 906 | for(i=m; i>0; i--) |
| 907 | d[d_offset+i] = d[d_offset+i-1]; |
| 908 | d[d_offset+0] = a * d[d_offset+0] + (1 - a * a) * x; |
| 909 | |
| 910 | return x; |
| 911 | } |
| 912 | |
| 913 | static double mglsadf(double x, double *b, int m, double a, int n, double *d) |
| 914 | { |
| 915 | int i; |
| 916 | for(i=0; i<n; i++) |
| 917 | x = mglsadff(x, b, m, a, d, (i*(m+1))); |
| 918 | |
| 919 | return x; |
| 920 | } |
| 921 | |
| 922 | /** posfilter: postfilter for mel-cepstrum. It uses alpha and beta defined in HMMData */ |
| 923 | static void postfilter_mcp(double *mcp, int m, double alpha, double beta) |
| 924 | { |
| 925 | double *postfilter_buff=NULL__null; /* used in postfiltering */ |
| 926 | int postfilter_size = 0; /* buffer size for postfiltering */ |
| 927 | |
| 928 | double e1, e2; |
| 929 | int k; |
| 930 | |
| 931 | if(beta > 0.0 && m > 1){ |
| 932 | if(postfilter_size < m){ |
| 933 | postfilter_buff = walloc(double,m+1)((double *)safe_walloc(sizeof(double)*(m+1))); |
| 934 | postfilter_size = m; |
Value stored to 'postfilter_size' is never read | |
| 935 | } |
| 936 | mc2b(mcp, postfilter_buff, m, alpha); |
| 937 | e1 = b2en(postfilter_buff, m, alpha); |
| 938 | |
| 939 | postfilter_buff[1] -= beta * alpha * mcp[2]; |
| 940 | for(k = 2; k < m; k++) |
| 941 | postfilter_buff[k] *= (1.0 +beta); |
| 942 | e2 = b2en(postfilter_buff, m, alpha); |
| 943 | postfilter_buff[0] += log(e1/e2) / 2; |
| 944 | b2mc(postfilter_buff, mcp, m, alpha); |
| 945 | |
| 946 | } |
| 947 | |
| 948 | if (postfilter_buff) |
| 949 | wfree(postfilter_buff); |
| 950 | |
| 951 | } |
| 952 | |
| 953 | static int modShift(int n, int N) |
| 954 | { |
| 955 | if( n < 0 ) |
| 956 | while( n < 0 ) |
| 957 | n = n + N; |
| 958 | else |
| 959 | while( n >= N ) |
| 960 | n = n - N; |
| 961 | return n; |
| 962 | } |
| 963 | |
| 964 | /** Generate one pitch period from Fourier magnitudes */ |
| 965 | static double *genPulseFromFourierMag(EST_Track *mag, int n, double f0, booleanint aperiodicFlag) |
| 966 | { |
| 967 | |
| 968 | int numHarm = mag->num_channels(); |
| 969 | int i; |
| 970 | int currentF0 = (int)round(f0); |
| 971 | int T, T2; |
| 972 | double *pulse = NULL__null; |
| 973 | |
| 974 | if(currentF0 < 512) |
| 975 | T = 512; |
| 976 | else |
| 977 | T = 1024; |
| 978 | T2 = 2*T; |
| 979 | |
| 980 | /* since is FFT2 no aperiodicFlag or jitter of 25% is applied */ |
| 981 | |
| 982 | /* get the pulse */ |
| 983 | pulse = walloc(double,T)((double *)safe_walloc(sizeof(double)*(T))); |
| 984 | EST_FVector real(T2); |
| 985 | EST_FVector imag(T2); |
| 986 | |
| 987 | /* copy Fourier magnitudes (Wai C. Chu "Speech Coding algorithms foundation and evolution of standardized coders" pg. 460) */ |
| 988 | real[0] = real[T] = 0.0; /* DC component set to zero */ |
| 989 | for(i=1; i<=numHarm; i++){ |
| 990 | real[i] = real[T-i] = real[T+i] = real[T2-i] = mag->a(n, i-1); /* Symetric extension */ |
| 991 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; |
| 992 | } |
| 993 | for(i=(numHarm+1); i<(T-numHarm); i++){ /* Default components set to 1.0 */ |
| 994 | real[i] = real[T-i] = real[T+i] = real[T2-i] = 1.0; |
| 995 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; |
| 996 | } |
| 997 | |
| 998 | /* Calculate inverse Fourier transform */ |
| 999 | IFFT(real, imag); |
| 1000 | |
| 1001 | /* circular shift and normalise multiplying by sqrt(F0) */ |
| 1002 | double sqrt_f0 = sqrt((float)currentF0); |
| 1003 | for(i=0; i<T; i++) |
| 1004 | pulse[i] = real[modShift(i-numHarm,T)] * sqrt_f0; |
| 1005 | |
| 1006 | return pulse; |
| 1007 | |
| 1008 | } |
| 1009 | |
| 1010 | int htsMLSAVocoder(EST_Track *lf0Pst, |
| 1011 | EST_Track *mcepPst, |
| 1012 | EST_Track *strPst, |
| 1013 | EST_Track *magPst, |
| 1014 | int *voiced, |
| 1015 | HTSData *htsData, |
| 1016 | EST_Wave *wave) |
| 1017 | { |
| 1018 | |
| 1019 | double inc, x; |
| 1020 | double xp=0.0,xn=0.0,fxp,fxn,mix; /* samples for pulse and for noise and the filtered ones */ |
| 1021 | int i, j, k, m, s, mcepframe, lf0frame, s_double; |
| 1022 | double alpha = htsData->alpha; |
| 1023 | double beta = htsData->beta; |
| 1024 | double aa = 1-alpha*alpha; |
| 1025 | int audio_size; /* audio size in samples, calculated as num frames * frame period */ |
| 1026 | double *audio_double = NULL__null; |
| 1027 | double *magPulse = NULL__null; /* pulse generated from Fourier magnitudes */ |
| 1028 | int magSample, magPulseSize; |
| 1029 | booleanint aperiodicFlag = false0; |
| 1030 | |
| 1031 | double *d; /* used in the lpc vocoder */ |
| 1032 | |
| 1033 | double f0, f0Std, f0Shift, f0MeanOri; |
| 1034 | double *mc = NULL__null; /* feature vector for a particular frame */ |
| 1035 | double *hp = NULL__null; /* pulse shaping filter, initialised once it is known orderM */ |
| 1036 | double *hn = NULL__null; /* noise shaping filter, initialised once it is known orderM */ |
| 1037 | |
| 1038 | /* Initialise vocoder and mixed excitation, once initialised it is known the order |
| 1039 | * of the filters so the shaping filters hp and hn can be initialised. */ |
| 1040 | m = mcepPst->num_channels(); |
| 1041 | mc = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); |
| 1042 | |
| 1043 | initVocoder(m-1, mcepPst->num_frames(), htsData); |
| 1044 | |
| 1045 | d = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); |
| 1046 | if (lpcVocoder) |
| 1047 | { |
| 1048 | /* printf("Using LPC vocoder\n"); */ |
| 1049 | for(i=0; i<m; i++) |
| 1050 | d[i] = 0.0; |
| 1051 | } |
| 1052 | mixedExcitation = htsData->useMixExc; |
| 1053 | fourierMagnitudes = htsData->useFourierMag; |
| 1054 | |
| 1055 | if ( mixedExcitation ) |
| 1056 | { |
| 1057 | numM = htsData->NumFilters; |
| 1058 | orderM = htsData->OrderFilters; |
| 1059 | |
| 1060 | xpulseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
| 1061 | xnoiseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
| 1062 | /* initialise xp_sig and xn_sig */ |
| 1063 | for(i=0; i<orderM; i++) |
| 1064 | xpulseSignal[i] = xnoiseSignal[i] = 0; |
| 1065 | |
| 1066 | h = htsData->MixFilters; |
| 1067 | hp = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
| 1068 | hn = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
| 1069 | |
| 1070 | //Check if the number of filters is equal to the order of strpst |
| 1071 | //i.e. the number of filters is equal to the number of generated strengths per frame. |
| 1072 | #if 0 |
| 1073 | if(numM != strPst->num_channels()) { |
| 1074 | printf("htsMLSAVocoder: error num mix-excitation filters = %d " |
| 1075 | " in configuration file is different from generated str order= %d\n", |
| 1076 | numM, strPst->num_channels()); |
| 1077 | } |
| 1078 | printf("HMM speech generation with mixed-excitation.\n"); |
| 1079 | #endif |
| 1080 | } |
| 1081 | #if 0 |
| 1082 | else |
| 1083 | printf("HMM speech generation without mixed-excitation.\n"); |
| 1084 | |
| 1085 | if( fourierMagnitudes && htsData->PdfMagFile != NULL__null) |
| 1086 | printf("Pulse generated with Fourier Magnitudes.\n"); |
| 1087 | else |
| 1088 | printf("Pulse generated as a unit pulse.\n"); |
| 1089 | |
| 1090 | if(beta != 0.0) |
| 1091 | printf("Postfiltering applied with beta=%f",(float)beta); |
| 1092 | else |
| 1093 | printf("No postfiltering applied.\n"); |
| 1094 | #endif |
| 1095 | |
| 1096 | /* Clear content of c, should be done if this function is |
| 1097 | called more than once with a new set of generated parameters. */ |
| 1098 | for(i=0; i< C_length; i++) |
| 1099 | C[i] = CC[i] = CINC[i] = 0.0; |
| 1100 | for(i=0; i< D1_length; i++) |
| 1101 | D1[i]=0.0; |
| 1102 | |
| 1103 | f0Std = htsData->F0Std; |
| 1104 | f0Shift = htsData->F0Mean; |
| 1105 | f0MeanOri = 0.0; |
| 1106 | |
| 1107 | /* XXX */ |
| 1108 | for (mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) |
| 1109 | { |
| 1110 | if(voiced[mcepframe]) |
| 1111 | { /* WAS WRONG */ |
| 1112 | f0MeanOri = f0MeanOri + lf0Pst->a(mcepframe, 0); |
| 1113 | lf0frame++; |
| 1114 | } |
| 1115 | } |
| 1116 | f0MeanOri = f0MeanOri/lf0frame; |
| 1117 | |
| 1118 | /* ____________________Synthesize speech waveforms_____________________ */ |
| 1119 | /* generate Nperiod samples per mcepframe */ |
| 1120 | s = 0; /* number of samples */ |
| 1121 | s_double = 0; |
| 1122 | audio_size = mcepPst->num_frames() * (fprd); |
| 1123 | audio_double = walloc(double,audio_size)((double *)safe_walloc(sizeof(double)*(audio_size))); /* initialise buffer for audio */ |
| 1124 | magSample = 1; |
| 1125 | magPulseSize = 0; |
| 1126 | |
| 1127 | for(mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) |
| 1128 | { |
| 1129 | /* get current feature vector mcp */ |
| 1130 | for(i=0; i<m; i++) |
| 1131 | mc[i] = mcepPst->a(mcepframe, i); |
| 1132 | |
| 1133 | /* f0 modification through the MARY audio effects */ |
| 1134 | if(voiced[mcepframe]){ |
| 1135 | f0 = f0Std * lf0Pst->a(mcepframe, 0) + (1-f0Std) * f0MeanOri + f0Shift; |
| 1136 | lf0frame++; |
| 1137 | if(f0 < 0.0) |
| 1138 | f0 = 0.0; |
| 1139 | } |
| 1140 | else{ |
| 1141 | f0 = 0.0; |
| 1142 | } |
| 1143 | |
| 1144 | /* if mixed excitation get shaping filters for this frame */ |
| 1145 | if (mixedExcitation) |
| 1146 | { |
| 1147 | for(j=0; j<orderM; j++) |
| 1148 | { |
| 1149 | hp[j] = hn[j] = 0.0; |
| 1150 | for(i=0; i<numM; i++) |
| 1151 | { |
| 1152 | hp[j] += strPst->a(mcepframe, i) * h[i][j]; |
| 1153 | hn[j] += ( 1 - strPst->a(mcepframe, i) ) * h[i][j]; |
| 1154 | } |
| 1155 | } |
| 1156 | } |
| 1157 | |
| 1158 | /* f0->pitch, in original code here it is used p, so f0=p in the c code */ |
| 1159 | if(f0 != 0.0) |
| 1160 | f0 = rate/f0; |
| 1161 | |
| 1162 | /* p1 is initialised in -1, so this will be done just for the first frame */ |
| 1163 | if( p1 < 0 ) { |
| 1164 | p1 = f0; |
| 1165 | pc = p1; |
| 1166 | /* for LSP */ |
| 1167 | if(stage != 0){ |
| 1168 | if( use_log_gain) |
| 1169 | C[0] = LZERO; |
| 1170 | else |
| 1171 | C[0] = ZERO; |
| 1172 | for(i=0; i<m; i++ ) |
| 1173 | C[i] = i * PI3.14159265358979323846 / m; |
| 1174 | /* LSP -> MGC */ |
| 1175 | lsp2mgc(C, C, (m-1), alpha); |
| 1176 | mc2b(C, C, (m-1), alpha); |
| 1177 | gnorm(C, C, (m-1), xgamma); |
| 1178 | for(i=1; i<m; i++) |
| 1179 | C[i] *= xgamma; |
| 1180 | } |
| 1181 | |
| 1182 | } |
| 1183 | |
| 1184 | if(stage == 0){ |
| 1185 | /* postfiltering, this is done if beta>0.0 */ |
| 1186 | postfilter_mcp(mc, (m-1), alpha, beta); |
| 1187 | /* mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ |
| 1188 | mc2b(mc, CC, (m-1), alpha); |
| 1189 | for(i=0; i<m; i++) |
| 1190 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; |
| 1191 | } else { |
| 1192 | |
| 1193 | lsp2mgc(mc, CC, (m-1), alpha ); |
| 1194 | |
| 1195 | mc2b(CC, CC, (m-1), alpha); |
| 1196 | |
| 1197 | gnorm(CC, CC, (m-1), xgamma); |
| 1198 | |
| 1199 | for(i=1; i<m; i++) |
| 1200 | CC[i] *= xgamma; |
| 1201 | |
| 1202 | for(i=0; i<m; i++) |
| 1203 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; |
| 1204 | |
| 1205 | } |
| 1206 | |
| 1207 | /* p=f0 in c code!!! */ |
| 1208 | if( p1 != 0.0 && f0 != 0.0 ) { |
| 1209 | inc = (f0 - p1) * (double)iprd/(double)fprd; |
| 1210 | //System.out.println(" inc=(f0-p1)/80=" + inc ); |
| 1211 | } else { |
| 1212 | inc = 0.0; |
| 1213 | pc = f0; |
| 1214 | p1 = 0.0; |
| 1215 | } |
| 1216 | |
| 1217 | /* Here need to generate both xp:pulse and xn:noise signals seprately*/ |
| 1218 | gauss = false0; /* Mixed excitation works better with nomal noise */ |
| 1219 | |
| 1220 | /* Generate fperiod samples per feature vector, normally 80 samples per frame */ |
| 1221 | //p1=0.0; |
| 1222 | gauss=false0; |
| 1223 | for(j=fprd-1, i=(iprd+1)/2; j>=0; j--) { |
| 1224 | if(p1 == 0.0) { |
| 1225 | if(gauss) |
| 1226 | x = 0 /* rand.nextGaussian() */; /* XXX returns double, gaussian distribution mean=0.0 and var=1.0 */ |
| 1227 | else |
| 1228 | x = uniformRand(); /* returns 1.0 or -1.0 uniformly distributed */ |
| 1229 | |
| 1230 | if(mixedExcitation) { |
| 1231 | xn = x; |
| 1232 | xp = 0.0; |
| 1233 | } |
| 1234 | } else { |
| 1235 | if( (pc += 1.0) >= p1 ){ |
| 1236 | if(fourierMagnitudes){ |
| 1237 | /* jitter is applied just in voiced frames when the stregth of the first band is < 0.5*/ |
| 1238 | /* this will work just if Radix FFT is used */ |
| 1239 | /*if(strPst.getPar(mcepframe, 0) < 0.5) |
| 1240 | aperiodicFlag = true; |
| 1241 | else |
| 1242 | aperiodicFlag = false; |
| 1243 | magPulse = genPulseFromFourierMagRadix(magPst, mcepframe, p1, aperiodicFlag); |
| 1244 | */ |
| 1245 | |
| 1246 | magPulse = genPulseFromFourierMag(magPst, mcepframe, p1, aperiodicFlag); |
| 1247 | magSample = 0; |
| 1248 | magPulseSize = -27 /* magPulse.length*/; /** XXX **/ |
| 1249 | x = magPulse[magSample]; |
| 1250 | magSample++; |
| 1251 | } else |
| 1252 | x = sqrt(p1); |
| 1253 | |
| 1254 | pc = pc - p1; |
| 1255 | } else { |
| 1256 | |
| 1257 | if(fourierMagnitudes){ |
| 1258 | if(magSample >= magPulseSize ){ |
| 1259 | x = 0.0; |
| 1260 | } |
| 1261 | else |
| 1262 | x = magPulse[magSample]; |
| 1263 | magSample++; |
| 1264 | } else |
| 1265 | x = 0.0; |
| 1266 | } |
| 1267 | |
| 1268 | if(mixedExcitation) { |
| 1269 | xp = x; |
| 1270 | if(gauss) |
| 1271 | xn = 0 /* rand.nextGaussian() */ ; /* XXX */ |
| 1272 | else |
| 1273 | xn = uniformRand(); |
| 1274 | } |
| 1275 | } |
| 1276 | |
| 1277 | /* apply the shaping filters to the pulse and noise samples */ |
| 1278 | /* i need memory of at least for M samples in both signals */ |
| 1279 | if(mixedExcitation) { |
| 1280 | fxp = 0.0; |
| 1281 | fxn = 0.0; |
| 1282 | for(k=orderM-1; k>0; k--) { |
| 1283 | fxp += hp[k] * xpulseSignal[k]; |
| 1284 | fxn += hn[k] * xnoiseSignal[k]; |
| 1285 | xpulseSignal[k] = xpulseSignal[k-1]; |
| 1286 | xnoiseSignal[k] = xnoiseSignal[k-1]; |
| 1287 | } |
| 1288 | fxp += hp[0] * xp; |
| 1289 | fxn += hn[0] * xn; |
| 1290 | xpulseSignal[0] = xp; |
| 1291 | xnoiseSignal[0] = xn; |
| 1292 | |
| 1293 | /* x is a pulse noise excitation and mix is mixed excitation */ |
| 1294 | mix = fxp+fxn; |
| 1295 | |
| 1296 | /* comment this line if no mixed excitation, just pulse and noise */ |
| 1297 | x = mix; /* excitation sample */ |
| 1298 | /* printf("awb_debug me %d %f\n",(int)(s_double),(float)x); */ |
| 1299 | } |
| 1300 | |
| 1301 | if(lpcVocoder){ |
| 1302 | // LPC filter C[k=0] = gain is not used! |
| 1303 | if(!NGAIN) |
| 1304 | x *= C[0]; |
| 1305 | for(k=(m-1); k>1; k--){ |
| 1306 | x = x - (C[k] * d[k]); |
| 1307 | d[k] = d[k-1]; |
| 1308 | } |
| 1309 | x = x - (C[1] * d[1]); |
| 1310 | d[1] = x; |
| 1311 | |
| 1312 | } else if(stage == 0 ){ |
| 1313 | if(x != 0.0 ) |
| 1314 | x *= exp(C[0]); |
| 1315 | x = mlsadf(x, C, m, alpha, aa, D1); |
| 1316 | |
| 1317 | } else { |
| 1318 | if(!NGAIN) |
| 1319 | x *= C[0]; |
| 1320 | x = mglsadf(x, C, (m-1), alpha, stage, D1); |
| 1321 | } |
| 1322 | |
| 1323 | audio_double[s_double] = x; |
| 1324 | s_double++; |
| 1325 | |
| 1326 | if((--i) == 0 ) { |
| 1327 | p1 += inc; |
| 1328 | for(k=0; k<m; k++){ |
| 1329 | C[k] += CINC[k]; |
| 1330 | } |
| 1331 | i = iprd; |
| 1332 | } |
| 1333 | } /* for each sample in a period fprd */ |
| 1334 | |
| 1335 | p1 = f0; |
| 1336 | |
| 1337 | /* move elements in c */ |
| 1338 | /* HTS_movem(v->cc, v->c, m + 1); */ |
| 1339 | for(i=0; i<m; i++){ |
| 1340 | C[i] = CC[i]; |
| 1341 | } |
| 1342 | |
| 1343 | } /* for each mcep frame */ |
| 1344 | |
| 1345 | /* printf("Finish processing %d mcep frames.\n",mcepframe); */ |
| 1346 | |
| 1347 | wave->resize(audio_size,1); |
| 1348 | for (i=0; i<s_double; i++) |
| 1349 | wave->a(i) = (short)audio_double[i]; |
| 1350 | |
| 1351 | return 0; |
| 1352 | |
| 1353 | } /* method htsMLSAVocoder() */ |
| 1354 | |
| 1355 |