File: | modules/clustergen/me_mlsa.cc |
Location: | line 1120, column 5 |
Description: | Value stored to 's' is never read |
1 | /** |
2 | * The HMM-Based Speech Synthesis System (HTS) |
3 | * HTS Working Group |
4 | * |
5 | * Department of Computer Science |
6 | * Nagoya Institute of Technology |
7 | * and |
8 | * Interdisciplinary Graduate School of Science and Engineering |
9 | * Tokyo Institute of Technology |
10 | * |
11 | * Portions Copyright (c) 2001-2006 |
12 | * All Rights Reserved. |
13 | * |
14 | * Portions Copyright 2000-2007 DFKI GmbH. |
15 | * All Rights Reserved. |
16 | * |
17 | * Permission is hereby granted, free of charge, to use and |
18 | * distribute this software and its documentation without |
19 | * restriction, including without limitation the rights to use, |
20 | * copy, modify, merge, publish, distribute, sublicense, and/or |
21 | * sell copies of this work, and to permit persons to whom this |
22 | * work is furnished to do so, subject to the following conditions: |
23 | * |
24 | * 1. The source code must retain the above copyright notice, |
25 | * this list of conditions and the following disclaimer. |
26 | * |
27 | * 2. Any modifications to the source code must be clearly |
28 | * marked as such. |
29 | * |
30 | * 3. Redistributions in binary form must reproduce the above |
31 | * copyright notice, this list of conditions and the |
32 | * following disclaimer in the documentation and/or other |
33 | * materials provided with the distribution. Otherwise, one |
34 | * must contact the HTS working group. |
35 | * |
36 | * NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF TECHNOLOGY, |
37 | * HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM |
38 | * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL |
39 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT |
40 | * SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF |
41 | * TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE |
42 | * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY |
43 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, |
44 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS |
45 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR |
46 | * PERFORMANCE OF THIS SOFTWARE. |
47 | * |
48 | * |
49 | * This software was translated to C for use within Festival to offer |
50 | * multi-excitation MLSA |
51 | * Alan W Black (awb@cs.cmu.edu) 3rd April 2009 |
52 | * |
53 | */ |
54 | |
55 | #include <stdio.h> |
56 | #include <stdlib.h> |
57 | #include <string.h> |
58 | #include <math.h> |
59 | #include <EST_walloc.h> |
60 | #include "festival.h" |
61 | |
62 | #include "mlsa_resynthesis.h" |
63 | |
64 | /** |
65 | * Synthesis of speech out of speech parameters. |
66 | * Mixed excitation MLSA vocoder. |
67 | * |
68 | * Java port and extension of HTS engine version 2.0 |
69 | * Extension: mixed excitation |
70 | * @author Marcela Charfuelan |
71 | * And ported to C by Alan W Black (awb@cs.cmu.edu) |
72 | */ |
73 | |
74 | #define booleanint int |
75 | #define true1 1 |
76 | #define false0 0 |
77 | |
78 | typedef struct HTSData_struct { |
79 | |
80 | int rate; |
81 | int fperiod; |
82 | double rhos; |
83 | |
84 | int stage; |
85 | double alpha; |
86 | double beta; |
87 | booleanint useLogGain; |
88 | double uf; |
89 | booleanint algnst; /* use state level alignment for duration */ |
90 | booleanint algnph; /* use phoneme level alignment for duration */ |
91 | booleanint useMixExc; /* use Mixed Excitation */ |
92 | booleanint useFourierMag; /* use Fourier magnitudes for pulse generation */ |
93 | booleanint useGV; /* use global variance in parameter generation */ |
94 | booleanint useGmmGV; /* use global variance as a Gaussian Mixture Model */ |
95 | booleanint useUnitDurationContinuousFeature; /* for using external duration, so it will not be generated from HMMs*/ |
96 | booleanint useUnitLogF0ContinuousFeature; /* for using external f0, so it will not be generated from HMMs*/ |
97 | |
98 | /** variables for controling generation of speech in the vocoder |
99 | * these variables have default values but can be fixed and read from the |
100 | * audio effects component. [Default][min--max] */ |
101 | double length; /* total number of frame for generated speech */ |
102 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
103 | double durationScale; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
104 | |
105 | booleanint LogGain; |
106 | char *PdfStrFile, *PdfMagFile; |
107 | |
108 | int NumFilters, OrderFilters; |
109 | double **MixFilters; |
110 | double F0Std; |
111 | double F0Mean; |
112 | |
113 | } HTSData; |
114 | |
115 | #if 0 |
116 | typedef struct HTSData_struct { |
117 | |
118 | int rate = 16000; |
119 | int fperiod = 80; |
120 | double rhos = 0.0; |
121 | |
122 | int stage = 0; |
123 | double alpha = 0.42; |
124 | booleanint useLogGain = false0; |
125 | double uf = 0.5; |
126 | booleanint algnst = false0; /* use state level alignment for duration */ |
127 | booleanint algnph = false0; /* use phoneme level alignment for duration */ |
128 | booleanint useMixExc = true1; /* use Mixed Excitation */ |
129 | booleanint useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ |
130 | booleanint useGV = false0; /* use global variance in parameter generation */ |
131 | booleanint useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ |
132 | booleanint useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ |
133 | booleanint useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ |
134 | |
135 | /** variables for controling generation of speech in the vocoder |
136 | * these variables have default values but can be fixed and read from the |
137 | * audio effects component. [Default][min--max] */ |
138 | double f0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ |
139 | double f0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ |
140 | double length = 0.0; /* total number of frame for generated speech */ |
141 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
142 | double durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
143 | |
144 | } HTSData; |
145 | #endif |
146 | |
147 | static int IPERIOD = 1; |
148 | static booleanint GAUSS = true1; |
149 | static int PADEORDER = 5; /* pade order for MLSA filter */ |
150 | static int IRLENG = 96; /* length of impulse response */ |
151 | |
152 | /* for MGLSA filter (mel-generalised log spectrum approximation filter) */ |
153 | static booleanint NORMFLG1 = true1; |
154 | static booleanint NORMFLG2 = false0; |
155 | static booleanint MULGFLG1 = true1; |
156 | static booleanint MULGFLG2 = false0; |
157 | static booleanint NGAIN = false0; |
158 | |
159 | static double ZERO = 1.0e-10; /* ~(0) */ |
160 | static double LZERO = (-1.0e+10); /* ~log(0) */ |
161 | |
162 | static int stage; /* Gamma=-1/stage : if stage=0 then Gamma=0 */ |
163 | static double xgamma; /* Gamma */ |
164 | static booleanint use_log_gain; /* log gain flag (for LSP) */ |
165 | static int fprd; /* frame shift */ |
166 | static int iprd; /* interpolation period */ |
167 | static booleanint gauss; /* flag to use Gaussian noise */ |
168 | static double p1; /* used in excitation generation */ |
169 | static double pc; /* used in excitation generation */ |
170 | static double *pade; /* used in mlsadf */ |
171 | static int ppade; /* offset for vector ppade */ |
172 | |
173 | static double *C; /* used in the MLSA/MGLSA filter */ |
174 | static double *CC; /* used in the MLSA/MGLSA filter */ |
175 | static double *CINC; /* used in the MLSA/MGLSA filter */ |
176 | static double *D1; /* used in the MLSA/MGLSA filter */ |
177 | static int CINC_length, CC_length, C_length, D1_length; |
178 | |
179 | static double rate; |
180 | static int pt1; /* used in mlsadf1 */ |
181 | static int pt2; /* used in mlsadf2 */ |
182 | static int *pt3; /* used in mlsadf2 */ |
183 | |
184 | /* mixed excitation variables */ |
185 | static int numM; /* Number of bandpass filters for mixed excitation */ |
186 | static int orderM; /* Order of filters for mixed excitation */ |
187 | static double **h; /* filters for mixed excitation */ |
188 | static double *xpulseSignal; /* the size of this should be orderM */ |
189 | static double *xnoiseSignal; /* the size of this should be orderM */ |
190 | static booleanint mixedExcitation = false0; |
191 | static booleanint fourierMagnitudes = false0; |
192 | |
193 | static booleanint lpcVocoder = false0; /* true if lpc vocoder is used, then the input should be lsp parameters */ |
194 | |
195 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData); |
196 | int htsMLSAVocoder(EST_Track *lf0Pst, |
197 | EST_Track *mcepPst, |
198 | EST_Track *strPst, |
199 | EST_Track *magPst, |
200 | int *voiced, |
201 | HTSData *htsData, |
202 | EST_Wave *wave); |
203 | |
204 | |
205 | LISP me_mlsa_resynthesis(LISP ltrack, LISP strack) |
206 | { |
207 | /* Resynthesizes a wave from given track with mixed excitation*/ |
208 | EST_Track *t; |
209 | EST_Track *str_track; |
210 | EST_Wave *wave = 0; |
211 | EST_Track *mcep; |
212 | EST_Track *f0v; |
213 | EST_Track *str; |
214 | EST_Track *mag; |
215 | int *voiced; |
216 | int sr = 16000; |
217 | int i,j; |
218 | double shift; |
219 | HTSData htsData; |
220 | |
221 | htsData.alpha = 0.42; |
222 | htsData.beta = 0.0; |
223 | |
224 | if ((ltrack == NULL__null) || |
225 | (TYPEP(ltrack,tc_string)( (ltrack != __null) && ((((ltrack) == ((struct obj * ) 0)) ? 0 : ((*(ltrack)).type)) == (13)) ) && |
226 | (streq(get_c_string(ltrack),"nil")(strcmp(get_c_string(ltrack),"nil")==0)))) |
227 | return siod(new EST_Wave(0,1,sr)); |
228 | |
229 | t = track(ltrack); |
230 | str_track = track(strack); |
231 | |
232 | f0v = new EST_Track(t->num_frames(),1); |
233 | mcep = new EST_Track(t->num_frames(),25); |
234 | str = new EST_Track(t->num_frames(),5); |
235 | mag = new EST_Track(t->num_frames(),10); |
236 | voiced = walloc(int,t->num_frames())((int *)safe_walloc(sizeof(int)*(t->num_frames()))); |
237 | |
238 | for (i=0; i<t->num_frames(); i++) |
239 | { |
240 | f0v->a(i) = t->a(i,0); |
241 | if (f0v->a(i) > 0) |
242 | voiced[i] = 1; |
243 | else |
244 | voiced[i] = 0; |
245 | for (j=1; j<26; j++) |
246 | mcep->a(i,j-1) = t->a(i,j); |
247 | |
248 | for (j=0; j<5; j++) |
249 | { |
250 | str->a(i,j) = str_track->a(i,j); |
251 | } |
252 | /* printf("awb_debug str %d 0 %f 1 %f 2 %f 3 %f 4 %f\n", |
253 | i,str->a(i,0),str->a(i,1),str->a(i,2),str->a(i,3),str->a(i,4));*/ |
254 | #if 0 |
255 | for (j=57; j<66; j++) |
256 | mag->a(i,j-57) = t->a(i,j); |
257 | #endif |
258 | } |
259 | |
260 | if (t->num_frames() > 1) |
261 | shift = 1000.0*(t->t(1)-t->t(0)); |
262 | else |
263 | shift = 5.0; |
264 | |
265 | htsData.alpha = FLONM(siod_get_lval("mlsa_alpha_param",((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data) |
266 | "mlsa: mlsa_alpha_param not set"))((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data); |
267 | htsData.beta = FLONM(siod_get_lval("mlsa_beta_param",((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data) |
268 | "mlsa: mlsa_beta_param not set"))((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data); |
269 | htsData.stage = 0; |
270 | htsData.LogGain = false0; |
271 | htsData.fperiod = 80; |
272 | htsData.rate = 16000; |
273 | htsData.rhos = 0.0; |
274 | |
275 | htsData.uf = 0.5; |
276 | htsData.algnst = false0; /* use state level alignment for duration */ |
277 | htsData.algnph = false0; /* use phoneme level alignment for duration */ |
278 | htsData.useMixExc = true1; /* use Mixed Excitation */ |
279 | htsData.useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ |
280 | htsData.useGV = false0; /* use global variance in parameter generation */ |
281 | htsData.useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ |
282 | htsData.useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ |
283 | htsData.useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ |
284 | |
285 | /** variables for controling generation of speech in the vocoder |
286 | * these variables have default values but can be fixed and read from the |
287 | * audio effects component. [Default][min--max] */ |
288 | htsData.F0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ |
289 | htsData.F0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ |
290 | htsData.length = 0.0; /* total number of frame for generated speech */ |
291 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ |
292 | htsData.durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ |
293 | |
294 | LISP filters = siod_get_lval("me_mix_filters", |
295 | "mlsa: me_mix_filters not set"); |
296 | LISP f; |
297 | int fl; |
298 | htsData.NumFilters = 5; |
299 | for (fl=0,f=filters; f; fl++) |
300 | f=cdr(f); |
301 | htsData.OrderFilters = fl/htsData.NumFilters; |
302 | htsData.MixFilters = walloc(double *,htsData.NumFilters)((double * *)safe_walloc(sizeof(double *)*(htsData.NumFilters ))); |
303 | for (i=0; i < htsData.NumFilters; i++) |
304 | { |
305 | htsData.MixFilters[i] = walloc(double,htsData.OrderFilters)((double *)safe_walloc(sizeof(double)*(htsData.OrderFilters)) ); |
306 | for (j=0; j<htsData.OrderFilters; j++) |
307 | { |
308 | htsData.MixFilters[i][j] = FLONM(car(filters))((*car(filters)).storage_as.flonum.data); |
309 | filters = cdr(filters); |
310 | } |
311 | } |
312 | |
313 | wave = new EST_Wave(0,1,sr); |
314 | |
315 | if (mcep->num_frames() > 0) |
316 | /* mcep_order and number of deltas */ |
317 | htsMLSAVocoder(f0v,mcep,str,mag,voiced,&htsData,wave); |
318 | |
319 | delete f0v; |
320 | delete mcep; |
321 | delete str; |
322 | delete mag; |
323 | delete voiced; |
324 | |
325 | return siod(wave); |
326 | } |
327 | |
328 | /** The initialisation of VocoderSetup should be done when there is already |
329 | * information about the number of feature vectors to be processed, |
330 | * size of the mcep vector file, etc. */ |
331 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData) |
332 | { |
333 | int vector_size; |
334 | double xrand; |
335 | |
336 | stage = htsData->stage; |
337 | if(stage != 0) |
338 | xgamma = -1.0 / stage; |
339 | else |
340 | xgamma = 0.0; |
341 | use_log_gain = htsData->LogGain; |
342 | |
343 | fprd = htsData->fperiod; |
344 | rate = htsData->rate; |
345 | iprd = IPERIOD; |
346 | gauss = GAUSS; |
347 | |
348 | /* XXX */ |
349 | xrand = rand(); |
350 | |
351 | if(stage == 0 ){ /* for MCP */ |
352 | |
353 | /* mcep_order=74 and pd=PADEORDER=5 (if no HTS_EMBEDDED is used) */ |
354 | vector_size = (mcep_vsize * ( 3 + PADEORDER) + 5 * PADEORDER + 6) - (3 * (mcep_order+1)); |
355 | CINC_length = CC_length = C_length = mcep_order+1; |
356 | D1_length = vector_size; |
357 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); |
358 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); |
359 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); |
360 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); |
361 | |
362 | vector_size=21; |
363 | pade = walloc(double,vector_size)((double *)safe_walloc(sizeof(double)*(vector_size))); |
364 | /* ppade is a copy of pade in mlsadf() function : ppade = &( pade[pd*(pd+1)/2] ); */ |
365 | ppade = PADEORDER*(PADEORDER+1)/2; /* offset for vector pade */ |
366 | pade[0] = 1.0; |
367 | pade[1] = 1.0; |
368 | pade[2] = 0.0; |
369 | pade[3] = 1.0; |
370 | pade[4] = 0.0; |
371 | pade[5] = 0.0; |
372 | pade[6] = 1.0; |
373 | pade[7] = 0.0; |
374 | pade[8] = 0.0; |
375 | pade[9] = 0.0; |
376 | pade[10] = 1.0; |
377 | pade[11] = 0.4999273; |
378 | pade[12] = 0.1067005; |
379 | pade[13] = 0.01170221; |
380 | pade[14] = 0.0005656279; |
381 | pade[15] = 1.0; |
382 | pade[16] = 0.4999391; |
383 | pade[17] = 0.1107098; |
384 | pade[18] = 0.01369984; |
385 | pade[19] = 0.0009564853; |
386 | pade[20] = 0.00003041721; |
387 | |
388 | pt1 = PADEORDER+1; |
389 | pt2 = ( 2 * (PADEORDER+1)) + (PADEORDER * (mcep_order+2)); |
390 | pt3 = new int[PADEORDER+1]; |
391 | for(int i=PADEORDER; i>=1; i--) |
392 | pt3[i] = ( 2 * (PADEORDER+1)) + ((i-1)*(mcep_order+2)); |
393 | |
394 | } else { /* for LSP */ |
395 | vector_size = ((mcep_vsize+1) * (stage+3)) - ( 3 * (mcep_order+1)); |
396 | CINC_length = CC_length = C_length = mcep_order+1; |
397 | D1_length = vector_size; |
398 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); |
399 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); |
400 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); |
401 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); |
402 | } |
403 | |
404 | /* excitation initialisation */ |
405 | p1 = -1; |
406 | pc = 0.0; |
407 | |
408 | } /* method initVocoder */ |
409 | |
410 | |
411 | |
412 | /** |
413 | * HTS_MLSA_Vocoder: Synthesis of speech out of mel-cepstral coefficients. |
414 | * This procedure uses the parameters generated in pdf2par stored in: |
415 | * PStream mceppst: Mel-cepstral coefficients |
416 | * PStream strpst : Filter bank stregths for mixed excitation |
417 | * PStream magpst : Fourier magnitudes ( OJO!! this is not used yet) |
418 | * PStream lf0pst : Log F0 |
419 | */ |
420 | #if 0 |
421 | AudioInputStream htsMLSAVocoder(HTSParameterGeneration pdf2par, HMMData htsData) |
422 | { |
423 | float sampleRate = 16000.0F; //8000,11025,16000,22050,44100 |
424 | int sampleSizeInBits = 16; //8,16 |
425 | int channels = 1; //1,2 |
426 | booleanint signed = true1; //true,false |
427 | booleanint bigEndian = false0; //true,false |
428 | AudioFormat af = new AudioFormat( |
429 | sampleRate, |
430 | sampleSizeInBits, |
431 | channels, |
432 | signed, |
433 | bigEndian); |
434 | double [] audio_double = NULL__null; |
435 | |
436 | audio_double = htsMLSAVocoder(pdf2par.getlf0Pst(), pdf2par.getMcepPst(), pdf2par.getStrPst(), pdf2par.getMagPst(), |
437 | pdf2par.getVoicedArray(), htsData); |
438 | |
439 | long lengthInSamples = (audio_double.length * 2 ) / (sampleSizeInBits/8); |
440 | logger.info("length in samples=" + lengthInSamples ); |
441 | |
442 | /* Normalise the signal before return, this will normalise between 1 and -1 */ |
443 | double MaxSample = MathUtils.getAbsMax(audio_double); |
444 | for (int i=0; i<audio_double.length; i++) |
445 | audio_double[i] = 0.3 * ( audio_double[i] / MaxSample ); |
446 | |
447 | DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af); |
448 | return oais; |
449 | |
450 | |
451 | } /* method htsMLSAVocoder() */ |
452 | #endif |
453 | |
454 | static double mlsafir(double x, double *b, int m, double a, double aa, double *d, int _pt3 ) |
455 | { |
456 | double y = 0.0; |
457 | int i; |
458 | |
459 | d[_pt3+0] = x; |
460 | d[_pt3+1] = aa * d[_pt3+0] + ( a * d[_pt3+1] ); |
461 | |
462 | for(i=2; i<=m; i++){ |
463 | d[_pt3+i] += a * ( d[_pt3+i+1] - d[_pt3+i-1]); |
464 | } |
465 | |
466 | for(i=2; i<=m; i++){ |
467 | y += d[_pt3+i] * b[i]; |
468 | } |
469 | |
470 | for(i=m+1; i>1; i--){ |
471 | d[_pt3+i] = d[_pt3+i-1]; |
472 | } |
473 | |
474 | return(y); |
475 | } |
476 | |
477 | /** mlsdaf1: sub functions for MLSA filter */ |
478 | static double mlsadf1(double x, double *b, int m, double a, double aa, double *d) |
479 | { |
480 | double v; |
481 | double out = 0.0; |
482 | int i; |
483 | //pt1 --> pt = &d1[pd+1] |
484 | |
485 | for(i=PADEORDER; i>=1; i--) { |
486 | d[i] = aa * d[pt1+i-1] + a * d[i]; |
487 | d[pt1+i] = d[i] * b[1]; |
488 | v = d[pt1+i] * pade[ppade+i]; |
489 | |
490 | //x += (1 & i) ? v : -v; |
491 | if(i == 1 || i == 3 || i == 5) |
492 | x += v; |
493 | else |
494 | x += -v; |
495 | out += v; |
496 | } |
497 | d[pt1+0] = x; |
498 | out += x; |
499 | |
500 | return(out); |
501 | |
502 | } |
503 | |
504 | /** mlsdaf2: sub functions for MLSA filter */ |
505 | static double mlsadf2(double x, double *b, int m, double a, double aa, double *d) |
506 | { |
507 | double v; |
508 | double out = 0.0; |
509 | int i; |
510 | // pt2 --> pt = &d1[pd * (m+2)] |
511 | // pt3 --> pt = &d1[ 2*(pd+1) ] |
512 | |
513 | for(i=PADEORDER; i>=1; i--) { |
514 | d[pt2+i] = mlsafir(d[(pt2+i)-1], b, m, a, aa, d, pt3[i]); |
515 | v = d[pt2+i] * pade[ppade+i]; |
516 | |
517 | if(i == 1 || i == 3 || i == 5) |
518 | x += v; |
519 | else |
520 | x += -v; |
521 | out += v; |
522 | |
523 | } |
524 | d[pt2+0] = x; |
525 | out += x; |
526 | |
527 | return out; |
528 | } |
529 | |
530 | /** mlsadf: HTS Mel Log Spectrum Approximation filter */ |
531 | static double mlsadf(double x, double *b, int m, double a, double aa, double *d) |
532 | { |
533 | |
534 | x = mlsadf1(x, b, m, a, aa, d); |
535 | x = mlsadf2(x, b, m-1, a, aa, d); |
536 | |
537 | return x; |
538 | } |
539 | |
540 | |
541 | /** uniform_rand: generate uniformly distributed random numbers 1 or -1 */ |
542 | static double uniformRand() |
543 | { |
544 | double x; |
545 | |
546 | x = rand(); /* double uniformly distributed between 0.0 <= Math.random() < 1.0.*/ |
547 | if(x >= RAND_MAX2147483647/2.0) |
548 | return 1.0; |
549 | else |
550 | return -1.0; |
551 | } |
552 | |
553 | /** mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ |
554 | static void mc2b(double *mc, double *b, int m, double a ) |
555 | { |
556 | |
557 | b[m] = mc[m]; |
558 | for(m--; m>=0; m--) { |
559 | b[m] = mc[m] - a * b[m+1]; |
560 | } |
561 | } |
562 | |
563 | /** b2mc: transform MLSA digital filter coefficients to mel-cepstrum */ |
564 | static void b2mc(double *b, double *mc, int m, double a) |
565 | { |
566 | double d, o; |
567 | int i; |
568 | d = mc[m] = b[m]; |
569 | for(i=m--; i>=0; i--) { |
570 | o = b[i] + (a * d); |
571 | d = b[i]; |
572 | mc[i] = o; |
573 | } |
574 | } |
575 | |
576 | |
577 | /** freqt: frequency transformation */ |
578 | //private void freqt(double c1[], int m1, int cepIndex, int m2, double a){ |
579 | static void freqt(double *c1, int m1, double *c2, int m2, double a) |
580 | { |
581 | double *freqt_buff=NULL__null; /* used in freqt */ |
582 | int freqt_size=0; /* buffer size for freqt */ |
583 | int i, j; |
584 | double b = 1 - a * a; |
585 | int g; /* offset of freqt_buff */ |
586 | |
587 | if(m2 > freqt_size) { |
588 | freqt_buff = walloc(double,m2 + m2 + 2)((double *)safe_walloc(sizeof(double)*(m2 + m2 + 2))); |
589 | freqt_size = m2; |
590 | } |
591 | g = freqt_size +1; |
592 | |
593 | for(i = 0; i < m2+1; i++) |
594 | freqt_buff[g+i] = 0.0; |
595 | |
596 | for(i = -m1; i <= 0; i++){ |
597 | if(0 <= m2 ) |
598 | freqt_buff[g+0] = c1[-i] + a * (freqt_buff[0] = freqt_buff[g+0]); |
599 | if(1 <= m2) |
600 | freqt_buff[g+1] = b * freqt_buff[0] + a * (freqt_buff[1] = freqt_buff[g+1]); |
601 | |
602 | for(j=2; j<=m2; j++) |
603 | freqt_buff[g+j] = freqt_buff[j-1] + a * ( (freqt_buff[j] = freqt_buff[g+j]) - freqt_buff[g+j-1]); |
604 | |
605 | } |
606 | |
607 | /* move memory */ |
608 | for(i=0; i<m2+1; i++) |
609 | c2[i] = freqt_buff[g+i]; |
610 | |
611 | if (freqt_buff) |
612 | wfree(freqt_buff); |
613 | |
614 | } |
615 | |
616 | /** c2ir: The minimum phase impulse response is evaluated from the minimum phase cepstrum */ |
617 | static void c2ir(double *c, int nc, double *hh, int leng ) |
618 | { |
619 | int n, k, upl; |
620 | double d; |
621 | |
622 | hh[0] = exp(c[0]); |
623 | for(n = 1; n < leng; n++) { |
624 | d = 0; |
625 | upl = (n >= nc) ? nc - 1 : n; |
626 | for(k = 1; k <= upl; k++ ) |
627 | d += k * c[k] * hh[n - k]; |
628 | hh[n] = d / n; |
629 | } |
630 | } |
631 | |
632 | /** b2en: functions for postfiltering */ |
633 | static double b2en(double *b, int m, double a) |
634 | { |
635 | double *spectrum2en_buff=NULL__null; /* used in spectrum2en */ |
636 | int spectrum2en_size=0; /* buffer size for spectrum2en */ |
637 | double en = 0.0; |
638 | int i; |
639 | double *cep, *ir; |
640 | |
641 | if(spectrum2en_size < m) { |
642 | spectrum2en_buff = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); |
643 | spectrum2en_size = m; |
644 | } |
645 | cep = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); /* CHECK! these sizes!!! */ |
646 | ir = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); |
647 | |
648 | b2mc(b, spectrum2en_buff, m, a); |
649 | /* freqt(vs->mc, m, vs->cep, vs->irleng - 1, -a);*/ |
650 | freqt(spectrum2en_buff, m, cep, IRLENG-1, -a); |
651 | /* HTS_c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); */ |
652 | c2ir(cep, IRLENG, ir, IRLENG); |
653 | en = 0.0; |
654 | |
655 | for(i = 0; i < IRLENG; i++) |
656 | en += ir[i] * ir[i]; |
657 | |
658 | if (spectrum2en_buff) |
659 | wfree(spectrum2en_buff); |
660 | wfree(cep); |
661 | wfree(ir); |
662 | |
663 | return(en); |
664 | } |
665 | |
666 | /** ignorm: inverse gain normalization */ |
667 | static void ignorm(double *c1, double *c2, int m, double ng) |
668 | { |
669 | double k; |
670 | int i; |
671 | if(ng != 0.0 ) { |
672 | k = pow(c1[0], ng); |
673 | for(i=m; i>=1; i--) |
674 | c2[i] = k * c1[i]; |
675 | c2[0] = (k - 1.0) / ng; |
676 | } else { |
677 | /* movem */ |
678 | for(i=1; i<m; i++) |
679 | c2[i] = c1[i]; |
680 | c2[0] = log(c1[0]); |
681 | } |
682 | } |
683 | |
684 | /** ignorm: gain normalization */ |
685 | static void gnorm(double *c1, double *c2, int m, double g) |
686 | { |
687 | double k; |
688 | int i; |
689 | if(g != 0.0) { |
690 | k = 1.0 + g * c1[0]; |
691 | for(; m>=1; m--) |
692 | c2[m] = c1[m] / k; |
693 | c2[0] = pow(k, 1.0 / g); |
694 | } else { |
695 | /* movem */ |
696 | for(i=1; i<=m; i++) |
697 | c2[i] = c1[i]; |
698 | c2[0] = exp(c1[0]); |
699 | } |
700 | |
701 | } |
702 | |
703 | /** lsp2lpc: transform LSP to LPC. lsp[1..m] --> a=lpc[0..m] a[0]=1.0 */ |
704 | static void lsp2lpc(double *lsp, double *a, int m) |
705 | { |
706 | double *lsp2lpc_buff=NULL__null; /* used in lsp2lpc */ |
707 | int lsp2lpc_size=0; /* buffer size of lsp2lpc */ |
708 | int i, k, mh1, mh2, flag_odd; |
709 | double xx, xf, xff; |
710 | int p, q; /* offsets of lsp2lpc_buff */ |
711 | int a0, a1, a2, b0, b1, b2; /* offsets of lsp2lpc_buff */ |
712 | |
713 | flag_odd = 0; |
714 | if(m % 2 == 0) |
715 | mh1 = mh2 = m / 2; |
716 | else { |
717 | mh1 = (m+1) / 2; |
718 | mh2 = (m-1) / 2; |
719 | flag_odd = 1; |
720 | } |
721 | |
722 | if(m > lsp2lpc_size){ |
723 | lsp2lpc_buff = walloc(double,5 * m + 6)((double *)safe_walloc(sizeof(double)*(5 * m + 6))); |
724 | lsp2lpc_size = m; |
725 | } |
726 | |
727 | /* offsets of lsp2lpcbuff */ |
728 | p = m; |
729 | q = p + mh1; |
730 | a0 = q + mh2; |
731 | a1 = a0 + (mh1 +1); |
732 | a2 = a1 + (mh1 +1); |
733 | b0 = a2 + (mh1 +1); |
734 | b1 = b0 + (mh2 +1); |
735 | b2 = b1 + (mh2 +1); |
736 | |
737 | /* move lsp -> lsp2lpc_buff */ |
738 | for(i=0; i<m; i++) |
739 | lsp2lpc_buff[i] = lsp[i+1]; |
740 | |
741 | for (i = 0; i < mh1 + 1; i++) |
742 | lsp2lpc_buff[a0 + i] = 0.0; |
743 | for (i = 0; i < mh1 + 1; i++) |
744 | lsp2lpc_buff[a1 + i] = 0.0; |
745 | for (i = 0; i < mh1 + 1; i++) |
746 | lsp2lpc_buff[a2 + i] = 0.0; |
747 | for (i = 0; i < mh2 + 1; i++) |
748 | lsp2lpc_buff[b0 + i] = 0.0; |
749 | for (i = 0; i < mh2 + 1; i++) |
750 | lsp2lpc_buff[b1 + i] = 0.0; |
751 | for (i = 0; i < mh2 + 1; i++) |
752 | lsp2lpc_buff[b2 + i] = 0.0; |
753 | |
754 | /* lsp filter parameters */ |
755 | for (i = k = 0; i < mh1; i++, k += 2) |
756 | lsp2lpc_buff[p + i] = -2.0 * cos(lsp2lpc_buff[k]); |
757 | for (i = k = 0; i < mh2; i++, k += 2) |
758 | lsp2lpc_buff[q + i] = -2.0 * cos(lsp2lpc_buff[k + 1]); |
759 | |
760 | /* impulse response of analysis filter */ |
761 | xx = 1.0; |
762 | xf = xff = 0.0; |
763 | |
764 | for (k = 0; k <= m; k++) { |
765 | if (flag_odd == 1) { |
766 | lsp2lpc_buff[a0 + 0] = xx; |
767 | lsp2lpc_buff[b0 + 0] = xx - xff; |
768 | xff = xf; |
769 | xf = xx; |
770 | } else { |
771 | lsp2lpc_buff[a0 + 0] = xx + xf; |
772 | lsp2lpc_buff[b0 + 0] = xx - xf; |
773 | xf = xx; |
774 | } |
775 | |
776 | for (i = 0; i < mh1; i++) { |
777 | lsp2lpc_buff[a0 + i + 1] = lsp2lpc_buff[a0 + i] + lsp2lpc_buff[p + i] * lsp2lpc_buff[a1 + i] + lsp2lpc_buff[a2 + i]; |
778 | lsp2lpc_buff[a2 + i] = lsp2lpc_buff[a1 + i]; |
779 | lsp2lpc_buff[a1 + i] = lsp2lpc_buff[a0 + i]; |
780 | } |
781 | |
782 | for (i = 0; i < mh2; i++) { |
783 | lsp2lpc_buff[b0 + i + 1] = lsp2lpc_buff[b0 + i] + lsp2lpc_buff[q + i] * lsp2lpc_buff[b1 + i] + lsp2lpc_buff[b2 + i]; |
784 | lsp2lpc_buff[b2 + i] = lsp2lpc_buff[b1 + i]; |
785 | lsp2lpc_buff[b1 + i] = lsp2lpc_buff[b0 + i]; |
786 | } |
787 | |
788 | if (k != 0) |
789 | a[k - 1] = -0.5 * (lsp2lpc_buff[a0 + mh1] + lsp2lpc_buff[b0 + mh2]); |
790 | xx = 0.0; |
791 | } |
792 | |
793 | for (i = m - 1; i >= 0; i--) |
794 | a[i + 1] = -a[i]; |
795 | a[0] = 1.0; |
796 | |
797 | if (lsp2lpc_buff) |
798 | wfree(lsp2lpc_buff); |
799 | } |
800 | |
801 | /** gc2gc: generalized cepstral transformation */ |
802 | static void gc2gc(double *c1, int m1, double g1, double *c2, int m2, double g2) |
803 | { |
804 | double *gc2gc_buff=NULL__null; /* used in gc2gc */ |
805 | int gc2gc_size=0; /* buffer size for gc2gc */ |
806 | int i, min, k, mk; |
807 | double ss1, ss2, cc; |
808 | |
809 | if( m1 > gc2gc_size ) { |
810 | gc2gc_buff = walloc(double,m1 + 1)((double *)safe_walloc(sizeof(double)*(m1 + 1))); /* check if these buffers should be created all the time */ |
811 | gc2gc_size = m1; |
812 | } |
813 | |
814 | /* movem*/ |
815 | for(i=0; i<(m1+1); i++) |
816 | gc2gc_buff[i] = c1[i]; |
817 | |
818 | c2[0] = gc2gc_buff[0]; |
819 | |
820 | for( i=1; i<=m2; i++){ |
821 | ss1 = ss2 = 0.0; |
822 | min = m1 < i ? m1 : i - 1; |
823 | for(k=1; k<=min; k++){ |
824 | mk = i - k; |
825 | cc = gc2gc_buff[k] * c2[mk]; |
826 | ss2 += k * cc; |
827 | ss1 += mk * cc; |
828 | } |
829 | |
830 | if(i <= m1) |
831 | c2[i] = gc2gc_buff[i] + (g2 * ss2 - g1 * ss1) / i; |
832 | else |
833 | c2[i] = (g2 * ss2 - g1 * ss1) / i; |
834 | } |
835 | |
836 | if (gc2gc_buff) |
837 | wfree(gc2gc_buff); |
838 | } |
839 | |
840 | /** mgc2mgc: frequency and generalized cepstral transformation */ |
841 | static void mgc2mgc(double *c1, int m1, double a1, double g1, double *c2, int m2, double a2, double g2) |
842 | { |
843 | double a; |
844 | |
845 | if(a1 == a2){ |
846 | gnorm(c1, c1, m1, g1); |
847 | gc2gc(c1, m1, g1, c2, m2, g2); |
848 | ignorm(c2, c2, m2, g2); |
849 | } else { |
850 | a = (a2 -a1) / (1 - a1 * a2); |
851 | freqt(c1, m1, c2, m2, a); |
852 | gnorm(c2, c2, m2, g1); |
853 | gc2gc(c2, m2, g1, c2, m2, g2); |
854 | ignorm(c2, c2, m2, g2); |
855 | |
856 | } |
857 | } |
858 | |
859 | /** lsp2mgc: transform LSP to MGC. lsp=C[0..m] mgc=C[0..m] */ |
860 | static void lsp2mgc(double *lsp, double *mgc, int m, double alpha) |
861 | { |
862 | int i; |
863 | /* lsp2lpc */ |
864 | lsp2lpc(lsp, mgc, m); /* lsp starts in 1! lsp[1..m] --> mgc[0..m] */ |
865 | if(use_log_gain) |
866 | mgc[0] = exp(lsp[0]); |
867 | else |
868 | mgc[0] = lsp[0]; |
869 | |
870 | /* mgc2mgc*/ |
871 | if(NORMFLG1) |
872 | ignorm(mgc, mgc, m, xgamma); |
873 | else if(MULGFLG1) |
874 | mgc[0] = (1.0 - mgc[0]) * stage; |
875 | |
876 | if(MULGFLG1) |
877 | for(i=m; i>=1; i--) |
878 | mgc[i] *= -stage; |
879 | |
880 | mgc2mgc(mgc, m, alpha, xgamma, mgc, m, alpha, xgamma); /* input and output is in mgc=C */ |
881 | |
882 | if(NORMFLG2) |
883 | gnorm(mgc, mgc, m, xgamma); |
884 | else if(MULGFLG2) |
885 | mgc[0] = mgc[0] * xgamma + 1.0; |
886 | |
887 | if(MULGFLG2) |
888 | for(i=m; i>=1; i--) |
889 | mgc[i] *= xgamma; |
890 | |
891 | } |
892 | |
893 | /** mglsadf: sub functions for MGLSA filter */ |
894 | static double mglsadff(double x, double *b, int m, double a, double *d, int d_offset) |
895 | { |
896 | int i; |
897 | double y; |
898 | y = d[d_offset+0] * b[1]; |
899 | |
900 | for(i=1; i<m; i++) { |
901 | d[d_offset+i] += a * (d[d_offset+i+1] -d[d_offset+i-1]); |
902 | y += d[d_offset+i] * b[i+1]; |
903 | } |
904 | x -= y; |
905 | |
906 | for(i=m; i>0; i--) |
907 | d[d_offset+i] = d[d_offset+i-1]; |
908 | d[d_offset+0] = a * d[d_offset+0] + (1 - a * a) * x; |
909 | |
910 | return x; |
911 | } |
912 | |
913 | static double mglsadf(double x, double *b, int m, double a, int n, double *d) |
914 | { |
915 | int i; |
916 | for(i=0; i<n; i++) |
917 | x = mglsadff(x, b, m, a, d, (i*(m+1))); |
918 | |
919 | return x; |
920 | } |
921 | |
922 | /** posfilter: postfilter for mel-cepstrum. It uses alpha and beta defined in HMMData */ |
923 | static void postfilter_mcp(double *mcp, int m, double alpha, double beta) |
924 | { |
925 | double *postfilter_buff=NULL__null; /* used in postfiltering */ |
926 | int postfilter_size = 0; /* buffer size for postfiltering */ |
927 | |
928 | double e1, e2; |
929 | int k; |
930 | |
931 | if(beta > 0.0 && m > 1){ |
932 | if(postfilter_size < m){ |
933 | postfilter_buff = walloc(double,m+1)((double *)safe_walloc(sizeof(double)*(m+1))); |
934 | postfilter_size = m; |
935 | } |
936 | mc2b(mcp, postfilter_buff, m, alpha); |
937 | e1 = b2en(postfilter_buff, m, alpha); |
938 | |
939 | postfilter_buff[1] -= beta * alpha * mcp[2]; |
940 | for(k = 2; k < m; k++) |
941 | postfilter_buff[k] *= (1.0 +beta); |
942 | e2 = b2en(postfilter_buff, m, alpha); |
943 | postfilter_buff[0] += log(e1/e2) / 2; |
944 | b2mc(postfilter_buff, mcp, m, alpha); |
945 | |
946 | } |
947 | |
948 | if (postfilter_buff) |
949 | wfree(postfilter_buff); |
950 | |
951 | } |
952 | |
953 | static int modShift(int n, int N) |
954 | { |
955 | if( n < 0 ) |
956 | while( n < 0 ) |
957 | n = n + N; |
958 | else |
959 | while( n >= N ) |
960 | n = n - N; |
961 | return n; |
962 | } |
963 | |
964 | /** Generate one pitch period from Fourier magnitudes */ |
965 | static double *genPulseFromFourierMag(EST_Track *mag, int n, double f0, booleanint aperiodicFlag) |
966 | { |
967 | |
968 | int numHarm = mag->num_channels(); |
969 | int i; |
970 | int currentF0 = (int)round(f0); |
971 | int T, T2; |
972 | double *pulse = NULL__null; |
973 | |
974 | if(currentF0 < 512) |
975 | T = 512; |
976 | else |
977 | T = 1024; |
978 | T2 = 2*T; |
979 | |
980 | /* since is FFT2 no aperiodicFlag or jitter of 25% is applied */ |
981 | |
982 | /* get the pulse */ |
983 | pulse = walloc(double,T)((double *)safe_walloc(sizeof(double)*(T))); |
984 | EST_FVector real(T2); |
985 | EST_FVector imag(T2); |
986 | |
987 | /* copy Fourier magnitudes (Wai C. Chu "Speech Coding algorithms foundation and evolution of standardized coders" pg. 460) */ |
988 | real[0] = real[T] = 0.0; /* DC component set to zero */ |
989 | for(i=1; i<=numHarm; i++){ |
990 | real[i] = real[T-i] = real[T+i] = real[T2-i] = mag->a(n, i-1); /* Symetric extension */ |
991 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; |
992 | } |
993 | for(i=(numHarm+1); i<(T-numHarm); i++){ /* Default components set to 1.0 */ |
994 | real[i] = real[T-i] = real[T+i] = real[T2-i] = 1.0; |
995 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; |
996 | } |
997 | |
998 | /* Calculate inverse Fourier transform */ |
999 | IFFT(real, imag); |
1000 | |
1001 | /* circular shift and normalise multiplying by sqrt(F0) */ |
1002 | double sqrt_f0 = sqrt((float)currentF0); |
1003 | for(i=0; i<T; i++) |
1004 | pulse[i] = real[modShift(i-numHarm,T)] * sqrt_f0; |
1005 | |
1006 | return pulse; |
1007 | |
1008 | } |
1009 | |
1010 | int htsMLSAVocoder(EST_Track *lf0Pst, |
1011 | EST_Track *mcepPst, |
1012 | EST_Track *strPst, |
1013 | EST_Track *magPst, |
1014 | int *voiced, |
1015 | HTSData *htsData, |
1016 | EST_Wave *wave) |
1017 | { |
1018 | |
1019 | double inc, x; |
1020 | double xp=0.0,xn=0.0,fxp,fxn,mix; /* samples for pulse and for noise and the filtered ones */ |
1021 | int i, j, k, m, s, mcepframe, lf0frame, s_double; |
1022 | double alpha = htsData->alpha; |
1023 | double beta = htsData->beta; |
1024 | double aa = 1-alpha*alpha; |
1025 | int audio_size; /* audio size in samples, calculated as num frames * frame period */ |
1026 | double *audio_double = NULL__null; |
1027 | double *magPulse = NULL__null; /* pulse generated from Fourier magnitudes */ |
1028 | int magSample, magPulseSize; |
1029 | booleanint aperiodicFlag = false0; |
1030 | |
1031 | double *d; /* used in the lpc vocoder */ |
1032 | |
1033 | double f0, f0Std, f0Shift, f0MeanOri; |
1034 | double *mc = NULL__null; /* feature vector for a particular frame */ |
1035 | double *hp = NULL__null; /* pulse shaping filter, initialised once it is known orderM */ |
1036 | double *hn = NULL__null; /* noise shaping filter, initialised once it is known orderM */ |
1037 | |
1038 | /* Initialise vocoder and mixed excitation, once initialised it is known the order |
1039 | * of the filters so the shaping filters hp and hn can be initialised. */ |
1040 | m = mcepPst->num_channels(); |
1041 | mc = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); |
1042 | |
1043 | initVocoder(m-1, mcepPst->num_frames(), htsData); |
1044 | |
1045 | d = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); |
1046 | if (lpcVocoder) |
1047 | { |
1048 | /* printf("Using LPC vocoder\n"); */ |
1049 | for(i=0; i<m; i++) |
1050 | d[i] = 0.0; |
1051 | } |
1052 | mixedExcitation = htsData->useMixExc; |
1053 | fourierMagnitudes = htsData->useFourierMag; |
1054 | |
1055 | if ( mixedExcitation ) |
1056 | { |
1057 | numM = htsData->NumFilters; |
1058 | orderM = htsData->OrderFilters; |
1059 | |
1060 | xpulseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
1061 | xnoiseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
1062 | /* initialise xp_sig and xn_sig */ |
1063 | for(i=0; i<orderM; i++) |
1064 | xpulseSignal[i] = xnoiseSignal[i] = 0; |
1065 | |
1066 | h = htsData->MixFilters; |
1067 | hp = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
1068 | hn = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); |
1069 | |
1070 | //Check if the number of filters is equal to the order of strpst |
1071 | //i.e. the number of filters is equal to the number of generated strengths per frame. |
1072 | #if 0 |
1073 | if(numM != strPst->num_channels()) { |
1074 | printf("htsMLSAVocoder: error num mix-excitation filters = %d " |
1075 | " in configuration file is different from generated str order= %d\n", |
1076 | numM, strPst->num_channels()); |
1077 | } |
1078 | printf("HMM speech generation with mixed-excitation.\n"); |
1079 | #endif |
1080 | } |
1081 | #if 0 |
1082 | else |
1083 | printf("HMM speech generation without mixed-excitation.\n"); |
1084 | |
1085 | if( fourierMagnitudes && htsData->PdfMagFile != NULL__null) |
1086 | printf("Pulse generated with Fourier Magnitudes.\n"); |
1087 | else |
1088 | printf("Pulse generated as a unit pulse.\n"); |
1089 | |
1090 | if(beta != 0.0) |
1091 | printf("Postfiltering applied with beta=%f",(float)beta); |
1092 | else |
1093 | printf("No postfiltering applied.\n"); |
1094 | #endif |
1095 | |
1096 | /* Clear content of c, should be done if this function is |
1097 | called more than once with a new set of generated parameters. */ |
1098 | for(i=0; i< C_length; i++) |
1099 | C[i] = CC[i] = CINC[i] = 0.0; |
1100 | for(i=0; i< D1_length; i++) |
1101 | D1[i]=0.0; |
1102 | |
1103 | f0Std = htsData->F0Std; |
1104 | f0Shift = htsData->F0Mean; |
1105 | f0MeanOri = 0.0; |
1106 | |
1107 | /* XXX */ |
1108 | for (mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) |
1109 | { |
1110 | if(voiced[mcepframe]) |
1111 | { /* WAS WRONG */ |
1112 | f0MeanOri = f0MeanOri + lf0Pst->a(mcepframe, 0); |
1113 | lf0frame++; |
1114 | } |
1115 | } |
1116 | f0MeanOri = f0MeanOri/lf0frame; |
1117 | |
1118 | /* ____________________Synthesize speech waveforms_____________________ */ |
1119 | /* generate Nperiod samples per mcepframe */ |
1120 | s = 0; /* number of samples */ |
Value stored to 's' is never read | |
1121 | s_double = 0; |
1122 | audio_size = mcepPst->num_frames() * (fprd); |
1123 | audio_double = walloc(double,audio_size)((double *)safe_walloc(sizeof(double)*(audio_size))); /* initialise buffer for audio */ |
1124 | magSample = 1; |
1125 | magPulseSize = 0; |
1126 | |
1127 | for(mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) |
1128 | { |
1129 | /* get current feature vector mcp */ |
1130 | for(i=0; i<m; i++) |
1131 | mc[i] = mcepPst->a(mcepframe, i); |
1132 | |
1133 | /* f0 modification through the MARY audio effects */ |
1134 | if(voiced[mcepframe]){ |
1135 | f0 = f0Std * lf0Pst->a(mcepframe, 0) + (1-f0Std) * f0MeanOri + f0Shift; |
1136 | lf0frame++; |
1137 | if(f0 < 0.0) |
1138 | f0 = 0.0; |
1139 | } |
1140 | else{ |
1141 | f0 = 0.0; |
1142 | } |
1143 | |
1144 | /* if mixed excitation get shaping filters for this frame */ |
1145 | if (mixedExcitation) |
1146 | { |
1147 | for(j=0; j<orderM; j++) |
1148 | { |
1149 | hp[j] = hn[j] = 0.0; |
1150 | for(i=0; i<numM; i++) |
1151 | { |
1152 | hp[j] += strPst->a(mcepframe, i) * h[i][j]; |
1153 | hn[j] += ( 1 - strPst->a(mcepframe, i) ) * h[i][j]; |
1154 | } |
1155 | } |
1156 | } |
1157 | |
1158 | /* f0->pitch, in original code here it is used p, so f0=p in the c code */ |
1159 | if(f0 != 0.0) |
1160 | f0 = rate/f0; |
1161 | |
1162 | /* p1 is initialised in -1, so this will be done just for the first frame */ |
1163 | if( p1 < 0 ) { |
1164 | p1 = f0; |
1165 | pc = p1; |
1166 | /* for LSP */ |
1167 | if(stage != 0){ |
1168 | if( use_log_gain) |
1169 | C[0] = LZERO; |
1170 | else |
1171 | C[0] = ZERO; |
1172 | for(i=0; i<m; i++ ) |
1173 | C[i] = i * PI3.14159265358979323846 / m; |
1174 | /* LSP -> MGC */ |
1175 | lsp2mgc(C, C, (m-1), alpha); |
1176 | mc2b(C, C, (m-1), alpha); |
1177 | gnorm(C, C, (m-1), xgamma); |
1178 | for(i=1; i<m; i++) |
1179 | C[i] *= xgamma; |
1180 | } |
1181 | |
1182 | } |
1183 | |
1184 | if(stage == 0){ |
1185 | /* postfiltering, this is done if beta>0.0 */ |
1186 | postfilter_mcp(mc, (m-1), alpha, beta); |
1187 | /* mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ |
1188 | mc2b(mc, CC, (m-1), alpha); |
1189 | for(i=0; i<m; i++) |
1190 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; |
1191 | } else { |
1192 | |
1193 | lsp2mgc(mc, CC, (m-1), alpha ); |
1194 | |
1195 | mc2b(CC, CC, (m-1), alpha); |
1196 | |
1197 | gnorm(CC, CC, (m-1), xgamma); |
1198 | |
1199 | for(i=1; i<m; i++) |
1200 | CC[i] *= xgamma; |
1201 | |
1202 | for(i=0; i<m; i++) |
1203 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; |
1204 | |
1205 | } |
1206 | |
1207 | /* p=f0 in c code!!! */ |
1208 | if( p1 != 0.0 && f0 != 0.0 ) { |
1209 | inc = (f0 - p1) * (double)iprd/(double)fprd; |
1210 | //System.out.println(" inc=(f0-p1)/80=" + inc ); |
1211 | } else { |
1212 | inc = 0.0; |
1213 | pc = f0; |
1214 | p1 = 0.0; |
1215 | } |
1216 | |
1217 | /* Here need to generate both xp:pulse and xn:noise signals seprately*/ |
1218 | gauss = false0; /* Mixed excitation works better with nomal noise */ |
1219 | |
1220 | /* Generate fperiod samples per feature vector, normally 80 samples per frame */ |
1221 | //p1=0.0; |
1222 | gauss=false0; |
1223 | for(j=fprd-1, i=(iprd+1)/2; j>=0; j--) { |
1224 | if(p1 == 0.0) { |
1225 | if(gauss) |
1226 | x = 0 /* rand.nextGaussian() */; /* XXX returns double, gaussian distribution mean=0.0 and var=1.0 */ |
1227 | else |
1228 | x = uniformRand(); /* returns 1.0 or -1.0 uniformly distributed */ |
1229 | |
1230 | if(mixedExcitation) { |
1231 | xn = x; |
1232 | xp = 0.0; |
1233 | } |
1234 | } else { |
1235 | if( (pc += 1.0) >= p1 ){ |
1236 | if(fourierMagnitudes){ |
1237 | /* jitter is applied just in voiced frames when the stregth of the first band is < 0.5*/ |
1238 | /* this will work just if Radix FFT is used */ |
1239 | /*if(strPst.getPar(mcepframe, 0) < 0.5) |
1240 | aperiodicFlag = true; |
1241 | else |
1242 | aperiodicFlag = false; |
1243 | magPulse = genPulseFromFourierMagRadix(magPst, mcepframe, p1, aperiodicFlag); |
1244 | */ |
1245 | |
1246 | magPulse = genPulseFromFourierMag(magPst, mcepframe, p1, aperiodicFlag); |
1247 | magSample = 0; |
1248 | magPulseSize = -27 /* magPulse.length*/; /** XXX **/ |
1249 | x = magPulse[magSample]; |
1250 | magSample++; |
1251 | } else |
1252 | x = sqrt(p1); |
1253 | |
1254 | pc = pc - p1; |
1255 | } else { |
1256 | |
1257 | if(fourierMagnitudes){ |
1258 | if(magSample >= magPulseSize ){ |
1259 | x = 0.0; |
1260 | } |
1261 | else |
1262 | x = magPulse[magSample]; |
1263 | magSample++; |
1264 | } else |
1265 | x = 0.0; |
1266 | } |
1267 | |
1268 | if(mixedExcitation) { |
1269 | xp = x; |
1270 | if(gauss) |
1271 | xn = 0 /* rand.nextGaussian() */ ; /* XXX */ |
1272 | else |
1273 | xn = uniformRand(); |
1274 | } |
1275 | } |
1276 | |
1277 | /* apply the shaping filters to the pulse and noise samples */ |
1278 | /* i need memory of at least for M samples in both signals */ |
1279 | if(mixedExcitation) { |
1280 | fxp = 0.0; |
1281 | fxn = 0.0; |
1282 | for(k=orderM-1; k>0; k--) { |
1283 | fxp += hp[k] * xpulseSignal[k]; |
1284 | fxn += hn[k] * xnoiseSignal[k]; |
1285 | xpulseSignal[k] = xpulseSignal[k-1]; |
1286 | xnoiseSignal[k] = xnoiseSignal[k-1]; |
1287 | } |
1288 | fxp += hp[0] * xp; |
1289 | fxn += hn[0] * xn; |
1290 | xpulseSignal[0] = xp; |
1291 | xnoiseSignal[0] = xn; |
1292 | |
1293 | /* x is a pulse noise excitation and mix is mixed excitation */ |
1294 | mix = fxp+fxn; |
1295 | |
1296 | /* comment this line if no mixed excitation, just pulse and noise */ |
1297 | x = mix; /* excitation sample */ |
1298 | /* printf("awb_debug me %d %f\n",(int)(s_double),(float)x); */ |
1299 | } |
1300 | |
1301 | if(lpcVocoder){ |
1302 | // LPC filter C[k=0] = gain is not used! |
1303 | if(!NGAIN) |
1304 | x *= C[0]; |
1305 | for(k=(m-1); k>1; k--){ |
1306 | x = x - (C[k] * d[k]); |
1307 | d[k] = d[k-1]; |
1308 | } |
1309 | x = x - (C[1] * d[1]); |
1310 | d[1] = x; |
1311 | |
1312 | } else if(stage == 0 ){ |
1313 | if(x != 0.0 ) |
1314 | x *= exp(C[0]); |
1315 | x = mlsadf(x, C, m, alpha, aa, D1); |
1316 | |
1317 | } else { |
1318 | if(!NGAIN) |
1319 | x *= C[0]; |
1320 | x = mglsadf(x, C, (m-1), alpha, stage, D1); |
1321 | } |
1322 | |
1323 | audio_double[s_double] = x; |
1324 | s_double++; |
1325 | |
1326 | if((--i) == 0 ) { |
1327 | p1 += inc; |
1328 | for(k=0; k<m; k++){ |
1329 | C[k] += CINC[k]; |
1330 | } |
1331 | i = iprd; |
1332 | } |
1333 | } /* for each sample in a period fprd */ |
1334 | |
1335 | p1 = f0; |
1336 | |
1337 | /* move elements in c */ |
1338 | /* HTS_movem(v->cc, v->c, m + 1); */ |
1339 | for(i=0; i<m; i++){ |
1340 | C[i] = CC[i]; |
1341 | } |
1342 | |
1343 | } /* for each mcep frame */ |
1344 | |
1345 | /* printf("Finish processing %d mcep frames.\n",mcepframe); */ |
1346 | |
1347 | wave->resize(audio_size,1); |
1348 | for (i=0; i<s_double; i++) |
1349 | wave->a(i) = (short)audio_double[i]; |
1350 | |
1351 | return 0; |
1352 | |
1353 | } /* method htsMLSAVocoder() */ |
1354 | |
1355 |