File: | modules/clustergen/me_mlsa.cc |
Location: | line 818, column 13 |
Description: | Array access (from variable 'gc2gc_buff') results in a null pointer dereference |
1 | /** | |||
2 | * The HMM-Based Speech Synthesis System (HTS) | |||
3 | * HTS Working Group | |||
4 | * | |||
5 | * Department of Computer Science | |||
6 | * Nagoya Institute of Technology | |||
7 | * and | |||
8 | * Interdisciplinary Graduate School of Science and Engineering | |||
9 | * Tokyo Institute of Technology | |||
10 | * | |||
11 | * Portions Copyright (c) 2001-2006 | |||
12 | * All Rights Reserved. | |||
13 | * | |||
14 | * Portions Copyright 2000-2007 DFKI GmbH. | |||
15 | * All Rights Reserved. | |||
16 | * | |||
17 | * Permission is hereby granted, free of charge, to use and | |||
18 | * distribute this software and its documentation without | |||
19 | * restriction, including without limitation the rights to use, | |||
20 | * copy, modify, merge, publish, distribute, sublicense, and/or | |||
21 | * sell copies of this work, and to permit persons to whom this | |||
22 | * work is furnished to do so, subject to the following conditions: | |||
23 | * | |||
24 | * 1. The source code must retain the above copyright notice, | |||
25 | * this list of conditions and the following disclaimer. | |||
26 | * | |||
27 | * 2. Any modifications to the source code must be clearly | |||
28 | * marked as such. | |||
29 | * | |||
30 | * 3. Redistributions in binary form must reproduce the above | |||
31 | * copyright notice, this list of conditions and the | |||
32 | * following disclaimer in the documentation and/or other | |||
33 | * materials provided with the distribution. Otherwise, one | |||
34 | * must contact the HTS working group. | |||
35 | * | |||
36 | * NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF TECHNOLOGY, | |||
37 | * HTS WORKING GROUP, AND THE CONTRIBUTORS TO THIS WORK DISCLAIM | |||
38 | * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL | |||
39 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT | |||
40 | * SHALL NAGOYA INSTITUTE OF TECHNOLOGY, TOKYO INSTITUTE OF | |||
41 | * TECHNOLOGY, HTS WORKING GROUP, NOR THE CONTRIBUTORS BE LIABLE | |||
42 | * FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY | |||
43 | * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, | |||
44 | * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTUOUS | |||
45 | * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |||
46 | * PERFORMANCE OF THIS SOFTWARE. | |||
47 | * | |||
48 | * | |||
49 | * This software was translated to C for use within Festival to offer | |||
50 | * multi-excitation MLSA | |||
51 | * Alan W Black (awb@cs.cmu.edu) 3rd April 2009 | |||
52 | * | |||
53 | */ | |||
54 | ||||
55 | #include <stdio.h> | |||
56 | #include <stdlib.h> | |||
57 | #include <string.h> | |||
58 | #include <math.h> | |||
59 | #include <EST_walloc.h> | |||
60 | #include "festival.h" | |||
61 | ||||
62 | #include "mlsa_resynthesis.h" | |||
63 | ||||
64 | /** | |||
65 | * Synthesis of speech out of speech parameters. | |||
66 | * Mixed excitation MLSA vocoder. | |||
67 | * | |||
68 | * Java port and extension of HTS engine version 2.0 | |||
69 | * Extension: mixed excitation | |||
70 | * @author Marcela Charfuelan | |||
71 | * And ported to C by Alan W Black (awb@cs.cmu.edu) | |||
72 | */ | |||
73 | ||||
74 | #define booleanint int | |||
75 | #define true1 1 | |||
76 | #define false0 0 | |||
77 | ||||
78 | typedef struct HTSData_struct { | |||
79 | ||||
80 | int rate; | |||
81 | int fperiod; | |||
82 | double rhos; | |||
83 | ||||
84 | int stage; | |||
85 | double alpha; | |||
86 | double beta; | |||
87 | booleanint useLogGain; | |||
88 | double uf; | |||
89 | booleanint algnst; /* use state level alignment for duration */ | |||
90 | booleanint algnph; /* use phoneme level alignment for duration */ | |||
91 | booleanint useMixExc; /* use Mixed Excitation */ | |||
92 | booleanint useFourierMag; /* use Fourier magnitudes for pulse generation */ | |||
93 | booleanint useGV; /* use global variance in parameter generation */ | |||
94 | booleanint useGmmGV; /* use global variance as a Gaussian Mixture Model */ | |||
95 | booleanint useUnitDurationContinuousFeature; /* for using external duration, so it will not be generated from HMMs*/ | |||
96 | booleanint useUnitLogF0ContinuousFeature; /* for using external f0, so it will not be generated from HMMs*/ | |||
97 | ||||
98 | /** variables for controling generation of speech in the vocoder | |||
99 | * these variables have default values but can be fixed and read from the | |||
100 | * audio effects component. [Default][min--max] */ | |||
101 | double length; /* total number of frame for generated speech */ | |||
102 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ | |||
103 | double durationScale; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ | |||
104 | ||||
105 | booleanint LogGain; | |||
106 | char *PdfStrFile, *PdfMagFile; | |||
107 | ||||
108 | int NumFilters, OrderFilters; | |||
109 | double **MixFilters; | |||
110 | double F0Std; | |||
111 | double F0Mean; | |||
112 | ||||
113 | } HTSData; | |||
114 | ||||
115 | #if 0 | |||
116 | typedef struct HTSData_struct { | |||
117 | ||||
118 | int rate = 16000; | |||
119 | int fperiod = 80; | |||
120 | double rhos = 0.0; | |||
121 | ||||
122 | int stage = 0; | |||
123 | double alpha = 0.42; | |||
124 | booleanint useLogGain = false0; | |||
125 | double uf = 0.5; | |||
126 | booleanint algnst = false0; /* use state level alignment for duration */ | |||
127 | booleanint algnph = false0; /* use phoneme level alignment for duration */ | |||
128 | booleanint useMixExc = true1; /* use Mixed Excitation */ | |||
129 | booleanint useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ | |||
130 | booleanint useGV = false0; /* use global variance in parameter generation */ | |||
131 | booleanint useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ | |||
132 | booleanint useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ | |||
133 | booleanint useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ | |||
134 | ||||
135 | /** variables for controling generation of speech in the vocoder | |||
136 | * these variables have default values but can be fixed and read from the | |||
137 | * audio effects component. [Default][min--max] */ | |||
138 | double f0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ | |||
139 | double f0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ | |||
140 | double length = 0.0; /* total number of frame for generated speech */ | |||
141 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ | |||
142 | double durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ | |||
143 | ||||
144 | } HTSData; | |||
145 | #endif | |||
146 | ||||
147 | static int IPERIOD = 1; | |||
148 | static booleanint GAUSS = true1; | |||
149 | static int PADEORDER = 5; /* pade order for MLSA filter */ | |||
150 | static int IRLENG = 96; /* length of impulse response */ | |||
151 | ||||
152 | /* for MGLSA filter (mel-generalised log spectrum approximation filter) */ | |||
153 | static booleanint NORMFLG1 = true1; | |||
154 | static booleanint NORMFLG2 = false0; | |||
155 | static booleanint MULGFLG1 = true1; | |||
156 | static booleanint MULGFLG2 = false0; | |||
157 | static booleanint NGAIN = false0; | |||
158 | ||||
159 | static double ZERO = 1.0e-10; /* ~(0) */ | |||
160 | static double LZERO = (-1.0e+10); /* ~log(0) */ | |||
161 | ||||
162 | static int stage; /* Gamma=-1/stage : if stage=0 then Gamma=0 */ | |||
163 | static double xgamma; /* Gamma */ | |||
164 | static booleanint use_log_gain; /* log gain flag (for LSP) */ | |||
165 | static int fprd; /* frame shift */ | |||
166 | static int iprd; /* interpolation period */ | |||
167 | static booleanint gauss; /* flag to use Gaussian noise */ | |||
168 | static double p1; /* used in excitation generation */ | |||
169 | static double pc; /* used in excitation generation */ | |||
170 | static double *pade; /* used in mlsadf */ | |||
171 | static int ppade; /* offset for vector ppade */ | |||
172 | ||||
173 | static double *C; /* used in the MLSA/MGLSA filter */ | |||
174 | static double *CC; /* used in the MLSA/MGLSA filter */ | |||
175 | static double *CINC; /* used in the MLSA/MGLSA filter */ | |||
176 | static double *D1; /* used in the MLSA/MGLSA filter */ | |||
177 | static int CINC_length, CC_length, C_length, D1_length; | |||
178 | ||||
179 | static double rate; | |||
180 | static int pt1; /* used in mlsadf1 */ | |||
181 | static int pt2; /* used in mlsadf2 */ | |||
182 | static int *pt3; /* used in mlsadf2 */ | |||
183 | ||||
184 | /* mixed excitation variables */ | |||
185 | static int numM; /* Number of bandpass filters for mixed excitation */ | |||
186 | static int orderM; /* Order of filters for mixed excitation */ | |||
187 | static double **h; /* filters for mixed excitation */ | |||
188 | static double *xpulseSignal; /* the size of this should be orderM */ | |||
189 | static double *xnoiseSignal; /* the size of this should be orderM */ | |||
190 | static booleanint mixedExcitation = false0; | |||
191 | static booleanint fourierMagnitudes = false0; | |||
192 | ||||
193 | static booleanint lpcVocoder = false0; /* true if lpc vocoder is used, then the input should be lsp parameters */ | |||
194 | ||||
195 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData); | |||
196 | int htsMLSAVocoder(EST_Track *lf0Pst, | |||
197 | EST_Track *mcepPst, | |||
198 | EST_Track *strPst, | |||
199 | EST_Track *magPst, | |||
200 | int *voiced, | |||
201 | HTSData *htsData, | |||
202 | EST_Wave *wave); | |||
203 | ||||
204 | ||||
205 | LISP me_mlsa_resynthesis(LISP ltrack, LISP strack) | |||
206 | { | |||
207 | /* Resynthesizes a wave from given track with mixed excitation*/ | |||
208 | EST_Track *t; | |||
209 | EST_Track *str_track; | |||
210 | EST_Wave *wave = 0; | |||
211 | EST_Track *mcep; | |||
212 | EST_Track *f0v; | |||
213 | EST_Track *str; | |||
214 | EST_Track *mag; | |||
215 | int *voiced; | |||
216 | int sr = 16000; | |||
217 | int i,j; | |||
218 | double shift; | |||
219 | HTSData htsData; | |||
220 | ||||
221 | htsData.alpha = 0.42; | |||
222 | htsData.beta = 0.0; | |||
223 | ||||
224 | if ((ltrack == NULL__null) || | |||
225 | (TYPEP(ltrack,tc_string)( (ltrack != __null) && ((((ltrack) == ((struct obj * ) 0)) ? 0 : ((*(ltrack)).type)) == (13)) ) && | |||
226 | (streq(get_c_string(ltrack),"nil")(strcmp(get_c_string(ltrack),"nil")==0)))) | |||
227 | return siod(new EST_Wave(0,1,sr)); | |||
228 | ||||
229 | t = track(ltrack); | |||
230 | str_track = track(strack); | |||
231 | ||||
232 | f0v = new EST_Track(t->num_frames(),1); | |||
233 | mcep = new EST_Track(t->num_frames(),25); | |||
234 | str = new EST_Track(t->num_frames(),5); | |||
235 | mag = new EST_Track(t->num_frames(),10); | |||
236 | voiced = walloc(int,t->num_frames())((int *)safe_walloc(sizeof(int)*(t->num_frames()))); | |||
237 | ||||
238 | for (i=0; i<t->num_frames(); i++) | |||
239 | { | |||
240 | f0v->a(i) = t->a(i,0); | |||
241 | if (f0v->a(i) > 0) | |||
242 | voiced[i] = 1; | |||
243 | else | |||
244 | voiced[i] = 0; | |||
245 | for (j=1; j<26; j++) | |||
246 | mcep->a(i,j-1) = t->a(i,j); | |||
247 | ||||
248 | for (j=0; j<5; j++) | |||
249 | { | |||
250 | str->a(i,j) = str_track->a(i,j); | |||
251 | } | |||
252 | /* printf("awb_debug str %d 0 %f 1 %f 2 %f 3 %f 4 %f\n", | |||
253 | i,str->a(i,0),str->a(i,1),str->a(i,2),str->a(i,3),str->a(i,4));*/ | |||
254 | #if 0 | |||
255 | for (j=57; j<66; j++) | |||
256 | mag->a(i,j-57) = t->a(i,j); | |||
257 | #endif | |||
258 | } | |||
259 | ||||
260 | if (t->num_frames() > 1) | |||
261 | shift = 1000.0*(t->t(1)-t->t(0)); | |||
262 | else | |||
263 | shift = 5.0; | |||
264 | ||||
265 | htsData.alpha = FLONM(siod_get_lval("mlsa_alpha_param",((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data) | |||
266 | "mlsa: mlsa_alpha_param not set"))((*siod_get_lval("mlsa_alpha_param", "mlsa: mlsa_alpha_param not set" )).storage_as.flonum.data); | |||
267 | htsData.beta = FLONM(siod_get_lval("mlsa_beta_param",((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data) | |||
268 | "mlsa: mlsa_beta_param not set"))((*siod_get_lval("mlsa_beta_param", "mlsa: mlsa_beta_param not set" )).storage_as.flonum.data); | |||
269 | htsData.stage = 0; | |||
270 | htsData.LogGain = false0; | |||
271 | htsData.fperiod = 80; | |||
272 | htsData.rate = 16000; | |||
273 | htsData.rhos = 0.0; | |||
274 | ||||
275 | htsData.uf = 0.5; | |||
276 | htsData.algnst = false0; /* use state level alignment for duration */ | |||
277 | htsData.algnph = false0; /* use phoneme level alignment for duration */ | |||
278 | htsData.useMixExc = true1; /* use Mixed Excitation */ | |||
279 | htsData.useFourierMag = false0; /* use Fourier magnitudes for pulse generation */ | |||
280 | htsData.useGV = false0; /* use global variance in parameter generation */ | |||
281 | htsData.useGmmGV = false0; /* use global variance as a Gaussian Mixture Model */ | |||
282 | htsData.useUnitDurationContinuousFeature = false0; /* for using external duration, so it will not be generated from HMMs*/ | |||
283 | htsData.useUnitLogF0ContinuousFeature = false0; /* for using external f0, so it will not be generated from HMMs*/ | |||
284 | ||||
285 | /** variables for controling generation of speech in the vocoder | |||
286 | * these variables have default values but can be fixed and read from the | |||
287 | * audio effects component. [Default][min--max] */ | |||
288 | htsData.F0Std = 1.0; /* variable for f0 control, multiply f0 [1.0][0.0--5.0] */ | |||
289 | htsData.F0Mean = 0.0; /* variable for f0 control, add f0 [0.0][0.0--100.0] */ | |||
290 | htsData.length = 0.0; /* total number of frame for generated speech */ | |||
291 | /* length of generated speech (in seconds) [N/A][0.0--30.0] */ | |||
292 | htsData.durationScale = 1.0; /* less than 1.0 is faster and more than 1.0 is slower, min=0.1 max=3.0 */ | |||
293 | ||||
294 | LISP filters = siod_get_lval("me_mix_filters", | |||
295 | "mlsa: me_mix_filters not set"); | |||
296 | LISP f; | |||
297 | int fl; | |||
298 | htsData.NumFilters = 5; | |||
299 | for (fl=0,f=filters; f; fl++) | |||
300 | f=cdr(f); | |||
301 | htsData.OrderFilters = fl/htsData.NumFilters; | |||
302 | htsData.MixFilters = walloc(double *,htsData.NumFilters)((double * *)safe_walloc(sizeof(double *)*(htsData.NumFilters ))); | |||
303 | for (i=0; i < htsData.NumFilters; i++) | |||
304 | { | |||
305 | htsData.MixFilters[i] = walloc(double,htsData.OrderFilters)((double *)safe_walloc(sizeof(double)*(htsData.OrderFilters)) ); | |||
306 | for (j=0; j<htsData.OrderFilters; j++) | |||
307 | { | |||
308 | htsData.MixFilters[i][j] = FLONM(car(filters))((*car(filters)).storage_as.flonum.data); | |||
309 | filters = cdr(filters); | |||
310 | } | |||
311 | } | |||
312 | ||||
313 | wave = new EST_Wave(0,1,sr); | |||
314 | ||||
315 | if (mcep->num_frames() > 0) | |||
316 | /* mcep_order and number of deltas */ | |||
317 | htsMLSAVocoder(f0v,mcep,str,mag,voiced,&htsData,wave); | |||
318 | ||||
319 | delete f0v; | |||
320 | delete mcep; | |||
321 | delete str; | |||
322 | delete mag; | |||
323 | delete voiced; | |||
324 | ||||
325 | return siod(wave); | |||
326 | } | |||
327 | ||||
328 | /** The initialisation of VocoderSetup should be done when there is already | |||
329 | * information about the number of feature vectors to be processed, | |||
330 | * size of the mcep vector file, etc. */ | |||
331 | void initVocoder(int mcep_order, int mcep_vsize, HTSData *htsData) | |||
332 | { | |||
333 | int vector_size; | |||
334 | double xrand; | |||
335 | ||||
336 | stage = htsData->stage; | |||
337 | if(stage != 0) | |||
338 | xgamma = -1.0 / stage; | |||
339 | else | |||
340 | xgamma = 0.0; | |||
341 | use_log_gain = htsData->LogGain; | |||
342 | ||||
343 | fprd = htsData->fperiod; | |||
344 | rate = htsData->rate; | |||
345 | iprd = IPERIOD; | |||
346 | gauss = GAUSS; | |||
347 | ||||
348 | /* XXX */ | |||
349 | xrand = rand(); | |||
350 | ||||
351 | if(stage == 0 ){ /* for MCP */ | |||
352 | ||||
353 | /* mcep_order=74 and pd=PADEORDER=5 (if no HTS_EMBEDDED is used) */ | |||
354 | vector_size = (mcep_vsize * ( 3 + PADEORDER) + 5 * PADEORDER + 6) - (3 * (mcep_order+1)); | |||
355 | CINC_length = CC_length = C_length = mcep_order+1; | |||
356 | D1_length = vector_size; | |||
357 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); | |||
358 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); | |||
359 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); | |||
360 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); | |||
361 | ||||
362 | vector_size=21; | |||
363 | pade = walloc(double,vector_size)((double *)safe_walloc(sizeof(double)*(vector_size))); | |||
364 | /* ppade is a copy of pade in mlsadf() function : ppade = &( pade[pd*(pd+1)/2] ); */ | |||
365 | ppade = PADEORDER*(PADEORDER+1)/2; /* offset for vector pade */ | |||
366 | pade[0] = 1.0; | |||
367 | pade[1] = 1.0; | |||
368 | pade[2] = 0.0; | |||
369 | pade[3] = 1.0; | |||
370 | pade[4] = 0.0; | |||
371 | pade[5] = 0.0; | |||
372 | pade[6] = 1.0; | |||
373 | pade[7] = 0.0; | |||
374 | pade[8] = 0.0; | |||
375 | pade[9] = 0.0; | |||
376 | pade[10] = 1.0; | |||
377 | pade[11] = 0.4999273; | |||
378 | pade[12] = 0.1067005; | |||
379 | pade[13] = 0.01170221; | |||
380 | pade[14] = 0.0005656279; | |||
381 | pade[15] = 1.0; | |||
382 | pade[16] = 0.4999391; | |||
383 | pade[17] = 0.1107098; | |||
384 | pade[18] = 0.01369984; | |||
385 | pade[19] = 0.0009564853; | |||
386 | pade[20] = 0.00003041721; | |||
387 | ||||
388 | pt1 = PADEORDER+1; | |||
389 | pt2 = ( 2 * (PADEORDER+1)) + (PADEORDER * (mcep_order+2)); | |||
390 | pt3 = new int[PADEORDER+1]; | |||
391 | for(int i=PADEORDER; i>=1; i--) | |||
392 | pt3[i] = ( 2 * (PADEORDER+1)) + ((i-1)*(mcep_order+2)); | |||
393 | ||||
394 | } else { /* for LSP */ | |||
395 | vector_size = ((mcep_vsize+1) * (stage+3)) - ( 3 * (mcep_order+1)); | |||
396 | CINC_length = CC_length = C_length = mcep_order+1; | |||
397 | D1_length = vector_size; | |||
398 | C = walloc(double,C_length)((double *)safe_walloc(sizeof(double)*(C_length))); | |||
399 | CC = walloc(double,CC_length)((double *)safe_walloc(sizeof(double)*(CC_length))); | |||
400 | CINC = walloc(double,CINC_length)((double *)safe_walloc(sizeof(double)*(CINC_length))); | |||
401 | D1 = walloc(double,D1_length)((double *)safe_walloc(sizeof(double)*(D1_length))); | |||
402 | } | |||
403 | ||||
404 | /* excitation initialisation */ | |||
405 | p1 = -1; | |||
406 | pc = 0.0; | |||
407 | ||||
408 | } /* method initVocoder */ | |||
409 | ||||
410 | ||||
411 | ||||
412 | /** | |||
413 | * HTS_MLSA_Vocoder: Synthesis of speech out of mel-cepstral coefficients. | |||
414 | * This procedure uses the parameters generated in pdf2par stored in: | |||
415 | * PStream mceppst: Mel-cepstral coefficients | |||
416 | * PStream strpst : Filter bank stregths for mixed excitation | |||
417 | * PStream magpst : Fourier magnitudes ( OJO!! this is not used yet) | |||
418 | * PStream lf0pst : Log F0 | |||
419 | */ | |||
420 | #if 0 | |||
421 | AudioInputStream htsMLSAVocoder(HTSParameterGeneration pdf2par, HMMData htsData) | |||
422 | { | |||
423 | float sampleRate = 16000.0F; //8000,11025,16000,22050,44100 | |||
424 | int sampleSizeInBits = 16; //8,16 | |||
425 | int channels = 1; //1,2 | |||
426 | booleanint signed = true1; //true,false | |||
427 | booleanint bigEndian = false0; //true,false | |||
428 | AudioFormat af = new AudioFormat( | |||
429 | sampleRate, | |||
430 | sampleSizeInBits, | |||
431 | channels, | |||
432 | signed, | |||
433 | bigEndian); | |||
434 | double [] audio_double = NULL__null; | |||
435 | ||||
436 | audio_double = htsMLSAVocoder(pdf2par.getlf0Pst(), pdf2par.getMcepPst(), pdf2par.getStrPst(), pdf2par.getMagPst(), | |||
437 | pdf2par.getVoicedArray(), htsData); | |||
438 | ||||
439 | long lengthInSamples = (audio_double.length * 2 ) / (sampleSizeInBits/8); | |||
440 | logger.info("length in samples=" + lengthInSamples ); | |||
441 | ||||
442 | /* Normalise the signal before return, this will normalise between 1 and -1 */ | |||
443 | double MaxSample = MathUtils.getAbsMax(audio_double); | |||
444 | for (int i=0; i<audio_double.length; i++) | |||
445 | audio_double[i] = 0.3 * ( audio_double[i] / MaxSample ); | |||
446 | ||||
447 | DDSAudioInputStream oais = new DDSAudioInputStream(new BufferedDoubleDataSource(audio_double), af); | |||
448 | return oais; | |||
449 | ||||
450 | ||||
451 | } /* method htsMLSAVocoder() */ | |||
452 | #endif | |||
453 | ||||
454 | static double mlsafir(double x, double *b, int m, double a, double aa, double *d, int _pt3 ) | |||
455 | { | |||
456 | double y = 0.0; | |||
457 | int i; | |||
458 | ||||
459 | d[_pt3+0] = x; | |||
460 | d[_pt3+1] = aa * d[_pt3+0] + ( a * d[_pt3+1] ); | |||
461 | ||||
462 | for(i=2; i<=m; i++){ | |||
463 | d[_pt3+i] += a * ( d[_pt3+i+1] - d[_pt3+i-1]); | |||
464 | } | |||
465 | ||||
466 | for(i=2; i<=m; i++){ | |||
467 | y += d[_pt3+i] * b[i]; | |||
468 | } | |||
469 | ||||
470 | for(i=m+1; i>1; i--){ | |||
471 | d[_pt3+i] = d[_pt3+i-1]; | |||
472 | } | |||
473 | ||||
474 | return(y); | |||
475 | } | |||
476 | ||||
477 | /** mlsdaf1: sub functions for MLSA filter */ | |||
478 | static double mlsadf1(double x, double *b, int m, double a, double aa, double *d) | |||
479 | { | |||
480 | double v; | |||
481 | double out = 0.0; | |||
482 | int i; | |||
483 | //pt1 --> pt = &d1[pd+1] | |||
484 | ||||
485 | for(i=PADEORDER; i>=1; i--) { | |||
486 | d[i] = aa * d[pt1+i-1] + a * d[i]; | |||
487 | d[pt1+i] = d[i] * b[1]; | |||
488 | v = d[pt1+i] * pade[ppade+i]; | |||
489 | ||||
490 | //x += (1 & i) ? v : -v; | |||
491 | if(i == 1 || i == 3 || i == 5) | |||
492 | x += v; | |||
493 | else | |||
494 | x += -v; | |||
495 | out += v; | |||
496 | } | |||
497 | d[pt1+0] = x; | |||
498 | out += x; | |||
499 | ||||
500 | return(out); | |||
501 | ||||
502 | } | |||
503 | ||||
504 | /** mlsdaf2: sub functions for MLSA filter */ | |||
505 | static double mlsadf2(double x, double *b, int m, double a, double aa, double *d) | |||
506 | { | |||
507 | double v; | |||
508 | double out = 0.0; | |||
509 | int i; | |||
510 | // pt2 --> pt = &d1[pd * (m+2)] | |||
511 | // pt3 --> pt = &d1[ 2*(pd+1) ] | |||
512 | ||||
513 | for(i=PADEORDER; i>=1; i--) { | |||
514 | d[pt2+i] = mlsafir(d[(pt2+i)-1], b, m, a, aa, d, pt3[i]); | |||
515 | v = d[pt2+i] * pade[ppade+i]; | |||
516 | ||||
517 | if(i == 1 || i == 3 || i == 5) | |||
518 | x += v; | |||
519 | else | |||
520 | x += -v; | |||
521 | out += v; | |||
522 | ||||
523 | } | |||
524 | d[pt2+0] = x; | |||
525 | out += x; | |||
526 | ||||
527 | return out; | |||
528 | } | |||
529 | ||||
530 | /** mlsadf: HTS Mel Log Spectrum Approximation filter */ | |||
531 | static double mlsadf(double x, double *b, int m, double a, double aa, double *d) | |||
532 | { | |||
533 | ||||
534 | x = mlsadf1(x, b, m, a, aa, d); | |||
535 | x = mlsadf2(x, b, m-1, a, aa, d); | |||
536 | ||||
537 | return x; | |||
538 | } | |||
539 | ||||
540 | ||||
541 | /** uniform_rand: generate uniformly distributed random numbers 1 or -1 */ | |||
542 | static double uniformRand() | |||
543 | { | |||
544 | double x; | |||
545 | ||||
546 | x = rand(); /* double uniformly distributed between 0.0 <= Math.random() < 1.0.*/ | |||
547 | if(x >= RAND_MAX2147483647/2.0) | |||
548 | return 1.0; | |||
549 | else | |||
550 | return -1.0; | |||
551 | } | |||
552 | ||||
553 | /** mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ | |||
554 | static void mc2b(double *mc, double *b, int m, double a ) | |||
555 | { | |||
556 | ||||
557 | b[m] = mc[m]; | |||
558 | for(m--; m>=0; m--) { | |||
559 | b[m] = mc[m] - a * b[m+1]; | |||
560 | } | |||
561 | } | |||
562 | ||||
563 | /** b2mc: transform MLSA digital filter coefficients to mel-cepstrum */ | |||
564 | static void b2mc(double *b, double *mc, int m, double a) | |||
565 | { | |||
566 | double d, o; | |||
567 | int i; | |||
568 | d = mc[m] = b[m]; | |||
569 | for(i=m--; i>=0; i--) { | |||
570 | o = b[i] + (a * d); | |||
571 | d = b[i]; | |||
572 | mc[i] = o; | |||
573 | } | |||
574 | } | |||
575 | ||||
576 | ||||
577 | /** freqt: frequency transformation */ | |||
578 | //private void freqt(double c1[], int m1, int cepIndex, int m2, double a){ | |||
579 | static void freqt(double *c1, int m1, double *c2, int m2, double a) | |||
580 | { | |||
581 | double *freqt_buff=NULL__null; /* used in freqt */ | |||
582 | int freqt_size=0; /* buffer size for freqt */ | |||
583 | int i, j; | |||
584 | double b = 1 - a * a; | |||
585 | int g; /* offset of freqt_buff */ | |||
586 | ||||
587 | if(m2 > freqt_size) { | |||
588 | freqt_buff = walloc(double,m2 + m2 + 2)((double *)safe_walloc(sizeof(double)*(m2 + m2 + 2))); | |||
589 | freqt_size = m2; | |||
590 | } | |||
591 | g = freqt_size +1; | |||
592 | ||||
593 | for(i = 0; i < m2+1; i++) | |||
594 | freqt_buff[g+i] = 0.0; | |||
595 | ||||
596 | for(i = -m1; i <= 0; i++){ | |||
597 | if(0 <= m2 ) | |||
598 | freqt_buff[g+0] = c1[-i] + a * (freqt_buff[0] = freqt_buff[g+0]); | |||
599 | if(1 <= m2) | |||
600 | freqt_buff[g+1] = b * freqt_buff[0] + a * (freqt_buff[1] = freqt_buff[g+1]); | |||
601 | ||||
602 | for(j=2; j<=m2; j++) | |||
603 | freqt_buff[g+j] = freqt_buff[j-1] + a * ( (freqt_buff[j] = freqt_buff[g+j]) - freqt_buff[g+j-1]); | |||
604 | ||||
605 | } | |||
606 | ||||
607 | /* move memory */ | |||
608 | for(i=0; i<m2+1; i++) | |||
609 | c2[i] = freqt_buff[g+i]; | |||
610 | ||||
611 | if (freqt_buff) | |||
612 | wfree(freqt_buff); | |||
613 | ||||
614 | } | |||
615 | ||||
616 | /** c2ir: The minimum phase impulse response is evaluated from the minimum phase cepstrum */ | |||
617 | static void c2ir(double *c, int nc, double *hh, int leng ) | |||
618 | { | |||
619 | int n, k, upl; | |||
620 | double d; | |||
621 | ||||
622 | hh[0] = exp(c[0]); | |||
623 | for(n = 1; n < leng; n++) { | |||
624 | d = 0; | |||
625 | upl = (n >= nc) ? nc - 1 : n; | |||
626 | for(k = 1; k <= upl; k++ ) | |||
627 | d += k * c[k] * hh[n - k]; | |||
628 | hh[n] = d / n; | |||
629 | } | |||
630 | } | |||
631 | ||||
632 | /** b2en: functions for postfiltering */ | |||
633 | static double b2en(double *b, int m, double a) | |||
634 | { | |||
635 | double *spectrum2en_buff=NULL__null; /* used in spectrum2en */ | |||
636 | int spectrum2en_size=0; /* buffer size for spectrum2en */ | |||
637 | double en = 0.0; | |||
638 | int i; | |||
639 | double *cep, *ir; | |||
640 | ||||
641 | if(spectrum2en_size < m) { | |||
642 | spectrum2en_buff = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); | |||
643 | spectrum2en_size = m; | |||
644 | } | |||
645 | cep = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); /* CHECK! these sizes!!! */ | |||
646 | ir = walloc(double,(m+1) + 2 * IRLENG)((double *)safe_walloc(sizeof(double)*((m+1) + 2 * IRLENG))); | |||
647 | ||||
648 | b2mc(b, spectrum2en_buff, m, a); | |||
649 | /* freqt(vs->mc, m, vs->cep, vs->irleng - 1, -a);*/ | |||
650 | freqt(spectrum2en_buff, m, cep, IRLENG-1, -a); | |||
651 | /* HTS_c2ir(vs->cep, vs->irleng, vs->ir, vs->irleng); */ | |||
652 | c2ir(cep, IRLENG, ir, IRLENG); | |||
653 | en = 0.0; | |||
654 | ||||
655 | for(i = 0; i < IRLENG; i++) | |||
656 | en += ir[i] * ir[i]; | |||
657 | ||||
658 | if (spectrum2en_buff) | |||
659 | wfree(spectrum2en_buff); | |||
660 | wfree(cep); | |||
661 | wfree(ir); | |||
662 | ||||
663 | return(en); | |||
664 | } | |||
665 | ||||
666 | /** ignorm: inverse gain normalization */ | |||
667 | static void ignorm(double *c1, double *c2, int m, double ng) | |||
668 | { | |||
669 | double k; | |||
670 | int i; | |||
671 | if(ng != 0.0 ) { | |||
672 | k = pow(c1[0], ng); | |||
673 | for(i=m; i>=1; i--) | |||
674 | c2[i] = k * c1[i]; | |||
675 | c2[0] = (k - 1.0) / ng; | |||
676 | } else { | |||
677 | /* movem */ | |||
678 | for(i=1; i<m; i++) | |||
679 | c2[i] = c1[i]; | |||
680 | c2[0] = log(c1[0]); | |||
681 | } | |||
682 | } | |||
683 | ||||
684 | /** ignorm: gain normalization */ | |||
685 | static void gnorm(double *c1, double *c2, int m, double g) | |||
686 | { | |||
687 | double k; | |||
688 | int i; | |||
689 | if(g != 0.0) { | |||
690 | k = 1.0 + g * c1[0]; | |||
691 | for(; m>=1; m--) | |||
692 | c2[m] = c1[m] / k; | |||
693 | c2[0] = pow(k, 1.0 / g); | |||
694 | } else { | |||
695 | /* movem */ | |||
696 | for(i=1; i<=m; i++) | |||
697 | c2[i] = c1[i]; | |||
698 | c2[0] = exp(c1[0]); | |||
699 | } | |||
700 | ||||
701 | } | |||
702 | ||||
703 | /** lsp2lpc: transform LSP to LPC. lsp[1..m] --> a=lpc[0..m] a[0]=1.0 */ | |||
704 | static void lsp2lpc(double *lsp, double *a, int m) | |||
705 | { | |||
706 | double *lsp2lpc_buff=NULL__null; /* used in lsp2lpc */ | |||
707 | int lsp2lpc_size=0; /* buffer size of lsp2lpc */ | |||
708 | int i, k, mh1, mh2, flag_odd; | |||
709 | double xx, xf, xff; | |||
710 | int p, q; /* offsets of lsp2lpc_buff */ | |||
711 | int a0, a1, a2, b0, b1, b2; /* offsets of lsp2lpc_buff */ | |||
712 | ||||
713 | flag_odd = 0; | |||
714 | if(m % 2 == 0) | |||
715 | mh1 = mh2 = m / 2; | |||
716 | else { | |||
717 | mh1 = (m+1) / 2; | |||
718 | mh2 = (m-1) / 2; | |||
719 | flag_odd = 1; | |||
720 | } | |||
721 | ||||
722 | if(m > lsp2lpc_size){ | |||
723 | lsp2lpc_buff = walloc(double,5 * m + 6)((double *)safe_walloc(sizeof(double)*(5 * m + 6))); | |||
724 | lsp2lpc_size = m; | |||
725 | } | |||
726 | ||||
727 | /* offsets of lsp2lpcbuff */ | |||
728 | p = m; | |||
729 | q = p + mh1; | |||
730 | a0 = q + mh2; | |||
731 | a1 = a0 + (mh1 +1); | |||
732 | a2 = a1 + (mh1 +1); | |||
733 | b0 = a2 + (mh1 +1); | |||
734 | b1 = b0 + (mh2 +1); | |||
735 | b2 = b1 + (mh2 +1); | |||
736 | ||||
737 | /* move lsp -> lsp2lpc_buff */ | |||
738 | for(i=0; i<m; i++) | |||
739 | lsp2lpc_buff[i] = lsp[i+1]; | |||
740 | ||||
741 | for (i = 0; i < mh1 + 1; i++) | |||
742 | lsp2lpc_buff[a0 + i] = 0.0; | |||
743 | for (i = 0; i < mh1 + 1; i++) | |||
744 | lsp2lpc_buff[a1 + i] = 0.0; | |||
745 | for (i = 0; i < mh1 + 1; i++) | |||
746 | lsp2lpc_buff[a2 + i] = 0.0; | |||
747 | for (i = 0; i < mh2 + 1; i++) | |||
748 | lsp2lpc_buff[b0 + i] = 0.0; | |||
749 | for (i = 0; i < mh2 + 1; i++) | |||
750 | lsp2lpc_buff[b1 + i] = 0.0; | |||
751 | for (i = 0; i < mh2 + 1; i++) | |||
752 | lsp2lpc_buff[b2 + i] = 0.0; | |||
753 | ||||
754 | /* lsp filter parameters */ | |||
755 | for (i = k = 0; i < mh1; i++, k += 2) | |||
756 | lsp2lpc_buff[p + i] = -2.0 * cos(lsp2lpc_buff[k]); | |||
757 | for (i = k = 0; i < mh2; i++, k += 2) | |||
758 | lsp2lpc_buff[q + i] = -2.0 * cos(lsp2lpc_buff[k + 1]); | |||
759 | ||||
760 | /* impulse response of analysis filter */ | |||
761 | xx = 1.0; | |||
762 | xf = xff = 0.0; | |||
763 | ||||
764 | for (k = 0; k <= m; k++) { | |||
765 | if (flag_odd == 1) { | |||
766 | lsp2lpc_buff[a0 + 0] = xx; | |||
767 | lsp2lpc_buff[b0 + 0] = xx - xff; | |||
768 | xff = xf; | |||
769 | xf = xx; | |||
770 | } else { | |||
771 | lsp2lpc_buff[a0 + 0] = xx + xf; | |||
772 | lsp2lpc_buff[b0 + 0] = xx - xf; | |||
773 | xf = xx; | |||
774 | } | |||
775 | ||||
776 | for (i = 0; i < mh1; i++) { | |||
777 | lsp2lpc_buff[a0 + i + 1] = lsp2lpc_buff[a0 + i] + lsp2lpc_buff[p + i] * lsp2lpc_buff[a1 + i] + lsp2lpc_buff[a2 + i]; | |||
778 | lsp2lpc_buff[a2 + i] = lsp2lpc_buff[a1 + i]; | |||
779 | lsp2lpc_buff[a1 + i] = lsp2lpc_buff[a0 + i]; | |||
780 | } | |||
781 | ||||
782 | for (i = 0; i < mh2; i++) { | |||
783 | lsp2lpc_buff[b0 + i + 1] = lsp2lpc_buff[b0 + i] + lsp2lpc_buff[q + i] * lsp2lpc_buff[b1 + i] + lsp2lpc_buff[b2 + i]; | |||
784 | lsp2lpc_buff[b2 + i] = lsp2lpc_buff[b1 + i]; | |||
785 | lsp2lpc_buff[b1 + i] = lsp2lpc_buff[b0 + i]; | |||
786 | } | |||
787 | ||||
788 | if (k != 0) | |||
789 | a[k - 1] = -0.5 * (lsp2lpc_buff[a0 + mh1] + lsp2lpc_buff[b0 + mh2]); | |||
790 | xx = 0.0; | |||
791 | } | |||
792 | ||||
793 | for (i = m - 1; i >= 0; i--) | |||
794 | a[i + 1] = -a[i]; | |||
795 | a[0] = 1.0; | |||
796 | ||||
797 | if (lsp2lpc_buff) | |||
798 | wfree(lsp2lpc_buff); | |||
799 | } | |||
800 | ||||
801 | /** gc2gc: generalized cepstral transformation */ | |||
802 | static void gc2gc(double *c1, int m1, double g1, double *c2, int m2, double g2) | |||
803 | { | |||
804 | double *gc2gc_buff=NULL__null; /* used in gc2gc */ | |||
805 | int gc2gc_size=0; /* buffer size for gc2gc */ | |||
806 | int i, min, k, mk; | |||
807 | double ss1, ss2, cc; | |||
808 | ||||
809 | if( m1 > gc2gc_size ) { | |||
810 | gc2gc_buff = walloc(double,m1 + 1)((double *)safe_walloc(sizeof(double)*(m1 + 1))); /* check if these buffers should be created all the time */ | |||
811 | gc2gc_size = m1; | |||
812 | } | |||
813 | ||||
814 | /* movem*/ | |||
815 | for(i=0; i<(m1+1); i++) | |||
816 | gc2gc_buff[i] = c1[i]; | |||
817 | ||||
818 | c2[0] = gc2gc_buff[0]; | |||
| ||||
819 | ||||
820 | for( i=1; i<=m2; i++){ | |||
821 | ss1 = ss2 = 0.0; | |||
822 | min = m1 < i ? m1 : i - 1; | |||
823 | for(k=1; k<=min; k++){ | |||
824 | mk = i - k; | |||
825 | cc = gc2gc_buff[k] * c2[mk]; | |||
826 | ss2 += k * cc; | |||
827 | ss1 += mk * cc; | |||
828 | } | |||
829 | ||||
830 | if(i <= m1) | |||
831 | c2[i] = gc2gc_buff[i] + (g2 * ss2 - g1 * ss1) / i; | |||
832 | else | |||
833 | c2[i] = (g2 * ss2 - g1 * ss1) / i; | |||
834 | } | |||
835 | ||||
836 | if (gc2gc_buff) | |||
837 | wfree(gc2gc_buff); | |||
838 | } | |||
839 | ||||
840 | /** mgc2mgc: frequency and generalized cepstral transformation */ | |||
841 | static void mgc2mgc(double *c1, int m1, double a1, double g1, double *c2, int m2, double a2, double g2) | |||
842 | { | |||
843 | double a; | |||
844 | ||||
845 | if(a1 == a2){ | |||
846 | gnorm(c1, c1, m1, g1); | |||
847 | gc2gc(c1, m1, g1, c2, m2, g2); | |||
848 | ignorm(c2, c2, m2, g2); | |||
849 | } else { | |||
850 | a = (a2 -a1) / (1 - a1 * a2); | |||
851 | freqt(c1, m1, c2, m2, a); | |||
852 | gnorm(c2, c2, m2, g1); | |||
853 | gc2gc(c2, m2, g1, c2, m2, g2); | |||
854 | ignorm(c2, c2, m2, g2); | |||
855 | ||||
856 | } | |||
857 | } | |||
858 | ||||
859 | /** lsp2mgc: transform LSP to MGC. lsp=C[0..m] mgc=C[0..m] */ | |||
860 | static void lsp2mgc(double *lsp, double *mgc, int m, double alpha) | |||
861 | { | |||
862 | int i; | |||
863 | /* lsp2lpc */ | |||
864 | lsp2lpc(lsp, mgc, m); /* lsp starts in 1! lsp[1..m] --> mgc[0..m] */ | |||
865 | if(use_log_gain) | |||
866 | mgc[0] = exp(lsp[0]); | |||
867 | else | |||
868 | mgc[0] = lsp[0]; | |||
869 | ||||
870 | /* mgc2mgc*/ | |||
871 | if(NORMFLG1) | |||
872 | ignorm(mgc, mgc, m, xgamma); | |||
873 | else if(MULGFLG1) | |||
874 | mgc[0] = (1.0 - mgc[0]) * stage; | |||
875 | ||||
876 | if(MULGFLG1) | |||
877 | for(i=m; i>=1; i--) | |||
878 | mgc[i] *= -stage; | |||
879 | ||||
880 | mgc2mgc(mgc, m, alpha, xgamma, mgc, m, alpha, xgamma); /* input and output is in mgc=C */ | |||
881 | ||||
882 | if(NORMFLG2) | |||
883 | gnorm(mgc, mgc, m, xgamma); | |||
884 | else if(MULGFLG2) | |||
885 | mgc[0] = mgc[0] * xgamma + 1.0; | |||
886 | ||||
887 | if(MULGFLG2) | |||
888 | for(i=m; i>=1; i--) | |||
889 | mgc[i] *= xgamma; | |||
890 | ||||
891 | } | |||
892 | ||||
893 | /** mglsadf: sub functions for MGLSA filter */ | |||
894 | static double mglsadff(double x, double *b, int m, double a, double *d, int d_offset) | |||
895 | { | |||
896 | int i; | |||
897 | double y; | |||
898 | y = d[d_offset+0] * b[1]; | |||
899 | ||||
900 | for(i=1; i<m; i++) { | |||
901 | d[d_offset+i] += a * (d[d_offset+i+1] -d[d_offset+i-1]); | |||
902 | y += d[d_offset+i] * b[i+1]; | |||
903 | } | |||
904 | x -= y; | |||
905 | ||||
906 | for(i=m; i>0; i--) | |||
907 | d[d_offset+i] = d[d_offset+i-1]; | |||
908 | d[d_offset+0] = a * d[d_offset+0] + (1 - a * a) * x; | |||
909 | ||||
910 | return x; | |||
911 | } | |||
912 | ||||
913 | static double mglsadf(double x, double *b, int m, double a, int n, double *d) | |||
914 | { | |||
915 | int i; | |||
916 | for(i=0; i<n; i++) | |||
917 | x = mglsadff(x, b, m, a, d, (i*(m+1))); | |||
918 | ||||
919 | return x; | |||
920 | } | |||
921 | ||||
922 | /** posfilter: postfilter for mel-cepstrum. It uses alpha and beta defined in HMMData */ | |||
923 | static void postfilter_mcp(double *mcp, int m, double alpha, double beta) | |||
924 | { | |||
925 | double *postfilter_buff=NULL__null; /* used in postfiltering */ | |||
926 | int postfilter_size = 0; /* buffer size for postfiltering */ | |||
927 | ||||
928 | double e1, e2; | |||
929 | int k; | |||
930 | ||||
931 | if(beta > 0.0 && m > 1){ | |||
932 | if(postfilter_size < m){ | |||
933 | postfilter_buff = walloc(double,m+1)((double *)safe_walloc(sizeof(double)*(m+1))); | |||
934 | postfilter_size = m; | |||
935 | } | |||
936 | mc2b(mcp, postfilter_buff, m, alpha); | |||
937 | e1 = b2en(postfilter_buff, m, alpha); | |||
938 | ||||
939 | postfilter_buff[1] -= beta * alpha * mcp[2]; | |||
940 | for(k = 2; k < m; k++) | |||
941 | postfilter_buff[k] *= (1.0 +beta); | |||
942 | e2 = b2en(postfilter_buff, m, alpha); | |||
943 | postfilter_buff[0] += log(e1/e2) / 2; | |||
944 | b2mc(postfilter_buff, mcp, m, alpha); | |||
945 | ||||
946 | } | |||
947 | ||||
948 | if (postfilter_buff) | |||
949 | wfree(postfilter_buff); | |||
950 | ||||
951 | } | |||
952 | ||||
953 | static int modShift(int n, int N) | |||
954 | { | |||
955 | if( n < 0 ) | |||
956 | while( n < 0 ) | |||
957 | n = n + N; | |||
958 | else | |||
959 | while( n >= N ) | |||
960 | n = n - N; | |||
961 | return n; | |||
962 | } | |||
963 | ||||
964 | /** Generate one pitch period from Fourier magnitudes */ | |||
965 | static double *genPulseFromFourierMag(EST_Track *mag, int n, double f0, booleanint aperiodicFlag) | |||
966 | { | |||
967 | ||||
968 | int numHarm = mag->num_channels(); | |||
969 | int i; | |||
970 | int currentF0 = (int)round(f0); | |||
971 | int T, T2; | |||
972 | double *pulse = NULL__null; | |||
973 | ||||
974 | if(currentF0 < 512) | |||
975 | T = 512; | |||
976 | else | |||
977 | T = 1024; | |||
978 | T2 = 2*T; | |||
979 | ||||
980 | /* since is FFT2 no aperiodicFlag or jitter of 25% is applied */ | |||
981 | ||||
982 | /* get the pulse */ | |||
983 | pulse = walloc(double,T)((double *)safe_walloc(sizeof(double)*(T))); | |||
984 | EST_FVector real(T2); | |||
985 | EST_FVector imag(T2); | |||
986 | ||||
987 | /* copy Fourier magnitudes (Wai C. Chu "Speech Coding algorithms foundation and evolution of standardized coders" pg. 460) */ | |||
988 | real[0] = real[T] = 0.0; /* DC component set to zero */ | |||
989 | for(i=1; i<=numHarm; i++){ | |||
990 | real[i] = real[T-i] = real[T+i] = real[T2-i] = mag->a(n, i-1); /* Symetric extension */ | |||
991 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; | |||
992 | } | |||
993 | for(i=(numHarm+1); i<(T-numHarm); i++){ /* Default components set to 1.0 */ | |||
994 | real[i] = real[T-i] = real[T+i] = real[T2-i] = 1.0; | |||
995 | imag[i] = imag[T-i] = imag[T+i] = imag[T2-i] = 0.0; | |||
996 | } | |||
997 | ||||
998 | /* Calculate inverse Fourier transform */ | |||
999 | IFFT(real, imag); | |||
1000 | ||||
1001 | /* circular shift and normalise multiplying by sqrt(F0) */ | |||
1002 | double sqrt_f0 = sqrt((float)currentF0); | |||
1003 | for(i=0; i<T; i++) | |||
1004 | pulse[i] = real[modShift(i-numHarm,T)] * sqrt_f0; | |||
1005 | ||||
1006 | return pulse; | |||
1007 | ||||
1008 | } | |||
1009 | ||||
1010 | int htsMLSAVocoder(EST_Track *lf0Pst, | |||
1011 | EST_Track *mcepPst, | |||
1012 | EST_Track *strPst, | |||
1013 | EST_Track *magPst, | |||
1014 | int *voiced, | |||
1015 | HTSData *htsData, | |||
1016 | EST_Wave *wave) | |||
1017 | { | |||
1018 | ||||
1019 | double inc, x; | |||
1020 | double xp=0.0,xn=0.0,fxp,fxn,mix; /* samples for pulse and for noise and the filtered ones */ | |||
1021 | int i, j, k, m, s, mcepframe, lf0frame, s_double; | |||
1022 | double alpha = htsData->alpha; | |||
1023 | double beta = htsData->beta; | |||
1024 | double aa = 1-alpha*alpha; | |||
1025 | int audio_size; /* audio size in samples, calculated as num frames * frame period */ | |||
1026 | double *audio_double = NULL__null; | |||
1027 | double *magPulse = NULL__null; /* pulse generated from Fourier magnitudes */ | |||
1028 | int magSample, magPulseSize; | |||
1029 | booleanint aperiodicFlag = false0; | |||
1030 | ||||
1031 | double *d; /* used in the lpc vocoder */ | |||
1032 | ||||
1033 | double f0, f0Std, f0Shift, f0MeanOri; | |||
1034 | double *mc = NULL__null; /* feature vector for a particular frame */ | |||
1035 | double *hp = NULL__null; /* pulse shaping filter, initialised once it is known orderM */ | |||
1036 | double *hn = NULL__null; /* noise shaping filter, initialised once it is known orderM */ | |||
1037 | ||||
1038 | /* Initialise vocoder and mixed excitation, once initialised it is known the order | |||
1039 | * of the filters so the shaping filters hp and hn can be initialised. */ | |||
1040 | m = mcepPst->num_channels(); | |||
1041 | mc = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); | |||
1042 | ||||
1043 | initVocoder(m-1, mcepPst->num_frames(), htsData); | |||
1044 | ||||
1045 | d = walloc(double,m)((double *)safe_walloc(sizeof(double)*(m))); | |||
1046 | if (lpcVocoder) | |||
| ||||
1047 | { | |||
1048 | /* printf("Using LPC vocoder\n"); */ | |||
1049 | for(i=0; i<m; i++) | |||
1050 | d[i] = 0.0; | |||
1051 | } | |||
1052 | mixedExcitation = htsData->useMixExc; | |||
1053 | fourierMagnitudes = htsData->useFourierMag; | |||
1054 | ||||
1055 | if ( mixedExcitation ) | |||
1056 | { | |||
1057 | numM = htsData->NumFilters; | |||
1058 | orderM = htsData->OrderFilters; | |||
1059 | ||||
1060 | xpulseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); | |||
1061 | xnoiseSignal = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); | |||
1062 | /* initialise xp_sig and xn_sig */ | |||
1063 | for(i=0; i<orderM; i++) | |||
1064 | xpulseSignal[i] = xnoiseSignal[i] = 0; | |||
1065 | ||||
1066 | h = htsData->MixFilters; | |||
1067 | hp = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); | |||
1068 | hn = walloc(double,orderM)((double *)safe_walloc(sizeof(double)*(orderM))); | |||
1069 | ||||
1070 | //Check if the number of filters is equal to the order of strpst | |||
1071 | //i.e. the number of filters is equal to the number of generated strengths per frame. | |||
1072 | #if 0 | |||
1073 | if(numM != strPst->num_channels()) { | |||
1074 | printf("htsMLSAVocoder: error num mix-excitation filters = %d " | |||
1075 | " in configuration file is different from generated str order= %d\n", | |||
1076 | numM, strPst->num_channels()); | |||
1077 | } | |||
1078 | printf("HMM speech generation with mixed-excitation.\n"); | |||
1079 | #endif | |||
1080 | } | |||
1081 | #if 0 | |||
1082 | else | |||
1083 | printf("HMM speech generation without mixed-excitation.\n"); | |||
1084 | ||||
1085 | if( fourierMagnitudes && htsData->PdfMagFile != NULL__null) | |||
1086 | printf("Pulse generated with Fourier Magnitudes.\n"); | |||
1087 | else | |||
1088 | printf("Pulse generated as a unit pulse.\n"); | |||
1089 | ||||
1090 | if(beta != 0.0) | |||
1091 | printf("Postfiltering applied with beta=%f",(float)beta); | |||
1092 | else | |||
1093 | printf("No postfiltering applied.\n"); | |||
1094 | #endif | |||
1095 | ||||
1096 | /* Clear content of c, should be done if this function is | |||
1097 | called more than once with a new set of generated parameters. */ | |||
1098 | for(i=0; i< C_length; i++) | |||
1099 | C[i] = CC[i] = CINC[i] = 0.0; | |||
1100 | for(i=0; i< D1_length; i++) | |||
1101 | D1[i]=0.0; | |||
1102 | ||||
1103 | f0Std = htsData->F0Std; | |||
1104 | f0Shift = htsData->F0Mean; | |||
1105 | f0MeanOri = 0.0; | |||
1106 | ||||
1107 | /* XXX */ | |||
1108 | for (mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) | |||
1109 | { | |||
1110 | if(voiced[mcepframe]) | |||
1111 | { /* WAS WRONG */ | |||
1112 | f0MeanOri = f0MeanOri + lf0Pst->a(mcepframe, 0); | |||
1113 | lf0frame++; | |||
1114 | } | |||
1115 | } | |||
1116 | f0MeanOri = f0MeanOri/lf0frame; | |||
1117 | ||||
1118 | /* ____________________Synthesize speech waveforms_____________________ */ | |||
1119 | /* generate Nperiod samples per mcepframe */ | |||
1120 | s = 0; /* number of samples */ | |||
1121 | s_double = 0; | |||
1122 | audio_size = mcepPst->num_frames() * (fprd); | |||
1123 | audio_double = walloc(double,audio_size)((double *)safe_walloc(sizeof(double)*(audio_size))); /* initialise buffer for audio */ | |||
1124 | magSample = 1; | |||
1125 | magPulseSize = 0; | |||
1126 | ||||
1127 | for(mcepframe=0,lf0frame=0; mcepframe<mcepPst->num_frames(); mcepframe++) | |||
1128 | { | |||
1129 | /* get current feature vector mcp */ | |||
1130 | for(i=0; i<m; i++) | |||
1131 | mc[i] = mcepPst->a(mcepframe, i); | |||
1132 | ||||
1133 | /* f0 modification through the MARY audio effects */ | |||
1134 | if(voiced[mcepframe]){ | |||
1135 | f0 = f0Std * lf0Pst->a(mcepframe, 0) + (1-f0Std) * f0MeanOri + f0Shift; | |||
1136 | lf0frame++; | |||
1137 | if(f0 < 0.0) | |||
1138 | f0 = 0.0; | |||
1139 | } | |||
1140 | else{ | |||
1141 | f0 = 0.0; | |||
1142 | } | |||
1143 | ||||
1144 | /* if mixed excitation get shaping filters for this frame */ | |||
1145 | if (mixedExcitation) | |||
1146 | { | |||
1147 | for(j=0; j<orderM; j++) | |||
1148 | { | |||
1149 | hp[j] = hn[j] = 0.0; | |||
1150 | for(i=0; i<numM; i++) | |||
1151 | { | |||
1152 | hp[j] += strPst->a(mcepframe, i) * h[i][j]; | |||
1153 | hn[j] += ( 1 - strPst->a(mcepframe, i) ) * h[i][j]; | |||
1154 | } | |||
1155 | } | |||
1156 | } | |||
1157 | ||||
1158 | /* f0->pitch, in original code here it is used p, so f0=p in the c code */ | |||
1159 | if(f0 != 0.0) | |||
1160 | f0 = rate/f0; | |||
1161 | ||||
1162 | /* p1 is initialised in -1, so this will be done just for the first frame */ | |||
1163 | if( p1 < 0 ) { | |||
1164 | p1 = f0; | |||
1165 | pc = p1; | |||
1166 | /* for LSP */ | |||
1167 | if(stage != 0){ | |||
1168 | if( use_log_gain) | |||
1169 | C[0] = LZERO; | |||
1170 | else | |||
1171 | C[0] = ZERO; | |||
1172 | for(i=0; i<m; i++ ) | |||
1173 | C[i] = i * PI3.14159265358979323846 / m; | |||
1174 | /* LSP -> MGC */ | |||
1175 | lsp2mgc(C, C, (m-1), alpha); | |||
1176 | mc2b(C, C, (m-1), alpha); | |||
1177 | gnorm(C, C, (m-1), xgamma); | |||
1178 | for(i=1; i<m; i++) | |||
1179 | C[i] *= xgamma; | |||
1180 | } | |||
1181 | ||||
1182 | } | |||
1183 | ||||
1184 | if(stage == 0){ | |||
1185 | /* postfiltering, this is done if beta>0.0 */ | |||
1186 | postfilter_mcp(mc, (m-1), alpha, beta); | |||
1187 | /* mc2b: transform mel-cepstrum to MLSA digital filter coefficients */ | |||
1188 | mc2b(mc, CC, (m-1), alpha); | |||
1189 | for(i=0; i<m; i++) | |||
1190 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; | |||
1191 | } else { | |||
1192 | ||||
1193 | lsp2mgc(mc, CC, (m-1), alpha ); | |||
1194 | ||||
1195 | mc2b(CC, CC, (m-1), alpha); | |||
1196 | ||||
1197 | gnorm(CC, CC, (m-1), xgamma); | |||
1198 | ||||
1199 | for(i=1; i<m; i++) | |||
1200 | CC[i] *= xgamma; | |||
1201 | ||||
1202 | for(i=0; i<m; i++) | |||
1203 | CINC[i] = (CC[i] - C[i]) * iprd / fprd; | |||
1204 | ||||
1205 | } | |||
1206 | ||||
1207 | /* p=f0 in c code!!! */ | |||
1208 | if( p1 != 0.0 && f0 != 0.0 ) { | |||
1209 | inc = (f0 - p1) * (double)iprd/(double)fprd; | |||
1210 | //System.out.println(" inc=(f0-p1)/80=" + inc ); | |||
1211 | } else { | |||
1212 | inc = 0.0; | |||
1213 | pc = f0; | |||
1214 | p1 = 0.0; | |||
1215 | } | |||
1216 | ||||
1217 | /* Here need to generate both xp:pulse and xn:noise signals seprately*/ | |||
1218 | gauss = false0; /* Mixed excitation works better with nomal noise */ | |||
1219 | ||||
1220 | /* Generate fperiod samples per feature vector, normally 80 samples per frame */ | |||
1221 | //p1=0.0; | |||
1222 | gauss=false0; | |||
1223 | for(j=fprd-1, i=(iprd+1)/2; j>=0; j--) { | |||
1224 | if(p1 == 0.0) { | |||
1225 | if(gauss) | |||
1226 | x = 0 /* rand.nextGaussian() */; /* XXX returns double, gaussian distribution mean=0.0 and var=1.0 */ | |||
1227 | else | |||
1228 | x = uniformRand(); /* returns 1.0 or -1.0 uniformly distributed */ | |||
1229 | ||||
1230 | if(mixedExcitation) { | |||
1231 | xn = x; | |||
1232 | xp = 0.0; | |||
1233 | } | |||
1234 | } else { | |||
1235 | if( (pc += 1.0) >= p1 ){ | |||
1236 | if(fourierMagnitudes){ | |||
1237 | /* jitter is applied just in voiced frames when the stregth of the first band is < 0.5*/ | |||
1238 | /* this will work just if Radix FFT is used */ | |||
1239 | /*if(strPst.getPar(mcepframe, 0) < 0.5) | |||
1240 | aperiodicFlag = true; | |||
1241 | else | |||
1242 | aperiodicFlag = false; | |||
1243 | magPulse = genPulseFromFourierMagRadix(magPst, mcepframe, p1, aperiodicFlag); | |||
1244 | */ | |||
1245 | ||||
1246 | magPulse = genPulseFromFourierMag(magPst, mcepframe, p1, aperiodicFlag); | |||
1247 | magSample = 0; | |||
1248 | magPulseSize = -27 /* magPulse.length*/; /** XXX **/ | |||
1249 | x = magPulse[magSample]; | |||
1250 | magSample++; | |||
1251 | } else | |||
1252 | x = sqrt(p1); | |||
1253 | ||||
1254 | pc = pc - p1; | |||
1255 | } else { | |||
1256 | ||||
1257 | if(fourierMagnitudes){ | |||
1258 | if(magSample >= magPulseSize ){ | |||
1259 | x = 0.0; | |||
1260 | } | |||
1261 | else | |||
1262 | x = magPulse[magSample]; | |||
1263 | magSample++; | |||
1264 | } else | |||
1265 | x = 0.0; | |||
1266 | } | |||
1267 | ||||
1268 | if(mixedExcitation) { | |||
1269 | xp = x; | |||
1270 | if(gauss) | |||
1271 | xn = 0 /* rand.nextGaussian() */ ; /* XXX */ | |||
1272 | else | |||
1273 | xn = uniformRand(); | |||
1274 | } | |||
1275 | } | |||
1276 | ||||
1277 | /* apply the shaping filters to the pulse and noise samples */ | |||
1278 | /* i need memory of at least for M samples in both signals */ | |||
1279 | if(mixedExcitation) { | |||
1280 | fxp = 0.0; | |||
1281 | fxn = 0.0; | |||
1282 | for(k=orderM-1; k>0; k--) { | |||
1283 | fxp += hp[k] * xpulseSignal[k]; | |||
1284 | fxn += hn[k] * xnoiseSignal[k]; | |||
1285 | xpulseSignal[k] = xpulseSignal[k-1]; | |||
1286 | xnoiseSignal[k] = xnoiseSignal[k-1]; | |||
1287 | } | |||
1288 | fxp += hp[0] * xp; | |||
1289 | fxn += hn[0] * xn; | |||
1290 | xpulseSignal[0] = xp; | |||
1291 | xnoiseSignal[0] = xn; | |||
1292 | ||||
1293 | /* x is a pulse noise excitation and mix is mixed excitation */ | |||
1294 | mix = fxp+fxn; | |||
1295 | ||||
1296 | /* comment this line if no mixed excitation, just pulse and noise */ | |||
1297 | x = mix; /* excitation sample */ | |||
1298 | /* printf("awb_debug me %d %f\n",(int)(s_double),(float)x); */ | |||
1299 | } | |||
1300 | ||||
1301 | if(lpcVocoder){ | |||
1302 | // LPC filter C[k=0] = gain is not used! | |||
1303 | if(!NGAIN) | |||
1304 | x *= C[0]; | |||
1305 | for(k=(m-1); k>1; k--){ | |||
1306 | x = x - (C[k] * d[k]); | |||
1307 | d[k] = d[k-1]; | |||
1308 | } | |||
1309 | x = x - (C[1] * d[1]); | |||
1310 | d[1] = x; | |||
1311 | ||||
1312 | } else if(stage == 0 ){ | |||
1313 | if(x != 0.0 ) | |||
1314 | x *= exp(C[0]); | |||
1315 | x = mlsadf(x, C, m, alpha, aa, D1); | |||
1316 | ||||
1317 | } else { | |||
1318 | if(!NGAIN) | |||
1319 | x *= C[0]; | |||
1320 | x = mglsadf(x, C, (m-1), alpha, stage, D1); | |||
1321 | } | |||
1322 | ||||
1323 | audio_double[s_double] = x; | |||
1324 | s_double++; | |||
1325 | ||||
1326 | if((--i) == 0 ) { | |||
1327 | p1 += inc; | |||
1328 | for(k=0; k<m; k++){ | |||
1329 | C[k] += CINC[k]; | |||
1330 | } | |||
1331 | i = iprd; | |||
1332 | } | |||
1333 | } /* for each sample in a period fprd */ | |||
1334 | ||||
1335 | p1 = f0; | |||
1336 | ||||
1337 | /* move elements in c */ | |||
1338 | /* HTS_movem(v->cc, v->c, m + 1); */ | |||
1339 | for(i=0; i<m; i++){ | |||
1340 | C[i] = CC[i]; | |||
1341 | } | |||
1342 | ||||
1343 | } /* for each mcep frame */ | |||
1344 | ||||
1345 | /* printf("Finish processing %d mcep frames.\n",mcepframe); */ | |||
1346 | ||||
1347 | wave->resize(audio_size,1); | |||
1348 | for (i=0; i<s_double; i++) | |||
1349 | wave->a(i) = (short)audio_double[i]; | |||
1350 | ||||
1351 | return 0; | |||
1352 | ||||
1353 | } /* method htsMLSAVocoder() */ | |||
1354 | ||||
1355 |