File: | modules/MultiSyn/DiphoneUnitVoice.cc |
Location: | line 379, column 10 |
Description: | Called C++ object pointer is null |
1 | /*************************************************************************/ | |||
2 | /* */ | |||
3 | /* Centre for Speech Technology Research */ | |||
4 | /* (University of Edinburgh, UK) and */ | |||
5 | /* Korin Richmond */ | |||
6 | /* Copyright (c) 2002 */ | |||
7 | /* All Rights Reserved. */ | |||
8 | /* */ | |||
9 | /* Permission is hereby granted, free of charge, to use and distribute */ | |||
10 | /* this software and its documentation without restriction, including */ | |||
11 | /* without limitation the rights to use, copy, modify, merge, publish, */ | |||
12 | /* distribute, sublicense, and/or sell copies of this work, and to */ | |||
13 | /* permit persons to whom this work is furnished to do so, subject to */ | |||
14 | /* the following conditions: */ | |||
15 | /* */ | |||
16 | /* 1. The code must retain the above copyright notice, this list of */ | |||
17 | /* conditions and the following disclaimer. */ | |||
18 | /* 2. Any modifications must be clearly marked as such. */ | |||
19 | /* 3. Original authors' names are not deleted. */ | |||
20 | /* 4. The authors' names are not used to endorse or promote products */ | |||
21 | /* derived from this software without specific prior written */ | |||
22 | /* permission. */ | |||
23 | /* */ | |||
24 | /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */ | |||
25 | /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */ | |||
26 | /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT */ | |||
27 | /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */ | |||
28 | /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */ | |||
29 | /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */ | |||
30 | /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */ | |||
31 | /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */ | |||
32 | /* THIS SOFTWARE. */ | |||
33 | /* */ | |||
34 | /*************************************************************************/ | |||
35 | /* */ | |||
36 | /* Author: Korin Richmond */ | |||
37 | /* Date: Aug 2002 */ | |||
38 | /* --------------------------------------------------------------------- */ | |||
39 | /* first stab at a diphone unit selection "voice" - using a list of */ | |||
40 | /* utterance objects */ | |||
41 | /*************************************************************************/ | |||
42 | ||||
43 | #include "festival.h" | |||
44 | #include "DiphoneUnitVoice.h" | |||
45 | #include "DiphoneVoiceModule.h" | |||
46 | #include "EST_DiphoneCoverage.h" | |||
47 | #include "EST_rw_status.h" | |||
48 | #include "EST_viterbi.h" | |||
49 | #include "EST_Track.h" | |||
50 | #include "EST_track_aux.h" | |||
51 | #include "EST_Wave.h" | |||
52 | #include "EST_THash.h" | |||
53 | #include "EST_TList.h" | |||
54 | #include "EST_types.h" | |||
55 | #include "ling_class/EST_Utterance.h" | |||
56 | #include "siod.h" | |||
57 | #include "siod_est.h" | |||
58 | #include "safety.h" | |||
59 | #include <cstdlib> | |||
60 | ||||
61 | #include "EST_TargetCost.h" | |||
62 | #include "TargetCostRescoring.h" | |||
63 | #include "EST_JoinCost.h" | |||
64 | #include "EST_JoinCostCache.h" | |||
65 | ||||
66 | #include "EST_Val.h" | |||
67 | ||||
68 | using namespace std; | |||
69 | ||||
70 | ||||
71 | SIOD_REGISTER_TYPE(itemlist,ItemList)ItemList *itemlist(LISP x) { return itemlist(val(x)); } int itemlist_p (LISP x) { if (val_p(x) && (val_type_itemlist == val( x).type())) return (1==1); else return (1==0); } LISP siod(const ItemList *v) { if (v == 0) return ((struct obj *) 0); else return siod(est_val(v)); } | |||
72 | VAL_REGISTER_TYPE(itemlist,ItemList)val_type val_type_itemlist="itemlist"; ItemList *itemlist(const EST_Val &v) { if (v.type() == val_type_itemlist) return ( ItemList *)v.internal_ptr(); else (EST_error_where = __null), (*EST_error_func)("val not of type val_type_""itemlist"); return __null; } static void val_delete_itemlist(void *v) { delete ( ItemList *)v; } EST_Val est_val(const ItemList *v) { return EST_Val (val_type_itemlist, (void *)v,val_delete_itemlist); } | |||
73 | ||||
74 | // from src/modules/UniSyn_diphone/us_diphone.h | |||
75 | // this won't be staying here long... | |||
76 | void parse_diphone_times(EST_Relation &diphone_stream, | |||
77 | EST_Relation &source_lab); | |||
78 | ||||
79 | SIOD_REGISTER_CLASS(du_voice,DiphoneUnitVoice)class DiphoneUnitVoice *du_voice(LISP x) { return du_voice(val (x)); } int du_voice_p(LISP x) { if (val_p(x) && (val_type_du_voice == val(x).type())) return (1==1); else return (1==0); } LISP siod(const class DiphoneUnitVoice *v) { if (v == 0) return ( (struct obj *) 0); else return siod(est_val(v)); } | |||
80 | VAL_REGISTER_CLASS(du_voice,DiphoneUnitVoice)val_type val_type_du_voice="du_voice"; class DiphoneUnitVoice *du_voice(const EST_Val &v) { if (v.type() == val_type_du_voice ) return (class DiphoneUnitVoice *)v.internal_ptr(); else (EST_error_where = __null), (*EST_error_func)("val not of type val_type_""du_voice" ); return __null; } static void val_delete_du_voice(void *v) { delete (class DiphoneUnitVoice *)v; } EST_Val est_val(const class DiphoneUnitVoice *v) { return EST_Val(val_type_du_voice, (void *)v,val_delete_du_voice); } | |||
81 | ||||
82 | static void my_parse_diphone_times(EST_Relation &diphone_stream, | |||
83 | EST_Relation &source_lab) | |||
84 | { | |||
85 | EST_Item *s, *u; | |||
86 | float dur1, dur_u, p_time=0.0; | |||
87 | ||||
88 | // NOTE: because of the extendLeft/extendRight phone join hack for missing diphones, | |||
89 | // the unit linked list *may be* shorter that the segment list. | |||
90 | //(admittedly could cause confusion) | |||
91 | ||||
92 | for( s=source_lab.head(), u=diphone_stream.head(); (u!=0)&&(s!=0); u=u->next(), s=s->next()){ | |||
93 | EST_Track *pm = track(u->f("coefs")); | |||
94 | ||||
95 | int end_frame = pm->num_frames() - 1; | |||
96 | int mid_frame = u->I("middle_frame"); | |||
97 | ||||
98 | dur1 = pm->t(mid_frame); | |||
99 | dur_u = pm->t(end_frame); | |||
100 | ||||
101 | s->set("end", (p_time+dur1) ); | |||
102 | ||||
103 | p_time += dur_u; | |||
104 | u->set("end", p_time); | |||
105 | ||||
106 | if( u->f_present("extendRight") ){//because diphone squeezed out (see above) | |||
107 | s = s->next(); | |||
108 | s->set("end", p_time ); | |||
109 | } | |||
110 | } | |||
111 | ||||
112 | if(s) | |||
113 | s->set("end", (p_time)); | |||
114 | } | |||
115 | ||||
116 | // temporary hack necessary because decoder can only take a | |||
117 | // function pointer (would be better to relax this restriction in | |||
118 | // the EST_Viterbi_Decoder class, or in a replacement class, rather | |||
119 | // than using this hack) | |||
120 | static DiphoneUnitVoice *globalTempVoicePtr = 0; | |||
121 | ||||
122 | DiphoneUnitVoice::DiphoneUnitVoice( const EST_StrList& basenames, | |||
123 | const EST_String& uttDir, | |||
124 | const EST_String& wavDir, | |||
125 | const EST_String& pmDir, | |||
126 | const EST_String& coefDir, | |||
127 | unsigned int sr, | |||
128 | const EST_String& uttExt, | |||
129 | const EST_String& wavExt, | |||
130 | const EST_String& pmExt, | |||
131 | const EST_String& coefExt ) | |||
132 | : pruning_beam( -1 ), | |||
133 | ob_pruning_beam( -1 ), | |||
134 | tc_rescoring_beam( -1 ), | |||
135 | tc_rescoring_weight( 0.0 ), | |||
136 | tc_weight( 1.0 ), | |||
137 | jc_weight( 1.0 ), | |||
138 | jc_f0_weight( 1.0 ), | |||
139 | jc_power_weight( 1.0 ), | |||
140 | jc_spectral_weight( 1.0 ), | |||
141 | prosodic_modification( 0 ), | |||
142 | wav_srate( sr ), | |||
143 | jc( 0 ), | |||
144 | jc_delete( false ), | |||
145 | tc( 0 ), | |||
146 | tc_delete( false ), | |||
147 | tcdh( 0 ) | |||
148 | ||||
149 | { | |||
150 | // make the default voice module with the supplied parameters | |||
151 | addVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir, | |||
152 | wav_srate, | |||
153 | uttExt, wavExt, pmExt, coefExt ); | |||
154 | ||||
155 | diphone_backoff_rules = 0; | |||
156 | } | |||
157 | ||||
158 | void DiphoneUnitVoice::initialise( bool ignore_bad_tag ) | |||
159 | { | |||
160 | if( jc == 0 ) | |||
161 | EST_error(EST_error_where = __null), (*EST_error_func)( "Need to set join cost calculator for voice" ); | |||
162 | ||||
163 | if( tc == 0 ) | |||
164 | EST_error(EST_error_where = __null), (*EST_error_func)( "Need to set target cost calculator for voice" ); | |||
165 | ||||
166 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
167 | ||||
168 | for( it.begin(voiceModules); it; it++ ) | |||
169 | (*it)->initialise( tc, ignore_bad_tag ); | |||
170 | } | |||
171 | ||||
172 | bool DiphoneUnitVoice::addVoiceModule( const EST_StrList& basenames, | |||
173 | const EST_String& uttDir, | |||
174 | const EST_String& wavDir, | |||
175 | const EST_String& pmDir, | |||
176 | const EST_String& coefDir, | |||
177 | unsigned int srate, | |||
178 | const EST_String& uttExt, | |||
179 | const EST_String& wavExt, | |||
180 | const EST_String& pmExt, | |||
181 | const EST_String& coefExt ) | |||
182 | ||||
183 | { | |||
184 | DiphoneVoiceModule *vm; | |||
185 | ||||
186 | if( srate != wav_srate ) | |||
187 | EST_error(EST_error_where = __null), (*EST_error_func)( "Voice samplerate: %d\nmodule samplerate: %d", | |||
188 | wav_srate, srate ); | |||
189 | ||||
190 | vm = new DiphoneVoiceModule( basenames, uttDir, wavDir, pmDir, coefDir, | |||
191 | srate, | |||
192 | uttExt, wavExt, pmExt, coefExt ); | |||
193 | CHECK_PTR(vm)if((vm)==0){ (EST_error_where = __null), (*EST_error_func)("memory allocation failed (file %s, line %d)" , "DiphoneUnitVoice.cc",193);}; | |||
194 | ||||
195 | registerVoiceModule( vm ); | |||
196 | ||||
197 | return true; | |||
198 | } | |||
199 | ||||
200 | ||||
201 | void DiphoneUnitVoice::registerVoiceModule( DiphoneVoiceModule *vm ) | |||
202 | { | |||
203 | voiceModules.append( vm ); | |||
204 | } | |||
205 | ||||
206 | ||||
207 | void DiphoneUnitVoice::setJoinCost( EST_JoinCost *jcost, bool del ) | |||
208 | { | |||
209 | if( jc_delete == true ) | |||
210 | if( jc != 0 ) | |||
211 | delete jc; | |||
212 | ||||
213 | jc = jcost; | |||
214 | jc_delete = del; | |||
215 | } | |||
216 | ||||
217 | void DiphoneUnitVoice::setTargetCost( EST_TargetCost *tcost, bool del ) | |||
218 | { | |||
219 | if( tc_delete == true ) | |||
220 | if( tc != 0 ) | |||
221 | delete tc; | |||
222 | ||||
223 | tc = tcost; | |||
224 | tc_delete = del; | |||
225 | } | |||
226 | ||||
227 | ||||
228 | DiphoneUnitVoice::~DiphoneUnitVoice() | |||
229 | { | |||
230 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
231 | ||||
232 | for( it.begin(voiceModules); it; it++ ) | |||
233 | delete( *it ); | |||
234 | ||||
235 | if(diphone_backoff_rules) | |||
236 | delete diphone_backoff_rules; | |||
237 | ||||
238 | if( jc_delete == true ) | |||
239 | if( jc != 0 ) | |||
240 | delete jc; | |||
241 | ||||
242 | if( tc_delete == true ) | |||
243 | if( tc != 0 ) | |||
244 | delete tc; | |||
245 | ||||
246 | if(tcdh) | |||
247 | delete tcdh; | |||
248 | ||||
249 | } | |||
250 | ||||
251 | ||||
252 | void DiphoneUnitVoice::addToCatalogue( const EST_Utterance *utt ) | |||
253 | { | |||
254 | // needed? | |||
255 | } | |||
256 | ||||
257 | ||||
258 | void DiphoneUnitVoice::getDiphone( const EST_VTCandidate *cand, | |||
259 | EST_Track* coef, EST_Wave* sig, int *midframe, | |||
260 | bool extendLeft, bool extendRight ) | |||
261 | { | |||
262 | // The need for this function in this class is a bit messy, it would be far | |||
263 | // nicer just to be able to ask the Candidate itself to hand over the relevant | |||
264 | // synthesis parameters. In future, it will work that way ;) | |||
265 | ||||
266 | // put there by DiphoneVoiceModule::getCandidateList | |||
267 | const DiphoneCandidate *diphcand = diphonecandidate( cand->name ); | |||
268 | ||||
269 | const DiphoneVoiceModule* parentModule = diphcand->dvm; | |||
270 | EST_Item *firstPhoneInDiphone = cand->s; | |||
271 | ||||
272 | // need to call right getDiphone to do the actual work | |||
273 | parentModule->getDiphone( firstPhoneInDiphone, coef, sig, midframe, extendLeft, extendRight ); | |||
274 | } | |||
275 | ||||
276 | // REQUIREMENT: the unit relation must have previously been used to initialise the | |||
277 | // Viterbi decoder from which the path was produced. | |||
278 | void DiphoneUnitVoice::fillUnitRelation( EST_Relation *units, const EST_VTPath *path ) | |||
279 | { | |||
280 | EST_Item *it=units->tail(); | |||
281 | ||||
282 | for ( ; path != 0 && it != 0; path=path->from, it=it->prev() ){ | |||
283 | EST_Track *coefs = new EST_Track; | |||
284 | CHECK_PTR(coefs)if((coefs)==0){ (EST_error_where = __null), (*EST_error_func) ("memory allocation failed (file %s, line %d)", "DiphoneUnitVoice.cc" ,284);}; | |||
285 | EST_Wave *sig = new EST_Wave; | |||
286 | CHECK_PTR(sig)if((sig)==0){ (EST_error_where = __null), (*EST_error_func)("memory allocation failed (file %s, line %d)" , "DiphoneUnitVoice.cc",286);}; | |||
287 | int midf; | |||
288 | ||||
289 | getDiphone( path->c, coefs, sig, &midf, | |||
290 | it->f_present("extendLeft"), it->f_present("extendRight")); | |||
291 | ||||
292 | EST_Item *firstPhoneInDiphone = path->c->s; | |||
293 | it->set_val( "sig", est_val( sig ) ); | |||
294 | it->set_val( "coefs", est_val( coefs ) ); | |||
295 | it->set( "middle_frame", midf ); | |||
296 | it->set( "source_utt", firstPhoneInDiphone->relation()->utt()->f.S("fileid")); | |||
297 | it->set_val( "source_ph1", est_val( firstPhoneInDiphone )); | |||
298 | it->set( "source_end", firstPhoneInDiphone->F("end")); | |||
299 | it->set( "target_cost", path->c->score ); | |||
300 | ||||
301 | //have to recalculate join cost as it's not currently saved anywhere | |||
302 | if( path->from == 0 ) | |||
303 | it->set( "join_cost", 0.0); | |||
304 | else{ | |||
305 | // join cost between right edge of left diphone and vice versa | |||
306 | const DiphoneCandidate *l_diph = diphonecandidate(path->from->c->name); | |||
307 | const DiphoneCandidate *r_diph = diphonecandidate(path->c->name); | |||
308 | ||||
309 | it->set( "join_cost", (*jc)( l_diph, r_diph ) ); | |||
310 | } | |||
311 | } | |||
312 | } | |||
313 | ||||
314 | // The use of the globalFunctionPtr in this function is a really just a temporary hack | |||
315 | // necessary because the decoder as it stands at present can only take a function pointer | |||
316 | // (would be better to relax this restriction in the EST_Viterbi_Decoder class, or in a | |||
317 | // replacement class, rather than using this hack) | |||
318 | // static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c, | |||
319 | // EST_Features&) | |||
320 | // { | |||
321 | // EST_VTPath *np = new EST_VTPath; | |||
322 | // CHECK_PTR(np); | |||
323 | ||||
324 | // if( globalTempVoicePtr ==0 ) | |||
325 | // EST_error( "globalTempVoicePtr is not set, can't continue" ); | |||
326 | ||||
327 | // const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator(); | |||
328 | ||||
329 | // np->c = c; | |||
330 | // np->from = p; | |||
331 | // np->state = c->pos; | |||
332 | ||||
333 | // if ((p == 0) || (p->c == 0)) | |||
334 | // np->score = c->score; | |||
335 | // else{ | |||
336 | // // join cost between right edge of left diphone and vice versa | |||
337 | // np->score = p->score + c->score + jcost( p->c->s->next(), c->s ); | |||
338 | // } | |||
339 | // return np; | |||
340 | // } | |||
341 | static EST_VTPath* extendPath( EST_VTPath *p, EST_VTCandidate *c, | |||
342 | EST_Features&) | |||
343 | { | |||
344 | EST_VTPath *np = new EST_VTPath; | |||
345 | CHECK_PTR(np)if((np)==0){ (EST_error_where = __null), (*EST_error_func)("memory allocation failed (file %s, line %d)" , "DiphoneUnitVoice.cc",345);}; | |||
346 | ||||
347 | if( globalTempVoicePtr ==0 ) | |||
348 | EST_error(EST_error_where = __null), (*EST_error_func)( "globalTempVoicePtr is not set, can't continue" ); | |||
349 | ||||
350 | const EST_JoinCost &jcost = globalTempVoicePtr->getJoinCostCalculator(); | |||
351 | ||||
352 | np->c = c; | |||
353 | np->from = p; | |||
354 | np->state = c->pos; | |||
355 | ||||
356 | if ((p == 0) || (p->c == 0)) | |||
357 | np->score = c->score; | |||
358 | else{ | |||
359 | const DiphoneCandidate *l_diph = diphonecandidate(p->c->name); | |||
360 | const DiphoneCandidate *r_diph = diphonecandidate(c->name); | |||
361 | ||||
362 | // join cost between right edge of left diphone and vice versa | |||
363 | np->score = p->score + c->score + jcost( l_diph, r_diph ); | |||
364 | } | |||
365 | return np; | |||
366 | } | |||
367 | ||||
368 | // This function is a really just a temporary hack necessary because the decoder | |||
369 | // as it stands at present can only take a function pointer (would be better to relax | |||
370 | // this restriction in the EST_Viterbi_Decoder class, or in a replacement class, rather | |||
371 | // than using this hack) | |||
372 | static EST_VTCandidate* getCandidatesFunction( EST_Item *s, | |||
373 | EST_Features &f) | |||
374 | { | |||
375 | DiphoneUnitVoice *duv = globalTempVoicePtr; | |||
| ||||
376 | if( duv==0 ) | |||
377 | EST_error(EST_error_where = __null), (*EST_error_func)( "Candidate source voice is unset" ); | |||
378 | ||||
379 | return duv->getCandidates( s, f ); | |||
| ||||
380 | } | |||
381 | ||||
382 | // Function which, given an item from the timeline relation that | |||
383 | // was originally used to initialise the EST_Viterbi_Decoder | |||
384 | // returns a pointer to a linked list of EST_VTCandidates | |||
385 | // (this is provided to the viterbi decoder upon its construction | |||
386 | // and (in)directly called by it as part of the decoding process...) | |||
387 | EST_VTCandidate* DiphoneUnitVoice::getCandidates( EST_Item *s, | |||
388 | EST_Features &f) const | |||
389 | { | |||
390 | EST_VTCandidate *c = 0; | |||
391 | EST_VTCandidate *moduleListHead = 0; | |||
392 | EST_VTCandidate *moduleListTail = 0; | |||
393 | ||||
394 | // these objects [c/sh]ould be a parameter visible in the user's script | |||
395 | // land, and will be in future... | |||
396 | ||||
397 | // tc now a member | |||
398 | // EST_DefaultTargetCost default_target_cost; | |||
399 | // EST_TargetCost *tc = &default_target_cost; | |||
400 | // or | |||
401 | // EST_SchemeTargetCost scheme_target_cost(rintern( "targetcost")); | |||
402 | // EST_TargetCost *tc = &scheme_target_cost; | |||
403 | ||||
404 | EST_TList<DiphoneVoiceModule*>::Entries module_iter; | |||
405 | int nfound, total=0; | |||
406 | ||||
407 | //////////////////////////////////////////////////////////////// | |||
408 | // join linked list of candidates from each module into one list | |||
409 | for( module_iter.begin(voiceModules); module_iter; module_iter++ ){ | |||
410 | nfound = (*module_iter)->getCandidateList( *s, | |||
411 | tc, | |||
412 | tcdh, | |||
413 | tc_weight, | |||
414 | &moduleListHead, | |||
415 | &moduleListTail ); | |||
416 | if( nfound>0 ){ | |||
417 | moduleListTail->next = c; | |||
418 | c = moduleListHead; | |||
419 | total += nfound; | |||
420 | } | |||
421 | } | |||
422 | ||||
423 | if( total==0 ) | |||
424 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't find diphone %s", (const char*)s->S("name") ); | |||
425 | ||||
426 | if( verbosity() > 0 ) | |||
427 | printf( "Number of candidates found for target \"%s\": %d\n", | |||
428 | (const char*)s->S("name"), total ); | |||
429 | ||||
430 | if( ! ((tc_rescoring_beam == -1.0) || (tc_rescoring_weight <= 0.0)) ) | |||
431 | rescoreCandidates( c, tc_rescoring_beam, tc_rescoring_weight ); | |||
432 | ||||
433 | return c; | |||
434 | } | |||
435 | ||||
436 | void DiphoneUnitVoice::diphoneCoverage(const EST_String filename) const | |||
437 | { | |||
438 | ||||
439 | EST_DiphoneCoverage dc; | |||
440 | EST_TList<DiphoneVoiceModule*>::Entries module_iter; | |||
441 | ||||
442 | // for each module | |||
443 | for( module_iter.begin(voiceModules); module_iter; module_iter++ ) | |||
444 | (*module_iter)->getDiphoneCoverageStats(&dc); | |||
445 | ||||
446 | dc.print_stats(filename); | |||
447 | ||||
448 | } | |||
449 | ||||
450 | ||||
451 | ||||
452 | bool DiphoneUnitVoice::synthesiseWave( EST_Utterance *utt ) | |||
453 | { | |||
454 | getUnitSequence( utt ); | |||
455 | ||||
456 | return true; | |||
457 | } | |||
458 | ||||
459 | ||||
460 | ||||
461 | void DiphoneUnitVoice::getUnitSequence( EST_Utterance *utt ) | |||
462 | { | |||
463 | EST_Relation *segs = utt->relation( "Segment" ); | |||
464 | EST_Relation *units = utt->create_relation( "Unit" ); | |||
465 | ||||
466 | if(!tcdh) | |||
467 | tcdh = new TCDataHash(20); | |||
468 | else | |||
469 | tcdh->clear(); | |||
470 | ||||
471 | // Initialise the Unit relation time index for decoder | |||
472 | EST_String diphone_name; | |||
473 | EST_StrList missing_diphones; | |||
474 | ||||
475 | EST_Item *it=segs->head(); | |||
476 | if( it == 0 ) | |||
477 | EST_error(EST_error_where = __null), (*EST_error_func)( "Segment relation is empty" ); | |||
478 | ||||
479 | bool extendLeftFlag = false; | |||
480 | for( ; it->next(); it=it->next() ) | |||
481 | { | |||
482 | EST_String l = it->S("name"); | |||
483 | EST_String r = it->next()->S("name"); | |||
484 | ||||
485 | EST_String diphone_name = EST_String::cat(l,"_",r); | |||
486 | EST_String orig = diphone_name; | |||
487 | ||||
488 | if(tc->is_flatpack()) | |||
489 | tcdh->add_item( it , ((EST_FlatTargetCost *)tc)->flatpack(it) ); | |||
490 | ||||
491 | ||||
492 | // First attempt back off: | |||
493 | // If missing diphone is an interword diphone, insert a silence! | |||
494 | // Perceptual results say this is prefered. | |||
495 | ||||
496 | if ( diphone_name != EST_String::Empty && | |||
497 | !this->unitAvailable(diphone_name) ) | |||
498 | { | |||
499 | EST_Item *s1,*s2; | |||
500 | EST_Item *w1=0,*w2=0; | |||
501 | ||||
502 | cout << "Missing diphone: "<< diphone_name << endl; | |||
503 | ||||
504 | if((s1 = parent(it,"SylStructure"))) | |||
505 | w1= parent(s1,"SylStructure"); | |||
506 | if( (s2 = parent(it->next(),"SylStructure"))) | |||
507 | w2= parent(s2,"SylStructure"); | |||
508 | ||||
509 | if( w1 && w2 && (w1 != w2) ) | |||
510 | { | |||
511 | EST_Item *sil; | |||
512 | ||||
513 | cout << " Interword so inseting silence.\n"; | |||
514 | ||||
515 | sil = it->insert_after(); | |||
516 | sil->set("name",ph_silence()); | |||
517 | ||||
518 | r = it->next()->S("name"); | |||
519 | diphone_name = EST_String::cat(l,"_",r); | |||
520 | ||||
521 | } | |||
522 | } | |||
523 | ||||
524 | ||||
525 | // Simple back off. | |||
526 | // Change diphone name for one we actually have. | |||
527 | ||||
528 | while(diphone_name != EST_String::Empty && | |||
529 | !this->unitAvailable(diphone_name) && | |||
530 | diphone_backoff_rules) | |||
531 | { | |||
532 | ||||
533 | cout << " diphone still missing, backing off: " << diphone_name << endl; | |||
534 | ||||
535 | diphone_name = diphone_backoff_rules->backoff(l,r); | |||
536 | l = diphone_name.before("_"); | |||
537 | r = diphone_name.after("_"); | |||
538 | ||||
539 | cout << " backed off: " << orig << " -> " << diphone_name << endl; | |||
540 | ||||
541 | if( verbosity() > 0 ){ | |||
542 | EST_warning(EST_error_where = __null), (*EST_warning_func)("Backing off requested diphone %s to %s", | |||
543 | orig.str(), | |||
544 | diphone_name.str() ); | |||
545 | } | |||
546 | } | |||
547 | ||||
548 | ||||
549 | //// Complex backoff. Changes the segment stream to the right, | |||
550 | //// may still leave a discontinuity to the left. This could be | |||
551 | //// fixed, but it would requires a better search. Rob's thoughts | |||
552 | //// are that the simple method works better, unless it resorts to | |||
553 | //// a bad default rule. | |||
554 | ||||
555 | ||||
556 | // while(!this->unitAvailable(diphone_name) && | |||
557 | // diphone_backoff_rules && | |||
558 | // !diphone_backoff_rules->backoff(it)) | |||
559 | // diphone_name = EST_String::cat(it->S("name"),"_",it->next()->S("name")); | |||
560 | ||||
561 | if( !this->unitAvailable( diphone_name ) ){ | |||
562 | missing_diphones.append( diphone_name ); | |||
563 | if(units->tail()) | |||
564 | units->tail()->set( "extendRight", 1 ); | |||
565 | extendLeftFlag = true; // trigger for next unit to make up second half of missing diphone | |||
566 | } | |||
567 | else{ | |||
568 | EST_Item *t = units->append(); | |||
569 | t->set( "name", diphone_name ); | |||
570 | if(orig != diphone_name) | |||
571 | t->set( "missing_diphone",orig); | |||
572 | t->set_val( "ph1", est_val(it) ); | |||
573 | if( extendLeftFlag == true ){ | |||
574 | t->set( "extendLeft", 1 ); | |||
575 | extendLeftFlag = false; | |||
576 | } | |||
577 | } | |||
578 | } | |||
579 | ||||
580 | // stop if necessary units are still missing. | |||
581 | if( missing_diphones.length() > 0 ){ | |||
582 | for( EST_Litem *it=missing_diphones.head(); it!=0 ; it=it->next() ) | |||
583 | printf( "requested diphone missing: %s\n", missing_diphones(it).str() ); | |||
584 | ||||
585 | EST_warning(EST_error_where = __null), (*EST_warning_func)("Making phone joins to compensate..."); | |||
586 | // EST_error("Unable to synthesise utterance due to missing diphones"); | |||
587 | } | |||
588 | ||||
589 | // Make the decoder do its thing | |||
590 | // -1 means number of states at each time point not fixed | |||
591 | EST_Viterbi_Decoder v( getCandidatesFunction, extendPath, -1 ); | |||
592 | ||||
593 | // turn on pruning if necessary | |||
594 | if( (pruning_beam>0) || (ob_pruning_beam>0) ) | |||
595 | v.set_pruning_parameters( pruning_beam, ob_pruning_beam ); | |||
596 | ||||
597 | // temporary hack necessary because decoder can only take a | |||
598 | // function pointer (would be better to relax this restriction in | |||
599 | // the EST_Viterbi_Decoder class, or in a replacement class, rather | |||
600 | // than using this hack) | |||
601 | globalTempVoicePtr = this; | |||
602 | ||||
603 | v.set_big_is_good(false); | |||
604 | ||||
605 | if( verbosity() > 0 ) | |||
606 | v.turn_on_trace(); | |||
607 | ||||
608 | v.initialise( units ); | |||
609 | v.search(); | |||
610 | ||||
611 | // take hold of the best path (end thereof) | |||
612 | EST_VTPath *bestp=0; | |||
613 | if( !v.result( &bestp ) ) | |||
614 | EST_error(EST_error_where = __null), (*EST_error_func)( "No best candidate sequence found" ); | |||
615 | ||||
616 | // fill in the best path features in the Unit Relation | |||
617 | fillUnitRelation( units, bestp ); | |||
618 | ||||
619 | my_parse_diphone_times( *units, *segs ); | |||
620 | } | |||
621 | ||||
622 | ||||
623 | ///////////////////////////////////////////////////////////////////////////////////// | |||
624 | // Canned example experimental code (proof of concept rather than intelligently done) | |||
625 | ||||
626 | static inline bool itemListContainsItem( const ItemList* il, const EST_Item *item ) | |||
627 | { | |||
628 | ItemList::Entries it; | |||
629 | ||||
630 | for( it.begin( *il ); it; it++ ) | |||
631 | if( (*it) == item ) | |||
632 | return true; | |||
633 | ||||
634 | return false; | |||
635 | } | |||
636 | ||||
637 | ||||
638 | static EST_VTCandidate* getCandidatesWithOmissionsFunction( EST_Item *s, EST_Features &f ) | |||
639 | { | |||
640 | DiphoneUnitVoice *duv = globalTempVoicePtr; | |||
641 | if( duv==0 ) | |||
642 | EST_error(EST_error_where = __null), (*EST_error_func)( "Candidate source voice is unset" ); | |||
643 | ||||
644 | //get candidate list as usual | |||
645 | EST_VTCandidate *candlist = duv->getCandidates( s, f ); | |||
646 | ||||
647 | //filter out candidates on basis of omission list (yes, this is quite dumb) | |||
648 | if( s->f_present( "omitlist" ) ){ | |||
649 | ||||
650 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "omitlist found in unit %s", s->S("name").str() ); | |||
651 | ||||
652 | ItemList *omitlist = itemlist( s->f("omitlist") ); | |||
653 | ||||
654 | //until one candidate remains as head (to keep hold of list head) | |||
655 | while( candlist != 0 && itemListContainsItem( omitlist, candlist->s ) ){ | |||
656 | EST_VTCandidate *del_cand = candlist; | |||
657 | candlist = candlist->next; | |||
658 | del_cand->next = 0; //so deletion doesn't trigger total list deletion | |||
659 | delete del_cand; | |||
660 | } | |||
661 | ||||
662 | //then continue down list | |||
663 | EST_VTCandidate *prev = candlist; | |||
664 | EST_VTCandidate *cand = candlist->next; | |||
665 | while( cand!=0 ){ | |||
666 | if( itemListContainsItem( omitlist, cand->s ) ){ //delete cand on true | |||
667 | prev->next = cand->next; | |||
668 | cand->next = 0; //so deletion doesn't trigger total list deletion | |||
669 | delete cand; | |||
670 | cand = prev; | |||
671 | } | |||
672 | cand = cand->next; | |||
673 | } | |||
674 | ||||
675 | if( candlist == 0 ) | |||
676 | EST_error(EST_error_where = __null), (*EST_error_func)( "zero candidates remain after filtering" ); | |||
677 | ||||
678 | } | |||
679 | ||||
680 | return candlist; | |||
681 | } | |||
682 | ||||
683 | // For when the utterance already has the unit sequence, with certain candidates | |||
684 | // flagged as to be avoided, or mandatory and so on... | |||
685 | void DiphoneUnitVoice::regetUnitSequence( EST_Utterance *utt ) | |||
686 | { | |||
687 | // Unit relation should already be in existence for decoder | |||
688 | EST_Relation *units = utt->relation( "Unit" ); | |||
689 | EST_Item *it=units->head(); | |||
690 | if( it == 0 ) | |||
691 | EST_error(EST_error_where = __null), (*EST_error_func)( "Unit relation is empty" ); | |||
692 | ||||
693 | // Make the decoder do its thing (again) | |||
694 | // -1 means number of states at each time point not fixed | |||
695 | EST_Viterbi_Decoder v( getCandidatesWithOmissionsFunction, extendPath, -1 ); | |||
696 | ||||
697 | // turn on pruning if necessary | |||
698 | if( (pruning_beam>0) || (ob_pruning_beam>0) ) | |||
699 | v.set_pruning_parameters( pruning_beam, ob_pruning_beam ); | |||
700 | ||||
701 | // temporary hack necessary because decoder can only take a | |||
702 | // function pointer (would be better to relax this restriction in | |||
703 | // the EST_Viterbi_Decoder class, or in a replacement class, rather | |||
704 | // than using this hack) | |||
705 | globalTempVoicePtr = this; | |||
706 | ||||
707 | v.set_big_is_good(false); | |||
708 | ||||
709 | if( verbosity() > 0 ) | |||
710 | v.turn_on_trace(); | |||
711 | ||||
712 | v.initialise( units ); | |||
713 | v.search(); | |||
714 | ||||
715 | // take hold of the best path (end thereof) | |||
716 | EST_VTPath *bestp=0; | |||
717 | if( !v.result( &bestp ) ) | |||
718 | EST_error(EST_error_where = __null), (*EST_error_func)( "No best candidate sequence found" ); | |||
719 | ||||
720 | // fill in the best path features in the Unit Relation | |||
721 | fillUnitRelation( units, bestp ); | |||
722 | ||||
723 | EST_Relation *segs = utt->relation("Segment"); | |||
724 | my_parse_diphone_times( *units, *segs ); | |||
725 | } | |||
726 | ||||
727 | // End canned example experimental code /////////////////////////////////////////// | |||
728 | /////////////////////////////////////////////////////////////////////////////////// | |||
729 | ||||
730 | ||||
731 | bool DiphoneUnitVoice::unitAvailable( const EST_String &diphone ) const | |||
732 | { | |||
733 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
734 | ||||
735 | for( it.begin(voiceModules); it; it++ ) | |||
736 | if( (*it)->numAvailableCandidates(diphone) > 0 ) | |||
737 | return true; | |||
738 | ||||
739 | return false; | |||
740 | } | |||
741 | ||||
742 | unsigned int DiphoneUnitVoice::numAvailableCandidates( const EST_String &diphone ) const | |||
743 | { | |||
744 | unsigned int number = 0; | |||
745 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
746 | ||||
747 | for( it.begin(voiceModules); it; it++ ) | |||
748 | number += (*it)->numAvailableCandidates(diphone); | |||
749 | ||||
750 | return number; | |||
751 | } | |||
752 | ||||
753 | ||||
754 | //////////////////////////////////////////////////////////////////////// | |||
755 | //////////////////////////////////////////////////////////////////////// | |||
756 | // special case of the above for utterances structures that are | |||
757 | // actually in the voice database, which doesn't do any search | |||
758 | // This is useful for doing copy synthesis of utterances (eg. | |||
759 | // to test out resynthesis, prosodic modification and so on) | |||
760 | void DiphoneUnitVoice::getCopyUnitUtterance( const EST_String &utt_fname, | |||
761 | EST_Utterance **utt_out ) const | |||
762 | { | |||
763 | // need to find which, if any, voice module has this utterance | |||
764 | // in its list | |||
765 | EST_TList<DiphoneVoiceModule*>::Entries module_iter; | |||
766 | EST_Utterance *db_utt=0; | |||
767 | for( module_iter.begin(voiceModules); module_iter; module_iter++ ) | |||
768 | if( (*module_iter)->getUtterance(&db_utt, "fileid", utt_fname) == true ) | |||
769 | break; | |||
770 | ||||
771 | if( db_utt == 0 ) | |||
772 | EST_error(EST_error_where = __null), (*EST_error_func)( "Could not find Utterance %s in any voice module", | |||
773 | utt_fname.str() ); | |||
774 | else{ | |||
775 | // deep copy database utterance and fill in Unit relation | |||
776 | *utt_out = new EST_Utterance( *db_utt ); | |||
777 | CHECK_PTR(utt_out)if((utt_out)==0){ (EST_error_where = __null), (*EST_error_func )("memory allocation failed (file %s, line %d)", "DiphoneUnitVoice.cc" ,777);}; | |||
778 | ||||
779 | EST_Utterance myUtt( *db_utt ); | |||
780 | ||||
781 | cerr << myUtt.relation_present( "Segment" ) << " " | |||
782 | << myUtt.num_relations() <<endl; | |||
783 | ||||
784 | ||||
785 | cerr << db_utt->relation_present( "Segment" ) << " " | |||
786 | << (*utt_out)->relation_present( "Segment" ) << " " | |||
787 | << (*utt_out)->num_relations() <<endl; | |||
788 | ||||
789 | ||||
790 | EST_Relation *segs = (*utt_out)->relation( "Segment" ); | |||
791 | EST_Relation *units = (*utt_out)->create_relation( "Unit" ); | |||
792 | ||||
793 | // Initialise the Unit relation + fill in necessary/suitable | |||
794 | // synthesis parameters | |||
795 | EST_String ph1, ph2; | |||
796 | EST_Item *it = segs->tail(); | |||
797 | EST_Item *db_utt_seg_it = db_utt->relation( "Segment" )->tail(); | |||
798 | if( it == 0 ) | |||
799 | EST_error(EST_error_where = __null), (*EST_error_func)( "Segment relation is empty" ); | |||
800 | else{ | |||
801 | ph2 = it->S("name"); | |||
802 | while( ((it=it->prev())!=0) && | |||
803 | ((db_utt_seg_it=db_utt_seg_it->prev())!=0) ){ | |||
804 | EST_Track *coefs = new EST_Track; | |||
805 | CHECK_PTR(coefs)if((coefs)==0){ (EST_error_where = __null), (*EST_error_func) ("memory allocation failed (file %s, line %d)", "DiphoneUnitVoice.cc" ,805);}; | |||
806 | EST_Wave *sig = new EST_Wave; | |||
807 | CHECK_PTR(sig)if((sig)==0){ (EST_error_where = __null), (*EST_error_func)("memory allocation failed (file %s, line %d)" , "DiphoneUnitVoice.cc",807);}; | |||
808 | int midf; | |||
809 | ||||
810 | (*module_iter)->getDiphone( db_utt_seg_it, coefs, sig, &midf ); | |||
811 | ||||
812 | ph1 = it->S("name"); | |||
813 | EST_Item *t = units->prepend(); | |||
814 | t->set( "name", EST_String::cat(ph1,"_",ph2) ); | |||
815 | t->set_val( "ph1", est_val(it) ); | |||
816 | t->set_val( "sig", est_val( sig ) ); | |||
817 | t->set_val( "coefs", est_val( coefs ) ); | |||
818 | t->set( "middle_frame", midf ); | |||
819 | t->set( "source_utt", db_utt->f.S("fileid")); | |||
820 | t->set_val( "source_ph1", est_val( db_utt_seg_it )); | |||
821 | t->set( "source_end", db_utt_seg_it->F("end")); | |||
822 | t->set( "target_cost", 0.0 ); | |||
823 | t->set( "join_cost", 0.0); | |||
824 | ||||
825 | ph2 = ph1; | |||
826 | } | |||
827 | } | |||
828 | my_parse_diphone_times( *units, *segs ); | |||
829 | ||||
830 | // this is for copy synthesis, so copy actual timings | |||
831 | //for( EST_Item *seg = segs->head(); it!=0; it=it->next() ) | |||
832 | //seg->set( "end", seg->F("source_end") ); | |||
833 | } | |||
834 | } | |||
835 | ||||
836 | //////////////////////////////////////////////////////////////////////// | |||
837 | //////////////////////////////////////////////////////////////////////// | |||
838 | ||||
839 | ||||
840 | ||||
841 | unsigned int DiphoneUnitVoice::numUnitTypes() const | |||
842 | { | |||
843 | //necessary? | |||
844 | return 0; | |||
845 | } | |||
846 | ||||
847 | unsigned int DiphoneUnitVoice::numDatabaseUnits() const | |||
848 | { | |||
849 | unsigned int sum=0; | |||
850 | ||||
851 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
852 | ||||
853 | for( it.begin( voiceModules ); it; it++ ) | |||
854 | sum += (*it)->numModuleUnits(); | |||
855 | ||||
856 | return sum; | |||
857 | } | |||
858 | ||||
859 | ||||
860 | ////////////////////////////////////////////////////////////////////////// | |||
861 | ||||
862 | void DiphoneUnitVoice::set_diphone_backoff(DiphoneBackoff *dbo) | |||
863 | { | |||
864 | if (diphone_backoff_rules) | |||
865 | delete diphone_backoff_rules; | |||
866 | diphone_backoff_rules = dbo; | |||
867 | } | |||
868 | ||||
869 | ||||
870 | int DiphoneUnitVoice::getPhoneList( const EST_String &phone, ItemList &list ) | |||
871 | { | |||
872 | unsigned int n=0; | |||
873 | ||||
874 | EST_TList<DiphoneVoiceModule*>::Entries it; | |||
875 | for( it.begin( voiceModules ); it; it++ ) | |||
876 | n += (*it)->getPhoneList( phone, list ); | |||
877 | ||||
878 | return n; | |||
879 | } | |||
880 | ||||
881 | ||||
882 | ||||
883 | void DiphoneUnitVoice::precomputeJoinCosts( const EST_StrList &phones, bool verbose ) | |||
884 | { | |||
885 | EST_StrList::Entries it; | |||
886 | for( it.begin( phones ); it; it++ ){ | |||
887 | ItemList *l = new ItemList; | |||
888 | CHECK_PTR(l)if((l)==0){ (EST_error_where = __null), (*EST_error_func)("memory allocation failed (file %s, line %d)" , "DiphoneUnitVoice.cc",888);}; | |||
889 | ||||
890 | unsigned int n = getPhoneList( (*it), *l ); | |||
891 | ||||
892 | if( verbose==true ) | |||
893 | cerr << "phone " << (*it) << " " << n << " instances\n"; | |||
894 | ||||
895 | if( n>0 ){ | |||
896 | jc->computeAndCache( *l, true ); //verbose=true | |||
897 | } | |||
898 | else | |||
899 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "Phone %s not listed in voice", (*it).str() ); | |||
900 | ||||
901 | delete l; | |||
902 | } | |||
903 | } |