us_unit.cc

Bug Summary

File:	modules/UniSyn/us_unit.cc
Location:	line 104, column 5
Description:	Value stored to 'print_centre' is never read

Annotated Source Code

1	/*************************************************************************/
2	/* */
3	/* Centre for Speech Technology Research */
4	/* University of Edinburgh, UK */
5	/* Copyright (c) 1996,1997 */
6	/* All Rights Reserved. */
7	/* */
8	/* Permission is hereby granted, free of charge, to use and distribute */
9	/* this software and its documentation without restriction, including */
10	/* without limitation the rights to use, copy, modify, merge, publish, */
11	/* distribute, sublicense, and/or sell copies of this work, and to */
12	/* permit persons to whom this work is furnished to do so, subject to */
13	/* the following conditions: */
14	/* 1. The code must retain the above copyright notice, this list of */
15	/* conditions and the following disclaimer. */
16	/* 2. Any modifications must be clearly marked as such. */
17	/* 3. Original authors' names are not deleted. */
18	/* 4. The authors' names are not used to endorse or promote products */
19	/* derived from this software without specific prior written */
20	/* permission. */
21	/* */
22	/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25	/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30	/* THIS SOFTWARE. */
31	/* */
32	/*************************************************************************/
33	/* */
34	/* Author: Paul Taylor */
35	/* Date: 6 Jan 1998 */
36	/* --------------------------------------------------------------------- */
37	/* Acoustic Unit Concatenation */
38	/* */
39	/*************************************************************************/
40
41
42	#include "siod.h"
43	#include "EST_sigpr.h"
44	#include "EST_wave_aux.h"
45	#include "EST_track_aux.h"
46	#include "EST_ling_class.h"
47	#include "us_synthesis.h"
48	#include <cmath>
49	#include "Phone.h"
50
51	using namespace std;
52
53	void merge_features(EST_Item from, EST_Item to, int keep_id);
54
55	void dp_time_align(EST_Utterance &utt, const EST_String &source_name,
56	const EST_String &target_name,
57	const EST_String &time_name,
58	bool do_start);
59
60	void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc);
61	void us_unit_raw_concat(EST_Utterance &utt);
62
63	void window_units(EST_Relation &unit_stream,
64	EST_TVector<EST_Wave> &frames,
65	float window_factor,
66	EST_String window_name,
67	bool window_symmetric,
68	EST_IVector *pm_indices=0);
69
70	bool dp_match(const EST_Relation &lexical,
71	const EST_Relation &surface,
72	EST_Relation &match,
73	float ins, float del, float sub);
74
75	void map_match_times(EST_Relation &target, const EST_String &match_name,
76	const EST_String &time_name, bool do_start);
77
78
79	static void window_frame(EST_Wave &frame, EST_Wave &whole, float scale,
80	int start, int end, EST_WindowFunc *window_function,
81	int centre_index=-1)
82	{
83	int i, j, send;
84	EST_TBuffer<float> window;
85	int window_length = (end-start)+1;
86
87	if (frame.num_samples() != (window_length))
88	frame.resize(window_length);
89	frame.set_sample_rate(whole.sample_rate());
90	// Ensure we have a safe end
91	if (end < whole.num_samples())
92	send = end;
93	else
94	send = whole.num_samples();
95
96
97	int print_centre;
98	if ( centre_index < 0 ){
99	window_function( window_length, window, -1 );
100	print_centre = (window_length-1)/2+start;
101	}
102	else{
103	window_function( window_length, window, (centre_index-start));
104	print_centre = centre_index;
	Value stored to 'print_centre' is never read
105	}
106
107
108	#if defined(EST_DEBUGGING)
109	cerr << "(start centre end window_length wholewavelen) "
110	<< start << " "
111	<< print_centre << " "
112	<< end << " "
113	<< window_length << " "
114	<< whole.num_samples() << endl;
115	#endif
116
117
118	// To allow a_no_check access we do this in three stages
119	for (i = 0, j = start; j < 0; ++i, ++j)
120	frame.a_no_check(i) = 0;
121	for ( ; j < send; ++i, ++j)
122	frame.a_no_check(i) = (int)((float)whole.a_no_check(j) * window(i) * scale);
123	for ( ; j < end; ++j,++i)
124	frame.a_no_check(i) = 0;
125
126
127	#if defined(EST_DEBUGGING)
128	// It's not always very nice to resynthesise speech from
129	// inserted zeros! These checks should alert the user (me ;)
130	if( start<0 )
131	EST_warning(EST_error_where = __null), (*EST_warning_func)( "padded start of pitch period with zeros (index %d)", i );
132
133	if( end>whole.num_samples() )
134	EST_warning(EST_error_where = __null), (*EST_warning_func)( "padded end of pitch period with zeros (frame %d)", i );
135	#endif
136	}
137
138
139	// The window_signal function has been changed in several ways:
140	//
141	// *) The function now has an asymmetric window mode.
142	//
143	// In this mode, asymmetric windows are used from pitchmark at t-1
144	// to pitchmark at time t+1, with the maximum value of 1.0 at
145	// pitchmark at time t.
146	//
147	// *) In the original symmetric mode:
148	//
149	// The first change is to ensure the window frames always have an
150	// odd number of samples (a convention for how to handle rounding
151	// problems when converting from times (float) to sample numbers
152	// (int)). The centre sample corresponds to the pitch mark time.
153	//
154	// The second change is that the estimate of local pitch period is
155	// always based in current and previous pitchmark. In the case
156	// of the first pitch mark in track pm, the previous pitchmark is
157	// assumed to be at zero time. Hopefully, this won't break much.
158	// However, if this convention is not used everywhere else that
159	// it's needed and some things break, then arguably those
160	// things need to be fixed to adhere to this same convention...
161	void window_signal(EST_Wave &sig, EST_Track &pm,
162	EST_WaveVector &frames, int &i, float scale,
163	float window_factor,
164	EST_WindowFunc *window_function,
165	bool window_symmetric,
166	EST_IVector *pm_indices=0)
167	{
168	float first_pos, period=0.0;
169	float prev_pm, current_pm;
170	int first_sample, centre_sample, last_sample;
171	int sample_rate = sig.sample_rate();
172	int pm_num_frames = pm.num_frames();
173
174	// estimate first period as pitchmark time itself (i.e. assume a previous
175	// pitchmark at 0.0 time, waveform sample 0)
176	prev_pm = 0.0;
177
178
179	if( window_symmetric )
180	{
181	if (pm_num_frames < 1 )
182	EST_error(EST_error_where = __null), (*EST_error_func)( "Attempted to Window around less than 1 pitchmark" );
183
184	for( int j=0; j<pm_num_frames; ++j, ++i ){
185	current_pm = pm.t(j);
186	period = current_pm - prev_pm;
187	centre_sample = (int)rint( current_pm*(float)sample_rate );
188
189	first_pos = prev_pm - (period * (window_factor-1.0));
190	first_sample = (int)rint( first_pos*(float)sample_rate );
191
192	last_sample = (2*centre_sample)-first_sample;
193
194	window_frame(frames[i], sig, scale, first_sample, last_sample, window_function);
195
196	prev_pm = current_pm;
197	}
198	}
199	else{
200	if( pm_indices == 0 )
201	EST_error(EST_error_where = __null), (*EST_error_func)( "required pitchmark indices EST_IVector is null" );
202
203	int j;
204
205	// Rob's experiment to see if we can handle small bits of speech with no pitchmarks.
206	// We just 0 the frames in this case.
207
208	if (pm_num_frames < 1 )
209	{
210	EST_warning(EST_error_where = __null), (*EST_warning_func)( "Attempted to Window around less than 1 pitchmark" );
211	}
212	else
213	{
214	for( j=0; j<pm_num_frames-1; ++j, ++i ){
215	current_pm = pm.t(j);
216	period = current_pm - prev_pm;
217	centre_sample = (int)rint( current_pm*(float)sample_rate );
218
219	first_pos = prev_pm - (period * (window_factor-1.0));
220	first_sample = (int)rint( first_pos*(float)sample_rate );
221
222	float next_pm = pm.t(j+1);
223	float last_pos = next_pm + ((next_pm-current_pm)*(window_factor-1.0));
224	last_sample = (int)rint( last_pos*(float)sample_rate );
225
226	window_frame(frames[i], sig, scale, first_sample,
227	last_sample, window_function, centre_sample);
228	(*pm_indices)[i] = centre_sample - first_sample;
229
230	prev_pm = current_pm;
231	}
232
233	//last frame window size is set according to pm.t(end) and the number
234	//of samples in the waveform (it is presumed the waveform begins at the
235	//preceeding pitchmark and ends at the pitchmark following the current
236	//unit...)
237
238	current_pm = pm.t(j);
239	centre_sample = (int)rint( current_pm*(float)sample_rate );
240	first_pos = prev_pm - (period * (window_factor-1.0));
241	first_sample = (int)rint( first_pos*(float)sample_rate );
242	last_sample = sig.num_samples()-1;
243	window_frame(frames[i], sig, scale, first_sample,
244	last_sample, window_function);
245	(*pm_indices)[i] = centre_sample - first_sample;
246
247	#if defined(EST_DEBUGGING)
248	cerr << "changed: " << i << " " << pm_indices->n() << endl;
249	#endif
250
251	++i;
252	}
253	}
254	}
255
256	void window_units( EST_Relation &unit_stream,
257	EST_TVector<EST_Wave> &frames,
258	float window_factor,
259	EST_String window_name,
260	bool window_symmetric,
261	EST_IVector *pm_indices )
262	{
263	int i;
264	EST_Wave *sig;
265	EST_Item *u;
266	EST_Track *coefs;
267	int num = 0;
268	float scale;
269	EST_WindowFunc *window_function;
270
271	for (u = unit_stream.head(); u; u = u->next())
272	num += track(u->f("coefs"))->num_frames();
273	frames.resize(num);
274
275	if( pm_indices != 0 )
276	pm_indices->resize(num);
277
278	if (window_name == "")
279	window_name = "hanning";
280
281	window_function = EST_Window::creator(window_name);
282
283	for (i = 0, u = unit_stream.head(); u; u = u->next())
284	{
285	sig = wave(u->f("sig"));
286	coefs = track(u->f("coefs"));
287	scale = (u->f_present("scale") ? u->F("scale") : 1.0);
288
289	window_signal(sig, coefs, frames, i, scale, window_factor,
290	window_function, window_symmetric, pm_indices);
291	}
292	}
293
294
295	void us_unit_concat(EST_Utterance &utt, float window_factor,
296	const EST_String &window_name,
297	bool no_waveform=false,
298	bool window_symmetric=true)
299
300	{
301	EST_Relation *unit_stream;
302	EST_Track *source_coef = new EST_Track;
303	EST_WaveVector *frames = new EST_WaveVector;
304	EST_IVector *pm_indices = 0;
305
306	unit_stream = utt.relation("Unit", 1);
307
308	concatenate_unit_coefs(unit_stream, source_coef);
309
310	utt.create_relation("SourceCoef");
311	EST_Item *item = utt.relation("SourceCoef")->append();
312	item->set("name", "coef");
313	item->set_val("coefs", est_val(source_coef));
314
315	if (!no_waveform){
316	if( !window_symmetric )
317	pm_indices = new EST_IVector;
318
319	window_units(unit_stream, frames,
320	window_factor, window_name, window_symmetric, pm_indices);
321
322	item->set_val("frame", est_val(frames));
323
324	if( !window_symmetric )
325	item->set_val("pm_indices", est_val(pm_indices));
326	}
327	}
328
329
330	void us_get_copy_wave(EST_Utterance &utt, EST_Wave &source_sig,
331	EST_Track &source_coefs, EST_Relation &source_seg)
332	{
333	EST_Item s, n;
334
335	if (!utt.relation_present("Segment"))
336	EST_error(EST_error_where = __null), (*EST_error_func)("utterance must have \"Segment\" relation\n");
337
338	utt.create_relation("TmpSegment");
339
340	for (s = source_seg.head(); s; s = s->next())
341	{
342	n = utt.relation("TmpSegment")->append();
343	merge_features(n, s, 0);
344	}
345
346	utt.relation("Segment")->remove_item_feature("source_end");
347
348	dp_time_align(utt, "TmpSegment", "Segment", "source_", 0);
349
350	utt.create_relation("Unit");
351	EST_Item *d = utt.relation("Unit")->append();
352
353
354	EST_Wave *ss = new EST_Wave;
355	*ss = source_sig;
356
357	EST_Track *c = new EST_Track;
358	*c = source_coefs;
359
360	d->set_val("sig", est_val(ss));
361	d->set_val("coefs", est_val(c));
362
363	utt.remove_relation("TmpSegment");
364	}
365
366
367	void us_energy_normalise(EST_Relation &unit)
368	{
369	EST_Wave *sig;
370
371	for (EST_Item *s = unit.head(); s; s = s->next())
372	{
373	sig = wave(s->f("sig"));
374	if (s->f_present("energy_factor"))
375	sig->rescale(s->F("energy_factor"));
376	}
377	}
378
379	void us_unit_raw_concat(EST_Utterance &utt)
380	{
381	EST_Wave sig, unit_sig;
382	EST_Track *unit_coefs=0;
383	float window_factor;
384	int i, j, k;
385	int first_pm, last_pm, last_length;
386	float first_pos, last_pos;
387
388	window_factor = get_c_float(siod_get_lval("window_factor",
389	"UniSyn: no window_factor"));
390	sig = new EST_Wave;
391
392	sig->resize(1000000);
393	sig->fill(0);
394	j = 0;
395
396	for (EST_Item *s = utt.relation("Unit", 1)->head(); s; s = s->next())
397	{
398	unit_sig = wave(s->f("sig"));
399	unit_coefs = track(s->f("coefs"));
400
401	first_pos = unit_coefs->t(1);
402	first_pm = (int)(first_pos * (float)unit_sig->sample_rate());
403
404	last_pos = unit_coefs->t(unit_coefs->num_frames()-2);
405	last_pm = (int)(last_pos * (float)unit_sig->sample_rate());
406	last_length = unit_sig->num_samples() - last_pm;
407
408	// std::cout << "first pm: " << first_pm << endl;
409	// std::cout << "last pm: " << last_pm << endl;
410	// std::cout << "last length: " << last_length << endl;
411
412	j -= first_pm;
413
414	for (i = 0; i < first_pm; ++i, ++j)
415	sig->a_safe(j) += (short)((((float) i)/ (float)first_pm) *(float)unit_sig->a_safe(i)+0.5);
416
417	for (; i < last_pm; ++i, ++j)
418	sig->a(j) = unit_sig->a(i);
419
420	for (k = 0; i < unit_sig->num_samples(); ++i, ++j, ++k)
421	sig->a_safe(j) += (short)((1.0 - (((float) k) / (float) last_length))
422	* (float)unit_sig->a_safe(i) + 0.5);
423
424	// j -= last_length;
425	// j += 2000;
426	}
427
428	sig->resize(j);
429	sig->set_sample_rate(16000);
430
431	add_wave_to_utterance(utt, *sig, "Wave");
432	}
433
434
435	void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc)
436	{
437	int num_source_frames = 0;
438	int num_source_channels = 0;;
439	float prev_time, abs_offset, rel_offset, period, offset;
440	int i, j, k, l;
441	EST_Track *coefs;
442
443	EST_Item *u = unit_stream.head();
444	if( u == 0 ){
445	//sometimes we are just asked to synthesise empty utterances, and
446	//code elsewhere wants us to continue...
447	source_lpc.resize(0,0);
448	}
449	else{
450	EST_Track *t = 0;
451	for ( ; u; u = u->next())
452	{
453	t = track(u->f("coefs"));
454	num_source_frames += t->num_frames();
455	}
456
457	num_source_channels = t->num_channels();
458
459	source_lpc.resize(num_source_frames, num_source_channels);
460	source_lpc.copy_setup(*t);
461
462	prev_time = 0.0;
463	// copy basic information
464	for (i = 0, l = 0, u = unit_stream.head(); u; u = u->next())
465	{
466	coefs = track(u->f("coefs"));
467
468	for (j = 0; j < coefs->num_frames(); ++j, ++i)
469	{
470	for (k = 0; k < coefs->num_channels(); ++k)
471	source_lpc.a_no_check(i, k) = coefs->a_no_check(j, k);
472	source_lpc.t(i) = coefs->t(j) + prev_time;
473	}
474
475	prev_time = source_lpc.t(i - 1);
476	u->set("end", prev_time);
477	u->set("num_frames", coefs->num_frames());
478	}
479	}
480
481	// adjust pitchmarks
482	abs_offset = 0.0;
483	rel_offset = 0.0;
484	// absolute offset in seconds
485	abs_offset = get_c_float(siod_get_lval("us_abs_offset", "zz"));
486	// relative offset as a function of local pitch period
487	rel_offset = get_c_float(siod_get_lval("us_rel_offset", "zz"));
488
489	if( abs_offset!=0.0 \|\| rel_offset!=0.0 ){
490	std::cerr << "Adjusting pitchmarks" << std::endl;
491	for (i = 0; i < source_lpc.num_frames(); ++i){
492	period = get_time_frame_size(source_lpc, (i));
493	offset = abs_offset + (rel_offset * period);
494	source_lpc.t(i) = source_lpc.t(i) + offset;
495	}
496	}
497	}
498
499	// jointimes specifies centre of last pitch period in each
500	// concatenated unit
501	// void us_linear_smooth_amplitude( EST_Wave *w,
502	// const EST_Track &pm,
503	// const EST_FVector &jointimes)
504	// {
505	// int num_joins = jointimes.length();
506
507	// EST_Track *factor_contour = new EST_Track( num_joins );
508
509	// for( int i=0; i<num_joins; ++i ){
510	// float join_t = jointimes(i);
511	// int join_indx = pm.index_below( join_t );
512
513	// // estimate local short-time energy function either side of join
514	// int left_start = rount(pm.t(join_indx-2)*(float)16000);
515	// int left_end = rount(pm.t(join_indx)*(float)16000);
516	// float left_power = 0.0 ;
517	// for( int j=left_start; j<left_end; ++j )
518	// left_power += pow( w[j], 2 );
519
520	// left_power /= (left_end - left_start); //normalise for frame length
521
522	// int right_start = rount(pm.t(join_indx+1)*(float)16000);
523	// int right_end = rount(pm.t(join_indx+3)*(float)16000);
524	// float right_power = 0.0;
525	// for( int j=right_start; j<right_end; ++j )
526	// right_power += pow( w[j], 2 );
527
528	// right_power /= (right_end - right_start); //normalise for frame length
529
530	// float mean_power = (left_power+right_power)/2.0;
531
532	// float left_factor = left_power/mean_power;
533	// float right_factor = right_power/mean_power;
534
535	// (*factor_contour)[i] = left_factor;
536	// (*factor_contour)[i+1] = right_factor;
537	// }
538
539	// }
540
541	static EST_Track* us_pitch_period_energy_contour( const EST_WaveVector &pp,
542	const EST_Track &pm )
543	{
544	const int pp_length = pp.length();
545
546	EST_Track *contour = new EST_Track;
547	contour->resize( pp_length, 1 );
548
549	for( int i=0; i<pp_length; ++i ){
550	const EST_Wave &frame = pp(i);
551	const int frame_length = frame.length();
552
553	// RMSE for EST_Wave window
554	int j;
555	for( contour->a_no_check(i,0) = 0.0, j=0; j<frame_length; ++j )
556	contour->a_no_check( i, 0 ) += pow( float(frame.a_no_check( j )), float(2.0) );
557
558	contour->a_no_check(i,0) = sqrt( contour->a_no_check(i,0) / (float)j );
559	contour->t(i) = pm.t(i);
560	}
561
562	return contour;
563	}
564
565	EST_Val ffeature(EST_Item *item,const EST_String &fname);
566
567	void us_linear_smooth_amplitude( EST_Utterance *utt )
568	{
569	EST_WaveVector *pp = wavevector(utt->relation("SourceCoef")->first()->f("frame"));
570	EST_Track *pm = track(utt->relation("SourceCoef")->first()->f("coefs"));
571
572	EST_Track energy = us_pitch_period_energy_contour( pp, *pm );
573	energy->save( "./energy_track.est", "est" );
574
575	FILE *ofile = fopen( "./join_times.est", "w" );
576	EST_Relation *units = utt->relation("Unit");
577	for( EST_Item *u=units->head(); u; u=u->next() ){
578
579	EST_Item *diphone_left = u;
580	// EST_Item *diphone_right = u->next();
581
582	fprintf( ofile, "%s\t%f\n", diphone_left->S("name").str(), diphone_left->F("end"));
583
584	EST_Item *join_phone_left = item(diphone_left->f("ph1"))->next();
585	EST_String phone_name = join_phone_left->S("name");
586	if( ph_is_sonorant( phone_name ) && !ph_is_silence( phone_name )){
587
588	//if( (ffeature(join_phone_left, "ph_vc")).S() == "+"){ // ideally for sonorants
589
590	std::cerr << "smoothing phone " << join_phone_left->S("name") << std::endl;
591
592	// EST_Item *join_phone_right = item(diphone_right->f("ph1"));
593
594	int left_end_index = energy->index(diphone_left->F("end"));
595	int right_start_index = left_end_index + 1;
596	float left_power = energy->a(left_end_index,0);
597	float right_power = energy->a(right_start_index,0);
598
599	float mean_power = (left_power+right_power)/2.0;
600	float left_factor = left_power/mean_power;
601	float right_factor = right_power/mean_power;
602
603	int smooth_start_index = left_end_index-5;
604	int smooth_end_index = right_start_index+5;
605
606
607	// rescale left pitch periods
608	float factor = 1.0;
609	float factor_incr = (left_factor-1.0)/(float)(left_end_index - smooth_start_index);
610	for( int i=smooth_start_index; i<=left_end_index; ++i, factor+=factor_incr ){
611	(*pp)[i].rescale( factor, 0 );
612	std::cerr << "rescaled frame " << i << "(factor " << factor << ")" << std::endl;
613	}
614
615	// rescale right pitch periods
616	factor = right_factor;
617	factor_incr = (1.0-right_factor)/(float)(smooth_end_index-right_start_index);
618	for( int i=right_start_index; i<=smooth_end_index; ++i, factor+=factor_incr){
619	(*pp)[i].rescale( factor, 0 );
620	std::cerr << "rescaled frame " << i << "(factor " << factor << ")" << std::endl;
621	}
622	}
623	else
624	std::cerr << "no smoothing for " << join_phone_left->S("name") << std::endl;
625
626	std::cerr << std::endl;
627	}
628
629	fclose( ofile );
630	delete energy;
631	}
632