us_prosody.cc

Bug Summary

File:	modules/UniSyn/us_prosody.cc
Location:	line 191, column 5
Description:	Value stored to 'm' is never read

Annotated Source Code

1	/*************************************************************************/
2	/* */
3	/* Centre for Speech Technology Research */
4	/* University of Edinburgh, UK */
5	/* Copyright (c) 1996,1997 */
6	/* All Rights Reserved. */
7	/* */
8	/* Permission is hereby granted, free of charge, to use and distribute */
9	/* this software and its documentation without restriction, including */
10	/* without limitation the rights to use, copy, modify, merge, publish, */
11	/* distribute, sublicense, and/or sell copies of this work, and to */
12	/* permit persons to whom this work is furnished to do so, subject to */
13	/* the following conditions: */
14	/* 1. The code must retain the above copyright notice, this list of */
15	/* conditions and the following disclaimer. */
16	/* 2. Any modifications must be clearly marked as such. */
17	/* 3. Original authors' names are not deleted. */
18	/* 4. The authors' names are not used to endorse or promote products */
19	/* derived from this software without specific prior written */
20	/* permission. */
21	/* */
22	/* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23	/* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24	/* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25	/* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26	/* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27	/* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28	/* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29	/* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30	/* THIS SOFTWARE. */
31	/* */
32	/*************************************************************************/
33	/* */
34	/* Author: Paul Taylor */
35	/* Date: 6 Jan 1998 */
36	/* --------------------------------------------------------------------- */
37	/* UniSyn prosody manipulation functions */
38	/* */
39	/*************************************************************************/
40
41	#include "us_synthesis.h"
42	#include "Phone.h"
43
44	//static void add_end_silences(EST_Relation &segment);
45	//static void add_end_silences(EST_Relation &segment, EST_Relation &target);
46
47	void pitchmarks_to_f0(EST_Track &pm, EST_Track &fz, float shift)
48	{
49	int i;
50	float period;
51
52	fz.resize((int)(pm.end()/shift), 1);
53	fz.fill_time(shift);
54
55	for (i = 0; i < fz.num_frames() -1 ; ++i)
56	{
57	period = get_time_frame_size(pm, pm.index_below(fz.t(i)));
58	fz.a(i) = 1.0 /period;
59	}
60	}
61
62	void f0_to_pitchmarks(EST_Track &fz, EST_Track &pm, int num_channels,
63	float default_f0, float target_end)
64	{
65	int i;
66	float max = 0.0;
67	float fz_end;
68
69	// Its impossible to guess the length of the pitchmark array before
70	// hand. Here we find the upper limit and resize at the end
71	for (i = 0; i < fz.num_frames(); ++i)
72	{
73	if (fz.a_no_check(i) < 0)
74	fz.a_no_check(i) = 0;
75	if (fz.a_no_check(i) > 500)
76	fz.a_no_check(i) = fz.a_no_check(i-1);
77	if (fz.a_no_check(i) > max)
78	max = fz.a_no_check(i);
79	}
80
81	// Coefficients will also be placed in here, so its best allocate
82	// space for their channels now
83	fz_end = fz.end();
84	pm.resize(int(max * (Gof(fz_end, target_end)(((fz_end) > (target_end)) ? (fz_end) : (target_end)))) + 10, num_channels);
85
86
87	int fz_len = fz.length();
88	float t1 = 0.0; //first pitchmark convention
89	float t2;
90
91	float f1 = fz.a_no_check(0); //arbitrary init
92	float f2;
93
94	double area = 0.5; // init value
95	int pm_i = 0;
96	int pm_len = pm.length();
97	for( int i=0; i<fz_len; i++ ){
98	t2 = fz.t( i );
99	f2 = fz.a_no_check( i );
100
101	float slope = (f2-f1)/(t2-t1);
102	area += (t2 - t1) * 0.5 * (f1 + f2);
103	while( (area >= 1.0) && (pm_i < pm_len) ){
104	area -= 1.0;
105	float discriminant = f2f2 - 2.0 area * slope;
106	if (discriminant < 0.0) discriminant = 0.0;
107	pm.t(pm_i++) = t2 - 2.0 * area / (f2 + sqrt (discriminant));
108	}
109	t1 = t2;
110	f1 = f2;
111	}
112
113	float default_shift = 1.0 / default_f0;
114	if (target_end > fz_end)
115	for (; t1 < target_end; ++pm_i)
116	t1 = pm.t(pm_i) = t1 + default_shift;
117
118	pm.resize(pm_i-1, num_channels);
119	}
120
121
122
123	/* Convert an F0 contour into a set of pitchmarks. This is done by the
124	obvious iterative function.
125
126	Space before the first defined F0 value is filled with regularly space
127	pitchmarks at intervals 1/def_f0. If the target_end value is
128	specified, more default pitchmarks are placed after the end of the
129	last f0 value until time target_end has been reached.
130	*/
131
132	void f0_to_pitchmarks_orig(EST_Track &fz, EST_Track &pm, int num_channels,
133	float default_f0, float target_end)
134	{
135	int i;
136	float max = 0.0, prev_pm = 0.0, val;
137	float fz_end;
138
139	// cout << "fz end: " << fz.end() << endl;
140	// cout << "fz n fg: " << fz.num_frames() << endl;
141
142	// Its impossible to guess the length of the pitchmark array before
143	// hand. Here we find the upper limit and resize at the end
144	for (i = 0; i < fz.num_frames(); ++i)
145	{
146	if (fz.a_no_check(i) < 0)
147	fz.a_no_check(i) = 0;
148	if (fz.a_no_check(i) > 500)
149	fz.a_no_check(i) = fz.a_no_check(i-1);
150	if (fz.a_no_check(i) > max)
151	max = fz.a_no_check(i);
152	}
153
154	// Coefficients will also be placed in here, so its best allocate
155	// space for their channels now
156	fz_end = fz.end();
157	pm.resize(int(max * (Gof(fz_end, target_end)(((fz_end) > (target_end)) ? (fz_end) : (target_end)))) + 10, num_channels);
158
159	// cout << "fz end: " << fz.end() << endl;
160	// cout << "fz n fg: " << fz.num_frames() << endl;
161	// cout << "pmn fg: " << pm.num_frames() << endl;
162
163	for (i = 0; prev_pm < fz_end; ++i)
164	{
165	val = fz.a(prev_pm) > 0.0 ? fz.a(prev_pm) : default_f0;
166	pm.t(i) = prev_pm + (1.0 / val);
167	prev_pm = pm.t(i);
168	}
169
170	if (target_end > fz_end)
171	for (; prev_pm < target_end; ++i)
172	{
173	pm.t(i) = prev_pm + (1.0 / default_f0);
174	prev_pm = pm.t(i);
175	}
176
177	pm.resize(i - 1, num_channels);
178	}
179
180	// not sure if this is useful
181	void linear_pitchmarks(EST_Track &source_pm, EST_Track &target_pm,
182	float start_f0, float end_f0)
183	{
184	int i;
185	float m, length, pitch;
186	target_pm.resize(source_pm.num_frames(), source_pm.num_channels());
187
188	length = (float)source_pm.num_frames() / (end_f0 - start_f0);
189
190	target_pm.t(0) = 0.0;
191	m = (end_f0 - start_f0) / length;
	Value stored to 'm' is never read
192
193	for(i = 1; i < target_pm.num_frames(); ++i)
194	{
195	pitch = (((float)i / (float) target_pm.num_frames())
196	* (end_f0 - start_f0)) + start_f0;
197	target_pm.t(i) = target_pm.t(i - 1) + (1 /pitch);
198	}
199	}
200
201	// not sure if this is useful
202	void stretch_f0_time(EST_Track &f0, float stretch,
203	float s_last_time, float t_last_time)
204	{
205	for (int i = 0 ; i < f0.num_frames(); ++i)
206	{
207	// cout << i << " o t:" << f0.t(i) << endl;
208	f0.t(i) = ((f0.t(i) - s_last_time) * stretch) + t_last_time;
209	// cout << i << " m t:" << f0.t(i) << endl;
210	}
211	}
212
213	// make target F0 from source F0, with same F0 values as original,
214	// but durations specified by target_seg.
215
216	/*
217	void us_F0targets_to_pitchmarks(EST_Utterance &utt,
218	const EST_String &seg_relation)
219	{
220	utt.create_relation("TargetCoef");
221	EST_Track *target_coef = new EST_Track;
222	EST_Item *end_seg;
223	int num_channels = 0;
224	float end;
225
226	if (utt.relation_present("SourceCoef"))
227	{
228	EST_Track *source_coef =
229	track(utt.relation("SourceCoef")->head()->f("coefs"));
230	num_channels = source_coef->num_channels();
231	}
232
233	if (seg_relation == "")
234	end_seg = utt.relation("Segment", 1)->last();
235	else
236	end_seg = utt.relation(seg_relation, 1)->last();
237
238	if (end_seg)
239	end = end_seg->F("end");
240	else
241	end = 0;
242
243	targets_to_pitchmarks((utt.relation("Target")), target_coef,
244	num_channels,end);
245
246	EST_Item *item = utt.relation("TargetCoef")->append();
247	item->set("name", "coef");
248	item->set_val("coefs",est_val(target_coef));
249
250	}
251
252	void targets_to_pitchmarks(EST_Relation &targ, EST_Track &pitchmarks,
253	int num_channels,float end)
254	{
255	EST_Item *s;
256	float time, f0, prev_time, prev_f0, m, max;
257	int i;
258
259	// Its impossible to guess the length of the pitchmark array before
260	// hand. Here we find the upper limit and resize at the end
261	for (max = 0.0, s = targ.first_leaf(); s; s = next_leaf(s))
262	if (s->F("f0") > max)
263	max = s->F("f0");
264
265	pitchmarks.resize((int)(max * 1.1 * end)+1, num_channels);
266
267	prev_time = 0;
268	prev_f0 = targ.first_leaf() ? targ.first_leaf()->F("f0") : 120;
269	pitchmarks.t(0) = 0.0;
270
271	for (i = 1, s = targ.first_leaf(); s; s = next_leaf(s))
272	{
273	time = s->f("pos");
274	f0 = s->F("f0");
275
276	if (f0 < 30) // to protect against with duff IntTarget algorithms
277	continue;
278	if (time == prev_time)
279	continue;
280	else if (time < prev_time)
281	{
282	cerr << "UniSyn: warning target in wrong order at " << prev_time;
283	cerr << " ignored" << endl;
284	continue;
285	}
286	m = (f0 - prev_f0) / (time - prev_time);
287
288
289	{
290	f0 = (m * (pitchmarks.t(i - 1) - prev_time)) + prev_f0;
291	pitchmarks.t(i) = pitchmarks.t(i - 1) + 1.0/f0;
292	}
293	prev_time = time;
294	prev_f0 = f0;
295	}
296	// Ensure pitch marks go to the end of the utterance
297	// This will effectively mean the last half diphone will be extend over
298	// the whol final segment. This will only be reasonable if the
299	// final segment is a silence.
300	for (; pitchmarks.t(i - 1) < end; ++i)
301	pitchmarks.t(i) = pitchmarks.t(i - 1) + 1.0/prev_f0;
302	pitchmarks.resize(i, pitchmarks.num_channels());
303	}
304	*/
305
306
307	/*static void add_end_silences(EST_Relation &segment, EST_Relation &target)
308	{
309	EST_Item t, n;
310	float shift = 0.0;
311	const float pause_duration = 0.1;
312
313	t = segment.head();
314	if (!ph_is_silence(t->f("name")))
315	{
316	n = t->insert_before();
317	n->set("name", ph_silence());
318	n->set("dur", pause_duration);
319	shift += pause_duration;
320	}
321
322	t = segment.tail();
323	if (!ph_is_silence(t->f("name")))
324	{
325	n = t->insert_after();
326	n->set("name", ph_silence());
327	n->set("dur", pause_duration);
328	shift += pause_duration;
329	}
330	dur_to_end(segment);
331
332	target.tail()->set("pos", (target.tail()->F("pos") + shift));
333	}
334
335	void merge_pitchmarks(EST_Utterance &u, EST_Track &pm1,
336	EST_Track &pm2, EST_Track &target_pm,
337	EST_Relation &guide)
338	{
339	EST_Item *s;
340	float s_end, s_start;
341	int s_i_start, s_i_end;
342	int i, j = 0;
343	(void) u;
344
345	target_pm.resize(1000000, 0);
346	s_start = 0.0;
347
348	for (s = guide.head(); s; s = s->next())
349	{
350	s_end = s->F("end", 1);
351	if (s->fI("use_pm") == 1)
352	{
353	s_i_start = pm1.index_below(s_start);
354	s_i_end = pm1.index_below(s_end);
355	for (i = s_i_start; i < s_i_end; ++i, ++j)
356	target_pm.t(j) = pm1.t(i);
357	}
358	else
359	{
360	s_i_start = pm2.index_below(s_start);
361	s_i_end = pm2.index_below(s_end);
362	for (i = s_i_start; i < s_i_end; ++i, ++j)
363	target_pm.t(j) = pm2.t(i);
364	}
365	s_start = s_end;
366	}
367	}
368
369	void warp_f0(EST_Track &source_f0, EST_Relation &source_seg,
370	EST_Track &target_f0, EST_Relation &target_seg)
371	{
372	EST_Item s, t;
373	float prev_source_end = 0.0, prev_target_end = 0.0;
374	EST_Track part;
375	int frame_start, frame_end;
376	float stretch, t_last_time = 0, s_last_time = 0;
377	EST_Relation match("Match");
378	EST_Item xx;
379	EST_Track str;
380	int i = 0;
381
382	dp_match(target_seg, source_seg, match, local_cost, &xx);
383
384	target_f0 = source_f0;
385	frame_start = 0;
386	frame_end = 0;
387
388	str.resize(target_seg.length(), 1);
389
390	cout << "tag: " << target_seg << endl;
391
392	for (t = target_seg.head(); t; t = t->next())
393	{
394	s = daughter1(t,"Match");
395	if (s == 0) // ie extra phone in target specification
396	continue;
397
398	frame_end = source_f0.index(s->f("end"));
399	if ((frame_end - frame_start) < 1)
400	{
401	cout << "Warning no frames for: " << *t << endl;
402	continue;
403	}
404	target_f0.sub_track(part, frame_start, (frame_end - frame_start + 1),
405	0, EST_ALL);
406
407	stretch = (t->F("end") - prev_target_end) /
408	(s->F("end") - prev_source_end);
409
410	str.a(i) = stretch;
411	str.t(i++) = t->F("end");
412
413	cout << "\nstretch: " << stretch << endl;
414	cout << "source: " << *s << endl;
415	cout << "target: " << *t << endl;
416	cout << "frames: " << frame_start << " " << frame_end << endl;
417
418	stretch_f0_time(part, stretch, s_last_time, t_last_time);
419
420	prev_target_end = t->f("end");
421	prev_source_end = s->f("end");
422	frame_start = frame_end + 1;
423	t_last_time = part.end();
424	s_last_time = source_f0.t(frame_end);
425	cout << "last time = " << s_last_time << " " << t_last_time << endl;
426	}
427	target_f0.resize(frame_end, 1);
428	target_f0.a(target_f0.num_frames() - 1) = 100;
429	str.save("zz_stretch");
430	}
431
432	void warp_pitchmarks(EST_Utterance &utt, EST_Track *source_pm,
433	EST_Relation &source_seg, EST_Relation &target_seg)
434	{
435	EST_Track source_f0, target_f0, *target_pm;
436
437	target_pm = new EST_Track;
438
439	cout << "tag: "<< target_seg << endl;
440
441	add_end_silences(target_seg);
442
443
444	cout << "tag 2: "<< target_seg << endl;
445
446	pitchmarks_to_f0(*source_pm, source_f0, 0.01);
447
448	cout << "tag 3: "<< target_seg << endl;
449
450	warp_f0(source_f0, source_seg, target_f0, target_seg);
451
452	f0_to_pitchmarks(target_f0, *target_pm);
453
454	utt.create_relation("TargetCoef");
455	utt.create_relation("SourceSegments");
456
457	*utt.relation("SourceSegments") = source_seg;
458
459	EST_Item *item = utt.relation("TargetCoef")->append();
460
461	target_f0.save("tt_tar.f0", "est");
462	target_seg.save("tt_tar.lab");
463	source_seg.save("tt_sou.lab");
464	source_f0.save("tt_sou.f0", "est");
465
466	target_pm->save("target_coef_a.pm","est");
467	item->set("name", "coefs");
468	item->set_val("coefs", est_val(target_pm));
469	}
470
471	float local_cost(const EST_Item s1, const EST_Item s2)
472	{
473	<<<<<<< us_prosody.cc
474	utt.create_relation("TargetCoef");
475	EST_Track *target_coef = new EST_Track;
476	EST_Item *end_seg;
477	int num_channels = 0;
478	float end;
479
480	if (utt.relation_present("SourceCoef"))
481	{
482	EST_Track *source_coef =
483	track(utt.relation("SourceCoef")->head()->f("coefs"));
484	num_channels = source_coef->num_channels();
485	}
486	=======
487	float insertion_cost = get_c_int(siod_get_lval("met_insertion", NULL));
488	float deletion_cost = get_c_int(siod_get_lval("met_deletion", NULL));
489	float substitution_cost =
490	get_c_int(siod_get_lval("met_substitution", NULL));
491	>>>>>>> 1.14
492
493	EST_String null_sym = "nil";
494
495	// otherwise cost is either insertion cost, or cost_matrix value
496	if (s1->name() == s2->name())
497	return 0;
498	else
499	{
500	if (s1->name() == null_sym)
501	return insertion_cost;
502	else if (s2->name() == null_sym)
503	return deletion_cost;
504	else
505	return substitution_cost;
506	}
507	}
508	typedef
509	float (local_cost_function)(const EST_Item item1,
510	const EST_Item *item2);
511
512	bool dp_match(const EST_Relation &lexical,
513	const EST_Relation &surface,
514	EST_Relation &match,
515	local_cost_function lcf,
516	EST_Item *null_syl);
517
518
519
520	*/
521
522	/*static void add_end_silences(EST_Relation &segment)
523	{
524	EST_Item t, n;
525
526	t = segment.head();
527	if (!ph_is_silence(t->f("name")))
528	{
529	n = t->insert_before();
530	n->set("name", ph_silence());
531	}
532
533	t = segment.tail();
534	if (!ph_is_silence(t->f("name")))
535	{
536	n = t->insert_after();
537	n->set("name", ph_silence());
538	}
539	}
540
541	*/