1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 | |
28 | |
29 | |
30 | |
31 | |
32 | |
33 | |
34 | |
35 | |
36 | |
37 | |
38 | |
39 | |
40 | |
41 | |
42 | #include "siod.h" |
43 | #include "EST_sigpr.h" |
44 | #include "EST_wave_aux.h" |
45 | #include "EST_track_aux.h" |
46 | #include "EST_ling_class.h" |
47 | #include "us_synthesis.h" |
48 | #include <cmath> |
49 | #include "Phone.h" |
50 | |
51 | using namespace std; |
52 | |
53 | void merge_features(EST_Item *from, EST_Item *to, int keep_id); |
54 | |
55 | void dp_time_align(EST_Utterance &utt, const EST_String &source_name, |
56 | const EST_String &target_name, |
57 | const EST_String &time_name, |
58 | bool do_start); |
59 | |
60 | void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc); |
61 | void us_unit_raw_concat(EST_Utterance &utt); |
62 | |
63 | void window_units(EST_Relation &unit_stream, |
64 | EST_TVector<EST_Wave> &frames, |
65 | float window_factor, |
66 | EST_String window_name, |
67 | bool window_symmetric, |
68 | EST_IVector *pm_indices=0); |
69 | |
70 | bool dp_match(const EST_Relation &lexical, |
71 | const EST_Relation &surface, |
72 | EST_Relation &match, |
73 | float ins, float del, float sub); |
74 | |
75 | void map_match_times(EST_Relation &target, const EST_String &match_name, |
76 | const EST_String &time_name, bool do_start); |
77 | |
78 | |
79 | static void window_frame(EST_Wave &frame, EST_Wave &whole, float scale, |
80 | int start, int end, EST_WindowFunc *window_function, |
81 | int centre_index=-1) |
82 | { |
83 | int i, j, send; |
84 | EST_TBuffer<float> window; |
85 | int window_length = (end-start)+1; |
86 | |
87 | if (frame.num_samples() != (window_length)) |
88 | frame.resize(window_length); |
89 | frame.set_sample_rate(whole.sample_rate()); |
90 | |
91 | if (end < whole.num_samples()) |
92 | send = end; |
93 | else |
94 | send = whole.num_samples(); |
95 | |
96 | |
97 | int print_centre; |
98 | if ( centre_index < 0 ){ |
99 | window_function( window_length, window, -1 ); |
100 | print_centre = (window_length-1)/2+start; |
101 | } |
102 | else{ |
103 | window_function( window_length, window, (centre_index-start)); |
104 | print_centre = centre_index; |
| Value stored to 'print_centre' is never read |
105 | } |
106 | |
107 | |
108 | #if defined(EST_DEBUGGING) |
109 | cerr << "(start centre end window_length wholewavelen) " |
110 | << start << " " |
111 | << print_centre << " " |
112 | << end << " " |
113 | << window_length << " " |
114 | << whole.num_samples() << endl; |
115 | #endif |
116 | |
117 | |
118 | |
119 | for (i = 0, j = start; j < 0; ++i, ++j) |
120 | frame.a_no_check(i) = 0; |
121 | for ( ; j < send; ++i, ++j) |
122 | frame.a_no_check(i) = (int)((float)whole.a_no_check(j) * window(i) * scale); |
123 | for ( ; j < end; ++j,++i) |
124 | frame.a_no_check(i) = 0; |
125 | |
126 | |
127 | #if defined(EST_DEBUGGING) |
128 | |
129 | |
130 | if( start<0 ) |
131 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "padded start of pitch period with zeros (index %d)", i ); |
132 | |
133 | if( end>whole.num_samples() ) |
134 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "padded end of pitch period with zeros (frame %d)", i ); |
135 | #endif |
136 | } |
137 | |
138 | |
139 | |
140 | |
141 | |
142 | |
143 | |
144 | |
145 | |
146 | |
147 | |
148 | |
149 | |
150 | |
151 | |
152 | |
153 | |
154 | |
155 | |
156 | |
157 | |
158 | |
159 | |
160 | |
161 | void window_signal(EST_Wave &sig, EST_Track &pm, |
162 | EST_WaveVector &frames, int &i, float scale, |
163 | float window_factor, |
164 | EST_WindowFunc *window_function, |
165 | bool window_symmetric, |
166 | EST_IVector *pm_indices=0) |
167 | { |
168 | float first_pos, period=0.0; |
169 | float prev_pm, current_pm; |
170 | int first_sample, centre_sample, last_sample; |
171 | int sample_rate = sig.sample_rate(); |
172 | int pm_num_frames = pm.num_frames(); |
173 | |
174 | |
175 | |
176 | prev_pm = 0.0; |
177 | |
178 | |
179 | if( window_symmetric ) |
180 | { |
181 | if (pm_num_frames < 1 ) |
182 | EST_error(EST_error_where = __null), (*EST_error_func)( "Attempted to Window around less than 1 pitchmark" ); |
183 | |
184 | for( int j=0; j<pm_num_frames; ++j, ++i ){ |
185 | current_pm = pm.t(j); |
186 | period = current_pm - prev_pm; |
187 | centre_sample = (int)rint( current_pm*(float)sample_rate ); |
188 | |
189 | first_pos = prev_pm - (period * (window_factor-1.0)); |
190 | first_sample = (int)rint( first_pos*(float)sample_rate ); |
191 | |
192 | last_sample = (2*centre_sample)-first_sample; |
193 | |
194 | window_frame(frames[i], sig, scale, first_sample, last_sample, window_function); |
195 | |
196 | prev_pm = current_pm; |
197 | } |
198 | } |
199 | else{ |
200 | if( pm_indices == 0 ) |
201 | EST_error(EST_error_where = __null), (*EST_error_func)( "required pitchmark indices EST_IVector is null" ); |
202 | |
203 | int j; |
204 | |
205 | |
206 | |
207 | |
208 | if (pm_num_frames < 1 ) |
209 | { |
210 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "Attempted to Window around less than 1 pitchmark" ); |
211 | } |
212 | else |
213 | { |
214 | for( j=0; j<pm_num_frames-1; ++j, ++i ){ |
215 | current_pm = pm.t(j); |
216 | period = current_pm - prev_pm; |
217 | centre_sample = (int)rint( current_pm*(float)sample_rate ); |
218 | |
219 | first_pos = prev_pm - (period * (window_factor-1.0)); |
220 | first_sample = (int)rint( first_pos*(float)sample_rate ); |
221 | |
222 | float next_pm = pm.t(j+1); |
223 | float last_pos = next_pm + ((next_pm-current_pm)*(window_factor-1.0)); |
224 | last_sample = (int)rint( last_pos*(float)sample_rate ); |
225 | |
226 | window_frame(frames[i], sig, scale, first_sample, |
227 | last_sample, window_function, centre_sample); |
228 | (*pm_indices)[i] = centre_sample - first_sample; |
229 | |
230 | prev_pm = current_pm; |
231 | } |
232 | |
233 | |
234 | |
235 | |
236 | |
237 | |
238 | current_pm = pm.t(j); |
239 | centre_sample = (int)rint( current_pm*(float)sample_rate ); |
240 | first_pos = prev_pm - (period * (window_factor-1.0)); |
241 | first_sample = (int)rint( first_pos*(float)sample_rate ); |
242 | last_sample = sig.num_samples()-1; |
243 | window_frame(frames[i], sig, scale, first_sample, |
244 | last_sample, window_function); |
245 | (*pm_indices)[i] = centre_sample - first_sample; |
246 | |
247 | #if defined(EST_DEBUGGING) |
248 | cerr << "changed: " << i << " " << pm_indices->n() << endl; |
249 | #endif |
250 | |
251 | ++i; |
252 | } |
253 | } |
254 | } |
255 | |
256 | void window_units( EST_Relation &unit_stream, |
257 | EST_TVector<EST_Wave> &frames, |
258 | float window_factor, |
259 | EST_String window_name, |
260 | bool window_symmetric, |
261 | EST_IVector *pm_indices ) |
262 | { |
263 | int i; |
264 | EST_Wave *sig; |
265 | EST_Item *u; |
266 | EST_Track *coefs; |
267 | int num = 0; |
268 | float scale; |
269 | EST_WindowFunc *window_function; |
270 | |
271 | for (u = unit_stream.head(); u; u = u->next()) |
272 | num += track(u->f("coefs"))->num_frames(); |
273 | frames.resize(num); |
274 | |
275 | if( pm_indices != 0 ) |
276 | pm_indices->resize(num); |
277 | |
278 | if (window_name == "") |
279 | window_name = "hanning"; |
280 | |
281 | window_function = EST_Window::creator(window_name); |
282 | |
283 | for (i = 0, u = unit_stream.head(); u; u = u->next()) |
284 | { |
285 | sig = wave(u->f("sig")); |
286 | coefs = track(u->f("coefs")); |
287 | scale = (u->f_present("scale") ? u->F("scale") : 1.0); |
288 | |
289 | window_signal(*sig, *coefs, frames, i, scale, window_factor, |
290 | window_function, window_symmetric, pm_indices); |
291 | } |
292 | } |
293 | |
294 | |
295 | void us_unit_concat(EST_Utterance &utt, float window_factor, |
296 | const EST_String &window_name, |
297 | bool no_waveform=false, |
298 | bool window_symmetric=true) |
299 | |
300 | { |
301 | EST_Relation *unit_stream; |
302 | EST_Track *source_coef = new EST_Track; |
303 | EST_WaveVector *frames = new EST_WaveVector; |
304 | EST_IVector *pm_indices = 0; |
305 | |
306 | unit_stream = utt.relation("Unit", 1); |
307 | |
308 | concatenate_unit_coefs(*unit_stream, *source_coef); |
309 | |
310 | utt.create_relation("SourceCoef"); |
311 | EST_Item *item = utt.relation("SourceCoef")->append(); |
312 | item->set("name", "coef"); |
313 | item->set_val("coefs", est_val(source_coef)); |
314 | |
315 | if (!no_waveform){ |
316 | if( !window_symmetric ) |
317 | pm_indices = new EST_IVector; |
318 | |
319 | window_units(*unit_stream, *frames, |
320 | window_factor, window_name, window_symmetric, pm_indices); |
321 | |
322 | item->set_val("frame", est_val(frames)); |
323 | |
324 | if( !window_symmetric ) |
325 | item->set_val("pm_indices", est_val(pm_indices)); |
326 | } |
327 | } |
328 | |
329 | |
330 | void us_get_copy_wave(EST_Utterance &utt, EST_Wave &source_sig, |
331 | EST_Track &source_coefs, EST_Relation &source_seg) |
332 | { |
333 | EST_Item *s, *n; |
334 | |
335 | if (!utt.relation_present("Segment")) |
336 | EST_error(EST_error_where = __null), (*EST_error_func)("utterance must have \"Segment\" relation\n"); |
337 | |
338 | utt.create_relation("TmpSegment"); |
339 | |
340 | for (s = source_seg.head(); s; s = s->next()) |
341 | { |
342 | n = utt.relation("TmpSegment")->append(); |
343 | merge_features(n, s, 0); |
344 | } |
345 | |
346 | utt.relation("Segment")->remove_item_feature("source_end"); |
347 | |
348 | dp_time_align(utt, "TmpSegment", "Segment", "source_", 0); |
349 | |
350 | utt.create_relation("Unit"); |
351 | EST_Item *d = utt.relation("Unit")->append(); |
352 | |
353 | |
354 | EST_Wave *ss = new EST_Wave; |
355 | *ss = source_sig; |
356 | |
357 | EST_Track *c = new EST_Track; |
358 | *c = source_coefs; |
359 | |
360 | d->set_val("sig", est_val(ss)); |
361 | d->set_val("coefs", est_val(c)); |
362 | |
363 | utt.remove_relation("TmpSegment"); |
364 | } |
365 | |
366 | |
367 | void us_energy_normalise(EST_Relation &unit) |
368 | { |
369 | EST_Wave *sig; |
370 | |
371 | for (EST_Item *s = unit.head(); s; s = s->next()) |
372 | { |
373 | sig = wave(s->f("sig")); |
374 | if (s->f_present("energy_factor")) |
375 | sig->rescale(s->F("energy_factor")); |
376 | } |
377 | } |
378 | |
379 | void us_unit_raw_concat(EST_Utterance &utt) |
380 | { |
381 | EST_Wave *sig, *unit_sig; |
382 | EST_Track *unit_coefs=0; |
383 | float window_factor; |
384 | int i, j, k; |
385 | int first_pm, last_pm, last_length; |
386 | float first_pos, last_pos; |
387 | |
388 | window_factor = get_c_float(siod_get_lval("window_factor", |
389 | "UniSyn: no window_factor")); |
390 | sig = new EST_Wave; |
391 | |
392 | sig->resize(1000000); |
393 | sig->fill(0); |
394 | j = 0; |
395 | |
396 | for (EST_Item *s = utt.relation("Unit", 1)->head(); s; s = s->next()) |
397 | { |
398 | unit_sig = wave(s->f("sig")); |
399 | unit_coefs = track(s->f("coefs")); |
400 | |
401 | first_pos = unit_coefs->t(1); |
402 | first_pm = (int)(first_pos * (float)unit_sig->sample_rate()); |
403 | |
404 | last_pos = unit_coefs->t(unit_coefs->num_frames()-2); |
405 | last_pm = (int)(last_pos * (float)unit_sig->sample_rate()); |
406 | last_length = unit_sig->num_samples() - last_pm; |
407 | |
408 | |
409 | |
410 | |
411 | |
412 | j -= first_pm; |
413 | |
414 | for (i = 0; i < first_pm; ++i, ++j) |
415 | sig->a_safe(j) += (short)((((float) i)/ (float)first_pm) *(float)unit_sig->a_safe(i)+0.5); |
416 | |
417 | for (; i < last_pm; ++i, ++j) |
418 | sig->a(j) = unit_sig->a(i); |
419 | |
420 | for (k = 0; i < unit_sig->num_samples(); ++i, ++j, ++k) |
421 | sig->a_safe(j) += (short)((1.0 - (((float) k) / (float) last_length)) |
422 | * (float)unit_sig->a_safe(i) + 0.5); |
423 | |
424 | |
425 | |
426 | } |
427 | |
428 | sig->resize(j); |
429 | sig->set_sample_rate(16000); |
430 | |
431 | add_wave_to_utterance(utt, *sig, "Wave"); |
432 | } |
433 | |
434 | |
435 | void concatenate_unit_coefs(EST_Relation &unit_stream, EST_Track &source_lpc) |
436 | { |
437 | int num_source_frames = 0; |
438 | int num_source_channels = 0;; |
439 | float prev_time, abs_offset, rel_offset, period, offset; |
440 | int i, j, k, l; |
441 | EST_Track *coefs; |
442 | |
443 | EST_Item *u = unit_stream.head(); |
444 | if( u == 0 ){ |
445 | |
446 | |
447 | source_lpc.resize(0,0); |
448 | } |
449 | else{ |
450 | EST_Track *t = 0; |
451 | for ( ; u; u = u->next()) |
452 | { |
453 | t = track(u->f("coefs")); |
454 | num_source_frames += t->num_frames(); |
455 | } |
456 | |
457 | num_source_channels = t->num_channels(); |
458 | |
459 | source_lpc.resize(num_source_frames, num_source_channels); |
460 | source_lpc.copy_setup(*t); |
461 | |
462 | prev_time = 0.0; |
463 | |
464 | for (i = 0, l = 0, u = unit_stream.head(); u; u = u->next()) |
465 | { |
466 | coefs = track(u->f("coefs")); |
467 | |
468 | for (j = 0; j < coefs->num_frames(); ++j, ++i) |
469 | { |
470 | for (k = 0; k < coefs->num_channels(); ++k) |
471 | source_lpc.a_no_check(i, k) = coefs->a_no_check(j, k); |
472 | source_lpc.t(i) = coefs->t(j) + prev_time; |
473 | } |
474 | |
475 | prev_time = source_lpc.t(i - 1); |
476 | u->set("end", prev_time); |
477 | u->set("num_frames", coefs->num_frames()); |
478 | } |
479 | } |
480 | |
481 | |
482 | abs_offset = 0.0; |
483 | rel_offset = 0.0; |
484 | |
485 | abs_offset = get_c_float(siod_get_lval("us_abs_offset", "zz")); |
486 | |
487 | rel_offset = get_c_float(siod_get_lval("us_rel_offset", "zz")); |
488 | |
489 | if( abs_offset!=0.0 || rel_offset!=0.0 ){ |
490 | std::cerr << "Adjusting pitchmarks" << std::endl; |
491 | for (i = 0; i < source_lpc.num_frames(); ++i){ |
492 | period = get_time_frame_size(source_lpc, (i)); |
493 | offset = abs_offset + (rel_offset * period); |
494 | source_lpc.t(i) = source_lpc.t(i) + offset; |
495 | } |
496 | } |
497 | } |
498 | |
499 | |
500 | |
501 | |
502 | |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
509 | |
510 | |
511 | |
512 | |
513 | |
514 | |
515 | |
516 | |
517 | |
518 | |
519 | |
520 | |
521 | |
522 | |
523 | |
524 | |
525 | |
526 | |
527 | |
528 | |
529 | |
530 | |
531 | |
532 | |
533 | |
534 | |
535 | |
536 | |
537 | |
538 | |
539 | |
540 | |
541 | static EST_Track* us_pitch_period_energy_contour( const EST_WaveVector &pp, |
542 | const EST_Track &pm ) |
543 | { |
544 | const int pp_length = pp.length(); |
545 | |
546 | EST_Track *contour = new EST_Track; |
547 | contour->resize( pp_length, 1 ); |
548 | |
549 | for( int i=0; i<pp_length; ++i ){ |
550 | const EST_Wave &frame = pp(i); |
551 | const int frame_length = frame.length(); |
552 | |
553 | |
554 | int j; |
555 | for( contour->a_no_check(i,0) = 0.0, j=0; j<frame_length; ++j ) |
556 | contour->a_no_check( i, 0 ) += pow( float(frame.a_no_check( j )), float(2.0) ); |
557 | |
558 | contour->a_no_check(i,0) = sqrt( contour->a_no_check(i,0) / (float)j ); |
559 | contour->t(i) = pm.t(i); |
560 | } |
561 | |
562 | return contour; |
563 | } |
564 | |
565 | EST_Val ffeature(EST_Item *item,const EST_String &fname); |
566 | |
567 | void us_linear_smooth_amplitude( EST_Utterance *utt ) |
568 | { |
569 | EST_WaveVector *pp = wavevector(utt->relation("SourceCoef")->first()->f("frame")); |
570 | EST_Track *pm = track(utt->relation("SourceCoef")->first()->f("coefs")); |
571 | |
572 | EST_Track *energy = us_pitch_period_energy_contour( *pp, *pm ); |
573 | energy->save( "./energy_track.est", "est" ); |
574 | |
575 | FILE *ofile = fopen( "./join_times.est", "w" ); |
576 | EST_Relation *units = utt->relation("Unit"); |
577 | for( EST_Item *u=units->head(); u; u=u->next() ){ |
578 | |
579 | EST_Item *diphone_left = u; |
580 | |
581 | |
582 | fprintf( ofile, "%s\t%f\n", diphone_left->S("name").str(), diphone_left->F("end")); |
583 | |
584 | EST_Item *join_phone_left = item(diphone_left->f("ph1"))->next(); |
585 | EST_String phone_name = join_phone_left->S("name"); |
586 | if( ph_is_sonorant( phone_name ) && !ph_is_silence( phone_name )){ |
587 | |
588 | |
589 | |
590 | std::cerr << "smoothing phone " << join_phone_left->S("name") << std::endl; |
591 | |
592 | |
593 | |
594 | int left_end_index = energy->index(diphone_left->F("end")); |
595 | int right_start_index = left_end_index + 1; |
596 | float left_power = energy->a(left_end_index,0); |
597 | float right_power = energy->a(right_start_index,0); |
598 | |
599 | float mean_power = (left_power+right_power)/2.0; |
600 | float left_factor = left_power/mean_power; |
601 | float right_factor = right_power/mean_power; |
602 | |
603 | int smooth_start_index = left_end_index-5; |
604 | int smooth_end_index = right_start_index+5; |
605 | |
606 | |
607 | |
608 | float factor = 1.0; |
609 | float factor_incr = (left_factor-1.0)/(float)(left_end_index - smooth_start_index); |
610 | for( int i=smooth_start_index; i<=left_end_index; ++i, factor+=factor_incr ){ |
611 | (*pp)[i].rescale( factor, 0 ); |
612 | std::cerr << "rescaled frame " << i << "(factor " << factor << ")" << std::endl; |
613 | } |
614 | |
615 | |
616 | factor = right_factor; |
617 | factor_incr = (1.0-right_factor)/(float)(smooth_end_index-right_start_index); |
618 | for( int i=right_start_index; i<=smooth_end_index; ++i, factor+=factor_incr){ |
619 | (*pp)[i].rescale( factor, 0 ); |
620 | std::cerr << "rescaled frame " << i << "(factor " << factor << ")" << std::endl; |
621 | } |
622 | } |
623 | else |
624 | std::cerr << "no smoothing for " << join_phone_left->S("name") << std::endl; |
625 | |
626 | std::cerr << std::endl; |
627 | } |
628 | |
629 | fclose( ofile ); |
630 | delete energy; |
631 | } |
632 | |