1 | |
2 | |
3 | |
4 | |
5 | |
6 | |
7 | |
8 | |
9 | |
10 | |
11 | |
12 | |
13 | |
14 | |
15 | |
16 | |
17 | |
18 | |
19 | |
20 | |
21 | |
22 | |
23 | |
24 | |
25 | |
26 | |
27 | |
28 | |
29 | |
30 | |
31 | |
32 | |
33 | |
34 | |
35 | |
36 | |
37 | |
38 | |
39 | |
40 | |
41 | #include "EST_error.h" |
42 | #include "EST_inline_utils.h" |
43 | #include "us_synthesis.h" |
44 | |
45 | #include "Phone.h" |
46 | |
47 | #include <fstream> |
48 | |
49 | using namespace std; |
50 | |
51 | |
52 | |
53 | |
54 | |
55 | |
56 | |
57 | |
58 | |
59 | |
60 | |
61 | |
62 | |
63 | |
64 | |
65 | |
66 | |
67 | |
68 | |
69 | |
70 | |
71 | |
72 | |
73 | |
74 | |
75 | |
76 | |
77 | |
78 | |
79 | |
80 | |
81 | |
82 | |
83 | |
84 | |
85 | |
86 | |
87 | |
88 | |
89 | |
90 | |
91 | |
92 | |
93 | |
94 | |
95 | |
96 | |
97 | |
98 | |
99 | |
100 | |
101 | |
102 | |
103 | |
104 | |
105 | void make_segment_single_mapping(EST_Relation &source_lab, |
106 | EST_Track &source_pm, |
107 | EST_Track &target_pm, EST_IVector &map) |
108 | { |
109 | int i = 0; |
110 | int s_i_start, s_i_end, t_i_start, t_i_end; |
111 | EST_Item *s; |
112 | float s_end, s_start, t_end, t_start, m; |
113 | map.resize(target_pm.num_frames()); |
114 | |
115 | s_start = t_start = 0.0; |
116 | s_i_start = t_i_start = 0; |
117 | |
118 | if (target_pm.t(target_pm.num_frames() - 1) < |
119 | source_lab.tail()->F("end",0)) |
120 | { |
121 | EST_warning(EST_error_where = __null), (*EST_warning_func)("Target pitchmarks end before end of target segment " |
122 | "timings (%f vs %f). Expect a truncated utterance\n", |
123 | target_pm.t(target_pm.num_frames() - 1), |
124 | source_lab.tail()->F("end",0.0)); |
125 | } |
126 | |
127 | |
128 | |
129 | for (s = source_lab.head(); s; s = s->next()) |
130 | { |
131 | |
132 | |
133 | |
134 | s_end = s->F("source_end"); |
135 | t_end = s->F("end"); |
136 | |
137 | s_i_end = source_pm.index_below(s_end); |
138 | t_i_end = target_pm.index_below(t_end); |
139 | |
140 | |
141 | if (s_i_end <= s_i_start) |
142 | s_i_end += 1; |
143 | |
144 | |
145 | |
146 | |
147 | |
148 | |
149 | |
150 | |
151 | |
152 | |
153 | |
154 | m = (t_end-t_start)/(s_end-s_start); |
155 | |
156 | |
157 | |
158 | |
159 | float apm_t_off = (s_i_start==0) ? 0.0 : source_pm.t(s_i_start-1); |
160 | float tpm_t_off = (t_i_start==0) ? 0.0 : target_pm.t(t_i_start-1); |
161 | |
162 | |
163 | |
164 | int apm_i = s_i_start; |
165 | float apm_t = source_pm.t(apm_i)-apm_t_off; |
166 | float next_apm_t = source_pm.t(apm_i+1)-apm_t_off; |
167 | |
168 | for( i=t_i_start; i<=t_i_end; ++i ){ |
169 | float tpm_t = target_pm.t(i)-tpm_t_off; |
170 | |
171 | |
172 | |
173 | while( (apm_i<=s_i_end) && (fabs((next_apm_t*m)-tpm_t) <= fabs((apm_t*m)-tpm_t)) ){ |
174 | |
175 | |
176 | apm_t = next_apm_t; |
177 | ++apm_i; |
178 | next_apm_t = source_pm.t(apm_i+1)-apm_t_off; |
179 | } |
180 | |
181 | |
182 | |
183 | |
184 | |
185 | |
186 | |
187 | map[i] = apm_i; |
188 | } |
189 | |
190 | |
191 | s_i_start = s_i_end+1; |
192 | t_i_start = t_i_end+1; |
193 | s_start = source_pm.t(s_i_start); |
194 | t_start = target_pm.t(t_i_start); |
195 | |
196 | } |
197 | if (i == 0) |
198 | map.resize(0); |
199 | else |
200 | map.resize(i); |
201 | } |
202 | |
203 | |
204 | void make_linear_mapping(EST_Track &pm, EST_IVector &map) |
205 | { |
206 | int pm_num_frames = pm.num_frames(); |
207 | |
208 | map.resize(pm_num_frames); |
209 | |
210 | for (int i = 0; i < pm_num_frames; ++i) |
211 | map[i] = i; |
212 | } |
213 | |
214 | |
215 | static bool contiguous( const EST_Item*left, const EST_Item* right ) |
216 | { |
217 | if( (item(left->f("source_ph1")))->next() == item(right->f("source_ph1")) ) |
218 | return true; |
219 | |
220 | return false; |
221 | } |
222 | |
223 | |
224 | |
225 | |
226 | static void pitchmarksToSpaces( const EST_Track &pm, EST_IVector *spaces, |
227 | int start_pm, int end_pm, int wav_srate ) |
228 | { |
229 | int left_pm, right_pm; |
230 | int num_frames = end_pm-start_pm; |
231 | spaces->resize( num_frames, 0 ); |
232 | |
233 | left_pm = (int) rint( pm.t(start_pm)*wav_srate ); |
234 | for( int i=0; i<num_frames; ++i ){ |
235 | right_pm = (int) rint( pm.t(start_pm+i+1)*wav_srate ); |
236 | (*spaces)[i] = right_pm - left_pm; |
237 | left_pm = right_pm; |
238 | } |
239 | } |
240 | |
241 | |
242 | void make_join_interpolate_mapping( const EST_Track &source_pm, |
243 | EST_Track &target_pm, |
244 | const EST_Relation &units, |
245 | EST_IVector &map ) |
246 | { |
247 | |
248 | |
249 | float wav_srate = wave(units.head()->f("sig"))->sample_rate(); |
250 | |
251 | |
252 | |
253 | int target_pm_length = source_pm.length(); |
254 | target_pm.resize(target_pm_length, source_pm.num_channels()); |
255 | |
256 | |
257 | EST_IVector source_spacing(target_pm_length); |
258 | EST_IVector target_spacing(target_pm_length); |
259 | EST_IVector voicing(target_pm_length); |
260 | |
261 | |
262 | EST_Item *diphone_left = units.head(); |
263 | |
264 | int left_start_index = diphone_left->I("middle_frame"); |
265 | int left_end_index = source_pm.index(diphone_left->F("end")); |
266 | |
267 | for( int i=0; i<left_start_index; ++i ){ |
268 | target_pm.t(i) = source_pm.t(i); |
269 | voicing[i] = 0; |
270 | } |
271 | |
272 | for( EST_Item *diphone_right=diphone_left->next(); |
273 | diphone_right; |
274 | diphone_right=diphone_left->next() ){ |
275 | |
276 | printf( "%s\t%f\n", diphone_left->S("name").str(), diphone_left->F("end")); |
277 | |
278 | int right_start_index = left_end_index + 1; |
279 | int right_end_index = right_start_index + diphone_right->I("middle_frame"); |
280 | |
281 | printf( "%d %d %d %d (l_start, l_end, r_start, r_end\n", |
282 | left_start_index, |
283 | left_end_index, |
284 | right_start_index, |
285 | right_end_index ); |
286 | |
287 | EST_String join_phone_name = item(diphone_left->f("ph1"))->next()->S("name"); |
288 | |
289 | cerr << "phone contigous " << contiguous(diphone_left,diphone_right) << endl; |
290 | |
291 | |
292 | int voicing_val; |
293 | if( ph_is_sonorant( join_phone_name ) && |
294 | ! ph_is_silence( join_phone_name ) ){ |
295 | voicing_val = 1; |
296 | } |
297 | else |
298 | voicing_val = 0; |
299 | |
300 | for( int i=left_start_index; i<right_end_index; ++i ) |
301 | voicing[i] = voicing_val; |
302 | |
303 | |
304 | |
305 | |
306 | if( (!contiguous(diphone_left,diphone_right)) && |
307 | ph_is_sonorant( join_phone_name ) && |
308 | (!ph_is_silence( join_phone_name) )){ |
309 | |
310 | cerr << "smoothing phone " << join_phone_name << "\n"; |
311 | |
312 | printf( "** Calculating spaces **\n" ); |
313 | |
314 | |
315 | |
316 | EST_IVector spaces; |
317 | |
318 | pitchmarksToSpaces( source_pm, &spaces, |
319 | left_start_index, right_end_index, |
320 | (int)wav_srate ); |
321 | |
322 | int num_frames = right_end_index-left_start_index; |
323 | |
324 | printf( "** Adjusting spaces**\n" ); |
325 | |
326 | |
327 | |
328 | |
329 | |
330 | |
331 | |
332 | |
333 | |
334 | |
335 | |
336 | |
337 | |
338 | |
339 | |
340 | |
341 | |
342 | |
343 | |
344 | |
345 | int join_i = left_end_index-left_start_index; |
346 | int joindiff = spaces[join_i+1] - spaces[join_i]; |
347 | |
348 | |
349 | const unsigned int DEFAULT_SMOOTHN = 5; |
350 | |
351 | |
352 | |
353 | int smoothn = min( DEFAULT_SMOOTHN, join_i ); |
354 | |
355 | for( int i=0; i<smoothn; ++i ) |
356 | spaces[join_i-i] += (int) rint(joindiff*((float)(smoothn-i)/(2*smoothn))); |
357 | |
358 | |
359 | joindiff = -joindiff; |
360 | smoothn = min( DEFAULT_SMOOTHN, num_frames-join_i ); |
361 | |
362 | for( int i=0; i<smoothn; ++i ) |
363 | spaces[join_i+1+i] += (int) rint( joindiff*((float)(smoothn-i)/(2*smoothn))); |
364 | |
365 | |
366 | |
367 | |
368 | printf( "** using modified spaces ** \n" ); |
369 | |
370 | for( int i=left_start_index; i<right_end_index; ++i ){ |
371 | printf( "Using space %d for target pitchmark %d\n", i-left_start_index, i ); |
372 | target_pm.t(i) = target_pm.t(i-1) + ((float)spaces[i-left_start_index]/wav_srate); |
373 | } |
374 | } |
375 | else{ |
376 | cerr << "no smoothing for " << join_phone_name << "\n"; |
377 | for( int i=left_start_index; i<right_end_index; ++i ){ |
378 | printf( "Using source pm %d for target pitchmark %d\n", i, i ); |
379 | target_pm.t(i) = source_pm.t(i); |
380 | } |
381 | } |
382 | |
383 | cerr <<endl; |
384 | |
385 | |
386 | left_start_index = right_end_index; |
387 | left_end_index = source_pm.index( diphone_right->F("end") ); |
388 | diphone_left = diphone_right; |
389 | } |
390 | |
391 | |
392 | for( int i=left_start_index; i<target_pm_length; ++i ) |
393 | target_pm.t(i) = source_pm.t(i); |
394 | |
395 | make_linear_mapping( target_pm, map ); |
396 | |
397 | |
398 | |
399 | |
400 | |
401 | pitchmarksToSpaces( source_pm, |
402 | &source_spacing, |
403 | 0, target_pm_length-1, |
404 | (int)wav_srate ); |
405 | |
406 | ofstream outfile( "/home/korin/projects/smoothing_temp/f0/source_spacing.est" ); |
407 | if( !outfile ) |
408 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open source pitchmark spacing output file" ); |
409 | |
410 | outfile << source_spacing << endl; |
411 | outfile.close(); |
412 | |
413 | |
414 | pitchmarksToSpaces( target_pm, |
415 | &target_spacing, |
416 | 0, target_pm_length-1, |
417 | (int)wav_srate ); |
418 | |
419 | ofstream afterfile( "/home/korin/projects/smoothing_temp/f0/target_spacing.est" ); |
420 | if( !afterfile) |
421 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open target pitchmark spacing output file" ); |
422 | |
423 | afterfile << target_spacing << endl; |
424 | afterfile.close(); |
425 | |
426 | ofstream voicingfile( "/home/korin/projects/smoothing_temp/f0/voicing.est" ); |
427 | if( !voicingfile) |
428 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open target pitchmark spacing output file" ); |
429 | |
430 | voicingfile << voicing << endl; |
431 | voicingfile.close(); |
432 | } |
433 | |
434 | void make_join_interpolate_mapping2( const EST_Track &source_pm, |
435 | EST_Track &target_pm, |
436 | const EST_Relation &units, |
437 | EST_IVector &map ) |
438 | { |
439 | |
440 | |
441 | float wav_srate = wave(units.head()->f("sig"))->sample_rate(); |
442 | |
443 | |
444 | |
445 | int target_pm_length = source_pm.length(); |
446 | target_pm.resize(target_pm_length, source_pm.num_channels()); |
447 | |
448 | |
449 | EST_IVector source_spacing(target_pm_length); |
450 | EST_IVector target_spacing(target_pm_length); |
451 | EST_IVector voicing(target_pm_length); |
452 | |
453 | |
454 | EST_Item *diphone_left = units.head(); |
455 | |
456 | int left_start_index = diphone_left->I("middle_frame"); |
457 | int left_end_index = source_pm.index(diphone_left->F("end")); |
458 | |
459 | for( int i=0; i<left_start_index; ++i ){ |
460 | target_pm.t(i) = source_pm.t(i); |
461 | voicing[i] = 0; |
462 | } |
463 | |
464 | for( EST_Item *diphone_right=diphone_left->next(); |
465 | diphone_right; |
466 | diphone_right=diphone_left->next() ){ |
467 | |
468 | printf( "%s\t%f\n", diphone_left->S("name").str(), diphone_left->F("end")); |
469 | |
470 | int right_start_index = left_end_index + 1; |
471 | int right_end_index = right_start_index + diphone_right->I("middle_frame"); |
472 | |
473 | printf( "%d %d %d %d (l_start, l_end, r_start, r_end\n", |
474 | left_start_index, |
475 | left_end_index, |
476 | right_start_index, |
477 | right_end_index ); |
478 | |
479 | EST_String join_phone_name = item(diphone_left->f("ph1"))->next()->S("name"); |
480 | |
481 | cerr << "phone contigous " << contiguous(diphone_left,diphone_right) << endl; |
482 | |
483 | |
484 | int voicing_val; |
485 | if( ph_is_sonorant( join_phone_name ) && |
486 | ! ph_is_silence( join_phone_name ) ){ |
487 | voicing_val = 1; |
488 | } |
489 | else |
490 | voicing_val = 0; |
491 | |
492 | for( int i=left_start_index; i<right_end_index; ++i ) |
493 | voicing[i] = voicing_val; |
494 | |
495 | |
496 | |
497 | |
498 | |
499 | |
500 | |
501 | |
502 | |
503 | |
504 | |
505 | |
506 | |
507 | |
508 | |
509 | |
510 | |
511 | |
512 | |
513 | |
514 | |
515 | |
516 | |
517 | |
518 | |
519 | |
520 | |
521 | |
522 | |
523 | |
524 | |
525 | |
526 | |
527 | |
528 | |
529 | |
530 | |
531 | |
532 | |
533 | |
534 | |
535 | |
536 | |
537 | |
538 | |
539 | |
540 | |
541 | |
542 | |
543 | |
544 | |
545 | |
546 | cerr << "no smoothing for " << join_phone_name << "\n"; |
547 | for( int i=left_start_index; i<right_end_index; ++i ){ |
548 | printf( "Using source pm %d for target pitchmark %d\n", i, i ); |
549 | target_pm.t(i) = source_pm.t(i); |
550 | } |
551 | |
552 | |
553 | cerr <<endl; |
554 | |
555 | |
556 | left_start_index = right_end_index; |
557 | left_end_index = source_pm.index( diphone_right->F("end") ); |
558 | diphone_left = diphone_right; |
559 | } |
560 | |
561 | |
562 | for( int i=left_start_index; i<target_pm_length; ++i ) |
563 | target_pm.t(i) = source_pm.t(i); |
564 | |
565 | make_linear_mapping( target_pm, map ); |
566 | |
567 | |
568 | |
569 | |
570 | |
571 | pitchmarksToSpaces( source_pm, |
572 | &source_spacing, |
573 | 0, target_pm_length-1, |
574 | (int)wav_srate ); |
575 | |
576 | ofstream outfile( "/home/korin/projects/smoothing_temp/f0/source_spacing.est" ); |
577 | if( !outfile ) |
578 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open source pitchmark spacing output file" ); |
579 | |
580 | outfile << source_spacing << endl; |
581 | outfile.close(); |
582 | |
583 | |
584 | pitchmarksToSpaces( target_pm, |
585 | &target_spacing, |
586 | 0, target_pm_length-1, |
587 | (int)wav_srate ); |
588 | |
589 | ofstream afterfile( "/home/korin/projects/smoothing_temp/f0/target_spacing.est" ); |
590 | if( !afterfile) |
591 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open target pitchmark spacing output file" ); |
592 | |
593 | afterfile << target_spacing << endl; |
594 | afterfile.close(); |
595 | |
596 | ofstream voicingfile( "/home/korin/projects/smoothing_temp/f0/voicing.est" ); |
597 | if( !voicingfile) |
598 | EST_error(EST_error_where = __null), (*EST_error_func)( "Couldn't open target pitchmark spacing output file" ); |
599 | |
600 | voicingfile << voicing << endl; |
601 | voicingfile.close(); |
602 | |
603 | if( const_cast<EST_Track&>(source_pm).save( "/home/korin/projects/smoothing_temp/f0/sourceCoef.est" ) |
604 | != write_ok ) |
605 | EST_warning(EST_error_where = __null), (*EST_warning_func)( "couldn't write sourceCoef.est file" ); |
606 | } |
607 | |
608 | |
609 | void us_mapping(EST_Utterance &utt, const EST_String &method) |
610 | { |
611 | EST_Relation *source_lab, *target_lab; |
612 | EST_IVector *map; |
613 | EST_Track *source_coef=0, *target_coef=0; |
614 | |
615 | source_coef = track(utt.relation("SourceCoef")->head()->f("coefs")); |
616 | target_coef = track(utt.relation("TargetCoef")->head()->f("coefs")); |
617 | |
618 | map = new EST_IVector; |
619 | |
620 | |
621 | if (method != "segment_single") |
622 | source_lab = utt.relation("SourceSegments"); |
| Value stored to 'source_lab' is never read |
623 | target_lab = utt.relation("Segment", 1); |
624 | |
625 | |
626 | |
627 | |
628 | |
629 | |
630 | |
631 | |
632 | if (method == "linear") |
633 | make_linear_mapping(*source_coef, *map); |
634 | else if (method == "segment_single") |
635 | make_segment_single_mapping(*target_lab, *source_coef, |
636 | *target_coef, *map); |
637 | else if (method == "interpolate_joins"){ |
638 | cerr << "Doing interpolate_joins\n"; |
639 | EST_Relation *units = utt.relation("Unit"); |
640 | make_join_interpolate_mapping(*source_coef, *target_coef, *units,*map); |
641 | } |
642 | else if (method == "interpolate_joins2"){ |
643 | cerr << "Doing interpolate_joins2\n"; |
644 | EST_Relation *units = utt.relation("Unit"); |
645 | make_join_interpolate_mapping2(*source_coef, *target_coef, *units,*map); |
646 | } |
647 | else |
648 | EST_error(EST_error_where = __null), (*EST_error_func)("Mapping method \"%s\" not found\n", (const char *)method); |
649 | |
650 | utt.create_relation("US_map"); |
651 | EST_Item *item = utt.relation("US_map")->append(); |
652 | item->set_val("map", est_val(map)); |
653 | } |
654 | |
655 | |
656 | void add_wave_to_utterance(EST_Utterance &u, EST_Wave &sig, |
657 | const EST_String &name) |
658 | { |
659 | u.create_relation(name); |
660 | EST_Item *item = u.relation(name)->append(); |
661 | item->set_val("wave", est_val(&sig)); |
662 | } |
663 | |
664 | void map_to_relation(EST_IVector &map, EST_Relation &r, |
665 | const EST_Track &source_pm, |
666 | const EST_Track &target_pm) |
667 | { |
668 | EST_Item *s, *t, *a=NULL__null; |
669 | EST_Utterance *u = r.utt(); |
670 | int i; |
671 | |
672 | |
673 | |
674 | |
675 | u->create_relation("smap"); |
676 | u->create_relation("tmap"); |
677 | |
678 | for (i = 0; i < source_pm.num_frames(); ++i) |
679 | { |
680 | s = u->relation("smap")->append(); |
681 | s->set("index", i); |
682 | s->set("end", source_pm.t(i)); |
683 | } |
684 | |
685 | for (i = 0; i < target_pm.num_frames(); ++i) |
686 | { |
687 | s = u->relation("tmap")->append(); |
688 | s->set("index", i); |
689 | s->set("end", target_pm.t(i)); |
690 | } |
691 | |
692 | EST_Item *last_s = 0; |
693 | |
694 | for (s = u->relation("smap")->head(); s; s = s->next()) |
695 | { |
696 | int n = s->I("index"); |
697 | for (t = u->relation("tmap")->head(); t; t = t->next()) |
698 | { |
699 | if (map(t->I("index")) == n) |
700 | { |
701 | if (last_s != s) |
702 | a = u->relation("lmap")->append(s); |
703 | last_s = s; |
704 | a->append_daughter(t); |
705 | t->set("map", n); |
706 | } |
707 | } |
708 | } |
709 | } |
710 | |
711 | |
712 | |
713 | |
714 | |
715 | |
716 | |
717 | |
718 | |
719 | |
720 | |
721 | |
722 | |
723 | |
724 | |
725 | |
726 | |
727 | |
728 | |
729 | |
730 | |
731 | |
732 | |
733 | |
734 | |
735 | |
736 | |
737 | |
738 | |
739 | |
740 | |
741 | |
742 | |
743 | |
744 | |
745 | |
746 | |
747 | |
748 | |
749 | |
750 | |
751 | |
752 | |
753 | |
754 | |
755 | |
756 | |
757 | |
758 | |
759 | |
760 | |
761 | |
762 | |
763 | |
764 | |
765 | |
766 | |
767 | |
768 | |
769 | |
770 | |
771 | |
772 | |
773 | |
774 | |
775 | |
776 | |
777 | |
778 | |
779 | |
780 | |
781 | |
782 | |
783 | |
784 | |
785 | |
786 | |
787 | |
788 | |
789 | |
790 | |
791 | |
792 | |
793 | |
794 | |
795 | |
796 | |
797 | |
798 | |
799 | |
800 | |
801 | |
802 | |
803 | |
804 | |
805 | |
806 | |
807 | |
808 | |
809 | |
810 | |
811 | |
812 | |
813 | |
814 | |
815 | |
816 | |
817 | |
818 | |
819 | |
820 | |
821 | |
822 | |
823 | |
824 | |
825 | |
826 | |
827 | |
828 | |
829 | |
830 | |
831 | |
832 | |
833 | |
834 | |
835 | |
836 | |
837 | |
838 | |
839 | |
840 | |
841 | |
842 | |
843 | |
844 | |
845 | |
846 | |
847 | |
848 | |
849 | |
850 | |
851 | |
852 | |
853 | |
854 | |
855 | |
856 | |
857 | |
858 | |