Edinburgh Speech Tools  2.1-release
ngram_build_main.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1995,1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Authors: Simon King */
34 /* Date : July 1995 */
35 /*-----------------------------------------------------------------------*/
36 /* EST_Ngrammar build program */
37 /* */
38 /*=======================================================================*/
39 
40 #include <cstdlib>
41 #include "EST.h"
42 #include "EST_Ngrammar.h"
43 #include "EST_Pathname.h"
44 
45 using namespace std;
46 
47 int main(int argc, char **argv)
48 {
49  int order;
50  EST_StrList files;
51  EST_Option al, op;
52  EST_String wordlist_file,wordlist_file2, out_file, format;
53  EST_String prev_tag(""), prev_prev_tag(""), last_tag("");
54  EST_String input_format(""), oov_mode(""), oov_marker("");
55  EST_Ngrammar::representation_t representation =
57 
58  EST_StrList wordlist,wordlist2;
59  EST_Ngrammar ngrammar;
60  bool trace=false;
61  double floor=0.0;
62 
64  (argc, argv,
65  EST_String("[input file0] [input file1] ... -o [output file]\n")+
66  "-w <ifile> filename containing word list (required)\n"+
67  "-p <ifile> filename containing predictee word list\n"+
68  " (default is to use wordlist given by -w)\n"+
69  "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
70  "-smooth <int> Good-Turing smooth the grammar up to the\n"+
71  " given frequency\n"+
72  "-o <ofile> Output file for constructed ngram\n"+
73  "\n"
74  "-input_format <string>\n"+
75  " format of input data (default sentence_per_line)\n"+
76  " may be sentence_per_file, ngram_per_line.\n"+
77  "-otype <string> format of output file, one of cstr_ascii\n"+
78  " cstr_bin or htk_ascii\n"+
79  "-sparse build ngram in sparse representation\n"+
80  "-dense build ngram in dense representation (default)\n"+
81  "-backoff <int>\n"+
82  " build backoff ngram (requires -smooth)\n"+
83  "-floor <double>\n"+
84  " frequency floor value used with some ngrams\n"+
85  "-freqsmooth <int>\n"+
86  " build frequency backed off smoothed ngram, this\n"+
87  " requires -smooth option\n"+
88  "-trace give verbose outout about build process\n"+
89  "-save_compressed save ngram in gzipped format\n"+
90  "-oov_mode <string>\n"+
91  " what to do about out-of-vocabulary words,\n"+
92  " one of skip_ngram, skip_sentence (default),\n"+
93  " skip_file, or use_oov_marker\n"+
94  "-oov_marker <string>\n"+
95  " special word for oov words (default "+OOV_MARKER+")\n"+
96  " (use in conjunction with '-oov_mode use_oov_marker'\n"+
97  "\n"+
98  "Pseudo-words :\n"+
99  "-prev_tag <string>\n"+
100  " tag before sentence start\n"+
101  "-prev_prev_tag <string>\n"+
102  " all words before 'prev_tag'\n"+
103  "-last_tag <string>\n"+
104  " after sentence end\n"+
105  "-default_tags use default tags of "+SENTENCE_START_MARKER+
106  ","+SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
107  " respectively\n",
108  files, al);
109 
110  if (al.present("-input_format"))
111  input_format = al.val("-input_format");
112  else
113  input_format = "sentence_per_line";
114 
115  if (al.present("-oov_mode"))
116  oov_mode = al.val("-oov_mode");
117  else
118  oov_mode = "skip_sentence";
119 
120 
121  if(al.present("-oov_marker"))
122  {
123  if(oov_mode != "use_oov_marker")
124  {
125  cerr << "Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
126  exit(1);
127  }
128  else
129  oov_marker = al.val("-oov_marker");
130 
131  // should check oov marker is/isn't (?) in vocab
132  // ......
133  }
134 
135  if( (oov_mode != "skip_ngram") &&
136  (oov_mode != "skip_sentence") &&
137  (oov_mode != "skip_file") &&
138  (oov_mode != "use_oov_marker") )
139  {
140  cerr << oov_mode << " is not a valid oov_mode !" << endl;
141  exit(1);
142  }
143 
144  if (al.present("-w"))
145  wordlist_file = al.val("-w");
146  else{
147  cerr << "build_ngram: Must specify a wordlist with -w" << endl;
148  exit(1);
149  }
150 
151  if (load_StrList(wordlist_file,wordlist) != format_ok)
152  {
153  cerr << "build_ngram: Could not read wordlist from file "
154  << wordlist_file << endl;
155  exit(1);
156  }
157 
158 
159  if (al.present("-p"))
160  {
161 
162  if(input_format != "ngram_per_line")
163  {
164  cerr << "Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
165  exit(1);
166  }
167 
168  wordlist_file2 = al.val("-p");
169  if (load_StrList(wordlist_file2,wordlist2) != format_ok)
170  {
171  cerr << "build_ngram: Could not read predictee list from file "
172  << wordlist_file2 << endl;
173  exit(1);
174  }
175  }
176 
177  if (al.present("-trace"))
178  trace=true;
179 
180  if (al.present("-o"))
181  out_file = al.val("-o");
182  else
183  out_file = "-";
184 
185  if (al.present("-default_tags"))
186  {
187  prev_tag = SENTENCE_START_MARKER;
188  prev_prev_tag = SENTENCE_END_MARKER;
189  last_tag = SENTENCE_END_MARKER;
190 
191  wordlist.append(SENTENCE_START_MARKER);
192  wordlist.append(SENTENCE_END_MARKER);
193 
194  if (al.present("-p"))
195  {
196  wordlist2.append(SENTENCE_START_MARKER);
197  wordlist2.append(SENTENCE_END_MARKER);
198  }
199  }
200 
201  if (al.present("-prev_tag"))
202  {
203  if (al.present("-default_tags"))
204  cerr << "build_ngram: WARNING : -prev_tag overrides -default_tags"
205  << endl;
206  prev_tag = al.val("-prev_tag");
207  }
208 
209  if (al.present("-prev_prev_tag"))
210  {
211  if (al.present("-default_tags"))
212  cerr << "build_ngram: WARNING : -prev_prev_tag overrides -default_tags"
213  << endl;
214  prev_prev_tag = al.val("-prev_prev_tag");
215  }
216 
217  if (al.present("-last_tag"))
218  {
219  if (al.present("-default_tags"))
220  cerr << "build_ngram: WARNING : -last_tag overrides -default_tags"
221  << endl;
222  last_tag = al.val("-last_tag");
223  }
224 
225  if ( ( (prev_tag=="") || (prev_prev_tag=="") || (last_tag=="") )
226  && ( (prev_tag!="") || (prev_prev_tag!="") || (last_tag!="") ) )
227  {
228  cerr << "build_ngram: ERROR : if any tags are given, ALL must be given"
229  << endl;
230  exit(1);
231  }
232 
233  if (al.present("-order"))
234  order = al.ival("-order");
235  else
236  {
237  cerr << "build_ngram: WARNING : No order specified with -order : defaulting to bigram"
238  << endl;
239  order = 2;
240  }
241 
242  if (al.present("-otype"))
243  format = al.val("-otype");
244  else
245  format = "";
246 
247  if (al.present("-floor"))
248  floor = al.dval("-floor");
249  else
250  floor = 0.0;
251 
252  if (al.present("-backoff"))
253  if (!al.present("-smooth"))
254  {
255  cerr << "build_ngram: backoff requires smooth value" << endl;
256  exit(-1);
257  }
258  if (al.present("-freqsmooth"))
259  if (!al.present("-smooth"))
260  {
261  cerr << "build_ngram: frequency smooth requires smooth value"
262  << endl;
263  exit(-1);
264  }
265 
266  if (al.present("-dense"))
267  representation = EST_Ngrammar::dense;
268  else if (al.present("-sparse"))
269  {
270  cerr << "build_ngram: Sorry, sparse representation is not yet available " << endl;
271  exit(1);
272  representation = EST_Ngrammar::sparse;
273  }
274  else if (al.present("-backoff"))
275  representation = EST_Ngrammar::backoff;
276  else
277  cerr << "build_ngram: Defaulting to dense representation" << endl;
278 
279  if (al.present("-p"))
280  {
281  if (!ngrammar.init(order,representation,wordlist,wordlist2))
282  {
283  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
284  exit(1);
285  }
286  }
287  else
288  {
289  if (!ngrammar.init(order,representation,wordlist))
290  {
291  cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
292  exit(1);
293  }
294  }
295 
296 
297  if ( al.present("-backoff") )
298  {
299  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
300  last_tag,input_format,oov_mode,
301  al.ival("-backoff"),al.ival("-smooth")))
302  {
303  cerr << "build_ngram: Failed to build backoff " << order
304  << "-gram" << endl;
305  exit(1);
306  }
307  else if (trace)
308  cerr << "build_ngram: Built backoff " << order <<
309  "-gram" << endl;
310  }
311  else
312  {
313  if (!ngrammar.build(files,prev_tag,prev_prev_tag,
314  last_tag,input_format,oov_mode))
315  {
316  cerr << "build_ngram: Failed to build " << order << "-gram" << endl;
317  exit(1);
318  }
319  else
320  if(trace)
321  cerr << "build_ngram: Built " << order << "-gram" << endl;
322  }
323 
324 
325  // Posit processing functions
326  if (al.present("-freqsmooth"))
327  {
328  Ngram_freqsmooth(ngrammar,al.ival("-smooth"),al.ival("-freqsmooth"));
329  }
330  else if (al.present("-smooth") && !al.present("-backoff"))
331  {
332  int smoothcount = atoi(al.val("-smooth"));
333  if(!Good_Turing_smooth(ngrammar,smoothcount))
334  {
335  cerr << "build_ngram: Failed to smooth " << order << "-gram" << endl;
336  exit(1);
337  }
338  else
339  if(trace)
340  cerr << "build_ngram: Good Turing smoothed " << order << "-gram" << endl;
341 
342  }
343 
344  // save
345  if (al.present("-save_compressed"))
346  {
347  EST_String tmp_file = make_tmp_filename();
348  if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
349  {
350  EST_String prog_name;
351  EST_Pathname tmp(out_file);
352  if (tmp.extension() == GZIP_FILENAME_EXTENSION)
353  prog_name = "gzip --stdout";
354  else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
355  prog_name = "compress -c";
356  else // default
357  {
358  prog_name = "gzip --stdout";
359  if(out_file != "-")
360  out_file = out_file + "." + GZIP_FILENAME_EXTENSION;
361  }
362 
363  if (trace)
364  cerr << "build_ngram: Compressing with '" << prog_name << "'" << endl;
365 
366  // now compress
367  if(compress_file(tmp_file,out_file,prog_name) != 0)
368  {
369  cerr << "build_ngram: Failed to compress to file "
370  << out_file << endl;
371  (void)delete_file(tmp_file);
372  exit(1);
373  }
374 
375  (void)delete_file(tmp_file);
376 
377  if(trace)
378  cerr << "build_ngram: Saved in compressed " << format
379  << " format to " << out_file << endl;
380  }
381  else
382  {
383  cerr << "build_ngram: Failed to write temporary file "
384  << tmp_file << endl;
385  exit(1);
386  }
387 
388 
389  }
390  else
391  {
392  if (ngrammar.save(out_file,format,trace,floor) == write_ok)
393  {
394  if(trace)
395  cerr << "build_ngram: Saved in " << format
396  << " format to " << out_file << endl;
397  }
398  else
399  {
400  cerr << "build_ngram: Failed to save " << format << " format data to "
401  << out_file << endl;
402  exit(1);
403  }
404  }
405 
406 
407  // everything went okay
408  return 0;
409 }
#define GZIP_FILENAME_EXTENSION
Definition: EST_Ngrammar.h:66
int main(int argc, char **argv)
#define SENTENCE_END_MARKER
Definition: EST_Ngrammar.h:60
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:82
double dval(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:115
EST_String make_tmp_filename()
Make a unique temporary filename.
Definition: util_io.cc:56
The file was written successfully.
#define COMPRESS_FILENAME_EXTENSION
Definition: EST_Ngrammar.h:67
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
int delete_file(const EST_String &filename)
OS independent way of removing a file.
Definition: EST_io_aux.h:81
int compress_file(const EST_String &filename, const EST_String &new_filename, const EST_String &prog_name)
compress file by calling program prog, writing result to new_filename
Definition: util_io.cc:229
#define format_ok
int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
#define OOV_MARKER
Definition: EST_Ngrammar.h:61
EST_String extension(void) const
bool build(const EST_StrList &filenames, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER, const EST_String &input_format="", const EST_String &oov_mode="", const int mincount=1, const int maxcount=10)
EST_String
#define SENTENCE_START_MARKER
Definition: EST_Ngrammar.h:59
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
bool init(int o, representation_t r, const EST_StrList &wordlist)
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
Definition: cmd_line.cc:101
EST_write_status save(const EST_String &filename, const EST_String type="cstr_ascii", const bool trace=false, double floor=0.0)
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)
Definition: freqsmooth.cc:58