47 int main(
int argc,
char **argv)
52 EST_String wordlist_file,wordlist_file2, out_file, format;
53 EST_String prev_tag(
""), prev_prev_tag(
""), last_tag(
"");
54 EST_String input_format(
""), oov_mode(
""), oov_marker(
"");
65 EST_String(
"[input file0] [input file1] ... -o [output file]\n")+
66 "-w <ifile> filename containing word list (required)\n"+
67 "-p <ifile> filename containing predictee word list\n"+
68 " (default is to use wordlist given by -w)\n"+
69 "-order <int> order, 1=unigram, 2=bigram etc. (default 2)\n"+
70 "-smooth <int> Good-Turing smooth the grammar up to the\n"+
72 "-o <ofile> Output file for constructed ngram\n"+
74 "-input_format <string>\n"+
75 " format of input data (default sentence_per_line)\n"+
76 " may be sentence_per_file, ngram_per_line.\n"+
77 "-otype <string> format of output file, one of cstr_ascii\n"+
78 " cstr_bin or htk_ascii\n"+
79 "-sparse build ngram in sparse representation\n"+
80 "-dense build ngram in dense representation (default)\n"+
82 " build backoff ngram (requires -smooth)\n"+
84 " frequency floor value used with some ngrams\n"+
85 "-freqsmooth <int>\n"+
86 " build frequency backed off smoothed ngram, this\n"+
87 " requires -smooth option\n"+
88 "-trace give verbose outout about build process\n"+
89 "-save_compressed save ngram in gzipped format\n"+
90 "-oov_mode <string>\n"+
91 " what to do about out-of-vocabulary words,\n"+
92 " one of skip_ngram, skip_sentence (default),\n"+
93 " skip_file, or use_oov_marker\n"+
94 "-oov_marker <string>\n"+
95 " special word for oov words (default "+
OOV_MARKER+
")\n"+
96 " (use in conjunction with '-oov_mode use_oov_marker'\n"+
99 "-prev_tag <string>\n"+
100 " tag before sentence start\n"+
101 "-prev_prev_tag <string>\n"+
102 " all words before 'prev_tag'\n"+
103 "-last_tag <string>\n"+
104 " after sentence end\n"+
110 if (al.
present(
"-input_format"))
111 input_format = al.
val(
"-input_format");
113 input_format =
"sentence_per_line";
116 oov_mode = al.
val(
"-oov_mode");
118 oov_mode =
"skip_sentence";
123 if(oov_mode !=
"use_oov_marker")
125 cerr <<
"Error : can only use -oov_marker with '-oov_mode use_oov_marker'" << endl;
129 oov_marker = al.
val(
"-oov_marker");
135 if( (oov_mode !=
"skip_ngram") &&
136 (oov_mode !=
"skip_sentence") &&
137 (oov_mode !=
"skip_file") &&
138 (oov_mode !=
"use_oov_marker") )
140 cerr << oov_mode <<
" is not a valid oov_mode !" << endl;
145 wordlist_file = al.
val(
"-w");
147 cerr <<
"build_ngram: Must specify a wordlist with -w" << endl;
153 cerr <<
"build_ngram: Could not read wordlist from file " 154 << wordlist_file << endl;
162 if(input_format !=
"ngram_per_line")
164 cerr <<
"Can't have differering predictor/predictee lists unless data is in ngram_per_line format !" << endl;
168 wordlist_file2 = al.
val(
"-p");
171 cerr <<
"build_ngram: Could not read predictee list from file " 172 << wordlist_file2 << endl;
181 out_file = al.
val(
"-o");
185 if (al.
present(
"-default_tags"))
203 if (al.
present(
"-default_tags"))
204 cerr <<
"build_ngram: WARNING : -prev_tag overrides -default_tags" 206 prev_tag = al.
val(
"-prev_tag");
209 if (al.
present(
"-prev_prev_tag"))
211 if (al.
present(
"-default_tags"))
212 cerr <<
"build_ngram: WARNING : -prev_prev_tag overrides -default_tags" 214 prev_prev_tag = al.
val(
"-prev_prev_tag");
219 if (al.
present(
"-default_tags"))
220 cerr <<
"build_ngram: WARNING : -last_tag overrides -default_tags" 222 last_tag = al.
val(
"-last_tag");
225 if ( ( (prev_tag==
"") || (prev_prev_tag==
"") || (last_tag==
"") )
226 && ( (prev_tag!=
"") || (prev_prev_tag!=
"") || (last_tag!=
"") ) )
228 cerr <<
"build_ngram: ERROR : if any tags are given, ALL must be given" 234 order = al.
ival(
"-order");
237 cerr <<
"build_ngram: WARNING : No order specified with -order : defaulting to bigram" 243 format = al.
val(
"-otype");
248 floor = al.
dval(
"-floor");
255 cerr <<
"build_ngram: backoff requires smooth value" << endl;
261 cerr <<
"build_ngram: frequency smooth requires smooth value" 268 else if (al.
present(
"-sparse"))
270 cerr <<
"build_ngram: Sorry, sparse representation is not yet available " << endl;
274 else if (al.
present(
"-backoff"))
277 cerr <<
"build_ngram: Defaulting to dense representation" << endl;
281 if (!ngrammar.
init(order,representation,wordlist,wordlist2))
283 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" << endl;
289 if (!ngrammar.
init(order,representation,wordlist))
291 cerr <<
"build_ngram: Failed to initialise " << order <<
"-gram" << endl;
299 if (!ngrammar.
build(files,prev_tag,prev_prev_tag,
300 last_tag,input_format,oov_mode,
301 al.
ival(
"-backoff"),al.
ival(
"-smooth")))
303 cerr <<
"build_ngram: Failed to build backoff " << order
308 cerr <<
"build_ngram: Built backoff " << order <<
313 if (!ngrammar.
build(files,prev_tag,prev_prev_tag,
314 last_tag,input_format,oov_mode))
316 cerr <<
"build_ngram: Failed to build " << order <<
"-gram" << endl;
321 cerr <<
"build_ngram: Built " << order <<
"-gram" << endl;
332 int smoothcount = atoi(al.
val(
"-smooth"));
335 cerr <<
"build_ngram: Failed to smooth " << order <<
"-gram" << endl;
340 cerr <<
"build_ngram: Good Turing smoothed " << order <<
"-gram" << endl;
345 if (al.
present(
"-save_compressed"))
348 if (ngrammar.
save(tmp_file,format,trace,floor) ==
write_ok)
353 prog_name =
"gzip --stdout";
355 prog_name =
"compress -c";
358 prog_name =
"gzip --stdout";
364 cerr <<
"build_ngram: Compressing with '" << prog_name <<
"'" << endl;
369 cerr <<
"build_ngram: Failed to compress to file " 378 cerr <<
"build_ngram: Saved in compressed " << format
379 <<
" format to " << out_file << endl;
383 cerr <<
"build_ngram: Failed to write temporary file " 392 if (ngrammar.
save(out_file,format,trace,floor) ==
write_ok)
395 cerr <<
"build_ngram: Saved in " << format
396 <<
" format to " << out_file << endl;
400 cerr <<
"build_ngram: Failed to save " << format <<
" format data to " #define GZIP_FILENAME_EXTENSION
int main(int argc, char **argv)
#define SENTENCE_END_MARKER
int ival(const EST_String &rkey, int m=1) const
double dval(const EST_String &rkey, int m=1) const
EST_String make_tmp_filename()
Make a unique temporary filename.
The file was written successfully.
#define COMPRESS_FILENAME_EXTENSION
bool Good_Turing_smooth(EST_Ngrammar &ngrammar, int maxcount, int mincount)
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
void append(const T &item)
add item onto end of list
int delete_file(const EST_String &filename)
OS independent way of removing a file.
int compress_file(const EST_String &filename, const EST_String &new_filename, const EST_String &prog_name)
compress file by calling program prog, writing result to new_filename
int present(const K &rkey) const
Returns true if key is present.
EST_String extension(void) const
bool build(const EST_StrList &filenames, const EST_String &prev=SENTENCE_START_MARKER, const EST_String &prev_prev=SENTENCE_END_MARKER, const EST_String &last=SENTENCE_END_MARKER, const EST_String &input_format="", const EST_String &oov_mode="", const int mincount=1, const int maxcount=10)
#define SENTENCE_START_MARKER
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
bool init(int o, representation_t r, const EST_StrList &wordlist)
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
EST_write_status save(const EST_String &filename, const EST_String type="cstr_ascii", const bool trace=false, double floor=0.0)
void Ngram_freqsmooth(EST_Ngrammar &ngram, int smooth_thresh1, int smooth_thresh2)