Edinburgh Speech Tools  2.1-release
scfg_make_main.cc
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* Build a stochastic context feee grammar with N non-terminals and */
37 /* M terminals specific as lists or numbers */
38 /* Probabilities are either even or random on rules and specified as */
39 /* probs or -log prob */
40 /* */
41 /*=======================================================================*/
42 #include <cstdlib>
43 #include <cstdio>
44 #include <iostream>
45 #include <fstream>
46 #include <cstring>
47 #include "EST.h"
48 #include "EST_SCFG.h"
49 #include "siod.h"
50 
51 using namespace std;
52 
54 EST_String domain = "nlogp";
55 EST_String values = "equal";
56 
57 static int scfg_make_main(int argc, char **argv);
58 
59 static void load_symbols(EST_StrList &syms,const EST_String &filename);
60 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix);
61 static LISP assign_probs(LISP rules, const EST_String &domain,
62  const EST_String &values);
63 static LISP make_all_rules(const EST_StrList &NonTerminals,
64  const EST_StrList &Terminals);
65 static void generate_probs(double *probs,int num);
66 
67 
68 
69 
70 int main(int argc, char **argv)
71 {
72 
73  scfg_make_main(argc,argv);
74 
75  exit(0);
76  return 0;
77 }
78 
79 static int scfg_make_main(int argc, char **argv)
80 {
81  // Top level function generates a probabilistic grammar
82  EST_Option al;
83  EST_StrList files;
84  EST_StrList NonTerminals, Terminals;
85  LISP rules,r;
86  FILE *fd;
87 
89  (argc, argv,
90  EST_String("[options]\n")+
91  "Summary: Build a stochastic context free grammar\n"+
92  "-nonterms <string> Number of nonterminals or file containing them\n"+
93  "-terms <string> Number of terminals or file containing them\n"+
94  "-domain <string> {nlogp}\n"+
95  " Values to be nlogp (negative log probabilities)\n"+
96  " or prob (probabilities)\n"+
97  "-values <string> {equal}\n"+
98  " General initial scores on rules as equal or\n"
99  " random\n"+
100  "-heap <int> {500000}\n"+
101  " Set size of Lisp heap, only needed for large grammars\n"+
102  "-o <ofile> File to save grammar (default stdout)\n",
103  files, al);
104 
105  if (al.present("-o"))
106  outfile = al.val("-o");
107  else
108  outfile = "-";
109 
110  if (al.present("-domain"))
111  {
112  if (al.val("-domain") == "nlogp")
113  domain = "nlogp";
114  else if (al.val("-domain") == "prob")
115  domain = "prob";
116  else
117  {
118  cerr << "scfg_make: domain must be nlogp or prob" << endl;
119  exit(1);
120  }
121  }
122 
123  if (al.present("-values"))
124  {
125  if (al.val("-values") == "equal")
126  values = "equal";
127  else if (al.val("-values") == "random")
128  values = "random";
129  else
130  {
131  cerr << "scfg_make: values must be equal or random" << endl;
132  exit(1);
133  }
134  }
135 
136  if (al.present("-nonterms"))
137  {
138  if (al.val("-nonterms").matches(RXint))
139  make_symbols(NonTerminals,al.ival("-nonterms"),"NT");
140  else
141  load_symbols(NonTerminals,al.val("-nonterms"));
142  }
143  else
144  {
145  cerr << "scfg_make: no nonterminals specified" << endl;
146  exit(1);
147  }
148 
149  if (al.present("-terms"))
150  {
151  if (al.val("-terms").matches(RXint))
152  make_symbols(Terminals,al.ival("-terms"),"T");
153  else
154  load_symbols(Terminals,al.val("-terms"));
155  }
156  else
157  {
158  cerr << "scfg_make: no terminals specified" << endl;
159  exit(1);
160  }
161 
162  siod_init(al.ival("-heap"));
163 
164  rules = make_all_rules(NonTerminals,Terminals);
165  rules = assign_probs(rules,domain,values);
166 
167  if (outfile == "-")
168  fd = stdout;
169  else
170  {
171  if ((fd=fopen(outfile,"w")) == NULL)
172  {
173  cerr << "scfg_make: failed to open file \"" << outfile <<
174  "\" for writing" << endl;
175  exit(1);
176  }
177  }
178 
179  for (r=rules; r != NIL; r=cdr(r))
180  pprint_to_fd(fd,car(r));
181 
182  if (fd != stdout)
183  fclose(fd);
184 
185 
186  return 0;
187 }
188 
189 static LISP make_all_rules(const EST_StrList &NonTerminals,
190  const EST_StrList &Terminals)
191 {
192  // Build all possibly rules (CNF)
193  // NT -> NT NT and NT -> T
194  EST_Litem *p,*q,*r;
195  LISP rules = NIL;
196 
197  for (p=NonTerminals.head(); p != 0; p=p->next())
198  {
199  int num_rules_nt = (NonTerminals.length()*NonTerminals.length())+
200  Terminals.length();
201  double *probs = new double[num_rules_nt];
202  generate_probs(probs,num_rules_nt);
203  int i=0;
204  for (q=NonTerminals.head(); q != 0; q=q->next())
205  for (r=NonTerminals.head(); r != 0; r=r->next(),i++)
206  rules = cons(cons(flocons(probs[i]),
207  cons(rintern(NonTerminals(p)),
208  cons(rintern(NonTerminals(q)),
209  cons(rintern(NonTerminals(r)),NIL)))),
210  rules);
211  for (q=Terminals.head(); q != 0; q=q->next(),i++)
212  rules = cons(cons(flocons(probs[i]),
213  cons(rintern(NonTerminals(p)),
214  cons(rintern(Terminals(q)),NIL))),
215  rules);
216  delete [] probs;
217  }
218 
219  return reverse(rules);
220 }
221 
222 static void generate_probs(double *probs,int num)
223 {
224  // Generate probabilities
225  int i;
226 
227  if (values == "equal")
228  {
229  double defp = 1.0/(float)num;
230  for (i=0; i < num; i++)
231  probs[i] = defp;
232  }
233  else if (values == "random")
234  {
235  // This isn't random but is somewhat arbitrary
236  double sum = 0;
237  for (i=0; i < num; i++)
238  {
239  probs[i] = (double)abs(rand())/(double)0x7fff;
240  sum += probs[i];
241  }
242  for (i=0; i < num; i++)
243  {
244  probs[i] /= sum;
245  }
246  }
247  else
248  {
249  cerr << "scfg_make: unknown value for probability distribution"
250  << endl;
251  exit(1);
252  }
253 }
254 
255 static LISP assign_probs(LISP rules, const EST_String &domain,
256  const EST_String &values)
257 {
258  // Modify probs (don't know how to do random probs yet)
259  LISP r;
260  (void)values;
261 
262  if (domain == "nlogp")
263  for (r=rules; r != NIL; r = cdr(r))
264  {
265  if (get_c_float(car(car(r))) == 0)
266  CAR(car(r)) = flocons(40);
267  else
268  CAR(car(r)) = flocons(-log(get_c_float(car(car(r)))));
269  }
270 
271  return rules;
272 }
273 
274 static void make_symbols(EST_StrList &syms,int n,const EST_String &prefix)
275 {
276  // Generate n symbols with given prefix
277  int i;
278  int magnitude,t;
279 
280  for (magnitude=0,t=n; t > 0; t=t/10)
281  magnitude++;
282 
283  char *name = walloc(char,prefix.length()+magnitude+1);
284  char *skel = walloc(char,prefix.length()+5);
285  sprintf(skel,"%s%%%02dd",(const char *)prefix,magnitude);
286 
287  for (i=0; i < n; i++)
288  {
289  sprintf(name,skel,i);
290  syms.append(name);
291  }
292 
293  wfree(name);
294  wfree(skel);
295 
296 }
297 
298 
299 static void load_symbols(EST_StrList &syms,const EST_String &filename)
300 {
301  // Load symbol list for file
302 
303  if (load_StrList(filename,syms) != format_ok)
304  exit(-1);
305 
306 }
int main(int argc, char **argv)
#define walloc(TYPE, SIZE)
Definition: EST_walloc.h:52
float get_c_float(LISP x)
Definition: slib.cc:1858
int ival(const EST_String &rkey, int m=1) const
Definition: EST_Option.cc:82
#define NIL
Definition: siod_defs.h:92
EST_String values
EST_UItem * next()
Definition: EST_UList.h:55
EST_Regex RXint("-?[0-9]+")
Integer.
LISP cons(LISP x, LISP y)
Definition: slib_list.cc:97
EST_String domain
EST_String outfile
NULL
Definition: EST_WFST.cc:55
int matches(const char *e, ssize_t pos=0) const
Exactly match this string?
Definition: EST_String.cc:651
const V & val(const K &rkey, bool m=0) const
return value according to key (const)
Definition: EST_TKVL.cc:145
void append(const T &item)
add item onto end of list
Definition: EST_TList.h:196
void pprint_to_fd(FILE *fd, LISP exp)
Definition: slib_file.cc:74
#define CAR(x)
Definition: siod_defs.h:76
int length() const
Definition: EST_UList.cc:57
size_t length(void) const
Length of string ({not} length of underlying chunk)
Definition: EST_String.h:231
LISP rintern(const char *name)
Definition: slib.cc:734
#define format_ok
int present(const K &rkey) const
Returns true if key is present.
Definition: EST_TKVL.cc:222
LISP flocons(double x)
Definition: slib.cc:673
EST_UItem * head() const
Definition: EST_UList.h:97
int siod_init(int heap_size=DEFAULT_HEAP_SIZE)
Definition: siod.cc:58
LISP car(LISP x)
Definition: slib_list.cc:115
EST_String
void wfree(void *p)
Definition: walloc.c:131
float sum(const EST_FMatrix &a)
sum of elements
Definition: vec_mat_aux.cc:147
EST_read_status load_StrList(EST_String filename, EST_StrList &l)
Load tokens from a file and return them in a EST_StrList.
void reverse(EST_Wave &sig)
int parse_command_line(int argc, char *argv[], const EST_String &usage, EST_StrList &files, EST_Option &al, int make_stdio=1)
Definition: cmd_line.cc:101
LISP cdr(LISP x)
Definition: slib_list.cc:124