Edinburgh Speech Tools  2.1-release
EST_SCFG.h
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : October 1997 */
35 /*-----------------------------------------------------------------------*/
36 /* */
37 /* Stochastic context free grammars */
38 /* */
39 /*=======================================================================*/
40 #ifndef __EST_SCFG_H__
41 #define __EST_SCFG_H__
42 
43 #include "EST_simplestats.h"
44 #include "EST_rw_status.h"
45 #include "EST_TList.h"
46 #include "siod.h"
47 
48 /** \class EST_bracketed_string
49  \brief This class represents a bracketed string used in training of SCFGs.
50 
51  An object in this class builds an index of valid bracketing of
52  the string, thus offering both a tree like access and direct
53  access to the leafs of the tree. The definition of ``valid
54  bracketing'' is any substring \f[ W_{i,j} \f] that doesn't cross any
55  brackets.
56 */
58  private:
59  int p_length;
60  LISP *symbols;
61  LISP bs;
62  int **valid_spans; // triangular matrix
63  int find_num_nodes(LISP string);
64  int set_leaf_indices(LISP string,int i,LISP *symbols);
65  int num_leafs(LISP l) const;
66  void find_valid(int i,LISP t) const;
67  void init();
68  public:
69  ///
71  ///
72  EST_bracketed_string(LISP string);
73  ///
75 
76  ///
77  void set_bracketed_string(LISP string);
78  ///
79  int length() const {return p_length;}
80  ///
81  LISP string() const { return bs; }
82  /// The nth symbol in the string.
83  const EST_String symbol_at(int i) const
84  { return EST_String(get_c_string(car(symbols[i]))); }
85  /// If a bracketing from i to k is valid in string
86  int valid(int i,int k) const { return valid_spans[i][k]; }
87 
88  ///
89  int operator !=(const EST_bracketed_string &a) const
90  { return (!(this == &a)); }
91  int operator ==(const EST_bracketed_string &a) const
92  { return ((this == &a)); }
93  ///
94  friend ostream& operator << (ostream &s, const EST_bracketed_string &a)
95  { (void)a; s << "[a bracketed string]" << std::endl; return s; }
96 
97 };
98 
100 
101 // Only support Chomsky Normal Form at present
104 
105 /** \class EST_SCFG_Rule
106  \brief A stochastic context free grammar rule.
107 
108  At present only two types of rule are supported:
109  `est\_scfg\_binary\_rule` and `est\_scfg\_unary\_rule`.
110  This is sufficient for the representation of grammars in
111  Chomsky Normal Form. Each rule also has a probability associated
112  with it. Terminals and noterminals are represented as ints using
113  the \ref EST_Discrete s in \ref EST_SCFG to reference the actual
114  alphabets.
115 
116  Although this class includes a "probability" nothing in the rule
117  itself enforces it to be a true probability. It is responsibility
118  of the classes that use this rule to enforce that condition if
119  desired.
120 
121  @author Alan W Black (awb@cstr.ed.ac.uk): October 1997
122 */
124  private:
125  int p_mother;
126  int p_daughter1;
127  int p_daughter2;
128  est_scfg_rtype p_type;
129  double p_prob;
130  public:
131  ///
132  EST_SCFG_Rule() {p_type=est_scfg_unset; p_prob=0; p_mother=0;
133  p_daughter1 = 0;p_daughter2 =0;}
134  ///
136  {p_mother = r.p_mother; p_daughter1 = r.p_daughter1;
137  p_daughter2 = r.p_daughter2; p_type=r.p_type; p_prob = r.p_prob;}
138  /// Create a unary rule.
139  EST_SCFG_Rule(double prob,int p,int m);
140  /// Create a binary rule.
141  EST_SCFG_Rule(double prob,int p, int q, int r);
142  /// The rule's probability
143  double prob() const {return p_prob;}
144  /// set the probability
145  void set_prob(double p) { p_prob=p;}
146  /// rule type
147  est_scfg_rtype type() const { return p_type; }
148  ///
149  int mother() const {return p_mother;}
150  /** In a unary rule this is a terminal, in a binary rule it
151  is a nonterminal
152  */
153  int daughter1() const {return p_daughter1;}
154  ///
155  int daughter2() const {return p_daughter2;}
156  ///
157  void set_rule(double prob,int p, int m);
158  ///
159  void set_rule(double prob,int p, int q, int r);
160 };
161 
163 
164 /** \class EST_SCFG
165  \brief A class representing a stochastic context free grammar (SCFG).
166 
167  This class includes the representation of the grammar itself and
168  methods for training and testing it against some corpus.
169 
170  At presnet of grammars in Chomsky Normal Form are supported. That
171  is rules may be binary or unary. If binary the mother an two
172  daughters are nonterminals, if unary the mother must be nonterminal
173  and daughter a terminal symbol.
174 
175  The terminals and nonterminals symbol sets are derived automatically
176  from the LISP representation of the rules at initialization time
177  and are represented as \ref EST_Discrete. The distinguished
178  symbol is assumed to be the first mother of the first rule in
179  the given grammar.
180 
181 */
182 class EST_SCFG {
183  private:
184  EST_Discrete nonterminals;
185  EST_Discrete terminals;
186  int p_distinguished_symbol;
187  // Index of probabilities for binary rules in grammar
188  double ***p_prob_B;
189  // Index of probabilities for unary rules in grammar
190  double **p_prob_U;
191  // Build rule probability caches
192  void rule_prob_cache();
193  // Delete rule probability caches
194  void delete_rule_prob_cache();
195  public:
196  /**@name Constructor and initialisation functions */
197  ///@{
198  EST_SCFG();
199  /// Initialize from a set of rules
200  EST_SCFG(LISP rules);
201  ~EST_SCFG();
202  ///@}
203 
204  /**@name utility functions */
205  ///@{
206  /// Set (or reset) rules from external source after construction
207  void set_rules(LISP rules);
208  /// Return rules as LISP list.
209  LISP get_rules();
210  /// The rules themselves
212  int distinguished_symbol() const { return p_distinguished_symbol; }
213  /** Find the terminals and nonterminals in the given grammar, adding
214  them to the appropriate given string lists.
215  */
216  void find_terms_nonterms(EST_StrList &nt, EST_StrList &t,LISP rules);
217  /// Convert nonterminal index to string form
218  EST_String nonterminal(int p) const { return nonterminals.name(p); }
219  /// Convert terminal index to string form
220  EST_String terminal(int m) const { return terminals.name(m); }
221  /// Convert nonterminal string to index
222  int nonterminal(const EST_String &p) const { return nonterminals.name(p); }
223  /// Convert terminal string to index
224  int terminal(const EST_String &m) const { return terminals.name(m); }
225  /// Number of nonterminals
226  int num_nonterminals() const { return nonterminals.length(); }
227  /// Number of terminals
228  int num_terminals() const { return terminals.length(); }
229  /// The rule probability of given binary rule
230  double prob_B(int p, int q, int r) const { return p_prob_B[p][q][r]; }
231  /// The rule probability of given unary rule
232  double prob_U(int p, int m) const { return p_prob_U[p][m]; }
233  /// (re-)set rule probability caches
234  void set_rule_prob_cache();
235  ///@}
236 
237  /**@name file i/o functions */
238  ///@{
239  /// Load grammar from named file
240  EST_read_status load(const EST_String &filename);
241  /// Save current grammar to named file
242  EST_write_status save(const EST_String &filename);
243  ///@}
244 };
245 
246 /** \class EST_SCFG_traintest
247  \brief A class used to train (and test) SCFGs is an extension of
248  \ref EST_SCFG.
249 
250  This offers an implementation of Pereira and Schabes ``Inside-Outside
251  reestimation from partially bracket corpora.'' ACL 1992.
252 
253  A SCFG maybe trained from a corpus (optionally) containing brackets
254  over a series of passes reestimating the grammar probabilities
255  after each pass. This basically extends the \ref EST_SCFG class
256  adding support for a bracket corpus and various indexes for efficient
257  use of the grammar.
258 */
259 class EST_SCFG_traintest : public EST_SCFG {
260  private:
261  /// Index for inside probabilities
262  double ***inside;
263  /// Index for outside probabilities
264  double ***outside;
265  EST_Bcorpus corpus;
266  /// Partial (numerator) for reestimation
267  EST_DVector n;
268  /// Partial (denominator) for reestimation
269  EST_DVector d;
270 
271  /// Calculate inside probability.
272  double f_I_cal(int c, int p, int i, int k);
273  /// Lookup or calculate inside probability.
274  double f_I(int c, int p, int i, int k)
275  { double r;
276  if ((r=inside[p][i][k]) != -1) return r;
277  else return f_I_cal(c,p,i,k); }
278  /// Calculate outside probability.
279  double f_O_cal(int c, int p, int i, int k);
280  /// Lookup or calculate outside probability.
281  double f_O(int c, int p, int i, int k)
282  { double r;
283  if ((r=outside[p][i][k]) != -1) return r;
284  else return f_O_cal(c,p,i,k); }
285  /// Find probability of parse of corpus sentence `c`
286  double f_P(int c);
287  /** Find probability of parse of corpus sentence `c` for
288  nonterminal `p`
289  */
290  double f_P(int c,int p);
291  /// Re-estimate probability of binary rule using inside-outside algorithm
292  void reestimate_rule_prob_B(int c, int ri, int p, int q, int r);
293  /// Re-estimate probability of unary rule using inside-outside algorithm
294  void reestimate_rule_prob_U(int c, int ri, int p, int m);
295  /// Do grammar re-estimation
296  void reestimate_grammar_probs(int passes,
297  int startpass,
298  int checkpoint,
299  int spread,
300  const EST_String &outfile);
301  ///
302  double cross_entropy();
303  /// Initialize the cache for inside/outside values for sentence `c`
304  void init_io_cache(int c,int nt);
305  /// Clear the cache for inside/outside values for sentence `c`
306  void clear_io_cache(int c);
307  public:
310 
311  /** Test the current grammar against the current corpus print summary.
312 
313  Cross entropy measure only is given.
314  */
315  void test_corpus();
316  /** Test the current grammar against the current corpus.
317 
318  Summary includes percentage of cross bracketing accuracy
319  and percentage of fully correct parses.
320  */
321  void test_crossbrackets();
322 
323  /** Load a corpus from the given file.
324 
325  Each sentence in the corpus should be contained in parentheses.
326  Additional parenthesis may be used to denote phrasing within
327  a sentence. The corpus is read using the LISP reader so LISP
328  conventions shold apply, notable single quotes should appear
329  within double quotes.
330  */
331  void load_corpus(const EST_String &filename);
332 
333  /** Train a grammar using the loaded corpus.
334 
335  @param passes the number of training passes desired.
336  @param startpass from which pass to start from
337  @param checkpoint save the grammar every n passes
338  @param spread Percentage of corpus to use on each pass, this cycles through the corpus on each pass.
339  @param outfile Output file name
340  */
341  void train_inout(int passes,
342  int startpass,
343  int checkpoint,
344  int spread,
345  const EST_String &outfile);
346 };
347 
348 /** From a full parse, extract the string with bracketing only.
349 */
350 LISP scfg_bracketing_only(LISP parse);
351 /** Cummulate cross bracketing information between ref and test.
352  */
354  const EST_bracketed_string &test,
355  EST_SuffStats &vs);
356 
357 #endif
int valid(int i, int k) const
If a bracketing from i to k is valid in string.
Definition: EST_SCFG.h:86
void set_prob(double p)
set the probability
Definition: EST_SCFG.h:145
void set_bracketed_string(LISP string)
int operator!=(const EST_bracketed_string &a) const
Definition: EST_SCFG.h:89
EST_write_status
EST_String nonterminal(int p) const
Convert nonterminal index to string form.
Definition: EST_SCFG.h:218
A class used to train (and test) SCFGs is an extension of EST_SCFG.
Definition: EST_SCFG.h:259
double prob_B(int p, int q, int r) const
The rule probability of given binary rule.
Definition: EST_SCFG.h:230
int terminal(const EST_String &m) const
Convert terminal string to index.
Definition: EST_SCFG.h:224
bool save(Lattice &lattice, EST_String filename)
const EST_String & name(const int n) const
The name given the index.
int nonterminal(const EST_String &p) const
Convert nonterminal string to index.
Definition: EST_SCFG.h:222
int length(void) const
The number of members in the discrete.
A class representing a stochastic context free grammar (SCFG).
Definition: EST_SCFG.h:182
bool load(Lattice &lattice, EST_String filename)
This class represents a bracketed string used in training of SCFGs.
Definition: EST_SCFG.h:57
A stochastic context free grammar rule.
Definition: EST_SCFG.h:123
int distinguished_symbol() const
Definition: EST_SCFG.h:212
int num_nonterminals() const
Number of nonterminals.
Definition: EST_SCFG.h:226
EST_SCFG_Rule(const EST_SCFG_Rule &r)
Definition: EST_SCFG.h:135
int daughter2() const
Definition: EST_SCFG.h:155
int length() const
Definition: EST_SCFG.h:79
est_scfg_rtype type() const
rule type
Definition: EST_SCFG.h:147
const char * get_c_string(LISP x)
Definition: slib.cc:638
EST_String terminal(int m) const
Convert terminal index to string form.
Definition: EST_SCFG.h:220
est_scfg_rtype
Definition: EST_SCFG.h:102
double prob() const
The rule&#39;s probability.
Definition: EST_SCFG.h:143
int num_terminals() const
Number of terminals.
Definition: EST_SCFG.h:228
int daughter1() const
Definition: EST_SCFG.h:153
SCFGRuleList rules
The rules themselves.
Definition: EST_SCFG.h:211
EST_String outfile
A vector class for double precision floating point numbers. EST_DVector x should be used instead of f...
Definition: EST_DMatrix.h:122
void count_bracket_crossing(const EST_bracketed_string &ref, const EST_bracketed_string &test, EST_SuffStats &vs)
int mother() const
Definition: EST_SCFG.h:149
const EST_String symbol_at(int i) const
The nth symbol in the string.
Definition: EST_SCFG.h:83
EST_read_status
LISP string() const
Definition: EST_SCFG.h:81
EST_TVector< EST_bracketed_string > EST_Bcorpus
Definition: EST_SCFG.h:99
LISP scfg_bracketing_only(LISP parse)
LISP car(LISP x)
Definition: slib_list.cc:115
double prob_U(int p, int m) const
The rule probability of given unary rule.
Definition: EST_SCFG.h:232
EST_TList< EST_SCFG_Rule > SCFGRuleList
Definition: EST_SCFG.h:162
EST_String
friend ostream & operator<<(ostream &s, const EST_bracketed_string &a)
Definition: EST_SCFG.h:94
int operator==(const EST_bracketed_string &a) const
Definition: EST_SCFG.h:91