Edinburgh Speech Tools  2.1-release
EST_Token.h
Go to the documentation of this file.
1 /*************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /*************************************************************************/
33 /* Author : Alan W Black */
34 /* Date : April 1996 */
35 /*-----------------------------------------------------------------------*/
36 /* Token/Tokenizer class */
37 /* */
38 /*=======================================================================*/
39 
40 #ifndef __EST_TOKEN_H__
41 #define __EST_TOKEN_H__
42 
43 #include <cstdio>
44 #include <istream>
45 #include "EST_String.h"
46 #include "EST_common.h"
47 #include "EST_File.h"
48 
49 // I can never really remember this so we'll define it here
50 /// The default whitespace characters
52 ///
54 ///
56 ///
58 
59 /** \class EST_Token
60  * @ingroup stringclasses
61  This class is similar to \ref EST_String but also maintains
62  the original punctuation and whitespace found around the
63  token.
64 
65  \ref EST_Token 's primary use is with \ref EST_TokenStream class
66  which allows easy tokenizing of ascii files.
67 
68  A token consists of four parts, any of which may be empty: a
69  name, the actual token, preceding whitespace, preceding
70  punctuation, the name and succeeding punctuation.
71 
72  @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
73 */
74 class EST_Token {
75  private:
76  EST_String space;
77  EST_String prepunc;
78  EST_String pname;
79  EST_String punc;
80  int linenum;
81  int linepos;
82  EST_FilePos p_filepos;
83  int p_quoted;
84 
85  public:
86  ///
87  EST_Token() {init();}
88  ///
89  EST_Token(const EST_String p) {init(); pname = p; }
90  ///
91  void init() {p_quoted=linenum=linepos=p_filepos=0;}
92 
93  /**@name Basic access to fields */
94  ///@{
95  /// set token from a string
96  void set_token(const EST_String &p) { pname = p; }
97  ///
98  void set_token(const char *p) { pname = p; }
99  /// set whitespace of token.
100  void set_whitespace(const EST_String &p) { space = p; }
101  ///
102  void set_whitespace(const char *p) { space = p; }
103  /// set (post) punctuation of token.
104  void set_punctuation(const EST_String &p) { punc = p; }
105  ///
106  void set_punctuation(const char *p) { punc = p; }
107  /// set prepunction
108  void set_prepunctuation(const EST_String &p) { prepunc = p; }
109  ///
110  void set_prepunctuation(const char *p) { prepunc = p; }
111  ///
112  const EST_String &whitespace() { return space; }
113  ///
114  const EST_String &punctuation() { return punc; }
115  ///
116  const EST_String &prepunctuation() { return prepunc; }
117 
118  /**@name Access token as a string */
119  ///@{
120  const EST_String &string() const { return String(); }
121  /// Access token as a string
122  const EST_String &S() const { return S(); }
123  /// Access token as a string
124  const EST_String &String() const { return pname; }
125  /// For automatic coercion to \ref EST_String
126  operator EST_String() const { return String(); }
127  ///@}
128 
129  /**@name Access token as a int */
130  ///@{
131  int Int(bool &valid) const { return String().Int(valid); }
132  int Int() const { return String().Int(); }
133  int I(bool &valid) const { return Int(valid); }
134  int I() const { return Int(); }
135  operator int() const { return Int(); }
136  ///@}
137 
138  /**@name Access token as a long */
139  ///@{
140  long Long(bool &valid) const { return String().Long(valid); }
141  long Long() const { return String().Long(); }
142  long L(bool &valid) const { return Long(valid); }
143  long L() const { return Long(); }
144  operator long() const { return Long(); }
145  ///@}
146 
147  /**@name Access token as a float */
148  ///@{
149  float Float(bool &valid) const { return String().Float(valid); }
150  float Float() const { return String().Float(); }
151  float F(bool &valid) const { return Float(valid); }
152  float F() const { return Float(); }
153  operator float() const { return Float(); }
154  ///@}
155 
156  /**@name Access token as a double */
157  ///@{
158  double Double(bool &valid) const { return String().Double(valid); }
159  double Double() const { return String().Double(); }
160  double D(bool &valid) const { return Double(valid); }
161  double D() const { return Double(); }
162  operator double() const { return Double(); }
163  ///@}
164 
165  ///@}
166 
167  /**@name Quotation related methods */
168  ///@{
169  /// Note that this token was quoted (or not)
170  void set_quoted(int q) { p_quoted = q; }
171  /// TRUE is token was quoted
172  int quoted() const { return p_quoted; }
173  ///@}
174  ///
175  void set_row(int r) { linenum = r; }
176  ///
177  void set_col(int c) { linepos = c; }
178  /// Set file position in original \ref EST_TokenStream
179  void set_filepos(EST_FilePos c) { p_filepos = c; }
180  /// Return lower case version of token name
181  EST_String lstring() { return downcase(pname); }
182  /// Return upper case version of token name
183  EST_String ustring() { return upcase(pname); }
184  /// Line number in original \ref EST_TokenStream.
185  int row(void) const { return linenum; }
186  /// Line position in original \ref EST_TokenStream.
187  int col(void) const { return linepos; }
188  /// file position in original \ref EST_TokenStream.
189  EST_FilePos filepos(void) const { return p_filepos; }
190 
191  /// A string describing current position, suitable for error messages
192  const EST_String pos_description() const;
193 
194  ///
195  friend std::ostream& operator << (std::ostream& s, const EST_Token &p);
196 
197  ///
198  EST_Token & operator = (const EST_Token &a);
199  ///
200  EST_Token & operator = (const EST_String &a);
201  ///
202  int operator == (const EST_String &a) { return (pname == a); }
203  ///
204  int operator != (const EST_String &a) { return (pname != a); }
205  ///
206  int operator == (const char *a) { return (strcmp(pname,a)==0); }
207  ///
208  int operator != (const char *a) { return (strcmp(pname,a)!=0); }
209 };
210 
212 
213 /** \class EST_TokenStream
214  A class that allows the reading of \ref EST_Token from a file
215  stream, pipe or string. It automatically tokenizes a file based on
216  user definable whitespace and punctuation.
217 
218  The definitions of whitespace and punctuation are user definable.
219  Also support for single character symbols is included. Single
220  character symbols *always* are treated as individual tokens
221  irrespective of their white space context. Also a quote
222  mode can be used to read uqoted tokens.
223 
224  The setting of whitespace, pre and post punctuation, single character
225  symbols and quote mode must be down (immediately) after opening
226  the stream.
227 
228  There is no unget but peek provides look ahead of one token.
229 
230  Note there is an interesting issue about what to do about
231  the last whitespace in the file. Should it be ignored or should
232  it be attached to a token with a name string of length zero.
233  In unquoted mode the eof() will return TRUE if the next token name
234  is empty (the mythical last token). In quoted mode the last must
235  be returned so eof will not be raised.
236 
237  @author Alan W Black (awb@cstr.ed.ac.uk): April 1996
238 */
240  private:
242  EST_String WhiteSpaceChars;
243  EST_String SingleCharSymbols;
244  EST_String PunctuationSymbols;
245  EST_String PrePunctuationSymbols;
246  EST_String Origin;
247  FILE *fp;
248  std::istream *is;
249  int fd;
250  char *buffer;
251  int buffer_length;
252  int pos;
253  int linepos;
254  EST_FilePos p_filepos;
255  int getch(void);
256  EST_TokenStream &getch(char &C);
257  int peeked_charp;
258  int peeked_char; // ungot character
259  int peekch(void);
260  int peeked_tokp;
261  int eof_flag;
262  int quotes;
263  char quote;
264  char escape;
265  EST_Token current_tok;
266  void default_values(void);
267  /* local buffers to save reallocating */
268  int tok_wspacelen;
269  char *tok_wspace;
270  int tok_stufflen;
271  char *tok_stuff;
272  int tok_prepuncslen;
273  char *tok_prepuncs;
274  int close_at_end;
275 
276  /* character class map */
277  char p_table[256];
278  bool p_table_wrong;
279 
280  /** This function is deliberately private so that you'll get a compilation
281  error if you assign a token stream or pass it as an (non-reference)
282  argument. The problem with copying is that you need to copy the
283  filedescriptiors too (which can't be done for pipes). You probably
284  don't really want a copy anyway and meant to pass it as a reference.
285  If you really need this (some sort of clever look ahead) I am not
286  sure what he consequences really are (or how portable they are).
287  Pass the \ref EST_TokenStream by reference instead.
288  */
290 
291  void build_table();
292 
293  inline int getch_internal();
294  inline int peekch_internal();
295  inline int getpeeked_internal();
296  public:
297  ///
298  EST_TokenStream();
299  /// will close file if appropriate for type
300  ~EST_TokenStream();
301  ///@{
302  /// open a \ref EST_TokenStream for a file.
303  int open(const EST_String &filename);
304  /// open a \ref EST_TokenStream for an already opened file
305  int open(FILE *ofp, int close_when_finished);
306  /// open a \ref EST_TokenStream for an already open istream
307  int open(std::istream &newis);
308  /// open a \ref EST_TokenStream for string rather than a file
309  int open_string(const EST_String &newbuffer);
310  /// Close stream.
311  void close(void);
312  ///@}
313  /**@name stream access functions */
314  ///@{
315  /// get next token in stream
316  EST_TokenStream &get(EST_Token &t);
317  /// get next token in stream
318  EST_Token &get();
319  /**@name get the next token which must be the argument. */
320  ///@{
321  EST_Token &must_get(EST_String expected, bool *ok);
322  EST_Token &must_get(EST_String expected, bool &ok)
323  { return must_get(expected, &ok); }
325  { return must_get(expected, (bool *)NULL); }
326  ///@}
327  /// get up to `s` in stream as a single token.
328  EST_Token get_upto(const EST_String &s);
329  /// get up to `s` in end of line as a single token.
330  EST_Token get_upto_eoln(void);
331  /// peek at next token
333  { if (!peeked_tokp) get();
334  peeked_tokp = TRUE; return current_tok; }
335  /// Reading binary data, (don't use peek() immediately beforehand)
336  int fread(void *buff,int size,int nitems) EST_WARN_UNUSED_RESULT;
337  ///@}
338  /**@name stream initialization functions */
339  ///@{
340  /// set which characters are to be treated as whitespace
342  { WhiteSpaceChars = ws; p_table_wrong=1;}
343  /// set which characters are to be treated as single character symbols
345  { SingleCharSymbols = sc; p_table_wrong=1;}
346  /// set which characters are to be treated as (post) punctuation
348  { PunctuationSymbols = ps; p_table_wrong=1;}
349  /// set which characters are to be treated as (post) punctuation
351  { PrePunctuationSymbols = ps; p_table_wrong=1;}
352  /// set characters to be used as quotes and escape, and set quote mode
353  void set_quotes(char q, char e) { quotes = TRUE; quote = q; escape = e; p_table_wrong=1;}
354  /// query quote mode
355  int quoted_mode(void) { return quotes; }
356  ///@}
357  /**@name miscellaneous */
358  ///@{
359  /// returns line number of \ref EST_TokenStream
360  int linenum(void) const {return linepos;}
361  /// end of file
362  int eof()
363  { return (eof_flag || ((!quotes) && (peek() == ""))); }
364  /// end of line
365  int eoln();
366  /// current file position in \ref EST_TokenStream
367  EST_FilePos filepos(void) const { return (type == tst_string) ? pos : p_filepos; }
368  /// tell, synonym for filepos
369  EST_FilePos tell(void) const { return filepos(); }
370  /// seek, reposition file pointer
371  int seek(int position);
372  int seek_end();
373  /// Reset to start of file/string
374  int restart(void);
375  /// A string describing current position, suitable for error messages
376  const EST_String pos_description();
377  /// The originating filename (if there is one)
378  const EST_String filename() const { return Origin; }
379  /// For the people who *need* the actual description (if possible)
380  FILE *filedescriptor() { return (type == tst_file) ? fp : 0; }
381  ///
382  EST_TokenStream & operator >>(EST_Token &p);
383  ///
384  EST_TokenStream & operator >>(EST_String &p);
385  ///
386  friend std::ostream& operator <<(std::ostream& s, EST_TokenStream &p);
387  ///@}
388 };
389 
390 /** Quote a string with given quotes and escape character
391 */
393  const EST_String &quote = "\"",
394  const EST_String &escape = "\\",
395  int force=0);
396 
397 #endif // __EST_TOKEN_H__
void set_WhiteSpaceChars(const EST_String &ws)
set which characters are to be treated as whitespace
Definition: EST_Token.h:341
int row(void) const
Line number in original EST_TokenStream.
Definition: EST_Token.h:185
const EST_String EST_Token_Default_SingleCharSymbols
Definition: EST_Token.cc:56
EST_Token(const EST_String p)
Definition: EST_Token.h:89
float F(bool &valid) const
Definition: EST_Token.h:151
EST_FilePos filepos(void) const
file position in original EST_TokenStream.
Definition: EST_Token.h:189
const EST_String EST_Token_Default_PrePunctuationSymbols
Definition: EST_Token.cc:57
FILE * filedescriptor()
For the people who need the actual description (if possible)
Definition: EST_Token.h:380
EST_String quote_string(const EST_String &s, const EST_String &quote="\"", const EST_String &escape="\\", int force=0)
Definition: EST_Token.cc:844
const EST_String pos_description() const
A string describing current position, suitable for error messages.
Definition: EST_Token.cc:109
void set_token(const char *p)
Definition: EST_Token.h:98
const EST_String & punctuation()
Definition: EST_Token.h:114
void set_filepos(EST_FilePos c)
Set file position in original EST_TokenStream.
Definition: EST_Token.h:179
const EST_String EST_Token_Default_PunctuationSymbols
Definition: EST_Token.cc:58
void set_prepunctuation(const EST_String &p)
set prepunction
Definition: EST_Token.h:108
long Long() const
Definition: EST_Token.h:141
double Double() const
Definition: EST_Token.h:159
EST_String lstring()
Return lower case version of token name.
Definition: EST_Token.h:181
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
long L(bool &valid) const
Definition: EST_Token.h:142
void set_punctuation(const char *p)
Definition: EST_Token.h:106
const EST_String filename() const
The originating filename (if there is one)
Definition: EST_Token.h:378
EST_FilePos tell(void) const
tell, synonym for filepos
Definition: EST_Token.h:369
const EST_String & whitespace()
Definition: EST_Token.h:112
double D() const
Definition: EST_Token.h:161
int quoted() const
TRUE is token was quoted.
Definition: EST_Token.h:172
int quoted_mode(void)
query quote mode
Definition: EST_Token.h:355
void set_punctuation(const EST_String &p)
set (post) punctuation of token.
Definition: EST_Token.h:104
EST_String upcase(const EST_String &s)
Definition: EST_String.cc:955
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:350
EST_Token & must_get(EST_String expected, bool &ok)
Definition: EST_Token.h:322
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:347
int eof()
end of file
Definition: EST_Token.h:362
void set_token(const EST_String &p)
set token from a string
Definition: EST_Token.h:96
EST_String downcase(const EST_String &s)
Definition: EST_String.cc:942
const EST_String & prepunctuation()
Definition: EST_Token.h:116
void set_prepunctuation(const char *p)
Definition: EST_Token.h:110
int linenum(void) const
returns line number of EST_TokenStream
Definition: EST_Token.h:360
const EST_String & S() const
Access token as a string.
Definition: EST_Token.h:122
int operator==(const EST_String &a)
Definition: EST_Token.h:202
EST_Token & operator=(const EST_Token &a)
Definition: EST_Token.cc:96
float Float() const
Definition: EST_Token.h:150
void set_row(int r)
Definition: EST_Token.h:175
#define EST_WARN_UNUSED_RESULT
Definition: EST_common.h:51
NULL
Definition: EST_WFST.cc:55
EST_Token()
Definition: EST_Token.h:87
long Long(bool &valid) const
Definition: EST_Token.h:140
int col(void) const
Line position in original EST_TokenStream.
Definition: EST_Token.h:187
EST_Token & peek(void)
peek at next token
Definition: EST_Token.h:332
EST_String ustring()
Return upper case version of token name.
Definition: EST_Token.h:183
void set_quotes(char q, char e)
set characters to be used as quotes and escape, and set quote mode
Definition: EST_Token.h:353
getString int
Definition: EST_item_aux.cc:50
int I(bool &valid) const
Definition: EST_Token.h:133
int operator!=(const EST_String &a)
Definition: EST_Token.h:204
long L() const
Definition: EST_Token.h:143
int Int() const
Definition: EST_Token.h:132
EST_FilePos filepos(void) const
current file position in EST_TokenStream
Definition: EST_Token.h:367
double D(bool &valid) const
Definition: EST_Token.h:160
const EST_String & string() const
Definition: EST_Token.h:120
LISP quote(LISP item)
Definition: siod.cc:252
FILE16 *(* open)(const char *, const char *, int, const char *, const char *)
Definition: url.c:107
float F() const
Definition: EST_Token.h:152
void init()
Definition: EST_Token.h:91
const EST_String & String() const
Access token as a string.
Definition: EST_Token.h:124
EST_tokenstream_type
Definition: EST_Token.h:211
int I() const
Definition: EST_Token.h:134
void set_whitespace(const char *p)
Definition: EST_Token.h:102
const EST_String EST_Token_Default_WhiteSpaceChars
The default whitespace characters.
Definition: EST_Token.cc:55
LISP fp
Definition: kkcompile.cc:63
EST_String
float Float(bool &valid) const
Definition: EST_Token.h:149
int Int(bool &valid) const
Definition: EST_Token.h:131
#define TRUE
Definition: EST_bool.h:118
void set_whitespace(const EST_String &p)
set whitespace of token.
Definition: EST_Token.h:100
off_t EST_FilePos
Definition: EST_File.h:69
EST_Token & must_get(EST_String expected)
Definition: EST_Token.h:324
friend std::ostream & operator<<(std::ostream &s, const EST_Token &p)
void set_col(int c)
Definition: EST_Token.h:177
void set_quoted(int q)
Note that this token was quoted (or not)
Definition: EST_Token.h:170
double Double(bool &valid) const
Definition: EST_Token.h:158