Edinburgh Speech Tools  2.1-release
token_example.cc
Go to the documentation of this file.
1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /************************************************************************/
33 /* Author: Alan W Black */
34 /* Date: May 1997 */
35 /************************************************************************/
36 /* */
37 /* Example of reading a file using the tokenizer */
38 /* */
39 /************************************************************************/
40 
41 #include <cstdlib>
42 #include "EST_Token.h"
43 
44 using namespace std;
45 
46 #if defined(DATAC)
47 # define __STRINGIZE(X) #X
48 # define DATA __STRINGIZE(DATAC)
49 #endif
50 
51 int main(int argc,char **argv)
52 {
53  // Simple program to read all the tokens in the named file
54  // a print a summary of them
55  EST_TokenStream ts;
56  int tokens, alices, quotes;
57  EST_Token t;
58  EST_String fname;
59 
60  if (argc > 2)
61  {
62  cerr << argv[0] << ": wrong number of arguments\n";
63  exit(-1);
64  }
65  else if (argc == 2)
66  fname = argv[1];
67  else
68  fname = DATA "/alice";
69 
70  if (ts.open(fname) == -1)
71  {
72  cerr << argv[0] << ": can't open input file \"" << argv[1] <<
73  "\"\n";
74  exit(-1);
75  }
76 
77  // Control of whitespace characters, single character symbols,
78  // pre and post punctuation may be set here.
79 
80  // The defaults are standard whitespace, and nothing for the rest
81  // (this is like awk's basic tokenizer). For language analysis
82  // you'll probably want to modify the punctuation
83  // \173 is '{', it is inserted by number because of a doc++ problem.
84 
85  ts.set_PrePunctuationSymbols("\173[(\"'");
87 
88  // Note you may set quotes so quoted tokens are read as single
89  // tokens (a la C)
90 
91  for (tokens=quotes=alices=0; !ts.eof(); tokens++)
92  {
93  t = ts.get();
94  if (t == "Alice")
95  alices++;
96  if (t.prepunctuation().contains("\""))
97  quotes++;
98  }
99 
100  printf("Input file contains:\n");
101  printf(" %5d tokens\n",tokens);
102  printf(" %5d tokens preceeded by double quotes\n",quotes);
103  printf(" %5d occurrences of Alice\n",alices);
104 
105  return 0;
106 }
107 
108 
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
int contains(const char *s, ssize_t pos=-1) const
Does it contain this substring?
Definition: EST_String.h:365
int main(int argc, char **argv)
const EST_String EST_Token_Default_PunctuationSymbols
Definition: EST_Token.cc:58
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:350
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:347
int eof()
end of file
Definition: EST_Token.h:362
const EST_String & prepunctuation()
Definition: EST_Token.h:116