Edinburgh Speech Tools  2.1-release
token_regression.cc
Go to the documentation of this file.
1 /************************************************************************/
2 /* */
3 /* Centre for Speech Technology Research */
4 /* University of Edinburgh, UK */
5 /* Copyright (c) 1996,1997 */
6 /* All Rights Reserved. */
7 /* */
8 /* Permission is hereby granted, free of charge, to use and distribute */
9 /* this software and its documentation without restriction, including */
10 /* without limitation the rights to use, copy, modify, merge, publish, */
11 /* distribute, sublicense, and/or sell copies of this work, and to */
12 /* permit persons to whom this work is furnished to do so, subject to */
13 /* the following conditions: */
14 /* 1. The code must retain the above copyright notice, this list of */
15 /* conditions and the following disclaimer. */
16 /* 2. Any modifications must be clearly marked as such. */
17 /* 3. Original authors' names are not deleted. */
18 /* 4. The authors' names are not used to endorse or promote products */
19 /* derived from this software without specific prior written */
20 /* permission. */
21 /* */
22 /* THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK */
23 /* DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING */
24 /* ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT */
25 /* SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE */
26 /* FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES */
27 /* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN */
28 /* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, */
29 /* ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF */
30 /* THIS SOFTWARE. */
31 /* */
32 /************************************************************************/
33 /* Author: Alan W Black */
34 /* Date: May 1997 */
35 /************************************************************************/
36 /* */
37 /* Lets see if we can break the TokenStream class */
38 /* */
39 /************************************************************************/
40 
41 #include <cstdlib>
42 #include "EST_Token.h"
43 
44 using namespace std;
45 
46 static void binary_read_test();
47 
48 static void find_tokens(EST_TokenStream &ts)
49 {
50  // Count and display the tokens in this stream
51  int tokens;
52 
53  for (tokens=0; !ts.eof(); tokens++)
54  cout << ts.get().string() << endl;
55  cout << "Total: " << tokens << endl << endl;;
56 
57 }
58 
59 int main(int argc,char **argv)
60 {
61  // Simple program to read all the tokens in the named file
62  // a print a summary of them
63  (void)argc;
64  (void)argv;
65  EST_TokenStream ts;
66  EST_String s;
67 
68  // Basic tokenizing tasks changing punctuation, whitespace and
69  // single character symbols etc.
70  s = "This is a test.";
71  cout << "Test 1: " << quote_string(s) << endl;
72  ts.open_string(s);
73  find_tokens(ts);
74  ts.close();
75 
76  s = "This (is) a test.";
77  cout << "Test 2: " << quote_string(s) << endl;
78  ts.open_string(s);
79  find_tokens(ts);
80  ts.close();
81 
82  s = "This (is) a test.";
83  cout << "Test 3: " << quote_string(s) << endl;
84  ts.open_string("This (is) a test.");
85  ts.set_PrePunctuationSymbols("({[");
87  find_tokens(ts);
88  ts.close();
89 
90  s = "This (is) a test.";
91  cout << "Test 4: " << quote_string(s) << endl;
92  ts.open_string(s);
93  ts.set_SingleCharSymbols("()");
95  find_tokens(ts);
96  ts.close();
97 
98  s = "This \"is a\" te\\\"st.";
99  cout << "Test 5: " << quote_string(s) << endl;
100  ts.open_string(s);
103  find_tokens(ts);
104  ts.close();
105 
106  s = "This \"is a\" te\\\"st.";
107  cout << "Test 6: " << quote_string(s) << endl;
108  ts.open_string(s);
109  ts.set_quotes('"','\\');
110  find_tokens(ts);
111  ts.close();
112 
113  s = "This \"is \n\
114 a\" te\\\"st.";
115  cout << "Test 7: " << quote_string(s) << endl;
116  ts.open_string(s);
117  ts.set_quotes('"','\\');
118  find_tokens(ts);
119  ts.close();
120 
121  // test of reading binary data
122  binary_read_test();
123 
124  return 0;
125 }
126 
128 {
129  FILE *fd;
130  char buff[64];
131  int a[2];
132  int numbytes;
133  // Make a buffer with both tokens and binary data
134  sprintf(buff,"a buffer BINARY ");
135  a[0] = 7;
136  a[1] = -34;
137  memmove(buff+16,a,sizeof(int)*2);
138  sprintf(buff+16+(sizeof(int)*2)," and tokens");
139 
140  if ((fd=fopen(filename,"w")) == NULL)
141  {
142  cerr << "Token_regression: failed to open " << filename << endl;
143  exit(-1);
144  }
145 
146  numbytes = fwrite(buff,1,16+(sizeof(int)*2)+11,fd);
147  fclose(fd);
148 
149  // Special constructions as the string contains nulls
150  return EST_String(buff,numbytes,0,numbytes);
151 }
152 
153 static void binary_read_test()
154 {
155  // You can use fread to read directly from a token stream
156  // but care should be take at the boundaries. Reading a
157  // token will always read the character following it. By
158  // convention it is recommended you include the single token
159  // BINARY follow by a single space in the stream before each
160  // binary section.
161  int b[2];
162  EST_String tokbinbuf;
163  EST_TokenStream ts;
164 
165  tokbinbuf = make_tokbins("tmp/tokbin.dat");
166 
167  // Do the reading
168 
169  cout << "Reading tokens and binary from string\n";
170 
171  ts.open_string(tokbinbuf);
172 
173  cout << ts.get() << endl;
174  cout << ts.get() << endl;
175  if (ts.get() != "BINARY")
176  {
177  cout << "failed to read binary data, missing BINARY token." << endl;
178  exit(-1);
179  }
180  if(ts.fread(b,sizeof(int),2) != 2)
181  {
182  cerr << "Error reading 2 integers from tokenstream" << endl;
183  exit(-1);
184  }
185  cout << b[0] << endl;
186  cout << b[1] << endl;
187  cout << ts.get() << endl;
188  cout << ts.get() << endl;
189  ts.close();
190 
191  cout << "Reading tokens and binary from file\n";
192 
193  ts.open("tmp/tokbin.dat");
194 
195  cout << ts.get() << endl;
196  cout << ts.get() << endl;
197  if (ts.get() != "BINARY")
198  {
199  cout << "failed to read binary data, missing BINARY token." << endl;
200  exit(-1);
201  }
202  if (ts.fread(b,sizeof(int),2) != 2)
203  {
204  cerr << "Error reading 2 int from tokenstream" << endl;
205  exit(-1);
206  }
207  cout << b[0] << endl;
208  cout << b[1] << endl;
209  cout << ts.get() << endl;
210  cout << ts.get() << endl;
211  ts.close();
212 
213 }
214 
215 
EST_TokenStream & get(EST_Token &t)
get next token in stream
Definition: EST_Token.cc:499
int fread(void *buff, int size, int nitems) EST_WARN_UNUSED_RESULT
Reading binary data, (don&#39;t use peek() immediately beforehand)
Definition: EST_Token.cc:368
void set_SingleCharSymbols(const EST_String &sc)
set which characters are to be treated as single character symbols
Definition: EST_Token.h:344
void close(void)
Close stream.
Definition: EST_Token.cc:419
const EST_String EST_Token_Default_PunctuationSymbols
Definition: EST_Token.cc:58
void set_PrePunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:350
int open(const EST_String &filename)
open a EST_TokenStream for a file.
Definition: EST_Token.cc:213
int open_string(const EST_String &newbuffer)
open a EST_TokenStream for string rather than a file
Definition: EST_Token.cc:264
void set_PunctuationSymbols(const EST_String &ps)
set which characters are to be treated as (post) punctuation
Definition: EST_Token.h:347
int eof()
end of file
Definition: EST_Token.h:362
const EST_String EST_Token_Default_PrePunctuationSymbols
Definition: EST_Token.cc:57
NULL
Definition: EST_WFST.cc:55
void set_quotes(char q, char e)
set characters to be used as quotes and escape, and set quote mode
Definition: EST_Token.h:353
int main(int argc, char **argv)
EST_String make_tokbins(const EST_String &filename)
EST_String
EST_String quote_string(const EST_String &s, const EST_String &quote, const EST_String &escape, int force)
Definition: EST_Token.cc:844