1   /*
2    *  CookBook.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 16/Feb/2000
12   *
13   *  $Id: CookBook.java,v 1.29 2001/12/03 12:29:46 valyt Exp $
14   */
15  
16  package gate;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  import junit.framework.*;
22  
23  import gate.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.creole.tokeniser.*;
27  import gate.creole.splitter.*;
28  import gate.creole.gazetteer.*;
29  import gate.creole.orthomatcher.*;
30  
31  
32  /**
33    * <P>
34    * This class provides examples of using the GATE APIs.
35    * Read this documentation along with a copy of the
36    * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source
37    * code</A>.
38    *
39    * <P>
40    * The CookBook is set up as
41    * part of the GATE test suite (using the
42    * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's
43    * an easy way to run the examples (viz.,
44    * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method,
45    * which will invoke the
46    * JUnit test runner). Also, we can use JUnit's assert methods: e.g.
47    * <TT>assertTrue(corpus.isEmpty());</TT>
48    * tests that a corpus object is empty, and creates a test failure report if
49    * this is not the case. (To add a new test class to the suite, see the
50    * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.)
51    *
52    * <P>
53    * Programming to the GATE Java API involves manipulating the classes and
54    * interfaces in the <A HREF=package-summary.html>gate package</A>
55    * (and to a lesser extent other packages). These are
56    * often interfaces; classes there are often to do with getting
57    * access to objects that implement the interfaces (without exposing those
58    * implementations). In other words, there's a lot of interface-based design
59    * around.
60    *
61    * <P>
62    * For more details and for a conceptual view, see
63    * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing
64    * Components with GATE</A> (for which this class provides some of the
65    * examples).
66    *
67    * <P>
68    * The rest of this documentation refers to methods in the code that
69    * provide examples of using the GATE API.
70    *
71    * <P>
72    * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives
73    * an example of creating a resource via
74    * <A HREF=../gate/Factory.html>gate.Factory</A>.
75    *
76    * <P>
77    * The <A HREF=Corpus.html>Corpus interface</A> represents collections of
78    * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER
79    * <TT>Collection</TT> class).
80    *
81    * <P>
82    * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method
83    * gives an example of how to create a new transient Corpus object.
84    *
85    * <P>
86    * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives
87    * examples of adding documents to corpora.
88    *
89    * <P>
90    * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives
91    * examples of adding annotations to documents.
92    *
93    *
94    * <P>
95    * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives
96    * examples of using features. <A HREF=FeatureMap.html>The FeatureMap
97    * interface</A> is a mechanism for associating arbitrary data with GATE
98    * entities. Corpora, documents and annotations all share this
99    * mechanism. Simple feature maps use Java's Map interface.
100   *
101   *
102   * <H3>Other sources of examples</H3>
103   *
104   * <P>
105   * See also the other test classes, although note that they also use methods
106   * that are not part of the public API. Test classes include:
107   * <A HREF=corpora/TestCreole.html>TestCreole</A>;
108   * <A HREF=corpora/TestCorpus.html>TestCorpus</A>;
109   * <A HREF=corpora/TestDocument.html>TestDocument</A>;
110   * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything
111   * else starting "Test" - about 30 of them at the last count.
112   */
113 public class CookBook extends TestCase
114 {
115   /** Debug flag */
116   private static final boolean DEBUG = false;
117 
118   /** A corpus */
119   Corpus corpus = null;
120 
121   /** A document */
122   Document doc1 = null;
123 
124   /** Another document */
125   Document doc2 = null;
126 
127   /** Constructing a resource */
128   public void testResourceCreation() throws GateException {
129 
130     // before creating a resource we need a feature map to store
131     // parameter values
132     FeatureMap params = Factory.newFeatureMap();
133 
134     // to create a document we need a sourceUrlName parameter giving
135     // the location of the source for the document content
136     params.put("sourceUrl", Gate.getUrl("tests/doc0.html"));
137     params.put("markupAware", new Boolean(true));
138     Resource res = Factory.createResource("gate.corpora.DocumentImpl", params);
139 
140     // now we have a document
141     assertTrue(
142       "should be document but the class is: " + res.getClass().getName(),
143       res instanceof gate.Document
144     );
145     Document doc = (Document) res;
146     AnnotationSet markupAnnotations = doc.getAnnotations(
147                         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
148     //this is useless as doc.getAnnotations() will never return null!
149     assertNotNull("no markup annotations on doc " + doc, markupAnnotations);
150     int numMarkupAnnotations = markupAnnotations.size();
151     if(DEBUG)
152       Out.prln("annotations on doc after unpack= " + numMarkupAnnotations);
153     assertTrue(
154       "wrong number annots on doc: " + doc + numMarkupAnnotations,
155       numMarkupAnnotations == 27
156     );
157 
158   } // testResourceCreation
159 
160   /** Constructing a corpus */
161   public void testCorpusConstruction() throws GateException {
162 
163     // corpus constructors require a name
164     corpus = Factory.newCorpus("My example corpus");
165 
166     // the corpus interface inherits all the sorted set methods
167     assertTrue(corpus.isEmpty());
168 
169   } // testCorpusConstruction
170 
171   /** Adding documents to a corpus */
172   public void testAddingDocuments() throws GateException {
173 
174     corpus = Factory.newCorpus("My example corpus");
175 
176     // add a document or two....
177     corpus.add(doc1);
178     corpus.add(doc2);
179 
180     // iterate the corpus members and do some random tests
181     Iterator iter = corpus.iterator();
182     while(iter.hasNext()) {
183       Document doc = (Document) iter.next();
184       assertTrue(
185         "document url not as expected",
186         doc.getSourceUrl().toExternalForm().endsWith("doc0.html") ||
187           doc.getSourceUrl().toExternalForm().endsWith("test1.htm")
188       );
189     } // while
190 
191   } // testAddingDocuments
192 
193   /** Adding annotations to documents */
194   public void testAddingAnnotations() {
195     AnnotationSet as = doc1.getAnnotations();
196     FeatureMap fm = doc1.getFeatures();
197     Integer id;
198 
199     // during creation of annotations offsets are checked and an invalid
200     // offset exception thrown if they are invalid
201     try {
202       id = as.add(new Long(10), new Long(20), "T1", fm);
203     } catch (InvalidOffsetException e) {
204       fail(e.toString());
205     }
206   } // testAddingAnnotations
207 
208   /** Using the FeatureMap interface */
209   public void testUsingFeatures() {
210     AnnotationSet as = doc1.getAnnotations();
211     Integer id; // the id of new annotations
212 
213     // putting features on documents
214     FeatureMap fm = Factory.newFeatureMap();
215     doc1.setFeatures(fm);
216     assertTrue(fm.size() == 0);
217     fm.put("author", "segovia");
218     assertTrue(fm.get("author").equals("segovia"));
219     fm.put("author", "brendl"); // map puts overwrite existing values
220     assertTrue(fm.get("author").equals("brendl"));
221     assertTrue(fm.size() == 1);
222 
223   } // testUsingFeatures
224 
225   /** String to print when wrong command-line args */
226   private static String usage =
227     "usage: CookBook [-dir directory-name | file(s)]";
228 
229   /**
230    * Main function: an example of embedding GATE-based
231    * batch processing. The method:
232    * <UL>
233    * <LI>
234    * initialises the GATE library, and creates PRs for
235    * tokenisation, sentence splitting and part of speech tagging
236    * <LI>
237    * takes a directory name as argument (-dir option) or just a list
238    * of files
239    * <LI>
240    * creates a directory called "out" and an index.html file there
241    * <LI>
242    * for each .html file in that directory:
243    * <BR> create a GATE document from the file
244    * <BR> run the PRs on the document
245    * <BR> dump some output for the file to "out/gate__[file name].txt",
246    * and add a line to the index
247    * </UL>
248    */
249   public static void main(String[] args) throws Exception {
250     // say "hi"
251     Out.prln("CookBook.main");
252     Out.prln("processing command line arguments");
253 
254     // check we have a directory name or list of files
255     List inputFiles = null;
256     if(args.length < 1) throw new GateException(usage);
257 
258     // set up a list of all the files to process
259     if(args[0].equals("-dir")) { // list all the files in the dir
260       if(args.length < 2) throw new GateException(usage);
261       File dir = new File(args[1]);
262       File[] filesArray = dir.listFiles();
263       if(filesArray == null)
264         throw new GateException(
265           dir.getPath() + " is not a directory; " + usage
266         );
267       inputFiles = Arrays.asList(filesArray);
268 
269     } else { // all args should be file names
270       inputFiles = new ArrayList();
271       for(int i = 0; i < args.length; i++)
272         inputFiles.add(new File(args[i]));
273     }
274 
275     // did we get some file names?
276     if(inputFiles.isEmpty()) {
277       throw new GateException("No files to process!");
278     }
279 
280     // initialise GATE
281     Out.prln("initialising GATE");
282     Gate.init();
283 
284     // create some processing resources
285     Out.prln("creating PRs");
286     //create a tokeniser
287     DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource(
288                                       "gate.creole.tokeniser.DefaultTokeniser");
289     //create a sentence splitter
290     SentenceSplitter splitter = (SentenceSplitter)Factory.createResource(
291                                       "gate.creole.splitter.SentenceSplitter");
292     //create a POS tagger
293     POSTagger tagger = (POSTagger)Factory.createResource(
294                                       "gate.creole.POSTagger");
295 
296     //create  a gazetteer
297     DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource(
298                                       "gate.creole.gazetteer.DefaultGazetteer");
299 
300     //create a grammar
301     ANNIETransducer transducer = (ANNIETransducer)Factory.createResource(
302                                       "gate.creole.ANNIETransducer");
303 
304     //create an orthomatcher
305     OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
306                                 "gate.creole.orthomatcher.OrthoMatcher");
307 
308     // make the "out" directory that will contain the results.
309     String outDirName =
310       ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out";
311     if(! new File(outDirName).mkdir()){
312       throw new GateException("Could not create the output directory");
313     }
314 
315     // construct a name for the output index file; open; dump header
316     String nl = Strings.getNl(); // shorthand for platform's newline
317     String fsep =
318       Strings.getFileSep(); // shorthand for platform's file separator
319     String indexName =
320       ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html";
321     FileWriter indexWriter = new FileWriter(new File(indexName));
322     indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>");
323     indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl);
324 
325     // main loop:
326     // for each document
327     //   create a gate doc
328     //   set as the document for the PRs
329     //   run the PRs
330     //   dump output from the doc to out/gate__.....txt
331     //   delete the doc
332 
333     // loop on files list
334     Iterator filesIter = inputFiles.iterator();
335     Out.prln("looping on input files list");
336     while(filesIter.hasNext()) {
337       File inFile = (File) filesIter.next(); // the current file
338       Out.prln("processing file " + inFile.getPath());
339       FeatureMap params = Factory.newFeatureMap(); // params list for new doc
340 
341       // set the source URL parameter to a "file:..." URL string
342       params.put("sourceUrl", inFile.toURL().toExternalForm());
343 
344       // use the platform's default encoding rather than GATE's
345       params.put("encoding", "");
346 
347       // create the document
348       Document doc = (Document) Factory.createResource(
349         "gate.corpora.DocumentImpl", params
350       );
351 
352       // set the document param on the PRs
353        tokeniser.setDocument(doc);
354        splitter.setDocument(doc);
355        tagger.setDocument(doc);
356        gazetteer.setDocument(doc);
357        transducer.setDocument(doc);
358        orthomatcher.setDocument(doc);
359 
360       // run each PR
361       tokeniser.execute();
362       splitter.execute();
363       tagger.execute();
364       gazetteer.execute();
365       transducer.execute();
366       orthomatcher.execute();
367 
368       // dump out results
369 
370       // construct a name for the output file and open a stream
371       StringBuffer outFileName = new StringBuffer(inFile.getParent());
372       outFileName.append(fsep);
373       outFileName.append("out");
374       outFileName.append(fsep);
375       outFileName.append("gate__");
376       outFileName.append(inFile.getName());
377       outFileName.append(".txt");
378       File outFile = new File(outFileName.toString());
379       FileWriter outFileWriter = new FileWriter(outFile);
380       Out.prln("dumping " + outFile.getPath());
381 
382       // iterate round the token annotations writing to the out file
383       // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens));
384       AnnotationSet tokens = doc.getAnnotations("nercAS").get("Token");
385       Iterator iter = tokens.iterator();
386       while(iter.hasNext()) {
387         Annotation token = (Annotation) iter.next();
388         FeatureMap tokFeats = token.getFeatures();
389         String tokStr = (String) tokFeats.get("string");
390         String tokPos = (String) tokFeats.get("category");
391         outFileWriter.write(tokStr + "\t" + tokPos + nl);
392       }
393       outFileWriter.write(doc.getFeatures().get("entitySet").toString());
394 
395       // close the out file stream; add an index line
396       outFileWriter.close();
397       indexWriter.write(
398         "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() +
399         "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() +
400         "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n"
401       );
402 
403       // make the doc a candidate for garbage collection
404       Out.prln("deleting gate doc");
405 
406       Factory.deleteResource(doc);
407     } // input files loop
408 
409     // finish the index file
410     indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl);
411     indexWriter.close();
412 
413     Out.prln("The End (roll credits)");
414   } // main
415 
416   /** Fixture set up: initialise members before each test method */
417   public void setUp() throws GateException, IOException {
418     corpus = Factory.newCorpus("My example corpus");
419 
420     doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html"));
421     doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm"));
422   } // setUp
423 
424   /** Construction */
425   public CookBook(String name) { super(name); }
426 
427   /** Test suite routine for the test runner */
428   public static Test suite() {
429     return new TestSuite(CookBook.class);
430   } // suite
431 
432 } // class CookBook
433