|
CookBook |
|
1 /* 2 * CookBook.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 16/Feb/2000 12 * 13 * $Id: CookBook.java,v 1.29 2001/12/03 12:29:46 valyt Exp $ 14 */ 15 16 package gate; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 import junit.framework.*; 22 23 import gate.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.creole.tokeniser.*; 27 import gate.creole.splitter.*; 28 import gate.creole.gazetteer.*; 29 import gate.creole.orthomatcher.*; 30 31 32 /** 33 * <P> 34 * This class provides examples of using the GATE APIs. 35 * Read this documentation along with a copy of the 36 * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source 37 * code</A>. 38 * 39 * <P> 40 * The CookBook is set up as 41 * part of the GATE test suite (using the 42 * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's 43 * an easy way to run the examples (viz., 44 * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method, 45 * which will invoke the 46 * JUnit test runner). Also, we can use JUnit's assert methods: e.g. 47 * <TT>assertTrue(corpus.isEmpty());</TT> 48 * tests that a corpus object is empty, and creates a test failure report if 49 * this is not the case. (To add a new test class to the suite, see the 50 * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.) 51 * 52 * <P> 53 * Programming to the GATE Java API involves manipulating the classes and 54 * interfaces in the <A HREF=package-summary.html>gate package</A> 55 * (and to a lesser extent other packages). These are 56 * often interfaces; classes there are often to do with getting 57 * access to objects that implement the interfaces (without exposing those 58 * implementations). In other words, there's a lot of interface-based design 59 * around. 60 * 61 * <P> 62 * For more details and for a conceptual view, see 63 * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing 64 * Components with GATE</A> (for which this class provides some of the 65 * examples). 66 * 67 * <P> 68 * The rest of this documentation refers to methods in the code that 69 * provide examples of using the GATE API. 70 * 71 * <P> 72 * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives 73 * an example of creating a resource via 74 * <A HREF=../gate/Factory.html>gate.Factory</A>. 75 * 76 * <P> 77 * The <A HREF=Corpus.html>Corpus interface</A> represents collections of 78 * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER 79 * <TT>Collection</TT> class). 80 * 81 * <P> 82 * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method 83 * gives an example of how to create a new transient Corpus object. 84 * 85 * <P> 86 * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives 87 * examples of adding documents to corpora. 88 * 89 * <P> 90 * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives 91 * examples of adding annotations to documents. 92 * 93 * 94 * <P> 95 * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives 96 * examples of using features. <A HREF=FeatureMap.html>The FeatureMap 97 * interface</A> is a mechanism for associating arbitrary data with GATE 98 * entities. Corpora, documents and annotations all share this 99 * mechanism. Simple feature maps use Java's Map interface. 100 * 101 * 102 * <H3>Other sources of examples</H3> 103 * 104 * <P> 105 * See also the other test classes, although note that they also use methods 106 * that are not part of the public API. Test classes include: 107 * <A HREF=corpora/TestCreole.html>TestCreole</A>; 108 * <A HREF=corpora/TestCorpus.html>TestCorpus</A>; 109 * <A HREF=corpora/TestDocument.html>TestDocument</A>; 110 * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything 111 * else starting "Test" - about 30 of them at the last count. 112 */ 113 public class CookBook extends TestCase 114 { 115 /** Debug flag */ 116 private static final boolean DEBUG = false; 117 118 /** A corpus */ 119 Corpus corpus = null; 120 121 /** A document */ 122 Document doc1 = null; 123 124 /** Another document */ 125 Document doc2 = null; 126 127 /** Constructing a resource */ 128 public void testResourceCreation() throws GateException { 129 130 // before creating a resource we need a feature map to store 131 // parameter values 132 FeatureMap params = Factory.newFeatureMap(); 133 134 // to create a document we need a sourceUrlName parameter giving 135 // the location of the source for the document content 136 params.put("sourceUrl", Gate.getUrl("tests/doc0.html")); 137 params.put("markupAware", new Boolean(true)); 138 Resource res = Factory.createResource("gate.corpora.DocumentImpl", params); 139 140 // now we have a document 141 assertTrue( 142 "should be document but the class is: " + res.getClass().getName(), 143 res instanceof gate.Document 144 ); 145 Document doc = (Document) res; 146 AnnotationSet markupAnnotations = doc.getAnnotations( 147 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 148 //this is useless as doc.getAnnotations() will never return null! 149 assertNotNull("no markup annotations on doc " + doc, markupAnnotations); 150 int numMarkupAnnotations = markupAnnotations.size(); 151 if(DEBUG) 152 Out.prln("annotations on doc after unpack= " + numMarkupAnnotations); 153 assertTrue( 154 "wrong number annots on doc: " + doc + numMarkupAnnotations, 155 numMarkupAnnotations == 27 156 ); 157 158 } // testResourceCreation 159 160 /** Constructing a corpus */ 161 public void testCorpusConstruction() throws GateException { 162 163 // corpus constructors require a name 164 corpus = Factory.newCorpus("My example corpus"); 165 166 // the corpus interface inherits all the sorted set methods 167 assertTrue(corpus.isEmpty()); 168 169 } // testCorpusConstruction 170 171 /** Adding documents to a corpus */ 172 public void testAddingDocuments() throws GateException { 173 174 corpus = Factory.newCorpus("My example corpus"); 175 176 // add a document or two.... 177 corpus.add(doc1); 178 corpus.add(doc2); 179 180 // iterate the corpus members and do some random tests 181 Iterator iter = corpus.iterator(); 182 while(iter.hasNext()) { 183 Document doc = (Document) iter.next(); 184 assertTrue( 185 "document url not as expected", 186 doc.getSourceUrl().toExternalForm().endsWith("doc0.html") || 187 doc.getSourceUrl().toExternalForm().endsWith("test1.htm") 188 ); 189 } // while 190 191 } // testAddingDocuments 192 193 /** Adding annotations to documents */ 194 public void testAddingAnnotations() { 195 AnnotationSet as = doc1.getAnnotations(); 196 FeatureMap fm = doc1.getFeatures(); 197 Integer id; 198 199 // during creation of annotations offsets are checked and an invalid 200 // offset exception thrown if they are invalid 201 try { 202 id = as.add(new Long(10), new Long(20), "T1", fm); 203 } catch (InvalidOffsetException e) { 204 fail(e.toString()); 205 } 206 } // testAddingAnnotations 207 208 /** Using the FeatureMap interface */ 209 public void testUsingFeatures() { 210 AnnotationSet as = doc1.getAnnotations(); 211 Integer id; // the id of new annotations 212 213 // putting features on documents 214 FeatureMap fm = Factory.newFeatureMap(); 215 doc1.setFeatures(fm); 216 assertTrue(fm.size() == 0); 217 fm.put("author", "segovia"); 218 assertTrue(fm.get("author").equals("segovia")); 219 fm.put("author", "brendl"); // map puts overwrite existing values 220 assertTrue(fm.get("author").equals("brendl")); 221 assertTrue(fm.size() == 1); 222 223 } // testUsingFeatures 224 225 /** String to print when wrong command-line args */ 226 private static String usage = 227 "usage: CookBook [-dir directory-name | file(s)]"; 228 229 /** 230 * Main function: an example of embedding GATE-based 231 * batch processing. The method: 232 * <UL> 233 * <LI> 234 * initialises the GATE library, and creates PRs for 235 * tokenisation, sentence splitting and part of speech tagging 236 * <LI> 237 * takes a directory name as argument (-dir option) or just a list 238 * of files 239 * <LI> 240 * creates a directory called "out" and an index.html file there 241 * <LI> 242 * for each .html file in that directory: 243 * <BR> create a GATE document from the file 244 * <BR> run the PRs on the document 245 * <BR> dump some output for the file to "out/gate__[file name].txt", 246 * and add a line to the index 247 * </UL> 248 */ 249 public static void main(String[] args) throws Exception { 250 // say "hi" 251 Out.prln("CookBook.main"); 252 Out.prln("processing command line arguments"); 253 254 // check we have a directory name or list of files 255 List inputFiles = null; 256 if(args.length < 1) throw new GateException(usage); 257 258 // set up a list of all the files to process 259 if(args[0].equals("-dir")) { // list all the files in the dir 260 if(args.length < 2) throw new GateException(usage); 261 File dir = new File(args[1]); 262 File[] filesArray = dir.listFiles(); 263 if(filesArray == null) 264 throw new GateException( 265 dir.getPath() + " is not a directory; " + usage 266 ); 267 inputFiles = Arrays.asList(filesArray); 268 269 } else { // all args should be file names 270 inputFiles = new ArrayList(); 271 for(int i = 0; i < args.length; i++) 272 inputFiles.add(new File(args[i])); 273 } 274 275 // did we get some file names? 276 if(inputFiles.isEmpty()) { 277 throw new GateException("No files to process!"); 278 } 279 280 // initialise GATE 281 Out.prln("initialising GATE"); 282 Gate.init(); 283 284 // create some processing resources 285 Out.prln("creating PRs"); 286 //create a tokeniser 287 DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource( 288 "gate.creole.tokeniser.DefaultTokeniser"); 289 //create a sentence splitter 290 SentenceSplitter splitter = (SentenceSplitter)Factory.createResource( 291 "gate.creole.splitter.SentenceSplitter"); 292 //create a POS tagger 293 POSTagger tagger = (POSTagger)Factory.createResource( 294 "gate.creole.POSTagger"); 295 296 //create a gazetteer 297 DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource( 298 "gate.creole.gazetteer.DefaultGazetteer"); 299 300 //create a grammar 301 ANNIETransducer transducer = (ANNIETransducer)Factory.createResource( 302 "gate.creole.ANNIETransducer"); 303 304 //create an orthomatcher 305 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource( 306 "gate.creole.orthomatcher.OrthoMatcher"); 307 308 // make the "out" directory that will contain the results. 309 String outDirName = 310 ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out"; 311 if(! new File(outDirName).mkdir()){ 312 throw new GateException("Could not create the output directory"); 313 } 314 315 // construct a name for the output index file; open; dump header 316 String nl = Strings.getNl(); // shorthand for platform's newline 317 String fsep = 318 Strings.getFileSep(); // shorthand for platform's file separator 319 String indexName = 320 ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html"; 321 FileWriter indexWriter = new FileWriter(new File(indexName)); 322 indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>"); 323 indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl); 324 325 // main loop: 326 // for each document 327 // create a gate doc 328 // set as the document for the PRs 329 // run the PRs 330 // dump output from the doc to out/gate__.....txt 331 // delete the doc 332 333 // loop on files list 334 Iterator filesIter = inputFiles.iterator(); 335 Out.prln("looping on input files list"); 336 while(filesIter.hasNext()) { 337 File inFile = (File) filesIter.next(); // the current file 338 Out.prln("processing file " + inFile.getPath()); 339 FeatureMap params = Factory.newFeatureMap(); // params list for new doc 340 341 // set the source URL parameter to a "file:..." URL string 342 params.put("sourceUrl", inFile.toURL().toExternalForm()); 343 344 // use the platform's default encoding rather than GATE's 345 params.put("encoding", ""); 346 347 // create the document 348 Document doc = (Document) Factory.createResource( 349 "gate.corpora.DocumentImpl", params 350 ); 351 352 // set the document param on the PRs 353 tokeniser.setDocument(doc); 354 splitter.setDocument(doc); 355 tagger.setDocument(doc); 356 gazetteer.setDocument(doc); 357 transducer.setDocument(doc); 358 orthomatcher.setDocument(doc); 359 360 // run each PR 361 tokeniser.execute(); 362 splitter.execute(); 363 tagger.execute(); 364 gazetteer.execute(); 365 transducer.execute(); 366 orthomatcher.execute(); 367 368 // dump out results 369 370 // construct a name for the output file and open a stream 371 StringBuffer outFileName = new StringBuffer(inFile.getParent()); 372 outFileName.append(fsep); 373 outFileName.append("out"); 374 outFileName.append(fsep); 375 outFileName.append("gate__"); 376 outFileName.append(inFile.getName()); 377 outFileName.append(".txt"); 378 File outFile = new File(outFileName.toString()); 379 FileWriter outFileWriter = new FileWriter(outFile); 380 Out.prln("dumping " + outFile.getPath()); 381 382 // iterate round the token annotations writing to the out file 383 // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens)); 384 AnnotationSet tokens = doc.getAnnotations("nercAS").get("Token"); 385 Iterator iter = tokens.iterator(); 386 while(iter.hasNext()) { 387 Annotation token = (Annotation) iter.next(); 388 FeatureMap tokFeats = token.getFeatures(); 389 String tokStr = (String) tokFeats.get("string"); 390 String tokPos = (String) tokFeats.get("category"); 391 outFileWriter.write(tokStr + "\t" + tokPos + nl); 392 } 393 outFileWriter.write(doc.getFeatures().get("entitySet").toString()); 394 395 // close the out file stream; add an index line 396 outFileWriter.close(); 397 indexWriter.write( 398 "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() + 399 "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() + 400 "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n" 401 ); 402 403 // make the doc a candidate for garbage collection 404 Out.prln("deleting gate doc"); 405 406 Factory.deleteResource(doc); 407 } // input files loop 408 409 // finish the index file 410 indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl); 411 indexWriter.close(); 412 413 Out.prln("The End (roll credits)"); 414 } // main 415 416 /** Fixture set up: initialise members before each test method */ 417 public void setUp() throws GateException, IOException { 418 corpus = Factory.newCorpus("My example corpus"); 419 420 doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html")); 421 doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm")); 422 } // setUp 423 424 /** Construction */ 425 public CookBook(String name) { super(name); } 426 427 /** Test suite routine for the test runner */ 428 public static Test suite() { 429 return new TestSuite(CookBook.class); 430 } // suite 431 432 } // class CookBook 433
|
CookBook |
|