1   /*
2    *  TestPR.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Oana Hamza,
12   *
13   *  $Id: TestPR.java,v 1.22 2001/11/24 18:23:35 hamish Exp $
14   */
15  
16  package gate.creole;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  import junit.framework.*;
22  import gnu.regexp.*;
23  
24  import gate.*;
25  import gate.util.*;
26  import gate.corpora.TestDocument;
27  import gate.creole.tokeniser.*;
28  import gate.creole.gazetteer.*;
29  import gate.creole.splitter.*;
30  import gate.creole.orthomatcher.*;
31  import gate.persist.*;
32  import gate.annotation.*;
33  
34  /** Test the PRs on three documents */
35  public class TestPR extends TestCase
36  {
37    /** Debug flag */
38    private static final boolean DEBUG = false;
39  
40    protected static Document doc1;
41    protected static Document doc2;
42    protected static Document doc3;
43  
44    protected static List annotationTypes = new ArrayList(10);
45  
46    /** Construction */
47    public TestPR(String name) { super(name); }
48  
49    /** Fixture set up */
50    public void setUp() throws Exception {
51      //get 3 documents
52      if (doc1 == null)
53        doc1 = Factory.newDocument(
54          new URL(TestDocument.getTestServerName() +
55          "tests/ft-bt-03-aug-2001.html")
56        );
57  
58      if (doc2 == null)
59        doc2 = Factory.newDocument(
60          new URL(TestDocument.getTestServerName() +
61            "tests/gu-Am-Brit-4-aug-2001.html")
62        );
63  
64      if (doc3 == null)
65        doc3 = Factory.newDocument(
66          new URL(TestDocument.getTestServerName() +
67            "tests/in-outlook-09-aug-2001.html")
68        );
69  
70      annotationTypes.add("Sentence");
71      annotationTypes.add("Organization");
72      annotationTypes.add("Location");
73      annotationTypes.add("Person");
74      annotationTypes.add("Date");
75      annotationTypes.add("Money");
76      annotationTypes.add("Lookup");
77      annotationTypes.add("Token");
78    } // setUp
79  
80    /** Put things back as they should be after running tests.
81      */
82    public void tearDown() throws Exception {
83    } // tearDown
84  
85    public void testTokenizer() throws Exception {
86      FeatureMap params = Factory.newFeatureMap();
87      DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource(
88                      "gate.creole.tokeniser.DefaultTokeniser", params);
89  
90  
91      //run the tokeniser for doc1
92      tokeniser.setDocument(doc1);
93      tokeniser.execute();
94      assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
95        doc1.getAnnotations().size() +
96        " Token annotations, instead of the expected 1286.",
97        doc1.getAnnotations().size()== 1286);
98  
99      //run the tokeniser for doc2
100     tokeniser.setDocument(doc2);
101     tokeniser.execute();
102     assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
103       doc2.getAnnotations().size() +
104       " Token annotations, instead of the expected 2144.",
105       doc2.getAnnotations().size()== 2144);
106 
107     //run the tokeniser for doc3
108     tokeniser.setDocument(doc3);
109     tokeniser.execute();
110     assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
111       doc3.getAnnotations().size() +
112       " Token annotations, instead of the expected 2812.",
113       doc3.getAnnotations().size()== 2812);
114 
115     Factory.deleteResource(tokeniser);
116   }// testTokenizer
117 
118   public void testGazetteer() throws Exception {
119     FeatureMap params = Factory.newFeatureMap();
120     DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource(
121                           "gate.creole.gazetteer.DefaultGazetteer", params);
122 
123     //run gazetteer for doc1
124     gaz.setDocument(doc1);
125     gaz.execute();
126     assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
127       doc1.getAnnotations().get("Lookup").size() +
128       " Lookup annotations, instead of the expected 47.",
129       doc1.getAnnotations().get("Lookup").size()== 47);
130 
131     //run gazetteer for doc2
132     gaz.setDocument(doc2);
133     gaz.execute();
134     assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
135       doc2.getAnnotations().get("Lookup").size() +
136       " Lookup annotations, instead of the expected 99.",
137       doc2.getAnnotations().get("Lookup").size()== 99);
138 
139     //run gazetteer for doc3
140     gaz.setDocument(doc3);
141     gaz.execute();
142     assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
143       doc3.getAnnotations().get("Lookup").size() +
144       " Lookup annotations, instead of the expected 112.",
145       doc3.getAnnotations().get("Lookup").size()== 112);
146     Factory.deleteResource(gaz);
147   }//testGazetteer
148 
149   public void testSplitter() throws Exception {
150     FeatureMap params = Factory.newFeatureMap();
151     SentenceSplitter splitter = (SentenceSplitter) Factory.createResource(
152                           "gate.creole.splitter.SentenceSplitter", params);
153 
154     //run splitter for doc1
155     splitter.setDocument(doc1);
156     splitter.execute();
157     assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
158       doc1.getAnnotations().get("Sentence").size() +
159       " Sentence annotations, instead of the expected 22.",
160       doc1.getAnnotations().get("Sentence").size()== 22);
161 
162     assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
163       doc1.getAnnotations().get("Split").size() +
164       " Split annotations, instead of the expected 36.",
165       doc1.getAnnotations().get("Split").size()== 36);
166 
167 
168     //run splitter for doc2
169     splitter.setDocument(doc2);
170     splitter.execute();
171     assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
172       doc2.getAnnotations().get("Sentence").size() +
173       " Sentence annotations, instead of the expected 53.",
174       doc2.getAnnotations().get("Sentence").size()== 53);
175 
176     assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
177       doc2.getAnnotations().get("Split").size() +
178       " Split annotations, instead of the expected 71.",
179       doc2.getAnnotations().get("Split").size()== 71);
180 
181     //run splitter for doc3
182     splitter.setDocument(doc3);
183     splitter.execute();
184 
185     assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
186       doc3.getAnnotations().get("Sentence").size() +
187       " Sentence annotations, instead of the expected 65.",
188       doc3.getAnnotations().get("Sentence").size()== 65);
189     if (DEBUG)
190       Out.prln(doc3.getAnnotations().get("Sentence"));
191 
192     assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
193       doc3.getAnnotations().get("Split").size() +
194       " Split annotations, instead of the expected 85.",
195       doc3.getAnnotations().get("Split").size()== 85);
196     Factory.deleteResource(splitter);
197   }//testSplitter
198 
199   public void testTagger() throws Exception {
200     FeatureMap params = Factory.newFeatureMap();
201     POSTagger tagger = (POSTagger) Factory.createResource(
202                           "gate.creole.POSTagger", params);
203 
204 
205     //run the tagger for doc1
206     tagger.setDocument(doc1);
207     tagger.execute();
208 
209     HashSet fType = new HashSet();
210     fType.add("category");
211     AnnotationSet annots =
212                   doc1.getAnnotations().get("Token", fType);
213 
214     assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+ annots.size() +
215       " Token annotations with category feature, instead of the expected 675.",
216       annots.size() == 675);
217 
218     //run the tagger for doc2
219     tagger.setDocument(doc2);
220     tagger.execute();
221     annots = doc2.getAnnotations().get("Token", fType);
222     assertTrue("Found in "+  doc2.getSourceUrl().getFile()+ " "+annots.size() +
223       " Token annotations with category feature, instead of the expected 1131.",
224       annots.size() == 1131);
225 
226     //run the tagger for doc3
227     tagger.setDocument(doc3);
228     tagger.execute();
229     annots = doc3.getAnnotations().get("Token", fType);
230     assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+ annots.size() +
231       " Token annotations with category feature, instead of the expected 1426.",
232       annots.size() == 1426);
233     Factory.deleteResource(tagger);
234   }//testTagger()
235 
236   public void testTransducer() throws Exception {
237     FeatureMap params = Factory.newFeatureMap();
238     ANNIETransducer transducer = (ANNIETransducer) Factory.createResource(
239                           "gate.creole.ANNIETransducer", params);
240 
241     //run the transducer for doc1
242     transducer.setDocument(doc1);
243     transducer.execute();
244     assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
245       doc1.getAnnotations().get("Organization").size() +
246       " Organization annotations, instead of the expected 16",
247       doc1.getAnnotations().get("Organization").size()== 16);
248     assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
249       doc1.getAnnotations().get("Location").size() +
250       " Location annotations, instead of the expected 3",
251       doc1.getAnnotations().get("Location").size()== 3);
252     assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
253       doc1.getAnnotations().get("Person").size() +
254       " Person annotations, instead of the expected 3",
255       doc1.getAnnotations().get("Person").size()== 3);
256     assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
257       doc1.getAnnotations().get("Date").size() +
258       " Date annotations, instead of the expected 6",
259       doc1.getAnnotations().get("Date").size()== 6);
260     assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
261       doc1.getAnnotations().get("Money").size() +
262       " Money annotations, instead of the expected 1",
263       doc1.getAnnotations().get("Money").size()== 1);
264 
265     //run the transducer for doc2
266     transducer.setDocument(doc2);
267     transducer.execute();
268     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
269       doc2.getAnnotations().get("Organization").size() +
270       " Organization annotations, instead of the expected 18",
271       doc2.getAnnotations().get("Organization").size()== 18);
272     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
273       doc2.getAnnotations().get("Location").size() +
274       " Location annotations, instead of the expected 9",
275       doc2.getAnnotations().get("Location").size()== 9);
276     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
277       doc2.getAnnotations().get("Person").size() +
278       " Person annotations, instead of the expected 1",
279       doc2.getAnnotations().get("Person").size()== 1);
280     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
281       doc2.getAnnotations().get("Date").size() +
282       " Date annotations, instead of the expected 6",
283       doc2.getAnnotations().get("Date").size()== 6);
284     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
285       doc2.getAnnotations().get("Money").size() +
286       " Money annotations, instead of the expected 3",
287       doc2.getAnnotations().get("Money").size()== 3);
288 
289     //run the transducer for doc3
290     transducer.setDocument(doc3);
291     transducer.execute();
292     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
293       doc3.getAnnotations().get("Organization").size() +
294       " Organization annotations, instead of the expected 9",
295       doc3.getAnnotations().get("Organization").size()== 9);
296     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
297       doc3.getAnnotations().get("Location").size() +
298       " Location annotations, instead of the expected 12",
299       doc3.getAnnotations().get("Location").size()== 12);
300     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
301       doc3.getAnnotations().get("Person").size() +
302       " Person annotations, instead of the expected 8",
303       doc3.getAnnotations().get("Person").size()== 8);
304     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
305       doc3.getAnnotations().get("Date").size() +
306       " Date annotations, instead of the expected 7",
307       doc3.getAnnotations().get("Date").size()== 7);
308     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
309       doc3.getAnnotations().get("Money").size() +
310       " Money annotations, instead of the expected 4",
311       doc3.getAnnotations().get("Money").size()== 4);
312 
313     Factory.deleteResource(transducer);
314   }//testTransducer
315 
316   public void testOrthomatcher() throws Exception {
317     FeatureMap params = Factory.newFeatureMap();
318 
319     OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
320                           "gate.creole.orthomatcher.OrthoMatcher", params);
321 
322 
323     // run the orthomatcher for doc1
324     orthomatcher.setDocument(doc1);
325     orthomatcher.execute();
326 
327     HashSet fType = new HashSet();
328     fType.add("matches");
329     AnnotationSet annots =
330                   doc1.getAnnotations().get(null,fType);
331 
332     assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+ annots.size() +
333       " annotations with matches feature, instead of the expected 29.",
334       annots.size() == 29);
335 
336     //run the orthomatcher for doc2
337     orthomatcher.setDocument(doc2);
338     orthomatcher.execute();
339     annots = doc2.getAnnotations().get(null,fType);
340     assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+ annots.size() +
341       " annotations with matches feature, instead of the expected 35.",
342       annots.size() == 33);
343 
344     //run the orthomatcher for doc3
345     orthomatcher.setDocument(doc3);
346     orthomatcher.execute();
347 
348     annots = doc3.getAnnotations().get(null,fType);
349     assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+ annots.size() +
350       " annotations with matches feature, instead of the expected 20.",
351       annots.size() == 20);
352     Factory.deleteResource(orthomatcher);
353   }//testOrthomatcher
354 
355   /** A test for comparing the annotation sets*/
356   public void testAllPR() throws Exception {
357 
358     // verify if the saved data store is the same with the just processed file
359     // first document
360     String urlBaseName = Gate.locateGateFiles();
361 //    RE re1 = new RE("build/gate.jar!");
362 //    RE re2 = new RE("jar:");
363 //    urlBaseName = re1.substituteAll( urlBaseName,"classes");
364 //    urlBaseName = re2.substituteAll( urlBaseName,"");
365 
366     if (urlBaseName.endsWith("/gate/build/gate.jar!/")) {
367       StringBuffer buff = new StringBuffer(
368                             urlBaseName.substring(
369                               0,
370                               urlBaseName.lastIndexOf("build/gate.jar!/"))
371                             );
372       buff.append("classes/");
373       buff.delete(0, "jar:file:".length());
374       buff.insert(0, "file://");
375       urlBaseName = buff.toString();
376     }
377 
378     URL urlBase = new URL(urlBaseName + "gate/resources/gate.ac.uk/");
379 
380     URL storageDir = null;
381     storageDir = new URL(urlBase, "tests/ft");
382 
383     //open the data store
384     DataStore ds = Factory.openDataStore
385                     ("gate.persist.SerialDataStore",
386                      storageDir.toExternalForm());
387 
388     //get LR id
389     String lrId = (String)ds.getLrIds
390                                 ("gate.corpora.DocumentImpl").get(0);
391 
392 
393     // get the document from data store
394     FeatureMap features = Factory.newFeatureMap();
395     features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
396     features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
397     Document document = (Document) Factory.createResource(
398                                       "gate.corpora.DocumentImpl",
399                                       features);
400     compareAnnots(document, doc1);
401 
402     // second document
403     storageDir = null;
404     storageDir = new URL(urlBase, "tests/gu");
405 
406     //open the data store
407     ds = Factory.openDataStore("gate.persist.SerialDataStore",
408                                storageDir.toExternalForm());
409     //get LR id
410     lrId = (String)ds.getLrIds("gate.corpora.DocumentImpl").get(0);
411     // get the document from data store
412     features = Factory.newFeatureMap();
413     features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
414     features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
415     document = (Document) Factory.createResource(
416                                       "gate.corpora.DocumentImpl",
417                                       features);
418     compareAnnots(document,doc2);
419 
420     // third document
421     storageDir = null;
422     storageDir = new URL(urlBase, "tests/in");
423 
424     //open the data store
425     ds = Factory.openDataStore("gate.persist.SerialDataStore",
426                                storageDir.toExternalForm());
427     //get LR id
428     lrId = (String)ds.getLrIds("gate.corpora.DocumentImpl").get(0);
429     // get the document from data store
430     features = Factory.newFeatureMap();
431     features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
432     features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
433     document = (Document) Factory.createResource(
434                                 "gate.corpora.DocumentImpl",
435                                 features);
436     compareAnnots(document,doc3);
437   } // testAllPR()
438 
439   public void compareAnnots(Document keyDocument, Document responseDocument){
440 
441     // create annotation schema
442     AnnotationSchema annotationSchema = new AnnotationSchema();
443     String annotType = null;
444 
445     // organization type
446     Iterator iteratorTypes = annotationTypes.iterator();
447     while (iteratorTypes.hasNext()){
448       // get the type of annotation
449       annotType = (String)iteratorTypes.next();
450 
451       annotationSchema.setAnnotationName(annotType);
452 
453       // create an annotation diff
454       FeatureMap parameters = Factory.newFeatureMap();
455       parameters.put("keyDocument",keyDocument);
456       parameters.put("responseDocument",responseDocument);
457       parameters.put("annotationSchema",annotationSchema);
458       parameters.put("keyAnnotationSetName",null);
459       parameters.put("responseAnnotationSetName",null);
460 
461       // Create Annotation Diff visual resource
462       try {
463       AnnotationDiff annotDiff = (AnnotationDiff)
464           Factory.createResource("gate.annotation.AnnotationDiff",parameters);
465 
466       if (DEBUG){
467         if (annotDiff.getFMeasureAverage() != 1.0) {
468           assertTrue("missing annotations " +
469             annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE),false);
470           assertTrue("spurious annotations " +
471             annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE),false);
472           assertTrue("partially-correct annotations " +
473             annotDiff.getAnnotationsOfType(
474                             AnnotationDiff.PARTIALLY_CORRECT_TYPE),false);
475         }
476       }//if
477 
478       assertTrue(annotType+ " precision average in "+
479         responseDocument.getSourceUrl().getFile()+
480         " is "+ annotDiff.getPrecisionAverage()+ " instead of 1.0 ",
481         annotDiff.getPrecisionAverage()== 1.0);
482       assertTrue(annotType+" recall average in "
483         +responseDocument.getSourceUrl().getFile()+
484         " is " + annotDiff.getRecallAverage()+ " instead of 1.0 ",
485         annotDiff.getRecallAverage()== 1.0);
486       assertTrue(annotType+" f-measure average in "
487         +responseDocument.getSourceUrl().getFile()+
488         " is "+ annotDiff.getFMeasureAverage()+ " instead of 1.0 ",
489         annotDiff.getFMeasureAverage()== 1.0);
490       } catch (ResourceInstantiationException rie) {
491         rie.printStackTrace(Err.getPrintWriter());
492       }
493 
494      }//while
495    }// public void compareAnnots
496 
497   /** Test suite routine for the test runner */
498   public static Test suite() {
499     return new TestSuite(TestPR.class);
500   } // suite
501 
502   public static void main(String[] args) {
503     try{
504 
505       Gate.init();
506       TestPR testPR = new TestPR("");
507       testPR.setUp();
508       testPR.testTokenizer();
509       testPR.testGazetteer();
510       testPR.testSplitter();
511       testPR.testTagger();
512       testPR.testTransducer();
513       testPR.testOrthomatcher();
514       testPR.testAllPR();
515       testPR.tearDown();
516     } catch(Exception e) {
517       e.printStackTrace();
518     }
519   } // main
520 } // class TestPR
521