1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.18 2001/11/13 15:32:05 valyt Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import gate.creole.tokeniser.*;
26  import gate.creole.gazetteer.*;
27  import gate.creole.splitter.*;
28  import gate.creole.orthomatcher.*;
29  import gate.creole.annotransfer.*;
30  import gate.annotation.*;
31  
32  public class CorpusBenchmarkTool {
33    private static final String MARKED_DIR_NAME = "marked";
34    private static final String CLEAN_DIR_NAME = "clean";
35    private static final String CVS_DIR_NAME = "Cvs";
36    private static final String PROCESSED_DIR_NAME = "processed";
37  
38    private static final boolean DEBUG = true;
39  
40    public CorpusBenchmarkTool() {}
41  
42    public void initPRs() {
43      try {
44        FeatureMap params = Factory.newFeatureMap();
45  
46        //create a default tokeniser
47        Out.prln("Loading tokeniser <P>");
48        String rulesURL = this.configs.getProperty("tokeniserRulesURL");
49        if (rulesURL != null && !rulesURL.equals(""))
50          params.put("tokeniserRulesURL", rulesURL);
51        String grammarsURL = this.configs.getProperty("tokeniserGrammarURL");
52        if (grammarsURL != null && !grammarsURL.equals(""))
53          params.put("transducerGrammarURL", grammarsURL);
54        //the annots are put in temp, as they are going to be transfered to the
55        //new set
56        params.put("annotationSetName", "temp");
57        tokeniser = (DefaultTokeniser) Factory.createResource(
58                        "gate.creole.tokeniser.DefaultTokeniser", params);
59  
60        //create a default gazetteer
61        Out.prln("Loading gazetteer <P>");
62        params.clear();
63        String listsURL = this.configs.getProperty("gazetteerListsURL");
64        if (listsURL != null && !listsURL.equals(""))
65          params.put("listsURL", listsURL);
66        String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive");
67        if (caseSensitive != null && !caseSensitive.equals(""))
68          params.put("caseSensitive", new Boolean(caseSensitive));
69        params.put("annotationSetName", "temp");
70        gazetteer = (DefaultGazetteer) Factory.createResource(
71                        "gate.creole.gazetteer.DefaultGazetteer", params);
72  
73        //create the Annotation set transfer
74        Out.prln("Loading annotation set transfer <P>");
75        params.clear();
76        params.put("inputASName", "temp");
77        params.put("outputASName", annotSetName);
78        //by default make it transfer all annotations
79        params.put("textTagName", "");
80        setTransfer = (AnnotationSetTransfer) Factory.createResource(
81                        "gate.creole.annotransfer.AnnotationSetTransfer", params);
82  
83        //create a splitter
84        Out.prln("Loading sentence splitter <P>");
85        params.clear();
86        listsURL = this.configs.getProperty("splitterGazetteerURL");
87        if (listsURL != null && !listsURL.equals(""))
88          params.put("gazetteerListsURL", listsURL);
89        grammarsURL = this.configs.getProperty("splitterGrammarURL");
90        if (grammarsURL != null && !grammarsURL.equals(""))
91          params.put("transducerURL", grammarsURL);
92        params.put("inputASName", annotSetName);
93        params.put("outputASName", annotSetName);
94        splitter = (SentenceSplitter) Factory.createResource(
95                        "gate.creole.splitter.SentenceSplitter", params);
96  
97        //create a tagger
98        Out.prln("Loading POS tagger <P>");
99        params.clear();
100       String lexiconURL = this.configs.getProperty("taggerLexiconURL");
101       if (lexiconURL != null && !lexiconURL.equals(""))
102         params.put("lexiconURL", lexiconURL);
103       rulesURL = this.configs.getProperty("taggerRulesURL");
104       if (rulesURL != null && !rulesURL.equals(""))
105         params.put("rulesURL", rulesURL);
106       params.put("inputASName", annotSetName);
107       params.put("outputASName", annotSetName);
108       tagger = (POSTagger) Factory.createResource(
109                       "gate.creole.POSTagger", params);
110 
111       //create a grammar
112       Out.prln("Loading grammars for transducer <P>");
113       params.clear();
114       String grammarURL = this.configs.getProperty("grammarURL");
115       if (grammarURL != null && !grammarURL.equals(""))
116         params.put("grammarURL", grammarURL);
117       params.put("inputASName", annotSetName);
118       params.put("outputASName", annotSetName);
119       transducer = (ANNIETransducer) Factory.createResource(
120                       "gate.creole.ANNIETransducer", params);
121 
122       //create an orthomatcher
123       Out.prln("Loading orthomatcher <P>");
124       params.clear();
125       params.put("annotationSetName", annotSetName);
126       orthomatcher = (OrthoMatcher) Factory.createResource(
127                       "gate.creole.orthomatcher.OrthoMatcher", params);
128     } catch (ResourceInstantiationException ex) {
129       throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
130     }
131   }//initPRs
132 
133   public void execute() {
134     execute(startDir);
135   }
136 
137   public void init() {
138     //we only initialise the PRs if they are going to be used
139     //for processing unprocessed documents
140     if (!this.isMarkedStored)
141       initPRs();
142 
143     annotTypes = new ArrayList();
144     annotTypes.add("Organization");
145     annotTypes.add("Person");
146     annotTypes.add("Date");
147     annotTypes.add("Location");
148     annotTypes.add("Address");
149     annotTypes.add("Money");
150     annotTypes.add("Percent");
151     annotTypes.add("GPE");
152     annotTypes.add("Facility");
153 
154   }
155 
156   public void execute(File dir) {
157     if (dir == null)
158       return;
159     //first set the current directory to be the given one
160     currDir = dir;
161     Out.prln("Processing directory: " + currDir + "<P>");
162 
163     File processedDir = null;
164     File cleanDir = null;
165     File markedDir = null;
166 
167     ArrayList subDirs = new ArrayList();
168     File[] dirArray = currDir.listFiles();
169     for (int i = 0; i < dirArray.length; i++) {
170       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
171         continue;
172       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
173         cleanDir = dirArray[i];
174       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
175         markedDir = dirArray[i];
176       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
177         processedDir = dirArray[i];
178       else
179         subDirs.add(dirArray[i]);
180     }
181 
182     if (this.isGenerateMode)
183       generateCorpus(cleanDir, processedDir);
184     else
185       evaluateCorpus(cleanDir, processedDir, markedDir);
186 
187     //if no more subdirs left, return
188     if (subDirs.isEmpty())
189       return;
190 
191     //there are more subdirectories to traverse, so iterate through
192     for (int j = 0; j < subDirs.size(); j++)
193       execute((File) subDirs.get(j));
194 
195   }//execute(dir)
196 
197 
198   public static void main(String[] args) throws GateException {
199     Out.prln("<HTML>");
200     Out.prln("<HEAD>");
201     Out.prln("<TITLE> Corpus benchmark tool: ran with args " +
202             args.toString() + " on " +
203             new Date() + "</TITLE> </HEAD>");
204     Out.prln("<BODY>");
205     Out.prln("Please wait while GATE tools are initialised. <P>");
206     // initialise GATE
207     Gate.init();
208 
209     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
210 
211     List inputFiles = null;
212     if(args.length < 1) throw new GateException(usage);
213     int i = 0;
214     while (i < args.length && args[i].startsWith("-")) {
215       if(args[i].equals("-generate")) {
216         Out.prln("Generating the corpus... <P>");
217         corpusTool.setGenerateMode(true);
218       } else if (args[i].equals("-marked_clean")) {
219         Out.prln("Evaluating current grammars against human-annotated...<P>");
220         corpusTool.setMarkedClean(true);
221       } else if (args[i].equals("-marked_stored")) {
222         Out.prln("Evaluating stored documents against human-annotated...<P>");
223         corpusTool.setMarkedStored(true);
224       } else if (args[i].equals("-verbose")) {
225         Out.prln("Running in verbose mode. Will generate annotation " +
226           "information when precision/recall are lower than " +
227           corpusTool.getThreshold() +"<P>");
228         corpusTool.setVerboseMode(true);
229       }
230       i++; //just ignore the option, which we do not recognise
231     }//while
232 
233     String dirName = args[i];
234     File dir = new File(dirName);
235     if (!dir.isDirectory())
236       throw new GateException(usage);
237 
238     File propFile = new File("corpus_tool.properties");
239     Out.prln(propFile.getAbsolutePath());
240     if (propFile.exists()) {
241       try {
242         InputStream inputStream = new FileInputStream(propFile);
243         corpusTool.configs.load(inputStream);
244         String thresholdString = corpusTool.configs.getProperty("threshold");
245         if (thresholdString != null && !thresholdString.equals("")) {
246           corpusTool.threshold = (new Double(thresholdString)).doubleValue();
247           Out.prln("new threshold is: " + corpusTool.threshold);
248         }
249         String setName = corpusTool.configs.getProperty("annotSetName");
250         if (setName != null && !setName.equals(""))
251           corpusTool.annotSetName = setName;
252       } catch (IOException ex) {
253         //just ignore the file and go on with the defaults
254         corpusTool.configs = new Properties();
255       }
256     } else
257       corpusTool.configs = new Properties();
258 
259     corpusTool.init();
260 
261     Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>");
262 
263     corpusTool.setStartDirectory(dir);
264     corpusTool.execute();
265 
266     //if we're not generating the corpus, then print the precision and recall
267     //statistics for the processed corpus
268     if (! corpusTool.getGenerateMode())
269       corpusTool.printStatistics();
270 
271     Out.prln("Finished! <P>");
272     Out.prln("</BODY>");
273     Out.prln("</HTML>");
274 
275     System.exit(0);
276 
277   }//main
278 
279   public void setGenerateMode(boolean mode) {
280     isGenerateMode = mode;
281   }//setGenerateMode
282 
283   public boolean getGenerateMode() {
284     return isGenerateMode;
285   }//getGenerateMode
286 
287   public boolean getVerboseMode() {
288     return isVerboseMode;
289   }//getVerboseMode
290 
291   public void setVerboseMode(boolean mode) {
292     isVerboseMode = mode;
293   }//setVerboseMode
294 
295   public void setMarkedStored(boolean mode) {
296     isMarkedStored = mode;
297   }//
298 
299   public boolean getMarkedStored() {
300     return isMarkedStored;
301   }//
302 
303   public void setMarkedClean(boolean mode) {
304     isMarkedClean = mode;
305   }//
306 
307   public boolean getMarkedClean() {
308     return isMarkedClean;
309   }//
310 
311   /**
312    * Returns the average precision over the entire set of processed documents.
313    * <P>
314    * If the tool has been evaluating the original documents against the
315    * previously-stored automatically annotated ones, then the precision
316    * will be the average precision on those two sets. <P>
317    * If the tool was run in -marked mode, i.e., was evaluating the stored
318    * automatically processed ones against the human-annotated ones, then
319    * the precision will be the average precision on those two sets of documents.
320    */
321   public double getPrecisionAverage() {
322     return precisionSum/docNumber;
323   }
324 
325   /**
326    * Returns the average recall over the entire set of processed documents.
327    * <P>
328    * If the tool has been evaluating the original documents against the
329    * previously-stored automatically annotated ones, then the recall
330    * will be the average recall on those two sets. <P>
331    * If the tool was run in -marked mode, i.e., was evaluating the stored
332    * automatically processed ones against the human-annotated ones, then
333    * the recall will be the average recall on those two sets of documents.
334    */
335   public double getRecallAverage() {
336     return recallSum/docNumber;
337   }
338 
339   public boolean isGenerateMode() {
340     return isGenerateMode == true;
341   }//isGenerateMode
342 
343   public double getThreshold() {
344     return threshold;
345   }
346 
347   public void setThreshold(double newValue) {
348     threshold = newValue;
349   }
350 
351   public File getStartDirectory() {
352     return startDir;
353   }//getStartDirectory
354 
355   public void setStartDirectory(File dir) {
356     startDir = dir;
357   }//setStartDirectory
358 
359   protected void generateCorpus(File fileDir, File outputDir) {
360     //1. check if we have input files
361     if (fileDir == null)
362       return;
363     //2. create the output directory or clean it up if needed
364     File outDir = outputDir;
365     if (outputDir == null) {
366       outDir = new File(currDir, PROCESSED_DIR_NAME);
367     } else {
368       // get rid of the directory, coz datastore wants it clean
369       if (!Files.rmdir(outDir))
370         Out.prln("cannot delete old output directory: " + outDir);
371     }
372     outDir.mkdir();
373 
374     //create the datastore and process each document
375     try {
376       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
377       sds.create();
378       sds.open();
379 
380       File[] files = fileDir.listFiles();
381       for (int i=0; i < files.length; i++) {
382         if (!files[i].isFile())
383           continue;
384         // create a document
385         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
386 
387         FeatureMap params = Factory.newFeatureMap();
388         params.put("sourceUrl", files[i].toURL());
389         params.put("encoding", "");
390 
391         // create the document
392         Document doc = (Document) Factory.createResource(
393           "gate.corpora.DocumentImpl", params
394         );
395 
396         doc.setName(files[i].getName());
397         if (doc == null)
398           continue;
399         processDocument(doc);
400         LanguageResource lr = sds.adopt(doc, null);
401         sds.sync(lr);
402       }//for
403       sds.close();
404     } catch (java.net.MalformedURLException ex) {
405       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
406     } catch (PersistenceException ex1) {
407       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
408     } catch (ResourceInstantiationException ex2) {
409       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
410     } catch (gate.security.SecurityException ex3) {
411       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
412     }
413 
414   }//generateCorpus
415 
416   protected void evaluateCorpus(File fileDir,
417                     File processedDir, File markedDir) {
418     //1. check if we have input files and the processed Dir
419     if (fileDir == null || !fileDir.exists())
420       return;
421     if (processedDir == null || !processedDir.exists())
422       //if the user wants evaluation of marked and stored that's not possible
423       if (isMarkedStored) {
424         Out.prln("Cannot evaluate because no processed documents exist.");
425         return;
426       }
427       else
428         isMarkedClean = true;
429 
430     //looked for marked texts only if the directory exists
431     boolean processMarked = markedDir != null && markedDir.exists();
432     if (!processMarked && (isMarkedStored || isMarkedClean)) {
433         Out.prln("Cannot evaluate because no human-annotated documents exist.");
434         return;
435     }
436 
437     if (isMarkedStored) {
438       evaluateMarkedStored(markedDir, processedDir);
439       return;
440     } else if (isMarkedClean) {
441       evaluateMarkedClean(markedDir, fileDir);
442       return;
443     }
444 
445     Document persDoc = null;
446     Document cleanDoc = null;
447     Document markedDoc = null;
448 
449     //open the datastore and process each document
450     try {
451       //open the data store
452       DataStore sds = Factory.openDataStore
453                       ("gate.persist.SerialDataStore",
454                        processedDir.toURL().toExternalForm());
455 
456       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
457       for (int i=0; i < lrIDs.size(); i++) {
458         String docID = (String) lrIDs.get(i);
459 
460         //read the stored document
461         FeatureMap features = Factory.newFeatureMap();
462         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
463         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
464         persDoc = (Document) Factory.createResource(
465                                     "gate.corpora.DocumentImpl",
466                                     features);
467 
468         Out.prln("<H2>" + persDoc.getName() + "</H2>");
469 
470         File cleanDocFile = new File(fileDir, persDoc.getName());
471         //try reading the original document from clean
472         if (! cleanDocFile.exists()) {
473           Out.prln("Warning: Cannot find original document " +
474                    persDoc.getName() + " in " + fileDir);
475         } else {
476           FeatureMap params = Factory.newFeatureMap();
477           params.put("sourceUrl", cleanDocFile.toURL());
478           params.put("encoding", "");
479 
480           // create the document
481           cleanDoc = (Document) Factory.createResource(
482                                   "gate.corpora.DocumentImpl", params);
483           cleanDoc.setName(persDoc.getName());
484         }
485 
486         //try finding the marked document
487         StringBuffer docName = new StringBuffer(persDoc.getName());
488         docName.replace(
489           persDoc.getName().lastIndexOf("."),
490           docName.length(),
491           ".xml");
492         File markedDocFile = new File(markedDir, docName.toString());
493         if (! processMarked || ! markedDocFile.exists()) {
494           Out.prln("Warning: Cannot find human-annotated document " +
495                    markedDocFile + " in " + markedDir);
496         } else {
497           FeatureMap params = Factory.newFeatureMap();
498           params.put("sourceUrl", markedDocFile.toURL());
499           params.put("encoding", "");
500 
501           // create the document
502           markedDoc = (Document) Factory.createResource(
503                                    "gate.corpora.DocumentImpl", params);
504           markedDoc.setName(persDoc.getName());
505         }
506 
507         evaluateDocuments(persDoc, cleanDoc, markedDoc);
508         if (persDoc != null)
509           Factory.deleteResource(persDoc);
510         if (cleanDoc != null)
511           Factory.deleteResource(cleanDoc);
512         if (markedDoc != null)
513           Factory.deleteResource(markedDoc);
514 
515       }//for loop through saved docs
516       sds.close();
517     } catch (java.net.MalformedURLException ex) {
518       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
519     } catch (PersistenceException ex1) {
520       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
521     } catch (ResourceInstantiationException ex2) {
522       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
523     }
524 
525   }//evaluateCorpus
526 
527   protected void evaluateMarkedStored(File markedDir, File storedDir) {
528     Document persDoc = null;
529     Document cleanDoc = null;
530     Document markedDoc = null;
531 
532     //open the datastore and process each document
533     try {
534       //open the data store
535       DataStore sds = Factory.openDataStore
536                       ("gate.persist.SerialDataStore",
537                        storedDir.toURL().toExternalForm());
538 
539       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
540       for (int i=0; i < lrIDs.size(); i++) {
541         String docID = (String) lrIDs.get(i);
542 
543         //read the stored document
544         FeatureMap features = Factory.newFeatureMap();
545         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
546         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
547         persDoc = (Document) Factory.createResource(
548                                     "gate.corpora.DocumentImpl",
549                                     features);
550 
551         Out.prln("<H2>" + persDoc.getName() + "</H2>");
552 
553         //try finding the marked document
554         StringBuffer docName = new StringBuffer(persDoc.getName());
555         docName.replace(
556           persDoc.getName().lastIndexOf("."),
557           docName.length(),
558           ".xml");
559         File markedDocFile = new File(markedDir, docName.toString());
560         if (! markedDocFile.exists()) {
561           Out.prln("Warning: Cannot find human-annotated document " +
562                    markedDocFile + " in " + markedDir);
563         } else {
564           FeatureMap params = Factory.newFeatureMap();
565           params.put("sourceUrl", markedDocFile.toURL());
566           params.put("encoding", "");
567 
568           // create the document
569           markedDoc = (Document) Factory.createResource(
570                                    "gate.corpora.DocumentImpl", params);
571           markedDoc.setName(persDoc.getName());
572         }
573 
574         evaluateDocuments(persDoc, cleanDoc, markedDoc);
575         if (persDoc != null)
576           Factory.deleteResource(persDoc);
577         if (markedDoc != null)
578           Factory.deleteResource(markedDoc);
579 
580       }//for loop through saved docs
581       sds.close();
582 
583     } catch (java.net.MalformedURLException ex) {
584       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
585     } catch (PersistenceException ex1) {
586       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
587     } catch (ResourceInstantiationException ex2) {
588       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
589     }
590 
591   }//evaluateMarkedStored
592 
593 
594   protected void evaluateMarkedClean(File markedDir, File cleanDir) {
595     Document persDoc = null;
596     Document cleanDoc = null;
597     Document markedDoc = null;
598 
599     File[] cleanDocs = cleanDir.listFiles();
600     for (int i = 0; i< cleanDocs.length; i++) {
601       if (!cleanDocs[i].isFile())
602         continue;
603 
604       //try reading the original document from clean
605       FeatureMap params = Factory.newFeatureMap();
606       try {
607         params.put("sourceUrl", cleanDocs[i].toURL());
608       } catch (java.net.MalformedURLException ex) {
609         Out.prln("Cannot create document from file: " +
610           cleanDocs[i].getAbsolutePath());
611         continue;
612       }
613       params.put("encoding", "");
614 
615       // create the document
616       try {
617         cleanDoc = (Document) Factory.createResource(
618                               "gate.corpora.DocumentImpl", params,
619                               null, cleanDocs[i].getName());
620       } catch (gate.creole.ResourceInstantiationException ex) {
621         Out.prln("Cannot create document from file: " +
622           cleanDocs[i].getAbsolutePath());
623         continue;
624       }
625 
626       Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>");
627 
628       //try finding the marked document
629       StringBuffer docName = new StringBuffer(cleanDoc.getName());
630       docName.replace(
631         cleanDoc.getName().lastIndexOf("."),
632         docName.length(),
633         ".xml");
634       File markedDocFile = new File(markedDir, docName.toString());
635       if (! markedDocFile.exists()) {
636         Out.prln("Warning: Cannot find human-annotated document " +
637                  markedDocFile + " in " + markedDir);
638         continue;
639       } else {
640         params = Factory.newFeatureMap();
641         try {
642           params.put("sourceUrl", markedDocFile.toURL());
643         } catch (java.net.MalformedURLException ex) {
644           Out.prln("Cannot create document from file: " +
645             markedDocFile.getAbsolutePath());
646           continue;
647         }
648         params.put("encoding", "");
649 
650         // create the document
651         try {
652           markedDoc = (Document) Factory.createResource(
653                                  "gate.corpora.DocumentImpl", params,
654                                  null, cleanDoc.getName());
655         } catch (gate.creole.ResourceInstantiationException ex) {
656           Out.prln("Cannot create document from file: " +
657             markedDocFile.getAbsolutePath());
658           continue;
659         }
660 
661       }//if markedDoc exists
662 
663       try {
664         evaluateDocuments(persDoc, cleanDoc, markedDoc);
665       } catch (gate.creole.ResourceInstantiationException ex) {
666         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
667       }
668       if (persDoc != null)
669         Factory.deleteResource(persDoc);
670       if (cleanDoc != null)
671         Factory.deleteResource(cleanDoc);
672       if (markedDoc != null)
673         Factory.deleteResource(markedDoc);
674 
675     }//for loop through clean docs
676 
677 
678   }//evaluateMarkedClean
679 
680   protected void processDocument(Document doc) {
681     try {
682       tokeniser.setDocument(doc);
683       tokeniser.execute();
684 
685       gazetteer.setDocument(doc);
686       gazetteer.execute();
687 
688       String textTagName = configs.getProperty("astTEXTTagName");
689       if (textTagName != null && !textTagName.equals(""))
690         setTransfer.setTextTagName(textTagName);
691       setTransfer.setDocument(doc);
692       setTransfer.execute();
693 
694       splitter.setDocument(doc);
695       splitter.execute();
696 
697       tagger.setDocument(doc);
698       tagger.execute();
699 
700       transducer.setDocument(doc);
701       transducer.execute();
702 
703       orthomatcher.setDocument(doc);
704       orthomatcher.execute();
705     } catch (gate.creole.ExecutionException ex) {
706       throw new GateRuntimeException("Corpus generation error: " +
707                                      ex.getMessage());
708     }
709   }
710 
711   protected void evaluateDocuments(Document persDoc,
712                     Document cleanDoc, Document markedDoc)
713                         throws ResourceInstantiationException {
714     if (cleanDoc == null && markedDoc == null)
715       return;
716 
717     //we've got no types to compare
718     if (annotTypes == null || annotTypes.isEmpty())
719       return;
720 
721     if (cleanDoc != null && !isMarkedStored) {
722 
723       processDocument(cleanDoc);
724 
725       if(!isMarkedClean)
726         evaluateAllThree(persDoc, cleanDoc, markedDoc);
727       else
728         evaluateTwoDocs(markedDoc, cleanDoc);
729 
730     } else
731       evaluateTwoDocs(markedDoc, persDoc);
732 
733   }
734 
735   protected void evaluateAllThree(Document persDoc,
736                                   Document cleanDoc, Document markedDoc)
737                                   throws ResourceInstantiationException {
738     //first start the table and its header
739     printTableHeader();
740     for (int jj= 0; jj< annotTypes.size(); jj++) {
741       String annotType = (String) annotTypes.get(jj);
742 
743       AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType);
744       //we don't have this annotation type in this document
745       if (annotDiff == null)
746         continue;
747       Out.prln("<TR>");
748 
749       //increase the number of processed documents
750       docNumber++;
751       //add precison and recall to the sums
752       updateStatistics(annotDiff, annotType);
753 
754       Out.prln("<TD> Annotation type: " + annotType + "</TD>");
755 
756       AnnotationDiff annotDiff1 =
757         measureDocs(markedDoc, persDoc, annotType);
758 
759       Out.prln("<TD>" + annotDiff.getPrecisionAverage());
760       //check the precision first
761       if (annotDiff1 != null &&
762           annotDiff!= null &&
763           annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage()
764          )
765         Out.prln("<P> Precision increase on human-marked from " +
766                  annotDiff1.getPrecisionAverage() + " to " +
767                  annotDiff.getPrecisionAverage() + "</P>");
768       else if (annotDiff1 != null
769                && annotDiff != null
770                && annotDiff1.getPrecisionAverage()
771                    > annotDiff.getPrecisionAverage())
772         Out.prln("<P> Precision decrease on human-marked from " +
773                  annotDiff1.getPrecisionAverage() + " to " +
774                  annotDiff.getPrecisionAverage() + "</P>");
775       Out.prln("</TD>");
776 
777       Out.prln("<TD>" + annotDiff.getRecallAverage());
778       //check the recall now
779       if (annotDiff1 != null &&
780           annotDiff!= null &&
781           annotDiff1.getRecallAverage()<annotDiff.getRecallAverage()
782          )
783         Out.prln("<P> Recall increase on human-marked from " +
784                  annotDiff1.getRecallAverage() + " to " +
785                  annotDiff.getRecallAverage() + "</P>");
786       else if (annotDiff1 != null
787                && annotDiff != null
788                && annotDiff1.getRecallAverage()
789                    > annotDiff.getRecallAverage())
790         Out.prln("<P> Recall decrease on human-marked from " +
791                  annotDiff1.getRecallAverage() + " to " +
792                  annotDiff.getRecallAverage() + "</P>");
793 
794       Out.prln("</TD>");
795 
796       //check the recall now
797       if ( isVerboseMode
798            &&
799            ((annotDiff.getRecallAverage() < threshold
800              ||
801              annotDiff.getRecallAverage() < threshold)
802            )
803          )
804         printAnnotations(annotDiff, markedDoc, cleanDoc);
805 
806 
807       Out.prln("</TR>");
808     }//for loop through annotation types
809     Out.prln("</TABLE>");
810 
811   }//evaluateAllThree
812 
813   protected void evaluateTwoDocs(Document keyDoc, Document respDoc)
814         throws ResourceInstantiationException {
815 
816     //first start the table and its header
817     printTableHeader();
818     for (int jj= 0; jj< annotTypes.size(); jj++) {
819       String annotType = (String) annotTypes.get(jj);
820 
821       AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType);
822       //we don't have this annotation type in this document
823       if (annotDiff == null)
824         continue;
825       Out.prln("<TR>");
826 
827       //increase the number of processed documents
828       docNumber++;
829       //add precison and recall to the sums
830       updateStatistics(annotDiff, annotType);
831 
832       Out.prln("<TD>" + annotType + "</TD>");
833 
834       Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
835       Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
836       //check the recall now
837       if ( isVerboseMode
838            &&
839            ((annotDiff.getRecallAverage() < threshold
840              ||
841              annotDiff.getRecallAverage() < threshold)
842            )
843          )
844         printAnnotations(annotDiff, keyDoc, respDoc);
845 
846       Out.prln("</TR>");
847     }//for loop through annotation types
848     Out.prln("</TABLE>");
849 
850   }//evaluateTwoDocs
851 
852   protected void printTableHeader() {
853     Out.prln("<TABLE BORDER=1");
854     if (isVerboseMode)
855       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
856               + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>");
857     else
858       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
859               + "<TD><B>Recall</B></TD>");
860   }
861 
862   protected void updateStatistics(AnnotationDiff annotDiff, String annotType){
863       precisionSum += annotDiff.getPrecisionAverage();
864       recallSum += annotDiff.getRecallAverage();
865       Double oldPrecision = (Double) precisionByType.get(annotType);
866       if (oldPrecision == null)
867         precisionByType.put(annotType,
868                             new Double(annotDiff.getPrecisionAverage()));
869       else
870         precisionByType.put(annotType,
871                             new Double(oldPrecision.doubleValue() +
872                                        annotDiff.getPrecisionAverage()));
873       Integer precCount = (Integer) prCountByType.get(annotType);
874       if (precCount == null)
875         prCountByType.put(annotType, new Integer(1));
876       else
877         prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
878 
879 
880       Double oldRecall = (Double) recallByType.get(annotType);
881       if (oldRecall == null)
882         recallByType.put(annotType,
883                          new Double(annotDiff.getRecallAverage()));
884       else
885         recallByType.put(annotType,
886                          new Double(oldRecall.doubleValue() +
887                                     annotDiff.getRecallAverage()));
888       Integer recCount = (Integer) recCountByType.get(annotType);
889       if (recCount == null)
890         recCountByType.put(annotType, new Integer(1));
891       else
892         recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
893 
894   }
895 
896   protected void printStatistics() {
897 
898     Out.prln("<H2> Statistics </H2>");
899     Out.prln("<H3> Precision </H3>");
900     if (precisionByType != null && !precisionByType.isEmpty()) {
901       Iterator iter = precisionByType.keySet().iterator();
902       while (iter.hasNext()) {
903         String annotType = (String) iter.next();
904         Out.prln(annotType + ": "
905           + ((Double)precisionByType.get(annotType)).doubleValue()
906               /
907               ((Integer)prCountByType.get(annotType)).intValue()
908           + "<P>");
909       }//while
910     }
911     Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
912 
913     Out.prln("<H3> Recall </H3>");
914     if (recallByType != null && !recallByType.isEmpty()) {
915       Iterator iter = recallByType.keySet().iterator();
916       while (iter.hasNext()) {
917         String annotType = (String) iter.next();
918         Out.prln(annotType + ": "
919           + ((Double)recallByType.get(annotType)).doubleValue()
920               /
921               ((Integer)recCountByType.get(annotType)).intValue()
922           + "<P>");
923       }//while
924     }
925 
926     Out.prln("Overall recall: " + getRecallAverage()
927              + "<P>");
928   }
929 
930   protected AnnotationDiff measureDocs(
931     Document keyDoc, Document respDoc, String annotType)
932       throws ResourceInstantiationException {
933 
934     if (keyDoc == null || respDoc == null)
935       return null;
936 
937     if (annotSetName != null
938         && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
939       return null;
940     else if ((annotSetName == null || annotSetName.equals(""))
941         && keyDoc.getAnnotations().get(annotType) == null)
942       return null;
943 
944     // create the annotation schema needed for AnnotationDiff
945     AnnotationSchema annotationSchema = new AnnotationSchema();
946 
947     // organization type
948     annotationSchema.setAnnotationName(annotType);
949     // create an annotation diff
950     FeatureMap parameters = Factory.newFeatureMap();
951     parameters.put("keyDocument",keyDoc);
952     parameters.put("responseDocument",respDoc);
953     parameters.put("annotationSchema",annotationSchema);
954     parameters.put("keyAnnotationSetName",annotSetName);
955     parameters.put("responseAnnotationSetName",annotSetName);
956     //for a start, do not compare the features of the annotations
957     parameters.put("keyFeatureNamesSet", new HashSet());
958     parameters.put("textMode", new Boolean(true));
959 
960     // Create Annotation Diff visual resource
961     AnnotationDiff annotDiff = (AnnotationDiff)
962           Factory.createResource("gate.annotation.AnnotationDiff",parameters);
963 
964     return annotDiff;
965   }
966 
967   protected void printAnnotations(AnnotationDiff annotDiff,
968                     Document keyDoc, Document respDoc) {
969     Out.prln("<TD>");
970     Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
971     Set missingSet =
972       annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE);
973     printAnnotations(missingSet, keyDoc);
974     Out.prln("<BR>");
975 
976     Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
977     Set spuriousSet =
978       annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE);
979     printAnnotations(spuriousSet, respDoc);
980     Out.prln("</BR>");
981 
982     Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
983     Set partialSet =
984       annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE);
985     printAnnotations(partialSet, respDoc);
986     Out.prln("</TD>");
987 
988   }
989 
990   protected void printAnnotations(Set set, Document doc) {
991     if (set == null || set.isEmpty())
992       return;
993 
994     Iterator iter = set.iterator();
995     while (iter.hasNext()) {
996       Annotation ann = (Annotation) iter.next();
997       Out.prln(
998         "<B>" +
999         doc.getContent().toString().substring(
1000          ann.getStartNode().getOffset().intValue(),
1001          ann.getEndNode().getOffset().intValue()) +
1002        "</B>: <I>[" + ann.getStartNode().getOffset() +
1003        "," + ann.getEndNode().getOffset() + "]</I>"
1004//        + "; features" + ann.getFeatures()
1005        );
1006    }//while
1007  }
1008
1009  /**
1010   * The directory from which we should generate/evaluate the corpus
1011   */
1012  private File startDir;
1013  private File currDir;
1014  private static List annotTypes;
1015
1016  private DefaultTokeniser tokeniser;
1017  private DefaultGazetteer gazetteer;
1018  private SentenceSplitter splitter;
1019  private POSTagger tagger;
1020  private ANNIETransducer transducer;
1021  private OrthoMatcher orthomatcher;
1022  private AnnotationSetTransfer setTransfer;
1023
1024  //collect the sum of all precisions and recalls of all docs
1025  //and the number of docs, so I can calculate the average for
1026  //the corpus at the end
1027  private double precisionSum = 0;
1028  private double recallSum = 0;
1029  private HashMap precisionByType = new HashMap();
1030  private HashMap prCountByType = new HashMap();
1031  private HashMap recallByType = new HashMap();
1032  private HashMap recCountByType = new HashMap();
1033  private int docNumber = 0;
1034
1035  /**
1036   * If true, the corpus tool will generate the corpus, otherwise it'll
1037   * run in evaluate mode
1038   */
1039  private boolean isGenerateMode = false;
1040  private boolean isVerboseMode = false;
1041
1042  /**
1043   * If true, the corpus tool will evaluate stored against the human-marked
1044   * documents
1045   */
1046  private boolean isMarkedStored = false;
1047  private boolean isMarkedClean = false;
1048
1049  private String annotSetName = "Key";
1050
1051  private double threshold = 0.5;
1052  private Properties configs = new Properties();
1053
1054  /** String to print when wrong command-line args */
1055  private static String usage =
1056    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name";
1057
1058}