1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.24 2002/03/06 17:15:48 kalina Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import gate.creole.tokeniser.*;
26  import gate.creole.gazetteer.*;
27  import gate.creole.splitter.*;
28  import gate.creole.orthomatcher.*;
29  import gate.creole.annotransfer.*;
30  import gate.annotation.*;
31  
32  public class CorpusBenchmarkTool {
33    private static final String MARKED_DIR_NAME = "marked";
34    private static final String CLEAN_DIR_NAME = "clean";
35    private static final String CVS_DIR_NAME = "Cvs";
36    private static final String PROCESSED_DIR_NAME = "processed";
37  
38    private static final boolean DEBUG = true;
39  
40    public CorpusBenchmarkTool() {}
41  
42    public void initPRs() {
43      try {
44        FeatureMap params = Factory.newFeatureMap();
45  
46        //create a default tokeniser
47        Out.prln("Loading tokeniser <P>");
48        String rulesURL = this.configs.getProperty("tokeniserRulesURL");
49        if (rulesURL != null && !rulesURL.equals(""))
50          params.put(
51            DefaultTokeniser.DEF_TOK_TOKRULES_URL_PARAMETER_NAME, rulesURL);
52        String grammarsURL = this.configs.getProperty("tokeniserGrammarURL");
53        if (grammarsURL != null && !grammarsURL.equals(""))
54          params.put(
55            DefaultTokeniser.DEF_TOK_GRAMRULES_URL_PARAMETER_NAME, grammarsURL);
56        //the annots are put in temp, as they are going to be transfered to the
57        //new set
58        params.put(DefaultTokeniser.DEF_TOK_ANNOT_SET_PARAMETER_NAME, "temp");
59        tokeniser = (DefaultTokeniser) Factory.createResource(
60                        "gate.creole.tokeniser.DefaultTokeniser", params);
61  
62        //create a default gazetteer
63        Out.prln("Loading gazetteer <P>");
64        params.clear();
65        String listsURL = this.configs.getProperty("gazetteerListsURL");
66        if (listsURL != null && !listsURL.equals("")) {
67          params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME, listsURL);
68          Out.prln("Running gazetteer on lists in: " + listsURL + "<P>");
69        }
70        String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive");
71        if (caseSensitive != null && !caseSensitive.equals(""))
72          params.put(DefaultGazetteer.DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME,
73            new Boolean(caseSensitive));
74        params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, "temp");
75        gazetteer = (DefaultGazetteer) Factory.createResource(
76                        "gate.creole.gazetteer.DefaultGazetteer", params);
77  
78        //create the Annotation set transfer
79        Out.prln("Loading annotation set transfer <P>");
80        params.clear();
81        params.put(AnnotationSetTransfer.AST_INPUT_AS_PARAMETER_NAME, "temp");
82        params.put(AnnotationSetTransfer.AST_OUTPUT_AS_PARAMETER_NAME, annotSetName);
83        //by default make it transfer all annotations
84        params.put(AnnotationSetTransfer.AST_TEXT_TAG_PARAMETER_NAME, "");
85        setTransfer = (AnnotationSetTransfer) Factory.createResource(
86                        "gate.creole.annotransfer.AnnotationSetTransfer", params);
87  
88        //create a splitter
89        Out.prln("Loading sentence splitter <P>");
90        params.clear();
91        listsURL = this.configs.getProperty("splitterGazetteerURL");
92        if (listsURL != null && !listsURL.equals(""))
93          params.put(SentenceSplitter.SPLIT_GAZ_URL_PARAMETER_NAME, listsURL);
94        grammarsURL = this.configs.getProperty("splitterGrammarURL");
95        if (grammarsURL != null && !grammarsURL.equals(""))
96          params.put(SentenceSplitter.SPLIT_TRANSD_URL_PARAMETER_NAME, grammarsURL);
97        params.put(SentenceSplitter.SPLIT_INPUT_AS_PARAMETER_NAME, annotSetName);
98        params.put(SentenceSplitter.SPLIT_OUTPUT_AS_PARAMETER_NAME, annotSetName);
99        splitter = (SentenceSplitter) Factory.createResource(
100                       "gate.creole.splitter.SentenceSplitter", params);
101 
102       //create a tagger
103       Out.prln("Loading POS tagger <P>");
104       params.clear();
105       String lexiconURL = this.configs.getProperty("taggerLexiconURL");
106       if (lexiconURL != null && !lexiconURL.equals(""))
107         params.put(POSTagger.TAG_LEXICON_URL_PARAMETER_NAME, lexiconURL);
108       rulesURL = this.configs.getProperty("taggerRulesURL");
109       if (rulesURL != null && !rulesURL.equals(""))
110         params.put(POSTagger.TAG_RULES_URL_PARAMETER_NAME, rulesURL);
111       params.put(POSTagger.TAG_INPUT_AS_PARAMETER_NAME, annotSetName);
112       params.put(POSTagger.TAG_OUTPUT_AS_PARAMETER_NAME, annotSetName);
113       tagger = (POSTagger) Factory.createResource(
114                       "gate.creole.POSTagger", params);
115 
116       //create a grammar
117       Out.prln("Loading grammars for transducer <P>");
118       params.clear();
119       String grammarURL = this.configs.getProperty("grammarURL");
120       if (grammarURL != null && !grammarURL.equals("")) {
121         params.put(ANNIETransducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, grammarURL);
122         Out.prln("Running transducer on grammars in: " + grammarURL + "<P>");
123       }
124       params.put(ANNIETransducer.TRANSD_INPUT_AS_PARAMETER_NAME, annotSetName);
125       params.put(ANNIETransducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, annotSetName);
126       transducer = (ANNIETransducer) Factory.createResource(
127                       "gate.creole.ANNIETransducer", params);
128 
129       //create an orthomatcher
130       Out.prln("Loading orthomatcher <P>");
131       params.clear();
132       params.put(OrthoMatcher.OM_ANN_SET_PARAMETER_NAME, annotSetName);
133       orthomatcher = (OrthoMatcher) Factory.createResource(
134                       "gate.creole.orthomatcher.OrthoMatcher", params);
135     } catch (ResourceInstantiationException ex) {
136       throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
137     }
138   }//initPRs
139 
140   public void unloadPRs() {
141     //we have nothing to unload if no PRs are loaded
142     if (isMarkedStored)
143       return;
144 
145     Factory.deleteResource(this.tokeniser);
146     Factory.deleteResource(this.gazetteer);
147     Factory.deleteResource(this.setTransfer);
148     Factory.deleteResource(this.splitter);
149     Factory.deleteResource(this.tagger);
150     Factory.deleteResource(this.transducer);
151     Factory.deleteResource(this.orthomatcher);
152   }
153 
154   public void execute() {
155     execute(startDir);
156   }
157 
158   public void init() {
159     //first read the corpus_tool.properties file
160     File propFile = new File("corpus_tool.properties");
161     Out.prln(propFile.getAbsolutePath());
162     if (propFile.exists()) {
163       try {
164         InputStream inputStream = new FileInputStream(propFile);
165         this.configs.load(inputStream);
166         String thresholdString = this.configs.getProperty("threshold");
167         if (thresholdString != null && !thresholdString.equals("")) {
168           this.threshold = (new Double(thresholdString)).doubleValue();
169           Out.prln("New threshold is: " + this.threshold + "<P>\n");
170         }
171         String setName = this.configs.getProperty("annotSetName");
172         if (setName != null && !setName.equals(""))
173           this.annotSetName = setName;
174       } catch (IOException ex) {
175         //just ignore the file and go on with the defaults
176         this.configs = new Properties();
177       }
178     } else
179       this.configs = new Properties();
180 
181 
182     //we only initialise the PRs if they are going to be used
183     //for processing unprocessed documents
184     if (!this.isMarkedStored)
185       initPRs();
186 
187     annotTypes = new ArrayList();
188     annotTypes.add("Organization");
189     annotTypes.add("Person");
190     annotTypes.add("Date");
191     annotTypes.add("Location");
192     annotTypes.add("Address");
193     annotTypes.add("Money");
194     annotTypes.add("Percent");
195     annotTypes.add("GPE");
196     annotTypes.add("Facility");
197 
198   }
199 
200   public void execute(File dir) {
201     if (dir == null)
202       return;
203     //first set the current directory to be the given one
204     currDir = dir;
205     Out.prln("Processing directory: " + currDir + "<P>");
206 
207     File processedDir = null;
208     File cleanDir = null;
209     File markedDir = null;
210 
211     ArrayList subDirs = new ArrayList();
212     File[] dirArray = currDir.listFiles();
213     for (int i = 0; i < dirArray.length; i++) {
214       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
215         continue;
216       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
217         cleanDir = dirArray[i];
218       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
219         markedDir = dirArray[i];
220       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
221         processedDir = dirArray[i];
222       else
223         subDirs.add(dirArray[i]);
224     }
225 
226     if (this.isGenerateMode)
227       generateCorpus(cleanDir, processedDir);
228     else
229       evaluateCorpus(cleanDir, processedDir, markedDir);
230 
231     //if no more subdirs left, return
232     if (subDirs.isEmpty())
233       return;
234 
235     //there are more subdirectories to traverse, so iterate through
236     for (int j = 0; j < subDirs.size(); j++)
237       execute((File) subDirs.get(j));
238 
239   }//execute(dir)
240 
241 
242   public static void main(String[] args) throws GateException {
243     Out.prln("<HTML>");
244     Out.prln("<HEAD>");
245     Out.prln("<TITLE> Corpus benchmark tool: ran with args " +
246             args.toString() + " on " +
247             new Date() + "</TITLE> </HEAD>");
248     Out.prln("<BODY>");
249     Out.prln("Please wait while GATE tools are initialised. <P>");
250     // initialise GATE
251     Gate.init();
252 
253     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
254 
255     List inputFiles = null;
256     if(args.length < 1) throw new GateException(usage);
257     int i = 0;
258     while (i < args.length && args[i].startsWith("-")) {
259       if(args[i].equals("-generate")) {
260         Out.prln("Generating the corpus... <P>");
261         corpusTool.setGenerateMode(true);
262       } else if (args[i].equals("-marked_clean")) {
263         Out.prln("Evaluating current grammars against human-annotated...<P>");
264         corpusTool.setMarkedClean(true);
265       } else if (args[i].equals("-marked_stored")) {
266         Out.prln("Evaluating stored documents against human-annotated...<P>");
267         corpusTool.setMarkedStored(true);
268       } else if (args[i].equals("-verbose")) {
269         Out.prln("Running in verbose mode. Will generate annotation " +
270           "information when precision/recall are lower than " +
271           corpusTool.getThreshold() +"<P>");
272         corpusTool.setVerboseMode(true);
273       }
274       i++; //just ignore the option, which we do not recognise
275     }//while
276 
277     String dirName = args[i];
278     File dir = new File(dirName);
279     if (!dir.isDirectory())
280       throw new GateException(usage);
281 
282     corpusTool.init();
283 
284     Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>");
285 
286     corpusTool.setStartDirectory(dir);
287     corpusTool.execute();
288 
289     //if we're not generating the corpus, then print the precision and recall
290     //statistics for the processed corpus
291     if (! corpusTool.getGenerateMode())
292       corpusTool.printStatistics();
293 
294     Out.prln("Finished! <P>");
295     Out.prln("</BODY>");
296     Out.prln("</HTML>");
297 
298     System.exit(0);
299 
300   }//main
301 
302   public void setGenerateMode(boolean mode) {
303     isGenerateMode = mode;
304   }//setGenerateMode
305 
306   public boolean getGenerateMode() {
307     return isGenerateMode;
308   }//getGenerateMode
309 
310   public boolean getVerboseMode() {
311     return isVerboseMode;
312   }//getVerboseMode
313 
314   public void setVerboseMode(boolean mode) {
315     isVerboseMode = mode;
316   }//setVerboseMode
317 
318   public void setMarkedStored(boolean mode) {
319     isMarkedStored = mode;
320   }//
321 
322   public boolean getMarkedStored() {
323     return isMarkedStored;
324   }//
325 
326   public void setMarkedClean(boolean mode) {
327     isMarkedClean = mode;
328   }//
329 
330   public boolean getMarkedClean() {
331     return isMarkedClean;
332   }//
333 
334   /**
335    * Returns the average precision over the entire set of processed documents.
336    * <P>
337    * If the tool has been evaluating the original documents against the
338    * previously-stored automatically annotated ones, then the precision
339    * will be the average precision on those two sets. <P>
340    * If the tool was run in -marked mode, i.e., was evaluating the stored
341    * automatically processed ones against the human-annotated ones, then
342    * the precision will be the average precision on those two sets of documents.
343    */
344   public double getPrecisionAverage() {
345     return precisionSum/docNumber;
346   }
347 
348   /**
349    * Returns the average recall over the entire set of processed documents.
350    * <P>
351    * If the tool has been evaluating the original documents against the
352    * previously-stored automatically annotated ones, then the recall
353    * will be the average recall on those two sets. <P>
354    * If the tool was run in -marked mode, i.e., was evaluating the stored
355    * automatically processed ones against the human-annotated ones, then
356    * the recall will be the average recall on those two sets of documents.
357    */
358   public double getRecallAverage() {
359     return recallSum/docNumber;
360   }
361 
362   public boolean isGenerateMode() {
363     return isGenerateMode == true;
364   }//isGenerateMode
365 
366   public double getThreshold() {
367     return threshold;
368   }
369 
370   public void setThreshold(double newValue) {
371     threshold = newValue;
372   }
373 
374   public File getStartDirectory() {
375     return startDir;
376   }//getStartDirectory
377 
378   public void setStartDirectory(File dir) {
379     startDir = dir;
380   }//setStartDirectory
381 
382   protected void generateCorpus(File fileDir, File outputDir) {
383     //1. check if we have input files
384     if (fileDir == null)
385       return;
386     //2. create the output directory or clean it up if needed
387     File outDir = outputDir;
388     if (outputDir == null) {
389       outDir = new File(currDir, PROCESSED_DIR_NAME);
390     } else {
391       // get rid of the directory, coz datastore wants it clean
392       if (!Files.rmdir(outDir))
393         Out.prln("cannot delete old output directory: " + outDir);
394     }
395     outDir.mkdir();
396 
397     //create the datastore and process each document
398     try {
399       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
400       sds.create();
401       sds.open();
402 
403       File[] files = fileDir.listFiles();
404       for (int i=0; i < files.length; i++) {
405         if (!files[i].isFile())
406           continue;
407         // create a document
408         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
409 
410         FeatureMap params = Factory.newFeatureMap();
411         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
412         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
413 
414         // create the document
415         Document doc = (Document) Factory.createResource(
416           "gate.corpora.DocumentImpl", params
417         );
418 
419         doc.setName(files[i].getName());
420         if (doc == null)
421           continue;
422         processDocument(doc);
423         LanguageResource lr = sds.adopt(doc, null);
424         sds.sync(lr);
425         Factory.deleteResource(doc);
426         Factory.deleteResource(lr);
427       }//for
428       sds.close();
429     } catch (java.net.MalformedURLException ex) {
430       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
431     } catch (PersistenceException ex1) {
432       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
433     } catch (ResourceInstantiationException ex2) {
434       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
435     } catch (gate.security.SecurityException ex3) {
436       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
437     }
438 
439   }//generateCorpus
440 
441   protected void evaluateCorpus(File fileDir,
442                     File processedDir, File markedDir) {
443     //1. check if we have input files and the processed Dir
444     if (fileDir == null || !fileDir.exists())
445       return;
446     if (processedDir == null || !processedDir.exists())
447       //if the user wants evaluation of marked and stored that's not possible
448       if (isMarkedStored) {
449         Out.prln("Cannot evaluate because no processed documents exist.");
450         return;
451       }
452       else
453         isMarkedClean = true;
454 
455     //looked for marked texts only if the directory exists
456     boolean processMarked = markedDir != null && markedDir.exists();
457     if (!processMarked && (isMarkedStored || isMarkedClean)) {
458         Out.prln("Cannot evaluate because no human-annotated documents exist.");
459         return;
460     }
461 
462     if (isMarkedStored) {
463       evaluateMarkedStored(markedDir, processedDir);
464       return;
465     } else if (isMarkedClean) {
466       evaluateMarkedClean(markedDir, fileDir);
467       return;
468     }
469 
470     Document persDoc = null;
471     Document cleanDoc = null;
472     Document markedDoc = null;
473 
474     //open the datastore and process each document
475     try {
476       //open the data store
477       DataStore sds = Factory.openDataStore
478                       ("gate.persist.SerialDataStore",
479                        processedDir.toURL().toExternalForm());
480 
481       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
482       for (int i=0; i < lrIDs.size(); i++) {
483         String docID = (String) lrIDs.get(i);
484 
485         //read the stored document
486         FeatureMap features = Factory.newFeatureMap();
487         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
488         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
489         persDoc = (Document) Factory.createResource(
490                                     "gate.corpora.DocumentImpl",
491                                     features);
492 
493         Out.prln("<H2>" + persDoc.getName() + "</H2>");
494 
495         File cleanDocFile = new File(fileDir, persDoc.getName());
496         //try reading the original document from clean
497         if (! cleanDocFile.exists()) {
498           Out.prln("Warning: Cannot find original document " +
499                    persDoc.getName() + " in " + fileDir);
500         } else {
501           FeatureMap params = Factory.newFeatureMap();
502           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
503           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
504 
505           // create the document
506           cleanDoc = (Document) Factory.createResource(
507                                   "gate.corpora.DocumentImpl", params);
508           cleanDoc.setName(persDoc.getName());
509         }
510 
511         //try finding the marked document
512         StringBuffer docName = new StringBuffer(persDoc.getName());
513         docName.replace(
514           persDoc.getName().lastIndexOf("."),
515           docName.length(),
516           ".xml");
517         File markedDocFile = new File(markedDir, docName.toString());
518         if (! processMarked || ! markedDocFile.exists()) {
519           Out.prln("Warning: Cannot find human-annotated document " +
520                    markedDocFile + " in " + markedDir);
521         } else {
522           FeatureMap params = Factory.newFeatureMap();
523           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
524           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
525 
526           // create the document
527           markedDoc = (Document) Factory.createResource(
528                                    "gate.corpora.DocumentImpl", params);
529           markedDoc.setName(persDoc.getName());
530         }
531 
532         evaluateDocuments(persDoc, cleanDoc, markedDoc);
533         if (persDoc != null)
534           Factory.deleteResource(persDoc);
535         if (cleanDoc != null)
536           Factory.deleteResource(cleanDoc);
537         if (markedDoc != null)
538           Factory.deleteResource(markedDoc);
539 
540       }//for loop through saved docs
541       sds.close();
542     } catch (java.net.MalformedURLException ex) {
543       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
544     } catch (PersistenceException ex1) {
545       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
546     } catch (ResourceInstantiationException ex2) {
547       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
548     }
549 
550   }//evaluateCorpus
551 
552   protected void evaluateMarkedStored(File markedDir, File storedDir) {
553     Document persDoc = null;
554     Document cleanDoc = null;
555     Document markedDoc = null;
556 
557     //open the datastore and process each document
558     try {
559       //open the data store
560       DataStore sds = Factory.openDataStore
561                       ("gate.persist.SerialDataStore",
562                        storedDir.toURL().toExternalForm());
563 
564       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
565       for (int i=0; i < lrIDs.size(); i++) {
566         String docID = (String) lrIDs.get(i);
567 
568         //read the stored document
569         FeatureMap features = Factory.newFeatureMap();
570         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
571         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
572         persDoc = (Document) Factory.createResource(
573                                     "gate.corpora.DocumentImpl",
574                                     features);
575 
576         Out.prln("<H2>" + persDoc.getName() + "</H2>");
577 
578         //try finding the marked document
579         StringBuffer docName = new StringBuffer(persDoc.getName());
580         docName.replace(
581           persDoc.getName().lastIndexOf("."),
582           docName.length(),
583           ".xml");
584         File markedDocFile = new File(markedDir, docName.toString());
585         if (! markedDocFile.exists()) {
586           Out.prln("Warning: Cannot find human-annotated document " +
587                    markedDocFile + " in " + markedDir);
588         } else {
589           FeatureMap params = Factory.newFeatureMap();
590           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
591           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
592 
593           // create the document
594           markedDoc = (Document) Factory.createResource(
595                                    "gate.corpora.DocumentImpl", params);
596           markedDoc.setName(persDoc.getName());
597         }
598 
599         evaluateDocuments(persDoc, cleanDoc, markedDoc);
600         if (persDoc != null)
601           Factory.deleteResource(persDoc);
602         if (markedDoc != null)
603           Factory.deleteResource(markedDoc);
604 
605       }//for loop through saved docs
606       sds.close();
607 
608     } catch (java.net.MalformedURLException ex) {
609       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
610     } catch (PersistenceException ex1) {
611       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
612     } catch (ResourceInstantiationException ex2) {
613       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
614     }
615 
616   }//evaluateMarkedStored
617 
618 
619   protected void evaluateMarkedClean(File markedDir, File cleanDir) {
620     Document persDoc = null;
621     Document cleanDoc = null;
622     Document markedDoc = null;
623 
624     File[] cleanDocs = cleanDir.listFiles();
625     for (int i = 0; i< cleanDocs.length; i++) {
626       if (!cleanDocs[i].isFile())
627         continue;
628 
629       //try reading the original document from clean
630       FeatureMap params = Factory.newFeatureMap();
631       try {
632         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
633       } catch (java.net.MalformedURLException ex) {
634         Out.prln("Cannot create document from file: " +
635           cleanDocs[i].getAbsolutePath());
636         continue;
637       }
638       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
639 
640       // create the document
641       try {
642         cleanDoc = (Document) Factory.createResource(
643                               "gate.corpora.DocumentImpl", params,
644                               null, cleanDocs[i].getName());
645       } catch (gate.creole.ResourceInstantiationException ex) {
646         Out.prln("Cannot create document from file: " +
647           cleanDocs[i].getAbsolutePath());
648         continue;
649       }
650 
651       Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>");
652 
653       //try finding the marked document
654       StringBuffer docName = new StringBuffer(cleanDoc.getName());
655       docName.replace(
656         cleanDoc.getName().lastIndexOf("."),
657         docName.length(),
658         ".xml");
659       File markedDocFile = new File(markedDir, docName.toString());
660       if (! markedDocFile.exists()) {
661         Out.prln("Warning: Cannot find human-annotated document " +
662                  markedDocFile + " in " + markedDir);
663         continue;
664       } else {
665         params = Factory.newFeatureMap();
666         try {
667           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
668         } catch (java.net.MalformedURLException ex) {
669           Out.prln("Cannot create document from file: " +
670             markedDocFile.getAbsolutePath());
671           continue;
672         }
673         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
674 
675         // create the document
676         try {
677           markedDoc = (Document) Factory.createResource(
678                                  "gate.corpora.DocumentImpl", params,
679                                  null, cleanDoc.getName());
680         } catch (gate.creole.ResourceInstantiationException ex) {
681           Out.prln("Cannot create document from file: " +
682             markedDocFile.getAbsolutePath());
683           continue;
684         }
685 
686       }//if markedDoc exists
687 
688       try {
689         evaluateDocuments(persDoc, cleanDoc, markedDoc);
690       } catch (gate.creole.ResourceInstantiationException ex) {
691 ex.printStackTrace();
692         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
693       }
694       if (persDoc != null)
695         Factory.deleteResource(persDoc);
696       if (cleanDoc != null)
697         Factory.deleteResource(cleanDoc);
698       if (markedDoc != null)
699         Factory.deleteResource(markedDoc);
700 
701     }//for loop through clean docs
702 
703 
704   }//evaluateMarkedClean
705 
706   protected void processDocument(Document doc) {
707     try {
708       tokeniser.setDocument(doc);
709       tokeniser.execute();
710 
711       gazetteer.setDocument(doc);
712       gazetteer.execute();
713 
714       String textTagName = configs.getProperty("astTEXTTagName");
715       if (textTagName != null && !textTagName.equals(""))
716         setTransfer.setTextTagName(textTagName);
717       setTransfer.setDocument(doc);
718       setTransfer.execute();
719 
720       splitter.setDocument(doc);
721       splitter.execute();
722 
723       tagger.setDocument(doc);
724       tagger.execute();
725 
726       transducer.setDocument(doc);
727       transducer.execute();
728 
729       orthomatcher.setDocument(doc);
730       orthomatcher.execute();
731     } catch (gate.creole.ExecutionException ex) {
732       throw new GateRuntimeException("Corpus generation error: " +
733                                      ex.getMessage());
734     }
735   }
736 
737   protected void evaluateDocuments(Document persDoc,
738                     Document cleanDoc, Document markedDoc)
739                         throws ResourceInstantiationException {
740     if (cleanDoc == null && markedDoc == null)
741       return;
742 
743     //we've got no types to compare
744     if (annotTypes == null || annotTypes.isEmpty())
745       return;
746 
747     if (cleanDoc != null && !isMarkedStored) {
748 
749       processDocument(cleanDoc);
750 
751       if(!isMarkedClean)
752         evaluateAllThree(persDoc, cleanDoc, markedDoc);
753       else
754         evaluateTwoDocs(markedDoc, cleanDoc);
755 
756     } else
757       evaluateTwoDocs(markedDoc, persDoc);
758 
759   }
760 
761   protected void evaluateAllThree(Document persDoc,
762                                   Document cleanDoc, Document markedDoc)
763                                   throws ResourceInstantiationException {
764     //first start the table and its header
765     printTableHeader();
766     for (int jj= 0; jj< annotTypes.size(); jj++) {
767       String annotType = (String) annotTypes.get(jj);
768 
769       AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType);
770       //we don't have this annotation type in this document
771       if (annotDiff == null)
772         continue;
773       Out.prln("<TR>");
774 
775       //increase the number of processed documents
776       docNumber++;
777       //add precison and recall to the sums
778       updateStatistics(annotDiff, annotType);
779 
780       Out.prln("<TD> Annotation type: " + annotType + "</TD>");
781 
782       AnnotationDiff annotDiff1 =
783         measureDocs(markedDoc, persDoc, annotType);
784 
785       Out.prln("<TD>" + annotDiff.getPrecisionAverage());
786       //check the precision first
787       if (annotDiff1 != null &&
788           annotDiff!= null &&
789           annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage()
790          )
791         Out.prln("<P> Precision increase on human-marked from " +
792                  annotDiff1.getPrecisionAverage() + " to " +
793                  annotDiff.getPrecisionAverage() + "</P>");
794       else if (annotDiff1 != null
795                && annotDiff != null
796                && annotDiff1.getPrecisionAverage()
797                    > annotDiff.getPrecisionAverage())
798         Out.prln("<P> Precision decrease on human-marked from " +
799                  annotDiff1.getPrecisionAverage() + " to " +
800                  annotDiff.getPrecisionAverage() + "</P>");
801       Out.prln("</TD>");
802 
803       Out.prln("<TD>" + annotDiff.getRecallAverage());
804       //check the recall now
805       if (annotDiff1 != null &&
806           annotDiff!= null &&
807           annotDiff1.getRecallAverage()<annotDiff.getRecallAverage()
808          )
809         Out.prln("<P> Recall increase on human-marked from " +
810                  annotDiff1.getRecallAverage() + " to " +
811                  annotDiff.getRecallAverage() + "</P>");
812       else if (annotDiff1 != null
813                && annotDiff != null
814                && annotDiff1.getRecallAverage()
815                    > annotDiff.getRecallAverage())
816         Out.prln("<P> Recall decrease on human-marked from " +
817                  annotDiff1.getRecallAverage() + " to " +
818                  annotDiff.getRecallAverage() + "</P>");
819 
820       Out.prln("</TD>");
821 
822       //check the recall now
823       if ( isVerboseMode
824            &&
825            ((annotDiff.getRecallAverage() < threshold
826              ||
827              annotDiff.getRecallAverage() < threshold)
828            )
829          )
830         printAnnotations(annotDiff, markedDoc, cleanDoc);
831 
832 
833       Out.prln("</TR>");
834     }//for loop through annotation types
835     Out.prln("</TABLE>");
836 
837   }//evaluateAllThree
838 
839   protected void evaluateTwoDocs(Document keyDoc, Document respDoc)
840         throws ResourceInstantiationException {
841 
842     //first start the table and its header
843     printTableHeader();
844     for (int jj= 0; jj< annotTypes.size(); jj++) {
845       String annotType = (String) annotTypes.get(jj);
846 
847       AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType);
848       //we don't have this annotation type in this document
849       if (annotDiff == null)
850         continue;
851       Out.prln("<TR>");
852 
853       //increase the number of processed documents
854       docNumber++;
855       //add precison and recall to the sums
856       updateStatistics(annotDiff, annotType);
857 
858       Out.prln("<TD>" + annotType + "</TD>");
859 
860       Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
861       Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
862       //check the recall now
863       if ( isVerboseMode
864            &&
865            ((annotDiff.getRecallAverage() < threshold
866              ||
867              annotDiff.getRecallAverage() < threshold)
868            )
869          )
870         printAnnotations(annotDiff, keyDoc, respDoc);
871 
872       Out.prln("</TR>");
873     }//for loop through annotation types
874     Out.prln("</TABLE>");
875 
876   }//evaluateTwoDocs
877 
878   protected void printTableHeader() {
879     Out.prln("<TABLE BORDER=1");
880     if (isVerboseMode)
881       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
882               + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>");
883     else
884       Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> "
885               + "<TD><B>Recall</B></TD>");
886   }
887 
888   protected void updateStatistics(AnnotationDiff annotDiff, String annotType){
889       precisionSum += annotDiff.getPrecisionAverage();
890       recallSum += annotDiff.getRecallAverage();
891       Double oldPrecision = (Double) precisionByType.get(annotType);
892       if (oldPrecision == null)
893         precisionByType.put(annotType,
894                             new Double(annotDiff.getPrecisionAverage()));
895       else
896         precisionByType.put(annotType,
897                             new Double(oldPrecision.doubleValue() +
898                                        annotDiff.getPrecisionAverage()));
899       Integer precCount = (Integer) prCountByType.get(annotType);
900       if (precCount == null)
901         prCountByType.put(annotType, new Integer(1));
902       else
903         prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
904 
905 
906       Double oldRecall = (Double) recallByType.get(annotType);
907       if (oldRecall == null)
908         recallByType.put(annotType,
909                          new Double(annotDiff.getRecallAverage()));
910       else
911         recallByType.put(annotType,
912                          new Double(oldRecall.doubleValue() +
913                                     annotDiff.getRecallAverage()));
914       Integer recCount = (Integer) recCountByType.get(annotType);
915       if (recCount == null)
916         recCountByType.put(annotType, new Integer(1));
917       else
918         recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
919 
920   }
921 
922   protected void printStatistics() {
923 
924     Out.prln("<H2> Statistics </H2>");
925     Out.prln("<H3> Precision </H3>");
926     if (precisionByType != null && !precisionByType.isEmpty()) {
927       Iterator iter = precisionByType.keySet().iterator();
928       while (iter.hasNext()) {
929         String annotType = (String) iter.next();
930         Out.prln(annotType + ": "
931           + ((Double)precisionByType.get(annotType)).doubleValue()
932               /
933               ((Integer)prCountByType.get(annotType)).intValue()
934           + "<P>");
935       }//while
936     }
937     Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
938 
939     Out.prln("<H3> Recall </H3>");
940     if (recallByType != null && !recallByType.isEmpty()) {
941       Iterator iter = recallByType.keySet().iterator();
942       while (iter.hasNext()) {
943         String annotType = (String) iter.next();
944         Out.prln(annotType + ": "
945           + ((Double)recallByType.get(annotType)).doubleValue()
946               /
947               ((Integer)recCountByType.get(annotType)).intValue()
948           + "<P>");
949       }//while
950     }
951 
952     Out.prln("Overall recall: " + getRecallAverage()
953              + "<P>");
954   }
955 
956   protected AnnotationDiff measureDocs(
957     Document keyDoc, Document respDoc, String annotType)
958       throws ResourceInstantiationException {
959 
960     if (keyDoc == null || respDoc == null)
961       return null;
962 
963     if (annotSetName != null
964         && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
965       return null;
966     else if ((annotSetName == null || annotSetName.equals(""))
967         && keyDoc.getAnnotations().get(annotType) == null)
968       return null;
969 
970     // create the annotation schema needed for AnnotationDiff
971     AnnotationSchema annotationSchema = new AnnotationSchema();
972 
973     // organization type
974     annotationSchema.setAnnotationName(annotType);
975     // create an annotation diff
976     AnnotationDiff annotDiff = new AnnotationDiff();
977     annotDiff.setAnnotationSchema(annotationSchema);
978     annotDiff.setKeyDocument(keyDoc);
979     annotDiff.setResponseDocument(respDoc);
980     annotDiff.setKeyAnnotationSetName(annotSetName);
981     annotDiff.setResponseAnnotationSetName(annotSetName);
982     annotDiff.setKeyFeatureNamesSet(new HashSet());
983     annotDiff.setTextMode(new Boolean(true));
984     annotDiff.init();
985 
986     return annotDiff;
987   }
988 
989   protected void printAnnotations(AnnotationDiff annotDiff,
990                     Document keyDoc, Document respDoc) {
991     Out.prln("<TD>");
992     Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
993     Set missingSet =
994       annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE);
995     printAnnotations(missingSet, keyDoc);
996     Out.prln("<BR>");
997 
998     Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
999     Set spuriousSet =
1000      annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE);
1001    printAnnotations(spuriousSet, respDoc);
1002    Out.prln("</BR>");
1003
1004    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1005    Set partialSet =
1006      annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE);
1007    printAnnotations(partialSet, respDoc);
1008    Out.prln("</TD>");
1009
1010  }
1011
1012  protected void printAnnotations(Set set, Document doc) {
1013    if (set == null || set.isEmpty())
1014      return;
1015
1016    Iterator iter = set.iterator();
1017    while (iter.hasNext()) {
1018      Annotation ann = (Annotation) iter.next();
1019      Out.prln(
1020        "<B>" +
1021        doc.getContent().toString().substring(
1022          ann.getStartNode().getOffset().intValue(),
1023          ann.getEndNode().getOffset().intValue()) +
1024        "</B>: <I>[" + ann.getStartNode().getOffset() +
1025        "," + ann.getEndNode().getOffset() + "]</I>"
1026//        + "; features" + ann.getFeatures()
1027        );
1028    }//while
1029  }
1030
1031  /**
1032   * The directory from which we should generate/evaluate the corpus
1033   */
1034  private File startDir;
1035  private File currDir;
1036  private static List annotTypes;
1037
1038  private DefaultTokeniser tokeniser;
1039  private DefaultGazetteer gazetteer;
1040  private SentenceSplitter splitter;
1041  private POSTagger tagger;
1042  private ANNIETransducer transducer;
1043  private OrthoMatcher orthomatcher;
1044  private AnnotationSetTransfer setTransfer;
1045
1046  //collect the sum of all precisions and recalls of all docs
1047  //and the number of docs, so I can calculate the average for
1048  //the corpus at the end
1049  private double precisionSum = 0;
1050  private double recallSum = 0;
1051  private HashMap precisionByType = new HashMap();
1052  private HashMap prCountByType = new HashMap();
1053  private HashMap recallByType = new HashMap();
1054  private HashMap recCountByType = new HashMap();
1055  private int docNumber = 0;
1056
1057  /**
1058   * If true, the corpus tool will generate the corpus, otherwise it'll
1059   * run in evaluate mode
1060   */
1061  private boolean isGenerateMode = false;
1062  private boolean isVerboseMode = false;
1063
1064  /**
1065   * If true, the corpus tool will evaluate stored against the human-marked
1066   * documents
1067   */
1068  private boolean isMarkedStored = false;
1069  private boolean isMarkedClean = false;
1070
1071  private String annotSetName = "Key";
1072
1073  private double threshold = 0.5;
1074  private Properties configs = new Properties();
1075
1076  /** String to print when wrong command-line args */
1077  private static String usage =
1078    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name";
1079
1080}