1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.49 2004/07/21 17:10:09 akshay Exp $
14   */
15  
16  package gate.util;
17  
18  import java.io.*;
19  import java.util.*;
20  
21  import gate.*;
22  import gate.util.AnnotationDiffer;
23  import gate.creole.*;
24  import gate.persist.PersistenceException;
25  import gate.persist.SerialDataStore;
26  
27  public class CorpusBenchmarkTool {
28    private static final String MARKED_DIR_NAME = "marked";
29    private static final String CLEAN_DIR_NAME = "clean";
30    private static final String CVS_DIR_NAME = "Cvs";
31    private static final String PROCESSED_DIR_NAME = "processed";
32    private static final String ERROR_DIR_NAME = "err";
33  
34    private static final boolean DEBUG = true;
35  
36    public CorpusBenchmarkTool() {}
37  
38    public void initPRs() {
39      try {
40        if (applicationFile == null)
41          Out.prln("Application not set!");
42        Out.prln("App file is: " + applicationFile.getAbsolutePath());
43        application = (Controller) gate.util.persistence.PersistenceManager
44                                     .loadObjectFromFile(applicationFile);
45      } catch (Exception ex) {
46        throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
47      }
48    }//initPRs
49  
50    public void unloadPRs() {
51      //we have nothing to unload if no PRs are loaded
52      if (isMarkedStored)
53        return;
54  
55    }
56  
57    public void execute() {
58  /*
59      Out.prln("Flags Gen Cln Str Vrb Minf: "
60               + isGenerateMode +" "+ isMarkedClean +" "+ isMarkedStored
61               +" "+ isVerboseMode +" "+ isMoreInfoMode);
62  */
63      execute(startDir);
64      System.out.println("Done execute");
65  /*    if (application != null) {
66        Iterator iter = new ArrayList(application.getPRs()).iterator();
67        while (iter.hasNext())
68          Factory.deleteResource((Resource) iter.next());
69  
70        Factory.deleteResource(application);
71      }*/
72      System.out.println("Done execute");
73    }
74  
75    public void init() {
76      //first read the corpus_tool.properties file
77      File propFile = new File("corpus_tool.properties");
78      Out.prln(propFile.getAbsolutePath());
79      if (propFile.exists()) {
80        try {
81          InputStream inputStream = new FileInputStream(propFile);
82          this.configs.load(inputStream);
83          String thresholdString = this.configs.getProperty("threshold");
84          if (thresholdString != null && !thresholdString.equals("")) {
85            this.threshold = (new Double(thresholdString)).doubleValue();
86            Out.prln("New threshold is: " + this.threshold + "<P>\n");
87          }
88          String setName = this.configs.getProperty("annotSetName");
89          if (setName != null && !setName.equals("")) {
90            Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
91            this.annotSetName = setName;
92          }
93          setName = this.configs.getProperty("outputSetName");
94          if (setName != null && !setName.equals("")) {
95            Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
96            this.outputSetName = setName;
97          }
98          String encodingString = this.configs.getProperty("encoding");
99          if (encodingString != null && !encodingString.equals("")) {
100           this.documentEncoding = encodingString;
101           Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
102         }
103         String types = this.configs.getProperty("annotTypes");
104         if (types != null && !types.equals("")) {
105           Out.prln("Using annotation types from the properties file. <P>\n");
106           StringTokenizer strTok = new StringTokenizer(types, ";");
107           annotTypes = new ArrayList();
108           while (strTok.hasMoreTokens())
109             annotTypes.add(strTok.nextToken());
110         } else {
111           annotTypes = new ArrayList();
112           annotTypes.add("Organization");
113           annotTypes.add("Person");
114           annotTypes.add("Date");
115           annotTypes.add("Location");
116           annotTypes.add("Address");
117           annotTypes.add("Money");
118           annotTypes.add("Percent");
119           annotTypes.add("GPE");
120           annotTypes.add("Facility");
121         }
122         String features = this.configs.getProperty("annotFeatures");
123         HashSet result = new HashSet();
124         if (features != null && !features.equals("")) {
125           Out.pr("Using annotation features from the properties file. \n");
126           java.util.StringTokenizer tok =
127               new java.util.StringTokenizer(features, ";");
128           String current;
129           while(tok.hasMoreTokens()) {
130             current = tok.nextToken();
131             result.add(current);
132           } // while
133         }
134         diffFeaturesSet = result;
135         Out.prln("Features: "+diffFeaturesSet+" <P>\n");
136 
137       } catch (IOException ex) {
138         //just ignore the file and go on with the defaults
139         this.configs = new Properties();
140       }
141     } else
142       this.configs = new Properties();
143 
144 
145     //we only initialise the PRs if they are going to be used
146     //for processing unprocessed documents
147     if (!this.isMarkedStored)
148       initPRs();
149 
150   }
151 
152   public void execute(File dir) {
153     if (dir == null)
154       return;
155     //first set the current directory to be the given one
156     currDir = dir;
157 
158     File processedDir = null;
159     File cleanDir = null;
160     File markedDir = null;
161     File errorDir = null;
162 
163     ArrayList subDirs = new ArrayList();
164     File[] dirArray = currDir.listFiles();
165     if(dirArray == null) return;
166     for (int i = 0; i < dirArray.length; i++) {
167       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
168         continue;
169       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
170         cleanDir = dirArray[i];
171       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
172         markedDir = dirArray[i];
173       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
174         processedDir = dirArray[i];
175       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
176         errorDir = dirArray[i];
177       else
178         subDirs.add(dirArray[i]);
179     }
180 
181     if(cleanDir == null) return;
182     Out.prln("Processing directory: " + currDir + "<P>");
183 
184     if (this.isGenerateMode)
185       generateCorpus(cleanDir, processedDir);
186     else
187       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
188 
189     //if no more subdirs left, return
190     if (subDirs.isEmpty())
191       return;
192 
193     //there are more subdirectories to traverse, so iterate through
194     for (int j = 0; j < subDirs.size(); j++)
195       execute((File) subDirs.get(j));
196 
197   }//execute(dir)
198 
199 
200   public static void main(String[] args) throws GateException {
201     Out.prln("<HTML>");
202     Out.prln("<HEAD>");
203     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
204     for(int argC=0; argC < args.length; ++argC)
205       Out.pr(args[argC]+" ");
206     Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
207     Out.prln("<BODY>");
208     Out.prln("Please wait while GATE tools are initialised. <P>");
209     // initialise GATE
210     Gate.init();
211 
212     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
213 
214     List inputFiles = null;
215     if(args.length < 1) throw new GateException(usage);
216     int i = 0;
217     while (i < args.length && args[i].startsWith("-")) {
218       if(args[i].equals("-generate")) {
219         Out.prln("Generating the corpus... <P>");
220         corpusTool.setGenerateMode(true);
221       } else if (args[i].equals("-marked_clean")) {
222         Out.prln("Evaluating current grammars against human-annotated...<P>");
223         corpusTool.setMarkedClean(true);
224       } else if (args[i].equals("-marked_stored")) {
225         Out.prln("Evaluating stored documents against human-annotated...<P>");
226         corpusTool.setMarkedStored(true);
227       } else if (args[i].equals("-marked_ds")) {
228         Out.prln("Looking for marked docs in a datastore...<P>");
229         corpusTool.setMarkedDS(true);
230       } else if (args[i].equals("-verbose")) {
231         Out.prln("Running in verbose mode. Will generate annotation " +
232           "information when precision/recall are lower than " +
233           corpusTool.getThreshold() +"<P>");
234         corpusTool.setVerboseMode(true);
235       } else if (args[i].equals("-moreinfo")) {
236         Out.prln("Show more details in document table...<P>");
237         corpusTool.setMoreInfo(true);
238       }
239       i++; //just ignore the option, which we do not recognise
240     }//while
241 
242     String dirName = args[i];
243     File dir = new File(dirName);
244     if (!dir.isDirectory())
245       throw new GateException(usage);
246 
247     //get the last argument which is the application
248     i++;
249     String appName = args[i];
250     File appFile = new File(appName);
251     if (!appFile.isFile())
252       throw new GateException(usage);
253     else
254       corpusTool.setApplicationFile(appFile);
255 
256     corpusTool.init();
257     corpusWordCount = 0;
258 
259     Out.prln("Measuring annotaitions of types: " + CorpusBenchmarkTool.annotTypes + "<P>");
260 
261     corpusTool.setStartDirectory(dir);
262     corpusTool.execute();
263     System.out.println("Done Executing");
264     //if we're not generating the corpus, then print the precision and recall
265     //statistics for the processed corpus
266     if (! corpusTool.getGenerateMode())
267       corpusTool.printStatistics();
268 
269     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
270     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
271     if(corpusWordCount == 0)
272       Out.prln("<BR>No Token annotations to count words in the corpus.");
273     else
274       Out.prln("<BR>Overall word count: " + corpusWordCount);
275 
276 
277     if(hasProcessed) {
278       Out.prln("<P>Old Processed: ");
279       Out.prln("<BR>Overall average precision: "
280                + corpusTool.getPrecisionAverageProc());
281       Out.prln("<BR>Overall average recall: "
282                + corpusTool.getRecallAverageProc());
283     }
284     Out.prln("<BR>Finished! <P>");
285     Out.prln("</BODY>");
286     Out.prln("</HTML>");
287 
288     System.exit(0);
289 
290   }//main
291 
292   public void setGenerateMode(boolean mode) {
293     isGenerateMode = mode;
294   }//setGenerateMode
295 
296   public boolean getGenerateMode() {
297     return isGenerateMode;
298   }//getGenerateMode
299 
300   public boolean getVerboseMode() {
301     return isVerboseMode;
302   }//getVerboseMode
303 
304   public void setVerboseMode(boolean mode) {
305     isVerboseMode = mode;
306   }//setVerboseMode
307 
308   public void setMoreInfo(boolean mode) {
309     isMoreInfoMode = mode;
310   } // setMoreInfo
311 
312   public boolean getMoreInfo() {
313     return isMoreInfoMode;
314   } // getMoreInfo
315 
316   public void setDiffFeaturesList(Set features) {
317     diffFeaturesSet = features;
318   } // setDiffFeaturesList
319 
320   public Set getDiffFeaturesList() {
321     return diffFeaturesSet;
322   } // getDiffFeaturesList
323 
324   public void setMarkedStored(boolean mode) {
325     isMarkedStored = mode;
326   }// setMarkedStored
327 
328 
329   public boolean getMarkedStored() {
330     return isMarkedStored;
331   }// getMarkedStored
332 
333   public void setMarkedClean(boolean mode) {
334     isMarkedClean = mode;
335   }//
336 
337   public boolean getMarkedClean() {
338     return isMarkedClean;
339   }//
340 
341   public void setMarkedDS(boolean mode) {
342     isMarkedDS = mode;
343   }//
344 
345   public boolean getMarkedDS() {
346     return isMarkedDS;
347   }//
348 
349   public void setApplicationFile(File newAppFile) {
350     applicationFile = newAppFile;
351   }
352 
353   /**
354    * Returns the average precision over the entire set of processed documents.
355    * <P>
356    * If the tool has been evaluating the original documents against the
357    * previously-stored automatically annotated ones, then the precision
358    * will be the average precision on those two sets. <P>
359    * If the tool was run in -marked mode, i.e., was evaluating the stored
360    * automatically processed ones against the human-annotated ones, then
361    * the precision will be the average precision on those two sets of documents.
362    */
363   public double getPrecisionAverage() {
364     return precisionSum/docNumber;
365   }
366 
367   /**
368    * Returns the average recall over the entire set of processed documents.
369    * <P>
370    * If the tool has been evaluating the original documents against the
371    * previously-stored automatically annotated ones, then the recall
372    * will be the average recall on those two sets. <P>
373    * If the tool was run in -marked mode, i.e., was evaluating the stored
374    * automatically processed ones against the human-annotated ones, then
375    * the recall will be the average recall on those two sets of documents.
376    */
377   public double getRecallAverage() {
378     return recallSum/docNumber;
379   }
380 
381   /** For processed documents */
382   public double getPrecisionAverageProc() {
383     return proc_precisionSum/docNumber;
384   }
385   public double getRecallAverageProc() {
386     return proc_recallSum/docNumber;
387   }
388 
389 
390   public boolean isGenerateMode() {
391     return isGenerateMode == true;
392   }//isGenerateMode
393 
394   public double getThreshold() {
395     return threshold;
396   }
397 
398   public void setThreshold(double newValue) {
399     threshold = newValue;
400   }
401 
402   public File getStartDirectory() {
403     return startDir;
404   }//getStartDirectory
405 
406   public void setStartDirectory(File dir) {
407     startDir = dir;
408   }//setStartDirectory
409 
410   protected void generateCorpus(File fileDir, File outputDir) {
411     //1. check if we have input files
412     if (fileDir == null)
413       return;
414     //2. create the output directory or clean it up if needed
415     File outDir = outputDir;
416     if (outputDir == null) {
417       outDir = new File(currDir, PROCESSED_DIR_NAME);
418     } else {
419       // get rid of the directory, coz datastore wants it clean
420       if (!Files.rmdir(outDir))
421         Out.prln("cannot delete old output directory: " + outDir);
422     }
423     outDir.mkdir();
424 
425     //create the datastore and process each document
426     try {
427       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
428       sds.create();
429       sds.open();
430 
431       File[] files = fileDir.listFiles();
432       for (int i=0; i < files.length; i++) {
433         if (!files[i].isFile())
434           continue;
435         // create a document
436         Out.prln("Processing and storing document: " + files[i].toURL() +"<P>");
437 
438         FeatureMap params = Factory.newFeatureMap();
439         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
440         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
441 
442         // create the document
443         Document doc = (Document) Factory.createResource(
444           "gate.corpora.DocumentImpl", params
445         );
446 
447         doc.setName(files[i].getName());
448         if (doc == null)
449           continue;
450         processDocument(doc);
451         LanguageResource lr = sds.adopt(doc, null);
452         sds.sync(lr);
453         Factory.deleteResource(doc);
454         Factory.deleteResource(lr);
455       }//for
456       sds.close();
457     } catch (java.net.MalformedURLException ex) {
458       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
459     } catch (PersistenceException ex1) {
460       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
461     } catch (ResourceInstantiationException ex2) {
462       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
463     } catch (gate.security.SecurityException ex3) {
464       throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage());
465     }
466     System.out.println("Done");
467   }//generateCorpus
468 
469   protected void evaluateCorpus(File fileDir,
470                     File processedDir, File markedDir,
471                     File errorDir) {
472     //1. check if we have input files and the processed Dir
473     if (fileDir == null || !fileDir.exists())
474       return;
475     if (processedDir == null || !processedDir.exists())
476       //if the user wants evaluation of marked and stored that's not possible
477       if (isMarkedStored) {
478         Out.prln("Cannot evaluate because no processed documents exist.");
479         return;
480       }
481       else
482         isMarkedClean = true;
483 
484     // create the error directory or clean it up if needed
485     File errDir = null;
486     if(isMoreInfoMode) {
487       errDir = errorDir;
488       if (errDir == null) {
489         errDir = new File(currDir, ERROR_DIR_NAME);
490       }
491       else {
492         // get rid of the directory, coz we wants it clean
493         if (!Files.rmdir(errDir))
494           Out.prln("cannot delete old error directory: " + errDir);
495       }
496       Out.prln("Create error directory: " + errDir + "<BR><BR>");
497       errDir.mkdir();
498     }
499 
500     //looked for marked texts only if the directory exists
501     boolean processMarked = markedDir != null && markedDir.exists();
502     if (!processMarked && (isMarkedStored || isMarkedClean)) {
503         Out.prln("Cannot evaluate because no human-annotated documents exist.");
504         return;
505     }
506 
507     if (isMarkedStored) {
508       evaluateMarkedStored(markedDir, processedDir, errDir);
509       return;
510     } else if (isMarkedClean) {
511       evaluateMarkedClean(markedDir, fileDir, errDir);
512       return;
513     }
514 
515     Document persDoc = null;
516     Document cleanDoc = null;
517     Document markedDoc = null;
518 
519     //open the datastore and process each document
520     try {
521       //open the data store
522       DataStore sds = Factory.openDataStore
523                       ("gate.persist.SerialDataStore",
524                        processedDir.toURL().toExternalForm());
525 
526       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
527       for (int i=0; i < lrIDs.size(); i++) {
528         String docID = (String) lrIDs.get(i);
529 
530         //read the stored document
531         FeatureMap features = Factory.newFeatureMap();
532         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
533         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
534         persDoc = (Document) Factory.createResource(
535                                     "gate.corpora.DocumentImpl",
536                                     features);
537 
538         if(isMoreInfoMode) {
539           StringBuffer errName = new StringBuffer(persDoc.getName());
540           errName.replace(
541             persDoc.getName().lastIndexOf("."),
542             persDoc.getName().length(),
543             ".err");
544           Out.prln("<H2>" +
545                    "<a href=\"err/" + errName.toString() + "\">"
546                    + persDoc.getName() + "</a>" + "</H2>");
547         } else
548           Out.prln("<H2>" + persDoc.getName() + "</H2>");
549 
550         File cleanDocFile = new File(fileDir, persDoc.getName());
551         //try reading the original document from clean
552         if (! cleanDocFile.exists()) {
553           Out.prln("Warning: Cannot find original document " +
554                    persDoc.getName() + " in " + fileDir);
555         } else {
556           FeatureMap params = Factory.newFeatureMap();
557           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
558           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
559 
560           // create the document
561           cleanDoc = (Document) Factory.createResource(
562                                   "gate.corpora.DocumentImpl", params);
563           cleanDoc.setName(persDoc.getName());
564         }
565 
566         //try finding the marked document
567         StringBuffer docName = new StringBuffer(persDoc.getName());
568         if (! isMarkedDS) {
569           docName.replace(
570             persDoc.getName().lastIndexOf("."),
571             docName.length(),
572             ".xml");
573           File markedDocFile = new File(markedDir, docName.toString());
574           if (! processMarked || ! markedDocFile.exists()) {
575             Out.prln("Warning: Cannot find human-annotated document " +
576                      markedDocFile + " in " + markedDir);
577           } else {
578             FeatureMap params = Factory.newFeatureMap();
579             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
580             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
581 
582             // create the document
583             markedDoc = (Document) Factory.createResource(
584                                      "gate.corpora.DocumentImpl", params);
585             markedDoc.setName(persDoc.getName());
586           }
587         } else {
588           //open marked from a DS
589           //open the data store
590           DataStore sds1 = Factory.openDataStore
591                           ("gate.persist.SerialDataStore",
592                            markedDir.toURL().toExternalForm());
593 
594           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
595           boolean found = false;
596           int k = 0;
597           //search for the marked doc with the same name
598           while (k < lrIDs1.size() && !found) {
599             String docID1 = (String) lrIDs1.get(k);
600 
601             //read the stored document
602             FeatureMap features1 = Factory.newFeatureMap();
603             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
604             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
605             Document tempDoc = (Document) Factory.createResource(
606                                         "gate.corpora.DocumentImpl",
607                                         features1);
608             //check whether this is our doc
609             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
610                  endsWith(persDoc.getName())) {
611               found = true;
612               markedDoc = tempDoc;
613             } else k++;
614           }
615         }
616 
617         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
618         if (persDoc != null)
619           Factory.deleteResource(persDoc);
620         if (cleanDoc != null)
621           Factory.deleteResource(cleanDoc);
622         if (markedDoc != null)
623           Factory.deleteResource(markedDoc);
624 
625       }//for loop through saved docs
626       sds.close();
627     } catch (java.net.MalformedURLException ex) {
628       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
629     } catch (PersistenceException ex1) {
630       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
631     } catch (ResourceInstantiationException ex2) {
632       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
633     }
634 
635   }//evaluateCorpus
636 
637   protected void evaluateMarkedStored(File markedDir, File storedDir, File errDir) {
638     Document persDoc = null;
639     Document cleanDoc = null;
640     Document markedDoc = null;
641 
642     //open the datastore and process each document
643     try {
644       //open the data store
645       DataStore sds = Factory.openDataStore
646                       ("gate.persist.SerialDataStore",
647                        storedDir.toURL().toExternalForm());
648 
649       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
650       for (int i=0; i < lrIDs.size(); i++) {
651         String docID = (String) lrIDs.get(i);
652 
653         //read the stored document
654         FeatureMap features = Factory.newFeatureMap();
655         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
656         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
657         persDoc = (Document) Factory.createResource(
658                                     "gate.corpora.DocumentImpl",
659                                     features);
660 
661         if(isMoreInfoMode) {
662           StringBuffer errName = new StringBuffer(persDoc.getName());
663           errName.replace(
664             persDoc.getName().lastIndexOf("."),
665             persDoc.getName().length(),
666             ".err");
667           Out.prln("<H2>" +
668                    "<a href=\"err/" + errName.toString() + "\">"
669                    + persDoc.getName() + "</a>" + "</H2>");
670         } else
671           Out.prln("<H2>" + persDoc.getName() + "</H2>");
672 
673         if (! this.isMarkedDS) { //try finding the marked document as file
674           StringBuffer docName = new StringBuffer(persDoc.getName());
675           docName.replace(
676             persDoc.getName().lastIndexOf("."),
677             docName.length(),
678             ".xml");
679           File markedDocFile = new File(markedDir, docName.toString());
680           if (! markedDocFile.exists()) {
681             Out.prln("Warning: Cannot find human-annotated document " +
682                      markedDocFile + " in " + markedDir);
683           } else {
684             FeatureMap params = Factory.newFeatureMap();
685             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
686             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
687 
688             // create the document
689             markedDoc = (Document) Factory.createResource(
690                                      "gate.corpora.DocumentImpl", params);
691             markedDoc.setName(persDoc.getName());
692           }//find marked as file
693         } else {
694           try {
695             //open marked from a DS
696             //open the data store
697             DataStore sds1 = Factory.openDataStore
698                             ("gate.persist.SerialDataStore",
699                              markedDir.toURL().toExternalForm());
700 
701             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
702             boolean found = false;
703             int k = 0;
704             //search for the marked doc with the same name
705             while (k < lrIDs1.size() && !found) {
706               String docID1 = (String) lrIDs1.get(k);
707 
708               //read the stored document
709               FeatureMap features1 = Factory.newFeatureMap();
710               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
711               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
712               Document tempDoc = (Document) Factory.createResource(
713                                           "gate.corpora.DocumentImpl",
714                                           features1);
715               //check whether this is our doc
716               if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
717                    endsWith(persDoc.getName())) {
718                 found = true;
719                 markedDoc = tempDoc;
720               } else k++;
721             }
722           } catch (java.net.MalformedURLException ex) {
723             Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
724           } catch (gate.persist.PersistenceException ex1) {
725             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
726           } catch (gate.creole.ResourceInstantiationException ex2) {
727             Out.prln("Error opening marked as a datastore (-marked_ds specified)");
728           }
729         }
730 
731         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
732         if (persDoc != null)
733           Factory.deleteResource(persDoc);
734         if (markedDoc != null)
735           Factory.deleteResource(markedDoc);
736 
737       }//for loop through saved docs
738       sds.close();
739 
740     } catch (java.net.MalformedURLException ex) {
741       throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage());
742     } catch (PersistenceException ex1) {
743       throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage());
744     } catch (ResourceInstantiationException ex2) {
745       throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage());
746     }
747 
748   }//evaluateMarkedStored
749 
750 
751   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
752     Document persDoc = null;
753     Document cleanDoc = null;
754     Document markedDoc = null;
755 
756     File[] cleanDocs = cleanDir.listFiles();
757     for (int i = 0; i< cleanDocs.length; i++) {
758       if (!cleanDocs[i].isFile())
759         continue;
760 
761       //try reading the original document from clean
762       FeatureMap params = Factory.newFeatureMap();
763       try {
764         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
765       } catch (java.net.MalformedURLException ex) {
766         Out.prln("Cannot create document from file: " +
767           cleanDocs[i].getAbsolutePath());
768         continue;
769       }
770       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
771 
772       // create the document
773       try {
774         cleanDoc = (Document) Factory.createResource(
775                               "gate.corpora.DocumentImpl", params,
776                               null, cleanDocs[i].getName());
777       } catch (gate.creole.ResourceInstantiationException ex) {
778         Out.prln("Cannot create document from file: " +
779           cleanDocs[i].getAbsolutePath());
780         continue;
781       }
782 
783       if(isMoreInfoMode) {
784         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
785         errName.replace(
786           cleanDocs[i].getName().lastIndexOf("."),
787           cleanDocs[i].getName().length(),
788           ".err");
789         Out.prln("<H2>" +
790                  "<a href=\"err/" + errName.toString() + "\">"
791                  + cleanDocs[i].getName() + "</a>" + "</H2>");
792       } else
793         Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
794 
795       //try finding the marked document
796       if (! isMarkedDS) {
797         StringBuffer docName = new StringBuffer(cleanDoc.getName());
798         docName.replace(
799           cleanDoc.getName().lastIndexOf("."),
800           docName.length(),
801           ".xml");
802         File markedDocFile = new File(markedDir, docName.toString());
803         if (! markedDocFile.exists()) {
804           Out.prln("Warning: Cannot find human-annotated document " +
805                    markedDocFile + " in " + markedDir);
806           continue;
807         } else {
808           params = Factory.newFeatureMap();
809           try {
810             params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL());
811           } catch (java.net.MalformedURLException ex) {
812             Out.prln("Cannot create document from file: " +
813               markedDocFile.getAbsolutePath());
814             continue;
815           }
816           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
817 
818           // create the document
819           try {
820             markedDoc = (Document) Factory.createResource(
821                                    "gate.corpora.DocumentImpl", params,
822                                    null, cleanDoc.getName());
823           } catch (gate.creole.ResourceInstantiationException ex) {
824             Out.prln("Cannot create document from file: " +
825               markedDocFile.getAbsolutePath());
826             continue;
827           }
828 
829         }//if markedDoc exists
830       } else {
831         try {
832           //open marked from a DS
833           //open the data store
834           DataStore sds1 = Factory.openDataStore
835                           ("gate.persist.SerialDataStore",
836                            markedDir.toURL().toExternalForm());
837 
838           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
839           boolean found = false;
840           int k = 0;
841           //search for the marked doc with the same name
842           while (k < lrIDs1.size() && !found) {
843             String docID1 = (String) lrIDs1.get(k);
844 
845             //read the stored document
846             FeatureMap features1 = Factory.newFeatureMap();
847             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
848             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
849             Document tempDoc = (Document) Factory.createResource(
850                                         "gate.corpora.DocumentImpl",
851                                         features1);
852             //check whether this is our doc
853             if ( ((String)tempDoc.getFeatures().get("gate.SourceURL")).
854                  endsWith(cleanDoc.getName())) {
855               found = true;
856               markedDoc = tempDoc;
857             } else k++;
858           }
859         } catch (java.net.MalformedURLException ex) {
860           Out.prln("Error finding marked directory " + markedDir.getAbsolutePath());
861         } catch (gate.persist.PersistenceException ex1) {
862           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
863         } catch (gate.creole.ResourceInstantiationException ex2) {
864           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
865         }
866       } //if using a DS for marked
867 
868       try {
869         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
870       } catch (gate.creole.ResourceInstantiationException ex) {
871         ex.printStackTrace();
872         Out.prln("Evaluate failed on document: " + cleanDoc.getName());
873       }
874       if (persDoc != null)
875         Factory.deleteResource(persDoc);
876       if (cleanDoc != null)
877         Factory.deleteResource(cleanDoc);
878       if (markedDoc != null)
879         Factory.deleteResource(markedDoc);
880 
881     }//for loop through clean docs
882 
883 
884   }//evaluateMarkedClean
885 
886   protected void processDocument(Document doc) {
887     try {
888       if (application instanceof CorpusController) {
889         Corpus tempCorpus = Factory.newCorpus("temp");
890         tempCorpus.add(doc);
891         ((CorpusController)application).setCorpus(tempCorpus);
892         application.execute();
893         Factory.deleteResource(tempCorpus);
894         tempCorpus = null;
895       } else {
896         Iterator iter = application.getPRs().iterator();
897         while (iter.hasNext())
898           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
899         application.execute();
900       }
901     } catch (ResourceInstantiationException ex) {
902       throw new RuntimeException("Error executing application: "
903                                     + ex.getMessage());
904     } catch (ExecutionException ex) {
905       throw new RuntimeException("Error executing application: "
906                                     + ex.getMessage());
907     }
908   }
909 
910   protected void evaluateDocuments(Document persDoc,
911                     Document cleanDoc, Document markedDoc,
912                     File errDir)
913                         throws ResourceInstantiationException {
914     if (cleanDoc == null && markedDoc == null)
915       return;
916 
917     //we've got no types to compare
918     if (annotTypes == null || annotTypes.isEmpty())
919       return;
920 
921     if (cleanDoc != null && !isMarkedStored) {
922 
923       processDocument(cleanDoc);
924 
925 
926       int wordCount = countWords(cleanDoc);
927       if(wordCount == 0)
928         Out.prln("<BR>No Token annotations to count words in the document.");
929       else
930         Out.prln("<BR>Word count: " + wordCount);
931       corpusWordCount += wordCount;
932 
933       if(!isMarkedClean)
934         evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
935       else
936         evaluateTwoDocs(markedDoc, cleanDoc, errDir);
937 
938     } else
939       evaluateTwoDocs(markedDoc, persDoc, errDir);
940 
941   }
942 
943   /**
944    * Count all Token.kind=word annotations in the document
945    */
946   protected int countWords(Document annotDoc) {
947     int count = 0;
948 
949     if (annotDoc == null) return 0;
950     // check for Token in outputSetName
951     AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
952     if (tokens == null) return 0;
953 
954     Iterator it = tokens.iterator();
955     Annotation currAnnotation;
956     while (it.hasNext()) {
957       currAnnotation = (Annotation) it.next();
958       Object feature = currAnnotation.getFeatures().get("kind");
959       if(feature != null && "word".equalsIgnoreCase((String)feature)) ++count;
960     } // while
961 
962     return count;
963   }
964 
965   protected void evaluateAllThree(Document persDoc,
966                                   Document cleanDoc, Document markedDoc,
967                                   File errDir)
968                                   throws ResourceInstantiationException {
969     //first start the table and its header
970     printTableHeader();
971 
972     // store annotation diff in .err file
973     Writer errWriter = null;
974     if (isMoreInfoMode && errDir != null) {
975       StringBuffer docName = new StringBuffer(cleanDoc.getName());
976       docName.replace(
977           cleanDoc.getName().lastIndexOf("."),
978           docName.length(),
979           ".err");
980       File errFile = new File(errDir, docName.toString());
981       String encoding = ((gate.corpora.DocumentImpl)cleanDoc).getEncoding();
982       try {
983         errWriter = new FileWriter(errFile, false);
984         /*
985         if(encoding == null) {
986           errWriter = new OutputStreamWriter(
987               new FileOutputStream(errFile, false));
988         } else {
989           errWriter = new OutputStreamWriter(
990               new FileOutputStream(errFile, false), encoding);
991         }*/
992       }
993       catch (Exception ex) {
994         Out.prln("Exception when creating the error file " + errFile + ": "
995                  + ex.getMessage());
996         errWriter = null;
997       }
998     }
999 
1000    for (int jj= 0; jj< annotTypes.size(); jj++) {
1001      String annotType = (String) annotTypes.get(jj);
1002
1003      AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1004      //we don't have this annotation type in this document
1005      if (annotDiffer == null)
1006        continue;
1007
1008      //increase the number of processed documents
1009      docNumber++;
1010      //add precison and recall to the sums
1011      updateStatistics(annotDiffer, annotType);
1012
1013      AnnotationDiffer annotDiffer1 =
1014        measureDocs(markedDoc, persDoc, annotType);
1015
1016      Out.prln("<TR>");
1017
1018      if(isMoreInfoMode && annotDiffer1 != null
1019         && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1020         || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1021         )
1022        Out.prln("<TD> " + annotType + "_new"+ "</TD>");
1023      else
1024        Out.prln("<TD> " + annotType + "</TD>");
1025
1026      if (isMoreInfoMode) {
1027        if(annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1028
1029        Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1030        Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1031        Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1032        Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1033      }
1034
1035      Out.prln("<TD>");
1036
1037      //check the precision first
1038      if (annotDiffer1 != null) {
1039
1040        if (annotDiffer1.getPrecisionAverage()
1041              < annotDiffer.getPrecisionAverage()) {
1042            Out.prln("<P><Font color=blue> ");
1043            Out.prln(annotDiffer.getPrecisionAverage());
1044
1045            if(!isMoreInfoMode) {
1046              Out.pr("<BR>Precision increase on human-marked from ");
1047              Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1048              Out.prln(annotDiffer.getPrecisionAverage());
1049            }
1050            Out.prln(" </Font></P>");
1051          }
1052        else if (annotDiffer1.getPrecisionAverage()
1053               > annotDiffer.getPrecisionAverage()) {
1054          Out.prln("<P><Font color=red> ");
1055          Out.prln(annotDiffer.getPrecisionAverage());
1056
1057          if(!isMoreInfoMode) {
1058            Out.pr("<BR>Precision decrease on human-marked from ");
1059            Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1060            Out.prln(annotDiffer.getPrecisionAverage());
1061          }
1062          Out.prln(" </Font></P>");
1063        }
1064        else
1065          Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() + " </P>");
1066      }
1067      else
1068        Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1069
1070      Out.prln("</TD>");
1071
1072      Out.prln("<TD>");
1073
1074      //check the recall now
1075      if (annotDiffer1 != null) {
1076
1077        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1078          Out.prln("<P><Font color=blue> ");
1079          Out.prln(annotDiffer.getRecallAverage());
1080
1081          if(!isMoreInfoMode) {
1082            Out.pr("<BR>Recall increase on human-marked from ");
1083            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1084            Out.prln(annotDiffer.getRecallAverage());
1085          }
1086          Out.prln(" </Font></P>");
1087        }
1088        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1089          Out.prln("<P><Font color=red> ");
1090          Out.prln(annotDiffer.getRecallAverage());
1091
1092          if(!isMoreInfoMode) {
1093            Out.pr("<BR>Recall decrease on human-marked from ");
1094            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1095            Out.prln(annotDiffer.getRecallAverage());
1096          }
1097          Out.prln(" </Font></P>");
1098        }
1099        else
1100          Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1101      } else
1102        Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1103
1104
1105      Out.prln("</TD>");
1106
1107      //check the recall now
1108      if ( isVerboseMode ) {
1109        Out.prln("<TD>");
1110        if (annotDiffer.getRecallAverage() < threshold) {
1111          printAnnotations(annotDiffer, markedDoc, cleanDoc);
1112        }
1113        else {
1114          Out.prln("&nbsp;");
1115        }
1116        Out.prln("</TD>");
1117      }
1118
1119      Out.prln("</TR>");
1120
1121      // show one more table line for processed document
1122      if(isMoreInfoMode && annotDiffer1 != null
1123         && (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1124         || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1125         ) {
1126
1127        Out.prln("<TR>");
1128        Out.prln("<TD> " + annotType + "_old" + "</TD>");
1129
1130        Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1131        Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1132        Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1133        Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1134
1135        Out.prln("<TD>");
1136        if (annotDiffer1.getPrecisionAverage() < annotDiffer.getPrecisionAverage())
1137
1138          Out.prln("<P><Font color=blue> "  + annotDiffer1.getPrecisionAverage()
1139                + "</Font></P>");
1140        else if (annotDiffer1.getPrecisionAverage() > annotDiffer.getPrecisionAverage())
1141          Out.prln(
1142             "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1143             + " </Font></P>");
1144        else
1145          Out.prln(annotDiffer1.getPrecisionAverage());
1146
1147        Out.prln("</TD>");
1148
1149        Out.prln("<TD>");
1150        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1151          Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1152                   + " </Font></P>");
1153        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1154          Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1155                    + " </Font></P>");
1156        else
1157           Out.prln(annotDiffer1.getRecallAverage());
1158
1159        Out.prln("</TD>");
1160
1161        //check the recall now
1162        if ( isVerboseMode ) {
1163          // create error file and start writing
1164
1165          Out.prln("<TD>");
1166          if (annotDiffer.getRecallAverage() < threshold) {
1167            printAnnotations(annotDiffer, markedDoc, cleanDoc);
1168          }
1169          else {
1170            Out.prln("&nbsp;");
1171          }
1172          Out.prln("</TD>");
1173        }
1174        Out.prln("</TR>");
1175      } // if(isMoreInfoMode && annotDiff1 != null)
1176
1177      if (isMoreInfoMode && errDir != null)
1178        storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1179    }//for loop through annotation types
1180    Out.prln("</TABLE>");
1181
1182    try {
1183      if(errWriter != null)
1184        errWriter.close();
1185    }
1186    catch (Exception ex) {
1187      Out.prln("Exception on close of error file " + errWriter + ": "
1188               + ex.getMessage());
1189    }
1190  }//evaluateAllThree
1191
1192  protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1193                                 File errDir)
1194        throws ResourceInstantiationException {
1195
1196    //first start the table and its header
1197    printTableHeader();
1198
1199    // store annotation diff in .err file
1200    Writer errWriter = null;
1201    if (isMoreInfoMode && errDir != null) {
1202      StringBuffer docName = new StringBuffer(keyDoc.getName());
1203      docName.replace(
1204          keyDoc.getName().lastIndexOf("."),
1205          docName.length(),
1206          ".err");
1207      File errFile = new File(errDir, docName.toString());
1208      String encoding = ((gate.corpora.DocumentImpl)keyDoc).getEncoding();
1209      try {
1210        errWriter = new FileWriter(errFile, false);
1211        /*
1212        if(encoding == null) {
1213          errWriter = new OutputStreamWriter(
1214              new FileOutputStream(errFile, false));
1215        } else {
1216          errWriter = new OutputStreamWriter(
1217              new FileOutputStream(errFile, false), encoding);
1218        }*/
1219      }
1220      catch (Exception ex) {
1221        Out.prln("Exception when creating the error file " + errFile + ": "
1222                 + ex.getMessage());
1223        errWriter = null;
1224      }
1225    }
1226
1227    for (int jj= 0; jj< annotTypes.size(); jj++) {
1228      String annotType = (String) annotTypes.get(jj);
1229
1230      AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1231      //we don't have this annotation type in this document
1232      if (annotDiff == null)
1233        continue;
1234
1235      //increase the number of processed documents
1236      docNumber++;
1237      //add precison and recall to the sums
1238      updateStatistics(annotDiff, annotType);
1239
1240      Out.prln("<TR>");
1241      Out.prln("<TD>" + annotType + "</TD>");
1242
1243      if(isMoreInfoMode) {
1244        Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1245        Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1246        Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1247        Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1248      }
1249
1250      Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1251      Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1252      //check the recall now
1253      if ( isVerboseMode ) {
1254        Out.prln("<TD>");
1255        if (annotDiff.getRecallAverage() < threshold) {
1256          printAnnotations(annotDiff, keyDoc, respDoc);
1257        }
1258        else {
1259          Out.prln("&nbsp;");
1260        }
1261        Out.prln("</TD>");
1262      }
1263      Out.prln("</TR>");
1264
1265      if (isMoreInfoMode && errDir != null)
1266        storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1267    }//for loop through annotation types
1268    Out.prln("</TABLE>");
1269
1270    try {
1271      if(errWriter != null)
1272        errWriter.close();
1273    }
1274    catch (Exception ex) {
1275      Out.prln("Exception on close of error file " + errWriter + ": "
1276               + ex.getMessage());
1277    }
1278  }//evaluateTwoDocs
1279
1280  protected void printTableHeader() {
1281    Out.prln("<TABLE BORDER=1");
1282    Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1283
1284    if(isMoreInfoMode)
1285     Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1286             + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1287
1288    Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1289
1290    if (isVerboseMode)
1291      Out.pr("<TD><B>Annotations</B></TD>");
1292
1293    Out.prln("</TR>");
1294  }
1295
1296  protected void updateStatistics(AnnotationDiffer annotDiffer, String annotType){
1297    double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1298(double)(2.0));
1299    precisionSum += precisionAverage;
1300
1301    double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1302    recallSum += recallAverage;
1303
1304    double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1) + annotDiffer.getFMeasureStrict(1)) /
1305(double) (2.0));
1306    fMeasureSum += fMeasureAverage;
1307
1308    Double oldPrecision = (Double) precisionByType.get(annotType);
1309    if (oldPrecision == null)
1310        precisionByType.put(annotType, new Double(precisionAverage));
1311    else
1312        precisionByType.put(annotType, new Double(oldPrecision.doubleValue() + precisionAverage));
1313
1314    Integer precCount = (Integer) prCountByType.get(annotType);
1315    if (precCount == null)
1316        prCountByType.put(annotType, new Integer(1));
1317    else
1318       prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1319
1320
1321    Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1322    if (oldFMeasure == null)
1323       fMeasureByType.put(annotType, new Double(fMeasureAverage));
1324    else
1325       fMeasureByType.put(annotType, new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1326
1327    Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1328    if (fCount == null)
1329       fMeasureCountByType.put(annotType, new Integer(1));
1330    else
1331       fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1332
1333    Double oldRecall = (Double) recallByType.get(annotType);
1334    if (oldRecall == null)
1335       recallByType.put(annotType, new Double(recallAverage));
1336    else
1337       recallByType.put(annotType, new Double(oldRecall.doubleValue() + recallAverage));
1338
1339    Integer recCount = (Integer) recCountByType.get(annotType);
1340    if (recCount == null)
1341       recCountByType.put(annotType, new Integer(1));
1342    else
1343       recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1344
1345    //Update the missing, spurious, correct, and partial counts
1346    Long oldMissingNo = (Long) missingByType.get(annotType);
1347    if (oldMissingNo == null)
1348       missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1349    else
1350       missingByType.put(annotType, new Long(oldMissingNo.longValue() + annotDiffer.getMissing()));
1351
1352    Long oldCorrectNo = (Long) correctByType.get(annotType);
1353    if (oldCorrectNo == null)
1354       correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1355    else
1356       correctByType.put(annotType, new Long(oldCorrectNo.longValue() + annotDiffer.getCorrectMatches()));
1357
1358    Long oldPartialNo = (Long) partialByType.get(annotType);
1359    if (oldPartialNo == null)
1360       partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1361    else
1362       partialByType.put(annotType, new Long(oldPartialNo.longValue() + annotDiffer.getPartiallyCorrectMatches()));
1363
1364    Long oldSpuriousNo = (Long) spurByType.get(annotType);
1365    if (oldSpuriousNo == null)
1366       spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1367    else
1368       spurByType.put(annotType, new Long(oldSpuriousNo.longValue() + annotDiffer.getSpurious()));
1369  }
1370
1371  /**
1372   * Update statistics for processed documents
1373   * The same procedure as updateStatistics with different hashTables
1374   */
1375  protected void updateStatisticsProc(AnnotationDiffer annotDiffer, String annotType){
1376    hasProcessed = true;
1377    double precisionAverage = ((double)(annotDiffer.getPrecisionLenient() + annotDiffer.getPrecisionStrict()) /
1378(double)(2.0));
1379    proc_precisionSum += precisionAverage;
1380
1381    double recallAverage = ((double)(annotDiffer.getRecallLenient() + annotDiffer.getRecallStrict()) / (double) (2.0));
1382    proc_recallSum += recallAverage;
1383
1384    double fMeasureAverage = ((double) (annotDiffer.getFMeasureLenient(1) + annotDiffer.getFMeasureStrict(1)) /
1385(double) (2.0));
1386    proc_fMeasureSum += fMeasureAverage;
1387
1388    Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1389    if (oldPrecision == null)
1390        proc_precisionByType.put(annotType, new Double(precisionAverage));
1391      else
1392        proc_precisionByType.put(annotType,
1393                            new Double(oldPrecision.doubleValue() +
1394                                       precisionAverage));
1395      Integer precCount = (Integer) proc_prCountByType.get(annotType);
1396      if (precCount == null)
1397        proc_prCountByType.put(annotType, new Integer(1));
1398      else
1399        proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1400
1401
1402      Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1403      if (oldFMeasure == null)
1404        proc_fMeasureByType.put(annotType,
1405                         new Double(fMeasureAverage));
1406      else
1407        proc_fMeasureByType.put(annotType,
1408                         new Double(oldFMeasure.doubleValue() +
1409                                    fMeasureAverage));
1410      Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1411      if (fCount == null)
1412        proc_fMeasureCountByType.put(annotType, new Integer(1));
1413      else
1414        proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1415
1416      Double oldRecall = (Double) proc_recallByType.get(annotType);
1417      if (oldRecall == null)
1418        proc_recallByType.put(annotType,
1419                            new Double(recallAverage));
1420      else
1421        proc_recallByType.put(annotType,
1422                            new Double(oldRecall.doubleValue() +
1423                                       recallAverage));
1424      Integer recCount = (Integer) proc_recCountByType.get(annotType);
1425      if (recCount == null)
1426        proc_recCountByType.put(annotType, new Integer(1));
1427      else
1428        proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1429
1430      //Update the missing, spurious, correct, and partial counts
1431      Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1432      if (oldMissingNo == null)
1433        proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1434      else
1435        proc_missingByType.put(annotType,
1436                        new Long(oldMissingNo.longValue() +
1437                                  annotDiffer.getMissing()));
1438
1439      Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1440      if (oldCorrectNo == null)
1441        proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1442      else
1443        proc_correctByType.put(annotType,
1444                        new Long(oldCorrectNo.longValue() +
1445                                  annotDiffer.getCorrectMatches()));
1446
1447      Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1448      if (oldPartialNo == null)
1449        proc_partialByType.put(annotType, new Long(annotDiffer.getPartiallyCorrectMatches()));
1450      else
1451        proc_partialByType.put(annotType,
1452                        new Long(oldPartialNo.longValue() +
1453                                  annotDiffer.getPartiallyCorrectMatches()));
1454
1455      Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1456      if (oldSpuriousNo == null)
1457        proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1458      else
1459        proc_spurByType.put(annotType,
1460                        new Long(oldSpuriousNo.longValue() +
1461                                  annotDiffer.getSpurious()));
1462  }
1463
1464  public void printStatistics() {
1465
1466    Out.prln("<H2> Statistics </H2>");
1467
1468/*
1469    Out.prln("<H3> Precision </H3>");
1470    if (precisionByType != null && !precisionByType.isEmpty()) {
1471      Iterator iter = precisionByType.keySet().iterator();
1472      while (iter.hasNext()) {
1473        String annotType = (String) iter.next();
1474        Out.prln(annotType + ": "
1475          + ((Double)precisionByType.get(annotType)).doubleValue()
1476              /
1477              ((Integer)prCountByType.get(annotType)).intValue()
1478          + "<P>");
1479      }//while
1480    }
1481    Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1482
1483    Out.prln("<H3> Recall </H3>");
1484    if (recallByType != null && !recallByType.isEmpty()) {
1485      Iterator iter = recallByType.keySet().iterator();
1486      while (iter.hasNext()) {
1487        String annotType = (String) iter.next();
1488        Out.prln(annotType + ": "
1489          + ((Double)recallByType.get(annotType)).doubleValue()
1490              /
1491              ((Integer)recCountByType.get(annotType)).intValue()
1492          + "<P>");
1493      }//while
1494    }
1495
1496    Out.prln("Overall recall: " + getRecallAverage()
1497             + "<P>");
1498*/
1499    if (annotTypes == null) {
1500      Out.prln("No types given for evaluation, cannot obtain precision/recall");
1501      return;
1502    }
1503    Out.prln("<table border=1>");
1504    Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1505              "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1506              "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1507              "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1508    String annotType;
1509    for (int i = 0; i < annotTypes.size(); i++) {
1510      annotType = (String) annotTypes.get(i);
1511      printStatsForType(annotType);
1512    }//for
1513    Out.prln("</table>");
1514  } // updateStatisticsProc
1515
1516  protected void printStatsForType(String annotType){
1517    long correct = (correctByType.get(annotType) == null)? 0 :
1518                      ((Long)correctByType.get(annotType)).longValue();
1519    long partial = (partialByType.get(annotType) == null)? 0 :
1520                      ((Long)partialByType.get(annotType)).longValue();
1521    long spurious = (spurByType.get(annotType) == null)? 0 :
1522                      ((Long)spurByType.get(annotType)).longValue();
1523    long missing = (missingByType.get(annotType) == null)? 0:
1524                      ((Long)missingByType.get(annotType)).longValue();
1525    long actual = correct + partial + spurious;
1526    long possible = correct + partial + missing;
1527    //precision strict is correct/actual
1528    //precision is (correct + 0.5 * partially correct)/actual
1529    double precision = (correct + 0.5 * partial) / actual;
1530    //recall strict is correct/possible
1531    double recall = (correct + 0.5*partial)/possible;
1532    //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1533    double fmeasure =
1534      ((beta*beta + 1)*precision*recall)
1535      /
1536      ((beta*beta*precision) + recall);
1537
1538    long proc_correct=0;
1539    long proc_partial=0;
1540    long proc_spurious=0;
1541    long proc_missing=0;
1542    long proc_actual=0;
1543    long proc_possible=0;
1544    double proc_precision=0;
1545    double proc_recall=0;
1546    double proc_fmeasure=0;
1547
1548    if(hasProcessed) {
1549      // calculate values for processed
1550      proc_correct = (proc_correctByType.get(annotType) == null)? 0 :
1551                        ((Long)proc_correctByType.get(annotType)).longValue();
1552      proc_partial = (proc_partialByType.get(annotType) == null)? 0 :
1553                        ((Long)proc_partialByType.get(annotType)).longValue();
1554      proc_spurious = (proc_spurByType.get(annotType) == null)? 0 :
1555                        ((Long)proc_spurByType.get(annotType)).longValue();
1556      proc_missing = (proc_missingByType.get(annotType) == null)? 0:
1557                        ((Long)proc_missingByType.get(annotType)).longValue();
1558      proc_actual = proc_correct + proc_partial + proc_spurious;
1559      proc_possible = proc_correct + proc_partial + proc_missing;
1560      //precision strict is correct/actual
1561      //precision is (correct + 0.5 * partially correct)/actual
1562      proc_precision = (proc_correct + 0.5*proc_partial)/proc_actual;
1563      //recall strict is correct/possible
1564      proc_recall = (proc_correct + 0.5*proc_partial)/proc_possible;
1565      //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1566      proc_fmeasure =
1567        ((beta*beta + 1)*proc_precision*proc_recall)
1568        /
1569        ((beta*beta*proc_precision) + proc_recall);
1570    }
1571
1572    // output data
1573    Out.prln("<TR>");
1574    if(hasProcessed)
1575      Out.prln("<TD>" + annotType+ "_new"  + "</TD>");
1576    else
1577      Out.prln("<TD>" + annotType + "</TD>");
1578
1579    Out.prln("<TD>" + correct + "</TD>");
1580    Out.prln("<TD>" + partial + "</TD>");
1581    Out.prln("<TD>" + missing + "</TD>");
1582    Out.prln("<TD>" + spurious + "</TD>");
1583
1584    String strPrec = (isMoreInfoMode)?
1585        avgPrint(precision, 4)
1586        :Double.toString(precision);
1587    String strRec = (isMoreInfoMode)?
1588        avgPrint(recall, 4)
1589        :Double.toString(recall);
1590    String strFmes = (isMoreInfoMode)?
1591        avgPrint(fmeasure, 4)
1592        :Double.toString(fmeasure);
1593
1594    if(hasProcessed && (precision < proc_precision))
1595      Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1596      else if(hasProcessed && (precision > proc_precision))
1597        Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1598        else
1599          Out.prln("<TD>" + strPrec + "</TD>");
1600    if(hasProcessed && (recall < proc_recall))
1601      Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1602      else if(hasProcessed && (recall > proc_recall))
1603        Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1604        else
1605          Out.prln("<TD>" + strRec + "</TD>");
1606    Out.prln("<TD>" + strFmes + "</TD>");
1607    Out.prln("</TR>");
1608
1609    if(hasProcessed) {
1610      // output data
1611      Out.prln("<TR>");
1612      Out.prln("<TD>" + annotType + "_old" + "</TD>");
1613
1614      Out.prln("<TD>" + proc_correct + "</TD>");
1615      Out.prln("<TD>" + proc_partial + "</TD>");
1616      Out.prln("<TD>" + proc_missing + "</TD>");
1617      Out.prln("<TD>" + proc_spurious + "</TD>");
1618
1619      String strProcPrec = (isMoreInfoMode)?
1620          avgPrint(proc_precision, 4)
1621          :Double.toString(proc_precision);
1622      String strProcRec = (isMoreInfoMode)?
1623          avgPrint(proc_recall, 4)
1624          :Double.toString(proc_recall);
1625      String strProcFmes = (isMoreInfoMode)?
1626          avgPrint(proc_fmeasure, 4)
1627          :Double.toString(proc_fmeasure);
1628
1629      if(precision < proc_precision)
1630        Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1631        else if(precision > proc_precision)
1632          Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1633          else
1634            Out.prln("<TD>" + strProcPrec + "</TD>");
1635      if(recall < proc_recall)
1636        Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1637        else if(recall > proc_recall)
1638          Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1639          else
1640            Out.prln("<TD>" + strProcRec + "</TD>");
1641      Out.prln("<TD>" + strProcFmes + "</TD>");
1642      Out.prln("</TR>");
1643    }
1644  }//printStatsForType
1645
1646  //** Print @param value with @param count digits after decimal point */
1647  protected String avgPrint(double value, int count) {
1648    double newvalue;
1649    double power = Math.pow(10, count);
1650    newvalue = Math.round( value * power )/ power;
1651    return Double.toString(newvalue);
1652  }
1653
1654
1655  private double precisionSumCalc = 0;
1656  private double recallSumCalc = 0;
1657  private double fMeasureSumCalc = 0;
1658
1659  public double getPrecisionAverageCalc() {
1660    return precisionSumCalc;
1661  }
1662
1663  public double getRecallAverageCalc() {
1664    return recallSumCalc;
1665  }
1666
1667  public double getFmeasureAverageCalc() {
1668    return fMeasureSumCalc;
1669  }
1670
1671  protected void calculateAvgTotal() {
1672    long correct, partial, spurious, missing;
1673    long correctSum, partialSum, spuriousSum, missingSum;
1674
1675    if (annotTypes == null) {
1676      return;
1677    }
1678    correctSum = partialSum = spuriousSum = missingSum = 0;
1679
1680    String annotType;
1681    for (int i = 0; i < annotTypes.size(); i++) {
1682      annotType = (String) annotTypes.get(i);
1683      correct = (correctByType.get(annotType) == null)? 0 :
1684                        ((Long)correctByType.get(annotType)).longValue();
1685      partial = (partialByType.get(annotType) == null)? 0 :
1686                        ((Long)partialByType.get(annotType)).longValue();
1687      spurious = (spurByType.get(annotType) == null)? 0 :
1688                        ((Long)spurByType.get(annotType)).longValue();
1689      missing = (missingByType.get(annotType) == null)? 0:
1690                        ((Long)missingByType.get(annotType)).longValue();
1691      correctSum += correct;
1692      partialSum += partial;
1693      spuriousSum += spurious;
1694      missingSum += missing;
1695    }//for
1696
1697    long actual = correctSum + partialSum + spuriousSum;
1698    long possible = correctSum + partialSum + missingSum;
1699
1700    if(actual == 0) {
1701      precisionSumCalc = 0;
1702    }
1703    else {
1704      precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1705    }
1706
1707    if(possible == 0) {
1708      recallSumCalc = 0;
1709    }
1710    else {
1711      recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1712    }
1713
1714    if(precisionSumCalc == 0 && recallSumCalc == 0) {
1715      fMeasureSumCalc = 0;
1716    }
1717    else {
1718      fMeasureSumCalc =
1719        ((beta*beta + 1)*precisionSumCalc*recallSumCalc)
1720        /
1721        ((beta*beta*precisionSumCalc) + recallSumCalc);
1722
1723    }
1724  } // calculateAvgTotal
1725
1726  protected AnnotationDiffer measureDocs(
1727    Document keyDoc, Document respDoc, String annotType)
1728      throws ResourceInstantiationException {
1729
1730    if (keyDoc == null || respDoc == null)
1731      return null;
1732
1733    if (annotSetName != null
1734        && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1735      return null;
1736    else if ((annotSetName == null || annotSetName.equals(""))
1737        && keyDoc.getAnnotations().get(annotType) == null)
1738      return null;
1739
1740    // create an annotation diff
1741    AnnotationDiffer annotDiffer = new AnnotationDiffer();
1742    // set the feature names set for annotation differ
1743    annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1744    // we need to find the sets
1745    AnnotationSet keys, responses;
1746    if(annotSetName == null || annotSetName.equals("")) {
1747      keys = keyDoc.getAnnotations();
1748      responses = respDoc.getAnnotations();
1749    } else {
1750      keys = keyDoc.getAnnotations(annotSetName)/*.get()*/;
1751      System.out.println("Keys : "+keys.size());
1752      responses = respDoc.getAnnotations(outputSetName)/*.get()*/;
1753      System.out.println("Resp : "+responses.size());
1754    }
1755
1756    // we have annotation sets so call the annotationDiffer
1757    List pairings = annotDiffer.calculateDiff(keys,responses);
1758    return annotDiffer;
1759  } // measureDocs
1760
1761  protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1762                  Document keyDoc, Document respDoc, Writer errFileWriter) {
1763    if(errFileWriter == null) return; // exit on "no file"
1764
1765    try {
1766      // extract and store annotations
1767      Comparator comp = new OffsetComparator();
1768      TreeSet sortedSet = new TreeSet(comp);
1769      Set missingSet =
1770          annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1771      sortedSet.clear();
1772      sortedSet.addAll(missingSet);
1773      storeAnnotations(type+".miss", sortedSet, keyDoc, errFileWriter);
1774      Set spuriousSet =
1775          annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1776      sortedSet.clear();
1777      sortedSet.addAll(spuriousSet);
1778      storeAnnotations(type+".spur", sortedSet, respDoc, errFileWriter);
1779      Set partialSet =
1780          annotDiffer.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1781      sortedSet.clear();
1782      sortedSet.addAll(partialSet);
1783      storeAnnotations(type+".part", sortedSet, respDoc, errFileWriter);
1784    } catch (Exception ex) {
1785      Out.prln("Exception on close of error file "+errFileWriter+": "
1786               +ex.getMessage());
1787    }
1788  }// storeAnnotations
1789
1790  protected void storeAnnotations(String type, Set set, Document doc,
1791                                  Writer file) throws IOException{
1792
1793    if (set == null || set.isEmpty())
1794      return;
1795
1796    Iterator iter = set.iterator();
1797    Annotation ann;
1798    while (iter.hasNext()) {
1799      ann = (Annotation) iter.next();
1800      file.write(type);
1801      file.write(".");
1802      file.write(doc.getContent().toString().substring(
1803          ann.getStartNode().getOffset().intValue(),
1804          ann.getEndNode().getOffset().intValue()));
1805      file.write(".");
1806      file.write(ann.getStartNode().getOffset().toString());
1807      file.write(".");
1808      file.write(ann.getEndNode().getOffset().toString());
1809      file.write("\n");
1810    }//while
1811  }// storeAnnotations
1812
1813  protected void printAnnotations(AnnotationDiffer annotDiff,
1814                    Document keyDoc, Document respDoc) {
1815    Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
1816    Set missingSet =
1817      annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1818    printAnnotations(missingSet, keyDoc);
1819    Out.prln("<BR>");
1820
1821    Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
1822    Set spuriousSet =
1823      annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1824    printAnnotations(spuriousSet, respDoc);
1825    Out.prln("</BR>");
1826
1827    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
1828    Set partialSet =
1829      annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
1830    printAnnotations(partialSet, respDoc);
1831  }
1832
1833  protected void printAnnotations(Set set, Document doc) {
1834    if (set == null || set.isEmpty())
1835      return;
1836
1837    Iterator iter = set.iterator();
1838    while (iter.hasNext()) {
1839      Annotation ann = (Annotation) iter.next();
1840      Out.prln(
1841        "<B>" +
1842        doc.getContent().toString().substring(
1843          ann.getStartNode().getOffset().intValue(),
1844          ann.getEndNode().getOffset().intValue()) +
1845        "</B>: <I>[" + ann.getStartNode().getOffset() +
1846        "," + ann.getEndNode().getOffset() + "]</I>"
1847//        + "; features" + ann.getFeatures()
1848        );
1849    }//while
1850  }//printAnnotations
1851
1852  /**
1853   * The directory from which we should generate/evaluate the corpus
1854   */
1855  private File startDir;
1856  private File currDir;
1857  private static List annotTypes;
1858
1859  private Controller application = null;
1860  private File applicationFile = null;
1861
1862  //collect the sum of all precisions and recalls of all docs
1863  //and the number of docs, so I can calculate the average for
1864  //the corpus at the end
1865  private double precisionSum = 0;
1866  private double recallSum = 0;
1867  private double fMeasureSum = 0;
1868  private HashMap precisionByType = new HashMap();
1869  private HashMap prCountByType = new HashMap();
1870  private HashMap recallByType = new HashMap();
1871  private HashMap recCountByType = new HashMap();
1872  private HashMap fMeasureByType = new HashMap();
1873  private HashMap fMeasureCountByType = new HashMap();
1874
1875  private HashMap missingByType = new HashMap();
1876  private HashMap spurByType = new HashMap();
1877  private HashMap correctByType = new HashMap();
1878  private HashMap partialByType = new HashMap();
1879
1880  // statistic for processed
1881  static boolean hasProcessed = false;
1882  private double proc_precisionSum = 0;
1883  private double proc_recallSum = 0;
1884  private double proc_fMeasureSum = 0;
1885  private HashMap proc_precisionByType = new HashMap();
1886  private HashMap proc_prCountByType = new HashMap();
1887  private HashMap proc_recallByType = new HashMap();
1888  private HashMap proc_recCountByType = new HashMap();
1889  private HashMap proc_fMeasureByType = new HashMap();
1890  private HashMap proc_fMeasureCountByType = new HashMap();
1891
1892  private HashMap proc_missingByType = new HashMap();
1893  private HashMap proc_spurByType = new HashMap();
1894  private HashMap proc_correctByType = new HashMap();
1895  private HashMap proc_partialByType = new HashMap();
1896
1897  double beta = 1;
1898
1899  private int docNumber = 0;
1900
1901  /**
1902   * If true, the corpus tool will generate the corpus, otherwise it'll
1903   * run in evaluate mode
1904   */
1905  private boolean isGenerateMode = false;
1906
1907  /**
1908   * If true - show annotations for docs below threshold
1909   */
1910  private boolean isVerboseMode = false;
1911
1912  /**
1913   * If true - show more info in document table
1914   */
1915  private boolean isMoreInfoMode = false;
1916
1917  /**
1918   * The list of features used in the AnnotationDiff separated by comma
1919   * Example: "class;inst"
1920   */
1921  private Set diffFeaturesSet;
1922
1923  /**
1924   * If true, the corpus tool will evaluate stored against the human-marked
1925   * documents
1926   */
1927  private boolean isMarkedStored = false;
1928  private boolean isMarkedClean = false;
1929  //whether marked are in a DS, not xml
1930  private boolean isMarkedDS = false;
1931
1932  private String annotSetName = "Key";
1933  private String outputSetName = null;
1934
1935  private double threshold = 0.5;
1936  private Properties configs = new Properties();
1937  private static int corpusWordCount = 0;
1938
1939  private String documentEncoding = "";
1940
1941  /** String to print when wrong command-line args */
1942  private static String usage =
1943    "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
1944    +"[-verbose] [-moreinfo] directory-name application";
1945
1946}
1947