|
CorpusBenchmarkTool |
|
1 /* 2 * CorpusBenchmarkTool.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/Oct/2001 12 * 13 * $Id: CorpusBenchmarkTool.java,v 1.24 2002/03/06 17:15:48 kalina Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import gate.creole.tokeniser.*; 26 import gate.creole.gazetteer.*; 27 import gate.creole.splitter.*; 28 import gate.creole.orthomatcher.*; 29 import gate.creole.annotransfer.*; 30 import gate.annotation.*; 31 32 public class CorpusBenchmarkTool { 33 private static final String MARKED_DIR_NAME = "marked"; 34 private static final String CLEAN_DIR_NAME = "clean"; 35 private static final String CVS_DIR_NAME = "Cvs"; 36 private static final String PROCESSED_DIR_NAME = "processed"; 37 38 private static final boolean DEBUG = true; 39 40 public CorpusBenchmarkTool() {} 41 42 public void initPRs() { 43 try { 44 FeatureMap params = Factory.newFeatureMap(); 45 46 //create a default tokeniser 47 Out.prln("Loading tokeniser <P>"); 48 String rulesURL = this.configs.getProperty("tokeniserRulesURL"); 49 if (rulesURL != null && !rulesURL.equals("")) 50 params.put( 51 DefaultTokeniser.DEF_TOK_TOKRULES_URL_PARAMETER_NAME, rulesURL); 52 String grammarsURL = this.configs.getProperty("tokeniserGrammarURL"); 53 if (grammarsURL != null && !grammarsURL.equals("")) 54 params.put( 55 DefaultTokeniser.DEF_TOK_GRAMRULES_URL_PARAMETER_NAME, grammarsURL); 56 //the annots are put in temp, as they are going to be transfered to the 57 //new set 58 params.put(DefaultTokeniser.DEF_TOK_ANNOT_SET_PARAMETER_NAME, "temp"); 59 tokeniser = (DefaultTokeniser) Factory.createResource( 60 "gate.creole.tokeniser.DefaultTokeniser", params); 61 62 //create a default gazetteer 63 Out.prln("Loading gazetteer <P>"); 64 params.clear(); 65 String listsURL = this.configs.getProperty("gazetteerListsURL"); 66 if (listsURL != null && !listsURL.equals("")) { 67 params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME, listsURL); 68 Out.prln("Running gazetteer on lists in: " + listsURL + "<P>"); 69 } 70 String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive"); 71 if (caseSensitive != null && !caseSensitive.equals("")) 72 params.put(DefaultGazetteer.DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME, 73 new Boolean(caseSensitive)); 74 params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, "temp"); 75 gazetteer = (DefaultGazetteer) Factory.createResource( 76 "gate.creole.gazetteer.DefaultGazetteer", params); 77 78 //create the Annotation set transfer 79 Out.prln("Loading annotation set transfer <P>"); 80 params.clear(); 81 params.put(AnnotationSetTransfer.AST_INPUT_AS_PARAMETER_NAME, "temp"); 82 params.put(AnnotationSetTransfer.AST_OUTPUT_AS_PARAMETER_NAME, annotSetName); 83 //by default make it transfer all annotations 84 params.put(AnnotationSetTransfer.AST_TEXT_TAG_PARAMETER_NAME, ""); 85 setTransfer = (AnnotationSetTransfer) Factory.createResource( 86 "gate.creole.annotransfer.AnnotationSetTransfer", params); 87 88 //create a splitter 89 Out.prln("Loading sentence splitter <P>"); 90 params.clear(); 91 listsURL = this.configs.getProperty("splitterGazetteerURL"); 92 if (listsURL != null && !listsURL.equals("")) 93 params.put(SentenceSplitter.SPLIT_GAZ_URL_PARAMETER_NAME, listsURL); 94 grammarsURL = this.configs.getProperty("splitterGrammarURL"); 95 if (grammarsURL != null && !grammarsURL.equals("")) 96 params.put(SentenceSplitter.SPLIT_TRANSD_URL_PARAMETER_NAME, grammarsURL); 97 params.put(SentenceSplitter.SPLIT_INPUT_AS_PARAMETER_NAME, annotSetName); 98 params.put(SentenceSplitter.SPLIT_OUTPUT_AS_PARAMETER_NAME, annotSetName); 99 splitter = (SentenceSplitter) Factory.createResource( 100 "gate.creole.splitter.SentenceSplitter", params); 101 102 //create a tagger 103 Out.prln("Loading POS tagger <P>"); 104 params.clear(); 105 String lexiconURL = this.configs.getProperty("taggerLexiconURL"); 106 if (lexiconURL != null && !lexiconURL.equals("")) 107 params.put(POSTagger.TAG_LEXICON_URL_PARAMETER_NAME, lexiconURL); 108 rulesURL = this.configs.getProperty("taggerRulesURL"); 109 if (rulesURL != null && !rulesURL.equals("")) 110 params.put(POSTagger.TAG_RULES_URL_PARAMETER_NAME, rulesURL); 111 params.put(POSTagger.TAG_INPUT_AS_PARAMETER_NAME, annotSetName); 112 params.put(POSTagger.TAG_OUTPUT_AS_PARAMETER_NAME, annotSetName); 113 tagger = (POSTagger) Factory.createResource( 114 "gate.creole.POSTagger", params); 115 116 //create a grammar 117 Out.prln("Loading grammars for transducer <P>"); 118 params.clear(); 119 String grammarURL = this.configs.getProperty("grammarURL"); 120 if (grammarURL != null && !grammarURL.equals("")) { 121 params.put(ANNIETransducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, grammarURL); 122 Out.prln("Running transducer on grammars in: " + grammarURL + "<P>"); 123 } 124 params.put(ANNIETransducer.TRANSD_INPUT_AS_PARAMETER_NAME, annotSetName); 125 params.put(ANNIETransducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, annotSetName); 126 transducer = (ANNIETransducer) Factory.createResource( 127 "gate.creole.ANNIETransducer", params); 128 129 //create an orthomatcher 130 Out.prln("Loading orthomatcher <P>"); 131 params.clear(); 132 params.put(OrthoMatcher.OM_ANN_SET_PARAMETER_NAME, annotSetName); 133 orthomatcher = (OrthoMatcher) Factory.createResource( 134 "gate.creole.orthomatcher.OrthoMatcher", params); 135 } catch (ResourceInstantiationException ex) { 136 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 137 } 138 }//initPRs 139 140 public void unloadPRs() { 141 //we have nothing to unload if no PRs are loaded 142 if (isMarkedStored) 143 return; 144 145 Factory.deleteResource(this.tokeniser); 146 Factory.deleteResource(this.gazetteer); 147 Factory.deleteResource(this.setTransfer); 148 Factory.deleteResource(this.splitter); 149 Factory.deleteResource(this.tagger); 150 Factory.deleteResource(this.transducer); 151 Factory.deleteResource(this.orthomatcher); 152 } 153 154 public void execute() { 155 execute(startDir); 156 } 157 158 public void init() { 159 //first read the corpus_tool.properties file 160 File propFile = new File("corpus_tool.properties"); 161 Out.prln(propFile.getAbsolutePath()); 162 if (propFile.exists()) { 163 try { 164 InputStream inputStream = new FileInputStream(propFile); 165 this.configs.load(inputStream); 166 String thresholdString = this.configs.getProperty("threshold"); 167 if (thresholdString != null && !thresholdString.equals("")) { 168 this.threshold = (new Double(thresholdString)).doubleValue(); 169 Out.prln("New threshold is: " + this.threshold + "<P>\n"); 170 } 171 String setName = this.configs.getProperty("annotSetName"); 172 if (setName != null && !setName.equals("")) 173 this.annotSetName = setName; 174 } catch (IOException ex) { 175 //just ignore the file and go on with the defaults 176 this.configs = new Properties(); 177 } 178 } else 179 this.configs = new Properties(); 180 181 182 //we only initialise the PRs if they are going to be used 183 //for processing unprocessed documents 184 if (!this.isMarkedStored) 185 initPRs(); 186 187 annotTypes = new ArrayList(); 188 annotTypes.add("Organization"); 189 annotTypes.add("Person"); 190 annotTypes.add("Date"); 191 annotTypes.add("Location"); 192 annotTypes.add("Address"); 193 annotTypes.add("Money"); 194 annotTypes.add("Percent"); 195 annotTypes.add("GPE"); 196 annotTypes.add("Facility"); 197 198 } 199 200 public void execute(File dir) { 201 if (dir == null) 202 return; 203 //first set the current directory to be the given one 204 currDir = dir; 205 Out.prln("Processing directory: " + currDir + "<P>"); 206 207 File processedDir = null; 208 File cleanDir = null; 209 File markedDir = null; 210 211 ArrayList subDirs = new ArrayList(); 212 File[] dirArray = currDir.listFiles(); 213 for (int i = 0; i < dirArray.length; i++) { 214 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME)) 215 continue; 216 if (dirArray[i].getName().equals(CLEAN_DIR_NAME)) 217 cleanDir = dirArray[i]; 218 else if (dirArray[i].getName().equals(MARKED_DIR_NAME)) 219 markedDir = dirArray[i]; 220 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME)) 221 processedDir = dirArray[i]; 222 else 223 subDirs.add(dirArray[i]); 224 } 225 226 if (this.isGenerateMode) 227 generateCorpus(cleanDir, processedDir); 228 else 229 evaluateCorpus(cleanDir, processedDir, markedDir); 230 231 //if no more subdirs left, return 232 if (subDirs.isEmpty()) 233 return; 234 235 //there are more subdirectories to traverse, so iterate through 236 for (int j = 0; j < subDirs.size(); j++) 237 execute((File) subDirs.get(j)); 238 239 }//execute(dir) 240 241 242 public static void main(String[] args) throws GateException { 243 Out.prln("<HTML>"); 244 Out.prln("<HEAD>"); 245 Out.prln("<TITLE> Corpus benchmark tool: ran with args " + 246 args.toString() + " on " + 247 new Date() + "</TITLE> </HEAD>"); 248 Out.prln("<BODY>"); 249 Out.prln("Please wait while GATE tools are initialised. <P>"); 250 // initialise GATE 251 Gate.init(); 252 253 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool(); 254 255 List inputFiles = null; 256 if(args.length < 1) throw new GateException(usage); 257 int i = 0; 258 while (i < args.length && args[i].startsWith("-")) { 259 if(args[i].equals("-generate")) { 260 Out.prln("Generating the corpus... <P>"); 261 corpusTool.setGenerateMode(true); 262 } else if (args[i].equals("-marked_clean")) { 263 Out.prln("Evaluating current grammars against human-annotated...<P>"); 264 corpusTool.setMarkedClean(true); 265 } else if (args[i].equals("-marked_stored")) { 266 Out.prln("Evaluating stored documents against human-annotated...<P>"); 267 corpusTool.setMarkedStored(true); 268 } else if (args[i].equals("-verbose")) { 269 Out.prln("Running in verbose mode. Will generate annotation " + 270 "information when precision/recall are lower than " + 271 corpusTool.getThreshold() +"<P>"); 272 corpusTool.setVerboseMode(true); 273 } 274 i++; //just ignore the option, which we do not recognise 275 }//while 276 277 String dirName = args[i]; 278 File dir = new File(dirName); 279 if (!dir.isDirectory()) 280 throw new GateException(usage); 281 282 corpusTool.init(); 283 284 Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>"); 285 286 corpusTool.setStartDirectory(dir); 287 corpusTool.execute(); 288 289 //if we're not generating the corpus, then print the precision and recall 290 //statistics for the processed corpus 291 if (! corpusTool.getGenerateMode()) 292 corpusTool.printStatistics(); 293 294 Out.prln("Finished! <P>"); 295 Out.prln("</BODY>"); 296 Out.prln("</HTML>"); 297 298 System.exit(0); 299 300 }//main 301 302 public void setGenerateMode(boolean mode) { 303 isGenerateMode = mode; 304 }//setGenerateMode 305 306 public boolean getGenerateMode() { 307 return isGenerateMode; 308 }//getGenerateMode 309 310 public boolean getVerboseMode() { 311 return isVerboseMode; 312 }//getVerboseMode 313 314 public void setVerboseMode(boolean mode) { 315 isVerboseMode = mode; 316 }//setVerboseMode 317 318 public void setMarkedStored(boolean mode) { 319 isMarkedStored = mode; 320 }// 321 322 public boolean getMarkedStored() { 323 return isMarkedStored; 324 }// 325 326 public void setMarkedClean(boolean mode) { 327 isMarkedClean = mode; 328 }// 329 330 public boolean getMarkedClean() { 331 return isMarkedClean; 332 }// 333 334 /** 335 * Returns the average precision over the entire set of processed documents. 336 * <P> 337 * If the tool has been evaluating the original documents against the 338 * previously-stored automatically annotated ones, then the precision 339 * will be the average precision on those two sets. <P> 340 * If the tool was run in -marked mode, i.e., was evaluating the stored 341 * automatically processed ones against the human-annotated ones, then 342 * the precision will be the average precision on those two sets of documents. 343 */ 344 public double getPrecisionAverage() { 345 return precisionSum/docNumber; 346 } 347 348 /** 349 * Returns the average recall over the entire set of processed documents. 350 * <P> 351 * If the tool has been evaluating the original documents against the 352 * previously-stored automatically annotated ones, then the recall 353 * will be the average recall on those two sets. <P> 354 * If the tool was run in -marked mode, i.e., was evaluating the stored 355 * automatically processed ones against the human-annotated ones, then 356 * the recall will be the average recall on those two sets of documents. 357 */ 358 public double getRecallAverage() { 359 return recallSum/docNumber; 360 } 361 362 public boolean isGenerateMode() { 363 return isGenerateMode == true; 364 }//isGenerateMode 365 366 public double getThreshold() { 367 return threshold; 368 } 369 370 public void setThreshold(double newValue) { 371 threshold = newValue; 372 } 373 374 public File getStartDirectory() { 375 return startDir; 376 }//getStartDirectory 377 378 public void setStartDirectory(File dir) { 379 startDir = dir; 380 }//setStartDirectory 381 382 protected void generateCorpus(File fileDir, File outputDir) { 383 //1. check if we have input files 384 if (fileDir == null) 385 return; 386 //2. create the output directory or clean it up if needed 387 File outDir = outputDir; 388 if (outputDir == null) { 389 outDir = new File(currDir, PROCESSED_DIR_NAME); 390 } else { 391 // get rid of the directory, coz datastore wants it clean 392 if (!Files.rmdir(outDir)) 393 Out.prln("cannot delete old output directory: " + outDir); 394 } 395 outDir.mkdir(); 396 397 //create the datastore and process each document 398 try { 399 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString()); 400 sds.create(); 401 sds.open(); 402 403 File[] files = fileDir.listFiles(); 404 for (int i=0; i < files.length; i++) { 405 if (!files[i].isFile()) 406 continue; 407 // create a document 408 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>"); 409 410 FeatureMap params = Factory.newFeatureMap(); 411 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL()); 412 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 413 414 // create the document 415 Document doc = (Document) Factory.createResource( 416 "gate.corpora.DocumentImpl", params 417 ); 418 419 doc.setName(files[i].getName()); 420 if (doc == null) 421 continue; 422 processDocument(doc); 423 LanguageResource lr = sds.adopt(doc, null); 424 sds.sync(lr); 425 Factory.deleteResource(doc); 426 Factory.deleteResource(lr); 427 }//for 428 sds.close(); 429 } catch (java.net.MalformedURLException ex) { 430 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 431 } catch (PersistenceException ex1) { 432 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 433 } catch (ResourceInstantiationException ex2) { 434 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 435 } catch (gate.security.SecurityException ex3) { 436 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage()); 437 } 438 439 }//generateCorpus 440 441 protected void evaluateCorpus(File fileDir, 442 File processedDir, File markedDir) { 443 //1. check if we have input files and the processed Dir 444 if (fileDir == null || !fileDir.exists()) 445 return; 446 if (processedDir == null || !processedDir.exists()) 447 //if the user wants evaluation of marked and stored that's not possible 448 if (isMarkedStored) { 449 Out.prln("Cannot evaluate because no processed documents exist."); 450 return; 451 } 452 else 453 isMarkedClean = true; 454 455 //looked for marked texts only if the directory exists 456 boolean processMarked = markedDir != null && markedDir.exists(); 457 if (!processMarked && (isMarkedStored || isMarkedClean)) { 458 Out.prln("Cannot evaluate because no human-annotated documents exist."); 459 return; 460 } 461 462 if (isMarkedStored) { 463 evaluateMarkedStored(markedDir, processedDir); 464 return; 465 } else if (isMarkedClean) { 466 evaluateMarkedClean(markedDir, fileDir); 467 return; 468 } 469 470 Document persDoc = null; 471 Document cleanDoc = null; 472 Document markedDoc = null; 473 474 //open the datastore and process each document 475 try { 476 //open the data store 477 DataStore sds = Factory.openDataStore 478 ("gate.persist.SerialDataStore", 479 processedDir.toURL().toExternalForm()); 480 481 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 482 for (int i=0; i < lrIDs.size(); i++) { 483 String docID = (String) lrIDs.get(i); 484 485 //read the stored document 486 FeatureMap features = Factory.newFeatureMap(); 487 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 488 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 489 persDoc = (Document) Factory.createResource( 490 "gate.corpora.DocumentImpl", 491 features); 492 493 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 494 495 File cleanDocFile = new File(fileDir, persDoc.getName()); 496 //try reading the original document from clean 497 if (! cleanDocFile.exists()) { 498 Out.prln("Warning: Cannot find original document " + 499 persDoc.getName() + " in " + fileDir); 500 } else { 501 FeatureMap params = Factory.newFeatureMap(); 502 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL()); 503 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 504 505 // create the document 506 cleanDoc = (Document) Factory.createResource( 507 "gate.corpora.DocumentImpl", params); 508 cleanDoc.setName(persDoc.getName()); 509 } 510 511 //try finding the marked document 512 StringBuffer docName = new StringBuffer(persDoc.getName()); 513 docName.replace( 514 persDoc.getName().lastIndexOf("."), 515 docName.length(), 516 ".xml"); 517 File markedDocFile = new File(markedDir, docName.toString()); 518 if (! processMarked || ! markedDocFile.exists()) { 519 Out.prln("Warning: Cannot find human-annotated document " + 520 markedDocFile + " in " + markedDir); 521 } else { 522 FeatureMap params = Factory.newFeatureMap(); 523 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 524 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 525 526 // create the document 527 markedDoc = (Document) Factory.createResource( 528 "gate.corpora.DocumentImpl", params); 529 markedDoc.setName(persDoc.getName()); 530 } 531 532 evaluateDocuments(persDoc, cleanDoc, markedDoc); 533 if (persDoc != null) 534 Factory.deleteResource(persDoc); 535 if (cleanDoc != null) 536 Factory.deleteResource(cleanDoc); 537 if (markedDoc != null) 538 Factory.deleteResource(markedDoc); 539 540 }//for loop through saved docs 541 sds.close(); 542 } catch (java.net.MalformedURLException ex) { 543 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 544 } catch (PersistenceException ex1) { 545 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 546 } catch (ResourceInstantiationException ex2) { 547 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 548 } 549 550 }//evaluateCorpus 551 552 protected void evaluateMarkedStored(File markedDir, File storedDir) { 553 Document persDoc = null; 554 Document cleanDoc = null; 555 Document markedDoc = null; 556 557 //open the datastore and process each document 558 try { 559 //open the data store 560 DataStore sds = Factory.openDataStore 561 ("gate.persist.SerialDataStore", 562 storedDir.toURL().toExternalForm()); 563 564 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 565 for (int i=0; i < lrIDs.size(); i++) { 566 String docID = (String) lrIDs.get(i); 567 568 //read the stored document 569 FeatureMap features = Factory.newFeatureMap(); 570 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 571 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 572 persDoc = (Document) Factory.createResource( 573 "gate.corpora.DocumentImpl", 574 features); 575 576 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 577 578 //try finding the marked document 579 StringBuffer docName = new StringBuffer(persDoc.getName()); 580 docName.replace( 581 persDoc.getName().lastIndexOf("."), 582 docName.length(), 583 ".xml"); 584 File markedDocFile = new File(markedDir, docName.toString()); 585 if (! markedDocFile.exists()) { 586 Out.prln("Warning: Cannot find human-annotated document " + 587 markedDocFile + " in " + markedDir); 588 } else { 589 FeatureMap params = Factory.newFeatureMap(); 590 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 591 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 592 593 // create the document 594 markedDoc = (Document) Factory.createResource( 595 "gate.corpora.DocumentImpl", params); 596 markedDoc.setName(persDoc.getName()); 597 } 598 599 evaluateDocuments(persDoc, cleanDoc, markedDoc); 600 if (persDoc != null) 601 Factory.deleteResource(persDoc); 602 if (markedDoc != null) 603 Factory.deleteResource(markedDoc); 604 605 }//for loop through saved docs 606 sds.close(); 607 608 } catch (java.net.MalformedURLException ex) { 609 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 610 } catch (PersistenceException ex1) { 611 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 612 } catch (ResourceInstantiationException ex2) { 613 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 614 } 615 616 }//evaluateMarkedStored 617 618 619 protected void evaluateMarkedClean(File markedDir, File cleanDir) { 620 Document persDoc = null; 621 Document cleanDoc = null; 622 Document markedDoc = null; 623 624 File[] cleanDocs = cleanDir.listFiles(); 625 for (int i = 0; i< cleanDocs.length; i++) { 626 if (!cleanDocs[i].isFile()) 627 continue; 628 629 //try reading the original document from clean 630 FeatureMap params = Factory.newFeatureMap(); 631 try { 632 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL()); 633 } catch (java.net.MalformedURLException ex) { 634 Out.prln("Cannot create document from file: " + 635 cleanDocs[i].getAbsolutePath()); 636 continue; 637 } 638 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 639 640 // create the document 641 try { 642 cleanDoc = (Document) Factory.createResource( 643 "gate.corpora.DocumentImpl", params, 644 null, cleanDocs[i].getName()); 645 } catch (gate.creole.ResourceInstantiationException ex) { 646 Out.prln("Cannot create document from file: " + 647 cleanDocs[i].getAbsolutePath()); 648 continue; 649 } 650 651 Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>"); 652 653 //try finding the marked document 654 StringBuffer docName = new StringBuffer(cleanDoc.getName()); 655 docName.replace( 656 cleanDoc.getName().lastIndexOf("."), 657 docName.length(), 658 ".xml"); 659 File markedDocFile = new File(markedDir, docName.toString()); 660 if (! markedDocFile.exists()) { 661 Out.prln("Warning: Cannot find human-annotated document " + 662 markedDocFile + " in " + markedDir); 663 continue; 664 } else { 665 params = Factory.newFeatureMap(); 666 try { 667 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, markedDocFile.toURL()); 668 } catch (java.net.MalformedURLException ex) { 669 Out.prln("Cannot create document from file: " + 670 markedDocFile.getAbsolutePath()); 671 continue; 672 } 673 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, ""); 674 675 // create the document 676 try { 677 markedDoc = (Document) Factory.createResource( 678 "gate.corpora.DocumentImpl", params, 679 null, cleanDoc.getName()); 680 } catch (gate.creole.ResourceInstantiationException ex) { 681 Out.prln("Cannot create document from file: " + 682 markedDocFile.getAbsolutePath()); 683 continue; 684 } 685 686 }//if markedDoc exists 687 688 try { 689 evaluateDocuments(persDoc, cleanDoc, markedDoc); 690 } catch (gate.creole.ResourceInstantiationException ex) { 691 ex.printStackTrace(); 692 Out.prln("Evaluate failed on document: " + cleanDoc.getName()); 693 } 694 if (persDoc != null) 695 Factory.deleteResource(persDoc); 696 if (cleanDoc != null) 697 Factory.deleteResource(cleanDoc); 698 if (markedDoc != null) 699 Factory.deleteResource(markedDoc); 700 701 }//for loop through clean docs 702 703 704 }//evaluateMarkedClean 705 706 protected void processDocument(Document doc) { 707 try { 708 tokeniser.setDocument(doc); 709 tokeniser.execute(); 710 711 gazetteer.setDocument(doc); 712 gazetteer.execute(); 713 714 String textTagName = configs.getProperty("astTEXTTagName"); 715 if (textTagName != null && !textTagName.equals("")) 716 setTransfer.setTextTagName(textTagName); 717 setTransfer.setDocument(doc); 718 setTransfer.execute(); 719 720 splitter.setDocument(doc); 721 splitter.execute(); 722 723 tagger.setDocument(doc); 724 tagger.execute(); 725 726 transducer.setDocument(doc); 727 transducer.execute(); 728 729 orthomatcher.setDocument(doc); 730 orthomatcher.execute(); 731 } catch (gate.creole.ExecutionException ex) { 732 throw new GateRuntimeException("Corpus generation error: " + 733 ex.getMessage()); 734 } 735 } 736 737 protected void evaluateDocuments(Document persDoc, 738 Document cleanDoc, Document markedDoc) 739 throws ResourceInstantiationException { 740 if (cleanDoc == null && markedDoc == null) 741 return; 742 743 //we've got no types to compare 744 if (annotTypes == null || annotTypes.isEmpty()) 745 return; 746 747 if (cleanDoc != null && !isMarkedStored) { 748 749 processDocument(cleanDoc); 750 751 if(!isMarkedClean) 752 evaluateAllThree(persDoc, cleanDoc, markedDoc); 753 else 754 evaluateTwoDocs(markedDoc, cleanDoc); 755 756 } else 757 evaluateTwoDocs(markedDoc, persDoc); 758 759 } 760 761 protected void evaluateAllThree(Document persDoc, 762 Document cleanDoc, Document markedDoc) 763 throws ResourceInstantiationException { 764 //first start the table and its header 765 printTableHeader(); 766 for (int jj= 0; jj< annotTypes.size(); jj++) { 767 String annotType = (String) annotTypes.get(jj); 768 769 AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType); 770 //we don't have this annotation type in this document 771 if (annotDiff == null) 772 continue; 773 Out.prln("<TR>"); 774 775 //increase the number of processed documents 776 docNumber++; 777 //add precison and recall to the sums 778 updateStatistics(annotDiff, annotType); 779 780 Out.prln("<TD> Annotation type: " + annotType + "</TD>"); 781 782 AnnotationDiff annotDiff1 = 783 measureDocs(markedDoc, persDoc, annotType); 784 785 Out.prln("<TD>" + annotDiff.getPrecisionAverage()); 786 //check the precision first 787 if (annotDiff1 != null && 788 annotDiff!= null && 789 annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage() 790 ) 791 Out.prln("<P> Precision increase on human-marked from " + 792 annotDiff1.getPrecisionAverage() + " to " + 793 annotDiff.getPrecisionAverage() + "</P>"); 794 else if (annotDiff1 != null 795 && annotDiff != null 796 && annotDiff1.getPrecisionAverage() 797 > annotDiff.getPrecisionAverage()) 798 Out.prln("<P> Precision decrease on human-marked from " + 799 annotDiff1.getPrecisionAverage() + " to " + 800 annotDiff.getPrecisionAverage() + "</P>"); 801 Out.prln("</TD>"); 802 803 Out.prln("<TD>" + annotDiff.getRecallAverage()); 804 //check the recall now 805 if (annotDiff1 != null && 806 annotDiff!= null && 807 annotDiff1.getRecallAverage()<annotDiff.getRecallAverage() 808 ) 809 Out.prln("<P> Recall increase on human-marked from " + 810 annotDiff1.getRecallAverage() + " to " + 811 annotDiff.getRecallAverage() + "</P>"); 812 else if (annotDiff1 != null 813 && annotDiff != null 814 && annotDiff1.getRecallAverage() 815 > annotDiff.getRecallAverage()) 816 Out.prln("<P> Recall decrease on human-marked from " + 817 annotDiff1.getRecallAverage() + " to " + 818 annotDiff.getRecallAverage() + "</P>"); 819 820 Out.prln("</TD>"); 821 822 //check the recall now 823 if ( isVerboseMode 824 && 825 ((annotDiff.getRecallAverage() < threshold 826 || 827 annotDiff.getRecallAverage() < threshold) 828 ) 829 ) 830 printAnnotations(annotDiff, markedDoc, cleanDoc); 831 832 833 Out.prln("</TR>"); 834 }//for loop through annotation types 835 Out.prln("</TABLE>"); 836 837 }//evaluateAllThree 838 839 protected void evaluateTwoDocs(Document keyDoc, Document respDoc) 840 throws ResourceInstantiationException { 841 842 //first start the table and its header 843 printTableHeader(); 844 for (int jj= 0; jj< annotTypes.size(); jj++) { 845 String annotType = (String) annotTypes.get(jj); 846 847 AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType); 848 //we don't have this annotation type in this document 849 if (annotDiff == null) 850 continue; 851 Out.prln("<TR>"); 852 853 //increase the number of processed documents 854 docNumber++; 855 //add precison and recall to the sums 856 updateStatistics(annotDiff, annotType); 857 858 Out.prln("<TD>" + annotType + "</TD>"); 859 860 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>"); 861 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>"); 862 //check the recall now 863 if ( isVerboseMode 864 && 865 ((annotDiff.getRecallAverage() < threshold 866 || 867 annotDiff.getRecallAverage() < threshold) 868 ) 869 ) 870 printAnnotations(annotDiff, keyDoc, respDoc); 871 872 Out.prln("</TR>"); 873 }//for loop through annotation types 874 Out.prln("</TABLE>"); 875 876 }//evaluateTwoDocs 877 878 protected void printTableHeader() { 879 Out.prln("<TABLE BORDER=1"); 880 if (isVerboseMode) 881 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 882 + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>"); 883 else 884 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 885 + "<TD><B>Recall</B></TD>"); 886 } 887 888 protected void updateStatistics(AnnotationDiff annotDiff, String annotType){ 889 precisionSum += annotDiff.getPrecisionAverage(); 890 recallSum += annotDiff.getRecallAverage(); 891 Double oldPrecision = (Double) precisionByType.get(annotType); 892 if (oldPrecision == null) 893 precisionByType.put(annotType, 894 new Double(annotDiff.getPrecisionAverage())); 895 else 896 precisionByType.put(annotType, 897 new Double(oldPrecision.doubleValue() + 898 annotDiff.getPrecisionAverage())); 899 Integer precCount = (Integer) prCountByType.get(annotType); 900 if (precCount == null) 901 prCountByType.put(annotType, new Integer(1)); 902 else 903 prCountByType.put(annotType, new Integer(precCount.intValue() + 1)); 904 905 906 Double oldRecall = (Double) recallByType.get(annotType); 907 if (oldRecall == null) 908 recallByType.put(annotType, 909 new Double(annotDiff.getRecallAverage())); 910 else 911 recallByType.put(annotType, 912 new Double(oldRecall.doubleValue() + 913 annotDiff.getRecallAverage())); 914 Integer recCount = (Integer) recCountByType.get(annotType); 915 if (recCount == null) 916 recCountByType.put(annotType, new Integer(1)); 917 else 918 recCountByType.put(annotType, new Integer(recCount.intValue() + 1)); 919 920 } 921 922 protected void printStatistics() { 923 924 Out.prln("<H2> Statistics </H2>"); 925 Out.prln("<H3> Precision </H3>"); 926 if (precisionByType != null && !precisionByType.isEmpty()) { 927 Iterator iter = precisionByType.keySet().iterator(); 928 while (iter.hasNext()) { 929 String annotType = (String) iter.next(); 930 Out.prln(annotType + ": " 931 + ((Double)precisionByType.get(annotType)).doubleValue() 932 / 933 ((Integer)prCountByType.get(annotType)).intValue() 934 + "<P>"); 935 }//while 936 } 937 Out.prln("Overall precision: " + getPrecisionAverage() + "<P>"); 938 939 Out.prln("<H3> Recall </H3>"); 940 if (recallByType != null && !recallByType.isEmpty()) { 941 Iterator iter = recallByType.keySet().iterator(); 942 while (iter.hasNext()) { 943 String annotType = (String) iter.next(); 944 Out.prln(annotType + ": " 945 + ((Double)recallByType.get(annotType)).doubleValue() 946 / 947 ((Integer)recCountByType.get(annotType)).intValue() 948 + "<P>"); 949 }//while 950 } 951 952 Out.prln("Overall recall: " + getRecallAverage() 953 + "<P>"); 954 } 955 956 protected AnnotationDiff measureDocs( 957 Document keyDoc, Document respDoc, String annotType) 958 throws ResourceInstantiationException { 959 960 if (keyDoc == null || respDoc == null) 961 return null; 962 963 if (annotSetName != null 964 && keyDoc.getAnnotations(annotSetName).get(annotType) == null) 965 return null; 966 else if ((annotSetName == null || annotSetName.equals("")) 967 && keyDoc.getAnnotations().get(annotType) == null) 968 return null; 969 970 // create the annotation schema needed for AnnotationDiff 971 AnnotationSchema annotationSchema = new AnnotationSchema(); 972 973 // organization type 974 annotationSchema.setAnnotationName(annotType); 975 // create an annotation diff 976 AnnotationDiff annotDiff = new AnnotationDiff(); 977 annotDiff.setAnnotationSchema(annotationSchema); 978 annotDiff.setKeyDocument(keyDoc); 979 annotDiff.setResponseDocument(respDoc); 980 annotDiff.setKeyAnnotationSetName(annotSetName); 981 annotDiff.setResponseAnnotationSetName(annotSetName); 982 annotDiff.setKeyFeatureNamesSet(new HashSet()); 983 annotDiff.setTextMode(new Boolean(true)); 984 annotDiff.init(); 985 986 return annotDiff; 987 } 988 989 protected void printAnnotations(AnnotationDiff annotDiff, 990 Document keyDoc, Document respDoc) { 991 Out.prln("<TD>"); 992 Out.pr("MISSING ANNOTATIONS in the automatic texts: "); 993 Set missingSet = 994 annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE); 995 printAnnotations(missingSet, keyDoc); 996 Out.prln("<BR>"); 997 998 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: "); 999 Set spuriousSet = 1000 annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE); 1001 printAnnotations(spuriousSet, respDoc); 1002 Out.prln("</BR>"); 1003 1004 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: "); 1005 Set partialSet = 1006 annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE); 1007 printAnnotations(partialSet, respDoc); 1008 Out.prln("</TD>"); 1009 1010 } 1011 1012 protected void printAnnotations(Set set, Document doc) { 1013 if (set == null || set.isEmpty()) 1014 return; 1015 1016 Iterator iter = set.iterator(); 1017 while (iter.hasNext()) { 1018 Annotation ann = (Annotation) iter.next(); 1019 Out.prln( 1020 "<B>" + 1021 doc.getContent().toString().substring( 1022 ann.getStartNode().getOffset().intValue(), 1023 ann.getEndNode().getOffset().intValue()) + 1024 "</B>: <I>[" + ann.getStartNode().getOffset() + 1025 "," + ann.getEndNode().getOffset() + "]</I>" 1026// + "; features" + ann.getFeatures() 1027 ); 1028 }//while 1029 } 1030 1031 /** 1032 * The directory from which we should generate/evaluate the corpus 1033 */ 1034 private File startDir; 1035 private File currDir; 1036 private static List annotTypes; 1037 1038 private DefaultTokeniser tokeniser; 1039 private DefaultGazetteer gazetteer; 1040 private SentenceSplitter splitter; 1041 private POSTagger tagger; 1042 private ANNIETransducer transducer; 1043 private OrthoMatcher orthomatcher; 1044 private AnnotationSetTransfer setTransfer; 1045 1046 //collect the sum of all precisions and recalls of all docs 1047 //and the number of docs, so I can calculate the average for 1048 //the corpus at the end 1049 private double precisionSum = 0; 1050 private double recallSum = 0; 1051 private HashMap precisionByType = new HashMap(); 1052 private HashMap prCountByType = new HashMap(); 1053 private HashMap recallByType = new HashMap(); 1054 private HashMap recCountByType = new HashMap(); 1055 private int docNumber = 0; 1056 1057 /** 1058 * If true, the corpus tool will generate the corpus, otherwise it'll 1059 * run in evaluate mode 1060 */ 1061 private boolean isGenerateMode = false; 1062 private boolean isVerboseMode = false; 1063 1064 /** 1065 * If true, the corpus tool will evaluate stored against the human-marked 1066 * documents 1067 */ 1068 private boolean isMarkedStored = false; 1069 private boolean isMarkedClean = false; 1070 1071 private String annotSetName = "Key"; 1072 1073 private double threshold = 0.5; 1074 private Properties configs = new Properties(); 1075 1076 /** String to print when wrong command-line args */ 1077 private static String usage = 1078 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name"; 1079 1080}
|
CorpusBenchmarkTool |
|