|
CorpusBenchmarkTool |
|
1 /* 2 * CorpusBenchmarkTool.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/Oct/2001 12 * 13 * $Id: CorpusBenchmarkTool.java,v 1.18 2001/11/13 15:32:05 valyt Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import gate.creole.tokeniser.*; 26 import gate.creole.gazetteer.*; 27 import gate.creole.splitter.*; 28 import gate.creole.orthomatcher.*; 29 import gate.creole.annotransfer.*; 30 import gate.annotation.*; 31 32 public class CorpusBenchmarkTool { 33 private static final String MARKED_DIR_NAME = "marked"; 34 private static final String CLEAN_DIR_NAME = "clean"; 35 private static final String CVS_DIR_NAME = "Cvs"; 36 private static final String PROCESSED_DIR_NAME = "processed"; 37 38 private static final boolean DEBUG = true; 39 40 public CorpusBenchmarkTool() {} 41 42 public void initPRs() { 43 try { 44 FeatureMap params = Factory.newFeatureMap(); 45 46 //create a default tokeniser 47 Out.prln("Loading tokeniser <P>"); 48 String rulesURL = this.configs.getProperty("tokeniserRulesURL"); 49 if (rulesURL != null && !rulesURL.equals("")) 50 params.put("tokeniserRulesURL", rulesURL); 51 String grammarsURL = this.configs.getProperty("tokeniserGrammarURL"); 52 if (grammarsURL != null && !grammarsURL.equals("")) 53 params.put("transducerGrammarURL", grammarsURL); 54 //the annots are put in temp, as they are going to be transfered to the 55 //new set 56 params.put("annotationSetName", "temp"); 57 tokeniser = (DefaultTokeniser) Factory.createResource( 58 "gate.creole.tokeniser.DefaultTokeniser", params); 59 60 //create a default gazetteer 61 Out.prln("Loading gazetteer <P>"); 62 params.clear(); 63 String listsURL = this.configs.getProperty("gazetteerListsURL"); 64 if (listsURL != null && !listsURL.equals("")) 65 params.put("listsURL", listsURL); 66 String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive"); 67 if (caseSensitive != null && !caseSensitive.equals("")) 68 params.put("caseSensitive", new Boolean(caseSensitive)); 69 params.put("annotationSetName", "temp"); 70 gazetteer = (DefaultGazetteer) Factory.createResource( 71 "gate.creole.gazetteer.DefaultGazetteer", params); 72 73 //create the Annotation set transfer 74 Out.prln("Loading annotation set transfer <P>"); 75 params.clear(); 76 params.put("inputASName", "temp"); 77 params.put("outputASName", annotSetName); 78 //by default make it transfer all annotations 79 params.put("textTagName", ""); 80 setTransfer = (AnnotationSetTransfer) Factory.createResource( 81 "gate.creole.annotransfer.AnnotationSetTransfer", params); 82 83 //create a splitter 84 Out.prln("Loading sentence splitter <P>"); 85 params.clear(); 86 listsURL = this.configs.getProperty("splitterGazetteerURL"); 87 if (listsURL != null && !listsURL.equals("")) 88 params.put("gazetteerListsURL", listsURL); 89 grammarsURL = this.configs.getProperty("splitterGrammarURL"); 90 if (grammarsURL != null && !grammarsURL.equals("")) 91 params.put("transducerURL", grammarsURL); 92 params.put("inputASName", annotSetName); 93 params.put("outputASName", annotSetName); 94 splitter = (SentenceSplitter) Factory.createResource( 95 "gate.creole.splitter.SentenceSplitter", params); 96 97 //create a tagger 98 Out.prln("Loading POS tagger <P>"); 99 params.clear(); 100 String lexiconURL = this.configs.getProperty("taggerLexiconURL"); 101 if (lexiconURL != null && !lexiconURL.equals("")) 102 params.put("lexiconURL", lexiconURL); 103 rulesURL = this.configs.getProperty("taggerRulesURL"); 104 if (rulesURL != null && !rulesURL.equals("")) 105 params.put("rulesURL", rulesURL); 106 params.put("inputASName", annotSetName); 107 params.put("outputASName", annotSetName); 108 tagger = (POSTagger) Factory.createResource( 109 "gate.creole.POSTagger", params); 110 111 //create a grammar 112 Out.prln("Loading grammars for transducer <P>"); 113 params.clear(); 114 String grammarURL = this.configs.getProperty("grammarURL"); 115 if (grammarURL != null && !grammarURL.equals("")) 116 params.put("grammarURL", grammarURL); 117 params.put("inputASName", annotSetName); 118 params.put("outputASName", annotSetName); 119 transducer = (ANNIETransducer) Factory.createResource( 120 "gate.creole.ANNIETransducer", params); 121 122 //create an orthomatcher 123 Out.prln("Loading orthomatcher <P>"); 124 params.clear(); 125 params.put("annotationSetName", annotSetName); 126 orthomatcher = (OrthoMatcher) Factory.createResource( 127 "gate.creole.orthomatcher.OrthoMatcher", params); 128 } catch (ResourceInstantiationException ex) { 129 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 130 } 131 }//initPRs 132 133 public void execute() { 134 execute(startDir); 135 } 136 137 public void init() { 138 //we only initialise the PRs if they are going to be used 139 //for processing unprocessed documents 140 if (!this.isMarkedStored) 141 initPRs(); 142 143 annotTypes = new ArrayList(); 144 annotTypes.add("Organization"); 145 annotTypes.add("Person"); 146 annotTypes.add("Date"); 147 annotTypes.add("Location"); 148 annotTypes.add("Address"); 149 annotTypes.add("Money"); 150 annotTypes.add("Percent"); 151 annotTypes.add("GPE"); 152 annotTypes.add("Facility"); 153 154 } 155 156 public void execute(File dir) { 157 if (dir == null) 158 return; 159 //first set the current directory to be the given one 160 currDir = dir; 161 Out.prln("Processing directory: " + currDir + "<P>"); 162 163 File processedDir = null; 164 File cleanDir = null; 165 File markedDir = null; 166 167 ArrayList subDirs = new ArrayList(); 168 File[] dirArray = currDir.listFiles(); 169 for (int i = 0; i < dirArray.length; i++) { 170 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME)) 171 continue; 172 if (dirArray[i].getName().equals(CLEAN_DIR_NAME)) 173 cleanDir = dirArray[i]; 174 else if (dirArray[i].getName().equals(MARKED_DIR_NAME)) 175 markedDir = dirArray[i]; 176 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME)) 177 processedDir = dirArray[i]; 178 else 179 subDirs.add(dirArray[i]); 180 } 181 182 if (this.isGenerateMode) 183 generateCorpus(cleanDir, processedDir); 184 else 185 evaluateCorpus(cleanDir, processedDir, markedDir); 186 187 //if no more subdirs left, return 188 if (subDirs.isEmpty()) 189 return; 190 191 //there are more subdirectories to traverse, so iterate through 192 for (int j = 0; j < subDirs.size(); j++) 193 execute((File) subDirs.get(j)); 194 195 }//execute(dir) 196 197 198 public static void main(String[] args) throws GateException { 199 Out.prln("<HTML>"); 200 Out.prln("<HEAD>"); 201 Out.prln("<TITLE> Corpus benchmark tool: ran with args " + 202 args.toString() + " on " + 203 new Date() + "</TITLE> </HEAD>"); 204 Out.prln("<BODY>"); 205 Out.prln("Please wait while GATE tools are initialised. <P>"); 206 // initialise GATE 207 Gate.init(); 208 209 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool(); 210 211 List inputFiles = null; 212 if(args.length < 1) throw new GateException(usage); 213 int i = 0; 214 while (i < args.length && args[i].startsWith("-")) { 215 if(args[i].equals("-generate")) { 216 Out.prln("Generating the corpus... <P>"); 217 corpusTool.setGenerateMode(true); 218 } else if (args[i].equals("-marked_clean")) { 219 Out.prln("Evaluating current grammars against human-annotated...<P>"); 220 corpusTool.setMarkedClean(true); 221 } else if (args[i].equals("-marked_stored")) { 222 Out.prln("Evaluating stored documents against human-annotated...<P>"); 223 corpusTool.setMarkedStored(true); 224 } else if (args[i].equals("-verbose")) { 225 Out.prln("Running in verbose mode. Will generate annotation " + 226 "information when precision/recall are lower than " + 227 corpusTool.getThreshold() +"<P>"); 228 corpusTool.setVerboseMode(true); 229 } 230 i++; //just ignore the option, which we do not recognise 231 }//while 232 233 String dirName = args[i]; 234 File dir = new File(dirName); 235 if (!dir.isDirectory()) 236 throw new GateException(usage); 237 238 File propFile = new File("corpus_tool.properties"); 239 Out.prln(propFile.getAbsolutePath()); 240 if (propFile.exists()) { 241 try { 242 InputStream inputStream = new FileInputStream(propFile); 243 corpusTool.configs.load(inputStream); 244 String thresholdString = corpusTool.configs.getProperty("threshold"); 245 if (thresholdString != null && !thresholdString.equals("")) { 246 corpusTool.threshold = (new Double(thresholdString)).doubleValue(); 247 Out.prln("new threshold is: " + corpusTool.threshold); 248 } 249 String setName = corpusTool.configs.getProperty("annotSetName"); 250 if (setName != null && !setName.equals("")) 251 corpusTool.annotSetName = setName; 252 } catch (IOException ex) { 253 //just ignore the file and go on with the defaults 254 corpusTool.configs = new Properties(); 255 } 256 } else 257 corpusTool.configs = new Properties(); 258 259 corpusTool.init(); 260 261 Out.prln("Measuring annotaitions of types: " + corpusTool.annotTypes + "<P>"); 262 263 corpusTool.setStartDirectory(dir); 264 corpusTool.execute(); 265 266 //if we're not generating the corpus, then print the precision and recall 267 //statistics for the processed corpus 268 if (! corpusTool.getGenerateMode()) 269 corpusTool.printStatistics(); 270 271 Out.prln("Finished! <P>"); 272 Out.prln("</BODY>"); 273 Out.prln("</HTML>"); 274 275 System.exit(0); 276 277 }//main 278 279 public void setGenerateMode(boolean mode) { 280 isGenerateMode = mode; 281 }//setGenerateMode 282 283 public boolean getGenerateMode() { 284 return isGenerateMode; 285 }//getGenerateMode 286 287 public boolean getVerboseMode() { 288 return isVerboseMode; 289 }//getVerboseMode 290 291 public void setVerboseMode(boolean mode) { 292 isVerboseMode = mode; 293 }//setVerboseMode 294 295 public void setMarkedStored(boolean mode) { 296 isMarkedStored = mode; 297 }// 298 299 public boolean getMarkedStored() { 300 return isMarkedStored; 301 }// 302 303 public void setMarkedClean(boolean mode) { 304 isMarkedClean = mode; 305 }// 306 307 public boolean getMarkedClean() { 308 return isMarkedClean; 309 }// 310 311 /** 312 * Returns the average precision over the entire set of processed documents. 313 * <P> 314 * If the tool has been evaluating the original documents against the 315 * previously-stored automatically annotated ones, then the precision 316 * will be the average precision on those two sets. <P> 317 * If the tool was run in -marked mode, i.e., was evaluating the stored 318 * automatically processed ones against the human-annotated ones, then 319 * the precision will be the average precision on those two sets of documents. 320 */ 321 public double getPrecisionAverage() { 322 return precisionSum/docNumber; 323 } 324 325 /** 326 * Returns the average recall over the entire set of processed documents. 327 * <P> 328 * If the tool has been evaluating the original documents against the 329 * previously-stored automatically annotated ones, then the recall 330 * will be the average recall on those two sets. <P> 331 * If the tool was run in -marked mode, i.e., was evaluating the stored 332 * automatically processed ones against the human-annotated ones, then 333 * the recall will be the average recall on those two sets of documents. 334 */ 335 public double getRecallAverage() { 336 return recallSum/docNumber; 337 } 338 339 public boolean isGenerateMode() { 340 return isGenerateMode == true; 341 }//isGenerateMode 342 343 public double getThreshold() { 344 return threshold; 345 } 346 347 public void setThreshold(double newValue) { 348 threshold = newValue; 349 } 350 351 public File getStartDirectory() { 352 return startDir; 353 }//getStartDirectory 354 355 public void setStartDirectory(File dir) { 356 startDir = dir; 357 }//setStartDirectory 358 359 protected void generateCorpus(File fileDir, File outputDir) { 360 //1. check if we have input files 361 if (fileDir == null) 362 return; 363 //2. create the output directory or clean it up if needed 364 File outDir = outputDir; 365 if (outputDir == null) { 366 outDir = new File(currDir, PROCESSED_DIR_NAME); 367 } else { 368 // get rid of the directory, coz datastore wants it clean 369 if (!Files.rmdir(outDir)) 370 Out.prln("cannot delete old output directory: " + outDir); 371 } 372 outDir.mkdir(); 373 374 //create the datastore and process each document 375 try { 376 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString()); 377 sds.create(); 378 sds.open(); 379 380 File[] files = fileDir.listFiles(); 381 for (int i=0; i < files.length; i++) { 382 if (!files[i].isFile()) 383 continue; 384 // create a document 385 Out.prln("Processing and storing document: " + files[i].toURL() +"<P>"); 386 387 FeatureMap params = Factory.newFeatureMap(); 388 params.put("sourceUrl", files[i].toURL()); 389 params.put("encoding", ""); 390 391 // create the document 392 Document doc = (Document) Factory.createResource( 393 "gate.corpora.DocumentImpl", params 394 ); 395 396 doc.setName(files[i].getName()); 397 if (doc == null) 398 continue; 399 processDocument(doc); 400 LanguageResource lr = sds.adopt(doc, null); 401 sds.sync(lr); 402 }//for 403 sds.close(); 404 } catch (java.net.MalformedURLException ex) { 405 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 406 } catch (PersistenceException ex1) { 407 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 408 } catch (ResourceInstantiationException ex2) { 409 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 410 } catch (gate.security.SecurityException ex3) { 411 throw new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage()); 412 } 413 414 }//generateCorpus 415 416 protected void evaluateCorpus(File fileDir, 417 File processedDir, File markedDir) { 418 //1. check if we have input files and the processed Dir 419 if (fileDir == null || !fileDir.exists()) 420 return; 421 if (processedDir == null || !processedDir.exists()) 422 //if the user wants evaluation of marked and stored that's not possible 423 if (isMarkedStored) { 424 Out.prln("Cannot evaluate because no processed documents exist."); 425 return; 426 } 427 else 428 isMarkedClean = true; 429 430 //looked for marked texts only if the directory exists 431 boolean processMarked = markedDir != null && markedDir.exists(); 432 if (!processMarked && (isMarkedStored || isMarkedClean)) { 433 Out.prln("Cannot evaluate because no human-annotated documents exist."); 434 return; 435 } 436 437 if (isMarkedStored) { 438 evaluateMarkedStored(markedDir, processedDir); 439 return; 440 } else if (isMarkedClean) { 441 evaluateMarkedClean(markedDir, fileDir); 442 return; 443 } 444 445 Document persDoc = null; 446 Document cleanDoc = null; 447 Document markedDoc = null; 448 449 //open the datastore and process each document 450 try { 451 //open the data store 452 DataStore sds = Factory.openDataStore 453 ("gate.persist.SerialDataStore", 454 processedDir.toURL().toExternalForm()); 455 456 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 457 for (int i=0; i < lrIDs.size(); i++) { 458 String docID = (String) lrIDs.get(i); 459 460 //read the stored document 461 FeatureMap features = Factory.newFeatureMap(); 462 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 463 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 464 persDoc = (Document) Factory.createResource( 465 "gate.corpora.DocumentImpl", 466 features); 467 468 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 469 470 File cleanDocFile = new File(fileDir, persDoc.getName()); 471 //try reading the original document from clean 472 if (! cleanDocFile.exists()) { 473 Out.prln("Warning: Cannot find original document " + 474 persDoc.getName() + " in " + fileDir); 475 } else { 476 FeatureMap params = Factory.newFeatureMap(); 477 params.put("sourceUrl", cleanDocFile.toURL()); 478 params.put("encoding", ""); 479 480 // create the document 481 cleanDoc = (Document) Factory.createResource( 482 "gate.corpora.DocumentImpl", params); 483 cleanDoc.setName(persDoc.getName()); 484 } 485 486 //try finding the marked document 487 StringBuffer docName = new StringBuffer(persDoc.getName()); 488 docName.replace( 489 persDoc.getName().lastIndexOf("."), 490 docName.length(), 491 ".xml"); 492 File markedDocFile = new File(markedDir, docName.toString()); 493 if (! processMarked || ! markedDocFile.exists()) { 494 Out.prln("Warning: Cannot find human-annotated document " + 495 markedDocFile + " in " + markedDir); 496 } else { 497 FeatureMap params = Factory.newFeatureMap(); 498 params.put("sourceUrl", markedDocFile.toURL()); 499 params.put("encoding", ""); 500 501 // create the document 502 markedDoc = (Document) Factory.createResource( 503 "gate.corpora.DocumentImpl", params); 504 markedDoc.setName(persDoc.getName()); 505 } 506 507 evaluateDocuments(persDoc, cleanDoc, markedDoc); 508 if (persDoc != null) 509 Factory.deleteResource(persDoc); 510 if (cleanDoc != null) 511 Factory.deleteResource(cleanDoc); 512 if (markedDoc != null) 513 Factory.deleteResource(markedDoc); 514 515 }//for loop through saved docs 516 sds.close(); 517 } catch (java.net.MalformedURLException ex) { 518 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 519 } catch (PersistenceException ex1) { 520 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 521 } catch (ResourceInstantiationException ex2) { 522 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 523 } 524 525 }//evaluateCorpus 526 527 protected void evaluateMarkedStored(File markedDir, File storedDir) { 528 Document persDoc = null; 529 Document cleanDoc = null; 530 Document markedDoc = null; 531 532 //open the datastore and process each document 533 try { 534 //open the data store 535 DataStore sds = Factory.openDataStore 536 ("gate.persist.SerialDataStore", 537 storedDir.toURL().toExternalForm()); 538 539 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl"); 540 for (int i=0; i < lrIDs.size(); i++) { 541 String docID = (String) lrIDs.get(i); 542 543 //read the stored document 544 FeatureMap features = Factory.newFeatureMap(); 545 features.put(DataStore.DATASTORE_FEATURE_NAME, sds); 546 features.put(DataStore.LR_ID_FEATURE_NAME, docID); 547 persDoc = (Document) Factory.createResource( 548 "gate.corpora.DocumentImpl", 549 features); 550 551 Out.prln("<H2>" + persDoc.getName() + "</H2>"); 552 553 //try finding the marked document 554 StringBuffer docName = new StringBuffer(persDoc.getName()); 555 docName.replace( 556 persDoc.getName().lastIndexOf("."), 557 docName.length(), 558 ".xml"); 559 File markedDocFile = new File(markedDir, docName.toString()); 560 if (! markedDocFile.exists()) { 561 Out.prln("Warning: Cannot find human-annotated document " + 562 markedDocFile + " in " + markedDir); 563 } else { 564 FeatureMap params = Factory.newFeatureMap(); 565 params.put("sourceUrl", markedDocFile.toURL()); 566 params.put("encoding", ""); 567 568 // create the document 569 markedDoc = (Document) Factory.createResource( 570 "gate.corpora.DocumentImpl", params); 571 markedDoc.setName(persDoc.getName()); 572 } 573 574 evaluateDocuments(persDoc, cleanDoc, markedDoc); 575 if (persDoc != null) 576 Factory.deleteResource(persDoc); 577 if (markedDoc != null) 578 Factory.deleteResource(markedDoc); 579 580 }//for loop through saved docs 581 sds.close(); 582 583 } catch (java.net.MalformedURLException ex) { 584 throw new GateRuntimeException("CorpusBenchmark: " + ex.getMessage()); 585 } catch (PersistenceException ex1) { 586 throw new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage()); 587 } catch (ResourceInstantiationException ex2) { 588 throw new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage()); 589 } 590 591 }//evaluateMarkedStored 592 593 594 protected void evaluateMarkedClean(File markedDir, File cleanDir) { 595 Document persDoc = null; 596 Document cleanDoc = null; 597 Document markedDoc = null; 598 599 File[] cleanDocs = cleanDir.listFiles(); 600 for (int i = 0; i< cleanDocs.length; i++) { 601 if (!cleanDocs[i].isFile()) 602 continue; 603 604 //try reading the original document from clean 605 FeatureMap params = Factory.newFeatureMap(); 606 try { 607 params.put("sourceUrl", cleanDocs[i].toURL()); 608 } catch (java.net.MalformedURLException ex) { 609 Out.prln("Cannot create document from file: " + 610 cleanDocs[i].getAbsolutePath()); 611 continue; 612 } 613 params.put("encoding", ""); 614 615 // create the document 616 try { 617 cleanDoc = (Document) Factory.createResource( 618 "gate.corpora.DocumentImpl", params, 619 null, cleanDocs[i].getName()); 620 } catch (gate.creole.ResourceInstantiationException ex) { 621 Out.prln("Cannot create document from file: " + 622 cleanDocs[i].getAbsolutePath()); 623 continue; 624 } 625 626 Out.prln("<TD>" + cleanDocs[i].getName() + "</TD>"); 627 628 //try finding the marked document 629 StringBuffer docName = new StringBuffer(cleanDoc.getName()); 630 docName.replace( 631 cleanDoc.getName().lastIndexOf("."), 632 docName.length(), 633 ".xml"); 634 File markedDocFile = new File(markedDir, docName.toString()); 635 if (! markedDocFile.exists()) { 636 Out.prln("Warning: Cannot find human-annotated document " + 637 markedDocFile + " in " + markedDir); 638 continue; 639 } else { 640 params = Factory.newFeatureMap(); 641 try { 642 params.put("sourceUrl", markedDocFile.toURL()); 643 } catch (java.net.MalformedURLException ex) { 644 Out.prln("Cannot create document from file: " + 645 markedDocFile.getAbsolutePath()); 646 continue; 647 } 648 params.put("encoding", ""); 649 650 // create the document 651 try { 652 markedDoc = (Document) Factory.createResource( 653 "gate.corpora.DocumentImpl", params, 654 null, cleanDoc.getName()); 655 } catch (gate.creole.ResourceInstantiationException ex) { 656 Out.prln("Cannot create document from file: " + 657 markedDocFile.getAbsolutePath()); 658 continue; 659 } 660 661 }//if markedDoc exists 662 663 try { 664 evaluateDocuments(persDoc, cleanDoc, markedDoc); 665 } catch (gate.creole.ResourceInstantiationException ex) { 666 Out.prln("Evaluate failed on document: " + cleanDoc.getName()); 667 } 668 if (persDoc != null) 669 Factory.deleteResource(persDoc); 670 if (cleanDoc != null) 671 Factory.deleteResource(cleanDoc); 672 if (markedDoc != null) 673 Factory.deleteResource(markedDoc); 674 675 }//for loop through clean docs 676 677 678 }//evaluateMarkedClean 679 680 protected void processDocument(Document doc) { 681 try { 682 tokeniser.setDocument(doc); 683 tokeniser.execute(); 684 685 gazetteer.setDocument(doc); 686 gazetteer.execute(); 687 688 String textTagName = configs.getProperty("astTEXTTagName"); 689 if (textTagName != null && !textTagName.equals("")) 690 setTransfer.setTextTagName(textTagName); 691 setTransfer.setDocument(doc); 692 setTransfer.execute(); 693 694 splitter.setDocument(doc); 695 splitter.execute(); 696 697 tagger.setDocument(doc); 698 tagger.execute(); 699 700 transducer.setDocument(doc); 701 transducer.execute(); 702 703 orthomatcher.setDocument(doc); 704 orthomatcher.execute(); 705 } catch (gate.creole.ExecutionException ex) { 706 throw new GateRuntimeException("Corpus generation error: " + 707 ex.getMessage()); 708 } 709 } 710 711 protected void evaluateDocuments(Document persDoc, 712 Document cleanDoc, Document markedDoc) 713 throws ResourceInstantiationException { 714 if (cleanDoc == null && markedDoc == null) 715 return; 716 717 //we've got no types to compare 718 if (annotTypes == null || annotTypes.isEmpty()) 719 return; 720 721 if (cleanDoc != null && !isMarkedStored) { 722 723 processDocument(cleanDoc); 724 725 if(!isMarkedClean) 726 evaluateAllThree(persDoc, cleanDoc, markedDoc); 727 else 728 evaluateTwoDocs(markedDoc, cleanDoc); 729 730 } else 731 evaluateTwoDocs(markedDoc, persDoc); 732 733 } 734 735 protected void evaluateAllThree(Document persDoc, 736 Document cleanDoc, Document markedDoc) 737 throws ResourceInstantiationException { 738 //first start the table and its header 739 printTableHeader(); 740 for (int jj= 0; jj< annotTypes.size(); jj++) { 741 String annotType = (String) annotTypes.get(jj); 742 743 AnnotationDiff annotDiff=measureDocs(markedDoc, cleanDoc, annotType); 744 //we don't have this annotation type in this document 745 if (annotDiff == null) 746 continue; 747 Out.prln("<TR>"); 748 749 //increase the number of processed documents 750 docNumber++; 751 //add precison and recall to the sums 752 updateStatistics(annotDiff, annotType); 753 754 Out.prln("<TD> Annotation type: " + annotType + "</TD>"); 755 756 AnnotationDiff annotDiff1 = 757 measureDocs(markedDoc, persDoc, annotType); 758 759 Out.prln("<TD>" + annotDiff.getPrecisionAverage()); 760 //check the precision first 761 if (annotDiff1 != null && 762 annotDiff!= null && 763 annotDiff1.getPrecisionAverage()<annotDiff.getPrecisionAverage() 764 ) 765 Out.prln("<P> Precision increase on human-marked from " + 766 annotDiff1.getPrecisionAverage() + " to " + 767 annotDiff.getPrecisionAverage() + "</P>"); 768 else if (annotDiff1 != null 769 && annotDiff != null 770 && annotDiff1.getPrecisionAverage() 771 > annotDiff.getPrecisionAverage()) 772 Out.prln("<P> Precision decrease on human-marked from " + 773 annotDiff1.getPrecisionAverage() + " to " + 774 annotDiff.getPrecisionAverage() + "</P>"); 775 Out.prln("</TD>"); 776 777 Out.prln("<TD>" + annotDiff.getRecallAverage()); 778 //check the recall now 779 if (annotDiff1 != null && 780 annotDiff!= null && 781 annotDiff1.getRecallAverage()<annotDiff.getRecallAverage() 782 ) 783 Out.prln("<P> Recall increase on human-marked from " + 784 annotDiff1.getRecallAverage() + " to " + 785 annotDiff.getRecallAverage() + "</P>"); 786 else if (annotDiff1 != null 787 && annotDiff != null 788 && annotDiff1.getRecallAverage() 789 > annotDiff.getRecallAverage()) 790 Out.prln("<P> Recall decrease on human-marked from " + 791 annotDiff1.getRecallAverage() + " to " + 792 annotDiff.getRecallAverage() + "</P>"); 793 794 Out.prln("</TD>"); 795 796 //check the recall now 797 if ( isVerboseMode 798 && 799 ((annotDiff.getRecallAverage() < threshold 800 || 801 annotDiff.getRecallAverage() < threshold) 802 ) 803 ) 804 printAnnotations(annotDiff, markedDoc, cleanDoc); 805 806 807 Out.prln("</TR>"); 808 }//for loop through annotation types 809 Out.prln("</TABLE>"); 810 811 }//evaluateAllThree 812 813 protected void evaluateTwoDocs(Document keyDoc, Document respDoc) 814 throws ResourceInstantiationException { 815 816 //first start the table and its header 817 printTableHeader(); 818 for (int jj= 0; jj< annotTypes.size(); jj++) { 819 String annotType = (String) annotTypes.get(jj); 820 821 AnnotationDiff annotDiff=measureDocs(keyDoc, respDoc, annotType); 822 //we don't have this annotation type in this document 823 if (annotDiff == null) 824 continue; 825 Out.prln("<TR>"); 826 827 //increase the number of processed documents 828 docNumber++; 829 //add precison and recall to the sums 830 updateStatistics(annotDiff, annotType); 831 832 Out.prln("<TD>" + annotType + "</TD>"); 833 834 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>"); 835 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>"); 836 //check the recall now 837 if ( isVerboseMode 838 && 839 ((annotDiff.getRecallAverage() < threshold 840 || 841 annotDiff.getRecallAverage() < threshold) 842 ) 843 ) 844 printAnnotations(annotDiff, keyDoc, respDoc); 845 846 Out.prln("</TR>"); 847 }//for loop through annotation types 848 Out.prln("</TABLE>"); 849 850 }//evaluateTwoDocs 851 852 protected void printTableHeader() { 853 Out.prln("<TABLE BORDER=1"); 854 if (isVerboseMode) 855 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 856 + "<TD><B>Recall</B></TD> <TD><B>Annotations<B></TD>"); 857 else 858 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Precision</B></TD> " 859 + "<TD><B>Recall</B></TD>"); 860 } 861 862 protected void updateStatistics(AnnotationDiff annotDiff, String annotType){ 863 precisionSum += annotDiff.getPrecisionAverage(); 864 recallSum += annotDiff.getRecallAverage(); 865 Double oldPrecision = (Double) precisionByType.get(annotType); 866 if (oldPrecision == null) 867 precisionByType.put(annotType, 868 new Double(annotDiff.getPrecisionAverage())); 869 else 870 precisionByType.put(annotType, 871 new Double(oldPrecision.doubleValue() + 872 annotDiff.getPrecisionAverage())); 873 Integer precCount = (Integer) prCountByType.get(annotType); 874 if (precCount == null) 875 prCountByType.put(annotType, new Integer(1)); 876 else 877 prCountByType.put(annotType, new Integer(precCount.intValue() + 1)); 878 879 880 Double oldRecall = (Double) recallByType.get(annotType); 881 if (oldRecall == null) 882 recallByType.put(annotType, 883 new Double(annotDiff.getRecallAverage())); 884 else 885 recallByType.put(annotType, 886 new Double(oldRecall.doubleValue() + 887 annotDiff.getRecallAverage())); 888 Integer recCount = (Integer) recCountByType.get(annotType); 889 if (recCount == null) 890 recCountByType.put(annotType, new Integer(1)); 891 else 892 recCountByType.put(annotType, new Integer(recCount.intValue() + 1)); 893 894 } 895 896 protected void printStatistics() { 897 898 Out.prln("<H2> Statistics </H2>"); 899 Out.prln("<H3> Precision </H3>"); 900 if (precisionByType != null && !precisionByType.isEmpty()) { 901 Iterator iter = precisionByType.keySet().iterator(); 902 while (iter.hasNext()) { 903 String annotType = (String) iter.next(); 904 Out.prln(annotType + ": " 905 + ((Double)precisionByType.get(annotType)).doubleValue() 906 / 907 ((Integer)prCountByType.get(annotType)).intValue() 908 + "<P>"); 909 }//while 910 } 911 Out.prln("Overall precision: " + getPrecisionAverage() + "<P>"); 912 913 Out.prln("<H3> Recall </H3>"); 914 if (recallByType != null && !recallByType.isEmpty()) { 915 Iterator iter = recallByType.keySet().iterator(); 916 while (iter.hasNext()) { 917 String annotType = (String) iter.next(); 918 Out.prln(annotType + ": " 919 + ((Double)recallByType.get(annotType)).doubleValue() 920 / 921 ((Integer)recCountByType.get(annotType)).intValue() 922 + "<P>"); 923 }//while 924 } 925 926 Out.prln("Overall recall: " + getRecallAverage() 927 + "<P>"); 928 } 929 930 protected AnnotationDiff measureDocs( 931 Document keyDoc, Document respDoc, String annotType) 932 throws ResourceInstantiationException { 933 934 if (keyDoc == null || respDoc == null) 935 return null; 936 937 if (annotSetName != null 938 && keyDoc.getAnnotations(annotSetName).get(annotType) == null) 939 return null; 940 else if ((annotSetName == null || annotSetName.equals("")) 941 && keyDoc.getAnnotations().get(annotType) == null) 942 return null; 943 944 // create the annotation schema needed for AnnotationDiff 945 AnnotationSchema annotationSchema = new AnnotationSchema(); 946 947 // organization type 948 annotationSchema.setAnnotationName(annotType); 949 // create an annotation diff 950 FeatureMap parameters = Factory.newFeatureMap(); 951 parameters.put("keyDocument",keyDoc); 952 parameters.put("responseDocument",respDoc); 953 parameters.put("annotationSchema",annotationSchema); 954 parameters.put("keyAnnotationSetName",annotSetName); 955 parameters.put("responseAnnotationSetName",annotSetName); 956 //for a start, do not compare the features of the annotations 957 parameters.put("keyFeatureNamesSet", new HashSet()); 958 parameters.put("textMode", new Boolean(true)); 959 960 // Create Annotation Diff visual resource 961 AnnotationDiff annotDiff = (AnnotationDiff) 962 Factory.createResource("gate.annotation.AnnotationDiff",parameters); 963 964 return annotDiff; 965 } 966 967 protected void printAnnotations(AnnotationDiff annotDiff, 968 Document keyDoc, Document respDoc) { 969 Out.prln("<TD>"); 970 Out.pr("MISSING ANNOTATIONS in the automatic texts: "); 971 Set missingSet = 972 annotDiff.getAnnotationsOfType(AnnotationDiff.MISSING_TYPE); 973 printAnnotations(missingSet, keyDoc); 974 Out.prln("<BR>"); 975 976 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: "); 977 Set spuriousSet = 978 annotDiff.getAnnotationsOfType(AnnotationDiff.SPURIOUS_TYPE); 979 printAnnotations(spuriousSet, respDoc); 980 Out.prln("</BR>"); 981 982 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: "); 983 Set partialSet = 984 annotDiff.getAnnotationsOfType(AnnotationDiff.PARTIALLY_CORRECT_TYPE); 985 printAnnotations(partialSet, respDoc); 986 Out.prln("</TD>"); 987 988 } 989 990 protected void printAnnotations(Set set, Document doc) { 991 if (set == null || set.isEmpty()) 992 return; 993 994 Iterator iter = set.iterator(); 995 while (iter.hasNext()) { 996 Annotation ann = (Annotation) iter.next(); 997 Out.prln( 998 "<B>" + 999 doc.getContent().toString().substring( 1000 ann.getStartNode().getOffset().intValue(), 1001 ann.getEndNode().getOffset().intValue()) + 1002 "</B>: <I>[" + ann.getStartNode().getOffset() + 1003 "," + ann.getEndNode().getOffset() + "]</I>" 1004// + "; features" + ann.getFeatures() 1005 ); 1006 }//while 1007 } 1008 1009 /** 1010 * The directory from which we should generate/evaluate the corpus 1011 */ 1012 private File startDir; 1013 private File currDir; 1014 private static List annotTypes; 1015 1016 private DefaultTokeniser tokeniser; 1017 private DefaultGazetteer gazetteer; 1018 private SentenceSplitter splitter; 1019 private POSTagger tagger; 1020 private ANNIETransducer transducer; 1021 private OrthoMatcher orthomatcher; 1022 private AnnotationSetTransfer setTransfer; 1023 1024 //collect the sum of all precisions and recalls of all docs 1025 //and the number of docs, so I can calculate the average for 1026 //the corpus at the end 1027 private double precisionSum = 0; 1028 private double recallSum = 0; 1029 private HashMap precisionByType = new HashMap(); 1030 private HashMap prCountByType = new HashMap(); 1031 private HashMap recallByType = new HashMap(); 1032 private HashMap recCountByType = new HashMap(); 1033 private int docNumber = 0; 1034 1035 /** 1036 * If true, the corpus tool will generate the corpus, otherwise it'll 1037 * run in evaluate mode 1038 */ 1039 private boolean isGenerateMode = false; 1040 private boolean isVerboseMode = false; 1041 1042 /** 1043 * If true, the corpus tool will evaluate stored against the human-marked 1044 * documents 1045 */ 1046 private boolean isMarkedStored = false; 1047 private boolean isMarkedClean = false; 1048 1049 private String annotSetName = "Key"; 1050 1051 private double threshold = 0.5; 1052 private Properties configs = new Properties(); 1053 1054 /** String to print when wrong command-line args */ 1055 private static String usage = 1056 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] [-verbose] directory-name"; 1057 1058}
|
CorpusBenchmarkTool |
|