|
OrthoMatcher |
|
1 /* 2 * OrthoMatcher.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/August/2001 12 * 13 * $Id: OrthoMatcher.java,v 1.36 2001/12/01 17:47:04 kalina Exp $ 14 */ 15 16 17 package gate.creole.orthomatcher; 18 19 import gate.*; 20 import gate.util.*; 21 import gate.creole.*; 22 import gate.corpora.*; 23 import gate.annotation.*; 24 import java.util.*; 25 import java.io.*; 26 import java.net.*; 27 import gnu.regexp.*; 28 29 public class OrthoMatcher extends AbstractLanguageAnalyser 30 implements ANNIEConstants{ 31 32 protected static final String CDGLISTNAME = "cdg"; 33 protected static final String ALIASLISTNAME = "alias"; 34 protected static final String ARTLISTNAME = "def_art"; 35 protected static final String PREPLISTNAME = "prepos"; 36 protected static final String CONNECTORLISTNAME = "connector"; 37 protected static final String SPURLISTNAME = "spur_match"; 38 39 protected static final String LOOKUPNAME = "Lookup"; 40 protected static final String GENDER_FEATURE = "gender"; 41 protected static final String KIND_FEATURE = "kind"; 42 protected static final String STRING_FEATURE = "string"; 43 protected static final String THE_VALUE = "The"; 44 45 46 /**the name of the annotation set*/ 47 protected String annotationSetName; 48 49 /** the types of the annotation */ 50 protected List annotationTypes = new ArrayList(10); 51 52 /** the organization type*/ 53 protected String organizationType = "Organization"; 54 55 /** the person type*/ 56 protected String personType = "Person"; 57 58 protected String unknownType = "Unknown"; 59 60 /** internal or external list */ 61 protected boolean extLists = true; 62 63 protected boolean matchingUnknowns = true; 64 65 /** This is an internal variable to indicate whether 66 * we matched using a rule that requires that 67 * the newly matched annotation matches all the others 68 * This is needed, because organizations can share 69 * first/last tokens like News and be different 70 */ 71 private boolean allMatchingNeeded = false; 72 73 //** Orthomatching is not case-sensitive by default*/ 74 protected boolean caseSensitive = false; 75 76 protected FeatureMap queryFM = Factory.newFeatureMap(); 77 78 // protected ExecutionException executionException; 79 80 // name lookup tables (used for namematch) 81 //gave them bigger default size, coz rehash is expensive 82 protected HashMap alias = new HashMap(100); 83 protected HashSet cdg = new HashSet(50); 84 protected HashMap spur_match = new HashMap(100); 85 protected HashMap def_art = new HashMap(20); 86 protected HashMap connector = new HashMap(20); 87 protected HashMap prepos = new HashMap(30); 88 89 90 protected AnnotationSet nameAllAnnots = null; 91 protected HashMap processedAnnots = new HashMap(150); 92 protected HashMap annots2Remove = new HashMap(75); 93 protected List matchesDocFeature = new ArrayList(); 94 //maps annotation ids to array lists of tokens 95 protected HashMap tokensMap = new HashMap(150); 96 97 protected Annotation shortAnnot, longAnnot; 98 99 protected ArrayList tokensLongAnnot, tokensShortAnnot; 100 101 /** a feature map to be used when retrieving annotations 102 * declared here so can be reused for efficiency 103 * clear() before each use 104 */ 105 protected FeatureMap tempMap = Factory.newFeatureMap(); 106 107 /** a buffer in order to read an array of char */ 108 private char[] cbuffer = null; 109 110 /** the size of the buffer */ 111 private final static int BUFF_SIZE = 65000; 112 113 /** @link dependency */ 114 /*#OrthoMatcher lnkOrthoMatcher;*/ 115 116 public OrthoMatcher () { 117 annotationTypes.add(organizationType); 118 annotationTypes.add(personType); 119 annotationTypes.add("Location"); 120 annotationTypes.add("Date"); 121 } 122 123 /** Initialise this resource, and return it. */ 124 public Resource init() throws ResourceInstantiationException { 125 cbuffer = new char[BUFF_SIZE]; 126 127 //initialise the list of annotations which we will match 128 try { 129 createLists(); 130 } catch (IOException ioe) {ioe.printStackTrace();} 131 return this; 132 } // init() 133 134 /** Run the resource. It doesn't make sense not to override 135 * this in subclasses so the default implementation signals an 136 * exception. 137 */ 138 public void execute() throws ExecutionException{ 139 140 //check the input 141 if(document == null) { 142 throw new ExecutionException( 143 "No document for namematch!" 144 ); 145 } 146 147 // get the annotations from document 148 if ((annotationSetName == null)|| (annotationSetName.equals(""))) 149 nameAllAnnots = document.getAnnotations(); 150 else 151 nameAllAnnots = document.getAnnotations(annotationSetName); 152 153 //if none found, print warning and exit 154 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) { 155 Out.prln("OrthoMatcher Warning: No annotations found for processing"); 156 return; 157 } 158 159 //check if we've been run on this document before 160 //and clean the doc if needed 161 docCleanup(); 162 Map matchesMap = (Map)document.getFeatures(). 163 get(DOCUMENT_COREF_FEATURE_NAME); 164 // if(matchesMap != null && matchesMap.containsKey(nameAllAnnots.getName())){ 165 // docCleanup(); 166 // } 167 168 // creates the cdg list from the document 169 //no need to create otherwise, coz already done in init() 170 if (!extLists) 171 buildTables(nameAllAnnots); 172 173 //first match all name annotations 174 matchNameAnnotations(); 175 176 //then match the unknown ones to all name ones 177 if (matchingUnknowns) 178 matchUnknown(); 179 180 // set the matches of the document 181 // determineMatchesDocument(); 182 if (! matchesDocFeature.isEmpty()) { 183 if(matchesMap == null){ 184 matchesMap = new HashMap(); 185 } 186 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature); 187 //we need to put it even if it was already present in order to triger 188 //the update events 189 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap); 190 191 //cannot do clear() as this has already been put on the document 192 //so I need a new one for the next run of matcher 193 matchesDocFeature = new ArrayList(); 194 } 195 196 // Out.prln("Processed strings" + processedAnnots.values()); 197 //clean-up the internal data structures for next run 198 nameAllAnnots = null; 199 processedAnnots.clear(); 200 annots2Remove.clear(); 201 tokensMap.clear(); 202 matchesDocFeature = new ArrayList(); 203 longAnnot = null; 204 shortAnnot = null; 205 tokensLongAnnot = null; 206 tokensShortAnnot = null; 207 208 } // run() 209 210 protected void matchNameAnnotations() throws ExecutionException{ 211 // go through all the annotation types 212 Iterator iterAnnotationTypes = annotationTypes.iterator(); 213 while (iterAnnotationTypes.hasNext()) { 214 String annotationType = (String)iterAnnotationTypes.next(); 215 216 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType); 217 218 // continue if no such annotations exist 219 if ((nameAnnots == null) || nameAnnots.isEmpty()) 220 continue; 221 222 Iterator iterNames = nameAnnots.iterator(); 223 while (iterNames.hasNext()) { 224 Annotation nameAnnot = (Annotation) iterNames.next(); 225 Integer id = nameAnnot.getId(); 226 227 // get string and value 228 String annotString = null; 229 try { 230 annotString = document.getContent().getContent( 231 nameAnnot.getStartNode().getOffset(), 232 nameAnnot.getEndNode().getOffset() 233 ).toString(); 234 // now do the reg. exp. substitutions 235 annotString = regularExpressions(annotString," ", "\\s+"); 236 237 } catch (InvalidOffsetException ioe) { 238 throw new ExecutionException 239 ("Invalid offset of the annotation"); 240 } 241 //convert to lower case if we are not doing a case sensitive match 242 if (!caseSensitive) 243 annotString = annotString.toLowerCase(); 244 245 //get the tokens 246 List tokens = new ArrayList((Set) 247 nameAllAnnots.get("Token", 248 nameAnnot.getStartNode().getOffset(), 249 nameAnnot.getEndNode().getOffset() 250 )); 251 //if no tokens to match, do nothing 252 if (tokens.isEmpty()) 253 continue; 254 Collections.sort(tokens, new gate.util.OffsetComparator()); 255 //check if these actually do not end after the name 256 //needed coz new tokeniser conflates 257 //strings with dashes. So British Gas-style is two tokens 258 //instead of three. So cannot match properly British Gas 259 // tokens = checkTokens(tokens); 260 tokensMap.put(nameAnnot.getId(), tokens); 261 262 // Out.prln("Matching annot " + nameAnnot + ": string " + annotString); 263 264 //first check whether we have not matched such a string already 265 //if so, just consider it matched, don't bother calling the rules 266 if (processedAnnots.containsValue(annotString)) { 267 // Out.prln("Contained string found " + annotString); 268 updateMatches(nameAnnot, annotString); 269 processedAnnots.put(nameAnnot.getId(), annotString); 270 continue; 271 } else if (processedAnnots.isEmpty()) { 272 processedAnnots.put(nameAnnot.getId(), annotString); 273 continue; 274 } 275 276 //if a person, then remove their title before matching 277 if (nameAnnot.getType().equals(personType)) 278 annotString = containTitle(annotString, nameAnnot); 279 else if (nameAnnot.getType().equals(organizationType)) 280 annotString = stripCDG(annotString, nameAnnot); 281 282 //otherwise try matching with previous annotations 283 matchWithPrevious(nameAnnot, annotString); 284 285 // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString); 286 //finally add the current annotations to the processed map 287 processedAnnots.put(nameAnnot.getId(), annotString); 288 }//while through name annotations 289 290 }//while through annotation types 291 292 } 293 294 protected void matchUnknown() throws ExecutionException { 295 //get all Unknown annotations 296 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType); 297 298 if ((unknownAnnots == null) || unknownAnnots.isEmpty()) 299 return; 300 301 Iterator iter = unknownAnnots.iterator(); 302 //loop through the unknown annots 303 while (iter.hasNext()) { 304 Annotation unknown = (Annotation) iter.next(); 305 306 // get string and value 307 String unknownString = null; 308 try { 309 unknownString = document.getContent().getContent( 310 unknown.getStartNode().getOffset(), 311 unknown.getEndNode().getOffset() 312 ).toString(); 313 // now do the reg. exp. substitutions 314 unknownString = regularExpressions(unknownString," ", "\\s+"); 315 } catch (InvalidOffsetException ioe) { 316 throw new ExecutionException 317 ("Invalid offset of the annotation"); 318 } 319 //convert to lower case if we are not doing a case sensitive match 320 if (!caseSensitive) 321 unknownString = unknownString.toLowerCase(); 322 323 //get the tokens 324 List tokens = new ArrayList((Set) 325 nameAllAnnots.get("Token", 326 unknown.getStartNode().getOffset(), 327 unknown.getEndNode().getOffset() 328 )); 329 if (tokens.isEmpty()) 330 continue; 331 Collections.sort(tokens, new gate.util.OffsetComparator()); 332 tokensMap.put(unknown.getId(), tokens); 333 334 335 //first check whether we have not matched such a string already 336 //if so, just consider it matched, don't bother calling the rules 337 if (processedAnnots.containsValue(unknownString)) { 338 Annotation matchedAnnot = updateMatches(unknown, unknownString); 339 // Out.prln("Matched " + unknown + "with string " + unknownString); 340 // Out.prln("That's same as " + matchedAnnot); 341 if (matchedAnnot.getType().equals(unknownType)) { 342 annots2Remove.put(unknown.getId(), 343 annots2Remove.get(matchedAnnot.getId())); 344 } 345 else 346 annots2Remove.put(unknown.getId(), matchedAnnot.getType()); 347 processedAnnots.put(unknown.getId(), unknownString); 348 unknown.getFeatures().put("NMRule", unknownType); 349 continue; 350 } 351 352 matchWithPrevious(unknown, unknownString); 353 } //while though unknowns 354 355 if (! annots2Remove.isEmpty()) { 356 Iterator unknownIter = annots2Remove.keySet().iterator(); 357 while (unknownIter.hasNext()) { 358 Integer unknId = (Integer) unknownIter.next(); 359 Annotation unknown = nameAllAnnots.get(unknId); 360 Integer newID = nameAllAnnots.add( 361 unknown.getStartNode(), 362 unknown.getEndNode(), 363 (String) annots2Remove.get(unknId), 364 unknown.getFeatures() 365 ); 366 nameAllAnnots.remove(unknown); 367 368 //change the id in the matches list 369 List mList = (List)unknown.getFeatures(). 370 get(ANNOTATION_COREF_FEATURE_NAME); 371 mList.remove(unknId); 372 mList.add(newID); 373 }//while 374 }//if 375 } 376 377 protected void matchWithPrevious(Annotation nameAnnot, String annotString) { 378 boolean matchedUnknown = false; 379 380 Iterator prevIter = processedAnnots.keySet().iterator(); 381 while (prevIter.hasNext()) { 382 Integer prevId = (Integer) prevIter.next(); 383 Annotation prevAnnot = nameAllAnnots.get(prevId); 384 385 //check if the two are from the same type or the new one is unknown 386 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType()) 387 && ! nameAnnot.getType().equals(unknownType)) 388 ) 389 continue; 390 //do not compare two unknown annotations either 391 //they are only matched to those of known types 392 if ( nameAnnot.getType().equals(unknownType) 393 && prevAnnot.getType().equals(unknownType)) 394 continue; 395 396 //check if we have already matched this annotation to the new one 397 if (matchedAlready(nameAnnot, prevAnnot) ) 398 continue; 399 400 // determine the title from annotation string 401 //now changed to a rule, here we just match by gender 402 if (prevAnnot.getType().equals(personType)) { 403 String prevGender = (String) prevAnnot.getFeatures().get(GENDER_FEATURE); 404 String nameGender = (String) nameAnnot.getFeatures().get(GENDER_FEATURE); 405 if ( prevGender != null 406 && nameGender != null 407 && ( (nameGender.equalsIgnoreCase("female") 408 && 409 prevGender.equalsIgnoreCase("male") 410 ) 411 || 412 (prevGender.equalsIgnoreCase("female") 413 && nameGender.equalsIgnoreCase("male") 414 ) 415 ) 416 ) //if condition 417 continue; //we don't have a match if the two genders are different 418 419 }//if 420 421 //if the two annotations match 422 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) { 423 // Out.prln("Matched " + shortName + "and " + longName); 424 updateMatches(nameAnnot, prevAnnot); 425 //if unknown annotation, we need to change to the new type 426 if (nameAnnot.getType().equals(unknownType)) { 427 matchedUnknown = true; 428 if (prevAnnot.getType().equals(unknownType)) 429 annots2Remove.put(nameAnnot.getId(), 430 annots2Remove.get(prevAnnot.getId())); 431 else 432 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType()); 433 //also put an attribute to indicate that 434 nameAnnot.getFeatures().put("NMRule", unknownType); 435 }//if unknown 436 break; //no need to match further 437 }//if annotations matched 438 439 }//while through previous annotations 440 441 if (matchedUnknown) 442 processedAnnots.put(nameAnnot.getId(), annotString); 443 444 445 }//matchWithPrevious 446 447 protected boolean matchAnnotations(Annotation newAnnot, String annotString, 448 Annotation prevAnnot) { 449 450 // find which annotation string of the two is longer 451 // this is useful for some of the matching rules 452 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 453 454 String longName = prevAnnotString; 455 String shortName = annotString; 456 longAnnot = prevAnnot; 457 shortAnnot = newAnnot; 458 459 if (shortName.length()>=longName.length()) { 460 String temp = longName; 461 longName = shortName; 462 shortName = temp; 463 Annotation tempAnn = longAnnot; 464 longAnnot = shortAnnot; 465 shortAnnot = tempAnn; 466 }//if 467 468 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 469 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 470 471 List matchesList = (List) prevAnnot.getFeatures(). 472 get(ANNOTATION_COREF_FEATURE_NAME); 473 if (matchesList == null || matchesList.isEmpty()) 474 return apply_rules_namematch(prevAnnot.getType(), shortName,longName); 475 476 //if these two match, then let's see if all the other matching one will too 477 //that's needed, because sometimes names can share a token (e.g., first or 478 //last but not be the same 479 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) { 480 List toMatchList = new ArrayList(matchesList); 481 // if (newAnnot.getType().equals(unknownType)) 482 // Out.prln("Matching new " + annotString + " with annots " + toMatchList); 483 toMatchList.remove(prevAnnot.getId()); 484 485 /** 486 * Check whether we need to ensure that there is a match with the rest 487 * of the matching annotations, because the rule requires that 488 * transtivity is not assummed. 489 */ 490 if (allMatchingNeeded) { 491 allMatchingNeeded = false; 492 return matchOtherAnnots(toMatchList, newAnnot, annotString); 493 } else 494 return true; 495 } 496 return false; 497 } 498 499 /** This method checkes whether the new annotation matches 500 * all annotations given in the toMatchList (it contains ids) 501 * The idea is that the new annotation needs to match all those, 502 * because assuming transitivity does not always work, when 503 * two different entities share a common token: e.g., BT Cellnet 504 * and BT and British Telecom. 505 */ 506 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot, 507 String annotString) { 508 509 //if the list is empty, then we're matching all right :-) 510 if (toMatchList.isEmpty()) 511 return true; 512 513 boolean matchedAll = true; 514 int i = 0; 515 516 while (matchedAll && i < toMatchList.size()) { 517 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i)); 518 519 // find which annotation string of the two is longer 520 // this is useful for some of the matching rules 521 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 522 if (prevAnnotString == null) 523 try { 524 prevAnnotString = document.getContent().getContent( 525 prevAnnot.getStartNode().getOffset(), 526 prevAnnot.getEndNode().getOffset() 527 ).toString(); 528 } catch (InvalidOffsetException ioe) { 529 return false; 530 }//try 531 532 533 String longName = prevAnnotString; 534 String shortName = annotString; 535 longAnnot = prevAnnot; 536 shortAnnot = newAnnot; 537 538 if (shortName.length()>=longName.length()) { 539 String temp = longName; 540 longName = shortName; 541 shortName = temp; 542 Annotation tempAnn = longAnnot; 543 longAnnot = shortAnnot; 544 shortAnnot = tempAnn; 545 }//if 546 547 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 548 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 549 550 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName); 551 // if (newAnnot.getType().equals(unknownType)) 552 // Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll); 553 554 i++; 555 }//while 556 return matchedAll; 557 } 558 559 560 protected boolean matchedAlready(Annotation annot1, Annotation annot2) { 561 //the two annotations are already matched if the matches list of the first 562 //contains the id of the second 563 List matchesList = (List) annot1.getFeatures(). 564 get(ANNOTATION_COREF_FEATURE_NAME); 565 if ((matchesList == null) || matchesList.isEmpty()) 566 return false; 567 else if (matchesList.contains(annot2.getId())) 568 return true; 569 return false; 570 } 571 572 protected Annotation updateMatches(Annotation newAnnot, String annotString) { 573 Annotation matchedAnnot = null; 574 Integer id; 575 576 //first find a processed annotation with the same string 577 Iterator iter = processedAnnots.keySet().iterator(); 578 while (iter.hasNext()) { 579 id = (Integer) iter.next(); 580 String oldString = (String) processedAnnots.get(id); 581 if (annotString.equals(oldString)) { 582 matchedAnnot = nameAllAnnots.get(id); 583 break; 584 }//if 585 }//while 586 587 if (matchedAnnot == null) return null; 588 //if the two matching annotations are of different type which is not 589 //unknown, do not match them 590 if (! matchedAnnot.getType().equals(newAnnot.getType()) 591 && !newAnnot.getType().equals(unknownType) ) 592 return matchedAnnot; 593 594 List matchesList = (List) matchedAnnot.getFeatures(). 595 get(ANNOTATION_COREF_FEATURE_NAME); 596 if ((matchesList == null) || matchesList.isEmpty()) { 597 //no previous matches, so need to add 598 if (matchesList == null) { 599 matchesList = new ArrayList(); 600 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, 601 matchesList); 602 matchesDocFeature.add(matchesList); 603 }//if 604 matchesList.add(matchedAnnot.getId()); 605 matchesList.add(newAnnot.getId()); 606 } else { 607 //just add the new annotation 608 matchesList.add(newAnnot.getId()); 609 }//if 610 //add the matches list to the new annotation 611 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 612 return matchedAnnot; 613 } 614 615 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) { 616 617 List matchesList = (List) prevAnnot.getFeatures(). 618 get(ANNOTATION_COREF_FEATURE_NAME); 619 if ((matchesList == null) || matchesList.isEmpty()) { 620 //no previous matches, so need to add 621 if (matchesList == null) { 622 matchesList = new ArrayList(); 623 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 624 matchesDocFeature.add(matchesList); 625 }//if 626 matchesList.add(prevAnnot.getId()); 627 matchesList.add(newAnnot.getId()); 628 } else { 629 //just add the new annotation 630 matchesList.add(newAnnot.getId()); 631 }//if 632 //add the matches list to the new annotation 633 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 634 //propagate the gender if two persons are matched 635 if (prevAnnot.getType().equals(personType)) { 636 String prevGender = (String) prevAnnot.getFeatures().get(GENDER_FEATURE); 637 String newGender = (String) newAnnot.getFeatures().get(GENDER_FEATURE); 638 boolean unknownPrevGender = isUnknownGender(prevGender); 639 boolean unknownNewGender = isUnknownGender(newGender); 640 if (unknownPrevGender && !unknownNewGender) 641 prevAnnot.getFeatures().put(GENDER_FEATURE, newGender); 642 else if (unknownNewGender && !unknownPrevGender) 643 newAnnot.getFeatures().put(GENDER_FEATURE, prevGender); 644 }//if 645 } 646 647 648 protected void docCleanup() { 649 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME); 650 if (matchesValue != null && (matchesValue instanceof Map)) 651 ((Map)matchesValue).remove(nameAllAnnots.getName()); 652 else if (matchesValue != null) 653 document.getFeatures().remove(DOCUMENT_COREF_FEATURE_NAME); 654 655 //get all annotations that have a matches feature 656 HashSet fNames = new HashSet(); 657 fNames.add(ANNOTATION_COREF_FEATURE_NAME); 658 AnnotationSet annots = 659 nameAllAnnots.get(null, fNames); 660 661 // Out.prln("Annots to cleanup" + annots); 662 663 if (annots == null || annots.isEmpty()) 664 return; 665 666 Iterator iter = annots.iterator(); 667 while (iter.hasNext()) { 668 while (iter.hasNext()) 669 ((Annotation) iter.next()).getFeatures(). 670 remove(ANNOTATION_COREF_FEATURE_NAME); 671 } //while 672 }//cleanup 673 674 /** return a person name without title */ 675 protected String containTitle (String annotString, Annotation annot) 676 throws ExecutionException { 677 // get the offsets 678 Long startAnnot = annot.getStartNode().getOffset(); 679 Long endAnnot = annot.getEndNode().getOffset(); 680 681 // determine "Lookup" annotation set 682 queryFM.clear(); 683 queryFM.put("majorType", "title"); 684 AnnotationSet as = 685 nameAllAnnots.get(startAnnot,endAnnot).get("Lookup", queryFM); 686 if (as !=null && ! as.isEmpty()) { 687 List titles = new ArrayList((Set)as); 688 Collections.sort(titles, new gate.util.OffsetComparator()); 689 690 Iterator iter = titles.iterator(); 691 while (iter.hasNext()) { 692 Annotation titleAnn = (Annotation)(iter.next()); 693 694 //we've not found a title at the start offset, 695 //there's no point in looking further 696 //coz titles come first 697 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0) 698 return annotString; 699 700 try { 701 // the title from the current annotation 702 String annotTitle = 703 document.getContent().getContent( 704 titleAnn.getStartNode().getOffset(), 705 titleAnn.getEndNode().getOffset() 706 ).toString(); 707 708 // eliminate the title from annotation string and return the result 709 if (annotTitle.length()<annotString.length()) { 710 //remove from the array of tokens, so then we can compare properly 711 //the remaining tokens 712 // Out.prln("Removing title from: " + annot + " with string " + annotString); 713 // Out.prln("Tokens are" + tokensMap.get(annot.getId())); 714 // Out.prln("Title is" + annotTitle); 715 ((ArrayList) tokensMap.get(annot.getId())).remove(0); 716 return annotString.substring( 717 annotTitle.length()+1,annotString.length()); 718 } 719 } catch (InvalidOffsetException ioe) { 720 throw new ExecutionException 721 ("Invalid offset of the annotation"); 722 }//try 723 }// while 724 }//if 725 return annotString; 726 727 } 728 729 /** return an organization without a designator and starting The*/ 730 protected String stripCDG (String annotString, Annotation annot){ 731 732 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId()); 733 734 //strip starting The first 735 if ( ((String) ((Annotation) tokens.get(0) 736 ).getFeatures().get(STRING_FEATURE)).equalsIgnoreCase(THE_VALUE)) 737 tokens.remove(0); 738 739 //no need to check for cdg if there is only 1 token or less 740 if (tokens.size()<2 && cdg.contains(((Annotation) tokens.get(tokens.size()-1) 741 ).getFeatures().get(STRING_FEATURE)) ) 742 tokens.remove(tokens.size()-1); 743 744 StringBuffer newString = new StringBuffer(50); 745 for (int i = 0; i < tokens.size(); i++){ 746 newString.append((String) ((Annotation) tokens.get(i) 747 ).getFeatures().get(STRING_FEATURE) ); 748 if (i != tokens.size()-1) 749 newString.append(" "); 750 } 751 752 if (caseSensitive) 753 return newString.toString(); 754 755 return newString.toString().toLowerCase(); 756 } 757 758 /* 759 public void check() throws ExecutionException { 760 if (executionException != null) { 761 ExecutionException e = executionException; 762 executionException = null; 763 throw e; 764 } 765 } // check() 766 */ 767 768 /** if ( == false) then reads the names of files in order 769 * to create the lookup tables 770 */ 771 protected void createLists() throws IOException { 772 InputStream inputStream = Files.getGateResourceAsStream( 773 "creole/namematcher/listsNM.def"); 774 InputStreamReader inputStreamReader = new InputStreamReader ( 775 inputStream); 776 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 777 778 String lineRead = null; 779 while ((lineRead = bufferedReader.readLine()) != null){ 780 int index = lineRead.indexOf(":"); 781 if (index != -1){ 782 String nameFile = lineRead.substring(0,index); 783 String nameList = lineRead.substring(index+1,lineRead.length()); 784 createAnnotList(nameFile,nameList); 785 }// if 786 }//while 787 bufferedReader.close(); 788 inputStreamReader.close(); 789 inputStream.close(); 790 }// createLists() 791 792 /** creates the lookup tables */ 793 protected void createAnnotList(String nameFile,String nameList) 794 throws IOException{ 795 InputStream inputStream = Files.getGateResourceAsStream( 796 "creole/namematcher/"+nameFile); 797 InputStreamReader inputStreamReader = new InputStreamReader ( 798 inputStream); 799 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 800 801 String lineRead = null; 802 while ((lineRead = bufferedReader.readLine()) != null){ 803 if (nameList.compareTo(CDGLISTNAME)==0){ 804 if (caseSensitive) 805 cdg.add(lineRead); 806 else 807 cdg.add(lineRead.toLowerCase()); 808 }// if 809 else { 810 int index = lineRead.indexOf("£"); 811 if (index != -1){ 812 String expr = lineRead.substring(0,index); 813 //if not case-sensitive, we need to downcase all strings 814 if (!caseSensitive) 815 expr = expr.toLowerCase(); 816 String code = lineRead.substring(index+1,lineRead.length()); 817 if (nameList.equals(ALIASLISTNAME)) 818 alias.put(expr, code); 819 else 820 if (nameList.equals(ARTLISTNAME)) 821 def_art.put(expr, code); 822 else 823 if (nameList.equals(PREPLISTNAME)) 824 prepos.put(expr, code); 825 else 826 if (nameList.equals(CONNECTORLISTNAME)) 827 connector.put(expr, code); 828 else 829 if (nameList.equals(SPURLISTNAME)) 830 spur_match.put(expr, code); 831 832 }//if 833 }// else 834 835 }//while 836 }//createAnnotList 837 838 839 /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */ 840 private boolean apply_rules_namematch(String annotationType, String shortName, 841 String longName) { 842 // first apply rule for spurius matches i.e. rule0 843 if (matchRule0(longName, shortName)) 844 return false; 845 if ( 846 (// rules for all annotations 847 //no longer use rule1, coz I do the check for same string via the 848 //hash table 849 matchRule2(longName, shortName) 850 || 851 matchRule3(longName, shortName) 852 ) // rules for all annotations 853 || 854 (// rules for organisation annotations 855 ( annotationType.equals(organizationType) 856 //ACE addition 857 || annotationType.equals("Facility")) 858 && 859 ( matchRule4(longName, shortName) 860 || 861 matchRule5(longName, shortName) 862 || 863 matchRule6(longName, shortName) 864 || 865 matchRule7(longName, shortName) 866 || 867 // matchRule8(longName, shortName) 868 // || 869 matchRule9(longName, shortName) 870 || 871 matchRule10(longName, shortName) 872 || 873 matchRule11(longName, shortName) 874 || 875 matchRule12(longName, shortName) 876 || 877 matchRule13(shortName, longName) 878 ) 879 )// rules for organisation annotations 880 || 881 (// rules for person annotations 882 ( annotationType.equals(personType)) 883 && 884 ( matchRule4(longName, shortName) 885 || 886 matchRule5(longName, shortName) 887 || 888 matchRule14(longName, shortName) 889 || //kalina: added this, so it matches names when contain more 890 //than one first and one last name 891 matchRule15(longName, shortName) 892 ) 893 )// rules for person annotations 894 ) //if 895 return true; 896 return false; 897 }//apply_rules 898 899 900 /** set the extLists flag */ 901 public void setExtLists(Boolean newExtLists) { 902 extLists = newExtLists.booleanValue(); 903 }//setextLists 904 905 /** set the caseSensitive flag */ 906 public void setCaseSensitive(Boolean newCase) { 907 caseSensitive = newCase.booleanValue(); 908 }//setextLists 909 910 /** set the annotation set name*/ 911 public void setAnnotationSetName(String newAnnotationSetName) { 912 annotationSetName = newAnnotationSetName; 913 }//setAnnotationSetName 914 915 /** set the types of the annotations*/ 916 public void setAnnotationTypes(List newType) { 917 annotationTypes = newType; 918 }//setAnnotationTypes 919 920 public void setOrganizationType(String newOrganizationType) { 921 organizationType = newOrganizationType; 922 }//setOrganizationType 923 924 public void setPersonType(String newPersonType) { 925 personType = newPersonType; 926 }//setPersonType 927 928 /**get the name of the annotation set*/ 929 public String getAnnotationSetName() { 930 return annotationSetName; 931 }//getAnnotationSetName 932 933 /** get the types of the annotation*/ 934 public List getAnnotationTypes() { 935 return annotationTypes; 936 }//getAnnotationTypes 937 938 public String getOrganizationType() { 939 return organizationType; 940 } 941 942 public String getPersonType() { 943 return personType; 944 } 945 946 public Boolean getExtLists() { 947 return new Boolean(extLists); 948 } 949 950 public Boolean getCaseSensitive() { 951 return new Boolean(caseSensitive); 952 } 953 954 /* 955 public List getMatchesDocument() { 956 return matchesDocument; 957 } 958 */ 959 960 protected boolean isUnknownGender(String gender) { 961 if (gender == null) 962 return true; 963 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female")) 964 return false; 965 return true; 966 967 } //isUnknownGender 968 969 /** RULE #0: If the two names are listed in table of 970 * spurius matches then they do NOT match 971 * Condition(s): - 972 * Applied to: all name annotations 973 */ 974 public boolean matchRule0(String s1, 975 String s2) { 976 if (spur_match.containsKey(s1) 977 && spur_match.containsKey(s2) ) 978 return 979 spur_match.get(s1).toString().equals(spur_match.get(s2).toString()); 980 981 return false; 982 }//matchRule0 983 984 /** RULE #1: If the two names are identical then they are the same 985 * no longer used, because I do the check for same string via the 986 * hash table of previous annotations 987 * Condition(s): depend on case 988 * Applied to: all name annotations 989 */ 990 public boolean matchRule1(String s1, 991 String s2, 992 boolean matchCase) { 993 // Out.prln("Rule1: Matching " + s1 + "and " + s2); 994 995 boolean matched = false; 996 if (!matchCase) 997 matched = s1.equalsIgnoreCase(s2); 998 else matched = s1.equals(s2) ; 999 //kalina: do not remove, nice for debug 1000// if (matched && (s2.equalsIgnoreCase("m") || s1.equalsIgnoreCase("m"))) 1001// Out.prln("Rule1: Matched " + s1 + "and " + s2); 1002 return matched; 1003 }//matchRule1 1004 1005 1006 /** 1007 * RULE #2: if the two names are listed as equivalent in the 1008 * lookup table (alias) then they match 1009 * Condition(s): - 1010 * Applied to: all name annotations 1011 */ 1012 public boolean matchRule2(String s1, 1013 String s2) { 1014 1015 if (alias.containsKey(s1) && alias.containsKey(s2)) 1016 return (alias.get(s1).toString().equals(alias.get(s2).toString())); 1017 1018 return false; 1019 }//matchRule2 1020 1021 /** 1022 * RULE #3: adding a possessive at the end 1023 * of one name causes a match 1024 * e.g. "Standard and Poor" == "Standard and Poor's" 1025 * and also "Standard and Poor" == "Standard's" 1026 * Condition(s): case-insensitive match 1027 * Applied to: all name annotations 1028 */ 1029 public boolean matchRule3(String s1, //long string 1030 String s2) { //short string 1031 1032 if (s2.endsWith("'s") || s2.endsWith("'") 1033 ||(s1.endsWith("'s")|| s1.endsWith("'"))) { 1034 1035 1036 String s2_poss = null; 1037 1038 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s"); 1039 else s2_poss = s2.concat("'"); 1040 1041 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true; 1042 1043 // now check the second case i.e. "Standard and Poor" == "Standard's" 1044 String token = (String) 1045 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1046 1047 if (!token.endsWith("'s")) s2_poss = token.concat("'s"); 1048 else s2_poss = token.concat("'"); 1049 1050 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true; 1051 1052 } // if (s2.endsWith("'s") 1053 return false; 1054 }//matchRule3 1055 1056 /** 1057 * RULE #4: Do all tokens other than the punctuation marks 1058 * , and . match? 1059 * e.g. "Smith, Jones" == "Smith Jones" 1060 * Condition(s): case-insensitive match 1061 * Applied to: organisation and person annotations 1062 */ 1063 public boolean matchRule4(String s1, 1064 String s2) { 1065 1066 boolean allTokensMatch = true; 1067 1068 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator(); 1069 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator(); 1070 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) { 1071 Annotation token = (Annotation) tokensLongAnnotIter.next(); 1072 if (((String)token.getFeatures().get(KIND_FEATURE)).equals("punctuation")) 1073 continue; 1074// Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot); 1075 if (! token.getFeatures().get(STRING_FEATURE).equals( 1076 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(STRING_FEATURE))) { 1077 allTokensMatch = false; 1078 break; 1079 } // if (!tokensLongAnnot.nextToken() 1080 } // while 1081// Out.prln("result is: " + allTokensMatch); 1082 return allTokensMatch; 1083 }//matchRule4 1084 1085 /** 1086 * RULE #5: if the 1st token of one name 1087 * matches the second name 1088 * e.g. "Pepsi Cola" == "Pepsi" 1089 * Condition(s): case-insensitive match 1090 * Applied to: all name annotations 1091 */ 1092 public boolean matchRule5(String s1, 1093 String s2) { 1094 1095 //do not match numbers by this rule 1096 if (tokensLongAnnot.size()> 1 && 1097 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number")) 1098 return false; 1099 1100// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1101// Out.prln("Rule 5: " + s1 + "and " + s2); 1102 if (tokensLongAnnot.size()>1) 1103 return matchRule1((String) 1104 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE), 1105 s2, 1106 caseSensitive); 1107 1108 return false; 1109 1110 }//matchRule5 1111 1112 /** 1113 * RULE #6: if one name is the acronym of the other 1114 * e.g. "Imperial Chemical Industries" == "ICI" 1115 * Applied to: organisation annotations only 1116 */ 1117 public boolean matchRule6(String s1, 1118 String s2) { 1119 1120 int i = 0; 1121 1122 //check and if the shorted string has a space in it, then it's not 1123 //an acronym 1124 if (s2.indexOf(" ") > 0) 1125 return false; 1126 1127 //Out.prln("Acronym: Matching " + s1 + "and " + s2); 1128 StringBuffer acronym_s1 = new StringBuffer(""); 1129 StringBuffer acronymDot_s1 = new StringBuffer(""); 1130 1131 for ( ;i < tokensLongAnnot.size(); i++ ) { 1132 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i) 1133 ).getFeatures().get(STRING_FEATURE)).substring(0,1); 1134 acronym_s1.append(toAppend); 1135 acronymDot_s1.append(toAppend); 1136 acronymDot_s1.append("."); 1137 } 1138 1139 //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2); 1140 //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive)); 1141 1142 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) || 1143 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) ) 1144 return true; 1145 1146 return false; 1147 }//matchRule6 1148 1149 /** 1150 * RULE #7: if one of the tokens in one of the 1151 * names is in the list of separators eg. "&" 1152 * then check if the token before the separator 1153 * matches the other name 1154 * e.g. "R.H. Macy & Co." == "Macy" 1155 * Condition(s): case-sensitive match 1156 * Applied to: organisation annotations only 1157 */ 1158 public boolean matchRule7(String s1, 1159 String s2) { 1160 1161 //don't try it unless the second string is just one token 1162 if (tokensShortAnnot.size() != 1) 1163 return false; 1164 1165 String previous_token = null; 1166 1167 for (int i = 0; i < tokensLongAnnot.size(); i++ ) { 1168 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i) 1169 ).getFeatures().get(STRING_FEATURE) )) { 1170 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1) 1171 ).getFeatures().get(STRING_FEATURE); 1172 1173 break; 1174 } 1175 } 1176 1177 //now match previous_token with other name 1178 if (previous_token != null) { 1179// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1180// Out.prln("Rule7"); 1181 return matchRule1(previous_token,s2,caseSensitive); 1182 1183 } 1184 return false; 1185 }//matchRule7 1186 1187 /** 1188 * This rule is now obsolete, as The and the trailing CDG 1189 * are stripped before matching. 1190 * DO NOT CALL!!! 1191 * 1192 * RULE #8: if the names match, ignoring The and 1193 * and trailing company designator (which have already been stripped) 1194 * e.g. "The Magic Tricks Co." == "Magic Tricks" 1195 * Condition(s): case-sensitive match 1196 * Applied to: organisation annotations only 1197 */ 1198 public boolean matchRule8(String s1, 1199 String s2) { 1200 Out.prln("OrthoMatcher warning: This rule has been discontinued!"); 1201/* 1202 if (s1.startsWith("The ")) s1 = s1.substring(4); 1203 if (s2.startsWith("The ")) s2 = s2.substring(4); 1204 1205 // check that cdg is not empty 1206 if (!cdg.isEmpty()) { 1207 String stringToTokenize1 = s1; 1208 StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," "); 1209 1210 String stringToTokenize2 = s2; 1211 StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," "); 1212 String token = null; 1213 String cdg1 = null; 1214 String cdg2 = null; 1215 1216 s1 = ""; 1217 s2 = ""; 1218 1219 //check last token of s1 1220 while (tokensLongAnnot.hasMoreTokens()) { 1221 token = tokensLongAnnot.nextToken(); 1222 if (!tokensLongAnnot.hasMoreTokens() 1223 && cdg.contains(token)) cdg1=token; 1224 else s1 = s1+token; 1225 } 1226 1227 // do the same for s2 1228 while (tokensShortAnnot.hasMoreTokens()) { 1229 token = tokensShortAnnot.nextToken(); 1230 if (!tokensShortAnnot.hasMoreTokens() 1231 && cdg.contains(token)) cdg2=token; 1232 else s2 = s2+token; 1233 } 1234 1235 // if the company designators are different 1236 // then they are NOT the same organisations 1237 if ((cdg1!=null && cdg2!=null) 1238 && !cdg1.equalsIgnoreCase(cdg2)) return false; 1239 } 1240 if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive); 1241*/ 1242 return false; 1243 1244 }//matchRule8 1245 1246 /** 1247 * RULE #9: does one of the names match the token 1248 * just before a trailing company designator 1249 * in the other name? 1250 * The company designator has already been chopped off, 1251 * so the token before it, is in fact the last token 1252 * e.g. "R.H. Macy Co." == "Macy" 1253 * Applied to: organisation annotations only 1254 */ 1255 public boolean matchRule9(String s1, 1256 String s2) { 1257 1258// if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news")) 1259// Out.prln("Rule 9 " + s1 + " and " + s2); 1260 String s1_short = (String) 1261 ((Annotation) tokensLongAnnot.get( 1262 tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE); 1263// Out.prln("Converted to " + s1_short); 1264 if (tokensLongAnnot.size()>1) { 1265 boolean matched = matchRule1(s1_short, s2, caseSensitive); 1266 //we need to make sure all names match, instead of assuming transitivity, 1267 //to avoid matching BBC News with News then News with ITV News, which 1268 //by transitivity leads to BBC News matching ITV News which is not what 1269 //we want 1270 if (matched) 1271 allMatchingNeeded = true; 1272 return matched; 1273 } //if 1274 1275 return false; 1276 }//matchRule9 1277 1278 /** 1279 * RULE #10: is one name the reverse of the other 1280 * reversing around prepositions only? 1281 * e.g. "Department of Defence" == "Defence Department" 1282 * Condition(s): case-sensitive match 1283 * Applied to: organisation annotations only 1284 */ 1285 public boolean matchRule10(String s1, 1286 String s2) { 1287 1288 String token = null; 1289 String previous_token = null; 1290 String next_token = null; 1291 boolean invoke_rule=false; 1292 1293 if (tokensLongAnnot.size() >= 3 1294 && tokensShortAnnot.size() >= 2) { 1295 1296 // first get the tokens before and after the preposition 1297 int i = 0; 1298 for (; i< tokensLongAnnot.size(); i++) { 1299 token = (String) 1300 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(STRING_FEATURE); 1301 if (prepos.containsKey(token)) { 1302 invoke_rule=true; 1303 break; 1304 }//if 1305 previous_token = token; 1306 }//while 1307 1308 if (! invoke_rule) 1309 return false; 1310 1311 if (i < tokensLongAnnot.size() 1312 && previous_token != null) 1313 next_token= (String) 1314 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(STRING_FEATURE); 1315 else return false; 1316 1317 String s21 = (String) 1318 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1319 String s22 = (String) 1320 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(STRING_FEATURE); 1321 // then compare (in reverse) with the first two tokens of s2 1322 if (matchRule1(next_token,(String) s21,caseSensitive) 1323 && matchRule1(previous_token, s22,caseSensitive)) 1324 return true ; 1325 }//if (tokensLongAnnot.countTokens() >= 3 1326 return false; 1327 }//matchRule10 1328 1329 /** 1330 * RULE #11: does one name consist of contractions 1331 * of the first two tokens of the other name? 1332 * e.g. "Communications Satellite" == "ComSat" 1333 * and "Pan American" == "Pan Am" 1334 * Condition(s): case-sensitive match 1335 * Applied to: organisation annotations only 1336 */ 1337 public boolean matchRule11(String s1, 1338 String s2) { 1339 1340 1341 // first do the easy case e.g. "Pan American" == "Pan Am" 1342 1343 String token11 = null; 1344 String token12 = null; 1345 String token21 = null; 1346 String token22 = null; 1347 1348 if (tokensLongAnnot.size() < 2) 1349 return false; 1350 1351 // 1st get the first two tokens of s1 1352 token11 = (String) 1353 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1354 token12 = (String) 1355 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(STRING_FEATURE); 1356 1357 // now check for the first case i.e. "Pan American" == "Pan Am" 1358 if (tokensShortAnnot.size() == 2) { 1359 1360 token21 = (String) 1361 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1362 token22 = (String) 1363 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1364 1365 if (token11.startsWith(token21) 1366 && token12.startsWith(token22)) 1367 return true; 1368 1369 } // if (tokensShortAnnot.countTokens() == 2) 1370 1371 // now the second case e.g. "Communications Satellite" == "ComSat" 1372 else if (tokensShortAnnot.size()==1 && s2.length()>=3) { 1373 1374 // split the token into possible contractions 1375 // ignore case for matching 1376 for (int i=2;i<s2.length();i++) { 1377 token21=s2.substring(0,i+1); 1378 token22=s2.substring(i+1); 1379 1380 if (token11.startsWith(token21) 1381 && token12.startsWith(token22)) 1382 return true; 1383 }// for 1384 } // else if 1385 1386 return false; 1387 }//matchRule11 1388 1389 /** 1390 * RULE #12: do the first and last tokens of one name 1391 * match the first and last tokens of the other? 1392 * Condition(s): case-sensitive match 1393 * Applied to: organisation annotations only 1394 */ 1395 public boolean matchRule12(String s1, 1396 String s2) { 1397 1398 // first do the easy case e.g. "Pan American" == "Pan Am" 1399 1400 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) { 1401// Out.prln("Rule 12"); 1402 1403 // get first and last tokens of s1 & s2 1404 String s1_first = (String) 1405 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1406 String s2_first = (String) 1407 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE); 1408 1409 if (!matchRule1(s1_first,s2_first,caseSensitive)) 1410 return false; 1411 1412 String s1_last = (String) 1413 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE); 1414 String s2_last = (String) 1415 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(STRING_FEATURE); 1416 1417 return matchRule1(s1_last,s2_last,caseSensitive); 1418 } // if (tokensLongAnnot.countTokens()>1 1419 return false; 1420 }//matchRule12 1421 1422 /** 1423 * RULE #13: do multi-word names match except for 1424 * one token e.g. 1425 * "Second Force Recon Company" == "Force Recon Company" 1426 * Note that this rule has NOT been used in LaSIE's 1.5 1427 * namematcher 1428 * Restrictions: - remove cdg first 1429 * - shortest name should be 2 words or more 1430 * - if N is the number of tokens of the longest 1431 * name, then N-1 tokens should be matched 1432 * Condition(s): case-sensitive match 1433 * Applied to: organisation or person annotations only 1434 */ 1435 public boolean matchRule13(String s1, 1436 String s2) { 1437 1438 1439 String token1 = null; 1440 String token2 = null; 1441 1442 int matched_tokens = 0, mismatches = 0;; 1443 1444 // if names < 2 words then rule is invalid 1445 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false; 1446 1447// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1448// Out.prln("Rule 13: Matching tokens" + tokensLongAnnot); 1449// Out.prln("with tokens " + tokensShortAnnot); 1450// } 1451 1452 // now do the matching 1453 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) { 1454 1455// Out.prln("i = " + i); 1456// Out.prln("j = " + j); 1457 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(STRING_FEATURE).equals( 1458 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(STRING_FEATURE)) ) { 1459 matched_tokens++; 1460 j++; 1461 } else 1462 mismatches++; 1463 } // for 1464 1465 if (matched_tokens >= tokensLongAnnot.size()-1) 1466 return true; 1467 1468 return false; 1469 }//matchRule13 1470 1471 /** 1472 * RULE #14: if the last token of one name 1473 * matches the second name 1474 * e.g. "Hamish Cunningham" == "Cunningham" 1475 * Condition(s): case-insensitive match 1476 * Applied to: all person annotations 1477 */ 1478 public boolean matchRule14(String s1, 1479 String s2) { 1480 1481// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1482// Out.prln("Rule 14 " + s1 + " and " + s2); 1483 String s1_short = (String) 1484 ((Annotation) tokensLongAnnot.get( 1485 tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE); 1486// Out.prln("Converted to " + s1_short); 1487 if (tokensLongAnnot.size()>1) 1488 return matchRule1(s1_short, 1489 s2, 1490 caseSensitive); 1491 1492 return false; 1493 1494 }//matchRule14 1495 1496 /** 1497 * RULE #15: does one token from a Person name appear as the other token 1498 * Note that this rule has NOT been used in LaSIE's 1.5 1499 * namematcher; added for ACE by Di's request 1500 * Applied to: organisation annotations only 1501 */ 1502 public boolean matchRule15(String s1, 1503 String s2) { 1504 1505 1506 String token1 = null; 1507 String token2 = null; 1508 1509 int matched_tokens = 0; 1510 1511 // if names < 2 words then rule is invalid 1512 1513// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1514// Out.prln("Rule 15:" ); 1515// Out.prln("with tokens " + tokensShortAnnot); 1516// } 1517 1518 // now do the matching 1519 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) { 1520 1521 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) 1522// Out.prln("i = " + i); 1523 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures( 1524 ).get(STRING_FEATURE).equals( 1525 ((Annotation) tokensShortAnnot.get(i)).getFeatures( 1526 ).get(STRING_FEATURE)) ) { 1527 matched_tokens++; 1528 } 1529 } // for 1530 1531 if (matched_tokens > 0) 1532 return true; 1533 1534 return false; 1535 }//matchRule15 1536 1537 /** Tables for namematch info 1538 * (used by the namematch rules) 1539 */ 1540 private void buildTables(AnnotationSet nameAllAnnots) { 1541 1542 //reset the tables first 1543 cdg.clear(); 1544 1545 if (! extLists) { 1546 // i.e. get cdg from Lookup annotations 1547 // get all Lookup annotations 1548 tempMap.clear(); 1549 tempMap.put("majorType", "cdg"); 1550 //now get all lookup annotations which are cdg 1551 AnnotationSet nameAnnots = 1552 nameAllAnnots.get(LOOKUPNAME, tempMap); 1553 1554 if ((nameAnnots ==null) || nameAnnots.isEmpty()) 1555 return; 1556 1557 Iterator iter = nameAnnots.iterator(); 1558 while (iter.hasNext()) { 1559 Annotation annot = (Annotation)iter.next(); 1560 // get the actual string 1561 Long offsetStartAnnot = annot.getStartNode().getOffset(); 1562 Long offsetEndAnnot = annot.getEndNode().getOffset(); 1563 try { 1564 gate.Document doc = nameAllAnnots.getDocument(); 1565 String annotString = 1566 doc.getContent().getContent( 1567 offsetStartAnnot,offsetEndAnnot 1568 ).toString(); 1569 cdg.add(annotString); 1570 } catch (InvalidOffsetException ioe) { 1571 ioe.printStackTrace(Err.getPrintWriter()); 1572 } 1573 }// while 1574 }//if 1575 }//buildTables 1576 1577 /** substitute all multiple spaces, tabes and newlines 1578 * with a single space 1579 */ 1580 public String regularExpressions ( String text, String replacement, 1581 String regEx) { 1582 String result = text; 1583 try { 1584 RE re = new RE(regEx); 1585 result = re.substituteAll( text,replacement); 1586 } catch (REException ree) {ree.printStackTrace();} 1587 return result; 1588 }//regularExpressions 1589 1590 1591 private static class Class1 { 1592 } 1593} // public class OrthoMatcher 1594 1595
|
OrthoMatcher |
|