|
OrthoMatcher |
|
1 /* 2 * OrthoMatcher.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 24/August/2001 12 * 13 * $Id: OrthoMatcher.java,v 1.43 2002/04/12 14:35:22 kalina Exp $ 14 */ 15 16 17 package gate.creole.orthomatcher; 18 19 import gate.*; 20 import gate.util.*; 21 import gate.creole.*; 22 import gate.corpora.*; 23 import gate.annotation.*; 24 import java.util.*; 25 import java.io.*; 26 import java.net.*; 27 import gnu.regexp.*; 28 29 public class OrthoMatcher extends AbstractLanguageAnalyser 30 implements ANNIEConstants{ 31 32 public static final String 33 OM_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 OM_ANN_SET_PARAMETER_NAME = "annotationSetName"; 37 38 public static final String 39 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 40 41 public static final String 42 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes"; 43 44 public static final String 45 OM_ORG_TYPE_PARAMETER_NAME = "organizationType"; 46 47 public static final String 48 OM_PERSON_TYPE_PARAMETER_NAME = "personType"; 49 50 public static final String 51 OM_EXT_LISTS_PARAMETER_NAME = "extLists"; 52 53 protected static final String CDGLISTNAME = "cdg"; 54 protected static final String ALIASLISTNAME = "alias"; 55 protected static final String ARTLISTNAME = "def_art"; 56 protected static final String PREPLISTNAME = "prepos"; 57 protected static final String CONNECTORLISTNAME = "connector"; 58 protected static final String SPURLISTNAME = "spur_match"; 59 60 protected static final String PUNCTUATION_VALUE = "punctuation"; 61 protected static final String THE_VALUE = "The"; 62 63 64 /**the name of the annotation set*/ 65 protected String annotationSetName; 66 67 /** the types of the annotation */ 68 protected List annotationTypes = new ArrayList(10); 69 70 /** the organization type*/ 71 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE; 72 73 /** the person type*/ 74 protected String personType = PERSON_ANNOTATION_TYPE; 75 76 protected String unknownType = "Unknown"; 77 78 /** internal or external list */ 79 protected boolean extLists = true; 80 81 /** matching unknowns or not*/ 82 protected boolean matchingUnknowns = true; 83 84 /** This is an internal variable to indicate whether 85 * we matched using a rule that requires that 86 * the newly matched annotation matches all the others 87 * This is needed, because organizations can share 88 * first/last tokens like News and be different 89 */ 90 private boolean allMatchingNeeded = false; 91 92 //** Orthomatching is not case-sensitive by default*/ 93 protected boolean caseSensitive = false; 94 95 protected FeatureMap queryFM = Factory.newFeatureMap(); 96 97 // protected ExecutionException executionException; 98 99 // name lookup tables (used for namematch) 100 //gave them bigger default size, coz rehash is expensive 101 protected HashMap alias = new HashMap(100); 102 protected HashSet cdg = new HashSet(50); 103 protected HashMap spur_match = new HashMap(100); 104 protected HashMap def_art = new HashMap(20); 105 protected HashMap connector = new HashMap(20); 106 protected HashMap prepos = new HashMap(30); 107 108 109 protected AnnotationSet nameAllAnnots = null; 110 protected HashMap processedAnnots = new HashMap(150); 111 protected HashMap annots2Remove = new HashMap(75); 112 protected List matchesDocFeature = new ArrayList(); 113 //maps annotation ids to array lists of tokens 114 protected HashMap tokensMap = new HashMap(150); 115 116 protected Annotation shortAnnot, longAnnot; 117 118 protected ArrayList tokensLongAnnot, tokensShortAnnot; 119 120 /** a feature map to be used when retrieving annotations 121 * declared here so can be reused for efficiency 122 * clear() before each use 123 */ 124 protected FeatureMap tempMap = Factory.newFeatureMap(); 125 126 /** a buffer in order to read an array of char */ 127 private char[] cbuffer = null; 128 129 /** the size of the buffer */ 130 private final static int BUFF_SIZE = 65000; 131 132 /** @link dependency */ 133 /*#OrthoMatcher lnkOrthoMatcher;*/ 134 135 public OrthoMatcher () { 136 annotationTypes.add(organizationType); 137 annotationTypes.add(personType); 138 annotationTypes.add("Location"); 139 annotationTypes.add("Date"); 140 } 141 142 /** Initialise this resource, and return it. */ 143 public Resource init() throws ResourceInstantiationException { 144 cbuffer = new char[BUFF_SIZE]; 145 146 //initialise the list of annotations which we will match 147 try { 148 createLists(); 149 } catch (IOException ioe) {ioe.printStackTrace();} 150 return this; 151 } // init() 152 153 /** Run the resource. It doesn't make sense not to override 154 * this in subclasses so the default implementation signals an 155 * exception. 156 */ 157 public void execute() throws ExecutionException{ 158 159 //check the input 160 if(document == null) { 161 throw new ExecutionException( 162 "No document for namematch!" 163 ); 164 } 165 166 // get the annotations from document 167 if ((annotationSetName == null)|| (annotationSetName.equals(""))) 168 nameAllAnnots = document.getAnnotations(); 169 else 170 nameAllAnnots = document.getAnnotations(annotationSetName); 171 172 //if none found, print warning and exit 173 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) { 174 Out.prln("OrthoMatcher Warning: No annotations found for processing"); 175 return; 176 } 177 178 //check if we've been run on this document before 179 //and clean the doc if needed 180 docCleanup(); 181 Map matchesMap = (Map)document.getFeatures(). 182 get(DOCUMENT_COREF_FEATURE_NAME); 183 184 // creates the cdg list from the document 185 //no need to create otherwise, coz already done in init() 186 if (!extLists) 187 buildTables(nameAllAnnots); 188 189 //first match all name annotations 190 matchNameAnnotations(); 191 192 //then match the unknown ones to all name ones 193 if (matchingUnknowns) 194 matchUnknown(); 195 196 // set the matches of the document 197 // determineMatchesDocument(); 198 if (! matchesDocFeature.isEmpty()) { 199 if(matchesMap == null){ 200 matchesMap = new HashMap(); 201 } 202 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature); 203 //we need to put it even if it was already present in order to triger 204 //the update events 205 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap); 206 207 //cannot do clear() as this has already been put on the document 208 //so I need a new one for the next run of matcher 209 matchesDocFeature = new ArrayList(); 210 } 211 212 // Out.prln("Processed strings" + processedAnnots.values()); 213 //clean-up the internal data structures for next run 214 nameAllAnnots = null; 215 processedAnnots.clear(); 216 annots2Remove.clear(); 217 tokensMap.clear(); 218 matchesDocFeature = new ArrayList(); 219 longAnnot = null; 220 shortAnnot = null; 221 tokensLongAnnot = null; 222 tokensShortAnnot = null; 223 224 } // run() 225 226 protected void matchNameAnnotations() throws ExecutionException{ 227 // go through all the annotation types 228 Iterator iterAnnotationTypes = annotationTypes.iterator(); 229 while (iterAnnotationTypes.hasNext()) { 230 String annotationType = (String)iterAnnotationTypes.next(); 231 232 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType); 233 234 // continue if no such annotations exist 235 if ((nameAnnots == null) || nameAnnots.isEmpty()) 236 continue; 237 238 Iterator iterNames = nameAnnots.iterator(); 239 while (iterNames.hasNext()) { 240 Annotation nameAnnot = (Annotation) iterNames.next(); 241 Integer id = nameAnnot.getId(); 242 243 // get string and value 244 String annotString = null; 245 try { 246 annotString = document.getContent().getContent( 247 nameAnnot.getStartNode().getOffset(), 248 nameAnnot.getEndNode().getOffset() 249 ).toString(); 250 // now do the reg. exp. substitutions 251 annotString = regularExpressions(annotString," ", "\\s+"); 252 253 } catch (InvalidOffsetException ioe) { 254 throw new ExecutionException 255 ("Invalid offset of the annotation"); 256 } 257 //convert to lower case if we are not doing a case sensitive match 258 if (!caseSensitive) 259 annotString = annotString.toLowerCase(); 260 261 //get the tokens 262 List tokens = new ArrayList((Set) 263 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 264 nameAnnot.getStartNode().getOffset(), 265 nameAnnot.getEndNode().getOffset() 266 )); 267 //if no tokens to match, do nothing 268 if (tokens.isEmpty()) 269 continue; 270 Collections.sort(tokens, new gate.util.OffsetComparator()); 271 //check if these actually do not end after the name 272 //needed coz new tokeniser conflates 273 //strings with dashes. So British Gas-style is two tokens 274 //instead of three. So cannot match properly British Gas 275 // tokens = checkTokens(tokens); 276 tokensMap.put(nameAnnot.getId(), tokens); 277 278 // Out.prln("Matching annot " + nameAnnot + ": string " + annotString); 279 280 //first check whether we have not matched such a string already 281 //if so, just consider it matched, don't bother calling the rules 282 if (processedAnnots.containsValue(annotString)) { 283 // Out.prln("Contained string found " + annotString); 284 updateMatches(nameAnnot, annotString); 285 processedAnnots.put(nameAnnot.getId(), annotString); 286 continue; 287 } else if (processedAnnots.isEmpty()) { 288 processedAnnots.put(nameAnnot.getId(), annotString); 289 continue; 290 } 291 292 //if a person, then remove their title before matching 293 if (nameAnnot.getType().equals(personType)) 294 annotString = containTitle(annotString, nameAnnot); 295 else if (nameAnnot.getType().equals(organizationType)) 296 annotString = stripCDG(annotString, nameAnnot); 297 298 if(null == annotString || "".equals(annotString)) 299 continue; 300 301 //otherwise try matching with previous annotations 302 matchWithPrevious(nameAnnot, annotString); 303 304 // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString); 305 //finally add the current annotations to the processed map 306 processedAnnots.put(nameAnnot.getId(), annotString); 307 }//while through name annotations 308 309 }//while through annotation types 310 311 } 312 313 protected void matchUnknown() throws ExecutionException { 314 //get all Unknown annotations 315 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType); 316 317 if ((unknownAnnots == null) || unknownAnnots.isEmpty()) 318 return; 319 320 Iterator iter = unknownAnnots.iterator(); 321 //loop through the unknown annots 322 while (iter.hasNext()) { 323 Annotation unknown = (Annotation) iter.next(); 324 325 // get string and value 326 String unknownString = null; 327 try { 328 unknownString = document.getContent().getContent( 329 unknown.getStartNode().getOffset(), 330 unknown.getEndNode().getOffset() 331 ).toString(); 332 // now do the reg. exp. substitutions 333 unknownString = regularExpressions(unknownString," ", "\\s+"); 334 } catch (InvalidOffsetException ioe) { 335 throw new ExecutionException 336 ("Invalid offset of the annotation"); 337 } 338 //convert to lower case if we are not doing a case sensitive match 339 if (!caseSensitive) 340 unknownString = unknownString.toLowerCase(); 341 342 //get the tokens 343 List tokens = new ArrayList((Set) 344 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE, 345 unknown.getStartNode().getOffset(), 346 unknown.getEndNode().getOffset() 347 )); 348 if (tokens.isEmpty()) 349 continue; 350 Collections.sort(tokens, new gate.util.OffsetComparator()); 351 tokensMap.put(unknown.getId(), tokens); 352 353 354 //first check whether we have not matched such a string already 355 //if so, just consider it matched, don't bother calling the rules 356 if (processedAnnots.containsValue(unknownString)) { 357 Annotation matchedAnnot = updateMatches(unknown, unknownString); 358 // Out.prln("Matched " + unknown + "with string " + unknownString); 359 // Out.prln("That's same as " + matchedAnnot); 360 if (matchedAnnot.getType().equals(unknownType)) { 361 annots2Remove.put(unknown.getId(), 362 annots2Remove.get(matchedAnnot.getId())); 363 } 364 else 365 annots2Remove.put(unknown.getId(), matchedAnnot.getType()); 366 processedAnnots.put(unknown.getId(), unknownString); 367 unknown.getFeatures().put("NMRule", unknownType); 368 continue; 369 } 370 371 //check if we should do sub-string matching in case it's hyphenated 372 //for example US-led 373 if (tokens.size() == 1 374 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) { 375 if (matchHyphenatedUnknowns(unknown, unknownString, iter)) 376 continue; 377 }//if 378 379 matchWithPrevious(unknown, unknownString); 380 381 } //while though unknowns 382 383 if (! annots2Remove.isEmpty()) { 384 Iterator unknownIter = annots2Remove.keySet().iterator(); 385 while (unknownIter.hasNext()) { 386 Integer unknId = (Integer) unknownIter.next(); 387 Annotation unknown = nameAllAnnots.get(unknId); 388 Integer newID = nameAllAnnots.add( 389 unknown.getStartNode(), 390 unknown.getEndNode(), 391 (String) annots2Remove.get(unknId), 392 unknown.getFeatures() 393 ); 394 nameAllAnnots.remove(unknown); 395 396 //change the id in the matches list 397 List mList = (List)unknown.getFeatures(). 398 get(ANNOTATION_COREF_FEATURE_NAME); 399 mList.remove(unknId); 400 mList.add(newID); 401 }//while 402 }//if 403 } 404 405 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString, 406 Iterator iter){ 407 boolean matched = false; 408 409 //only take the substring before the hyphen 410 int stringEnd = unknownString.indexOf("-"); 411 unknownString = unknownString.substring(0, stringEnd); 412 //check if we've already matched this string 413 //because only exact match of the substring are considered 414 if (processedAnnots.containsValue(unknownString)) { 415 matched = true; 416 Annotation matchedAnnot = updateMatches(unknown, unknownString); 417 //only do the matching if not a person, because we do not match 418 //those on sub-strings 419 iter.remove(); 420 String newType; 421 if (matchedAnnot.getType().equals(unknownType)) 422 newType = (String)annots2Remove.get(matchedAnnot.getId()); 423 else 424 newType = matchedAnnot.getType(); 425 426 Integer newID = new Integer(-1); 427 try { 428 newID = nameAllAnnots.add( 429 unknown.getStartNode().getOffset(), 430 new Long(unknown.getStartNode().getOffset().longValue() 431 + stringEnd), 432 newType, 433 unknown.getFeatures() 434 ); 435 } catch (InvalidOffsetException ex) { 436 throw new GateRuntimeException(ex.getMessage()); 437 } 438 nameAllAnnots.remove(unknown); 439 440 //change the id in the matches list 441 List mList = (List)unknown.getFeatures(). 442 get(ANNOTATION_COREF_FEATURE_NAME); 443 mList.remove(unknown.getId()); 444 mList.add(newID); 445 446 } 447 return matched; 448 } 449 450 protected void matchWithPrevious(Annotation nameAnnot, String annotString) { 451 boolean matchedUnknown = false; 452 453 Iterator prevIter = processedAnnots.keySet().iterator(); 454 while (prevIter.hasNext()) { 455 Integer prevId = (Integer) prevIter.next(); 456 Annotation prevAnnot = nameAllAnnots.get(prevId); 457 458 //check if the two are from the same type or the new one is unknown 459 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType()) 460 && ! nameAnnot.getType().equals(unknownType)) 461 ) 462 continue; 463 //do not compare two unknown annotations either 464 //they are only matched to those of known types 465 if ( nameAnnot.getType().equals(unknownType) 466 && prevAnnot.getType().equals(unknownType)) 467 continue; 468 469 //check if we have already matched this annotation to the new one 470 if (matchedAlready(nameAnnot, prevAnnot) ) 471 continue; 472 473 //now changed to a rule, here we just match by gender 474 if (prevAnnot.getType().equals(personType)) { 475 String prevGender = 476 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 477 String nameGender = 478 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 479 if ( prevGender != null 480 && nameGender != null 481 && ( (nameGender.equalsIgnoreCase("female") 482 && 483 prevGender.equalsIgnoreCase("male") 484 ) 485 || 486 (prevGender.equalsIgnoreCase("female") 487 && nameGender.equalsIgnoreCase("male") 488 ) 489 ) 490 ) //if condition 491 continue; //we don't have a match if the two genders are different 492 493 }//if 494 495 //if the two annotations match 496 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) { 497 // Out.prln("Matched " + shortName + "and " + longName); 498 updateMatches(nameAnnot, prevAnnot); 499 //if unknown annotation, we need to change to the new type 500 if (nameAnnot.getType().equals(unknownType)) { 501 matchedUnknown = true; 502 if (prevAnnot.getType().equals(unknownType)) 503 annots2Remove.put(nameAnnot.getId(), 504 annots2Remove.get(prevAnnot.getId())); 505 else 506 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType()); 507 //also put an attribute to indicate that 508 nameAnnot.getFeatures().put("NMRule", unknownType); 509 }//if unknown 510 break; //no need to match further 511 }//if annotations matched 512 513 }//while through previous annotations 514 515 if (matchedUnknown) 516 processedAnnots.put(nameAnnot.getId(), annotString); 517 518 519 }//matchWithPrevious 520 521 protected boolean matchAnnotations(Annotation newAnnot, String annotString, 522 Annotation prevAnnot) { 523 //do not match two annotations that overlap 524 if (newAnnot.overlaps(prevAnnot)) 525 return false; 526 527 // find which annotation string of the two is longer 528 // this is useful for some of the matching rules 529 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 530 531 String longName = prevAnnotString; 532 String shortName = annotString; 533 longAnnot = prevAnnot; 534 shortAnnot = newAnnot; 535 536 if (shortName.length()>=longName.length()) { 537 String temp = longName; 538 longName = shortName; 539 shortName = temp; 540 Annotation tempAnn = longAnnot; 541 longAnnot = shortAnnot; 542 shortAnnot = tempAnn; 543 }//if 544 545 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 546 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 547 548 List matchesList = (List) prevAnnot.getFeatures(). 549 get(ANNOTATION_COREF_FEATURE_NAME); 550 if (matchesList == null || matchesList.isEmpty()) 551 return apply_rules_namematch(prevAnnot.getType(), shortName,longName); 552 553 //if these two match, then let's see if all the other matching one will too 554 //that's needed, because sometimes names can share a token (e.g., first or 555 //last but not be the same 556 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) { 557 /** 558 * Check whether we need to ensure that there is a match with the rest 559 * of the matching annotations, because the rule requires that 560 * transtivity is not assummed. 561 */ 562 if (allMatchingNeeded) { 563 allMatchingNeeded = false; 564 565 List toMatchList = new ArrayList(matchesList); 566 // if (newAnnot.getType().equals(unknownType)) 567 // Out.prln("Matching new " + annotString + " with annots " + toMatchList); 568 toMatchList.remove(prevAnnot.getId()); 569 570 return matchOtherAnnots(toMatchList, newAnnot, annotString); 571 } else 572 return true; 573 } 574 return false; 575 } 576 577 /** This method checkes whether the new annotation matches 578 * all annotations given in the toMatchList (it contains ids) 579 * The idea is that the new annotation needs to match all those, 580 * because assuming transitivity does not always work, when 581 * two different entities share a common token: e.g., BT Cellnet 582 * and BT and British Telecom. 583 */ 584 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot, 585 String annotString) { 586 587 //if the list is empty, then we're matching all right :-) 588 if (toMatchList.isEmpty()) 589 return true; 590 591 boolean matchedAll = true; 592 int i = 0; 593 594 while (matchedAll && i < toMatchList.size()) { 595 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i)); 596 597 // find which annotation string of the two is longer 598 // this is useful for some of the matching rules 599 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId()); 600 if (prevAnnotString == null) 601 try { 602 prevAnnotString = document.getContent().getContent( 603 prevAnnot.getStartNode().getOffset(), 604 prevAnnot.getEndNode().getOffset() 605 ).toString(); 606 } catch (InvalidOffsetException ioe) { 607 return false; 608 }//try 609 610 611 String longName = prevAnnotString; 612 String shortName = annotString; 613 longAnnot = prevAnnot; 614 shortAnnot = newAnnot; 615 616 if (shortName.length()>=longName.length()) { 617 String temp = longName; 618 longName = shortName; 619 shortName = temp; 620 Annotation tempAnn = longAnnot; 621 longAnnot = shortAnnot; 622 shortAnnot = tempAnn; 623 }//if 624 625 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId()); 626 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId()); 627 628 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName); 629 // if (newAnnot.getType().equals(unknownType)) 630 // Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll); 631 632 i++; 633 }//while 634 return matchedAll; 635 } 636 637 638 protected boolean matchedAlready(Annotation annot1, Annotation annot2) { 639 //the two annotations are already matched if the matches list of the first 640 //contains the id of the second 641 List matchesList = (List) annot1.getFeatures(). 642 get(ANNOTATION_COREF_FEATURE_NAME); 643 if ((matchesList == null) || matchesList.isEmpty()) 644 return false; 645 else if (matchesList.contains(annot2.getId())) 646 return true; 647 return false; 648 } 649 650 protected Annotation updateMatches(Annotation newAnnot, String annotString) { 651 Annotation matchedAnnot = null; 652 Integer id; 653 654 //first find a processed annotation with the same string 655 Iterator iter = processedAnnots.keySet().iterator(); 656 while (iter.hasNext()) { 657 id = (Integer) iter.next(); 658 String oldString = (String) processedAnnots.get(id); 659 if (annotString.equals(oldString)) { 660 matchedAnnot = nameAllAnnots.get(id); 661 break; 662 }//if 663 }//while 664 665 if (matchedAnnot == null) return null; 666 //if the two matching annotations are of different type which is not 667 //unknown, do not match them 668 if (! matchedAnnot.getType().equals(newAnnot.getType()) 669 && !newAnnot.getType().equals(unknownType) ) 670 return matchedAnnot; 671 672 List matchesList = (List) matchedAnnot.getFeatures(). 673 get(ANNOTATION_COREF_FEATURE_NAME); 674 if ((matchesList == null) || matchesList.isEmpty()) { 675 //no previous matches, so need to add 676 if (matchesList == null) { 677 matchesList = new ArrayList(); 678 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, 679 matchesList); 680 matchesDocFeature.add(matchesList); 681 }//if 682 matchesList.add(matchedAnnot.getId()); 683 matchesList.add(newAnnot.getId()); 684 } else { 685 //just add the new annotation 686 matchesList.add(newAnnot.getId()); 687 }//if 688 //add the matches list to the new annotation 689 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 690 return matchedAnnot; 691 } 692 693 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) { 694 695 List matchesList = (List) prevAnnot.getFeatures(). 696 get(ANNOTATION_COREF_FEATURE_NAME); 697 if ((matchesList == null) || matchesList.isEmpty()) { 698 //no previous matches, so need to add 699 if (matchesList == null) { 700 matchesList = new ArrayList(); 701 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 702 matchesDocFeature.add(matchesList); 703 }//if 704 matchesList.add(prevAnnot.getId()); 705 matchesList.add(newAnnot.getId()); 706 } else { 707 //just add the new annotation 708 matchesList.add(newAnnot.getId()); 709 }//if 710 //add the matches list to the new annotation 711 newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList); 712 //propagate the gender if two persons are matched 713 if (prevAnnot.getType().equals(personType)) { 714 String prevGender = 715 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 716 String newGender = 717 (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 718 boolean unknownPrevGender = isUnknownGender(prevGender); 719 boolean unknownNewGender = isUnknownGender(newGender); 720 if (unknownPrevGender && !unknownNewGender) 721 prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender); 722 else if (unknownNewGender && !unknownPrevGender) 723 newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender); 724 }//if 725 } 726 727 728 protected void docCleanup() { 729 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME); 730 if (matchesValue != null && (matchesValue instanceof Map)) 731 ((Map)matchesValue).remove(nameAllAnnots.getName()); 732 else if (matchesValue != null) { 733 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap()); 734 } 735 736 //get all annotations that have a matches feature 737 HashSet fNames = new HashSet(); 738 fNames.add(ANNOTATION_COREF_FEATURE_NAME); 739 AnnotationSet annots = 740 nameAllAnnots.get(null, fNames); 741 742 // Out.prln("Annots to cleanup" + annots); 743 744 if (annots == null || annots.isEmpty()) 745 return; 746 747 Iterator iter = annots.iterator(); 748 while (iter.hasNext()) { 749 while (iter.hasNext()) 750 ((Annotation) iter.next()).getFeatures(). 751 remove(ANNOTATION_COREF_FEATURE_NAME); 752 } //while 753 }//cleanup 754 755 /** return a person name without title */ 756 protected String containTitle (String annotString, Annotation annot) 757 throws ExecutionException { 758 // get the offsets 759 Long startAnnot = annot.getStartNode().getOffset(); 760 Long endAnnot = annot.getEndNode().getOffset(); 761 762 // determine "Lookup" annotation set 763 queryFM.clear(); 764 queryFM.put("majorType", "title"); 765 AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot); 766 if (as1 == null || as1.isEmpty()) 767 return annotString; 768 AnnotationSet as = 769 as1.get("Lookup", queryFM); 770 if (as !=null && ! as.isEmpty()) { 771 List titles = new ArrayList((Set)as); 772 Collections.sort(titles, new gate.util.OffsetComparator()); 773 774 Iterator iter = titles.iterator(); 775 while (iter.hasNext()) { 776 Annotation titleAnn = (Annotation)(iter.next()); 777 778 //we've not found a title at the start offset, 779 //there's no point in looking further 780 //coz titles come first 781 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0) 782 return annotString; 783 784 try { 785 // the title from the current annotation 786 String annotTitle = 787 document.getContent().getContent( 788 titleAnn.getStartNode().getOffset(), 789 titleAnn.getEndNode().getOffset() 790 ).toString(); 791 792 // eliminate the title from annotation string and return the result 793 if (annotTitle.length()<annotString.length()) { 794 //remove from the array of tokens, so then we can compare properly 795 //the remaining tokens 796 // Out.prln("Removing title from: " + annot + " with string " + annotString); 797 // Out.prln("Tokens are" + tokensMap.get(annot.getId())); 798 // Out.prln("Title is" + annotTitle); 799 ((ArrayList) tokensMap.get(annot.getId())).remove(0); 800 return annotString.substring( 801 annotTitle.length()+1,annotString.length()); 802 } 803 } catch (InvalidOffsetException ioe) { 804 throw new ExecutionException 805 ("Invalid offset of the annotation"); 806 }//try 807 }// while 808 }//if 809 return annotString; 810 811 } 812 813 /** return an organization without a designator and starting The*/ 814 protected String stripCDG (String annotString, Annotation annot){ 815 816 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId()); 817 818 //strip starting The first 819 if ( ((String) ((Annotation) tokens.get(0) 820 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) 821 .equalsIgnoreCase(THE_VALUE)) 822 tokens.remove(0); 823 824 //no need to check for cdg if there is only 1 token or less 825 if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1) 826 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 827 tokens.remove(tokens.size()-1); 828 829 StringBuffer newString = new StringBuffer(50); 830 for (int i = 0; i < tokens.size(); i++){ 831 newString.append((String) ((Annotation) tokens.get(i) 832 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) ); 833 if (i != tokens.size()-1) 834 newString.append(" "); 835 } 836 // Out.prln("Strip CDG returned: " + newString + "for string " + annotString); 837 838 if (caseSensitive) 839 return newString.toString(); 840 841 return newString.toString().toLowerCase(); 842 } 843 844 /* 845 public void check() throws ExecutionException { 846 if (executionException != null) { 847 ExecutionException e = executionException; 848 executionException = null; 849 throw e; 850 } 851 } // check() 852 */ 853 854 /** if ( == false) then reads the names of files in order 855 * to create the lookup tables 856 */ 857 protected void createLists() throws IOException { 858 InputStream inputStream = Files.getGateResourceAsStream( 859 "creole/namematcher/listsNM.def"); 860 InputStreamReader inputStreamReader = new InputStreamReader ( 861 inputStream); 862 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 863 864 String lineRead = null; 865 while ((lineRead = bufferedReader.readLine()) != null){ 866 int index = lineRead.indexOf(":"); 867 if (index != -1){ 868 String nameFile = lineRead.substring(0,index); 869 String nameList = lineRead.substring(index+1,lineRead.length()); 870 createAnnotList(nameFile,nameList); 871 }// if 872 }//while 873 bufferedReader.close(); 874 inputStreamReader.close(); 875 inputStream.close(); 876 }// createLists() 877 878 /** creates the lookup tables */ 879 protected void createAnnotList(String nameFile,String nameList) 880 throws IOException{ 881 InputStream inputStream = Files.getGateResourceAsStream( 882 "creole/namematcher/"+nameFile); 883 InputStreamReader inputStreamReader = new InputStreamReader ( 884 inputStream); 885 BufferedReader bufferedReader = new BufferedReader(inputStreamReader); 886 887 String lineRead = null; 888 while ((lineRead = bufferedReader.readLine()) != null){ 889 if (nameList.compareTo(CDGLISTNAME)==0){ 890 if (caseSensitive) 891 cdg.add(lineRead); 892 else 893 cdg.add(lineRead.toLowerCase()); 894 }// if 895 else { 896 int index = lineRead.indexOf("£"); 897 if (index != -1){ 898 String expr = lineRead.substring(0,index); 899 //if not case-sensitive, we need to downcase all strings 900 if (!caseSensitive) 901 expr = expr.toLowerCase(); 902 String code = lineRead.substring(index+1,lineRead.length()); 903 if (nameList.equals(ALIASLISTNAME)) 904 alias.put(expr, code); 905 else 906 if (nameList.equals(ARTLISTNAME)) 907 def_art.put(expr, code); 908 else 909 if (nameList.equals(PREPLISTNAME)) 910 prepos.put(expr, code); 911 else 912 if (nameList.equals(CONNECTORLISTNAME)) 913 connector.put(expr, code); 914 else 915 if (nameList.equals(SPURLISTNAME)) 916 spur_match.put(expr, code); 917 918 }//if 919 }// else 920 921 }//while 922 }//createAnnotList 923 924 925 /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */ 926 private boolean apply_rules_namematch(String annotationType, String shortName, 927 String longName) { 928 // first apply rule for spurius matches i.e. rule0 929 if (matchRule0(longName, shortName)) 930 return false; 931 if ( 932 (// rules for all annotations 933 //no longer use rule1, coz I do the check for same string via the 934 //hash table 935 matchRule2(longName, shortName) 936 || 937 matchRule3(longName, shortName) 938 ) // rules for all annotations 939 || 940 (// rules for organisation annotations 941 ( annotationType.equals(organizationType) 942 //ACE addition 943 || annotationType.equals("Facility")) 944 && 945 ( matchRule4(longName, shortName) 946 || 947 matchRule5(longName, shortName) 948 || 949 matchRule6(longName, shortName) 950 || 951 matchRule7(longName, shortName) 952 || 953 // matchRule8(longName, shortName) 954 // || 955 matchRule9(longName, shortName) 956 || 957 matchRule10(longName, shortName) 958 || 959 matchRule11(longName, shortName) 960 || 961 matchRule12(longName, shortName) 962 || 963 matchRule13(shortName, longName) 964 ) 965 )// rules for organisation annotations 966 || 967 (// rules for person annotations 968 ( annotationType.equals(personType)) 969 && 970 ( matchRule4(longName, shortName) 971 || 972 matchRule5(longName, shortName) 973 || 974 matchRule14(longName, shortName) 975 || //kalina: added this, so it matches names when contain more 976 //than one first and one last name 977 matchRule15(longName, shortName) 978 ) 979 )// rules for person annotations 980 ) //if 981 return true; 982 return false; 983 }//apply_rules 984 985 986 /** set the extLists flag */ 987 public void setExtLists(Boolean newExtLists) { 988 extLists = newExtLists.booleanValue(); 989 }//setextLists 990 991 /** set the caseSensitive flag */ 992 public void setCaseSensitive(Boolean newCase) { 993 caseSensitive = newCase.booleanValue(); 994 }//setextLists 995 996 /** set the annotation set name*/ 997 public void setAnnotationSetName(String newAnnotationSetName) { 998 annotationSetName = newAnnotationSetName; 999 }//setAnnotationSetName 1000 1001 /** set the types of the annotations*/ 1002 public void setAnnotationTypes(List newType) { 1003 annotationTypes = newType; 1004 }//setAnnotationTypes 1005 1006 /** set whether to process the Unknown annotations*/ 1007 public void setProcessUnknown(Boolean processOrNot) { 1008 this.matchingUnknowns = processOrNot.booleanValue(); 1009 }//setAnnotationTypes 1010 1011 public void setOrganizationType(String newOrganizationType) { 1012 organizationType = newOrganizationType; 1013 }//setOrganizationType 1014 1015 public void setPersonType(String newPersonType) { 1016 personType = newPersonType; 1017 }//setPersonType 1018 1019 /**get the name of the annotation set*/ 1020 public String getAnnotationSetName() { 1021 return annotationSetName; 1022 }//getAnnotationSetName 1023 1024 /** get the types of the annotation*/ 1025 public List getAnnotationTypes() { 1026 return annotationTypes; 1027 }//getAnnotationTypes 1028 1029 public String getOrganizationType() { 1030 return organizationType; 1031 } 1032 1033 public String getPersonType() { 1034 return personType; 1035 } 1036 1037 public Boolean getExtLists() { 1038 return new Boolean(extLists); 1039 } 1040 1041 /** Are we running in a case-sensitive mode?*/ 1042 public Boolean getCaseSensitive() { 1043 return new Boolean(caseSensitive); 1044 } 1045 1046 /** Return whether or not we're processing the Unknown annots*/ 1047 public Boolean getProcessUnknown() { 1048 return new Boolean(matchingUnknowns); 1049 } 1050 1051/* 1052 public List getMatchesDocument() { 1053 return matchesDocument; 1054 } 1055*/ 1056 1057 protected boolean isUnknownGender(String gender) { 1058 if (gender == null) 1059 return true; 1060 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female")) 1061 return false; 1062 return true; 1063 1064 } //isUnknownGender 1065 1066 /** RULE #0: If the two names are listed in table of 1067 * spurius matches then they do NOT match 1068 * Condition(s): - 1069 * Applied to: all name annotations 1070 */ 1071 public boolean matchRule0(String s1, 1072 String s2) { 1073 if (spur_match.containsKey(s1) 1074 && spur_match.containsKey(s2) ) 1075 return 1076 spur_match.get(s1).toString().equals(spur_match.get(s2).toString()); 1077 1078 return false; 1079 }//matchRule0 1080 1081 /** RULE #1: If the two names are identical then they are the same 1082 * no longer used, because I do the check for same string via the 1083 * hash table of previous annotations 1084 * Condition(s): depend on case 1085 * Applied to: all name annotations 1086 */ 1087 public boolean matchRule1(String s1, 1088 String s2, 1089 boolean matchCase) { 1090// Out.prln("Rule1: Matching " + s1 + "and " + s2); 1091 1092 boolean matched = false; 1093 if (!matchCase) 1094 matched = s1.equalsIgnoreCase(s2); 1095 else matched = s1.equals(s2) ; 1096//kalina: do not remove, nice for debug 1097// if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth"))) 1098// Out.prln("Rule1: Matched " + s1 + "and " + s2); 1099 return matched; 1100 }//matchRule1 1101 1102 1103 /** 1104 * RULE #2: if the two names are listed as equivalent in the 1105 * lookup table (alias) then they match 1106 * Condition(s): - 1107 * Applied to: all name annotations 1108 */ 1109 public boolean matchRule2(String s1, 1110 String s2) { 1111 1112 if (alias.containsKey(s1) && alias.containsKey(s2)) 1113 return (alias.get(s1).toString().equals(alias.get(s2).toString())); 1114 1115 return false; 1116 }//matchRule2 1117 1118 /** 1119 * RULE #3: adding a possessive at the end 1120 * of one name causes a match 1121 * e.g. "Standard and Poor" == "Standard and Poor's" 1122 * and also "Standard and Poor" == "Standard's" 1123 * Condition(s): case-insensitive match 1124 * Applied to: all name annotations 1125 */ 1126 public boolean matchRule3(String s1, //long string 1127 String s2) { //short string 1128 1129 if (s2.endsWith("'s") || s2.endsWith("'") 1130 ||(s1.endsWith("'s")|| s1.endsWith("'"))) { 1131 1132 1133 String s2_poss = null; 1134 1135 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s"); 1136 else s2_poss = s2.concat("'"); 1137 1138 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true; 1139 1140 // now check the second case i.e. "Standard and Poor" == "Standard's" 1141 String token = (String) 1142 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1143 1144 if (!token.endsWith("'s")) s2_poss = token.concat("'s"); 1145 else s2_poss = token.concat("'"); 1146 1147 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true; 1148 1149 } // if (s2.endsWith("'s") 1150 return false; 1151 }//matchRule3 1152 1153 /** 1154 * RULE #4: Do all tokens other than the punctuation marks 1155 * , and . match? 1156 * e.g. "Smith, Jones" == "Smith Jones" 1157 * Condition(s): case-insensitive match 1158 * Applied to: organisation and person annotations 1159 */ 1160 public boolean matchRule4(String s1, 1161 String s2) { 1162 1163 boolean allTokensMatch = true; 1164 1165 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator(); 1166 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator(); 1167 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) { 1168 Annotation token = (Annotation) tokensLongAnnotIter.next(); 1169 if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE)) 1170 continue; 1171// Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot); 1172 if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1173 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) { 1174 allTokensMatch = false; 1175 break; 1176 } // if (!tokensLongAnnot.nextToken() 1177 } // while 1178// if (allTokensMatch) 1179// Out.prln("rule4 fired. result is: " + allTokensMatch); 1180 return allTokensMatch; 1181 }//matchRule4 1182 1183 /** 1184 * RULE #5: if the 1st token of one name 1185 * matches the second name 1186 * e.g. "Pepsi Cola" == "Pepsi" 1187 * Condition(s): case-insensitive match 1188 * Applied to: all name annotations 1189 */ 1190 public boolean matchRule5(String s1, 1191 String s2) { 1192 1193 //do not match numbers by this rule 1194 if (tokensLongAnnot.size()> 1 && 1195 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number")) 1196 return false; 1197 1198// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) { 1199// Out.prln("Rule 5: " + s1 + "and " + s2); 1200// } 1201 1202 //require that when matching person names, the shorter one to be of length 1 1203 //for the rule to apply. In other words, avoid matching Peter Smith and 1204 //Peter Kline, because they share a Peter token. 1205 if ( (shortAnnot.getType().equals(personType) 1206 || longAnnot.getType().equals(personType) 1207 ) 1208 && 1209 tokensShortAnnot.size()>1 1210 ) 1211 return false; 1212 1213 if (tokensLongAnnot.size()<=1) 1214 return false; 1215 boolean result = matchRule1((String) 1216 ((Annotation) tokensLongAnnot.get(0) 1217 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME), 1218 s2, 1219 caseSensitive); 1220 1221// if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) 1222// Out.prln("rule 5 result: " + result); 1223 return result; 1224 1225 }//matchRule5 1226 1227 /** 1228 * RULE #6: if one name is the acronym of the other 1229 * e.g. "Imperial Chemical Industries" == "ICI" 1230 * Applied to: organisation annotations only 1231 */ 1232 public boolean matchRule6(String s1, 1233 String s2) { 1234 1235 int i = 0; 1236 1237 //check and if the shorted string has a space in it, then it's not 1238 //an acronym 1239 if (s2.indexOf(" ") > 0) 1240 return false; 1241 1242 //Out.prln("Acronym: Matching " + s1 + "and " + s2); 1243 StringBuffer acronym_s1 = new StringBuffer(""); 1244 StringBuffer acronymDot_s1 = new StringBuffer(""); 1245 1246 for ( ;i < tokensLongAnnot.size(); i++ ) { 1247 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i) 1248 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1); 1249 acronym_s1.append(toAppend); 1250 acronymDot_s1.append(toAppend); 1251 acronymDot_s1.append("."); 1252 } 1253 1254 //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2); 1255 //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive)); 1256 1257 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) || 1258 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) ) 1259 return true; 1260 1261 return false; 1262 }//matchRule6 1263 1264 /** 1265 * RULE #7: if one of the tokens in one of the 1266 * names is in the list of separators eg. "&" 1267 * then check if the token before the separator 1268 * matches the other name 1269 * e.g. "R.H. Macy & Co." == "Macy" 1270 * Condition(s): case-sensitive match 1271 * Applied to: organisation annotations only 1272 */ 1273 public boolean matchRule7(String s1, 1274 String s2) { 1275 1276 //don't try it unless the second string is just one token 1277 if (tokensShortAnnot.size() != 1) 1278 return false; 1279 1280 String previous_token = null; 1281 1282 for (int i = 0; i < tokensLongAnnot.size(); i++ ) { 1283 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i) 1284 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) { 1285 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1) 1286 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1287 1288 break; 1289 } 1290 } 1291 1292 //now match previous_token with other name 1293 if (previous_token != null) { 1294// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1295// Out.prln("Rule7"); 1296 return matchRule1(previous_token,s2,caseSensitive); 1297 1298 } 1299 return false; 1300 }//matchRule7 1301 1302 /** 1303 * This rule is now obsolete, as The and the trailing CDG 1304 * are stripped before matching. 1305 * DO NOT CALL!!! 1306 * 1307 * RULE #8: if the names match, ignoring The and 1308 * and trailing company designator (which have already been stripped) 1309 * e.g. "The Magic Tricks Co." == "Magic Tricks" 1310 * Condition(s): case-sensitive match 1311 * Applied to: organisation annotations only 1312 */ 1313 public boolean matchRule8(String s1, 1314 String s2) { 1315 Out.prln("OrthoMatcher warning: This rule has been discontinued!"); 1316/* 1317 if (s1.startsWith("The ")) s1 = s1.substring(4); 1318 if (s2.startsWith("The ")) s2 = s2.substring(4); 1319 1320 // check that cdg is not empty 1321 if (!cdg.isEmpty()) { 1322 String stringToTokenize1 = s1; 1323 StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," "); 1324 1325 String stringToTokenize2 = s2; 1326 StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," "); 1327 String token = null; 1328 String cdg1 = null; 1329 String cdg2 = null; 1330 1331 s1 = ""; 1332 s2 = ""; 1333 1334 //check last token of s1 1335 while (tokensLongAnnot.hasMoreTokens()) { 1336 token = tokensLongAnnot.nextToken(); 1337 if (!tokensLongAnnot.hasMoreTokens() 1338 && cdg.contains(token)) cdg1=token; 1339 else s1 = s1+token; 1340 } 1341 1342 // do the same for s2 1343 while (tokensShortAnnot.hasMoreTokens()) { 1344 token = tokensShortAnnot.nextToken(); 1345 if (!tokensShortAnnot.hasMoreTokens() 1346 && cdg.contains(token)) cdg2=token; 1347 else s2 = s2+token; 1348 } 1349 1350 // if the company designators are different 1351 // then they are NOT the same organisations 1352 if ((cdg1!=null && cdg2!=null) 1353 && !cdg1.equalsIgnoreCase(cdg2)) return false; 1354 } 1355 if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive); 1356*/ 1357 return false; 1358 1359 }//matchRule8 1360 1361 /** 1362 * RULE #9: does one of the names match the token 1363 * just before a trailing company designator 1364 * in the other name? 1365 * The company designator has already been chopped off, 1366 * so the token before it, is in fact the last token 1367 * e.g. "R.H. Macy Co." == "Macy" 1368 * Applied to: organisation annotations only 1369 */ 1370 public boolean matchRule9(String s1, 1371 String s2) { 1372 1373// if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news")) 1374// Out.prln("Rule 9 " + s1 + " and " + s2); 1375 String s1_short = (String) 1376 ((Annotation) tokensLongAnnot.get( 1377 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1378// Out.prln("Converted to " + s1_short); 1379 if (tokensLongAnnot.size()>1) { 1380 boolean matched = matchRule1(s1_short, s2, caseSensitive); 1381 //we need to make sure all names match, instead of assuming transitivity, 1382 //to avoid matching BBC News with News then News with ITV News, which 1383 //by transitivity leads to BBC News matching ITV News which is not what 1384 //we want 1385 if (matched) 1386 allMatchingNeeded = true; 1387 return matched; 1388 } //if 1389 1390 return false; 1391 }//matchRule9 1392 1393 /** 1394 * RULE #10: is one name the reverse of the other 1395 * reversing around prepositions only? 1396 * e.g. "Department of Defence" == "Defence Department" 1397 * Condition(s): case-sensitive match 1398 * Applied to: organisation annotations only 1399 */ 1400 public boolean matchRule10(String s1, 1401 String s2) { 1402 1403 String token = null; 1404 String previous_token = null; 1405 String next_token = null; 1406 boolean invoke_rule=false; 1407 1408 if (tokensLongAnnot.size() >= 3 1409 && tokensShortAnnot.size() >= 2) { 1410 1411 // first get the tokens before and after the preposition 1412 int i = 0; 1413 for (; i< tokensLongAnnot.size(); i++) { 1414 token = (String) 1415 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1416 if (prepos.containsKey(token)) { 1417 invoke_rule=true; 1418 break; 1419 }//if 1420 previous_token = token; 1421 }//while 1422 1423 if (! invoke_rule) 1424 return false; 1425 1426 if (i < tokensLongAnnot.size() 1427 && previous_token != null) 1428 next_token= (String) 1429 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1430 else return false; 1431 1432 String s21 = (String) 1433 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1434 String s22 = (String) 1435 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1436 // then compare (in reverse) with the first two tokens of s2 1437 if (matchRule1(next_token,(String) s21,caseSensitive) 1438 && matchRule1(previous_token, s22,caseSensitive)) 1439 return true ; 1440 }//if (tokensLongAnnot.countTokens() >= 3 1441 return false; 1442 }//matchRule10 1443 1444 /** 1445 * RULE #11: does one name consist of contractions 1446 * of the first two tokens of the other name? 1447 * e.g. "Communications Satellite" == "ComSat" 1448 * and "Pan American" == "Pan Am" 1449 * Condition(s): case-sensitive match 1450 * Applied to: organisation annotations only 1451 */ 1452 public boolean matchRule11(String s1, 1453 String s2) { 1454 1455 1456 // first do the easy case e.g. "Pan American" == "Pan Am" 1457 1458 String token11 = null; 1459 String token12 = null; 1460 String token21 = null; 1461 String token22 = null; 1462 1463 if (tokensLongAnnot.size() < 2) 1464 return false; 1465 1466 // 1st get the first two tokens of s1 1467 token11 = (String) 1468 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1469 token12 = (String) 1470 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1471 1472 // now check for the first case i.e. "Pan American" == "Pan Am" 1473 if (tokensShortAnnot.size() == 2) { 1474 1475 token21 = (String) 1476 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1477 token22 = (String) 1478 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1479 1480 if (token11.startsWith(token21) 1481 && token12.startsWith(token22)) 1482 return true; 1483 1484 } // if (tokensShortAnnot.countTokens() == 2) 1485 1486 // now the second case e.g. "Communications Satellite" == "ComSat" 1487 else if (tokensShortAnnot.size()==1 && s2.length()>=3) { 1488 1489 // split the token into possible contractions 1490 // ignore case for matching 1491 for (int i=2;i<s2.length();i++) { 1492 token21=s2.substring(0,i+1); 1493 token22=s2.substring(i+1); 1494 1495 if (token11.startsWith(token21) 1496 && token12.startsWith(token22)) 1497 return true; 1498 }// for 1499 } // else if 1500 1501 return false; 1502 }//matchRule11 1503 1504 /** 1505 * RULE #12: do the first and last tokens of one name 1506 * match the first and last tokens of the other? 1507 * Condition(s): case-sensitive match 1508 * Applied to: organisation annotations only 1509 */ 1510 public boolean matchRule12(String s1, 1511 String s2) { 1512 1513 // first do the easy case e.g. "Pan American" == "Pan Am" 1514 1515 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) { 1516// Out.prln("Rule 12"); 1517 1518 // get first and last tokens of s1 & s2 1519 String s1_first = (String) 1520 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1521 String s2_first = (String) 1522 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1523 1524 if (!matchRule1(s1_first,s2_first,caseSensitive)) 1525 return false; 1526 1527 String s1_last = (String) 1528 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1529 String s2_last = (String) 1530 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1531 1532 return matchRule1(s1_last,s2_last,caseSensitive); 1533 } // if (tokensLongAnnot.countTokens()>1 1534 return false; 1535 }//matchRule12 1536 1537 /** 1538 * RULE #13: do multi-word names match except for 1539 * one token e.g. 1540 * "Second Force Recon Company" == "Force Recon Company" 1541 * Note that this rule has NOT been used in LaSIE's 1.5 1542 * namematcher 1543 * Restrictions: - remove cdg first 1544 * - shortest name should be 2 words or more 1545 * - if N is the number of tokens of the longest 1546 * name, then N-1 tokens should be matched 1547 * Condition(s): case-sensitive match 1548 * Applied to: organisation or person annotations only 1549 */ 1550 public boolean matchRule13(String s1, 1551 String s2) { 1552 1553 1554 String token1 = null; 1555 String token2 = null; 1556 1557 int matched_tokens = 0, mismatches = 0;; 1558 1559 // if names < 2 words then rule is invalid 1560 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false; 1561 1562// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1563// Out.prln("Rule 13: Matching tokens" + tokensLongAnnot); 1564// Out.prln("with tokens " + tokensShortAnnot); 1565// } 1566 1567 // now do the matching 1568 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) { 1569 1570// Out.prln("i = " + i); 1571// Out.prln("j = " + j); 1572 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1573 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) { 1574 matched_tokens++; 1575 j++; 1576 } else 1577 mismatches++; 1578 } // for 1579 1580 if (matched_tokens >= tokensLongAnnot.size()-1) 1581 return true; 1582 1583 return false; 1584 }//matchRule13 1585 1586 /** 1587 * RULE #14: if the last token of one name 1588 * matches the second name 1589 * e.g. "Hamish Cunningham" == "Cunningham" 1590 * Condition(s): case-insensitive match 1591 * Applied to: all person annotations 1592 */ 1593 public boolean matchRule14(String s1, 1594 String s2) { 1595 1596// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) 1597// Out.prln("Rule 14 " + s1 + " and " + s2); 1598 String s1_short = (String) 1599 ((Annotation) tokensLongAnnot.get( 1600 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1601// Out.prln("Converted to " + s1_short); 1602 if (tokensLongAnnot.size()>1) 1603 return matchRule1(s1_short, 1604 s2, 1605 caseSensitive); 1606 1607 return false; 1608 1609 }//matchRule14 1610 1611 /** 1612 * RULE #15: does one token from a Person name appear as the other token 1613 * Note that this rule has NOT been used in LaSIE's 1.5 1614 * namematcher; added for ACE by Di's request 1615 */ 1616 public boolean matchRule15(String s1, 1617 String s2) { 1618 1619 int matched_tokens = 0; 1620 1621 // if names < 2 words then rule is invalid 1622 1623// if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) { 1624// Out.prln("Rule 15:" ); 1625// Out.prln("with tokens " + tokensShortAnnot); 1626// } 1627 1628 // now do the matching 1629 Annotation token1, token2; 1630 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) { 1631 token1 = (Annotation) tokensShortAnnot.get(i); 1632 //first check if not punctuation, because we need to skip it 1633 if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1634 continue; 1635 1636 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) { 1637// Out.prln("i = " + i); 1638 token2 = (Annotation) tokensLongAnnot.get(j); 1639 if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE)) 1640 continue; 1641 if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals( 1642 token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) 1643 matched_tokens++; 1644 }//for 1645 } // for 1646 1647 //19 February 2002: kalina 1648 //was originally > 0 (i.e., any match is good) 1649 //ensure that we've matched all the tokens in the short annotation 1650 //the reason for that is, because otherwise we match 1651 //Patrick Viera and Patrick Somebody - not good! 1652 if (matched_tokens == tokensShortAnnot.size()) 1653 return true; 1654 1655 return false; 1656 }//matchRule15 1657 1658 /** Tables for namematch info 1659 * (used by the namematch rules) 1660 */ 1661 private void buildTables(AnnotationSet nameAllAnnots) { 1662 1663 //reset the tables first 1664 cdg.clear(); 1665 1666 if (! extLists) { 1667 // i.e. get cdg from Lookup annotations 1668 // get all Lookup annotations 1669 tempMap.clear(); 1670 tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg"); 1671 //now get all lookup annotations which are cdg 1672 AnnotationSet nameAnnots = 1673 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap); 1674 1675 if ((nameAnnots ==null) || nameAnnots.isEmpty()) 1676 return; 1677 1678 Iterator iter = nameAnnots.iterator(); 1679 while (iter.hasNext()) { 1680 Annotation annot = (Annotation)iter.next(); 1681 // get the actual string 1682 Long offsetStartAnnot = annot.getStartNode().getOffset(); 1683 Long offsetEndAnnot = annot.getEndNode().getOffset(); 1684 try { 1685 gate.Document doc = nameAllAnnots.getDocument(); 1686 String annotString = 1687 doc.getContent().getContent( 1688 offsetStartAnnot,offsetEndAnnot 1689 ).toString(); 1690 cdg.add(annotString); 1691 } catch (InvalidOffsetException ioe) { 1692 ioe.printStackTrace(Err.getPrintWriter()); 1693 } 1694 }// while 1695 }//if 1696 }//buildTables 1697 1698 /** substitute all multiple spaces, tabes and newlines 1699 * with a single space 1700 */ 1701 public String regularExpressions ( String text, String replacement, 1702 String regEx) { 1703 String result = text; 1704 try { 1705 RE re = new RE(regEx); 1706 result = re.substituteAll( text,replacement); 1707 } catch (REException ree) {ree.printStackTrace();} 1708 return result; 1709 }//regularExpressions 1710 1711 1712 private static class Class1 { 1713 } 1714} // public class OrthoMatcher 1715 1716
|
OrthoMatcher |
|