|
PronominalCoref |
|
1 /* 2 * PronominalCoref.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Marin Dimitrov, 30/Dec/2001 12 * 13 * $Id: PronominalCoref.java,v 1.25 2002/04/10 11:07:59 marin Exp $ 14 */ 15 16 package gate.creole.coref; 17 18 import java.util.*; 19 import java.net.*; 20 21 import junit.framework.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.util.*; 26 import gate.annotation.*; 27 28 public class PronominalCoref extends AbstractLanguageAnalyser 29 implements ProcessingResource, ANNIEConstants{ 30 31 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document"; 32 33 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName"; 34 35 /** --- */ 36 private static final boolean DEBUG = false; 37 38 //JAPE grammars 39 private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape"; 40 private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape"; 41 42 //annotation types 43 private static final String QUOTED_TEXT_TYPE = "Quoted Text"; 44 private static final String PLEONASTIC_TYPE = "PleonasticIt"; 45 46 //annotation features 47 private static final String PRP_CATEGORY = "PRP"; 48 private static final String PRP$_CATEGORY = "PRP$"; 49 50 //scope 51 private static final int SENTENCES_IN_SCOPE = 3; 52 /** --- */ 53 private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR; 54 /** --- */ 55 private String annotationSetName; 56 /** --- */ 57 private Transducer qtTransducer; 58 /** --- */ 59 private Transducer pleonTransducer; 60 /** --- */ 61 private AnnotationSet defaultAnnotations; 62 /** --- */ 63 private Sentence[] textSentences; 64 /** --- */ 65 private Quote[] quotedText; 66 /** --- */ 67 private Annotation[] pleonasticIt; 68 /** --- */ 69 private HashMap personGender; 70 /** --- */ 71 private HashMap anaphor2antecedent; 72 /** --- */ 73 private static final FeatureMap PRP_RESTRICTION; 74 75 /** --- */ 76 static { 77 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator(); 78 PRP_RESTRICTION = new SimpleFeatureMapImpl(); 79 PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 80 } 81 82 /** --- */ 83 public PronominalCoref() { 84 85 this.personGender = new HashMap(); 86 this.anaphor2antecedent = new HashMap(); 87 this.qtTransducer = new gate.creole.Transducer(); 88 this.pleonTransducer = new gate.creole.Transducer(); 89 } 90 91 /** Initialise this resource, and return it. */ 92 public Resource init() throws ResourceInstantiationException { 93 94 //0. preconditions 95 Assert.assertNotNull(this.qtTransducer); 96 97 //1. initialise quoted text transducer 98 URL qtGrammarURL = null; 99 try { 100 qtGrammarURL = new URL(QT_GRAMMAR_URL); 101 } 102 catch(MalformedURLException mue) { 103 throw new ResourceInstantiationException(mue); 104 } 105 this.qtTransducer.setGrammarURL(qtGrammarURL); 106 this.qtTransducer.setEncoding("UTF-8"); 107 this.qtTransducer.init(); 108 109 //2. initialise pleonastic transducer 110 URL pleonGrammarURL = null; 111 try { 112 pleonGrammarURL = new URL(PLEON_GRAMMAR_URL); 113 } 114 catch(MalformedURLException mue) { 115 throw new ResourceInstantiationException(mue); 116 } 117 this.pleonTransducer.setGrammarURL(pleonGrammarURL); 118 this.pleonTransducer.setEncoding("UTF-8"); 119 this.pleonTransducer.init(); 120 121 122 //3. delegate 123 return super.init(); 124 } // init() 125 126 /** 127 * Reinitialises the processing resource. After calling this method the 128 * resource should be in the state it is after calling init. 129 * If the resource depends on external resources (such as rules files) then 130 * the resource will re-read those resources. If the data used to create 131 * the resource has changed since the resource has been created then the 132 * resource will change too after calling reInit(). 133 */ 134 public void reInit() throws ResourceInstantiationException { 135 136 if (null != this.qtTransducer) { 137 this.qtTransducer.reInit(); 138 } 139 140 if (null != this.pleonTransducer) { 141 this.pleonTransducer.reInit(); 142 } 143 144 init(); 145 } // reInit() 146 147 148 /** Set the document to run on. */ 149 public void setDocument(Document newDocument) { 150 151 //0. precondition 152 // Assert.assertNotNull(newDocument); 153 154 //1. set doc for aggregated components 155 this.qtTransducer.setDocument(newDocument); 156 this.pleonTransducer.setDocument(newDocument); 157 158 //3. delegate 159 super.setDocument(newDocument); 160 } 161 162 /** --- */ 163 public void setAnnotationSetName(String annotationSetName) { 164 this.annotationSetName = annotationSetName; 165 } 166 167 168 /** --- */ 169 public String getAnnotationSetName() { 170 return annotationSetName; 171 } 172 173 /** 174 * This method runs the coreferencer. It assumes that all the needed parameters 175 * are set. If they are not, an exception will be fired. 176 */ 177 public void execute() throws ExecutionException{ 178 179 //0. preconditions 180 if(null == this.document) { 181 throw new ExecutionException("[coreference] Document is not set!"); 182 } 183 184 //1. preprocess 185 preprocess(); 186 /* 187 //2. remove corefs from previous run 188 String annSetName = this.annotationSetName == null ? "COREF" 189 : this.annotationSetName; 190 191 AnnotationSet corefSet = this.document.getAnnotations(annSetName); 192 if (false == corefSet.isEmpty()) { 193 corefSet.clear(); 194 } 195 */ 196 //3.get personal pronouns 197 FeatureMap constraintPRP = new SimpleFeatureMapImpl(); 198 constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 199 AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP); 200 201 //4.get possesive pronouns 202 FeatureMap constraintPRP$ = new SimpleFeatureMapImpl(); 203 constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY); 204 AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$); 205 206 //5.combine them 207 AnnotationSet pronouns = personalPronouns; 208 if (null == personalPronouns) { 209 pronouns = possesivePronouns; 210 } 211 else if (null != possesivePronouns) { 212 pronouns.addAll(possesivePronouns); 213 } 214 215 //6.do we have pronouns at all? 216 if (null == pronouns) { 217 //do nothing 218 return; 219 } 220 221 //7.sort them according to offset 222 Object[] arrPronouns = pronouns.toArray(); 223 java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR); 224 225 //8.cleanup - ease the GC 226 pronouns = personalPronouns = possesivePronouns = null; 227 228 int prnSentIndex = 0; 229 230 231 //10. process all pronouns 232 for (int i=0; i< arrPronouns.length; i++) { 233 Annotation currPronoun = (Annotation)arrPronouns[i]; 234 while (this.textSentences[prnSentIndex].getEndOffset().longValue() < 235 currPronoun.getEndNode().getOffset().longValue()) { 236 prnSentIndex++; 237 } 238 239 Sentence currSentence = this.textSentences[prnSentIndex]; 240 Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue()); 241 Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue()); 242 243 //11. find antecedent (if any) for pronoun 244 Annotation antc = findAntecedent(currPronoun,prnSentIndex); 245 246 //12. add to the ana2ant hashtable 247 this.anaphor2antecedent.put(currPronoun,antc); 248 } 249 250 //done 251 } 252 253 254 /** --- */ 255 public HashMap getResolvedAnaphora() { 256 return this.anaphor2antecedent; 257 } 258 259 /** --- */ 260 private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) { 261 262 //0. preconditions 263 Assert.assertNotNull(currPronoun); 264 Assert.assertTrue(prnSentIndex >= 0); 265 Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 266 Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 267 currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 268 269 //1. 270 String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 271 272 Assert.assertNotNull(strPronoun); 273 274 //2. delegate processing to the appropriate methods 275 if (strPronoun.equalsIgnoreCase("HE") || 276 strPronoun.equalsIgnoreCase("HIM") || 277 strPronoun.equalsIgnoreCase("HIS") || 278 strPronoun.equalsIgnoreCase("HIMSELF")) { 279 return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex); 280 } 281 else if (strPronoun.equalsIgnoreCase("SHE") || 282 strPronoun.equalsIgnoreCase("HER")) { 283 return _resolve$SHE$HER$(currPronoun,prnSentIndex); 284 } 285 else if (strPronoun.equalsIgnoreCase("IT") || 286 strPronoun.equalsIgnoreCase("ITS") || 287 strPronoun.equalsIgnoreCase("ITSELF")) { 288 return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex); 289 } 290 else if (strPronoun.equalsIgnoreCase("I") || 291 strPronoun.equalsIgnoreCase("ME") || 292 strPronoun.equalsIgnoreCase("MY") || 293 strPronoun.equalsIgnoreCase("MYSELF")) { 294 return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex); 295 } 296 else { 297 if (DEBUG) { 298 gate.util.Err.println("["+strPronoun+"] is not handled yet..."); 299 } 300 return null; 301 } 302 } 303 304 305 boolean isPleonastic(Annotation pronoun) { 306 307 //0. preconditions 308 Assert.assertNotNull(pronoun); 309 String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 310 Assert.assertTrue(str.equalsIgnoreCase("IT")); 311 312 //1. do we have pleonasms in this text? 313 if (this.pleonasticIt.length == 0) { 314 return false; 315 } 316 317 //2. find closest pleonasm index 318 int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt, 319 pronoun, 320 ANNOTATION_OFFSET_COMPARATOR); 321 //normalize index 322 if (closestPleonasmIndex < 0) { 323 closestPleonasmIndex = -closestPleonasmIndex -1 -1; 324 } 325 326 //still not good? 327 if (closestPleonasmIndex < 0) { 328 closestPleonasmIndex = 0; 329 } 330 331 //get closest pleonasm 332 Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex]; 333 334 //System.out.println(pleonasm); 335 //System.out.println(pronoun); 336 337 //3. return true only if the proboun is contained in pleonastic fragment 338 boolean result = (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue() 339 && 340 pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue()); 341 //System.out.println("is pleon=["+result+"]"); 342 return result; 343 } 344 345 346 /** --- */ 347 private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) { 348 349 //0. preconditions 350 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 351 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 352 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 353 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 354 Assert.assertTrue(pronounString.equalsIgnoreCase("HE") || 355 pronounString.equalsIgnoreCase("HIM") || 356 pronounString.equalsIgnoreCase("HIS") || 357 pronounString.equalsIgnoreCase("HIMSELF")); 358 359 //1. 360 boolean antecedentFound = false; 361 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 362 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 363 364 int currSentenceIndex = sentenceIndex; 365 Annotation bestAntecedent = null; 366 367 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 368 Sentence currSentence = this.textSentences[currSentenceIndex]; 369 AnnotationSet persons = currSentence.getPersons(); 370 371 Iterator it = persons.iterator(); 372 while (it.hasNext()) { 373 Annotation currPerson = (Annotation)it.next(); 374 String gender = (String)this.personGender.get(currPerson); 375 376 if (null == gender || 377 gender.equalsIgnoreCase("MALE") || 378 gender.equalsIgnoreCase("UNKNOWN")) { 379 //hit 380 antecedentFound = true; 381 382 if (null == bestAntecedent) { 383 bestAntecedent = currPerson; 384 } 385 else { 386 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 387 } 388 } 389 } 390 391 if (0 == currSentenceIndex--) 392 break; 393 394 } 395 396 return bestAntecedent; 397 } 398 399 400 /** --- */ 401 private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) { 402 403 //0. preconditions 404 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 405 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 406 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 407 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 408 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 409 pronounString.equalsIgnoreCase("HER")); 410 411 //1. 412 boolean antecedentFound = false; 413 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 414 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 415 int currSentenceIndex = sentenceIndex; 416 Annotation bestAntecedent = null; 417 418 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 419 Sentence currSentence = this.textSentences[currSentenceIndex]; 420 AnnotationSet persons = currSentence.getPersons(); 421 422 Iterator it = persons.iterator(); 423 while (it.hasNext()) { 424 Annotation currPerson = (Annotation)it.next(); 425 String gender = (String)this.personGender.get(currPerson); 426 427 if (null == gender || 428 gender.equalsIgnoreCase("FEMALE") || 429 gender.equalsIgnoreCase("UNKNOWN")) { 430 //hit 431 antecedentFound = true; 432 433 if (null == bestAntecedent) { 434 bestAntecedent = currPerson; 435 } 436 else { 437 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 438 } 439 } 440 } 441 442 if (0 == currSentenceIndex--) 443 break; 444 } 445 446 return bestAntecedent; 447 } 448 449 450 /** --- */ 451 private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) { 452 453 //0. preconditions 454 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 455 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 456 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 457 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 458 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 459 pronounString.equalsIgnoreCase("ITS") || 460 pronounString.equalsIgnoreCase("ITSELF")); 461 462 //0.5 check if the IT is pleonastic 463 if (pronounString.equalsIgnoreCase("IT") && 464 isPleonastic(pronoun)) { 465 //System.out.println("PLEONASM..."); 466 return null; 467 } 468 469 //1. 470 int scopeFirstIndex = sentenceIndex - 1; 471 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 472 473 int currSentenceIndex = sentenceIndex; 474 Annotation bestAntecedent = null; 475 476 while (currSentenceIndex >= scopeFirstIndex) { 477 478 Sentence currSentence = this.textSentences[currSentenceIndex]; 479 AnnotationSet org = currSentence.getOrganizations(); 480 AnnotationSet loc = currSentence.getLocations(); 481 //combine them 482 AnnotationSet org_loc = org; 483 org_loc.addAll(loc); 484 485 Iterator it = org_loc.iterator(); 486 while (it.hasNext()) { 487 Annotation currOrgLoc = (Annotation)it.next(); 488 489 if (null == bestAntecedent) { 490 //discard cataphoric references 491 if (currOrgLoc.getStartNode().getOffset().longValue() < 492 pronoun.getStartNode().getOffset().longValue()) { 493 bestAntecedent = currOrgLoc; 494 } 495 } 496 else { 497 bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun); 498 } 499 } 500 501 if (0 == currSentenceIndex--) 502 break; 503 } 504 505 return bestAntecedent; 506 } 507 508 509 /** --- */ 510 private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) { 511 512 //0. preconditions 513 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 514 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 515 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 516 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 517 Assert.assertTrue(pronounString.equalsIgnoreCase("I") || 518 pronounString.equalsIgnoreCase("MY") || 519 pronounString.equalsIgnoreCase("ME") || 520 pronounString.equalsIgnoreCase("MYSELF")); 521 522 //0.5 sanity check 523 //if there are not quotes at all in the text then exit 524 if (0 == this.quotedText.length) { 525 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED..."); 526 return null; 527 } 528 529 530 //1. 531 Annotation bestAntecedent = null; 532 533 int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR); 534 //normalize index 535 if (closestQuoteIndex < 0) { 536 closestQuoteIndex = -closestQuoteIndex -1 -1; 537 } 538 539 //still not good? 540 if (closestQuoteIndex < 0) { 541 closestQuoteIndex = 0; 542 } 543 544 //get closest Quote 545 Quote quoteContext = this.quotedText[closestQuoteIndex]; 546 547 //assure that the pronoun is contained in the quoted text fragment 548 //otherwise exit 549 550 if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() || 551 pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) { 552 //oops, probably incorrect text - I/My/Me is not part of quoted text fragment 553 //exit 554 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment..."); 555 return null; 556 } 557 558 //get the Persons that precede/succeed the quoted fragment 559 //the order is: 560 // 561 //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but 562 //in the same sentence, then use it 563 //i.e. ["PRN1(x)...", said X ...A, B, C ....] 564 // 565 //[2]. if there is a Person (NOT a pronoun) in the same sentence, 566 // preceding the quote, then use it 567 //i.e. . [A, B, C...X ..."PRN1(x) ..."...] 568 // 569 570 //try [1] 571 //get the succeeding Persons/pronouns 572 AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER); 573 if (false == succCandidates.isEmpty()) { 574 //cool, we have candidates, pick up the one closest to the end quote 575 Iterator it = succCandidates.iterator(); 576 577 while (it.hasNext()) { 578 Annotation currCandidate = (Annotation)it.next(); 579 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 580 //wow, we have a candidate that is closer to the quote 581 bestAntecedent = currCandidate; 582 } 583 } 584 } 585 586 //try [2] 587 //get the preceding Persons/pronouns 588 if (null == bestAntecedent) { 589 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE); 590 if (false == precCandidates.isEmpty()) { 591 //cool, we have candidates, pick up the one closest to the end quote 592 Iterator it = precCandidates.iterator(); 593 594 while (it.hasNext()) { 595 Annotation currCandidate = (Annotation)it.next(); 596 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) { 597 //wow, we have a candidate that is closer to the quote 598 bestAntecedent = currCandidate; 599 } 600 } 601 } 602 } 603 604 //try [3] 605 //get the Persons/pronouns back in context 606 if (null == bestAntecedent) { 607 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK); 608 if (false == precCandidates.isEmpty()) { 609 //cool, we have candidates, pick up the one closest to the end quote 610 Iterator it = precCandidates.iterator(); 611 612 while (it.hasNext()) { 613 Annotation currCandidate = (Annotation)it.next(); 614 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 615 //wow, we have a candidate that is closer to the quote 616 bestAntecedent = currCandidate; 617 } 618 } 619 } 620 } 621 622 return bestAntecedent; 623 } 624 625 626 /** --- */ 627 private void preprocess() throws ExecutionException { 628 629 //0.5 cleanup 630 this.personGender.clear(); 631 this.anaphor2antecedent.clear(); 632 633 //1.get all annotation in the input set 634 if ( this.annotationSetName == null || this.annotationSetName.equals("")) { 635 this.defaultAnnotations = this.document.getAnnotations(); 636 } 637 else { 638 this.defaultAnnotations = this.document.getAnnotations(annotationSetName); 639 } 640 641 //if none found, print warning and exit 642 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) { 643 Err.prln("Coref Warning: No annotations found for processing!"); 644 return; 645 } 646 647 648 649 //2.1 remove QT annotations if left from previous execution 650 AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 651 if (null != qtSet) { 652 qtSet.clear(); 653 } 654 655 //2.2. run quoted text transducer to generate "Quoted Text" annotations 656 this.qtTransducer.execute(); 657 658 //3.1 remove pleonastic annotations if left from previous execution 659 AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 660 if (null != pleonSet) { 661 pleonSet.clear(); 662 } 663 664 //3.2 run quoted text transducer to generate "Pleonasm" annotations 665 this.pleonTransducer.execute(); 666 667 //4.get all SENTENCE annotations 668 AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE); 669 670 this.textSentences = new Sentence[sentenceAnnotations.size()]; 671 Object[] sentenceArray = sentenceAnnotations.toArray(); 672 673 java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR); 674 675 for (int i=0; i< sentenceArray.length; i++) { 676 677 Annotation currSentence = (Annotation)sentenceArray[i]; 678 Long sentStartOffset = currSentence.getStartNode().getOffset(); 679 Long sentEndOffset = currSentence.getEndNode().getOffset(); 680 681 //4.1. get PERSOSNS in this sentence 682 AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE, 683 sentStartOffset, 684 sentEndOffset); 685 686 //4.2. get ORGANIZATIONS in this sentence 687 AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE, 688 sentStartOffset, 689 sentEndOffset); 690 691 //4.3. get LOCATION in this sentence 692 AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE, 693 sentStartOffset, 694 sentEndOffset); 695 696 //4.5. create a Sentence for thei SENTENCE annotation 697 this.textSentences[i] = new Sentence(i, 698 0, 699 sentStartOffset, 700 sentEndOffset, 701 sentPersons, 702 sentOrgs, 703 sentLocs 704 ); 705 706 //4.6. for all PERSONs in the sentence - find their gender using the 707 //orthographic coreferences if the gender of some entity is unknown 708 Iterator itPersons = sentPersons.iterator(); 709 while (itPersons.hasNext()) { 710 Annotation currPerson = (Annotation)itPersons.next(); 711 String gender = this.findPersonGender(currPerson); 712 this.personGender.put(currPerson,gender); 713 } 714 } 715 716 //5. initialise the quoted text fragments 717 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 718 719 //if none then return 720 if (null == sentQuotes) { 721 this.quotedText = new Quote[0]; 722 } 723 else { 724 this.quotedText = new Quote[sentQuotes.size()]; 725 726 Object[] quotesArray = sentQuotes.toArray(); 727 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 728 729 for (int i =0; i < quotesArray.length; i++) { 730 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i); 731 } 732 } 733 734 //6. initialuse the plonastic It annotations 735 AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 736 737 if (null == plaonasticSet) { 738 this.pleonasticIt = new Annotation[0]; 739 } 740 else { 741 this.pleonasticIt = new Annotation[plaonasticSet.size()]; 742 743 Object[] quotesArray = plaonasticSet.toArray(); 744 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 745 746 for (int i=0; i< this.pleonasticIt.length; i++) { 747 this.pleonasticIt[i] = (Annotation)quotesArray[i]; 748 } 749 } 750 751 } 752 753 754 /** --- */ 755 private String findPersonGender(Annotation person) { 756 757 String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 758 759 if (null==result) { 760 //gender is unknown - try to find it from the ortho coreferences 761 List orthoMatches = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME); 762 763 if (null != orthoMatches) { 764 Iterator itMatches = orthoMatches.iterator(); 765 766 while (itMatches.hasNext()) { 767 Integer correferringID = (Integer)itMatches.next(); 768 Annotation coreferringEntity = this.defaultAnnotations.get(correferringID); 769 Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE)); 770 String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 771 772 if (null != correferringGender) { 773 result = correferringGender; 774 break; 775 } 776 } 777 } 778 } 779 780 return result; 781 } 782 783 784 /** --- */ 785 private static class AnnotationOffsetComparator implements Comparator { 786 787 private int _getOffset(Object o) { 788 789 if (o instanceof Annotation) { 790 return ((Annotation)o).getEndNode().getOffset().intValue(); 791 } 792 else if (o instanceof Sentence) { 793 return ((Sentence)o).getStartOffset().intValue(); 794 } 795 else if (o instanceof Quote) { 796 return ((Quote)o).getStartOffset().intValue(); 797 } 798 else if (o instanceof Node) { 799 return ((Node)o).getOffset().intValue(); 800 } 801 else { 802 throw new IllegalArgumentException(); 803 } 804 } 805 806 public int compare(Object o1,Object o2) { 807 808 //0. preconditions 809 Assert.assertNotNull(o1); 810 Assert.assertNotNull(o2); 811 Assert.assertTrue(o1 instanceof Annotation || 812 o1 instanceof Sentence || 813 o1 instanceof Quote || 814 o1 instanceof Node); 815 Assert.assertTrue(o2 instanceof Annotation || 816 o2 instanceof Sentence || 817 o2 instanceof Quote || 818 o2 instanceof Node); 819 820 int offset1 = _getOffset(o1); 821 int offset2 = _getOffset(o2); 822 823 return offset1 - offset2; 824 } 825 } 826 827 828 /** --- */ 829 private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 830 831 //0. preconditions 832 Assert.assertNotNull(ant1); 833 Assert.assertNotNull(ant2); 834 Assert.assertNotNull(pronoun); 835 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 836 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 837 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 838 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 839 pronounString.equalsIgnoreCase("HER") || 840 pronounString.equalsIgnoreCase("HE") || 841 pronounString.equalsIgnoreCase("HIM") || 842 pronounString.equalsIgnoreCase("HIS") || 843 pronounString.equalsIgnoreCase("HIMSELF")); 844 845 Long offset1 = ant1.getStartNode().getOffset(); 846 Long offset2 = ant2.getStartNode().getOffset(); 847 Long offsetPrn = pronoun.getStartNode().getOffset(); 848 849 long diff1 = offsetPrn.longValue() - offset1.longValue(); 850 long diff2 = offsetPrn.longValue() - offset2.longValue(); 851 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 852 //reject candidates that overlap with the pronoun 853 if (diff1 == 0) { 854 return ant2; 855 } 856 else if (diff2 == 0) { 857 return ant1; 858 } 859 860 //get the one CLOSEST AND PRECEDING the pronoun 861 if (diff1 > 0 && diff2 > 0) { 862 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 863 if (diff1 < diff2) 864 return ant1; 865 else 866 return ant2; 867 } 868 else if (diff1 < 0 && diff2 < 0) { 869 //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A 870 if (Math.abs(diff1) < Math.abs(diff2)) 871 return ant1; 872 else 873 return ant2; 874 } 875 else { 876 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 877 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 878 if (diff1 > 0) 879 return ant1; 880 else 881 return ant2; 882 } 883 } 884 885 /** --- */ 886 private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 887 888 //0. preconditions 889 Assert.assertNotNull(ant1); 890 Assert.assertNotNull(ant2); 891 Assert.assertNotNull(pronoun); 892 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 893 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 894 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 895 896 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 897 pronounString.equalsIgnoreCase("ITS") || 898 pronounString.equalsIgnoreCase("ITSELF")); 899 900 Long offset1 = ant1.getStartNode().getOffset(); 901 Long offset2 = ant2.getStartNode().getOffset(); 902 Long offsetPrn = pronoun.getStartNode().getOffset(); 903 long diff1 = offsetPrn.longValue() - offset1.longValue(); 904 long diff2 = offsetPrn.longValue() - offset2.longValue(); 905 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 906 //reject candidates that overlap with the pronoun 907 if (diff1 == 0) { 908 return ant2; 909 } 910 else if (diff2 == 0) { 911 return ant1; 912 } 913 914 915 //get the one CLOSEST AND PRECEDING the pronoun 916 if (diff1 > 0 && diff2 > 0) { 917 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 918 if (diff1 < diff2) 919 return ant1; 920 else 921 return ant2; 922 } 923 else if (diff1 > 0){ 924 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 925 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 926 return ant1; 927 } 928 else if (diff2 > 0){ 929 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 930 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 931 return ant2; 932 } 933 else { 934 //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either 935 //cataphora, or nominal antecedent, or an antecedent that is further back in scope 936 //in any case - discard the antecedents 937 return null; 938 } 939 } 940 941 942 /** --- */ 943 private class Quote { 944 945 /** --- */ 946 public static final int ANTEC_AFTER = 1; 947 /** --- */ 948 public static final int ANTEC_BEFORE = 2; 949 /** --- */ 950 public static final int ANTEC_BACK = 3; 951 /** --- */ 952 private AnnotationSet antecedentsBefore; 953 /** --- */ 954 private AnnotationSet antecedentsAfter; 955 /** --- */ 956 private AnnotationSet antecedentsBackInContext; 957 /** --- */ 958 private Annotation quoteAnnotation; 959 /** --- */ 960 private int quoteIndex; 961 962 /** --- */ 963 public Quote(Annotation quoteAnnotation, int index) { 964 965 this.quoteAnnotation = quoteAnnotation; 966 this.quoteIndex = index; 967 init(); 968 } 969 970 /** --- */ 971 private void init() { 972 973 //0.preconditions 974 Assert.assertNotNull(textSentences); 975 976 //0.5 create a restriction for PRP pos tokens 977 FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl(); 978 prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 979 980 //1. generate the precPersons set 981 982 //1.1 locate the sentece containing the opening quote marks 983 int quoteStartPos = java.util.Arrays.binarySearch(textSentences, 984 this.quoteAnnotation.getStartNode(), 985 ANNOTATION_OFFSET_COMPARATOR); 986 987 //normalize index 988 int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos 989 : -quoteStartPos -1 -1; // blame Sun, not me 990 //still not good? 991 if (startSentenceIndex < 0) { 992 startSentenceIndex = 0; 993 } 994 995 //1.2. get the persons and restrict to these that precede the quote (i.e. not contained 996 //in the quote) 997 this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex, 998 this.quoteIndex, 999 ANTEC_BEFORE); 1000 1001 1002 //2. generate the precPersonsInCOntext set 1003 //2.1. get the persons from the sentence precedeing the sentence containing the quote start 1004 if (startSentenceIndex > 0) { 1005 this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1, 1006 this.quoteIndex, 1007 ANTEC_BACK); 1008 } 1009 1010 //2. generate the succ Persons set 1011 //2.1 locate the sentece containing the closing quote marks 1012 int quoteEndPos = java.util.Arrays.binarySearch(textSentences, 1013 this.quoteAnnotation.getEndNode(), 1014 ANNOTATION_OFFSET_COMPARATOR); 1015 1016 //normalize it 1017 int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos 1018 : -quoteEndPos -1 -1; // blame Sun, not me 1019 //still not good? 1020 if (endSentenceIndex < 0) { 1021 endSentenceIndex = 0; 1022 } 1023 1024 this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex, 1025 this.quoteIndex, 1026 ANTEC_AFTER); 1027 //generate t 1028 } 1029 1030 1031 /** --- */ 1032 private AnnotationSet generateAntecedentCandidates(int sentenceNumber, 1033 int quoteNumber , 1034 int mode) { 1035 1036 //0. preconditions 1037 Assert.assertTrue(sentenceNumber >=0); 1038 Assert.assertTrue(quoteNumber >=0); 1039 Assert.assertTrue(mode == Quote.ANTEC_AFTER || 1040 mode == Quote.ANTEC_BEFORE || 1041 mode == Quote.ANTEC_BACK); 1042 1043 //1. get sentence 1044 Sentence sentence = textSentences[sentenceNumber]; 1045 1046 //2. get the persons 1047 AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons()); 1048 1049 //4. now get the he/she pronouns in the relevant context 1050 AnnotationSet annotations = null; 1051 1052 switch(mode) { 1053 1054 case ANTEC_BEFORE: 1055 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1056 this.getStartOffset()); 1057 break; 1058 1059 case ANTEC_AFTER: 1060 annotations = defaultAnnotations.getContained(this.getEndOffset(), 1061 sentence.getEndOffset()); 1062 break; 1063 1064 case ANTEC_BACK: 1065 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1066 sentence.getEndOffset()); 1067 break; 1068 } 1069 1070 //4. get the pronouns 1071 //restrict to he/she pronouns 1072 if (null != annotations) { 1073 AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION); 1074 1075 if (null != pronouns) { 1076 1077 Iterator it = pronouns.iterator(); 1078 while (it.hasNext()) { 1079 Annotation currPronoun = (Annotation)it.next(); 1080 //add to succPersons only if HE/SHE 1081 String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1082 1083 if (null != pronounString && 1084 (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she")) 1085 ) 1086 antecedents.add(currPronoun); 1087 }//while 1088 }//if 1089 }//if 1090 1091 1092 //3. depending on the mode, may have to restrict persons to these that precede/succeed 1093 //the quoted fragment 1094 // 1095 //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where 1096 //the quote *starts* 1097 // 1098 //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where 1099 //the quote *ends* 1100 // 1101 //for ANTEC_BACK, we are operating in the context of the sentence previous to the 1102 //sentence where the quote starts. I.e. we're resolbinf a case like 1103 // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"] 1104 //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote 1105 //Note that the cirrent sentence is the first one, not the second 1106 // 1107 Iterator itPersons = antecedents.iterator(); 1108 1109 while (itPersons.hasNext()) { 1110 Annotation currPerson = (Annotation)itPersons.next(); 1111 1112 //cut 1113 if (Quote.ANTEC_BEFORE == mode && 1114 currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) { 1115 //restrict only to persosn preceding 1116 itPersons.remove(); 1117 } 1118 else if (Quote.ANTEC_AFTER == mode && 1119 currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) { 1120 //restrict only to persons succeeding the quote 1121 itPersons.remove(); 1122 } 1123 else if (Quote.ANTEC_BACK == mode) { 1124 //this one is tricky 1125 //locate the quote previous to the one we're resolving 1126 //(since we're operating in the sentence previous to the quote being resolved 1127 //wew try to find if any quote (prevQuote) exist in this sentence and get the 1128 //persons succeeding it) 1129 1130 //get prev quote 1131 //is the curr quote the first one? 1132 if (quoteNumber >0) { 1133 Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1]; 1134 1135 //restrict to the succeeding persons 1136 if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) { 1137 itPersons.remove(); 1138 } 1139 } 1140 } 1141 } 1142 1143 return antecedents; 1144 } 1145 1146 /** --- */ 1147 public Long getStartOffset() { 1148 return this.quoteAnnotation.getStartNode().getOffset(); 1149 } 1150 1151 /** --- */ 1152 public Long getEndOffset() { 1153 return this.quoteAnnotation.getEndNode().getOffset(); 1154 } 1155 1156 /** --- */ 1157 public AnnotationSet getAntecedentCandidates(int type) { 1158 1159 switch(type) { 1160 1161 case ANTEC_AFTER: 1162 return this.antecedentsAfter; 1163 1164 case ANTEC_BEFORE: 1165 return this.antecedentsBefore; 1166 1167 case ANTEC_BACK: 1168 return this.antecedentsBackInContext; 1169 1170 default: 1171 throw new IllegalArgumentException(); 1172 } 1173 } 1174 1175 } 1176 1177 1178 /** --- */ 1179 private class Sentence { 1180 1181 /** --- */ 1182 private int sentNumber; 1183 /** --- */ 1184 private int paraNumber; 1185 /** --- */ 1186 private Long startOffset; 1187 /** --- */ 1188 private Long endOffset; 1189 /** --- */ 1190 private AnnotationSet persons; 1191 /** --- */ 1192 private AnnotationSet organizations; 1193 /** --- */ 1194 private AnnotationSet locations; 1195 1196 /** --- */ 1197 public Sentence(int sentNumber, 1198 int paraNumber, 1199 Long startOffset, 1200 Long endOffset, 1201 AnnotationSet persons, 1202 AnnotationSet organizations, 1203 AnnotationSet locations) { 1204 1205 this.sentNumber = sentNumber; 1206 this.paraNumber = paraNumber; 1207 this.startOffset = startOffset; 1208 this.endOffset = endOffset; 1209 this.persons = persons; 1210 this.organizations = organizations; 1211 this.locations = locations; 1212 } 1213 1214 /** --- */ 1215 public Long getStartOffset() { 1216 return this.startOffset; 1217 } 1218 1219 /** --- */ 1220 public Long getEndOffset() { 1221 return this.endOffset; 1222 } 1223 1224 /** --- */ 1225 public AnnotationSet getPersons() { 1226 return this.persons; 1227 } 1228 1229 /** --- */ 1230 public AnnotationSet getOrganizations() { 1231 return this.organizations; 1232 } 1233 1234 /** --- */ 1235 public AnnotationSet getLocations() { 1236 return this.locations; 1237 } 1238 } 1239 1240}
|
PronominalCoref |
|