|
PronominalCoref |
|
1 /* 2 * PronominalCoref.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Marin Dimitrov, 30/Dec/2001 12 * 13 * $Id: PronominalCoref.java,v 1.23 2002/03/11 10:00:18 marin Exp $ 14 */ 15 16 package gate.creole.coref; 17 18 import java.util.*; 19 import java.net.*; 20 21 import junit.framework.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.util.*; 26 import gate.annotation.*; 27 28 public class PronominalCoref extends AbstractLanguageAnalyser 29 implements ProcessingResource, ANNIEConstants{ 30 31 public static final String COREF_DOCUMENT_PARAMETER_NAME = "document"; 32 33 public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName"; 34 35 /** --- */ 36 private static final boolean DEBUG = false; 37 38 //JAPE grammars 39 private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape"; 40 private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape"; 41 42 //annotation types 43 private static final String QUOTED_TEXT_TYPE = "Quoted Text"; 44 private static final String PLEONASTIC_TYPE = "PleonasticIt"; 45 46 //annotation features 47 private static final String PRP_CATEGORY = "PRP"; 48 private static final String PRP$_CATEGORY = "PRP$"; 49 50 //scope 51 private static final int SENTENCES_IN_SCOPE = 3; 52 /** --- */ 53 private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR; 54 /** --- */ 55 private String annotationSetName; 56 /** --- */ 57 private Transducer qtTransducer; 58 /** --- */ 59 private Transducer pleonTransducer; 60 /** --- */ 61 private AnnotationSet defaultAnnotations; 62 /** --- */ 63 private Sentence[] textSentences; 64 /** --- */ 65 private Quote[] quotedText; 66 /** --- */ 67 private Annotation[] pleonasticIt; 68 /** --- */ 69 private HashMap personGender; 70 /** --- */ 71 private HashMap anaphor2antecedent; 72 /** --- */ 73 private static final FeatureMap PRP_RESTRICTION; 74 75 /** --- */ 76 static { 77 ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator(); 78 PRP_RESTRICTION = new SimpleFeatureMapImpl(); 79 PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 80 } 81 82 /** --- */ 83 public PronominalCoref() { 84 85 this.personGender = new HashMap(); 86 this.anaphor2antecedent = new HashMap(); 87 this.qtTransducer = new gate.creole.Transducer(); 88 this.pleonTransducer = new gate.creole.Transducer(); 89 } 90 91 /** Initialise this resource, and return it. */ 92 public Resource init() throws ResourceInstantiationException { 93 94 //0. preconditions 95 Assert.assertNotNull(this.qtTransducer); 96 97 //1. initialise quoted text transducer 98 URL qtGrammarURL = null; 99 try { 100 qtGrammarURL = new URL(QT_GRAMMAR_URL); 101 } 102 catch(MalformedURLException mue) { 103 throw new ResourceInstantiationException(mue); 104 } 105 this.qtTransducer.setGrammarURL(qtGrammarURL); 106 this.qtTransducer.setEncoding("UTF-8"); 107 this.qtTransducer.init(); 108 109 //2. initialise pleonastic transducer 110 URL pleonGrammarURL = null; 111 try { 112 pleonGrammarURL = new URL(PLEON_GRAMMAR_URL); 113 } 114 catch(MalformedURLException mue) { 115 throw new ResourceInstantiationException(mue); 116 } 117 this.pleonTransducer.setGrammarURL(pleonGrammarURL); 118 this.pleonTransducer.setEncoding("UTF-8"); 119 this.pleonTransducer.init(); 120 121 122 //3. delegate 123 return super.init(); 124 } // init() 125 126 /** 127 * Reinitialises the processing resource. After calling this method the 128 * resource should be in the state it is after calling init. 129 * If the resource depends on external resources (such as rules files) then 130 * the resource will re-read those resources. If the data used to create 131 * the resource has changed since the resource has been created then the 132 * resource will change too after calling reInit(). 133 */ 134 public void reInit() throws ResourceInstantiationException { 135 136 if (null != this.qtTransducer) { 137 this.qtTransducer.reInit(); 138 } 139 140 if (null != this.pleonTransducer) { 141 this.pleonTransducer.reInit(); 142 } 143 144 init(); 145 } // reInit() 146 147 148 /** Set the document to run on. */ 149 public void setDocument(Document newDocument) { 150 151 //0. precondition 152 // Assert.assertNotNull(newDocument); 153 154 //1. set doc for aggregated components 155 this.qtTransducer.setDocument(newDocument); 156 this.pleonTransducer.setDocument(newDocument); 157 158 //3. delegate 159 super.setDocument(newDocument); 160 } 161 162 /** --- */ 163 public void setAnnotationSetName(String annotationSetName) { 164 this.annotationSetName = annotationSetName; 165 } 166 167 168 /** --- */ 169 public String getAnnotationSetName() { 170 return annotationSetName; 171 } 172 173 /** 174 * This method runs the coreferencer. It assumes that all the needed parameters 175 * are set. If they are not, an exception will be fired. 176 */ 177 public void execute() throws ExecutionException{ 178 179 //0. preconditions 180 if(null == this.document) { 181 throw new ExecutionException("[coreference] Document is not set!"); 182 } 183 184 //1. preprocess 185 preprocess(); 186 /* 187 //2. remove corefs from previous run 188 String annSetName = this.annotationSetName == null ? "COREF" 189 : this.annotationSetName; 190 191 AnnotationSet corefSet = this.document.getAnnotations(annSetName); 192 if (false == corefSet.isEmpty()) { 193 corefSet.clear(); 194 } 195 */ 196 //3.get personal pronouns 197 FeatureMap constraintPRP = new SimpleFeatureMapImpl(); 198 constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 199 AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP); 200 201 //4.get possesive pronouns 202 FeatureMap constraintPRP$ = new SimpleFeatureMapImpl(); 203 constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY); 204 AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$); 205 206 //5.combine them 207 AnnotationSet pronouns = personalPronouns; 208 if (null == personalPronouns) { 209 pronouns = possesivePronouns; 210 } 211 else if (null != possesivePronouns) { 212 pronouns.addAll(possesivePronouns); 213 } 214 215 //6.do we have pronouns at all? 216 if (null == pronouns) { 217 //do nothing 218 return; 219 } 220 221 //7.sort them according to offset 222 Object[] arrPronouns = pronouns.toArray(); 223 java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR); 224 225 //8.cleanup - ease the GC 226 pronouns = personalPronouns = possesivePronouns = null; 227 228 int prnSentIndex = 0; 229 230 231 //10. process all pronouns 232 for (int i=0; i< arrPronouns.length; i++) { 233 Annotation currPronoun = (Annotation)arrPronouns[i]; 234 while (this.textSentences[prnSentIndex].getEndOffset().longValue() < 235 currPronoun.getEndNode().getOffset().longValue()) { 236 prnSentIndex++; 237 } 238 239 Sentence currSentence = this.textSentences[prnSentIndex]; 240 Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue()); 241 Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue()); 242 243 //11. find antecedent (if any) for pronoun 244 Annotation antc = findAntecedent(currPronoun,prnSentIndex); 245 246 //12. add to the ana2ant hashtable 247 this.anaphor2antecedent.put(currPronoun,antc); 248 } 249 250 //done 251 } 252 253 254 /** --- */ 255 public HashMap getResolvedAnaphora() { 256 return this.anaphor2antecedent; 257 } 258 259 /** --- */ 260 private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) { 261 262 //0. preconditions 263 Assert.assertNotNull(currPronoun); 264 Assert.assertTrue(prnSentIndex >= 0); 265 Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 266 Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 267 currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 268 269 //1. 270 String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 271 272 Assert.assertNotNull(strPronoun); 273 274 //2. delegate processing to the appropriate methods 275 if (strPronoun.equalsIgnoreCase("HE") || 276 strPronoun.equalsIgnoreCase("HIM") || 277 strPronoun.equalsIgnoreCase("HIS") || 278 strPronoun.equalsIgnoreCase("HIMSELF")) { 279 return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex); 280 } 281 else if (strPronoun.equalsIgnoreCase("SHE") || 282 strPronoun.equalsIgnoreCase("HER")) { 283 return _resolve$SHE$HER$(currPronoun,prnSentIndex); 284 } 285 else if (strPronoun.equalsIgnoreCase("IT") || 286 strPronoun.equalsIgnoreCase("ITS") || 287 strPronoun.equalsIgnoreCase("ITSELF")) { 288 return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex); 289 } 290 else if (strPronoun.equalsIgnoreCase("I") || 291 strPronoun.equalsIgnoreCase("ME") || 292 strPronoun.equalsIgnoreCase("MY") || 293 strPronoun.equalsIgnoreCase("MYSELF")) { 294 return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex); 295 } 296 else { 297 if (DEBUG) { 298 gate.util.Err.println("["+strPronoun+"] is not handled yet..."); 299 } 300 return null; 301 } 302 } 303 304 305 boolean isPleonastic(Annotation pronoun) { 306 307 //0. preconditions 308 Assert.assertNotNull(pronoun); 309 String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 310 Assert.assertTrue(str.equalsIgnoreCase("IT")); 311 312 //1. do we have pleonasms in this text? 313 if (this.pleonasticIt.length == 0) { 314 return false; 315 } 316 317 //2. find closest pleonasm index 318 int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt, 319 pronoun, 320 ANNOTATION_OFFSET_COMPARATOR); 321 //normalize index 322 if (closestPleonasmIndex < 0) { 323 closestPleonasmIndex = -closestPleonasmIndex -1 -1; 324 } 325 326 //still not good? 327 if (closestPleonasmIndex < 0) { 328 closestPleonasmIndex = 0; 329 } 330 331 //get closest pleonasm 332 Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex]; 333 334 //System.out.println(pleonasm); 335 //System.out.println(pronoun); 336 337 //3. return true only if the proboun is contained in pleonastic fragment 338 boolean result = (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue() 339 && 340 pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue()); 341 //System.out.println("is pleon=["+result+"]"); 342 return result; 343 } 344 345 346 /** --- */ 347 private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) { 348 349 //0. preconditions 350 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 351 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 352 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 353 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 354 Assert.assertTrue(pronounString.equalsIgnoreCase("HE") || 355 pronounString.equalsIgnoreCase("HIM") || 356 pronounString.equalsIgnoreCase("HIS") || 357 pronounString.equalsIgnoreCase("HIMSELF")); 358 359 //1. 360 boolean antecedentFound = false; 361 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 362 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 363 364 int currSentenceIndex = sentenceIndex; 365 Annotation bestAntecedent = null; 366 367 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 368 Sentence currSentence = this.textSentences[currSentenceIndex]; 369 AnnotationSet persons = currSentence.getPersons(); 370 371 Iterator it = persons.iterator(); 372 while (it.hasNext()) { 373 Annotation currPerson = (Annotation)it.next(); 374 String gender = (String)this.personGender.get(currPerson); 375 376 if (null == gender || 377 gender.equalsIgnoreCase("MALE") || 378 gender.equalsIgnoreCase("UNKNOWN")) { 379 //hit 380 antecedentFound = true; 381 382 if (null == bestAntecedent) { 383 bestAntecedent = currPerson; 384 } 385 else { 386 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 387 } 388 } 389 } 390 391 if (0 == currSentenceIndex--) 392 break; 393 394 } 395 396 return bestAntecedent; 397 } 398 399 400 /** --- */ 401 private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) { 402 403 //0. preconditions 404 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 405 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 406 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 407 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 408 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 409 pronounString.equalsIgnoreCase("HER")); 410 411 //1. 412 boolean antecedentFound = false; 413 int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE; 414 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 415 int currSentenceIndex = sentenceIndex; 416 Annotation bestAntecedent = null; 417 418 while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) { 419 Sentence currSentence = this.textSentences[currSentenceIndex]; 420 AnnotationSet persons = currSentence.getPersons(); 421 422 Iterator it = persons.iterator(); 423 while (it.hasNext()) { 424 Annotation currPerson = (Annotation)it.next(); 425 String gender = (String)this.personGender.get(currPerson); 426 427 if (null == gender || 428 gender.equalsIgnoreCase("FEMALE") || 429 gender.equalsIgnoreCase("UNKNOWN")) { 430 //hit 431 antecedentFound = true; 432 433 if (null == bestAntecedent) { 434 bestAntecedent = currPerson; 435 } 436 else { 437 bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun); 438 } 439 } 440 } 441 442 if (0 == currSentenceIndex--) 443 break; 444 } 445 446 return bestAntecedent; 447 } 448 449 450 /** --- */ 451 private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) { 452 453 //0. preconditions 454 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 455 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 456 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 457 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 458 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 459 pronounString.equalsIgnoreCase("ITS") || 460 pronounString.equalsIgnoreCase("ITSELF")); 461 462 //0.5 check if the IT is pleonastic 463 if (pronounString.equalsIgnoreCase("IT") && 464 isPleonastic(pronoun)) { 465 //System.out.println("PLEONASM..."); 466 return null; 467 } 468 469 //1. 470 int scopeFirstIndex = sentenceIndex - 1; 471 if (scopeFirstIndex < 0 ) scopeFirstIndex = 0; 472 473 int currSentenceIndex = sentenceIndex; 474 Annotation bestAntecedent = null; 475 476 while (currSentenceIndex >= scopeFirstIndex) { 477 478 Sentence currSentence = this.textSentences[currSentenceIndex]; 479 AnnotationSet org = currSentence.getOrganizations(); 480 AnnotationSet loc = currSentence.getLocations(); 481 //combine them 482 AnnotationSet org_loc = org; 483 org_loc.addAll(loc); 484 485 Iterator it = org_loc.iterator(); 486 while (it.hasNext()) { 487 Annotation currOrgLoc = (Annotation)it.next(); 488 489 if (null == bestAntecedent) { 490 //discard cataphoric references 491 if (currOrgLoc.getStartNode().getOffset().longValue() < 492 pronoun.getStartNode().getOffset().longValue()) { 493 bestAntecedent = currOrgLoc; 494 } 495 } 496 else { 497 bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun); 498 } 499 } 500 501 if (0 == currSentenceIndex--) 502 break; 503 } 504 505 return bestAntecedent; 506 } 507 508 509 /** --- */ 510 private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) { 511 512 //0. preconditions 513 Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE)); 514 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 515 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 516 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 517 Assert.assertTrue(pronounString.equalsIgnoreCase("I") || 518 pronounString.equalsIgnoreCase("MY") || 519 pronounString.equalsIgnoreCase("ME") || 520 pronounString.equalsIgnoreCase("MYSELF")); 521 522 //0.5 sanity check 523 //if there are not quotes at all in the text then exit 524 if (0 == this.quotedText.length) { 525 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED..."); 526 return null; 527 } 528 529 530 //1. 531 Annotation bestAntecedent = null; 532 533 int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR); 534 //normalize index 535 if (closestQuoteIndex < 0) { 536 closestQuoteIndex = -closestQuoteIndex -1 -1; 537 } 538 539 //get closest Quote 540 Quote quoteContext = this.quotedText[closestQuoteIndex]; 541 542 //assure that the pronoun is contained in the quoted text fragment 543 //otherwise exit 544 545 if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() || 546 pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) { 547 //oops, probably incorrect text - I/My/Me is not part of quoted text fragment 548 //exit 549 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment..."); 550 return null; 551 } 552 553 //get the Persons that precede/succeed the quoted fragment 554 //the order is: 555 // 556 //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but 557 //in the same sentence, then use it 558 //i.e. ["PRN1(x)...", said X ...A, B, C ....] 559 // 560 //[2]. if there is a Person (NOT a pronoun) in the same sentence, 561 // preceding the quote, then use it 562 //i.e. . [A, B, C...X ..."PRN1(x) ..."...] 563 // 564 565 //try [1] 566 //get the succeeding Persons/pronouns 567 AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER); 568 if (false == succCandidates.isEmpty()) { 569 //cool, we have candidates, pick up the one closest to the end quote 570 Iterator it = succCandidates.iterator(); 571 572 while (it.hasNext()) { 573 Annotation currCandidate = (Annotation)it.next(); 574 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 575 //wow, we have a candidate that is closer to the quote 576 bestAntecedent = currCandidate; 577 } 578 } 579 } 580 581 //try [2] 582 //get the preceding Persons/pronouns 583 if (null == bestAntecedent) { 584 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE); 585 if (false == precCandidates.isEmpty()) { 586 //cool, we have candidates, pick up the one closest to the end quote 587 Iterator it = precCandidates.iterator(); 588 589 while (it.hasNext()) { 590 Annotation currCandidate = (Annotation)it.next(); 591 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) { 592 //wow, we have a candidate that is closer to the quote 593 bestAntecedent = currCandidate; 594 } 595 } 596 } 597 } 598 599 //try [3] 600 //get the Persons/pronouns back in context 601 if (null == bestAntecedent) { 602 AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK); 603 if (false == precCandidates.isEmpty()) { 604 //cool, we have candidates, pick up the one closest to the end quote 605 Iterator it = precCandidates.iterator(); 606 607 while (it.hasNext()) { 608 Annotation currCandidate = (Annotation)it.next(); 609 if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) { 610 //wow, we have a candidate that is closer to the quote 611 bestAntecedent = currCandidate; 612 } 613 } 614 } 615 } 616 617 return bestAntecedent; 618 } 619 620 621 /** --- */ 622 private void preprocess() throws ExecutionException { 623 624 //0.5 cleanup 625 this.personGender.clear(); 626 this.anaphor2antecedent.clear(); 627 628 //1.get all annotation in the input set 629 if ( this.annotationSetName == null || this.annotationSetName.equals("")) { 630 this.defaultAnnotations = this.document.getAnnotations(); 631 } 632 else { 633 this.defaultAnnotations = this.document.getAnnotations(annotationSetName); 634 } 635 636 //if none found, print warning and exit 637 if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) { 638 Err.prln("Coref Warning: No annotations found for processing!"); 639 return; 640 } 641 642 643 644 //2.1 remove QT annotations if left from previous execution 645 AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 646 if (null != qtSet) { 647 qtSet.clear(); 648 } 649 650 //2.2. run quoted text transducer to generate "Quoted Text" annotations 651 this.qtTransducer.execute(); 652 653 //3.1 remove pleonastic annotations if left from previous execution 654 AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 655 if (null != pleonSet) { 656 pleonSet.clear(); 657 } 658 659 //3.2 run quoted text transducer to generate "Pleonasm" annotations 660 this.pleonTransducer.execute(); 661 662 //4.get all SENTENCE annotations 663 AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE); 664 665 this.textSentences = new Sentence[sentenceAnnotations.size()]; 666 Object[] sentenceArray = sentenceAnnotations.toArray(); 667 668 java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR); 669 670 for (int i=0; i< sentenceArray.length; i++) { 671 672 Annotation currSentence = (Annotation)sentenceArray[i]; 673 Long sentStartOffset = currSentence.getStartNode().getOffset(); 674 Long sentEndOffset = currSentence.getEndNode().getOffset(); 675 676 //4.1. get PERSOSNS in this sentence 677 AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE, 678 sentStartOffset, 679 sentEndOffset); 680 681 //4.2. get ORGANIZATIONS in this sentence 682 AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE, 683 sentStartOffset, 684 sentEndOffset); 685 686 //4.3. get LOCATION in this sentence 687 AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE, 688 sentStartOffset, 689 sentEndOffset); 690 691 //4.5. create a Sentence for thei SENTENCE annotation 692 this.textSentences[i] = new Sentence(i, 693 0, 694 sentStartOffset, 695 sentEndOffset, 696 sentPersons, 697 sentOrgs, 698 sentLocs 699 ); 700 701 //4.6. for all PERSONs in the sentence - find their gender using the 702 //orthographic coreferences if the gender of some entity is unknown 703 Iterator itPersons = sentPersons.iterator(); 704 while (itPersons.hasNext()) { 705 Annotation currPerson = (Annotation)itPersons.next(); 706 String gender = this.findPersonGender(currPerson); 707 this.personGender.put(currPerson,gender); 708 } 709 } 710 711 //5. initialise the quoted text fragments 712 AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE); 713 714 //if none then return 715 if (null == sentQuotes) { 716 this.quotedText = new Quote[0]; 717 } 718 else { 719 this.quotedText = new Quote[sentQuotes.size()]; 720 721 Object[] quotesArray = sentQuotes.toArray(); 722 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 723 724 for (int i =0; i < quotesArray.length; i++) { 725 this.quotedText[i] = new Quote((Annotation)quotesArray[i],i); 726 } 727 } 728 729 //6. initialuse the plonastic It annotations 730 AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE); 731 732 if (null == plaonasticSet) { 733 this.pleonasticIt = new Annotation[0]; 734 } 735 else { 736 this.pleonasticIt = new Annotation[plaonasticSet.size()]; 737 738 Object[] quotesArray = plaonasticSet.toArray(); 739 java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR); 740 741 for (int i=0; i< this.pleonasticIt.length; i++) { 742 this.pleonasticIt[i] = (Annotation)quotesArray[i]; 743 } 744 } 745 746 } 747 748 749 /** --- */ 750 private String findPersonGender(Annotation person) { 751 752 String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 753 754 if (null==result) { 755 //gender is unknown - try to find it from the ortho coreferences 756 List orthoMatches = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME); 757 758 if (null != orthoMatches) { 759 Iterator itMatches = orthoMatches.iterator(); 760 761 while (itMatches.hasNext()) { 762 Integer correferringID = (Integer)itMatches.next(); 763 Annotation coreferringEntity = this.defaultAnnotations.get(correferringID); 764 Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE)); 765 String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME); 766 767 if (null != correferringGender) { 768 result = correferringGender; 769 break; 770 } 771 } 772 } 773 } 774 775 return result; 776 } 777 778 779 /** --- */ 780 private static class AnnotationOffsetComparator implements Comparator { 781 782 private int _getOffset(Object o) { 783 784 if (o instanceof Annotation) { 785 return ((Annotation)o).getEndNode().getOffset().intValue(); 786 } 787 else if (o instanceof Sentence) { 788 return ((Sentence)o).getStartOffset().intValue(); 789 } 790 else if (o instanceof Quote) { 791 return ((Quote)o).getStartOffset().intValue(); 792 } 793 else if (o instanceof Node) { 794 return ((Node)o).getOffset().intValue(); 795 } 796 else { 797 throw new IllegalArgumentException(); 798 } 799 } 800 801 public int compare(Object o1,Object o2) { 802 803 //0. preconditions 804 Assert.assertNotNull(o1); 805 Assert.assertNotNull(o2); 806 Assert.assertTrue(o1 instanceof Annotation || 807 o1 instanceof Sentence || 808 o1 instanceof Quote || 809 o1 instanceof Node); 810 Assert.assertTrue(o2 instanceof Annotation || 811 o2 instanceof Sentence || 812 o2 instanceof Quote || 813 o2 instanceof Node); 814 815 int offset1 = _getOffset(o1); 816 int offset2 = _getOffset(o2); 817 818 return offset1 - offset2; 819 } 820 } 821 822 823 /** --- */ 824 private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 825 826 //0. preconditions 827 Assert.assertNotNull(ant1); 828 Assert.assertNotNull(ant2); 829 Assert.assertNotNull(pronoun); 830 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 831 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 832 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 833 Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") || 834 pronounString.equalsIgnoreCase("HER") || 835 pronounString.equalsIgnoreCase("HE") || 836 pronounString.equalsIgnoreCase("HIM") || 837 pronounString.equalsIgnoreCase("HIS") || 838 pronounString.equalsIgnoreCase("HIMSELF")); 839 840 Long offset1 = ant1.getStartNode().getOffset(); 841 Long offset2 = ant2.getStartNode().getOffset(); 842 Long offsetPrn = pronoun.getStartNode().getOffset(); 843 844 long diff1 = offsetPrn.longValue() - offset1.longValue(); 845 long diff2 = offsetPrn.longValue() - offset2.longValue(); 846 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 847 //reject candidates that overlap with the pronoun 848 if (diff1 == 0) { 849 return ant2; 850 } 851 else if (diff2 == 0) { 852 return ant1; 853 } 854 855 //get the one CLOSEST AND PRECEDING the pronoun 856 if (diff1 > 0 && diff2 > 0) { 857 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 858 if (diff1 < diff2) 859 return ant1; 860 else 861 return ant2; 862 } 863 else if (diff1 < 0 && diff2 < 0) { 864 //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A 865 if (Math.abs(diff1) < Math.abs(diff2)) 866 return ant1; 867 else 868 return ant2; 869 } 870 else { 871 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 872 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 873 if (diff1 > 0) 874 return ant1; 875 else 876 return ant2; 877 } 878 } 879 880 /** --- */ 881 private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) { 882 883 //0. preconditions 884 Assert.assertNotNull(ant1); 885 Assert.assertNotNull(ant2); 886 Assert.assertNotNull(pronoun); 887 Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) || 888 pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY)); 889 String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 890 891 Assert.assertTrue(pronounString.equalsIgnoreCase("IT") || 892 pronounString.equalsIgnoreCase("ITS") || 893 pronounString.equalsIgnoreCase("ITSELF")); 894 895 Long offset1 = ant1.getStartNode().getOffset(); 896 Long offset2 = ant2.getStartNode().getOffset(); 897 Long offsetPrn = pronoun.getStartNode().getOffset(); 898 long diff1 = offsetPrn.longValue() - offset1.longValue(); 899 long diff2 = offsetPrn.longValue() - offset2.longValue(); 900 // Assert.assertTrue(diff1 != 0 && diff2 != 0); 901 //reject candidates that overlap with the pronoun 902 if (diff1 == 0) { 903 return ant2; 904 } 905 else if (diff2 == 0) { 906 return ant1; 907 } 908 909 910 //get the one CLOSEST AND PRECEDING the pronoun 911 if (diff1 > 0 && diff2 > 0) { 912 //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B 913 if (diff1 < diff2) 914 return ant1; 915 else 916 return ant2; 917 } 918 else if (diff1 > 0){ 919 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 920 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 921 return ant1; 922 } 923 else if (diff2 > 0){ 924 Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2)); 925 //we have [antecedentA...pronoun...AntecedentB] ==> choose A 926 return ant2; 927 } 928 else { 929 //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either 930 //cataphora, or nominal antecedent, or an antecedent that is further back in scope 931 //in any case - discard the antecedents 932 return null; 933 } 934 } 935 936 937 /** --- */ 938 private class Quote { 939 940 /** --- */ 941 public static final int ANTEC_AFTER = 1; 942 /** --- */ 943 public static final int ANTEC_BEFORE = 2; 944 /** --- */ 945 public static final int ANTEC_BACK = 3; 946 /** --- */ 947 private AnnotationSet antecedentsBefore; 948 /** --- */ 949 private AnnotationSet antecedentsAfter; 950 /** --- */ 951 private AnnotationSet antecedentsBackInContext; 952 /** --- */ 953 private Annotation quoteAnnotation; 954 /** --- */ 955 private int quoteIndex; 956 957 /** --- */ 958 public Quote(Annotation quoteAnnotation, int index) { 959 960 this.quoteAnnotation = quoteAnnotation; 961 this.quoteIndex = index; 962 init(); 963 } 964 965 /** --- */ 966 private void init() { 967 968 //0.preconditions 969 Assert.assertNotNull(textSentences); 970 971 //0.5 create a restriction for PRP pos tokens 972 FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl(); 973 prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY); 974 975 //1. generate the precPersons set 976 977 //1.1 locate the sentece containing the opening quote marks 978 int quoteStartPos = java.util.Arrays.binarySearch(textSentences, 979 this.quoteAnnotation.getStartNode(), 980 ANNOTATION_OFFSET_COMPARATOR); 981 982 //normalize index 983 int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos 984 : -quoteStartPos -1 -1; // blame Sun, not me 985 986 //1.2. get the persons and restrict to these that precede the quote (i.e. not contained 987 //in the quote) 988 this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex, 989 this.quoteIndex, 990 ANTEC_BEFORE); 991 992 993 //2. generate the precPersonsInCOntext set 994 //2.1. get the persons from the sentence precedeing the sentence containing the quote start 995 if (startSentenceIndex > 0) { 996 this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1, 997 this.quoteIndex, 998 ANTEC_BACK); 999 } 1000 1001 //2. generate the succ Persons set 1002 //2.1 locate the sentece containing the closing quote marks 1003 int quoteEndPos = java.util.Arrays.binarySearch(textSentences, 1004 this.quoteAnnotation.getEndNode(), 1005 ANNOTATION_OFFSET_COMPARATOR); 1006 1007 //normalize it 1008 int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos 1009 : -quoteEndPos -1 -1; // blame Sun, not me 1010 this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex, 1011 this.quoteIndex, 1012 ANTEC_AFTER); 1013 //generate t 1014 } 1015 1016 1017 /** --- */ 1018 private AnnotationSet generateAntecedentCandidates(int sentenceNumber, 1019 int quoteNumber , 1020 int mode) { 1021 1022 //0. preconditions 1023 Assert.assertTrue(sentenceNumber >=0); 1024 Assert.assertTrue(quoteNumber >=0); 1025 Assert.assertTrue(mode == Quote.ANTEC_AFTER || 1026 mode == Quote.ANTEC_BEFORE || 1027 mode == Quote.ANTEC_BACK); 1028 1029 //1. get sentence 1030 Sentence sentence = textSentences[sentenceNumber]; 1031 1032 //2. get the persons 1033 AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons()); 1034 1035 //4. now get the he/she pronouns in the relevant context 1036 AnnotationSet annotations = null; 1037 1038 switch(mode) { 1039 1040 case ANTEC_BEFORE: 1041 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1042 this.getStartOffset()); 1043 break; 1044 1045 case ANTEC_AFTER: 1046 annotations = defaultAnnotations.getContained(this.getEndOffset(), 1047 sentence.getEndOffset()); 1048 break; 1049 1050 case ANTEC_BACK: 1051 annotations = defaultAnnotations.getContained(sentence.getStartOffset(), 1052 sentence.getEndOffset()); 1053 break; 1054 } 1055 1056 //4. get the pronouns 1057 //restrict to he/she pronouns 1058 if (null != annotations) { 1059 AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION); 1060 1061 if (null != pronouns) { 1062 1063 Iterator it = pronouns.iterator(); 1064 while (it.hasNext()) { 1065 Annotation currPronoun = (Annotation)it.next(); 1066 //add to succPersons only if HE/SHE 1067 String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 1068 1069 if (null != pronounString && 1070 (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she")) 1071 ) 1072 antecedents.add(currPronoun); 1073 }//while 1074 }//if 1075 }//if 1076 1077 1078 //3. depending on the mode, may have to restrict persons to these that precede/succeed 1079 //the quoted fragment 1080 // 1081 //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where 1082 //the quote *starts* 1083 // 1084 //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where 1085 //the quote *ends* 1086 // 1087 //for ANTEC_BACK, we are operating in the context of the sentence previous to the 1088 //sentence where the quote starts. I.e. we're resolbinf a case like 1089 // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"] 1090 //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote 1091 //Note that the cirrent sentence is the first one, not the second 1092 // 1093 Iterator itPersons = antecedents.iterator(); 1094 1095 while (itPersons.hasNext()) { 1096 Annotation currPerson = (Annotation)itPersons.next(); 1097 1098 //cut 1099 if (Quote.ANTEC_BEFORE == mode && 1100 currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) { 1101 //restrict only to persosn preceding 1102 itPersons.remove(); 1103 } 1104 else if (Quote.ANTEC_AFTER == mode && 1105 currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) { 1106 //restrict only to persons succeeding the quote 1107 itPersons.remove(); 1108 } 1109 else if (Quote.ANTEC_BACK == mode) { 1110 //this one is tricky 1111 //locate the quote previous to the one we're resolving 1112 //(since we're operating in the sentence previous to the quote being resolved 1113 //wew try to find if any quote (prevQuote) exist in this sentence and get the 1114 //persons succeeding it) 1115 1116 //get prev quote 1117 //is the curr quote the first one? 1118 if (quoteNumber >0) { 1119 Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1]; 1120 1121 //restrict to the succeeding persons 1122 if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) { 1123 itPersons.remove(); 1124 } 1125 } 1126 } 1127 } 1128 1129 return antecedents; 1130 } 1131 1132 /** --- */ 1133 public Long getStartOffset() { 1134 return this.quoteAnnotation.getStartNode().getOffset(); 1135 } 1136 1137 /** --- */ 1138 public Long getEndOffset() { 1139 return this.quoteAnnotation.getEndNode().getOffset(); 1140 } 1141 1142 /** --- */ 1143 public AnnotationSet getAntecedentCandidates(int type) { 1144 1145 switch(type) { 1146 1147 case ANTEC_AFTER: 1148 return this.antecedentsAfter; 1149 1150 case ANTEC_BEFORE: 1151 return this.antecedentsBefore; 1152 1153 case ANTEC_BACK: 1154 return this.antecedentsBackInContext; 1155 1156 default: 1157 throw new IllegalArgumentException(); 1158 } 1159 } 1160 1161 } 1162 1163 1164 /** --- */ 1165 private class Sentence { 1166 1167 /** --- */ 1168 private int sentNumber; 1169 /** --- */ 1170 private int paraNumber; 1171 /** --- */ 1172 private Long startOffset; 1173 /** --- */ 1174 private Long endOffset; 1175 /** --- */ 1176 private AnnotationSet persons; 1177 /** --- */ 1178 private AnnotationSet organizations; 1179 /** --- */ 1180 private AnnotationSet locations; 1181 1182 /** --- */ 1183 public Sentence(int sentNumber, 1184 int paraNumber, 1185 Long startOffset, 1186 Long endOffset, 1187 AnnotationSet persons, 1188 AnnotationSet organizations, 1189 AnnotationSet locations) { 1190 1191 this.sentNumber = sentNumber; 1192 this.paraNumber = paraNumber; 1193 this.startOffset = startOffset; 1194 this.endOffset = endOffset; 1195 this.persons = persons; 1196 this.organizations = organizations; 1197 this.locations = locations; 1198 } 1199 1200 /** --- */ 1201 public Long getStartOffset() { 1202 return this.startOffset; 1203 } 1204 1205 /** --- */ 1206 public Long getEndOffset() { 1207 return this.endOffset; 1208 } 1209 1210 /** --- */ 1211 public AnnotationSet getPersons() { 1212 return this.persons; 1213 } 1214 1215 /** --- */ 1216 public AnnotationSet getOrganizations() { 1217 return this.organizations; 1218 } 1219 1220 /** --- */ 1221 public AnnotationSet getLocations() { 1222 return this.locations; 1223 } 1224 } 1225 1226}
|
PronominalCoref |
|