1   /*
2    *  PronominalCoref.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Marin Dimitrov, 30/Dec/2001
12   *
13   *  $Id: PronominalCoref.java,v 1.25 2002/04/10 11:07:59 marin Exp $
14   */
15  
16  package gate.creole.coref;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import junit.framework.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  import gate.annotation.*;
27  
28  public class PronominalCoref extends AbstractLanguageAnalyser
29                                implements ProcessingResource, ANNIEConstants{
30  
31    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
32  
33    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
34  
35    /** --- */
36    private static final boolean DEBUG = false;
37  
38    //JAPE grammars
39    private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape";
40    private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape";
41  
42    //annotation types
43    private static final String QUOTED_TEXT_TYPE = "Quoted Text";
44    private static final String PLEONASTIC_TYPE = "PleonasticIt";
45  
46    //annotation features
47    private static final String PRP_CATEGORY = "PRP";
48    private static final String PRP$_CATEGORY = "PRP$";
49  
50    //scope
51    private static final int SENTENCES_IN_SCOPE = 3;
52    /** --- */
53    private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
54    /** --- */
55    private String annotationSetName;
56    /** --- */
57    private Transducer qtTransducer;
58    /** --- */
59    private Transducer pleonTransducer;
60    /** --- */
61    private AnnotationSet defaultAnnotations;
62    /** --- */
63    private Sentence[] textSentences;
64    /** --- */
65    private Quote[] quotedText;
66    /** --- */
67    private Annotation[] pleonasticIt;
68    /** --- */
69    private HashMap personGender;
70    /** --- */
71    private HashMap anaphor2antecedent;
72    /** --- */
73    private static final FeatureMap PRP_RESTRICTION;
74  
75    /** --- */
76    static {
77      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
78      PRP_RESTRICTION = new SimpleFeatureMapImpl();
79      PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
80    }
81  
82    /** --- */
83    public PronominalCoref() {
84  
85      this.personGender = new HashMap();
86      this.anaphor2antecedent = new HashMap();
87      this.qtTransducer = new gate.creole.Transducer();
88      this.pleonTransducer = new gate.creole.Transducer();
89    }
90  
91    /** Initialise this resource, and return it. */
92    public Resource init() throws ResourceInstantiationException {
93  
94      //0. preconditions
95      Assert.assertNotNull(this.qtTransducer);
96  
97      //1. initialise quoted text transducer
98      URL qtGrammarURL = null;
99      try {
100       qtGrammarURL = new URL(QT_GRAMMAR_URL);
101     }
102     catch(MalformedURLException mue) {
103       throw new ResourceInstantiationException(mue);
104     }
105     this.qtTransducer.setGrammarURL(qtGrammarURL);
106     this.qtTransducer.setEncoding("UTF-8");
107     this.qtTransducer.init();
108 
109     //2. initialise pleonastic transducer
110     URL pleonGrammarURL = null;
111     try {
112       pleonGrammarURL = new URL(PLEON_GRAMMAR_URL);
113     }
114     catch(MalformedURLException mue) {
115       throw new ResourceInstantiationException(mue);
116     }
117     this.pleonTransducer.setGrammarURL(pleonGrammarURL);
118     this.pleonTransducer.setEncoding("UTF-8");
119     this.pleonTransducer.init();
120 
121 
122     //3. delegate
123     return super.init();
124   } // init()
125 
126   /**
127    * Reinitialises the processing resource. After calling this method the
128    * resource should be in the state it is after calling init.
129    * If the resource depends on external resources (such as rules files) then
130    * the resource will re-read those resources. If the data used to create
131    * the resource has changed since the resource has been created then the
132    * resource will change too after calling reInit().
133   */
134   public void reInit() throws ResourceInstantiationException {
135 
136     if (null != this.qtTransducer) {
137       this.qtTransducer.reInit();
138     }
139 
140     if (null != this.pleonTransducer) {
141       this.pleonTransducer.reInit();
142     }
143 
144     init();
145   } // reInit()
146 
147 
148   /** Set the document to run on. */
149   public void setDocument(Document newDocument) {
150 
151     //0. precondition
152 //    Assert.assertNotNull(newDocument);
153 
154     //1. set doc for aggregated components
155     this.qtTransducer.setDocument(newDocument);
156     this.pleonTransducer.setDocument(newDocument);
157 
158     //3. delegate
159     super.setDocument(newDocument);
160   }
161 
162   /** --- */
163   public void setAnnotationSetName(String annotationSetName) {
164     this.annotationSetName = annotationSetName;
165   }
166 
167 
168   /** --- */
169   public String getAnnotationSetName() {
170     return annotationSetName;
171   }
172 
173   /**
174    * This method runs the coreferencer. It assumes that all the needed parameters
175    * are set. If they are not, an exception will be fired.
176    */
177   public void execute() throws ExecutionException{
178 
179     //0. preconditions
180     if(null == this.document) {
181       throw new ExecutionException("[coreference] Document is not set!");
182     }
183 
184     //1. preprocess
185     preprocess();
186 /*
187     //2. remove corefs from previous run
188     String annSetName = this.annotationSetName == null ? "COREF"
189                                                        : this.annotationSetName;
190 
191     AnnotationSet corefSet = this.document.getAnnotations(annSetName);
192     if (false == corefSet.isEmpty()) {
193       corefSet.clear();
194     }
195 */
196     //3.get personal pronouns
197     FeatureMap constraintPRP = new SimpleFeatureMapImpl();
198     constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
199     AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
200 
201     //4.get possesive pronouns
202     FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
203     constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
204     AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
205 
206     //5.combine them
207     AnnotationSet pronouns = personalPronouns;
208     if (null == personalPronouns) {
209       pronouns = possesivePronouns;
210     }
211     else if (null != possesivePronouns) {
212       pronouns.addAll(possesivePronouns);
213     }
214 
215     //6.do we have pronouns at all?
216     if (null == pronouns) {
217       //do nothing
218       return;
219     }
220 
221     //7.sort them according to offset
222     Object[] arrPronouns = pronouns.toArray();
223     java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
224 
225     //8.cleanup - ease the GC
226     pronouns = personalPronouns = possesivePronouns = null;
227 
228     int prnSentIndex = 0;
229 
230 
231     //10. process all pronouns
232     for (int i=0; i< arrPronouns.length; i++) {
233       Annotation currPronoun = (Annotation)arrPronouns[i];
234       while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
235                                       currPronoun.getEndNode().getOffset().longValue()) {
236         prnSentIndex++;
237       }
238 
239       Sentence currSentence = this.textSentences[prnSentIndex];
240       Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
241       Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
242 
243       //11. find antecedent (if any) for pronoun
244       Annotation antc = findAntecedent(currPronoun,prnSentIndex);
245 
246       //12. add to the ana2ant hashtable
247       this.anaphor2antecedent.put(currPronoun,antc);
248     }
249 
250     //done
251   }
252 
253 
254   /** --- */
255   public HashMap getResolvedAnaphora() {
256     return this.anaphor2antecedent;
257   }
258 
259   /** --- */
260   private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
261 
262     //0. preconditions
263     Assert.assertNotNull(currPronoun);
264     Assert.assertTrue(prnSentIndex >= 0);
265     Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
266     Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
267                       currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
268 
269     //1.
270     String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
271 
272     Assert.assertNotNull(strPronoun);
273 
274     //2. delegate processing to the appropriate methods
275     if (strPronoun.equalsIgnoreCase("HE") ||
276         strPronoun.equalsIgnoreCase("HIM") ||
277         strPronoun.equalsIgnoreCase("HIS") ||
278         strPronoun.equalsIgnoreCase("HIMSELF")) {
279       return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
280     }
281     else if (strPronoun.equalsIgnoreCase("SHE") ||
282               strPronoun.equalsIgnoreCase("HER")) {
283       return _resolve$SHE$HER$(currPronoun,prnSentIndex);
284     }
285     else if (strPronoun.equalsIgnoreCase("IT") ||
286               strPronoun.equalsIgnoreCase("ITS") ||
287               strPronoun.equalsIgnoreCase("ITSELF")) {
288       return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
289     }
290     else if (strPronoun.equalsIgnoreCase("I") ||
291               strPronoun.equalsIgnoreCase("ME") ||
292               strPronoun.equalsIgnoreCase("MY") ||
293               strPronoun.equalsIgnoreCase("MYSELF")) {
294       return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex);
295     }
296     else {
297       if (DEBUG) {
298         gate.util.Err.println("["+strPronoun+"] is not handled yet...");
299       }
300       return null;
301     }
302   }
303 
304 
305   boolean isPleonastic(Annotation pronoun) {
306 
307     //0. preconditions
308     Assert.assertNotNull(pronoun);
309     String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
310     Assert.assertTrue(str.equalsIgnoreCase("IT"));
311 
312     //1. do we have pleonasms in this text?
313     if (this.pleonasticIt.length == 0) {
314       return false;
315     }
316 
317     //2. find closest pleonasm index
318     int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt,
319                                                              pronoun,
320                                                              ANNOTATION_OFFSET_COMPARATOR);
321     //normalize index
322     if (closestPleonasmIndex < 0) {
323       closestPleonasmIndex = -closestPleonasmIndex -1 -1;
324     }
325 
326     //still not good?
327     if (closestPleonasmIndex < 0) {
328       closestPleonasmIndex = 0;
329     }
330 
331     //get closest pleonasm
332     Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
333 
334 //System.out.println(pleonasm);
335 //System.out.println(pronoun);
336 
337     //3. return true only if the proboun is contained in pleonastic fragment
338     boolean result =  (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
339             &&
340             pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
341 //System.out.println("is pleon=["+result+"]");
342     return result;
343   }
344 
345 
346   /** --- */
347   private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
348 
349     //0. preconditions
350     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
351     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
352                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
353     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
354     Assert.assertTrue(pronounString.equalsIgnoreCase("HE") ||
355                       pronounString.equalsIgnoreCase("HIM") ||
356                       pronounString.equalsIgnoreCase("HIS") ||
357                       pronounString.equalsIgnoreCase("HIMSELF"));
358 
359     //1.
360     boolean antecedentFound = false;
361     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
362     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
363 
364     int currSentenceIndex = sentenceIndex;
365     Annotation bestAntecedent = null;
366 
367     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
368       Sentence currSentence = this.textSentences[currSentenceIndex];
369       AnnotationSet persons = currSentence.getPersons();
370 
371       Iterator it = persons.iterator();
372       while (it.hasNext()) {
373         Annotation currPerson = (Annotation)it.next();
374         String gender = (String)this.personGender.get(currPerson);
375 
376         if (null == gender ||
377             gender.equalsIgnoreCase("MALE") ||
378             gender.equalsIgnoreCase("UNKNOWN")) {
379           //hit
380           antecedentFound = true;
381 
382           if (null == bestAntecedent) {
383             bestAntecedent = currPerson;
384           }
385           else {
386             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
387           }
388         }
389       }
390 
391       if (0 == currSentenceIndex--)
392         break;
393 
394     }
395 
396     return bestAntecedent;
397   }
398 
399 
400   /** --- */
401   private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) {
402 
403     //0. preconditions
404     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
405     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
406                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
407     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
408     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
409                       pronounString.equalsIgnoreCase("HER"));
410 
411     //1.
412     boolean antecedentFound = false;
413     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
414     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
415     int currSentenceIndex = sentenceIndex;
416     Annotation bestAntecedent = null;
417 
418     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
419       Sentence currSentence = this.textSentences[currSentenceIndex];
420       AnnotationSet persons = currSentence.getPersons();
421 
422       Iterator it = persons.iterator();
423       while (it.hasNext()) {
424         Annotation currPerson = (Annotation)it.next();
425         String gender = (String)this.personGender.get(currPerson);
426 
427         if (null == gender ||
428             gender.equalsIgnoreCase("FEMALE") ||
429             gender.equalsIgnoreCase("UNKNOWN")) {
430           //hit
431           antecedentFound = true;
432 
433           if (null == bestAntecedent) {
434             bestAntecedent = currPerson;
435           }
436           else {
437             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
438           }
439         }
440       }
441 
442       if (0 == currSentenceIndex--)
443         break;
444     }
445 
446     return bestAntecedent;
447   }
448 
449 
450   /** --- */
451   private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
452 
453     //0. preconditions
454     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
455     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
456                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
457     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
458     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
459                       pronounString.equalsIgnoreCase("ITS") ||
460                       pronounString.equalsIgnoreCase("ITSELF"));
461 
462     //0.5 check if the IT is pleonastic
463     if (pronounString.equalsIgnoreCase("IT") &&
464         isPleonastic(pronoun)) {
465 //System.out.println("PLEONASM...");
466       return null;
467     }
468 
469     //1.
470     int scopeFirstIndex = sentenceIndex - 1;
471     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
472 
473     int currSentenceIndex = sentenceIndex;
474     Annotation bestAntecedent = null;
475 
476     while (currSentenceIndex >= scopeFirstIndex) {
477 
478       Sentence currSentence = this.textSentences[currSentenceIndex];
479       AnnotationSet org = currSentence.getOrganizations();
480       AnnotationSet loc = currSentence.getLocations();
481       //combine them
482       AnnotationSet org_loc = org;
483       org_loc.addAll(loc);
484 
485       Iterator it = org_loc.iterator();
486       while (it.hasNext()) {
487         Annotation currOrgLoc = (Annotation)it.next();
488 
489         if (null == bestAntecedent) {
490           //discard cataphoric references
491           if (currOrgLoc.getStartNode().getOffset().longValue() <
492                                           pronoun.getStartNode().getOffset().longValue()) {
493             bestAntecedent = currOrgLoc;
494           }
495         }
496         else {
497           bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
498         }
499       }
500 
501       if (0 == currSentenceIndex--)
502         break;
503     }
504 
505     return bestAntecedent;
506   }
507 
508 
509   /** --- */
510   private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) {
511 
512     //0. preconditions
513     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
514     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
515                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
516     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
517     Assert.assertTrue(pronounString.equalsIgnoreCase("I") ||
518                       pronounString.equalsIgnoreCase("MY") ||
519                       pronounString.equalsIgnoreCase("ME") ||
520                       pronounString.equalsIgnoreCase("MYSELF"));
521 
522     //0.5 sanity check
523     //if there are not quotes at all in the text then exit
524     if (0 == this.quotedText.length) {
525 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED...");
526       return null;
527     }
528 
529 
530     //1.
531     Annotation bestAntecedent = null;
532 
533     int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
534     //normalize index
535     if (closestQuoteIndex < 0) {
536       closestQuoteIndex = -closestQuoteIndex -1 -1;
537     }
538 
539     //still not good?
540     if (closestQuoteIndex < 0) {
541       closestQuoteIndex = 0;
542     }
543 
544     //get closest Quote
545     Quote quoteContext = this.quotedText[closestQuoteIndex];
546 
547     //assure that the pronoun is contained in the quoted text fragment
548     //otherwise exit
549 
550     if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
551         pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
552       //oops, probably incorrect text - I/My/Me is not part of quoted text fragment
553       //exit
554 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment...");
555       return null;
556     }
557 
558     //get the Persons that precede/succeed the quoted fragment
559     //the order is:
560     //
561     //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but
562     //in the same sentence, then use it
563     //i.e.  ["PRN1(x)...", said X ...A, B, C ....]
564     //
565     //[2]. if there is a Person (NOT a pronoun) in the same sentence,
566     // preceding the quote, then use it
567     //i.e. . [A, B, C...X ..."PRN1(x) ..."...]
568     //
569 
570     //try [1]
571     //get the succeeding Persons/pronouns
572     AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
573     if (false == succCandidates.isEmpty()) {
574       //cool, we have candidates, pick up the one closest to the end quote
575       Iterator it = succCandidates.iterator();
576 
577       while (it.hasNext()) {
578         Annotation currCandidate = (Annotation)it.next();
579         if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
580           //wow, we have a candidate that is closer to the quote
581           bestAntecedent = currCandidate;
582         }
583       }
584     }
585 
586     //try [2]
587     //get the preceding Persons/pronouns
588     if (null == bestAntecedent) {
589       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
590       if (false == precCandidates.isEmpty()) {
591         //cool, we have candidates, pick up the one closest to the end quote
592         Iterator it = precCandidates.iterator();
593 
594         while (it.hasNext()) {
595           Annotation currCandidate = (Annotation)it.next();
596           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
597             //wow, we have a candidate that is closer to the quote
598             bestAntecedent = currCandidate;
599           }
600         }
601       }
602     }
603 
604     //try [3]
605     //get the Persons/pronouns back in context
606     if (null == bestAntecedent) {
607       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
608       if (false == precCandidates.isEmpty()) {
609         //cool, we have candidates, pick up the one closest to the end quote
610         Iterator it = precCandidates.iterator();
611 
612         while (it.hasNext()) {
613           Annotation currCandidate = (Annotation)it.next();
614           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
615             //wow, we have a candidate that is closer to the quote
616             bestAntecedent = currCandidate;
617           }
618         }
619       }
620     }
621 
622     return bestAntecedent;
623   }
624 
625 
626   /** --- */
627   private void preprocess() throws ExecutionException {
628 
629     //0.5 cleanup
630     this.personGender.clear();
631     this.anaphor2antecedent.clear();
632 
633     //1.get all annotation in the input set
634     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
635       this.defaultAnnotations = this.document.getAnnotations();
636     }
637     else {
638       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
639     }
640 
641     //if none found, print warning and exit
642     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
643       Err.prln("Coref Warning: No annotations found for processing!");
644       return;
645     }
646 
647 
648 
649     //2.1 remove QT annotations if left from previous execution
650     AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
651     if (null != qtSet) {
652       qtSet.clear();
653     }
654 
655     //2.2. run quoted text transducer to generate "Quoted Text" annotations
656     this.qtTransducer.execute();
657 
658     //3.1 remove pleonastic annotations if left from previous execution
659     AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
660     if (null != pleonSet) {
661       pleonSet.clear();
662     }
663 
664     //3.2 run quoted text transducer to generate "Pleonasm" annotations
665     this.pleonTransducer.execute();
666 
667     //4.get all SENTENCE annotations
668     AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
669 
670     this.textSentences = new Sentence[sentenceAnnotations.size()];
671     Object[]  sentenceArray = sentenceAnnotations.toArray();
672 
673     java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
674 
675     for (int i=0; i< sentenceArray.length; i++) {
676 
677       Annotation currSentence = (Annotation)sentenceArray[i];
678       Long sentStartOffset = currSentence.getStartNode().getOffset();
679       Long sentEndOffset = currSentence.getEndNode().getOffset();
680 
681       //4.1. get PERSOSNS in this sentence
682       AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE,
683                                                               sentStartOffset,
684                                                               sentEndOffset);
685 
686       //4.2. get ORGANIZATIONS in this sentence
687       AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE,
688                                                               sentStartOffset,
689                                                               sentEndOffset);
690 
691       //4.3. get LOCATION in this sentence
692       AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE,
693                                                               sentStartOffset,
694                                                               sentEndOffset);
695 
696       //4.5. create a Sentence for thei SENTENCE annotation
697       this.textSentences[i] = new Sentence(i,
698                                             0,
699                                             sentStartOffset,
700                                             sentEndOffset,
701                                             sentPersons,
702                                             sentOrgs,
703                                             sentLocs
704                                   );
705 
706       //4.6. for all PERSONs in the sentence - find their gender using the
707       //orthographic coreferences if the gender of some entity is unknown
708       Iterator itPersons = sentPersons.iterator();
709       while (itPersons.hasNext()) {
710         Annotation currPerson = (Annotation)itPersons.next();
711         String gender = this.findPersonGender(currPerson);
712         this.personGender.put(currPerson,gender);
713       }
714     }
715 
716     //5. initialise the quoted text fragments
717     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
718 
719     //if none then return
720     if (null == sentQuotes) {
721       this.quotedText = new Quote[0];
722     }
723     else {
724       this.quotedText = new Quote[sentQuotes.size()];
725 
726       Object[] quotesArray = sentQuotes.toArray();
727       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
728 
729       for (int i =0; i < quotesArray.length; i++) {
730         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
731       }
732     }
733 
734     //6. initialuse the plonastic It annotations
735     AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
736 
737     if (null == plaonasticSet) {
738       this.pleonasticIt = new Annotation[0];
739     }
740     else {
741       this.pleonasticIt = new Annotation[plaonasticSet.size()];
742 
743       Object[] quotesArray = plaonasticSet.toArray();
744       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
745 
746       for (int i=0; i< this.pleonasticIt.length; i++) {
747         this.pleonasticIt[i] = (Annotation)quotesArray[i];
748       }
749     }
750 
751   }
752 
753 
754   /** --- */
755   private String findPersonGender(Annotation person) {
756 
757     String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
758 
759     if (null==result) {
760       //gender is unknown - try to find it from the ortho coreferences
761       List orthoMatches  = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
762 
763       if (null != orthoMatches) {
764         Iterator itMatches = orthoMatches.iterator();
765 
766         while (itMatches.hasNext()) {
767           Integer correferringID = (Integer)itMatches.next();
768           Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
769           Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
770           String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
771 
772           if (null != correferringGender) {
773             result = correferringGender;
774             break;
775           }
776         }
777       }
778     }
779 
780     return result;
781   }
782 
783 
784   /** --- */
785   private static class AnnotationOffsetComparator implements Comparator {
786 
787     private int _getOffset(Object o) {
788 
789       if (o instanceof Annotation) {
790         return ((Annotation)o).getEndNode().getOffset().intValue();
791       }
792       else if (o instanceof Sentence) {
793         return ((Sentence)o).getStartOffset().intValue();
794       }
795       else if (o instanceof Quote) {
796         return ((Quote)o).getStartOffset().intValue();
797       }
798       else if (o instanceof Node) {
799         return ((Node)o).getOffset().intValue();
800       }
801       else {
802         throw new IllegalArgumentException();
803       }
804     }
805 
806     public int compare(Object o1,Object o2) {
807 
808       //0. preconditions
809       Assert.assertNotNull(o1);
810       Assert.assertNotNull(o2);
811       Assert.assertTrue(o1 instanceof Annotation ||
812                         o1 instanceof Sentence ||
813                         o1 instanceof Quote ||
814                         o1 instanceof Node);
815       Assert.assertTrue(o2 instanceof Annotation ||
816                         o2 instanceof Sentence ||
817                         o2 instanceof Quote ||
818                         o2 instanceof Node);
819 
820       int offset1 = _getOffset(o1);
821       int offset2 = _getOffset(o2);
822 
823       return offset1 - offset2;
824     }
825   }
826 
827 
828   /** --- */
829   private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
830 
831     //0. preconditions
832     Assert.assertNotNull(ant1);
833     Assert.assertNotNull(ant2);
834     Assert.assertNotNull(pronoun);
835     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
836                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
837     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
838     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
839                       pronounString.equalsIgnoreCase("HER") ||
840                       pronounString.equalsIgnoreCase("HE") ||
841                       pronounString.equalsIgnoreCase("HIM") ||
842                       pronounString.equalsIgnoreCase("HIS") ||
843                       pronounString.equalsIgnoreCase("HIMSELF"));
844 
845     Long offset1 = ant1.getStartNode().getOffset();
846     Long offset2 = ant2.getStartNode().getOffset();
847     Long offsetPrn = pronoun.getStartNode().getOffset();
848 
849     long diff1 = offsetPrn.longValue() - offset1.longValue();
850     long diff2 = offsetPrn.longValue() - offset2.longValue();
851 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
852     //reject candidates that overlap with the pronoun
853     if (diff1 == 0) {
854       return ant2;
855     }
856     else if (diff2 == 0) {
857       return ant1;
858     }
859 
860     //get the one CLOSEST AND PRECEDING the pronoun
861     if (diff1 > 0 && diff2 > 0) {
862       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
863       if (diff1 < diff2)
864         return ant1;
865       else
866         return ant2;
867     }
868     else if (diff1 < 0 && diff2 < 0) {
869       //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A
870       if (Math.abs(diff1) < Math.abs(diff2))
871         return ant1;
872       else
873           return ant2;
874     }
875     else {
876       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
877       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
878       if (diff1 > 0)
879         return ant1;
880       else
881         return ant2;
882     }
883   }
884 
885   /** --- */
886   private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
887 
888     //0. preconditions
889     Assert.assertNotNull(ant1);
890     Assert.assertNotNull(ant2);
891     Assert.assertNotNull(pronoun);
892     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
893                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
894     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
895 
896     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
897                       pronounString.equalsIgnoreCase("ITS") ||
898                       pronounString.equalsIgnoreCase("ITSELF"));
899 
900     Long offset1 = ant1.getStartNode().getOffset();
901     Long offset2 = ant2.getStartNode().getOffset();
902     Long offsetPrn = pronoun.getStartNode().getOffset();
903     long diff1 = offsetPrn.longValue() - offset1.longValue();
904     long diff2 = offsetPrn.longValue() - offset2.longValue();
905 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
906     //reject candidates that overlap with the pronoun
907     if (diff1 == 0) {
908       return ant2;
909     }
910     else if (diff2 == 0) {
911       return ant1;
912     }
913 
914 
915     //get the one CLOSEST AND PRECEDING the pronoun
916     if (diff1 > 0 && diff2 > 0) {
917       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
918       if (diff1 < diff2)
919         return ant1;
920       else
921         return ant2;
922     }
923     else if (diff1 > 0){
924       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
925       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
926       return ant1;
927     }
928     else if (diff2 > 0){
929       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
930       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
931       return ant2;
932     }
933     else {
934       //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either
935       //cataphora, or nominal antecedent, or an antecedent that is further back in scope
936       //in any case - discard the antecedents
937       return null;
938     }
939   }
940 
941 
942   /** --- */
943   private class Quote {
944 
945     /** --- */
946     public static final int ANTEC_AFTER = 1;
947     /** --- */
948     public static final int ANTEC_BEFORE = 2;
949     /** --- */
950     public static final int ANTEC_BACK = 3;
951     /** --- */
952     private AnnotationSet antecedentsBefore;
953     /** --- */
954     private AnnotationSet antecedentsAfter;
955     /** --- */
956     private AnnotationSet antecedentsBackInContext;
957     /** --- */
958     private Annotation quoteAnnotation;
959     /** --- */
960     private int quoteIndex;
961 
962     /** --- */
963     public Quote(Annotation quoteAnnotation, int index) {
964 
965       this.quoteAnnotation = quoteAnnotation;
966       this.quoteIndex = index;
967       init();
968     }
969 
970     /** --- */
971     private void init() {
972 
973       //0.preconditions
974       Assert.assertNotNull(textSentences);
975 
976       //0.5 create a restriction for PRP pos tokens
977       FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
978       prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
979 
980       //1. generate the precPersons set
981 
982       //1.1 locate the sentece containing the opening quote marks
983       int quoteStartPos = java.util.Arrays.binarySearch(textSentences,
984                                                         this.quoteAnnotation.getStartNode(),
985                                                         ANNOTATION_OFFSET_COMPARATOR);
986 
987       //normalize index
988       int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
989                                                   : -quoteStartPos -1 -1; // blame Sun, not me
990       //still not good?
991       if (startSentenceIndex < 0) {
992         startSentenceIndex = 0;
993       }
994 
995       //1.2. get the persons and restrict to these that precede the quote (i.e. not contained
996       //in the quote)
997       this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
998                                                             this.quoteIndex,
999                                                             ANTEC_BEFORE);
1000
1001
1002      //2. generate the precPersonsInCOntext set
1003      //2.1. get the persons from the sentence precedeing the sentence containing the quote start
1004      if (startSentenceIndex > 0) {
1005        this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
1006                                                                    this.quoteIndex,
1007                                                                    ANTEC_BACK);
1008      }
1009
1010      //2. generate the succ  Persons set
1011      //2.1 locate the sentece containing the closing quote marks
1012      int quoteEndPos = java.util.Arrays.binarySearch(textSentences,
1013                                                        this.quoteAnnotation.getEndNode(),
1014                                                        ANNOTATION_OFFSET_COMPARATOR);
1015
1016      //normalize it
1017      int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
1018                                              : -quoteEndPos -1 -1; // blame Sun, not me
1019      //still not good?
1020      if (endSentenceIndex < 0) {
1021        endSentenceIndex = 0;
1022      }
1023
1024      this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
1025                                                            this.quoteIndex,
1026                                                            ANTEC_AFTER);
1027      //generate t
1028    }
1029
1030
1031    /** --- */
1032    private AnnotationSet generateAntecedentCandidates(int sentenceNumber,
1033                                                        int quoteNumber ,
1034                                                        int mode) {
1035
1036      //0. preconditions
1037      Assert.assertTrue(sentenceNumber >=0);
1038      Assert.assertTrue(quoteNumber >=0);
1039      Assert.assertTrue(mode == Quote.ANTEC_AFTER ||
1040                        mode == Quote.ANTEC_BEFORE ||
1041                        mode == Quote.ANTEC_BACK);
1042
1043      //1. get sentence
1044     Sentence sentence = textSentences[sentenceNumber];
1045
1046      //2. get the persons
1047      AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons());
1048
1049      //4. now get the he/she pronouns in the relevant context
1050      AnnotationSet annotations = null;
1051
1052      switch(mode) {
1053
1054        case ANTEC_BEFORE:
1055          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1056                                                      this.getStartOffset());
1057          break;
1058
1059        case ANTEC_AFTER:
1060          annotations = defaultAnnotations.getContained(this.getEndOffset(),
1061                                                     sentence.getEndOffset());
1062          break;
1063
1064        case ANTEC_BACK:
1065          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1066                                                     sentence.getEndOffset());
1067          break;
1068      }
1069
1070      //4. get the pronouns
1071      //restrict to he/she pronouns
1072      if (null != annotations) {
1073        AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
1074
1075        if (null != pronouns) {
1076
1077          Iterator it = pronouns.iterator();
1078          while (it.hasNext()) {
1079            Annotation currPronoun = (Annotation)it.next();
1080            //add to succPersons only if HE/SHE
1081            String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1082
1083            if (null != pronounString &&
1084                (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
1085                )
1086              antecedents.add(currPronoun);
1087          }//while
1088        }//if
1089      }//if
1090
1091
1092      //3. depending on the mode, may have to restrict persons to these that precede/succeed
1093      //the quoted fragment
1094      //
1095      //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where
1096      //the quote *starts*
1097      //
1098      //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where
1099      //the quote *ends*
1100      //
1101      //for ANTEC_BACK, we are operating in the context of the sentence previous to the
1102      //sentence where the quote starts. I.e. we're resolbinf a case like
1103      // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"]
1104      //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote
1105      //Note that the cirrent sentence is the first one, not the second
1106      //
1107      Iterator itPersons = antecedents.iterator();
1108
1109      while (itPersons.hasNext()) {
1110        Annotation currPerson = (Annotation)itPersons.next();
1111
1112        //cut
1113        if (Quote.ANTEC_BEFORE == mode &&
1114            currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
1115          //restrict only to persosn preceding
1116          itPersons.remove();
1117        }
1118        else if (Quote.ANTEC_AFTER == mode &&
1119                currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
1120          //restrict only to persons succeeding the quote
1121          itPersons.remove();
1122        }
1123        else if (Quote.ANTEC_BACK == mode) {
1124          //this one is tricky
1125          //locate the quote previous to the one we're resolving
1126          //(since we're operating in the sentence previous to the quote being resolved
1127          //wew try to find if any quote (prevQuote) exist in this sentence and get the
1128          //persons succeeding it)
1129
1130          //get prev quote
1131          //is the curr quote the first one?
1132          if (quoteNumber >0) {
1133            Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
1134
1135            //restrict to the succeeding persons
1136            if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
1137              itPersons.remove();
1138            }
1139          }
1140        }
1141      }
1142
1143      return antecedents;
1144    }
1145
1146    /** --- */
1147    public Long getStartOffset() {
1148      return this.quoteAnnotation.getStartNode().getOffset();
1149    }
1150
1151    /** --- */
1152    public Long getEndOffset() {
1153      return this.quoteAnnotation.getEndNode().getOffset();
1154    }
1155
1156    /** --- */
1157    public AnnotationSet getAntecedentCandidates(int type) {
1158
1159      switch(type) {
1160
1161        case ANTEC_AFTER:
1162          return this.antecedentsAfter;
1163
1164        case ANTEC_BEFORE:
1165          return this.antecedentsBefore;
1166
1167        case ANTEC_BACK:
1168          return this.antecedentsBackInContext;
1169
1170        default:
1171          throw new IllegalArgumentException();
1172      }
1173    }
1174
1175  }
1176
1177
1178  /** --- */
1179  private class Sentence {
1180
1181    /** --- */
1182    private int sentNumber;
1183    /** --- */
1184    private int paraNumber;
1185    /** --- */
1186    private Long startOffset;
1187    /** --- */
1188    private Long endOffset;
1189    /** --- */
1190    private AnnotationSet persons;
1191    /** --- */
1192    private AnnotationSet organizations;
1193    /** --- */
1194    private AnnotationSet locations;
1195
1196    /** --- */
1197    public Sentence(int sentNumber,
1198                    int paraNumber,
1199                    Long startOffset,
1200                    Long endOffset,
1201                    AnnotationSet persons,
1202                    AnnotationSet organizations,
1203                    AnnotationSet locations) {
1204
1205      this.sentNumber = sentNumber;
1206      this.paraNumber = paraNumber;
1207      this.startOffset = startOffset;
1208      this.endOffset = endOffset;
1209      this.persons = persons;
1210      this.organizations = organizations;
1211      this.locations = locations;
1212    }
1213
1214    /** --- */
1215    public Long getStartOffset() {
1216      return this.startOffset;
1217    }
1218
1219    /** --- */
1220    public Long getEndOffset() {
1221      return this.endOffset;
1222    }
1223
1224    /** --- */
1225    public AnnotationSet getPersons() {
1226      return this.persons;
1227    }
1228
1229    /** --- */
1230    public AnnotationSet getOrganizations() {
1231      return this.organizations;
1232    }
1233
1234    /** --- */
1235    public AnnotationSet getLocations() {
1236      return this.locations;
1237    }
1238  }
1239
1240}