1   /*
2    *  PronominalCoref.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Marin Dimitrov, 30/Dec/2001
12   *
13   *  $Id: PronominalCoref.java,v 1.23 2002/03/11 10:00:18 marin Exp $
14   */
15  
16  package gate.creole.coref;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import junit.framework.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  import gate.annotation.*;
27  
28  public class PronominalCoref extends AbstractLanguageAnalyser
29                                implements ProcessingResource, ANNIEConstants{
30  
31    public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
32  
33    public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
34  
35    /** --- */
36    private static final boolean DEBUG = false;
37  
38    //JAPE grammars
39    private static final String QT_GRAMMAR_URL = "gate://gate/creole/coref/quoted_text.jape";
40    private static final String PLEON_GRAMMAR_URL = "gate://gate/creole/coref/pleonasm.jape";
41  
42    //annotation types
43    private static final String QUOTED_TEXT_TYPE = "Quoted Text";
44    private static final String PLEONASTIC_TYPE = "PleonasticIt";
45  
46    //annotation features
47    private static final String PRP_CATEGORY = "PRP";
48    private static final String PRP$_CATEGORY = "PRP$";
49  
50    //scope
51    private static final int SENTENCES_IN_SCOPE = 3;
52    /** --- */
53    private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
54    /** --- */
55    private String annotationSetName;
56    /** --- */
57    private Transducer qtTransducer;
58    /** --- */
59    private Transducer pleonTransducer;
60    /** --- */
61    private AnnotationSet defaultAnnotations;
62    /** --- */
63    private Sentence[] textSentences;
64    /** --- */
65    private Quote[] quotedText;
66    /** --- */
67    private Annotation[] pleonasticIt;
68    /** --- */
69    private HashMap personGender;
70    /** --- */
71    private HashMap anaphor2antecedent;
72    /** --- */
73    private static final FeatureMap PRP_RESTRICTION;
74  
75    /** --- */
76    static {
77      ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
78      PRP_RESTRICTION = new SimpleFeatureMapImpl();
79      PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
80    }
81  
82    /** --- */
83    public PronominalCoref() {
84  
85      this.personGender = new HashMap();
86      this.anaphor2antecedent = new HashMap();
87      this.qtTransducer = new gate.creole.Transducer();
88      this.pleonTransducer = new gate.creole.Transducer();
89    }
90  
91    /** Initialise this resource, and return it. */
92    public Resource init() throws ResourceInstantiationException {
93  
94      //0. preconditions
95      Assert.assertNotNull(this.qtTransducer);
96  
97      //1. initialise quoted text transducer
98      URL qtGrammarURL = null;
99      try {
100       qtGrammarURL = new URL(QT_GRAMMAR_URL);
101     }
102     catch(MalformedURLException mue) {
103       throw new ResourceInstantiationException(mue);
104     }
105     this.qtTransducer.setGrammarURL(qtGrammarURL);
106     this.qtTransducer.setEncoding("UTF-8");
107     this.qtTransducer.init();
108 
109     //2. initialise pleonastic transducer
110     URL pleonGrammarURL = null;
111     try {
112       pleonGrammarURL = new URL(PLEON_GRAMMAR_URL);
113     }
114     catch(MalformedURLException mue) {
115       throw new ResourceInstantiationException(mue);
116     }
117     this.pleonTransducer.setGrammarURL(pleonGrammarURL);
118     this.pleonTransducer.setEncoding("UTF-8");
119     this.pleonTransducer.init();
120 
121 
122     //3. delegate
123     return super.init();
124   } // init()
125 
126   /**
127    * Reinitialises the processing resource. After calling this method the
128    * resource should be in the state it is after calling init.
129    * If the resource depends on external resources (such as rules files) then
130    * the resource will re-read those resources. If the data used to create
131    * the resource has changed since the resource has been created then the
132    * resource will change too after calling reInit().
133   */
134   public void reInit() throws ResourceInstantiationException {
135 
136     if (null != this.qtTransducer) {
137       this.qtTransducer.reInit();
138     }
139 
140     if (null != this.pleonTransducer) {
141       this.pleonTransducer.reInit();
142     }
143 
144     init();
145   } // reInit()
146 
147 
148   /** Set the document to run on. */
149   public void setDocument(Document newDocument) {
150 
151     //0. precondition
152 //    Assert.assertNotNull(newDocument);
153 
154     //1. set doc for aggregated components
155     this.qtTransducer.setDocument(newDocument);
156     this.pleonTransducer.setDocument(newDocument);
157 
158     //3. delegate
159     super.setDocument(newDocument);
160   }
161 
162   /** --- */
163   public void setAnnotationSetName(String annotationSetName) {
164     this.annotationSetName = annotationSetName;
165   }
166 
167 
168   /** --- */
169   public String getAnnotationSetName() {
170     return annotationSetName;
171   }
172 
173   /**
174    * This method runs the coreferencer. It assumes that all the needed parameters
175    * are set. If they are not, an exception will be fired.
176    */
177   public void execute() throws ExecutionException{
178 
179     //0. preconditions
180     if(null == this.document) {
181       throw new ExecutionException("[coreference] Document is not set!");
182     }
183 
184     //1. preprocess
185     preprocess();
186 /*
187     //2. remove corefs from previous run
188     String annSetName = this.annotationSetName == null ? "COREF"
189                                                        : this.annotationSetName;
190 
191     AnnotationSet corefSet = this.document.getAnnotations(annSetName);
192     if (false == corefSet.isEmpty()) {
193       corefSet.clear();
194     }
195 */
196     //3.get personal pronouns
197     FeatureMap constraintPRP = new SimpleFeatureMapImpl();
198     constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
199     AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
200 
201     //4.get possesive pronouns
202     FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
203     constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
204     AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
205 
206     //5.combine them
207     AnnotationSet pronouns = personalPronouns;
208     if (null == personalPronouns) {
209       pronouns = possesivePronouns;
210     }
211     else if (null != possesivePronouns) {
212       pronouns.addAll(possesivePronouns);
213     }
214 
215     //6.do we have pronouns at all?
216     if (null == pronouns) {
217       //do nothing
218       return;
219     }
220 
221     //7.sort them according to offset
222     Object[] arrPronouns = pronouns.toArray();
223     java.util.Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
224 
225     //8.cleanup - ease the GC
226     pronouns = personalPronouns = possesivePronouns = null;
227 
228     int prnSentIndex = 0;
229 
230 
231     //10. process all pronouns
232     for (int i=0; i< arrPronouns.length; i++) {
233       Annotation currPronoun = (Annotation)arrPronouns[i];
234       while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
235                                       currPronoun.getEndNode().getOffset().longValue()) {
236         prnSentIndex++;
237       }
238 
239       Sentence currSentence = this.textSentences[prnSentIndex];
240       Assert.assertTrue(currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
241       Assert.assertTrue(currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
242 
243       //11. find antecedent (if any) for pronoun
244       Annotation antc = findAntecedent(currPronoun,prnSentIndex);
245 
246       //12. add to the ana2ant hashtable
247       this.anaphor2antecedent.put(currPronoun,antc);
248     }
249 
250     //done
251   }
252 
253 
254   /** --- */
255   public HashMap getResolvedAnaphora() {
256     return this.anaphor2antecedent;
257   }
258 
259   /** --- */
260   private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
261 
262     //0. preconditions
263     Assert.assertNotNull(currPronoun);
264     Assert.assertTrue(prnSentIndex >= 0);
265     Assert.assertTrue(currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
266     Assert.assertTrue(currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
267                       currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
268 
269     //1.
270     String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
271 
272     Assert.assertNotNull(strPronoun);
273 
274     //2. delegate processing to the appropriate methods
275     if (strPronoun.equalsIgnoreCase("HE") ||
276         strPronoun.equalsIgnoreCase("HIM") ||
277         strPronoun.equalsIgnoreCase("HIS") ||
278         strPronoun.equalsIgnoreCase("HIMSELF")) {
279       return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
280     }
281     else if (strPronoun.equalsIgnoreCase("SHE") ||
282               strPronoun.equalsIgnoreCase("HER")) {
283       return _resolve$SHE$HER$(currPronoun,prnSentIndex);
284     }
285     else if (strPronoun.equalsIgnoreCase("IT") ||
286               strPronoun.equalsIgnoreCase("ITS") ||
287               strPronoun.equalsIgnoreCase("ITSELF")) {
288       return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
289     }
290     else if (strPronoun.equalsIgnoreCase("I") ||
291               strPronoun.equalsIgnoreCase("ME") ||
292               strPronoun.equalsIgnoreCase("MY") ||
293               strPronoun.equalsIgnoreCase("MYSELF")) {
294       return _resolve$I$ME$MY$MYSELF$(currPronoun,prnSentIndex);
295     }
296     else {
297       if (DEBUG) {
298         gate.util.Err.println("["+strPronoun+"] is not handled yet...");
299       }
300       return null;
301     }
302   }
303 
304 
305   boolean isPleonastic(Annotation pronoun) {
306 
307     //0. preconditions
308     Assert.assertNotNull(pronoun);
309     String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
310     Assert.assertTrue(str.equalsIgnoreCase("IT"));
311 
312     //1. do we have pleonasms in this text?
313     if (this.pleonasticIt.length == 0) {
314       return false;
315     }
316 
317     //2. find closest pleonasm index
318     int closestPleonasmIndex = java.util.Arrays.binarySearch(this.pleonasticIt,
319                                                              pronoun,
320                                                              ANNOTATION_OFFSET_COMPARATOR);
321     //normalize index
322     if (closestPleonasmIndex < 0) {
323       closestPleonasmIndex = -closestPleonasmIndex -1 -1;
324     }
325 
326     //still not good?
327     if (closestPleonasmIndex < 0) {
328       closestPleonasmIndex = 0;
329     }
330 
331     //get closest pleonasm
332     Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
333 
334 //System.out.println(pleonasm);
335 //System.out.println(pronoun);
336 
337     //3. return true only if the proboun is contained in pleonastic fragment
338     boolean result =  (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
339             &&
340             pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
341 //System.out.println("is pleon=["+result+"]");
342     return result;
343   }
344 
345 
346   /** --- */
347   private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
348 
349     //0. preconditions
350     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
351     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
352                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
353     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
354     Assert.assertTrue(pronounString.equalsIgnoreCase("HE") ||
355                       pronounString.equalsIgnoreCase("HIM") ||
356                       pronounString.equalsIgnoreCase("HIS") ||
357                       pronounString.equalsIgnoreCase("HIMSELF"));
358 
359     //1.
360     boolean antecedentFound = false;
361     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
362     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
363 
364     int currSentenceIndex = sentenceIndex;
365     Annotation bestAntecedent = null;
366 
367     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
368       Sentence currSentence = this.textSentences[currSentenceIndex];
369       AnnotationSet persons = currSentence.getPersons();
370 
371       Iterator it = persons.iterator();
372       while (it.hasNext()) {
373         Annotation currPerson = (Annotation)it.next();
374         String gender = (String)this.personGender.get(currPerson);
375 
376         if (null == gender ||
377             gender.equalsIgnoreCase("MALE") ||
378             gender.equalsIgnoreCase("UNKNOWN")) {
379           //hit
380           antecedentFound = true;
381 
382           if (null == bestAntecedent) {
383             bestAntecedent = currPerson;
384           }
385           else {
386             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
387           }
388         }
389       }
390 
391       if (0 == currSentenceIndex--)
392         break;
393 
394     }
395 
396     return bestAntecedent;
397   }
398 
399 
400   /** --- */
401   private Annotation _resolve$SHE$HER$(Annotation pronoun, int sentenceIndex) {
402 
403     //0. preconditions
404     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
405     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
406                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
407     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
408     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
409                       pronounString.equalsIgnoreCase("HER"));
410 
411     //1.
412     boolean antecedentFound = false;
413     int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
414     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
415     int currSentenceIndex = sentenceIndex;
416     Annotation bestAntecedent = null;
417 
418     while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
419       Sentence currSentence = this.textSentences[currSentenceIndex];
420       AnnotationSet persons = currSentence.getPersons();
421 
422       Iterator it = persons.iterator();
423       while (it.hasNext()) {
424         Annotation currPerson = (Annotation)it.next();
425         String gender = (String)this.personGender.get(currPerson);
426 
427         if (null == gender ||
428             gender.equalsIgnoreCase("FEMALE") ||
429             gender.equalsIgnoreCase("UNKNOWN")) {
430           //hit
431           antecedentFound = true;
432 
433           if (null == bestAntecedent) {
434             bestAntecedent = currPerson;
435           }
436           else {
437             bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(bestAntecedent,currPerson,pronoun);
438           }
439         }
440       }
441 
442       if (0 == currSentenceIndex--)
443         break;
444     }
445 
446     return bestAntecedent;
447   }
448 
449 
450   /** --- */
451   private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
452 
453     //0. preconditions
454     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
455     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
456                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
457     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
458     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
459                       pronounString.equalsIgnoreCase("ITS") ||
460                       pronounString.equalsIgnoreCase("ITSELF"));
461 
462     //0.5 check if the IT is pleonastic
463     if (pronounString.equalsIgnoreCase("IT") &&
464         isPleonastic(pronoun)) {
465 //System.out.println("PLEONASM...");
466       return null;
467     }
468 
469     //1.
470     int scopeFirstIndex = sentenceIndex - 1;
471     if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
472 
473     int currSentenceIndex = sentenceIndex;
474     Annotation bestAntecedent = null;
475 
476     while (currSentenceIndex >= scopeFirstIndex) {
477 
478       Sentence currSentence = this.textSentences[currSentenceIndex];
479       AnnotationSet org = currSentence.getOrganizations();
480       AnnotationSet loc = currSentence.getLocations();
481       //combine them
482       AnnotationSet org_loc = org;
483       org_loc.addAll(loc);
484 
485       Iterator it = org_loc.iterator();
486       while (it.hasNext()) {
487         Annotation currOrgLoc = (Annotation)it.next();
488 
489         if (null == bestAntecedent) {
490           //discard cataphoric references
491           if (currOrgLoc.getStartNode().getOffset().longValue() <
492                                           pronoun.getStartNode().getOffset().longValue()) {
493             bestAntecedent = currOrgLoc;
494           }
495         }
496         else {
497           bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
498         }
499       }
500 
501       if (0 == currSentenceIndex--)
502         break;
503     }
504 
505     return bestAntecedent;
506   }
507 
508 
509   /** --- */
510   private Annotation _resolve$I$ME$MY$MYSELF$(Annotation pronoun, int sentenceIndex) {
511 
512     //0. preconditions
513     Assert.assertTrue(pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
514     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
515                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
516     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
517     Assert.assertTrue(pronounString.equalsIgnoreCase("I") ||
518                       pronounString.equalsIgnoreCase("MY") ||
519                       pronounString.equalsIgnoreCase("ME") ||
520                       pronounString.equalsIgnoreCase("MYSELF"));
521 
522     //0.5 sanity check
523     //if there are not quotes at all in the text then exit
524     if (0 == this.quotedText.length) {
525 //System.out.println("TEXT WITH NO QUOTES ENCOUNTERED...");
526       return null;
527     }
528 
529 
530     //1.
531     Annotation bestAntecedent = null;
532 
533     int closestQuoteIndex = java.util.Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
534     //normalize index
535     if (closestQuoteIndex < 0) {
536       closestQuoteIndex = -closestQuoteIndex -1 -1;
537     }
538 
539     //get closest Quote
540     Quote quoteContext = this.quotedText[closestQuoteIndex];
541 
542     //assure that the pronoun is contained in the quoted text fragment
543     //otherwise exit
544 
545     if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
546         pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
547       //oops, probably incorrect text - I/My/Me is not part of quoted text fragment
548       //exit
549 //System.out.println("Oops! ["+pronounString+"] not part of quoted fragment...");
550       return null;
551     }
552 
553     //get the Persons that precede/succeed the quoted fragment
554     //the order is:
555     //
556     //[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but
557     //in the same sentence, then use it
558     //i.e.  ["PRN1(x)...", said X ...A, B, C ....]
559     //
560     //[2]. if there is a Person (NOT a pronoun) in the same sentence,
561     // preceding the quote, then use it
562     //i.e. . [A, B, C...X ..."PRN1(x) ..."...]
563     //
564 
565     //try [1]
566     //get the succeeding Persons/pronouns
567     AnnotationSet succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
568     if (false == succCandidates.isEmpty()) {
569       //cool, we have candidates, pick up the one closest to the end quote
570       Iterator it = succCandidates.iterator();
571 
572       while (it.hasNext()) {
573         Annotation currCandidate = (Annotation)it.next();
574         if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
575           //wow, we have a candidate that is closer to the quote
576           bestAntecedent = currCandidate;
577         }
578       }
579     }
580 
581     //try [2]
582     //get the preceding Persons/pronouns
583     if (null == bestAntecedent) {
584       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
585       if (false == precCandidates.isEmpty()) {
586         //cool, we have candidates, pick up the one closest to the end quote
587         Iterator it = precCandidates.iterator();
588 
589         while (it.hasNext()) {
590           Annotation currCandidate = (Annotation)it.next();
591           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
592             //wow, we have a candidate that is closer to the quote
593             bestAntecedent = currCandidate;
594           }
595         }
596       }
597     }
598 
599     //try [3]
600     //get the Persons/pronouns back in context
601     if (null == bestAntecedent) {
602       AnnotationSet precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
603       if (false == precCandidates.isEmpty()) {
604         //cool, we have candidates, pick up the one closest to the end quote
605         Iterator it = precCandidates.iterator();
606 
607         while (it.hasNext()) {
608           Annotation currCandidate = (Annotation)it.next();
609           if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
610             //wow, we have a candidate that is closer to the quote
611             bestAntecedent = currCandidate;
612           }
613         }
614       }
615     }
616 
617     return bestAntecedent;
618   }
619 
620 
621   /** --- */
622   private void preprocess() throws ExecutionException {
623 
624     //0.5 cleanup
625     this.personGender.clear();
626     this.anaphor2antecedent.clear();
627 
628     //1.get all annotation in the input set
629     if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
630       this.defaultAnnotations = this.document.getAnnotations();
631     }
632     else {
633       this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
634     }
635 
636     //if none found, print warning and exit
637     if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
638       Err.prln("Coref Warning: No annotations found for processing!");
639       return;
640     }
641 
642 
643 
644     //2.1 remove QT annotations if left from previous execution
645     AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
646     if (null != qtSet) {
647       qtSet.clear();
648     }
649 
650     //2.2. run quoted text transducer to generate "Quoted Text" annotations
651     this.qtTransducer.execute();
652 
653     //3.1 remove pleonastic annotations if left from previous execution
654     AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
655     if (null != pleonSet) {
656       pleonSet.clear();
657     }
658 
659     //3.2 run quoted text transducer to generate "Pleonasm" annotations
660     this.pleonTransducer.execute();
661 
662     //4.get all SENTENCE annotations
663     AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
664 
665     this.textSentences = new Sentence[sentenceAnnotations.size()];
666     Object[]  sentenceArray = sentenceAnnotations.toArray();
667 
668     java.util.Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
669 
670     for (int i=0; i< sentenceArray.length; i++) {
671 
672       Annotation currSentence = (Annotation)sentenceArray[i];
673       Long sentStartOffset = currSentence.getStartNode().getOffset();
674       Long sentEndOffset = currSentence.getEndNode().getOffset();
675 
676       //4.1. get PERSOSNS in this sentence
677       AnnotationSet sentPersons = this.defaultAnnotations.get(PERSON_ANNOTATION_TYPE,
678                                                               sentStartOffset,
679                                                               sentEndOffset);
680 
681       //4.2. get ORGANIZATIONS in this sentence
682       AnnotationSet sentOrgs = this.defaultAnnotations.get(ORGANIZATION_ANNOTATION_TYPE,
683                                                               sentStartOffset,
684                                                               sentEndOffset);
685 
686       //4.3. get LOCATION in this sentence
687       AnnotationSet sentLocs = this.defaultAnnotations.get(LOCATION_ANNOTATION_TYPE,
688                                                               sentStartOffset,
689                                                               sentEndOffset);
690 
691       //4.5. create a Sentence for thei SENTENCE annotation
692       this.textSentences[i] = new Sentence(i,
693                                             0,
694                                             sentStartOffset,
695                                             sentEndOffset,
696                                             sentPersons,
697                                             sentOrgs,
698                                             sentLocs
699                                   );
700 
701       //4.6. for all PERSONs in the sentence - find their gender using the
702       //orthographic coreferences if the gender of some entity is unknown
703       Iterator itPersons = sentPersons.iterator();
704       while (itPersons.hasNext()) {
705         Annotation currPerson = (Annotation)itPersons.next();
706         String gender = this.findPersonGender(currPerson);
707         this.personGender.put(currPerson,gender);
708       }
709     }
710 
711     //5. initialise the quoted text fragments
712     AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
713 
714     //if none then return
715     if (null == sentQuotes) {
716       this.quotedText = new Quote[0];
717     }
718     else {
719       this.quotedText = new Quote[sentQuotes.size()];
720 
721       Object[] quotesArray = sentQuotes.toArray();
722       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
723 
724       for (int i =0; i < quotesArray.length; i++) {
725         this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
726       }
727     }
728 
729     //6. initialuse the plonastic It annotations
730     AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
731 
732     if (null == plaonasticSet) {
733       this.pleonasticIt = new Annotation[0];
734     }
735     else {
736       this.pleonasticIt = new Annotation[plaonasticSet.size()];
737 
738       Object[] quotesArray = plaonasticSet.toArray();
739       java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
740 
741       for (int i=0; i< this.pleonasticIt.length; i++) {
742         this.pleonasticIt[i] = (Annotation)quotesArray[i];
743       }
744     }
745 
746   }
747 
748 
749   /** --- */
750   private String findPersonGender(Annotation person) {
751 
752     String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
753 
754     if (null==result) {
755       //gender is unknown - try to find it from the ortho coreferences
756       List orthoMatches  = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
757 
758       if (null != orthoMatches) {
759         Iterator itMatches = orthoMatches.iterator();
760 
761         while (itMatches.hasNext()) {
762           Integer correferringID = (Integer)itMatches.next();
763           Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
764           Assert.assertTrue(coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
765           String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
766 
767           if (null != correferringGender) {
768             result = correferringGender;
769             break;
770           }
771         }
772       }
773     }
774 
775     return result;
776   }
777 
778 
779   /** --- */
780   private static class AnnotationOffsetComparator implements Comparator {
781 
782     private int _getOffset(Object o) {
783 
784       if (o instanceof Annotation) {
785         return ((Annotation)o).getEndNode().getOffset().intValue();
786       }
787       else if (o instanceof Sentence) {
788         return ((Sentence)o).getStartOffset().intValue();
789       }
790       else if (o instanceof Quote) {
791         return ((Quote)o).getStartOffset().intValue();
792       }
793       else if (o instanceof Node) {
794         return ((Node)o).getOffset().intValue();
795       }
796       else {
797         throw new IllegalArgumentException();
798       }
799     }
800 
801     public int compare(Object o1,Object o2) {
802 
803       //0. preconditions
804       Assert.assertNotNull(o1);
805       Assert.assertNotNull(o2);
806       Assert.assertTrue(o1 instanceof Annotation ||
807                         o1 instanceof Sentence ||
808                         o1 instanceof Quote ||
809                         o1 instanceof Node);
810       Assert.assertTrue(o2 instanceof Annotation ||
811                         o2 instanceof Sentence ||
812                         o2 instanceof Quote ||
813                         o2 instanceof Node);
814 
815       int offset1 = _getOffset(o1);
816       int offset2 = _getOffset(o2);
817 
818       return offset1 - offset2;
819     }
820   }
821 
822 
823   /** --- */
824   private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HIMSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
825 
826     //0. preconditions
827     Assert.assertNotNull(ant1);
828     Assert.assertNotNull(ant2);
829     Assert.assertNotNull(pronoun);
830     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
831                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
832     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
833     Assert.assertTrue(pronounString.equalsIgnoreCase("SHE") ||
834                       pronounString.equalsIgnoreCase("HER") ||
835                       pronounString.equalsIgnoreCase("HE") ||
836                       pronounString.equalsIgnoreCase("HIM") ||
837                       pronounString.equalsIgnoreCase("HIS") ||
838                       pronounString.equalsIgnoreCase("HIMSELF"));
839 
840     Long offset1 = ant1.getStartNode().getOffset();
841     Long offset2 = ant2.getStartNode().getOffset();
842     Long offsetPrn = pronoun.getStartNode().getOffset();
843 
844     long diff1 = offsetPrn.longValue() - offset1.longValue();
845     long diff2 = offsetPrn.longValue() - offset2.longValue();
846 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
847     //reject candidates that overlap with the pronoun
848     if (diff1 == 0) {
849       return ant2;
850     }
851     else if (diff2 == 0) {
852       return ant1;
853     }
854 
855     //get the one CLOSEST AND PRECEDING the pronoun
856     if (diff1 > 0 && diff2 > 0) {
857       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
858       if (diff1 < diff2)
859         return ant1;
860       else
861         return ant2;
862     }
863     else if (diff1 < 0 && diff2 < 0) {
864       //we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A
865       if (Math.abs(diff1) < Math.abs(diff2))
866         return ant1;
867       else
868           return ant2;
869     }
870     else {
871       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
872       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
873       if (diff1 > 0)
874         return ant1;
875       else
876         return ant2;
877     }
878   }
879 
880   /** --- */
881   private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
882 
883     //0. preconditions
884     Assert.assertNotNull(ant1);
885     Assert.assertNotNull(ant2);
886     Assert.assertNotNull(pronoun);
887     Assert.assertTrue(pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
888                       pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
889     String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
890 
891     Assert.assertTrue(pronounString.equalsIgnoreCase("IT") ||
892                       pronounString.equalsIgnoreCase("ITS") ||
893                       pronounString.equalsIgnoreCase("ITSELF"));
894 
895     Long offset1 = ant1.getStartNode().getOffset();
896     Long offset2 = ant2.getStartNode().getOffset();
897     Long offsetPrn = pronoun.getStartNode().getOffset();
898     long diff1 = offsetPrn.longValue() - offset1.longValue();
899     long diff2 = offsetPrn.longValue() - offset2.longValue();
900 //    Assert.assertTrue(diff1 != 0 && diff2 != 0);
901     //reject candidates that overlap with the pronoun
902     if (diff1 == 0) {
903       return ant2;
904     }
905     else if (diff2 == 0) {
906       return ant1;
907     }
908 
909 
910     //get the one CLOSEST AND PRECEDING the pronoun
911     if (diff1 > 0 && diff2 > 0) {
912       //we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
913       if (diff1 < diff2)
914         return ant1;
915       else
916         return ant2;
917     }
918     else if (diff1 > 0){
919       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
920       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
921       return ant1;
922     }
923     else if (diff2 > 0){
924       Assert.assertTrue(Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
925       //we have [antecedentA...pronoun...AntecedentB] ==> choose A
926       return ant2;
927     }
928     else {
929       //both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either
930       //cataphora, or nominal antecedent, or an antecedent that is further back in scope
931       //in any case - discard the antecedents
932       return null;
933     }
934   }
935 
936 
937   /** --- */
938   private class Quote {
939 
940     /** --- */
941     public static final int ANTEC_AFTER = 1;
942     /** --- */
943     public static final int ANTEC_BEFORE = 2;
944     /** --- */
945     public static final int ANTEC_BACK = 3;
946     /** --- */
947     private AnnotationSet antecedentsBefore;
948     /** --- */
949     private AnnotationSet antecedentsAfter;
950     /** --- */
951     private AnnotationSet antecedentsBackInContext;
952     /** --- */
953     private Annotation quoteAnnotation;
954     /** --- */
955     private int quoteIndex;
956 
957     /** --- */
958     public Quote(Annotation quoteAnnotation, int index) {
959 
960       this.quoteAnnotation = quoteAnnotation;
961       this.quoteIndex = index;
962       init();
963     }
964 
965     /** --- */
966     private void init() {
967 
968       //0.preconditions
969       Assert.assertNotNull(textSentences);
970 
971       //0.5 create a restriction for PRP pos tokens
972       FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
973       prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
974 
975       //1. generate the precPersons set
976 
977       //1.1 locate the sentece containing the opening quote marks
978       int quoteStartPos = java.util.Arrays.binarySearch(textSentences,
979                                                         this.quoteAnnotation.getStartNode(),
980                                                         ANNOTATION_OFFSET_COMPARATOR);
981 
982       //normalize index
983       int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
984                                                   : -quoteStartPos -1 -1; // blame Sun, not me
985 
986       //1.2. get the persons and restrict to these that precede the quote (i.e. not contained
987       //in the quote)
988       this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
989                                                             this.quoteIndex,
990                                                             ANTEC_BEFORE);
991 
992 
993       //2. generate the precPersonsInCOntext set
994       //2.1. get the persons from the sentence precedeing the sentence containing the quote start
995       if (startSentenceIndex > 0) {
996         this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
997                                                                     this.quoteIndex,
998                                                                     ANTEC_BACK);
999       }
1000
1001      //2. generate the succ  Persons set
1002      //2.1 locate the sentece containing the closing quote marks
1003      int quoteEndPos = java.util.Arrays.binarySearch(textSentences,
1004                                                        this.quoteAnnotation.getEndNode(),
1005                                                        ANNOTATION_OFFSET_COMPARATOR);
1006
1007      //normalize it
1008      int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
1009                                              : -quoteEndPos -1 -1; // blame Sun, not me
1010      this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
1011                                                            this.quoteIndex,
1012                                                            ANTEC_AFTER);
1013      //generate t
1014    }
1015
1016
1017    /** --- */
1018    private AnnotationSet generateAntecedentCandidates(int sentenceNumber,
1019                                                        int quoteNumber ,
1020                                                        int mode) {
1021
1022      //0. preconditions
1023      Assert.assertTrue(sentenceNumber >=0);
1024      Assert.assertTrue(quoteNumber >=0);
1025      Assert.assertTrue(mode == Quote.ANTEC_AFTER ||
1026                        mode == Quote.ANTEC_BEFORE ||
1027                        mode == Quote.ANTEC_BACK);
1028
1029      //1. get sentence
1030     Sentence sentence = textSentences[sentenceNumber];
1031
1032      //2. get the persons
1033      AnnotationSet antecedents = new AnnotationSetImpl(sentence.getPersons());
1034
1035      //4. now get the he/she pronouns in the relevant context
1036      AnnotationSet annotations = null;
1037
1038      switch(mode) {
1039
1040        case ANTEC_BEFORE:
1041          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1042                                                      this.getStartOffset());
1043          break;
1044
1045        case ANTEC_AFTER:
1046          annotations = defaultAnnotations.getContained(this.getEndOffset(),
1047                                                     sentence.getEndOffset());
1048          break;
1049
1050        case ANTEC_BACK:
1051          annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
1052                                                     sentence.getEndOffset());
1053          break;
1054      }
1055
1056      //4. get the pronouns
1057      //restrict to he/she pronouns
1058      if (null != annotations) {
1059        AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
1060
1061        if (null != pronouns) {
1062
1063          Iterator it = pronouns.iterator();
1064          while (it.hasNext()) {
1065            Annotation currPronoun = (Annotation)it.next();
1066            //add to succPersons only if HE/SHE
1067            String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1068
1069            if (null != pronounString &&
1070                (pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
1071                )
1072              antecedents.add(currPronoun);
1073          }//while
1074        }//if
1075      }//if
1076
1077
1078      //3. depending on the mode, may have to restrict persons to these that precede/succeed
1079      //the quoted fragment
1080      //
1081      //for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where
1082      //the quote *starts*
1083      //
1084      //for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where
1085      //the quote *ends*
1086      //
1087      //for ANTEC_BACK, we are operating in the context of the sentence previous to the
1088      //sentence where the quote starts. I.e. we're resolbinf a case like
1089      // [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"]
1090      //...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote
1091      //Note that the cirrent sentence is the first one, not the second
1092      //
1093      Iterator itPersons = antecedents.iterator();
1094
1095      while (itPersons.hasNext()) {
1096        Annotation currPerson = (Annotation)itPersons.next();
1097
1098        //cut
1099        if (Quote.ANTEC_BEFORE == mode &&
1100            currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
1101          //restrict only to persosn preceding
1102          itPersons.remove();
1103        }
1104        else if (Quote.ANTEC_AFTER == mode &&
1105                currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
1106          //restrict only to persons succeeding the quote
1107          itPersons.remove();
1108        }
1109        else if (Quote.ANTEC_BACK == mode) {
1110          //this one is tricky
1111          //locate the quote previous to the one we're resolving
1112          //(since we're operating in the sentence previous to the quote being resolved
1113          //wew try to find if any quote (prevQuote) exist in this sentence and get the
1114          //persons succeeding it)
1115
1116          //get prev quote
1117          //is the curr quote the first one?
1118          if (quoteNumber >0) {
1119            Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
1120
1121            //restrict to the succeeding persons
1122            if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
1123              itPersons.remove();
1124            }
1125          }
1126        }
1127      }
1128
1129      return antecedents;
1130    }
1131
1132    /** --- */
1133    public Long getStartOffset() {
1134      return this.quoteAnnotation.getStartNode().getOffset();
1135    }
1136
1137    /** --- */
1138    public Long getEndOffset() {
1139      return this.quoteAnnotation.getEndNode().getOffset();
1140    }
1141
1142    /** --- */
1143    public AnnotationSet getAntecedentCandidates(int type) {
1144
1145      switch(type) {
1146
1147        case ANTEC_AFTER:
1148          return this.antecedentsAfter;
1149
1150        case ANTEC_BEFORE:
1151          return this.antecedentsBefore;
1152
1153        case ANTEC_BACK:
1154          return this.antecedentsBackInContext;
1155
1156        default:
1157          throw new IllegalArgumentException();
1158      }
1159    }
1160
1161  }
1162
1163
1164  /** --- */
1165  private class Sentence {
1166
1167    /** --- */
1168    private int sentNumber;
1169    /** --- */
1170    private int paraNumber;
1171    /** --- */
1172    private Long startOffset;
1173    /** --- */
1174    private Long endOffset;
1175    /** --- */
1176    private AnnotationSet persons;
1177    /** --- */
1178    private AnnotationSet organizations;
1179    /** --- */
1180    private AnnotationSet locations;
1181
1182    /** --- */
1183    public Sentence(int sentNumber,
1184                    int paraNumber,
1185                    Long startOffset,
1186                    Long endOffset,
1187                    AnnotationSet persons,
1188                    AnnotationSet organizations,
1189                    AnnotationSet locations) {
1190
1191      this.sentNumber = sentNumber;
1192      this.paraNumber = paraNumber;
1193      this.startOffset = startOffset;
1194      this.endOffset = endOffset;
1195      this.persons = persons;
1196      this.organizations = organizations;
1197      this.locations = locations;
1198    }
1199
1200    /** --- */
1201    public Long getStartOffset() {
1202      return this.startOffset;
1203    }
1204
1205    /** --- */
1206    public Long getEndOffset() {
1207      return this.endOffset;
1208    }
1209
1210    /** --- */
1211    public AnnotationSet getPersons() {
1212      return this.persons;
1213    }
1214
1215    /** --- */
1216    public AnnotationSet getOrganizations() {
1217      return this.organizations;
1218    }
1219
1220    /** --- */
1221    public AnnotationSet getLocations() {
1222      return this.locations;
1223    }
1224  }
1225
1226}