1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.43 2002/04/12 14:35:22 kalina Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import gate.*;
20  import gate.util.*;
21  import gate.creole.*;
22  import gate.corpora.*;
23  import gate.annotation.*;
24  import java.util.*;
25  import java.io.*;
26  import java.net.*;
27  import gnu.regexp.*;
28  
29  public class OrthoMatcher extends AbstractLanguageAnalyser
30                            implements ANNIEConstants{
31  
32    public static final String
33      OM_DOCUMENT_PARAMETER_NAME = "document";
34  
35    public static final String
36      OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
37  
38    public static final String
39      OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
40  
41    public static final String
42      OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
43  
44    public static final String
45      OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
46  
47    public static final String
48      OM_PERSON_TYPE_PARAMETER_NAME = "personType";
49  
50    public static final String
51      OM_EXT_LISTS_PARAMETER_NAME = "extLists";
52  
53    protected static final String CDGLISTNAME = "cdg";
54    protected static final String ALIASLISTNAME = "alias";
55    protected static final String ARTLISTNAME = "def_art";
56    protected static final String PREPLISTNAME = "prepos";
57    protected static final String CONNECTORLISTNAME = "connector";
58    protected static final String SPURLISTNAME = "spur_match";
59  
60    protected static final String PUNCTUATION_VALUE = "punctuation";
61    protected static final String THE_VALUE = "The";
62  
63  
64    /**the name of the annotation set*/
65    protected String annotationSetName;
66  
67    /** the types of the annotation */
68    protected List annotationTypes = new ArrayList(10);
69  
70    /** the organization type*/
71    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
72  
73    /** the person type*/
74    protected String personType = PERSON_ANNOTATION_TYPE;
75  
76    protected String unknownType = "Unknown";
77  
78    /** internal or external list */
79    protected boolean extLists = true;
80  
81    /** matching unknowns or not*/
82    protected boolean matchingUnknowns = true;
83  
84    /** This is an internal variable to indicate whether
85     *  we matched using a rule that requires that
86     *  the newly matched annotation matches all the others
87     *  This is needed, because organizations can share
88     *  first/last tokens like News and be different
89     */
90    private   boolean allMatchingNeeded = false;
91  
92    //** Orthomatching is not case-sensitive by default*/
93    protected boolean caseSensitive = false;
94  
95    protected FeatureMap queryFM = Factory.newFeatureMap();
96  
97  //  protected ExecutionException executionException;
98  
99    // name lookup tables (used for namematch)
100   //gave them bigger default size, coz rehash is expensive
101   protected HashMap alias = new HashMap(100);
102   protected HashSet cdg = new HashSet(50);
103   protected HashMap spur_match = new HashMap(100);
104   protected HashMap def_art = new HashMap(20);
105   protected HashMap connector = new HashMap(20);
106   protected HashMap prepos = new HashMap(30);
107 
108 
109   protected AnnotationSet nameAllAnnots = null;
110   protected HashMap processedAnnots = new HashMap(150);
111   protected HashMap annots2Remove = new HashMap(75);
112   protected List matchesDocFeature = new ArrayList();
113   //maps annotation ids to array lists of tokens
114   protected HashMap tokensMap = new HashMap(150);
115 
116   protected Annotation shortAnnot, longAnnot;
117 
118   protected ArrayList tokensLongAnnot, tokensShortAnnot;
119 
120   /** a feature map to be used when retrieving annotations
121    *  declared here so can be reused for efficiency
122    *  clear() before each use
123    */
124   protected FeatureMap tempMap = Factory.newFeatureMap();
125 
126   /** a buffer in order to read an array of char */
127   private char[] cbuffer = null;
128 
129   /** the size of the buffer */
130   private final static int BUFF_SIZE = 65000;
131 
132   /** @link dependency */
133   /*#OrthoMatcher lnkOrthoMatcher;*/
134 
135   public OrthoMatcher () {
136     annotationTypes.add(organizationType);
137     annotationTypes.add(personType);
138     annotationTypes.add("Location");
139     annotationTypes.add("Date");
140   }
141 
142   /** Initialise this resource, and return it. */
143   public Resource init() throws ResourceInstantiationException {
144     cbuffer = new char[BUFF_SIZE];
145 
146     //initialise the list of annotations which we will match
147     try {
148       createLists();
149     } catch (IOException ioe) {ioe.printStackTrace();}
150     return this;
151   } // init()
152 
153   /**  Run the resource. It doesn't make sense not to override
154     *  this in subclasses so the default implementation signals an
155     *  exception.
156     */
157   public void execute() throws ExecutionException{
158 
159     //check the input
160     if(document == null) {
161       throw new ExecutionException(
162         "No document for namematch!"
163       );
164     }
165 
166     // get the annotations from document
167     if ((annotationSetName == null)|| (annotationSetName.equals("")))
168       nameAllAnnots = document.getAnnotations();
169     else
170       nameAllAnnots = document.getAnnotations(annotationSetName);
171 
172     //if none found, print warning and exit
173     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
174       Out.prln("OrthoMatcher Warning: No annotations found for processing");
175       return;
176     }
177 
178     //check if we've been run on this document before
179     //and clean the doc if needed
180     docCleanup();
181     Map matchesMap = (Map)document.getFeatures().
182                      get(DOCUMENT_COREF_FEATURE_NAME);
183 
184     // creates the cdg list from the document
185     //no need to create otherwise, coz already done in init()
186     if (!extLists)
187       buildTables(nameAllAnnots);
188 
189     //first match all name annotations
190     matchNameAnnotations();
191 
192     //then match the unknown ones to all name ones
193     if (matchingUnknowns)
194       matchUnknown();
195 
196     // set the matches of the document
197 //    determineMatchesDocument();
198     if (! matchesDocFeature.isEmpty()) {
199       if(matchesMap == null){
200         matchesMap = new HashMap();
201       }
202       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
203       //we need to put it even if it was already present in order to triger
204       //the update events
205       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
206 
207       //cannot do clear() as this has already been put on the document
208       //so I need a new one for the next run of matcher
209       matchesDocFeature = new ArrayList();
210     }
211 
212 //    Out.prln("Processed strings" + processedAnnots.values());
213     //clean-up the internal data structures for next run
214     nameAllAnnots = null;
215     processedAnnots.clear();
216     annots2Remove.clear();
217     tokensMap.clear();
218     matchesDocFeature = new ArrayList();
219     longAnnot = null;
220     shortAnnot = null;
221     tokensLongAnnot = null;
222     tokensShortAnnot = null;
223 
224   } // run()
225 
226   protected void matchNameAnnotations() throws ExecutionException{
227     // go through all the annotation types
228     Iterator iterAnnotationTypes = annotationTypes.iterator();
229     while (iterAnnotationTypes.hasNext()) {
230       String annotationType = (String)iterAnnotationTypes.next();
231 
232       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
233 
234       // continue if no such annotations exist
235       if ((nameAnnots == null) || nameAnnots.isEmpty())
236         continue;
237 
238       Iterator iterNames = nameAnnots.iterator();
239       while (iterNames.hasNext()) {
240         Annotation nameAnnot = (Annotation) iterNames.next();
241         Integer id = nameAnnot.getId();
242 
243         // get string and value
244         String annotString = null;
245         try {
246             annotString = document.getContent().getContent(
247             nameAnnot.getStartNode().getOffset(),
248             nameAnnot.getEndNode().getOffset()
249             ).toString();
250           // now do the reg. exp. substitutions
251           annotString = regularExpressions(annotString," ", "\\s+");
252 
253         } catch (InvalidOffsetException ioe) {
254             throw new ExecutionException
255                                    ("Invalid offset of the annotation");
256         }
257         //convert to lower case if we are not doing a case sensitive match
258         if (!caseSensitive)
259           annotString = annotString.toLowerCase();
260 
261         //get the tokens
262         List tokens = new ArrayList((Set)
263                         nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
264                           nameAnnot.getStartNode().getOffset(),
265                           nameAnnot.getEndNode().getOffset()
266                         ));
267         //if no tokens to match, do nothing
268         if (tokens.isEmpty())
269           continue;
270         Collections.sort(tokens, new gate.util.OffsetComparator());
271         //check if these actually do not end after the name
272         //needed coz new tokeniser conflates
273         //strings with dashes. So British Gas-style is two tokens
274         //instead of three. So cannot match properly British Gas
275 //        tokens = checkTokens(tokens);
276         tokensMap.put(nameAnnot.getId(), tokens);
277 
278 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
279 
280         //first check whether we have not matched such a string already
281         //if so, just consider it matched, don't bother calling the rules
282         if (processedAnnots.containsValue(annotString)) {
283 //          Out.prln("Contained string found " + annotString);
284           updateMatches(nameAnnot, annotString);
285           processedAnnots.put(nameAnnot.getId(), annotString);
286           continue;
287         } else if (processedAnnots.isEmpty()) {
288           processedAnnots.put(nameAnnot.getId(), annotString);
289           continue;
290         }
291 
292         //if a person, then remove their title before matching
293         if (nameAnnot.getType().equals(personType))
294           annotString = containTitle(annotString, nameAnnot);
295         else if (nameAnnot.getType().equals(organizationType))
296           annotString = stripCDG(annotString, nameAnnot);
297 
298         if(null == annotString || "".equals(annotString))
299           continue;
300 
301         //otherwise try matching with previous annotations
302         matchWithPrevious(nameAnnot, annotString);
303 
304 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
305         //finally add the current annotations to the processed map
306         processedAnnots.put(nameAnnot.getId(), annotString);
307       }//while through name annotations
308 
309     }//while through annotation types
310 
311   }
312 
313   protected void matchUnknown() throws ExecutionException {
314     //get all Unknown annotations
315     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
316 
317     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
318       return;
319 
320     Iterator iter = unknownAnnots.iterator();
321     //loop through the unknown annots
322     while (iter.hasNext()) {
323       Annotation unknown = (Annotation) iter.next();
324 
325       // get string and value
326       String unknownString = null;
327       try {
328           unknownString = document.getContent().getContent(
329             unknown.getStartNode().getOffset(),
330             unknown.getEndNode().getOffset()
331             ).toString();
332         // now do the reg. exp. substitutions
333         unknownString = regularExpressions(unknownString," ", "\\s+");
334       } catch (InvalidOffsetException ioe) {
335           throw new ExecutionException
336                                  ("Invalid offset of the annotation");
337       }
338       //convert to lower case if we are not doing a case sensitive match
339       if (!caseSensitive)
340         unknownString = unknownString.toLowerCase();
341 
342       //get the tokens
343       List tokens = new ArrayList((Set)
344                       nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
345                         unknown.getStartNode().getOffset(),
346                         unknown.getEndNode().getOffset()
347                       ));
348       if (tokens.isEmpty())
349         continue;
350       Collections.sort(tokens, new gate.util.OffsetComparator());
351       tokensMap.put(unknown.getId(), tokens);
352 
353 
354       //first check whether we have not matched such a string already
355       //if so, just consider it matched, don't bother calling the rules
356       if (processedAnnots.containsValue(unknownString)) {
357         Annotation matchedAnnot = updateMatches(unknown, unknownString);
358 //        Out.prln("Matched " + unknown + "with string " + unknownString);
359 //        Out.prln("That's same as " + matchedAnnot);
360         if (matchedAnnot.getType().equals(unknownType)) {
361           annots2Remove.put(unknown.getId(),
362                             annots2Remove.get(matchedAnnot.getId()));
363         }
364         else
365           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
366         processedAnnots.put(unknown.getId(), unknownString);
367         unknown.getFeatures().put("NMRule", unknownType);
368         continue;
369       }
370 
371       //check if we should do sub-string matching in case it's hyphenated
372       //for example US-led
373       if (tokens.size() == 1
374           && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
375         if (matchHyphenatedUnknowns(unknown, unknownString, iter))
376           continue;
377       }//if
378 
379       matchWithPrevious(unknown, unknownString);
380 
381     } //while though unknowns
382 
383     if (! annots2Remove.isEmpty()) {
384       Iterator unknownIter = annots2Remove.keySet().iterator();
385       while (unknownIter.hasNext()) {
386         Integer unknId = (Integer) unknownIter.next();
387         Annotation unknown = nameAllAnnots.get(unknId);
388         Integer newID = nameAllAnnots.add(
389           unknown.getStartNode(),
390           unknown.getEndNode(),
391           (String) annots2Remove.get(unknId),
392           unknown.getFeatures()
393         );
394         nameAllAnnots.remove(unknown);
395 
396         //change the id in the matches list
397         List mList = (List)unknown.getFeatures().
398                      get(ANNOTATION_COREF_FEATURE_NAME);
399         mList.remove(unknId);
400         mList.add(newID);
401       }//while
402     }//if
403   }
404 
405   private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
406                                        Iterator iter){
407     boolean matched = false;
408 
409     //only take the substring before the hyphen
410     int stringEnd = unknownString.indexOf("-");
411     unknownString = unknownString.substring(0, stringEnd);
412     //check if we've already matched this string
413     //because only exact match of the substring are considered
414     if (processedAnnots.containsValue(unknownString)) {
415       matched = true;
416       Annotation matchedAnnot = updateMatches(unknown, unknownString);
417       //only do the matching if not a person, because we do not match
418       //those on sub-strings
419       iter.remove();
420       String newType;
421       if (matchedAnnot.getType().equals(unknownType))
422         newType = (String)annots2Remove.get(matchedAnnot.getId());
423       else
424         newType = matchedAnnot.getType();
425 
426       Integer newID = new Integer(-1);
427       try {
428         newID = nameAllAnnots.add(
429           unknown.getStartNode().getOffset(),
430           new Long(unknown.getStartNode().getOffset().longValue()
431                   + stringEnd),
432           newType,
433           unknown.getFeatures()
434         );
435       } catch (InvalidOffsetException ex) {
436         throw new GateRuntimeException(ex.getMessage());
437       }
438       nameAllAnnots.remove(unknown);
439 
440       //change the id in the matches list
441       List mList = (List)unknown.getFeatures().
442                    get(ANNOTATION_COREF_FEATURE_NAME);
443       mList.remove(unknown.getId());
444       mList.add(newID);
445 
446     }
447     return matched;
448   }
449 
450   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
451     boolean matchedUnknown = false;
452 
453     Iterator prevIter = processedAnnots.keySet().iterator();
454     while (prevIter.hasNext()) {
455       Integer prevId = (Integer) prevIter.next();
456       Annotation prevAnnot = nameAllAnnots.get(prevId);
457 
458       //check if the two are from the same type or the new one is unknown
459       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
460           && ! nameAnnot.getType().equals(unknownType))
461          )
462         continue;
463       //do not compare two unknown annotations either
464       //they are only matched to those of known types
465       if (  nameAnnot.getType().equals(unknownType)
466             && prevAnnot.getType().equals(unknownType))
467       continue;
468 
469       //check if we have already matched this annotation to the new one
470       if (matchedAlready(nameAnnot, prevAnnot) )
471         continue;
472 
473       //now changed to a rule, here we just match by gender
474       if (prevAnnot.getType().equals(personType)) {
475         String prevGender =
476           (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
477         String nameGender =
478           (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
479         if (   prevGender != null
480             && nameGender != null
481             && ( (nameGender.equalsIgnoreCase("female")
482                   &&
483                   prevGender.equalsIgnoreCase("male")
484                   )
485                ||
486                   (prevGender.equalsIgnoreCase("female")
487                    && nameGender.equalsIgnoreCase("male")
488                   )
489                 )
490             ) //if condition
491           continue; //we don't have a match if the two genders are different
492 
493       }//if
494 
495       //if the two annotations match
496       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
497 //        Out.prln("Matched " + shortName + "and " + longName);
498         updateMatches(nameAnnot, prevAnnot);
499         //if unknown annotation, we need to change to the new type
500         if (nameAnnot.getType().equals(unknownType)) {
501           matchedUnknown = true;
502           if (prevAnnot.getType().equals(unknownType))
503             annots2Remove.put(nameAnnot.getId(),
504                               annots2Remove.get(prevAnnot.getId()));
505           else
506             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
507          //also put an attribute to indicate that
508           nameAnnot.getFeatures().put("NMRule", unknownType);
509         }//if unknown
510         break; //no need to match further
511       }//if annotations matched
512 
513     }//while through previous annotations
514 
515     if (matchedUnknown)
516       processedAnnots.put(nameAnnot.getId(), annotString);
517 
518 
519   }//matchWithPrevious
520 
521   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
522                                      Annotation prevAnnot) {
523     //do not match two annotations that overlap
524     if (newAnnot.overlaps(prevAnnot))
525       return false;
526 
527     // find which annotation string of the two is longer
528     //  this is useful for some of the matching rules
529     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
530 
531     String longName = prevAnnotString;
532     String shortName = annotString;
533     longAnnot = prevAnnot;
534     shortAnnot = newAnnot;
535 
536     if (shortName.length()>=longName.length()) {
537       String temp = longName;
538       longName = shortName;
539       shortName = temp;
540       Annotation tempAnn = longAnnot;
541       longAnnot = shortAnnot;
542       shortAnnot = tempAnn;
543     }//if
544 
545     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
546     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
547 
548     List matchesList = (List) prevAnnot.getFeatures().
549                               get(ANNOTATION_COREF_FEATURE_NAME);
550     if (matchesList == null || matchesList.isEmpty())
551       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
552 
553     //if these two match, then let's see if all the other matching one will too
554     //that's needed, because sometimes names can share a token (e.g., first or
555     //last but not be the same
556     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
557       /**
558        * Check whether we need to ensure that there is a match with the rest
559        * of the matching annotations, because the rule requires that
560        * transtivity is not assummed.
561        */
562       if (allMatchingNeeded) {
563         allMatchingNeeded = false;
564 
565         List toMatchList = new ArrayList(matchesList);
566   //      if (newAnnot.getType().equals(unknownType))
567   //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
568         toMatchList.remove(prevAnnot.getId());
569 
570         return matchOtherAnnots(toMatchList, newAnnot, annotString);
571       } else
572         return true;
573     }
574     return false;
575   }
576 
577   /** This method checkes whether the new annotation matches
578    *  all annotations given in the toMatchList (it contains ids)
579    *  The idea is that the new annotation needs to match all those,
580    *  because assuming transitivity does not always work, when
581    *  two different entities share a common token: e.g., BT Cellnet
582    *  and BT and British Telecom.
583   */
584   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
585                                       String annotString) {
586 
587     //if the list is empty, then we're matching all right :-)
588     if (toMatchList.isEmpty())
589       return true;
590 
591     boolean matchedAll = true;
592     int i = 0;
593 
594     while (matchedAll && i < toMatchList.size()) {
595       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
596 
597       // find which annotation string of the two is longer
598       //  this is useful for some of the matching rules
599       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
600       if (prevAnnotString == null)
601         try {
602           prevAnnotString = document.getContent().getContent(
603             prevAnnot.getStartNode().getOffset(),
604             prevAnnot.getEndNode().getOffset()
605             ).toString();
606         } catch (InvalidOffsetException ioe) {
607           return false;
608         }//try
609 
610 
611       String longName = prevAnnotString;
612       String shortName = annotString;
613       longAnnot = prevAnnot;
614       shortAnnot = newAnnot;
615 
616       if (shortName.length()>=longName.length()) {
617         String temp = longName;
618         longName = shortName;
619         shortName = temp;
620         Annotation tempAnn = longAnnot;
621         longAnnot = shortAnnot;
622         shortAnnot = tempAnn;
623       }//if
624 
625       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
626       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
627 
628       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
629 //      if (newAnnot.getType().equals(unknownType))
630 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
631 
632       i++;
633     }//while
634     return matchedAll;
635   }
636 
637 
638   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
639     //the two annotations are already matched if the matches list of the first
640     //contains the id of the second
641     List matchesList = (List) annot1.getFeatures().
642                        get(ANNOTATION_COREF_FEATURE_NAME);
643     if ((matchesList == null) || matchesList.isEmpty())
644       return false;
645     else if (matchesList.contains(annot2.getId()))
646       return true;
647     return false;
648   }
649 
650   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
651     Annotation matchedAnnot = null;
652     Integer id;
653 
654     //first find a processed annotation with the same string
655     Iterator iter = processedAnnots.keySet().iterator();
656     while (iter.hasNext()) {
657       id = (Integer) iter.next();
658       String oldString = (String) processedAnnots.get(id);
659       if (annotString.equals(oldString)) {
660         matchedAnnot = nameAllAnnots.get(id);
661         break;
662       }//if
663     }//while
664 
665     if (matchedAnnot == null) return null;
666     //if the two matching annotations are of different type which is not
667     //unknown, do not match them
668     if (! matchedAnnot.getType().equals(newAnnot.getType())
669         && !newAnnot.getType().equals(unknownType) )
670       return matchedAnnot;
671 
672     List matchesList = (List) matchedAnnot.getFeatures().
673                        get(ANNOTATION_COREF_FEATURE_NAME);
674     if ((matchesList == null) || matchesList.isEmpty()) {
675       //no previous matches, so need to add
676       if (matchesList == null) {
677         matchesList = new ArrayList();
678         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
679                                        matchesList);
680         matchesDocFeature.add(matchesList);
681       }//if
682       matchesList.add(matchedAnnot.getId());
683       matchesList.add(newAnnot.getId());
684     } else {
685       //just add the new annotation
686       matchesList.add(newAnnot.getId());
687     }//if
688     //add the matches list to the new annotation
689     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
690     return matchedAnnot;
691   }
692 
693   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
694 
695     List matchesList = (List) prevAnnot.getFeatures().
696                               get(ANNOTATION_COREF_FEATURE_NAME);
697     if ((matchesList == null) || matchesList.isEmpty()) {
698       //no previous matches, so need to add
699       if (matchesList == null) {
700         matchesList = new ArrayList();
701         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
702         matchesDocFeature.add(matchesList);
703       }//if
704       matchesList.add(prevAnnot.getId());
705       matchesList.add(newAnnot.getId());
706     } else {
707       //just add the new annotation
708       matchesList.add(newAnnot.getId());
709     }//if
710     //add the matches list to the new annotation
711     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
712     //propagate the gender if two persons are matched
713     if (prevAnnot.getType().equals(personType)) {
714       String prevGender =
715         (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
716       String newGender =
717         (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
718       boolean unknownPrevGender = isUnknownGender(prevGender);
719       boolean unknownNewGender = isUnknownGender(newGender);
720       if (unknownPrevGender && !unknownNewGender)
721         prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
722       else if (unknownNewGender && !unknownPrevGender)
723         newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
724     }//if
725   }
726 
727 
728   protected void docCleanup() {
729     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
730     if (matchesValue != null && (matchesValue instanceof Map))
731       ((Map)matchesValue).remove(nameAllAnnots.getName());
732     else if (matchesValue != null) {
733       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
734     }
735 
736     //get all annotations that have a matches feature
737     HashSet fNames = new HashSet();
738     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
739     AnnotationSet annots =
740                   nameAllAnnots.get(null, fNames);
741 
742 //    Out.prln("Annots to cleanup" + annots);
743 
744     if (annots == null || annots.isEmpty())
745       return;
746 
747     Iterator iter = annots.iterator();
748     while (iter.hasNext()) {
749       while (iter.hasNext())
750         ((Annotation) iter.next()).getFeatures().
751                                    remove(ANNOTATION_COREF_FEATURE_NAME);
752     } //while
753   }//cleanup
754 
755   /** return a person name without title */
756   protected String containTitle (String annotString, Annotation annot)
757                       throws ExecutionException {
758     // get the offsets
759     Long startAnnot = annot.getStartNode().getOffset();
760     Long endAnnot = annot.getEndNode().getOffset();
761 
762     // determine "Lookup" annotation set
763     queryFM.clear();
764     queryFM.put("majorType", "title");
765     AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
766     if (as1 == null || as1.isEmpty())
767       return annotString;
768     AnnotationSet as =
769       as1.get("Lookup", queryFM);
770     if (as !=null && ! as.isEmpty()) {
771       List titles = new ArrayList((Set)as);
772       Collections.sort(titles, new gate.util.OffsetComparator());
773 
774       Iterator iter = titles.iterator();
775       while (iter.hasNext()) {
776         Annotation titleAnn = (Annotation)(iter.next());
777 
778         //we've not found a title at the start offset,
779         //there's no point in looking further
780         //coz titles come first
781         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
782           return annotString;
783 
784         try {
785           // the title from the current annotation
786           String annotTitle =
787             document.getContent().getContent(
788               titleAnn.getStartNode().getOffset(),
789               titleAnn.getEndNode().getOffset()
790             ).toString();
791 
792           // eliminate the title from annotation string and return the result
793           if (annotTitle.length()<annotString.length()) {
794             //remove from the array of tokens, so then we can compare properly
795             //the remaining tokens
796 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
797 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
798 //            Out.prln("Title is" + annotTitle);
799             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
800             return annotString.substring(
801                                  annotTitle.length()+1,annotString.length());
802           }
803         } catch (InvalidOffsetException ioe) {
804             throw new ExecutionException
805                                ("Invalid offset of the annotation");
806         }//try
807       }// while
808     }//if
809     return annotString;
810 
811   }
812 
813   /** return an organization  without a designator and starting The*/
814   protected String stripCDG (String annotString, Annotation annot){
815 
816     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
817 
818     //strip starting The first
819     if ( ((String) ((Annotation) tokens.get(0)
820           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
821           .equalsIgnoreCase(THE_VALUE))
822       tokens.remove(0);
823 
824     //no need to check for cdg if there is only 1 token or less
825     if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
826           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
827       tokens.remove(tokens.size()-1);
828 
829     StringBuffer newString = new StringBuffer(50);
830     for (int i = 0; i < tokens.size(); i++){
831       newString.append((String) ((Annotation) tokens.get(i)
832           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
833       if (i != tokens.size()-1)
834         newString.append(" ");
835     }
836 //    Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
837 
838     if (caseSensitive)
839       return newString.toString();
840 
841     return newString.toString().toLowerCase();
842   }
843 
844 /*
845   public void check() throws ExecutionException {
846     if (executionException != null) {
847       ExecutionException e = executionException;
848       executionException = null;
849       throw e;
850     }
851   } // check()
852 */
853 
854   /** if ( == false) then reads the names of files in order
855     *  to create the lookup tables
856     */
857   protected void createLists() throws IOException {
858     InputStream inputStream = Files.getGateResourceAsStream(
859                                               "creole/namematcher/listsNM.def");
860     InputStreamReader inputStreamReader = new InputStreamReader (
861                                                     inputStream);
862     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
863 
864     String lineRead = null;
865     while ((lineRead = bufferedReader.readLine()) != null){
866       int index = lineRead.indexOf(":");
867       if (index != -1){
868         String nameFile = lineRead.substring(0,index);
869         String nameList = lineRead.substring(index+1,lineRead.length());
870         createAnnotList(nameFile,nameList);
871       }// if
872     }//while
873     bufferedReader.close();
874     inputStreamReader.close();
875     inputStream.close();
876   }// createLists()
877 
878   /** creates the lookup tables */
879   protected void createAnnotList(String nameFile,String nameList)
880                                                           throws IOException{
881     InputStream inputStream = Files.getGateResourceAsStream(
882                                               "creole/namematcher/"+nameFile);
883     InputStreamReader inputStreamReader = new InputStreamReader (
884                                                     inputStream);
885     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
886 
887     String lineRead = null;
888     while ((lineRead = bufferedReader.readLine()) != null){
889       if (nameList.compareTo(CDGLISTNAME)==0){
890         if (caseSensitive)
891           cdg.add(lineRead);
892         else
893           cdg.add(lineRead.toLowerCase());
894       }// if
895       else {
896         int index = lineRead.indexOf("£");
897         if (index != -1){
898           String  expr = lineRead.substring(0,index);
899           //if not case-sensitive, we need to downcase all strings
900           if (!caseSensitive)
901             expr = expr.toLowerCase();
902           String code = lineRead.substring(index+1,lineRead.length());
903           if (nameList.equals(ALIASLISTNAME))
904                             alias.put(expr, code);
905           else
906           if (nameList.equals(ARTLISTNAME))
907                             def_art.put(expr, code);
908           else
909           if (nameList.equals(PREPLISTNAME))
910                             prepos.put(expr, code);
911           else
912           if (nameList.equals(CONNECTORLISTNAME))
913                             connector.put(expr, code);
914           else
915           if (nameList.equals(SPURLISTNAME))
916                             spur_match.put(expr, code);
917 
918         }//if
919       }// else
920 
921     }//while
922   }//createAnnotList
923 
924 
925   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
926   private boolean apply_rules_namematch(String annotationType, String shortName,
927                                         String longName) {
928     // first apply rule for spurius matches i.e. rule0
929     if (matchRule0(longName, shortName))
930       return false;
931     if (
932          (// rules for all annotations
933           //no longer use rule1, coz I do the check for same string via the
934           //hash table
935             matchRule2(longName, shortName)
936          ||
937             matchRule3(longName, shortName)
938          ) // rules for all annotations
939          ||
940          (// rules for organisation annotations
941              ( annotationType.equals(organizationType)
942                //ACE addition
943                || annotationType.equals("Facility"))
944              &&
945              (    matchRule4(longName, shortName)
946                ||
947                   matchRule5(longName, shortName)
948                ||
949                   matchRule6(longName, shortName)
950                ||
951                   matchRule7(longName, shortName)
952                ||
953 //                  matchRule8(longName, shortName)
954 //               ||
955                   matchRule9(longName, shortName)
956                ||
957                   matchRule10(longName, shortName)
958                ||
959                   matchRule11(longName, shortName)
960                ||
961                   matchRule12(longName, shortName)
962                ||
963                   matchRule13(shortName, longName)
964               )
965            )// rules for organisation annotations
966          ||
967          (// rules for person annotations
968              (    annotationType.equals(personType))
969                &&
970              (    matchRule4(longName, shortName)
971                ||
972                   matchRule5(longName, shortName)
973                ||
974                   matchRule14(longName, shortName)
975                || //kalina: added this, so it matches names when contain more
976                   //than one first and one last name
977                   matchRule15(longName, shortName)
978               )
979           )// rules for person annotations
980          ) //if
981       return true;
982     return false;
983   }//apply_rules
984 
985 
986   /** set the extLists flag */
987   public void setExtLists(Boolean newExtLists) {
988     extLists = newExtLists.booleanValue();
989   }//setextLists
990 
991   /** set the caseSensitive flag */
992   public void setCaseSensitive(Boolean newCase) {
993     caseSensitive = newCase.booleanValue();
994   }//setextLists
995 
996   /** set the annotation set name*/
997   public void setAnnotationSetName(String newAnnotationSetName) {
998     annotationSetName = newAnnotationSetName;
999   }//setAnnotationSetName
1000
1001  /** set the types of the annotations*/
1002  public void setAnnotationTypes(List newType) {
1003    annotationTypes = newType;
1004  }//setAnnotationTypes
1005
1006  /** set whether to process the Unknown annotations*/
1007  public void setProcessUnknown(Boolean processOrNot) {
1008    this.matchingUnknowns = processOrNot.booleanValue();
1009  }//setAnnotationTypes
1010
1011  public void setOrganizationType(String newOrganizationType) {
1012    organizationType = newOrganizationType;
1013  }//setOrganizationType
1014
1015  public void setPersonType(String newPersonType) {
1016    personType = newPersonType;
1017  }//setPersonType
1018
1019  /**get the name of the annotation set*/
1020  public String getAnnotationSetName() {
1021    return annotationSetName;
1022  }//getAnnotationSetName
1023
1024  /** get the types of the annotation*/
1025  public List getAnnotationTypes() {
1026    return annotationTypes;
1027  }//getAnnotationTypes
1028
1029  public String getOrganizationType() {
1030    return organizationType;
1031  }
1032
1033  public String getPersonType() {
1034    return personType;
1035  }
1036
1037  public Boolean getExtLists() {
1038    return new Boolean(extLists);
1039  }
1040
1041  /** Are we running in a case-sensitive mode?*/
1042  public Boolean getCaseSensitive() {
1043    return new Boolean(caseSensitive);
1044  }
1045
1046  /** Return whether or not we're processing the Unknown annots*/
1047  public Boolean getProcessUnknown() {
1048    return new Boolean(matchingUnknowns);
1049  }
1050
1051/*
1052  public List getMatchesDocument() {
1053    return matchesDocument;
1054  }
1055*/
1056
1057  protected boolean isUnknownGender(String gender) {
1058    if (gender == null)
1059      return true;
1060    if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1061      return false;
1062    return true;
1063
1064  } //isUnknownGender
1065
1066  /** RULE #0: If the two names are listed in table of
1067    * spurius matches then they do NOT match
1068    * Condition(s): -
1069    * Applied to: all name annotations
1070    */
1071  public boolean matchRule0(String s1,
1072           String s2) {
1073    if (spur_match.containsKey(s1)
1074        && spur_match.containsKey(s2) )
1075      return
1076        spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1077
1078    return false;
1079  }//matchRule0
1080
1081  /** RULE #1: If the two names are identical then they are the same
1082    * no longer used, because I do the check for same string via the
1083    * hash table of previous annotations
1084    * Condition(s): depend on case
1085    * Applied to: all name annotations
1086    */
1087  public boolean matchRule1(String s1,
1088           String s2,
1089           boolean matchCase) {
1090//    Out.prln("Rule1: Matching " + s1 + "and " + s2);
1091
1092    boolean matched = false;
1093    if (!matchCase)
1094        matched = s1.equalsIgnoreCase(s2);
1095    else matched =  s1.equals(s2) ;
1096//kalina: do not remove, nice for debug
1097//    if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
1098//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1099    return matched;
1100  }//matchRule1
1101
1102
1103  /**
1104    * RULE #2: if the two names are listed as equivalent in the
1105    * lookup table (alias) then they match
1106    * Condition(s): -
1107    * Applied to: all name annotations
1108    */
1109  public boolean matchRule2(String s1,
1110           String s2) {
1111
1112    if (alias.containsKey(s1) && alias.containsKey(s2))
1113      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1114
1115    return false;
1116  }//matchRule2
1117
1118  /**
1119    * RULE #3: adding a possessive at the end
1120    * of one name causes a match
1121    * e.g. "Standard and Poor" == "Standard and Poor's"
1122    * and also "Standard and Poor" == "Standard's"
1123    * Condition(s): case-insensitive match
1124    * Applied to: all name annotations
1125    */
1126  public boolean matchRule3(String s1, //long string
1127                             String s2) { //short string
1128
1129    if (s2.endsWith("'s") || s2.endsWith("'")
1130        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1131
1132
1133      String s2_poss = null;
1134
1135      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1136      else s2_poss = s2.concat("'");
1137
1138      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1139
1140      // now check the second case i.e. "Standard and Poor" == "Standard's"
1141      String token = (String)
1142        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1143
1144      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1145      else s2_poss = token.concat("'");
1146
1147      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1148
1149    } // if (s2.endsWith("'s")
1150    return false;
1151  }//matchRule3
1152
1153  /**
1154    * RULE #4: Do all tokens other than the punctuation marks
1155    * , and . match?
1156    * e.g. "Smith, Jones" == "Smith Jones"
1157    * Condition(s): case-insensitive match
1158    * Applied to: organisation and person annotations
1159    */
1160  public boolean matchRule4(String s1,
1161           String s2) {
1162
1163    boolean allTokensMatch = true;
1164
1165    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1166    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1167    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1168      Annotation token = (Annotation) tokensLongAnnotIter.next();
1169      if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1170        continue;
1171//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1172      if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1173             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1174        allTokensMatch = false;
1175        break;
1176      } // if (!tokensLongAnnot.nextToken()
1177    } // while
1178//    if (allTokensMatch)
1179//      Out.prln("rule4 fired. result is: " + allTokensMatch);
1180    return allTokensMatch;
1181  }//matchRule4
1182
1183  /**
1184    * RULE #5: if the 1st token of one name
1185    * matches the second name
1186    * e.g. "Pepsi Cola" == "Pepsi"
1187    * Condition(s): case-insensitive match
1188    * Applied to: all name annotations
1189    */
1190  public boolean matchRule5(String s1,
1191           String s2) {
1192
1193    //do not match numbers by this rule
1194    if (tokensLongAnnot.size()> 1 &&
1195        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1196      return false;
1197
1198//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) {
1199//      Out.prln("Rule 5: " + s1 + "and " + s2);
1200//    }
1201
1202    //require that when matching person names, the shorter one to be of length 1
1203    //for the rule to apply. In other words, avoid matching Peter Smith and
1204    //Peter Kline, because they share a Peter token.
1205    if ( (shortAnnot.getType().equals(personType)
1206         || longAnnot.getType().equals(personType)
1207         )
1208       &&
1209         tokensShortAnnot.size()>1
1210       )
1211       return false;
1212
1213    if (tokensLongAnnot.size()<=1)
1214      return false;
1215    boolean result = matchRule1((String)
1216                      ((Annotation) tokensLongAnnot.get(0)
1217                        ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1218                      s2,
1219                      caseSensitive);
1220
1221//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick"))
1222//      Out.prln("rule 5 result: " + result);
1223    return result;
1224
1225  }//matchRule5
1226
1227  /**
1228    * RULE #6: if one name is the acronym of the other
1229    * e.g. "Imperial Chemical Industries" == "ICI"
1230    * Applied to: organisation annotations only
1231    */
1232  public boolean matchRule6(String s1,
1233           String s2) {
1234
1235    int i = 0;
1236
1237    //check and if the shorted string has a space in it, then it's not
1238    //an acronym
1239    if (s2.indexOf(" ") > 0)
1240      return false;
1241
1242    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1243    StringBuffer acronym_s1 = new StringBuffer("");
1244    StringBuffer acronymDot_s1 = new StringBuffer("");
1245
1246    for ( ;i < tokensLongAnnot.size(); i++ ) {
1247      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1248                         ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1249      acronym_s1.append(toAppend);
1250      acronymDot_s1.append(toAppend);
1251      acronymDot_s1.append(".");
1252    }
1253
1254    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1255    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1256
1257    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1258        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1259      return true;
1260
1261    return false;
1262  }//matchRule6
1263
1264  /**
1265    * RULE #7: if one of the tokens in one of the
1266    * names is in the list of separators eg. "&"
1267    * then check if the token before the separator
1268    * matches the other name
1269    * e.g. "R.H. Macy & Co." == "Macy"
1270    * Condition(s): case-sensitive match
1271    * Applied to: organisation annotations only
1272    */
1273  public boolean matchRule7(String s1,
1274           String s2) {
1275
1276    //don't try it unless the second string is just one token
1277    if (tokensShortAnnot.size() != 1)
1278      return false;
1279
1280    String previous_token = null;
1281
1282    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1283      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1284          ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1285        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1286                                    ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1287
1288        break;
1289      }
1290    }
1291
1292    //now match previous_token with other name
1293    if (previous_token != null) {
1294//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1295//        Out.prln("Rule7");
1296      return matchRule1(previous_token,s2,caseSensitive);
1297
1298    }
1299    return false;
1300  }//matchRule7
1301
1302  /**
1303   * This rule is now obsolete, as The and the trailing CDG
1304   * are stripped before matching.
1305   * DO NOT CALL!!!
1306   *
1307    * RULE #8: if the names match, ignoring The and
1308    * and trailing company designator (which have already been stripped)
1309    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1310    * Condition(s): case-sensitive match
1311    * Applied to: organisation annotations only
1312    */
1313  public boolean matchRule8(String s1,
1314           String s2) {
1315    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1316/*
1317    if (s1.startsWith("The ")) s1 = s1.substring(4);
1318    if (s2.startsWith("The ")) s2 = s2.substring(4);
1319
1320    // check that cdg is not empty
1321    if (!cdg.isEmpty()) {
1322      String stringToTokenize1 = s1;
1323      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1324
1325      String stringToTokenize2 = s2;
1326      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1327      String token = null;
1328      String cdg1 = null;
1329      String cdg2 = null;
1330
1331      s1 = "";
1332      s2 = "";
1333
1334      //check last token of s1
1335      while (tokensLongAnnot.hasMoreTokens()) {
1336        token = tokensLongAnnot.nextToken();
1337        if (!tokensLongAnnot.hasMoreTokens()
1338            && cdg.contains(token)) cdg1=token;
1339        else s1 = s1+token;
1340      }
1341
1342      // do the same for s2
1343      while (tokensShortAnnot.hasMoreTokens()) {
1344        token = tokensShortAnnot.nextToken();
1345        if (!tokensShortAnnot.hasMoreTokens()
1346          && cdg.contains(token)) cdg2=token;
1347        else s2 = s2+token;
1348      }
1349
1350      // if the company designators are different
1351      // then they are NOT the same organisations
1352      if ((cdg1!=null && cdg2!=null)
1353    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1354    }
1355    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1356*/
1357    return false;
1358
1359  }//matchRule8
1360
1361  /**
1362    * RULE #9: does one of the names match the token
1363    * just before a trailing company designator
1364    * in the other name?
1365    * The company designator has already been chopped off,
1366    * so the token before it, is in fact the last token
1367    * e.g. "R.H. Macy Co." == "Macy"
1368    * Applied to: organisation annotations only
1369    */
1370  public boolean matchRule9(String s1,
1371           String s2) {
1372
1373//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1374//      Out.prln("Rule 9 " + s1 + " and " + s2);
1375    String s1_short = (String)
1376                      ((Annotation) tokensLongAnnot.get(
1377                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1378//    Out.prln("Converted to " + s1_short);
1379    if (tokensLongAnnot.size()>1) {
1380      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1381      //we need to make sure all names match, instead of assuming transitivity,
1382      //to avoid matching BBC News with News then News with ITV News, which
1383      //by transitivity leads to BBC News matching ITV News which is not what
1384      //we want
1385      if (matched)
1386        allMatchingNeeded = true;
1387      return matched;
1388    } //if
1389
1390    return false;
1391  }//matchRule9
1392
1393  /**
1394    * RULE #10: is one name the reverse of the other
1395    * reversing around prepositions only?
1396    * e.g. "Department of Defence" == "Defence Department"
1397    * Condition(s): case-sensitive match
1398    * Applied to: organisation annotations only
1399    */
1400  public boolean matchRule10(String s1,
1401            String s2) {
1402
1403    String token = null;
1404    String previous_token = null;
1405    String next_token = null;
1406    boolean invoke_rule=false;
1407
1408    if (tokensLongAnnot.size() >= 3
1409        && tokensShortAnnot.size() >= 2) {
1410
1411      // first get the tokens before and after the preposition
1412      int i = 0;
1413      for (; i< tokensLongAnnot.size(); i++) {
1414        token = (String)
1415                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1416        if (prepos.containsKey(token)) {
1417          invoke_rule=true;
1418          break;
1419        }//if
1420        previous_token = token;
1421      }//while
1422
1423      if (! invoke_rule)
1424        return false;
1425
1426      if (i < tokensLongAnnot.size()
1427          && previous_token != null)
1428        next_token= (String)
1429                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1430      else return false;
1431
1432      String s21 = (String)
1433                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1434      String s22 = (String)
1435                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1436      // then compare (in reverse) with the first two tokens of s2
1437      if (matchRule1(next_token,(String) s21,caseSensitive)
1438          && matchRule1(previous_token, s22,caseSensitive))
1439        return true ;
1440    }//if (tokensLongAnnot.countTokens() >= 3
1441    return false;
1442  }//matchRule10
1443
1444  /**
1445    * RULE #11: does one name consist of contractions
1446    * of the first two tokens of the other name?
1447    * e.g. "Communications Satellite" == "ComSat"
1448    * and "Pan American" == "Pan Am"
1449    * Condition(s): case-sensitive match
1450    * Applied to: organisation annotations only
1451    */
1452  public boolean matchRule11(String s1,
1453            String s2) {
1454
1455
1456    // first do the easy case e.g. "Pan American" == "Pan Am"
1457
1458    String token11 = null;
1459    String token12 = null;
1460    String token21 = null;
1461    String token22 = null;
1462
1463    if (tokensLongAnnot.size() < 2)
1464      return false;
1465
1466    // 1st get the first two tokens of s1
1467    token11 = (String)
1468                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1469    token12 = (String)
1470                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1471
1472    // now check for the first case i.e. "Pan American" == "Pan Am"
1473    if (tokensShortAnnot.size() == 2)  {
1474
1475      token21 = (String)
1476                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1477      token22 = (String)
1478                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1479
1480      if (token11.startsWith(token21)
1481          && token12.startsWith(token22))
1482        return true;
1483
1484    } // if (tokensShortAnnot.countTokens() == 2)
1485
1486    // now the second case e.g.  "Communications Satellite" == "ComSat"
1487    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1488
1489      // split the token into possible contractions
1490      // ignore case for matching
1491      for (int i=2;i<s2.length();i++) {
1492        token21=s2.substring(0,i+1);
1493        token22=s2.substring(i+1);
1494
1495        if (token11.startsWith(token21)
1496            && token12.startsWith(token22))
1497          return true;
1498      }// for
1499    } // else if
1500
1501    return false;
1502  }//matchRule11
1503
1504  /**
1505    * RULE #12: do the first and last tokens of one name
1506    * match the first and last tokens of the other?
1507    * Condition(s): case-sensitive match
1508    * Applied to: organisation annotations only
1509    */
1510  public boolean matchRule12(String s1,
1511            String s2) {
1512
1513    // first do the easy case e.g. "Pan American" == "Pan Am"
1514
1515    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1516//     Out.prln("Rule 12");
1517
1518      // get first and last tokens of s1 & s2
1519      String s1_first = (String)
1520                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1521      String s2_first = (String)
1522                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1523
1524      if (!matchRule1(s1_first,s2_first,caseSensitive))
1525        return false;
1526
1527      String s1_last = (String)
1528         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1529      String s2_last = (String)
1530         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1531
1532      return matchRule1(s1_last,s2_last,caseSensitive);
1533    } // if (tokensLongAnnot.countTokens()>1
1534    return false;
1535  }//matchRule12
1536
1537  /**
1538    * RULE #13: do multi-word names match except for
1539    * one token e.g.
1540    * "Second Force Recon Company" == "Force Recon Company"
1541    * Note that this rule has NOT been used in LaSIE's 1.5
1542    * namematcher
1543    * Restrictions: - remove cdg first
1544    *               - shortest name should be 2 words or more
1545    *               - if N is the number of tokens of the longest
1546    *                 name, then N-1 tokens should be matched
1547    * Condition(s): case-sensitive match
1548    * Applied to: organisation or person annotations only
1549    */
1550  public boolean matchRule13(String s1,
1551            String s2) {
1552
1553
1554    String token1 = null;
1555    String token2 = null;
1556
1557    int matched_tokens = 0, mismatches = 0;;
1558
1559    // if names < 2 words then rule is invalid
1560    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1561
1562//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1563//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1564//      Out.prln("with tokens " + tokensShortAnnot);
1565//    }
1566
1567    // now do the matching
1568    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1569
1570//      Out.prln("i = " + i);
1571//      Out.prln("j = " + j);
1572      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1573           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1574        matched_tokens++;
1575        j++;
1576      } else
1577        mismatches++;
1578    } // for
1579
1580    if (matched_tokens >= tokensLongAnnot.size()-1)
1581      return true;
1582
1583    return false;
1584  }//matchRule13
1585
1586  /**
1587    * RULE #14: if the last token of one name
1588    * matches the second name
1589    * e.g. "Hamish Cunningham" == "Cunningham"
1590    * Condition(s): case-insensitive match
1591    * Applied to: all person annotations
1592    */
1593  public boolean matchRule14(String s1,
1594           String s2) {
1595
1596//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1597//      Out.prln("Rule 14 " + s1 + " and " + s2);
1598    String s1_short = (String)
1599                      ((Annotation) tokensLongAnnot.get(
1600                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1601//    Out.prln("Converted to " + s1_short);
1602    if (tokensLongAnnot.size()>1)
1603      return matchRule1(s1_short,
1604                      s2,
1605                      caseSensitive);
1606
1607    return false;
1608
1609  }//matchRule14
1610
1611  /**
1612    * RULE #15: does one token from a Person name appear as the other token
1613    * Note that this rule has NOT been used in LaSIE's 1.5
1614    * namematcher; added for ACE by Di's request
1615    */
1616  public boolean matchRule15(String s1,
1617            String s2) {
1618
1619    int matched_tokens = 0;
1620
1621    // if names < 2 words then rule is invalid
1622
1623//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1624//      Out.prln("Rule 15:" );
1625//      Out.prln("with tokens " + tokensShortAnnot);
1626//    }
1627
1628    // now do the matching
1629    Annotation token1, token2;
1630    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1631      token1 = (Annotation) tokensShortAnnot.get(i);
1632      //first check if not punctuation, because we need to skip it
1633      if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1634        continue;
1635
1636      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1637//      Out.prln("i = " + i);
1638        token2 = (Annotation) tokensLongAnnot.get(j);
1639        if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1640          continue;
1641        if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1642             token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1643          matched_tokens++;
1644      }//for
1645    } // for
1646
1647    //19 February 2002: kalina
1648    //was originally > 0 (i.e., any match is good)
1649    //ensure that we've matched all the tokens in the short annotation
1650    //the reason for that is, because otherwise we match
1651    //Patrick Viera and Patrick Somebody - not good!
1652    if (matched_tokens == tokensShortAnnot.size())
1653      return true;
1654
1655    return false;
1656  }//matchRule15
1657
1658  /** Tables for namematch info
1659    * (used by the namematch rules)
1660    */
1661  private void buildTables(AnnotationSet nameAllAnnots) {
1662
1663    //reset the tables first
1664    cdg.clear();
1665
1666    if (! extLists) {
1667    // i.e. get cdg from Lookup annotations
1668      // get all Lookup annotations
1669      tempMap.clear();
1670      tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1671      //now get all lookup annotations which are cdg
1672      AnnotationSet nameAnnots =
1673        nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1674
1675      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1676        return;
1677
1678      Iterator iter = nameAnnots.iterator();
1679      while (iter.hasNext()) {
1680         Annotation annot = (Annotation)iter.next();
1681         // get the actual string
1682         Long offsetStartAnnot = annot.getStartNode().getOffset();
1683         Long offsetEndAnnot = annot.getEndNode().getOffset();
1684         try {
1685           gate.Document doc = nameAllAnnots.getDocument();
1686           String annotString =
1687                            doc.getContent().getContent(
1688                            offsetStartAnnot,offsetEndAnnot
1689                            ).toString();
1690                cdg.add(annotString);
1691         } catch (InvalidOffsetException ioe) {
1692             ioe.printStackTrace(Err.getPrintWriter());
1693         }
1694      }// while
1695    }//if
1696  }//buildTables
1697
1698  /** substitute all multiple spaces, tabes and newlines
1699    * with a single space
1700    */
1701  public String regularExpressions ( String text, String replacement,
1702                                      String regEx) {
1703    String result = text;
1704    try {
1705      RE re = new RE(regEx);
1706      result = re.substituteAll( text,replacement);
1707    } catch (REException ree) {ree.printStackTrace();}
1708    return result;
1709  }//regularExpressions
1710
1711
1712  private static class Class1 {
1713  }
1714} // public class OrthoMatcher
1715
1716