1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.36 2001/12/01 17:47:04 kalina Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import gate.*;
20  import gate.util.*;
21  import gate.creole.*;
22  import gate.corpora.*;
23  import gate.annotation.*;
24  import java.util.*;
25  import java.io.*;
26  import java.net.*;
27  import gnu.regexp.*;
28  
29  public class OrthoMatcher extends AbstractLanguageAnalyser
30                            implements ANNIEConstants{
31  
32    protected static final String CDGLISTNAME = "cdg";
33    protected static final String ALIASLISTNAME = "alias";
34    protected static final String ARTLISTNAME = "def_art";
35    protected static final String PREPLISTNAME = "prepos";
36    protected static final String CONNECTORLISTNAME = "connector";
37    protected static final String SPURLISTNAME = "spur_match";
38  
39    protected static final String LOOKUPNAME = "Lookup";
40    protected static final String GENDER_FEATURE = "gender";
41    protected static final String KIND_FEATURE = "kind";
42    protected static final String STRING_FEATURE = "string";
43    protected static final String THE_VALUE = "The";
44  
45  
46    /**the name of the annotation set*/
47    protected String annotationSetName;
48  
49    /** the types of the annotation */
50    protected List annotationTypes = new ArrayList(10);
51  
52    /** the organization type*/
53    protected String organizationType = "Organization";
54  
55    /** the person type*/
56    protected String personType = "Person";
57  
58    protected String unknownType = "Unknown";
59  
60    /** internal or external list */
61    protected boolean extLists = true;
62  
63    protected boolean matchingUnknowns = true;
64  
65    /** This is an internal variable to indicate whether
66     *  we matched using a rule that requires that
67     *  the newly matched annotation matches all the others
68     *  This is needed, because organizations can share
69     *  first/last tokens like News and be different
70     */
71    private   boolean allMatchingNeeded = false;
72  
73    //** Orthomatching is not case-sensitive by default*/
74    protected boolean caseSensitive = false;
75  
76    protected FeatureMap queryFM = Factory.newFeatureMap();
77  
78  //  protected ExecutionException executionException;
79  
80    // name lookup tables (used for namematch)
81    //gave them bigger default size, coz rehash is expensive
82    protected HashMap alias = new HashMap(100);
83    protected HashSet cdg = new HashSet(50);
84    protected HashMap spur_match = new HashMap(100);
85    protected HashMap def_art = new HashMap(20);
86    protected HashMap connector = new HashMap(20);
87    protected HashMap prepos = new HashMap(30);
88  
89  
90    protected AnnotationSet nameAllAnnots = null;
91    protected HashMap processedAnnots = new HashMap(150);
92    protected HashMap annots2Remove = new HashMap(75);
93    protected List matchesDocFeature = new ArrayList();
94    //maps annotation ids to array lists of tokens
95    protected HashMap tokensMap = new HashMap(150);
96  
97    protected Annotation shortAnnot, longAnnot;
98  
99    protected ArrayList tokensLongAnnot, tokensShortAnnot;
100 
101   /** a feature map to be used when retrieving annotations
102    *  declared here so can be reused for efficiency
103    *  clear() before each use
104    */
105   protected FeatureMap tempMap = Factory.newFeatureMap();
106 
107   /** a buffer in order to read an array of char */
108   private char[] cbuffer = null;
109 
110   /** the size of the buffer */
111   private final static int BUFF_SIZE = 65000;
112 
113   /** @link dependency */
114   /*#OrthoMatcher lnkOrthoMatcher;*/
115 
116   public OrthoMatcher () {
117     annotationTypes.add(organizationType);
118     annotationTypes.add(personType);
119     annotationTypes.add("Location");
120     annotationTypes.add("Date");
121   }
122 
123   /** Initialise this resource, and return it. */
124   public Resource init() throws ResourceInstantiationException {
125     cbuffer = new char[BUFF_SIZE];
126 
127     //initialise the list of annotations which we will match
128     try {
129       createLists();
130     } catch (IOException ioe) {ioe.printStackTrace();}
131     return this;
132   } // init()
133 
134   /**  Run the resource. It doesn't make sense not to override
135     *  this in subclasses so the default implementation signals an
136     *  exception.
137     */
138   public void execute() throws ExecutionException{
139 
140     //check the input
141     if(document == null) {
142       throw new ExecutionException(
143         "No document for namematch!"
144       );
145     }
146 
147     // get the annotations from document
148     if ((annotationSetName == null)|| (annotationSetName.equals("")))
149       nameAllAnnots = document.getAnnotations();
150     else
151       nameAllAnnots = document.getAnnotations(annotationSetName);
152 
153     //if none found, print warning and exit
154     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
155       Out.prln("OrthoMatcher Warning: No annotations found for processing");
156       return;
157     }
158 
159     //check if we've been run on this document before
160     //and clean the doc if needed
161     docCleanup();
162     Map matchesMap = (Map)document.getFeatures().
163                      get(DOCUMENT_COREF_FEATURE_NAME);
164 //    if(matchesMap != null && matchesMap.containsKey(nameAllAnnots.getName())){
165 //      docCleanup();
166 //    }
167 
168     // creates the cdg list from the document
169     //no need to create otherwise, coz already done in init()
170     if (!extLists)
171       buildTables(nameAllAnnots);
172 
173     //first match all name annotations
174     matchNameAnnotations();
175 
176     //then match the unknown ones to all name ones
177     if (matchingUnknowns)
178       matchUnknown();
179 
180     // set the matches of the document
181 //    determineMatchesDocument();
182     if (! matchesDocFeature.isEmpty()) {
183       if(matchesMap == null){
184         matchesMap = new HashMap();
185       }
186       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
187       //we need to put it even if it was already present in order to triger
188       //the update events
189       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
190 
191       //cannot do clear() as this has already been put on the document
192       //so I need a new one for the next run of matcher
193       matchesDocFeature = new ArrayList();
194     }
195 
196 //    Out.prln("Processed strings" + processedAnnots.values());
197     //clean-up the internal data structures for next run
198     nameAllAnnots = null;
199     processedAnnots.clear();
200     annots2Remove.clear();
201     tokensMap.clear();
202     matchesDocFeature = new ArrayList();
203     longAnnot = null;
204     shortAnnot = null;
205     tokensLongAnnot = null;
206     tokensShortAnnot = null;
207 
208   } // run()
209 
210   protected void matchNameAnnotations() throws ExecutionException{
211     // go through all the annotation types
212     Iterator iterAnnotationTypes = annotationTypes.iterator();
213     while (iterAnnotationTypes.hasNext()) {
214       String annotationType = (String)iterAnnotationTypes.next();
215 
216       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
217 
218       // continue if no such annotations exist
219       if ((nameAnnots == null) || nameAnnots.isEmpty())
220         continue;
221 
222       Iterator iterNames = nameAnnots.iterator();
223       while (iterNames.hasNext()) {
224         Annotation nameAnnot = (Annotation) iterNames.next();
225         Integer id = nameAnnot.getId();
226 
227         // get string and value
228         String annotString = null;
229         try {
230             annotString = document.getContent().getContent(
231             nameAnnot.getStartNode().getOffset(),
232             nameAnnot.getEndNode().getOffset()
233             ).toString();
234           // now do the reg. exp. substitutions
235           annotString = regularExpressions(annotString," ", "\\s+");
236 
237         } catch (InvalidOffsetException ioe) {
238             throw new ExecutionException
239                                    ("Invalid offset of the annotation");
240         }
241         //convert to lower case if we are not doing a case sensitive match
242         if (!caseSensitive)
243           annotString = annotString.toLowerCase();
244 
245         //get the tokens
246         List tokens = new ArrayList((Set)
247                         nameAllAnnots.get("Token",
248                           nameAnnot.getStartNode().getOffset(),
249                           nameAnnot.getEndNode().getOffset()
250                         ));
251         //if no tokens to match, do nothing
252         if (tokens.isEmpty())
253           continue;
254         Collections.sort(tokens, new gate.util.OffsetComparator());
255         //check if these actually do not end after the name
256         //needed coz new tokeniser conflates
257         //strings with dashes. So British Gas-style is two tokens
258         //instead of three. So cannot match properly British Gas
259 //        tokens = checkTokens(tokens);
260         tokensMap.put(nameAnnot.getId(), tokens);
261 
262 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
263 
264         //first check whether we have not matched such a string already
265         //if so, just consider it matched, don't bother calling the rules
266         if (processedAnnots.containsValue(annotString)) {
267 //          Out.prln("Contained string found " + annotString);
268           updateMatches(nameAnnot, annotString);
269           processedAnnots.put(nameAnnot.getId(), annotString);
270           continue;
271         } else if (processedAnnots.isEmpty()) {
272           processedAnnots.put(nameAnnot.getId(), annotString);
273           continue;
274         }
275 
276         //if a person, then remove their title before matching
277         if (nameAnnot.getType().equals(personType))
278           annotString = containTitle(annotString, nameAnnot);
279         else if (nameAnnot.getType().equals(organizationType))
280           annotString = stripCDG(annotString, nameAnnot);
281 
282         //otherwise try matching with previous annotations
283         matchWithPrevious(nameAnnot, annotString);
284 
285 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
286         //finally add the current annotations to the processed map
287         processedAnnots.put(nameAnnot.getId(), annotString);
288       }//while through name annotations
289 
290     }//while through annotation types
291 
292   }
293 
294   protected void matchUnknown() throws ExecutionException {
295     //get all Unknown annotations
296     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
297 
298     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
299       return;
300 
301     Iterator iter = unknownAnnots.iterator();
302     //loop through the unknown annots
303     while (iter.hasNext()) {
304       Annotation unknown = (Annotation) iter.next();
305 
306       // get string and value
307       String unknownString = null;
308       try {
309           unknownString = document.getContent().getContent(
310             unknown.getStartNode().getOffset(),
311             unknown.getEndNode().getOffset()
312             ).toString();
313         // now do the reg. exp. substitutions
314         unknownString = regularExpressions(unknownString," ", "\\s+");
315       } catch (InvalidOffsetException ioe) {
316           throw new ExecutionException
317                                  ("Invalid offset of the annotation");
318       }
319       //convert to lower case if we are not doing a case sensitive match
320       if (!caseSensitive)
321         unknownString = unknownString.toLowerCase();
322 
323       //get the tokens
324       List tokens = new ArrayList((Set)
325                       nameAllAnnots.get("Token",
326                         unknown.getStartNode().getOffset(),
327                         unknown.getEndNode().getOffset()
328                       ));
329       if (tokens.isEmpty())
330         continue;
331       Collections.sort(tokens, new gate.util.OffsetComparator());
332       tokensMap.put(unknown.getId(), tokens);
333 
334 
335       //first check whether we have not matched such a string already
336       //if so, just consider it matched, don't bother calling the rules
337       if (processedAnnots.containsValue(unknownString)) {
338         Annotation matchedAnnot = updateMatches(unknown, unknownString);
339 //        Out.prln("Matched " + unknown + "with string " + unknownString);
340 //        Out.prln("That's same as " + matchedAnnot);
341         if (matchedAnnot.getType().equals(unknownType)) {
342           annots2Remove.put(unknown.getId(),
343                             annots2Remove.get(matchedAnnot.getId()));
344         }
345         else
346           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
347         processedAnnots.put(unknown.getId(), unknownString);
348         unknown.getFeatures().put("NMRule", unknownType);
349         continue;
350       }
351 
352       matchWithPrevious(unknown, unknownString);
353     } //while though unknowns
354 
355     if (! annots2Remove.isEmpty()) {
356       Iterator unknownIter = annots2Remove.keySet().iterator();
357       while (unknownIter.hasNext()) {
358         Integer unknId = (Integer) unknownIter.next();
359         Annotation unknown = nameAllAnnots.get(unknId);
360         Integer newID = nameAllAnnots.add(
361           unknown.getStartNode(),
362           unknown.getEndNode(),
363           (String) annots2Remove.get(unknId),
364           unknown.getFeatures()
365         );
366         nameAllAnnots.remove(unknown);
367 
368         //change the id in the matches list
369         List mList = (List)unknown.getFeatures().
370                      get(ANNOTATION_COREF_FEATURE_NAME);
371         mList.remove(unknId);
372         mList.add(newID);
373       }//while
374     }//if
375   }
376 
377   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
378     boolean matchedUnknown = false;
379 
380     Iterator prevIter = processedAnnots.keySet().iterator();
381     while (prevIter.hasNext()) {
382       Integer prevId = (Integer) prevIter.next();
383       Annotation prevAnnot = nameAllAnnots.get(prevId);
384 
385       //check if the two are from the same type or the new one is unknown
386       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
387           && ! nameAnnot.getType().equals(unknownType))
388          )
389         continue;
390       //do not compare two unknown annotations either
391       //they are only matched to those of known types
392       if (  nameAnnot.getType().equals(unknownType)
393             && prevAnnot.getType().equals(unknownType))
394       continue;
395 
396       //check if we have already matched this annotation to the new one
397       if (matchedAlready(nameAnnot, prevAnnot) )
398         continue;
399 
400       // determine the title from annotation string
401       //now changed to a rule, here we just match by gender
402       if (prevAnnot.getType().equals(personType)) {
403         String prevGender = (String) prevAnnot.getFeatures().get(GENDER_FEATURE);
404         String nameGender = (String) nameAnnot.getFeatures().get(GENDER_FEATURE);
405         if (   prevGender != null
406             && nameGender != null
407             && ( (nameGender.equalsIgnoreCase("female")
408                   &&
409                   prevGender.equalsIgnoreCase("male")
410                   )
411                ||
412                   (prevGender.equalsIgnoreCase("female")
413                    && nameGender.equalsIgnoreCase("male")
414                   )
415                 )
416             ) //if condition
417           continue; //we don't have a match if the two genders are different
418 
419       }//if
420 
421       //if the two annotations match
422       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
423 //        Out.prln("Matched " + shortName + "and " + longName);
424         updateMatches(nameAnnot, prevAnnot);
425         //if unknown annotation, we need to change to the new type
426         if (nameAnnot.getType().equals(unknownType)) {
427           matchedUnknown = true;
428           if (prevAnnot.getType().equals(unknownType))
429             annots2Remove.put(nameAnnot.getId(),
430                               annots2Remove.get(prevAnnot.getId()));
431           else
432             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
433          //also put an attribute to indicate that
434           nameAnnot.getFeatures().put("NMRule", unknownType);
435         }//if unknown
436         break; //no need to match further
437       }//if annotations matched
438 
439     }//while through previous annotations
440 
441     if (matchedUnknown)
442       processedAnnots.put(nameAnnot.getId(), annotString);
443 
444 
445   }//matchWithPrevious
446 
447   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
448                                      Annotation prevAnnot) {
449 
450     // find which annotation string of the two is longer
451     //  this is useful for some of the matching rules
452     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
453 
454     String longName = prevAnnotString;
455     String shortName = annotString;
456     longAnnot = prevAnnot;
457     shortAnnot = newAnnot;
458 
459     if (shortName.length()>=longName.length()) {
460       String temp = longName;
461       longName = shortName;
462       shortName = temp;
463       Annotation tempAnn = longAnnot;
464       longAnnot = shortAnnot;
465       shortAnnot = tempAnn;
466     }//if
467 
468     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
469     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
470 
471     List matchesList = (List) prevAnnot.getFeatures().
472                               get(ANNOTATION_COREF_FEATURE_NAME);
473     if (matchesList == null || matchesList.isEmpty())
474       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
475 
476     //if these two match, then let's see if all the other matching one will too
477     //that's needed, because sometimes names can share a token (e.g., first or
478     //last but not be the same
479     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
480       List toMatchList = new ArrayList(matchesList);
481 //      if (newAnnot.getType().equals(unknownType))
482 //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
483       toMatchList.remove(prevAnnot.getId());
484 
485       /**
486        * Check whether we need to ensure that there is a match with the rest
487        * of the matching annotations, because the rule requires that
488        * transtivity is not assummed.
489        */
490       if (allMatchingNeeded) {
491         allMatchingNeeded = false;
492         return matchOtherAnnots(toMatchList, newAnnot, annotString);
493       } else
494         return true;
495     }
496     return false;
497   }
498 
499   /** This method checkes whether the new annotation matches
500    *  all annotations given in the toMatchList (it contains ids)
501    *  The idea is that the new annotation needs to match all those,
502    *  because assuming transitivity does not always work, when
503    *  two different entities share a common token: e.g., BT Cellnet
504    *  and BT and British Telecom.
505   */
506   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
507                                       String annotString) {
508 
509     //if the list is empty, then we're matching all right :-)
510     if (toMatchList.isEmpty())
511       return true;
512 
513     boolean matchedAll = true;
514     int i = 0;
515 
516     while (matchedAll && i < toMatchList.size()) {
517       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
518 
519       // find which annotation string of the two is longer
520       //  this is useful for some of the matching rules
521       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
522       if (prevAnnotString == null)
523         try {
524           prevAnnotString = document.getContent().getContent(
525             prevAnnot.getStartNode().getOffset(),
526             prevAnnot.getEndNode().getOffset()
527             ).toString();
528         } catch (InvalidOffsetException ioe) {
529           return false;
530         }//try
531 
532 
533       String longName = prevAnnotString;
534       String shortName = annotString;
535       longAnnot = prevAnnot;
536       shortAnnot = newAnnot;
537 
538       if (shortName.length()>=longName.length()) {
539         String temp = longName;
540         longName = shortName;
541         shortName = temp;
542         Annotation tempAnn = longAnnot;
543         longAnnot = shortAnnot;
544         shortAnnot = tempAnn;
545       }//if
546 
547       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
548       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
549 
550       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
551 //      if (newAnnot.getType().equals(unknownType))
552 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
553 
554       i++;
555     }//while
556     return matchedAll;
557   }
558 
559 
560   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
561     //the two annotations are already matched if the matches list of the first
562     //contains the id of the second
563     List matchesList = (List) annot1.getFeatures().
564                        get(ANNOTATION_COREF_FEATURE_NAME);
565     if ((matchesList == null) || matchesList.isEmpty())
566       return false;
567     else if (matchesList.contains(annot2.getId()))
568       return true;
569     return false;
570   }
571 
572   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
573     Annotation matchedAnnot = null;
574     Integer id;
575 
576     //first find a processed annotation with the same string
577     Iterator iter = processedAnnots.keySet().iterator();
578     while (iter.hasNext()) {
579       id = (Integer) iter.next();
580       String oldString = (String) processedAnnots.get(id);
581       if (annotString.equals(oldString)) {
582         matchedAnnot = nameAllAnnots.get(id);
583         break;
584       }//if
585     }//while
586 
587     if (matchedAnnot == null) return null;
588     //if the two matching annotations are of different type which is not
589     //unknown, do not match them
590     if (! matchedAnnot.getType().equals(newAnnot.getType())
591         && !newAnnot.getType().equals(unknownType) )
592       return matchedAnnot;
593 
594     List matchesList = (List) matchedAnnot.getFeatures().
595                        get(ANNOTATION_COREF_FEATURE_NAME);
596     if ((matchesList == null) || matchesList.isEmpty()) {
597       //no previous matches, so need to add
598       if (matchesList == null) {
599         matchesList = new ArrayList();
600         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
601                                        matchesList);
602         matchesDocFeature.add(matchesList);
603       }//if
604       matchesList.add(matchedAnnot.getId());
605       matchesList.add(newAnnot.getId());
606     } else {
607       //just add the new annotation
608       matchesList.add(newAnnot.getId());
609     }//if
610     //add the matches list to the new annotation
611     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
612     return matchedAnnot;
613   }
614 
615   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
616 
617     List matchesList = (List) prevAnnot.getFeatures().
618                               get(ANNOTATION_COREF_FEATURE_NAME);
619     if ((matchesList == null) || matchesList.isEmpty()) {
620       //no previous matches, so need to add
621       if (matchesList == null) {
622         matchesList = new ArrayList();
623         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
624         matchesDocFeature.add(matchesList);
625       }//if
626       matchesList.add(prevAnnot.getId());
627       matchesList.add(newAnnot.getId());
628     } else {
629       //just add the new annotation
630       matchesList.add(newAnnot.getId());
631     }//if
632     //add the matches list to the new annotation
633     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
634     //propagate the gender if two persons are matched
635     if (prevAnnot.getType().equals(personType)) {
636       String prevGender = (String) prevAnnot.getFeatures().get(GENDER_FEATURE);
637       String newGender = (String) newAnnot.getFeatures().get(GENDER_FEATURE);
638       boolean unknownPrevGender = isUnknownGender(prevGender);
639       boolean unknownNewGender = isUnknownGender(newGender);
640       if (unknownPrevGender && !unknownNewGender)
641         prevAnnot.getFeatures().put(GENDER_FEATURE, newGender);
642       else if (unknownNewGender && !unknownPrevGender)
643         newAnnot.getFeatures().put(GENDER_FEATURE, prevGender);
644     }//if
645   }
646 
647 
648   protected void docCleanup() {
649     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
650     if (matchesValue != null && (matchesValue instanceof Map))
651       ((Map)matchesValue).remove(nameAllAnnots.getName());
652     else if (matchesValue != null)
653       document.getFeatures().remove(DOCUMENT_COREF_FEATURE_NAME);
654 
655     //get all annotations that have a matches feature
656     HashSet fNames = new HashSet();
657     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
658     AnnotationSet annots =
659                   nameAllAnnots.get(null, fNames);
660 
661 //    Out.prln("Annots to cleanup" + annots);
662 
663     if (annots == null || annots.isEmpty())
664       return;
665 
666     Iterator iter = annots.iterator();
667     while (iter.hasNext()) {
668       while (iter.hasNext())
669         ((Annotation) iter.next()).getFeatures().
670                                    remove(ANNOTATION_COREF_FEATURE_NAME);
671     } //while
672   }//cleanup
673 
674   /** return a person name without title */
675   protected String containTitle (String annotString, Annotation annot)
676                       throws ExecutionException {
677     // get the offsets
678     Long startAnnot = annot.getStartNode().getOffset();
679     Long endAnnot = annot.getEndNode().getOffset();
680 
681     // determine "Lookup" annotation set
682     queryFM.clear();
683     queryFM.put("majorType", "title");
684     AnnotationSet as =
685       nameAllAnnots.get(startAnnot,endAnnot).get("Lookup", queryFM);
686     if (as !=null && ! as.isEmpty()) {
687       List titles = new ArrayList((Set)as);
688       Collections.sort(titles, new gate.util.OffsetComparator());
689 
690       Iterator iter = titles.iterator();
691       while (iter.hasNext()) {
692         Annotation titleAnn = (Annotation)(iter.next());
693 
694         //we've not found a title at the start offset,
695         //there's no point in looking further
696         //coz titles come first
697         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
698           return annotString;
699 
700         try {
701           // the title from the current annotation
702           String annotTitle =
703             document.getContent().getContent(
704               titleAnn.getStartNode().getOffset(),
705               titleAnn.getEndNode().getOffset()
706             ).toString();
707 
708           // eliminate the title from annotation string and return the result
709           if (annotTitle.length()<annotString.length()) {
710             //remove from the array of tokens, so then we can compare properly
711             //the remaining tokens
712 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
713 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
714 //            Out.prln("Title is" + annotTitle);
715             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
716             return annotString.substring(
717                                  annotTitle.length()+1,annotString.length());
718           }
719         } catch (InvalidOffsetException ioe) {
720             throw new ExecutionException
721                                ("Invalid offset of the annotation");
722         }//try
723       }// while
724     }//if
725     return annotString;
726 
727   }
728 
729   /** return an organization  without a designator and starting The*/
730   protected String stripCDG (String annotString, Annotation annot){
731 
732     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
733 
734     //strip starting The first
735     if ( ((String) ((Annotation) tokens.get(0)
736           ).getFeatures().get(STRING_FEATURE)).equalsIgnoreCase(THE_VALUE))
737       tokens.remove(0);
738 
739     //no need to check for cdg if there is only 1 token or less
740     if (tokens.size()<2 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
741           ).getFeatures().get(STRING_FEATURE)) )
742       tokens.remove(tokens.size()-1);
743 
744     StringBuffer newString = new StringBuffer(50);
745     for (int i = 0; i < tokens.size(); i++){
746       newString.append((String) ((Annotation) tokens.get(i)
747           ).getFeatures().get(STRING_FEATURE) );
748       if (i != tokens.size()-1)
749         newString.append(" ");
750     }
751 
752     if (caseSensitive)
753       return newString.toString();
754 
755     return newString.toString().toLowerCase();
756   }
757 
758 /*
759   public void check() throws ExecutionException {
760     if (executionException != null) {
761       ExecutionException e = executionException;
762       executionException = null;
763       throw e;
764     }
765   } // check()
766 */
767 
768   /** if ( == false) then reads the names of files in order
769     *  to create the lookup tables
770     */
771   protected void createLists() throws IOException {
772     InputStream inputStream = Files.getGateResourceAsStream(
773                                               "creole/namematcher/listsNM.def");
774     InputStreamReader inputStreamReader = new InputStreamReader (
775                                                     inputStream);
776     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
777 
778     String lineRead = null;
779     while ((lineRead = bufferedReader.readLine()) != null){
780       int index = lineRead.indexOf(":");
781       if (index != -1){
782         String nameFile = lineRead.substring(0,index);
783         String nameList = lineRead.substring(index+1,lineRead.length());
784         createAnnotList(nameFile,nameList);
785       }// if
786     }//while
787     bufferedReader.close();
788     inputStreamReader.close();
789     inputStream.close();
790   }// createLists()
791 
792   /** creates the lookup tables */
793   protected void createAnnotList(String nameFile,String nameList)
794                                                           throws IOException{
795     InputStream inputStream = Files.getGateResourceAsStream(
796                                               "creole/namematcher/"+nameFile);
797     InputStreamReader inputStreamReader = new InputStreamReader (
798                                                     inputStream);
799     BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
800 
801     String lineRead = null;
802     while ((lineRead = bufferedReader.readLine()) != null){
803       if (nameList.compareTo(CDGLISTNAME)==0){
804         if (caseSensitive)
805           cdg.add(lineRead);
806         else
807           cdg.add(lineRead.toLowerCase());
808       }// if
809       else {
810         int index = lineRead.indexOf("£");
811         if (index != -1){
812           String  expr = lineRead.substring(0,index);
813           //if not case-sensitive, we need to downcase all strings
814           if (!caseSensitive)
815             expr = expr.toLowerCase();
816           String code = lineRead.substring(index+1,lineRead.length());
817           if (nameList.equals(ALIASLISTNAME))
818                             alias.put(expr, code);
819           else
820           if (nameList.equals(ARTLISTNAME))
821                             def_art.put(expr, code);
822           else
823           if (nameList.equals(PREPLISTNAME))
824                             prepos.put(expr, code);
825           else
826           if (nameList.equals(CONNECTORLISTNAME))
827                             connector.put(expr, code);
828           else
829           if (nameList.equals(SPURLISTNAME))
830                             spur_match.put(expr, code);
831 
832         }//if
833       }// else
834 
835     }//while
836   }//createAnnotList
837 
838 
839   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
840   private boolean apply_rules_namematch(String annotationType, String shortName,
841                                         String longName) {
842     // first apply rule for spurius matches i.e. rule0
843     if (matchRule0(longName, shortName))
844       return false;
845     if (
846          (// rules for all annotations
847           //no longer use rule1, coz I do the check for same string via the
848           //hash table
849             matchRule2(longName, shortName)
850          ||
851             matchRule3(longName, shortName)
852          ) // rules for all annotations
853          ||
854          (// rules for organisation annotations
855              ( annotationType.equals(organizationType)
856                //ACE addition
857                || annotationType.equals("Facility"))
858              &&
859              (    matchRule4(longName, shortName)
860                ||
861                   matchRule5(longName, shortName)
862                ||
863                   matchRule6(longName, shortName)
864                ||
865                   matchRule7(longName, shortName)
866                ||
867 //                  matchRule8(longName, shortName)
868 //               ||
869                   matchRule9(longName, shortName)
870                ||
871                   matchRule10(longName, shortName)
872                ||
873                   matchRule11(longName, shortName)
874                ||
875                   matchRule12(longName, shortName)
876                ||
877                   matchRule13(shortName, longName)
878               )
879            )// rules for organisation annotations
880          ||
881          (// rules for person annotations
882              (    annotationType.equals(personType))
883                &&
884              (    matchRule4(longName, shortName)
885                ||
886                   matchRule5(longName, shortName)
887                ||
888                   matchRule14(longName, shortName)
889                || //kalina: added this, so it matches names when contain more
890                   //than one first and one last name
891                   matchRule15(longName, shortName)
892               )
893           )// rules for person annotations
894          ) //if
895       return true;
896     return false;
897   }//apply_rules
898 
899 
900   /** set the extLists flag */
901   public void setExtLists(Boolean newExtLists) {
902     extLists = newExtLists.booleanValue();
903   }//setextLists
904 
905   /** set the caseSensitive flag */
906   public void setCaseSensitive(Boolean newCase) {
907     caseSensitive = newCase.booleanValue();
908   }//setextLists
909 
910   /** set the annotation set name*/
911   public void setAnnotationSetName(String newAnnotationSetName) {
912     annotationSetName = newAnnotationSetName;
913   }//setAnnotationSetName
914 
915   /** set the types of the annotations*/
916   public void setAnnotationTypes(List newType) {
917     annotationTypes = newType;
918   }//setAnnotationTypes
919 
920   public void setOrganizationType(String newOrganizationType) {
921     organizationType = newOrganizationType;
922   }//setOrganizationType
923 
924   public void setPersonType(String newPersonType) {
925     personType = newPersonType;
926   }//setPersonType
927 
928   /**get the name of the annotation set*/
929   public String getAnnotationSetName() {
930     return annotationSetName;
931   }//getAnnotationSetName
932 
933   /** get the types of the annotation*/
934   public List getAnnotationTypes() {
935     return annotationTypes;
936   }//getAnnotationTypes
937 
938   public String getOrganizationType() {
939     return organizationType;
940   }
941 
942   public String getPersonType() {
943     return personType;
944   }
945 
946   public Boolean getExtLists() {
947     return new Boolean(extLists);
948   }
949 
950   public Boolean getCaseSensitive() {
951     return new Boolean(caseSensitive);
952   }
953 
954 /*
955   public List getMatchesDocument() {
956     return matchesDocument;
957   }
958 */
959 
960   protected boolean isUnknownGender(String gender) {
961     if (gender == null)
962       return true;
963     if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
964       return false;
965     return true;
966 
967   } //isUnknownGender
968 
969   /** RULE #0: If the two names are listed in table of
970     * spurius matches then they do NOT match
971     * Condition(s): -
972     * Applied to: all name annotations
973     */
974   public boolean matchRule0(String s1,
975            String s2) {
976     if (spur_match.containsKey(s1)
977         && spur_match.containsKey(s2) )
978       return
979         spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
980 
981     return false;
982   }//matchRule0
983 
984   /** RULE #1: If the two names are identical then they are the same
985     * no longer used, because I do the check for same string via the
986     * hash table of previous annotations
987     * Condition(s): depend on case
988     * Applied to: all name annotations
989     */
990   public boolean matchRule1(String s1,
991            String s2,
992            boolean matchCase) {
993 //    Out.prln("Rule1: Matching " + s1 + "and " + s2);
994 
995     boolean matched = false;
996     if (!matchCase)
997         matched = s1.equalsIgnoreCase(s2);
998     else matched =  s1.equals(s2) ;
999 //kalina: do not remove, nice for debug
1000//    if (matched && (s2.equalsIgnoreCase("m") || s1.equalsIgnoreCase("m")))
1001//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1002    return matched;
1003  }//matchRule1
1004
1005
1006  /**
1007    * RULE #2: if the two names are listed as equivalent in the
1008    * lookup table (alias) then they match
1009    * Condition(s): -
1010    * Applied to: all name annotations
1011    */
1012  public boolean matchRule2(String s1,
1013           String s2) {
1014
1015    if (alias.containsKey(s1) && alias.containsKey(s2))
1016      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1017
1018    return false;
1019  }//matchRule2
1020
1021  /**
1022    * RULE #3: adding a possessive at the end
1023    * of one name causes a match
1024    * e.g. "Standard and Poor" == "Standard and Poor's"
1025    * and also "Standard and Poor" == "Standard's"
1026    * Condition(s): case-insensitive match
1027    * Applied to: all name annotations
1028    */
1029  public boolean matchRule3(String s1, //long string
1030                             String s2) { //short string
1031
1032    if (s2.endsWith("'s") || s2.endsWith("'")
1033        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1034
1035
1036      String s2_poss = null;
1037
1038      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1039      else s2_poss = s2.concat("'");
1040
1041      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1042
1043      // now check the second case i.e. "Standard and Poor" == "Standard's"
1044      String token = (String)
1045        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1046
1047      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1048      else s2_poss = token.concat("'");
1049
1050      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1051
1052    } // if (s2.endsWith("'s")
1053    return false;
1054  }//matchRule3
1055
1056  /**
1057    * RULE #4: Do all tokens other than the punctuation marks
1058    * , and . match?
1059    * e.g. "Smith, Jones" == "Smith Jones"
1060    * Condition(s): case-insensitive match
1061    * Applied to: organisation and person annotations
1062    */
1063  public boolean matchRule4(String s1,
1064           String s2) {
1065
1066    boolean allTokensMatch = true;
1067
1068    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1069    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1070    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1071      Annotation token = (Annotation) tokensLongAnnotIter.next();
1072      if (((String)token.getFeatures().get(KIND_FEATURE)).equals("punctuation"))
1073        continue;
1074//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1075      if (! token.getFeatures().get(STRING_FEATURE).equals(
1076             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(STRING_FEATURE))) {
1077        allTokensMatch = false;
1078        break;
1079      } // if (!tokensLongAnnot.nextToken()
1080    } // while
1081//    Out.prln("result is: " + allTokensMatch);
1082    return allTokensMatch;
1083  }//matchRule4
1084
1085  /**
1086    * RULE #5: if the 1st token of one name
1087    * matches the second name
1088    * e.g. "Pepsi Cola" == "Pepsi"
1089    * Condition(s): case-insensitive match
1090    * Applied to: all name annotations
1091    */
1092  public boolean matchRule5(String s1,
1093           String s2) {
1094
1095    //do not match numbers by this rule
1096    if (tokensLongAnnot.size()> 1 &&
1097        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1098      return false;
1099
1100//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1101//    Out.prln("Rule 5: " + s1 + "and " + s2);
1102    if (tokensLongAnnot.size()>1)
1103      return matchRule1((String)
1104                      ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE),
1105                      s2,
1106                      caseSensitive);
1107
1108    return false;
1109
1110  }//matchRule5
1111
1112  /**
1113    * RULE #6: if one name is the acronym of the other
1114    * e.g. "Imperial Chemical Industries" == "ICI"
1115    * Applied to: organisation annotations only
1116    */
1117  public boolean matchRule6(String s1,
1118           String s2) {
1119
1120    int i = 0;
1121
1122    //check and if the shorted string has a space in it, then it's not
1123    //an acronym
1124    if (s2.indexOf(" ") > 0)
1125      return false;
1126
1127    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1128    StringBuffer acronym_s1 = new StringBuffer("");
1129    StringBuffer acronymDot_s1 = new StringBuffer("");
1130
1131    for ( ;i < tokensLongAnnot.size(); i++ ) {
1132      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1133                         ).getFeatures().get(STRING_FEATURE)).substring(0,1);
1134      acronym_s1.append(toAppend);
1135      acronymDot_s1.append(toAppend);
1136      acronymDot_s1.append(".");
1137    }
1138
1139    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1140    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1141
1142    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1143        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1144      return true;
1145
1146    return false;
1147  }//matchRule6
1148
1149  /**
1150    * RULE #7: if one of the tokens in one of the
1151    * names is in the list of separators eg. "&"
1152    * then check if the token before the separator
1153    * matches the other name
1154    * e.g. "R.H. Macy & Co." == "Macy"
1155    * Condition(s): case-sensitive match
1156    * Applied to: organisation annotations only
1157    */
1158  public boolean matchRule7(String s1,
1159           String s2) {
1160
1161    //don't try it unless the second string is just one token
1162    if (tokensShortAnnot.size() != 1)
1163      return false;
1164
1165    String previous_token = null;
1166
1167    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1168      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1169          ).getFeatures().get(STRING_FEATURE) )) {
1170        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1171                                    ).getFeatures().get(STRING_FEATURE);
1172
1173        break;
1174      }
1175    }
1176
1177    //now match previous_token with other name
1178    if (previous_token != null) {
1179//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1180//        Out.prln("Rule7");
1181      return matchRule1(previous_token,s2,caseSensitive);
1182
1183    }
1184    return false;
1185  }//matchRule7
1186
1187  /**
1188   * This rule is now obsolete, as The and the trailing CDG
1189   * are stripped before matching.
1190   * DO NOT CALL!!!
1191   *
1192    * RULE #8: if the names match, ignoring The and
1193    * and trailing company designator (which have already been stripped)
1194    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1195    * Condition(s): case-sensitive match
1196    * Applied to: organisation annotations only
1197    */
1198  public boolean matchRule8(String s1,
1199           String s2) {
1200    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1201/*
1202    if (s1.startsWith("The ")) s1 = s1.substring(4);
1203    if (s2.startsWith("The ")) s2 = s2.substring(4);
1204
1205    // check that cdg is not empty
1206    if (!cdg.isEmpty()) {
1207      String stringToTokenize1 = s1;
1208      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1209
1210      String stringToTokenize2 = s2;
1211      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1212      String token = null;
1213      String cdg1 = null;
1214      String cdg2 = null;
1215
1216      s1 = "";
1217      s2 = "";
1218
1219      //check last token of s1
1220      while (tokensLongAnnot.hasMoreTokens()) {
1221        token = tokensLongAnnot.nextToken();
1222        if (!tokensLongAnnot.hasMoreTokens()
1223            && cdg.contains(token)) cdg1=token;
1224        else s1 = s1+token;
1225      }
1226
1227      // do the same for s2
1228      while (tokensShortAnnot.hasMoreTokens()) {
1229        token = tokensShortAnnot.nextToken();
1230        if (!tokensShortAnnot.hasMoreTokens()
1231          && cdg.contains(token)) cdg2=token;
1232        else s2 = s2+token;
1233      }
1234
1235      // if the company designators are different
1236      // then they are NOT the same organisations
1237      if ((cdg1!=null && cdg2!=null)
1238    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1239    }
1240    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1241*/
1242    return false;
1243
1244  }//matchRule8
1245
1246  /**
1247    * RULE #9: does one of the names match the token
1248    * just before a trailing company designator
1249    * in the other name?
1250    * The company designator has already been chopped off,
1251    * so the token before it, is in fact the last token
1252    * e.g. "R.H. Macy Co." == "Macy"
1253    * Applied to: organisation annotations only
1254    */
1255  public boolean matchRule9(String s1,
1256           String s2) {
1257
1258//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1259//      Out.prln("Rule 9 " + s1 + " and " + s2);
1260    String s1_short = (String)
1261                      ((Annotation) tokensLongAnnot.get(
1262                          tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE);
1263//    Out.prln("Converted to " + s1_short);
1264    if (tokensLongAnnot.size()>1) {
1265      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1266      //we need to make sure all names match, instead of assuming transitivity,
1267      //to avoid matching BBC News with News then News with ITV News, which
1268      //by transitivity leads to BBC News matching ITV News which is not what
1269      //we want
1270      if (matched)
1271        allMatchingNeeded = true;
1272      return matched;
1273    } //if
1274
1275    return false;
1276  }//matchRule9
1277
1278  /**
1279    * RULE #10: is one name the reverse of the other
1280    * reversing around prepositions only?
1281    * e.g. "Department of Defence" == "Defence Department"
1282    * Condition(s): case-sensitive match
1283    * Applied to: organisation annotations only
1284    */
1285  public boolean matchRule10(String s1,
1286            String s2) {
1287
1288    String token = null;
1289    String previous_token = null;
1290    String next_token = null;
1291    boolean invoke_rule=false;
1292
1293    if (tokensLongAnnot.size() >= 3
1294        && tokensShortAnnot.size() >= 2) {
1295
1296      // first get the tokens before and after the preposition
1297      int i = 0;
1298      for (; i< tokensLongAnnot.size(); i++) {
1299        token = (String)
1300                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(STRING_FEATURE);
1301        if (prepos.containsKey(token)) {
1302          invoke_rule=true;
1303          break;
1304        }//if
1305        previous_token = token;
1306      }//while
1307
1308      if (! invoke_rule)
1309        return false;
1310
1311      if (i < tokensLongAnnot.size()
1312          && previous_token != null)
1313        next_token= (String)
1314                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(STRING_FEATURE);
1315      else return false;
1316
1317      String s21 = (String)
1318                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1319      String s22 = (String)
1320                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(STRING_FEATURE);
1321      // then compare (in reverse) with the first two tokens of s2
1322      if (matchRule1(next_token,(String) s21,caseSensitive)
1323          && matchRule1(previous_token, s22,caseSensitive))
1324        return true ;
1325    }//if (tokensLongAnnot.countTokens() >= 3
1326    return false;
1327  }//matchRule10
1328
1329  /**
1330    * RULE #11: does one name consist of contractions
1331    * of the first two tokens of the other name?
1332    * e.g. "Communications Satellite" == "ComSat"
1333    * and "Pan American" == "Pan Am"
1334    * Condition(s): case-sensitive match
1335    * Applied to: organisation annotations only
1336    */
1337  public boolean matchRule11(String s1,
1338            String s2) {
1339
1340
1341    // first do the easy case e.g. "Pan American" == "Pan Am"
1342
1343    String token11 = null;
1344    String token12 = null;
1345    String token21 = null;
1346    String token22 = null;
1347
1348    if (tokensLongAnnot.size() < 2)
1349      return false;
1350
1351    // 1st get the first two tokens of s1
1352    token11 = (String)
1353                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1354    token12 = (String)
1355                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(STRING_FEATURE);
1356
1357    // now check for the first case i.e. "Pan American" == "Pan Am"
1358    if (tokensShortAnnot.size() == 2)  {
1359
1360      token21 = (String)
1361                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1362      token22 = (String)
1363                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1364
1365      if (token11.startsWith(token21)
1366          && token12.startsWith(token22))
1367        return true;
1368
1369    } // if (tokensShortAnnot.countTokens() == 2)
1370
1371    // now the second case e.g.  "Communications Satellite" == "ComSat"
1372    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1373
1374      // split the token into possible contractions
1375      // ignore case for matching
1376      for (int i=2;i<s2.length();i++) {
1377        token21=s2.substring(0,i+1);
1378        token22=s2.substring(i+1);
1379
1380        if (token11.startsWith(token21)
1381            && token12.startsWith(token22))
1382          return true;
1383      }// for
1384    } // else if
1385
1386    return false;
1387  }//matchRule11
1388
1389  /**
1390    * RULE #12: do the first and last tokens of one name
1391    * match the first and last tokens of the other?
1392    * Condition(s): case-sensitive match
1393    * Applied to: organisation annotations only
1394    */
1395  public boolean matchRule12(String s1,
1396            String s2) {
1397
1398    // first do the easy case e.g. "Pan American" == "Pan Am"
1399
1400    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1401//     Out.prln("Rule 12");
1402
1403      // get first and last tokens of s1 & s2
1404      String s1_first = (String)
1405                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1406      String s2_first = (String)
1407                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(STRING_FEATURE);
1408
1409      if (!matchRule1(s1_first,s2_first,caseSensitive))
1410        return false;
1411
1412      String s1_last = (String)
1413         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE);
1414      String s2_last = (String)
1415         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(STRING_FEATURE);
1416
1417      return matchRule1(s1_last,s2_last,caseSensitive);
1418    } // if (tokensLongAnnot.countTokens()>1
1419    return false;
1420  }//matchRule12
1421
1422  /**
1423    * RULE #13: do multi-word names match except for
1424    * one token e.g.
1425    * "Second Force Recon Company" == "Force Recon Company"
1426    * Note that this rule has NOT been used in LaSIE's 1.5
1427    * namematcher
1428    * Restrictions: - remove cdg first
1429    *               - shortest name should be 2 words or more
1430    *               - if N is the number of tokens of the longest
1431    *                 name, then N-1 tokens should be matched
1432    * Condition(s): case-sensitive match
1433    * Applied to: organisation or person annotations only
1434    */
1435  public boolean matchRule13(String s1,
1436            String s2) {
1437
1438
1439    String token1 = null;
1440    String token2 = null;
1441
1442    int matched_tokens = 0, mismatches = 0;;
1443
1444    // if names < 2 words then rule is invalid
1445    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1446
1447//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1448//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1449//      Out.prln("with tokens " + tokensShortAnnot);
1450//    }
1451
1452    // now do the matching
1453    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1454
1455//      Out.prln("i = " + i);
1456//      Out.prln("j = " + j);
1457      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(STRING_FEATURE).equals(
1458           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(STRING_FEATURE)) ) {
1459        matched_tokens++;
1460        j++;
1461      } else
1462        mismatches++;
1463    } // for
1464
1465    if (matched_tokens >= tokensLongAnnot.size()-1)
1466      return true;
1467
1468    return false;
1469  }//matchRule13
1470
1471  /**
1472    * RULE #14: if the last token of one name
1473    * matches the second name
1474    * e.g. "Hamish Cunningham" == "Cunningham"
1475    * Condition(s): case-insensitive match
1476    * Applied to: all person annotations
1477    */
1478  public boolean matchRule14(String s1,
1479           String s2) {
1480
1481//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1482//      Out.prln("Rule 14 " + s1 + " and " + s2);
1483    String s1_short = (String)
1484                      ((Annotation) tokensLongAnnot.get(
1485                          tokensLongAnnot.size()-1)).getFeatures().get(STRING_FEATURE);
1486//    Out.prln("Converted to " + s1_short);
1487    if (tokensLongAnnot.size()>1)
1488      return matchRule1(s1_short,
1489                      s2,
1490                      caseSensitive);
1491
1492    return false;
1493
1494  }//matchRule14
1495
1496  /**
1497    * RULE #15: does one token from a Person name appear as the other token
1498    * Note that this rule has NOT been used in LaSIE's 1.5
1499    * namematcher; added for ACE by Di's request
1500    * Applied to: organisation annotations only
1501    */
1502  public boolean matchRule15(String s1,
1503            String s2) {
1504
1505
1506    String token1 = null;
1507    String token2 = null;
1508
1509    int matched_tokens = 0;
1510
1511    // if names < 2 words then rule is invalid
1512
1513//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1514//      Out.prln("Rule 15:" );
1515//      Out.prln("with tokens " + tokensShortAnnot);
1516//    }
1517
1518    // now do the matching
1519    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1520
1521      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++)
1522//      Out.prln("i = " + i);
1523        if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures(
1524                                                   ).get(STRING_FEATURE).equals(
1525             ((Annotation) tokensShortAnnot.get(i)).getFeatures(
1526                                                   ).get(STRING_FEATURE)) ) {
1527          matched_tokens++;
1528      }
1529    } // for
1530
1531    if (matched_tokens > 0)
1532      return true;
1533
1534    return false;
1535  }//matchRule15
1536
1537  /** Tables for namematch info
1538    * (used by the namematch rules)
1539    */
1540  private void buildTables(AnnotationSet nameAllAnnots) {
1541
1542    //reset the tables first
1543    cdg.clear();
1544
1545    if (! extLists) {
1546    // i.e. get cdg from Lookup annotations
1547      // get all Lookup annotations
1548      tempMap.clear();
1549      tempMap.put("majorType", "cdg");
1550      //now get all lookup annotations which are cdg
1551      AnnotationSet nameAnnots =
1552        nameAllAnnots.get(LOOKUPNAME, tempMap);
1553
1554      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1555        return;
1556
1557      Iterator iter = nameAnnots.iterator();
1558      while (iter.hasNext()) {
1559         Annotation annot = (Annotation)iter.next();
1560         // get the actual string
1561         Long offsetStartAnnot = annot.getStartNode().getOffset();
1562         Long offsetEndAnnot = annot.getEndNode().getOffset();
1563         try {
1564           gate.Document doc = nameAllAnnots.getDocument();
1565           String annotString =
1566                            doc.getContent().getContent(
1567                            offsetStartAnnot,offsetEndAnnot
1568                            ).toString();
1569                cdg.add(annotString);
1570         } catch (InvalidOffsetException ioe) {
1571             ioe.printStackTrace(Err.getPrintWriter());
1572         }
1573      }// while
1574    }//if
1575  }//buildTables
1576
1577  /** substitute all multiple spaces, tabes and newlines
1578    * with a single space
1579    */
1580  public String regularExpressions ( String text, String replacement,
1581                                      String regEx) {
1582    String result = text;
1583    try {
1584      RE re = new RE(regEx);
1585      result = re.substituteAll( text,replacement);
1586    } catch (REException ree) {ree.printStackTrace();}
1587    return result;
1588  }//regularExpressions
1589
1590
1591  private static class Class1 {
1592  }
1593} // public class OrthoMatcher
1594
1595