1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.48 2004/07/21 17:10:05 akshay Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import java.io.*;
20  import java.net.URL;
21  import java.util.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  
27  import gnu.regexp.RE;
28  import gnu.regexp.REException;
29  
30  public class OrthoMatcher extends AbstractLanguageAnalyser
31                            implements ANNIEConstants{
32  
33    public static final String
34      OM_DOCUMENT_PARAMETER_NAME = "document";
35  
36    public static final String
37      OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
38  
39    public static final String
40      OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
41  
42    public static final String
43      OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
44  
45    public static final String
46      OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
47  
48    public static final String
49      OM_PERSON_TYPE_PARAMETER_NAME = "personType";
50  
51    public static final String
52      OM_EXT_LISTS_PARAMETER_NAME = "extLists";
53  
54    protected static final String CDGLISTNAME = "cdg";
55    protected static final String ALIASLISTNAME = "alias";
56    protected static final String ARTLISTNAME = "def_art";
57    protected static final String PREPLISTNAME = "prepos";
58    protected static final String CONNECTORLISTNAME = "connector";
59    protected static final String SPURLISTNAME = "spur_match";
60  
61    protected static final String PUNCTUATION_VALUE = "punctuation";
62    protected static final String THE_VALUE = "The";
63  
64  
65    /**the name of the annotation set*/
66    protected String annotationSetName;
67  
68    /** the types of the annotation */
69    protected List annotationTypes = new ArrayList(10);
70  
71    /** the organization type*/
72    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
73  
74    /** the person type*/
75    protected String personType = PERSON_ANNOTATION_TYPE;
76  
77    protected String unknownType = "Unknown";
78  
79    /** internal or external list */
80    protected boolean extLists = true;
81  
82    /** matching unknowns or not*/
83    protected boolean matchingUnknowns = true;
84  
85    /** This is an internal variable to indicate whether
86     *  we matched using a rule that requires that
87     *  the newly matched annotation matches all the others
88     *  This is needed, because organizations can share
89     *  first/last tokens like News and be different
90     */
91    private   boolean allMatchingNeeded = false;
92  
93    //** Orthomatching is not case-sensitive by default*/
94    protected boolean caseSensitive = false;
95  
96    protected FeatureMap queryFM = Factory.newFeatureMap();
97  
98  //  protected ExecutionException executionException;
99  
100   // name lookup tables (used for namematch)
101   //gave them bigger default size, coz rehash is expensive
102   protected HashMap alias = new HashMap(100);
103   protected HashSet cdg = new HashSet(50);
104   protected HashMap spur_match = new HashMap(100);
105   protected HashMap def_art = new HashMap(20);
106   protected HashMap connector = new HashMap(20);
107   protected HashMap prepos = new HashMap(30);
108 
109 
110   protected AnnotationSet nameAllAnnots = null;
111   protected HashMap processedAnnots = new HashMap(150);
112   protected HashMap annots2Remove = new HashMap(75);
113   protected List matchesDocFeature = new ArrayList();
114   //maps annotation ids to array lists of tokens
115   protected HashMap tokensMap = new HashMap(150);
116 
117   protected Annotation shortAnnot, longAnnot;
118 
119   protected ArrayList tokensLongAnnot, tokensShortAnnot;
120 
121   /** a feature map to be used when retrieving annotations
122    *  declared here so can be reused for efficiency
123    *  clear() before each use
124    */
125   protected FeatureMap tempMap = Factory.newFeatureMap();
126 
127   /** the size of the buffer */
128   private final static int BUFF_SIZE = 65000;
129 
130   /**
131    * URL to the file containing the definition for this orthomatcher
132    */
133   private java.net.URL definitionFileURL;
134 
135   /** The encoding used for the definition file and associated lists.*/
136   private String encoding;
137 
138   /** @link dependency */
139   /*#OrthoMatcher lnkOrthoMatcher;*/
140 
141   public OrthoMatcher () {
142     annotationTypes.add(organizationType);
143     annotationTypes.add(personType);
144     annotationTypes.add("Location");
145     annotationTypes.add("Date");
146   }
147 
148   /** Initialise this resource, and return it. */
149   public Resource init() throws ResourceInstantiationException {
150     //initialise the list of annotations which we will match
151     if(definitionFileURL == null){
152       throw new ResourceInstantiationException(
153                 "No URL provided for the definition file!");
154     }
155 
156     //at this point we have the definition file
157     try{
158       BufferedReader reader = new BufferedReader(
159                       new InputStreamReader(definitionFileURL.openStream(),
160                                             encoding));
161       String lineRead = null;
162       while ((lineRead = reader.readLine()) != null){
163         int index = lineRead.indexOf(":");
164         if (index != -1){
165           String nameFile = lineRead.substring(0,index);
166           String nameList = lineRead.substring(index+1,lineRead.length());
167           createAnnotList(nameFile,nameList);
168         }// if
169       }//while
170       reader.close();
171     }catch(IOException ioe){
172       throw new ResourceInstantiationException(ioe);
173     }
174 
175     return this;
176   } // init()
177 
178   /**  Run the resource. It doesn't make sense not to override
179     *  this in subclasses so the default implementation signals an
180     *  exception.
181     */
182   public void execute() throws ExecutionException{
183 
184     //check the input
185     if(document == null) {
186       throw new ExecutionException(
187         "No document for namematch!"
188       );
189     }
190 
191     // get the annotations from document
192     if ((annotationSetName == null)|| (annotationSetName.equals("")))
193       nameAllAnnots = document.getAnnotations();
194     else
195       nameAllAnnots = document.getAnnotations(annotationSetName);
196 
197     //if none found, print warning and exit
198     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
199       Out.prln("OrthoMatcher Warning: No annotations found for processing");
200       return;
201     }
202 
203     //check if we've been run on this document before
204     //and clean the doc if needed
205     docCleanup();
206     Map matchesMap = (Map)document.getFeatures().
207                      get(DOCUMENT_COREF_FEATURE_NAME);
208 
209     // creates the cdg list from the document
210     //no need to create otherwise, coz already done in init()
211     if (!extLists)
212       buildTables(nameAllAnnots);
213 
214     //first match all name annotations
215     matchNameAnnotations();
216 
217     //then match the unknown ones to all name ones
218     if (matchingUnknowns)
219       matchUnknown();
220 
221     // set the matches of the document
222 //    determineMatchesDocument();
223     if (! matchesDocFeature.isEmpty()) {
224       if(matchesMap == null){
225         matchesMap = new HashMap();
226       }
227       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
228       //we need to put it even if it was already present in order to triger
229       //the update events
230       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
231 
232       //cannot do clear() as this has already been put on the document
233       //so I need a new one for the next run of matcher
234       matchesDocFeature = new ArrayList();
235     }
236 
237 //    Out.prln("Processed strings" + processedAnnots.values());
238     //clean-up the internal data structures for next run
239     nameAllAnnots = null;
240     processedAnnots.clear();
241     annots2Remove.clear();
242     tokensMap.clear();
243     matchesDocFeature = new ArrayList();
244     longAnnot = null;
245     shortAnnot = null;
246     tokensLongAnnot = null;
247     tokensShortAnnot = null;
248 
249   } // run()
250 
251   protected void matchNameAnnotations() throws ExecutionException{
252     // go through all the annotation types
253     Iterator iterAnnotationTypes = annotationTypes.iterator();
254     while (iterAnnotationTypes.hasNext()) {
255       String annotationType = (String)iterAnnotationTypes.next();
256 
257       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
258 
259       // continue if no such annotations exist
260       if ((nameAnnots == null) || nameAnnots.isEmpty())
261         continue;
262 
263       Iterator iterNames = nameAnnots.iterator();
264       while (iterNames.hasNext()) {
265         Annotation nameAnnot = (Annotation) iterNames.next();
266         Integer id = nameAnnot.getId();
267 
268         // get string and value
269         String annotString = null;
270         try {
271             annotString = document.getContent().getContent(
272             nameAnnot.getStartNode().getOffset(),
273             nameAnnot.getEndNode().getOffset()
274             ).toString();
275           // now do the reg. exp. substitutions
276           annotString = regularExpressions(annotString," ", "\\s+");
277 
278         } catch (InvalidOffsetException ioe) {
279             throw new ExecutionException
280                                    ("Invalid offset of the annotation");
281         }
282         //convert to lower case if we are not doing a case sensitive match
283         if (!caseSensitive)
284           annotString = annotString.toLowerCase();
285 
286         //get the tokens
287         List tokens = new ArrayList((Set)
288                         nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
289                           nameAnnot.getStartNode().getOffset(),
290                           nameAnnot.getEndNode().getOffset()
291                         ));
292         //if no tokens to match, do nothing
293         if (tokens.isEmpty())
294           continue;
295         Collections.sort(tokens, new gate.util.OffsetComparator());
296         //check if these actually do not end after the name
297         //needed coz new tokeniser conflates
298         //strings with dashes. So British Gas-style is two tokens
299         //instead of three. So cannot match properly British Gas
300 //        tokens = checkTokens(tokens);
301         tokensMap.put(nameAnnot.getId(), tokens);
302 
303 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
304 
305         //first check whether we have not matched such a string already
306         //if so, just consider it matched, don't bother calling the rules
307         if (processedAnnots.containsValue(annotString)) {
308 //          Out.prln("Contained string found " + annotString);
309           updateMatches(nameAnnot, annotString);
310           processedAnnots.put(nameAnnot.getId(), annotString);
311           continue;
312         } else if (processedAnnots.isEmpty()) {
313           processedAnnots.put(nameAnnot.getId(), annotString);
314           continue;
315         }
316 
317         //if a person, then remove their title before matching
318         if (nameAnnot.getType().equals(personType))
319           annotString = containTitle(annotString, nameAnnot);
320         else if (nameAnnot.getType().equals(organizationType))
321           annotString = stripCDG(annotString, nameAnnot);
322 
323         if(null == annotString || "".equals(annotString))
324           continue;
325 
326         //otherwise try matching with previous annotations
327         matchWithPrevious(nameAnnot, annotString);
328 
329 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
330         //finally add the current annotations to the processed map
331         processedAnnots.put(nameAnnot.getId(), annotString);
332       }//while through name annotations
333 
334     }//while through annotation types
335 
336   }
337 
338   protected void matchUnknown() throws ExecutionException {
339     //get all Unknown annotations
340     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
341 
342     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
343       return;
344 
345     Iterator iter = unknownAnnots.iterator();
346     //loop through the unknown annots
347     while (iter.hasNext()) {
348       Annotation unknown = (Annotation) iter.next();
349 
350       // get string and value
351       String unknownString = null;
352       try {
353           unknownString = document.getContent().getContent(
354             unknown.getStartNode().getOffset(),
355             unknown.getEndNode().getOffset()
356             ).toString();
357         // now do the reg. exp. substitutions
358         unknownString = regularExpressions(unknownString," ", "\\s+");
359       } catch (InvalidOffsetException ioe) {
360           throw new ExecutionException
361                                  ("Invalid offset of the annotation");
362       }
363       //convert to lower case if we are not doing a case sensitive match
364       if (!caseSensitive)
365         unknownString = unknownString.toLowerCase();
366 
367       //get the tokens
368       List tokens = new ArrayList((Set)
369                       nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
370                         unknown.getStartNode().getOffset(),
371                         unknown.getEndNode().getOffset()
372                       ));
373       if (tokens.isEmpty())
374         continue;
375       Collections.sort(tokens, new gate.util.OffsetComparator());
376       tokensMap.put(unknown.getId(), tokens);
377 
378 
379       //first check whether we have not matched such a string already
380       //if so, just consider it matched, don't bother calling the rules
381       if (processedAnnots.containsValue(unknownString)) {
382         Annotation matchedAnnot = updateMatches(unknown, unknownString);
383 //        Out.prln("Matched " + unknown + "with string " + unknownString);
384 //        Out.prln("That's same as " + matchedAnnot);
385         if (matchedAnnot.getType().equals(unknownType)) {
386           annots2Remove.put(unknown.getId(),
387                             annots2Remove.get(matchedAnnot.getId()));
388         }
389         else
390           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
391         processedAnnots.put(unknown.getId(), unknownString);
392         unknown.getFeatures().put("NMRule", unknownType);
393         continue;
394       }
395 
396       //check if we should do sub-string matching in case it's hyphenated
397       //for example US-led
398       if (tokens.size() == 1
399           && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
400         if (matchHyphenatedUnknowns(unknown, unknownString, iter))
401           continue;
402       }//if
403 
404       matchWithPrevious(unknown, unknownString);
405 
406     } //while though unknowns
407 
408     if (! annots2Remove.isEmpty()) {
409       Iterator unknownIter = annots2Remove.keySet().iterator();
410       while (unknownIter.hasNext()) {
411         Integer unknId = (Integer) unknownIter.next();
412         Annotation unknown = nameAllAnnots.get(unknId);
413         Integer newID = nameAllAnnots.add(
414           unknown.getStartNode(),
415           unknown.getEndNode(),
416           (String) annots2Remove.get(unknId),
417           unknown.getFeatures()
418         );
419         nameAllAnnots.remove(unknown);
420 
421         //change the id in the matches list
422         List mList = (List)unknown.getFeatures().
423                      get(ANNOTATION_COREF_FEATURE_NAME);
424         mList.remove(unknId);
425         mList.add(newID);
426       }//while
427     }//if
428   }
429 
430   private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
431                                        Iterator iter){
432     boolean matched = false;
433 
434     //only take the substring before the hyphen
435     int stringEnd = unknownString.indexOf("-");
436     unknownString = unknownString.substring(0, stringEnd);
437     //check if we've already matched this string
438     //because only exact match of the substring are considered
439     if (processedAnnots.containsValue(unknownString)) {
440       matched = true;
441       Annotation matchedAnnot = updateMatches(unknown, unknownString);
442       //only do the matching if not a person, because we do not match
443       //those on sub-strings
444       iter.remove();
445       String newType;
446       if (matchedAnnot.getType().equals(unknownType))
447         newType = (String)annots2Remove.get(matchedAnnot.getId());
448       else
449         newType = matchedAnnot.getType();
450 
451       Integer newID = new Integer(-1);
452       try {
453         newID = nameAllAnnots.add(
454           unknown.getStartNode().getOffset(),
455           new Long(unknown.getStartNode().getOffset().longValue()
456                   + stringEnd),
457           newType,
458           unknown.getFeatures()
459         );
460       } catch (InvalidOffsetException ex) {
461         throw new GateRuntimeException(ex.getMessage());
462       }
463       nameAllAnnots.remove(unknown);
464 
465       //change the id in the matches list
466       List mList = (List)unknown.getFeatures().
467                    get(ANNOTATION_COREF_FEATURE_NAME);
468       mList.remove(unknown.getId());
469       mList.add(newID);
470 
471     }
472     return matched;
473   }
474 
475   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
476     boolean matchedUnknown = false;
477 
478     Iterator prevIter = processedAnnots.keySet().iterator();
479     while (prevIter.hasNext()) {
480       Integer prevId = (Integer) prevIter.next();
481       Annotation prevAnnot = nameAllAnnots.get(prevId);
482 
483       //check if the two are from the same type or the new one is unknown
484       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
485           && ! nameAnnot.getType().equals(unknownType))
486          )
487         continue;
488       //do not compare two unknown annotations either
489       //they are only matched to those of known types
490       if (  nameAnnot.getType().equals(unknownType)
491             && prevAnnot.getType().equals(unknownType))
492       continue;
493 
494       //check if we have already matched this annotation to the new one
495       if (matchedAlready(nameAnnot, prevAnnot) )
496         continue;
497 
498       //now changed to a rule, here we just match by gender
499       if (prevAnnot.getType().equals(personType)) {
500         String prevGender =
501           (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
502         String nameGender =
503           (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
504         if (   prevGender != null
505             && nameGender != null
506             && ( (nameGender.equalsIgnoreCase("female")
507                   &&
508                   prevGender.equalsIgnoreCase("male")
509                   )
510                ||
511                   (prevGender.equalsIgnoreCase("female")
512                    && nameGender.equalsIgnoreCase("male")
513                   )
514                 )
515             ) //if condition
516           continue; //we don't have a match if the two genders are different
517 
518       }//if
519 
520       //if the two annotations match
521       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
522 //        Out.prln("Matched " + shortName + "and " + longName);
523         updateMatches(nameAnnot, prevAnnot);
524         //if unknown annotation, we need to change to the new type
525         if (nameAnnot.getType().equals(unknownType)) {
526           matchedUnknown = true;
527           if (prevAnnot.getType().equals(unknownType))
528             annots2Remove.put(nameAnnot.getId(),
529                               annots2Remove.get(prevAnnot.getId()));
530           else
531             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
532          //also put an attribute to indicate that
533           nameAnnot.getFeatures().put("NMRule", unknownType);
534         }//if unknown
535         break; //no need to match further
536       }//if annotations matched
537 
538     }//while through previous annotations
539 
540     if (matchedUnknown)
541       processedAnnots.put(nameAnnot.getId(), annotString);
542 
543 
544   }//matchWithPrevious
545 
546   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
547                                      Annotation prevAnnot) {
548     //do not match two annotations that overlap
549     if (newAnnot.overlaps(prevAnnot))
550       return false;
551 
552     // find which annotation string of the two is longer
553     //  this is useful for some of the matching rules
554     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
555 
556     String longName = prevAnnotString;
557     String shortName = annotString;
558     longAnnot = prevAnnot;
559     shortAnnot = newAnnot;
560 
561     if (shortName.length()>longName.length()) {
562       String temp = longName;
563       longName = shortName;
564       shortName = temp;
565       Annotation tempAnn = longAnnot;
566       longAnnot = shortAnnot;
567       shortAnnot = tempAnn;
568     }//if
569 
570     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
571     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
572 
573     List matchesList = (List) prevAnnot.getFeatures().
574                               get(ANNOTATION_COREF_FEATURE_NAME);
575     if (matchesList == null || matchesList.isEmpty())
576       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
577 
578     //if these two match, then let's see if all the other matching one will too
579     //that's needed, because sometimes names can share a token (e.g., first or
580     //last but not be the same
581     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
582       /**
583        * Check whether we need to ensure that there is a match with the rest
584        * of the matching annotations, because the rule requires that
585        * transtivity is not assummed.
586        */
587       if (allMatchingNeeded) {
588         allMatchingNeeded = false;
589 
590         List toMatchList = new ArrayList(matchesList);
591   //      if (newAnnot.getType().equals(unknownType))
592   //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
593         toMatchList.remove(prevAnnot.getId());
594 
595         return matchOtherAnnots(toMatchList, newAnnot, annotString);
596       } else
597         return true;
598     }
599     return false;
600   }
601 
602   /** This method checkes whether the new annotation matches
603    *  all annotations given in the toMatchList (it contains ids)
604    *  The idea is that the new annotation needs to match all those,
605    *  because assuming transitivity does not always work, when
606    *  two different entities share a common token: e.g., BT Cellnet
607    *  and BT and British Telecom.
608   */
609   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
610                                       String annotString) {
611 
612     //if the list is empty, then we're matching all right :-)
613     if (toMatchList.isEmpty())
614       return true;
615 
616     boolean matchedAll = true;
617     int i = 0;
618 
619     while (matchedAll && i < toMatchList.size()) {
620       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
621 
622       // find which annotation string of the two is longer
623       //  this is useful for some of the matching rules
624       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
625       if (prevAnnotString == null)
626         try {
627           prevAnnotString = document.getContent().getContent(
628             prevAnnot.getStartNode().getOffset(),
629             prevAnnot.getEndNode().getOffset()
630             ).toString();
631         } catch (InvalidOffsetException ioe) {
632           return false;
633         }//try
634 
635 
636       String longName = prevAnnotString;
637       String shortName = annotString;
638       longAnnot = prevAnnot;
639       shortAnnot = newAnnot;
640 
641       if (shortName.length()>=longName.length()) {
642         String temp = longName;
643         longName = shortName;
644         shortName = temp;
645         Annotation tempAnn = longAnnot;
646         longAnnot = shortAnnot;
647         shortAnnot = tempAnn;
648       }//if
649 
650       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
651       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
652 
653       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
654 //      if (newAnnot.getType().equals(unknownType))
655 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
656 
657       i++;
658     }//while
659     return matchedAll;
660   }
661 
662 
663   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
664     //the two annotations are already matched if the matches list of the first
665     //contains the id of the second
666     List matchesList = (List) annot1.getFeatures().
667                        get(ANNOTATION_COREF_FEATURE_NAME);
668     if ((matchesList == null) || matchesList.isEmpty())
669       return false;
670     else if (matchesList.contains(annot2.getId()))
671       return true;
672     return false;
673   }
674 
675   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
676     Annotation matchedAnnot = null;
677     Integer id;
678 
679     //first find a processed annotation with the same string
680     Iterator iter = processedAnnots.keySet().iterator();
681     while (iter.hasNext()) {
682       id = (Integer) iter.next();
683       String oldString = (String) processedAnnots.get(id);
684       if (annotString.equals(oldString)) {
685         matchedAnnot = nameAllAnnots.get(id);
686         break;
687       }//if
688     }//while
689 
690     if (matchedAnnot == null) return null;
691     //if the two matching annotations are of different type which is not
692     //unknown, do not match them
693     if (! matchedAnnot.getType().equals(newAnnot.getType())
694         && !newAnnot.getType().equals(unknownType) )
695       return matchedAnnot;
696 
697     List matchesList = (List) matchedAnnot.getFeatures().
698                        get(ANNOTATION_COREF_FEATURE_NAME);
699     if ((matchesList == null) || matchesList.isEmpty()) {
700       //no previous matches, so need to add
701       if (matchesList == null) {
702         matchesList = new ArrayList();
703         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
704                                        matchesList);
705         matchesDocFeature.add(matchesList);
706       }//if
707       matchesList.add(matchedAnnot.getId());
708       matchesList.add(newAnnot.getId());
709     } else {
710       //just add the new annotation
711       matchesList.add(newAnnot.getId());
712     }//if
713     //add the matches list to the new annotation
714     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
715     return matchedAnnot;
716   }
717 
718   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
719 
720     List matchesList = (List) prevAnnot.getFeatures().
721                               get(ANNOTATION_COREF_FEATURE_NAME);
722     if ((matchesList == null) || matchesList.isEmpty()) {
723       //no previous matches, so need to add
724       if (matchesList == null) {
725         matchesList = new ArrayList();
726         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
727         matchesDocFeature.add(matchesList);
728       }//if
729       matchesList.add(prevAnnot.getId());
730       matchesList.add(newAnnot.getId());
731     } else {
732       //just add the new annotation
733       matchesList.add(newAnnot.getId());
734     }//if
735     //add the matches list to the new annotation
736     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
737     //propagate the gender if two persons are matched
738     if (prevAnnot.getType().equals(personType)) {
739       String prevGender =
740         (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
741       String newGender =
742         (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
743       boolean unknownPrevGender = isUnknownGender(prevGender);
744       boolean unknownNewGender = isUnknownGender(newGender);
745       if (unknownPrevGender && !unknownNewGender)
746         prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
747       else if (unknownNewGender && !unknownPrevGender)
748         newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
749     }//if
750   }
751 
752 
753   protected void docCleanup() {
754     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
755     if (matchesValue != null && (matchesValue instanceof Map))
756       ((Map)matchesValue).remove(nameAllAnnots.getName());
757     else if (matchesValue != null) {
758       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
759     }
760 
761     //get all annotations that have a matches feature
762     HashSet fNames = new HashSet();
763     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
764     AnnotationSet annots =
765                   nameAllAnnots.get(null, fNames);
766 
767 //    Out.prln("Annots to cleanup" + annots);
768 
769     if (annots == null || annots.isEmpty())
770       return;
771 
772     Iterator iter = annots.iterator();
773     while (iter.hasNext()) {
774       while (iter.hasNext())
775         ((Annotation) iter.next()).getFeatures().
776                                    remove(ANNOTATION_COREF_FEATURE_NAME);
777     } //while
778   }//cleanup
779 
780   /** return a person name without title */
781   protected String containTitle (String annotString, Annotation annot)
782                       throws ExecutionException {
783     // get the offsets
784     Long startAnnot = annot.getStartNode().getOffset();
785     Long endAnnot = annot.getEndNode().getOffset();
786 
787     // determine "Lookup" annotation set
788     queryFM.clear();
789     queryFM.put("majorType", "title");
790     AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
791     if (as1 == null || as1.isEmpty())
792       return annotString;
793     AnnotationSet as =
794       as1.get("Lookup", queryFM);
795     if (as !=null && ! as.isEmpty()) {
796       List titles = new ArrayList((Set)as);
797       Collections.sort(titles, new gate.util.OffsetComparator());
798 
799       Iterator iter = titles.iterator();
800       while (iter.hasNext()) {
801         Annotation titleAnn = (Annotation)(iter.next());
802 
803         //we've not found a title at the start offset,
804         //there's no point in looking further
805         //coz titles come first
806         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
807           return annotString;
808 
809         try {
810           // the title from the current annotation
811           String annotTitle =
812             document.getContent().getContent(
813               titleAnn.getStartNode().getOffset(),
814               titleAnn.getEndNode().getOffset()
815             ).toString();
816 
817           // eliminate the title from annotation string and return the result
818           if (annotTitle.length()<annotString.length()) {
819             //remove from the array of tokens, so then we can compare properly
820             //the remaining tokens
821 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
822 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
823 //            Out.prln("Title is" + annotTitle);
824             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
825             return annotString.substring(
826                                  annotTitle.length()+1,annotString.length());
827           }
828         } catch (InvalidOffsetException ioe) {
829             throw new ExecutionException
830                                ("Invalid offset of the annotation");
831         }//try
832       }// while
833     }//if
834     return annotString;
835 
836   }
837 
838   /** return an organization  without a designator and starting The*/
839   protected String stripCDG (String annotString, Annotation annot){
840 
841     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
842 
843     //strip starting The first
844     if ( ((String) ((Annotation) tokens.get(0)
845           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
846           .equalsIgnoreCase(THE_VALUE))
847       tokens.remove(0);
848 
849     //no need to check for cdg if there is only 1 token or less
850     if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
851           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
852       tokens.remove(tokens.size()-1);
853 
854     StringBuffer newString = new StringBuffer(50);
855     for (int i = 0; i < tokens.size(); i++){
856       newString.append((String) ((Annotation) tokens.get(i)
857           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
858       if (i != tokens.size()-1)
859         newString.append(" ");
860     }
861 //    Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
862 
863     if (caseSensitive)
864       return newString.toString();
865 
866     return newString.toString().toLowerCase();
867   }
868 
869 /*
870   public void check() throws ExecutionException {
871     if (executionException != null) {
872       ExecutionException e = executionException;
873       executionException = null;
874       throw e;
875     }
876   } // check()
877 */
878 
879   /** if ( == false) then reads the names of files in order
880     *  to create the lookup tables
881     */
882 //  protected void createLists() throws IOException {
883 //
884 //    InputStream inputStream = Files.getGateResourceAsStream(
885 //                                              "creole/namematcher/listsNM.def");
886 //    InputStreamReader inputStreamReader = new InputStreamReader (
887 //                                                    inputStream);
888 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
889 //
890 //    String lineRead = null;
891 //    while ((lineRead = bufferedReader.readLine()) != null){
892 //      int index = lineRead.indexOf(":");
893 //      if (index != -1){
894 //        String nameFile = lineRead.substring(0,index);
895 //        String nameList = lineRead.substring(index+1,lineRead.length());
896 //        createAnnotList(nameFile,nameList);
897 //      }// if
898 //    }//while
899 //    bufferedReader.close();
900 //    inputStreamReader.close();
901 //    inputStream.close();
902 //  }// createLists()
903 
904   /** creates the lookup tables */
905   protected void createAnnotList(String nameFile,String nameList)
906                                                           throws IOException{
907 
908 //    InputStream inputStream = Files.getGateResourceAsStream(
909 //                                              "creole/namematcher/"+nameFile);
910 //    InputStreamReader inputStreamReader = new InputStreamReader (
911 //                                                    inputStream);
912 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
913 
914     //create the relative URL
915     URL fileURL = new URL(definitionFileURL, nameFile);
916     BufferedReader bufferedReader =
917       new BufferedReader(new InputStreamReader(fileURL.openStream(),
918                          encoding));
919 
920     String lineRead = null;
921     while ((lineRead = bufferedReader.readLine()) != null){
922       if (nameList.compareTo(CDGLISTNAME)==0){
923         if (caseSensitive)
924           cdg.add(lineRead);
925         else
926           cdg.add(lineRead.toLowerCase());
927       }// if
928       else {
929         int index = lineRead.indexOf("£");
930         if (index != -1){
931           String  expr = lineRead.substring(0,index);
932           //if not case-sensitive, we need to downcase all strings
933           if (!caseSensitive)
934             expr = expr.toLowerCase();
935           String code = lineRead.substring(index+1,lineRead.length());
936           if (nameList.equals(ALIASLISTNAME))
937                             alias.put(expr, code);
938           else
939           if (nameList.equals(ARTLISTNAME))
940                             def_art.put(expr, code);
941           else
942           if (nameList.equals(PREPLISTNAME))
943                             prepos.put(expr, code);
944           else
945           if (nameList.equals(CONNECTORLISTNAME))
946                             connector.put(expr, code);
947           else
948           if (nameList.equals(SPURLISTNAME))
949                             spur_match.put(expr, code);
950 
951         }//if
952       }// else
953 
954     }//while
955   }//createAnnotList
956 
957 
958   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
959   private boolean apply_rules_namematch(String annotationType, String shortName,
960                                         String longName) {
961     // first apply rule for spurius matches i.e. rule0
962     if (matchRule0(longName, shortName))
963       return false;
964     if (
965          (// rules for all annotations
966           //no longer use rule1, coz I do the check for same string via the
967           //hash table
968             matchRule2(longName, shortName)
969          ||
970             matchRule3(longName, shortName)
971          ) // rules for all annotations
972          ||
973          (// rules for organisation annotations
974              ( annotationType.equals(organizationType)
975                //ACE addition
976                || annotationType.equals("Facility"))
977              &&
978              (    matchRule4(longName, shortName)
979                ||
980                   matchRule5(longName, shortName)
981                ||
982                   matchRule6(longName, shortName)
983                ||
984                   matchRule7(longName, shortName)
985                ||
986 //                  matchRule8(longName, shortName)
987 //               ||
988                   matchRule9(longName, shortName)
989                ||
990                   matchRule10(longName, shortName)
991                ||
992                   matchRule11(longName, shortName)
993                ||
994                   matchRule12(longName, shortName)
995                ||
996                   matchRule13(shortName, longName)
997               )
998            )// rules for organisation annotations
999          ||
1000         (// rules for person annotations
1001             (    annotationType.equals(personType))
1002               &&
1003             (    matchRule4(longName, shortName)
1004               ||
1005                  matchRule5(longName, shortName)
1006               ||
1007                  matchRule14(longName, shortName)
1008               || //kalina: added this, so it matches names when contain more
1009                  //than one first and one last name
1010                  matchRule15(longName, shortName)
1011              )
1012          )// rules for person annotations
1013         ) //if
1014      return true;
1015    return false;
1016  }//apply_rules
1017
1018
1019  /** set the extLists flag */
1020  public void setExtLists(Boolean newExtLists) {
1021    extLists = newExtLists.booleanValue();
1022  }//setextLists
1023
1024  /** set the caseSensitive flag */
1025  public void setCaseSensitive(Boolean newCase) {
1026    caseSensitive = newCase.booleanValue();
1027  }//setextLists
1028
1029  /** set the annotation set name*/
1030  public void setAnnotationSetName(String newAnnotationSetName) {
1031    annotationSetName = newAnnotationSetName;
1032  }//setAnnotationSetName
1033
1034  /** set the types of the annotations*/
1035  public void setAnnotationTypes(List newType) {
1036    annotationTypes = newType;
1037  }//setAnnotationTypes
1038
1039  /** set whether to process the Unknown annotations*/
1040  public void setProcessUnknown(Boolean processOrNot) {
1041    this.matchingUnknowns = processOrNot.booleanValue();
1042  }//setAnnotationTypes
1043
1044  public void setOrganizationType(String newOrganizationType) {
1045    organizationType = newOrganizationType;
1046  }//setOrganizationType
1047
1048  public void setPersonType(String newPersonType) {
1049    personType = newPersonType;
1050  }//setPersonType
1051
1052  /**get the name of the annotation set*/
1053  public String getAnnotationSetName() {
1054    return annotationSetName;
1055  }//getAnnotationSetName
1056
1057  /** get the types of the annotation*/
1058  public List getAnnotationTypes() {
1059    return annotationTypes;
1060  }//getAnnotationTypes
1061
1062  public String getOrganizationType() {
1063    return organizationType;
1064  }
1065
1066  public String getPersonType() {
1067    return personType;
1068  }
1069
1070  public Boolean getExtLists() {
1071    return new Boolean(extLists);
1072  }
1073
1074  /** Are we running in a case-sensitive mode?*/
1075  public Boolean getCaseSensitive() {
1076    return new Boolean(caseSensitive);
1077  }
1078
1079  /** Return whether or not we're processing the Unknown annots*/
1080  public Boolean getProcessUnknown() {
1081    return new Boolean(matchingUnknowns);
1082  }
1083
1084/*
1085  public List getMatchesDocument() {
1086    return matchesDocument;
1087  }
1088*/
1089
1090  protected boolean isUnknownGender(String gender) {
1091    if (gender == null)
1092      return true;
1093    if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1094      return false;
1095    return true;
1096
1097  } //isUnknownGender
1098
1099  /** RULE #0: If the two names are listed in table of
1100    * spurius matches then they do NOT match
1101    * Condition(s): -
1102    * Applied to: all name annotations
1103    */
1104  public boolean matchRule0(String s1,
1105           String s2) {
1106    if (spur_match.containsKey(s1)
1107        && spur_match.containsKey(s2) )
1108      return
1109        spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1110
1111    return false;
1112  }//matchRule0
1113
1114  /** RULE #1: If the two names are identical then they are the same
1115    * no longer used, because I do the check for same string via the
1116    * hash table of previous annotations
1117    * Condition(s): depend on case
1118    * Applied to: all name annotations
1119    */
1120  public boolean matchRule1(String s1,
1121           String s2,
1122           boolean matchCase) {
1123//    Out.prln("Rule1: Matching " + s1 + "and " + s2);
1124
1125    boolean matched = false;
1126    if (!matchCase)
1127        matched = s1.equalsIgnoreCase(s2);
1128    else matched =  s1.equals(s2) ;
1129//kalina: do not remove, nice for debug
1130//    if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
1131//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1132    return matched;
1133  }//matchRule1
1134
1135
1136  /**
1137    * RULE #2: if the two names are listed as equivalent in the
1138    * lookup table (alias) then they match
1139    * Condition(s): -
1140    * Applied to: all name annotations
1141    */
1142  public boolean matchRule2(String s1,
1143           String s2) {
1144
1145    if (alias.containsKey(s1) && alias.containsKey(s2))
1146      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1147
1148    return false;
1149  }//matchRule2
1150
1151  /**
1152    * RULE #3: adding a possessive at the end
1153    * of one name causes a match
1154    * e.g. "Standard and Poor" == "Standard and Poor's"
1155    * and also "Standard and Poor" == "Standard's"
1156    * Condition(s): case-insensitive match
1157    * Applied to: all name annotations
1158    */
1159  public boolean matchRule3(String s1, //long string
1160                             String s2) { //short string
1161
1162    if (s2.endsWith("'s") || s2.endsWith("'")
1163        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1164
1165
1166      String s2_poss = null;
1167
1168      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1169      else s2_poss = s2.concat("'");
1170
1171      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1172
1173      // now check the second case i.e. "Standard and Poor" == "Standard's"
1174      String token = (String)
1175        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1176
1177      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1178      else s2_poss = token.concat("'");
1179
1180      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1181
1182    } // if (s2.endsWith("'s")
1183    return false;
1184  }//matchRule3
1185
1186  /**
1187    * RULE #4: Do all tokens other than the punctuation marks
1188    * , and . match?
1189    * e.g. "Smith, Jones" == "Smith Jones"
1190    * Condition(s): case-insensitive match
1191    * Applied to: organisation and person annotations
1192    */
1193  public boolean matchRule4(String s1,
1194           String s2) {
1195
1196    boolean allTokensMatch = true;
1197
1198    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1199    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1200    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1201      Annotation token = (Annotation) tokensLongAnnotIter.next();
1202      if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1203        continue;
1204//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1205      if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1206             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1207        allTokensMatch = false;
1208        break;
1209      } // if (!tokensLongAnnot.nextToken()
1210    } // while
1211//    if (allTokensMatch)
1212//      Out.prln("rule4 fired. result is: " + allTokensMatch);
1213    return allTokensMatch;
1214  }//matchRule4
1215
1216  /**
1217    * RULE #5: if the 1st token of one name
1218    * matches the second name
1219    * e.g. "Pepsi Cola" == "Pepsi"
1220    * Condition(s): case-insensitive match
1221    * Applied to: all name annotations
1222    */
1223  public boolean matchRule5(String s1,
1224           String s2) {
1225
1226    //do not match numbers by this rule
1227    if (tokensLongAnnot.size()> 1 &&
1228        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1229      return false;
1230
1231//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) {
1232//      Out.prln("Rule 5: " + s1 + "and " + s2);
1233//    }
1234
1235    //require that when matching person names, the shorter one to be of length 1
1236    //for the rule to apply. In other words, avoid matching Peter Smith and
1237    //Peter Kline, because they share a Peter token.
1238    if ( (shortAnnot.getType().equals(personType)
1239         || longAnnot.getType().equals(personType)
1240         )
1241       &&
1242         tokensShortAnnot.size()>1
1243       )
1244       return false;
1245
1246    if (tokensLongAnnot.size()<=1)
1247      return false;
1248    boolean result = matchRule1((String)
1249                      ((Annotation) tokensLongAnnot.get(0)
1250                        ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1251                      s2,
1252                      caseSensitive);
1253
1254//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick"))
1255//      Out.prln("rule 5 result: " + result);
1256    return result;
1257
1258  }//matchRule5
1259
1260  /**
1261    * RULE #6: if one name is the acronym of the other
1262    * e.g. "Imperial Chemical Industries" == "ICI"
1263    * Applied to: organisation annotations only
1264    */
1265  public boolean matchRule6(String s1,
1266           String s2) {
1267
1268    int i = 0;
1269
1270    //check and if the shorted string has a space in it, then it's not
1271    //an acronym
1272    if (s2.indexOf(" ") > 0)
1273      return false;
1274
1275    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1276    StringBuffer acronym_s1 = new StringBuffer("");
1277    StringBuffer acronymDot_s1 = new StringBuffer("");
1278
1279    for ( ;i < tokensLongAnnot.size(); i++ ) {
1280      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1281                         ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1282      acronym_s1.append(toAppend);
1283      acronymDot_s1.append(toAppend);
1284      acronymDot_s1.append(".");
1285    }
1286
1287    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1288    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1289
1290    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1291        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1292      return true;
1293
1294    return false;
1295  }//matchRule6
1296
1297  /**
1298    * RULE #7: if one of the tokens in one of the
1299    * names is in the list of separators eg. "&"
1300    * then check if the token before the separator
1301    * matches the other name
1302    * e.g. "R.H. Macy & Co." == "Macy"
1303    * Condition(s): case-sensitive match
1304    * Applied to: organisation annotations only
1305    */
1306  public boolean matchRule7(String s1,
1307           String s2) {
1308
1309    //don't try it unless the second string is just one token
1310    if (tokensShortAnnot.size() != 1)
1311      return false;
1312
1313    String previous_token = null;
1314
1315    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1316      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1317          ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1318        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1319                                    ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1320
1321        break;
1322      }
1323    }
1324
1325    //now match previous_token with other name
1326    if (previous_token != null) {
1327//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1328//        Out.prln("Rule7");
1329      return matchRule1(previous_token,s2,caseSensitive);
1330
1331    }
1332    return false;
1333  }//matchRule7
1334
1335  /**
1336   * This rule is now obsolete, as The and the trailing CDG
1337   * are stripped before matching.
1338   * DO NOT CALL!!!
1339   *
1340    * RULE #8: if the names match, ignoring The and
1341    * and trailing company designator (which have already been stripped)
1342    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1343    * Condition(s): case-sensitive match
1344    * Applied to: organisation annotations only
1345    */
1346  public boolean matchRule8(String s1,
1347           String s2) {
1348    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1349/*
1350    if (s1.startsWith("The ")) s1 = s1.substring(4);
1351    if (s2.startsWith("The ")) s2 = s2.substring(4);
1352
1353    // check that cdg is not empty
1354    if (!cdg.isEmpty()) {
1355      String stringToTokenize1 = s1;
1356      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1357
1358      String stringToTokenize2 = s2;
1359      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1360      String token = null;
1361      String cdg1 = null;
1362      String cdg2 = null;
1363
1364      s1 = "";
1365      s2 = "";
1366
1367      //check last token of s1
1368      while (tokensLongAnnot.hasMoreTokens()) {
1369        token = tokensLongAnnot.nextToken();
1370        if (!tokensLongAnnot.hasMoreTokens()
1371            && cdg.contains(token)) cdg1=token;
1372        else s1 = s1+token;
1373      }
1374
1375      // do the same for s2
1376      while (tokensShortAnnot.hasMoreTokens()) {
1377        token = tokensShortAnnot.nextToken();
1378        if (!tokensShortAnnot.hasMoreTokens()
1379          && cdg.contains(token)) cdg2=token;
1380        else s2 = s2+token;
1381      }
1382
1383      // if the company designators are different
1384      // then they are NOT the same organisations
1385      if ((cdg1!=null && cdg2!=null)
1386    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1387    }
1388    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1389*/
1390    return false;
1391
1392  }//matchRule8
1393
1394  /**
1395    * RULE #9: does one of the names match the token
1396    * just before a trailing company designator
1397    * in the other name?
1398    * The company designator has already been chopped off,
1399    * so the token before it, is in fact the last token
1400    * e.g. "R.H. Macy Co." == "Macy"
1401    * Applied to: organisation annotations only
1402    */
1403  public boolean matchRule9(String s1,
1404           String s2) {
1405
1406//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1407//      Out.prln("Rule 9 " + s1 + " and " + s2);
1408    String s1_short = (String)
1409                      ((Annotation) tokensLongAnnot.get(
1410                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1411//    Out.prln("Converted to " + s1_short);
1412    if (tokensLongAnnot.size()>1) {
1413      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1414      //we need to make sure all names match, instead of assuming transitivity,
1415      //to avoid matching BBC News with News then News with ITV News, which
1416      //by transitivity leads to BBC News matching ITV News which is not what
1417      //we want
1418      if (matched)
1419        allMatchingNeeded = true;
1420      return matched;
1421    } //if
1422
1423    return false;
1424  }//matchRule9
1425
1426  /**
1427    * RULE #10: is one name the reverse of the other
1428    * reversing around prepositions only?
1429    * e.g. "Department of Defence" == "Defence Department"
1430    * Condition(s): case-sensitive match
1431    * Applied to: organisation annotations only
1432    */
1433  public boolean matchRule10(String s1,
1434            String s2) {
1435
1436    String token = null;
1437    String previous_token = null;
1438    String next_token = null;
1439    boolean invoke_rule=false;
1440
1441    if (tokensLongAnnot.size() >= 3
1442        && tokensShortAnnot.size() >= 2) {
1443
1444      // first get the tokens before and after the preposition
1445      int i = 0;
1446      for (; i< tokensLongAnnot.size(); i++) {
1447        token = (String)
1448                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1449        if (prepos.containsKey(token)) {
1450          invoke_rule=true;
1451          break;
1452        }//if
1453        previous_token = token;
1454      }//while
1455
1456      if (! invoke_rule)
1457        return false;
1458
1459      if (i < tokensLongAnnot.size()
1460          && previous_token != null)
1461        next_token= (String)
1462                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1463      else return false;
1464
1465      String s21 = (String)
1466                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1467      String s22 = (String)
1468                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1469      // then compare (in reverse) with the first two tokens of s2
1470      if (matchRule1(next_token,(String) s21,caseSensitive)
1471          && matchRule1(previous_token, s22,caseSensitive))
1472        return true ;
1473    }//if (tokensLongAnnot.countTokens() >= 3
1474    return false;
1475  }//matchRule10
1476
1477  /**
1478    * RULE #11: does one name consist of contractions
1479    * of the first two tokens of the other name?
1480    * e.g. "Communications Satellite" == "ComSat"
1481    * and "Pan American" == "Pan Am"
1482    * Condition(s): case-sensitive match
1483    * Applied to: organisation annotations only
1484    */
1485  public boolean matchRule11(String s1,
1486            String s2) {
1487
1488
1489    // first do the easy case e.g. "Pan American" == "Pan Am"
1490
1491    String token11 = null;
1492    String token12 = null;
1493    String token21 = null;
1494    String token22 = null;
1495
1496    if (tokensLongAnnot.size() < 2)
1497      return false;
1498
1499    // 1st get the first two tokens of s1
1500    token11 = (String)
1501                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1502    token12 = (String)
1503                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1504
1505    // now check for the first case i.e. "Pan American" == "Pan Am"
1506    if (tokensShortAnnot.size() == 2)  {
1507
1508      token21 = (String)
1509                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1510      token22 = (String)
1511                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1512
1513      if (token11.startsWith(token21)
1514          && token12.startsWith(token22))
1515        return true;
1516
1517    } // if (tokensShortAnnot.countTokens() == 2)
1518
1519    // now the second case e.g.  "Communications Satellite" == "ComSat"
1520    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1521
1522      // split the token into possible contractions
1523      // ignore case for matching
1524      for (int i=2;i<s2.length();i++) {
1525        token21=s2.substring(0,i+1);
1526        token22=s2.substring(i+1);
1527
1528        if (token11.startsWith(token21)
1529            && token12.startsWith(token22))
1530          return true;
1531      }// for
1532    } // else if
1533
1534    return false;
1535  }//matchRule11
1536
1537  /**
1538    * RULE #12: do the first and last tokens of one name
1539    * match the first and last tokens of the other?
1540    * Condition(s): case-sensitive match
1541    * Applied to: organisation annotations only
1542    */
1543  public boolean matchRule12(String s1,
1544            String s2) {
1545
1546    // first do the easy case e.g. "Pan American" == "Pan Am"
1547
1548    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1549//     Out.prln("Rule 12");
1550
1551      // get first and last tokens of s1 & s2
1552      String s1_first = (String)
1553                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1554      String s2_first = (String)
1555                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1556
1557      if (!matchRule1(s1_first,s2_first,caseSensitive))
1558        return false;
1559
1560      String s1_last = (String)
1561         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1562      String s2_last = (String)
1563         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1564
1565      return matchRule1(s1_last,s2_last,caseSensitive);
1566    } // if (tokensLongAnnot.countTokens()>1
1567    return false;
1568  }//matchRule12
1569
1570  /**
1571    * RULE #13: do multi-word names match except for
1572    * one token e.g.
1573    * "Second Force Recon Company" == "Force Recon Company"
1574    * Note that this rule has NOT been used in LaSIE's 1.5
1575    * namematcher
1576    * Restrictions: - remove cdg first
1577    *               - shortest name should be 2 words or more
1578    *               - if N is the number of tokens of the longest
1579    *                 name, then N-1 tokens should be matched
1580    * Condition(s): case-sensitive match
1581    * Applied to: organisation or person annotations only
1582    */
1583  public boolean matchRule13(String s1,
1584            String s2) {
1585
1586
1587    String token1 = null;
1588    String token2 = null;
1589
1590    int matched_tokens = 0, mismatches = 0;;
1591
1592    // if names < 2 words then rule is invalid
1593    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1594
1595//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1596//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1597//      Out.prln("with tokens " + tokensShortAnnot);
1598//    }
1599
1600    // now do the matching
1601    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1602
1603//      Out.prln("i = " + i);
1604//      Out.prln("j = " + j);
1605      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1606           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1607        matched_tokens++;
1608        j++;
1609      } else
1610        mismatches++;
1611    } // for
1612
1613    if (matched_tokens >= tokensLongAnnot.size()-1)
1614      return true;
1615
1616    return false;
1617  }//matchRule13
1618
1619  /**
1620    * RULE #14: if the last token of one name
1621    * matches the second name
1622    * e.g. "Hamish Cunningham" == "Cunningham"
1623    * Condition(s): case-insensitive match
1624    * Applied to: all person annotations
1625    */
1626  public boolean matchRule14(String s1,
1627           String s2) {
1628
1629//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1630//      Out.prln("Rule 14 " + s1 + " and " + s2);
1631    String s1_short = (String)
1632                      ((Annotation) tokensLongAnnot.get(
1633                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1634//    Out.prln("Converted to " + s1_short);
1635    if (tokensLongAnnot.size()>1)
1636      return matchRule1(s1_short,
1637                      s2,
1638                      caseSensitive);
1639
1640    return false;
1641
1642  }//matchRule14
1643
1644  /**
1645    * RULE #15: does one token from a Person name appear as the other token
1646    * Note that this rule has NOT been used in LaSIE's 1.5
1647    * namematcher; added for ACE by Di's request
1648    */
1649  public boolean matchRule15(String s1,
1650            String s2) {
1651
1652    int matched_tokens = 0;
1653
1654    // if names < 2 words then rule is invalid
1655
1656//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1657//      Out.prln("Rule 15:" );
1658//      Out.prln("with tokens " + tokensShortAnnot);
1659//    }
1660
1661    // now do the matching
1662    Annotation token1, token2;
1663    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1664      token1 = (Annotation) tokensShortAnnot.get(i);
1665      //first check if not punctuation, because we need to skip it
1666      if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1667        continue;
1668
1669      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1670//      Out.prln("i = " + i);
1671        token2 = (Annotation) tokensLongAnnot.get(j);
1672        if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1673          continue;
1674        if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1675             token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1676          matched_tokens++;
1677      }//for
1678    } // for
1679
1680    //19 February 2002: kalina
1681    //was originally > 0 (i.e., any match is good)
1682    //ensure that we've matched all the tokens in the short annotation
1683    //the reason for that is, because otherwise we match
1684    //Patrick Viera and Patrick Somebody - not good!
1685    if (matched_tokens == tokensShortAnnot.size())
1686      return true;
1687
1688    return false;
1689  }//matchRule15
1690
1691
1692  /** Tables for namematch info
1693    * (used by the namematch rules)
1694    */
1695  private void buildTables(AnnotationSet nameAllAnnots) {
1696
1697    //reset the tables first
1698    cdg.clear();
1699
1700    if (! extLists) {
1701    // i.e. get cdg from Lookup annotations
1702      // get all Lookup annotations
1703      tempMap.clear();
1704      tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1705      //now get all lookup annotations which are cdg
1706      AnnotationSet nameAnnots =
1707        nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1708
1709      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1710        return;
1711
1712      Iterator iter = nameAnnots.iterator();
1713      while (iter.hasNext()) {
1714         Annotation annot = (Annotation)iter.next();
1715         // get the actual string
1716         Long offsetStartAnnot = annot.getStartNode().getOffset();
1717         Long offsetEndAnnot = annot.getEndNode().getOffset();
1718         try {
1719           gate.Document doc = nameAllAnnots.getDocument();
1720           String annotString =
1721                            doc.getContent().getContent(
1722                            offsetStartAnnot,offsetEndAnnot
1723                            ).toString();
1724                cdg.add(annotString);
1725         } catch (InvalidOffsetException ioe) {
1726             ioe.printStackTrace(Err.getPrintWriter());
1727         }
1728      }// while
1729    }//if
1730  }//buildTables
1731
1732  /** substitute all multiple spaces, tabes and newlines
1733    * with a single space
1734    */
1735  public String regularExpressions ( String text, String replacement,
1736                                      String regEx) {
1737    String result = text;
1738    try {
1739      RE re = new RE(regEx);
1740      result = re.substituteAll( text,replacement);
1741    } catch (REException ree) {ree.printStackTrace();}
1742    return result;
1743  }
1744
1745  public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1746    this.definitionFileURL = definitionFileURL;
1747  }
1748
1749  public java.net.URL getDefinitionFileURL() {
1750    return definitionFileURL;
1751  }
1752  public void setEncoding(String encoding) {
1753    this.encoding = encoding;
1754  }
1755  public String getEncoding() {
1756    return encoding;
1757  }//regularExpressions
1758
1759
1760  private static class Class1 {
1761  }
1762} // public class OrthoMatcher
1763
1764