1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 2000-2001, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   * borislav popov 24/03/2002
15   *
16   * $Id: DefaultGazetteer.java,v 1.42 2002/06/26 14:23:22 nasso Exp $
17   */
18  package gate.creole.gazetteer;
19  
20  import java.io.*;
21  import java.util.*;
22  import java.net.*;
23  
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.event.*;
27  import gate.*;
28  
29  /** This component is responsible for doing lists lookup. The implementaion is
30   * based on finite state machines.
31   * The phrases to be recognised should be listed in a set of files, one for
32   * each type of occurences.
33   * The gazeteer is build with the information from a file that contains the set
34   * of lists (which are files as well) and the associated type for each list.
35   * The file defining the set of lists should have the following syntax:
36   * each list definition should be written on its own line and should contain:
37   * <ol>
38   * <li>the file name (required) </li>
39   * <li>the major type (required) </li>
40   * <li>the minor type (optional)</li>
41   * <li>the language(s) (optional) </li>
42   * </ol>
43   * The elements of each definition are separated by &quot;:&quot;.
44   * The following is an example of a valid definition: <br>
45   * <code>personmale.lst:person:male:english</code>
46   * Each list file named in the lists definition file is just a list containing
47   * one entry per line.
48   * When this gazetter will be run over some input text (a Gate document) it
49   * will generate annotations of type Lookup having the attributes specified in
50   * the definition file.
51   */
52  public class DefaultGazetteer extends AbstractGazetteer {
53  
54    /** Debug flag
55     */
56    private static final boolean DEBUG = false;
57  
58    public static final String
59      DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
60  
61    public static final String
62      DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
63  
64    public static final String
65      DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
66  
67    public static final String
68      DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
69  
70    public static final String
71      DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
72  
73  
74    /** a map of nodes vs gaz lists */
75    private Map listsByNode;
76  
77    /** Build a gazetter using the default lists from the agte resources
78     * {@see init()}
79     */
80    public DefaultGazetteer(){
81    }
82  
83    /** Does the actual loading and parsing of the lists. This method must be
84     * called before the gazetteer can be used
85     */
86    public Resource init()throws ResourceInstantiationException{
87      fsmStates = new HashSet();
88      initialState = new FSMState(this);
89      if(listsURL == null){
90        throw new ResourceInstantiationException (
91              "No URL provided for gazetteer creation!");
92      }
93      definition = new LinearDefinition();
94      definition.setURL(listsURL);
95      definition.load();
96      int linesCnt = definition.size();
97      listsByNode = definition.loadLists();
98      Iterator inodes = definition.iterator();
99  
100     String line;
101     int nodeIdx = 0;
102     LinearNode node;
103     while (inodes.hasNext()) {
104       node = (LinearNode) inodes.next();
105       fireStatusChanged("Reading " + node.toString());
106       fireProgressChanged(++nodeIdx * 100 / linesCnt);
107       readList(node,true);
108     } // while iline
109     fireProcessFinished();
110     return this;
111   }
112 
113 
114   /** Reads one lists (one file) of phrases
115    *
116    * @param listDesc the line from the definition file
117    * @param add
118    * @add if <b>true</b> will add the phrases found in the list to the ones
119    *     recognised by this gazetter, if <b>false</b> the phrases found in the
120    *     list will be removed from the list of phrases recognised by this
121    *     gazetteer.
122    */
123   void readList(LinearNode node, boolean add) throws ResourceInstantiationException{
124     String listName, majorType, minorType, languages;
125     if ( null == node ) {
126       throw new ResourceInstantiationException(" LinearNode node is null ");
127     }
128 
129     listName = node.getList();
130     majorType = node.getMajorType();
131     minorType = node.getMinorType();
132     languages = node.getLanguage();
133     GazetteerList gazList = (GazetteerList)listsByNode.get(node);
134     if (null == gazList) {
135       throw new ResourceInstantiationException("gazetteer list not found by node");
136     }
137 
138     Iterator iline = gazList.iterator();
139 
140     Lookup lookup = new Lookup(listName,majorType, minorType, languages);
141     lookup.list = node.getList();
142     if ( null != mappingDefinition){
143       MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
144       if (null!=mnode){
145         lookup.oClass = mnode.getClassID();
146         lookup.ontology = mnode.getOntologyID();
147       }
148     }//if mapping def
149 
150     String line;
151     while(iline.hasNext()){
152       line = iline.next().toString();
153       if(add)addLookup(line, lookup);
154       else removeLookup(line, lookup);
155     }
156   } // void readList(String listDesc)
157 
158   /** Adds one phrase to the list of phrases recognised by this gazetteer
159    *
160    * @param text the phrase to be added
161    * @param lookup the description of the annotation to be added when this
162    *     phrase is recognised
163    */
164 // >>> DAM, was
165 /*
166   public void addLookup(String text, Lookup lookup) {
167     Character currentChar;
168     FSMState currentState = initialState;
169     FSMState nextState;
170     Lookup oldLookup;
171     boolean isSpace;
172 
173     for(int i = 0; i< text.length(); i++) {
174       isSpace = Character.isWhitespace(text.charAt(i));
175       if(isSpace) currentChar = new Character(' ');
176       else currentChar = (caseSensitive.booleanValue()) ?
177                           new Character(text.charAt(i)) :
178                           new Character(Character.toUpperCase(text.charAt(i))) ;
179       nextState = currentState.next(currentChar);
180       if(nextState == null){
181         nextState = new FSMState(this);
182         currentState.put(currentChar, nextState);
183         if(isSpace) nextState.put(new Character(' '),nextState);
184       }
185       currentState = nextState;
186     } //for(int i = 0; i< text.length(); i++)
187 
188     currentState.addLookup(lookup);
189     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
190 
191   } // addLookup
192 */
193 // >>> DAM: TransArray optimization
194   public void addLookup(String text, Lookup lookup) {
195     char currentChar;
196     FSMState currentState = initialState;
197     FSMState nextState;
198     Lookup oldLookup;
199     boolean isSpace;
200 
201     for(int i = 0; i< text.length(); i++) {
202         currentChar = text.charAt(i);
203         isSpace = Character.isWhitespace(currentChar);
204         if(isSpace) currentChar = ' ';
205         else currentChar = (caseSensitive.booleanValue()) ?
206                           currentChar :
207                           Character.toUpperCase(currentChar) ;
208       nextState = currentState.next(currentChar);
209       if(nextState == null){
210         nextState = new FSMState(this);
211         currentState.put(currentChar, nextState);
212         if(isSpace) nextState.put(' ',nextState);
213       }
214       currentState = nextState;
215     } //for(int i = 0; i< text.length(); i++)
216 
217     currentState.addLookup(lookup);
218     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
219 
220   } // addLookup
221 // >>> DAM, end
222 
223   /** Removes one phrase to the list of phrases recognised by this gazetteer
224    *
225    * @param text the phrase to be removed
226    * @param lookup the description of the annotation associated to this phrase
227    */
228 // >>> DAM, was
229 /*
230   public void removeLookup(String text, Lookup lookup) {
231     Character currentChar;
232     FSMState currentState = initialState;
233     FSMState nextState;
234     Lookup oldLookup;
235     boolean isSpace;
236 
237     for(int i = 0; i< text.length(); i++) {
238       isSpace = Character.isWhitespace(text.charAt(i));
239       if(isSpace) currentChar = new Character(' ');
240       else currentChar = new Character(text.charAt(i));
241       nextState = currentState.next(currentChar);
242       if(nextState == null) return;//nothing to remove
243       currentState = nextState;
244     } //for(int i = 0; i< text.length(); i++)
245     currentState.removeLookup(lookup);
246   } // removeLookup
247 */
248 // >>> DAM: TransArray optimization
249   public void removeLookup(String text, Lookup lookup) {
250     char currentChar;
251     FSMState currentState = initialState;
252     FSMState nextState;
253     Lookup oldLookup;
254 
255     for(int i = 0; i< text.length(); i++) {
256         currentChar = text.charAt(i);
257         if(Character.isWhitespace(currentChar)) currentChar = ' ';
258         nextState = currentState.next(currentChar);
259         if(nextState == null) return;//nothing to remove
260         currentState = nextState;
261     } //for(int i = 0; i< text.length(); i++)
262     currentState.removeLookup(lookup);
263   } // removeLookup
264 // >>> DAM, end
265 
266   /** Returns a string representation of the deterministic FSM graph using
267    * GML.
268    */
269   public String getFSMgml() {
270     String res = "graph[ \ndirected 1\n";
271     ///String nodes = "", edges = "";
272     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
273                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
274     Iterator fsmStatesIter = fsmStates.iterator();
275     while (fsmStatesIter.hasNext()){
276       FSMState currentState = (FSMState)fsmStatesIter.next();
277       int stateIndex = currentState.getIndex();
278       /*nodes += "node[ id " + stateIndex +
279                " label \"" + stateIndex;
280       */
281       nodes.append("node[ id ");
282       nodes.append(stateIndex);
283       nodes.append(" label \"");
284       nodes.append(stateIndex);
285 
286              if(currentState.isFinal()){
287               ///nodes += ",F\\n" + currentState.getLookupSet();
288               nodes.append(",F\\n");
289               nodes.append(currentState.getLookupSet());
290              }
291              ///nodes +=  "\"  ]\n";
292              nodes.append("\"  ]\n");
293       //edges += currentState.getEdgesGML();
294       edges.append(currentState.getEdgesGML());
295     }
296     res += nodes.toString() + edges.toString() + "]\n";
297     return res;
298   } // getFSMgml
299 
300 
301   /**
302    * This method runs the gazetteer. It assumes that all the needed parameters
303    * are set. If they are not, an exception will be fired.
304    */
305   public void execute() throws ExecutionException{
306     interrupted = false;
307     AnnotationSet annotationSet;
308     //check the input
309     if(document == null) {
310       throw new ExecutionException(
311         "No document to process!"
312       );
313     }
314 
315     if(annotationSetName == null ||
316        annotationSetName.equals("")) annotationSet = document.getAnnotations();
317     else annotationSet = document.getAnnotations(annotationSetName);
318 
319     fireStatusChanged("Doing lookup in " +
320                            document.getName() + "...");
321     String content = document.getContent().toString();
322     int length = content.length();
323 // >>> DAM, was
324 /*
325     Character currentChar;
326 */
327 // >>> DAM: TransArray optimization
328     char currentChar;
329 // >>> DAM, end
330     FSMState currentState = initialState;
331     FSMState nextState;
332     FSMState lastMatchingState = null;
333     int matchedRegionEnd = 0;
334     int matchedRegionStart = 0;
335     int charIdx = 0;
336     int oldCharIdx = 0;
337     FeatureMap fm;
338     Lookup currentLookup;
339 
340 // >>> DAM, was
341 /*
342     while(charIdx < length) {
343       if(Character.isWhitespace(content.charAt(charIdx)))
344         currentChar = new Character(' ');
345       else currentChar = (caseSensitive.booleanValue()) ?
346                          new Character(content.charAt(charIdx)) :
347                          new Character(Character.toUpperCase(
348                                        content.charAt(charIdx)));
349 */
350 // >>> DAM: TransArray optimization
351     while(charIdx < length) {
352       currentChar = content.charAt(charIdx);
353       if(Character.isWhitespace(currentChar)) currentChar = ' ';
354       else currentChar = caseSensitive.booleanValue() ?
355                           currentChar :
356                           Character.toUpperCase(currentChar);
357 // >>> DAM, end
358       nextState = currentState.next(currentChar);
359       if(nextState == null) {
360         //the matching stopped
361 
362         //if we had a successful match then act on it;
363         if(lastMatchingState != null){
364           //let's add the new annotation(s)
365           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
366 
367           while(lookupIter.hasNext()) {
368             currentLookup = (Lookup)lookupIter.next();
369             fm = Factory.newFeatureMap();
370             fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
371             if (null!= currentLookup.oClass && null!=currentLookup.ontology){
372               fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
373               fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
374             }
375             if(null != currentLookup.minorType) {
376               fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
377               if(null != currentLookup.languages)
378                 fm.put("language", currentLookup.languages);
379             }
380             try {
381               annotationSet.add(new Long(matchedRegionStart),
382                               new Long(matchedRegionEnd + 1),
383                               LOOKUP_ANNOTATION_TYPE,
384                               fm);
385             } catch(InvalidOffsetException ioe) {
386               throw new LuckyException(ioe.toString());
387             }
388           }//while(lookupIter.hasNext())
389           lastMatchingState = null;
390         }
391 
392         //reset the FSM
393         charIdx = matchedRegionStart + 1;
394         matchedRegionStart = charIdx;
395         currentState = initialState;
396 
397       } else{//go on with the matching
398         currentState = nextState;
399         //if we have a successful state then store it
400         if(currentState.isFinal() &&
401            (matchedRegionStart == 0 ||
402             !Character.isLetter(content.charAt(matchedRegionStart - 1))) &&
403            (charIdx + 1 >= content.length()   ||
404             !Character.isLetter(content.charAt(charIdx + 1)))
405           ){
406           matchedRegionEnd = charIdx;
407           lastMatchingState = currentState;
408         }
409         charIdx ++;
410         if(charIdx == content.length()){
411           //we can't go on, use the last matching state and restart matching
412           //from the next char
413           if(lastMatchingState != null){
414             //let's add the new annotation(s)
415             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
416 
417             while(lookupIter.hasNext()) {
418               currentLookup = (Lookup)lookupIter.next();
419               fm = Factory.newFeatureMap();
420               fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
421               if (null!= currentLookup.oClass && null!=currentLookup.ontology){
422                 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
423                 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
424               }
425               if(null != currentLookup.minorType) {
426                 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
427                 if(null != currentLookup.languages)
428                   fm.put("language", currentLookup.languages);
429               }
430               try {
431                 annotationSet.add(new Long(matchedRegionStart),
432                                 new Long(matchedRegionEnd + 1),
433                                 LOOKUP_ANNOTATION_TYPE,
434                                 fm);
435               } catch(InvalidOffsetException ioe) {
436                 throw new LuckyException(ioe.toString());
437               }
438             }//while(lookupIter.hasNext())
439             lastMatchingState = null;
440           }
441 
442           //reset the FSM
443           charIdx = matchedRegionStart + 1;
444           matchedRegionStart = charIdx;
445           currentState = initialState;
446         }
447       }
448       if(charIdx - oldCharIdx > 256) {
449         fireProgressChanged((100 * charIdx )/ length );
450         oldCharIdx = charIdx;
451         if(isInterrupted()) throw new ExecutionInterruptedException(
452             "The execution of the " + getName() +
453             " gazetteer has been abruptly interrupted!");
454       }
455     } // while(charIdx < length)
456 
457     if(lastMatchingState != null) {
458       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
459       while(lookupIter.hasNext()) {
460         currentLookup = (Lookup)lookupIter.next();
461         fm = Factory.newFeatureMap();
462         fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
463         if (null!= currentLookup.oClass && null!=currentLookup.ontology){
464           fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
465           fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
466         }
467 
468         if(null != currentLookup.minorType)
469           fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
470         try{
471           annotationSet.add(new Long(matchedRegionStart),
472                           new Long(matchedRegionEnd + 1),
473                           LOOKUP_ANNOTATION_TYPE,
474                           fm);
475         } catch(InvalidOffsetException ioe) {
476           throw new GateRuntimeException(ioe.toString());
477         }
478       }//while(lookupIter.hasNext())
479     }
480     fireProcessFinished();
481     fireStatusChanged("Lookup complete!");
482   } // execute
483 
484 
485   /** The initial state of the FSM that backs this gazetteer
486    */
487   FSMState initialState;
488 
489   /** A set containing all the states of the FSM backing the gazetteer
490    */
491   Set fsmStates;
492 
493   /**lookup <br>
494    * @param singleItem a single string to be looked up by the gazetteer
495    * @return set of the Lookups associated with the parameter*/
496   public Set lookup(String singleItem) {
497     char currentChar;
498     Set set = new HashSet();
499     FSMState currentState = initialState;
500     FSMState nextState;
501 
502     for(int i = 0; i< singleItem.length(); i++) {
503         currentChar = singleItem.charAt(i);
504         if(Character.isWhitespace(currentChar)) currentChar = ' ';
505         nextState = currentState.next(currentChar);
506         if(nextState == null) {
507           return set;
508         }
509         currentState = nextState;
510     } //for(int i = 0; i< text.length(); i++)
511     set = currentState.getLookupSet();
512     return set;
513   }
514 
515   public boolean remove(String singleItem) {
516     char currentChar;
517     FSMState currentState = initialState;
518     FSMState nextState;
519     Lookup oldLookup;
520 
521     for(int i = 0; i< singleItem.length(); i++) {
522         currentChar = singleItem.charAt(i);
523         if(Character.isWhitespace(currentChar)) currentChar = ' ';
524         nextState = currentState.next(currentChar);
525         if(nextState == null) {
526           return false;
527         }//nothing to remove
528         currentState = nextState;
529     } //for(int i = 0; i< text.length(); i++)
530     currentState.lookupSet = new HashSet();
531     return true;
532   }
533 
534   public boolean add(String singleItem, Lookup lookup) {
535     addLookup(singleItem,lookup);
536     return true;
537   }
538 
539 
540 } // DefaultGazetteer
541 
542 // >>> DAM: TransArray optimization, new charMap implementation
543 interface Iter
544 {
545     public boolean hasNext();
546     public char next();
547 } // iter class
548 
549 /**
550  * class implementing the map using binary serach by char as key
551  * to retrive the coresponding object.
552  */
553 class charMap
554 {
555     char[] itemsKeys = null;
556     Object[] itemsObjs = null;
557 
558     /**
559      * resize the containers by one leavaing empty elemant at position 'index'
560      */
561     void resize(int index)
562     {
563         int newsz = itemsKeys.length + 1;
564         char[] tempKeys = new char[newsz];
565         Object[] tempObjs = new Object[newsz];
566         int i;
567         for (i= 0; i < index; i++)
568         {
569             tempKeys[i] = itemsKeys[i];
570             tempObjs[i] = itemsObjs[i];
571         }
572         for (i= index+1; i < newsz; i++)
573         {
574             tempKeys[i] = itemsKeys[i-1];
575             tempObjs[i] = itemsObjs[i-1];
576         }
577 
578         itemsKeys = tempKeys;
579         itemsObjs = tempObjs;
580     } // resize
581 
582 /**
583  * get the object from the map using the char key
584  */
585     Object get(char key)
586     {
587         if (itemsKeys == null) return null;
588         int index = Arrays.binarySearch(itemsKeys, key);
589         if (index<0)
590             return null;
591         return itemsObjs[index];
592     }
593 /**
594  * put the object into the char map using the chat as the key
595  */
596     Object put(char key, Object value)
597     {
598         if (itemsKeys == null)
599         {
600             itemsKeys = new char[1];
601             itemsKeys[0] = key;
602             itemsObjs = new Object[1];
603             itemsObjs[0] = value;
604             return value;
605         }// if first time
606         int index = Arrays.binarySearch(itemsKeys, key);
607         if (index<0)
608         {
609             index = ~index;
610             resize(index);
611             itemsKeys[index] = key;
612             itemsObjs[index] = value;
613         }
614         return itemsObjs[index];
615     } // put
616 /**
617  * the keys itereator
618  * /
619     public Iter iter()
620     {
621         return new Iter()
622         {
623             int counter = 0;
624             public boolean hasNext() {return counter < itemsKeys.length;}
625             public char next() { return itemsKeys[counter];}
626         };
627     } // iter()
628  */
629 
630 } // class charMap
631 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState