1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 2000-2001, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   *
15   * $Id: DefaultGazetteer.java,v 1.33 2001/11/12 15:04:28 valyt Exp $
16   */
17  
18  package gate.creole.gazetteer;
19  
20  import java.io.*;
21  import java.util.*;
22  import java.net.*;
23  
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.event.*;
27  import gate.*;
28  
29  /** This component is responsible for doing lists lookup. The implementaion is
30   * based on finite state machines.
31   * The phrases to be recognised should be listed in a set of files, one for
32   * each type of occurences.
33   * The gazeteer is build with the information from a file that contains the set
34   * of lists (which are files as well) and the associated type for each list.
35   * The file defining the set of lists should have the following syntax:
36   * each list definition should be written on its own line and should contain:
37   * <ol>
38   * <li>the file name (required) </li>
39   * <li>the major type (required) </li>
40   * <li>the minor type (optional)</li>
41   * <li>the language(s) (optional) </li>
42   * </ol>
43   * The elements of each definition are separated by &quot;:&quot;.
44   * The following is an example of a valid definition: <br>
45   * <code>personmale.lst:person:male:english</code>
46   * Each list file named in the lists definition file is just a list containing
47   * one entry per line.
48   * When this gazetter will be run over some input text (a Gate document) it
49   * will generate annotations of type Lookup having the attributes specified in
50   * the definition file.
51   */
52  public class DefaultGazetteer extends AbstractLanguageAnalyser
53               implements ProcessingResource {
54  
55    /** Debug flag
56     */
57    private static final boolean DEBUG = false;
58  
59    /** Build a gazetter using the default lists from the agte resources
60     * {@see init()}
61     */
62    public DefaultGazetteer(){
63    }
64  
65    /** Does the actual loading and parsing of the lists. This method must be
66     * called before the gazetteer can be used
67     */
68    public Resource init()throws ResourceInstantiationException{
69      fsmStates = new HashSet();
70      try{
71        initialState = new FSMState(this);
72        if(listsURL == null){
73          throw new ResourceInstantiationException (
74                "No URL provided for gazetteer creation!");
75        }
76  
77        //find the number of lines
78        Reader reader = new InputStreamReader(listsURL.openStream(), encoding);
79        int linesCnt = 0;
80        BufferedReader bReader = new BufferedReader(reader);
81        String line = bReader.readLine();
82        while (line != null) {
83          linesCnt++;
84          line = bReader.readLine();
85        }
86        bReader.close();
87  
88        //parse the file
89        reader = new InputStreamReader(listsURL.openStream(), encoding);
90        bReader = new BufferedReader(reader);
91        line = bReader.readLine();
92        ///String toParse = "";
93        StringBuffer toParse = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
94  
95        int lineIdx = 0;
96        while (line != null) {
97          if(line.endsWith("\\")) {
98            ///toParse += line.substring(0,line.length()-1);
99            toParse.append(line.substring(0,line.length()-1));
100         } else {
101           ///toParse += line;
102           toParse.append(line);
103           fireStatusChanged("Reading " + toParse.toString());
104           fireProgressChanged(lineIdx * 100 / linesCnt);
105           lineIdx ++;
106           readList(toParse.toString(), true);
107           ///toParse = "";
108           toParse.delete(0,toParse.length());
109         }
110         line = bReader.readLine();
111       }
112       fireProcessFinished();
113     }catch(IOException ioe){
114       throw new ResourceInstantiationException(ioe);
115     }catch(GazetteerException ge){
116       throw new ResourceInstantiationException(ge);
117     }
118     return this;
119   }
120 
121 
122   /** Reads one lists (one file) of phrases
123    *
124    * @param listDesc the line from the definition file
125    * @param add
126    * @add if <b>true</b> will add the phrases found in the list to the ones
127    *     recognised by this gazetter, if <b>false</b> the phrases found in the
128    *     list will be removed from the list of phrases recognised by this
129    *     gazetteer.
130    */
131   void readList(String listDesc, boolean add) throws FileNotFoundException,
132                                         IOException,
133                                         GazetteerException{
134     String listName, majorType, minorType, languages;
135     int firstColon = listDesc.indexOf(':');
136     int secondColon = listDesc.indexOf(':', firstColon + 1);
137     int thirdColon = listDesc.indexOf(':', secondColon + 1);
138     if(firstColon == -1){
139       throw new GazetteerException("Invalid list definition: " + listDesc);
140     }
141     listName = listDesc.substring(0, firstColon);
142 
143     if(secondColon == -1){
144       majorType = listDesc.substring(firstColon + 1);
145       minorType = null;
146       languages = null;
147     } else {
148       majorType = listDesc.substring(firstColon + 1, secondColon);
149       if(thirdColon == -1) {
150         minorType = listDesc.substring(secondColon + 1);
151         languages = null;
152       } else {
153         minorType = listDesc.substring(secondColon + 1, thirdColon);
154         languages = listDesc.substring(thirdColon + 1);
155       }
156     }
157     BufferedReader listReader;
158 
159     listReader = new BufferedReader(new InputStreamReader(
160                             (new URL(listsURL, listName)).openStream(), encoding));
161 
162     Lookup lookup = new Lookup(majorType, minorType, languages);
163     String line = listReader.readLine();
164     while(null != line){
165       if(add)addLookup(line, lookup);
166       else removeLookup(line, lookup);
167       line = listReader.readLine();
168     }
169   } // void readList(String listDesc)
170 
171   /** Adds one phrase to the list of phrases recognised by this gazetteer
172    *
173    * @param text the phrase to be added
174    * @param lookup the description of the annotation to be added when this
175    *     phrase is recognised
176    */
177 // >>> DAM, was
178 /*
179   public void addLookup(String text, Lookup lookup) {
180     Character currentChar;
181     FSMState currentState = initialState;
182     FSMState nextState;
183     Lookup oldLookup;
184     boolean isSpace;
185 
186     for(int i = 0; i< text.length(); i++) {
187       isSpace = Character.isWhitespace(text.charAt(i));
188       if(isSpace) currentChar = new Character(' ');
189       else currentChar = (caseSensitive.booleanValue()) ?
190                           new Character(text.charAt(i)) :
191                           new Character(Character.toUpperCase(text.charAt(i))) ;
192       nextState = currentState.next(currentChar);
193       if(nextState == null){
194         nextState = new FSMState(this);
195         currentState.put(currentChar, nextState);
196         if(isSpace) nextState.put(new Character(' '),nextState);
197       }
198       currentState = nextState;
199     } //for(int i = 0; i< text.length(); i++)
200 
201     currentState.addLookup(lookup);
202     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
203 
204   } // addLookup
205 */
206 // >>> DAM: TransArray optimization
207   public void addLookup(String text, Lookup lookup) {
208     char currentChar;
209     FSMState currentState = initialState;
210     FSMState nextState;
211     Lookup oldLookup;
212     boolean isSpace;
213 
214     for(int i = 0; i< text.length(); i++) {
215         currentChar = text.charAt(i);
216         isSpace = Character.isWhitespace(currentChar);
217         if(isSpace) currentChar = ' ';
218         else currentChar = (caseSensitive.booleanValue()) ?
219                           currentChar :
220                           Character.toUpperCase(currentChar) ;
221       nextState = currentState.next(currentChar);
222       if(nextState == null){
223         nextState = new FSMState(this);
224         currentState.put(currentChar, nextState);
225         if(isSpace) nextState.put(' ',nextState);
226       }
227       currentState = nextState;
228     } //for(int i = 0; i< text.length(); i++)
229 
230     currentState.addLookup(lookup);
231     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
232 
233   } // addLookup
234 // >>> DAM, end
235 
236   /** Removes one phrase to the list of phrases recognised by this gazetteer
237    *
238    * @param text the phrase to be removed
239    * @param lookup the description of the annotation associated to this phrase
240    */
241 // >>> DAM, was
242 /*
243   public void removeLookup(String text, Lookup lookup) {
244     Character currentChar;
245     FSMState currentState = initialState;
246     FSMState nextState;
247     Lookup oldLookup;
248     boolean isSpace;
249 
250     for(int i = 0; i< text.length(); i++) {
251       isSpace = Character.isWhitespace(text.charAt(i));
252       if(isSpace) currentChar = new Character(' ');
253       else currentChar = new Character(text.charAt(i));
254       nextState = currentState.next(currentChar);
255       if(nextState == null) return;//nothing to remove
256       currentState = nextState;
257     } //for(int i = 0; i< text.length(); i++)
258     currentState.removeLookup(lookup);
259   } // removeLookup
260 */
261 // >>> DAM: TransArray optimization
262   public void removeLookup(String text, Lookup lookup) {
263     char currentChar;
264     FSMState currentState = initialState;
265     FSMState nextState;
266     Lookup oldLookup;
267 
268     for(int i = 0; i< text.length(); i++) {
269         currentChar = text.charAt(i);
270         if(Character.isWhitespace(currentChar)) currentChar = ' ';
271         nextState = currentState.next(currentChar);
272         if(nextState == null) return;//nothing to remove
273         currentState = nextState;
274     } //for(int i = 0; i< text.length(); i++)
275     currentState.removeLookup(lookup);
276   } // removeLookup
277 // >>> DAM, end
278 
279   /** Returns a string representation of the deterministic FSM graph using
280    * GML.
281    */
282   public String getFSMgml() {
283     String res = "graph[ \ndirected 1\n";
284     ///String nodes = "", edges = "";
285     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
286                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
287     Iterator fsmStatesIter = fsmStates.iterator();
288     while (fsmStatesIter.hasNext()){
289       FSMState currentState = (FSMState)fsmStatesIter.next();
290       int stateIndex = currentState.getIndex();
291       /*nodes += "node[ id " + stateIndex +
292                " label \"" + stateIndex;
293       */
294       nodes.append("node[ id ");
295       nodes.append(stateIndex);
296       nodes.append(" label \"");
297       nodes.append(stateIndex);
298 
299              if(currentState.isFinal()){
300               ///nodes += ",F\\n" + currentState.getLookupSet();
301               nodes.append(",F\\n");
302               nodes.append(currentState.getLookupSet());
303              }
304              ///nodes +=  "\"  ]\n";
305              nodes.append("\"  ]\n");
306       //edges += currentState.getEdgesGML();
307       edges.append(currentState.getEdgesGML());
308     }
309     res += nodes.toString() + edges.toString() + "]\n";
310     return res;
311   } // getFSMgml
312 
313   //no doc required: javadoc will copy it from the interface
314   /**    */
315   public FeatureMap getFeatures(){
316     return features;
317   } // getFeatures
318 
319   /**    */
320   public void setFeatures(FeatureMap features){
321     this.features = features;
322   } // setFeatures
323 
324 
325 
326   /**
327    * This method runs the gazetteer. It assumes that all the needed parameters
328    * are set. If they are not, an exception will be fired.
329    */
330   public void execute() throws ExecutionException{
331     interrupted = false;
332     AnnotationSet annotationSet;
333     //check the input
334     if(document == null) {
335       throw new ExecutionException(
336         "No document to process!"
337       );
338     }
339 
340     if(annotationSetName == null ||
341        annotationSetName.equals("")) annotationSet = document.getAnnotations();
342     else annotationSet = document.getAnnotations(annotationSetName);
343 
344     fireStatusChanged("Doing lookup in " +
345                            document.getSourceUrl().getFile() + "...");
346     String content = document.getContent().toString();
347     int length = content.length();
348 // >>> DAM, was
349 /*
350     Character currentChar;
351 */
352 // >>> DAM: TransArray optimization
353     char currentChar;
354 // >>> DAM, end
355     FSMState currentState = initialState;
356     FSMState nextState;
357     FSMState lastMatchingState = null;
358     int matchedRegionEnd = 0;
359     int matchedRegionStart = 0;
360     int charIdx = 0;
361     int oldCharIdx = 0;
362     FeatureMap fm;
363     Lookup currentLookup;
364 
365 // >>> DAM, was
366 /*
367     while(charIdx < length) {
368       if(Character.isWhitespace(content.charAt(charIdx)))
369         currentChar = new Character(' ');
370       else currentChar = (caseSensitive.booleanValue()) ?
371                          new Character(content.charAt(charIdx)) :
372                          new Character(Character.toUpperCase(
373                                        content.charAt(charIdx)));
374 */
375 // >>> DAM: TransArray optimization
376     while(charIdx < length) {
377       currentChar = content.charAt(charIdx);
378       if(Character.isWhitespace(currentChar)) currentChar = ' ';
379       else currentChar = caseSensitive.booleanValue() ?
380                           currentChar :
381                           Character.toUpperCase(currentChar);
382 // >>> DAM, end
383       nextState = currentState.next(currentChar);
384       if(nextState == null) {
385         //the matching stopped
386 
387         //if we had a successful match then act on it;
388         if(lastMatchingState != null){
389           //let's add the new annotation(s)
390           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
391 
392           while(lookupIter.hasNext()) {
393             currentLookup = (Lookup)lookupIter.next();
394             fm = Factory.newFeatureMap();
395             fm.put("majorType", currentLookup.majorType);
396             if(null != currentLookup.minorType) {
397               fm.put("minorType", currentLookup.minorType);
398               if(null != currentLookup.languages)
399                 fm.put("language", currentLookup.languages);
400             }
401             try {
402               annotationSet.add(new Long(matchedRegionStart),
403                               new Long(matchedRegionEnd + 1),
404                               "Lookup",
405                               fm);
406             } catch(InvalidOffsetException ioe) {
407               throw new LuckyException(ioe.toString());
408             }
409           }//while(lookupIter.hasNext())
410           lastMatchingState = null;
411         }
412 
413         //reset the FSM
414         charIdx = matchedRegionStart + 1;
415         matchedRegionStart = charIdx;
416         currentState = initialState;
417 
418       } else{//go on with the matching
419         currentState = nextState;
420         //if we have a successful state then store it
421         if(currentState.isFinal() &&
422            (matchedRegionStart == 0 ||
423             !Character.isLetter(content.charAt(matchedRegionStart - 1))) &&
424            (charIdx + 1 >= content.length()   ||
425             !Character.isLetter(content.charAt(charIdx + 1)))
426           ){
427           matchedRegionEnd = charIdx;
428           lastMatchingState = currentState;
429         }
430         charIdx ++;
431         if(charIdx == content.length()){
432           //we can't go on, use the last matching state and restart matching
433           //from the next char
434           if(lastMatchingState != null){
435             //let's add the new annotation(s)
436             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
437 
438             while(lookupIter.hasNext()) {
439               currentLookup = (Lookup)lookupIter.next();
440               fm = Factory.newFeatureMap();
441               fm.put("majorType", currentLookup.majorType);
442               if(null != currentLookup.minorType) {
443                 fm.put("minorType", currentLookup.minorType);
444                 if(null != currentLookup.languages)
445                   fm.put("language", currentLookup.languages);
446               }
447               try {
448                 annotationSet.add(new Long(matchedRegionStart),
449                                 new Long(matchedRegionEnd + 1),
450                                 "Lookup",
451                                 fm);
452               } catch(InvalidOffsetException ioe) {
453                 throw new LuckyException(ioe.toString());
454               }
455             }//while(lookupIter.hasNext())
456             lastMatchingState = null;
457           }
458 
459           //reset the FSM
460           charIdx = matchedRegionStart + 1;
461           matchedRegionStart = charIdx;
462           currentState = initialState;
463         }
464       }
465       if(charIdx - oldCharIdx > 256) {
466         fireProgressChanged((100 * charIdx )/ length );
467         oldCharIdx = charIdx;
468         if(isInterrupted()) throw new ExecutionInterruptedException(
469             "The execution of the " + getName() +
470             " gazetteer has been abruptly interrupted!");
471       }
472     } // while(charIdx < length)
473 
474     if(lastMatchingState != null) {
475       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
476       while(lookupIter.hasNext()) {
477         currentLookup = (Lookup)lookupIter.next();
478         fm = Factory.newFeatureMap();
479         fm.put("majorType", currentLookup.majorType);
480         if(null != currentLookup.minorType)
481           fm.put("minorType", currentLookup.minorType);
482         try{
483           annotationSet.add(new Long(matchedRegionStart),
484                           new Long(matchedRegionEnd + 1),
485                           "Lookup",
486                           fm);
487         } catch(InvalidOffsetException ioe) {
488           throw new GateRuntimeException(ioe.toString());
489         }
490       }//while(lookupIter.hasNext())
491     }
492     fireProcessFinished();
493     fireStatusChanged("Lookup complete!");
494   } // execute
495 
496 
497   /**
498    * Sets the AnnotationSet that will be used at the next run for the newly
499    * produced annotations.
500    */
501   public void setAnnotationSetName(String newAnnotationSetName) {
502     annotationSetName = newAnnotationSetName;
503   }
504 
505 
506   /** The initial state of the FSM that backs this gazetteer
507    */
508   FSMState initialState;
509 
510   /** A set containing all the states of the FSM backing the gazetteer
511    */
512   Set fsmStates;
513 
514   protected FeatureMap features  = null;
515 
516   /** Used to store the annotation set currently being used for the newly
517    * generated annotations
518    */
519   protected String annotationSetName;
520 
521   private String encoding = "UTF-8";
522 
523   /**
524    * The value of this property is the URL that will be used for reading the
525    * lists dtaht define this Gazetteer
526    */
527   private java.net.URL listsURL;
528 
529   /**
530    * Should this gazetteer be case sensitive. The default value is true.
531    */
532   private Boolean caseSensitive = new Boolean(true);
533 
534   public void setEncoding(String newEncoding) {
535     encoding = newEncoding;
536   }
537   public String getEncoding() {
538     return encoding;
539   }
540   public void setListsURL(java.net.URL newListsURL) {
541     listsURL = newListsURL;
542   }
543   public java.net.URL getListsURL() {
544     return listsURL;
545   }
546   public void setCaseSensitive(Boolean newCaseSensitive) {
547     caseSensitive = newCaseSensitive;
548   }
549   public Boolean getCaseSensitive() {
550     return caseSensitive;
551   }
552   public String getAnnotationSetName() {
553     return annotationSetName;
554   }
555 
556 } // DefaultGazetteer
557 
558 // >>> DAM: TransArray optimization, new charMap implementation
559 interface Iter
560 {
561     public boolean hasNext();
562     public char next();
563 } // iter class
564 
565 /**
566  * class implementing the map using binary serach by char as key
567  * to retrive the coresponding object.
568  */
569 class charMap
570 {
571     char[] itemsKeys = null;
572     Object[] itemsObjs = null;
573 
574     /**
575      * resize the containers by one leavaing empty elemant at position 'index'
576      */
577     void resize(int index)
578     {
579         int newsz = itemsKeys.length + 1;
580         char[] tempKeys = new char[newsz];
581         Object[] tempObjs = new Object[newsz];
582         int i;
583         for (i= 0; i < index; i++)
584         {
585             tempKeys[i] = itemsKeys[i];
586             tempObjs[i] = itemsObjs[i];
587         }
588         for (i= index+1; i < newsz; i++)
589         {
590             tempKeys[i] = itemsKeys[i-1];
591             tempObjs[i] = itemsObjs[i-1];
592         }
593 
594         itemsKeys = tempKeys;
595         itemsObjs = tempObjs;
596     } // resize
597 
598 /**
599  * get the object from the map using the char key
600  */
601     Object get(char key)
602     {
603         if (itemsKeys == null) return null;
604         int index = Arrays.binarySearch(itemsKeys, key);
605         if (index<0)
606             return null;
607         return itemsObjs[index];
608     }
609 /**
610  * put the object into the char map using the chat as the key
611  */
612     Object put(char key, Object value)
613     {
614         if (itemsKeys == null)
615         {
616             itemsKeys = new char[1];
617             itemsKeys[0] = key;
618             itemsObjs = new Object[1];
619             itemsObjs[0] = value;
620             return value;
621         }// if first time
622         int index = Arrays.binarySearch(itemsKeys, key);
623         if (index<0)
624         {
625             index = ~index;
626             resize(index);
627             itemsKeys[index] = key;
628             itemsObjs[index] = value;
629         }
630         return itemsObjs[index];
631     } // put
632 /**
633  * the keys itereator
634  * /
635     public Iter iter()
636     {
637         return new Iter()
638         {
639             int counter = 0;
640             public boolean hasNext() {return counter < itemsKeys.length;}
641             public char next() { return itemsKeys[counter];}
642         };
643     } // iter()
644  */
645 
646 } // class charMap
647 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState