1   /*
2    *  DefaultTokeniser.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, 2000
12   *
13   *  $Id: SimpleTokeniser.java,v 1.10 2001/10/05 15:40:07 valyt Exp $
14   */
15  
16  package gate.creole.tokeniser;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  import java.lang.reflect.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.event.*;
26  import gate.util.*;
27  
28  //import EDU.auburn.VGJ.graph.ParseError;
29  
30  /** Implementation of a Unicode rule based tokeniser.
31   * The tokeniser gets its rules from a file an {@link java.io.InputStream
32   * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
33   * of the constructors.
34   * The implementations is based on a finite state machine that is built based
35   * on the set of rules.
36   * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
37   * that are separated by the ">" character. The LHS represents a
38   * regular expression that will be matched against the input while the RHS
39   * describes a Gate2 annotation in terms of annotation type and attribute-value
40   * pairs.
41   * The matching is done using Unicode enumarated types as defined by the {@link
42   * java.lang.Character Character} class. At the time of writing this class the
43   * suported Unicode categories were:
44   * <ul>
45   * <li>UNASSIGNED
46   * <li>UPPERCASE_LETTER
47   * <li>LOWERCASE_LETTER
48   * <li>TITLECASE_LETTER
49   * <li>MODIFIER_LETTER
50   * <li>OTHER_LETTER
51   * <li>NON_SPACING_MARK
52   * <li>ENCLOSING_MARK
53   * <li>COMBINING_SPACING_MARK
54   * <li>DECIMAL_DIGIT_NUMBER
55   * <li>LETTER_NUMBER
56   * <li>OTHER_NUMBER
57   * <li>SPACE_SEPARATOR
58   * <li>LINE_SEPARATOR
59   * <li>PARAGRAPH_SEPARATOR
60   * <li>CONTROL
61   * <li>FORMAT
62   * <li>PRIVATE_USE
63   * <li>SURROGATE
64   * <li>DASH_PUNCTUATION
65   * <li>START_PUNCTUATION
66   * <li>END_PUNCTUATION
67   * <li>CONNECTOR_PUNCTUATION
68   * <li>OTHER_PUNCTUATION
69   * <li>MATH_SYMBOL
70   * <li>CURRENCY_SYMBOL
71   * <li>MODIFIER_SYMBOL
72   * <li>OTHER_SYMBOL
73   * </ul>
74   * The accepted operators for the LHS are "+", "*" and "|" having the usual
75   * interpretations of "1 to n occurences", "0 to n occurences" and
76   * "boolean OR".
77   * For instance this is a valid LHS:
78   * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
79   * <br>meaning an uppercase letter followed by one or more lowercase letters.
80   *
81   * The RHS describes an annotation that is to be created and inserted in the
82   * annotation set provided in case of a match. The new annotation will span the
83   * text that has been recognised. The RHS consists in the annotation type
84   * followed by pairs of attributes and associated values.
85   * E.g. for the LHS above a possible RHS can be:<br>
86   * Token;kind=upperInitial;<br>
87   * representing an annotation of type &quot;Token&quot; having one attribute
88   * named &quot;kind&quot; with the value &quot;upperInitial&quot;<br>
89   * The entire rule willbe:<br>
90   * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre>
91   * <br>
92   * The tokeniser ignores all the empty lines or the ones that start with # or
93   * //.
94   *
95   */
96  public class SimpleTokeniser extends AbstractLanguageAnalyser{
97    /** Debug flag
98     */
99    private static final boolean DEBUG = false;
100 
101   /**
102    * Creates a tokeniser
103    */
104   public SimpleTokeniser(){
105   }
106 
107   /**
108    * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
109    * the finite state machine at the core of the tokeniser.
110    *
111    * @exception ResourceInstantiationException
112    */
113   public Resource init() throws ResourceInstantiationException{
114     Reader rulesReader;
115     try{
116       if(rulesURL != null){
117         rulesReader = new InputStreamReader(rulesURL.openStream(), encoding);
118       }else{
119         //no init data, Scream!
120         throw new ResourceInstantiationException(
121           "No URL provided for the rules!");
122       }
123       initialState = new FSMState(this);
124       BufferedReader bRulesReader = new BufferedReader(rulesReader);
125       String line = bRulesReader.readLine();
126       ///String toParse = "";
127       StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
128 
129       while (line != null){
130         if(line.endsWith("\\")){
131           ///toParse += line.substring(0,line.length()-1);
132           toParse.append(line.substring(0,line.length()-1));
133         }else{
134           /*toParse += line;
135           parseRule(toParse);
136           toParse = "";
137           */
138           toParse.append(line);
139           parseRule(toParse.toString());
140           toParse.delete(0,toParse.length());
141         }
142         line = bRulesReader.readLine();
143       }
144       eliminateVoidTransitions();
145     }catch(java.io.IOException ioe){
146       throw new ResourceInstantiationException(ioe);
147     }catch(TokeniserException te){
148       throw new ResourceInstantiationException(te);
149     }
150     return this;
151   }
152 
153   /**
154    * Prepares this Processing resource for a new run.
155    */
156   public void reset(){
157     document = null;
158     annotationSetName = null;
159   }
160 
161   /** Parses one input line containing a tokeniser rule.
162    * This will create the necessary FSMState objects and the links
163    * between them.
164    *
165    * @param line the string containing the rule
166    */
167   void parseRule(String line)throws TokeniserException{
168     //ignore comments
169     if(line.startsWith("#")) return;
170 
171     if(line.startsWith("//")) return;
172 
173     StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
174     FSMState newState = new FSMState(this);
175 
176     initialState.put(null, newState);
177     FSMState finalState = parseLHS(newState, st, LHStoRHS);
178     String rhs = "";
179 
180     if(st.hasMoreTokens()) rhs = st.nextToken("\f");
181 
182     if(rhs.length() > 0)finalState.setRhs(rhs);
183   } // parseRule
184 
185   /** Parses a part or the entire LHS.
186    *
187    * @param startState a FSMState object representing the initial state for
188    *     the small FSM that will recognise the (part of) the rule parsed by this
189    *     method.
190    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
191    *     provides the input
192    * @param until the string that marks the end of the section to be
193    *     recognised. This method will first be called by {@link
194    *     #parseRule(String)} with &quot; &gt;&quot; in order to parse the entire
195    *     LHS. when necessary it will make itself another call to {@link #parseLHS
196    *     parseLHS} to parse a region of the LHS (e.g. a
197    *     &quot;(&quot;,&quot;)&quot; enclosed part.
198    */
199   FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
200        throws TokeniserException{
201 
202     FSMState currentState = startState;
203     boolean orFound = false;
204     List orList = new LinkedList();
205     String token;
206     token = skipIgnoreTokens(st);
207 
208     if(null == token) return currentState;
209 
210     FSMState newState;
211     Integer typeId;
212     UnicodeType uType;
213 
214     bigwhile: while(!token.equals(until)){
215       if(token.equals("(")){//(..)
216         newState = parseLHS(currentState, st,")");
217       } else if(token.equals("\"")){//"unicode_type"
218         String sType = parseQuotedString(st, "\"");
219         //Out.println(sType);
220         newState = new FSMState(this);
221         typeId = (Integer)stringTypeIds.get(sType);
222 
223         if(null == typeId)
224           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
225         else uType = new UnicodeType(typeId.intValue());
226 
227         currentState.put(uType ,newState);
228       } else {// a type with no quotes
229         String sType = token;
230         //Out.println(sType);
231         newState = new FSMState(this);
232         typeId = (Integer)stringTypeIds.get(sType);
233 
234         if(null == typeId)
235           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
236         else uType = new UnicodeType(typeId.intValue());
237 
238         currentState.put(uType ,newState);
239       }
240       //treat the operators
241       token = skipIgnoreTokens(st);
242       if(null == token) throw
243         new InvalidRuleException("Tokeniser rule ended too soon!");
244 
245       if(token.equals("|")) {
246 
247         orFound = true;
248         orList.add(newState);
249         token = skipIgnoreTokens(st);
250         if(null == token) throw
251           new InvalidRuleException("Tokeniser rule ended too soon!");
252 
253         continue bigwhile;
254       } else if(orFound) {//done parsing the "|"
255         orFound = false;
256         orList.add(newState);
257         newState = new FSMState(this);
258         Iterator orListIter = orList.iterator();
259 
260         while(orListIter.hasNext())
261           ((FSMState)orListIter.next()).put(null, newState);
262         orList.clear();
263       }
264 
265       if(token.equals("+")) {
266 
267         newState.put(null,currentState);
268         currentState = newState;
269         newState = new FSMState(this);
270         currentState.put(null,newState);
271         token = skipIgnoreTokens(st);
272 
273         if(null == token) throw
274           new InvalidRuleException("Tokeniser rule ended too soon!");
275       } else if(token.equals("*")) {
276 
277         currentState.put(null,newState);
278         newState.put(null,currentState);
279         currentState = newState;
280         newState = new FSMState(this);
281         currentState.put(null,newState);
282         token = skipIgnoreTokens(st);
283 
284         if(null == token) throw
285           new InvalidRuleException("Tokeniser rule ended too soon!");
286       }
287       currentState = newState;
288     }
289     return currentState;
290   } // parseLHS
291 
292   /** Parses from the given string tokeniser until it finds a specific
293    * delimiter.
294    * One use for this method is to read everything until the first quote.
295    *
296    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
297    *     provides the input
298    * @param until a String representing the end delimiter.
299    */
300   String parseQuotedString(StringTokenizer st, String until)
301     throws TokeniserException {
302 
303     String token;
304 
305     if(st.hasMoreElements()) token = st.nextToken();
306     else return null;
307 
308     ///String type = "";
309     StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
310 
311     while(!token.equals(until)){
312       //type += token;
313       type.append(token);
314       if(st.hasMoreElements())token = st.nextToken();
315       else throw new InvalidRuleException("Tokeniser rule ended too soon!");
316     }
317     return type.toString();
318   } // parseQuotedString
319 
320   /** Skips the ignorable tokens from the input returning the first significant
321    * token.
322    * The ignorable tokens are defined by {@link #ignoreTokens a set}
323    */
324   protected static String skipIgnoreTokens(StringTokenizer st){
325     Iterator ignorables;
326     boolean ignorableFound = false;
327     String currentToken;
328 
329     while(true){
330       if(st.hasMoreTokens()){
331         currentToken = st.nextToken();
332         ignorables = ignoreTokens.iterator();
333         ignorableFound = false;
334 
335         while(!ignorableFound && ignorables.hasNext()){
336           if(currentToken.equals((String)ignorables.next()))
337             ignorableFound = true;
338         }
339 
340         if(!ignorableFound) return currentToken;
341       } else return null;
342     }
343   }//skipIgnoreTokens
344 
345   /* Computes the lambda-closure (aka epsilon closure) of the given set of
346    * states, that is the set of states that are accessible from any of the
347    * states in the given set using only unrestricted transitions.
348    * @return a set containing all the states accessible from this state via
349    * transitions that bear no restrictions.
350    */
351   /**
352    * Converts the finite state machine to a deterministic one.
353    *
354    * @param s
355    */
356   private AbstractSet lambdaClosure(Set s){
357 
358     //the stack/queue used by the algorithm
359     LinkedList list = new LinkedList(s);
360 
361     //the set to be returned
362     AbstractSet lambdaClosure = new HashSet(s);
363 
364     FSMState top;
365     FSMState currentState;
366     Set nextStates;
367     Iterator statesIter;
368 
369     while(!list.isEmpty()) {
370       top = (FSMState)list.removeFirst();
371       nextStates = top.nextSet(null);
372 
373       if(null != nextStates){
374         statesIter = nextStates.iterator();
375 
376         while(statesIter.hasNext()) {
377           currentState = (FSMState)statesIter.next();
378           if(!lambdaClosure.contains(currentState)){
379             lambdaClosure.add(currentState);
380             list.addFirst(currentState);
381           }//if(!lambdaClosure.contains(currentState))
382         }//while(statesIter.hasNext())
383 
384       }//if(null != nextStates)
385     }
386     return lambdaClosure;
387   } // lambdaClosure
388 
389   /** Converts the FSM from a non-deterministic to a deterministic one by
390    * eliminating all the unrestricted transitions.
391    */
392   void eliminateVoidTransitions() throws TokeniserException {
393 
394     //kalina:clear() faster than init() which is called with init()
395     newStates.clear();
396     Set sdStates = new HashSet();
397     LinkedList unmarkedDStates = new LinkedList();
398     DFSMState dCurrentState = new DFSMState(this);
399     Set sdCurrentState = new HashSet();
400 
401     sdCurrentState.add(initialState);
402     sdCurrentState = lambdaClosure(sdCurrentState);
403     newStates.put(sdCurrentState, dCurrentState);
404     sdStates.add(sdCurrentState);
405 
406     //find out if the new state is a final one
407     Iterator innerStatesIter = sdCurrentState.iterator();
408     String rhs;
409     FSMState currentInnerState;
410     Set rhsClashSet = new HashSet();
411     boolean newRhs = false;
412 
413     while(innerStatesIter.hasNext()){
414       currentInnerState = (FSMState)innerStatesIter.next();
415       if(currentInnerState.isFinal()){
416         rhs = currentInnerState.getRhs();
417         rhsClashSet.add(rhs);
418         dCurrentState.rhs = rhs;
419         newRhs = true;
420       }
421     }
422 
423     if(rhsClashSet.size() > 1){
424       Err.println("Warning, rule clash: " +  rhsClashSet +
425                          "\nSelected last definition: " + dCurrentState.rhs);
426     }
427 
428     if(newRhs)dCurrentState.buildTokenDesc();
429     rhsClashSet.clear();
430     unmarkedDStates.addFirst(sdCurrentState);
431     dInitialState = dCurrentState;
432     Set nextSet;
433 
434     while(!unmarkedDStates.isEmpty()){
435       //Out.println("\n\n=====================" + unmarkedDStates.size());
436       sdCurrentState = (Set)unmarkedDStates.removeFirst();
437       for(int type = 0; type < maxTypeId; type++){
438       //Out.print(type);
439         nextSet = new HashSet();
440         innerStatesIter = sdCurrentState.iterator();
441 
442         while(innerStatesIter.hasNext()){
443           currentInnerState = (FSMState)innerStatesIter.next();
444           Set tempSet = currentInnerState.nextSet(type);
445           if(null != tempSet) nextSet.addAll(tempSet);
446         }//while(innerStatesIter.hasNext())
447 
448         if(!nextSet.isEmpty()){
449           nextSet = lambdaClosure(nextSet);
450           dCurrentState = (DFSMState)newStates.get(nextSet);
451 
452           if(dCurrentState == null){
453 
454             //we have a new DFSMState
455             dCurrentState = new DFSMState(this);
456             sdStates.add(nextSet);
457             unmarkedDStates.add(nextSet);
458 
459             //check to see whether the new state is a final one
460             innerStatesIter = nextSet.iterator();
461             newRhs =false;
462 
463             while(innerStatesIter.hasNext()){
464               currentInnerState = (FSMState)innerStatesIter.next();
465               if(currentInnerState.isFinal()){
466                 rhs = currentInnerState.getRhs();
467                 rhsClashSet.add(rhs);
468                 dCurrentState.rhs = rhs;
469                 newRhs = true;
470               }
471             }
472 
473             if(rhsClashSet.size() > 1){
474               Err.println("Warning, rule clash: " +  rhsClashSet +
475                             "\nSelected last definition: " + dCurrentState.rhs);
476             }
477 
478             if(newRhs)dCurrentState.buildTokenDesc();
479             rhsClashSet.clear();
480             newStates.put(nextSet, dCurrentState);
481           }
482           ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
483         } // if(!nextSet.isEmpty())
484 
485       } // for(byte type = 0; type < 256; type++)
486 
487     } // while(!unmarkedDStates.isEmpty())
488 
489   } // eliminateVoidTransitions
490 
491   /** Returns a string representation of the non-deterministic FSM graph using
492    * GML (Graph modelling language).
493    */
494   public String getFSMgml(){
495     String res = "graph[ \ndirected 1\n";
496     ///String nodes = "", edges = "";
497     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
498                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
499 
500     Iterator fsmStatesIter = fsmStates.iterator();
501     while (fsmStatesIter.hasNext()){
502       FSMState currentState = (FSMState)fsmStatesIter.next();
503       int stateIndex = currentState.getIndex();
504       /*nodes += "node[ id " + stateIndex +
505                " label \"" + stateIndex;
506         */
507         nodes.append("node[ id ");
508         nodes.append(stateIndex);
509         nodes.append(" label \"");
510         nodes.append(stateIndex);
511 
512              if(currentState.isFinal()){
513               ///nodes += ",F\\n" + currentState.getRhs();
514               nodes.append(",F\\n" + currentState.getRhs());
515              }
516              ///nodes +=  "\"  ]\n";
517              nodes.append("\"  ]\n");
518       ///edges += currentState.getEdgesGML();
519       edges.append(currentState.getEdgesGML());
520     }
521     res += nodes.toString() + edges.toString() + "]\n";
522     return res;
523   } // getFSMgml
524 
525   /** Returns a string representation of the deterministic FSM graph using
526    * GML.
527    */
528   public String getDFSMgml() {
529     String res = "graph[ \ndirected 1\n";
530     ///String nodes = "", edges = "";
531     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
532                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
533 
534     Iterator dfsmStatesIter = dfsmStates.iterator();
535     while (dfsmStatesIter.hasNext()) {
536       DFSMState currentState = (DFSMState)dfsmStatesIter.next();
537       int stateIndex = currentState.getIndex();
538 /*      nodes += "node[ id " + stateIndex +
539                " label \"" + stateIndex;
540 */
541         nodes.append("node[ id ");
542         nodes.append(stateIndex);
543         nodes.append(" label \"");
544         nodes.append(stateIndex);
545 
546              if(currentState.isFinal()){
547 ///              nodes += ",F\\n" + currentState.getRhs();
548               nodes.append(",F\\n" + currentState.getRhs());
549              }
550 ///             nodes +=  "\"  ]\n";
551              nodes.append("\"  ]\n");
552 ///      edges += currentState.getEdgesGML();
553         edges.append(currentState.getEdgesGML());
554     }
555     res += nodes.toString() + edges.toString() + "]\n";
556     return res;
557   } // getDFSMgml
558 
559   //no doc required: javadoc will copy it from the interface
560   /**    */
561   public FeatureMap getFeatures(){
562     return features;
563   } // getFeatures
564 
565   /**    */
566   public void setFeatures(FeatureMap features){
567     this.features = features;
568   } // setFeatures
569 
570   /**
571    * The method that does the actual tokenisation.
572    */
573   public void execute() throws ExecutionException {
574     interrupted = false;
575     AnnotationSet annotationSet;
576     //check the input
577     if(document == null) {
578       throw new ExecutionException(
579         "No document to tokenise!"
580       );
581     }
582 
583     if(annotationSetName == null ||
584        annotationSetName.equals("")) annotationSet = document.getAnnotations();
585     else annotationSet = document.getAnnotations(annotationSetName);
586 
587     fireStatusChanged(
588         "Tokenising " + document.getSourceUrl().getFile() + "...");
589 
590     String content = document.getContent().toString();
591     int length = content.length();
592     char currentChar;
593 
594     DFSMState graphPosition = dInitialState;
595 
596     //the index of the first character of the token trying to be recognised
597     int tokenStart = 0;
598 
599     //the index of the last character of the last token recognised
600     int lastMatch = -1;
601 
602     DFSMState lastMatchingState = null;
603     DFSMState nextState;
604     String tokenString;
605     int charIdx = 0;
606     int oldCharIdx = 0;
607     FeatureMap newTokenFm;
608 
609     while(charIdx < length){
610       currentChar = content.charAt(charIdx);
611 //      Out.println(
612 //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
613       nextState = graphPosition.next(((Integer)typeIds.get(
614                   new Integer(Character.getType(currentChar)))).intValue());
615 
616       if( null != nextState ) {
617         graphPosition = nextState;
618         if(graphPosition.isFinal()) {
619           lastMatch = charIdx;
620           lastMatchingState = graphPosition;
621         }
622         charIdx ++;
623       } else {//we have a match!
624         newTokenFm = Factory.newFeatureMap();
625 
626         if (null == lastMatchingState) {
627           tokenString = content.substring(tokenStart, tokenStart +1);
628           newTokenFm.put("type","UNKNOWN");
629           newTokenFm.put("string", tokenString);
630           newTokenFm.put("length", Integer.toString(tokenString.length()));
631 
632           try {
633             annotationSet.add(new Long(tokenStart),
634                               new Long(tokenStart + 1),
635                               "DEFAULT_TOKEN", newTokenFm);
636           } catch (InvalidOffsetException ioe) {
637             //This REALLY shouldn't happen!
638             ioe.printStackTrace(Err.getPrintWriter());
639           }
640           // Out.println("Default token: " + tokenStart +
641           //             "->" + tokenStart + " :" + tokenString + ";");
642           charIdx  = tokenStart + 1;
643         } else {
644           tokenString = content.substring(tokenStart, lastMatch + 1);
645           newTokenFm.put("string", tokenString);
646           newTokenFm.put("length", Integer.toString(tokenString.length()));
647 
648           for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
649             newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
650                            lastMatchingState.getTokenDesc()[i][1]);
651           //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
652           //                       lastMatchingState.getTokenDesc()[i][1]);
653           }
654 
655 
656           try {
657             annotationSet.add(new Long(tokenStart),
658                             new Long(lastMatch + 1),
659                             lastMatchingState.getTokenDesc()[0][0], newTokenFm);
660           } catch(InvalidOffsetException ioe) {
661             //This REALLY shouldn't happen!
662             throw new GateRuntimeException(ioe.toString());
663           }
664 
665           // Out.println(lastMatchingState.getTokenDesc()[0][0] +
666           //              ": " + tokenStart + "->" + lastMatch +
667           //              " :" + tokenString + ";");
668           charIdx = lastMatch + 1;
669         }
670 
671         lastMatchingState = null;
672         graphPosition = dInitialState;
673         tokenStart = charIdx;
674       }
675 
676       if((charIdx - oldCharIdx > 256)){
677         fireProgressChanged((100 * charIdx )/ length );
678         oldCharIdx = charIdx;
679         if(isInterrupted()) throw new ExecutionInterruptedException();
680       }
681 
682     } // while(charIdx < length)
683 
684     if (null != lastMatchingState) {
685       tokenString = content.substring(tokenStart, lastMatch + 1);
686       newTokenFm = Factory.newFeatureMap();
687       newTokenFm.put("string", tokenString);
688       newTokenFm.put("length", Integer.toString(tokenString.length()));
689 
690       for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
691         newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
692                        lastMatchingState.getTokenDesc()[i][1]);
693       }
694 
695 
696       try {
697         annotationSet.add(new Long(tokenStart),
698                           new Long(lastMatch + 1),
699                           lastMatchingState.getTokenDesc()[0][0], newTokenFm);
700       } catch(InvalidOffsetException ioe) {
701         //This REALLY shouldn't happen!
702         throw new GateRuntimeException(ioe.toString());
703       }
704 
705     }
706 
707     reset();
708     fireProcessFinished();
709     fireStatusChanged("Tokenisation complete!");
710   } // run
711 
712   /**
713    * Sets the value of the <code>rulesURL</code> property which holds an URL
714    * to the file containing the rules for this tokeniser.
715    * @param newRulesURL
716    */
717   public void setRulesURL(java.net.URL newRulesURL) {
718     rulesURL = newRulesURL;
719   }
720   /**
721    * Gets the value of the <code>rulesURL</code> property hich holds an
722    * URL to the file containing the rules for this tokeniser.
723    */
724   public java.net.URL getRulesURL() {
725     return rulesURL;
726   }
727   /**    */
728   public void setAnnotationSetName(String newAnnotationSetName) {
729     annotationSetName = newAnnotationSetName;
730   }
731   /**    */
732   public String getAnnotationSetName() {
733     return annotationSetName;
734   }
735   public void setRulesResourceName(String newRulesResourceName) {
736     rulesResourceName = newRulesResourceName;
737   }
738   public String getRulesResourceName() {
739     return rulesResourceName;
740   }
741   public void setEncoding(String newEncoding) {
742     encoding = newEncoding;
743   }
744   public String getEncoding() {
745     return encoding;
746   }
747 
748   /**    */
749   protected FeatureMap features  = null;
750 
751   /** the annotations et where the new annotations will be adde
752    */
753   protected String annotationSetName;
754 
755   /** The initial state of the non deterministic machin
756    */
757   protected FSMState initialState;
758 
759   /** A set containng all the states of the non deterministic machin
760    */
761   protected Set fsmStates = new HashSet();
762 
763   /** The initial state of the deterministic machin
764    */
765   protected DFSMState dInitialState;
766 
767   /** A set containng all the states of the deterministic machin
768    */
769   protected Set dfsmStates = new HashSet();
770 
771   /** The separator from LHS to RH
772    */
773   static String LHStoRHS = ">";
774 
775   /** A set of string representing tokens to be ignored (e.g. blanks
776    */
777   static Set ignoreTokens;
778 
779   /** maps from int (the static value on {@link java.lang.Character} to int
780    * the internal value used by the tokeniser. The ins values used by the
781    * tokeniser are consecutive values, starting from 0 and going as high as
782    * necessary.
783    * They map all the public static int members on{@link java.lang.Character}
784    */
785   public static Map typeIds;
786 
787   /** The maximum int value used internally as a type i
788    */
789   public static int maxTypeId;
790 
791   /** Maps the internal type ids to the type name
792    */
793   public static String[] typeMnemonics;
794 
795   /** Maps from type names to type internal id
796    */
797   public static Map stringTypeIds;
798 
799   /**
800    * This property holds an URL to the file containing the rules for this tokeniser
801    *
802    */
803 
804   /**    */
805   static protected String defaultResourceName =
806                             "creole/tokeniser/DefaultTokeniser.rules";
807 
808   private String rulesResourceName;
809   private java.net.URL rulesURL;
810   private String encoding;
811   private transient Vector progressListeners;
812   //kalina: added this as method to minimise too many init() calls
813   protected transient Map newStates = new HashMap();
814 
815 
816   /** The static initialiser will inspect the class {@link java.lang.Character}
817     * using reflection to find all the public static members and will map them
818     * to ids starting from 0.
819     * After that it will build all the static data: {@link #typeIds}, {@link
820     * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
821     */
822   static{
823     Field[] characterClassFields;
824 
825     try{
826       characterClassFields = Class.forName("java.lang.Character").getFields();
827     }catch(ClassNotFoundException cnfe){
828       throw new LuckyException("Could not find the java.lang.Character class!");
829     }
830 
831     Collection staticFields = new LinkedList();
832 
833     for(int i = 0; i< characterClassFields.length; i++)
834       if(Modifier.isStatic(characterClassFields[i].getModifiers()))
835          staticFields.add(characterClassFields[i]);
836 
837     typeIds = new HashMap();
838     maxTypeId = staticFields.size() -1;
839     typeMnemonics = new String[maxTypeId + 1];
840     stringTypeIds = new HashMap();
841 
842     Iterator staticFieldsIter = staticFields.iterator();
843     Field currentField;
844     int currentId = 0;
845     String fieldName;
846 
847     try {
848       while(staticFieldsIter.hasNext()){
849         currentField = (Field)staticFieldsIter.next();
850         if(currentField.getType().toString().equals("byte")){
851           fieldName = currentField.getName();
852           typeIds.put(new Integer(currentField.getInt(null)),
853                                     new Integer(currentId));
854           typeMnemonics[currentId] = fieldName;
855           stringTypeIds.put(fieldName, new Integer(currentId));
856           currentId++;
857         }
858       }
859     } catch(Exception e) {
860       throw new LuckyException(e.toString());
861     }
862 
863     ignoreTokens = new HashSet();
864     ignoreTokens.add(" ");
865     ignoreTokens.add("\t");
866     ignoreTokens.add("\f");
867 
868   }
869 
870 } // class DefaultTokeniser