1   /*
2    *  DefaultTokeniser.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, 2000
12   *
13   *  $Id: SimpleTokeniser.java,v 1.13 2002/03/06 17:15:45 kalina Exp $
14   */
15  
16  package gate.creole.tokeniser;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  import java.lang.reflect.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.event.*;
26  import gate.util.*;
27  
28  //import EDU.auburn.VGJ.graph.ParseError;
29  
30  /** Implementation of a Unicode rule based tokeniser.
31   * The tokeniser gets its rules from a file an {@link java.io.InputStream
32   * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
33   * of the constructors.
34   * The implementations is based on a finite state machine that is built based
35   * on the set of rules.
36   * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
37   * that are separated by the ">" character. The LHS represents a
38   * regular expression that will be matched against the input while the RHS
39   * describes a Gate2 annotation in terms of annotation type and attribute-value
40   * pairs.
41   * The matching is done using Unicode enumarated types as defined by the {@link
42   * java.lang.Character Character} class. At the time of writing this class the
43   * suported Unicode categories were:
44   * <ul>
45   * <li>UNASSIGNED
46   * <li>UPPERCASE_LETTER
47   * <li>LOWERCASE_LETTER
48   * <li>TITLECASE_LETTER
49   * <li>MODIFIER_LETTER
50   * <li>OTHER_LETTER
51   * <li>NON_SPACING_MARK
52   * <li>ENCLOSING_MARK
53   * <li>COMBINING_SPACING_MARK
54   * <li>DECIMAL_DIGIT_NUMBER
55   * <li>LETTER_NUMBER
56   * <li>OTHER_NUMBER
57   * <li>SPACE_SEPARATOR
58   * <li>LINE_SEPARATOR
59   * <li>PARAGRAPH_SEPARATOR
60   * <li>CONTROL
61   * <li>FORMAT
62   * <li>PRIVATE_USE
63   * <li>SURROGATE
64   * <li>DASH_PUNCTUATION
65   * <li>START_PUNCTUATION
66   * <li>END_PUNCTUATION
67   * <li>CONNECTOR_PUNCTUATION
68   * <li>OTHER_PUNCTUATION
69   * <li>MATH_SYMBOL
70   * <li>CURRENCY_SYMBOL
71   * <li>MODIFIER_SYMBOL
72   * <li>OTHER_SYMBOL
73   * </ul>
74   * The accepted operators for the LHS are "+", "*" and "|" having the usual
75   * interpretations of "1 to n occurences", "0 to n occurences" and
76   * "boolean OR".
77   * For instance this is a valid LHS:
78   * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
79   * <br>meaning an uppercase letter followed by one or more lowercase letters.
80   *
81   * The RHS describes an annotation that is to be created and inserted in the
82   * annotation set provided in case of a match. The new annotation will span the
83   * text that has been recognised. The RHS consists in the annotation type
84   * followed by pairs of attributes and associated values.
85   * E.g. for the LHS above a possible RHS can be:<br>
86   * Token;kind=upperInitial;<br>
87   * representing an annotation of type &quot;Token&quot; having one attribute
88   * named &quot;kind&quot; with the value &quot;upperInitial&quot;<br>
89   * The entire rule willbe:<br>
90   * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre>
91   * <br>
92   * The tokeniser ignores all the empty lines or the ones that start with # or
93   * //.
94   *
95   */
96  public class SimpleTokeniser extends AbstractLanguageAnalyser{
97    public static final String
98      SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
99  
100   public static final String
101     SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
102 
103   public static final String
104     SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
105 
106   public static final String
107     SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
108 
109   /** Debug flag
110    */
111   private static final boolean DEBUG = false;
112 
113   /**
114    * Creates a tokeniser
115    */
116   public SimpleTokeniser(){
117   }
118 
119   /**
120    * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
121    * the finite state machine at the core of the tokeniser.
122    *
123    * @exception ResourceInstantiationException
124    */
125   public Resource init() throws ResourceInstantiationException{
126     Reader rulesReader;
127     try{
128       if(rulesURL != null){
129         rulesReader = new InputStreamReader(rulesURL.openStream(), encoding);
130       }else{
131         //no init data, Scream!
132         throw new ResourceInstantiationException(
133           "No URL provided for the rules!");
134       }
135       initialState = new FSMState(this);
136       BufferedReader bRulesReader = new BufferedReader(rulesReader);
137       String line = bRulesReader.readLine();
138       ///String toParse = "";
139       StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
140 
141       while (line != null){
142         if(line.endsWith("\\")){
143           ///toParse += line.substring(0,line.length()-1);
144           toParse.append(line.substring(0,line.length()-1));
145         }else{
146           /*toParse += line;
147           parseRule(toParse);
148           toParse = "";
149           */
150           toParse.append(line);
151           parseRule(toParse.toString());
152           toParse.delete(0,toParse.length());
153         }
154         line = bRulesReader.readLine();
155       }
156       eliminateVoidTransitions();
157     }catch(java.io.IOException ioe){
158       throw new ResourceInstantiationException(ioe);
159     }catch(TokeniserException te){
160       throw new ResourceInstantiationException(te);
161     }
162     return this;
163   }
164 
165   /**
166    * Prepares this Processing resource for a new run.
167    */
168   public void reset(){
169     document = null;
170     annotationSetName = null;
171   }
172 
173   /** Parses one input line containing a tokeniser rule.
174    * This will create the necessary FSMState objects and the links
175    * between them.
176    *
177    * @param line the string containing the rule
178    */
179   void parseRule(String line)throws TokeniserException{
180     //ignore comments
181     if(line.startsWith("#")) return;
182 
183     if(line.startsWith("//")) return;
184 
185     StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
186     FSMState newState = new FSMState(this);
187 
188     initialState.put(null, newState);
189     FSMState finalState = parseLHS(newState, st, LHStoRHS);
190     String rhs = "";
191 
192     if(st.hasMoreTokens()) rhs = st.nextToken("\f");
193 
194     if(rhs.length() > 0)finalState.setRhs(rhs);
195   } // parseRule
196 
197   /** Parses a part or the entire LHS.
198    *
199    * @param startState a FSMState object representing the initial state for
200    *     the small FSM that will recognise the (part of) the rule parsed by this
201    *     method.
202    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
203    *     provides the input
204    * @param until the string that marks the end of the section to be
205    *     recognised. This method will first be called by {@link
206    *     #parseRule(String)} with &quot; &gt;&quot; in order to parse the entire
207    *     LHS. when necessary it will make itself another call to {@link #parseLHS
208    *     parseLHS} to parse a region of the LHS (e.g. a
209    *     &quot;(&quot;,&quot;)&quot; enclosed part.
210    */
211   FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
212        throws TokeniserException{
213 
214     FSMState currentState = startState;
215     boolean orFound = false;
216     List orList = new LinkedList();
217     String token;
218     token = skipIgnoreTokens(st);
219 
220     if(null == token) return currentState;
221 
222     FSMState newState;
223     Integer typeId;
224     UnicodeType uType;
225 
226     bigwhile: while(!token.equals(until)){
227       if(token.equals("(")){//(..)
228         newState = parseLHS(currentState, st,")");
229       } else if(token.equals("\"")){//"unicode_type"
230         String sType = parseQuotedString(st, "\"");
231         newState = new FSMState(this);
232         typeId = (Integer)stringTypeIds.get(sType);
233 
234         if(null == typeId)
235           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
236         else uType = new UnicodeType(typeId.intValue());
237 
238         currentState.put(uType ,newState);
239       } else {// a type with no quotes
240         String sType = token;
241         newState = new FSMState(this);
242         typeId = (Integer)stringTypeIds.get(sType);
243 
244         if(null == typeId)
245           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
246         else uType = new UnicodeType(typeId.intValue());
247 
248         currentState.put(uType ,newState);
249       }
250       //treat the operators
251       token = skipIgnoreTokens(st);
252       if(null == token) throw
253         new InvalidRuleException("Tokeniser rule ended too soon!");
254 
255       if(token.equals("|")) {
256 
257         orFound = true;
258         orList.add(newState);
259         token = skipIgnoreTokens(st);
260         if(null == token) throw
261           new InvalidRuleException("Tokeniser rule ended too soon!");
262 
263         continue bigwhile;
264       } else if(orFound) {//done parsing the "|"
265         orFound = false;
266         orList.add(newState);
267         newState = new FSMState(this);
268         Iterator orListIter = orList.iterator();
269 
270         while(orListIter.hasNext())
271           ((FSMState)orListIter.next()).put(null, newState);
272         orList.clear();
273       }
274 
275       if(token.equals("+")) {
276 
277         newState.put(null,currentState);
278         currentState = newState;
279         newState = new FSMState(this);
280         currentState.put(null,newState);
281         token = skipIgnoreTokens(st);
282 
283         if(null == token) throw
284           new InvalidRuleException("Tokeniser rule ended too soon!");
285       } else if(token.equals("*")) {
286 
287         currentState.put(null,newState);
288         newState.put(null,currentState);
289         currentState = newState;
290         newState = new FSMState(this);
291         currentState.put(null,newState);
292         token = skipIgnoreTokens(st);
293 
294         if(null == token) throw
295           new InvalidRuleException("Tokeniser rule ended too soon!");
296       }
297       currentState = newState;
298     }
299     return currentState;
300   } // parseLHS
301 
302   /** Parses from the given string tokeniser until it finds a specific
303    * delimiter.
304    * One use for this method is to read everything until the first quote.
305    *
306    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
307    *     provides the input
308    * @param until a String representing the end delimiter.
309    */
310   String parseQuotedString(StringTokenizer st, String until)
311     throws TokeniserException {
312 
313     String token;
314 
315     if(st.hasMoreElements()) token = st.nextToken();
316     else return null;
317 
318     ///String type = "";
319     StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
320 
321     while(!token.equals(until)){
322       //type += token;
323       type.append(token);
324       if(st.hasMoreElements())token = st.nextToken();
325       else throw new InvalidRuleException("Tokeniser rule ended too soon!");
326     }
327     return type.toString();
328   } // parseQuotedString
329 
330   /** Skips the ignorable tokens from the input returning the first significant
331    * token.
332    * The ignorable tokens are defined by {@link #ignoreTokens a set}
333    */
334   protected static String skipIgnoreTokens(StringTokenizer st){
335     Iterator ignorables;
336     boolean ignorableFound = false;
337     String currentToken;
338 
339     while(true){
340       if(st.hasMoreTokens()){
341         currentToken = st.nextToken();
342         ignorables = ignoreTokens.iterator();
343         ignorableFound = false;
344 
345         while(!ignorableFound && ignorables.hasNext()){
346           if(currentToken.equals((String)ignorables.next()))
347             ignorableFound = true;
348         }
349 
350         if(!ignorableFound) return currentToken;
351       } else return null;
352     }
353   }//skipIgnoreTokens
354 
355   /* Computes the lambda-closure (aka epsilon closure) of the given set of
356    * states, that is the set of states that are accessible from any of the
357    * states in the given set using only unrestricted transitions.
358    * @return a set containing all the states accessible from this state via
359    * transitions that bear no restrictions.
360    */
361   /**
362    * Converts the finite state machine to a deterministic one.
363    *
364    * @param s
365    */
366   private AbstractSet lambdaClosure(Set s){
367 
368     //the stack/queue used by the algorithm
369     LinkedList list = new LinkedList(s);
370 
371     //the set to be returned
372     AbstractSet lambdaClosure = new HashSet(s);
373 
374     FSMState top;
375     FSMState currentState;
376     Set nextStates;
377     Iterator statesIter;
378 
379     while(!list.isEmpty()) {
380       top = (FSMState)list.removeFirst();
381       nextStates = top.nextSet(null);
382 
383       if(null != nextStates){
384         statesIter = nextStates.iterator();
385 
386         while(statesIter.hasNext()) {
387           currentState = (FSMState)statesIter.next();
388           if(!lambdaClosure.contains(currentState)){
389             lambdaClosure.add(currentState);
390             list.addFirst(currentState);
391           }//if(!lambdaClosure.contains(currentState))
392         }//while(statesIter.hasNext())
393 
394       }//if(null != nextStates)
395     }
396     return lambdaClosure;
397   } // lambdaClosure
398 
399   /** Converts the FSM from a non-deterministic to a deterministic one by
400    * eliminating all the unrestricted transitions.
401    */
402   void eliminateVoidTransitions() throws TokeniserException {
403 
404     //kalina:clear() faster than init() which is called with init()
405     newStates.clear();
406     Set sdStates = new HashSet();
407     LinkedList unmarkedDStates = new LinkedList();
408     DFSMState dCurrentState = new DFSMState(this);
409     Set sdCurrentState = new HashSet();
410 
411     sdCurrentState.add(initialState);
412     sdCurrentState = lambdaClosure(sdCurrentState);
413     newStates.put(sdCurrentState, dCurrentState);
414     sdStates.add(sdCurrentState);
415 
416     //find out if the new state is a final one
417     Iterator innerStatesIter = sdCurrentState.iterator();
418     String rhs;
419     FSMState currentInnerState;
420     Set rhsClashSet = new HashSet();
421     boolean newRhs = false;
422 
423     while(innerStatesIter.hasNext()){
424       currentInnerState = (FSMState)innerStatesIter.next();
425       if(currentInnerState.isFinal()){
426         rhs = currentInnerState.getRhs();
427         rhsClashSet.add(rhs);
428         dCurrentState.rhs = rhs;
429         newRhs = true;
430       }
431     }
432 
433     if(rhsClashSet.size() > 1){
434       Err.println("Warning, rule clash: " +  rhsClashSet +
435                          "\nSelected last definition: " + dCurrentState.rhs);
436     }
437 
438     if(newRhs)dCurrentState.buildTokenDesc();
439     rhsClashSet.clear();
440     unmarkedDStates.addFirst(sdCurrentState);
441     dInitialState = dCurrentState;
442     Set nextSet;
443 
444     while(!unmarkedDStates.isEmpty()){
445       //Out.println("\n\n=====================" + unmarkedDStates.size());
446       sdCurrentState = (Set)unmarkedDStates.removeFirst();
447       for(int type = 0; type < maxTypeId; type++){
448       //Out.print(type);
449         nextSet = new HashSet();
450         innerStatesIter = sdCurrentState.iterator();
451 
452         while(innerStatesIter.hasNext()){
453           currentInnerState = (FSMState)innerStatesIter.next();
454           Set tempSet = currentInnerState.nextSet(type);
455           if(null != tempSet) nextSet.addAll(tempSet);
456         }//while(innerStatesIter.hasNext())
457 
458         if(!nextSet.isEmpty()){
459           nextSet = lambdaClosure(nextSet);
460           dCurrentState = (DFSMState)newStates.get(nextSet);
461 
462           if(dCurrentState == null){
463 
464             //we have a new DFSMState
465             dCurrentState = new DFSMState(this);
466             sdStates.add(nextSet);
467             unmarkedDStates.add(nextSet);
468 
469             //check to see whether the new state is a final one
470             innerStatesIter = nextSet.iterator();
471             newRhs =false;
472 
473             while(innerStatesIter.hasNext()){
474               currentInnerState = (FSMState)innerStatesIter.next();
475               if(currentInnerState.isFinal()){
476                 rhs = currentInnerState.getRhs();
477                 rhsClashSet.add(rhs);
478                 dCurrentState.rhs = rhs;
479                 newRhs = true;
480               }
481             }
482 
483             if(rhsClashSet.size() > 1){
484               Err.println("Warning, rule clash: " +  rhsClashSet +
485                             "\nSelected last definition: " + dCurrentState.rhs);
486             }
487 
488             if(newRhs)dCurrentState.buildTokenDesc();
489             rhsClashSet.clear();
490             newStates.put(nextSet, dCurrentState);
491           }
492           ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
493         } // if(!nextSet.isEmpty())
494 
495       } // for(byte type = 0; type < 256; type++)
496 
497     } // while(!unmarkedDStates.isEmpty())
498 
499   } // eliminateVoidTransitions
500 
501   /** Returns a string representation of the non-deterministic FSM graph using
502    * GML (Graph modelling language).
503    */
504   public String getFSMgml(){
505     String res = "graph[ \ndirected 1\n";
506     ///String nodes = "", edges = "";
507     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
508                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
509 
510     Iterator fsmStatesIter = fsmStates.iterator();
511     while (fsmStatesIter.hasNext()){
512       FSMState currentState = (FSMState)fsmStatesIter.next();
513       int stateIndex = currentState.getIndex();
514       /*nodes += "node[ id " + stateIndex +
515                " label \"" + stateIndex;
516         */
517         nodes.append("node[ id ");
518         nodes.append(stateIndex);
519         nodes.append(" label \"");
520         nodes.append(stateIndex);
521 
522              if(currentState.isFinal()){
523               ///nodes += ",F\\n" + currentState.getRhs();
524               nodes.append(",F\\n" + currentState.getRhs());
525              }
526              ///nodes +=  "\"  ]\n";
527              nodes.append("\"  ]\n");
528       ///edges += currentState.getEdgesGML();
529       edges.append(currentState.getEdgesGML());
530     }
531     res += nodes.toString() + edges.toString() + "]\n";
532     return res;
533   } // getFSMgml
534 
535   /** Returns a string representation of the deterministic FSM graph using
536    * GML.
537    */
538   public String getDFSMgml() {
539     String res = "graph[ \ndirected 1\n";
540     ///String nodes = "", edges = "";
541     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
542                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
543 
544     Iterator dfsmStatesIter = dfsmStates.iterator();
545     while (dfsmStatesIter.hasNext()) {
546       DFSMState currentState = (DFSMState)dfsmStatesIter.next();
547       int stateIndex = currentState.getIndex();
548 /*      nodes += "node[ id " + stateIndex +
549                " label \"" + stateIndex;
550 */
551         nodes.append("node[ id ");
552         nodes.append(stateIndex);
553         nodes.append(" label \"");
554         nodes.append(stateIndex);
555 
556              if(currentState.isFinal()){
557 ///              nodes += ",F\\n" + currentState.getRhs();
558               nodes.append(",F\\n" + currentState.getRhs());
559              }
560 ///             nodes +=  "\"  ]\n";
561              nodes.append("\"  ]\n");
562 ///      edges += currentState.getEdgesGML();
563         edges.append(currentState.getEdgesGML());
564     }
565     res += nodes.toString() + edges.toString() + "]\n";
566     return res;
567   } // getDFSMgml
568 
569   //no doc required: javadoc will copy it from the interface
570   /**    */
571   public FeatureMap getFeatures(){
572     return features;
573   } // getFeatures
574 
575   /**    */
576   public void setFeatures(FeatureMap features){
577     this.features = features;
578   } // setFeatures
579 
580   /**
581    * The method that does the actual tokenisation.
582    */
583   public void execute() throws ExecutionException {
584     interrupted = false;
585     AnnotationSet annotationSet;
586     //check the input
587     if(document == null) {
588       throw new ExecutionException(
589         "No document to tokenise!"
590       );
591     }
592 
593     if(annotationSetName == null ||
594        annotationSetName.equals("")) annotationSet = document.getAnnotations();
595     else annotationSet = document.getAnnotations(annotationSetName);
596 
597     fireStatusChanged(
598         "Tokenising " + document.getName() + "...");
599 
600     String content = document.getContent().toString();
601     int length = content.length();
602     char currentChar;
603 
604     DFSMState graphPosition = dInitialState;
605 
606     //the index of the first character of the token trying to be recognised
607     int tokenStart = 0;
608 
609     //the index of the last character of the last token recognised
610     int lastMatch = -1;
611 
612     DFSMState lastMatchingState = null;
613     DFSMState nextState;
614     String tokenString;
615     int charIdx = 0;
616     int oldCharIdx = 0;
617     FeatureMap newTokenFm;
618 
619     while(charIdx < length){
620       currentChar = content.charAt(charIdx);
621 //      Out.println(
622 //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
623       nextState = graphPosition.next(((Integer)typeIds.get(
624                   new Integer(Character.getType(currentChar)))).intValue());
625 
626       if( null != nextState ) {
627         graphPosition = nextState;
628         if(graphPosition.isFinal()) {
629           lastMatch = charIdx;
630           lastMatchingState = graphPosition;
631         }
632         charIdx ++;
633       } else {//we have a match!
634         newTokenFm = Factory.newFeatureMap();
635 
636         if (null == lastMatchingState) {
637           tokenString = content.substring(tokenStart, tokenStart +1);
638           newTokenFm.put("type","UNKNOWN");
639           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
640           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
641                          Integer.toString(tokenString.length()));
642 
643           try {
644             annotationSet.add(new Long(tokenStart),
645                               new Long(tokenStart + 1),
646                               "DEFAULT_TOKEN", newTokenFm);
647           } catch (InvalidOffsetException ioe) {
648             //This REALLY shouldn't happen!
649             ioe.printStackTrace(Err.getPrintWriter());
650           }
651           // Out.println("Default token: " + tokenStart +
652           //             "->" + tokenStart + " :" + tokenString + ";");
653           charIdx  = tokenStart + 1;
654         } else {
655           tokenString = content.substring(tokenStart, lastMatch + 1);
656           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
657           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
658                          Integer.toString(tokenString.length()));
659 
660           for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
661             newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
662                            lastMatchingState.getTokenDesc()[i][1]);
663           //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
664           //                       lastMatchingState.getTokenDesc()[i][1]);
665           }
666 
667 
668           try {
669             annotationSet.add(new Long(tokenStart),
670                             new Long(lastMatch + 1),
671                             lastMatchingState.getTokenDesc()[0][0], newTokenFm);
672           } catch(InvalidOffsetException ioe) {
673             //This REALLY shouldn't happen!
674             throw new GateRuntimeException(ioe.toString());
675           }
676 
677           // Out.println(lastMatchingState.getTokenDesc()[0][0] +
678           //              ": " + tokenStart + "->" + lastMatch +
679           //              " :" + tokenString + ";");
680           charIdx = lastMatch + 1;
681         }
682 
683         lastMatchingState = null;
684         graphPosition = dInitialState;
685         tokenStart = charIdx;
686       }
687 
688       if((charIdx - oldCharIdx > 256)){
689         fireProgressChanged((100 * charIdx )/ length );
690         oldCharIdx = charIdx;
691         if(isInterrupted()) throw new ExecutionInterruptedException();
692       }
693 
694     } // while(charIdx < length)
695 
696     if (null != lastMatchingState) {
697       tokenString = content.substring(tokenStart, lastMatch + 1);
698       newTokenFm = Factory.newFeatureMap();
699       newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
700       newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
701                      Integer.toString(tokenString.length()));
702 
703       for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
704         newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
705                        lastMatchingState.getTokenDesc()[i][1]);
706       }
707 
708 
709       try {
710         annotationSet.add(new Long(tokenStart),
711                           new Long(lastMatch + 1),
712                           lastMatchingState.getTokenDesc()[0][0], newTokenFm);
713       } catch(InvalidOffsetException ioe) {
714         //This REALLY shouldn't happen!
715         throw new GateRuntimeException(ioe.toString());
716       }
717 
718     }
719 
720     reset();
721     fireProcessFinished();
722     fireStatusChanged("Tokenisation complete!");
723   } // run
724 
725   /**
726    * Sets the value of the <code>rulesURL</code> property which holds an URL
727    * to the file containing the rules for this tokeniser.
728    * @param newRulesURL
729    */
730   public void setRulesURL(java.net.URL newRulesURL) {
731     rulesURL = newRulesURL;
732   }
733   /**
734    * Gets the value of the <code>rulesURL</code> property hich holds an
735    * URL to the file containing the rules for this tokeniser.
736    */
737   public java.net.URL getRulesURL() {
738     return rulesURL;
739   }
740   /**    */
741   public void setAnnotationSetName(String newAnnotationSetName) {
742     annotationSetName = newAnnotationSetName;
743   }
744   /**    */
745   public String getAnnotationSetName() {
746     return annotationSetName;
747   }
748   public void setRulesResourceName(String newRulesResourceName) {
749     rulesResourceName = newRulesResourceName;
750   }
751   public String getRulesResourceName() {
752     return rulesResourceName;
753   }
754   public void setEncoding(String newEncoding) {
755     encoding = newEncoding;
756   }
757   public String getEncoding() {
758     return encoding;
759   }
760 
761   /**    */
762   protected FeatureMap features  = null;
763 
764   /** the annotations et where the new annotations will be adde
765    */
766   protected String annotationSetName;
767 
768   /** The initial state of the non deterministic machin
769    */
770   protected FSMState initialState;
771 
772   /** A set containng all the states of the non deterministic machin
773    */
774   protected Set fsmStates = new HashSet();
775 
776   /** The initial state of the deterministic machin
777    */
778   protected DFSMState dInitialState;
779 
780   /** A set containng all the states of the deterministic machin
781    */
782   protected Set dfsmStates = new HashSet();
783 
784   /** The separator from LHS to RH
785    */
786   static String LHStoRHS = ">";
787 
788   /** A set of string representing tokens to be ignored (e.g. blanks
789    */
790   static Set ignoreTokens;
791 
792   /** maps from int (the static value on {@link java.lang.Character} to int
793    * the internal value used by the tokeniser. The ins values used by the
794    * tokeniser are consecutive values, starting from 0 and going as high as
795    * necessary.
796    * They map all the public static int members on{@link java.lang.Character}
797    */
798   public static Map typeIds;
799 
800   /** The maximum int value used internally as a type i
801    */
802   public static int maxTypeId;
803 
804   /** Maps the internal type ids to the type name
805    */
806   public static String[] typeMnemonics;
807 
808   /** Maps from type names to type internal id
809    */
810   public static Map stringTypeIds;
811 
812   /**
813    * This property holds an URL to the file containing the rules for this tokeniser
814    *
815    */
816 
817   /**    */
818   static protected String defaultResourceName =
819                             "creole/tokeniser/DefaultTokeniser.rules";
820 
821   private String rulesResourceName;
822   private java.net.URL rulesURL;
823   private String encoding;
824   private transient Vector progressListeners;
825   //kalina: added this as method to minimise too many init() calls
826   protected transient Map newStates = new HashMap();
827 
828 
829   /** The static initialiser will inspect the class {@link java.lang.Character}
830     * using reflection to find all the public static members and will map them
831     * to ids starting from 0.
832     * After that it will build all the static data: {@link #typeIds}, {@link
833     * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
834     */
835   static{
836     Field[] characterClassFields;
837 
838     try{
839       characterClassFields = Class.forName("java.lang.Character").getFields();
840     }catch(ClassNotFoundException cnfe){
841       throw new LuckyException("Could not find the java.lang.Character class!");
842     }
843 
844     Collection staticFields = new LinkedList();
845     // JDK 1.4 introduced directionality constants that have the same values as
846     //character types; we need to skip those as well
847     for(int i = 0; i< characterClassFields.length; i++)
848       if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
849          characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
850         staticFields.add(characterClassFields[i]);
851 
852     typeIds = new HashMap();
853     maxTypeId = staticFields.size() -1;
854     typeMnemonics = new String[maxTypeId + 1];
855     stringTypeIds = new HashMap();
856 
857     Iterator staticFieldsIter = staticFields.iterator();
858     Field currentField;
859     int currentId = 0;
860     String fieldName;
861 
862     try {
863       while(staticFieldsIter.hasNext()){
864         currentField = (Field)staticFieldsIter.next();
865         if(currentField.getType().toString().equals("byte")){
866           fieldName = currentField.getName();
867           typeIds.put(new Integer(currentField.getInt(null)),
868                                     new Integer(currentId));
869           typeMnemonics[currentId] = fieldName;
870           stringTypeIds.put(fieldName, new Integer(currentId));
871           currentId++;
872         }
873       }
874     } catch(Exception e) {
875       throw new LuckyException(e.toString());
876     }
877 
878     ignoreTokens = new HashSet();
879     ignoreTokens.add(" ");
880     ignoreTokens.add("\t");
881     ignoreTokens.add("\f");
882   }
883 
884 } // class DefaultTokeniser