|
SimpleTokeniser |
|
1 /* 2 * DefaultTokeniser.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Valentin Tablan, 2000 12 * 13 * $Id: SimpleTokeniser.java,v 1.10 2001/10/05 15:40:07 valyt Exp $ 14 */ 15 16 package gate.creole.tokeniser; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 import java.lang.reflect.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.event.*; 26 import gate.util.*; 27 28 //import EDU.auburn.VGJ.graph.ParseError; 29 30 /** Implementation of a Unicode rule based tokeniser. 31 * The tokeniser gets its rules from a file an {@link java.io.InputStream 32 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one 33 * of the constructors. 34 * The implementations is based on a finite state machine that is built based 35 * on the set of rules. 36 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS) 37 * that are separated by the ">" character. The LHS represents a 38 * regular expression that will be matched against the input while the RHS 39 * describes a Gate2 annotation in terms of annotation type and attribute-value 40 * pairs. 41 * The matching is done using Unicode enumarated types as defined by the {@link 42 * java.lang.Character Character} class. At the time of writing this class the 43 * suported Unicode categories were: 44 * <ul> 45 * <li>UNASSIGNED 46 * <li>UPPERCASE_LETTER 47 * <li>LOWERCASE_LETTER 48 * <li>TITLECASE_LETTER 49 * <li>MODIFIER_LETTER 50 * <li>OTHER_LETTER 51 * <li>NON_SPACING_MARK 52 * <li>ENCLOSING_MARK 53 * <li>COMBINING_SPACING_MARK 54 * <li>DECIMAL_DIGIT_NUMBER 55 * <li>LETTER_NUMBER 56 * <li>OTHER_NUMBER 57 * <li>SPACE_SEPARATOR 58 * <li>LINE_SEPARATOR 59 * <li>PARAGRAPH_SEPARATOR 60 * <li>CONTROL 61 * <li>FORMAT 62 * <li>PRIVATE_USE 63 * <li>SURROGATE 64 * <li>DASH_PUNCTUATION 65 * <li>START_PUNCTUATION 66 * <li>END_PUNCTUATION 67 * <li>CONNECTOR_PUNCTUATION 68 * <li>OTHER_PUNCTUATION 69 * <li>MATH_SYMBOL 70 * <li>CURRENCY_SYMBOL 71 * <li>MODIFIER_SYMBOL 72 * <li>OTHER_SYMBOL 73 * </ul> 74 * The accepted operators for the LHS are "+", "*" and "|" having the usual 75 * interpretations of "1 to n occurences", "0 to n occurences" and 76 * "boolean OR". 77 * For instance this is a valid LHS: 78 * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ 79 * <br>meaning an uppercase letter followed by one or more lowercase letters. 80 * 81 * The RHS describes an annotation that is to be created and inserted in the 82 * annotation set provided in case of a match. The new annotation will span the 83 * text that has been recognised. The RHS consists in the annotation type 84 * followed by pairs of attributes and associated values. 85 * E.g. for the LHS above a possible RHS can be:<br> 86 * Token;kind=upperInitial;<br> 87 * representing an annotation of type "Token" having one attribute 88 * named "kind" with the value "upperInitial"<br> 89 * The entire rule willbe:<br> 90 * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre> 91 * <br> 92 * The tokeniser ignores all the empty lines or the ones that start with # or 93 * //. 94 * 95 */ 96 public class SimpleTokeniser extends AbstractLanguageAnalyser{ 97 /** Debug flag 98 */ 99 private static final boolean DEBUG = false; 100 101 /** 102 * Creates a tokeniser 103 */ 104 public SimpleTokeniser(){ 105 } 106 107 /** 108 * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building 109 * the finite state machine at the core of the tokeniser. 110 * 111 * @exception ResourceInstantiationException 112 */ 113 public Resource init() throws ResourceInstantiationException{ 114 Reader rulesReader; 115 try{ 116 if(rulesURL != null){ 117 rulesReader = new InputStreamReader(rulesURL.openStream(), encoding); 118 }else{ 119 //no init data, Scream! 120 throw new ResourceInstantiationException( 121 "No URL provided for the rules!"); 122 } 123 initialState = new FSMState(this); 124 BufferedReader bRulesReader = new BufferedReader(rulesReader); 125 String line = bRulesReader.readLine(); 126 ///String toParse = ""; 127 StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE); 128 129 while (line != null){ 130 if(line.endsWith("\\")){ 131 ///toParse += line.substring(0,line.length()-1); 132 toParse.append(line.substring(0,line.length()-1)); 133 }else{ 134 /*toParse += line; 135 parseRule(toParse); 136 toParse = ""; 137 */ 138 toParse.append(line); 139 parseRule(toParse.toString()); 140 toParse.delete(0,toParse.length()); 141 } 142 line = bRulesReader.readLine(); 143 } 144 eliminateVoidTransitions(); 145 }catch(java.io.IOException ioe){ 146 throw new ResourceInstantiationException(ioe); 147 }catch(TokeniserException te){ 148 throw new ResourceInstantiationException(te); 149 } 150 return this; 151 } 152 153 /** 154 * Prepares this Processing resource for a new run. 155 */ 156 public void reset(){ 157 document = null; 158 annotationSetName = null; 159 } 160 161 /** Parses one input line containing a tokeniser rule. 162 * This will create the necessary FSMState objects and the links 163 * between them. 164 * 165 * @param line the string containing the rule 166 */ 167 void parseRule(String line)throws TokeniserException{ 168 //ignore comments 169 if(line.startsWith("#")) return; 170 171 if(line.startsWith("//")) return; 172 173 StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true); 174 FSMState newState = new FSMState(this); 175 176 initialState.put(null, newState); 177 FSMState finalState = parseLHS(newState, st, LHStoRHS); 178 String rhs = ""; 179 180 if(st.hasMoreTokens()) rhs = st.nextToken("\f"); 181 182 if(rhs.length() > 0)finalState.setRhs(rhs); 183 } // parseRule 184 185 /** Parses a part or the entire LHS. 186 * 187 * @param startState a FSMState object representing the initial state for 188 * the small FSM that will recognise the (part of) the rule parsed by this 189 * method. 190 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 191 * provides the input 192 * @param until the string that marks the end of the section to be 193 * recognised. This method will first be called by {@link 194 * #parseRule(String)} with " >" in order to parse the entire 195 * LHS. when necessary it will make itself another call to {@link #parseLHS 196 * parseLHS} to parse a region of the LHS (e.g. a 197 * "(",")" enclosed part. 198 */ 199 FSMState parseLHS(FSMState startState, StringTokenizer st, String until) 200 throws TokeniserException{ 201 202 FSMState currentState = startState; 203 boolean orFound = false; 204 List orList = new LinkedList(); 205 String token; 206 token = skipIgnoreTokens(st); 207 208 if(null == token) return currentState; 209 210 FSMState newState; 211 Integer typeId; 212 UnicodeType uType; 213 214 bigwhile: while(!token.equals(until)){ 215 if(token.equals("(")){//(..) 216 newState = parseLHS(currentState, st,")"); 217 } else if(token.equals("\"")){//"unicode_type" 218 String sType = parseQuotedString(st, "\""); 219 //Out.println(sType); 220 newState = new FSMState(this); 221 typeId = (Integer)stringTypeIds.get(sType); 222 223 if(null == typeId) 224 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 225 else uType = new UnicodeType(typeId.intValue()); 226 227 currentState.put(uType ,newState); 228 } else {// a type with no quotes 229 String sType = token; 230 //Out.println(sType); 231 newState = new FSMState(this); 232 typeId = (Integer)stringTypeIds.get(sType); 233 234 if(null == typeId) 235 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 236 else uType = new UnicodeType(typeId.intValue()); 237 238 currentState.put(uType ,newState); 239 } 240 //treat the operators 241 token = skipIgnoreTokens(st); 242 if(null == token) throw 243 new InvalidRuleException("Tokeniser rule ended too soon!"); 244 245 if(token.equals("|")) { 246 247 orFound = true; 248 orList.add(newState); 249 token = skipIgnoreTokens(st); 250 if(null == token) throw 251 new InvalidRuleException("Tokeniser rule ended too soon!"); 252 253 continue bigwhile; 254 } else if(orFound) {//done parsing the "|" 255 orFound = false; 256 orList.add(newState); 257 newState = new FSMState(this); 258 Iterator orListIter = orList.iterator(); 259 260 while(orListIter.hasNext()) 261 ((FSMState)orListIter.next()).put(null, newState); 262 orList.clear(); 263 } 264 265 if(token.equals("+")) { 266 267 newState.put(null,currentState); 268 currentState = newState; 269 newState = new FSMState(this); 270 currentState.put(null,newState); 271 token = skipIgnoreTokens(st); 272 273 if(null == token) throw 274 new InvalidRuleException("Tokeniser rule ended too soon!"); 275 } else if(token.equals("*")) { 276 277 currentState.put(null,newState); 278 newState.put(null,currentState); 279 currentState = newState; 280 newState = new FSMState(this); 281 currentState.put(null,newState); 282 token = skipIgnoreTokens(st); 283 284 if(null == token) throw 285 new InvalidRuleException("Tokeniser rule ended too soon!"); 286 } 287 currentState = newState; 288 } 289 return currentState; 290 } // parseLHS 291 292 /** Parses from the given string tokeniser until it finds a specific 293 * delimiter. 294 * One use for this method is to read everything until the first quote. 295 * 296 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 297 * provides the input 298 * @param until a String representing the end delimiter. 299 */ 300 String parseQuotedString(StringTokenizer st, String until) 301 throws TokeniserException { 302 303 String token; 304 305 if(st.hasMoreElements()) token = st.nextToken(); 306 else return null; 307 308 ///String type = ""; 309 StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE); 310 311 while(!token.equals(until)){ 312 //type += token; 313 type.append(token); 314 if(st.hasMoreElements())token = st.nextToken(); 315 else throw new InvalidRuleException("Tokeniser rule ended too soon!"); 316 } 317 return type.toString(); 318 } // parseQuotedString 319 320 /** Skips the ignorable tokens from the input returning the first significant 321 * token. 322 * The ignorable tokens are defined by {@link #ignoreTokens a set} 323 */ 324 protected static String skipIgnoreTokens(StringTokenizer st){ 325 Iterator ignorables; 326 boolean ignorableFound = false; 327 String currentToken; 328 329 while(true){ 330 if(st.hasMoreTokens()){ 331 currentToken = st.nextToken(); 332 ignorables = ignoreTokens.iterator(); 333 ignorableFound = false; 334 335 while(!ignorableFound && ignorables.hasNext()){ 336 if(currentToken.equals((String)ignorables.next())) 337 ignorableFound = true; 338 } 339 340 if(!ignorableFound) return currentToken; 341 } else return null; 342 } 343 }//skipIgnoreTokens 344 345 /* Computes the lambda-closure (aka epsilon closure) of the given set of 346 * states, that is the set of states that are accessible from any of the 347 * states in the given set using only unrestricted transitions. 348 * @return a set containing all the states accessible from this state via 349 * transitions that bear no restrictions. 350 */ 351 /** 352 * Converts the finite state machine to a deterministic one. 353 * 354 * @param s 355 */ 356 private AbstractSet lambdaClosure(Set s){ 357 358 //the stack/queue used by the algorithm 359 LinkedList list = new LinkedList(s); 360 361 //the set to be returned 362 AbstractSet lambdaClosure = new HashSet(s); 363 364 FSMState top; 365 FSMState currentState; 366 Set nextStates; 367 Iterator statesIter; 368 369 while(!list.isEmpty()) { 370 top = (FSMState)list.removeFirst(); 371 nextStates = top.nextSet(null); 372 373 if(null != nextStates){ 374 statesIter = nextStates.iterator(); 375 376 while(statesIter.hasNext()) { 377 currentState = (FSMState)statesIter.next(); 378 if(!lambdaClosure.contains(currentState)){ 379 lambdaClosure.add(currentState); 380 list.addFirst(currentState); 381 }//if(!lambdaClosure.contains(currentState)) 382 }//while(statesIter.hasNext()) 383 384 }//if(null != nextStates) 385 } 386 return lambdaClosure; 387 } // lambdaClosure 388 389 /** Converts the FSM from a non-deterministic to a deterministic one by 390 * eliminating all the unrestricted transitions. 391 */ 392 void eliminateVoidTransitions() throws TokeniserException { 393 394 //kalina:clear() faster than init() which is called with init() 395 newStates.clear(); 396 Set sdStates = new HashSet(); 397 LinkedList unmarkedDStates = new LinkedList(); 398 DFSMState dCurrentState = new DFSMState(this); 399 Set sdCurrentState = new HashSet(); 400 401 sdCurrentState.add(initialState); 402 sdCurrentState = lambdaClosure(sdCurrentState); 403 newStates.put(sdCurrentState, dCurrentState); 404 sdStates.add(sdCurrentState); 405 406 //find out if the new state is a final one 407 Iterator innerStatesIter = sdCurrentState.iterator(); 408 String rhs; 409 FSMState currentInnerState; 410 Set rhsClashSet = new HashSet(); 411 boolean newRhs = false; 412 413 while(innerStatesIter.hasNext()){ 414 currentInnerState = (FSMState)innerStatesIter.next(); 415 if(currentInnerState.isFinal()){ 416 rhs = currentInnerState.getRhs(); 417 rhsClashSet.add(rhs); 418 dCurrentState.rhs = rhs; 419 newRhs = true; 420 } 421 } 422 423 if(rhsClashSet.size() > 1){ 424 Err.println("Warning, rule clash: " + rhsClashSet + 425 "\nSelected last definition: " + dCurrentState.rhs); 426 } 427 428 if(newRhs)dCurrentState.buildTokenDesc(); 429 rhsClashSet.clear(); 430 unmarkedDStates.addFirst(sdCurrentState); 431 dInitialState = dCurrentState; 432 Set nextSet; 433 434 while(!unmarkedDStates.isEmpty()){ 435 //Out.println("\n\n=====================" + unmarkedDStates.size()); 436 sdCurrentState = (Set)unmarkedDStates.removeFirst(); 437 for(int type = 0; type < maxTypeId; type++){ 438 //Out.print(type); 439 nextSet = new HashSet(); 440 innerStatesIter = sdCurrentState.iterator(); 441 442 while(innerStatesIter.hasNext()){ 443 currentInnerState = (FSMState)innerStatesIter.next(); 444 Set tempSet = currentInnerState.nextSet(type); 445 if(null != tempSet) nextSet.addAll(tempSet); 446 }//while(innerStatesIter.hasNext()) 447 448 if(!nextSet.isEmpty()){ 449 nextSet = lambdaClosure(nextSet); 450 dCurrentState = (DFSMState)newStates.get(nextSet); 451 452 if(dCurrentState == null){ 453 454 //we have a new DFSMState 455 dCurrentState = new DFSMState(this); 456 sdStates.add(nextSet); 457 unmarkedDStates.add(nextSet); 458 459 //check to see whether the new state is a final one 460 innerStatesIter = nextSet.iterator(); 461 newRhs =false; 462 463 while(innerStatesIter.hasNext()){ 464 currentInnerState = (FSMState)innerStatesIter.next(); 465 if(currentInnerState.isFinal()){ 466 rhs = currentInnerState.getRhs(); 467 rhsClashSet.add(rhs); 468 dCurrentState.rhs = rhs; 469 newRhs = true; 470 } 471 } 472 473 if(rhsClashSet.size() > 1){ 474 Err.println("Warning, rule clash: " + rhsClashSet + 475 "\nSelected last definition: " + dCurrentState.rhs); 476 } 477 478 if(newRhs)dCurrentState.buildTokenDesc(); 479 rhsClashSet.clear(); 480 newStates.put(nextSet, dCurrentState); 481 } 482 ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState); 483 } // if(!nextSet.isEmpty()) 484 485 } // for(byte type = 0; type < 256; type++) 486 487 } // while(!unmarkedDStates.isEmpty()) 488 489 } // eliminateVoidTransitions 490 491 /** Returns a string representation of the non-deterministic FSM graph using 492 * GML (Graph modelling language). 493 */ 494 public String getFSMgml(){ 495 String res = "graph[ \ndirected 1\n"; 496 ///String nodes = "", edges = ""; 497 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 498 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 499 500 Iterator fsmStatesIter = fsmStates.iterator(); 501 while (fsmStatesIter.hasNext()){ 502 FSMState currentState = (FSMState)fsmStatesIter.next(); 503 int stateIndex = currentState.getIndex(); 504 /*nodes += "node[ id " + stateIndex + 505 " label \"" + stateIndex; 506 */ 507 nodes.append("node[ id "); 508 nodes.append(stateIndex); 509 nodes.append(" label \""); 510 nodes.append(stateIndex); 511 512 if(currentState.isFinal()){ 513 ///nodes += ",F\\n" + currentState.getRhs(); 514 nodes.append(",F\\n" + currentState.getRhs()); 515 } 516 ///nodes += "\" ]\n"; 517 nodes.append("\" ]\n"); 518 ///edges += currentState.getEdgesGML(); 519 edges.append(currentState.getEdgesGML()); 520 } 521 res += nodes.toString() + edges.toString() + "]\n"; 522 return res; 523 } // getFSMgml 524 525 /** Returns a string representation of the deterministic FSM graph using 526 * GML. 527 */ 528 public String getDFSMgml() { 529 String res = "graph[ \ndirected 1\n"; 530 ///String nodes = "", edges = ""; 531 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 532 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 533 534 Iterator dfsmStatesIter = dfsmStates.iterator(); 535 while (dfsmStatesIter.hasNext()) { 536 DFSMState currentState = (DFSMState)dfsmStatesIter.next(); 537 int stateIndex = currentState.getIndex(); 538 /* nodes += "node[ id " + stateIndex + 539 " label \"" + stateIndex; 540 */ 541 nodes.append("node[ id "); 542 nodes.append(stateIndex); 543 nodes.append(" label \""); 544 nodes.append(stateIndex); 545 546 if(currentState.isFinal()){ 547 /// nodes += ",F\\n" + currentState.getRhs(); 548 nodes.append(",F\\n" + currentState.getRhs()); 549 } 550 /// nodes += "\" ]\n"; 551 nodes.append("\" ]\n"); 552 /// edges += currentState.getEdgesGML(); 553 edges.append(currentState.getEdgesGML()); 554 } 555 res += nodes.toString() + edges.toString() + "]\n"; 556 return res; 557 } // getDFSMgml 558 559 //no doc required: javadoc will copy it from the interface 560 /** */ 561 public FeatureMap getFeatures(){ 562 return features; 563 } // getFeatures 564 565 /** */ 566 public void setFeatures(FeatureMap features){ 567 this.features = features; 568 } // setFeatures 569 570 /** 571 * The method that does the actual tokenisation. 572 */ 573 public void execute() throws ExecutionException { 574 interrupted = false; 575 AnnotationSet annotationSet; 576 //check the input 577 if(document == null) { 578 throw new ExecutionException( 579 "No document to tokenise!" 580 ); 581 } 582 583 if(annotationSetName == null || 584 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 585 else annotationSet = document.getAnnotations(annotationSetName); 586 587 fireStatusChanged( 588 "Tokenising " + document.getSourceUrl().getFile() + "..."); 589 590 String content = document.getContent().toString(); 591 int length = content.length(); 592 char currentChar; 593 594 DFSMState graphPosition = dInitialState; 595 596 //the index of the first character of the token trying to be recognised 597 int tokenStart = 0; 598 599 //the index of the last character of the last token recognised 600 int lastMatch = -1; 601 602 DFSMState lastMatchingState = null; 603 DFSMState nextState; 604 String tokenString; 605 int charIdx = 0; 606 int oldCharIdx = 0; 607 FeatureMap newTokenFm; 608 609 while(charIdx < length){ 610 currentChar = content.charAt(charIdx); 611 // Out.println( 612 // currentChar + typesMnemonics[Character.getType(currentChar)+128]); 613 nextState = graphPosition.next(((Integer)typeIds.get( 614 new Integer(Character.getType(currentChar)))).intValue()); 615 616 if( null != nextState ) { 617 graphPosition = nextState; 618 if(graphPosition.isFinal()) { 619 lastMatch = charIdx; 620 lastMatchingState = graphPosition; 621 } 622 charIdx ++; 623 } else {//we have a match! 624 newTokenFm = Factory.newFeatureMap(); 625 626 if (null == lastMatchingState) { 627 tokenString = content.substring(tokenStart, tokenStart +1); 628 newTokenFm.put("type","UNKNOWN"); 629 newTokenFm.put("string", tokenString); 630 newTokenFm.put("length", Integer.toString(tokenString.length())); 631 632 try { 633 annotationSet.add(new Long(tokenStart), 634 new Long(tokenStart + 1), 635 "DEFAULT_TOKEN", newTokenFm); 636 } catch (InvalidOffsetException ioe) { 637 //This REALLY shouldn't happen! 638 ioe.printStackTrace(Err.getPrintWriter()); 639 } 640 // Out.println("Default token: " + tokenStart + 641 // "->" + tokenStart + " :" + tokenString + ";"); 642 charIdx = tokenStart + 1; 643 } else { 644 tokenString = content.substring(tokenStart, lastMatch + 1); 645 newTokenFm.put("string", tokenString); 646 newTokenFm.put("length", Integer.toString(tokenString.length())); 647 648 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 649 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 650 lastMatchingState.getTokenDesc()[i][1]); 651 //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + 652 // lastMatchingState.getTokenDesc()[i][1]); 653 } 654 655 656 try { 657 annotationSet.add(new Long(tokenStart), 658 new Long(lastMatch + 1), 659 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 660 } catch(InvalidOffsetException ioe) { 661 //This REALLY shouldn't happen! 662 throw new GateRuntimeException(ioe.toString()); 663 } 664 665 // Out.println(lastMatchingState.getTokenDesc()[0][0] + 666 // ": " + tokenStart + "->" + lastMatch + 667 // " :" + tokenString + ";"); 668 charIdx = lastMatch + 1; 669 } 670 671 lastMatchingState = null; 672 graphPosition = dInitialState; 673 tokenStart = charIdx; 674 } 675 676 if((charIdx - oldCharIdx > 256)){ 677 fireProgressChanged((100 * charIdx )/ length ); 678 oldCharIdx = charIdx; 679 if(isInterrupted()) throw new ExecutionInterruptedException(); 680 } 681 682 } // while(charIdx < length) 683 684 if (null != lastMatchingState) { 685 tokenString = content.substring(tokenStart, lastMatch + 1); 686 newTokenFm = Factory.newFeatureMap(); 687 newTokenFm.put("string", tokenString); 688 newTokenFm.put("length", Integer.toString(tokenString.length())); 689 690 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 691 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 692 lastMatchingState.getTokenDesc()[i][1]); 693 } 694 695 696 try { 697 annotationSet.add(new Long(tokenStart), 698 new Long(lastMatch + 1), 699 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 700 } catch(InvalidOffsetException ioe) { 701 //This REALLY shouldn't happen! 702 throw new GateRuntimeException(ioe.toString()); 703 } 704 705 } 706 707 reset(); 708 fireProcessFinished(); 709 fireStatusChanged("Tokenisation complete!"); 710 } // run 711 712 /** 713 * Sets the value of the <code>rulesURL</code> property which holds an URL 714 * to the file containing the rules for this tokeniser. 715 * @param newRulesURL 716 */ 717 public void setRulesURL(java.net.URL newRulesURL) { 718 rulesURL = newRulesURL; 719 } 720 /** 721 * Gets the value of the <code>rulesURL</code> property hich holds an 722 * URL to the file containing the rules for this tokeniser. 723 */ 724 public java.net.URL getRulesURL() { 725 return rulesURL; 726 } 727 /** */ 728 public void setAnnotationSetName(String newAnnotationSetName) { 729 annotationSetName = newAnnotationSetName; 730 } 731 /** */ 732 public String getAnnotationSetName() { 733 return annotationSetName; 734 } 735 public void setRulesResourceName(String newRulesResourceName) { 736 rulesResourceName = newRulesResourceName; 737 } 738 public String getRulesResourceName() { 739 return rulesResourceName; 740 } 741 public void setEncoding(String newEncoding) { 742 encoding = newEncoding; 743 } 744 public String getEncoding() { 745 return encoding; 746 } 747 748 /** */ 749 protected FeatureMap features = null; 750 751 /** the annotations et where the new annotations will be adde 752 */ 753 protected String annotationSetName; 754 755 /** The initial state of the non deterministic machin 756 */ 757 protected FSMState initialState; 758 759 /** A set containng all the states of the non deterministic machin 760 */ 761 protected Set fsmStates = new HashSet(); 762 763 /** The initial state of the deterministic machin 764 */ 765 protected DFSMState dInitialState; 766 767 /** A set containng all the states of the deterministic machin 768 */ 769 protected Set dfsmStates = new HashSet(); 770 771 /** The separator from LHS to RH 772 */ 773 static String LHStoRHS = ">"; 774 775 /** A set of string representing tokens to be ignored (e.g. blanks 776 */ 777 static Set ignoreTokens; 778 779 /** maps from int (the static value on {@link java.lang.Character} to int 780 * the internal value used by the tokeniser. The ins values used by the 781 * tokeniser are consecutive values, starting from 0 and going as high as 782 * necessary. 783 * They map all the public static int members on{@link java.lang.Character} 784 */ 785 public static Map typeIds; 786 787 /** The maximum int value used internally as a type i 788 */ 789 public static int maxTypeId; 790 791 /** Maps the internal type ids to the type name 792 */ 793 public static String[] typeMnemonics; 794 795 /** Maps from type names to type internal id 796 */ 797 public static Map stringTypeIds; 798 799 /** 800 * This property holds an URL to the file containing the rules for this tokeniser 801 * 802 */ 803 804 /** */ 805 static protected String defaultResourceName = 806 "creole/tokeniser/DefaultTokeniser.rules"; 807 808 private String rulesResourceName; 809 private java.net.URL rulesURL; 810 private String encoding; 811 private transient Vector progressListeners; 812 //kalina: added this as method to minimise too many init() calls 813 protected transient Map newStates = new HashMap(); 814 815 816 /** The static initialiser will inspect the class {@link java.lang.Character} 817 * using reflection to find all the public static members and will map them 818 * to ids starting from 0. 819 * After that it will build all the static data: {@link #typeIds}, {@link 820 * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds} 821 */ 822 static{ 823 Field[] characterClassFields; 824 825 try{ 826 characterClassFields = Class.forName("java.lang.Character").getFields(); 827 }catch(ClassNotFoundException cnfe){ 828 throw new LuckyException("Could not find the java.lang.Character class!"); 829 } 830 831 Collection staticFields = new LinkedList(); 832 833 for(int i = 0; i< characterClassFields.length; i++) 834 if(Modifier.isStatic(characterClassFields[i].getModifiers())) 835 staticFields.add(characterClassFields[i]); 836 837 typeIds = new HashMap(); 838 maxTypeId = staticFields.size() -1; 839 typeMnemonics = new String[maxTypeId + 1]; 840 stringTypeIds = new HashMap(); 841 842 Iterator staticFieldsIter = staticFields.iterator(); 843 Field currentField; 844 int currentId = 0; 845 String fieldName; 846 847 try { 848 while(staticFieldsIter.hasNext()){ 849 currentField = (Field)staticFieldsIter.next(); 850 if(currentField.getType().toString().equals("byte")){ 851 fieldName = currentField.getName(); 852 typeIds.put(new Integer(currentField.getInt(null)), 853 new Integer(currentId)); 854 typeMnemonics[currentId] = fieldName; 855 stringTypeIds.put(fieldName, new Integer(currentId)); 856 currentId++; 857 } 858 } 859 } catch(Exception e) { 860 throw new LuckyException(e.toString()); 861 } 862 863 ignoreTokens = new HashSet(); 864 ignoreTokens.add(" "); 865 ignoreTokens.add("\t"); 866 ignoreTokens.add("\f"); 867 868 } 869 870 } // class DefaultTokeniser
|
SimpleTokeniser |
|