|
SimpleTokeniser |
|
1 /* 2 * DefaultTokeniser.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Valentin Tablan, 2000 12 * 13 * $Id: SimpleTokeniser.java,v 1.13 2002/03/06 17:15:45 kalina Exp $ 14 */ 15 16 package gate.creole.tokeniser; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 import java.lang.reflect.*; 22 23 import gate.*; 24 import gate.creole.*; 25 import gate.event.*; 26 import gate.util.*; 27 28 //import EDU.auburn.VGJ.graph.ParseError; 29 30 /** Implementation of a Unicode rule based tokeniser. 31 * The tokeniser gets its rules from a file an {@link java.io.InputStream 32 * InputStream} or a {@link java.io.Reader Reader} which should be sent to one 33 * of the constructors. 34 * The implementations is based on a finite state machine that is built based 35 * on the set of rules. 36 * A rule has two sides, the left hand side (LHS)and the right hand side (RHS) 37 * that are separated by the ">" character. The LHS represents a 38 * regular expression that will be matched against the input while the RHS 39 * describes a Gate2 annotation in terms of annotation type and attribute-value 40 * pairs. 41 * The matching is done using Unicode enumarated types as defined by the {@link 42 * java.lang.Character Character} class. At the time of writing this class the 43 * suported Unicode categories were: 44 * <ul> 45 * <li>UNASSIGNED 46 * <li>UPPERCASE_LETTER 47 * <li>LOWERCASE_LETTER 48 * <li>TITLECASE_LETTER 49 * <li>MODIFIER_LETTER 50 * <li>OTHER_LETTER 51 * <li>NON_SPACING_MARK 52 * <li>ENCLOSING_MARK 53 * <li>COMBINING_SPACING_MARK 54 * <li>DECIMAL_DIGIT_NUMBER 55 * <li>LETTER_NUMBER 56 * <li>OTHER_NUMBER 57 * <li>SPACE_SEPARATOR 58 * <li>LINE_SEPARATOR 59 * <li>PARAGRAPH_SEPARATOR 60 * <li>CONTROL 61 * <li>FORMAT 62 * <li>PRIVATE_USE 63 * <li>SURROGATE 64 * <li>DASH_PUNCTUATION 65 * <li>START_PUNCTUATION 66 * <li>END_PUNCTUATION 67 * <li>CONNECTOR_PUNCTUATION 68 * <li>OTHER_PUNCTUATION 69 * <li>MATH_SYMBOL 70 * <li>CURRENCY_SYMBOL 71 * <li>MODIFIER_SYMBOL 72 * <li>OTHER_SYMBOL 73 * </ul> 74 * The accepted operators for the LHS are "+", "*" and "|" having the usual 75 * interpretations of "1 to n occurences", "0 to n occurences" and 76 * "boolean OR". 77 * For instance this is a valid LHS: 78 * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ 79 * <br>meaning an uppercase letter followed by one or more lowercase letters. 80 * 81 * The RHS describes an annotation that is to be created and inserted in the 82 * annotation set provided in case of a match. The new annotation will span the 83 * text that has been recognised. The RHS consists in the annotation type 84 * followed by pairs of attributes and associated values. 85 * E.g. for the LHS above a possible RHS can be:<br> 86 * Token;kind=upperInitial;<br> 87 * representing an annotation of type "Token" having one attribute 88 * named "kind" with the value "upperInitial"<br> 89 * The entire rule willbe:<br> 90 * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre> 91 * <br> 92 * The tokeniser ignores all the empty lines or the ones that start with # or 93 * //. 94 * 95 */ 96 public class SimpleTokeniser extends AbstractLanguageAnalyser{ 97 public static final String 98 SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document"; 99 100 public static final String 101 SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; 102 103 public static final String 104 SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL"; 105 106 public static final String 107 SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding"; 108 109 /** Debug flag 110 */ 111 private static final boolean DEBUG = false; 112 113 /** 114 * Creates a tokeniser 115 */ 116 public SimpleTokeniser(){ 117 } 118 119 /** 120 * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building 121 * the finite state machine at the core of the tokeniser. 122 * 123 * @exception ResourceInstantiationException 124 */ 125 public Resource init() throws ResourceInstantiationException{ 126 Reader rulesReader; 127 try{ 128 if(rulesURL != null){ 129 rulesReader = new InputStreamReader(rulesURL.openStream(), encoding); 130 }else{ 131 //no init data, Scream! 132 throw new ResourceInstantiationException( 133 "No URL provided for the rules!"); 134 } 135 initialState = new FSMState(this); 136 BufferedReader bRulesReader = new BufferedReader(rulesReader); 137 String line = bRulesReader.readLine(); 138 ///String toParse = ""; 139 StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE); 140 141 while (line != null){ 142 if(line.endsWith("\\")){ 143 ///toParse += line.substring(0,line.length()-1); 144 toParse.append(line.substring(0,line.length()-1)); 145 }else{ 146 /*toParse += line; 147 parseRule(toParse); 148 toParse = ""; 149 */ 150 toParse.append(line); 151 parseRule(toParse.toString()); 152 toParse.delete(0,toParse.length()); 153 } 154 line = bRulesReader.readLine(); 155 } 156 eliminateVoidTransitions(); 157 }catch(java.io.IOException ioe){ 158 throw new ResourceInstantiationException(ioe); 159 }catch(TokeniserException te){ 160 throw new ResourceInstantiationException(te); 161 } 162 return this; 163 } 164 165 /** 166 * Prepares this Processing resource for a new run. 167 */ 168 public void reset(){ 169 document = null; 170 annotationSetName = null; 171 } 172 173 /** Parses one input line containing a tokeniser rule. 174 * This will create the necessary FSMState objects and the links 175 * between them. 176 * 177 * @param line the string containing the rule 178 */ 179 void parseRule(String line)throws TokeniserException{ 180 //ignore comments 181 if(line.startsWith("#")) return; 182 183 if(line.startsWith("//")) return; 184 185 StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true); 186 FSMState newState = new FSMState(this); 187 188 initialState.put(null, newState); 189 FSMState finalState = parseLHS(newState, st, LHStoRHS); 190 String rhs = ""; 191 192 if(st.hasMoreTokens()) rhs = st.nextToken("\f"); 193 194 if(rhs.length() > 0)finalState.setRhs(rhs); 195 } // parseRule 196 197 /** Parses a part or the entire LHS. 198 * 199 * @param startState a FSMState object representing the initial state for 200 * the small FSM that will recognise the (part of) the rule parsed by this 201 * method. 202 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 203 * provides the input 204 * @param until the string that marks the end of the section to be 205 * recognised. This method will first be called by {@link 206 * #parseRule(String)} with " >" in order to parse the entire 207 * LHS. when necessary it will make itself another call to {@link #parseLHS 208 * parseLHS} to parse a region of the LHS (e.g. a 209 * "(",")" enclosed part. 210 */ 211 FSMState parseLHS(FSMState startState, StringTokenizer st, String until) 212 throws TokeniserException{ 213 214 FSMState currentState = startState; 215 boolean orFound = false; 216 List orList = new LinkedList(); 217 String token; 218 token = skipIgnoreTokens(st); 219 220 if(null == token) return currentState; 221 222 FSMState newState; 223 Integer typeId; 224 UnicodeType uType; 225 226 bigwhile: while(!token.equals(until)){ 227 if(token.equals("(")){//(..) 228 newState = parseLHS(currentState, st,")"); 229 } else if(token.equals("\"")){//"unicode_type" 230 String sType = parseQuotedString(st, "\""); 231 newState = new FSMState(this); 232 typeId = (Integer)stringTypeIds.get(sType); 233 234 if(null == typeId) 235 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 236 else uType = new UnicodeType(typeId.intValue()); 237 238 currentState.put(uType ,newState); 239 } else {// a type with no quotes 240 String sType = token; 241 newState = new FSMState(this); 242 typeId = (Integer)stringTypeIds.get(sType); 243 244 if(null == typeId) 245 throw new InvalidRuleException("Invalid type: \"" + sType + "\""); 246 else uType = new UnicodeType(typeId.intValue()); 247 248 currentState.put(uType ,newState); 249 } 250 //treat the operators 251 token = skipIgnoreTokens(st); 252 if(null == token) throw 253 new InvalidRuleException("Tokeniser rule ended too soon!"); 254 255 if(token.equals("|")) { 256 257 orFound = true; 258 orList.add(newState); 259 token = skipIgnoreTokens(st); 260 if(null == token) throw 261 new InvalidRuleException("Tokeniser rule ended too soon!"); 262 263 continue bigwhile; 264 } else if(orFound) {//done parsing the "|" 265 orFound = false; 266 orList.add(newState); 267 newState = new FSMState(this); 268 Iterator orListIter = orList.iterator(); 269 270 while(orListIter.hasNext()) 271 ((FSMState)orListIter.next()).put(null, newState); 272 orList.clear(); 273 } 274 275 if(token.equals("+")) { 276 277 newState.put(null,currentState); 278 currentState = newState; 279 newState = new FSMState(this); 280 currentState.put(null,newState); 281 token = skipIgnoreTokens(st); 282 283 if(null == token) throw 284 new InvalidRuleException("Tokeniser rule ended too soon!"); 285 } else if(token.equals("*")) { 286 287 currentState.put(null,newState); 288 newState.put(null,currentState); 289 currentState = newState; 290 newState = new FSMState(this); 291 currentState.put(null,newState); 292 token = skipIgnoreTokens(st); 293 294 if(null == token) throw 295 new InvalidRuleException("Tokeniser rule ended too soon!"); 296 } 297 currentState = newState; 298 } 299 return currentState; 300 } // parseLHS 301 302 /** Parses from the given string tokeniser until it finds a specific 303 * delimiter. 304 * One use for this method is to read everything until the first quote. 305 * 306 * @param st a {@link java.util.StringTokenizer StringTokenizer} that 307 * provides the input 308 * @param until a String representing the end delimiter. 309 */ 310 String parseQuotedString(StringTokenizer st, String until) 311 throws TokeniserException { 312 313 String token; 314 315 if(st.hasMoreElements()) token = st.nextToken(); 316 else return null; 317 318 ///String type = ""; 319 StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE); 320 321 while(!token.equals(until)){ 322 //type += token; 323 type.append(token); 324 if(st.hasMoreElements())token = st.nextToken(); 325 else throw new InvalidRuleException("Tokeniser rule ended too soon!"); 326 } 327 return type.toString(); 328 } // parseQuotedString 329 330 /** Skips the ignorable tokens from the input returning the first significant 331 * token. 332 * The ignorable tokens are defined by {@link #ignoreTokens a set} 333 */ 334 protected static String skipIgnoreTokens(StringTokenizer st){ 335 Iterator ignorables; 336 boolean ignorableFound = false; 337 String currentToken; 338 339 while(true){ 340 if(st.hasMoreTokens()){ 341 currentToken = st.nextToken(); 342 ignorables = ignoreTokens.iterator(); 343 ignorableFound = false; 344 345 while(!ignorableFound && ignorables.hasNext()){ 346 if(currentToken.equals((String)ignorables.next())) 347 ignorableFound = true; 348 } 349 350 if(!ignorableFound) return currentToken; 351 } else return null; 352 } 353 }//skipIgnoreTokens 354 355 /* Computes the lambda-closure (aka epsilon closure) of the given set of 356 * states, that is the set of states that are accessible from any of the 357 * states in the given set using only unrestricted transitions. 358 * @return a set containing all the states accessible from this state via 359 * transitions that bear no restrictions. 360 */ 361 /** 362 * Converts the finite state machine to a deterministic one. 363 * 364 * @param s 365 */ 366 private AbstractSet lambdaClosure(Set s){ 367 368 //the stack/queue used by the algorithm 369 LinkedList list = new LinkedList(s); 370 371 //the set to be returned 372 AbstractSet lambdaClosure = new HashSet(s); 373 374 FSMState top; 375 FSMState currentState; 376 Set nextStates; 377 Iterator statesIter; 378 379 while(!list.isEmpty()) { 380 top = (FSMState)list.removeFirst(); 381 nextStates = top.nextSet(null); 382 383 if(null != nextStates){ 384 statesIter = nextStates.iterator(); 385 386 while(statesIter.hasNext()) { 387 currentState = (FSMState)statesIter.next(); 388 if(!lambdaClosure.contains(currentState)){ 389 lambdaClosure.add(currentState); 390 list.addFirst(currentState); 391 }//if(!lambdaClosure.contains(currentState)) 392 }//while(statesIter.hasNext()) 393 394 }//if(null != nextStates) 395 } 396 return lambdaClosure; 397 } // lambdaClosure 398 399 /** Converts the FSM from a non-deterministic to a deterministic one by 400 * eliminating all the unrestricted transitions. 401 */ 402 void eliminateVoidTransitions() throws TokeniserException { 403 404 //kalina:clear() faster than init() which is called with init() 405 newStates.clear(); 406 Set sdStates = new HashSet(); 407 LinkedList unmarkedDStates = new LinkedList(); 408 DFSMState dCurrentState = new DFSMState(this); 409 Set sdCurrentState = new HashSet(); 410 411 sdCurrentState.add(initialState); 412 sdCurrentState = lambdaClosure(sdCurrentState); 413 newStates.put(sdCurrentState, dCurrentState); 414 sdStates.add(sdCurrentState); 415 416 //find out if the new state is a final one 417 Iterator innerStatesIter = sdCurrentState.iterator(); 418 String rhs; 419 FSMState currentInnerState; 420 Set rhsClashSet = new HashSet(); 421 boolean newRhs = false; 422 423 while(innerStatesIter.hasNext()){ 424 currentInnerState = (FSMState)innerStatesIter.next(); 425 if(currentInnerState.isFinal()){ 426 rhs = currentInnerState.getRhs(); 427 rhsClashSet.add(rhs); 428 dCurrentState.rhs = rhs; 429 newRhs = true; 430 } 431 } 432 433 if(rhsClashSet.size() > 1){ 434 Err.println("Warning, rule clash: " + rhsClashSet + 435 "\nSelected last definition: " + dCurrentState.rhs); 436 } 437 438 if(newRhs)dCurrentState.buildTokenDesc(); 439 rhsClashSet.clear(); 440 unmarkedDStates.addFirst(sdCurrentState); 441 dInitialState = dCurrentState; 442 Set nextSet; 443 444 while(!unmarkedDStates.isEmpty()){ 445 //Out.println("\n\n=====================" + unmarkedDStates.size()); 446 sdCurrentState = (Set)unmarkedDStates.removeFirst(); 447 for(int type = 0; type < maxTypeId; type++){ 448 //Out.print(type); 449 nextSet = new HashSet(); 450 innerStatesIter = sdCurrentState.iterator(); 451 452 while(innerStatesIter.hasNext()){ 453 currentInnerState = (FSMState)innerStatesIter.next(); 454 Set tempSet = currentInnerState.nextSet(type); 455 if(null != tempSet) nextSet.addAll(tempSet); 456 }//while(innerStatesIter.hasNext()) 457 458 if(!nextSet.isEmpty()){ 459 nextSet = lambdaClosure(nextSet); 460 dCurrentState = (DFSMState)newStates.get(nextSet); 461 462 if(dCurrentState == null){ 463 464 //we have a new DFSMState 465 dCurrentState = new DFSMState(this); 466 sdStates.add(nextSet); 467 unmarkedDStates.add(nextSet); 468 469 //check to see whether the new state is a final one 470 innerStatesIter = nextSet.iterator(); 471 newRhs =false; 472 473 while(innerStatesIter.hasNext()){ 474 currentInnerState = (FSMState)innerStatesIter.next(); 475 if(currentInnerState.isFinal()){ 476 rhs = currentInnerState.getRhs(); 477 rhsClashSet.add(rhs); 478 dCurrentState.rhs = rhs; 479 newRhs = true; 480 } 481 } 482 483 if(rhsClashSet.size() > 1){ 484 Err.println("Warning, rule clash: " + rhsClashSet + 485 "\nSelected last definition: " + dCurrentState.rhs); 486 } 487 488 if(newRhs)dCurrentState.buildTokenDesc(); 489 rhsClashSet.clear(); 490 newStates.put(nextSet, dCurrentState); 491 } 492 ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState); 493 } // if(!nextSet.isEmpty()) 494 495 } // for(byte type = 0; type < 256; type++) 496 497 } // while(!unmarkedDStates.isEmpty()) 498 499 } // eliminateVoidTransitions 500 501 /** Returns a string representation of the non-deterministic FSM graph using 502 * GML (Graph modelling language). 503 */ 504 public String getFSMgml(){ 505 String res = "graph[ \ndirected 1\n"; 506 ///String nodes = "", edges = ""; 507 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 508 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 509 510 Iterator fsmStatesIter = fsmStates.iterator(); 511 while (fsmStatesIter.hasNext()){ 512 FSMState currentState = (FSMState)fsmStatesIter.next(); 513 int stateIndex = currentState.getIndex(); 514 /*nodes += "node[ id " + stateIndex + 515 " label \"" + stateIndex; 516 */ 517 nodes.append("node[ id "); 518 nodes.append(stateIndex); 519 nodes.append(" label \""); 520 nodes.append(stateIndex); 521 522 if(currentState.isFinal()){ 523 ///nodes += ",F\\n" + currentState.getRhs(); 524 nodes.append(",F\\n" + currentState.getRhs()); 525 } 526 ///nodes += "\" ]\n"; 527 nodes.append("\" ]\n"); 528 ///edges += currentState.getEdgesGML(); 529 edges.append(currentState.getEdgesGML()); 530 } 531 res += nodes.toString() + edges.toString() + "]\n"; 532 return res; 533 } // getFSMgml 534 535 /** Returns a string representation of the deterministic FSM graph using 536 * GML. 537 */ 538 public String getDFSMgml() { 539 String res = "graph[ \ndirected 1\n"; 540 ///String nodes = "", edges = ""; 541 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE), 542 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE); 543 544 Iterator dfsmStatesIter = dfsmStates.iterator(); 545 while (dfsmStatesIter.hasNext()) { 546 DFSMState currentState = (DFSMState)dfsmStatesIter.next(); 547 int stateIndex = currentState.getIndex(); 548 /* nodes += "node[ id " + stateIndex + 549 " label \"" + stateIndex; 550 */ 551 nodes.append("node[ id "); 552 nodes.append(stateIndex); 553 nodes.append(" label \""); 554 nodes.append(stateIndex); 555 556 if(currentState.isFinal()){ 557 /// nodes += ",F\\n" + currentState.getRhs(); 558 nodes.append(",F\\n" + currentState.getRhs()); 559 } 560 /// nodes += "\" ]\n"; 561 nodes.append("\" ]\n"); 562 /// edges += currentState.getEdgesGML(); 563 edges.append(currentState.getEdgesGML()); 564 } 565 res += nodes.toString() + edges.toString() + "]\n"; 566 return res; 567 } // getDFSMgml 568 569 //no doc required: javadoc will copy it from the interface 570 /** */ 571 public FeatureMap getFeatures(){ 572 return features; 573 } // getFeatures 574 575 /** */ 576 public void setFeatures(FeatureMap features){ 577 this.features = features; 578 } // setFeatures 579 580 /** 581 * The method that does the actual tokenisation. 582 */ 583 public void execute() throws ExecutionException { 584 interrupted = false; 585 AnnotationSet annotationSet; 586 //check the input 587 if(document == null) { 588 throw new ExecutionException( 589 "No document to tokenise!" 590 ); 591 } 592 593 if(annotationSetName == null || 594 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 595 else annotationSet = document.getAnnotations(annotationSetName); 596 597 fireStatusChanged( 598 "Tokenising " + document.getName() + "..."); 599 600 String content = document.getContent().toString(); 601 int length = content.length(); 602 char currentChar; 603 604 DFSMState graphPosition = dInitialState; 605 606 //the index of the first character of the token trying to be recognised 607 int tokenStart = 0; 608 609 //the index of the last character of the last token recognised 610 int lastMatch = -1; 611 612 DFSMState lastMatchingState = null; 613 DFSMState nextState; 614 String tokenString; 615 int charIdx = 0; 616 int oldCharIdx = 0; 617 FeatureMap newTokenFm; 618 619 while(charIdx < length){ 620 currentChar = content.charAt(charIdx); 621 // Out.println( 622 // currentChar + typesMnemonics[Character.getType(currentChar)+128]); 623 nextState = graphPosition.next(((Integer)typeIds.get( 624 new Integer(Character.getType(currentChar)))).intValue()); 625 626 if( null != nextState ) { 627 graphPosition = nextState; 628 if(graphPosition.isFinal()) { 629 lastMatch = charIdx; 630 lastMatchingState = graphPosition; 631 } 632 charIdx ++; 633 } else {//we have a match! 634 newTokenFm = Factory.newFeatureMap(); 635 636 if (null == lastMatchingState) { 637 tokenString = content.substring(tokenStart, tokenStart +1); 638 newTokenFm.put("type","UNKNOWN"); 639 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 640 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 641 Integer.toString(tokenString.length())); 642 643 try { 644 annotationSet.add(new Long(tokenStart), 645 new Long(tokenStart + 1), 646 "DEFAULT_TOKEN", newTokenFm); 647 } catch (InvalidOffsetException ioe) { 648 //This REALLY shouldn't happen! 649 ioe.printStackTrace(Err.getPrintWriter()); 650 } 651 // Out.println("Default token: " + tokenStart + 652 // "->" + tokenStart + " :" + tokenString + ";"); 653 charIdx = tokenStart + 1; 654 } else { 655 tokenString = content.substring(tokenStart, lastMatch + 1); 656 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 657 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 658 Integer.toString(tokenString.length())); 659 660 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 661 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 662 lastMatchingState.getTokenDesc()[i][1]); 663 //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" + 664 // lastMatchingState.getTokenDesc()[i][1]); 665 } 666 667 668 try { 669 annotationSet.add(new Long(tokenStart), 670 new Long(lastMatch + 1), 671 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 672 } catch(InvalidOffsetException ioe) { 673 //This REALLY shouldn't happen! 674 throw new GateRuntimeException(ioe.toString()); 675 } 676 677 // Out.println(lastMatchingState.getTokenDesc()[0][0] + 678 // ": " + tokenStart + "->" + lastMatch + 679 // " :" + tokenString + ";"); 680 charIdx = lastMatch + 1; 681 } 682 683 lastMatchingState = null; 684 graphPosition = dInitialState; 685 tokenStart = charIdx; 686 } 687 688 if((charIdx - oldCharIdx > 256)){ 689 fireProgressChanged((100 * charIdx )/ length ); 690 oldCharIdx = charIdx; 691 if(isInterrupted()) throw new ExecutionInterruptedException(); 692 } 693 694 } // while(charIdx < length) 695 696 if (null != lastMatchingState) { 697 tokenString = content.substring(tokenStart, lastMatch + 1); 698 newTokenFm = Factory.newFeatureMap(); 699 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString); 700 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME, 701 Integer.toString(tokenString.length())); 702 703 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){ 704 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0], 705 lastMatchingState.getTokenDesc()[i][1]); 706 } 707 708 709 try { 710 annotationSet.add(new Long(tokenStart), 711 new Long(lastMatch + 1), 712 lastMatchingState.getTokenDesc()[0][0], newTokenFm); 713 } catch(InvalidOffsetException ioe) { 714 //This REALLY shouldn't happen! 715 throw new GateRuntimeException(ioe.toString()); 716 } 717 718 } 719 720 reset(); 721 fireProcessFinished(); 722 fireStatusChanged("Tokenisation complete!"); 723 } // run 724 725 /** 726 * Sets the value of the <code>rulesURL</code> property which holds an URL 727 * to the file containing the rules for this tokeniser. 728 * @param newRulesURL 729 */ 730 public void setRulesURL(java.net.URL newRulesURL) { 731 rulesURL = newRulesURL; 732 } 733 /** 734 * Gets the value of the <code>rulesURL</code> property hich holds an 735 * URL to the file containing the rules for this tokeniser. 736 */ 737 public java.net.URL getRulesURL() { 738 return rulesURL; 739 } 740 /** */ 741 public void setAnnotationSetName(String newAnnotationSetName) { 742 annotationSetName = newAnnotationSetName; 743 } 744 /** */ 745 public String getAnnotationSetName() { 746 return annotationSetName; 747 } 748 public void setRulesResourceName(String newRulesResourceName) { 749 rulesResourceName = newRulesResourceName; 750 } 751 public String getRulesResourceName() { 752 return rulesResourceName; 753 } 754 public void setEncoding(String newEncoding) { 755 encoding = newEncoding; 756 } 757 public String getEncoding() { 758 return encoding; 759 } 760 761 /** */ 762 protected FeatureMap features = null; 763 764 /** the annotations et where the new annotations will be adde 765 */ 766 protected String annotationSetName; 767 768 /** The initial state of the non deterministic machin 769 */ 770 protected FSMState initialState; 771 772 /** A set containng all the states of the non deterministic machin 773 */ 774 protected Set fsmStates = new HashSet(); 775 776 /** The initial state of the deterministic machin 777 */ 778 protected DFSMState dInitialState; 779 780 /** A set containng all the states of the deterministic machin 781 */ 782 protected Set dfsmStates = new HashSet(); 783 784 /** The separator from LHS to RH 785 */ 786 static String LHStoRHS = ">"; 787 788 /** A set of string representing tokens to be ignored (e.g. blanks 789 */ 790 static Set ignoreTokens; 791 792 /** maps from int (the static value on {@link java.lang.Character} to int 793 * the internal value used by the tokeniser. The ins values used by the 794 * tokeniser are consecutive values, starting from 0 and going as high as 795 * necessary. 796 * They map all the public static int members on{@link java.lang.Character} 797 */ 798 public static Map typeIds; 799 800 /** The maximum int value used internally as a type i 801 */ 802 public static int maxTypeId; 803 804 /** Maps the internal type ids to the type name 805 */ 806 public static String[] typeMnemonics; 807 808 /** Maps from type names to type internal id 809 */ 810 public static Map stringTypeIds; 811 812 /** 813 * This property holds an URL to the file containing the rules for this tokeniser 814 * 815 */ 816 817 /** */ 818 static protected String defaultResourceName = 819 "creole/tokeniser/DefaultTokeniser.rules"; 820 821 private String rulesResourceName; 822 private java.net.URL rulesURL; 823 private String encoding; 824 private transient Vector progressListeners; 825 //kalina: added this as method to minimise too many init() calls 826 protected transient Map newStates = new HashMap(); 827 828 829 /** The static initialiser will inspect the class {@link java.lang.Character} 830 * using reflection to find all the public static members and will map them 831 * to ids starting from 0. 832 * After that it will build all the static data: {@link #typeIds}, {@link 833 * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds} 834 */ 835 static{ 836 Field[] characterClassFields; 837 838 try{ 839 characterClassFields = Class.forName("java.lang.Character").getFields(); 840 }catch(ClassNotFoundException cnfe){ 841 throw new LuckyException("Could not find the java.lang.Character class!"); 842 } 843 844 Collection staticFields = new LinkedList(); 845 // JDK 1.4 introduced directionality constants that have the same values as 846 //character types; we need to skip those as well 847 for(int i = 0; i< characterClassFields.length; i++) 848 if(Modifier.isStatic(characterClassFields[i].getModifiers()) && 849 characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1) 850 staticFields.add(characterClassFields[i]); 851 852 typeIds = new HashMap(); 853 maxTypeId = staticFields.size() -1; 854 typeMnemonics = new String[maxTypeId + 1]; 855 stringTypeIds = new HashMap(); 856 857 Iterator staticFieldsIter = staticFields.iterator(); 858 Field currentField; 859 int currentId = 0; 860 String fieldName; 861 862 try { 863 while(staticFieldsIter.hasNext()){ 864 currentField = (Field)staticFieldsIter.next(); 865 if(currentField.getType().toString().equals("byte")){ 866 fieldName = currentField.getName(); 867 typeIds.put(new Integer(currentField.getInt(null)), 868 new Integer(currentId)); 869 typeMnemonics[currentId] = fieldName; 870 stringTypeIds.put(fieldName, new Integer(currentId)); 871 currentId++; 872 } 873 } 874 } catch(Exception e) { 875 throw new LuckyException(e.toString()); 876 } 877 878 ignoreTokens = new HashSet(); 879 ignoreTokens.add(" "); 880 ignoreTokens.add("\t"); 881 ignoreTokens.add("\f"); 882 } 883 884 } // class DefaultTokeniser
|
SimpleTokeniser |
|