|
DefaultGazetteer |
|
1 /* 2 * DefaultGazeteer.java 3 * 4 * Copyright (c) 2000-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June1991. 9 * 10 * A copy of this licence is included in the distribution in the file 11 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. 12 * 13 * Valentin Tablan, 03/07/2000 14 * 15 * $Id: DefaultGazetteer.java,v 1.33 2001/11/12 15:04:28 valyt Exp $ 16 */ 17 18 package gate.creole.gazetteer; 19 20 import java.io.*; 21 import java.util.*; 22 import java.net.*; 23 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.event.*; 27 import gate.*; 28 29 /** This component is responsible for doing lists lookup. The implementaion is 30 * based on finite state machines. 31 * The phrases to be recognised should be listed in a set of files, one for 32 * each type of occurences. 33 * The gazeteer is build with the information from a file that contains the set 34 * of lists (which are files as well) and the associated type for each list. 35 * The file defining the set of lists should have the following syntax: 36 * each list definition should be written on its own line and should contain: 37 * <ol> 38 * <li>the file name (required) </li> 39 * <li>the major type (required) </li> 40 * <li>the minor type (optional)</li> 41 * <li>the language(s) (optional) </li> 42 * </ol> 43 * The elements of each definition are separated by ":". 44 * The following is an example of a valid definition: <br> 45 * <code>personmale.lst:person:male:english</code> 46 * Each list file named in the lists definition file is just a list containing 47 * one entry per line. 48 * When this gazetter will be run over some input text (a Gate document) it 49 * will generate annotations of type Lookup having the attributes specified in 50 * the definition file. 51 */ 52 public class DefaultGazetteer extends AbstractLanguageAnalyser 53 implements ProcessingResource { 54 55 /** Debug flag 56 */ 57 private static final boolean DEBUG = false; 58 59 /** Build a gazetter using the default lists from the agte resources 60 * {@see init()} 61 */ 62 public DefaultGazetteer(){ 63 } 64 65 /** Does the actual loading and parsing of the lists. This method must be 66 * called before the gazetteer can be used 67 */ 68 public Resource init()throws ResourceInstantiationException{ 69 fsmStates = new HashSet(); 70 try{ 71 initialState = new FSMState(this); 72 if(listsURL == null){ 73 throw new ResourceInstantiationException ( 74 "No URL provided for gazetteer creation!"); 75 } 76 77 //find the number of lines 78 Reader reader = new InputStreamReader(listsURL.openStream(), encoding); 79 int linesCnt = 0; 80 BufferedReader bReader = new BufferedReader(reader); 81 String line = bReader.readLine(); 82 while (line != null) { 83 linesCnt++; 84 line = bReader.readLine(); 85 } 86 bReader.close(); 87 88 //parse the file 89 reader = new InputStreamReader(listsURL.openStream(), encoding); 90 bReader = new BufferedReader(reader); 91 line = bReader.readLine(); 92 ///String toParse = ""; 93 StringBuffer toParse = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 94 95 int lineIdx = 0; 96 while (line != null) { 97 if(line.endsWith("\\")) { 98 ///toParse += line.substring(0,line.length()-1); 99 toParse.append(line.substring(0,line.length()-1)); 100 } else { 101 ///toParse += line; 102 toParse.append(line); 103 fireStatusChanged("Reading " + toParse.toString()); 104 fireProgressChanged(lineIdx * 100 / linesCnt); 105 lineIdx ++; 106 readList(toParse.toString(), true); 107 ///toParse = ""; 108 toParse.delete(0,toParse.length()); 109 } 110 line = bReader.readLine(); 111 } 112 fireProcessFinished(); 113 }catch(IOException ioe){ 114 throw new ResourceInstantiationException(ioe); 115 }catch(GazetteerException ge){ 116 throw new ResourceInstantiationException(ge); 117 } 118 return this; 119 } 120 121 122 /** Reads one lists (one file) of phrases 123 * 124 * @param listDesc the line from the definition file 125 * @param add 126 * @add if <b>true</b> will add the phrases found in the list to the ones 127 * recognised by this gazetter, if <b>false</b> the phrases found in the 128 * list will be removed from the list of phrases recognised by this 129 * gazetteer. 130 */ 131 void readList(String listDesc, boolean add) throws FileNotFoundException, 132 IOException, 133 GazetteerException{ 134 String listName, majorType, minorType, languages; 135 int firstColon = listDesc.indexOf(':'); 136 int secondColon = listDesc.indexOf(':', firstColon + 1); 137 int thirdColon = listDesc.indexOf(':', secondColon + 1); 138 if(firstColon == -1){ 139 throw new GazetteerException("Invalid list definition: " + listDesc); 140 } 141 listName = listDesc.substring(0, firstColon); 142 143 if(secondColon == -1){ 144 majorType = listDesc.substring(firstColon + 1); 145 minorType = null; 146 languages = null; 147 } else { 148 majorType = listDesc.substring(firstColon + 1, secondColon); 149 if(thirdColon == -1) { 150 minorType = listDesc.substring(secondColon + 1); 151 languages = null; 152 } else { 153 minorType = listDesc.substring(secondColon + 1, thirdColon); 154 languages = listDesc.substring(thirdColon + 1); 155 } 156 } 157 BufferedReader listReader; 158 159 listReader = new BufferedReader(new InputStreamReader( 160 (new URL(listsURL, listName)).openStream(), encoding)); 161 162 Lookup lookup = new Lookup(majorType, minorType, languages); 163 String line = listReader.readLine(); 164 while(null != line){ 165 if(add)addLookup(line, lookup); 166 else removeLookup(line, lookup); 167 line = listReader.readLine(); 168 } 169 } // void readList(String listDesc) 170 171 /** Adds one phrase to the list of phrases recognised by this gazetteer 172 * 173 * @param text the phrase to be added 174 * @param lookup the description of the annotation to be added when this 175 * phrase is recognised 176 */ 177 // >>> DAM, was 178 /* 179 public void addLookup(String text, Lookup lookup) { 180 Character currentChar; 181 FSMState currentState = initialState; 182 FSMState nextState; 183 Lookup oldLookup; 184 boolean isSpace; 185 186 for(int i = 0; i< text.length(); i++) { 187 isSpace = Character.isWhitespace(text.charAt(i)); 188 if(isSpace) currentChar = new Character(' '); 189 else currentChar = (caseSensitive.booleanValue()) ? 190 new Character(text.charAt(i)) : 191 new Character(Character.toUpperCase(text.charAt(i))) ; 192 nextState = currentState.next(currentChar); 193 if(nextState == null){ 194 nextState = new FSMState(this); 195 currentState.put(currentChar, nextState); 196 if(isSpace) nextState.put(new Character(' '),nextState); 197 } 198 currentState = nextState; 199 } //for(int i = 0; i< text.length(); i++) 200 201 currentState.addLookup(lookup); 202 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 203 204 } // addLookup 205 */ 206 // >>> DAM: TransArray optimization 207 public void addLookup(String text, Lookup lookup) { 208 char currentChar; 209 FSMState currentState = initialState; 210 FSMState nextState; 211 Lookup oldLookup; 212 boolean isSpace; 213 214 for(int i = 0; i< text.length(); i++) { 215 currentChar = text.charAt(i); 216 isSpace = Character.isWhitespace(currentChar); 217 if(isSpace) currentChar = ' '; 218 else currentChar = (caseSensitive.booleanValue()) ? 219 currentChar : 220 Character.toUpperCase(currentChar) ; 221 nextState = currentState.next(currentChar); 222 if(nextState == null){ 223 nextState = new FSMState(this); 224 currentState.put(currentChar, nextState); 225 if(isSpace) nextState.put(' ',nextState); 226 } 227 currentState = nextState; 228 } //for(int i = 0; i< text.length(); i++) 229 230 currentState.addLookup(lookup); 231 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 232 233 } // addLookup 234 // >>> DAM, end 235 236 /** Removes one phrase to the list of phrases recognised by this gazetteer 237 * 238 * @param text the phrase to be removed 239 * @param lookup the description of the annotation associated to this phrase 240 */ 241 // >>> DAM, was 242 /* 243 public void removeLookup(String text, Lookup lookup) { 244 Character currentChar; 245 FSMState currentState = initialState; 246 FSMState nextState; 247 Lookup oldLookup; 248 boolean isSpace; 249 250 for(int i = 0; i< text.length(); i++) { 251 isSpace = Character.isWhitespace(text.charAt(i)); 252 if(isSpace) currentChar = new Character(' '); 253 else currentChar = new Character(text.charAt(i)); 254 nextState = currentState.next(currentChar); 255 if(nextState == null) return;//nothing to remove 256 currentState = nextState; 257 } //for(int i = 0; i< text.length(); i++) 258 currentState.removeLookup(lookup); 259 } // removeLookup 260 */ 261 // >>> DAM: TransArray optimization 262 public void removeLookup(String text, Lookup lookup) { 263 char currentChar; 264 FSMState currentState = initialState; 265 FSMState nextState; 266 Lookup oldLookup; 267 268 for(int i = 0; i< text.length(); i++) { 269 currentChar = text.charAt(i); 270 if(Character.isWhitespace(currentChar)) currentChar = ' '; 271 nextState = currentState.next(currentChar); 272 if(nextState == null) return;//nothing to remove 273 currentState = nextState; 274 } //for(int i = 0; i< text.length(); i++) 275 currentState.removeLookup(lookup); 276 } // removeLookup 277 // >>> DAM, end 278 279 /** Returns a string representation of the deterministic FSM graph using 280 * GML. 281 */ 282 public String getFSMgml() { 283 String res = "graph[ \ndirected 1\n"; 284 ///String nodes = "", edges = ""; 285 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE), 286 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 287 Iterator fsmStatesIter = fsmStates.iterator(); 288 while (fsmStatesIter.hasNext()){ 289 FSMState currentState = (FSMState)fsmStatesIter.next(); 290 int stateIndex = currentState.getIndex(); 291 /*nodes += "node[ id " + stateIndex + 292 " label \"" + stateIndex; 293 */ 294 nodes.append("node[ id "); 295 nodes.append(stateIndex); 296 nodes.append(" label \""); 297 nodes.append(stateIndex); 298 299 if(currentState.isFinal()){ 300 ///nodes += ",F\\n" + currentState.getLookupSet(); 301 nodes.append(",F\\n"); 302 nodes.append(currentState.getLookupSet()); 303 } 304 ///nodes += "\" ]\n"; 305 nodes.append("\" ]\n"); 306 //edges += currentState.getEdgesGML(); 307 edges.append(currentState.getEdgesGML()); 308 } 309 res += nodes.toString() + edges.toString() + "]\n"; 310 return res; 311 } // getFSMgml 312 313 //no doc required: javadoc will copy it from the interface 314 /** */ 315 public FeatureMap getFeatures(){ 316 return features; 317 } // getFeatures 318 319 /** */ 320 public void setFeatures(FeatureMap features){ 321 this.features = features; 322 } // setFeatures 323 324 325 326 /** 327 * This method runs the gazetteer. It assumes that all the needed parameters 328 * are set. If they are not, an exception will be fired. 329 */ 330 public void execute() throws ExecutionException{ 331 interrupted = false; 332 AnnotationSet annotationSet; 333 //check the input 334 if(document == null) { 335 throw new ExecutionException( 336 "No document to process!" 337 ); 338 } 339 340 if(annotationSetName == null || 341 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 342 else annotationSet = document.getAnnotations(annotationSetName); 343 344 fireStatusChanged("Doing lookup in " + 345 document.getSourceUrl().getFile() + "..."); 346 String content = document.getContent().toString(); 347 int length = content.length(); 348 // >>> DAM, was 349 /* 350 Character currentChar; 351 */ 352 // >>> DAM: TransArray optimization 353 char currentChar; 354 // >>> DAM, end 355 FSMState currentState = initialState; 356 FSMState nextState; 357 FSMState lastMatchingState = null; 358 int matchedRegionEnd = 0; 359 int matchedRegionStart = 0; 360 int charIdx = 0; 361 int oldCharIdx = 0; 362 FeatureMap fm; 363 Lookup currentLookup; 364 365 // >>> DAM, was 366 /* 367 while(charIdx < length) { 368 if(Character.isWhitespace(content.charAt(charIdx))) 369 currentChar = new Character(' '); 370 else currentChar = (caseSensitive.booleanValue()) ? 371 new Character(content.charAt(charIdx)) : 372 new Character(Character.toUpperCase( 373 content.charAt(charIdx))); 374 */ 375 // >>> DAM: TransArray optimization 376 while(charIdx < length) { 377 currentChar = content.charAt(charIdx); 378 if(Character.isWhitespace(currentChar)) currentChar = ' '; 379 else currentChar = caseSensitive.booleanValue() ? 380 currentChar : 381 Character.toUpperCase(currentChar); 382 // >>> DAM, end 383 nextState = currentState.next(currentChar); 384 if(nextState == null) { 385 //the matching stopped 386 387 //if we had a successful match then act on it; 388 if(lastMatchingState != null){ 389 //let's add the new annotation(s) 390 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 391 392 while(lookupIter.hasNext()) { 393 currentLookup = (Lookup)lookupIter.next(); 394 fm = Factory.newFeatureMap(); 395 fm.put("majorType", currentLookup.majorType); 396 if(null != currentLookup.minorType) { 397 fm.put("minorType", currentLookup.minorType); 398 if(null != currentLookup.languages) 399 fm.put("language", currentLookup.languages); 400 } 401 try { 402 annotationSet.add(new Long(matchedRegionStart), 403 new Long(matchedRegionEnd + 1), 404 "Lookup", 405 fm); 406 } catch(InvalidOffsetException ioe) { 407 throw new LuckyException(ioe.toString()); 408 } 409 }//while(lookupIter.hasNext()) 410 lastMatchingState = null; 411 } 412 413 //reset the FSM 414 charIdx = matchedRegionStart + 1; 415 matchedRegionStart = charIdx; 416 currentState = initialState; 417 418 } else{//go on with the matching 419 currentState = nextState; 420 //if we have a successful state then store it 421 if(currentState.isFinal() && 422 (matchedRegionStart == 0 || 423 !Character.isLetter(content.charAt(matchedRegionStart - 1))) && 424 (charIdx + 1 >= content.length() || 425 !Character.isLetter(content.charAt(charIdx + 1))) 426 ){ 427 matchedRegionEnd = charIdx; 428 lastMatchingState = currentState; 429 } 430 charIdx ++; 431 if(charIdx == content.length()){ 432 //we can't go on, use the last matching state and restart matching 433 //from the next char 434 if(lastMatchingState != null){ 435 //let's add the new annotation(s) 436 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 437 438 while(lookupIter.hasNext()) { 439 currentLookup = (Lookup)lookupIter.next(); 440 fm = Factory.newFeatureMap(); 441 fm.put("majorType", currentLookup.majorType); 442 if(null != currentLookup.minorType) { 443 fm.put("minorType", currentLookup.minorType); 444 if(null != currentLookup.languages) 445 fm.put("language", currentLookup.languages); 446 } 447 try { 448 annotationSet.add(new Long(matchedRegionStart), 449 new Long(matchedRegionEnd + 1), 450 "Lookup", 451 fm); 452 } catch(InvalidOffsetException ioe) { 453 throw new LuckyException(ioe.toString()); 454 } 455 }//while(lookupIter.hasNext()) 456 lastMatchingState = null; 457 } 458 459 //reset the FSM 460 charIdx = matchedRegionStart + 1; 461 matchedRegionStart = charIdx; 462 currentState = initialState; 463 } 464 } 465 if(charIdx - oldCharIdx > 256) { 466 fireProgressChanged((100 * charIdx )/ length ); 467 oldCharIdx = charIdx; 468 if(isInterrupted()) throw new ExecutionInterruptedException( 469 "The execution of the " + getName() + 470 " gazetteer has been abruptly interrupted!"); 471 } 472 } // while(charIdx < length) 473 474 if(lastMatchingState != null) { 475 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 476 while(lookupIter.hasNext()) { 477 currentLookup = (Lookup)lookupIter.next(); 478 fm = Factory.newFeatureMap(); 479 fm.put("majorType", currentLookup.majorType); 480 if(null != currentLookup.minorType) 481 fm.put("minorType", currentLookup.minorType); 482 try{ 483 annotationSet.add(new Long(matchedRegionStart), 484 new Long(matchedRegionEnd + 1), 485 "Lookup", 486 fm); 487 } catch(InvalidOffsetException ioe) { 488 throw new GateRuntimeException(ioe.toString()); 489 } 490 }//while(lookupIter.hasNext()) 491 } 492 fireProcessFinished(); 493 fireStatusChanged("Lookup complete!"); 494 } // execute 495 496 497 /** 498 * Sets the AnnotationSet that will be used at the next run for the newly 499 * produced annotations. 500 */ 501 public void setAnnotationSetName(String newAnnotationSetName) { 502 annotationSetName = newAnnotationSetName; 503 } 504 505 506 /** The initial state of the FSM that backs this gazetteer 507 */ 508 FSMState initialState; 509 510 /** A set containing all the states of the FSM backing the gazetteer 511 */ 512 Set fsmStates; 513 514 protected FeatureMap features = null; 515 516 /** Used to store the annotation set currently being used for the newly 517 * generated annotations 518 */ 519 protected String annotationSetName; 520 521 private String encoding = "UTF-8"; 522 523 /** 524 * The value of this property is the URL that will be used for reading the 525 * lists dtaht define this Gazetteer 526 */ 527 private java.net.URL listsURL; 528 529 /** 530 * Should this gazetteer be case sensitive. The default value is true. 531 */ 532 private Boolean caseSensitive = new Boolean(true); 533 534 public void setEncoding(String newEncoding) { 535 encoding = newEncoding; 536 } 537 public String getEncoding() { 538 return encoding; 539 } 540 public void setListsURL(java.net.URL newListsURL) { 541 listsURL = newListsURL; 542 } 543 public java.net.URL getListsURL() { 544 return listsURL; 545 } 546 public void setCaseSensitive(Boolean newCaseSensitive) { 547 caseSensitive = newCaseSensitive; 548 } 549 public Boolean getCaseSensitive() { 550 return caseSensitive; 551 } 552 public String getAnnotationSetName() { 553 return annotationSetName; 554 } 555 556 } // DefaultGazetteer 557 558 // >>> DAM: TransArray optimization, new charMap implementation 559 interface Iter 560 { 561 public boolean hasNext(); 562 public char next(); 563 } // iter class 564 565 /** 566 * class implementing the map using binary serach by char as key 567 * to retrive the coresponding object. 568 */ 569 class charMap 570 { 571 char[] itemsKeys = null; 572 Object[] itemsObjs = null; 573 574 /** 575 * resize the containers by one leavaing empty elemant at position 'index' 576 */ 577 void resize(int index) 578 { 579 int newsz = itemsKeys.length + 1; 580 char[] tempKeys = new char[newsz]; 581 Object[] tempObjs = new Object[newsz]; 582 int i; 583 for (i= 0; i < index; i++) 584 { 585 tempKeys[i] = itemsKeys[i]; 586 tempObjs[i] = itemsObjs[i]; 587 } 588 for (i= index+1; i < newsz; i++) 589 { 590 tempKeys[i] = itemsKeys[i-1]; 591 tempObjs[i] = itemsObjs[i-1]; 592 } 593 594 itemsKeys = tempKeys; 595 itemsObjs = tempObjs; 596 } // resize 597 598 /** 599 * get the object from the map using the char key 600 */ 601 Object get(char key) 602 { 603 if (itemsKeys == null) return null; 604 int index = Arrays.binarySearch(itemsKeys, key); 605 if (index<0) 606 return null; 607 return itemsObjs[index]; 608 } 609 /** 610 * put the object into the char map using the chat as the key 611 */ 612 Object put(char key, Object value) 613 { 614 if (itemsKeys == null) 615 { 616 itemsKeys = new char[1]; 617 itemsKeys[0] = key; 618 itemsObjs = new Object[1]; 619 itemsObjs[0] = value; 620 return value; 621 }// if first time 622 int index = Arrays.binarySearch(itemsKeys, key); 623 if (index<0) 624 { 625 index = ~index; 626 resize(index); 627 itemsKeys[index] = key; 628 itemsObjs[index] = value; 629 } 630 return itemsObjs[index]; 631 } // put 632 /** 633 * the keys itereator 634 * / 635 public Iter iter() 636 { 637 return new Iter() 638 { 639 int counter = 0; 640 public boolean hasNext() {return counter < itemsKeys.length;} 641 public char next() { return itemsKeys[counter];} 642 }; 643 } // iter() 644 */ 645 646 } // class charMap 647 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState
|
DefaultGazetteer |
|