|
DefaultGazetteer |
|
1 /* 2 * DefaultGazeteer.java 3 * 4 * Copyright (c) 2000-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June1991. 9 * 10 * A copy of this licence is included in the distribution in the file 11 * licence.html, and is also available at http://gate.ac.uk/gate/licence.html. 12 * 13 * Valentin Tablan, 03/07/2000 14 * borislav popov 24/03/2002 15 * 16 * $Id: DefaultGazetteer.java,v 1.42 2002/06/26 14:23:22 nasso Exp $ 17 */ 18 package gate.creole.gazetteer; 19 20 import java.io.*; 21 import java.util.*; 22 import java.net.*; 23 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.event.*; 27 import gate.*; 28 29 /** This component is responsible for doing lists lookup. The implementaion is 30 * based on finite state machines. 31 * The phrases to be recognised should be listed in a set of files, one for 32 * each type of occurences. 33 * The gazeteer is build with the information from a file that contains the set 34 * of lists (which are files as well) and the associated type for each list. 35 * The file defining the set of lists should have the following syntax: 36 * each list definition should be written on its own line and should contain: 37 * <ol> 38 * <li>the file name (required) </li> 39 * <li>the major type (required) </li> 40 * <li>the minor type (optional)</li> 41 * <li>the language(s) (optional) </li> 42 * </ol> 43 * The elements of each definition are separated by ":". 44 * The following is an example of a valid definition: <br> 45 * <code>personmale.lst:person:male:english</code> 46 * Each list file named in the lists definition file is just a list containing 47 * one entry per line. 48 * When this gazetter will be run over some input text (a Gate document) it 49 * will generate annotations of type Lookup having the attributes specified in 50 * the definition file. 51 */ 52 public class DefaultGazetteer extends AbstractGazetteer { 53 54 /** Debug flag 55 */ 56 private static final boolean DEBUG = false; 57 58 public static final String 59 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document"; 60 61 public static final String 62 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName"; 63 64 public static final String 65 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL"; 66 67 public static final String 68 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding"; 69 70 public static final String 71 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive"; 72 73 74 /** a map of nodes vs gaz lists */ 75 private Map listsByNode; 76 77 /** Build a gazetter using the default lists from the agte resources 78 * {@see init()} 79 */ 80 public DefaultGazetteer(){ 81 } 82 83 /** Does the actual loading and parsing of the lists. This method must be 84 * called before the gazetteer can be used 85 */ 86 public Resource init()throws ResourceInstantiationException{ 87 fsmStates = new HashSet(); 88 initialState = new FSMState(this); 89 if(listsURL == null){ 90 throw new ResourceInstantiationException ( 91 "No URL provided for gazetteer creation!"); 92 } 93 definition = new LinearDefinition(); 94 definition.setURL(listsURL); 95 definition.load(); 96 int linesCnt = definition.size(); 97 listsByNode = definition.loadLists(); 98 Iterator inodes = definition.iterator(); 99 100 String line; 101 int nodeIdx = 0; 102 LinearNode node; 103 while (inodes.hasNext()) { 104 node = (LinearNode) inodes.next(); 105 fireStatusChanged("Reading " + node.toString()); 106 fireProgressChanged(++nodeIdx * 100 / linesCnt); 107 readList(node,true); 108 } // while iline 109 fireProcessFinished(); 110 return this; 111 } 112 113 114 /** Reads one lists (one file) of phrases 115 * 116 * @param listDesc the line from the definition file 117 * @param add 118 * @add if <b>true</b> will add the phrases found in the list to the ones 119 * recognised by this gazetter, if <b>false</b> the phrases found in the 120 * list will be removed from the list of phrases recognised by this 121 * gazetteer. 122 */ 123 void readList(LinearNode node, boolean add) throws ResourceInstantiationException{ 124 String listName, majorType, minorType, languages; 125 if ( null == node ) { 126 throw new ResourceInstantiationException(" LinearNode node is null "); 127 } 128 129 listName = node.getList(); 130 majorType = node.getMajorType(); 131 minorType = node.getMinorType(); 132 languages = node.getLanguage(); 133 GazetteerList gazList = (GazetteerList)listsByNode.get(node); 134 if (null == gazList) { 135 throw new ResourceInstantiationException("gazetteer list not found by node"); 136 } 137 138 Iterator iline = gazList.iterator(); 139 140 Lookup lookup = new Lookup(listName,majorType, minorType, languages); 141 lookup.list = node.getList(); 142 if ( null != mappingDefinition){ 143 MappingNode mnode = mappingDefinition.getNodeByList(lookup.list); 144 if (null!=mnode){ 145 lookup.oClass = mnode.getClassID(); 146 lookup.ontology = mnode.getOntologyID(); 147 } 148 }//if mapping def 149 150 String line; 151 while(iline.hasNext()){ 152 line = iline.next().toString(); 153 if(add)addLookup(line, lookup); 154 else removeLookup(line, lookup); 155 } 156 } // void readList(String listDesc) 157 158 /** Adds one phrase to the list of phrases recognised by this gazetteer 159 * 160 * @param text the phrase to be added 161 * @param lookup the description of the annotation to be added when this 162 * phrase is recognised 163 */ 164 // >>> DAM, was 165 /* 166 public void addLookup(String text, Lookup lookup) { 167 Character currentChar; 168 FSMState currentState = initialState; 169 FSMState nextState; 170 Lookup oldLookup; 171 boolean isSpace; 172 173 for(int i = 0; i< text.length(); i++) { 174 isSpace = Character.isWhitespace(text.charAt(i)); 175 if(isSpace) currentChar = new Character(' '); 176 else currentChar = (caseSensitive.booleanValue()) ? 177 new Character(text.charAt(i)) : 178 new Character(Character.toUpperCase(text.charAt(i))) ; 179 nextState = currentState.next(currentChar); 180 if(nextState == null){ 181 nextState = new FSMState(this); 182 currentState.put(currentChar, nextState); 183 if(isSpace) nextState.put(new Character(' '),nextState); 184 } 185 currentState = nextState; 186 } //for(int i = 0; i< text.length(); i++) 187 188 currentState.addLookup(lookup); 189 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 190 191 } // addLookup 192 */ 193 // >>> DAM: TransArray optimization 194 public void addLookup(String text, Lookup lookup) { 195 char currentChar; 196 FSMState currentState = initialState; 197 FSMState nextState; 198 Lookup oldLookup; 199 boolean isSpace; 200 201 for(int i = 0; i< text.length(); i++) { 202 currentChar = text.charAt(i); 203 isSpace = Character.isWhitespace(currentChar); 204 if(isSpace) currentChar = ' '; 205 else currentChar = (caseSensitive.booleanValue()) ? 206 currentChar : 207 Character.toUpperCase(currentChar) ; 208 nextState = currentState.next(currentChar); 209 if(nextState == null){ 210 nextState = new FSMState(this); 211 currentState.put(currentChar, nextState); 212 if(isSpace) nextState.put(' ',nextState); 213 } 214 currentState = nextState; 215 } //for(int i = 0; i< text.length(); i++) 216 217 currentState.addLookup(lookup); 218 //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType); 219 220 } // addLookup 221 // >>> DAM, end 222 223 /** Removes one phrase to the list of phrases recognised by this gazetteer 224 * 225 * @param text the phrase to be removed 226 * @param lookup the description of the annotation associated to this phrase 227 */ 228 // >>> DAM, was 229 /* 230 public void removeLookup(String text, Lookup lookup) { 231 Character currentChar; 232 FSMState currentState = initialState; 233 FSMState nextState; 234 Lookup oldLookup; 235 boolean isSpace; 236 237 for(int i = 0; i< text.length(); i++) { 238 isSpace = Character.isWhitespace(text.charAt(i)); 239 if(isSpace) currentChar = new Character(' '); 240 else currentChar = new Character(text.charAt(i)); 241 nextState = currentState.next(currentChar); 242 if(nextState == null) return;//nothing to remove 243 currentState = nextState; 244 } //for(int i = 0; i< text.length(); i++) 245 currentState.removeLookup(lookup); 246 } // removeLookup 247 */ 248 // >>> DAM: TransArray optimization 249 public void removeLookup(String text, Lookup lookup) { 250 char currentChar; 251 FSMState currentState = initialState; 252 FSMState nextState; 253 Lookup oldLookup; 254 255 for(int i = 0; i< text.length(); i++) { 256 currentChar = text.charAt(i); 257 if(Character.isWhitespace(currentChar)) currentChar = ' '; 258 nextState = currentState.next(currentChar); 259 if(nextState == null) return;//nothing to remove 260 currentState = nextState; 261 } //for(int i = 0; i< text.length(); i++) 262 currentState.removeLookup(lookup); 263 } // removeLookup 264 // >>> DAM, end 265 266 /** Returns a string representation of the deterministic FSM graph using 267 * GML. 268 */ 269 public String getFSMgml() { 270 String res = "graph[ \ndirected 1\n"; 271 ///String nodes = "", edges = ""; 272 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE), 273 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 274 Iterator fsmStatesIter = fsmStates.iterator(); 275 while (fsmStatesIter.hasNext()){ 276 FSMState currentState = (FSMState)fsmStatesIter.next(); 277 int stateIndex = currentState.getIndex(); 278 /*nodes += "node[ id " + stateIndex + 279 " label \"" + stateIndex; 280 */ 281 nodes.append("node[ id "); 282 nodes.append(stateIndex); 283 nodes.append(" label \""); 284 nodes.append(stateIndex); 285 286 if(currentState.isFinal()){ 287 ///nodes += ",F\\n" + currentState.getLookupSet(); 288 nodes.append(",F\\n"); 289 nodes.append(currentState.getLookupSet()); 290 } 291 ///nodes += "\" ]\n"; 292 nodes.append("\" ]\n"); 293 //edges += currentState.getEdgesGML(); 294 edges.append(currentState.getEdgesGML()); 295 } 296 res += nodes.toString() + edges.toString() + "]\n"; 297 return res; 298 } // getFSMgml 299 300 301 /** 302 * This method runs the gazetteer. It assumes that all the needed parameters 303 * are set. If they are not, an exception will be fired. 304 */ 305 public void execute() throws ExecutionException{ 306 interrupted = false; 307 AnnotationSet annotationSet; 308 //check the input 309 if(document == null) { 310 throw new ExecutionException( 311 "No document to process!" 312 ); 313 } 314 315 if(annotationSetName == null || 316 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 317 else annotationSet = document.getAnnotations(annotationSetName); 318 319 fireStatusChanged("Doing lookup in " + 320 document.getName() + "..."); 321 String content = document.getContent().toString(); 322 int length = content.length(); 323 // >>> DAM, was 324 /* 325 Character currentChar; 326 */ 327 // >>> DAM: TransArray optimization 328 char currentChar; 329 // >>> DAM, end 330 FSMState currentState = initialState; 331 FSMState nextState; 332 FSMState lastMatchingState = null; 333 int matchedRegionEnd = 0; 334 int matchedRegionStart = 0; 335 int charIdx = 0; 336 int oldCharIdx = 0; 337 FeatureMap fm; 338 Lookup currentLookup; 339 340 // >>> DAM, was 341 /* 342 while(charIdx < length) { 343 if(Character.isWhitespace(content.charAt(charIdx))) 344 currentChar = new Character(' '); 345 else currentChar = (caseSensitive.booleanValue()) ? 346 new Character(content.charAt(charIdx)) : 347 new Character(Character.toUpperCase( 348 content.charAt(charIdx))); 349 */ 350 // >>> DAM: TransArray optimization 351 while(charIdx < length) { 352 currentChar = content.charAt(charIdx); 353 if(Character.isWhitespace(currentChar)) currentChar = ' '; 354 else currentChar = caseSensitive.booleanValue() ? 355 currentChar : 356 Character.toUpperCase(currentChar); 357 // >>> DAM, end 358 nextState = currentState.next(currentChar); 359 if(nextState == null) { 360 //the matching stopped 361 362 //if we had a successful match then act on it; 363 if(lastMatchingState != null){ 364 //let's add the new annotation(s) 365 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 366 367 while(lookupIter.hasNext()) { 368 currentLookup = (Lookup)lookupIter.next(); 369 fm = Factory.newFeatureMap(); 370 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 371 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 372 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 373 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 374 } 375 if(null != currentLookup.minorType) { 376 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 377 if(null != currentLookup.languages) 378 fm.put("language", currentLookup.languages); 379 } 380 try { 381 annotationSet.add(new Long(matchedRegionStart), 382 new Long(matchedRegionEnd + 1), 383 LOOKUP_ANNOTATION_TYPE, 384 fm); 385 } catch(InvalidOffsetException ioe) { 386 throw new LuckyException(ioe.toString()); 387 } 388 }//while(lookupIter.hasNext()) 389 lastMatchingState = null; 390 } 391 392 //reset the FSM 393 charIdx = matchedRegionStart + 1; 394 matchedRegionStart = charIdx; 395 currentState = initialState; 396 397 } else{//go on with the matching 398 currentState = nextState; 399 //if we have a successful state then store it 400 if(currentState.isFinal() && 401 (matchedRegionStart == 0 || 402 !Character.isLetter(content.charAt(matchedRegionStart - 1))) && 403 (charIdx + 1 >= content.length() || 404 !Character.isLetter(content.charAt(charIdx + 1))) 405 ){ 406 matchedRegionEnd = charIdx; 407 lastMatchingState = currentState; 408 } 409 charIdx ++; 410 if(charIdx == content.length()){ 411 //we can't go on, use the last matching state and restart matching 412 //from the next char 413 if(lastMatchingState != null){ 414 //let's add the new annotation(s) 415 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 416 417 while(lookupIter.hasNext()) { 418 currentLookup = (Lookup)lookupIter.next(); 419 fm = Factory.newFeatureMap(); 420 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 421 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 422 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 423 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 424 } 425 if(null != currentLookup.minorType) { 426 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 427 if(null != currentLookup.languages) 428 fm.put("language", currentLookup.languages); 429 } 430 try { 431 annotationSet.add(new Long(matchedRegionStart), 432 new Long(matchedRegionEnd + 1), 433 LOOKUP_ANNOTATION_TYPE, 434 fm); 435 } catch(InvalidOffsetException ioe) { 436 throw new LuckyException(ioe.toString()); 437 } 438 }//while(lookupIter.hasNext()) 439 lastMatchingState = null; 440 } 441 442 //reset the FSM 443 charIdx = matchedRegionStart + 1; 444 matchedRegionStart = charIdx; 445 currentState = initialState; 446 } 447 } 448 if(charIdx - oldCharIdx > 256) { 449 fireProgressChanged((100 * charIdx )/ length ); 450 oldCharIdx = charIdx; 451 if(isInterrupted()) throw new ExecutionInterruptedException( 452 "The execution of the " + getName() + 453 " gazetteer has been abruptly interrupted!"); 454 } 455 } // while(charIdx < length) 456 457 if(lastMatchingState != null) { 458 Iterator lookupIter = lastMatchingState.getLookupSet().iterator(); 459 while(lookupIter.hasNext()) { 460 currentLookup = (Lookup)lookupIter.next(); 461 fm = Factory.newFeatureMap(); 462 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType); 463 if (null!= currentLookup.oClass && null!=currentLookup.ontology){ 464 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass); 465 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology); 466 } 467 468 if(null != currentLookup.minorType) 469 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType); 470 try{ 471 annotationSet.add(new Long(matchedRegionStart), 472 new Long(matchedRegionEnd + 1), 473 LOOKUP_ANNOTATION_TYPE, 474 fm); 475 } catch(InvalidOffsetException ioe) { 476 throw new GateRuntimeException(ioe.toString()); 477 } 478 }//while(lookupIter.hasNext()) 479 } 480 fireProcessFinished(); 481 fireStatusChanged("Lookup complete!"); 482 } // execute 483 484 485 /** The initial state of the FSM that backs this gazetteer 486 */ 487 FSMState initialState; 488 489 /** A set containing all the states of the FSM backing the gazetteer 490 */ 491 Set fsmStates; 492 493 /**lookup <br> 494 * @param singleItem a single string to be looked up by the gazetteer 495 * @return set of the Lookups associated with the parameter*/ 496 public Set lookup(String singleItem) { 497 char currentChar; 498 Set set = new HashSet(); 499 FSMState currentState = initialState; 500 FSMState nextState; 501 502 for(int i = 0; i< singleItem.length(); i++) { 503 currentChar = singleItem.charAt(i); 504 if(Character.isWhitespace(currentChar)) currentChar = ' '; 505 nextState = currentState.next(currentChar); 506 if(nextState == null) { 507 return set; 508 } 509 currentState = nextState; 510 } //for(int i = 0; i< text.length(); i++) 511 set = currentState.getLookupSet(); 512 return set; 513 } 514 515 public boolean remove(String singleItem) { 516 char currentChar; 517 FSMState currentState = initialState; 518 FSMState nextState; 519 Lookup oldLookup; 520 521 for(int i = 0; i< singleItem.length(); i++) { 522 currentChar = singleItem.charAt(i); 523 if(Character.isWhitespace(currentChar)) currentChar = ' '; 524 nextState = currentState.next(currentChar); 525 if(nextState == null) { 526 return false; 527 }//nothing to remove 528 currentState = nextState; 529 } //for(int i = 0; i< text.length(); i++) 530 currentState.lookupSet = new HashSet(); 531 return true; 532 } 533 534 public boolean add(String singleItem, Lookup lookup) { 535 addLookup(singleItem,lookup); 536 return true; 537 } 538 539 540 } // DefaultGazetteer 541 542 // >>> DAM: TransArray optimization, new charMap implementation 543 interface Iter 544 { 545 public boolean hasNext(); 546 public char next(); 547 } // iter class 548 549 /** 550 * class implementing the map using binary serach by char as key 551 * to retrive the coresponding object. 552 */ 553 class charMap 554 { 555 char[] itemsKeys = null; 556 Object[] itemsObjs = null; 557 558 /** 559 * resize the containers by one leavaing empty elemant at position 'index' 560 */ 561 void resize(int index) 562 { 563 int newsz = itemsKeys.length + 1; 564 char[] tempKeys = new char[newsz]; 565 Object[] tempObjs = new Object[newsz]; 566 int i; 567 for (i= 0; i < index; i++) 568 { 569 tempKeys[i] = itemsKeys[i]; 570 tempObjs[i] = itemsObjs[i]; 571 } 572 for (i= index+1; i < newsz; i++) 573 { 574 tempKeys[i] = itemsKeys[i-1]; 575 tempObjs[i] = itemsObjs[i-1]; 576 } 577 578 itemsKeys = tempKeys; 579 itemsObjs = tempObjs; 580 } // resize 581 582 /** 583 * get the object from the map using the char key 584 */ 585 Object get(char key) 586 { 587 if (itemsKeys == null) return null; 588 int index = Arrays.binarySearch(itemsKeys, key); 589 if (index<0) 590 return null; 591 return itemsObjs[index]; 592 } 593 /** 594 * put the object into the char map using the chat as the key 595 */ 596 Object put(char key, Object value) 597 { 598 if (itemsKeys == null) 599 { 600 itemsKeys = new char[1]; 601 itemsKeys[0] = key; 602 itemsObjs = new Object[1]; 603 itemsObjs[0] = value; 604 return value; 605 }// if first time 606 int index = Arrays.binarySearch(itemsKeys, key); 607 if (index<0) 608 { 609 index = ~index; 610 resize(index); 611 itemsKeys[index] = key; 612 itemsObjs[index] = value; 613 } 614 return itemsObjs[index]; 615 } // put 616 /** 617 * the keys itereator 618 * / 619 public Iter iter() 620 { 621 return new Iter() 622 { 623 int counter = 0; 624 public boolean hasNext() {return counter < itemsKeys.length;} 625 public char next() { return itemsKeys[counter];} 626 }; 627 } // iter() 628 */ 629 630 } // class charMap 631 // >>> DAM, end, new charMap instead MAP for transition function in the FSMState
|
DefaultGazetteer |
|