|
XmlDocumentHandler |
|
1 /* 2 * XmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 9 May 2000 12 * 13 * $Id: XmlDocumentHandler.java,v 1.43 2002/10/29 10:21:51 valyt Exp $ 14 */ 15 16 package gate.xml; 17 18 import java.util.*; 19 20 import gate.corpora.*; 21 import gate.util.*; 22 import gate.*; 23 import gate.event.*; 24 25 26 import org.xml.sax.*; 27 import org.xml.sax.helpers.*; 28 29 30 /** 31 * Implements the behaviour of the XML reader 32 * Methods of an object of this class are called by the SAX parser when 33 * events will appear. 34 * The idea is to parse the XML document and construct Gate annotations 35 * objects. 36 * This class also will replace the content of the Gate document with a 37 * new one containing only text from the XML document. 38 */ 39 public class XmlDocumentHandler extends XmlPositionCorrectionHandler { 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** Keep the refference to this structure */ 44 private RepositioningInfo reposInfo = null; 45 46 /** Keep the refference to this structure */ 47 private RepositioningInfo ampCodingInfo = null; 48 49 /** Set repositioning information structure refference. If you set this 50 * refference to <B>null</B> information wouldn't be collected. 51 */ 52 public void setRepositioningInfo(RepositioningInfo info) { 53 reposInfo = info; 54 } // setRepositioningInfo 55 56 /** Return current RepositioningInfo object */ 57 public RepositioningInfo getRepositioningInfo() { 58 return reposInfo; 59 } // getRepositioningInfo 60 61 /** Set repositioning information structure refference for ampersand coding. 62 * If you set this refference to <B>null</B> information wouldn't be used. 63 */ 64 public void setAmpCodingInfo(RepositioningInfo info) { 65 ampCodingInfo = info; 66 } // setRepositioningInfo 67 68 /** Return current RepositioningInfo object for ampersand coding. */ 69 public RepositioningInfo getAmpCodingInfo() { 70 return ampCodingInfo; 71 } // getRepositioningInfo 72 73 /** 74 * Constructs a XmlDocumentHandler object. The annotationSet set will be the 75 * default one taken from the gate document. 76 * @param aDocument the Gate document that will be processed. 77 * @param aMarkupElementsMap this map contains the elements name that we 78 * want to create. 79 * @param anElement2StringMap this map contains the strings that will be 80 * added to the text contained by the key element. 81 */ 82 public XmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap, 83 Map anElement2StringMap){ 84 this(aDocument,aMarkupElementsMap,anElement2StringMap,null); 85 } // XmlDocumentHandler 86 87 /** 88 * Constructs a XmlDocumentHandler object. 89 * @param aDocument the Gate document that will be processed. 90 * @param aMarkupElementsMap this map contains the elements name that we 91 * want to create. 92 * @param anElement2StringMap this map contains the strings that will be 93 * added to the text contained by the key element. 94 * @param anAnnotationSet is the annotation set that will be filled when the 95 * document was processed 96 */ 97 public XmlDocumentHandler(gate.Document aDocument, 98 Map aMarkupElementsMap, 99 Map anElement2StringMap, 100 gate.AnnotationSet anAnnotationSet){ 101 // init parent 102 super(); 103 // init stack 104 stack = new java.util.Stack(); 105 106 // this string contains the plain text (the text without markup) 107 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 108 109 // colector is used later to transform all custom objects into annotation 110 // objects 111 colector = new LinkedList(); 112 113 // the Gate document 114 doc = aDocument; 115 116 // this map contains the elements name that we want to create 117 // if it's null all the elements from the XML documents will be transformed 118 // into Gate annotation objects 119 markupElementsMap = aMarkupElementsMap; 120 121 // this map contains the string that we want to insert iside the document 122 // content, when a certain element is found 123 // if the map is null then no string is added 124 element2StringMap = anElement2StringMap; 125 126 basicAS = anAnnotationSet; 127 customObjectsId = 0; 128 }// XmlDocumentHandler()/ 129 130 /** 131 * This method is called when the SAX parser encounts the beginning of the 132 * XML document. 133 */ 134 public void startDocument() throws org.xml.sax.SAXException { 135 // init of variables in the parent 136 super.startDocument(); 137 } 138 139 /** 140 * This method is called when the SAX parser encounts the end of the 141 * XML document. 142 * Here we set the content of the gate Document to be the one generated 143 * inside this class (tmpDocContent). 144 * After that we use the colector to generate all the annotation reffering 145 * this new gate document. 146 */ 147 public void endDocument() throws org.xml.sax.SAXException { 148 149 // replace the document content with the one without markups 150 doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); 151 152 // fire the status listener 153 fireStatusChangedEvent("Total elements: " + elements); 154 155 // If basicAs is null then get the default AnnotationSet, 156 // based on the gate document. 157 if (basicAS == null) 158 basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 159 160 // sort colector ascending on its id 161 Collections.sort(colector); 162 Set testIdsSet = new HashSet(); 163 // create all the annotations (on this new document) from the collector 164 while (!colector.isEmpty()){ 165 CustomObject obj = (CustomObject) colector.getFirst(); 166 // Test to see if there are two annotation objects with the same id. 167 if (testIdsSet.contains(obj.getId())){ 168 throw new GateSaxException("Found two annotations with the same Id("+ 169 obj.getId()+ 170 ").The document is inconsistent."); 171 }else{ 172 testIdsSet.add(obj.getId()); 173 }// End iff 174 // create a new annotation and add it to the annotation set 175 try{ 176 // the annotation type will be conforming with markupElementsMap 177 //add the annotation to the Annotation Set 178 if (markupElementsMap == null) 179 basicAS.add( obj.getId(), 180 obj.getStart(), 181 obj.getEnd(), 182 obj.getElemName(), 183 obj.getFM ()); 184 else { 185 // get the type of the annotation from Map 186 String annotationType = (String) 187 markupElementsMap.get(obj.getElemName()); 188 if (annotationType != null) 189 basicAS.add( obj.getId(), 190 obj.getStart(), 191 obj.getEnd(), 192 annotationType, 193 obj.getFM()); 194 }// End if 195 }catch (gate.util.InvalidOffsetException e){ 196 Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + 197 " with Id =" + obj.getId() + ". Discarded..."); 198 }// End try 199 colector.remove(obj); 200 }// End while 201 }// endDocument(); 202 203 /** 204 * This method is called when the SAX parser encounts the beginning of an 205 * XML element. 206 */ 207 public void startElement (String uri, String qName, String elemName, 208 Attributes atts){ 209 // Inform the progress listener to fire only if no of elements processed 210 // so far is a multiple of ELEMENTS_RATE 211 if ((++elements % ELEMENTS_RATE) == 0) 212 fireStatusChangedEvent("Processed elements : " + elements); 213 214 Integer customObjectId = null; 215 // Construct a SimpleFeatureMapImpl from the list of attributes 216 FeatureMap fm = Factory.newFeatureMap(); 217 //Get the name and the value of the attributes and add them to a FeaturesMAP 218 for (int i = 0; i < atts.getLength(); i++) { 219 String attName = atts.getLocalName(i); 220 String attValue = atts.getValue(i); 221 String attUri = atts.getURI(i); 222 if (attUri != null && Gate.URI.equals(attUri)){ 223 if ("gateId".equals(attName)){ 224 customObjectId = new Integer(attValue); 225 }// End if 226 if ("annotMaxId".equals(attName)){ 227 customObjectsId = new Integer(attValue).intValue(); 228 }// End if 229 if ("matches".equals(attName)){ 230 StringTokenizer strTokenizer = new StringTokenizer(attValue,";"); 231 List list = new ArrayList(); 232 // Take all tokens,create Integers and add them to the list 233 while (strTokenizer.hasMoreTokens()){ 234 String token = strTokenizer.nextToken(); 235 list.add(new Integer(token)); 236 }// End while 237 fm.put(attName,list); 238 }// End if 239 }else{ 240 fm.put(atts.getQName(i), attValue); 241 }// End if 242 }// End for 243 244 // create the START index of the annotation 245 Long startIndex = new Long(tmpDocContent.length()); 246 247 // initialy the Start index is equal with End index 248 CustomObject obj = new CustomObject(customObjectId,elemName,fm, 249 startIndex, startIndex); 250 251 // put this object into the stack 252 stack.push(obj); 253 }// startElement(); 254 255 /** 256 * This method is called when the SAX parser encounts the end of an 257 * XML element. 258 * Here we extract 259 */ 260 public void endElement (String uri, String qName, String elemName ) 261 throws SAXException{ 262 // obj is for internal use 263 CustomObject obj = null; 264 265 // if the stack is not empty, we extract the custom object and delete it 266 if (!stack.isEmpty ()){ 267 obj = (CustomObject) stack.pop(); 268 }// End if 269 270 // Before adding it to the colector, we need to check if is an 271 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 272 if (obj.getStart().equals(obj.getEnd())){ 273 // The element had an end tag and its start was equal to its end. Hence 274 // it is anEmptyAndSpan one. 275 obj.getFM().put("isEmptyAndSpan","true"); 276 }// End iff 277 278 // Put the object into colector 279 // Later, when the document ends we will use colector to create all the 280 // annotations 281 colector.add(obj); 282 283 // if element is found on Element2String map, then add the string to the 284 // end of the document content 285 if (element2StringMap != null){ 286 String stringFromMap = null; 287 288 // test to see if element is inside the map 289 // if it is then get the string value and add it to the document content 290 stringFromMap = (String) element2StringMap.get(elemName); 291 if (stringFromMap != null) 292 tmpDocContent.append(stringFromMap); 293 }// End if 294 }// endElement(); 295 296 /** 297 * This method is called when the SAX parser encounts text in the XML doc. 298 * Here we calculate the end indices for all the elements present inside the 299 * stack and update with the new values. For entities, this method is called 300 * separatley regardless of the text sourinding the entity. 301 */ 302 public void characters( char[] text,int start,int length) throws SAXException{ 303 // correction of real offset. Didn't affect on other data. 304 super.characters(text, start, length); 305 // create a string object based on the reported text 306 String content = new String(text, start, length); 307 StringBuffer contentBuffer = new StringBuffer(""); 308 int tmpDocContentSize = tmpDocContent.length(); 309 boolean incrementStartIndex = false; 310 boolean addExtraSpace = true; 311 if ( Gate.getUserConfig().get( 312 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME)!= null) 313 addExtraSpace = 314 Gate.getUserConfig().getBoolean( 315 GateConstants.DOCUMENT_ADD_SPACE_ON_UNPACK_FEATURE_NAME 316 ).booleanValue(); 317 // If the first char of the text just read "text[0]" is NOT whitespace AND 318 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 319 // concatenation "tmpDocContent + content" will result into a new different 320 // word... and we want to avoid that, because the tokenizer, gazetter and 321 // Jape work on the raw text and concatenating tokens might be not good. 322 if ( tmpDocContentSize != 0 && 323 content.length() != 0 && 324 !Character.isWhitespace(content.charAt(0)) && 325 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 326 327 // If we are here it means that a concatenation between the last 328 // token in the tmpDocContent and the content(which doesn't start 329 // with a white space) will be performed. In order to prevent this, 330 // we will add a " " space char in order to assure that the 2 tokens 331 // stay apart. Howerver we will except from this rule the most known 332 // internal entities like &, <, >, etc 333 if ( 334 ( 335 // Testing the length against 1 makes it more likely that 336 // an internal entity was called. characters() gets called for 337 // each entity separately. 338 (content.length() == 1) 339 && 340 (content.charAt(0) == '&' || 341 content.charAt(0) == '<' || 342 content.charAt(0) == '>' || 343 content.charAt(0) == '"' || 344 content.charAt(0) == '\'' 345 ) 346 ) || 347 (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || 348 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || 349 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || 350 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || 351 tmpDocContent.charAt(tmpDocContentSize - 1) == '\'' 352 )){// do nothing. The content will be appended 353 }else if (!addExtraSpace) { 354 }else 355 { 356 // In all other cases append " " 357 contentBuffer.append(" "); 358 incrementStartIndex = true; 359 }// End if 360 }// End if 361 362 // put the repositioning information 363 if(reposInfo != null) { 364 if(! (start == 0 && length == 1 && text.length <= 2)) { 365 // normal piece of text 366 reposInfo.addPositionInfo(getRealOffset(), content.length(), 367 tmpDocContent.length()+contentBuffer.length(), 368 content.length()); 369 if(DEBUG) { 370 Out.println("Info: "+getRealOffset()+", "+content.length()); 371 Out.println("Start: "+start+" len"+length); 372 } // DEBUG 373 } 374 else { 375 // unicode char or &xxx; coding 376 // Reported from the parser offset is 0 377 // The real offset should be found in the ampCodingInfo structure. 378 379 long lastPosition = 0; 380 RepositioningInfo.PositionInfo pi; 381 382 if(reposInfo.size() > 0) { 383 pi = 384 (RepositioningInfo.PositionInfo) reposInfo.get(reposInfo.size()-1); 385 lastPosition = pi.getOriginalPosition(); 386 } // if 387 388 for(int i = 0; i < ampCodingInfo.size(); ++i) { 389 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i); 390 if(pi.getOriginalPosition() > lastPosition) { 391 // found 392 reposInfo.addPositionInfo(pi.getOriginalPosition(), 393 pi.getOriginalLength(), 394 tmpDocContent.length()+contentBuffer.length(), 395 content.length()); 396 break; 397 } // if 398 } // for 399 } // if 400 } // if 401 402 // update the document content 403 contentBuffer.append(content); 404 // calculate the End index for all the elements of the stack 405 // the expression is : End index = Current doc length + text length 406 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 407 408 CustomObject obj = null; 409 // Iterate through stack to modify the End index of the existing elements 410 411 java.util.Iterator anIterator = stack.iterator(); 412 while (anIterator.hasNext ()){ 413 // get the object and move to the next one 414 obj = (CustomObject) anIterator.next (); 415 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 416 obj.setStart(new Long(obj.getStart().longValue() + 1)); 417 }// End if 418 // sets its End index 419 obj.setEnd(end); 420 }// End while 421 422 tmpDocContent.append(contentBuffer.toString()); 423 }// characters(); 424 425 /** 426 * This method is called when the SAX parser encounts white spaces 427 */ 428 public void ignorableWhitespace(char ch[],int start,int length) throws 429 SAXException{ 430 431 // internal String object 432 String text = new String(ch, start, length); 433 // if the last character in tmpDocContent is \n and the read whitespace is 434 // \n then don't add it to tmpDocContent... 435 436 if (tmpDocContent.length () != 0) 437 if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' || 438 !text.equalsIgnoreCase("\n") 439 ) 440 tmpDocContent.append(text); 441 } 442 443 /** 444 * Error method.We deal with this exception inside SimpleErrorHandler class 445 */ 446 public void error(SAXParseException ex) throws SAXException { 447 // deal with a SAXParseException 448 // see SimpleErrorhandler class 449 _seh.error(ex); 450 } 451 452 /** 453 * FatalError method. 454 */ 455 public void fatalError(SAXParseException ex) throws SAXException { 456 // deal with a SAXParseException 457 // see SimpleErrorhandler class 458 _seh.fatalError(ex); 459 } 460 461 /** 462 * Warning method comment. 463 */ 464 public void warning(SAXParseException ex) throws SAXException { 465 // deal with a SAXParseException 466 // see SimpleErrorhandler class 467 _seh.warning(ex); 468 } 469 470 /** 471 * This method is called when the SAX parser encounts a comment 472 * It works only if the XmlDocumentHandler implements a 473 * com.sun.parser.LexicalEventListener 474 */ 475 public void comment(String text) throws SAXException { 476 // create a FeatureMap and then add the comment to the annotation set. 477 /* 478 gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl(); 479 fm.put ("text_comment",text); 480 Long node = new Long (tmpDocContent.length()); 481 CustomObject anObject = new CustomObject("Comment",fm,node,node); 482 colector.add(anObject); 483 */ 484 } 485 486 /** 487 * This method is called when the SAX parser encounts a start of a CDATA 488 * section 489 * It works only if the XmlDocumentHandler implements a 490 * com.sun.parser.LexicalEventListener 491 */ 492 public void startCDATA()throws SAXException { 493 } 494 495 /** 496 * This method is called when the SAX parser encounts the end of a CDATA 497 * section. 498 * It works only if the XmlDocumentHandler implements a 499 * com.sun.parser.LexicalEventListener 500 */ 501 public void endCDATA() throws SAXException { 502 } 503 504 /** 505 * This method is called when the SAX parser encounts a parsed Entity 506 * It works only if the XmlDocumentHandler implements a 507 * com.sun.parser.LexicalEventListener 508 */ 509 public void startParsedEntity(String name) throws SAXException { 510 } 511 512 /** 513 * This method is called when the SAX parser encounts a parsed entity and 514 * informs the application if that entity was parsed or not 515 * It's working only if the CustomDocumentHandler implements a 516 * com.sun.parser.LexicalEventListener 517 */ 518 public void endParsedEntity(String name, boolean included)throws SAXException{ 519 } 520 521 //StatusReporter Implementation 522 523 /** 524 * This methos is called when a listener is registered with this class 525 */ 526 public void addStatusListener(StatusListener listener){ 527 myStatusListeners.add(listener); 528 } 529 /** 530 * This methos is called when a listener is removed 531 */ 532 public void removeStatusListener(StatusListener listener){ 533 myStatusListeners.remove(listener); 534 } 535 /** 536 * This methos is called whenever we need to inform the listener about an 537 * event. 538 */ 539 protected void fireStatusChangedEvent(String text){ 540 Iterator listenersIter = myStatusListeners.iterator(); 541 while(listenersIter.hasNext()) 542 ((StatusListener)listenersIter.next()).statusChanged(text); 543 } 544 545 /** This method is a workaround of the java 4 non namespace supporting parser 546 * It receives a qualified name and returns its local name. 547 * For eg. if it receives gate:gateId it will return gateId 548 */ 549 private String getMyLocalName(String aQName){ 550 if (aQName == null) return ""; 551 StringTokenizer strToken = new StringTokenizer(aQName,":"); 552 if (strToken.countTokens()<= 1) return aQName; 553 // The nr of tokens is >= than 2 554 // Skip the first token which is the QName 555 strToken.nextToken(); 556 return strToken.nextToken(); 557 }//getMyLocalName() 558 559 /** Also a workaround for URI identifier. If the QName is gate it will return 560 * GATE's. Otherwhise it will return the empty string 561 */ 562 private String getMyURI(String aQName){ 563 if (aQName == null) return ""; 564 StringTokenizer strToken = new StringTokenizer(aQName,":"); 565 if (strToken.countTokens()<= 1) return ""; 566 // If first token is "gate" then return GATE's URI 567 if ("gate".equalsIgnoreCase(strToken.nextToken())) 568 return Gate.URI; 569 return ""; 570 }// getMyURI() 571 572 // XmlDocumentHandler member data 573 574 // this constant indicates when to fire the status listener 575 // this listener will add an overhead and we don't want a big overhead 576 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 577 final static int ELEMENTS_RATE = 128; 578 579 // this map contains the elements name that we want to create 580 // if it's null all the elements from the XML documents will be transformed 581 // into Gate annotation objects otherwise only the elements it contains will 582 // be transformed 583 private Map markupElementsMap = null; 584 585 // this map contains the string that we want to insert iside the document 586 // content, when a certain element is found 587 // if the map is null then no string is added 588 private Map element2StringMap = null; 589 590 /**This object inducates what to do when the parser encounts an error*/ 591 private SimpleErrorHandler _seh = new SimpleErrorHandler(); 592 593 /**The content of the XML document, without any tag for internal use*/ 594 private StringBuffer tmpDocContent = null; 595 596 /**A stack used to remember elements and to keep the order */ 597 private java.util.Stack stack = null; 598 599 /**A gate document */ 600 private gate.Document doc = null; 601 602 /**An annotation set used for creating annotation reffering the doc */ 603 private gate.AnnotationSet basicAS = null; 604 605 /**Listeners for status report */ 606 protected List myStatusListeners = new LinkedList(); 607 608 /**This reports the the number of elements that have beed processed so far*/ 609 private int elements = 0; 610 611 /** We need a colection to retain all the CustomObjects that will be 612 * transformed into annotation over the gate document... 613 * the transformation will take place inside onDocumentEnd() method 614 */ 615 private LinkedList colector = null; 616 617 /** This is used to generate unique Ids for the CustomObjects read*/ 618 protected int customObjectsId = 0; 619 620 /** Accesor method for the customObjectsId field*/ 621 public int getCustomObjectsId(){ return customObjectsId;} 622 623 //////// INNER CLASS 624 /** 625 * The objects belonging to this class are used inside the stack. 626 * This class is for internal needs 627 */ 628 class CustomObject implements Comparable { 629 630 // constructor 631 public CustomObject(Integer anId,String anElemName, FeatureMap aFm, 632 Long aStart, Long anEnd) { 633 elemName = anElemName; 634 fm = aFm; 635 start = aStart; 636 end = anEnd; 637 if (anId == null){ 638 id = new Integer(customObjectsId ++); 639 }else{ 640 id = anId; 641 if (customObjectsId <= anId.intValue()) 642 customObjectsId = anId.intValue() + 1 ; 643 }// End if 644 }// End CustomObject() 645 646 // Methos implemented as required by Comparable interface 647 public int compareTo(Object o){ 648 CustomObject obj = (CustomObject) o; 649 return this.id.compareTo(obj.getId()); 650 }// compareTo(); 651 652 // accesor 653 public String getElemName() { 654 return elemName; 655 }// getElemName() 656 657 public FeatureMap getFM() { 658 return fm; 659 }// getFM() 660 661 public Long getStart() { 662 return start; 663 }// getStart() 664 665 public Long getEnd() { 666 return end; 667 }// getEnd() 668 669 public Integer getId(){ return id;} 670 671 // mutator 672 public void setElemName(String anElemName) { 673 elemName = anElemName; 674 }// getElemName() 675 676 public void setFM(FeatureMap aFm) { 677 fm = aFm; 678 }// setFM(); 679 680 public void setStart(Long aStart) { 681 start = aStart; 682 }// setStart(); 683 684 public void setEnd(Long anEnd) { 685 end = anEnd; 686 }// setEnd(); 687 688 // data fields 689 private String elemName = null; 690 private FeatureMap fm = null; 691 private Long start = null; 692 private Long end = null; 693 private Integer id = null; 694 695 } // End inner class CustomObject 696 697 } //XmlDocumentHandler 698 699 700 701
|
XmlDocumentHandler |
|