|
XmlDocumentHandler |
|
1 /* 2 * XmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 9 May 2000 12 * 13 * $Id: XmlDocumentHandler.java,v 1.40 2002/03/19 14:47:33 nasso Exp $ 14 */ 15 16 package gate.xml; 17 18 import java.util.*; 19 20 import gate.corpora.*; 21 import gate.util.*; 22 import gate.*; 23 import gate.event.*; 24 25 26 import org.xml.sax.*; 27 import org.xml.sax.helpers.*; 28 29 30 /** 31 * Implements the behaviour of the XML reader 32 * Methods of an object of this class are called by the SAX parser when 33 * events will appear. 34 * The idea is to parse the XML document and construct Gate annotations 35 * objects. 36 * This class also will replace the content of the Gate document with a 37 * new one containing only text from the XML document. 38 */ 39 public class XmlDocumentHandler extends XmlPositionCorrectionHandler { 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** Keep the refference to this structure */ 44 private RepositioningInfo reposInfo = null; 45 46 /** Keep the refference to this structure */ 47 private RepositioningInfo ampCodingInfo = null; 48 49 /** Set repositioning information structure refference. If you set this 50 * refference to <B>null</B> information wouldn't be collected. 51 */ 52 public void setRepositioningInfo(RepositioningInfo info) { 53 reposInfo = info; 54 } // setRepositioningInfo 55 56 /** Return current RepositioningInfo object */ 57 public RepositioningInfo getRepositioningInfo() { 58 return reposInfo; 59 } // getRepositioningInfo 60 61 /** Set repositioning information structure refference for ampersand coding. 62 * If you set this refference to <B>null</B> information wouldn't be used. 63 */ 64 public void setAmpCodingInfo(RepositioningInfo info) { 65 ampCodingInfo = info; 66 } // setRepositioningInfo 67 68 /** Return current RepositioningInfo object for ampersand coding. */ 69 public RepositioningInfo getAmpCodingInfo() { 70 return ampCodingInfo; 71 } // getRepositioningInfo 72 73 /** 74 * Constructs a XmlDocumentHandler object. The annotationSet set will be the 75 * default one taken from the gate document. 76 * @param aDocument the Gate document that will be processed. 77 * @param aMarkupElementsMap this map contains the elements name that we 78 * want to create. 79 * @param anElement2StringMap this map contains the strings that will be 80 * added to the text contained by the key element. 81 */ 82 public XmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap, 83 Map anElement2StringMap){ 84 this(aDocument,aMarkupElementsMap,anElement2StringMap,null); 85 } // XmlDocumentHandler 86 87 /** 88 * Constructs a XmlDocumentHandler object. 89 * @param aDocument the Gate document that will be processed. 90 * @param aMarkupElementsMap this map contains the elements name that we 91 * want to create. 92 * @param anElement2StringMap this map contains the strings that will be 93 * added to the text contained by the key element. 94 * @param anAnnotationSet is the annotation set that will be filled when the 95 * document was processed 96 */ 97 public XmlDocumentHandler(gate.Document aDocument, 98 Map aMarkupElementsMap, 99 Map anElement2StringMap, 100 gate.AnnotationSet anAnnotationSet){ 101 // init parent 102 super(); 103 // init stack 104 stack = new java.util.Stack(); 105 106 // this string contains the plain text (the text without markup) 107 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 108 109 // colector is used later to transform all custom objects into annotation 110 // objects 111 colector = new LinkedList(); 112 113 // the Gate document 114 doc = aDocument; 115 116 // this map contains the elements name that we want to create 117 // if it's null all the elements from the XML documents will be transformed 118 // into Gate annotation objects 119 markupElementsMap = aMarkupElementsMap; 120 121 // this map contains the string that we want to insert iside the document 122 // content, when a certain element is found 123 // if the map is null then no string is added 124 element2StringMap = anElement2StringMap; 125 126 basicAS = anAnnotationSet; 127 customObjectsId = 0; 128 }// XmlDocumentHandler()/ 129 130 /** 131 * This method is called when the SAX parser encounts the beginning of the 132 * XML document. 133 */ 134 public void startDocument() throws org.xml.sax.SAXException { 135 // init of variables in the parent 136 super.startDocument(); 137 } 138 139 /** 140 * This method is called when the SAX parser encounts the end of the 141 * XML document. 142 * Here we set the content of the gate Document to be the one generated 143 * inside this class (tmpDocContent). 144 * After that we use the colector to generate all the annotation reffering 145 * this new gate document. 146 */ 147 public void endDocument() throws org.xml.sax.SAXException { 148 149 // replace the document content with the one without markups 150 doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); 151 152 // fire the status listener 153 fireStatusChangedEvent("Total elements: " + elements); 154 155 // If basicAs is null then get the default AnnotationSet, 156 // based on the gate document. 157 if (basicAS == null) 158 basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 159 160 // sort colector ascending on its id 161 Collections.sort(colector); 162 Set testIdsSet = new HashSet(); 163 // create all the annotations (on this new document) from the collector 164 while (!colector.isEmpty()){ 165 CustomObject obj = (CustomObject) colector.getFirst(); 166 // Test to see if there are two annotation objects with the same id. 167 if (testIdsSet.contains(obj.getId())){ 168 throw new GateSaxException("Found two annotations with the same Id("+ 169 obj.getId()+ 170 ").The document is inconsistent."); 171 }else{ 172 testIdsSet.add(obj.getId()); 173 }// End iff 174 // create a new annotation and add it to the annotation set 175 try{ 176 // the annotation type will be conforming with markupElementsMap 177 //add the annotation to the Annotation Set 178 if (markupElementsMap == null) 179 basicAS.add( obj.getId(), 180 obj.getStart(), 181 obj.getEnd(), 182 obj.getElemName(), 183 obj.getFM ()); 184 else { 185 // get the type of the annotation from Map 186 String annotationType = (String) 187 markupElementsMap.get(obj.getElemName()); 188 if (annotationType != null) 189 basicAS.add( obj.getId(), 190 obj.getStart(), 191 obj.getEnd(), 192 annotationType, 193 obj.getFM()); 194 }// End if 195 }catch (gate.util.InvalidOffsetException e){ 196 Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + 197 " with Id =" + obj.getId() + ". Discarded..."); 198 }// End try 199 colector.remove(obj); 200 }// End while 201 }// endDocument(); 202 203 /** 204 * This method is called when the SAX parser encounts the beginning of an 205 * XML element. 206 */ 207 public void startElement (String uri, String qName, String elemName, 208 Attributes atts){ 209 // Inform the progress listener to fire only if no of elements processed 210 // so far is a multiple of ELEMENTS_RATE 211 if ((++elements % ELEMENTS_RATE) == 0) 212 fireStatusChangedEvent("Processed elements : " + elements); 213 214 Integer customObjectId = null; 215 // Construct a SimpleFeatureMapImpl from the list of attributes 216 FeatureMap fm = Factory.newFeatureMap(); 217 //Get the name and the value of the attributes and add them to a FeaturesMAP 218 for (int i = 0; i < atts.getLength(); i++) { 219 String attName = atts.getLocalName(i); 220 String attValue = atts.getValue(i); 221 String attUri = atts.getURI(i); 222 if (attUri != null && Gate.URI.equals(attUri)){ 223 if ("gateId".equals(attName)){ 224 customObjectId = new Integer(attValue); 225 }// End if 226 if ("annotMaxId".equals(attName)){ 227 customObjectsId = new Integer(attValue).intValue(); 228 }// End if 229 if ("matches".equals(attName)){ 230 StringTokenizer strTokenizer = new StringTokenizer(attValue,";"); 231 List list = new ArrayList(); 232 // Take all tokens,create Integers and add them to the list 233 while (strTokenizer.hasMoreTokens()){ 234 String token = strTokenizer.nextToken(); 235 list.add(new Integer(token)); 236 }// End while 237 fm.put(attName,list); 238 }// End if 239 }else{ 240 fm.put(attName,attValue); 241 }// End if 242 }// End for 243 244 // create the START index of the annotation 245 Long startIndex = new Long(tmpDocContent.length()); 246 247 // initialy the Start index is equal with End index 248 CustomObject obj = new CustomObject(customObjectId,elemName,fm, 249 startIndex, startIndex); 250 251 // put this object into the stack 252 stack.push(obj); 253 }// startElement(); 254 255 /** 256 * This method is called when the SAX parser encounts the end of an 257 * XML element. 258 * Here we extract 259 */ 260 public void endElement (String uri, String qName, String elemName ) 261 throws SAXException{ 262 // obj is for internal use 263 CustomObject obj = null; 264 265 // if the stack is not empty, we extract the custom object and delete it 266 if (!stack.isEmpty ()){ 267 obj = (CustomObject) stack.pop(); 268 }// End if 269 270 // Before adding it to the colector, we need to check if is an 271 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 272 if (obj.getStart().equals(obj.getEnd())){ 273 // The element had an end tag and its start was equal to its end. Hence 274 // it is anEmptyAndSpan one. 275 obj.getFM().put("isEmptyAndSpan","true"); 276 }// End iff 277 278 // Put the object into colector 279 // Later, when the document ends we will use colector to create all the 280 // annotations 281 colector.add(obj); 282 283 // if element is found on Element2String map, then add the string to the 284 // end of the document content 285 if (element2StringMap != null){ 286 String stringFromMap = null; 287 288 // test to see if element is inside the map 289 // if it is then get the string value and add it to the document content 290 stringFromMap = (String) element2StringMap.get(elemName); 291 if (stringFromMap != null) 292 tmpDocContent.append(stringFromMap); 293 }// End if 294 }// endElement(); 295 296 /** 297 * This method is called when the SAX parser encounts text in the XML doc. 298 * Here we calculate the end indices for all the elements present inside the 299 * stack and update with the new values. For entities, this method is called 300 * separatley regardless of the text sourinding the entity. 301 */ 302 public void characters( char[] text,int start,int length) throws SAXException{ 303 // correction of real offset. Didn't affect on other data. 304 super.characters(text, start, length); 305 // create a string object based on the reported text 306 String content = new String(text, start, length); 307 StringBuffer contentBuffer = new StringBuffer(""); 308 int tmpDocContentSize = tmpDocContent.length(); 309 boolean incrementStartIndex = false; 310 // If the first char of the text just read "text[0]" is NOT whitespace AND 311 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 312 // concatenation "tmpDocContent + content" will result into a new different 313 // word... and we want to avoid that, because the tokenizer, gazetter and 314 // Jape work on the raw text and concatenating tokens might be not good. 315 if ( tmpDocContentSize != 0 && 316 content.length() != 0 && 317 !Character.isWhitespace(content.charAt(0)) && 318 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 319 320 // If we are here it means that a concatenation between the last 321 // token in the tmpDocContent and the content(which doesn't start 322 // with a white space) will be performed. In order to prevent this, 323 // we will add a " " space char in order to assure that the 2 tokens 324 // stay apart. Howerver we will except from this rule the most known 325 // internal entities like &, <, >, etc 326 if ( 327 ( 328 // Testing the length against 1 makes it more likely that 329 // an internal entity was called. characters() gets called for 330 // each entity separately. 331 (content.length() == 1) 332 && 333 (content.charAt(0) == '&' || 334 content.charAt(0) == '<' || 335 content.charAt(0) == '>' || 336 content.charAt(0) == '"' || 337 content.charAt(0) == '\'' 338 ) 339 ) || 340 (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || 341 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || 342 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || 343 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || 344 tmpDocContent.charAt(tmpDocContentSize - 1) == '\'' 345 )){// do nothing. The content will be appended 346 }else{ 347 // In all other cases append " " 348 contentBuffer.append(" "); 349 incrementStartIndex = true; 350 }// End if 351 }// End if 352 353 // put the repositioning information 354 if(reposInfo != null) { 355 if(! (start == 0 && length == 1 && text.length <= 2)) { 356 // normal piece of text 357 reposInfo.addPositionInfo(getRealOffset(), content.length(), 358 tmpDocContent.length()+contentBuffer.length(), 359 content.length()); 360 if(DEBUG) { 361 Out.println("Info: "+getRealOffset()+", "+content.length()); 362 Out.println("Start: "+start+" len"+length); 363 } // DEBUG 364 } 365 else { 366 // unicode char or &xxx; coding 367 // Reported from the parser offset is 0 368 // The real offset should be found in the ampCodingInfo structure. 369 370 long lastPosition = 0; 371 RepositioningInfo.PositionInfo pi; 372 373 if(reposInfo.size() > 0) { 374 pi = 375 (RepositioningInfo.PositionInfo) reposInfo.get(reposInfo.size()-1); 376 lastPosition = pi.getOriginalPosition(); 377 } // if 378 379 for(int i = 0; i < ampCodingInfo.size(); ++i) { 380 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i); 381 if(pi.getOriginalPosition() > lastPosition) { 382 // found 383 reposInfo.addPositionInfo(pi.getOriginalPosition(), 384 pi.getOriginalLength(), 385 tmpDocContent.length()+contentBuffer.length(), 386 content.length()); 387 break; 388 } // if 389 } // for 390 } // if 391 } // if 392 393 // update the document content 394 contentBuffer.append(content); 395 // calculate the End index for all the elements of the stack 396 // the expression is : End index = Current doc length + text length 397 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 398 399 CustomObject obj = null; 400 // Iterate through stack to modify the End index of the existing elements 401 402 java.util.Iterator anIterator = stack.iterator(); 403 while (anIterator.hasNext ()){ 404 // get the object and move to the next one 405 obj = (CustomObject) anIterator.next (); 406 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 407 obj.setStart(new Long(obj.getStart().longValue() + 1)); 408 }// End if 409 // sets its End index 410 obj.setEnd(end); 411 }// End while 412 413 tmpDocContent.append(contentBuffer.toString()); 414 }// characters(); 415 416 /** 417 * This method is called when the SAX parser encounts white spaces 418 */ 419 public void ignorableWhitespace(char ch[],int start,int length) throws 420 SAXException{ 421 422 // internal String object 423 String text = new String(ch, start, length); 424 // if the last character in tmpDocContent is \n and the read whitespace is 425 // \n then don't add it to tmpDocContent... 426 427 if (tmpDocContent.length () != 0) 428 if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' || 429 !text.equalsIgnoreCase("\n") 430 ) 431 tmpDocContent.append(text); 432 } 433 434 /** 435 * Error method.We deal with this exception inside SimpleErrorHandler class 436 */ 437 public void error(SAXParseException ex) throws SAXException { 438 // deal with a SAXParseException 439 // see SimpleErrorhandler class 440 _seh.error(ex); 441 } 442 443 /** 444 * FatalError method. 445 */ 446 public void fatalError(SAXParseException ex) throws SAXException { 447 // deal with a SAXParseException 448 // see SimpleErrorhandler class 449 _seh.fatalError(ex); 450 } 451 452 /** 453 * Warning method comment. 454 */ 455 public void warning(SAXParseException ex) throws SAXException { 456 // deal with a SAXParseException 457 // see SimpleErrorhandler class 458 _seh.warning(ex); 459 } 460 461 /** 462 * This method is called when the SAX parser encounts a comment 463 * It works only if the XmlDocumentHandler implements a 464 * com.sun.parser.LexicalEventListener 465 */ 466 public void comment(String text) throws SAXException { 467 // create a FeatureMap and then add the comment to the annotation set. 468 /* 469 gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl(); 470 fm.put ("text_comment",text); 471 Long node = new Long (tmpDocContent.length()); 472 CustomObject anObject = new CustomObject("Comment",fm,node,node); 473 colector.add(anObject); 474 */ 475 } 476 477 /** 478 * This method is called when the SAX parser encounts a start of a CDATA 479 * section 480 * It works only if the XmlDocumentHandler implements a 481 * com.sun.parser.LexicalEventListener 482 */ 483 public void startCDATA()throws SAXException { 484 } 485 486 /** 487 * This method is called when the SAX parser encounts the end of a CDATA 488 * section. 489 * It works only if the XmlDocumentHandler implements a 490 * com.sun.parser.LexicalEventListener 491 */ 492 public void endCDATA() throws SAXException { 493 } 494 495 /** 496 * This method is called when the SAX parser encounts a parsed Entity 497 * It works only if the XmlDocumentHandler implements a 498 * com.sun.parser.LexicalEventListener 499 */ 500 public void startParsedEntity(String name) throws SAXException { 501 } 502 503 /** 504 * This method is called when the SAX parser encounts a parsed entity and 505 * informs the application if that entity was parsed or not 506 * It's working only if the CustomDocumentHandler implements a 507 * com.sun.parser.LexicalEventListener 508 */ 509 public void endParsedEntity(String name, boolean included)throws SAXException{ 510 } 511 512 //StatusReporter Implementation 513 514 /** 515 * This methos is called when a listener is registered with this class 516 */ 517 public void addStatusListener(StatusListener listener){ 518 myStatusListeners.add(listener); 519 } 520 /** 521 * This methos is called when a listener is removed 522 */ 523 public void removeStatusListener(StatusListener listener){ 524 myStatusListeners.remove(listener); 525 } 526 /** 527 * This methos is called whenever we need to inform the listener about an 528 * event. 529 */ 530 protected void fireStatusChangedEvent(String text){ 531 Iterator listenersIter = myStatusListeners.iterator(); 532 while(listenersIter.hasNext()) 533 ((StatusListener)listenersIter.next()).statusChanged(text); 534 } 535 536 /** This method is a workaround of the java 4 non namespace supporting parser 537 * It receives a qualified name and returns its local name. 538 * For eg. if it receives gate:gateId it will return gateId 539 */ 540 private String getMyLocalName(String aQName){ 541 if (aQName == null) return ""; 542 StringTokenizer strToken = new StringTokenizer(aQName,":"); 543 if (strToken.countTokens()<= 1) return aQName; 544 // The nr of tokens is >= than 2 545 // Skip the first token which is the QName 546 strToken.nextToken(); 547 return strToken.nextToken(); 548 }//getMyLocalName() 549 550 /** Also a workaround for URI identifier. If the QName is gate it will return 551 * GATE's. Otherwhise it will return the empty string 552 */ 553 private String getMyURI(String aQName){ 554 if (aQName == null) return ""; 555 StringTokenizer strToken = new StringTokenizer(aQName,":"); 556 if (strToken.countTokens()<= 1) return ""; 557 // If first token is "gate" then return GATE's URI 558 if ("gate".equalsIgnoreCase(strToken.nextToken())) 559 return Gate.URI; 560 return ""; 561 }// getMyURI() 562 563 // XmlDocumentHandler member data 564 565 // this constant indicates when to fire the status listener 566 // this listener will add an overhead and we don't want a big overhead 567 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 568 final static int ELEMENTS_RATE = 128; 569 570 // this map contains the elements name that we want to create 571 // if it's null all the elements from the XML documents will be transformed 572 // into Gate annotation objects otherwise only the elements it contains will 573 // be transformed 574 private Map markupElementsMap = null; 575 576 // this map contains the string that we want to insert iside the document 577 // content, when a certain element is found 578 // if the map is null then no string is added 579 private Map element2StringMap = null; 580 581 /**This object inducates what to do when the parser encounts an error*/ 582 private SimpleErrorHandler _seh = new SimpleErrorHandler(); 583 584 /**The content of the XML document, without any tag for internal use*/ 585 private StringBuffer tmpDocContent = null; 586 587 /**A stack used to remember elements and to keep the order */ 588 private java.util.Stack stack = null; 589 590 /**A gate document */ 591 private gate.Document doc = null; 592 593 /**An annotation set used for creating annotation reffering the doc */ 594 private gate.AnnotationSet basicAS = null; 595 596 /**Listeners for status report */ 597 protected List myStatusListeners = new LinkedList(); 598 599 /**This reports the the number of elements that have beed processed so far*/ 600 private int elements = 0; 601 602 /** We need a colection to retain all the CustomObjects that will be 603 * transformed into annotation over the gate document... 604 * the transformation will take place inside onDocumentEnd() method 605 */ 606 private LinkedList colector = null; 607 608 /** This is used to generate unique Ids for the CustomObjects read*/ 609 protected int customObjectsId = 0; 610 611 /** Accesor method for the customObjectsId field*/ 612 public int getCustomObjectsId(){ return customObjectsId;} 613 614 //////// INNER CLASS 615 /** 616 * The objects belonging to this class are used inside the stack. 617 * This class is for internal needs 618 */ 619 class CustomObject implements Comparable { 620 621 // constructor 622 public CustomObject(Integer anId,String anElemName, FeatureMap aFm, 623 Long aStart, Long anEnd) { 624 elemName = anElemName; 625 fm = aFm; 626 start = aStart; 627 end = anEnd; 628 if (anId == null){ 629 id = new Integer(customObjectsId ++); 630 }else{ 631 id = anId; 632 if (customObjectsId <= anId.intValue()) 633 customObjectsId = anId.intValue() + 1 ; 634 }// End if 635 }// End CustomObject() 636 637 // Methos implemented as required by Comparable interface 638 public int compareTo(Object o){ 639 CustomObject obj = (CustomObject) o; 640 return this.id.compareTo(obj.getId()); 641 }// compareTo(); 642 643 // accesor 644 public String getElemName() { 645 return elemName; 646 }// getElemName() 647 648 public FeatureMap getFM() { 649 return fm; 650 }// getFM() 651 652 public Long getStart() { 653 return start; 654 }// getStart() 655 656 public Long getEnd() { 657 return end; 658 }// getEnd() 659 660 public Integer getId(){ return id;} 661 662 // mutator 663 public void setElemName(String anElemName) { 664 elemName = anElemName; 665 }// getElemName() 666 667 public void setFM(FeatureMap aFm) { 668 fm = aFm; 669 }// setFM(); 670 671 public void setStart(Long aStart) { 672 start = aStart; 673 }// setStart(); 674 675 public void setEnd(Long anEnd) { 676 end = anEnd; 677 }// setEnd(); 678 679 // data fields 680 private String elemName = null; 681 private FeatureMap fm = null; 682 private Long start = null; 683 private Long end = null; 684 private Integer id = null; 685 686 } // End inner class CustomObject 687 688 } //XmlDocumentHandler 689 690 691 692
|
XmlDocumentHandler |
|