|
XmlDocumentHandler |
|
1 /* 2 * XmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 9 May 2000 12 * 13 * $Id: XmlDocumentHandler.java,v 1.35 2001/11/08 17:23:34 cursu Exp $ 14 */ 15 16 package gate.xml; 17 18 import java.util.*; 19 20 import gate.corpora.*; 21 import gate.util.*; 22 import gate.*; 23 import gate.event.*; 24 25 26 import org.xml.sax.*; 27 import org.xml.sax.helpers.*; 28 29 30 /** 31 * Implements the behaviour of the XML reader 32 * Methods of an object of this class are called by the SAX parser when 33 * events will appear. 34 * The idea is to parse the XML document and construct Gate annotations 35 * objects. 36 * This class also will replace the content of the Gate document with a 37 * new one containing anly text from the XML document. 38 */ 39 public class XmlDocumentHandler extends DefaultHandler{ 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** 44 * Constructs a XmlDocumentHandler object. The annotationSet set will be the 45 * default one taken from the gate document. 46 * @param aDocument the Gate document that will be processed. 47 * @param aMarkupElementsMap this map contains the elements name that we 48 * want to create. 49 * @param anElement2StringMap this map contains the strings that will be 50 * added to the text contained by the key element. 51 */ 52 public XmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap, 53 Map anElement2StringMap){ 54 this(aDocument,aMarkupElementsMap,anElement2StringMap,null); 55 } // XmlDocumentHandler 56 57 /** 58 * Constructs a XmlDocumentHandler object. 59 * @param aDocument the Gate document that will be processed. 60 * @param aMarkupElementsMap this map contains the elements name that we 61 * want to create. 62 * @param anElement2StringMap this map contains the strings that will be 63 * added to the text contained by the key element. 64 * @param anAnnotationSet is the annotation set that will be filled when the 65 * document was processed 66 */ 67 public XmlDocumentHandler(gate.Document aDocument, 68 Map aMarkupElementsMap, 69 Map anElement2StringMap, 70 gate.AnnotationSet anAnnotationSet){ 71 // init stack 72 stack = new java.util.Stack(); 73 74 // this string contains the plain text (the text without markup) 75 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 76 77 // colector is used later to transform all custom objects into annotation 78 // objects 79 colector = new LinkedList(); 80 81 // the Gate document 82 doc = aDocument; 83 84 // this map contains the elements name that we want to create 85 // if it's null all the elements from the XML documents will be transformed 86 // into Gate annotation objects 87 markupElementsMap = aMarkupElementsMap; 88 89 // this map contains the string that we want to insert iside the document 90 // content, when a certain element is found 91 // if the map is null then no string is added 92 element2StringMap = anElement2StringMap; 93 94 basicAS = anAnnotationSet; 95 customObjectsId = 0; 96 }// XmlDocumentHandler()/ 97 98 /** 99 * This method is called when the SAX parser encounts the beginning of the 100 * XML document. 101 */ 102 public void startDocument() throws org.xml.sax.SAXException { 103 } 104 105 /** 106 * This method is called when the SAX parser encounts the end of the 107 * XML document. 108 * Here we set the content of the gate Document to be the one generated 109 * inside this class (tmpDocContent). 110 * After that we use the colector to generate all the annotation reffering 111 * this new gate document. 112 */ 113 public void endDocument() throws org.xml.sax.SAXException { 114 115 // replace the document content with the one without markups 116 doc.setContent(new DocumentContentImpl(tmpDocContent.toString())); 117 118 // fire the status listener 119 fireStatusChangedEvent("Total elements: " + elements); 120 121 // If basicAs is null then get the default AnnotationSet, 122 // based on the gate document. 123 if (basicAS == null) 124 basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 125 126 // sort colector ascending on its id 127 Collections.sort(colector); 128 Set testIdsSet = new HashSet(); 129 // create all the annotations (on this new document) from the collector 130 while (!colector.isEmpty()){ 131 CustomObject obj = (CustomObject) colector.getFirst(); 132 // Test to see if there are two annotation objects with the same id. 133 if (testIdsSet.contains(obj.getId())){ 134 throw new GateSaxException("Found two annotations with the same Id("+ 135 obj.getId()+ 136 ").The document is inconsistent."); 137 }else{ 138 testIdsSet.add(obj.getId()); 139 }// End iff 140 // create a new annotation and add it to the annotation set 141 try{ 142 // the annotation type will be conforming with markupElementsMap 143 //add the annotation to the Annotation Set 144 if (markupElementsMap == null) 145 basicAS.add( obj.getId(), 146 obj.getStart(), 147 obj.getEnd(), 148 obj.getElemName(), 149 obj.getFM ()); 150 else { 151 // get the type of the annotation from Map 152 String annotationType = (String) 153 markupElementsMap.get(obj.getElemName()); 154 if (annotationType != null) 155 basicAS.add( obj.getId(), 156 obj.getStart(), 157 obj.getEnd(), 158 annotationType, 159 obj.getFM()); 160 }// End if 161 }catch (gate.util.InvalidOffsetException e){ 162 Err.prln("InvalidOffsetException for annot :" + obj.getElemName() + 163 " with Id =" + obj.getId() + ". Discarded..."); 164 }// End try 165 colector.remove(obj); 166 }// End while 167 }// endDocument(); 168 169 /** 170 * This method is called when the SAX parser encounts the beginning of an 171 * XML element. 172 */ 173 public void startElement (String uri, String qName, String elemName, 174 Attributes atts){ 175 // Inform the progress listener to fire only if no of elements processed 176 // so far is a multiple of ELEMENTS_RATE 177 if ((++elements % ELEMENTS_RATE) == 0) 178 fireStatusChangedEvent("Processed elements : " + elements); 179 180 Integer customObjectId = null; 181 // Construct a SimpleFeatureMapImpl from the list of attributes 182 FeatureMap fm = Factory.newFeatureMap(); 183 //Get the name and the value of the attributes and add them to a FeaturesMAP 184 for (int i = 0; i < atts.getLength(); i++) { 185 String attName = atts.getLocalName(i); 186 String attValue = atts.getValue(i); 187 String attUri = atts.getURI(i); 188 if (attUri != null && Gate.URI.equals(attUri)){ 189 if ("gateId".equals(attName)){ 190 customObjectId = new Integer(attValue); 191 }// End if 192 if ("annotMaxId".equals(attName)){ 193 customObjectsId = new Integer(attValue).intValue(); 194 }// End if 195 if ("matches".equals(attName)){ 196 StringTokenizer strTokenizer = new StringTokenizer(attValue,";"); 197 List list = new ArrayList(); 198 // Take all tokens,create Integers and add them to the list 199 while (strTokenizer.hasMoreTokens()){ 200 String token = strTokenizer.nextToken(); 201 list.add(new Integer(token)); 202 }// End while 203 fm.put(attName,list); 204 }// End if 205 }else{ 206 fm.put(attName,attValue); 207 }// End if 208 }// End for 209 210 // create the START index of the annotation 211 Long startIndex = new Long(tmpDocContent.length()); 212 213 // initialy the Start index is equal with End index 214 CustomObject obj = new CustomObject(customObjectId,elemName,fm, 215 startIndex, startIndex); 216 217 // put this object into the stack 218 stack.push(obj); 219 }// startElement(); 220 221 /** 222 * This method is called when the SAX parser encounts the end of an 223 * XML element. 224 * Here we extract 225 */ 226 public void endElement (String uri, String qName, String elemName ) 227 throws SAXException{ 228 // obj is for internal use 229 CustomObject obj = null; 230 231 // if the stack is not empty, we extract the custom object and delete it 232 if (!stack.isEmpty ()){ 233 obj = (CustomObject) stack.pop(); 234 }// End if 235 236 // Before adding it to the colector, we need to check if is an 237 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 238 if (obj.getStart().equals(obj.getEnd())){ 239 // The element had an end tag and its start was equal to its end. Hence 240 // it is anEmptyAndSpan one. 241 obj.getFM().put("isEmptyAndSpan","true"); 242 }// End iff 243 244 // Put the object into colector 245 // Later, when the document ends we will use colector to create all the 246 // annotations 247 colector.add(obj); 248 249 // if element is found on Element2String map, then add the string to the 250 // end of the document content 251 if (element2StringMap != null){ 252 String stringFromMap = null; 253 254 // test to see if element is inside the map 255 // if it is then get the string value and add it to the document content 256 stringFromMap = (String) element2StringMap.get(elemName); 257 if (stringFromMap != null) 258 tmpDocContent.append(stringFromMap); 259 }// End if 260 }// endElement(); 261 262 /** 263 * This method is called when the SAX parser encounts text in the XML doc. 264 * Here we calculate the end indices for all the elements present inside the 265 * stack and update with the new values. For entities, this method is called 266 * separatley regardless of the text sourinding the entity. 267 */ 268 public void characters( char[] text,int start,int length) throws SAXException{ 269 // create a string object based on the reported text 270 String content = new String(text, start, length); 271 StringBuffer contentBuffer = new StringBuffer(""); 272 int tmpDocContentSize = tmpDocContent.length(); 273 boolean incrementStartIndex = false; 274 // If the first char of the text just read "text[0]" is NOT whitespace AND 275 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 276 // concatenation "tmpDocContent + content" will result into a new different 277 // word... and we want to avoid that, because the tokenizer, gazetter and 278 // Jape work on the raw text and concatenating tokens might be not good. 279 if ( tmpDocContentSize != 0 && 280 content.length() != 0 && 281 !Character.isWhitespace(content.charAt(0)) && 282 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 283 284 // If we are here it means that a concatenation between the last 285 // token in the tmpDocContent and the content(which doesn't start 286 // with a white space) will be performed. In order to prevent this, 287 // we will add a " " space char in order to assure taht the 2 tokens 288 // stay apart. Howerver we will except from this rule the most known 289 // internal entities like &, <, >, etc 290 if ( 291 ( 292 // Testing the length against 1 makes it more likely that 293 // an internal entity was called. characters() gets called for 294 // each entity separately. 295 (content.length() == 1) 296 && 297 (content.charAt(0) == '&' || 298 content.charAt(0) == '<' || 299 content.charAt(0) == '>' || 300 content.charAt(0) == '"' || 301 content.charAt(0) == '\'' 302 ) 303 ) || 304 (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' || 305 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' || 306 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' || 307 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' || 308 tmpDocContent.charAt(tmpDocContentSize - 1) == '\'' 309 )){// do nothing. The content will be appended 310 }else{ 311 // In all other cases append " " 312 contentBuffer.append(" "); 313 incrementStartIndex = true; 314 }// End if 315 }// End if 316 // update the document content 317 contentBuffer.append(content); 318 // calculate the End index for all the elements of the stack 319 // the expression is : End index = Current doc length + text length 320 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 321 322 CustomObject obj = null; 323 // Iterate through stack to modify the End index of the existing elements 324 325 java.util.Iterator anIterator = stack.iterator(); 326 while (anIterator.hasNext ()){ 327 // get the object and move to the next one 328 obj = (CustomObject) anIterator.next (); 329 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 330 obj.setStart(new Long(obj.getStart().longValue() + 1)); 331 }// End if 332 // sets its End index 333 obj.setEnd(end); 334 }// End while 335 336 tmpDocContent.append(contentBuffer.toString()); 337 }// characters(); 338 339 /** 340 * This method is called when the SAX parser encounts white spaces 341 */ 342 public void ignorableWhitespace(char ch[],int start,int length) throws 343 SAXException{ 344 345 // internal String object 346 String text = new String(ch, start, length); 347 // if the last character in tmpDocContent is \n and the read whitespace is 348 // \n then don't add it to tmpDocContent... 349 350 if (tmpDocContent.length () != 0) 351 if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' || 352 !text.equalsIgnoreCase("\n") 353 ) 354 tmpDocContent.append(text); 355 } 356 357 /** 358 * Error method.We deal with this exception inside SimpleErrorHandler class 359 */ 360 public void error(SAXParseException ex) throws SAXException { 361 // deal with a SAXParseException 362 // see SimpleErrorhandler class 363 _seh.error(ex); 364 } 365 366 /** 367 * FatalError method. 368 */ 369 public void fatalError(SAXParseException ex) throws SAXException { 370 // deal with a SAXParseException 371 // see SimpleErrorhandler class 372 _seh.fatalError(ex); 373 } 374 375 /** 376 * Warning method comment. 377 */ 378 public void warning(SAXParseException ex) throws SAXException { 379 // deal with a SAXParseException 380 // see SimpleErrorhandler class 381 _seh.warning(ex); 382 } 383 384 /** 385 * This method is called when the SAX parser encounts a comment 386 * It works only if the XmlDocumentHandler implements a 387 * com.sun.parser.LexicalEventListener 388 */ 389 public void comment(String text) throws SAXException { 390 // create a FeatureMap and then add the comment to the annotation set. 391 /* 392 gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl(); 393 fm.put ("text_comment",text); 394 Long node = new Long (tmpDocContent.length()); 395 CustomObject anObject = new CustomObject("Comment",fm,node,node); 396 colector.add(anObject); 397 */ 398 } 399 400 /** 401 * This method is called when the SAX parser encounts a start of a CDATA 402 * section 403 * It works only if the XmlDocumentHandler implements a 404 * com.sun.parser.LexicalEventListener 405 */ 406 public void startCDATA()throws SAXException { 407 } 408 409 /** 410 * This method is called when the SAX parser encounts the end of a CDATA 411 * section. 412 * It works only if the XmlDocumentHandler implements a 413 * com.sun.parser.LexicalEventListener 414 */ 415 public void endCDATA() throws SAXException { 416 } 417 418 /** 419 * This method is called when the SAX parser encounts a parsed Entity 420 * It works only if the XmlDocumentHandler implements a 421 * com.sun.parser.LexicalEventListener 422 */ 423 public void startParsedEntity(String name) throws SAXException { 424 } 425 426 /** 427 * This method is called when the SAX parser encounts a parsed entity and 428 * informs the application if that entity was parsed or not 429 * It's working only if the CustomDocumentHandler implements a 430 * com.sun.parser.LexicalEventListener 431 */ 432 public void endParsedEntity(String name, boolean included)throws SAXException{ 433 } 434 435 //StatusReporter Implementation 436 437 /** 438 * This methos is called when a listener is registered with this class 439 */ 440 public void addStatusListener(StatusListener listener){ 441 myStatusListeners.add(listener); 442 } 443 /** 444 * This methos is called when a listener is removed 445 */ 446 public void removeStatusListener(StatusListener listener){ 447 myStatusListeners.remove(listener); 448 } 449 /** 450 * This methos is called whenever we need to inform the listener about an 451 * event. 452 */ 453 protected void fireStatusChangedEvent(String text){ 454 Iterator listenersIter = myStatusListeners.iterator(); 455 while(listenersIter.hasNext()) 456 ((StatusListener)listenersIter.next()).statusChanged(text); 457 } 458 459 /** This method is a workaround of the java 4 non namespace supporting parser 460 * It receives a qualified name and returns its local name. 461 * For eg. if it receives gate:gateId it will return gateId 462 */ 463 private String getMyLocalName(String aQName){ 464 if (aQName == null) return ""; 465 StringTokenizer strToken = new StringTokenizer(aQName,":"); 466 if (strToken.countTokens()<= 1) return aQName; 467 // The nr of tokens is >= than 2 468 // Skip the first token which is the QName 469 strToken.nextToken(); 470 return strToken.nextToken(); 471 }//getMyLocalName() 472 473 /** Also a workaround for URI identifier. If the QName is gate it will return 474 * GATE's. Otherwhise it will return the empty string 475 */ 476 private String getMyURI(String aQName){ 477 if (aQName == null) return ""; 478 StringTokenizer strToken = new StringTokenizer(aQName,":"); 479 if (strToken.countTokens()<= 1) return ""; 480 // If first token is "gate" then return GATE's URI 481 if ("gate".equalsIgnoreCase(strToken.nextToken())) 482 return Gate.URI; 483 return ""; 484 }// getMyURI() 485 486 // XmlDocumentHandler member data 487 488 // this constant indicates when to fire the status listener 489 // this listener will add an overhead and we don't want a big overhead 490 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 491 final static int ELEMENTS_RATE = 128; 492 493 // this map contains the elements name that we want to create 494 // if it's null all the elements from the XML documents will be transformed 495 // into Gate annotation objects otherwise only the elements it contains will 496 // be transformed 497 private Map markupElementsMap = null; 498 499 // this map contains the string that we want to insert iside the document 500 // content, when a certain element is found 501 // if the map is null then no string is added 502 private Map element2StringMap = null; 503 504 /**This object inducates what to do when the parser encounts an error*/ 505 private SimpleErrorHandler _seh = new SimpleErrorHandler(); 506 507 /**The content of the XML document, without any tag for internal use*/ 508 private StringBuffer tmpDocContent = null; 509 510 /**A stack used to remember elements and to keep the order */ 511 private java.util.Stack stack = null; 512 513 /**A gate document */ 514 private gate.Document doc = null; 515 516 /**An annotation set used for creating annotation reffering the doc */ 517 private gate.AnnotationSet basicAS = null; 518 519 /**Listeners for status report */ 520 protected List myStatusListeners = new LinkedList(); 521 522 /**This reports the the number of elements that have beed processed so far*/ 523 private int elements = 0; 524 525 /** We need a colection to retain all the CustomObjects that will be 526 * transformed into annotation over the gate document... 527 * the transformation will take place inside onDocumentEnd() method 528 */ 529 private LinkedList colector = null; 530 531 /** This is used to generate unique Ids for the CustomObjects read*/ 532 protected int customObjectsId = 0; 533 534 /** Accesor method for the customObjectsId field*/ 535 public int getCustomObjectsId(){ return customObjectsId;} 536 537 //////// INNER CLASS 538 /** 539 * The objects belonging to this class are used inside the stack. 540 * This class is for internal needs 541 */ 542 class CustomObject implements Comparable { 543 544 // constructor 545 public CustomObject(Integer anId,String anElemName, FeatureMap aFm, 546 Long aStart, Long anEnd) { 547 elemName = anElemName; 548 fm = aFm; 549 start = aStart; 550 end = anEnd; 551 if (anId == null){ 552 id = new Integer(customObjectsId ++); 553 }else{ 554 id = anId; 555 if (customObjectsId <= anId.intValue()) 556 customObjectsId = anId.intValue() + 1 ; 557 }// End if 558 }// End CustomObject() 559 560 // Methos implemented as required by Comparable interface 561 public int compareTo(Object o){ 562 CustomObject obj = (CustomObject) o; 563 return this.id.compareTo(obj.getId()); 564 }// compareTo(); 565 566 // accesor 567 public String getElemName() { 568 return elemName; 569 }// getElemName() 570 571 public FeatureMap getFM() { 572 return fm; 573 }// getFM() 574 575 public Long getStart() { 576 return start; 577 }// getStart() 578 579 public Long getEnd() { 580 return end; 581 }// getEnd() 582 583 public Integer getId(){ return id;} 584 585 // mutator 586 public void setElemName(String anElemName) { 587 elemName = anElemName; 588 }// getElemName() 589 590 public void setFM(FeatureMap aFm) { 591 fm = aFm; 592 }// setFM(); 593 594 public void setStart(Long aStart) { 595 start = aStart; 596 }// setStart(); 597 598 public void setEnd(Long anEnd) { 599 end = anEnd; 600 }// setEnd(); 601 602 // data fields 603 private String elemName = null; 604 private FeatureMap fm = null; 605 private Long start = null; 606 private Long end = null; 607 private Integer id = null; 608 609 } // End inner class CustomObject 610 611 } //XmlDocumentHandler 612 613 614 615
|
XmlDocumentHandler |
|