|
HtmlDocumentHandler |
|
1 /* 2 * HtmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 12/June/2000 12 * 13 * $Id: HtmlDocumentHandler.java,v 1.31 2002/02/28 15:11:13 nasso Exp $ 14 */ 15 16 package gate.html; 17 18 import javax.swing.text.html.*; 19 import javax.swing.text.html.parser.*; 20 import javax.swing.text.html.HTMLEditorKit.*; 21 import javax.swing.text.BadLocationException; 22 import javax.swing.text.MutableAttributeSet; 23 24 import java.util.*; 25 26 import gate.corpora.*; 27 import gate.util.*; 28 import gate.*; 29 import gate.event.*; 30 31 32 /** Implements the behaviour of the HTML reader. 33 * Methods of an object of this class are called by the HTML parser when 34 * events will appear. 35 * The idea is to parse the HTML document and construct Gate annotations 36 * objects. 37 * This class also will replace the content of the Gate document with a 38 * new one containing anly text from the HTML document. 39 */ 40 public class HtmlDocumentHandler extends ParserCallback { 41 42 /** Debug flag */ 43 private static final boolean DEBUG = false; 44 45 /** Constructor initialises all the private memeber data. 46 * This will use the default annotation set taken from the gate document. 47 * @param aDocument The gate document that will be processed 48 * @param aMarkupElementsMap The map containing the elements that will 49 * transform into annotations 50 */ 51 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) { 52 this(aDocument,aMarkupElementsMap,null); 53 } 54 55 /** Constructor initialises all the private memeber data 56 * @param aDocument The gate document that will be processed 57 * @param aMarkupElementsMap The map containing the elements that will 58 * transform into annotations 59 * @param anAnnoatationSet The annotation set that will contain annotations 60 * resulted from the processing of the gate document 61 */ 62 public HtmlDocumentHandler(gate.Document aDocument, 63 Map aMarkupElementsMap, 64 gate.AnnotationSet anAnnotationSet) { 65 // init stack 66 stack = new java.util.Stack(); 67 68 // this string contains the plain text (the text without markup) 69 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 70 71 // colector is used later to transform all custom objects into 72 // annotation objects 73 colector = new LinkedList(); 74 75 // the Gate document 76 doc = aDocument; 77 78 // this map contains the elements name that we want to create 79 // if it's null all the elements from the XML documents will be transformed 80 // into Gate annotation objects 81 markupElementsMap = aMarkupElementsMap; 82 83 // init an annotation set for this gate document 84 basicAS = anAnnotationSet; 85 86 customObjectsId = 0; 87 }//HtmlDocumentHandler 88 89 /** Keep the refference to this structure */ 90 private RepositioningInfo reposInfo = null; 91 92 /** Keep the refference to this structure */ 93 private RepositioningInfo ampCodingInfo = null; 94 95 /** Set repositioning information structure refference. If you set this 96 * refference to <B>null</B> information wouldn't be collected. 97 */ 98 public void setRepositioningInfo(RepositioningInfo info) { 99 reposInfo = info; 100 } // setRepositioningInfo 101 102 /** Return current RepositioningInfo object */ 103 public RepositioningInfo getRepositioningInfo() { 104 return reposInfo; 105 } // getRepositioningInfo 106 107 /** Set repositioning information structure refference for ampersand coding. 108 * If you set this refference to <B>null</B> information wouldn't be used. 109 */ 110 public void setAmpCodingInfo(RepositioningInfo info) { 111 ampCodingInfo = info; 112 } // setRepositioningInfo 113 114 /** Return current RepositioningInfo object for ampersand coding. */ 115 public RepositioningInfo getAmpCodingInfo() { 116 return ampCodingInfo; 117 } // getRepositioningInfo 118 119 /** The text inside the STYLE tag is processed with <code>handleText()</code>. 120 * We should skip inserting of this text in the document. */ 121 private boolean isInsideStyleTag = false; 122 123 /** This method is called when the HTML parser encounts the beginning 124 * of a tag that means that the tag is paired by an end tag and it's 125 * not an empty one. 126 */ 127 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 128 // Fire the status listener if the elements processed exceded the rate 129 if (0 == (++elements % ELEMENTS_RATE)) 130 fireStatusChangedEvent("Processed elements : " + elements); 131 132 // Start of STYLE tag 133 if(HTML.Tag.STYLE.equals(t)) { 134 isInsideStyleTag = true; 135 } // if 136 137 // Construct a feature map from the attributes list 138 FeatureMap fm = Factory.newFeatureMap(); 139 140 // Take all the attributes an put them into the feature map 141 if (0 != a.getAttributeCount()){ 142 Enumeration enum = a.getAttributeNames(); 143 while (enum.hasMoreElements()){ 144 Object attribute = enum.nextElement(); 145 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString()); 146 }// while 147 }// if 148 149 // Just analize the tag t and add some\n chars and spaces to the 150 // tmpDocContent.The reason behind is that we need to have a readable form 151 // for the final document. 152 customizeAppearanceOfDocumentWithStartTag(t); 153 154 // If until here the "tmpDocContent" ends with a NON whitespace char, 155 // then we add a space char before calculating the START index of this 156 // tag. 157 // This is done in order not to concatenate the content of two separate tags 158 // and obtain a different NEW word. 159 int tmpDocContentSize = tmpDocContent.length(); 160 if ( tmpDocContentSize != 0 && 161 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)) 162 ) tmpDocContent.append(" "); 163 164 // create the start index of the annotation 165 Long startIndex = new Long(tmpDocContent.length()); 166 167 // initialy the start index is equal with the End index 168 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 169 170 // put it into the stack 171 stack.push (obj); 172 173 }//handleStartTag 174 175 /** This method is called when the HTML parser encounts the end of a tag 176 * that means that the tag is paired by a beginning tag 177 */ 178 public void handleEndTag(HTML.Tag t, int pos){ 179 // obj is for internal use 180 CustomObject obj = null; 181 182 // end of STYLE tag 183 if(HTML.Tag.STYLE.equals(t)) { 184 isInsideStyleTag = false; 185 } // if 186 187 // If the stack is not empty then we get the object from the stack 188 if (!stack.isEmpty()){ 189 obj = (CustomObject) stack.pop(); 190 // Before adding it to the colector, we need to check if is an 191 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 192 if (obj.getStart().equals(obj.getEnd())){ 193 // The element had an end tag and its start was equal to its end. Hence 194 // it is anEmptyAndSpan one. 195 obj.getFM().put("isEmptyAndSpan","true"); 196 }// End iff 197 // we add it to the colector 198 colector.add(obj); 199 }// End if 200 201 // If element has text between, then customize its apearance 202 if ( obj != null && 203 obj.getStart().longValue() != obj.getEnd().longValue() 204 ) 205 // Customize the appearance of the document 206 customizeAppearanceOfDocumentWithEndTag(t); 207 208 // if t is the </HTML> tag then we reached the end of theHTMLdocument 209 if (t == HTML.Tag.HTML){ 210 // replace the old content with the new one 211 doc.setContent (new DocumentContentImpl(tmpDocContent.toString())); 212 213 // If basicAs is null then get the default annotation 214 // set from this gate document 215 if (basicAS == null) 216 basicAS = doc.getAnnotations( 217 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 218 219 // sort colector ascending on its id 220 Collections.sort(colector); 221 // iterate through colector and construct annotations 222 while (!colector.isEmpty()){ 223 obj = (CustomObject) colector.getFirst(); 224 colector.remove(obj); 225 // Construct an annotation from this obj 226 try{ 227 if (markupElementsMap == null){ 228 basicAS.add( obj.getStart(), 229 obj.getEnd(), 230 obj.getElemName(), 231 obj.getFM() 232 ); 233 }else{ 234 String annotationType = 235 (String) markupElementsMap.get(obj.getElemName()); 236 if (annotationType != null) 237 basicAS.add( obj.getStart(), 238 obj.getEnd(), 239 annotationType, 240 obj.getFM() 241 ); 242 } 243 }catch (InvalidOffsetException e){ 244 Err.prln("Error creating an annot :" + obj + " Discarded..."); 245 }// end try 246 // }// end if 247 }//while 248 249 // notify the listener about the total amount of elements that 250 // has been processed 251 fireStatusChangedEvent("Total elements : " + elements); 252 253 }//else 254 255 }//handleEndTag 256 257 /** This method is called when the HTML parser encounts an empty tag 258 */ 259 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){ 260 // fire the status listener if the elements processed exceded the rate 261 if ((++elements % ELEMENTS_RATE) == 0) 262 fireStatusChangedEvent("Processed elements : " + elements); 263 264 // construct a feature map from the attributes list 265 // these are empty elements 266 FeatureMap fm = Factory.newFeatureMap(); 267 268 // take all the attributes an put them into the feature map 269 if (0 != a.getAttributeCount ()){ 270 271 // Out.println("HAS attributes = " + a.getAttributeCount ()); 272 Enumeration enum = a.getAttributeNames (); 273 while (enum.hasMoreElements ()){ 274 Object attribute = enum.nextElement (); 275 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString()); 276 277 }//while 278 279 }//if 280 281 // create the start index of the annotation 282 Long startIndex = new Long(tmpDocContent.length()); 283 284 // initialy the start index is equal with the End index 285 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 286 287 // we add the object directly into the colector 288 // we don't add it to the stack because this is an empty tag 289 colector.add(obj); 290 291 // Just analize the tag t and add some\n chars and spaces to the 292 // tmpDocContent.The reason behind is that we need to have a readable form 293 // for the final document. 294 customizeAppearanceOfDocumentWithSimpleTag(t); 295 296 } // handleSimpleTag 297 298 /** This method is called when the HTML parser encounts text (PCDATA) 299 */ 300 public void handleText(char[] text, int pos){ 301 302 // Skip the STYLE tag content 303 if(isInsideStyleTag) return; 304 305 // create a string object based on the reported text 306 String content = new String(text); 307 308 // remove the difference between JDK 1.3 and JDK 1.4 309 String trimContent = content.trim(); 310 if(trimContent.length() == 0) { 311 return; 312 } // if 313 314 int trimCorrection = content.indexOf(trimContent.charAt(0)); 315 content = trimContent; 316 317 StringBuffer contentBuffer = new StringBuffer(""); 318 int tmpDocContentSize = tmpDocContent.length(); 319 boolean incrementStartIndex = false; 320 // If the first char of the text just read "text[0]" is NOT whitespace AND 321 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 322 // concatenation "tmpDocContent + content" will result into a new different 323 // word... and we want to avoid that... 324 if ( tmpDocContentSize != 0 && 325 content.length() != 0 && 326 !Character.isWhitespace(content.charAt(0)) && 327 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 328 329 contentBuffer.append(" "); 330 incrementStartIndex = true; 331 }// End if 332 // update the document content 333 334 // put the repositioning information 335 if(reposInfo != null) { 336 int extractedPos = tmpDocContent.length()+contentBuffer.length() 337 +trimCorrection; 338 addRepositioningInfo(content, pos, extractedPos); 339 } // if 340 341 contentBuffer.append(content); 342 // calculate the End index for all the elements of the stack 343 // the expression is : End index = Current doc length + text length 344 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 345 346 CustomObject obj = null; 347 // Iterate through stack to modify the End index of the existing elements 348 349 java.util.Iterator anIterator = stack.iterator(); 350 while (anIterator.hasNext ()){ 351 // get the object and move to the next one 352 obj = (CustomObject) anIterator.next (); 353 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 354 obj.setStart(new Long(obj.getStart().longValue() + 1)); 355 }// End if 356 // sets its End index 357 obj.setEnd(end); 358 }// End while 359 360 tmpDocContent.append(contentBuffer.toString()); 361 }// end handleText(); 362 363 /** For given content the list with shrink position information is searched 364 * and on the corresponding positions the correct repositioning information 365 * is calculated and generated. 366 */ 367 public void addRepositioningInfo(String content, int pos, int extractedPos) { 368 int contentLength = content.length(); 369 370 // wrong way (without correction and analysing) 371 //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength); 372 373 RepositioningInfo.PositionInfo pi = null; 374 long startPos = pos; 375 long correction = 0; 376 long substituteStart; 377 long remainingLen; 378 long offsetInExtracted; 379 380 for(int i = 0; i < ampCodingInfo.size(); ++i) { 381 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i); 382 substituteStart = pi.getOriginalPosition(); 383 384 if(substituteStart >= startPos) { 385 if(substituteStart > pos + contentLength + correction) { 386 break; // outside the current text 387 } // if 388 389 // should create two repositioning information records 390 remainingLen = substituteStart - (startPos + correction); 391 offsetInExtracted = startPos - pos; 392 if(remainingLen > 0) { 393 reposInfo.addPositionInfo(startPos + correction, remainingLen, 394 extractedPos + offsetInExtracted, remainingLen); 395 } // if 396 // record for shrank text 397 reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(), 398 extractedPos + offsetInExtracted + remainingLen, 399 pi.getCurrentLength()); 400 startPos = startPos + remainingLen + pi.getCurrentLength(); 401 correction += pi.getOriginalLength() - pi.getCurrentLength(); 402 } // if 403 } // for 404 405 // there is some text remaining for repositioning 406 offsetInExtracted = startPos - pos; 407 remainingLen = contentLength - offsetInExtracted; 408 if(remainingLen > 0) { 409 reposInfo.addPositionInfo(startPos + correction, remainingLen, 410 extractedPos + offsetInExtracted, remainingLen); 411 } // if 412 } // addRepositioningInfo 413 414 /** This method analizes the tag t and adds some \n chars and spaces to the 415 * tmpDocContent.The reason behind is that we need to have a readable form 416 * for the final document. This method modifies the content of tmpDocContent. 417 * @param t the Html tag encounted by the HTML parser 418 */ 419 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){ 420 boolean modification = false; 421 // if the HTML tag is BR then we add a new line character to the document 422 if (HTML.Tag.BR == t){ 423 tmpDocContent.append("\n"); 424 modification = true; 425 }// End if 426 if (modification == true){ 427 Long end = new Long (tmpDocContent.length()); 428 java.util.Iterator anIterator = stack.iterator(); 429 while (anIterator.hasNext ()){ 430 // get the object and move to the next one 431 CustomObject obj = (CustomObject) anIterator.next(); 432 // sets its End index 433 obj.setEnd(end); 434 }// End while 435 }//End if 436 }// customizeAppearanceOfDocumentWithSimpleTag 437 438 /** This method analizes the tag t and adds some \n chars and spaces to the 439 * tmpDocContent.The reason behind is that we need to have a readable form 440 * for the final document. This method modifies the content of tmpDocContent. 441 * @param t the Html tag encounted by the HTML parser 442 */ 443 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){ 444 boolean modification = false; 445 if (HTML.Tag.P == t){ 446 int tmpDocContentSize = tmpDocContent.length(); 447 if ( tmpDocContentSize >= 2 && 448 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2) 449 ) { tmpDocContent.append("\n"); modification = true;} 450 }// End if 451 if (modification == true){ 452 Long end = new Long (tmpDocContent.length()); 453 java.util.Iterator anIterator = stack.iterator(); 454 while (anIterator.hasNext ()){ 455 // get the object and move to the next one 456 CustomObject obj = (CustomObject) anIterator.next(); 457 // sets its End index 458 obj.setEnd(end); 459 }// End while 460 }//End if 461 }// customizeAppearanceOfDocumentWithStartTag 462 463 /** This method analizes the tag t and adds some \n chars and spaces to the 464 * tmpDocContent.The reason behind is that we need to have a readable form 465 * for the final document. This method modifies the content of tmpDocContent. 466 * @param t the Html tag encounted by the HTML parser 467 */ 468 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){ 469 boolean modification = false; 470 // if the HTML tag is BR then we add a new line character to the document 471 if ( (HTML.Tag.P == t) || 472 473 (HTML.Tag.H1 == t) || 474 (HTML.Tag.H2 == t) || 475 (HTML.Tag.H3 == t) || 476 (HTML.Tag.H4 == t) || 477 (HTML.Tag.H5 == t) || 478 (HTML.Tag.H6 == t) || 479 (HTML.Tag.TR == t) || 480 (HTML.Tag.CENTER == t) || 481 (HTML.Tag.LI == t) 482 ){ tmpDocContent.append("\n"); modification = true;} 483 484 if (HTML.Tag.TITLE == t){ 485 tmpDocContent.append("\n\n"); 486 modification = true; 487 }// End if 488 489 if (modification == true){ 490 Long end = new Long (tmpDocContent.length()); 491 java.util.Iterator anIterator = stack.iterator(); 492 while (anIterator.hasNext ()){ 493 // get the object and move to the next one 494 CustomObject obj = (CustomObject) anIterator.next(); 495 // sets its End index 496 obj.setEnd(end); 497 }// End while 498 }//End if 499 }// customizeAppearanceOfDocumentWithEndTag 500 501 /** 502 * This method is called when the HTML parser encounts an error 503 * it depends on the programmer if he wants to deal with that error 504 */ 505 public void handleError(String errorMsg, int pos) { 506 //Out.println ("ERROR CALLED : " + errorMsg); 507 } 508 509 /** This method is called once, when the HTML parser reaches the end 510 * of its input streamin order to notify the parserCallback that there 511 * is nothing more to parse. 512 */ 513 public void flush() throws BadLocationException{ 514 }// flush 515 516 /** This method is called when the HTML parser encounts a comment 517 */ 518 public void handleComment(char[] text, int pos) { 519 } 520 521 //StatusReporter Implementation 522 523 public void addStatusListener(StatusListener listener) { 524 myStatusListeners.add(listener); 525 } 526 527 public void removeStatusListener(StatusListener listener) { 528 myStatusListeners.remove(listener); 529 } 530 531 protected void fireStatusChangedEvent(String text) { 532 Iterator listenersIter = myStatusListeners.iterator(); 533 while(listenersIter.hasNext()) 534 ((StatusListener)listenersIter.next()).statusChanged(text); 535 } 536 537 /** 538 * This method verifies if data contained by the CustomObject can be used 539 * to create a GATE annotation. 540 */ 541 /* private boolean canCreateAnnotation(CustomObject aCustomObject){ 542 long start = aCustomObject.getStart().longValue(); 543 long end = aCustomObject.getEnd().longValue(); 544 long gateDocumentSize = doc.getContent().size().longValue(); 545 546 if (start < 0 || end < 0 ) return false; 547 if (start > end ) return false; 548 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false; 549 return true; 550 }// canCreateAnnotation 551 */ 552 553 // HtmlDocumentHandler member data 554 555 // this constant indicates when to fire the status listener 556 // this listener will add an overhead and we don't want a big overhead 557 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 558 final static int ELEMENTS_RATE = 128; 559 560 // this map contains the elements name that we want to create 561 // if it's null all the elements from the HTML documents will be transformed 562 // into Gate annotation objects otherwise only the elements it contains will 563 // be transformed 564 private Map markupElementsMap = null; 565 566 // the content of the HTML document, without any tag 567 // for internal use 568 private StringBuffer tmpDocContent = null; 569 570 // a stack used to remember elements and to keep the order 571 private java.util.Stack stack = null; 572 573 // a gate document 574 private gate.Document doc = null; 575 576 // an annotation set used for creating annotation reffering the doc 577 private gate.AnnotationSet basicAS; 578 579 // listeners for status report 580 protected List myStatusListeners = new LinkedList(); 581 582 // this reports the the number of elements that have beed processed so far 583 private int elements = 0; 584 585 protected long customObjectsId = 0; 586 // we need a colection to retain all the CustomObjects that will be 587 // transformed into annotation over the gate document... 588 // the transformation will take place inside onDocumentEnd() method 589 private LinkedList colector = null; 590 591 // Inner class 592 /** 593 * The objects belonging to this class are used inside the stack. 594 * This class is for internal needs 595 */ 596 class CustomObject implements Comparable { 597 598 // constructor 599 public CustomObject(String anElemName, FeatureMap aFm, 600 Long aStart, Long anEnd) { 601 elemName = anElemName; 602 fm = aFm; 603 start = aStart; 604 end = anEnd; 605 id = new Long(customObjectsId ++); 606 }// End CustomObject() 607 608 // Methos implemented as required by Comparable interface 609 public int compareTo(Object o){ 610 CustomObject obj = (CustomObject) o; 611 return this.id.compareTo(obj.getId()); 612 }// compareTo(); 613 614 // accesor 615 public String getElemName() { 616 return elemName; 617 }// getElemName() 618 619 public FeatureMap getFM() { 620 return fm; 621 }// getFM() 622 623 public Long getStart() { 624 return start; 625 }// getStart() 626 627 public Long getEnd() { 628 return end; 629 }// getEnd() 630 631 public Long getId(){ return id;} 632 633 // mutator 634 public void setElemName(String anElemName) { 635 elemName = anElemName; 636 }// getElemName() 637 638 public void setFM(FeatureMap aFm) { 639 fm = aFm; 640 }// setFM(); 641 642 public void setStart(Long aStart) { 643 start = aStart; 644 }// setStart(); 645 646 public void setEnd(Long anEnd) { 647 end = anEnd; 648 }// setEnd(); 649 650 // data fields 651 private String elemName = null; 652 private FeatureMap fm = null; 653 private Long start = null; 654 private Long end = null; 655 private Long id = null; 656 657 } // End inner class CustomObject 658 659 }//End class HtmlDocumentHandler 660 661 662 663
|
HtmlDocumentHandler |
|