|
HtmlDocumentHandler |
|
1 /* 2 * HtmlDocumentHandler.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 12/June/2000 12 * 13 * $Id: HtmlDocumentHandler.java,v 1.27 2001/11/08 17:23:32 cursu Exp $ 14 */ 15 16 package gate.html; 17 18 import javax.swing.text.html.*; 19 import javax.swing.text.html.parser.*; 20 import javax.swing.text.html.HTMLEditorKit.*; 21 import javax.swing.text.*; 22 23 import java.util.*; 24 25 import gate.corpora.*; 26 import gate.util.*; 27 import gate.*; 28 import gate.event.*; 29 30 31 /** Implements the behaviour of the HTML reader. 32 * Methods of an object of this class are called by the HTML parser when 33 * events will appear. 34 * The idea is to parse the HTML document and construct Gate annotations 35 * objects. 36 * This class also will replace the content of the Gate document with a 37 * new one containing anly text from the HTML document. 38 */ 39 public class HtmlDocumentHandler extends ParserCallback { 40 41 /** Debug flag */ 42 private static final boolean DEBUG = false; 43 44 /** Constructor initialises all the private memeber data. 45 * This will use the default annotation set taken from the gate document. 46 * @param aDocument The gate document that will be processed 47 * @param aMarkupElementsMap The map containing the elements that will 48 * transform into annotations 49 */ 50 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) { 51 this(aDocument,aMarkupElementsMap,null); 52 } 53 54 /** Constructor initialises all the private memeber data 55 * @param aDocument The gate document that will be processed 56 * @param aMarkupElementsMap The map containing the elements that will 57 * transform into annotations 58 * @param anAnnoatationSet The annotation set that will contain annotations 59 * resulted from the processing of the gate document 60 */ 61 public HtmlDocumentHandler(gate.Document aDocument, 62 Map aMarkupElementsMap, 63 gate.AnnotationSet anAnnotationSet) { 64 // init stack 65 stack = new java.util.Stack(); 66 67 // this string contains the plain text (the text without markup) 68 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue()); 69 70 // colector is used later to transform all custom objects into 71 // annotation objects 72 colector = new LinkedList(); 73 74 // the Gate document 75 doc = aDocument; 76 77 // this map contains the elements name that we want to create 78 // if it's null all the elements from the XML documents will be transformed 79 // into Gate annotation objects 80 markupElementsMap = aMarkupElementsMap; 81 82 // init an annotation set for this gate document 83 basicAS = anAnnotationSet; 84 85 customObjectsId = 0; 86 }//HtmlDocumentHandler 87 88 /** This method is called when the HTML parser encounts the beginning 89 * of a tag that means that the tag is paired by an end tag and it's 90 * not an empty one. 91 */ 92 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 93 // Fire the status listener if the elements processed exceded the rate 94 if (0 == (++elements % ELEMENTS_RATE)) 95 fireStatusChangedEvent("Processed elements : " + elements); 96 97 // Construct a feature map from the attributes list 98 FeatureMap fm = Factory.newFeatureMap(); 99 100 // Take all the attributes an put them into the feature map 101 if (0 != a.getAttributeCount()){ 102 Enumeration enum = a.getAttributeNames(); 103 while (enum.hasMoreElements()){ 104 Object attribute = enum.nextElement(); 105 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString()); 106 }// while 107 }// if 108 109 // Just analize the tag t and add some\n chars and spaces to the 110 // tmpDocContent.The reason behind is that we need to have a readable form 111 // for the final document. 112 customizeAppearanceOfDocumentWithStartTag(t); 113 114 // If until here the "tmpDocContent" ends with a NON whitespace char, 115 // then we add a space char before calculating the START index of this 116 // tag. 117 // This is done in order not to concatenate the content of two separate tags 118 // and obtain a different NEW word. 119 int tmpDocContentSize = tmpDocContent.length(); 120 if ( tmpDocContentSize != 0 && 121 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1)) 122 ) tmpDocContent.append(" "); 123 124 // create the start index of the annotation 125 Long startIndex = new Long(tmpDocContent.length()); 126 127 // initialy the start index is equal with the End index 128 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 129 130 // put it into the stack 131 stack.push (obj); 132 133 }//handleStartTag 134 135 /** This method is called when the HTML parser encounts the end of a tag 136 * that means that the tag is paired by a beginning tag 137 */ 138 public void handleEndTag(HTML.Tag t, int pos){ 139 // obj is for internal use 140 CustomObject obj = null; 141 142 // If the stack is not empty then we get the object from the stack 143 if (!stack.isEmpty()){ 144 obj = (CustomObject) stack.pop(); 145 // Before adding it to the colector, we need to check if is an 146 // emptyAndSpan one. See CustomObject's isEmptyAndSpan field. 147 if (obj.getStart().equals(obj.getEnd())){ 148 // The element had an end tag and its start was equal to its end. Hence 149 // it is anEmptyAndSpan one. 150 obj.getFM().put("isEmptyAndSpan","true"); 151 }// End iff 152 // we add it to the colector 153 colector.add(obj); 154 }// End if 155 156 // If element has text between, then customize its apearance 157 if ( obj != null && 158 obj.getStart().longValue() != obj.getEnd().longValue() 159 ) 160 // Customize the appearance of the document 161 customizeAppearanceOfDocumentWithEndTag(t); 162 163 // if t is the </HTML> tag then we reached the end of theHTMLdocument 164 if (t == HTML.Tag.HTML){ 165 // replace the old content with the new one 166 doc.setContent (new DocumentContentImpl(tmpDocContent.toString())); 167 168 // If basicAs is null then get the default annotation 169 // set from this gate document 170 if (basicAS == null) 171 basicAS = doc.getAnnotations( 172 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 173 174 // sort colector ascending on its id 175 Collections.sort(colector); 176 // iterate through colector and construct annotations 177 while (!colector.isEmpty()){ 178 obj = (CustomObject) colector.getFirst(); 179 colector.remove(obj); 180 // Construct an annotation from this obj 181 try{ 182 if (markupElementsMap == null){ 183 basicAS.add( obj.getStart(), 184 obj.getEnd(), 185 obj.getElemName(), 186 obj.getFM() 187 ); 188 }else{ 189 String annotationType = 190 (String) markupElementsMap.get(obj.getElemName()); 191 if (annotationType != null) 192 basicAS.add( obj.getStart(), 193 obj.getEnd(), 194 annotationType, 195 obj.getFM() 196 ); 197 } 198 }catch (InvalidOffsetException e){ 199 Err.prln("Error creating an annot :" + obj + " Discarded..."); 200 }// end try 201 // }// end if 202 }//while 203 204 // notify the listener about the total amount of elements that 205 // has been processed 206 fireStatusChangedEvent("Total elements : " + elements); 207 208 }//else 209 210 }//handleEndTag 211 212 /** This method is called when the HTML parser encounts an empty tag 213 */ 214 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){ 215 // fire the status listener if the elements processed exceded the rate 216 if ((++elements % ELEMENTS_RATE) == 0) 217 fireStatusChangedEvent("Processed elements : " + elements); 218 219 // construct a feature map from the attributes list 220 // these are empty elements 221 FeatureMap fm = Factory.newFeatureMap(); 222 223 // take all the attributes an put them into the feature map 224 if (0 != a.getAttributeCount ()){ 225 226 // Out.println("HAS attributes = " + a.getAttributeCount ()); 227 Enumeration enum = a.getAttributeNames (); 228 while (enum.hasMoreElements ()){ 229 Object attribute = enum.nextElement (); 230 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString()); 231 232 }//while 233 234 }//if 235 236 // create the start index of the annotation 237 Long startIndex = new Long(tmpDocContent.length()); 238 239 // initialy the start index is equal with the End index 240 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex); 241 242 // we add the object directly into the colector 243 // we don't add it to the stack because this is an empty tag 244 colector.add(obj); 245 246 // Just analize the tag t and add some\n chars and spaces to the 247 // tmpDocContent.The reason behind is that we need to have a readable form 248 // for the final document. 249 customizeAppearanceOfDocumentWithSimpleTag(t); 250 251 } // handleSimpleTag 252 253 /** This method is called when the HTML parser encounts text (PCDATA) 254 */ 255 public void handleText(char[] text, int pos){ 256 // create a string object based on the reported text 257 String content = new String(text); 258 StringBuffer contentBuffer = new StringBuffer(""); 259 int tmpDocContentSize = tmpDocContent.length(); 260 boolean incrementStartIndex = false; 261 // If the first char of the text just read "text[0]" is NOT whitespace AND 262 // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then 263 // concatenation "tmpDocContent + content" will result into a new different 264 // word... and we want to avoid that... 265 if ( tmpDocContentSize != 0 && 266 content.length() != 0 && 267 !Character.isWhitespace(content.charAt(0)) && 268 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){ 269 270 contentBuffer.append(" "); 271 incrementStartIndex = true; 272 }// End if 273 // update the document content 274 contentBuffer.append(content); 275 // calculate the End index for all the elements of the stack 276 // the expression is : End index = Current doc length + text length 277 Long end = new Long(tmpDocContent.length() + contentBuffer.length()); 278 279 CustomObject obj = null; 280 // Iterate through stack to modify the End index of the existing elements 281 282 java.util.Iterator anIterator = stack.iterator(); 283 while (anIterator.hasNext ()){ 284 // get the object and move to the next one 285 obj = (CustomObject) anIterator.next (); 286 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){ 287 obj.setStart(new Long(obj.getStart().longValue() + 1)); 288 }// End if 289 // sets its End index 290 obj.setEnd(end); 291 }// End while 292 293 tmpDocContent.append(contentBuffer.toString()); 294 }// end handleText(); 295 296 /** This method analizes the tag t and adds some \n chars and spaces to the 297 * tmpDocContent.The reason behind is that we need to have a readable form 298 * for the final document. This method modifies the content of tmpDocContent. 299 * @param t the Html tag encounted by the HTML parser 300 */ 301 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){ 302 boolean modification = false; 303 // if the HTML tag is BR then we add a new line character to the document 304 if (HTML.Tag.BR == t){ 305 tmpDocContent.append("\n"); 306 modification = true; 307 }// End if 308 if (modification == true){ 309 Long end = new Long (tmpDocContent.length()); 310 java.util.Iterator anIterator = stack.iterator(); 311 while (anIterator.hasNext ()){ 312 // get the object and move to the next one 313 CustomObject obj = (CustomObject) anIterator.next(); 314 // sets its End index 315 obj.setEnd(end); 316 }// End while 317 }//End if 318 }// customizeAppearanceOfDocumentWithSimpleTag 319 320 /** This method analizes the tag t and adds some \n chars and spaces to the 321 * tmpDocContent.The reason behind is that we need to have a readable form 322 * for the final document. This method modifies the content of tmpDocContent. 323 * @param t the Html tag encounted by the HTML parser 324 */ 325 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){ 326 boolean modification = false; 327 if (HTML.Tag.P == t){ 328 int tmpDocContentSize = tmpDocContent.length(); 329 if ( tmpDocContentSize >= 2 && 330 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2) 331 ) { tmpDocContent.append("\n"); modification = true;} 332 }// End if 333 if (modification == true){ 334 Long end = new Long (tmpDocContent.length()); 335 java.util.Iterator anIterator = stack.iterator(); 336 while (anIterator.hasNext ()){ 337 // get the object and move to the next one 338 CustomObject obj = (CustomObject) anIterator.next(); 339 // sets its End index 340 obj.setEnd(end); 341 }// End while 342 }//End if 343 }// customizeAppearanceOfDocumentWithStartTag 344 345 /** This method analizes the tag t and adds some \n chars and spaces to the 346 * tmpDocContent.The reason behind is that we need to have a readable form 347 * for the final document. This method modifies the content of tmpDocContent. 348 * @param t the Html tag encounted by the HTML parser 349 */ 350 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){ 351 boolean modification = false; 352 // if the HTML tag is BR then we add a new line character to the document 353 if ( (HTML.Tag.P == t) || 354 355 (HTML.Tag.H1 == t) || 356 (HTML.Tag.H2 == t) || 357 (HTML.Tag.H3 == t) || 358 (HTML.Tag.H4 == t) || 359 (HTML.Tag.H5 == t) || 360 (HTML.Tag.H6 == t) || 361 (HTML.Tag.TR == t) || 362 (HTML.Tag.CENTER == t) || 363 (HTML.Tag.LI == t) 364 ){ tmpDocContent.append("\n"); modification = true;} 365 366 if (HTML.Tag.TITLE == t){ 367 tmpDocContent.append("\n\n"); 368 modification = true; 369 }// End if 370 371 if (modification == true){ 372 Long end = new Long (tmpDocContent.length()); 373 java.util.Iterator anIterator = stack.iterator(); 374 while (anIterator.hasNext ()){ 375 // get the object and move to the next one 376 CustomObject obj = (CustomObject) anIterator.next(); 377 // sets its End index 378 obj.setEnd(end); 379 }// End while 380 }//End if 381 }// customizeAppearanceOfDocumentWithEndTag 382 383 /** 384 * This method is called when the HTML parser encounts an error 385 * it depends on the programmer if he wants to deal with that error 386 */ 387 public void handleError(String errorMsg, int pos) { 388 //Out.println ("ERROR CALLED : " + errorMsg); 389 } 390 391 /** This method is called once, when the HTML parser reaches the end 392 * of its input streamin order to notify the parserCallback that there 393 * is nothing more to parse. 394 */ 395 public void flush() throws BadLocationException{ 396 }// flush 397 398 /** This method is called when the HTML parser encounts a comment 399 */ 400 public void handleComment(char[] text, int pos) { 401 } 402 403 //StatusReporter Implementation 404 405 public void addStatusListener(StatusListener listener) { 406 myStatusListeners.add(listener); 407 } 408 409 public void removeStatusListener(StatusListener listener) { 410 myStatusListeners.remove(listener); 411 } 412 413 protected void fireStatusChangedEvent(String text) { 414 Iterator listenersIter = myStatusListeners.iterator(); 415 while(listenersIter.hasNext()) 416 ((StatusListener)listenersIter.next()).statusChanged(text); 417 } 418 419 /** 420 * This method verifies if data contained by the CustomObject can be used 421 * to create a GATE annotation. 422 */ 423 /* private boolean canCreateAnnotation(CustomObject aCustomObject){ 424 long start = aCustomObject.getStart().longValue(); 425 long end = aCustomObject.getEnd().longValue(); 426 long gateDocumentSize = doc.getContent().size().longValue(); 427 428 if (start < 0 || end < 0 ) return false; 429 if (start > end ) return false; 430 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false; 431 return true; 432 }// canCreateAnnotation 433 */ 434 435 // HtmlDocumentHandler member data 436 437 // this constant indicates when to fire the status listener 438 // this listener will add an overhead and we don't want a big overhead 439 // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE 440 final static int ELEMENTS_RATE = 128; 441 442 // this map contains the elements name that we want to create 443 // if it's null all the elements from the HTML documents will be transformed 444 // into Gate annotation objects otherwise only the elements it contains will 445 // be transformed 446 private Map markupElementsMap = null; 447 448 // the content of the HTML document, without any tag 449 // for internal use 450 private StringBuffer tmpDocContent = null; 451 452 // a stack used to remember elements and to keep the order 453 private java.util.Stack stack = null; 454 455 // a gate document 456 private gate.Document doc = null; 457 458 // an annotation set used for creating annotation reffering the doc 459 private gate.AnnotationSet basicAS; 460 461 // listeners for status report 462 protected List myStatusListeners = new LinkedList(); 463 464 // this reports the the number of elements that have beed processed so far 465 private int elements = 0; 466 467 protected long customObjectsId = 0; 468 // we need a colection to retain all the CustomObjects that will be 469 // transformed into annotation over the gate document... 470 // the transformation will take place inside onDocumentEnd() method 471 private LinkedList colector = null; 472 473 // Inner class 474 /** 475 * The objects belonging to this class are used inside the stack. 476 * This class is for internal needs 477 */ 478 class CustomObject implements Comparable { 479 480 // constructor 481 public CustomObject(String anElemName, FeatureMap aFm, 482 Long aStart, Long anEnd) { 483 elemName = anElemName; 484 fm = aFm; 485 start = aStart; 486 end = anEnd; 487 id = new Long(customObjectsId ++); 488 }// End CustomObject() 489 490 // Methos implemented as required by Comparable interface 491 public int compareTo(Object o){ 492 CustomObject obj = (CustomObject) o; 493 return this.id.compareTo(obj.getId()); 494 }// compareTo(); 495 496 // accesor 497 public String getElemName() { 498 return elemName; 499 }// getElemName() 500 501 public FeatureMap getFM() { 502 return fm; 503 }// getFM() 504 505 public Long getStart() { 506 return start; 507 }// getStart() 508 509 public Long getEnd() { 510 return end; 511 }// getEnd() 512 513 public Long getId(){ return id;} 514 515 // mutator 516 public void setElemName(String anElemName) { 517 elemName = anElemName; 518 }// getElemName() 519 520 public void setFM(FeatureMap aFm) { 521 fm = aFm; 522 }// setFM(); 523 524 public void setStart(Long aStart) { 525 start = aStart; 526 }// setStart(); 527 528 public void setEnd(Long anEnd) { 529 end = anEnd; 530 }// setEnd(); 531 532 // data fields 533 private String elemName = null; 534 private FeatureMap fm = null; 535 private Long start = null; 536 private Long end = null; 537 private Long id = null; 538 539 } // End inner class CustomObject 540 541 }//End class HtmlDocumentHandler 542 543 544 545
|
HtmlDocumentHandler |
|