1   /*
2    *  HtmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  12/June/2000
12   *
13   *  $Id: HtmlDocumentHandler.java,v 1.27 2001/11/08 17:23:32 cursu Exp $
14   */
15  
16  package gate.html;
17  
18  import javax.swing.text.html.*;
19  import javax.swing.text.html.parser.*;
20  import javax.swing.text.html.HTMLEditorKit.*;
21  import javax.swing.text.*;
22  
23  import java.util.*;
24  
25  import gate.corpora.*;
26  import gate.util.*;
27  import gate.*;
28  import gate.event.*;
29  
30  
31  /** Implements the behaviour of the HTML reader.
32    * Methods of an object of this class are called by the HTML parser when
33    * events will appear.
34    * The idea is to parse the HTML document and construct Gate annotations
35    * objects.
36    * This class also will replace the content of the Gate document with a
37    * new one containing anly text from the HTML document.
38    */
39  public class HtmlDocumentHandler extends ParserCallback {
40  
41    /** Debug flag */
42    private static final boolean DEBUG = false;
43  
44    /** Constructor initialises all the private memeber data.
45      * This will use the default annotation set taken from the gate document.
46      * @param aDocument The gate document that will be processed
47      * @param aMarkupElementsMap The map containing the elements that will
48      * transform into annotations
49      */
50    public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
51      this(aDocument,aMarkupElementsMap,null);
52    }
53  
54    /** Constructor initialises all the private memeber data
55      * @param aDocument The gate document that will be processed
56      * @param aMarkupElementsMap The map containing the elements that will
57      * transform into annotations
58      * @param anAnnoatationSet The annotation set that will contain annotations
59      * resulted from the processing of the gate document
60      */
61    public HtmlDocumentHandler(gate.Document       aDocument,
62                               Map                 aMarkupElementsMap,
63                               gate.AnnotationSet  anAnnotationSet) {
64      // init stack
65      stack = new java.util.Stack();
66  
67      // this string contains the plain text (the text without markup)
68      tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
69  
70      // colector is used later to transform all custom objects into
71      // annotation objects
72      colector = new LinkedList();
73  
74      // the Gate document
75      doc = aDocument;
76  
77      // this map contains the elements name that we want to create
78      // if it's null all the elements from the XML documents will be transformed
79      // into Gate annotation objects
80      markupElementsMap = aMarkupElementsMap;
81  
82      // init an annotation set for this gate document
83      basicAS = anAnnotationSet;
84  
85      customObjectsId = 0;
86    }//HtmlDocumentHandler
87  
88    /** This method is called when the HTML parser encounts the beginning
89      * of a tag that means that the tag is paired by an end tag and it's
90      * not an empty one.
91      */
92    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
93      // Fire the status listener if the elements processed exceded the rate
94      if (0 == (++elements % ELEMENTS_RATE))
95        fireStatusChangedEvent("Processed elements : " + elements);
96  
97      // Construct a feature map from the attributes list
98      FeatureMap fm = Factory.newFeatureMap();
99  
100     // Take all the attributes an put them into the feature map
101     if (0 != a.getAttributeCount()){
102       Enumeration enum = a.getAttributeNames();
103       while (enum.hasMoreElements()){
104         Object attribute = enum.nextElement();
105         fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
106       }// while
107     }// if
108 
109     // Just analize the tag t and add some\n chars and spaces to the
110     // tmpDocContent.The reason behind is that we need to have a readable form
111     // for the final document.
112     customizeAppearanceOfDocumentWithStartTag(t);
113 
114     // If until here the "tmpDocContent" ends with a NON whitespace char,
115     // then we add a space char before calculating the START index of this
116     // tag.
117     // This is done in order not to concatenate the content of two separate tags
118     // and obtain a different NEW word.
119     int tmpDocContentSize = tmpDocContent.length();
120     if ( tmpDocContentSize != 0 &&
121          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
122        ) tmpDocContent.append(" ");
123 
124     // create the start index of the annotation
125     Long startIndex = new Long(tmpDocContent.length());
126 
127     // initialy the start index is equal with the End index
128     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
129 
130     // put it into the stack
131     stack.push (obj);
132 
133   }//handleStartTag
134 
135    /** This method is called when the HTML parser encounts the end of a tag
136      * that means that the tag is paired by a beginning tag
137      */
138   public void handleEndTag(HTML.Tag t, int pos){
139     // obj is for internal use
140     CustomObject obj = null;
141 
142     // If the stack is not empty then we get the object from the stack
143     if (!stack.isEmpty()){
144       obj = (CustomObject) stack.pop();
145       // Before adding it to the colector, we need to check if is an
146       // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
147       if (obj.getStart().equals(obj.getEnd())){
148         // The element had an end tag and its start was equal to its end. Hence
149         // it is anEmptyAndSpan one.
150         obj.getFM().put("isEmptyAndSpan","true");
151       }// End iff
152       // we add it to the colector
153       colector.add(obj);
154     }// End if
155 
156     // If element has text between, then customize its apearance
157     if ( obj != null &&
158          obj.getStart().longValue() != obj.getEnd().longValue()
159        )
160       // Customize the appearance of the document
161       customizeAppearanceOfDocumentWithEndTag(t);
162 
163     // if t is the </HTML> tag then we reached the end of theHTMLdocument
164     if (t == HTML.Tag.HTML){
165       // replace the old content with the new one
166       doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
167 
168       // If basicAs is null then get the default annotation
169       // set from this gate document
170       if (basicAS == null)
171         basicAS = doc.getAnnotations(
172                                 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
173 
174       // sort colector ascending on its id
175       Collections.sort(colector);
176       // iterate through colector and construct annotations
177       while (!colector.isEmpty()){
178         obj = (CustomObject) colector.getFirst();
179         colector.remove(obj);
180           // Construct an annotation from this obj
181           try{
182             if (markupElementsMap == null){
183                basicAS.add( obj.getStart(),
184                             obj.getEnd(),
185                             obj.getElemName(),
186                             obj.getFM()
187                            );
188             }else{
189               String annotationType =
190                      (String) markupElementsMap.get(obj.getElemName());
191               if (annotationType != null)
192                  basicAS.add( obj.getStart(),
193                               obj.getEnd(),
194                               annotationType,
195                               obj.getFM()
196                              );
197             }
198           }catch (InvalidOffsetException e){
199               Err.prln("Error creating an annot :" + obj + " Discarded...");
200           }// end try
201 //        }// end if
202       }//while
203 
204       // notify the listener about the total amount of elements that
205       // has been processed
206       fireStatusChangedEvent("Total elements : " + elements);
207 
208     }//else
209 
210   }//handleEndTag
211 
212   /** This method is called when the HTML parser encounts an empty tag
213     */
214   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
215     // fire the status listener if the elements processed exceded the rate
216     if ((++elements % ELEMENTS_RATE) == 0)
217       fireStatusChangedEvent("Processed elements : " + elements);
218 
219     // construct a feature map from the attributes list
220     // these are empty elements
221     FeatureMap fm = Factory.newFeatureMap();
222 
223     // take all the attributes an put them into the feature map
224     if (0 != a.getAttributeCount ()){
225 
226        // Out.println("HAS  attributes = " + a.getAttributeCount ());
227         Enumeration enum = a.getAttributeNames ();
228         while (enum.hasMoreElements ()){
229           Object attribute = enum.nextElement ();
230           fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
231 
232         }//while
233 
234     }//if
235 
236     // create the start index of the annotation
237     Long startIndex = new Long(tmpDocContent.length());
238 
239     // initialy the start index is equal with the End index
240     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
241 
242     // we add the object directly into the colector
243     // we don't add it to the stack because this is an empty tag
244     colector.add(obj);
245 
246     // Just analize the tag t and add some\n chars and spaces to the
247     // tmpDocContent.The reason behind is that we need to have a readable form
248     // for the final document.
249     customizeAppearanceOfDocumentWithSimpleTag(t);
250 
251   } // handleSimpleTag
252 
253   /** This method is called when the HTML parser encounts text (PCDATA)
254     */
255   public void handleText(char[] text, int pos){
256     // create a string object based on the reported text
257     String content = new String(text);
258     StringBuffer contentBuffer = new StringBuffer("");
259     int tmpDocContentSize = tmpDocContent.length();
260     boolean incrementStartIndex = false;
261     // If the first char of the text just read "text[0]" is NOT whitespace AND
262     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
263     // concatenation "tmpDocContent + content" will result into a new different
264     // word... and we want to avoid that...
265     if ( tmpDocContentSize != 0 &&
266          content.length() != 0 &&
267          !Character.isWhitespace(content.charAt(0)) &&
268          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
269 
270             contentBuffer.append(" ");
271             incrementStartIndex = true;
272     }// End if
273     // update the document content
274     contentBuffer.append(content);
275     // calculate the End index for all the elements of the stack
276     // the expression is : End index = Current doc length + text length
277     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
278 
279     CustomObject obj = null;
280     // Iterate through stack to modify the End index of the existing elements
281 
282     java.util.Iterator anIterator = stack.iterator();
283     while (anIterator.hasNext ()){
284       // get the object and move to the next one
285       obj = (CustomObject) anIterator.next ();
286       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
287         obj.setStart(new Long(obj.getStart().longValue() + 1));
288       }// End if
289       // sets its End index
290       obj.setEnd(end);
291     }// End while
292 
293     tmpDocContent.append(contentBuffer.toString());
294   }// end handleText();
295 
296   /** This method analizes the tag t and adds some \n chars and spaces to the
297     * tmpDocContent.The reason behind is that we need to have a readable form
298     * for the final document. This method modifies the content of tmpDocContent.
299     * @param t the Html tag encounted by the HTML parser
300     */
301   protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
302     boolean modification = false;
303     // if the HTML tag is BR then we add a new line character to the document
304     if (HTML.Tag.BR == t){
305       tmpDocContent.append("\n");
306       modification = true;
307     }// End if
308     if (modification == true){
309       Long end = new Long (tmpDocContent.length());
310       java.util.Iterator anIterator = stack.iterator();
311       while (anIterator.hasNext ()){
312         // get the object and move to the next one
313         CustomObject obj = (CustomObject) anIterator.next();
314         // sets its End index
315         obj.setEnd(end);
316       }// End while
317     }//End if
318   }// customizeAppearanceOfDocumentWithSimpleTag
319 
320   /** This method analizes the tag t and adds some \n chars and spaces to the
321     * tmpDocContent.The reason behind is that we need to have a readable form
322     * for the final document. This method modifies the content of tmpDocContent.
323     * @param t the Html tag encounted by the HTML parser
324     */
325   protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
326     boolean modification = false;
327     if (HTML.Tag.P == t){
328       int tmpDocContentSize = tmpDocContent.length();
329       if ( tmpDocContentSize >= 2 &&
330            '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
331          ) { tmpDocContent.append("\n"); modification = true;}
332     }// End if
333     if (modification == true){
334       Long end = new Long (tmpDocContent.length());
335       java.util.Iterator anIterator = stack.iterator();
336       while (anIterator.hasNext ()){
337         // get the object and move to the next one
338         CustomObject obj = (CustomObject) anIterator.next();
339         // sets its End index
340         obj.setEnd(end);
341       }// End while
342     }//End if
343   }// customizeAppearanceOfDocumentWithStartTag
344 
345   /** This method analizes the tag t and adds some \n chars and spaces to the
346     * tmpDocContent.The reason behind is that we need to have a readable form
347     * for the final document. This method modifies the content of tmpDocContent.
348     * @param t the Html tag encounted by the HTML parser
349     */
350   protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
351     boolean modification = false;
352     // if the HTML tag is BR then we add a new line character to the document
353     if ( (HTML.Tag.P == t) ||
354 
355          (HTML.Tag.H1 == t) ||
356          (HTML.Tag.H2 == t) ||
357          (HTML.Tag.H3 == t) ||
358          (HTML.Tag.H4 == t) ||
359          (HTML.Tag.H5 == t) ||
360          (HTML.Tag.H6 == t) ||
361          (HTML.Tag.TR == t) ||
362          (HTML.Tag.CENTER == t) ||
363          (HTML.Tag.LI == t)
364        ){ tmpDocContent.append("\n"); modification = true;}
365 
366     if (HTML.Tag.TITLE == t){
367       tmpDocContent.append("\n\n");
368       modification = true;
369     }// End if
370 
371     if (modification == true){
372       Long end = new Long (tmpDocContent.length());
373       java.util.Iterator anIterator = stack.iterator();
374       while (anIterator.hasNext ()){
375         // get the object and move to the next one
376         CustomObject obj = (CustomObject) anIterator.next();
377         // sets its End index
378         obj.setEnd(end);
379       }// End while
380     }//End if
381   }// customizeAppearanceOfDocumentWithEndTag
382 
383   /**
384     * This method is called when the HTML parser encounts an error
385     * it depends on the programmer if he wants to deal with that error
386     */
387   public void handleError(String errorMsg, int pos) {
388     //Out.println ("ERROR CALLED : " + errorMsg);
389   }
390 
391   /** This method is called once, when the HTML parser reaches the end
392     * of its input streamin order to notify the parserCallback that there
393     * is nothing more to parse.
394     */
395   public void flush() throws BadLocationException{
396   }// flush
397 
398   /** This method is called when the HTML parser encounts a comment
399     */
400   public void handleComment(char[] text, int pos) {
401   }
402 
403   //StatusReporter Implementation
404 
405   public void addStatusListener(StatusListener listener) {
406     myStatusListeners.add(listener);
407   }
408 
409   public void removeStatusListener(StatusListener listener) {
410     myStatusListeners.remove(listener);
411   }
412 
413   protected void fireStatusChangedEvent(String text) {
414     Iterator listenersIter = myStatusListeners.iterator();
415     while(listenersIter.hasNext())
416       ((StatusListener)listenersIter.next()).statusChanged(text);
417   }
418 
419   /**
420     * This method verifies if data contained by the CustomObject can be used
421     * to create a GATE annotation.
422     */
423 /*  private boolean canCreateAnnotation(CustomObject aCustomObject){
424     long start            = aCustomObject.getStart().longValue();
425     long end              = aCustomObject.getEnd().longValue();
426     long gateDocumentSize = doc.getContent().size().longValue();
427 
428     if (start < 0 || end < 0 ) return false;
429     if (start > end ) return false;
430     if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
431     return true;
432   }// canCreateAnnotation
433 */
434 
435   // HtmlDocumentHandler member data
436 
437   // this constant indicates when to fire the status listener
438   // this listener will add an overhead and we don't want a big overhead
439   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
440   final static  int ELEMENTS_RATE = 128;
441 
442   // this map contains the elements name that we want to create
443   // if it's null all the elements from the HTML documents will be transformed
444   // into Gate annotation objects otherwise only the elements it contains will
445   // be transformed
446   private Map markupElementsMap = null;
447 
448   // the content of the HTML document, without any tag
449   // for internal use
450   private StringBuffer tmpDocContent = null;
451 
452   // a stack used to remember elements and to keep the order
453   private java.util.Stack stack = null;
454 
455   // a gate document
456   private gate.Document doc = null;
457 
458   // an annotation set used for creating annotation reffering the doc
459   private gate.AnnotationSet basicAS;
460 
461   // listeners for status report
462   protected List myStatusListeners = new LinkedList();
463 
464   // this reports the the number of elements that have beed processed so far
465   private int elements = 0;
466 
467   protected  long customObjectsId = 0;
468   // we need a colection to retain all the CustomObjects that will be
469   // transformed into annotation over the gate document...
470   // the transformation will take place inside onDocumentEnd() method
471   private LinkedList colector = null;
472 
473   // Inner class
474   /**
475     * The objects belonging to this class are used inside the stack.
476     * This class is for internal needs
477     */
478   class  CustomObject implements Comparable {
479 
480     // constructor
481     public CustomObject(String anElemName, FeatureMap aFm,
482                            Long aStart, Long anEnd) {
483       elemName = anElemName;
484       fm = aFm;
485       start = aStart;
486       end = anEnd;
487       id = new Long(customObjectsId ++);
488     }// End CustomObject()
489 
490     // Methos implemented as required by Comparable interface
491     public int compareTo(Object o){
492       CustomObject obj = (CustomObject) o;
493       return this.id.compareTo(obj.getId());
494     }// compareTo();
495 
496     // accesor
497     public String getElemName() {
498       return elemName;
499     }// getElemName()
500 
501     public FeatureMap getFM() {
502       return fm;
503     }// getFM()
504 
505     public Long getStart() {
506       return start;
507     }// getStart()
508 
509     public Long getEnd() {
510       return end;
511     }// getEnd()
512 
513     public Long getId(){ return id;}
514 
515     // mutator
516     public void setElemName(String anElemName) {
517       elemName = anElemName;
518     }// getElemName()
519 
520     public void setFM(FeatureMap aFm) {
521       fm = aFm;
522     }// setFM();
523 
524     public void setStart(Long aStart) {
525       start = aStart;
526     }// setStart();
527 
528     public void setEnd(Long anEnd) {
529       end = anEnd;
530     }// setEnd();
531 
532     // data fields
533     private String elemName = null;
534     private FeatureMap fm = null;
535     private Long start = null;
536     private Long end  = null;
537     private Long id = null;
538 
539   } // End inner class CustomObject
540 
541 }//End class HtmlDocumentHandler
542 
543 
544 
545