1   /*
2    *  XmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  9 May 2000
12   *
13   *  $Id: XmlDocumentHandler.java,v 1.35 2001/11/08 17:23:34 cursu Exp $
14   */
15  
16  package gate.xml;
17  
18  import java.util.*;
19  
20  import gate.corpora.*;
21  import gate.util.*;
22  import gate.*;
23  import gate.event.*;
24  
25  
26  import org.xml.sax.*;
27  import org.xml.sax.helpers.*;
28  
29  
30  /**
31    * Implements the behaviour of the XML reader
32    * Methods of an object of this class are called by the SAX parser when
33    * events will appear.
34    * The idea is to parse the XML document and construct Gate annotations
35    * objects.
36    * This class also will replace the content of the Gate document with a
37    * new one containing anly text from the XML document.
38    */
39  public class XmlDocumentHandler extends DefaultHandler{
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /**
44      * Constructs a XmlDocumentHandler object. The annotationSet set will be the
45      * default one taken from the gate document.
46      * @param aDocument the Gate document that will be processed.
47      * @param aMarkupElementsMap this map contains the elements name that we
48      * want to create.
49      * @param anElement2StringMap this map contains the strings that will be
50      * added to the text contained by the key element.
51      */
52    public XmlDocumentHandler(gate.Document aDocument, Map  aMarkupElementsMap,
53                              Map anElement2StringMap){
54      this(aDocument,aMarkupElementsMap,anElement2StringMap,null);
55    } // XmlDocumentHandler
56  
57    /**
58      * Constructs a XmlDocumentHandler object.
59      * @param aDocument the Gate document that will be processed.
60      * @param aMarkupElementsMap this map contains the elements name that we
61      * want to create.
62      * @param anElement2StringMap this map contains the strings that will be
63      * added to the text contained by the key element.
64      * @param anAnnotationSet is the annotation set that will be filled when the
65      * document was processed
66      */
67    public XmlDocumentHandler(gate.Document       aDocument,
68                              Map                 aMarkupElementsMap,
69                              Map                 anElement2StringMap,
70                              gate.AnnotationSet  anAnnotationSet){
71      // init stack
72      stack = new java.util.Stack();
73  
74      // this string contains the plain text (the text without markup)
75      tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
76  
77      // colector is used later to transform all custom objects into annotation
78      // objects
79      colector = new LinkedList();
80  
81      // the Gate document
82      doc = aDocument;
83  
84      // this map contains the elements name that we want to create
85      // if it's null all the elements from the XML documents will be transformed
86      // into Gate annotation objects
87      markupElementsMap = aMarkupElementsMap;
88  
89      // this map contains the string that we want to insert iside the document
90      // content, when a certain element is found
91      // if the map is null then no string is added
92      element2StringMap = anElement2StringMap;
93  
94      basicAS = anAnnotationSet;
95      customObjectsId = 0;
96    }// XmlDocumentHandler()/
97  
98    /**
99      * This method is called when the SAX parser encounts the beginning of the
100     * XML document.
101     */
102   public void startDocument() throws org.xml.sax.SAXException {
103   }
104 
105   /**
106     * This method is called when the SAX parser encounts the end of the
107     * XML document.
108     * Here we set the content of the gate Document to be the one generated
109     * inside this class (tmpDocContent).
110     * After that we use the colector to generate all the annotation reffering
111     * this new gate document.
112     */
113   public void endDocument() throws org.xml.sax.SAXException {
114 
115     // replace the document content with the one without markups
116     doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
117 
118     // fire the status listener
119     fireStatusChangedEvent("Total elements: " + elements);
120 
121     // If basicAs is null then get the default AnnotationSet,
122     // based on the gate document.
123     if (basicAS == null)
124       basicAS=doc.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
125 
126     // sort colector ascending on its id
127     Collections.sort(colector);
128     Set testIdsSet = new HashSet();
129     // create all the annotations (on this new document) from the collector
130     while (!colector.isEmpty()){
131       CustomObject obj = (CustomObject) colector.getFirst();
132       // Test to see if there are two annotation objects with the same id.
133       if (testIdsSet.contains(obj.getId())){
134         throw new GateSaxException("Found two annotations with the same Id("+
135         obj.getId()+
136         ").The document is inconsistent.");
137       }else{
138         testIdsSet.add(obj.getId());
139       }// End iff
140       // create a new annotation and add it to the annotation set
141       try{
142         // the annotation type will be conforming with markupElementsMap
143         //add the annotation to the Annotation Set
144         if (markupElementsMap == null)
145           basicAS.add(  obj.getId(),
146                         obj.getStart(),
147                         obj.getEnd(),
148                         obj.getElemName(),
149                         obj.getFM ());
150         else {
151           // get the type of the annotation from Map
152           String annotationType = (String)
153                                 markupElementsMap.get(obj.getElemName());
154           if (annotationType != null)
155             basicAS.add( obj.getId(),
156                          obj.getStart(),
157                          obj.getEnd(),
158                          annotationType,
159                          obj.getFM());
160         }// End if
161       }catch (gate.util.InvalidOffsetException e){
162         Err.prln("InvalidOffsetException for annot :" + obj.getElemName() +
163          " with Id =" + obj.getId() + ". Discarded...");
164       }// End try
165       colector.remove(obj);
166     }// End while
167   }// endDocument();
168 
169   /**
170     * This method is called when the SAX parser encounts the beginning of an
171     * XML element.
172     */
173   public void startElement (String uri, String qName, String elemName,
174                                                              Attributes atts){
175     // Inform the progress listener to fire only if no of elements processed
176     // so far is a multiple of ELEMENTS_RATE
177     if ((++elements % ELEMENTS_RATE) == 0)
178         fireStatusChangedEvent("Processed elements : " + elements);
179 
180     Integer customObjectId = null;
181     // Construct a SimpleFeatureMapImpl from the list of attributes
182     FeatureMap fm = Factory.newFeatureMap();
183     //Get the name and the value of the attributes and add them to a FeaturesMAP
184     for (int i = 0; i < atts.getLength(); i++) {
185       String attName  = atts.getLocalName(i);
186       String attValue = atts.getValue(i);
187       String attUri =   atts.getURI(i);
188       if (attUri != null && Gate.URI.equals(attUri)){
189         if ("gateId".equals(attName)){
190           customObjectId = new Integer(attValue);
191         }// End if
192         if ("annotMaxId".equals(attName)){
193           customObjectsId = new Integer(attValue).intValue();
194         }// End if
195         if ("matches".equals(attName)){
196           StringTokenizer strTokenizer = new StringTokenizer(attValue,";");
197           List list = new ArrayList();
198           // Take all tokens,create Integers and add them to the list
199           while (strTokenizer.hasMoreTokens()){
200             String token = strTokenizer.nextToken();
201             list.add(new Integer(token));
202           }// End while
203           fm.put(attName,list);
204         }// End if
205       }else{
206         fm.put(attName,attValue);
207       }// End if
208     }// End for
209 
210     // create the START index of the annotation
211     Long startIndex = new Long(tmpDocContent.length());
212 
213     // initialy the Start index is equal with End index
214     CustomObject obj = new CustomObject(customObjectId,elemName,fm,
215                                                  startIndex, startIndex);
216 
217     // put this object into the stack
218     stack.push(obj);
219   }// startElement();
220 
221   /**
222     * This method is called when the SAX parser encounts the end of an
223     * XML element.
224     * Here we extract
225     */
226   public void endElement (String uri, String qName, String elemName )
227                                                          throws SAXException{
228     // obj is for internal use
229     CustomObject obj = null;
230 
231     // if the stack is not empty, we extract the custom object and delete it
232     if (!stack.isEmpty ()){
233       obj = (CustomObject) stack.pop();
234     }// End if
235 
236     // Before adding it to the colector, we need to check if is an
237     // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
238     if (obj.getStart().equals(obj.getEnd())){
239       // The element had an end tag and its start was equal to its end. Hence
240       // it is anEmptyAndSpan one.
241       obj.getFM().put("isEmptyAndSpan","true");
242     }// End iff
243 
244     // Put the object into colector
245     // Later, when the document ends we will use colector to create all the
246     // annotations
247     colector.add(obj);
248 
249     // if element is found on Element2String map, then add the string to the
250     // end of the document content
251     if (element2StringMap != null){
252       String stringFromMap = null;
253 
254       // test to see if element is inside the map
255       // if it is then get the string value and add it to the document content
256       stringFromMap = (String) element2StringMap.get(elemName);
257       if (stringFromMap != null)
258           tmpDocContent.append(stringFromMap);
259     }// End if
260   }// endElement();
261 
262   /**
263     * This method is called when the SAX parser encounts text in the XML doc.
264     * Here we calculate the end indices for all the elements present inside the
265     * stack and update with the new values. For entities, this method is called
266     * separatley regardless of the text sourinding the entity.
267     */
268   public void characters( char[] text,int start,int length) throws SAXException{
269     // create a string object based on the reported text
270     String content = new String(text, start, length);
271     StringBuffer contentBuffer = new StringBuffer("");
272     int tmpDocContentSize = tmpDocContent.length();
273     boolean incrementStartIndex = false;
274     // If the first char of the text just read "text[0]" is NOT whitespace AND
275     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
276     // concatenation "tmpDocContent + content" will result into a new different
277     // word... and we want to avoid that, because the tokenizer, gazetter and
278     // Jape work on the raw text and concatenating tokens might be not good.
279     if ( tmpDocContentSize != 0 &&
280          content.length() != 0 &&
281          !Character.isWhitespace(content.charAt(0)) &&
282          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
283 
284          // If we are here it means that a concatenation between the last
285          // token in the tmpDocContent and the content(which doesn't start
286          // with a white space) will be performed. In order to prevent this,
287          // we will add a " " space char in order to assure taht the 2 tokens
288          // stay apart. Howerver we will except from this rule the most known
289          // internal entities like &, <, >, etc
290          if (
291               (
292                  // Testing the length against 1 makes it more likely that
293                  // an internal entity was called. characters() gets called for
294                  // each entity separately.
295                  (content.length() == 1)
296                   &&
297                  (content.charAt(0) == '&' ||
298                   content.charAt(0) == '<' ||
299                   content.charAt(0) == '>' ||
300                   content.charAt(0) == '"' ||
301                   content.charAt(0) == '\''
302                   )
303                ) ||
304                (tmpDocContent.charAt(tmpDocContentSize - 1) == '&' ||
305                 tmpDocContent.charAt(tmpDocContentSize - 1) == '<' ||
306                 tmpDocContent.charAt(tmpDocContentSize - 1) == '>' ||
307                 tmpDocContent.charAt(tmpDocContentSize - 1) == '"' ||
308                 tmpDocContent.charAt(tmpDocContentSize - 1) == '\''
309                )){// do nothing. The content will be appended
310          }else{
311             // In all other cases append " "
312             contentBuffer.append(" ");
313             incrementStartIndex = true;
314         }// End if
315     }// End if
316     // update the document content
317     contentBuffer.append(content);
318     // calculate the End index for all the elements of the stack
319     // the expression is : End index = Current doc length + text length
320     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
321 
322     CustomObject obj = null;
323     // Iterate through stack to modify the End index of the existing elements
324 
325     java.util.Iterator anIterator = stack.iterator();
326     while (anIterator.hasNext ()){
327       // get the object and move to the next one
328       obj = (CustomObject) anIterator.next ();
329       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
330         obj.setStart(new Long(obj.getStart().longValue() + 1));
331       }// End if
332       // sets its End index
333       obj.setEnd(end);
334     }// End while
335 
336     tmpDocContent.append(contentBuffer.toString());
337   }// characters();
338 
339   /**
340     * This method is called when the SAX parser encounts white spaces
341     */
342   public void ignorableWhitespace(char ch[],int start,int length) throws
343                                                                    SAXException{
344 
345     // internal String object
346     String  text = new String(ch, start, length);
347     // if the last character in tmpDocContent is \n and the read whitespace is
348     // \n then don't add it to tmpDocContent...
349 
350     if (tmpDocContent.length () != 0)
351       if (tmpDocContent.charAt (tmpDocContent.length () - 1) != '\n' ||
352         !text.equalsIgnoreCase("\n")
353       )
354          tmpDocContent.append(text);
355   }
356 
357   /**
358     * Error method.We deal with this exception inside SimpleErrorHandler class
359     */
360   public void error(SAXParseException ex) throws SAXException {
361     // deal with a SAXParseException
362     // see SimpleErrorhandler class
363     _seh.error(ex);
364   }
365 
366   /**
367     * FatalError method.
368     */
369   public void fatalError(SAXParseException ex) throws SAXException {
370     // deal with a SAXParseException
371     // see SimpleErrorhandler class
372     _seh.fatalError(ex);
373   }
374 
375   /**
376     * Warning method comment.
377     */
378   public void warning(SAXParseException ex) throws SAXException {
379     // deal with a SAXParseException
380     // see SimpleErrorhandler class
381     _seh.warning(ex);
382   }
383 
384   /**
385     * This method is called when the SAX parser encounts a comment
386     * It works only if the XmlDocumentHandler implements a
387     * com.sun.parser.LexicalEventListener
388     */
389   public void comment(String text) throws SAXException {
390     // create a FeatureMap and then add the comment to the annotation set.
391     /*
392     gate.util.SimpleFeatureMapImpl fm = new gate.util.SimpleFeatureMapImpl();
393     fm.put ("text_comment",text);
394     Long node = new Long (tmpDocContent.length());
395     CustomObject anObject = new CustomObject("Comment",fm,node,node);
396     colector.add(anObject);
397     */
398   }
399 
400   /**
401     * This method is called when the SAX parser encounts a start of a CDATA
402     * section
403     * It works only if the XmlDocumentHandler implements a
404     * com.sun.parser.LexicalEventListener
405     */
406   public void startCDATA()throws SAXException {
407   }
408 
409   /**
410     * This method is called when the SAX parser encounts the end of a CDATA
411     * section.
412     * It works only if the XmlDocumentHandler implements a
413     * com.sun.parser.LexicalEventListener
414     */
415   public void endCDATA() throws SAXException {
416   }
417 
418   /**
419     * This method is called when the SAX parser encounts a parsed Entity
420     * It works only if the XmlDocumentHandler implements a
421     * com.sun.parser.LexicalEventListener
422     */
423   public void startParsedEntity(String name) throws SAXException {
424   }
425 
426   /**
427     * This method is called when the SAX parser encounts a parsed entity and
428     * informs the application if that entity was parsed or not
429     * It's working only if the CustomDocumentHandler implements a
430     *  com.sun.parser.LexicalEventListener
431     */
432   public void endParsedEntity(String name, boolean included)throws SAXException{
433   }
434 
435   //StatusReporter Implementation
436 
437   /**
438     * This methos is called when a listener is registered with this class
439     */
440   public void addStatusListener(StatusListener listener){
441     myStatusListeners.add(listener);
442   }
443   /**
444     * This methos is called when a listener is removed
445     */
446   public void removeStatusListener(StatusListener listener){
447     myStatusListeners.remove(listener);
448   }
449   /**
450     * This methos is called whenever we need to inform the listener about an
451     * event.
452   */
453   protected void fireStatusChangedEvent(String text){
454     Iterator listenersIter = myStatusListeners.iterator();
455     while(listenersIter.hasNext())
456       ((StatusListener)listenersIter.next()).statusChanged(text);
457   }
458 
459   /** This method is a workaround of the java 4 non namespace supporting parser
460     * It receives a qualified name and returns its local name.
461     * For eg. if it receives gate:gateId it will return gateId
462     */
463   private String getMyLocalName(String aQName){
464     if (aQName == null) return "";
465     StringTokenizer strToken = new StringTokenizer(aQName,":");
466     if (strToken.countTokens()<= 1) return aQName;
467     // The nr of tokens is >= than 2
468     // Skip the first token which is the QName
469     strToken.nextToken();
470     return strToken.nextToken();
471   }//getMyLocalName()
472 
473   /** Also a workaround for URI identifier. If the QName is gate it will return
474     *  GATE's. Otherwhise it will return the empty string
475     */
476   private String getMyURI(String aQName){
477     if (aQName == null) return "";
478     StringTokenizer strToken = new StringTokenizer(aQName,":");
479     if (strToken.countTokens()<= 1) return "";
480     // If first token is "gate" then return GATE's URI
481     if ("gate".equalsIgnoreCase(strToken.nextToken()))
482       return Gate.URI;
483     return "";
484   }// getMyURI()
485 
486   // XmlDocumentHandler member data
487 
488   // this constant indicates when to fire the status listener
489   // this listener will add an overhead and we don't want a big overhead
490   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
491   final static  int ELEMENTS_RATE = 128;
492 
493   // this map contains the elements name that we want to create
494   // if it's null all the elements from the XML documents will be transformed
495   // into Gate annotation objects otherwise only the elements it contains will
496   // be transformed
497   private Map markupElementsMap = null;
498 
499   // this map contains the string that we want to insert iside the document
500   // content, when a certain element is found
501   // if the map is null then no string is added
502   private Map element2StringMap = null;
503 
504   /**This object inducates what to do when the parser encounts an error*/
505   private SimpleErrorHandler _seh = new SimpleErrorHandler();
506 
507   /**The content of the XML document, without any tag for internal use*/
508   private StringBuffer tmpDocContent = null;
509 
510   /**A stack used to remember elements and to keep the order */
511   private java.util.Stack stack = null;
512 
513   /**A gate document */
514   private gate.Document doc = null;
515 
516   /**An annotation set used for creating annotation reffering the doc */
517   private gate.AnnotationSet basicAS = null;
518 
519   /**Listeners for status report */
520   protected List myStatusListeners = new LinkedList();
521 
522   /**This reports the the number of elements that have beed processed so far*/
523   private int elements = 0;
524 
525   /** We need a colection to retain all the CustomObjects that will be
526     * transformed into annotation over the gate document...
527     * the transformation will take place inside onDocumentEnd() method
528     */
529   private LinkedList colector = null;
530 
531   /** This is used to generate unique Ids for the CustomObjects read*/
532   protected  int customObjectsId = 0;
533 
534   /** Accesor method for the customObjectsId field*/
535   public int getCustomObjectsId(){ return customObjectsId;}
536 
537   //////// INNER CLASS
538   /**
539     * The objects belonging to this class are used inside the stack.
540     * This class is for internal needs
541     */
542   class  CustomObject implements Comparable {
543 
544     // constructor
545     public CustomObject(Integer anId,String anElemName, FeatureMap aFm,
546                            Long aStart, Long anEnd) {
547       elemName = anElemName;
548       fm = aFm;
549       start = aStart;
550       end = anEnd;
551       if (anId == null){
552         id = new Integer(customObjectsId ++);
553       }else{
554         id = anId;
555         if (customObjectsId <= anId.intValue())
556           customObjectsId = anId.intValue() + 1 ;
557       }// End if
558     }// End CustomObject()
559 
560     // Methos implemented as required by Comparable interface
561     public int compareTo(Object o){
562       CustomObject obj = (CustomObject) o;
563       return this.id.compareTo(obj.getId());
564     }// compareTo();
565 
566     // accesor
567     public String getElemName() {
568       return elemName;
569     }// getElemName()
570 
571     public FeatureMap getFM() {
572       return fm;
573     }// getFM()
574 
575     public Long getStart() {
576       return start;
577     }// getStart()
578 
579     public Long getEnd() {
580       return end;
581     }// getEnd()
582 
583     public Integer getId(){ return id;}
584 
585     // mutator
586     public void setElemName(String anElemName) {
587       elemName = anElemName;
588     }// getElemName()
589 
590     public void setFM(FeatureMap aFm) {
591       fm = aFm;
592     }// setFM();
593 
594     public void setStart(Long aStart) {
595       start = aStart;
596     }// setStart();
597 
598     public void setEnd(Long anEnd) {
599       end = anEnd;
600     }// setEnd();
601 
602     // data fields
603     private String elemName = null;
604     private FeatureMap fm = null;
605     private Long start = null;
606     private Long end  = null;
607     private Integer id = null;
608 
609   } // End inner class CustomObject
610 
611 } //XmlDocumentHandler
612 
613 
614 
615