1   /*
2    *  XmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: XmlDocumentFormat.java,v 1.34 2001/11/30 14:38:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  //import com.sun.xml.parser.* ;
19  import java.util.*;
20  import java.io.*;
21  import java.net.*;
22  
23  import gate.util.*;
24  import gate.*;
25  import gate.xml.*;
26  import gate.event.*;
27  import gate.creole.*;
28  
29  // xml tools
30  import javax.xml.parsers.*;
31  import org.xml.sax.*;
32  import org.xml.sax.helpers.*;
33  import org.w3c.www.mime.*;
34  
35  /** The format of Documents. Subclasses of DocumentFormat know about
36    * particular MIME types and how to unpack the information in any
37    * markup or formatting they contain into GATE annotations. Each MIME
38    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
39    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
40    * with a static index residing here when they are constructed. Static
41    * getDocumentFormat methods can then be used to get the appropriate
42    * format class for a particular document.
43    */
44  public class XmlDocumentFormat extends TextualDocumentFormat
45  {
46    /** Debug flag */
47    private static final boolean DEBUG = false;
48  
49    /** Default construction */
50    public XmlDocumentFormat() { super(); }
51  
52    /** Unpack the markup in the document. This converts markup from the
53      * native format (e.g. XML) into annotations in GATE format.
54      * Uses the markupElementsMap to determine which elements to convert, and
55      * what annotation type names to use. If the document was created from a
56      * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
57      * So, if the document has a valid URL, then the parser will try to
58      * parse the XML document pointed by the URL.If the URL is not valid, or
59      * is null, then the doc's content will be parsed. If the doc's content is
60      * not a valid XML then the parser might crash.
61      *
62      * @param Document doc The gate document you want to parse. If
63      * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of
64      * doc will be parsed. Using a URL is recomended because the parser will
65      * report errors corectlly if the XML document is not well formed.
66      */
67    public void unpackMarkup(Document doc) throws DocumentFormatException{
68      if( (doc == null) ||
69          (doc.getSourceUrl() == null && doc.getContent() == null)){
70  
71        throw new DocumentFormatException(
72                 "GATE document is null or no content found. Nothing to parse!");
73      }// End if
74  
75      boolean docHasContentButNoValidURL = false;
76      // This is a test to see if the GATE document has a valid URL or a valid
77      // content. If doesn't has a valid URL then try to parse its content as XML
78      try{
79        if (doc.getSourceUrl() == null && doc.getContent() != null){
80          // The doc's url is null but there is a content.
81          docHasContentButNoValidURL = true;
82        }else {URLConnection conn = doc.getSourceUrl().openConnection();}
83      }catch (IOException ex1){
84        // The URL is not null but is not valid.
85        if(doc.getContent() == null)
86          // The document content is also null. There is nothing we can do.
87          throw new DocumentFormatException("The document doesn't have a" +
88          " valid URL and also no content");
89        docHasContentButNoValidURL = true;
90      }// End try
91  
92      // Create a status listener
93      StatusListener statusListener = new StatusListener(){
94            public void statusChanged(String text){
95              // This is implemented in DocumentFormat.java and inherited here
96              fireStatusChanged(text);
97            }
98      };
99      GateFormatXmlDocumentHandler gateXmlHandler = null;
100     XmlDocumentHandler xmlDocHandler = null;
101     if (docHasContentButNoValidURL)
102       parseDocumentWithoutURL(doc);
103     else try {
104       // use Excerces XML parser with JAXP
105       // System.setProperty("javax.xml.parsers.SAXParserFactory",
106       //                         "org.apache.xerces.jaxp.SAXParserFactoryImpl");
107       // Get a parser factory.
108       SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
109       // Set up the factory to create the appropriate type of parser
110       // non validating one
111       saxParserFactory.setValidating(false);
112       // non namesapace aware one
113       saxParserFactory.setNamespaceAware(true);
114       // create it
115       SAXParser xmlParser = saxParserFactory.newSAXParser();
116       if (isGateXmlDocument){
117         // Construct the appropiate xml handler for the job.
118         gateXmlHandler = new GateFormatXmlDocumentHandler(doc);
119         // Register a status listener
120         gateXmlHandler.addStatusListener(statusListener);
121         // Parse the Gate Document
122         xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler);
123         gateXmlHandler.removeStatusListener(statusListener);
124       }else{
125         // Create a new Xml document handler
126         xmlDocHandler =  new XmlDocumentHandler( doc,
127                                                  this.markupElementsMap,
128                                                  this.element2StringMap);
129         // Register a status listener with it
130         xmlDocHandler.addStatusListener(statusListener);
131         // Parse the document handler
132         xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler );
133         ((DocumentImpl) doc).setNextAnnotationId(
134                                           xmlDocHandler.getCustomObjectsId());
135         xmlDocHandler.removeStatusListener(statusListener);
136       }// End if
137     } catch (ParserConfigurationException e){
138         throw
139         new DocumentFormatException("XML parser configuration exception ", e);
140     } catch (SAXException e){
141         throw new DocumentFormatException(e);
142     } catch (IOException e){
143         throw new DocumentFormatException("I/O exception for " +
144                                       doc.getSourceUrl().toString());
145     }finally{
146       if(gateXmlHandler != null)
147         gateXmlHandler.removeStatusListener(statusListener);
148       if (xmlDocHandler != null)
149         xmlDocHandler.removeStatusListener(statusListener);
150     }// End if else try
151   }// unpackMarkup
152 
153   /** Called from unpackMarkup() if the document have been created from a
154    *  string
155    */
156   private void parseDocumentWithoutURL(gate.Document aDocument)
157                                               throws DocumentFormatException {
158 
159     XmlDocumentHandler xmlDocHandler = null;
160     // Create a status listener
161     StatusListener statusList = new StatusListener(){
162         public void statusChanged(String text){
163           // this is implemented in DocumentFormat.java and inherited here
164           fireStatusChanged(text);
165         }
166     };
167     try{
168       Reader reader = new InputStreamReader(
169         new ByteArrayInputStream(aDocument.getContent().toString().getBytes()),
170         "UTF-8");
171       InputSource is = new InputSource(reader);
172 
173 
174       // use Excerces XML parser with JAXP
175       // System.setProperty("javax.xml.parsers.SAXParserFactory",
176       //                         "org.apache.xerces.jaxp.SAXParserFactoryImpl");
177       // Get a parser factory.
178       SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
179       // Set up the factory to create the appropriate type of parser
180       // non validating one
181       saxParserFactory.setValidating(false);
182       // non namesapace aware one
183       saxParserFactory.setNamespaceAware(true);
184       // create it
185       SAXParser xmlParser = saxParserFactory.newSAXParser();
186       // create a new Xml document handler
187       xmlDocHandler =  new XmlDocumentHandler(aDocument,
188                                               this.markupElementsMap,
189                                               this.element2StringMap);
190       // Regsiter the statusListener with xmlDocHandler
191       xmlDocHandler.addStatusListener(statusList);
192       // Parse the document handler
193       xmlParser.parse(is, xmlDocHandler);
194       ((DocumentImpl) aDocument).setNextAnnotationId(
195                                           xmlDocHandler.getCustomObjectsId());
196     } catch (ParserConfigurationException e){
197         throw new DocumentFormatException(
198                         "XML parser configuration exception ", e);
199     } catch (SAXException e){
200         throw new DocumentFormatException(e);
201     } catch (IOException e){
202         throw new DocumentFormatException(e);
203     }finally{
204       // Remove the statusListener with xmlDocHandler
205       xmlDocHandler.removeStatusListener(statusList);
206     }// End try
207   }// End parseDocumentWithoutURL()
208 
209   /** Initialise this resource, and return it. */
210   public Resource init() throws ResourceInstantiationException{
211     // Register XML mime type
212     MimeType mime = new MimeType("text","xml");
213     // Register the class handler for this mime type
214     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
215                                                                           this);
216     // Register the mime type with mine string
217     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
218     // Register file sufixes for this mime type
219     suffixes2mimeTypeMap.put("xml",mime);
220     suffixes2mimeTypeMap.put("xhtm",mime);
221     suffixes2mimeTypeMap.put("xhtml",mime);
222     // Register magic numbers for this mime type
223     magic2mimeTypeMap.put("<?xml",mime);
224     // Set the mimeType for this language resource
225     setMimeType(mime);
226     return this;
227   }// init()
228 
229 }//class XmlDocumentFormat
230