|
XmlDocumentFormat |
|
1 /* 2 * XmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: XmlDocumentFormat.java,v 1.34 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 //import com.sun.xml.parser.* ; 19 import java.util.*; 20 import java.io.*; 21 import java.net.*; 22 23 import gate.util.*; 24 import gate.*; 25 import gate.xml.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 // xml tools 30 import javax.xml.parsers.*; 31 import org.xml.sax.*; 32 import org.xml.sax.helpers.*; 33 import org.w3c.www.mime.*; 34 35 /** The format of Documents. Subclasses of DocumentFormat know about 36 * particular MIME types and how to unpack the information in any 37 * markup or formatting they contain into GATE annotations. Each MIME 38 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 39 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 40 * with a static index residing here when they are constructed. Static 41 * getDocumentFormat methods can then be used to get the appropriate 42 * format class for a particular document. 43 */ 44 public class XmlDocumentFormat extends TextualDocumentFormat 45 { 46 /** Debug flag */ 47 private static final boolean DEBUG = false; 48 49 /** Default construction */ 50 public XmlDocumentFormat() { super(); } 51 52 /** Unpack the markup in the document. This converts markup from the 53 * native format (e.g. XML) into annotations in GATE format. 54 * Uses the markupElementsMap to determine which elements to convert, and 55 * what annotation type names to use. If the document was created from a 56 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>. 57 * So, if the document has a valid URL, then the parser will try to 58 * parse the XML document pointed by the URL.If the URL is not valid, or 59 * is null, then the doc's content will be parsed. If the doc's content is 60 * not a valid XML then the parser might crash. 61 * 62 * @param Document doc The gate document you want to parse. If 63 * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of 64 * doc will be parsed. Using a URL is recomended because the parser will 65 * report errors corectlly if the XML document is not well formed. 66 */ 67 public void unpackMarkup(Document doc) throws DocumentFormatException{ 68 if( (doc == null) || 69 (doc.getSourceUrl() == null && doc.getContent() == null)){ 70 71 throw new DocumentFormatException( 72 "GATE document is null or no content found. Nothing to parse!"); 73 }// End if 74 75 boolean docHasContentButNoValidURL = false; 76 // This is a test to see if the GATE document has a valid URL or a valid 77 // content. If doesn't has a valid URL then try to parse its content as XML 78 try{ 79 if (doc.getSourceUrl() == null && doc.getContent() != null){ 80 // The doc's url is null but there is a content. 81 docHasContentButNoValidURL = true; 82 }else {URLConnection conn = doc.getSourceUrl().openConnection();} 83 }catch (IOException ex1){ 84 // The URL is not null but is not valid. 85 if(doc.getContent() == null) 86 // The document content is also null. There is nothing we can do. 87 throw new DocumentFormatException("The document doesn't have a" + 88 " valid URL and also no content"); 89 docHasContentButNoValidURL = true; 90 }// End try 91 92 // Create a status listener 93 StatusListener statusListener = new StatusListener(){ 94 public void statusChanged(String text){ 95 // This is implemented in DocumentFormat.java and inherited here 96 fireStatusChanged(text); 97 } 98 }; 99 GateFormatXmlDocumentHandler gateXmlHandler = null; 100 XmlDocumentHandler xmlDocHandler = null; 101 if (docHasContentButNoValidURL) 102 parseDocumentWithoutURL(doc); 103 else try { 104 // use Excerces XML parser with JAXP 105 // System.setProperty("javax.xml.parsers.SAXParserFactory", 106 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 107 // Get a parser factory. 108 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 109 // Set up the factory to create the appropriate type of parser 110 // non validating one 111 saxParserFactory.setValidating(false); 112 // non namesapace aware one 113 saxParserFactory.setNamespaceAware(true); 114 // create it 115 SAXParser xmlParser = saxParserFactory.newSAXParser(); 116 if (isGateXmlDocument){ 117 // Construct the appropiate xml handler for the job. 118 gateXmlHandler = new GateFormatXmlDocumentHandler(doc); 119 // Register a status listener 120 gateXmlHandler.addStatusListener(statusListener); 121 // Parse the Gate Document 122 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler); 123 gateXmlHandler.removeStatusListener(statusListener); 124 }else{ 125 // Create a new Xml document handler 126 xmlDocHandler = new XmlDocumentHandler( doc, 127 this.markupElementsMap, 128 this.element2StringMap); 129 // Register a status listener with it 130 xmlDocHandler.addStatusListener(statusListener); 131 // Parse the document handler 132 xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler ); 133 ((DocumentImpl) doc).setNextAnnotationId( 134 xmlDocHandler.getCustomObjectsId()); 135 xmlDocHandler.removeStatusListener(statusListener); 136 }// End if 137 } catch (ParserConfigurationException e){ 138 throw 139 new DocumentFormatException("XML parser configuration exception ", e); 140 } catch (SAXException e){ 141 throw new DocumentFormatException(e); 142 } catch (IOException e){ 143 throw new DocumentFormatException("I/O exception for " + 144 doc.getSourceUrl().toString()); 145 }finally{ 146 if(gateXmlHandler != null) 147 gateXmlHandler.removeStatusListener(statusListener); 148 if (xmlDocHandler != null) 149 xmlDocHandler.removeStatusListener(statusListener); 150 }// End if else try 151 }// unpackMarkup 152 153 /** Called from unpackMarkup() if the document have been created from a 154 * string 155 */ 156 private void parseDocumentWithoutURL(gate.Document aDocument) 157 throws DocumentFormatException { 158 159 XmlDocumentHandler xmlDocHandler = null; 160 // Create a status listener 161 StatusListener statusList = new StatusListener(){ 162 public void statusChanged(String text){ 163 // this is implemented in DocumentFormat.java and inherited here 164 fireStatusChanged(text); 165 } 166 }; 167 try{ 168 Reader reader = new InputStreamReader( 169 new ByteArrayInputStream(aDocument.getContent().toString().getBytes()), 170 "UTF-8"); 171 InputSource is = new InputSource(reader); 172 173 174 // use Excerces XML parser with JAXP 175 // System.setProperty("javax.xml.parsers.SAXParserFactory", 176 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 177 // Get a parser factory. 178 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 179 // Set up the factory to create the appropriate type of parser 180 // non validating one 181 saxParserFactory.setValidating(false); 182 // non namesapace aware one 183 saxParserFactory.setNamespaceAware(true); 184 // create it 185 SAXParser xmlParser = saxParserFactory.newSAXParser(); 186 // create a new Xml document handler 187 xmlDocHandler = new XmlDocumentHandler(aDocument, 188 this.markupElementsMap, 189 this.element2StringMap); 190 // Regsiter the statusListener with xmlDocHandler 191 xmlDocHandler.addStatusListener(statusList); 192 // Parse the document handler 193 xmlParser.parse(is, xmlDocHandler); 194 ((DocumentImpl) aDocument).setNextAnnotationId( 195 xmlDocHandler.getCustomObjectsId()); 196 } catch (ParserConfigurationException e){ 197 throw new DocumentFormatException( 198 "XML parser configuration exception ", e); 199 } catch (SAXException e){ 200 throw new DocumentFormatException(e); 201 } catch (IOException e){ 202 throw new DocumentFormatException(e); 203 }finally{ 204 // Remove the statusListener with xmlDocHandler 205 xmlDocHandler.removeStatusListener(statusList); 206 }// End try 207 }// End parseDocumentWithoutURL() 208 209 /** Initialise this resource, and return it. */ 210 public Resource init() throws ResourceInstantiationException{ 211 // Register XML mime type 212 MimeType mime = new MimeType("text","xml"); 213 // Register the class handler for this mime type 214 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 215 this); 216 // Register the mime type with mine string 217 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 218 // Register file sufixes for this mime type 219 suffixes2mimeTypeMap.put("xml",mime); 220 suffixes2mimeTypeMap.put("xhtm",mime); 221 suffixes2mimeTypeMap.put("xhtml",mime); 222 // Register magic numbers for this mime type 223 magic2mimeTypeMap.put("<?xml",mime); 224 // Set the mimeType for this language resource 225 setMimeType(mime); 226 return this; 227 }// init() 228 229 }//class XmlDocumentFormat 230
|
XmlDocumentFormat |
|