|
XmlDocumentFormat |
|
1 /* 2 * XmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: XmlDocumentFormat.java,v 1.41 2002/06/10 12:51:31 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 //import com.sun.xml.parser.* ; 19 import java.util.*; 20 import java.io.*; 21 import java.net.*; 22 23 import gate.util.*; 24 import gate.*; 25 import gate.xml.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 // xml tools 30 import javax.xml.parsers.*; 31 import org.xml.sax.*; 32 import org.xml.sax.helpers.*; 33 import org.w3c.www.mime.*; 34 35 /** The format of Documents. Subclasses of DocumentFormat know about 36 * particular MIME types and how to unpack the information in any 37 * markup or formatting they contain into GATE annotations. Each MIME 38 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 39 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 40 * with a static index residing here when they are constructed. Static 41 * getDocumentFormat methods can then be used to get the appropriate 42 * format class for a particular document. 43 */ 44 public class XmlDocumentFormat extends TextualDocumentFormat 45 { 46 /** Debug flag */ 47 private static final boolean DEBUG = false; 48 49 /** Default construction */ 50 public XmlDocumentFormat() { super(); } 51 52 /** We could collect repositioning information during XML parsing */ 53 public Boolean supportsRepositioning() { 54 return new Boolean(true); 55 } // supportsRepositioning 56 57 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */ 58 public void unpackMarkup(Document doc) throws DocumentFormatException { 59 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null); 60 } // unpackMarkup 61 62 63 /** Unpack the markup in the document. This converts markup from the 64 * native format (e.g. XML) into annotations in GATE format. 65 * Uses the markupElementsMap to determine which elements to convert, and 66 * what annotation type names to use. If the document was created from a 67 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>. 68 * So, if the document has a valid URL, then the parser will try to 69 * parse the XML document pointed by the URL.If the URL is not valid, or 70 * is null, then the doc's content will be parsed. If the doc's content is 71 * not a valid XML then the parser might crash. 72 * 73 * @param Document doc The gate document you want to parse. If 74 * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of 75 * doc will be parsed. Using a URL is recomended because the parser will 76 * report errors corectlly if the XML document is not well formed. 77 */ 78 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 79 RepositioningInfo ampCodingInfo) throws DocumentFormatException { 80 if( (doc == null) || 81 (doc.getSourceUrl() == null && doc.getContent() == null)){ 82 83 throw new DocumentFormatException( 84 "GATE document is null or no content found. Nothing to parse!"); 85 }// End if 86 87 boolean docHasContentButNoValidURL = false; 88 // This is a test to see if the GATE document has a valid URL or a valid 89 // content. If doesn't has a valid URL then try to parse its content as XML 90 try{ 91 if (doc.getSourceUrl() == null && doc.getContent() != null){ 92 // The doc's url is null but there is a content. 93 docHasContentButNoValidURL = true; 94 }else {URLConnection conn = doc.getSourceUrl().openConnection();} 95 }catch (IOException ex1){ 96 // The URL is not null but is not valid. 97 if(doc.getContent() == null) 98 // The document content is also null. There is nothing we can do. 99 throw new DocumentFormatException("The document doesn't have a" + 100 " valid URL and also no content"); 101 docHasContentButNoValidURL = true; 102 }// End try 103 104 // Create a status listener 105 StatusListener statusListener = new StatusListener(){ 106 public void statusChanged(String text){ 107 // This is implemented in DocumentFormat.java and inherited here 108 fireStatusChanged(text); 109 } 110 }; 111 GateFormatXmlDocumentHandler gateXmlHandler = null; 112 XmlDocumentHandler xmlDocHandler = null; 113 if (docHasContentButNoValidURL) 114 parseDocumentWithoutURL(doc, repInfo, ampCodingInfo); 115 else try { 116 // use Excerces XML parser with JAXP 117 // System.setProperty("javax.xml.parsers.SAXParserFactory", 118 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 119 // Get a parser factory. 120 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 121 // Set up the factory to create the appropriate type of parser 122 // non validating one 123 saxParserFactory.setValidating(false); 124 // non namesapace aware one 125 saxParserFactory.setNamespaceAware(true); 126 // create it 127 SAXParser xmlParser = saxParserFactory.newSAXParser(); 128 if (isGateXmlDocument){ 129 // Construct the appropiate xml handler for the job. 130 gateXmlHandler = new GateFormatXmlDocumentHandler(doc); 131 // Register a status listener 132 gateXmlHandler.addStatusListener(statusListener); 133 // Parse the Gate Document 134 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler); 135 gateXmlHandler.removeStatusListener(statusListener); 136 }else{ 137 // Create a new Xml document handler 138 xmlDocHandler = new XmlDocumentHandler( doc, 139 this.markupElementsMap, 140 this.element2StringMap); 141 // Register a status listener with it 142 xmlDocHandler.addStatusListener(statusListener); 143 // set repositioning object 144 xmlDocHandler.setRepositioningInfo(repInfo); 145 // set the object with ampersand coding positions 146 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 147 148 // Parse the document handler 149 /* Angel 150 xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler ); 151 Angel */ 152 // try to choose concret parser (Xerces) 153 // Angel - start 154 org.apache.xerces.parsers.SAXParser newxmlParser = 155 new org.apache.xerces.parsers.SAXParser(); 156 // Set up the factory to create the appropriate type of parser 157 // non validating one 158 // http://xml.org/sax/features/validation set to false 159 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 160 // namesapace aware one 161 // http://xml.org/sax/features/namespaces set to true 162 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 163 newxmlParser.setContentHandler(xmlDocHandler); 164 newxmlParser.setErrorHandler(xmlDocHandler); 165 newxmlParser.setDTDHandler(xmlDocHandler); 166 newxmlParser.setEntityResolver(xmlDocHandler); 167 newxmlParser.setReaderFactory(new StreamingCharFactory()); 168 newxmlParser.parse(doc.getSourceUrl().toString()); 169 // Angel - end 170 ((DocumentImpl) doc).setNextAnnotationId( 171 xmlDocHandler.getCustomObjectsId()); 172 xmlDocHandler.removeStatusListener(statusListener); 173 }// End if 174 } catch (ParserConfigurationException e){ 175 throw 176 new DocumentFormatException("XML parser configuration exception ", e); 177 } catch (SAXException e){ 178 doc.getFeatures().put("parsingError", new Boolean(true)); 179 180 Boolean bThrow = (Boolean) 181 doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME); 182 183 if(bThrow != null && bThrow.booleanValue()) { 184 // the next line is commented to avoid Document creation fail on error 185 throw new DocumentFormatException(e); 186 } 187 else { 188 Out.println("Warning: Document remains unparsed. \n" 189 +"\n Stack Dump: "); 190 e.printStackTrace(Out.getPrintWriter()); 191 } // if 192 193 } catch (IOException e){ 194 throw new DocumentFormatException("I/O exception for " + 195 doc.getSourceUrl().toString()); 196 }finally{ 197 if(gateXmlHandler != null) 198 gateXmlHandler.removeStatusListener(statusListener); 199 if (xmlDocHandler != null) 200 xmlDocHandler.removeStatusListener(statusListener); 201 }// End if else try 202 }// unpackMarkup 203 204 /** Called from unpackMarkup() if the document have been created from a 205 * string 206 */ 207 private void parseDocumentWithoutURL(gate.Document aDocument, 208 RepositioningInfo repInfo, 209 RepositioningInfo ampCodingInfo) 210 throws DocumentFormatException { 211 212 XmlDocumentHandler xmlDocHandler = null; 213 // Create a status listener 214 StatusListener statusList = new StatusListener(){ 215 public void statusChanged(String text){ 216 // this is implemented in DocumentFormat.java and inherited here 217 fireStatusChanged(text); 218 } 219 }; 220 try{ 221 Reader reader = new InputStreamReader( 222 new ByteArrayInputStream(aDocument.getContent().toString().getBytes()), 223 "UTF-8"); 224 InputSource is = new InputSource(reader); 225 226 227 // use Excerces XML parser with JAXP 228 // System.setProperty("javax.xml.parsers.SAXParserFactory", 229 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 230 // Get a parser factory. 231 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 232 // Set up the factory to create the appropriate type of parser 233 // non validating one 234 saxParserFactory.setValidating(false); 235 // non namesapace aware one 236 saxParserFactory.setNamespaceAware(true); 237 // create it 238 SAXParser xmlParser = saxParserFactory.newSAXParser(); 239 240 // create a new Xml document handler 241 xmlDocHandler = new XmlDocumentHandler(aDocument, 242 this.markupElementsMap, 243 this.element2StringMap); 244 // Regsiter the statusListener with xmlDocHandler 245 xmlDocHandler.addStatusListener(statusList); 246 // set repositioning object 247 xmlDocHandler.setRepositioningInfo(repInfo); 248 // set the object with ampersand coding positions 249 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 250 // Parse the document handler 251 /* Angel 252 // xmlParser.parse(is, xmlDocHandler); 253 Angel */ 254 255 // Angel - start 256 // try to choose concret parser 257 org.apache.xerces.parsers.SAXParser newxmlParser = 258 new org.apache.xerces.parsers.SAXParser(); 259 // Set up the factory to create the appropriate type of parser 260 // non validating one 261 // http://xml.org/sax/features/validation set to false 262 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 263 // namesapace aware one 264 // http://xml.org/sax/features/namespaces set to true 265 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 266 newxmlParser.setContentHandler(xmlDocHandler); 267 newxmlParser.setErrorHandler(xmlDocHandler); 268 newxmlParser.setDTDHandler(xmlDocHandler); 269 newxmlParser.setEntityResolver(xmlDocHandler); 270 newxmlParser.setReaderFactory(new StreamingCharFactory()); 271 newxmlParser.parse(is); 272 // Angel - end 273 274 ((DocumentImpl) aDocument).setNextAnnotationId( 275 xmlDocHandler.getCustomObjectsId()); 276 } catch (ParserConfigurationException e){ 277 throw new DocumentFormatException( 278 "XML parser configuration exception ", e); 279 } catch (SAXException e){ 280 throw new DocumentFormatException(e); 281 } catch (IOException e){ 282 throw new DocumentFormatException(e); 283 }finally{ 284 // Remove the statusListener with xmlDocHandler 285 xmlDocHandler.removeStatusListener(statusList); 286 }// End try 287 }// End parseDocumentWithoutURL() 288 289 /** Initialise this resource, and return it. */ 290 public Resource init() throws ResourceInstantiationException{ 291 // Register XML mime type 292 MimeType mime = new MimeType("text","xml"); 293 // Register the class handler for this mime type 294 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 295 this); 296 // Register the mime type with mine string 297 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 298 // Register file sufixes for this mime type 299 suffixes2mimeTypeMap.put("xml",mime); 300 suffixes2mimeTypeMap.put("xhtm",mime); 301 suffixes2mimeTypeMap.put("xhtml",mime); 302 // Register magic numbers for this mime type 303 magic2mimeTypeMap.put("<?xml",mime); 304 // Set the mimeType for this language resource 305 setMimeType(mime); 306 return this; 307 }// init() 308 309 }//class XmlDocumentFormat 310
|
XmlDocumentFormat |
|