|
XmlDocumentFormat |
|
1 /* 2 * XmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: XmlDocumentFormat.java,v 1.39 2002/01/28 14:25:09 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 //import com.sun.xml.parser.* ; 19 import java.util.*; 20 import java.io.*; 21 import java.net.*; 22 23 import gate.util.*; 24 import gate.*; 25 import gate.xml.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 // xml tools 30 import javax.xml.parsers.*; 31 import org.xml.sax.*; 32 import org.xml.sax.helpers.*; 33 import org.w3c.www.mime.*; 34 35 /** The format of Documents. Subclasses of DocumentFormat know about 36 * particular MIME types and how to unpack the information in any 37 * markup or formatting they contain into GATE annotations. Each MIME 38 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 39 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 40 * with a static index residing here when they are constructed. Static 41 * getDocumentFormat methods can then be used to get the appropriate 42 * format class for a particular document. 43 */ 44 public class XmlDocumentFormat extends TextualDocumentFormat 45 { 46 /** Debug flag */ 47 private static final boolean DEBUG = false; 48 49 /** Default construction */ 50 public XmlDocumentFormat() { super(); } 51 52 /** We could collect repositioning information during XML parsing */ 53 public Boolean supportsRepositioning() { 54 return new Boolean(true); 55 } // supportsRepositioning 56 57 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */ 58 public void unpackMarkup(Document doc) throws DocumentFormatException { 59 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null); 60 } // unpackMarkup 61 62 63 /** Unpack the markup in the document. This converts markup from the 64 * native format (e.g. XML) into annotations in GATE format. 65 * Uses the markupElementsMap to determine which elements to convert, and 66 * what annotation type names to use. If the document was created from a 67 * String, then is recomandable to set the doc's sourceUrl to <b>null</b>. 68 * So, if the document has a valid URL, then the parser will try to 69 * parse the XML document pointed by the URL.If the URL is not valid, or 70 * is null, then the doc's content will be parsed. If the doc's content is 71 * not a valid XML then the parser might crash. 72 * 73 * @param Document doc The gate document you want to parse. If 74 * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of 75 * doc will be parsed. Using a URL is recomended because the parser will 76 * report errors corectlly if the XML document is not well formed. 77 */ 78 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 79 RepositioningInfo ampCodingInfo) throws DocumentFormatException { 80 if( (doc == null) || 81 (doc.getSourceUrl() == null && doc.getContent() == null)){ 82 83 throw new DocumentFormatException( 84 "GATE document is null or no content found. Nothing to parse!"); 85 }// End if 86 87 boolean docHasContentButNoValidURL = false; 88 // This is a test to see if the GATE document has a valid URL or a valid 89 // content. If doesn't has a valid URL then try to parse its content as XML 90 try{ 91 if (doc.getSourceUrl() == null && doc.getContent() != null){ 92 // The doc's url is null but there is a content. 93 docHasContentButNoValidURL = true; 94 }else {URLConnection conn = doc.getSourceUrl().openConnection();} 95 }catch (IOException ex1){ 96 // The URL is not null but is not valid. 97 if(doc.getContent() == null) 98 // The document content is also null. There is nothing we can do. 99 throw new DocumentFormatException("The document doesn't have a" + 100 " valid URL and also no content"); 101 docHasContentButNoValidURL = true; 102 }// End try 103 104 // Create a status listener 105 StatusListener statusListener = new StatusListener(){ 106 public void statusChanged(String text){ 107 // This is implemented in DocumentFormat.java and inherited here 108 fireStatusChanged(text); 109 } 110 }; 111 GateFormatXmlDocumentHandler gateXmlHandler = null; 112 XmlDocumentHandler xmlDocHandler = null; 113 if (docHasContentButNoValidURL) 114 parseDocumentWithoutURL(doc, repInfo, ampCodingInfo); 115 else try { 116 // use Excerces XML parser with JAXP 117 // System.setProperty("javax.xml.parsers.SAXParserFactory", 118 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 119 // Get a parser factory. 120 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 121 // Set up the factory to create the appropriate type of parser 122 // non validating one 123 saxParserFactory.setValidating(false); 124 // non namesapace aware one 125 saxParserFactory.setNamespaceAware(true); 126 // create it 127 SAXParser xmlParser = saxParserFactory.newSAXParser(); 128 if (isGateXmlDocument){ 129 // Construct the appropiate xml handler for the job. 130 gateXmlHandler = new GateFormatXmlDocumentHandler(doc); 131 // Register a status listener 132 gateXmlHandler.addStatusListener(statusListener); 133 // Parse the Gate Document 134 xmlParser.parse(doc.getSourceUrl().toString(), gateXmlHandler); 135 gateXmlHandler.removeStatusListener(statusListener); 136 }else{ 137 // Create a new Xml document handler 138 xmlDocHandler = new XmlDocumentHandler( doc, 139 this.markupElementsMap, 140 this.element2StringMap); 141 // Register a status listener with it 142 xmlDocHandler.addStatusListener(statusListener); 143 // set repositioning object 144 xmlDocHandler.setRepositioningInfo(repInfo); 145 // set the object with ampersand coding positions 146 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 147 148 // Parse the document handler 149 /* Angel 150 xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler ); 151 Angel */ 152 // try to choose concret parser (Xerces) 153 // Angel - start 154 org.apache.xerces.parsers.SAXParser newxmlParser = 155 new org.apache.xerces.parsers.SAXParser(); 156 // Set up the factory to create the appropriate type of parser 157 // non validating one 158 // http://xml.org/sax/features/validation set to false 159 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 160 // namesapace aware one 161 // http://xml.org/sax/features/namespaces set to true 162 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 163 newxmlParser.setContentHandler(xmlDocHandler); 164 newxmlParser.setErrorHandler(xmlDocHandler); 165 newxmlParser.setDTDHandler(xmlDocHandler); 166 newxmlParser.setEntityResolver(xmlDocHandler); 167 newxmlParser.setReaderFactory(new StreamingCharFactory()); 168 newxmlParser.parse(doc.getSourceUrl().toString()); 169 // Angel - end 170 ((DocumentImpl) doc).setNextAnnotationId( 171 xmlDocHandler.getCustomObjectsId()); 172 xmlDocHandler.removeStatusListener(statusListener); 173 }// End if 174 } catch (ParserConfigurationException e){ 175 throw 176 new DocumentFormatException("XML parser configuration exception ", e); 177 } catch (SAXException e){ 178 // the next line is commented to avoid Document creation fail on error 179 // throw new DocumentFormatException(e); 180 Out.println("Warning: Document remains unparsed. \n" 181 +"\n Stack Dump: "); 182 e.printStackTrace(Out.getPrintWriter()); 183 } catch (IOException e){ 184 throw new DocumentFormatException("I/O exception for " + 185 doc.getSourceUrl().toString()); 186 }finally{ 187 if(gateXmlHandler != null) 188 gateXmlHandler.removeStatusListener(statusListener); 189 if (xmlDocHandler != null) 190 xmlDocHandler.removeStatusListener(statusListener); 191 }// End if else try 192 }// unpackMarkup 193 194 /** Called from unpackMarkup() if the document have been created from a 195 * string 196 */ 197 private void parseDocumentWithoutURL(gate.Document aDocument, 198 RepositioningInfo repInfo, 199 RepositioningInfo ampCodingInfo) 200 throws DocumentFormatException { 201 202 XmlDocumentHandler xmlDocHandler = null; 203 // Create a status listener 204 StatusListener statusList = new StatusListener(){ 205 public void statusChanged(String text){ 206 // this is implemented in DocumentFormat.java and inherited here 207 fireStatusChanged(text); 208 } 209 }; 210 try{ 211 Reader reader = new InputStreamReader( 212 new ByteArrayInputStream(aDocument.getContent().toString().getBytes()), 213 "UTF-8"); 214 InputSource is = new InputSource(reader); 215 216 217 // use Excerces XML parser with JAXP 218 // System.setProperty("javax.xml.parsers.SAXParserFactory", 219 // "org.apache.xerces.jaxp.SAXParserFactoryImpl"); 220 // Get a parser factory. 221 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(); 222 // Set up the factory to create the appropriate type of parser 223 // non validating one 224 saxParserFactory.setValidating(false); 225 // non namesapace aware one 226 saxParserFactory.setNamespaceAware(true); 227 // create it 228 SAXParser xmlParser = saxParserFactory.newSAXParser(); 229 230 // create a new Xml document handler 231 xmlDocHandler = new XmlDocumentHandler(aDocument, 232 this.markupElementsMap, 233 this.element2StringMap); 234 // Regsiter the statusListener with xmlDocHandler 235 xmlDocHandler.addStatusListener(statusList); 236 // set repositioning object 237 xmlDocHandler.setRepositioningInfo(repInfo); 238 // set the object with ampersand coding positions 239 xmlDocHandler.setAmpCodingInfo(ampCodingInfo); 240 // Parse the document handler 241 /* Angel 242 // xmlParser.parse(is, xmlDocHandler); 243 Angel */ 244 245 // Angel - start 246 // try to choose concret parser 247 org.apache.xerces.parsers.SAXParser newxmlParser = 248 new org.apache.xerces.parsers.SAXParser(); 249 // Set up the factory to create the appropriate type of parser 250 // non validating one 251 // http://xml.org/sax/features/validation set to false 252 newxmlParser.setFeature("http://xml.org/sax/features/validation", false); 253 // namesapace aware one 254 // http://xml.org/sax/features/namespaces set to true 255 newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true); 256 newxmlParser.setContentHandler(xmlDocHandler); 257 newxmlParser.setErrorHandler(xmlDocHandler); 258 newxmlParser.setDTDHandler(xmlDocHandler); 259 newxmlParser.setEntityResolver(xmlDocHandler); 260 newxmlParser.setReaderFactory(new StreamingCharFactory()); 261 newxmlParser.parse(is); 262 // Angel - end 263 264 ((DocumentImpl) aDocument).setNextAnnotationId( 265 xmlDocHandler.getCustomObjectsId()); 266 } catch (ParserConfigurationException e){ 267 throw new DocumentFormatException( 268 "XML parser configuration exception ", e); 269 } catch (SAXException e){ 270 throw new DocumentFormatException(e); 271 } catch (IOException e){ 272 throw new DocumentFormatException(e); 273 }finally{ 274 // Remove the statusListener with xmlDocHandler 275 xmlDocHandler.removeStatusListener(statusList); 276 }// End try 277 }// End parseDocumentWithoutURL() 278 279 /** Initialise this resource, and return it. */ 280 public Resource init() throws ResourceInstantiationException{ 281 // Register XML mime type 282 MimeType mime = new MimeType("text","xml"); 283 // Register the class handler for this mime type 284 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 285 this); 286 // Register the mime type with mine string 287 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 288 // Register file sufixes for this mime type 289 suffixes2mimeTypeMap.put("xml",mime); 290 suffixes2mimeTypeMap.put("xhtm",mime); 291 suffixes2mimeTypeMap.put("xhtml",mime); 292 // Register magic numbers for this mime type 293 magic2mimeTypeMap.put("<?xml",mime); 294 // Set the mimeType for this language resource 295 setMimeType(mime); 296 return this; 297 }// init() 298 299 }//class XmlDocumentFormat 300
|
XmlDocumentFormat |
|