|
HtmlDocumentFormat |
|
1 /* 2 * HtmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: HtmlDocumentFormat.java,v 1.29 2002/02/05 12:50:31 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 // html tools 23 import javax.swing.text.html.*; 24 import javax.swing.text.html.parser.*; 25 import javax.swing.text.html.HTMLEditorKit.*; 26 //import javax.swing.text.*; 27 28 import gate.util.*; 29 import gate.*; 30 import gate.html.*; 31 import gate.event.*; 32 import gate.creole.*; 33 34 import org.w3c.www.mime.*; 35 36 /** The format of Documents. Subclasses of DocumentFormat know about 37 * particular MIME types and how to unpack the information in any 38 * markup or formatting they contain into GATE annotations. Each MIME 39 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 40 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 41 * with a static index residing here when they are constructed. Static 42 * getDocumentFormat methods can then be used to get the appropriate 43 * format class for a particular document. 44 */ 45 public class HtmlDocumentFormat extends TextualDocumentFormat 46 { 47 48 /** Debug flag */ 49 private static final boolean DEBUG = false; 50 51 /** Default construction */ 52 public HtmlDocumentFormat() { super(); } 53 54 /** We could collect repositioning information during XML parsing */ 55 public Boolean supportsRepositioning() { 56 return new Boolean(true); 57 } // supportsRepositioning 58 59 /** Old style of unpackMarkup (without collecting of RepositioningInfo) */ 60 public void unpackMarkup(Document doc) throws DocumentFormatException { 61 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null); 62 } // unpackMarkup 63 64 /** Unpack the markup in the document. This converts markup from the 65 * native format (e.g. HTML) into annotations in GATE format. 66 * Uses the markupElementsMap to determine which elements to convert, and 67 * what annotation type names to use. 68 * It always tryes to parse te doc's content. It doesn't matter if the 69 * sourceUrl is null or not. 70 * 71 * @param Document doc The gate document you want to parse. 72 * 73 */ 74 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 75 RepositioningInfo ampCodingInfo) throws DocumentFormatException{ 76 Reader reader = null; 77 URLConnection conn = null; 78 PrintWriter out = null; 79 HTMLEditorKit.Parser parser = new ParserDelegator(); 80 81 if ( doc == null || doc.getContent() == null ){ 82 throw new DocumentFormatException( 83 "GATE document is null or no content found. Nothing to parse!"); 84 }// End if 85 86 reader = new InputStreamReader( 87 new ByteArrayInputStream(doc.getContent().toString().getBytes())); 88 89 // create a new Htmldocument handler 90 HtmlDocumentHandler htmlDocHandler = new 91 HtmlDocumentHandler(doc, this.markupElementsMap); 92 // Create a Status Listener 93 StatusListener statusListener = new StatusListener(){ 94 public void statusChanged(String text){ 95 fireStatusChanged(text); 96 } 97 }; 98 // Register the listener with htmlDocHandler 99 htmlDocHandler.addStatusListener(statusListener); 100 // set repositioning object 101 htmlDocHandler.setRepositioningInfo(repInfo); 102 // set the object with ampersand coding positions 103 htmlDocHandler.setAmpCodingInfo(ampCodingInfo); 104 105 try{ 106 // parse the HTML document 107 parser.parse(reader, htmlDocHandler, true); 108 } catch (IOException e){ 109 throw new DocumentFormatException(e); 110 }finally{ 111 if (htmlDocHandler != null) 112 htmlDocHandler.removeStatusListener(statusListener); 113 }// End try 114 }//unpackMarkup(doc) 115 116 /** Initialise this resource, and return it. */ 117 public Resource init() throws ResourceInstantiationException{ 118 // Register HTML mime type 119 MimeType mime = new MimeType("text","html"); 120 // Register the class handler for this mime type 121 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 122 this); 123 // Register the mime type with mine string 124 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 125 // Register file sufixes for this mime type 126 suffixes2mimeTypeMap.put("html",mime); 127 suffixes2mimeTypeMap.put("htm",mime); 128 // Register magic numbers for this mime type 129 magic2mimeTypeMap.put("<html",mime); 130 // Set the mimeType for this language resource 131 setMimeType(mime); 132 return this; 133 }// init() 134 }// class HtmlDocumentFormat 135
|
HtmlDocumentFormat |
|