|
HtmlDocumentFormat |
|
1 /* 2 * HtmlDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: HtmlDocumentFormat.java,v 1.28 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 // html tools 23 import javax.swing.text.html.*; 24 import javax.swing.text.html.parser.*; 25 import javax.swing.text.html.HTMLEditorKit.*; 26 import javax.swing.text.*; 27 28 import gate.util.*; 29 import gate.*; 30 import gate.html.*; 31 import gate.event.*; 32 import gate.creole.*; 33 34 import org.w3c.www.mime.*; 35 36 /** The format of Documents. Subclasses of DocumentFormat know about 37 * particular MIME types and how to unpack the information in any 38 * markup or formatting they contain into GATE annotations. Each MIME 39 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 40 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 41 * with a static index residing here when they are constructed. Static 42 * getDocumentFormat methods can then be used to get the appropriate 43 * format class for a particular document. 44 */ 45 public class HtmlDocumentFormat extends TextualDocumentFormat 46 { 47 48 /** Debug flag */ 49 private static final boolean DEBUG = false; 50 51 /** Default construction */ 52 public HtmlDocumentFormat() { super(); } 53 54 /** Unpack the markup in the document. This converts markup from the 55 * native format (e.g. HTML) into annotations in GATE format. 56 * Uses the markupElementsMap to determine which elements to convert, and 57 * what annotation type names to use. 58 * It always tryes to parse te doc's content. It doesn't matter if the 59 * sourceUrl is null or not. 60 * 61 * @param Document doc The gate document you want to parse. 62 * 63 */ 64 public void unpackMarkup(gate.Document doc) throws DocumentFormatException{ 65 Reader reader = null; 66 URLConnection conn = null; 67 PrintWriter out = null; 68 HTMLEditorKit.Parser parser = new ParserDelegator(); 69 70 if ( doc == null || doc.getContent() == null ){ 71 throw new DocumentFormatException( 72 "GATE document is null or no content found. Nothing to parse!"); 73 }// End if 74 75 reader = new InputStreamReader( 76 new ByteArrayInputStream(doc.getContent().toString().getBytes())); 77 78 // create a new Htmldocument handler 79 HtmlDocumentHandler htmlDocHandler = new 80 HtmlDocumentHandler(doc, this.markupElementsMap); 81 // Create a Status Listener 82 StatusListener statusListener = new StatusListener(){ 83 public void statusChanged(String text){ 84 fireStatusChanged(text); 85 } 86 }; 87 // Register the listener with htmlDocHandler 88 htmlDocHandler.addStatusListener(statusListener); 89 try{ 90 // parse the HTML document 91 parser.parse(reader, htmlDocHandler, true); 92 } catch (IOException e){ 93 throw new DocumentFormatException(e); 94 }finally{ 95 if (htmlDocHandler != null) 96 htmlDocHandler.removeStatusListener(statusListener); 97 }// End try 98 }//unpackMarkup(doc) 99 100 /** Initialise this resource, and return it. */ 101 public Resource init() throws ResourceInstantiationException{ 102 // Register HTML mime type 103 MimeType mime = new MimeType("text","html"); 104 // Register the class handler for this mime type 105 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 106 this); 107 // Register the mime type with mine string 108 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 109 // Register file sufixes for this mime type 110 suffixes2mimeTypeMap.put("html",mime); 111 suffixes2mimeTypeMap.put("htm",mime); 112 // Register magic numbers for this mime type 113 magic2mimeTypeMap.put("<html",mime); 114 // Set the mimeType for this language resource 115 setMimeType(mime); 116 return this; 117 }// init() 118 }// class HtmlDocumentFormat 119
|
HtmlDocumentFormat |
|