|
EmailDocumentFormat |
|
1 /* 2 * EmailDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 3/Aug/2000 12 * 13 * $Id: EmailDocumentFormat.java,v 1.23 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 import gate.util.*; 23 import gate.*; 24 import gate.email.*; 25 import gate.event.*; 26 import gate.creole.*; 27 28 import org.w3c.www.mime.*; 29 30 /** The format of Documents. Subclasses of DocumentFormat know about 31 * particular MIME types and how to unpack the information in any 32 * markup or formatting they contain into GATE annotations. Each MIME 33 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 34 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 35 * with a static index residing here when they are constructed. Static 36 * getDocumentFormat methods can then be used to get the appropriate 37 * format class for a particular document. 38 */ 39 public class EmailDocumentFormat extends TextualDocumentFormat 40 { 41 /** Debug flag */ 42 private static final boolean DEBUG = false; 43 44 /** Default construction */ 45 public EmailDocumentFormat() { super();} 46 47 /** Unpack the markup in the document. This converts markup from the 48 * native format (e.g. EMAIL) into annotations in GATE format. 49 * Uses the markupElementsMap to determine which elements to convert, and 50 * what annotation type names to use. 51 * It always tryes to parse te doc's content. It doesn't matter if the 52 * sourceUrl is null or not. 53 * 54 * @param Document doc The gate document you want to parse. 55 * 56 */ 57 58 public void unpackMarkup(gate.Document doc) throws DocumentFormatException{ 59 if ( (doc == null) || 60 (doc.getSourceUrl() == null && doc.getContent() == null)){ 61 62 throw new DocumentFormatException( 63 "GATE document is null or no content found. Nothing to parse!"); 64 }// End if 65 // create an EmailDocumentHandler 66 EmailDocumentHandler emailDocHandler = null; 67 emailDocHandler = new gate.email.EmailDocumentHandler( 68 doc, 69 this.markupElementsMap, 70 this.element2StringMap); 71 StatusListener statusListener = new StatusListener(){ 72 public void statusChanged(String text) { 73 // this is implemented in DocumentFormat.java and inherited here 74 fireStatusChanged(text); 75 }//statusChanged(String text) 76 }; 77 // Register a status listener with it 78 emailDocHandler.addStatusListener(statusListener); 79 try{ 80 // Call the method that creates annotations on the gate document 81 emailDocHandler.annotateMessages(); 82 // Process the body annotations and search for paragraphs 83 AnnotationSet bodyAnnotations = doc.getAnnotations( 84 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body"); 85 if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){ 86 Iterator iter = bodyAnnotations.iterator(); 87 while(iter.hasNext()){ 88 Annotation a = (Annotation)iter.next(); 89 annotateParagraphs(doc,a.getStartNode().getOffset().intValue(), 90 a.getEndNode().getOffset().intValue(), 91 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 92 }// End while 93 }// End if 94 } catch (IOException e){ 95 throw new DocumentFormatException("Couldn't create a buffered reader ",e); 96 } catch (InvalidOffsetException e){ 97 throw new DocumentFormatException(e); 98 }finally{ 99 emailDocHandler.removeStatusListener(statusListener); 100 }// End try 101 }//unpackMarkup(doc) 102 103 /** Initialise this resource, and return it. */ 104 public Resource init() throws ResourceInstantiationException{ 105 // Register EMAIL mime type 106 MimeType mime = new MimeType("text","email"); 107 // Register the class handler for this mime type 108 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 109 this); 110 // Register the mime type with mine string 111 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 112 // Register file sufixes for this mime type 113 suffixes2mimeTypeMap.put("eml",mime); 114 suffixes2mimeTypeMap.put("email",mime); 115 suffixes2mimeTypeMap.put("mail",mime); 116 // Register magic numbers for this mime type 117 magic2mimeTypeMap.put("Subject:",mime); 118 // Set the mimeType for this language resource 119 setMimeType(mime); 120 return this; 121 }// init() 122 }// class EmailDocumentFormat 123 124
|
EmailDocumentFormat |
|