|
EmailDocumentFormat |
|
1 /* 2 * EmailDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 3/Aug/2000 12 * 13 * $Id: EmailDocumentFormat.java,v 1.24 2002/07/05 08:54:08 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.io.*; 20 import java.net.*; 21 22 import gate.util.*; 23 import gate.*; 24 import gate.email.*; 25 import gate.event.*; 26 import gate.creole.*; 27 28 import org.w3c.www.mime.*; 29 30 /** The format of Documents. Subclasses of DocumentFormat know about 31 * particular MIME types and how to unpack the information in any 32 * markup or formatting they contain into GATE annotations. Each MIME 33 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 34 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 35 * with a static index residing here when they are constructed. Static 36 * getDocumentFormat methods can then be used to get the appropriate 37 * format class for a particular document. 38 */ 39 public class EmailDocumentFormat extends TextualDocumentFormat 40 { 41 /** Debug flag */ 42 private static final boolean DEBUG = false; 43 44 /** Default construction */ 45 public EmailDocumentFormat() { super();} 46 47 /** Unpack the markup in the document. This converts markup from the 48 * native format (e.g. EMAIL) into annotations in GATE format. 49 * Uses the markupElementsMap to determine which elements to convert, and 50 * what annotation type names to use. 51 * It always tryes to parse te doc's content. It doesn't matter if the 52 * sourceUrl is null or not. 53 * 54 * @param Document doc The gate document you want to parse. 55 * 56 */ 57 58 public void unpackMarkup(gate.Document doc) throws DocumentFormatException{ 59 if ( (doc == null) || 60 (doc.getSourceUrl() == null && doc.getContent() == null)){ 61 62 throw new DocumentFormatException( 63 "GATE document is null or no content found. Nothing to parse!"); 64 }// End if 65 66 setNewLineProperty(doc); 67 68 // create an EmailDocumentHandler 69 EmailDocumentHandler emailDocHandler = null; 70 emailDocHandler = new gate.email.EmailDocumentHandler( 71 doc, 72 this.markupElementsMap, 73 this.element2StringMap); 74 StatusListener statusListener = new StatusListener(){ 75 public void statusChanged(String text) { 76 // this is implemented in DocumentFormat.java and inherited here 77 fireStatusChanged(text); 78 }//statusChanged(String text) 79 }; 80 // Register a status listener with it 81 emailDocHandler.addStatusListener(statusListener); 82 try{ 83 // Call the method that creates annotations on the gate document 84 emailDocHandler.annotateMessages(); 85 // Process the body annotations and search for paragraphs 86 AnnotationSet bodyAnnotations = doc.getAnnotations( 87 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body"); 88 if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){ 89 Iterator iter = bodyAnnotations.iterator(); 90 while(iter.hasNext()){ 91 Annotation a = (Annotation)iter.next(); 92 annotateParagraphs(doc,a.getStartNode().getOffset().intValue(), 93 a.getEndNode().getOffset().intValue(), 94 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 95 }// End while 96 }// End if 97 } catch (IOException e){ 98 throw new DocumentFormatException("Couldn't create a buffered reader ",e); 99 } catch (InvalidOffsetException e){ 100 throw new DocumentFormatException(e); 101 }finally{ 102 emailDocHandler.removeStatusListener(statusListener); 103 }// End try 104 }//unpackMarkup(doc) 105 106 /** Initialise this resource, and return it. */ 107 public Resource init() throws ResourceInstantiationException{ 108 // Register EMAIL mime type 109 MimeType mime = new MimeType("text","email"); 110 // Register the class handler for this mime type 111 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 112 this); 113 // Register the mime type with mine string 114 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 115 // Register file sufixes for this mime type 116 suffixes2mimeTypeMap.put("eml",mime); 117 suffixes2mimeTypeMap.put("email",mime); 118 suffixes2mimeTypeMap.put("mail",mime); 119 // Register magic numbers for this mime type 120 magic2mimeTypeMap.put("Subject:",mime); 121 // Set the mimeType for this language resource 122 setMimeType(mime); 123 return this; 124 }// init() 125 }// class EmailDocumentFormat 126 127
|
EmailDocumentFormat |
|