1   /*
2    *  EmailDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 3/Aug/2000
12   *
13   *  $Id: EmailDocumentFormat.java,v 1.23 2001/11/30 14:38:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  
22  import gate.util.*;
23  import gate.*;
24  import gate.email.*;
25  import gate.event.*;
26  import gate.creole.*;
27  
28  import org.w3c.www.mime.*;
29  
30  /** The format of Documents. Subclasses of DocumentFormat know about
31    * particular MIME types and how to unpack the information in any
32    * markup or formatting they contain into GATE annotations. Each MIME
33    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
34    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
35    * with a static index residing here when they are constructed. Static
36    * getDocumentFormat methods can then be used to get the appropriate
37    * format class for a particular document.
38    */
39  public class EmailDocumentFormat extends TextualDocumentFormat
40  {
41    /** Debug flag */
42    private static final boolean DEBUG = false;
43  
44    /** Default construction */
45    public EmailDocumentFormat() { super();}
46  
47    /** Unpack the markup in the document. This converts markup from the
48      * native format (e.g. EMAIL) into annotations in GATE format.
49      * Uses the markupElementsMap to determine which elements to convert, and
50      * what annotation type names to use.
51      * It always tryes to parse te doc's content. It doesn't matter if the
52      * sourceUrl is null or not.
53      *
54      * @param Document doc The gate document you want to parse.
55      *
56      */
57  
58    public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
59      if ( (doc == null) ||
60           (doc.getSourceUrl() == null && doc.getContent() == null)){
61  
62        throw new DocumentFormatException(
63                 "GATE document is null or no content found. Nothing to parse!");
64      }// End if
65      // create an EmailDocumentHandler
66      EmailDocumentHandler emailDocHandler = null;
67      emailDocHandler = new  gate.email.EmailDocumentHandler(
68                                                         doc,
69                                                         this.markupElementsMap,
70                                                         this.element2StringMap);
71      StatusListener statusListener = new StatusListener(){
72          public void statusChanged(String text) {
73            // this is implemented in DocumentFormat.java and inherited here
74            fireStatusChanged(text);
75          }//statusChanged(String text)
76      };
77      // Register a status listener with it
78      emailDocHandler.addStatusListener(statusListener);
79      try{
80        // Call the method that creates annotations on the gate document
81        emailDocHandler.annotateMessages();
82        // Process the body annotations and search for paragraphs
83        AnnotationSet bodyAnnotations = doc.getAnnotations(
84                      GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
85        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){
86          Iterator iter = bodyAnnotations.iterator();
87          while(iter.hasNext()){
88            Annotation a = (Annotation)iter.next();
89            annotateParagraphs(doc,a.getStartNode().getOffset().intValue(),
90                                   a.getEndNode().getOffset().intValue(),
91                                   GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
92          }// End while
93        }// End if
94      } catch (IOException e){
95        throw new DocumentFormatException("Couldn't create a buffered reader ",e);
96      } catch (InvalidOffsetException e){
97        throw new DocumentFormatException(e);
98      }finally{
99        emailDocHandler.removeStatusListener(statusListener);
100     }// End try
101   }//unpackMarkup(doc)
102 
103   /** Initialise this resource, and return it. */
104   public Resource init() throws ResourceInstantiationException{
105     // Register EMAIL mime type
106     MimeType mime = new MimeType("text","email");
107     // Register the class handler for this mime type
108     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
109                                                                           this);
110     // Register the mime type with mine string
111     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
112     // Register file sufixes for this mime type
113     suffixes2mimeTypeMap.put("eml",mime);
114     suffixes2mimeTypeMap.put("email",mime);
115     suffixes2mimeTypeMap.put("mail",mime);
116     // Register magic numbers for this mime type
117     magic2mimeTypeMap.put("Subject:",mime);
118     // Set the mimeType for this language resource
119     setMimeType(mime);
120     return this;
121   }// init()
122 }// class EmailDocumentFormat
123 
124