1   /*
2    *  RtfDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/July/2000
12   *
13   *  $Id: RtfDocumentFormat.java,v 1.15 2001/11/30 14:38:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.io.*;
20  import java.net.*;
21  
22  import gate.util.*;
23  import gate.*;
24  import gate.gui.*;
25  import gate.creole.*;
26  
27  // rtf tools
28  import javax.swing.text.rtf.*;
29  import javax.swing.text.*;
30  import org.w3c.www.mime.*;
31  
32  /** The format of Documents. Subclasses of DocumentFormat know about
33    * particular MIME types and how to unpack the information in any
34    * markup or formatting they contain into GATE annotations. Each MIME
35    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
36    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
37    * with a static index residing here when they are constructed. Static
38    * getDocumentFormat methods can then be used to get the appropriate
39    * format class for a particular document.
40    */
41  public class RtfDocumentFormat extends TextualDocumentFormat{
42  
43    /** Debug flag */
44    private static final boolean DEBUG = false;
45  
46    /** Default construction */
47    public RtfDocumentFormat() { super(); }
48  
49    /** Unpack the markup in the document. This converts markup from the
50      * native format (e.g.RTF) into annotations in GATE format.
51      * Uses the markupElementsMap to determine which elements to convert, and
52      * what annotation type names to use.
53      * It always tryes to parse te doc's content. It doesn't matter if the
54      * sourceUrl is null or not.
55      *
56      * @param Document doc The gate document you want to parse.
57      *
58      */
59    public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
60  
61      if ( (doc == null) ||
62           (doc.getSourceUrl() == null && doc.getContent() == null)){
63  
64        throw new DocumentFormatException(
65                 "GATE document is null or no content found. Nothing to parse!");
66      }// End if
67  
68      // create a RTF editor kit
69      RTFEditorKit aRtfEditorkit = new RTFEditorKit();
70  
71      // create a Styled Document
72      // NOTE that RTF Kit works only with Systled Document interface
73      StyledDocument styledDoc = new DefaultStyledDocument();
74  
75      // get an Input stream from the gate document
76      InputStream in = new ByteArrayInputStream(
77                                           doc.getContent().toString().getBytes()
78                                           );
79  
80      try {
81        aRtfEditorkit.read(in, styledDoc, 0);
82        // replace the document content with the one without markups
83        doc.setContent(new DocumentContentImpl(
84                                        styledDoc.getText(0,styledDoc.getLength())
85                                              )
86                      );
87      } catch (BadLocationException e) {
88        throw new DocumentFormatException(e);
89      } catch (IOException e){
90        throw new DocumentFormatException("I/O exception for " +
91                                          doc.getSourceUrl().toExternalForm(),e);
92      }
93    } // unpackMarkup(doc)
94  
95    /** Initialise this resource, and return it. */
96    public Resource init() throws ResourceInstantiationException{
97      // Register RTF mime type
98      MimeType mime = new MimeType("text","rtf");
99      // Register the class handler for this mime type
100     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
101                                                                           this);
102     // Register the mime type with mine string
103     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
104     // Register file sufixes for this mime type
105     suffixes2mimeTypeMap.put("rtf",mime);
106     // Register magic numbers for this mime type
107     magic2mimeTypeMap.put("{\\rtf1",mime);
108     // Set the mimeType for this language resource
109     setMimeType(mime);
110     return this;
111   }// init()
112 }// class RtfDocumentFormat
113