1   /*
2    *  TextualDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: TextualDocumentFormat.java,v 1.18 2001/11/30 14:38:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import gate.util.*;
22  import gate.*;
23  import gate.creole.*;
24  
25  import org.w3c.www.mime.*;
26  
27  /** The format of Documents. Subclasses of DocumentFormat know about
28    * particular MIME types and how to unpack the information in any
29    * markup or formatting they contain into GATE annotations. Each MIME
30    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
31    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
32    * with a static index residing here when they are constructed. Static
33    * getDocumentFormat methods can then be used to get the appropriate
34    * format class for a particular document.
35    */
36  public class TextualDocumentFormat extends DocumentFormat
37  {
38  
39    /** Debug flag */
40    private static final boolean DEBUG = false;
41  
42    /** Default construction */
43    public TextualDocumentFormat() { super(); }
44  
45    /** Initialise this resource, and return it. */
46    public Resource init() throws ResourceInstantiationException{
47      // Register plain text mime type
48      MimeType mime = new MimeType("text","plain");
49      // Register the class handler for this mime type
50      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
51                                                                            this);
52      // Register the mime type with mine string
53      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
54      // Register file sufixes for this mime type
55      suffixes2mimeTypeMap.put("txt",mime);
56      suffixes2mimeTypeMap.put("text",mime);
57      // Set the mimeType for this language resource
58      setMimeType(mime);
59      return this;
60    } // init()
61  
62    /** Unpack the markup in the document. This converts markup from the
63      * native format (e.g. XML, RTF) into annotations in GATE format.
64      * Uses the markupElementsMap to determine which elements to convert, and
65      * what annotation type names to use.
66      */
67    public void unpackMarkup(Document doc) throws DocumentFormatException{
68      if (doc == null || doc.getContent() == null) return;
69      // Create paragraph annotations in the specified annotation set
70      int endOffset = doc.getContent().toString().length();
71      int startOffset = 0;
72      annotateParagraphs(doc,startOffset,endOffset,
73                                  GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
74    }//unpackMarkup
75  
76    /** This method annotates paragraphs in a GATE document. The investigated text
77      * spans beetween start and end offsets and the paragraph annotations are
78      * created in the annotSetName. If annotSetName is null then they are creted
79      * in the default annotation set.
80      * @param aDoc is the gate document on which the paragraph detection would
81      *  be performed.If it is null or its content it's null then the method woul
82      *  simply return doing nothing.
83      * @param startOffset is the index  form the document content from which the
84      * paragraph detection will start
85      * @param endOffset is the offset where the detection will end.
86      * @param annotSetName is the name of the set in which paragraph annotation
87      * would be created.The annotation type created will be "paragraph"
88      */
89    public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
90                              String annotSetName)throws DocumentFormatException{
91      // Simply return if the document is null or its content
92      if (aDoc == null || aDoc.getContent() == null) return;
93      // Simply return if the start is > than the end
94      if (startOffset > endOffset) return;
95      // Decide where to put the newly detected annotations
96      AnnotationSet annotSet = null;
97      if (annotSetName == null)
98        annotSet = aDoc.getAnnotations();
99      else
100       annotSet = aDoc.getAnnotations(annotSetName);
101     // Extract the document content
102     String content = aDoc.getContent().toString();
103     // This is the offset marking the start of a para
104     int startOffsetPara = startOffset;
105     // This marks the ned of a para
106     int endOffsetPara = endOffset;
107     // The initial sate of the FSA
108     int state = 1;
109     // This field marks that a BR entity was read
110     // A BR entity can be NL or NL CR, depending on the operating system (UNIX
111     // or DOS)
112     boolean readBR = false;
113     int index = startOffset;
114     while (index < endOffset){
115       // Read the current char
116       char ch = content.charAt(index);
117       // Test if a BR entity was read
118       if (ch =='\n'){
119         readBR = true;
120         // If \n is followed by a \r then advance the index in order to read a
121         // BR entity
122         while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
123           index ++;
124       }// End if
125       switch(state){
126         // It is the initial and also a final state
127         // Stay in state 1 while it reads whitespaces
128         case 1:{
129           // If reads a non whitespace char then move to state 2 and record
130           // the beggining of a paragraph
131           if (!Character.isWhitespace(ch)){
132             state = 2;
133             startOffsetPara = index;
134           }// End if
135         }break;
136         // It can be also a final state.
137         case 2:{
138           // Stay in state 2 while reading chars != BR entities
139           if (readBR){
140             // If you find a BR char go to state 3. The possible end of the para
141             // can be index. This will be confirmed by state 3. So, this is why
142             // the end of a para is recorded here.
143             readBR = false;
144             endOffsetPara = index;
145             state = 3;
146           }// End if
147         }break;
148         // It can be also a final state
149         // From state 3 there are only 2 possible ways: (state 2 or state1)
150         // In state 1 it needs to read a BR
151         // For state 2 it nead to read something different then a BR
152         case 3:{
153           if (readBR){
154             // A BR was read. Go to state 1
155             readBR = false;
156             state = 1;
157             // Create an annotation type paragraph
158             try{
159               annotSet.add( new Long(startOffsetPara),
160                             new Long(endOffsetPara),
161                             "paragraph",
162                             Factory.newFeatureMap());
163             } catch (gate.util.InvalidOffsetException ioe){
164               throw new DocumentFormatException("Coudn't create a paragraph"+
165               " annotation",ioe);
166             }// End try
167           }else{
168             // Go to state 2 an keep reading chars
169             state = 2;
170           }// End if
171         }break;
172       }// End switch
173       // Prepare to read the next char.
174       index ++;
175     }// End while
176     endOffsetPara = index;
177     // Investigate where the finite automata has stoped
178     if ( state==2 || state==3 ){
179       // Create an annotation type paragraph
180       try{
181         annotSet.add( new Long(startOffsetPara),
182                       // Create the final annotation using the endOffset
183                       new Long(endOffsetPara),
184                       "paragraph",
185                       Factory.newFeatureMap());
186       } catch (gate.util.InvalidOffsetException ioe){
187               throw new DocumentFormatException("Coudn't create a paragraph"+
188               " annotation",ioe);
189       }// End try
190     }// End if
191   }// End annotateParagraphs();
192 
193   public DataStore getDataStore(){ return null;}
194 
195 } // class TextualDocumentFormat
196