1   /*
2    *  TextualDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: TextualDocumentFormat.java,v 1.20 2002/01/28 14:25:09 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import gate.util.*;
22  import gate.*;
23  import gate.creole.*;
24  
25  import org.w3c.www.mime.*;
26  
27  /** The format of Documents. Subclasses of DocumentFormat know about
28    * particular MIME types and how to unpack the information in any
29    * markup or formatting they contain into GATE annotations. Each MIME
30    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
31    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
32    * with a static index residing here when they are constructed. Static
33    * getDocumentFormat methods can then be used to get the appropriate
34    * format class for a particular document.
35    */
36  public class TextualDocumentFormat extends DocumentFormat
37  {
38  
39    /** Debug flag */
40    private static final boolean DEBUG = false;
41  
42    /** Default construction */
43    public TextualDocumentFormat() { super(); }
44  
45    /** Initialise this resource, and return it. */
46    public Resource init() throws ResourceInstantiationException{
47      // Register plain text mime type
48      MimeType mime = new MimeType("text","plain");
49      // Register the class handler for this mime type
50      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
51                                                                            this);
52      // Register the mime type with mine string
53      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
54      // Register file sufixes for this mime type
55      suffixes2mimeTypeMap.put("txt",mime);
56      suffixes2mimeTypeMap.put("text",mime);
57      // Set the mimeType for this language resource
58      setMimeType(mime);
59      return this;
60    } // init()
61  
62    /** Unpack the markup in the document. This converts markup from the
63      * native format (e.g. XML, RTF) into annotations in GATE format.
64      * Uses the markupElementsMap to determine which elements to convert, and
65      * what annotation type names to use.
66      */
67    public void unpackMarkup(Document doc) throws DocumentFormatException{
68      if (doc == null || doc.getContent() == null) return;
69      // Create paragraph annotations in the specified annotation set
70      int endOffset = doc.getContent().toString().length();
71      int startOffset = 0;
72      annotateParagraphs(doc,startOffset,endOffset,
73                                  GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
74    }//unpackMarkup
75  
76    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
77                              RepositioningInfo ampCodingInfo)
78                                        throws DocumentFormatException {
79      unpackMarkup(doc);
80    } // unpackMarkup
81  
82  
83    /** This method annotates paragraphs in a GATE document. The investigated text
84      * spans beetween start and end offsets and the paragraph annotations are
85      * created in the annotSetName. If annotSetName is null then they are creted
86      * in the default annotation set.
87      * @param aDoc is the gate document on which the paragraph detection would
88      *  be performed.If it is null or its content it's null then the method woul
89      *  simply return doing nothing.
90      * @param startOffset is the index  form the document content from which the
91      * paragraph detection will start
92      * @param endOffset is the offset where the detection will end.
93      * @param annotSetName is the name of the set in which paragraph annotation
94      * would be created.The annotation type created will be "paragraph"
95      */
96    public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
97                              String annotSetName)throws DocumentFormatException{
98      // Simply return if the document is null or its content
99      if (aDoc == null || aDoc.getContent() == null) return;
100     // Simply return if the start is > than the end
101     if (startOffset > endOffset) return;
102     // Decide where to put the newly detected annotations
103     AnnotationSet annotSet = null;
104     if (annotSetName == null)
105       annotSet = aDoc.getAnnotations();
106     else
107       annotSet = aDoc.getAnnotations(annotSetName);
108     // Extract the document content
109     String content = aDoc.getContent().toString();
110     // This is the offset marking the start of a para
111     int startOffsetPara = startOffset;
112     // This marks the ned of a para
113     int endOffsetPara = endOffset;
114     // The initial sate of the FSA
115     int state = 1;
116     // This field marks that a BR entity was read
117     // A BR entity can be NL or NL CR, depending on the operating system (UNIX
118     // or DOS)
119     boolean readBR = false;
120     int index = startOffset;
121     while (index < endOffset){
122       // Read the current char
123       char ch = content.charAt(index);
124       // Test if a BR entity was read
125       if (ch =='\n'){
126         readBR = true;
127         // If \n is followed by a \r then advance the index in order to read a
128         // BR entity
129         while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
130           index ++;
131       }// End if
132       switch(state){
133         // It is the initial and also a final state
134         // Stay in state 1 while it reads whitespaces
135         case 1:{
136           // If reads a non whitespace char then move to state 2 and record
137           // the beggining of a paragraph
138           if (!Character.isWhitespace(ch)){
139             state = 2;
140             startOffsetPara = index;
141           }// End if
142         }break;
143         // It can be also a final state.
144         case 2:{
145           // Stay in state 2 while reading chars != BR entities
146           if (readBR){
147             // If you find a BR char go to state 3. The possible end of the para
148             // can be index. This will be confirmed by state 3. So, this is why
149             // the end of a para is recorded here.
150             readBR = false;
151             endOffsetPara = index;
152             state = 3;
153           }// End if
154         }break;
155         // It can be also a final state
156         // From state 3 there are only 2 possible ways: (state 2 or state1)
157         // In state 1 it needs to read a BR
158         // For state 2 it nead to read something different then a BR
159         case 3:{
160           if (readBR){
161             // A BR was read. Go to state 1
162             readBR = false;
163             state = 1;
164             // Create an annotation type paragraph
165             try{
166               annotSet.add( new Long(startOffsetPara),
167                             new Long(endOffsetPara),
168                             "paragraph",
169                             Factory.newFeatureMap());
170             } catch (gate.util.InvalidOffsetException ioe){
171               throw new DocumentFormatException("Coudn't create a paragraph"+
172               " annotation",ioe);
173             }// End try
174           }else{
175             // Go to state 2 an keep reading chars
176             state = 2;
177           }// End if
178         }break;
179       }// End switch
180       // Prepare to read the next char.
181       index ++;
182     }// End while
183     endOffsetPara = index;
184     // Investigate where the finite automata has stoped
185     if ( state==2 || state==3 ){
186       // Create an annotation type paragraph
187       try{
188         annotSet.add( new Long(startOffsetPara),
189                       // Create the final annotation using the endOffset
190                       new Long(endOffsetPara),
191                       "paragraph",
192                       Factory.newFeatureMap());
193       } catch (gate.util.InvalidOffsetException ioe){
194               throw new DocumentFormatException("Coudn't create a paragraph"+
195               " annotation",ioe);
196       }// End try
197     }// End if
198   }// End annotateParagraphs();
199 
200   public DataStore getDataStore(){ return null;}
201 
202 } // class TextualDocumentFormat
203