1   /*
2    *  TextualDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: TextualDocumentFormat.java,v 1.21 2002/07/05 08:54:08 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  
21  import gate.util.*;
22  import gate.*;
23  import gate.creole.*;
24  
25  import org.w3c.www.mime.*;
26  
27  /** The format of Documents. Subclasses of DocumentFormat know about
28    * particular MIME types and how to unpack the information in any
29    * markup or formatting they contain into GATE annotations. Each MIME
30    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
31    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
32    * with a static index residing here when they are constructed. Static
33    * getDocumentFormat methods can then be used to get the appropriate
34    * format class for a particular document.
35    */
36  public class TextualDocumentFormat extends DocumentFormat
37  {
38  
39    /** Debug flag */
40    private static final boolean DEBUG = false;
41  
42    /** Default construction */
43    public TextualDocumentFormat() { super(); }
44  
45    /** Initialise this resource, and return it. */
46    public Resource init() throws ResourceInstantiationException{
47      // Register plain text mime type
48      MimeType mime = new MimeType("text","plain");
49      // Register the class handler for this mime type
50      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
51                                                                            this);
52      // Register the mime type with mine string
53      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
54      // Register file sufixes for this mime type
55      suffixes2mimeTypeMap.put("txt",mime);
56      suffixes2mimeTypeMap.put("text",mime);
57      // Set the mimeType for this language resource
58      setMimeType(mime);
59      return this;
60    } // init()
61  
62    /** Unpack the markup in the document. This converts markup from the
63      * native format (e.g. XML, RTF) into annotations in GATE format.
64      * Uses the markupElementsMap to determine which elements to convert, and
65      * what annotation type names to use.
66      */
67    public void unpackMarkup(Document doc) throws DocumentFormatException{
68      if (doc == null || doc.getContent() == null) return;
69      setNewLineProperty(doc);
70      // Create paragraph annotations in the specified annotation set
71      int endOffset = doc.getContent().toString().length();
72      int startOffset = 0;
73      annotateParagraphs(doc,startOffset,endOffset,
74                                  GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
75    }//unpackMarkup
76  
77    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
78                              RepositioningInfo ampCodingInfo)
79                                        throws DocumentFormatException {
80      unpackMarkup(doc);
81    } // unpackMarkup
82  
83  
84    /**
85     * Check the new line sequence and set document property.
86     * <BR>
87     * Possible values are CRLF, LFCR, CR, LF
88     */
89    protected void setNewLineProperty(Document doc) {
90      String content = doc.getContent().toString();
91      String newLineType = "";
92      
93      char ch = ' ';
94      char lastch = ' ';
95      for(int i=0; i < content.length(); ++i) {
96        ch = content.charAt(i);
97        if(lastch == '\r') {
98          if(ch == '\n') {
99            newLineType = "CRLF";
100           break;
101         }
102         else {
103           newLineType = "CR";
104           break;
105         }
106       }
107       if(lastch == '\n') {
108         if(ch == '\r') {
109           newLineType = "LFCR";
110           break;
111         }
112         else {
113           newLineType = "LF";
114           break;
115         }
116       }
117       lastch = ch;
118     } // for
119     
120     doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
121   } // setNewLineProperty()
122   
123   /** Delete '\r' in combination CRLF or LFCR in document content */
124   private void removeExtraNewLine(Document doc) {
125     String content = doc.getContent().toString();
126     StringBuffer buff = new StringBuffer(content);
127     
128     char ch = ' ';
129     char lastch = ' ';
130     for(int i=content.length()-1; i > -1; --i) {
131       ch = content.charAt(i);
132       if(ch == '\n' && lastch == '\r') {
133         buff.deleteCharAt(i+1);
134       }
135       if(ch == '\r' && lastch == '\n') {
136         buff.deleteCharAt(i);
137         ch = lastch;
138       }
139       lastch = ch;
140     } // for
141     
142     doc.setContent(new DocumentContentImpl(buff.toString()));
143   } // removeExtraNewLine(Document doc)
144 
145   /** This method annotates paragraphs in a GATE document. The investigated text
146     * spans beetween start and end offsets and the paragraph annotations are
147     * created in the annotSetName. If annotSetName is null then they are creted
148     * in the default annotation set.
149     * @param aDoc is the gate document on which the paragraph detection would
150     *  be performed.If it is null or its content it's null then the method woul
151     *  simply return doing nothing.
152     * @param startOffset is the index  form the document content from which the
153     * paragraph detection will start
154     * @param endOffset is the offset where the detection will end.
155     * @param annotSetName is the name of the set in which paragraph annotation
156     * would be created.The annotation type created will be "paragraph"
157     */
158   public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
159                             String annotSetName)throws DocumentFormatException{
160     // Simply return if the document is null or its content
161     if (aDoc == null || aDoc.getContent() == null) return;
162     // Simply return if the start is > than the end
163     if (startOffset > endOffset) return;
164     // Decide where to put the newly detected annotations
165     AnnotationSet annotSet = null;
166     if (annotSetName == null)
167       annotSet = aDoc.getAnnotations();
168     else
169       annotSet = aDoc.getAnnotations(annotSetName);
170     // Extract the document content
171     String content = aDoc.getContent().toString();
172     // This is the offset marking the start of a para
173     int startOffsetPara = startOffset;
174     // This marks the ned of a para
175     int endOffsetPara = endOffset;
176     // The initial sate of the FSA
177     int state = 1;
178     // This field marks that a BR entity was read
179     // A BR entity can be NL or NL CR, depending on the operating system (UNIX
180     // or DOS)
181     boolean readBR = false;
182     int index = startOffset;
183     while (index < endOffset){
184       // Read the current char
185       char ch = content.charAt(index);
186       // Test if a BR entity was read
187       if (ch =='\n'){
188         readBR = true;
189         // If \n is followed by a \r then advance the index in order to read a
190         // BR entity
191         while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
192           index ++;
193       }// End if
194       switch(state){
195         // It is the initial and also a final state
196         // Stay in state 1 while it reads whitespaces
197         case 1:{
198           // If reads a non whitespace char then move to state 2 and record
199           // the beggining of a paragraph
200           if (!Character.isWhitespace(ch)){
201             state = 2;
202             startOffsetPara = index;
203           }// End if
204         }break;
205         // It can be also a final state.
206         case 2:{
207           // Stay in state 2 while reading chars != BR entities
208           if (readBR){
209             // If you find a BR char go to state 3. The possible end of the para
210             // can be index. This will be confirmed by state 3. So, this is why
211             // the end of a para is recorded here.
212             readBR = false;
213             endOffsetPara = index;
214             state = 3;
215           }// End if
216         }break;
217         // It can be also a final state
218         // From state 3 there are only 2 possible ways: (state 2 or state1)
219         // In state 1 it needs to read a BR
220         // For state 2 it nead to read something different then a BR
221         case 3:{
222           if (readBR){
223             // A BR was read. Go to state 1
224             readBR = false;
225             state = 1;
226             // Create an annotation type paragraph
227             try{
228               annotSet.add( new Long(startOffsetPara),
229                             new Long(endOffsetPara),
230                             "paragraph",
231                             Factory.newFeatureMap());
232             } catch (gate.util.InvalidOffsetException ioe){
233               throw new DocumentFormatException("Coudn't create a paragraph"+
234               " annotation",ioe);
235             }// End try
236           }else{
237             // Go to state 2 an keep reading chars
238             state = 2;
239           }// End if
240         }break;
241       }// End switch
242       // Prepare to read the next char.
243       index ++;
244     }// End while
245     endOffsetPara = index;
246     // Investigate where the finite automata has stoped
247     if ( state==2 || state==3 ){
248       // Create an annotation type paragraph
249       try{
250         annotSet.add( new Long(startOffsetPara),
251                       // Create the final annotation using the endOffset
252                       new Long(endOffsetPara),
253                       "paragraph",
254                       Factory.newFeatureMap());
255       } catch (gate.util.InvalidOffsetException ioe){
256               throw new DocumentFormatException("Coudn't create a paragraph"+
257               " annotation",ioe);
258       }// End try
259     }// End if
260   }// End annotateParagraphs();
261 
262   public DataStore getDataStore(){ return null;}
263 
264 } // class TextualDocumentFormat
265