|
TextualDocumentFormat |
|
1 /* 2 * TextualDocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: TextualDocumentFormat.java,v 1.18 2001/11/30 14:38:44 cursu Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 21 import gate.util.*; 22 import gate.*; 23 import gate.creole.*; 24 25 import org.w3c.www.mime.*; 26 27 /** The format of Documents. Subclasses of DocumentFormat know about 28 * particular MIME types and how to unpack the information in any 29 * markup or formatting they contain into GATE annotations. Each MIME 30 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 31 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 32 * with a static index residing here when they are constructed. Static 33 * getDocumentFormat methods can then be used to get the appropriate 34 * format class for a particular document. 35 */ 36 public class TextualDocumentFormat extends DocumentFormat 37 { 38 39 /** Debug flag */ 40 private static final boolean DEBUG = false; 41 42 /** Default construction */ 43 public TextualDocumentFormat() { super(); } 44 45 /** Initialise this resource, and return it. */ 46 public Resource init() throws ResourceInstantiationException{ 47 // Register plain text mime type 48 MimeType mime = new MimeType("text","plain"); 49 // Register the class handler for this mime type 50 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 51 this); 52 // Register the mime type with mine string 53 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 54 // Register file sufixes for this mime type 55 suffixes2mimeTypeMap.put("txt",mime); 56 suffixes2mimeTypeMap.put("text",mime); 57 // Set the mimeType for this language resource 58 setMimeType(mime); 59 return this; 60 } // init() 61 62 /** Unpack the markup in the document. This converts markup from the 63 * native format (e.g. XML, RTF) into annotations in GATE format. 64 * Uses the markupElementsMap to determine which elements to convert, and 65 * what annotation type names to use. 66 */ 67 public void unpackMarkup(Document doc) throws DocumentFormatException{ 68 if (doc == null || doc.getContent() == null) return; 69 // Create paragraph annotations in the specified annotation set 70 int endOffset = doc.getContent().toString().length(); 71 int startOffset = 0; 72 annotateParagraphs(doc,startOffset,endOffset, 73 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 74 }//unpackMarkup 75 76 /** This method annotates paragraphs in a GATE document. The investigated text 77 * spans beetween start and end offsets and the paragraph annotations are 78 * created in the annotSetName. If annotSetName is null then they are creted 79 * in the default annotation set. 80 * @param aDoc is the gate document on which the paragraph detection would 81 * be performed.If it is null or its content it's null then the method woul 82 * simply return doing nothing. 83 * @param startOffset is the index form the document content from which the 84 * paragraph detection will start 85 * @param endOffset is the offset where the detection will end. 86 * @param annotSetName is the name of the set in which paragraph annotation 87 * would be created.The annotation type created will be "paragraph" 88 */ 89 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset, 90 String annotSetName)throws DocumentFormatException{ 91 // Simply return if the document is null or its content 92 if (aDoc == null || aDoc.getContent() == null) return; 93 // Simply return if the start is > than the end 94 if (startOffset > endOffset) return; 95 // Decide where to put the newly detected annotations 96 AnnotationSet annotSet = null; 97 if (annotSetName == null) 98 annotSet = aDoc.getAnnotations(); 99 else 100 annotSet = aDoc.getAnnotations(annotSetName); 101 // Extract the document content 102 String content = aDoc.getContent().toString(); 103 // This is the offset marking the start of a para 104 int startOffsetPara = startOffset; 105 // This marks the ned of a para 106 int endOffsetPara = endOffset; 107 // The initial sate of the FSA 108 int state = 1; 109 // This field marks that a BR entity was read 110 // A BR entity can be NL or NL CR, depending on the operating system (UNIX 111 // or DOS) 112 boolean readBR = false; 113 int index = startOffset; 114 while (index < endOffset){ 115 // Read the current char 116 char ch = content.charAt(index); 117 // Test if a BR entity was read 118 if (ch =='\n'){ 119 readBR = true; 120 // If \n is followed by a \r then advance the index in order to read a 121 // BR entity 122 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r')) 123 index ++; 124 }// End if 125 switch(state){ 126 // It is the initial and also a final state 127 // Stay in state 1 while it reads whitespaces 128 case 1:{ 129 // If reads a non whitespace char then move to state 2 and record 130 // the beggining of a paragraph 131 if (!Character.isWhitespace(ch)){ 132 state = 2; 133 startOffsetPara = index; 134 }// End if 135 }break; 136 // It can be also a final state. 137 case 2:{ 138 // Stay in state 2 while reading chars != BR entities 139 if (readBR){ 140 // If you find a BR char go to state 3. The possible end of the para 141 // can be index. This will be confirmed by state 3. So, this is why 142 // the end of a para is recorded here. 143 readBR = false; 144 endOffsetPara = index; 145 state = 3; 146 }// End if 147 }break; 148 // It can be also a final state 149 // From state 3 there are only 2 possible ways: (state 2 or state1) 150 // In state 1 it needs to read a BR 151 // For state 2 it nead to read something different then a BR 152 case 3:{ 153 if (readBR){ 154 // A BR was read. Go to state 1 155 readBR = false; 156 state = 1; 157 // Create an annotation type paragraph 158 try{ 159 annotSet.add( new Long(startOffsetPara), 160 new Long(endOffsetPara), 161 "paragraph", 162 Factory.newFeatureMap()); 163 } catch (gate.util.InvalidOffsetException ioe){ 164 throw new DocumentFormatException("Coudn't create a paragraph"+ 165 " annotation",ioe); 166 }// End try 167 }else{ 168 // Go to state 2 an keep reading chars 169 state = 2; 170 }// End if 171 }break; 172 }// End switch 173 // Prepare to read the next char. 174 index ++; 175 }// End while 176 endOffsetPara = index; 177 // Investigate where the finite automata has stoped 178 if ( state==2 || state==3 ){ 179 // Create an annotation type paragraph 180 try{ 181 annotSet.add( new Long(startOffsetPara), 182 // Create the final annotation using the endOffset 183 new Long(endOffsetPara), 184 "paragraph", 185 Factory.newFeatureMap()); 186 } catch (gate.util.InvalidOffsetException ioe){ 187 throw new DocumentFormatException("Coudn't create a paragraph"+ 188 " annotation",ioe); 189 }// End try 190 }// End if 191 }// End annotateParagraphs(); 192 193 public DataStore getDataStore(){ return null;} 194 195 } // class TextualDocumentFormat 196
|
TextualDocumentFormat |
|