|
DocumentFormat |
|
1 /* 2 * DocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 25/May/2000 12 * 13 * $Id: DocumentFormat.java,v 1.44 2001/11/30 14:38:43 cursu Exp $ 14 */ 15 16 package gate; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.util.*; 23 import gate.event.*; 24 import gate.creole.*; 25 26 import org.w3c.www.mime.*; 27 28 29 /** The format of Documents. Subclasses of DocumentFormat know about 30 * particular MIME types and how to unpack the information in any 31 * markup or formatting they contain into GATE annotations. Each MIME 32 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 33 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 34 * with a static index residing here when they are constructed. Static 35 * getDocumentFormat methods can then be used to get the appropriate 36 * format class for a particular document. 37 */ 38 public abstract class DocumentFormat 39 extends AbstractLanguageResource implements LanguageResource{ 40 /** Debug flag */ 41 private static final boolean DEBUG = false; 42 43 /** This fields indicates whether the document being processed is in a 44 * Gate XML custom format. 45 * Detection is done in runMagicNumbers(). 46 */ 47 protected static boolean isGateXmlDocument = false; 48 49 /** The MIME type of this format. */ 50 private MimeType mimeType = null; 51 52 /** Map of MimeTypeString to ClassHandler class. This is used to find the 53 * language resource that deals with the specific Document format 54 */ 55 protected static Map mimeString2ClassHandlerMap = new HashMap(); 56 /** Map of MimeType to DocumentFormat Class. This is used to find the 57 * DocumentFormat subclass that deals with a particular MIME type. 58 */ 59 protected static Map mimeString2mimeTypeMap = new HashMap(); 60 61 /** Map of Set of file suffixes to MimeType. This is used to figure 62 * out what MIME type a document is from its file name. 63 */ 64 protected static Map suffixes2mimeTypeMap = new HashMap(); 65 66 /** Map of Set of magic numbers to MimeType. This is used to guess the 67 * MIME type of a document, when we don't have any other clues. 68 */ 69 protected static Map magic2mimeTypeMap = new HashMap(); 70 71 /** Map of markup elements to annotation types. If it is null, the 72 * unpackMarkup() method will convert all markup, using the element names 73 * for annotation types. If it is non-null, only those elements specified 74 * here will be converted. 75 */ 76 protected Map markupElementsMap = null; 77 78 /** This map is used inside uppackMarkup() method... 79 * When an element from the map is encounted, The corresponding string 80 * element is added to the document content 81 */ 82 protected Map element2StringMap = null; 83 84 /** The features of this resource */ 85 private FeatureMap features = null; 86 87 /** Default construction */ 88 public DocumentFormat() {} 89 90 /** listeners for status report */ 91 private transient Vector statusListeners; 92 93 /** Unpack the markup in the document. This converts markup from the 94 * native format (e.g. XML, RTF) into annotations in GATE format. 95 * Uses the markupElementsMap to determine which elements to convert, and 96 * what annotation type names to use. 97 */ 98 abstract public void unpackMarkup(Document doc) 99 throws DocumentFormatException; 100 101 /** Unpack the markup in the document. This method calls unpackMarkup on the 102 * GATE document, but after it saves its content as a feature atached to 103 * the document. This method is usefull if one wants to save the content 104 * of the document being unpacked. After the markups have been unpacked, 105 * the content of the document will be replaced with a new one containing 106 * the text between markups. 107 * 108 * @param doc the document that will be upacked 109 * @param originalContentFeatureType the name of the feature that will hold 110 * the document's content. 111 */ 112 public void unpackMarkup( Document doc, 113 String originalContentFeatureType ) 114 throws DocumentFormatException{ 115 FeatureMap fm = doc.getFeatures(); 116 if (fm == null) fm = Factory.newFeatureMap(); 117 fm.put(originalContentFeatureType, doc.getContent().toString()); 118 doc.setFeatures(fm); 119 unpackMarkup(doc); 120 }// unpackMarkup(); 121 122 /** 123 * Returns a MimeType having as input a fileSufix. 124 * If the file sufix is <b>null</b> or not recognised then, 125 * <b>null</b> will be returned. 126 * @param fileSufix The file sufix associated with a recognisabe mime type. 127 * @return The MimeType associated with this file suffix. 128 */ 129 static private MimeType getMimeType(String fileSufix){ 130 // Get a mimeType string associated with this fileSuffix 131 // Eg: for html returns MimeType("text/html"), for xml returns 132 // MimeType("text/xml") 133 if(fileSufix == null) return null; 134 return (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase()); 135 }//getMimeType 136 137 /** 138 * Returns a MymeType having as input a URL object. If the MimeType wasn't 139 * recognized it returns <b>null</b>. 140 * @param url The URL object from which the MimeType will be extracted 141 * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is 142 * unknown. 143 */ 144 static private MimeType getMimeType(URL url) { 145 String mimeTypeString = null; 146 String charsetFromWebServer = null; 147 String contentType = null; 148 InputStream is = null; 149 MimeType mimeTypeFromWebServer = null; 150 MimeType mimeTypeFromFileSuffix = null; 151 MimeType mimeTypeFromMagicNumbers = null; 152 String fileSufix = null; 153 154 if (url == null) 155 return null; 156 // Ask the web server for the content type 157 // We expect to get contentType something like this: 158 // "text/html; charset=iso-8859-1" 159 // Charset is optional 160 try{ 161 is = url.openConnection().getInputStream(); 162 contentType = url.openConnection().getContentType(); 163 } catch (IOException e){ 164 // Failed to get the content type with te Web server. 165 // Let's try some other methods like FileSuffix or magic numbers. 166 } 167 // If a content Type was returned by the server, try to get the mime Type 168 // string 169 // If contentType is something like this:"text/html; charset=iso-8859-1" 170 // try to get content Type string (text/html) 171 if (contentType != null){ 172 StringTokenizer st = new StringTokenizer(contentType, ";"); 173 // We assume that the first token is the mime type string... 174 // If this doesn't happen then BAD LUCK :(( ... 175 if (st.hasMoreTokens()) 176 mimeTypeString = st.nextToken().toLowerCase(); 177 // The next token it should be the CharSet 178 if (st.hasMoreTokens()) 179 charsetFromWebServer = st.nextToken().toLowerCase(); 180 if (charsetFromWebServer != null){ 181 //We have something like : "charset=iso-8859-1" and let's extract the 182 // encoding. 183 st = new StringTokenizer(charsetFromWebServer, "="); 184 // Don't need this anymore 185 charsetFromWebServer = null; 186 // Discarding the first token which is : "charset" 187 if (st.hasMoreTokens()) 188 st.nextToken().toUpperCase(); 189 // Get the encoding : "ISO-8859-1" 190 if (st.hasMoreTokens()) 191 charsetFromWebServer = st.nextToken().toUpperCase(); 192 } // End if 193 }// end if 194 // Return the corresponding MimeType with WebServer from the associated MAP 195 mimeTypeFromWebServer = (MimeType) 196 mimeString2mimeTypeMap.get(mimeTypeString); 197 // Let's try a file suffix detection 198 // Get the file sufix from the URL.See method definition for more details 199 fileSufix = getFileSufix(url); 200 // Get the mime type based on the on file sufix 201 mimeTypeFromFileSuffix = getMimeType(fileSufix); 202 203 // Let's perform a magic numbers guess.. 204 mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is, 205 charsetFromWebServer); 206 //All those types enter into a deciding system 207 return decideBetweenThreeMimeTypes( mimeTypeFromWebServer, 208 mimeTypeFromFileSuffix, 209 mimeTypeFromMagicNumbers); 210 }//getMimeType 211 212 /** 213 * This method decides what mimeType is in majority 214 * @param aMimeTypeFromWebServer a MimeType 215 * @param aMimeTypeFromFileSuffix a MimeType 216 * @param aMimeTypeFromMagicNumbers a MimeType 217 * @return the MimeType which occurs most. If all are null, then returns 218 * <b>null</b> 219 */ 220 protected static MimeType decideBetweenThreeMimeTypes( 221 MimeType aMimeTypeFromWebServer, 222 MimeType aMimeTypeFromFileSuffix, 223 MimeType aMimeTypeFromMagicNumbers){ 224 225 // First a voting system 226 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix)) 227 return aMimeTypeFromFileSuffix; 228 if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers)) 229 return aMimeTypeFromFileSuffix; 230 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers)) 231 return aMimeTypeFromWebServer; 232 233 // 1 is the highest priority 234 if (aMimeTypeFromFileSuffix != null) 235 aMimeTypeFromFileSuffix.addParameter("Priority","1"); 236 // 2 is the second priority 237 if (aMimeTypeFromWebServer != null) 238 aMimeTypeFromWebServer.addParameter("Priority","2"); 239 // 3 is the third priority 240 if (aMimeTypeFromMagicNumbers != null) 241 aMimeTypeFromMagicNumbers.addParameter("Priority","3"); 242 243 return decideBetweenTwoMimeTypes( 244 decideBetweenTwoMimeTypes(aMimeTypeFromWebServer, 245 aMimeTypeFromFileSuffix), 246 aMimeTypeFromMagicNumbers); 247 248 }// decideBetweenThreeMimeTypes 249 250 /** Decide between two mimeTypes. The decistion is made on "Priority" 251 * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes 252 * doesn't have "Priority" paramether set, it will return one on them. 253 * @param aMimeType a MimeType object with "Prority" parameter set 254 * @param anotherMimeType a MimeType object with "Prority" parameter set 255 * @return One of the two mime types. 256 */ 257 protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType, 258 MimeType anotherMimeType){ 259 if (aMimeType == null) return anotherMimeType; 260 if (anotherMimeType == null) return aMimeType; 261 262 int priority1 = 0; 263 int priority2 = 0; 264 // Both of them are not null 265 if (aMimeType.hasParameter("Priority")) 266 try{ 267 priority1 = 268 new Integer(aMimeType.getParameterValue("Priority")).intValue(); 269 }catch (NumberFormatException e){ 270 return anotherMimeType; 271 } 272 if (anotherMimeType.hasParameter("Priority")) 273 try{ 274 priority2 = 275 new Integer(anotherMimeType.getParameterValue("Priority")).intValue(); 276 }catch (NumberFormatException e){ 277 return aMimeType; 278 } 279 280 // The lower the number, the highest the priority 281 if (priority1 <= priority2) 282 return aMimeType; 283 else 284 return anotherMimeType; 285 }// decideBetweenTwoMimeTypes 286 287 /** 288 * Tests if two MimeType objects are equal. 289 * @return true only if boths MimeType objects are different than <b>null</b> 290 * and their Types and Subtypes are equals. The method is case sensitive. 291 */ 292 protected static boolean areEqual( MimeType aMimeType, 293 MimeType anotherMimeType){ 294 if (aMimeType == null || anotherMimeType == null) 295 return false; 296 297 if ( aMimeType.getType().equals(anotherMimeType.getType()) && 298 aMimeType.getSubtype().equals(anotherMimeType.getSubtype()) 299 ) return true; 300 else 301 return false; 302 }// are Equal 303 304 /** 305 * This method tries to guess the mime Type using some magic numbers. 306 * @param aInputStream a InputStream which has to be transformed into a 307 * InputStreamReader 308 * @param anEncoding the encoding. If is null or unknown then a 309 * InputStreamReader with default encodings will be created. 310 * @return the mime type associated with magic numbers 311 */ 312 protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream, 313 String anEncoding){ 314 315 if (aInputStream == null) return null; 316 InputStreamReader reader = null; 317 if (anEncoding != null) 318 try{ 319 reader = new InputStreamReader(aInputStream, anEncoding); 320 } catch (UnsupportedEncodingException e){ 321 reader = null; 322 } 323 if (reader == null) 324 // Create a reader with the default encoding system 325 reader = new InputStreamReader(aInputStream); 326 327 // We have a input stream reader 328 return runMagicNumbers(reader); 329 }//guessTypeUsingMagicNumbers 330 331 /** Performs magic over Gate Document */ 332 protected static MimeType runMagicNumbers(InputStreamReader aReader){ 333 // No reader, nothing to detect 334 if( aReader == null) return null; 335 336 // Prepare to run the magic stuff 337 String strBuffer = null; 338 int bufferSize = 2048; 339 int charReads = 0; 340 char[] cbuf = new char[bufferSize]; 341 342 try { 343 charReads = aReader.read(cbuf,0,bufferSize); 344 } catch (IOException e){ 345 return null; 346 }// End try 347 348 if (charReads == -1) 349 // the document is empty 350 return null; 351 352 // Create a string form the buffer and perform some search on it. 353 strBuffer = new String(cbuf,0,charReads); 354 355 // Detect whether or not is a GateXmlDocument 356 if ( strBuffer.indexOf("<GateDocument") != -1 || 357 strBuffer.indexOf(" GateDocument") != -1) 358 isGateXmlDocument = true; 359 else 360 isGateXmlDocument = false; 361 362 // Run the magic numbers test 363 Set magicSet = magic2mimeTypeMap.keySet(); 364 Iterator iterator=magicSet.iterator(); 365 while (iterator.hasNext()){ 366 String magic = (String) iterator.next(); 367 if (strBuffer.indexOf(magic) != -1) 368 return (MimeType) magic2mimeTypeMap.get(magic); 369 }// End while 370 371 // If this fails then surrender 372 return null; 373 }// runMagicNumbers 374 375 /** 376 * Return the fileSuffix or null if the url doesn't have a file suffix 377 * If the url is null then the file suffix will be null also 378 */ 379 private static String getFileSufix(URL url){ 380 String fileName = null; 381 String fileSuffix = null; 382 383 // GIGO test (garbage in garbage out) 384 if (url != null){ 385 // get the file name from the URL 386 fileName = url.getFile(); 387 388 // tokenize this file name with "." as separator... 389 // the last token will be the file suffix 390 StringTokenizer st = new StringTokenizer(fileName,"."); 391 392 // fileSuffix is the last token 393 while (st.hasMoreTokens()) 394 fileSuffix = st.nextToken(); 395 // here fileSuffix is the last token 396 } // End if 397 return fileSuffix; 398 }//getFileSufix 399 400 /** 401 * Find a DocumentFormat implementation that deals with a particular 402 * MIME type, given that type. 403 * @param aGateDocument this document will receive as a feature 404 * the associated Mime Type. The name of the feature is 405 * MimeType and its value is in the format type/subtype 406 * @param mimeType the mime type that is given as input 407 */ 408 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 409 MimeType mimeType){ 410 FeatureMap aFeatureMap = null; 411 if (mimeType != null){ 412 // If the Gate Document doesn't have a feature map atached then 413 // We will create and set one. 414 if(aGateDocument.getFeatures() == null){ 415 aFeatureMap = Factory.newFeatureMap(); 416 aGateDocument.setFeatures(aFeatureMap); 417 }// end if 418 aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" + 419 mimeType.getSubtype()); 420 421 return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType() 422 + "/" + mimeType.getSubtype()); 423 }// end If 424 return null; 425 } // getDocumentFormat(aGateDocument, MimeType) 426 427 /** 428 * Find a DocumentFormat implementation that deals with a particular 429 * MIME type, given the file suffix (e.g. ".txt") that the document came 430 * from. 431 * @param aGateDocument this document will receive as a feature 432 * the associated Mime Type. The name of the feature is 433 * MimeType and its value is in the format type/subtype 434 * @param fileSuffix the file suffix that is given as input 435 */ 436 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 437 String fileSuffix) { 438 return getDocumentFormat(aGateDocument, getMimeType(fileSuffix)); 439 } // getDocumentFormat(String) 440 441 /** 442 * Find a DocumentFormat implementation that deals with a particular 443 * MIME type, given the URL of the Document. If it is an HTTP URL, we 444 * can ask the web server. If it has a recognised file extension, we 445 * can use that. Otherwise we need to use a map of magic numbers 446 * to MIME types to guess the type, and then look up the format using the 447 * type. 448 * @param aGateDocument this document will receive as a feature 449 * the associated Mime Type. The name of the feature is 450 * MimeType and its value is in the format type/subtype 451 * @param url the URL that is given as input 452 */ 453 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 454 URL url) { 455 return getDocumentFormat(aGateDocument, getMimeType(url)); 456 } // getDocumentFormat(URL) 457 458 /** Get the feature set */ 459 public FeatureMap getFeatures() { return features; } 460 461 /** Get the markup elements map */ 462 public Map getMarkupElementsMap() { return markupElementsMap; } 463 464 /** Get the element 2 string map */ 465 public Map getElement2StringMap() { return element2StringMap; } 466 467 /** Set the markup elements map */ 468 public void setMarkupElementsMap(Map markupElementsMap) { 469 this.markupElementsMap = markupElementsMap; 470 } 471 472 /** Set the element 2 string map */ 473 public void setElement2StringMap(Map anElement2StringMap) { 474 element2StringMap = anElement2StringMap; 475 } 476 477 /** Set the features map*/ 478 public void setFeatures(FeatureMap features){this.features = features;} 479 480 /** Set the mime type*/ 481 482 public void setMimeType(MimeType aMimeType){mimeType = aMimeType;} 483 /** Gets the mime Type*/ 484 public MimeType getMimeType(){return mimeType;} 485 486 //StatusReporter Implementation 487 488 489 public synchronized void removeStatusListener(StatusListener l) { 490 if (statusListeners != null && statusListeners.contains(l)) { 491 Vector v = (Vector) statusListeners.clone(); 492 v.removeElement(l); 493 statusListeners = v; 494 } 495 } 496 public synchronized void addStatusListener(StatusListener l) { 497 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); 498 if (!v.contains(l)) { 499 v.addElement(l); 500 statusListeners = v; 501 } 502 } 503 protected void fireStatusChanged(String e) { 504 if (statusListeners != null) { 505 Vector listeners = statusListeners; 506 int count = listeners.size(); 507 for (int i = 0; i < count; i++) { 508 ((StatusListener) listeners.elementAt(i)).statusChanged(e); 509 } 510 } 511 } 512 513 } // class DocumentFormat 514
|
DocumentFormat |
|