|
DocumentFormat |
|
1 /* 2 * DocumentFormat.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 25/May/2000 12 * 13 * $Id: DocumentFormat.java,v 1.46 2002/01/28 14:26:58 nasso Exp $ 14 */ 15 16 package gate; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.util.*; 23 import gate.event.*; 24 import gate.creole.*; 25 import gate.corpora.RepositioningInfo; 26 27 import org.w3c.www.mime.*; 28 29 30 /** The format of Documents. Subclasses of DocumentFormat know about 31 * particular MIME types and how to unpack the information in any 32 * markup or formatting they contain into GATE annotations. Each MIME 33 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 34 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 35 * with a static index residing here when they are constructed. Static 36 * getDocumentFormat methods can then be used to get the appropriate 37 * format class for a particular document. 38 */ 39 public abstract class DocumentFormat 40 extends AbstractLanguageResource implements LanguageResource{ 41 /** Debug flag */ 42 private static final boolean DEBUG = false; 43 44 /** This fields indicates whether the document being processed is in a 45 * Gate XML custom format. 46 * Detection is done in runMagicNumbers(). 47 */ 48 protected static boolean isGateXmlDocument = false; 49 50 /** The MIME type of this format. */ 51 private MimeType mimeType = null; 52 53 /** Map of MimeTypeString to ClassHandler class. This is used to find the 54 * language resource that deals with the specific Document format 55 */ 56 protected static Map mimeString2ClassHandlerMap = new HashMap(); 57 /** Map of MimeType to DocumentFormat Class. This is used to find the 58 * DocumentFormat subclass that deals with a particular MIME type. 59 */ 60 protected static Map mimeString2mimeTypeMap = new HashMap(); 61 62 /** Map of Set of file suffixes to MimeType. This is used to figure 63 * out what MIME type a document is from its file name. 64 */ 65 protected static Map suffixes2mimeTypeMap = new HashMap(); 66 67 /** Map of Set of magic numbers to MimeType. This is used to guess the 68 * MIME type of a document, when we don't have any other clues. 69 */ 70 protected static Map magic2mimeTypeMap = new HashMap(); 71 72 /** Map of markup elements to annotation types. If it is null, the 73 * unpackMarkup() method will convert all markup, using the element names 74 * for annotation types. If it is non-null, only those elements specified 75 * here will be converted. 76 */ 77 protected Map markupElementsMap = null; 78 79 /** This map is used inside uppackMarkup() method... 80 * When an element from the map is encounted, The corresponding string 81 * element is added to the document content 82 */ 83 protected Map element2StringMap = null; 84 85 /** The features of this resource */ 86 private FeatureMap features = null; 87 88 /** Default construction */ 89 public DocumentFormat() {} 90 91 /** listeners for status report */ 92 private transient Vector statusListeners; 93 94 /** Flag for enable/disable collecting of repositioning information */ 95 private Boolean shouldCollectRepositioning = new Boolean(false); 96 97 /** If the document format could collect repositioning information 98 * during the unpack phase this method will return <B>true</B>. 99 * <BR> 100 * You should override this method in the child class of the defined 101 * document format if it could collect the repositioning information. 102 */ 103 public Boolean supportsRepositioning() { 104 return new Boolean(false); 105 } // supportsRepositioning 106 107 public void setShouldCollectRepositioning(Boolean b) { 108 if(supportsRepositioning().booleanValue() && b.booleanValue()) { 109 shouldCollectRepositioning = b; 110 } 111 else { 112 shouldCollectRepositioning = new Boolean(false); 113 } // if 114 } // setShouldCollectRepositioning 115 116 public Boolean getShouldCollectRepositioning() { 117 return shouldCollectRepositioning; 118 } // 119 120 /** Unpack the markup in the document. This converts markup from the 121 * native format (e.g. XML, RTF) into annotations in GATE format. 122 * Uses the markupElementsMap to determine which elements to convert, and 123 * what annotation type names to use. 124 */ 125 abstract public void unpackMarkup(Document doc) 126 throws DocumentFormatException; 127 128 abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo, 129 RepositioningInfo ampCodingInfo) 130 throws DocumentFormatException; 131 /** Unpack the markup in the document. This method calls unpackMarkup on the 132 * GATE document, but after it saves its content as a feature atached to 133 * the document. This method is usefull if one wants to save the content 134 * of the document being unpacked. After the markups have been unpacked, 135 * the content of the document will be replaced with a new one containing 136 * the text between markups. 137 * 138 * @param doc the document that will be upacked 139 * @param originalContentFeatureType the name of the feature that will hold 140 * the document's content. 141 */ 142 public void unpackMarkup( Document doc, 143 String originalContentFeatureType ) 144 throws DocumentFormatException{ 145 FeatureMap fm = doc.getFeatures(); 146 if (fm == null) fm = Factory.newFeatureMap(); 147 fm.put(originalContentFeatureType, doc.getContent().toString()); 148 doc.setFeatures(fm); 149 unpackMarkup(doc); 150 }// unpackMarkup(); 151 152 /** 153 * Returns a MimeType having as input a fileSufix. 154 * If the file sufix is <b>null</b> or not recognised then, 155 * <b>null</b> will be returned. 156 * @param fileSufix The file sufix associated with a recognisabe mime type. 157 * @return The MimeType associated with this file suffix. 158 */ 159 static private MimeType getMimeType(String fileSufix){ 160 // Get a mimeType string associated with this fileSuffix 161 // Eg: for html returns MimeType("text/html"), for xml returns 162 // MimeType("text/xml") 163 if(fileSufix == null) return null; 164 return (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase()); 165 }//getMimeType 166 167 /** 168 * Returns a MymeType having as input a URL object. If the MimeType wasn't 169 * recognized it returns <b>null</b>. 170 * @param url The URL object from which the MimeType will be extracted 171 * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is 172 * unknown. 173 */ 174 static private MimeType getMimeType(URL url) { 175 String mimeTypeString = null; 176 String charsetFromWebServer = null; 177 String contentType = null; 178 InputStream is = null; 179 MimeType mimeTypeFromWebServer = null; 180 MimeType mimeTypeFromFileSuffix = null; 181 MimeType mimeTypeFromMagicNumbers = null; 182 String fileSufix = null; 183 184 if (url == null) 185 return null; 186 // Ask the web server for the content type 187 // We expect to get contentType something like this: 188 // "text/html; charset=iso-8859-1" 189 // Charset is optional 190 try{ 191 is = url.openConnection().getInputStream(); 192 contentType = url.openConnection().getContentType(); 193 } catch (IOException e){ 194 // Failed to get the content type with te Web server. 195 // Let's try some other methods like FileSuffix or magic numbers. 196 } 197 // If a content Type was returned by the server, try to get the mime Type 198 // string 199 // If contentType is something like this:"text/html; charset=iso-8859-1" 200 // try to get content Type string (text/html) 201 if (contentType != null){ 202 StringTokenizer st = new StringTokenizer(contentType, ";"); 203 // We assume that the first token is the mime type string... 204 // If this doesn't happen then BAD LUCK :(( ... 205 if (st.hasMoreTokens()) 206 mimeTypeString = st.nextToken().toLowerCase(); 207 // The next token it should be the CharSet 208 if (st.hasMoreTokens()) 209 charsetFromWebServer = st.nextToken().toLowerCase(); 210 if (charsetFromWebServer != null){ 211 //We have something like : "charset=iso-8859-1" and let's extract the 212 // encoding. 213 st = new StringTokenizer(charsetFromWebServer, "="); 214 // Don't need this anymore 215 charsetFromWebServer = null; 216 // Discarding the first token which is : "charset" 217 if (st.hasMoreTokens()) 218 st.nextToken().toUpperCase(); 219 // Get the encoding : "ISO-8859-1" 220 if (st.hasMoreTokens()) 221 charsetFromWebServer = st.nextToken().toUpperCase(); 222 } // End if 223 }// end if 224 // Return the corresponding MimeType with WebServer from the associated MAP 225 mimeTypeFromWebServer = (MimeType) 226 mimeString2mimeTypeMap.get(mimeTypeString); 227 // Let's try a file suffix detection 228 // Get the file sufix from the URL.See method definition for more details 229 fileSufix = getFileSufix(url); 230 // Get the mime type based on the on file sufix 231 mimeTypeFromFileSuffix = getMimeType(fileSufix); 232 233 // Let's perform a magic numbers guess.. 234 mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is, 235 charsetFromWebServer); 236 //All those types enter into a deciding system 237 return decideBetweenThreeMimeTypes( mimeTypeFromWebServer, 238 mimeTypeFromFileSuffix, 239 mimeTypeFromMagicNumbers); 240 }//getMimeType 241 242 /** 243 * This method decides what mimeType is in majority 244 * @param aMimeTypeFromWebServer a MimeType 245 * @param aMimeTypeFromFileSuffix a MimeType 246 * @param aMimeTypeFromMagicNumbers a MimeType 247 * @return the MimeType which occurs most. If all are null, then returns 248 * <b>null</b> 249 */ 250 protected static MimeType decideBetweenThreeMimeTypes( 251 MimeType aMimeTypeFromWebServer, 252 MimeType aMimeTypeFromFileSuffix, 253 MimeType aMimeTypeFromMagicNumbers){ 254 255 // First a voting system 256 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix)) 257 return aMimeTypeFromFileSuffix; 258 if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers)) 259 return aMimeTypeFromFileSuffix; 260 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers)) 261 return aMimeTypeFromWebServer; 262 263 // 1 is the highest priority 264 if (aMimeTypeFromFileSuffix != null) 265 aMimeTypeFromFileSuffix.addParameter("Priority","1"); 266 // 2 is the second priority 267 if (aMimeTypeFromWebServer != null) 268 aMimeTypeFromWebServer.addParameter("Priority","2"); 269 // 3 is the third priority 270 if (aMimeTypeFromMagicNumbers != null) 271 aMimeTypeFromMagicNumbers.addParameter("Priority","3"); 272 273 return decideBetweenTwoMimeTypes( 274 decideBetweenTwoMimeTypes(aMimeTypeFromWebServer, 275 aMimeTypeFromFileSuffix), 276 aMimeTypeFromMagicNumbers); 277 278 }// decideBetweenThreeMimeTypes 279 280 /** Decide between two mimeTypes. The decistion is made on "Priority" 281 * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes 282 * doesn't have "Priority" paramether set, it will return one on them. 283 * @param aMimeType a MimeType object with "Prority" parameter set 284 * @param anotherMimeType a MimeType object with "Prority" parameter set 285 * @return One of the two mime types. 286 */ 287 protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType, 288 MimeType anotherMimeType){ 289 if (aMimeType == null) return anotherMimeType; 290 if (anotherMimeType == null) return aMimeType; 291 292 int priority1 = 0; 293 int priority2 = 0; 294 // Both of them are not null 295 if (aMimeType.hasParameter("Priority")) 296 try{ 297 priority1 = 298 new Integer(aMimeType.getParameterValue("Priority")).intValue(); 299 }catch (NumberFormatException e){ 300 return anotherMimeType; 301 } 302 if (anotherMimeType.hasParameter("Priority")) 303 try{ 304 priority2 = 305 new Integer(anotherMimeType.getParameterValue("Priority")).intValue(); 306 }catch (NumberFormatException e){ 307 return aMimeType; 308 } 309 310 // The lower the number, the highest the priority 311 if (priority1 <= priority2) 312 return aMimeType; 313 else 314 return anotherMimeType; 315 }// decideBetweenTwoMimeTypes 316 317 /** 318 * Tests if two MimeType objects are equal. 319 * @return true only if boths MimeType objects are different than <b>null</b> 320 * and their Types and Subtypes are equals. The method is case sensitive. 321 */ 322 protected static boolean areEqual( MimeType aMimeType, 323 MimeType anotherMimeType){ 324 if (aMimeType == null || anotherMimeType == null) 325 return false; 326 327 if ( aMimeType.getType().equals(anotherMimeType.getType()) && 328 aMimeType.getSubtype().equals(anotherMimeType.getSubtype()) 329 ) return true; 330 else 331 return false; 332 }// are Equal 333 334 /** 335 * This method tries to guess the mime Type using some magic numbers. 336 * @param aInputStream a InputStream which has to be transformed into a 337 * InputStreamReader 338 * @param anEncoding the encoding. If is null or unknown then a 339 * InputStreamReader with default encodings will be created. 340 * @return the mime type associated with magic numbers 341 */ 342 protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream, 343 String anEncoding){ 344 345 if (aInputStream == null) return null; 346 InputStreamReader reader = null; 347 if (anEncoding != null) 348 try{ 349 reader = new InputStreamReader(aInputStream, anEncoding); 350 } catch (UnsupportedEncodingException e){ 351 reader = null; 352 } 353 if (reader == null) 354 // Create a reader with the default encoding system 355 reader = new InputStreamReader(aInputStream); 356 357 // We have a input stream reader 358 return runMagicNumbers(reader); 359 }//guessTypeUsingMagicNumbers 360 361 /** Performs magic over Gate Document */ 362 protected static MimeType runMagicNumbers(InputStreamReader aReader){ 363 // No reader, nothing to detect 364 if( aReader == null) return null; 365 366 // Prepare to run the magic stuff 367 String strBuffer = null; 368 int bufferSize = 2048; 369 int charReads = 0; 370 char[] cbuf = new char[bufferSize]; 371 372 try { 373 charReads = aReader.read(cbuf,0,bufferSize); 374 } catch (IOException e){ 375 return null; 376 }// End try 377 378 if (charReads == -1) 379 // the document is empty 380 return null; 381 382 // Create a string form the buffer and perform some search on it. 383 strBuffer = new String(cbuf,0,charReads); 384 385 // Detect whether or not is a GateXmlDocument 386 if ( strBuffer.indexOf("<GateDocument") != -1 || 387 strBuffer.indexOf(" GateDocument") != -1) 388 isGateXmlDocument = true; 389 else 390 isGateXmlDocument = false; 391 392 // Run the magic numbers test 393 Set magicSet = magic2mimeTypeMap.keySet(); 394 Iterator iterator=magicSet.iterator(); 395 while (iterator.hasNext()){ 396 String magic = (String) iterator.next(); 397 if (strBuffer.indexOf(magic) != -1) 398 return (MimeType) magic2mimeTypeMap.get(magic); 399 }// End while 400 401 // If this fails then surrender 402 return null; 403 }// runMagicNumbers 404 405 /** 406 * Return the fileSuffix or null if the url doesn't have a file suffix 407 * If the url is null then the file suffix will be null also 408 */ 409 private static String getFileSufix(URL url){ 410 String fileName = null; 411 String fileSuffix = null; 412 413 // GIGO test (garbage in garbage out) 414 if (url != null){ 415 // get the file name from the URL 416 fileName = url.getFile(); 417 418 // tokenize this file name with "." as separator... 419 // the last token will be the file suffix 420 StringTokenizer st = new StringTokenizer(fileName,"."); 421 422 // fileSuffix is the last token 423 while (st.hasMoreTokens()) 424 fileSuffix = st.nextToken(); 425 // here fileSuffix is the last token 426 } // End if 427 return fileSuffix; 428 }//getFileSufix 429 430 /** 431 * Find a DocumentFormat implementation that deals with a particular 432 * MIME type, given that type. 433 * @param aGateDocument this document will receive as a feature 434 * the associated Mime Type. The name of the feature is 435 * MimeType and its value is in the format type/subtype 436 * @param mimeType the mime type that is given as input 437 */ 438 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 439 MimeType mimeType){ 440 FeatureMap aFeatureMap = null; 441 if (mimeType != null){ 442 // If the Gate Document doesn't have a feature map atached then 443 // We will create and set one. 444 if(aGateDocument.getFeatures() == null){ 445 aFeatureMap = Factory.newFeatureMap(); 446 aGateDocument.setFeatures(aFeatureMap); 447 }// end if 448 aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" + 449 mimeType.getSubtype()); 450 451 return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType() 452 + "/" + mimeType.getSubtype()); 453 }// end If 454 return null; 455 } // getDocumentFormat(aGateDocument, MimeType) 456 457 /** 458 * Find a DocumentFormat implementation that deals with a particular 459 * MIME type, given the file suffix (e.g. ".txt") that the document came 460 * from. 461 * @param aGateDocument this document will receive as a feature 462 * the associated Mime Type. The name of the feature is 463 * MimeType and its value is in the format type/subtype 464 * @param fileSuffix the file suffix that is given as input 465 */ 466 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 467 String fileSuffix) { 468 return getDocumentFormat(aGateDocument, getMimeType(fileSuffix)); 469 } // getDocumentFormat(String) 470 471 /** 472 * Find a DocumentFormat implementation that deals with a particular 473 * MIME type, given the URL of the Document. If it is an HTTP URL, we 474 * can ask the web server. If it has a recognised file extension, we 475 * can use that. Otherwise we need to use a map of magic numbers 476 * to MIME types to guess the type, and then look up the format using the 477 * type. 478 * @param aGateDocument this document will receive as a feature 479 * the associated Mime Type. The name of the feature is 480 * MimeType and its value is in the format type/subtype 481 * @param url the URL that is given as input 482 */ 483 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument, 484 URL url) { 485 return getDocumentFormat(aGateDocument, getMimeType(url)); 486 } // getDocumentFormat(URL) 487 488 /** Get the feature set */ 489 public FeatureMap getFeatures() { return features; } 490 491 /** Get the markup elements map */ 492 public Map getMarkupElementsMap() { return markupElementsMap; } 493 494 /** Get the element 2 string map */ 495 public Map getElement2StringMap() { return element2StringMap; } 496 497 /** Set the markup elements map */ 498 public void setMarkupElementsMap(Map markupElementsMap) { 499 this.markupElementsMap = markupElementsMap; 500 } 501 502 /** Set the element 2 string map */ 503 public void setElement2StringMap(Map anElement2StringMap) { 504 element2StringMap = anElement2StringMap; 505 } 506 507 /** Set the features map*/ 508 public void setFeatures(FeatureMap features){this.features = features;} 509 510 /** Set the mime type*/ 511 512 public void setMimeType(MimeType aMimeType){mimeType = aMimeType;} 513 /** Gets the mime Type*/ 514 public MimeType getMimeType(){return mimeType;} 515 516 //StatusReporter Implementation 517 518 519 public synchronized void removeStatusListener(StatusListener l) { 520 if (statusListeners != null && statusListeners.contains(l)) { 521 Vector v = (Vector) statusListeners.clone(); 522 v.removeElement(l); 523 statusListeners = v; 524 } 525 } 526 public synchronized void addStatusListener(StatusListener l) { 527 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); 528 if (!v.contains(l)) { 529 v.addElement(l); 530 statusListeners = v; 531 } 532 } 533 protected void fireStatusChanged(String e) { 534 if (statusListeners != null) { 535 Vector listeners = statusListeners; 536 int count = listeners.size(); 537 for (int i = 0; i < count; i++) { 538 ((StatusListener) listeners.elementAt(i)).statusChanged(e); 539 } 540 } 541 } 542 543 } // class DocumentFormat 544
|
DocumentFormat |
|