1   /*
2    *  DocumentFormat.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 25/May/2000
12   *
13   *  $Id: DocumentFormat.java,v 1.44 2001/11/30 14:38:43 cursu Exp $
14   */
15  
16  package gate;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.util.*;
23  import gate.event.*;
24  import gate.creole.*;
25  
26  import org.w3c.www.mime.*;
27  
28  
29  /** The format of Documents. Subclasses of DocumentFormat know about
30    * particular MIME types and how to unpack the information in any
31    * markup or formatting they contain into GATE annotations. Each MIME
32    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
33    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
34    * with a static index residing here when they are constructed. Static
35    * getDocumentFormat methods can then be used to get the appropriate
36    * format class for a particular document.
37    */
38  public abstract class DocumentFormat
39  extends AbstractLanguageResource implements LanguageResource{
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** This fields indicates whether the document being processed is in a
44      * Gate XML custom format.
45      * Detection is done in runMagicNumbers().
46      */
47    protected static boolean isGateXmlDocument = false;
48  
49    /** The MIME type of this format. */
50    private MimeType mimeType = null;
51  
52    /** Map of MimeTypeString to ClassHandler class. This is used to find the
53      * language resource that deals with the specific Document format
54      */
55    protected static Map mimeString2ClassHandlerMap = new HashMap();
56    /** Map of MimeType to DocumentFormat Class. This is used to find the
57      * DocumentFormat subclass that deals with a particular MIME type.
58      */
59    protected static Map mimeString2mimeTypeMap = new HashMap();
60  
61    /** Map of Set of file suffixes to MimeType. This is used to figure
62      * out what MIME type a document is from its file name.
63      */
64    protected static Map suffixes2mimeTypeMap = new HashMap();
65  
66    /** Map of Set of magic numbers to MimeType. This is used to guess the
67      * MIME type of a document, when we don't have any other clues.
68      */
69    protected static Map magic2mimeTypeMap = new HashMap();
70  
71    /** Map of markup elements to annotation types. If it is null, the
72      * unpackMarkup() method will convert all markup, using the element names
73      * for annotation types. If it is non-null, only those elements specified
74      * here will be converted.
75      */
76    protected Map markupElementsMap = null;
77  
78    /** This map is used inside uppackMarkup() method...
79      * When an element from the map is encounted, The corresponding string
80      * element is added to the document content
81      */
82    protected Map element2StringMap = null;
83  
84    /** The features of this resource */
85    private FeatureMap features = null;
86  
87    /** Default construction */
88    public DocumentFormat() {}
89  
90    /** listeners for status report */
91    private transient Vector statusListeners;
92  
93    /** Unpack the markup in the document. This converts markup from the
94      * native format (e.g. XML, RTF) into annotations in GATE format.
95      * Uses the markupElementsMap to determine which elements to convert, and
96      * what annotation type names to use.
97      */
98    abstract public void unpackMarkup(Document doc)
99                                        throws DocumentFormatException;
100 
101   /** Unpack the markup in the document. This method calls unpackMarkup on the
102     * GATE document, but after it saves its content as a feature atached to
103     * the document. This method is usefull if one wants to save the content
104     * of the document being unpacked. After the markups have been unpacked,
105     * the content of the document will be replaced with a new one containing
106     * the text between markups.
107     *
108     * @param doc the document that will be upacked
109     * @param originalContentFeatureType the name of the feature that will hold
110     * the document's content.
111     */
112   public void unpackMarkup( Document doc,
113                             String  originalContentFeatureType )
114                                               throws DocumentFormatException{
115      FeatureMap fm = doc.getFeatures();
116      if (fm == null) fm = Factory.newFeatureMap();
117      fm.put(originalContentFeatureType, doc.getContent().toString());
118      doc.setFeatures(fm);
119      unpackMarkup(doc);
120   }// unpackMarkup();
121 
122   /**
123     * Returns a MimeType having as input a fileSufix.
124     * If the file sufix is <b>null</b> or not recognised then,
125     * <b>null</b> will be returned.
126     * @param fileSufix The file sufix associated with a recognisabe mime type.
127     * @return The MimeType associated with this file suffix.
128     */
129   static private MimeType  getMimeType(String fileSufix){
130     // Get a mimeType string associated with this fileSuffix
131     // Eg: for html returns  MimeType("text/html"), for xml returns
132     // MimeType("text/xml")
133     if(fileSufix == null) return null;
134     return  (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
135   }//getMimeType
136 
137   /**
138     * Returns a MymeType having as input a URL object. If the MimeType wasn't
139     * recognized it returns <b>null</b>.
140     * @param url The URL object from which the MimeType will be extracted
141     * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is
142     * unknown.
143     */
144   static private MimeType  getMimeType(URL url) {
145     String mimeTypeString = null;
146     String charsetFromWebServer = null;
147     String contentType = null;
148     InputStream is = null;
149     MimeType mimeTypeFromWebServer = null;
150     MimeType mimeTypeFromFileSuffix = null;
151     MimeType mimeTypeFromMagicNumbers = null;
152     String fileSufix = null;
153 
154     if (url == null)
155       return null;
156     // Ask the web server for the content type
157     // We expect to get contentType something like this:
158     // "text/html; charset=iso-8859-1"
159     // Charset is optional
160     try{
161       is = url.openConnection().getInputStream();
162       contentType = url.openConnection().getContentType();
163     } catch (IOException e){
164       // Failed to get the content type with te Web server.
165       // Let's try some other methods like FileSuffix or magic numbers.
166     }
167     // If a content Type was returned by the server, try to get the mime Type
168     // string
169     // If contentType is something like this:"text/html; charset=iso-8859-1"
170     // try to get content Type string (text/html)
171     if (contentType != null){
172       StringTokenizer st = new StringTokenizer(contentType, ";");
173       // We assume that the first token is the mime type string...
174       // If this doesn't happen then BAD LUCK :(( ...
175       if (st.hasMoreTokens())
176         mimeTypeString     = st.nextToken().toLowerCase();
177       // The next token it should be the CharSet
178       if (st.hasMoreTokens())
179         charsetFromWebServer = st.nextToken().toLowerCase();
180       if (charsetFromWebServer != null){
181         //We have something like : "charset=iso-8859-1" and let's extract the
182         // encoding.
183         st = new StringTokenizer(charsetFromWebServer, "=");
184         // Don't need this anymore
185         charsetFromWebServer = null;
186         // Discarding the first token which is : "charset"
187         if (st.hasMoreTokens())
188           st.nextToken().toUpperCase();
189         // Get the encoding : "ISO-8859-1"
190         if (st.hasMoreTokens())
191           charsetFromWebServer = st.nextToken().toUpperCase();
192       } // End if
193     }// end if
194     // Return the corresponding MimeType with WebServer from the associated MAP
195     mimeTypeFromWebServer = (MimeType)
196                                 mimeString2mimeTypeMap.get(mimeTypeString);
197     // Let's try a file suffix detection
198     // Get the file sufix from the URL.See method definition for more details
199     fileSufix = getFileSufix(url);
200     // Get the mime type based on the on file sufix
201     mimeTypeFromFileSuffix = getMimeType(fileSufix);
202 
203     // Let's perform a magic numbers guess..
204     mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
205                                                     charsetFromWebServer);
206     //All those types enter into a deciding system
207     return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
208                                         mimeTypeFromFileSuffix,
209                                         mimeTypeFromMagicNumbers);
210   }//getMimeType
211 
212   /**
213     * This method decides what mimeType is in majority
214     * @param aMimeTypeFromWebServer a MimeType
215     * @param aMimeTypeFromFileSuffix a MimeType
216     * @param aMimeTypeFromMagicNumbers a MimeType
217     * @return the MimeType which occurs most. If all are null, then returns
218     * <b>null</b>
219     */
220   protected static MimeType decideBetweenThreeMimeTypes(
221                                     MimeType aMimeTypeFromWebServer,
222                                     MimeType aMimeTypeFromFileSuffix,
223                                     MimeType aMimeTypeFromMagicNumbers){
224 
225     // First a voting system
226     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
227       return aMimeTypeFromFileSuffix;
228     if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
229       return aMimeTypeFromFileSuffix;
230     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
231       return aMimeTypeFromWebServer;
232 
233     // 1 is the highest priority
234     if (aMimeTypeFromFileSuffix != null)
235       aMimeTypeFromFileSuffix.addParameter("Priority","1");
236     // 2 is the second priority
237     if (aMimeTypeFromWebServer != null)
238       aMimeTypeFromWebServer.addParameter("Priority","2");
239     // 3 is the third priority
240     if (aMimeTypeFromMagicNumbers != null)
241       aMimeTypeFromMagicNumbers.addParameter("Priority","3");
242 
243     return decideBetweenTwoMimeTypes(
244                              decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
245                                                        aMimeTypeFromFileSuffix),
246                              aMimeTypeFromMagicNumbers);
247 
248   }// decideBetweenThreeMimeTypes
249 
250   /** Decide between two mimeTypes. The decistion is made on "Priority"
251     * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
252     * doesn't have "Priority" paramether set, it will return one on them.
253     * @param aMimeType a MimeType object with "Prority" parameter set
254     * @param anotherMimeType a MimeType object with "Prority" parameter set
255     * @return One of the two mime types.
256     */
257   protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
258                                                 MimeType anotherMimeType){
259     if (aMimeType == null) return anotherMimeType;
260     if (anotherMimeType == null) return aMimeType;
261 
262     int priority1 = 0;
263     int priority2 = 0;
264     // Both of them are not null
265     if (aMimeType.hasParameter("Priority"))
266       try{
267         priority1 =
268               new Integer(aMimeType.getParameterValue("Priority")).intValue();
269       }catch (NumberFormatException e){
270         return anotherMimeType;
271       }
272     if (anotherMimeType.hasParameter("Priority"))
273       try{
274         priority2 =
275           new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
276       }catch (NumberFormatException e){
277         return aMimeType;
278       }
279 
280     // The lower the number, the highest the priority
281     if (priority1 <= priority2)
282       return aMimeType;
283     else
284       return anotherMimeType;
285   }// decideBetweenTwoMimeTypes
286 
287   /**
288     * Tests if two MimeType objects are equal.
289     * @return true only if boths MimeType objects are different than <b>null</b>
290     * and their Types and Subtypes are equals. The method is case sensitive.
291     */
292   protected static boolean areEqual( MimeType aMimeType,
293                                      MimeType anotherMimeType){
294     if (aMimeType == null || anotherMimeType == null)
295       return false;
296 
297     if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
298          aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
299        ) return true;
300     else
301       return false;
302   }// are Equal
303 
304   /**
305     * This method tries to guess the mime Type using some magic numbers.
306     * @param aInputStream a InputStream which has to be transformed into a
307     *        InputStreamReader
308     * @param anEncoding the encoding. If is null or unknown then a
309     * InputStreamReader with default encodings will be created.
310     * @return the mime type associated with magic numbers
311     */
312   protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
313                                                             String anEncoding){
314 
315     if (aInputStream == null) return null;
316     InputStreamReader reader = null;
317     if (anEncoding != null)
318       try{
319         reader = new InputStreamReader(aInputStream, anEncoding);
320       } catch (UnsupportedEncodingException e){
321         reader = null;
322       }
323     if (reader == null)
324       // Create a reader with the default encoding system
325       reader = new InputStreamReader(aInputStream);
326 
327     // We have a input stream reader
328     return runMagicNumbers(reader);
329   }//guessTypeUsingMagicNumbers
330 
331   /** Performs magic over Gate Document */
332   protected static MimeType runMagicNumbers(InputStreamReader aReader){
333     // No reader, nothing to detect
334     if( aReader == null) return null;
335 
336     // Prepare to run the magic stuff
337     String strBuffer = null;
338     int bufferSize = 2048;
339     int charReads = 0;
340     char[] cbuf = new char[bufferSize];
341 
342     try {
343       charReads = aReader.read(cbuf,0,bufferSize);
344     } catch (IOException e){
345       return null;
346     }// End try
347 
348     if (charReads == -1)
349       // the document is empty
350       return null;
351 
352     // Create a string form the buffer and perform some search on it.
353     strBuffer = new String(cbuf,0,charReads);
354 
355     // Detect whether or not is a GateXmlDocument
356     if (  strBuffer.indexOf("<GateDocument") != -1  ||
357           strBuffer.indexOf(" GateDocument") != -1)
358       isGateXmlDocument = true;
359     else
360       isGateXmlDocument = false;
361 
362     // Run the magic numbers test
363     Set magicSet = magic2mimeTypeMap.keySet();
364     Iterator iterator=magicSet.iterator();
365     while (iterator.hasNext()){
366       String magic = (String) iterator.next();
367       if (strBuffer.indexOf(magic) != -1)
368         return (MimeType) magic2mimeTypeMap.get(magic);
369     }// End while
370 
371     // If this fails then surrender
372     return null;
373   }// runMagicNumbers
374 
375   /**
376     * Return the fileSuffix or null if the url doesn't have a file suffix
377     * If the url is null then the file suffix will be null also
378     */
379   private static String getFileSufix(URL url){
380     String fileName = null;
381     String fileSuffix = null;
382 
383     // GIGO test  (garbage in garbage out)
384     if (url != null){
385       // get the file name from the URL
386       fileName = url.getFile();
387 
388       // tokenize this file name with "." as separator...
389       // the last token will be the file suffix
390       StringTokenizer st = new StringTokenizer(fileName,".");
391 
392       // fileSuffix is the last token
393       while (st.hasMoreTokens())
394         fileSuffix = st.nextToken();
395       // here fileSuffix is the last token
396     } // End if
397     return fileSuffix;
398   }//getFileSufix
399 
400   /**
401     * Find a DocumentFormat implementation that deals with a particular
402     * MIME type, given that type.
403     * @param  aGateDocument this document will receive as a feature
404     *                      the associated Mime Type. The name of the feature is
405     *                      MimeType and its value is in the format type/subtype
406     * @param  mimeType the mime type that is given as input
407     */
408   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
409                                                             MimeType mimeType){
410     FeatureMap      aFeatureMap    = null;
411     if (mimeType != null){
412       // If the Gate Document doesn't have a feature map atached then
413       // We will create and set one.
414       if(aGateDocument.getFeatures() == null){
415             aFeatureMap = Factory.newFeatureMap();
416             aGateDocument.setFeatures(aFeatureMap);
417       }// end if
418       aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
419                                           mimeType.getSubtype());
420 
421       return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType()
422                                                + "/" + mimeType.getSubtype());
423     }// end If
424     return null;
425   } // getDocumentFormat(aGateDocument, MimeType)
426 
427   /**
428     * Find a DocumentFormat implementation that deals with a particular
429     * MIME type, given the file suffix (e.g. ".txt") that the document came
430     * from.
431     * @param  aGateDocument this document will receive as a feature
432     *                     the associated Mime Type. The name of the feature is
433     *                     MimeType and its value is in the format type/subtype
434     * @param  fileSuffix the file suffix that is given as input
435     */
436   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
437                                                             String fileSuffix) {
438     return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
439   } // getDocumentFormat(String)
440 
441   /**
442     * Find a DocumentFormat implementation that deals with a particular
443     * MIME type, given the URL of the Document. If it is an HTTP URL, we
444     * can ask the web server. If it has a recognised file extension, we
445     * can use that. Otherwise we need to use a map of magic numbers
446     * to MIME types to guess the type, and then look up the format using the
447     * type.
448     * @param  aGateDocument this document will receive as a feature
449     *                      the associated Mime Type. The name of the feature is
450     *                      MimeType and its value is in the format type/subtype
451     * @param  url  the URL that is given as input
452     */
453   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
454                                                                       URL url) {
455     return getDocumentFormat(aGateDocument, getMimeType(url));
456   } // getDocumentFormat(URL)
457 
458   /** Get the feature set */
459   public FeatureMap getFeatures() { return features; }
460 
461    /** Get the markup elements map */
462   public Map getMarkupElementsMap() { return markupElementsMap; }
463 
464    /** Get the element 2 string map */
465   public Map getElement2StringMap() { return element2StringMap; }
466 
467   /** Set the markup elements map */
468   public void setMarkupElementsMap(Map markupElementsMap) {
469    this.markupElementsMap = markupElementsMap;
470   }
471 
472   /** Set the element 2 string map */
473   public void setElement2StringMap(Map anElement2StringMap) {
474    element2StringMap = anElement2StringMap;
475   }
476 
477   /** Set the features map*/
478   public void setFeatures(FeatureMap features){this.features = features;}
479 
480   /** Set the mime type*/
481 
482   public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
483   /** Gets the mime Type*/
484   public MimeType getMimeType(){return mimeType;}
485 
486   //StatusReporter Implementation
487 
488 
489   public synchronized void removeStatusListener(StatusListener l) {
490     if (statusListeners != null && statusListeners.contains(l)) {
491       Vector v = (Vector) statusListeners.clone();
492       v.removeElement(l);
493       statusListeners = v;
494     }
495   }
496   public synchronized void addStatusListener(StatusListener l) {
497     Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
498     if (!v.contains(l)) {
499       v.addElement(l);
500       statusListeners = v;
501     }
502   }
503   protected void fireStatusChanged(String e) {
504     if (statusListeners != null) {
505       Vector listeners = statusListeners;
506       int count = listeners.size();
507       for (int i = 0; i < count; i++) {
508         ((StatusListener) listeners.elementAt(i)).statusChanged(e);
509       }
510     }
511   }
512 
513 } // class DocumentFormat
514