1   /*
2    *  DumpingPR.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 19/10/2001
12   *
13   *  $Id: DumpingPR.java,v 1.9 2002/04/12 14:35:21 kalina Exp $
14   */
15  
16  package gate.creole.dumpingPR;
17  
18  import java.util.*;
19  import gate.*;
20  import gate.creole.*;
21  import gate.corpora.DocumentImpl;
22  import gate.util.*;
23  import java.net.URL;
24  import java.io.*;
25  
26  /**
27   * This class implements a DumpingPR which exports a given set of annotation
28   * types + the original markup, back into the document's native format.
29   * The export might also include the GATE features of those annotations or
30   * not (the default). One can also control whether the export files have a
31   * new suffix (useSuffixForDumpFiles) and what this suffix is
32   * (suffixForDumpFiles). By default, a suffix is used and it is .gate.
33   */
34  public class DumpingPR extends AbstractLanguageAnalyser
35    implements ProcessingResource {
36  
37    public static final String
38      DPR_DOCUMENT_PARAMETER_NAME = "document";
39  
40    public static final String
41      DPR_ANN_SET_PARAMETER_NAME = "annotationSetName";
42  
43    public static final String
44      DPR_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
45  
46    public static final String
47      DPR_DUMP_TYPES_PARAMETER_NAME = "dumpTypes";
48  
49    public static final String
50      DPR_OUTPUR_URL_PARAMETER_NAME = "outputFileUrl";
51  
52    public static final String
53      DPR_INCLUDE_FEAT_PARAMETER_NAME = "includeFeatures";
54  
55    public static final String
56      DPR_USE_SUFFIX_PARAMETER_NAME = "useSuffixForDumpFiles";
57  
58    public static final String
59      DPR_FILE_SUFFIX_PARAMETER_NAME = "suffixForDumpFiles";
60  
61    private static final boolean DEBUG = false;
62  
63    /**
64     * A list of annotation types, which are to be dumped into the output file
65     */
66    protected List annotationTypes;
67  
68    /**
69     * A list of strings specifying new names to be used instead of the original
70     * annotation types given in the annotationTypes parameter. For example, if
71     * annotationTypes was set to [Location, Date], then if dumpTypes is set to
72     * [Place, Date-expr], then the labels <Place> and <Date-expr> will be inserted
73     * instead of <Location> and <Date>.
74     */
75    protected List dumpTypes;
76  
77    /**the name of the annotation set
78     * from which to take the annotations for dumping
79     */
80    protected String annotationSetName;
81  
82    /**
83     * Whether or not to include the annotation features during export
84     */
85    protected boolean includeFeatures = false;
86  
87    /**
88     * What suffix to use for the dump files. .gate by default, but can be
89     * changed via the set method.
90     */
91    protected String suffixForDumpFiles = ".gate";
92  
93    /**
94     * Whether or not to use the special suffix fo the dump files. True by
95     * default.
96     */
97    protected boolean useSuffixForDumpFiles = true;
98  
99    protected java.net.URL outputFileUrl;
100 
101   private static final String DUMPING_PR_SET = "DumpingPRTempSet";
102 
103   /** Initialise this resource, and return it. */
104   public Resource init() throws ResourceInstantiationException
105   {
106     return super.init();
107   } // init()
108 
109   /**
110   * Reinitialises the processing resource. After calling this method the
111   * resource should be in the state it is after calling init.
112   * If the resource depends on external resources (such as rules files) then
113   * the resource will re-read those resources. If the data used to create
114   * the resource has changed since the resource has been created then the
115   * resource will change too after calling reInit().
116   */
117   public void reInit() throws ResourceInstantiationException
118   {
119     init();
120   } // reInit()
121 
122   /** Run the resource. */
123   public void execute() throws ExecutionException {
124 
125     if(document == null)
126       throw new GateRuntimeException("No document to process!");
127 
128     AnnotationSet allAnnots;
129     // get the annotations from document
130     if ((annotationSetName == null)|| (annotationSetName.equals("")))
131       allAnnots = document.getAnnotations();
132     else
133       allAnnots = document.getAnnotations(annotationSetName);
134 
135     //if none found, print warning and exit
136     if ((allAnnots == null) || allAnnots.isEmpty()) {
137       Out.prln("DumpingPR Warning: No annotations found for export. "
138                + "Including only those from the Original markups set.");
139       write2File(null);
140       return;
141     }
142 
143     //first transfer the annotation types from a list to a set
144     //don't I just hate this!
145     Set types2Export = new HashSet();
146     for(int i=0; i<annotationTypes.size(); i++)
147       types2Export.add(annotationTypes.get(i));
148 
149     //then get the annotations for export
150     AnnotationSet annots2Export = allAnnots.get(types2Export);
151 
152     //check whether we want the annotations to be renamed before
153     //export (that's what dumpTypes is for)
154     if (dumpTypes != null && !dumpTypes.isEmpty()) {
155       HashMap renameMap = new HashMap();
156       for(int i=0; i<dumpTypes.size() && i<annotationTypes.size(); i++) {
157         //check if we have a corresponding annotationType and if yes,
158         //then add to the hash map for renaming
159         renameMap.put(annotationTypes.get(i), dumpTypes.get(i));
160       }//for
161       //if we have to rename annotations, then do so
162       if(!renameMap.isEmpty() && annots2Export != null)
163         annots2Export = renameAnnotations(annots2Export, renameMap);
164     }//if
165 
166     write2File(annots2Export);
167     document.removeAnnotationSet(this.DUMPING_PR_SET);
168 
169   } // execute()
170 
171   protected void write2File(AnnotationSet exportSet) {
172     File outputFile;
173     String source = (String) document.getFeatures().get("gate.SourceURL");
174     try {
175       URL sourceURL = new URL(source);
176       StringBuffer tempBuff = new StringBuffer(sourceURL.getFile());
177       //now append the special suffix if we want to use it
178       if (useSuffixForDumpFiles)
179         tempBuff.append(this.suffixForDumpFiles);
180       String outputPath = tempBuff.toString();
181       if (DEBUG)
182         Out.prln(outputPath);
183       outputFile = new File(outputPath);
184     } catch (java.net.MalformedURLException ex) {
185       if (outputFileUrl != null)
186         outputFile = new File(outputFileUrl.getFile());
187       else
188         throw new GateRuntimeException("Cannot export GATE annotations because"
189                      + "document does not have a valid source URL.");
190     }
191 
192     try {
193       // Prepare to write into the xmlFile using the doc's encoding if there
194       OutputStreamWriter writer;
195       if (document instanceof DocumentImpl) {
196         String encoding = ((DocumentImpl) document).getEncoding();
197         if (encoding == null || "".equals(encoding))
198           writer = new OutputStreamWriter(new FileOutputStream(outputFile));
199         else
200           writer = new OutputStreamWriter(
201                             new FileOutputStream(outputFile), encoding);
202       } else
203           writer = new OutputStreamWriter(
204                             new FileOutputStream(outputFile));
205 
206       // Write (test the toXml() method)
207       // This Action is added only when a gate.Document is created.
208       // So, is for sure that the resource is a gate.Document
209       writer.write(document.toXml(exportSet, includeFeatures));
210       writer.flush();
211       writer.close();
212     } catch (IOException ex) {
213       throw new GateRuntimeException("Dumping PR: Error writing document "
214                                      + document.getName() + ": "
215                                      + ex.getMessage());
216     }
217 
218 
219   }//write2File
220 
221   protected AnnotationSet renameAnnotations(AnnotationSet annots2Export,
222                                    HashMap renameMap){
223     Iterator iter = annots2Export.iterator();
224     AnnotationSet as = document.getAnnotations(DUMPING_PR_SET);
225     if (!as.isEmpty())
226       as.clear();
227     while(iter.hasNext()) {
228       Annotation annot = (Annotation) iter.next();
229       //first check whether this type needs to be renamed
230       //if not, continue
231       if (!renameMap.containsKey(annot.getType()))
232         renameMap.put(annot.getType(), annot.getType());
233       try{
234         as.add(annot.getId(),
235             annot.getStartNode().getOffset(),
236             annot.getEndNode().getOffset(),
237             (String) renameMap.get(annot.getType()),
238             annot.getFeatures());
239       } catch (InvalidOffsetException ex) {
240         throw new GateRuntimeException("DumpingPR: " + ex.getMessage());
241       }
242     }//while
243     return as;
244   }//renameAnnotations
245 
246 
247   /**get the name of the annotation set*/
248   public String getAnnotationSetName() {
249     return annotationSetName;
250   }//getAnnotationSetName
251 
252   /** set the annotation set name*/
253   public void setAnnotationSetName(String newAnnotationSetName) {
254     annotationSetName = newAnnotationSetName;
255   }//setAnnotationSetName
256 
257   public List getAnnotationTypes() {
258     return this.annotationTypes;
259   }
260 
261   public void setAnnotationTypes(List newTypes) {
262     annotationTypes = newTypes;
263   }
264 
265   public List getDumpTypes() {
266     return this.dumpTypes;
267   }
268 
269   public void setDumpTypes(List newTypes) {
270     dumpTypes = newTypes;
271   }
272 
273   public URL getOutputFileUrl() {
274     return this.outputFileUrl;
275   }
276 
277   public void setOutputFileUrl(URL file) {
278     outputFileUrl = file;
279   }
280 
281   public void setIncludeFeatures(Boolean inclFeatures) {
282     if (inclFeatures != null)
283       includeFeatures = inclFeatures.booleanValue();
284   }
285 
286   public Boolean getIncludeFeatures() {
287     return new Boolean(includeFeatures);
288   }
289 
290   public String getSuffixForDumpFiles() {
291     return suffixForDumpFiles;
292   }
293 
294   public void setSuffixForDumpFiles(String newSuffix) {
295     this.suffixForDumpFiles = newSuffix;
296   }
297 
298   public Boolean getUseSuffixForDumpFiles() {
299     return new Boolean(this.useSuffixForDumpFiles);
300   }
301 
302   public void setUseSuffixForDumpFiles(Boolean useOrNot) {
303     if (useOrNot != null)
304       this.useSuffixForDumpFiles = useOrNot.booleanValue();
305   }
306 
307 } // class AnnotationSetTransfer
308