1   /*
2    *  CorpusSaver.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 22/Nov/2001
12   *
13   *  $Id: CorpusSaver.java,v 1.4 2002/04/30 10:12:07 valyt Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import java.net.*;
26  import java.text.NumberFormat;
27  
28  import gate.creole.tokeniser.*;
29  import gate.creole.gazetteer.*;
30  import gate.creole.splitter.*;
31  import gate.creole.orthomatcher.*;
32  import gate.creole.annotransfer.*;
33  import gate.creole.annotdelete.*;
34  
35  public class CorpusSaver {
36  
37    private static final boolean DEBUG = true;
38  
39    public CorpusSaver() {
40    }
41  
42    public void init() {
43      File path = new File(dsPath);
44      try {
45       ds = Factory.openDataStore("gate.persist.SerialDataStore",
46                                  path.toURL().toString());
47      } catch (Exception ex) {
48        throw new gate.util.GateRuntimeException(ex.getMessage());
49      }
50  
51      try {
52        Corpus corpus = Factory.newCorpus("bnc");
53        LanguageResource lr = ds.adopt(corpus, null);
54        ds.sync(lr);
55        theCorpus = (Corpus) lr;
56      } catch (Exception ex) {
57        throw new GateRuntimeException(ex.getMessage());
58      }
59  
60      if (processMode)
61        initPRs();
62  
63    }
64  
65    public void initPRs() {
66      try {
67        FeatureMap params = Factory.newFeatureMap();
68  
69        //create a default tokeniser
70        Out.prln("Loading tokeniser <P>");
71  //      String rulesURL = this.configs.getProperty("tokeniserRulesURL");
72  //      if (rulesURL != null && !rulesURL.equals(""))
73  //        params.put("tokeniserRulesURL", rulesURL);
74  //      String grammarsURL = this.configs.getProperty("tokeniserGrammarURL");
75  //      if (grammarsURL != null && !grammarsURL.equals(""))
76  //        params.put("transducerGrammarURL", grammarsURL);
77        //the annots are put in temp, as they are going to be transfered to the
78        //new set
79        params.put(DefaultTokeniser.DEF_TOK_ANNOT_SET_PARAMETER_NAME, "temp");
80        tokeniser = (DefaultTokeniser) Factory.createResource(
81                        "gate.creole.tokeniser.DefaultTokeniser", params);
82  
83        //create a default gazetteer
84        Out.prln("Loading gazetteer <P>");
85        params.clear();
86  //      String listsURL = this.configs.getProperty("gazetteerListsURL");
87  //      if (listsURL != null && !listsURL.equals(""))
88  //        params.put("listsURL", listsURL);
89  //      String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive");
90  //      if (caseSensitive != null && !caseSensitive.equals(""))
91  //        params.put("caseSensitive", new Boolean(caseSensitive));
92        params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, "temp");
93        gazetteer = (DefaultGazetteer) Factory.createResource(
94                        "gate.creole.gazetteer.DefaultGazetteer", params);
95  
96        //create the Annotation set transfer
97        Out.prln("Loading annotation set transfer <P>");
98        params.clear();
99        params.put("inputASName", "temp");
100       params.put("outputASName", annotSetName);
101       //transfer only the annotations under the body tag (BNC spesific)
102       setTransfer = (AnnotationSetTransfer) Factory.createResource(
103                       "gate.creole.annotransfer.AnnotationSetTransfer", params);
104 
105       //create a splitter
106       Out.prln("Loading sentence splitter <P>");
107       params.clear();
108 //      listsURL = this.configs.getProperty("splitterGazetteerURL");
109 //      if (listsURL != null && !listsURL.equals(""))
110 //        params.put("gazetteerListsURL", listsURL);
111 //      grammarsURL = this.configs.getProperty("splitterGrammarURL");
112 //      if (grammarsURL != null && !grammarsURL.equals(""))
113 //        params.put("transducerURL", grammarsURL);
114       params.put(SentenceSplitter.SPLIT_INPUT_AS_PARAMETER_NAME, annotSetName);
115       params.put(SentenceSplitter.SPLIT_OUTPUT_AS_PARAMETER_NAME, annotSetName);
116       splitter = (SentenceSplitter) Factory.createResource(
117                       "gate.creole.splitter.SentenceSplitter", params);
118 
119       //create a tagger
120       Out.prln("Loading POS tagger <P>");
121       params.clear();
122 //      String lexiconURL = this.configs.getProperty("taggerLexiconURL");
123 //      if (lexiconURL != null && !lexiconURL.equals(""))
124 //        params.put("lexiconURL", lexiconURL);
125 //      rulesURL = this.configs.getProperty("taggerRulesURL");
126 //      if (rulesURL != null && !rulesURL.equals(""))
127 //        params.put("rulesURL", rulesURL);
128       params.put(POSTagger.TAG_INPUT_AS_PARAMETER_NAME, annotSetName);
129       tagger = (POSTagger) Factory.createResource(
130                       "gate.creole.POSTagger", params);
131 
132       //create a grammar
133       Out.prln("Loading grammars for transducer <P>");
134       params.clear();
135 //      String grammarURL = this.configs.getProperty("grammarURL");
136 //      if (grammarURL != null && !grammarURL.equals(""))
137 //        params.put("grammarURL", grammarURL);
138       params.put(ANNIETransducer.TRANSD_INPUT_AS_PARAMETER_NAME, annotSetName);
139       params.put(ANNIETransducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, annotSetName);
140       transducer = (ANNIETransducer) Factory.createResource(
141                       "gate.creole.ANNIETransducer", params);
142 
143       //create an orthomatcher
144       Out.prln("Loading orthomatcher <P>");
145       params.clear();
146       params.put(OrthoMatcher.OM_ANN_SET_PARAMETER_NAME, annotSetName);
147       orthomatcher = (OrthoMatcher) Factory.createResource(
148                       "gate.creole.orthomatcher.OrthoMatcher", params);
149 
150       Out.prln("Loading document reset PR <P>");
151       params.clear();
152       annotDeletePR = (AnnotationDeletePR) Factory.createResource(
153                     "gate.creole.annotdelete.AnnotationDeletePR", params);
154     } catch (ResourceInstantiationException ex) {
155       throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
156     }
157   }//initPRs
158 
159   public void execute() {
160     execute(startDir);
161     try {
162       ds.sync(theCorpus);
163       Factory.deleteResource(theCorpus);
164       if(ds !=null)
165         ds.close();
166     } catch (Exception ex) {
167       throw new GateRuntimeException(ex.getMessage());
168     }
169   }
170 
171   public void execute(File dir) {
172     if (dir == null || ds == null)
173       return;
174     //first set the current directory to be the given one
175     currDir = dir;
176     Out.prln("Processing directory: " + currDir);
177 
178     ArrayList files = new ArrayList();
179     ArrayList dirs = new ArrayList();
180     File[] dirArray = currDir.listFiles();
181     for (int i = 0; i < dirArray.length; i++) {
182       if (dirArray[i].isDirectory())
183         dirs.add(dirArray[i]);
184       else if (dirArray[i].isFile())
185         files.add(dirArray[i]);
186     }
187 
188     saveFiles(files);
189 
190     //if no more subdirs left, return
191     if (dirs.isEmpty())
192       return;
193 
194     //there are more subdirectories to traverse, so iterate through
195     for (int j = 0; j < dirs.size(); j++)
196       execute((File) dirs.get(j));
197 
198   }//execute(dir)
199 
200 
201   public static void main(String[] args) throws GateException {
202     Gate.init();
203 
204     CorpusSaver corpusSaver1 = new CorpusSaver();
205 
206     if(args.length < 2)
207       throw new GateException("usage: [-process] source_directory datastore_path");
208     int i = 0;
209     while (i < args.length && args[i].startsWith("-")) {
210       if(args[i].equals("-process")) {
211         Out.prln("ANNIE processing the corpus enabled. <P>");
212         corpusSaver1.setProcessMode(true);
213       }
214       i++; //just ignore the option, which we do not recognise
215     }//while
216 
217     String dirName = args[i];
218     File dir = new File(dirName);
219     if (!dir.isDirectory())
220       throw new GateRuntimeException("Corpus directory should be "
221                                      + "provided as a parameter");
222 
223     if(i+1 >= args.length)
224       throw new GateRuntimeException("Datastore path not provided");
225 
226     String storagePath = args[i+1];
227     File storage = new File(storagePath);
228     if (!storage.isDirectory())
229       throw new GateRuntimeException("Please provide path to an existing "
230                                      + "GATE serial datastore");
231     corpusSaver1.setDSPath(storagePath);
232 
233     corpusSaver1.init();
234     corpusSaver1.setStartDir(dir);
235     double timeBefore = System.currentTimeMillis();
236     corpusSaver1.execute();
237     double timeAfter = System.currentTimeMillis();
238     Out.prln("BNC saved in " +
239       NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
240       + " seconds");
241 
242   }
243 
244   public void setStartDir(File newDir) {
245     startDir = newDir;
246   }
247 
248   public void setProcessMode(boolean mode) {
249     processMode = mode;
250   }
251 
252   public void setDSPath(String path){
253     dsPath = path;
254   }
255 
256   protected void saveFiles(List files) {
257     if (files==null || files.isEmpty() || theCorpus == null || ds == null)
258       return;
259 
260     for(int i=0; i<files.size(); i++) {
261       try {
262         Document doc = Factory.newDocument(((File)files.get(i)).toURL());
263         doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString()));
264         Out.prln("Storing document: " + doc.getName());
265         //first process it with ANNIE if in process mode
266         if (processMode)
267           processDocument(doc);
268         //then store it in the DS and add to corpus
269         LanguageResource lr = ds.adopt(doc, null);
270         theCorpus.add(lr);
271         theCorpus.unloadDocument((Document)lr);
272         Factory.deleteResource(doc);
273         if (lr != doc)
274           Factory.deleteResource(lr);
275       } catch (Exception ex) {
276         throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
277       }
278     }//for
279   }//saveFiles
280 
281   protected void processDocument(Document doc) {
282     try {
283       tokeniser.setDocument(doc);
284       tokeniser.execute();
285 
286       gazetteer.setDocument(doc);
287       gazetteer.execute();
288 
289       setTransfer.setDocument(doc);
290       String tagName = "text";
291       AnnotationSet body = doc.getAnnotations(
292                     GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName);
293       if (body == null || body.isEmpty())
294         tagName = "stext";
295       body = doc.getAnnotations(
296                     GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName);
297       if (body == null || body.isEmpty())
298         tagName = "body";
299       setTransfer.setTextTagName(tagName);
300       setTransfer.execute();
301 
302       splitter.setDocument(doc);
303       splitter.execute();
304 
305       tagger.setDocument(doc);
306       tagger.execute();
307 
308       transducer.setDocument(doc);
309       transducer.execute();
310 
311       orthomatcher.setDocument(doc);
312       orthomatcher.execute();
313 
314       annotDeletePR.setDocument(doc);
315       List annotTypes = new ArrayList();
316       annotTypes.add(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
317       annotTypes.add(ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE);
318       annotTypes.add("Unknown");
319       annotTypes.add("TempIdentifier");
320       annotTypes.add("Temp");
321       annotTypes.add(ANNIEConstants.LOOKUP_ANNOTATION_TYPE);
322       annotTypes.add("Split");
323       annotDeletePR.setAnnotationTypes(annotTypes);
324       annotDeletePR.execute();
325     } catch (gate.creole.ExecutionException ex) {
326       throw new GateRuntimeException("Corpus generation error: " +
327                                      ex.getMessage());
328     }
329   }
330 
331 
332   /**
333    * The directory from which we should generate/evaluate the corpus
334    */
335   private File startDir;
336   private File currDir;
337 
338   private DataStore ds;
339   private Corpus theCorpus;
340   private String annotSetName = "NE";
341   private String dsPath = "d:\\bnc";
342 
343   private DefaultTokeniser tokeniser;
344   private DefaultGazetteer gazetteer;
345   private SentenceSplitter splitter;
346   private POSTagger tagger;
347   private ANNIETransducer transducer;
348   private OrthoMatcher orthomatcher;
349   private AnnotationSetTransfer setTransfer;
350   private AnnotationDeletePR annotDeletePR;
351 
352   private boolean processMode = false;
353 }