1   /*
2    *  CorpusSaver.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 22/Nov/2001
12   *
13   *  $Id: CorpusSaver.java,v 1.2 2001/11/27 14:27:31 kalina Exp $
14   */
15  
16  package gate.util;
17  
18  import java.util.*;
19  import java.io.*;
20  
21  import gate.*;
22  import gate.creole.*;
23  import gate.util.*;
24  import gate.persist.*;
25  import java.net.*;
26  import java.text.NumberFormat;
27  
28  import gate.creole.tokeniser.*;
29  import gate.creole.gazetteer.*;
30  import gate.creole.splitter.*;
31  import gate.creole.orthomatcher.*;
32  import gate.creole.annotransfer.*;
33  import gate.creole.annotdelete.*;
34  
35  public class CorpusSaver {
36  
37    private static final boolean DEBUG = true;
38  
39    public CorpusSaver() {
40    }
41  
42    public void init() {
43      File path = new File(dsPath);
44      try {
45       ds = Factory.openDataStore("gate.persist.SerialDataStore",
46                                  path.toURL().toString());
47      } catch (Exception ex) {
48        throw new gate.util.GateRuntimeException(ex.getMessage());
49      }
50  
51      try {
52        Corpus corpus = Factory.newCorpus("bnc");
53        LanguageResource lr = ds.adopt(corpus, null);
54        ds.sync(lr);
55        theCorpus = (Corpus) lr;
56      } catch (Exception ex) {
57        throw new GateRuntimeException(ex.getMessage());
58      }
59  
60      if (processMode)
61        initPRs();
62  
63    }
64  
65    public void initPRs() {
66      try {
67        FeatureMap params = Factory.newFeatureMap();
68  
69        //create a default tokeniser
70        Out.prln("Loading tokeniser <P>");
71  //      String rulesURL = this.configs.getProperty("tokeniserRulesURL");
72  //      if (rulesURL != null && !rulesURL.equals(""))
73  //        params.put("tokeniserRulesURL", rulesURL);
74  //      String grammarsURL = this.configs.getProperty("tokeniserGrammarURL");
75  //      if (grammarsURL != null && !grammarsURL.equals(""))
76  //        params.put("transducerGrammarURL", grammarsURL);
77        //the annots are put in temp, as they are going to be transfered to the
78        //new set
79        params.put("annotationSetName", "temp");
80        tokeniser = (DefaultTokeniser) Factory.createResource(
81                        "gate.creole.tokeniser.DefaultTokeniser", params);
82  
83        //create a default gazetteer
84        Out.prln("Loading gazetteer <P>");
85        params.clear();
86  //      String listsURL = this.configs.getProperty("gazetteerListsURL");
87  //      if (listsURL != null && !listsURL.equals(""))
88  //        params.put("listsURL", listsURL);
89  //      String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive");
90  //      if (caseSensitive != null && !caseSensitive.equals(""))
91  //        params.put("caseSensitive", new Boolean(caseSensitive));
92        params.put("annotationSetName", "temp");
93        gazetteer = (DefaultGazetteer) Factory.createResource(
94                        "gate.creole.gazetteer.DefaultGazetteer", params);
95  
96        //create the Annotation set transfer
97        Out.prln("Loading annotation set transfer <P>");
98        params.clear();
99        params.put("inputASName", "temp");
100       params.put("outputASName", annotSetName);
101       //transfer only the annotations under the body tag (BNC spesific)
102       setTransfer = (AnnotationSetTransfer) Factory.createResource(
103                       "gate.creole.annotransfer.AnnotationSetTransfer", params);
104 
105       //create a splitter
106       Out.prln("Loading sentence splitter <P>");
107       params.clear();
108 //      listsURL = this.configs.getProperty("splitterGazetteerURL");
109 //      if (listsURL != null && !listsURL.equals(""))
110 //        params.put("gazetteerListsURL", listsURL);
111 //      grammarsURL = this.configs.getProperty("splitterGrammarURL");
112 //      if (grammarsURL != null && !grammarsURL.equals(""))
113 //        params.put("transducerURL", grammarsURL);
114       params.put("inputASName", annotSetName);
115       params.put("outputASName", annotSetName);
116       splitter = (SentenceSplitter) Factory.createResource(
117                       "gate.creole.splitter.SentenceSplitter", params);
118 
119       //create a tagger
120       Out.prln("Loading POS tagger <P>");
121       params.clear();
122 //      String lexiconURL = this.configs.getProperty("taggerLexiconURL");
123 //      if (lexiconURL != null && !lexiconURL.equals(""))
124 //        params.put("lexiconURL", lexiconURL);
125 //      rulesURL = this.configs.getProperty("taggerRulesURL");
126 //      if (rulesURL != null && !rulesURL.equals(""))
127 //        params.put("rulesURL", rulesURL);
128       params.put("inputASName", annotSetName);
129       params.put("outputASName", annotSetName);
130       tagger = (POSTagger) Factory.createResource(
131                       "gate.creole.POSTagger", params);
132 
133       //create a grammar
134       Out.prln("Loading grammars for transducer <P>");
135       params.clear();
136 //      String grammarURL = this.configs.getProperty("grammarURL");
137 //      if (grammarURL != null && !grammarURL.equals(""))
138 //        params.put("grammarURL", grammarURL);
139       params.put("inputASName", annotSetName);
140       params.put("outputASName", annotSetName);
141       transducer = (ANNIETransducer) Factory.createResource(
142                       "gate.creole.ANNIETransducer", params);
143 
144       //create an orthomatcher
145       Out.prln("Loading orthomatcher <P>");
146       params.clear();
147       params.put("annotationSetName", annotSetName);
148       orthomatcher = (OrthoMatcher) Factory.createResource(
149                       "gate.creole.orthomatcher.OrthoMatcher", params);
150 
151       Out.prln("Loading document reset PR <P>");
152       params.clear();
153       annotDeletePR = (AnnotationDeletePR) Factory.createResource(
154                     "gate.creole.annotdelete.AnnotationDeletePR", params);
155     } catch (ResourceInstantiationException ex) {
156       throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage());
157     }
158   }//initPRs
159 
160   public void execute() {
161     execute(startDir);
162     try {
163       ds.sync(theCorpus);
164       Factory.deleteResource(theCorpus);
165       if(ds !=null)
166         ds.close();
167     } catch (Exception ex) {
168       throw new GateRuntimeException(ex.getMessage());
169     }
170   }
171 
172   public void execute(File dir) {
173     if (dir == null || ds == null)
174       return;
175     //first set the current directory to be the given one
176     currDir = dir;
177     Out.prln("Processing directory: " + currDir);
178 
179     ArrayList files = new ArrayList();
180     ArrayList dirs = new ArrayList();
181     File[] dirArray = currDir.listFiles();
182     for (int i = 0; i < dirArray.length; i++) {
183       if (dirArray[i].isDirectory())
184         dirs.add(dirArray[i]);
185       else if (dirArray[i].isFile())
186         files.add(dirArray[i]);
187     }
188 
189     saveFiles(files);
190 
191     //if no more subdirs left, return
192     if (dirs.isEmpty())
193       return;
194 
195     //there are more subdirectories to traverse, so iterate through
196     for (int j = 0; j < dirs.size(); j++)
197       execute((File) dirs.get(j));
198 
199   }//execute(dir)
200 
201 
202   public static void main(String[] args) throws GateException {
203     Gate.init();
204 
205     CorpusSaver corpusSaver1 = new CorpusSaver();
206 
207     if(args.length < 2)
208       throw new GateException("usage: [-process] source_directory datastore_path");
209     int i = 0;
210     while (i < args.length && args[i].startsWith("-")) {
211       if(args[i].equals("-process")) {
212         Out.prln("ANNIE processing the corpus enabled. <P>");
213         corpusSaver1.setProcessMode(true);
214       }
215       i++; //just ignore the option, which we do not recognise
216     }//while
217 
218     String dirName = args[i];
219     File dir = new File(dirName);
220     if (!dir.isDirectory())
221       throw new GateRuntimeException("Corpus directory should be "
222                                      + "provided as a parameter");
223 
224     if(i+1 >= args.length)
225       throw new GateRuntimeException("Datastore path not provided");
226 
227     String storagePath = args[i+1];
228     File storage = new File(storagePath);
229     if (!storage.isDirectory())
230       throw new GateRuntimeException("Please provide path to an existing "
231                                      + "GATE serial datastore");
232     corpusSaver1.setDSPath(storagePath);
233 
234     corpusSaver1.init();
235     corpusSaver1.setStartDir(dir);
236     double timeBefore = System.currentTimeMillis();
237     corpusSaver1.execute();
238     double timeAfter = System.currentTimeMillis();
239     Out.prln("BNC saved in " +
240       NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
241       + " seconds");
242 
243   }
244 
245   public void setStartDir(File newDir) {
246     startDir = newDir;
247   }
248 
249   public void setProcessMode(boolean mode) {
250     processMode = mode;
251   }
252 
253   public void setDSPath(String path){
254     dsPath = path;
255   }
256 
257   protected void saveFiles(List files) {
258     if (files==null || files.isEmpty() || theCorpus == null || ds == null)
259       return;
260 
261     for(int i=0; i<files.size(); i++) {
262       try {
263         Document doc = Factory.newDocument(((File)files.get(i)).toURL());
264         doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString()));
265         Out.prln("Storing document: " + doc.getName());
266         //first process it with ANNIE if in process mode
267         if (processMode)
268           processDocument(doc);
269         //then store it in the DS and add to corpus
270         LanguageResource lr = ds.adopt(doc, null);
271         theCorpus.add(lr);
272         theCorpus.unloadDocument((Document)lr);
273         Factory.deleteResource(doc);
274         if (lr != doc)
275           Factory.deleteResource(lr);
276       } catch (Exception ex) {
277         throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
278       }
279     }//for
280   }//saveFiles
281 
282   protected void processDocument(Document doc) {
283     try {
284       tokeniser.setDocument(doc);
285       tokeniser.execute();
286 
287       gazetteer.setDocument(doc);
288       gazetteer.execute();
289 
290       setTransfer.setDocument(doc);
291       String tagName = "text";
292       AnnotationSet body = doc.getAnnotations(
293                     GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName);
294       if (body == null || body.isEmpty())
295         tagName = "stext";
296       body = doc.getAnnotations(
297                     GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName);
298       if (body == null || body.isEmpty())
299         tagName = "body";
300       setTransfer.setTextTagName(tagName);
301       setTransfer.execute();
302 
303       splitter.setDocument(doc);
304       splitter.execute();
305 
306       tagger.setDocument(doc);
307       tagger.execute();
308 
309       transducer.setDocument(doc);
310       transducer.execute();
311 
312       orthomatcher.setDocument(doc);
313       orthomatcher.execute();
314 
315       annotDeletePR.setDocument(doc);
316       List annotTypes = new ArrayList();
317       annotTypes.add("Token");
318       annotTypes.add("SpaceToken");
319       annotTypes.add("Unknown");
320       annotTypes.add("TempIdentifier");
321       annotTypes.add("Temp");
322       annotTypes.add("Lookup");
323       annotTypes.add("Split");
324       annotDeletePR.setAnnotationTypes(annotTypes);
325       annotDeletePR.execute();
326     } catch (gate.creole.ExecutionException ex) {
327       throw new GateRuntimeException("Corpus generation error: " +
328                                      ex.getMessage());
329     }
330   }
331 
332 
333   /**
334    * The directory from which we should generate/evaluate the corpus
335    */
336   private File startDir;
337   private File currDir;
338 
339   private DataStore ds;
340   private Corpus theCorpus;
341   private String annotSetName = "NE";
342   private String dsPath = "d:\\bnc";
343 
344   private DefaultTokeniser tokeniser;
345   private DefaultGazetteer gazetteer;
346   private SentenceSplitter splitter;
347   private POSTagger tagger;
348   private ANNIETransducer transducer;
349   private OrthoMatcher orthomatcher;
350   private AnnotationSetTransfer setTransfer;
351   private AnnotationDeletePR annotDeletePR;
352 
353   private boolean processMode = false;
354 }