|
CorpusSaver |
|
1 /* 2 * CorpusSaver.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 22/Nov/2001 12 * 13 * $Id: CorpusSaver.java,v 1.2 2001/11/27 14:27:31 kalina Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import java.net.*; 26 import java.text.NumberFormat; 27 28 import gate.creole.tokeniser.*; 29 import gate.creole.gazetteer.*; 30 import gate.creole.splitter.*; 31 import gate.creole.orthomatcher.*; 32 import gate.creole.annotransfer.*; 33 import gate.creole.annotdelete.*; 34 35 public class CorpusSaver { 36 37 private static final boolean DEBUG = true; 38 39 public CorpusSaver() { 40 } 41 42 public void init() { 43 File path = new File(dsPath); 44 try { 45 ds = Factory.openDataStore("gate.persist.SerialDataStore", 46 path.toURL().toString()); 47 } catch (Exception ex) { 48 throw new gate.util.GateRuntimeException(ex.getMessage()); 49 } 50 51 try { 52 Corpus corpus = Factory.newCorpus("bnc"); 53 LanguageResource lr = ds.adopt(corpus, null); 54 ds.sync(lr); 55 theCorpus = (Corpus) lr; 56 } catch (Exception ex) { 57 throw new GateRuntimeException(ex.getMessage()); 58 } 59 60 if (processMode) 61 initPRs(); 62 63 } 64 65 public void initPRs() { 66 try { 67 FeatureMap params = Factory.newFeatureMap(); 68 69 //create a default tokeniser 70 Out.prln("Loading tokeniser <P>"); 71 // String rulesURL = this.configs.getProperty("tokeniserRulesURL"); 72 // if (rulesURL != null && !rulesURL.equals("")) 73 // params.put("tokeniserRulesURL", rulesURL); 74 // String grammarsURL = this.configs.getProperty("tokeniserGrammarURL"); 75 // if (grammarsURL != null && !grammarsURL.equals("")) 76 // params.put("transducerGrammarURL", grammarsURL); 77 //the annots are put in temp, as they are going to be transfered to the 78 //new set 79 params.put("annotationSetName", "temp"); 80 tokeniser = (DefaultTokeniser) Factory.createResource( 81 "gate.creole.tokeniser.DefaultTokeniser", params); 82 83 //create a default gazetteer 84 Out.prln("Loading gazetteer <P>"); 85 params.clear(); 86 // String listsURL = this.configs.getProperty("gazetteerListsURL"); 87 // if (listsURL != null && !listsURL.equals("")) 88 // params.put("listsURL", listsURL); 89 // String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive"); 90 // if (caseSensitive != null && !caseSensitive.equals("")) 91 // params.put("caseSensitive", new Boolean(caseSensitive)); 92 params.put("annotationSetName", "temp"); 93 gazetteer = (DefaultGazetteer) Factory.createResource( 94 "gate.creole.gazetteer.DefaultGazetteer", params); 95 96 //create the Annotation set transfer 97 Out.prln("Loading annotation set transfer <P>"); 98 params.clear(); 99 params.put("inputASName", "temp"); 100 params.put("outputASName", annotSetName); 101 //transfer only the annotations under the body tag (BNC spesific) 102 setTransfer = (AnnotationSetTransfer) Factory.createResource( 103 "gate.creole.annotransfer.AnnotationSetTransfer", params); 104 105 //create a splitter 106 Out.prln("Loading sentence splitter <P>"); 107 params.clear(); 108 // listsURL = this.configs.getProperty("splitterGazetteerURL"); 109 // if (listsURL != null && !listsURL.equals("")) 110 // params.put("gazetteerListsURL", listsURL); 111 // grammarsURL = this.configs.getProperty("splitterGrammarURL"); 112 // if (grammarsURL != null && !grammarsURL.equals("")) 113 // params.put("transducerURL", grammarsURL); 114 params.put("inputASName", annotSetName); 115 params.put("outputASName", annotSetName); 116 splitter = (SentenceSplitter) Factory.createResource( 117 "gate.creole.splitter.SentenceSplitter", params); 118 119 //create a tagger 120 Out.prln("Loading POS tagger <P>"); 121 params.clear(); 122 // String lexiconURL = this.configs.getProperty("taggerLexiconURL"); 123 // if (lexiconURL != null && !lexiconURL.equals("")) 124 // params.put("lexiconURL", lexiconURL); 125 // rulesURL = this.configs.getProperty("taggerRulesURL"); 126 // if (rulesURL != null && !rulesURL.equals("")) 127 // params.put("rulesURL", rulesURL); 128 params.put("inputASName", annotSetName); 129 params.put("outputASName", annotSetName); 130 tagger = (POSTagger) Factory.createResource( 131 "gate.creole.POSTagger", params); 132 133 //create a grammar 134 Out.prln("Loading grammars for transducer <P>"); 135 params.clear(); 136 // String grammarURL = this.configs.getProperty("grammarURL"); 137 // if (grammarURL != null && !grammarURL.equals("")) 138 // params.put("grammarURL", grammarURL); 139 params.put("inputASName", annotSetName); 140 params.put("outputASName", annotSetName); 141 transducer = (ANNIETransducer) Factory.createResource( 142 "gate.creole.ANNIETransducer", params); 143 144 //create an orthomatcher 145 Out.prln("Loading orthomatcher <P>"); 146 params.clear(); 147 params.put("annotationSetName", annotSetName); 148 orthomatcher = (OrthoMatcher) Factory.createResource( 149 "gate.creole.orthomatcher.OrthoMatcher", params); 150 151 Out.prln("Loading document reset PR <P>"); 152 params.clear(); 153 annotDeletePR = (AnnotationDeletePR) Factory.createResource( 154 "gate.creole.annotdelete.AnnotationDeletePR", params); 155 } catch (ResourceInstantiationException ex) { 156 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 157 } 158 }//initPRs 159 160 public void execute() { 161 execute(startDir); 162 try { 163 ds.sync(theCorpus); 164 Factory.deleteResource(theCorpus); 165 if(ds !=null) 166 ds.close(); 167 } catch (Exception ex) { 168 throw new GateRuntimeException(ex.getMessage()); 169 } 170 } 171 172 public void execute(File dir) { 173 if (dir == null || ds == null) 174 return; 175 //first set the current directory to be the given one 176 currDir = dir; 177 Out.prln("Processing directory: " + currDir); 178 179 ArrayList files = new ArrayList(); 180 ArrayList dirs = new ArrayList(); 181 File[] dirArray = currDir.listFiles(); 182 for (int i = 0; i < dirArray.length; i++) { 183 if (dirArray[i].isDirectory()) 184 dirs.add(dirArray[i]); 185 else if (dirArray[i].isFile()) 186 files.add(dirArray[i]); 187 } 188 189 saveFiles(files); 190 191 //if no more subdirs left, return 192 if (dirs.isEmpty()) 193 return; 194 195 //there are more subdirectories to traverse, so iterate through 196 for (int j = 0; j < dirs.size(); j++) 197 execute((File) dirs.get(j)); 198 199 }//execute(dir) 200 201 202 public static void main(String[] args) throws GateException { 203 Gate.init(); 204 205 CorpusSaver corpusSaver1 = new CorpusSaver(); 206 207 if(args.length < 2) 208 throw new GateException("usage: [-process] source_directory datastore_path"); 209 int i = 0; 210 while (i < args.length && args[i].startsWith("-")) { 211 if(args[i].equals("-process")) { 212 Out.prln("ANNIE processing the corpus enabled. <P>"); 213 corpusSaver1.setProcessMode(true); 214 } 215 i++; //just ignore the option, which we do not recognise 216 }//while 217 218 String dirName = args[i]; 219 File dir = new File(dirName); 220 if (!dir.isDirectory()) 221 throw new GateRuntimeException("Corpus directory should be " 222 + "provided as a parameter"); 223 224 if(i+1 >= args.length) 225 throw new GateRuntimeException("Datastore path not provided"); 226 227 String storagePath = args[i+1]; 228 File storage = new File(storagePath); 229 if (!storage.isDirectory()) 230 throw new GateRuntimeException("Please provide path to an existing " 231 + "GATE serial datastore"); 232 corpusSaver1.setDSPath(storagePath); 233 234 corpusSaver1.init(); 235 corpusSaver1.setStartDir(dir); 236 double timeBefore = System.currentTimeMillis(); 237 corpusSaver1.execute(); 238 double timeAfter = System.currentTimeMillis(); 239 Out.prln("BNC saved in " + 240 NumberFormat.getInstance().format((timeAfter-timeBefore)/1000) 241 + " seconds"); 242 243 } 244 245 public void setStartDir(File newDir) { 246 startDir = newDir; 247 } 248 249 public void setProcessMode(boolean mode) { 250 processMode = mode; 251 } 252 253 public void setDSPath(String path){ 254 dsPath = path; 255 } 256 257 protected void saveFiles(List files) { 258 if (files==null || files.isEmpty() || theCorpus == null || ds == null) 259 return; 260 261 for(int i=0; i<files.size(); i++) { 262 try { 263 Document doc = Factory.newDocument(((File)files.get(i)).toURL()); 264 doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString())); 265 Out.prln("Storing document: " + doc.getName()); 266 //first process it with ANNIE if in process mode 267 if (processMode) 268 processDocument(doc); 269 //then store it in the DS and add to corpus 270 LanguageResource lr = ds.adopt(doc, null); 271 theCorpus.add(lr); 272 theCorpus.unloadDocument((Document)lr); 273 Factory.deleteResource(doc); 274 if (lr != doc) 275 Factory.deleteResource(lr); 276 } catch (Exception ex) { 277 throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage()); 278 } 279 }//for 280 }//saveFiles 281 282 protected void processDocument(Document doc) { 283 try { 284 tokeniser.setDocument(doc); 285 tokeniser.execute(); 286 287 gazetteer.setDocument(doc); 288 gazetteer.execute(); 289 290 setTransfer.setDocument(doc); 291 String tagName = "text"; 292 AnnotationSet body = doc.getAnnotations( 293 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName); 294 if (body == null || body.isEmpty()) 295 tagName = "stext"; 296 body = doc.getAnnotations( 297 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName); 298 if (body == null || body.isEmpty()) 299 tagName = "body"; 300 setTransfer.setTextTagName(tagName); 301 setTransfer.execute(); 302 303 splitter.setDocument(doc); 304 splitter.execute(); 305 306 tagger.setDocument(doc); 307 tagger.execute(); 308 309 transducer.setDocument(doc); 310 transducer.execute(); 311 312 orthomatcher.setDocument(doc); 313 orthomatcher.execute(); 314 315 annotDeletePR.setDocument(doc); 316 List annotTypes = new ArrayList(); 317 annotTypes.add("Token"); 318 annotTypes.add("SpaceToken"); 319 annotTypes.add("Unknown"); 320 annotTypes.add("TempIdentifier"); 321 annotTypes.add("Temp"); 322 annotTypes.add("Lookup"); 323 annotTypes.add("Split"); 324 annotDeletePR.setAnnotationTypes(annotTypes); 325 annotDeletePR.execute(); 326 } catch (gate.creole.ExecutionException ex) { 327 throw new GateRuntimeException("Corpus generation error: " + 328 ex.getMessage()); 329 } 330 } 331 332 333 /** 334 * The directory from which we should generate/evaluate the corpus 335 */ 336 private File startDir; 337 private File currDir; 338 339 private DataStore ds; 340 private Corpus theCorpus; 341 private String annotSetName = "NE"; 342 private String dsPath = "d:\\bnc"; 343 344 private DefaultTokeniser tokeniser; 345 private DefaultGazetteer gazetteer; 346 private SentenceSplitter splitter; 347 private POSTagger tagger; 348 private ANNIETransducer transducer; 349 private OrthoMatcher orthomatcher; 350 private AnnotationSetTransfer setTransfer; 351 private AnnotationDeletePR annotDeletePR; 352 353 private boolean processMode = false; 354 }
|
CorpusSaver |
|