|
CorpusSaver |
|
1 /* 2 * CorpusSaver.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 22/Nov/2001 12 * 13 * $Id: CorpusSaver.java,v 1.4 2002/04/30 10:12:07 valyt Exp $ 14 */ 15 16 package gate.util; 17 18 import java.util.*; 19 import java.io.*; 20 21 import gate.*; 22 import gate.creole.*; 23 import gate.util.*; 24 import gate.persist.*; 25 import java.net.*; 26 import java.text.NumberFormat; 27 28 import gate.creole.tokeniser.*; 29 import gate.creole.gazetteer.*; 30 import gate.creole.splitter.*; 31 import gate.creole.orthomatcher.*; 32 import gate.creole.annotransfer.*; 33 import gate.creole.annotdelete.*; 34 35 public class CorpusSaver { 36 37 private static final boolean DEBUG = true; 38 39 public CorpusSaver() { 40 } 41 42 public void init() { 43 File path = new File(dsPath); 44 try { 45 ds = Factory.openDataStore("gate.persist.SerialDataStore", 46 path.toURL().toString()); 47 } catch (Exception ex) { 48 throw new gate.util.GateRuntimeException(ex.getMessage()); 49 } 50 51 try { 52 Corpus corpus = Factory.newCorpus("bnc"); 53 LanguageResource lr = ds.adopt(corpus, null); 54 ds.sync(lr); 55 theCorpus = (Corpus) lr; 56 } catch (Exception ex) { 57 throw new GateRuntimeException(ex.getMessage()); 58 } 59 60 if (processMode) 61 initPRs(); 62 63 } 64 65 public void initPRs() { 66 try { 67 FeatureMap params = Factory.newFeatureMap(); 68 69 //create a default tokeniser 70 Out.prln("Loading tokeniser <P>"); 71 // String rulesURL = this.configs.getProperty("tokeniserRulesURL"); 72 // if (rulesURL != null && !rulesURL.equals("")) 73 // params.put("tokeniserRulesURL", rulesURL); 74 // String grammarsURL = this.configs.getProperty("tokeniserGrammarURL"); 75 // if (grammarsURL != null && !grammarsURL.equals("")) 76 // params.put("transducerGrammarURL", grammarsURL); 77 //the annots are put in temp, as they are going to be transfered to the 78 //new set 79 params.put(DefaultTokeniser.DEF_TOK_ANNOT_SET_PARAMETER_NAME, "temp"); 80 tokeniser = (DefaultTokeniser) Factory.createResource( 81 "gate.creole.tokeniser.DefaultTokeniser", params); 82 83 //create a default gazetteer 84 Out.prln("Loading gazetteer <P>"); 85 params.clear(); 86 // String listsURL = this.configs.getProperty("gazetteerListsURL"); 87 // if (listsURL != null && !listsURL.equals("")) 88 // params.put("listsURL", listsURL); 89 // String caseSensitive = this.configs.getProperty("gazetteerCaseSensitive"); 90 // if (caseSensitive != null && !caseSensitive.equals("")) 91 // params.put("caseSensitive", new Boolean(caseSensitive)); 92 params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, "temp"); 93 gazetteer = (DefaultGazetteer) Factory.createResource( 94 "gate.creole.gazetteer.DefaultGazetteer", params); 95 96 //create the Annotation set transfer 97 Out.prln("Loading annotation set transfer <P>"); 98 params.clear(); 99 params.put("inputASName", "temp"); 100 params.put("outputASName", annotSetName); 101 //transfer only the annotations under the body tag (BNC spesific) 102 setTransfer = (AnnotationSetTransfer) Factory.createResource( 103 "gate.creole.annotransfer.AnnotationSetTransfer", params); 104 105 //create a splitter 106 Out.prln("Loading sentence splitter <P>"); 107 params.clear(); 108 // listsURL = this.configs.getProperty("splitterGazetteerURL"); 109 // if (listsURL != null && !listsURL.equals("")) 110 // params.put("gazetteerListsURL", listsURL); 111 // grammarsURL = this.configs.getProperty("splitterGrammarURL"); 112 // if (grammarsURL != null && !grammarsURL.equals("")) 113 // params.put("transducerURL", grammarsURL); 114 params.put(SentenceSplitter.SPLIT_INPUT_AS_PARAMETER_NAME, annotSetName); 115 params.put(SentenceSplitter.SPLIT_OUTPUT_AS_PARAMETER_NAME, annotSetName); 116 splitter = (SentenceSplitter) Factory.createResource( 117 "gate.creole.splitter.SentenceSplitter", params); 118 119 //create a tagger 120 Out.prln("Loading POS tagger <P>"); 121 params.clear(); 122 // String lexiconURL = this.configs.getProperty("taggerLexiconURL"); 123 // if (lexiconURL != null && !lexiconURL.equals("")) 124 // params.put("lexiconURL", lexiconURL); 125 // rulesURL = this.configs.getProperty("taggerRulesURL"); 126 // if (rulesURL != null && !rulesURL.equals("")) 127 // params.put("rulesURL", rulesURL); 128 params.put(POSTagger.TAG_INPUT_AS_PARAMETER_NAME, annotSetName); 129 tagger = (POSTagger) Factory.createResource( 130 "gate.creole.POSTagger", params); 131 132 //create a grammar 133 Out.prln("Loading grammars for transducer <P>"); 134 params.clear(); 135 // String grammarURL = this.configs.getProperty("grammarURL"); 136 // if (grammarURL != null && !grammarURL.equals("")) 137 // params.put("grammarURL", grammarURL); 138 params.put(ANNIETransducer.TRANSD_INPUT_AS_PARAMETER_NAME, annotSetName); 139 params.put(ANNIETransducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, annotSetName); 140 transducer = (ANNIETransducer) Factory.createResource( 141 "gate.creole.ANNIETransducer", params); 142 143 //create an orthomatcher 144 Out.prln("Loading orthomatcher <P>"); 145 params.clear(); 146 params.put(OrthoMatcher.OM_ANN_SET_PARAMETER_NAME, annotSetName); 147 orthomatcher = (OrthoMatcher) Factory.createResource( 148 "gate.creole.orthomatcher.OrthoMatcher", params); 149 150 Out.prln("Loading document reset PR <P>"); 151 params.clear(); 152 annotDeletePR = (AnnotationDeletePR) Factory.createResource( 153 "gate.creole.annotdelete.AnnotationDeletePR", params); 154 } catch (ResourceInstantiationException ex) { 155 throw new GateRuntimeException("Corpus Benchmark Tool:"+ex.getMessage()); 156 } 157 }//initPRs 158 159 public void execute() { 160 execute(startDir); 161 try { 162 ds.sync(theCorpus); 163 Factory.deleteResource(theCorpus); 164 if(ds !=null) 165 ds.close(); 166 } catch (Exception ex) { 167 throw new GateRuntimeException(ex.getMessage()); 168 } 169 } 170 171 public void execute(File dir) { 172 if (dir == null || ds == null) 173 return; 174 //first set the current directory to be the given one 175 currDir = dir; 176 Out.prln("Processing directory: " + currDir); 177 178 ArrayList files = new ArrayList(); 179 ArrayList dirs = new ArrayList(); 180 File[] dirArray = currDir.listFiles(); 181 for (int i = 0; i < dirArray.length; i++) { 182 if (dirArray[i].isDirectory()) 183 dirs.add(dirArray[i]); 184 else if (dirArray[i].isFile()) 185 files.add(dirArray[i]); 186 } 187 188 saveFiles(files); 189 190 //if no more subdirs left, return 191 if (dirs.isEmpty()) 192 return; 193 194 //there are more subdirectories to traverse, so iterate through 195 for (int j = 0; j < dirs.size(); j++) 196 execute((File) dirs.get(j)); 197 198 }//execute(dir) 199 200 201 public static void main(String[] args) throws GateException { 202 Gate.init(); 203 204 CorpusSaver corpusSaver1 = new CorpusSaver(); 205 206 if(args.length < 2) 207 throw new GateException("usage: [-process] source_directory datastore_path"); 208 int i = 0; 209 while (i < args.length && args[i].startsWith("-")) { 210 if(args[i].equals("-process")) { 211 Out.prln("ANNIE processing the corpus enabled. <P>"); 212 corpusSaver1.setProcessMode(true); 213 } 214 i++; //just ignore the option, which we do not recognise 215 }//while 216 217 String dirName = args[i]; 218 File dir = new File(dirName); 219 if (!dir.isDirectory()) 220 throw new GateRuntimeException("Corpus directory should be " 221 + "provided as a parameter"); 222 223 if(i+1 >= args.length) 224 throw new GateRuntimeException("Datastore path not provided"); 225 226 String storagePath = args[i+1]; 227 File storage = new File(storagePath); 228 if (!storage.isDirectory()) 229 throw new GateRuntimeException("Please provide path to an existing " 230 + "GATE serial datastore"); 231 corpusSaver1.setDSPath(storagePath); 232 233 corpusSaver1.init(); 234 corpusSaver1.setStartDir(dir); 235 double timeBefore = System.currentTimeMillis(); 236 corpusSaver1.execute(); 237 double timeAfter = System.currentTimeMillis(); 238 Out.prln("BNC saved in " + 239 NumberFormat.getInstance().format((timeAfter-timeBefore)/1000) 240 + " seconds"); 241 242 } 243 244 public void setStartDir(File newDir) { 245 startDir = newDir; 246 } 247 248 public void setProcessMode(boolean mode) { 249 processMode = mode; 250 } 251 252 public void setDSPath(String path){ 253 dsPath = path; 254 } 255 256 protected void saveFiles(List files) { 257 if (files==null || files.isEmpty() || theCorpus == null || ds == null) 258 return; 259 260 for(int i=0; i<files.size(); i++) { 261 try { 262 Document doc = Factory.newDocument(((File)files.get(i)).toURL()); 263 doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString())); 264 Out.prln("Storing document: " + doc.getName()); 265 //first process it with ANNIE if in process mode 266 if (processMode) 267 processDocument(doc); 268 //then store it in the DS and add to corpus 269 LanguageResource lr = ds.adopt(doc, null); 270 theCorpus.add(lr); 271 theCorpus.unloadDocument((Document)lr); 272 Factory.deleteResource(doc); 273 if (lr != doc) 274 Factory.deleteResource(lr); 275 } catch (Exception ex) { 276 throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage()); 277 } 278 }//for 279 }//saveFiles 280 281 protected void processDocument(Document doc) { 282 try { 283 tokeniser.setDocument(doc); 284 tokeniser.execute(); 285 286 gazetteer.setDocument(doc); 287 gazetteer.execute(); 288 289 setTransfer.setDocument(doc); 290 String tagName = "text"; 291 AnnotationSet body = doc.getAnnotations( 292 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName); 293 if (body == null || body.isEmpty()) 294 tagName = "stext"; 295 body = doc.getAnnotations( 296 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get(tagName); 297 if (body == null || body.isEmpty()) 298 tagName = "body"; 299 setTransfer.setTextTagName(tagName); 300 setTransfer.execute(); 301 302 splitter.setDocument(doc); 303 splitter.execute(); 304 305 tagger.setDocument(doc); 306 tagger.execute(); 307 308 transducer.setDocument(doc); 309 transducer.execute(); 310 311 orthomatcher.setDocument(doc); 312 orthomatcher.execute(); 313 314 annotDeletePR.setDocument(doc); 315 List annotTypes = new ArrayList(); 316 annotTypes.add(ANNIEConstants.TOKEN_ANNOTATION_TYPE); 317 annotTypes.add(ANNIEConstants.SPACE_TOKEN_ANNOTATION_TYPE); 318 annotTypes.add("Unknown"); 319 annotTypes.add("TempIdentifier"); 320 annotTypes.add("Temp"); 321 annotTypes.add(ANNIEConstants.LOOKUP_ANNOTATION_TYPE); 322 annotTypes.add("Split"); 323 annotDeletePR.setAnnotationTypes(annotTypes); 324 annotDeletePR.execute(); 325 } catch (gate.creole.ExecutionException ex) { 326 throw new GateRuntimeException("Corpus generation error: " + 327 ex.getMessage()); 328 } 329 } 330 331 332 /** 333 * The directory from which we should generate/evaluate the corpus 334 */ 335 private File startDir; 336 private File currDir; 337 338 private DataStore ds; 339 private Corpus theCorpus; 340 private String annotSetName = "NE"; 341 private String dsPath = "d:\\bnc"; 342 343 private DefaultTokeniser tokeniser; 344 private DefaultGazetteer gazetteer; 345 private SentenceSplitter splitter; 346 private POSTagger tagger; 347 private ANNIETransducer transducer; 348 private OrthoMatcher orthomatcher; 349 private AnnotationSetTransfer setTransfer; 350 private AnnotationDeletePR annotDeletePR; 351 352 private boolean processMode = false; 353 }
|
CorpusSaver |
|