|
MachineLearningPR |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan 19/11/2002 10 * 11 * $Id: MachineLearningPR.java,v 1.4 2002/12/20 12:02:29 valyt Exp $ 12 * 13 */ 14 package gate.creole.ml; 15 16 import java.util.*; 17 18 import gate.*; 19 import gate.creole.*; 20 import gate.gui.*; 21 import gate.util.*; 22 import org.jdom.*; 23 import org.jdom.input.*; 24 25 /** 26 * This processing resource is used to train a machine learning algorithm with 27 * data extracted from a corpus. 28 */ 29 30 public class MachineLearningPR extends AbstractLanguageAnalyser 31 implements gate.gui.ActionsPublisher{ 32 33 public MachineLearningPR(){ 34 actionList = new ArrayList(); 35 actionList.add(null); 36 } 37 38 /** Initialise this resource, and return it. */ 39 public Resource init() throws ResourceInstantiationException { 40 if(configFileURL == null){ 41 throw new ResourceInstantiationException( 42 "No configuration file provided!"); 43 } 44 45 org.jdom.Document jdomDoc; 46 SAXBuilder saxBuilder = new SAXBuilder(false); 47 try{ 48 jdomDoc = saxBuilder.build(configFileURL); 49 }catch(JDOMException jde){ 50 throw new ResourceInstantiationException(jde); 51 } 52 53 //go through the jdom document to extract the data we need 54 Element rootElement = jdomDoc.getRootElement(); 55 if(!rootElement.getName().equals("ML-CONFIG")) 56 throw new ResourceInstantiationException( 57 "Root element of dataset defintion file is \"" + rootElement.getName() + 58 "\" instead of \"ML-CONFIG\"!"); 59 60 //create the dataset defintion 61 Element datasetElement = rootElement.getChild("DATASET"); 62 if(datasetElement == null) throw new ResourceInstantiationException( 63 "No dataset definition provided in the configuration file!"); 64 try{ 65 datasetDefinition = new DatasetDefintion(datasetElement); 66 }catch(GateException ge){ 67 throw new ResourceInstantiationException(ge); 68 } 69 70 //create the engine 71 Element engineElement = rootElement.getChild("ENGINE"); 72 if(engineElement == null) throw new ResourceInstantiationException( 73 "No engine option provided in the configuration file!"); 74 Element engineClassElement = engineElement.getChild("WRAPPER"); 75 if(engineClassElement == null) throw new ResourceInstantiationException( 76 "No ML engine class provided!"); 77 String engineClassName = engineClassElement.getTextTrim(); 78 try{ 79 Class engineClass = Class.forName(engineClassName); 80 engine = (MLEngine)engineClass.newInstance(); 81 }catch(ClassNotFoundException cnfe){ 82 throw new ResourceInstantiationException( 83 "ML engine class:" + engineClassName + "not found!"); 84 }catch(IllegalAccessException iae){ 85 throw new ResourceInstantiationException(iae); 86 }catch(InstantiationException ie){ 87 throw new ResourceInstantiationException(ie); 88 } 89 engine.setDatasetDefinition(datasetDefinition); 90 engine.setOptions(engineElement.getChild("OPTIONS")); 91 engine.setOwnerPR(this); 92 try{ 93 engine.init(); 94 }catch(GateException ge){ 95 throw new ResourceInstantiationException(ge); 96 } 97 98 return this; 99 } // init() 100 101 102 /** 103 * Run the resource. 104 */ 105 public void execute() throws ExecutionException{ 106 interrupted = false; 107 //check the input 108 if(document == null) { 109 throw new ExecutionException( 110 "No document provided!" 111 ); 112 } 113 114 if(inputASName == null || 115 inputASName.equals("")) annotationSet = document.getAnnotations(); 116 else annotationSet = document.getAnnotations(inputASName); 117 118 if(training.booleanValue()){ 119 fireStatusChanged( 120 "Collecting training data from " + document.getName() + "..."); 121 }else{ 122 fireStatusChanged( 123 "Applying ML model to " + document.getName() + "..."); 124 } 125 fireProgressChanged(0); 126 AnnotationSet anns = annotationSet. 127 get(datasetDefinition.getInstanceType()); 128 annotations = (anns == null || anns.isEmpty()) ? 129 new ArrayList() : new ArrayList(anns); 130 Collections.sort(annotations, new OffsetComparator()); 131 Iterator annotationIter = annotations.iterator(); 132 int index = 0; 133 int size = annotations.size(); 134 135 //create the cache structure 136 cache = new Cache(); 137 138 while(annotationIter.hasNext()){ 139 Annotation instanceAnn = (Annotation)annotationIter.next(); 140 List attributeValues = new ArrayList(datasetDefinition. 141 getAttributes().size()); 142 //find the values for all attributes 143 Iterator attrIter = datasetDefinition.getAttributes().iterator(); 144 while(attrIter.hasNext()){ 145 Attribute attr = (Attribute)attrIter.next(); 146 if(attr.isClass && !training.booleanValue()){ 147 //we're not training so the class will be undefined 148 attributeValues.add(null); 149 }else{ 150 attributeValues.add(cache.getAttributeValue(index, attr)); 151 } 152 } 153 154 if(training.booleanValue()){ 155 engine.addTrainingInstance(attributeValues); 156 }else{ 157 Object result = engine.classifyInstance(attributeValues); 158 if(result instanceof Collection){ 159 Iterator resIter = ((Collection)result).iterator(); 160 while(resIter.hasNext()) updateDocument(resIter.next(), index); 161 }else{ 162 updateDocument(result, index); 163 } 164 } 165 166 cache.shift(); 167 //every 10 instances fire an event 168 if(index % 10 == 0){ 169 fireProgressChanged(index * 100 / size); 170 if(isInterrupted()) throw new ExecutionInterruptedException(); 171 } 172 index++; 173 } 174 annotations = null; 175 } // execute() 176 177 178 protected void updateDocument(Object classificationResult, int instanceIndex){ 179 //interpret the result according to the attribute semantics 180 Attribute classAttr = datasetDefinition.getClassAttribute(); 181 String type = classAttr.getType(); 182 String feature = classAttr.getFeature(); 183 List classValues = classAttr.getValues(); 184 FeatureMap features = Factory.newFeatureMap(); 185 boolean shouldCreateAnnotation = true; 186 if(classValues != null && !classValues.isEmpty()){ 187 //nominal attribute -> AnnotationType.feature 188 //the result is the value for the feature 189 String featureValue = (String)classificationResult; 190 features.put(feature, featureValue); 191 }else{ 192 if(feature == null){ 193 //boolean attribute 194 shouldCreateAnnotation = classificationResult.equals("true"); 195 }else{ 196 //numeric attribute 197 String featureValue = classificationResult.toString(); 198 features.put(feature, featureValue); 199 } 200 } 201 202 if(shouldCreateAnnotation){ 203 //generate the new annotation 204 int coveredInstanceIndex = instanceIndex + classAttr.getPosition(); 205 if(coveredInstanceIndex >= 0 && 206 coveredInstanceIndex < annotations.size()){ 207 Annotation coveredInstance = (Annotation)annotations. 208 get(coveredInstanceIndex); 209 annotationSet.add(coveredInstance.getStartNode(), 210 coveredInstance.getEndNode(), 211 type, features); 212 } 213 } 214 } 215 216 217 /** 218 * Gets the list of actions that can be performed on this resource. 219 * @return a List of Action objects (or null values) 220 */ 221 public List getActions(){ 222 List result = new ArrayList(); 223 result.addAll(actionList); 224 if(engine instanceof ActionsPublisher){ 225 result.addAll(((ActionsPublisher)engine).getActions()); 226 } 227 return result; 228 } 229 230 protected class Cache{ 231 public Cache(){ 232 //find the sizes for the two caches 233 int forwardCacheSize = 0; 234 int backwardCacheSize = 0; 235 Iterator attrIter = datasetDefinition.getAttributes().iterator(); 236 while(attrIter.hasNext()){ 237 Attribute anAttribute = (Attribute)attrIter.next(); 238 if(anAttribute.getPosition() > 0){ 239 //forward looking 240 if(anAttribute.getPosition() > forwardCacheSize){ 241 forwardCacheSize = anAttribute.getPosition(); 242 } 243 }else if(anAttribute.getPosition() < 0){ 244 //backward looking 245 if(-anAttribute.getPosition() > backwardCacheSize){ 246 backwardCacheSize = -anAttribute.getPosition(); 247 } 248 } 249 } 250 //create the caches filled with null values 251 forwardCache = new ArrayList(forwardCacheSize); 252 for(int i =0; i < forwardCacheSize; i++) forwardCache.add(null); 253 backwardCache = new ArrayList(backwardCacheSize); 254 for(int i =0; i < backwardCacheSize; i++) backwardCache.add(null); 255 } 256 257 /** 258 * Finds the value of a specified attribute for a particular instance. 259 * @param instanceIndex the index of the current instance in the annotations 260 * List. 261 * @param attribute the attribute whose value needs to be found 262 * @return a String representing the value for the attribute. 263 */ 264 public String getAttributeValue(int instanceIndex, Attribute attribute){ 265 //sanity check 266 int actualPosition = instanceIndex + attribute.getPosition(); 267 if(actualPosition < 0 || actualPosition >= annotations.size()) return null; 268 269 //check caches first 270 if(attribute.getPosition() == 0){ 271 //current instance 272 if(currentAttributes == null) currentAttributes = new HashMap(); 273 return getValue(attribute, instanceIndex, currentAttributes); 274 }else if(attribute.getPosition() > 0){ 275 //check forward cache 276 Map attributesMap = (Map)forwardCache.get(attribute.getPosition() - 1); 277 if(attributesMap == null){ 278 attributesMap = new HashMap(); 279 forwardCache.set(attribute.getPosition() - 1, attributesMap); 280 } 281 return getValue(attribute, actualPosition, attributesMap); 282 }else if(attribute.getPosition() < 0){ 283 //check bacward cache 284 Map attributesMap = (Map)backwardCache.get(-attribute.getPosition() - 1); 285 if(attributesMap == null){ 286 attributesMap = new HashMap(); 287 backwardCache.set(-attribute.getPosition() - 1, attributesMap); 288 } 289 return getValue(attribute, actualPosition, attributesMap); 290 } 291 //we should never get here 292 throw new LuckyException( 293 "Attribute position is neither 0, nor negative nor positive!"); 294 } 295 296 /** 297 * Notifies the cache that it should advance its internal structures one 298 * step forward. 299 */ 300 public void shift(){ 301 if(backwardCache.isEmpty()){ 302 //no backward caching, all attributes have position "0" or more 303 //nothing to do 304 }else{ 305 backwardCache.remove(backwardCache.size() - 1); 306 backwardCache.add(0, currentAttributes); 307 } 308 if(forwardCache.isEmpty()){ 309 //no forward caching, all attributes have position "0" or less 310 if(currentAttributes != null) currentAttributes.clear(); 311 }else{ 312 currentAttributes = (Map) forwardCache.remove(0); 313 forwardCache.add(null); 314 } 315 } 316 317 /** 318 * Finds the value for a particular attribute and returns it. 319 * If the value is not present in the cache it will be retrieved from the 320 * document and the cache will be updated. 321 * @param attribute the attribute whose value is requested. 322 * @param cache the Map containing the cache for the appropriate position 323 * for the attribute 324 * @param instanceIndex the index of the instance annotation which is 325 * covered by the sought attribute 326 * @return a String value. 327 */ 328 protected String getValue(Attribute attribute, 329 int instanceIndex, 330 Map cache){ 331 String value = null; 332 String annType = attribute.getType(); 333 String featureName = attribute.getFeature(); 334 Map typeData = (Map)cache.get(annType); 335 if(typeData != null){ 336 if(featureName == null){ 337 //we're only interested in the presence of the annotation 338 value = (String)typeData.get(null); 339 }else{ 340 value = (String)typeData.get(featureName); 341 } 342 }else{ 343 //type data was null -> nothing known about this type of annotations 344 //get the insformation; update the cache and return the right value 345 Annotation instanceAnnot = (Annotation)annotations.get(instanceIndex); 346 AnnotationSet coverSubset = annotationSet.get( 347 annType, 348 instanceAnnot.getStartNode().getOffset(), 349 instanceAnnot.getEndNode().getOffset()); 350 typeData = new HashMap(); 351 cache.put(annType, typeData); 352 if(coverSubset == null || coverSubset.isEmpty()){ 353 //no such annotations at given location 354 typeData.put(null, "false"); 355 if(featureName == null) value = "false"; 356 else value = null; 357 }else{ 358 typeData.putAll(((Annotation)coverSubset.iterator().next()). 359 getFeatures()); 360 typeData.put(null, "true"); 361 if(featureName == null) value = "true"; 362 else value = (String)typeData.get(featureName); 363 } 364 } 365 return value; 366 } 367 368 /** 369 * Stores cached data with attribute values for instances placed 370 * <b>after</b> the current instance. 371 * For each instance (i.e. for each position in the list) the data is a Map 372 * with annotationTypes as keys. For each annotation type the data stored is 373 * another Map with feature names as keys and feature values as values. 374 * The <tt>null</tt> key is used for a boolean value (stored as one of the 375 * "true" or "false" strings) signifying the presence 376 * (or lack of presence) of the required type of annotation at the location. 377 * forwardCache[2].get("Lookup").get(null) == "false" means that no lookup 378 * annotation covers the second instance to the right from the current 379 * instance. 380 */ 381 protected List forwardCache; 382 383 /** 384 * Stores cached data with attribute values for instances placed 385 * <b>before</b> the current instance. 386 * For each instance (i.e. for each position in the list) the data is a Map 387 * with annotationTypes as keys. For each annotation type the data stored is 388 * another Map with feature names as keys and feature values as values. 389 * The <tt>null</tt> key is used for a boolean value (stored as one of the 390 * "true" or "false" strings) signifying the presence 391 * (or lack of presence) of the required type of annotation at the location. 392 * backwardCache[2].get("Lookup").get(null) == "false" means that no lookup 393 * annotation covers the second instance to the left from the current 394 * instance. 395 */ 396 protected List backwardCache; 397 398 /** 399 * A Map 400 * with annotationTypes as keys. For each annotation type the data stored is 401 * another Map with feature names as keys and feature values as values. 402 * The <tt>null</tt> key is used for a boolean value (stored as one of the 403 * "true" or "false" strings) signifying the presence 404 * (or lack of presence) of the required type of annotation at the location. 405 * currentAttributes.get(Lookup).get(null) == "false" means that the current 406 * instance is not covered by a Lookup annotation. 407 * currentAttributes.get(Lookup) == null menas nothing is known about Lookup 408 * annotations caovering the current instance. 409 */ 410 protected Map currentAttributes; 411 412 } 413 414 415 public void setInputASName(String inputASName) { 416 this.inputASName = inputASName; 417 } 418 public String getInputASName() { 419 return inputASName; 420 } 421 public java.net.URL getConfigFileURL() { 422 return configFileURL; 423 } 424 public void setConfigFileURL(java.net.URL configFileURL) { 425 this.configFileURL = configFileURL; 426 } 427 public void setTraining(Boolean training) { 428 this.training = training; 429 } 430 public Boolean getTraining() { 431 return training; 432 } 433 434 private java.net.URL configFileURL; 435 protected DatasetDefintion datasetDefinition; 436 437 protected MLEngine engine; 438 439 protected String inputASName; 440 441 protected AnnotationSet annotationSet; 442 443 protected List annotations; 444 445 protected List actionList; 446 447 protected Cache cache; 448 private Boolean training; 449 }
|
MachineLearningPR |
|