1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 19/11/2002
10   *
11   *  $Id: MachineLearningPR.java,v 1.4 2002/12/20 12:02:29 valyt Exp $
12   *
13   */
14  package gate.creole.ml;
15  
16  import java.util.*;
17  
18  import gate.*;
19  import gate.creole.*;
20  import gate.gui.*;
21  import gate.util.*;
22  import org.jdom.*;
23  import org.jdom.input.*;
24  
25  /**
26   * This processing resource is used to train a machine learning algorithm with
27   * data extracted from a corpus.
28   */
29  
30  public class MachineLearningPR extends AbstractLanguageAnalyser
31                         implements gate.gui.ActionsPublisher{
32  
33    public MachineLearningPR(){
34      actionList = new ArrayList();
35      actionList.add(null);
36    }
37  
38    /** Initialise this resource, and return it. */
39    public Resource init() throws ResourceInstantiationException {
40      if(configFileURL == null){
41        throw new ResourceInstantiationException(
42          "No configuration file provided!");
43      }
44  
45      org.jdom.Document jdomDoc;
46      SAXBuilder saxBuilder = new SAXBuilder(false);
47      try{
48        jdomDoc = saxBuilder.build(configFileURL);
49      }catch(JDOMException jde){
50        throw new ResourceInstantiationException(jde);
51      }
52  
53      //go through the jdom document to extract the data we need
54      Element rootElement = jdomDoc.getRootElement();
55      if(!rootElement.getName().equals("ML-CONFIG"))
56        throw new ResourceInstantiationException(
57          "Root element of dataset defintion file is \"" + rootElement.getName() +
58          "\" instead of \"ML-CONFIG\"!");
59  
60      //create the dataset defintion
61      Element datasetElement = rootElement.getChild("DATASET");
62      if(datasetElement == null) throw new ResourceInstantiationException(
63        "No dataset definition provided in the configuration file!");
64      try{
65        datasetDefinition = new DatasetDefintion(datasetElement);
66      }catch(GateException ge){
67        throw new ResourceInstantiationException(ge);
68      }
69  
70      //create the engine
71      Element engineElement = rootElement.getChild("ENGINE");
72      if(engineElement == null) throw new ResourceInstantiationException(
73        "No engine option provided in the configuration file!");
74      Element engineClassElement = engineElement.getChild("WRAPPER");
75      if(engineClassElement == null) throw new ResourceInstantiationException(
76        "No ML engine class provided!");
77      String engineClassName = engineClassElement.getTextTrim();
78      try{
79        Class engineClass = Class.forName(engineClassName);
80        engine = (MLEngine)engineClass.newInstance();
81      }catch(ClassNotFoundException cnfe){
82        throw new ResourceInstantiationException(
83          "ML engine class:" + engineClassName + "not found!");
84      }catch(IllegalAccessException iae){
85        throw new ResourceInstantiationException(iae);
86      }catch(InstantiationException ie){
87        throw new ResourceInstantiationException(ie);
88      }
89      engine.setDatasetDefinition(datasetDefinition);
90      engine.setOptions(engineElement.getChild("OPTIONS"));
91      engine.setOwnerPR(this);
92      try{
93        engine.init();
94      }catch(GateException ge){
95        throw new ResourceInstantiationException(ge);
96      }
97  
98      return this;
99    } // init()
100 
101 
102   /**
103    * Run the resource.
104    */
105   public void execute() throws ExecutionException{
106     interrupted = false;
107     //check the input
108     if(document == null) {
109       throw new ExecutionException(
110         "No document provided!"
111       );
112     }
113 
114     if(inputASName == null ||
115        inputASName.equals("")) annotationSet = document.getAnnotations();
116     else annotationSet = document.getAnnotations(inputASName);
117 
118     if(training.booleanValue()){
119       fireStatusChanged(
120           "Collecting training data from " + document.getName() + "...");
121     }else{
122       fireStatusChanged(
123           "Applying ML model to " + document.getName() + "...");
124     }
125     fireProgressChanged(0);
126     AnnotationSet anns = annotationSet.
127                                      get(datasetDefinition.getInstanceType());
128     annotations = (anns == null || anns.isEmpty()) ?
129                   new ArrayList() : new ArrayList(anns);
130     Collections.sort(annotations, new OffsetComparator());
131     Iterator annotationIter = annotations.iterator();
132     int index = 0;
133     int size = annotations.size();
134 
135     //create the cache structure
136     cache = new Cache();
137 
138     while(annotationIter.hasNext()){
139       Annotation instanceAnn = (Annotation)annotationIter.next();
140       List attributeValues = new ArrayList(datasetDefinition.
141                                            getAttributes().size());
142       //find the values for all attributes
143       Iterator attrIter = datasetDefinition.getAttributes().iterator();
144       while(attrIter.hasNext()){
145         Attribute attr = (Attribute)attrIter.next();
146         if(attr.isClass && !training.booleanValue()){
147           //we're not training so the class will be undefined
148           attributeValues.add(null);
149         }else{
150           attributeValues.add(cache.getAttributeValue(index, attr));
151         }
152       }
153 
154       if(training.booleanValue()){
155         engine.addTrainingInstance(attributeValues);
156       }else{
157         Object result = engine.classifyInstance(attributeValues);
158         if(result instanceof Collection){
159           Iterator resIter = ((Collection)result).iterator();
160           while(resIter.hasNext()) updateDocument(resIter.next(), index);
161         }else{
162           updateDocument(result, index);
163         }
164       }
165 
166       cache.shift();
167       //every 10 instances fire an event
168       if(index % 10 == 0){
169         fireProgressChanged(index * 100 / size);
170         if(isInterrupted()) throw new ExecutionInterruptedException();
171       }
172       index++;
173     }
174     annotations = null;
175   } // execute()
176 
177 
178   protected void updateDocument(Object classificationResult, int instanceIndex){
179     //interpret the result according to the attribute semantics
180     Attribute classAttr = datasetDefinition.getClassAttribute();
181     String type = classAttr.getType();
182     String feature = classAttr.getFeature();
183     List classValues = classAttr.getValues();
184     FeatureMap features = Factory.newFeatureMap();
185     boolean shouldCreateAnnotation = true;
186     if(classValues != null && !classValues.isEmpty()){
187       //nominal attribute -> AnnotationType.feature
188       //the result is the value for the feature
189       String featureValue = (String)classificationResult;
190       features.put(feature, featureValue);
191     }else{
192       if(feature == null){
193         //boolean attribute
194         shouldCreateAnnotation = classificationResult.equals("true");
195       }else{
196         //numeric attribute
197         String featureValue = classificationResult.toString();
198         features.put(feature, featureValue);
199       }
200     }
201 
202     if(shouldCreateAnnotation){
203       //generate the new annotation
204       int coveredInstanceIndex = instanceIndex + classAttr.getPosition();
205       if(coveredInstanceIndex >= 0 &&
206          coveredInstanceIndex < annotations.size()){
207         Annotation coveredInstance = (Annotation)annotations.
208                                      get(coveredInstanceIndex);
209         annotationSet.add(coveredInstance.getStartNode(),
210                           coveredInstance.getEndNode(),
211                           type, features);
212       }
213     }
214   }
215 
216 
217   /**
218    * Gets the list of actions that can be performed on this resource.
219    * @return a List of Action objects (or null values)
220    */
221   public List getActions(){
222     List result = new ArrayList();
223     result.addAll(actionList);
224     if(engine instanceof ActionsPublisher){
225       result.addAll(((ActionsPublisher)engine).getActions());
226     }
227     return result;
228   }
229 
230   protected class Cache{
231     public Cache(){
232       //find the sizes for the two caches
233       int forwardCacheSize = 0;
234       int backwardCacheSize = 0;
235       Iterator attrIter = datasetDefinition.getAttributes().iterator();
236       while(attrIter.hasNext()){
237         Attribute anAttribute = (Attribute)attrIter.next();
238         if(anAttribute.getPosition() > 0){
239           //forward looking
240           if(anAttribute.getPosition() > forwardCacheSize){
241             forwardCacheSize = anAttribute.getPosition();
242           }
243         }else if(anAttribute.getPosition() < 0){
244           //backward looking
245           if(-anAttribute.getPosition() > backwardCacheSize){
246             backwardCacheSize = -anAttribute.getPosition();
247           }
248         }
249       }
250       //create the caches filled with null values
251       forwardCache = new ArrayList(forwardCacheSize);
252       for(int i =0; i < forwardCacheSize; i++) forwardCache.add(null);
253       backwardCache = new ArrayList(backwardCacheSize);
254       for(int i =0; i < backwardCacheSize; i++) backwardCache.add(null);
255     }
256 
257     /**
258      * Finds the value of a specified attribute for a particular instance.
259      * @param instanceIndex the index of the current instance in the annotations
260      * List.
261      * @param attribute the attribute whose value needs to be found
262      * @return a String representing the value for the attribute.
263      */
264     public String getAttributeValue(int instanceIndex, Attribute attribute){
265       //sanity check
266       int actualPosition = instanceIndex + attribute.getPosition();
267       if(actualPosition < 0 || actualPosition >= annotations.size()) return null;
268 
269       //check caches first
270       if(attribute.getPosition() == 0){
271         //current instance
272         if(currentAttributes == null) currentAttributes = new HashMap();
273         return getValue(attribute, instanceIndex, currentAttributes);
274       }else if(attribute.getPosition() > 0){
275         //check forward cache
276         Map attributesMap = (Map)forwardCache.get(attribute.getPosition() - 1);
277         if(attributesMap == null){
278           attributesMap = new HashMap();
279           forwardCache.set(attribute.getPosition() - 1, attributesMap);
280         }
281         return getValue(attribute, actualPosition, attributesMap);
282       }else if(attribute.getPosition() < 0){
283         //check bacward cache
284         Map attributesMap = (Map)backwardCache.get(-attribute.getPosition() - 1);
285         if(attributesMap == null){
286           attributesMap = new HashMap();
287           backwardCache.set(-attribute.getPosition() - 1, attributesMap);
288         }
289         return getValue(attribute, actualPosition, attributesMap);
290       }
291       //we should never get here
292       throw new LuckyException(
293         "Attribute position is neither 0, nor negative nor positive!");
294     }
295 
296     /**
297      * Notifies the cache that it should advance its internal structures one
298      * step forward.
299      */
300     public void shift(){
301       if(backwardCache.isEmpty()){
302         //no backward caching, all attributes have position "0" or more
303         //nothing to do
304       }else{
305         backwardCache.remove(backwardCache.size() - 1);
306         backwardCache.add(0, currentAttributes);
307       }
308       if(forwardCache.isEmpty()){
309         //no forward caching, all attributes have position "0" or less
310         if(currentAttributes != null) currentAttributes.clear();
311       }else{
312         currentAttributes = (Map) forwardCache.remove(0);
313         forwardCache.add(null);
314       }
315     }
316 
317     /**
318      * Finds the value for a particular attribute and returns it.
319      * If the value is not present in the cache it will be retrieved from the
320      * document and the cache will be updated.
321      * @param attribute the attribute whose value is requested.
322      * @param cache the Map containing the cache for the appropriate position
323      * for the attribute
324      * @param instanceIndex the index of the instance annotation which is
325      * covered by the sought attribute
326      * @return a String value.
327      */
328     protected String getValue(Attribute attribute,
329                               int instanceIndex,
330                               Map cache){
331       String value = null;
332       String annType = attribute.getType();
333       String featureName = attribute.getFeature();
334       Map typeData = (Map)cache.get(annType);
335       if(typeData != null){
336         if(featureName == null){
337           //we're only interested in the presence of the annotation
338           value = (String)typeData.get(null);
339         }else{
340           value = (String)typeData.get(featureName);
341         }
342       }else{
343         //type data was null -> nothing known about this type of annotations
344         //get the insformation; update the cache and return the right value
345         Annotation instanceAnnot = (Annotation)annotations.get(instanceIndex);
346         AnnotationSet coverSubset = annotationSet.get(
347                                       annType,
348                                       instanceAnnot.getStartNode().getOffset(),
349                                       instanceAnnot.getEndNode().getOffset());
350         typeData = new HashMap();
351         cache.put(annType, typeData);
352         if(coverSubset == null || coverSubset.isEmpty()){
353           //no such annotations at given location
354           typeData.put(null, "false");
355           if(featureName == null) value = "false";
356           else value = null;
357         }else{
358           typeData.putAll(((Annotation)coverSubset.iterator().next()).
359                           getFeatures());
360           typeData.put(null, "true");
361           if(featureName == null) value = "true";
362           else value = (String)typeData.get(featureName);
363         }
364       }
365       return value;
366     }
367 
368     /**
369      * Stores cached data with attribute values for instances placed
370      * <b>after</b> the current instance.
371      * For each instance (i.e. for each position in the list) the data is a Map
372      * with annotationTypes as keys. For each annotation type the data stored is
373      * another Map with feature names as keys and feature values as values.
374      * The <tt>null</tt> key is used for a boolean value (stored as one of the
375      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
376      * (or lack of presence) of the required type of annotation at the location.
377      * forwardCache[2].get("Lookup").get(null) == "false" means that no lookup
378      * annotation covers the second instance to the right from the current
379      * instance.
380      */
381     protected List forwardCache;
382 
383     /**
384      * Stores cached data with attribute values for instances placed
385      * <b>before</b> the current instance.
386      * For each instance (i.e. for each position in the list) the data is a Map
387      * with annotationTypes as keys. For each annotation type the data stored is
388      * another Map with feature names as keys and feature values as values.
389      * The <tt>null</tt> key is used for a boolean value (stored as one of the
390      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
391      * (or lack of presence) of the required type of annotation at the location.
392      * backwardCache[2].get("Lookup").get(null) == "false" means that no lookup
393      * annotation covers the second instance to the left from the current
394      * instance.
395      */
396     protected List backwardCache;
397 
398     /**
399      * A Map
400      * with annotationTypes as keys. For each annotation type the data stored is
401      * another Map with feature names as keys and feature values as values.
402      * The <tt>null</tt> key is used for a boolean value (stored as one of the
403      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
404      * (or lack of presence) of the required type of annotation at the location.
405      * currentAttributes.get(Lookup).get(null) == "false" means that the current
406      * instance is not covered by a Lookup annotation.
407      * currentAttributes.get(Lookup) == null menas nothing is known about Lookup
408      * annotations caovering the current instance.
409      */
410     protected Map currentAttributes;
411 
412   }
413 
414 
415   public void setInputASName(String inputASName) {
416     this.inputASName = inputASName;
417   }
418   public String getInputASName() {
419     return inputASName;
420   }
421   public java.net.URL getConfigFileURL() {
422     return configFileURL;
423   }
424   public void setConfigFileURL(java.net.URL configFileURL) {
425     this.configFileURL = configFileURL;
426   }
427   public void setTraining(Boolean training) {
428     this.training = training;
429   }
430   public Boolean getTraining() {
431     return training;
432   }
433 
434   private java.net.URL configFileURL;
435   protected DatasetDefintion datasetDefinition;
436 
437   protected MLEngine engine;
438 
439   protected String inputASName;
440 
441   protected AnnotationSet annotationSet;
442 
443   protected List annotations;
444 
445   protected List actionList;
446 
447   protected Cache cache;
448   private Boolean training;
449 }