|
DataCollector |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan 28 May 2002 10 * 11 * $Id: DataCollector.java,v 1.1 2002/06/27 17:12:32 valyt Exp $ 12 */ 13 package gate.ml; 14 15 import java.util.*; 16 import javax.xml.parsers.*; 17 import java.net.*; 18 19 import gate.*; 20 import gate.util.*; 21 import gate.creole.*; 22 23 import weka.core.*; 24 /** 25 * Collects training data from a corpus. 26 * It iterates through the offsets in the annotation set and uses an instance 27 * detector to find instances and a set of attribute detectors to find the 28 * associated attributes. 29 */ 30 public class DataCollector extends AbstractLanguageAnalyser { 31 /** 32 * Default constructor. 33 */ 34 public DataCollector() { 35 attributeDetectors = new ArrayList(); 36 inputTypes = new ArrayList(); 37 } 38 39 /** 40 * Gets the annotations that start at a given offset. 41 * Returns null or an empty list if none found. 42 * @param offset a Long value 43 * @return a Set value. 44 */ 45 public Set getStartingAnnotations(Long offset){ 46 AnnotationsLists existingAnnotations = (AnnotationsLists) 47 annotationsByOffset.get(offset); 48 if(existingAnnotations == null) return null; 49 else return existingAnnotations.startingAnnotations; 50 } 51 52 /** 53 * Gets the annotations that end at a given offset. 54 * Returns null or an empty list if none found. 55 * @param offset a Long value 56 * @return a Set value. 57 */ 58 public Set getEndingAnnotations(Long offset){ 59 AnnotationsLists existingAnnotations = (AnnotationsLists) 60 annotationsByOffset.get(offset); 61 if(existingAnnotations == null) return null; 62 else return existingAnnotations.endingAnnotations; 63 } 64 65 /** 66 * Gets the next offset for a given offset using the natural ordering. 67 * @param offset a Long value 68 * @return a Long value. 69 */ 70 public Long nextOffset(Long offset){ 71 if(annotationsByOffset == null || 72 annotationsByOffset.isEmpty()) return null; 73 SortedMap tailMap = annotationsByOffset.tailMap( 74 new Long(offset.longValue() + 1)); 75 return (Long)((tailMap == null || tailMap.isEmpty()) ? null : 76 tailMap.firstKey()); 77 } 78 79 /** 80 * Gets the next offset for a given offset using the natural ordering. 81 * @param offset a Long value 82 * @return a Long value. 83 */ 84 public Long previousOffset(Long offset){ 85 if(annotationsByOffset == null || 86 annotationsByOffset.isEmpty()) return null; 87 SortedMap headMap = annotationsByOffset.subMap( 88 annotationsByOffset.firstKey(), offset); 89 return (Long)((headMap == null || headMap.isEmpty()) ? null : 90 headMap.lastKey()); 91 } 92 93 public void execute() throws ExecutionException{ 94 //check the input 95 if(document == null) 96 throw new ExecutionException("No document to process!"); 97 if(annotationSetName == null || 98 annotationSetName.equals("")) annotationSet = document.getAnnotations(); 99 else annotationSet = document.getAnnotations(annotationSetName); 100 101 102 fireStatusChanged("Extracting data from " + document.getName() + "..."); 103 104 //get all the relevant offsets 105 annotationsByOffset = new TreeMap(); 106 107 Iterator annIter = annotationSet.iterator(); 108 while(annIter.hasNext()){ 109 Annotation annotation = (Annotation)annIter.next(); 110 Long startOffset = annotation.getStartNode().getOffset(); 111 AnnotationsLists existingAnnotations = (AnnotationsLists) 112 annotationsByOffset. 113 get(startOffset); 114 if(existingAnnotations == null){ 115 existingAnnotations = new AnnotationsLists(); 116 annotationsByOffset.put(startOffset, existingAnnotations); 117 } 118 existingAnnotations.startingAnnotations.add(annotation); 119 120 Long endOffset = annotation.getEndNode().getOffset(); 121 existingAnnotations = (AnnotationsLists)annotationsByOffset. 122 get(endOffset); 123 if(existingAnnotations == null){ 124 existingAnnotations = new AnnotationsLists(); 125 annotationsByOffset.put(endOffset, existingAnnotations); 126 } 127 existingAnnotations.endingAnnotations.add(annotation); 128 } 129 130 //parse through all the offsets 131 Iterator offsetsIter = annotationsByOffset.keySet().iterator(); 132 while(offsetsIter.hasNext()){ 133 fireDataAdvance((Long) offsetsIter.next()); 134 } 135 }//public void execute() throws ExecutionException{ 136 137 /** 138 * Adds a new instance to the dataset being constructed. 139 * @param instance the instance value to be added. 140 */ 141 public void addInstance(Instance instance){ 142 dataSet.add(instance); 143 } 144 145 public static void main(String[] args) { 146 } 147 148 149 public void setConfigFileURL(URL configFileURL) { 150 this.configFileURL = configFileURL; 151 } 152 153 public URL getConfigFileURL() { 154 return configFileURL; 155 } 156 157 public Resource init(){ 158 readConfigFile(); 159 //prepare the dataset 160 FastVector attributes = new FastVector(); 161 Iterator attIter = attributeDetectors.iterator(); 162 while(attIter.hasNext()){ 163 attributes.addElement(((AttributeDetector)attIter.next()).getAttribute()); 164 } 165 //add the attribute for the class 166 attributes.addElement(instanceDetector.getClassAttribute()); 167 dataSet = new Instances(getName() + " Dataset", attributes, 0); 168 169 return this; 170 } 171 172 /** 173 * Reads the configuration file and populates internal data with values. 174 */ 175 protected void readConfigFile(){ 176 //hardcoded for now 177 AnnotationDetector annotationDetector = new AnnotationDetector(); 178 annotationDetector.setAnnotationTypes("Date,Person,Location,Organization,Money"); 179 setInstanceDetector(annotationDetector); 180 //Add attributes now 181 182 //annotation length (in tokens) 183 addAttributeDetector(new AnnotationLengthExtractor()); 184 //POS category and orthography for the first 7 tokens 185 for(int i = 1; i <= 7; i++){ 186 POSCategoryExtractor posExtractor = new POSCategoryExtractor(); 187 posExtractor.setPosition(i); 188 //look in the right context too 189 // posExtractor.setIgnoreRightContext(false); 190 addAttributeDetector(posExtractor); 191 192 TokenOrthographyExtractor orthExtractor = new TokenOrthographyExtractor(); 193 orthExtractor.setPosition(i); 194 addAttributeDetector(orthExtractor); 195 } 196 197 198 //POS category and orthography for 3 tokens left context 199 for(int i = -1; i >= -3; i--){ 200 POSCategoryExtractor posExtractor = new POSCategoryExtractor(); 201 posExtractor.setPosition(i); 202 addAttributeDetector(posExtractor); 203 204 TokenOrthographyExtractor orthExtractor = new TokenOrthographyExtractor(); 205 orthExtractor.setPosition(i); 206 addAttributeDetector(orthExtractor); 207 } 208 209 //Lookup type and position for the first 3 lookups 210 LookupDetector lookupDetector = new LookupDetector(); 211 //type - 1 212 addAttributeDetector(lookupDetector); 213 //position - 1 214 addAttributeDetector(lookupDetector); 215 //type - 2 216 addAttributeDetector(lookupDetector); 217 //position - 2 218 addAttributeDetector(lookupDetector); 219 //type -3 220 addAttributeDetector(lookupDetector); 221 //position - 3 222 addAttributeDetector(lookupDetector); 223 } 224 225 public void setState(int state) { 226 this.state = state; 227 } 228 229 public int getState() { 230 return state; 231 } 232 public synchronized void removeDataListener(DataListener l) { 233 if (dataListeners != null && dataListeners.contains(l)) { 234 Vector v = (Vector) dataListeners.clone(); 235 v.removeElement(l); 236 dataListeners = v; 237 } 238 } 239 public synchronized void addDataListener(DataListener l) { 240 Vector v = dataListeners == null ? new Vector(2) : (Vector) dataListeners.clone(); 241 if (!v.contains(l)) { 242 v.addElement(l); 243 dataListeners = v; 244 l.setDataCollector(this); 245 } 246 } 247 248 /** 249 * URL to the file containing the configuration. 250 */ 251 protected URL configFileURL; 252 253 /** 254 * The types of annotation to be considered. Annotations of types not 255 * contained here will be ignored. 256 */ 257 List inputTypes; 258 259 protected AnnotationSet annotationSet; 260 public Instances getDataSet(){ 261 return dataSet; 262 } 263 264 protected Instances dataSet; 265 266 protected InstanceDetector instanceDetector; 267 268 /** 269 * Stores the annotations from the input annotation set by offset (starting 270 * and ending). Maps from Long (offset) to {@link AnnotationsLists}. 271 */ 272 protected SortedMap annotationsByOffset; 273 274 /** 275 * A structure that stores the annotations relevant for an offset: a list of 276 * annotations that start at the offset and a list of annotations that end at 277 * the offset. 278 */ 279 protected static class AnnotationsLists{ 280 public AnnotationsLists(){ 281 startingAnnotations = new HashSet(); 282 endingAnnotations = new HashSet(); 283 } 284 285 public Set startingAnnotations; 286 public Set endingAnnotations; 287 } 288 289 List attributeDetectors; 290 291 /** 292 * The state of the data collector. Can be one of {@link BEFORE}, 293 * {@link INSIDE} or {@link AFTER} according to the relation between the 294 * current location in the document and the instance being constructed. 295 * The value of the state is controlled by the instance detector. 296 */ 297 protected int state; 298 299 private transient Vector dataListeners; 300 private String annotationSetName; 301 302 protected void fireDataAdvance(Long e) { 303 if (dataListeners != null) { 304 Vector listeners = dataListeners; 305 int count = listeners.size(); 306 for (int i = 0; i < count; i++) { 307 ((DataListener) listeners.elementAt(i)).dataAdvance(e); 308 } 309 } 310 } 311 312 public void addAttributeDetector(AttributeDetector attrDetector){ 313 attributeDetectors.add(attrDetector); 314 attrDetector.setDataCollector(this); 315 } 316 317 public List getAttributeDetectors(){ 318 return attributeDetectors; 319 } 320 321 public InstanceDetector getInstanceDetector() { 322 return instanceDetector; 323 } 324 325 public void setInstanceDetector(InstanceDetector instanceDetector) { 326 if(instanceDetector != null) removeDataListener(instanceDetector); 327 this.instanceDetector = instanceDetector; 328 addDataListener(instanceDetector); 329 } 330 public void setAnnotationSetName(String annotationSetName) { 331 this.annotationSetName = annotationSetName; 332 } 333 public String getAnnotationSetName() { 334 return annotationSetName; 335 } 336 }
|
DataCollector |
|