1
14
15 package gate.creole.ml.weka;
16
17 import java.io.Serializable;
18 import java.util.*;
19
20 import weka.core.*;
21 import weka.filters.Filter;
22
23
27 public class StringToNominalFilter extends Filter implements OptionHandler{
28
31 public StringToNominalFilter() {
32 }
33
34
45 public boolean setInputFormat(Instances instanceInfo)
46 throws Exception {
47 super.setInputFormat(instanceInfo);
48 Iterator attIter = attributesData.iterator();
49 while(attIter.hasNext()){
50 AttributeData aData = (AttributeData)attIter.next();
51 if (!instanceInfo.attribute(aData.index).isString()) {
52 throw new UnsupportedAttributeTypeException(
53 "Attribute at selcted index " + aData.index +
54 " is not of type string!");
55 }
56 }
57 return false;
58 }
59
60
69 public boolean input(Instance instance) {
70 if (getInputFormat() == null) {
71 throw new IllegalStateException("No input instance format defined");
72 }
73
74 if (m_NewBatch) {
75 resetQueue();
76 m_NewBatch = false;
77 }
78
79 bufferInput(instance);
80 return false;
81 }
82
83
91 public boolean batchFinished() {
92 if (getInputFormat() == null) {
93 throw new IllegalStateException("No input instance format defined");
94 }
95 buildOutputFormat();
97
98 for(int i = 0; i < getInputFormat().numInstances(); i++) {
100 push(processInstance(getInputFormat().instance(i)));
101 }
102
103 flushInput();
104 m_NewBatch = true;
105 return (numPendingOutput() != 0);
106 }
107
108
109
114 protected void buildOutputFormat(){
115 Map wordData = new HashMap();
120 for(int i = 0; i < getInputFormat().numInstances(); i++) {
122 Instance instance = getInputFormat().instance(i);
123 Iterator attIter = attributesData.iterator();
125 while(attIter.hasNext()){
126 AttributeData aData = (AttributeData)attIter.next();
127 String word = instance.stringValue(aData.index);
128 Map wMap = (Map)wordData.get(word);
130 if(wMap == null){
131 wMap = new HashMap();
132 wordData.put(word, wMap);
133 }
134 Integer attIndex = new Integer(aData.index);
136 Map w_aMap = (Map)wMap.get(attIndex);
137 if(w_aMap == null){
138 w_aMap = new HashMap();
139 wMap.put(attIndex, w_aMap);
140 }
141 Double classValue = new Double(instance.classValue());
143 WordData w_a_cCount = (WordData)w_aMap.get(classValue);
144 if(w_a_cCount == null){
146 w_a_cCount = new WordData(word, aData.index, classValue, 1);
147 w_aMap.put(classValue, w_a_cCount);
148 }else{
149 w_a_cCount.inc();
150 }
151 }
152 }
153 Instances newData;
155 FastVector newAtts, newVals;
156 newAtts = new FastVector(getInputFormat().numAttributes());
158 for (int i = 0; i < getInputFormat().numAttributes(); i++) {
159 Attribute att = getInputFormat().attribute(i);
160 newAtts.addElement(att.copy());
161 }
162
163 Iterator attIter = attributesData.iterator();
165 while(attIter.hasNext()){
166 AttributeData aData = (AttributeData)attIter.next();
167 FastVector values = new FastVector(aData.maxCount);
168 if(aData.method.equalsIgnoreCase(FREQUENCY)){
169 List wordFreqs = new ArrayList(wordData.size());
170 Iterator entryIter = wordData.entrySet().iterator();
171 while(entryIter.hasNext()){
172 Map.Entry entry = (Map.Entry)entryIter.next();
173 String word = (String)entry.getKey();
174 Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index));
175 int count = addLeaves(w_aMap);
176 wordFreqs.add(new WordCount(word, count));
177 }
178 int start = 0;
179 if(wordFreqs.size() > aData.maxCount){
180 Collections.sort(wordFreqs);
181 start = wordFreqs.size() - aData.maxCount;
182 }
183 for(int i = wordFreqs.size() -1; i >= start; i--){
184 values.addElement(((WordCount)wordFreqs.get(i)).word);
185 }
186 System.out.println("Values count" + values.size());
187 }else if(aData.method.equalsIgnoreCase(TFIDF)){
188 int classCount = getInputFormat().classAttribute().numValues();
189 List wordTFIDFValues = new ArrayList(wordData.size());
190 Iterator entryIter = wordData.entrySet().iterator();
191 while(entryIter.hasNext()){
192 Map.Entry entry = (Map.Entry)entryIter.next();
193 String word = (String)entry.getKey();
194 Map w_aMap = (Map)((Map)entry.getValue()).get(new Integer(aData.index));
195 if(w_aMap == null || w_aMap.isEmpty()) continue;
196 int count = addLeaves(w_aMap);
197 int classFreq = w_aMap.size();
198 double tfidf = count * Math.log(classCount/classFreq);
199 wordTFIDFValues.add(new WordCount(word, count, tfidf));
200 }
201 int start = 0;
202 if(wordTFIDFValues.size() > aData.maxCount){
203 Collections.sort(wordTFIDFValues, new Comparator(){
204 public int compare(Object o1, Object o2){
205 double value = ((WordCount)o1).tfidf - ((WordCount)o2).tfidf;
206 if(value > Utils.SMALL) return 1;
207 else if(value < -Utils.SMALL) return -1;
208 else return 0;
209 }
210 });
211 start = wordTFIDFValues.size() - aData.maxCount;
212 }
213 for(int i = wordTFIDFValues.size() -1; i >= start; i--){
214 values.addElement(((WordCount)wordTFIDFValues.get(i)).word);
215 }
216
217 }
218 Attribute oldAttr = (Attribute)newAtts.elementAt(aData.index);
219 Attribute newAttribute = new Attribute(oldAttr.name(), values);
220 System.out.println("Atribute \"" + newAttribute.name() + "\":" + values.size());
221 newAtts.setElementAt(newAttribute, aData.index);
222 }
223
224 newData = new Instances(getInputFormat().relationName(), newAtts, 0);
226 newData.setClassIndex(getInputFormat().classIndex());
227 setOutputFormat(newData);
228 }
229
230 public static void main(String[] args){
231 try{
232 StringToNominalFilter filter = new StringToNominalFilter();
233 filter.setOptions(new String[]{"-A", "10,200,TFIDF",
234 "-A", "11,200,TFIDF",
235 "-A", "12,200,TFIDF"});
236 Instances input = new Instances(
237 new java.io.FileReader("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.arff"));
238 input.setClassIndex(18);
239 filter.setInputFormat(input);
240 for(int i = 0; i < input.numInstances(); i++){
241 filter.input(input.instance(i));
242 }
243 filter.batchFinished();
244 Instances output = filter.getOutputFormat();
245 Instance instance = filter.output();
246 while(instance != null){
247 output.add(instance);
248 instance = filter.output();
249 }
250 java.io.FileWriter fw = new java.io.FileWriter("D:\\tmp\\ML-Weka\\Strings\\MUC7dataset.filtered.arff");
251 fw.write(output.toString());
252 fw.flush();
253 fw.close();
254 }catch(Exception e){
255 e.printStackTrace();
256 }
257 }
258
259 protected int addLeaves(Map map){
260 int res = 0;
261 Iterator valuesIter = map.values().iterator();
262 while(valuesIter.hasNext()){
263 Object value = valuesIter.next();
264 if(value instanceof WordData) res += ((WordData)value).count;
265 else if(value instanceof Map) res += addLeaves((Map)value);
266 }
267 return res;
268 }
269
270
276 protected Instance processInstance(Instance inputInstance){
277 Instance newInstance = new Instance(getOutputFormat().numAttributes());
278 newInstance.setDataset(getOutputFormat());
279 for(int i = 0; i < getOutputFormat().numAttributes(); i++){
280 if(inputInstance.isMissing(i)) newInstance.setMissing(i);
281 else{
282 if(isString(i)){
283 String value = inputInstance.stringValue(i);
284 if(getOutputFormat().attribute(i).indexOfValue(value) == -1){
285 newInstance.setMissing(i);
286 }else{
287 newInstance.setValue(i, value);
288 }
289 }else{
290 newInstance.setValue(i, inputInstance.value(i));
291 }
292 }
293 }
294 return newInstance;
295 }
296
297
303 protected boolean isString(int index){
304 int[] stringIndices = getInputStringIndex();
305 for(int i = 0; i < stringIndices.length; i++)
306 if(stringIndices[i] == index) return true;
307 return false;
308 }
309
310 public Enumeration listOptions() {
311 return optionsDesc.elements();
312 }
313
314 public void setOptions(String[] options) throws java.lang.Exception {
315 this.options = options;
316 parseOptions();
317 Iterator itr = attributesData.iterator();
318 while(itr.hasNext()){
319 AttributeData aData = (AttributeData)itr.next();
320 System.out.println("Attribute " + aData.index + " " + aData.maxCount + " " + aData.method);
321 }
322 }
323
324 public String[] getOptions() {
325 return options;
326 }
327
328
331 protected void parseOptions() throws Exception{
332 attributesData = new ArrayList();
333 String option = Utils.getOption('A', options);
334 System.out.print("Option " + option);
335 while(option != null && option.length() > 0){
336 StringTokenizer strTok = new StringTokenizer(option, ",", false);
337 int index = Integer.parseInt(strTok.nextToken());
338 System.out.print(": " + index);
339 int maxCnt = Integer.parseInt(strTok.nextToken());
340 System.out.print(": " + maxCnt);
341 String method = null;
343 if(strTok.hasMoreTokens()){
344 method = strTok.nextToken();
345 if(!method.equalsIgnoreCase(FREQUENCY) &&
346 !method.equalsIgnoreCase(TFIDF)){
347 throw new Exception("Unknown filtering method: " + method);
348 }
349 }
350 attributesData.add(new AttributeData(index, maxCnt, method));
351 option = Utils.getOption('A', options);
353 }
354 }
355
356
359 protected static class AttributeData implements Serializable{
360 public AttributeData(int index, int count, String method){
361 this.index = index;
362 this.maxCount = count;
363 this.method = method;
364 }
365
366 int index;
367 int maxCount;
368 String method;
369 }
370
371 protected static class WordData{
372 public WordData(String word, int attrIndex, Double classValue, int count){
373 this.word = word;
374 this.attributeIndex = attrIndex;
375 this.classValue = classValue;
376 this.count = count;
377 }
378
379 public void inc(){
380 count ++;
381 }
382 String word;
383 int attributeIndex;
384 Double classValue;
385 int count;
386 }
387
388 protected static class WordCount implements Comparable{
389 public WordCount(String word, int count){
390 this.word = word;
391 this.count = count;
392 tfidf = -1;
393 }
394
395 public WordCount(String word, int count, double tfidf){
396 this.word = word;
397 this.count = count;
398 this.tfidf = tfidf;
399 }
400
401 public int compareTo(Object other){
402 return count - ((WordCount)other).count;
403 }
404
405 String word;
406 int count;
407 double tfidf;
408 }
409
410
413 private String[] options;
414
415 protected List attributesData;
416
419 protected static Vector optionsDesc;
420
421
424 public static final String FREQUENCY = "FREQ";
425
426
429 public static final String TFIDF = "TFIDF";
430
433 static{
434 optionsDesc = new Vector(1);
435 Option option = new Option(
436 "Selects one attribute for conversion. " +
437 "The optional <method> argument can be one of FREQ or TFIDF " +
438 "(the default is FREQ). " +
439 "This option can be repeated for as many attributes as necessary.",
440 "A", 1, "-A <index>,<max count>[,<method>] ...");
441 optionsDesc.add(option);
442 }
443 }