|
TokenOrthographyExtractor |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan 27 June 2002 10 * 11 * $Id: TokenOrthographyExtractor.java,v 1.1 2002/06/27 17:12:32 valyt Exp $ 12 */ 13 package gate.ml; 14 15 import java.util.*; 16 17 import weka.core.*; 18 19 20 import gate.*; 21 import gate.util.*; 22 import gate.creole.ANNIEConstants; 23 import weka.core.Attribute; 24 25 public class TokenOrthographyExtractor extends AbstractAttributeExtractor { 26 public Attribute getAttribute() { 27 FastVector values = new FastVector(ORTHOGRAPHY_VALUES.length); 28 for(int i = 0; i < ORTHOGRAPHY_VALUES.length; i++) 29 values.addElement(ORTHOGRAPHY_VALUES[i]); 30 Attribute attribute = new Attribute("Orth(" + position + ")", values); 31 return attribute; 32 } 33 34 public Object getAttributeValue(Object data){ 35 if(position > 0) return getInsideOrthValue(data); 36 else return getLeftContextOrth(data); 37 } 38 39 /** 40 * This method will find POS category for tokens in the left context of the 41 * target annotation (where position is negative). 42 * @param data 43 * @return 44 */ 45 protected Object getLeftContextOrth(Object data){ 46 //the data is an annotation in this case. 47 Annotation ann = (Annotation)data; 48 Long previousOffset = dataCollector.previousOffset( 49 ann.getStartNode().getOffset()); 50 //we start looking for Tokens going backwards from the annotation start. 51 int skippedTokens = 0; 52 while(previousOffset != null && 53 skippedTokens < -position){ 54 Set startingAnnots = dataCollector.getStartingAnnotations(previousOffset); 55 if(startingAnnots != null && (!startingAnnots.isEmpty())){ 56 Iterator annIter = startingAnnots.iterator(); 57 while(annIter.hasNext()){ 58 Annotation annotation = (Annotation)annIter.next(); 59 if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){ 60 skippedTokens++; 61 if(skippedTokens == -position){ 62 //the token we just skipped was the one we needed 63 if(annotation.getFeatures() != null){ 64 String kindOrth = (String)annotation.getFeatures(). 65 get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME); 66 String orth = (String)annotation.getFeatures(). 67 get(ANNIEConstants.TOKEN_ORTH_FEATURE_NAME); 68 if(orth != null && orth.length() > 0){ 69 kindOrth += ":" + orth; 70 } 71 if(orthValues.contains(kindOrth)) return kindOrth; 72 else{ 73 Out.prln("Warning: unknown orthography value: " + kindOrth); 74 } 75 } 76 return null; 77 } 78 } 79 } 80 } 81 previousOffset = dataCollector.previousOffset(previousOffset); 82 } 83 //could not find the token 84 return null; 85 } 86 87 /** 88 * This method will find the POS category for tokens covered by the instance 89 * annotation and tokens that are part of the right context. 90 * @param data the instance annotation 91 * @return the POS category as a string. 92 */ 93 protected Object getInsideOrthValue(Object data){ 94 //the data is an annotation in this case. 95 Annotation ann = (Annotation)data; 96 Long endOffset = ann.getEndNode().getOffset(); 97 Long nextOffset = ann.getStartNode().getOffset(); 98 int skippedTokens = 0; 99 while(nextOffset != null && 100 ((!ignoreRightContext) || (nextOffset.compareTo(endOffset) < 0)) && 101 skippedTokens < position){ 102 //advance offset skipping all tokens found 103 Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset); 104 if(startingAnnots != null && (!startingAnnots.isEmpty())){ 105 Iterator annIter = startingAnnots.iterator(); 106 while(annIter.hasNext()){ 107 Annotation annotation = (Annotation)annIter.next(); 108 if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){ 109 skippedTokens++; 110 if(skippedTokens == position){ 111 //the token we just skipped was the one we needed 112 if(annotation.getFeatures() != null){ 113 String kindOrth = (String)annotation.getFeatures(). 114 get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME); 115 String orth = (String)annotation.getFeatures(). 116 get(ANNIEConstants.TOKEN_ORTH_FEATURE_NAME); 117 if(orth != null && orth.length() > 0){ 118 kindOrth += ":" + orth; 119 } 120 if(orthValues.contains(kindOrth)) return kindOrth; 121 else{ 122 Out.prln("Warning: unknown orthography value: " + kindOrth); 123 } 124 } 125 return null; 126 } 127 } 128 } 129 } 130 nextOffset = dataCollector.nextOffset(nextOffset); 131 } 132 //could not find the token 133 return null; 134 } 135 136 /** 137 * Sets the (1-based) location of the word inside the instance that this 138 * extractor targets. 139 * Negative positions mean tokens in the right context. 140 * Position cannot be zero! 141 * @param position an int value. 142 */ 143 public void setPosition(int position){ 144 this.position = position; 145 } 146 147 public void setIgnoreRightContext(boolean ignoreRightContext) { 148 this.ignoreRightContext = ignoreRightContext; 149 } 150 151 public boolean isIgnoreRightContext() { 152 return ignoreRightContext; 153 } 154 155 /** 156 * The 1-based position of the Token (for which the POS will gbe extracted) 157 * inside the instance annotation. 158 */ 159 protected int position; 160 161 /** 162 * Used internally for easy element-of tests 163 */ 164 private List orthValues = Arrays.asList(ORTHOGRAPHY_VALUES); 165 166 static protected final String[] ORTHOGRAPHY_VALUES = new String[] 167 {"word:upperInitial", "word:allCaps", "word:lowercase", 168 "word:mixedCaps", "number", "symbol", "punctuation", "word:apostrophe"}; 169 170 private boolean ignoreRightContext = true;}
|
TokenOrthographyExtractor |
|