|
POSCategoryExtractor |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan 28 May 2002 10 * 11 * $Id: POSCategoryExtractor.java,v 1.1 2002/06/27 17:12:32 valyt Exp $ 12 */ 13 package gate.ml; 14 15 import java.util.*; 16 17 import weka.core.*; 18 19 20 import gate.*; 21 import gate.util.*; 22 import gate.creole.ANNIEConstants; 23 /** 24 * Extracts the POS category of the n-th word inside the instance. 25 */ 26 public class POSCategoryExtractor extends AbstractAttributeExtractor { 27 28 public Attribute getAttribute() { 29 List posCats = Arrays.asList(POS_CATEGORIES); 30 FastVector values = new FastVector(POS_CATEGORIES.length); 31 for(int i = 0; i < POS_CATEGORIES.length; i++) 32 values.addElement(POS_CATEGORIES[i]); 33 Attribute attribute = new Attribute("POS(" + position + ")", values); 34 return attribute; 35 } 36 37 public Object getAttributeValue(Object data){ 38 if(position > 0) return getInsidePOSValue(data); 39 else return getLeftContextPOS(data); 40 } 41 42 /** 43 * This method will find POS category for tokens in the left context of the 44 * target annotation (where position is negative). 45 * @param data 46 * @return 47 */ 48 protected Object getLeftContextPOS(Object data){ 49 //the data is an annotation in this case. 50 Annotation ann = (Annotation)data; 51 Long previousOffset = dataCollector.previousOffset( 52 ann.getStartNode().getOffset()); 53 //we start looking for Tokens going backwards from the annotation start. 54 int skippedTokens = 0; 55 while(previousOffset != null && 56 skippedTokens < -position){ 57 Set startingAnnots = dataCollector.getStartingAnnotations(previousOffset); 58 if(startingAnnots != null && (!startingAnnots.isEmpty())){ 59 Iterator annIter = startingAnnots.iterator(); 60 while(annIter.hasNext()){ 61 Annotation annotation = (Annotation)annIter.next(); 62 if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){ 63 skippedTokens++; 64 if(skippedTokens == -position){ 65 //the token we just skipped was the one we needed 66 if(annotation.getFeatures() != null){ 67 String pos = (String)annotation.getFeatures(). 68 get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME); 69 if(posValues.contains(pos)) return pos; 70 else{ 71 Out.prln("Warning: unknown POS category: " + pos); 72 } 73 } 74 return null; 75 } 76 } 77 } 78 } 79 previousOffset = dataCollector.previousOffset(previousOffset); 80 } 81 //could not find the token 82 return null; 83 } 84 85 /** 86 * This method will find the POS category for tokens covered by the instance 87 * annotation and tokens that are part of the right context. 88 * @param data the instance annotation 89 * @return the POS category as a string. 90 */ 91 protected Object getInsidePOSValue(Object data){ 92 //the data is an annotation in this case. 93 Annotation ann = (Annotation)data; 94 Long endOffset = ann.getEndNode().getOffset(); 95 Long nextOffset = ann.getStartNode().getOffset(); 96 int skippedTokens = 0; 97 while(nextOffset != null && 98 ((!ignoreRightContext) || (nextOffset.compareTo(endOffset) < 0)) && 99 skippedTokens < position){ 100 //advance offset skipping all tokens found 101 Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset); 102 if(startingAnnots != null && (!startingAnnots.isEmpty())){ 103 Iterator annIter = startingAnnots.iterator(); 104 while(annIter.hasNext()){ 105 Annotation annotation = (Annotation)annIter.next(); 106 if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){ 107 skippedTokens++; 108 if(skippedTokens == position){ 109 //the token we just skipped was the one we needed 110 if(annotation.getFeatures() != null){ 111 String pos = (String)annotation.getFeatures(). 112 get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME); 113 if(posValues.contains(pos)) return pos; 114 else{ 115 Out.prln("Warning: unknown POS category: " + pos); 116 } 117 } 118 return null; 119 } 120 } 121 } 122 } 123 nextOffset = dataCollector.nextOffset(nextOffset); 124 } 125 //could not find the token 126 return null; 127 } 128 129 /** 130 * Sets the (1-based) location of the word inside the instance that this 131 * extractor targets. 132 * Negative positions mean tokens in the right context. 133 * Position cannot be zero! 134 * @param position an int value. 135 */ 136 public void setPosition(int position){ 137 this.position = position; 138 } 139 140 public void setIgnoreRightContext(boolean ignoreRightContext) { 141 this.ignoreRightContext = ignoreRightContext; 142 } 143 144 public boolean isIgnoreRightContext() { 145 return ignoreRightContext; 146 } 147 148 /** 149 * The 1-based position of the Token (for which the POS will gbe extracted) 150 * inside the instance annotation. 151 */ 152 protected int position; 153 154 /** 155 * Used internally for easy element-of tests 156 */ 157 private List posValues = Arrays.asList(POS_CATEGORIES); 158 159 static protected final String[] POS_CATEGORIES = new String[] 160 {"NN", "NNP", "NNPS", "NNS", "NP", "NPS", "JJ", "JJR", "JJS", 161 "JJSS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", 162 "VBP", "VBZ", "FW", "CD", "CC", "DT", "EX", "IN", "LS", 163 "MD", "PDT", "POS", "PP", "PRP", "PRP$", "PRPR$", "RP", 164 "TO", "UH", "WDT", "WP", "WP$", "WRB", "SYM", "\"", "#", 165 "$", "'", "(", ")", ",", "--", "-LRB-", ".", "''", ":" ,"::", "`"}; 166 167 private boolean ignoreRightContext = true; 168 }
|
POSCategoryExtractor |
|