1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 27 June 2002
10   *
11   *  $Id: TokenOrthographyExtractor.java,v 1.1 2002/06/27 17:12:32 valyt Exp $
12   */
13  package gate.ml;
14  
15  import java.util.*;
16  
17  import weka.core.*;
18  
19  
20  import gate.*;
21  import gate.util.*;
22  import gate.creole.ANNIEConstants;
23  import weka.core.Attribute;
24  
25  public class TokenOrthographyExtractor extends AbstractAttributeExtractor {
26    public Attribute getAttribute() {
27      FastVector values = new FastVector(ORTHOGRAPHY_VALUES.length);
28      for(int i = 0; i < ORTHOGRAPHY_VALUES.length; i++)
29        values.addElement(ORTHOGRAPHY_VALUES[i]);
30      Attribute attribute = new Attribute("Orth(" + position + ")", values);
31      return attribute;
32    }
33  
34    public Object getAttributeValue(Object data){
35      if(position > 0) return getInsideOrthValue(data);
36      else return getLeftContextOrth(data);
37    }
38  
39    /**
40     * This method will find POS category for tokens in the left context of the
41     * target annotation (where position is negative).
42     * @param data
43     * @return
44     */
45    protected Object getLeftContextOrth(Object data){
46      //the data is an annotation in this case.
47      Annotation ann = (Annotation)data;
48      Long previousOffset = dataCollector.previousOffset(
49                                          ann.getStartNode().getOffset());
50      //we start looking for Tokens going backwards from the annotation start.
51      int skippedTokens = 0;
52      while(previousOffset != null &&
53            skippedTokens < -position){
54        Set startingAnnots = dataCollector.getStartingAnnotations(previousOffset);
55        if(startingAnnots != null && (!startingAnnots.isEmpty())){
56          Iterator annIter = startingAnnots.iterator();
57          while(annIter.hasNext()){
58            Annotation annotation = (Annotation)annIter.next();
59            if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){
60              skippedTokens++;
61              if(skippedTokens == -position){
62                //the token we just skipped was the one we needed
63                if(annotation.getFeatures() != null){
64                  String kindOrth = (String)annotation.getFeatures().
65                               get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME);
66                  String orth = (String)annotation.getFeatures().
67                               get(ANNIEConstants.TOKEN_ORTH_FEATURE_NAME);
68                  if(orth != null && orth.length() > 0){
69                    kindOrth += ":" + orth;
70                  }
71                  if(orthValues.contains(kindOrth)) return kindOrth;
72                  else{
73                    Out.prln("Warning: unknown orthography value: " + kindOrth);
74                  }
75                }
76                return null;
77              }
78            }
79          }
80        }
81        previousOffset = dataCollector.previousOffset(previousOffset);
82      }
83      //could not find the token
84      return null;
85    }
86  
87    /**
88     * This method will find the POS category for tokens covered by the instance
89     * annotation and tokens that are part of the right context.
90     * @param data the instance annotation
91     * @return the POS category as a string.
92     */
93    protected Object getInsideOrthValue(Object data){
94      //the data is an annotation in this case.
95      Annotation ann = (Annotation)data;
96      Long endOffset = ann.getEndNode().getOffset();
97      Long nextOffset = ann.getStartNode().getOffset();
98      int skippedTokens = 0;
99      while(nextOffset != null &&
100           ((!ignoreRightContext) || (nextOffset.compareTo(endOffset) < 0)) &&
101           skippedTokens < position){
102       //advance offset skipping all tokens found
103       Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset);
104       if(startingAnnots != null && (!startingAnnots.isEmpty())){
105         Iterator annIter = startingAnnots.iterator();
106         while(annIter.hasNext()){
107           Annotation annotation = (Annotation)annIter.next();
108           if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){
109             skippedTokens++;
110             if(skippedTokens == position){
111               //the token we just skipped was the one we needed
112               if(annotation.getFeatures() != null){
113                 String kindOrth = (String)annotation.getFeatures().
114                              get(ANNIEConstants.TOKEN_KIND_FEATURE_NAME);
115                 String orth = (String)annotation.getFeatures().
116                              get(ANNIEConstants.TOKEN_ORTH_FEATURE_NAME);
117                 if(orth != null && orth.length() > 0){
118                   kindOrth += ":" + orth;
119                 }
120                 if(orthValues.contains(kindOrth)) return kindOrth;
121                 else{
122                   Out.prln("Warning: unknown orthography value: " + kindOrth);
123                 }
124               }
125               return null;
126             }
127           }
128         }
129       }
130       nextOffset = dataCollector.nextOffset(nextOffset);
131     }
132     //could not find the token
133     return null;
134   }
135 
136   /**
137    * Sets the (1-based) location of the word inside the instance that this
138    * extractor targets.
139    * Negative positions mean tokens in the right context.
140    * Position cannot be zero!
141    * @param position an int value.
142    */
143   public void setPosition(int position){
144     this.position = position;
145   }
146 
147   public void setIgnoreRightContext(boolean ignoreRightContext) {
148     this.ignoreRightContext = ignoreRightContext;
149   }
150 
151   public boolean isIgnoreRightContext() {
152     return ignoreRightContext;
153   }
154 
155   /**
156    * The 1-based position of the Token (for which the POS will gbe extracted)
157    * inside the instance annotation.
158    */
159   protected int position;
160 
161   /**
162    * Used internally for easy element-of tests
163    */
164   private List orthValues = Arrays.asList(ORTHOGRAPHY_VALUES);
165 
166   static protected final String[] ORTHOGRAPHY_VALUES = new String[]
167         {"word:upperInitial", "word:allCaps", "word:lowercase",
168         "word:mixedCaps", "number", "symbol", "punctuation", "word:apostrophe"};
169 
170   private boolean ignoreRightContext = true;}