1   package gate.ml;
2   
3   import java.util.*;
4   
5   import weka.core.*;
6   
7   
8   import gate.*;
9   import gate.util.*;
10  import gate.creole.ANNIEConstants;
11  /**
12   * Detects lookup major and minor types and their location.
13   * This attribute detector is used to detect both lookup types (a nominal
14   * attribute) and their location (a numerical one).
15   * A sequence of calls to {@link #getAttribute()} will return alternatively
16   * the two types of attributes.
17   */
18  public class LookupDetector extends AbstractAttributeExtractor{
19  
20    public LookupDetector() {
21    }
22  
23    public Attribute getAttribute() {
24      Attribute attribute = null;
25      String attributeNameBase = "Lookup-" + ((int)attributesReturned / 2 + 1);
26      if(attributesReturned % 2 == 0){
27        //even value -> Lookup type
28        FastVector values = new FastVector(LOOKUP_TYPES.length);
29        for(int i = 0; i < LOOKUP_TYPES.length; i++)
30          values.addElement(LOOKUP_TYPES[i]);
31        attribute = new Attribute(attributeNameBase,
32                                  values);
33      }else{
34        //odd value ->lookup position
35        attribute = new Attribute(attributeNameBase + " (position)");
36      }
37      attributesReturned++;
38      return attribute;
39    }
40  
41  
42    public Object getAttributeValue(Object data) {
43      if(data == lastAnnotationInstance){
44        if(lastLookupPosition != -1){
45          //this is a second question for the same annotation instance and the
46          //same lookup -> return lookup position
47          Object returnValue = lastLookupPosition == -2 ? null :
48                               new Double(lastLookupPosition);
49          lastLookupPosition = -1;
50          return returnValue;
51        }
52      }else{
53        //new annotation instance
54        lookupsReturned = 0;
55      }
56  
57      //if we reached this point we need to return the lookup type
58  
59      //the data is an annotation in this case.
60      Annotation ann = (Annotation)data;
61      Long endOffset = ann.getEndNode().getOffset();
62      Long nextOffset = ann.getStartNode().getOffset();
63      int skippedLookups = 0;
64      int skippedTokens = 0;
65      while(nextOffset != null &&
66            nextOffset.compareTo(endOffset) < 0){
67        //advance offset skipping all Lookups found until the one that needs
68        //returning
69        Set startingAnnots = dataCollector.getStartingAnnotations(nextOffset);
70        if(startingAnnots != null && (!startingAnnots.isEmpty())){
71          //first count skipped tokens
72          Iterator annIter = startingAnnots.iterator();
73          while(annIter.hasNext()){
74            Annotation annotation = (Annotation)annIter.next();
75            if(annotation.getType().equals(ANNIEConstants.TOKEN_ANNOTATION_TYPE)){
76              skippedTokens++;
77            }
78          }
79  
80          annIter = startingAnnots.iterator();
81          while(annIter.hasNext()){
82            Annotation annotation = (Annotation)annIter.next();
83            if(annotation.getType().equals(ANNIEConstants.LOOKUP_ANNOTATION_TYPE)){
84              skippedLookups++;
85              if(skippedLookups == (lookupsReturned + 1)){
86                //the lookup we just skipped was never returned before
87                //it needs to be returned now
88                String lookupType = (String)annotation.getFeatures().
89                              get(ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME);
90                String minorType = (String)annotation.getFeatures().
91                              get(ANNIEConstants.LOOKUP_MINOR_TYPE_FEATURE_NAME);
92                if(minorType != null) lookupType += ":" + minorType;
93  
94                //save the last annotation instance we examined
95                lastAnnotationInstance = ann;
96                //save the location for the last lookup found
97                lastLookupPosition = skippedTokens;
98                lookupsReturned ++;
99                if(LOOKUP_TYPES_LIST.contains(lookupType)){
100                 return lookupType;
101               }else{
102                 Out.prln("Warning: unknown lookup type: " + lookupType);
103                 return null;
104               }
105             }
106           }
107         }
108       }
109       nextOffset = dataCollector.nextOffset(nextOffset);
110     }
111     //no more lookups
112     lastLookupPosition = -2;
113     lastAnnotationInstance = ann;
114     return null;
115   }
116 
117 
118   /**
119    * This attribute detector is used to detect both lookup types (a nominal
120    * attribute) and their location (a numerical one).
121    * A sequence of calls to {@link #getAttribute()} will return alternatively
122    * the two types of attributes.
123    * This value is used to determine what attribute will be returned based on
124    * its parity.
125    */
126   protected int attributesReturned = 0;
127 
128   /**
129    * This attribute detector can be used repeatedly to get the values for more
130    * than one lookup annotations inside the annotation instance under scrutiny.
131    * This value will mark the number of lookups returned for the current target
132    * entity in order to avoid returning the same value twice.
133    */
134   protected int lookupsReturned = 0;
135 
136   protected Annotation lastAnnotationInstance = null;
137 
138   protected int lastLookupPosition = -1;
139 
140   protected static final String[] LOOKUP_TYPES;
141   protected static final List LOOKUP_TYPES_LIST;
142 
143   static{
144     LOOKUP_TYPES = new String[]{
145     "sport", "stop", "organization", "location:city", "organization:company",
146     "location:country_abbrev", "country_adj", "location:country",
147     "currency_unit:pre_amount", "currency_unit:post_amount", "date_key",
148     "date_unit", "date:day", "organization:departmen", "facility_key_ext",
149     "facility_key", "facility:building", "date:festival", "govern_key",
150     "organization:government", "greeting", "time:hour", "ident_key:pre",
151     "jobtitle", "loc_general_key", "loc_key:post", "loc_key:pre",
152     "location:relig", "date:month", "location:region", "cdg",
153     "organization:newspaper", "number", "date:ordinal", "organization",
154     "org_base", "org_key:cap", "org_key", "org_pre", "spur",
155     "person_first:ambig", "person_ending", "person_first:female", "person_full",
156     "person_first:male", "person_full:relig", "person_full:sci", "phone_prefix",
157     "location:province", "location:racecourse", "spur_ident", "address:street",
158     "surname:prefix", "organization:team", "time:ampm", "time_modifier",
159     "time_unit", "time:zone", "title:female", "title:civilian", "title:male",
160     "title:military", "title:police", "organization:company", "year"};
161 
162     LOOKUP_TYPES_LIST = Arrays.asList(LOOKUP_TYPES);
163   }
164 
165 }