1   /*
2    * FlexibleGazetteer.java
3    *
4    * Copyright (c) 2004, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Niraj Aswani 02/2002
14   *
15   */
16  
17  package gate.creole.gazetteer;
18  
19  import java.util.*;
20  import gate.util.*;
21  import gate.*;
22  import gate.creole.*;
23  
24  /**
25   * <p>Title: Flexible Gazetteer </p>
26       * <p> The Flexible Gazetteer provides users with the flexibility to choose </p>
27   * <p> their own customized input and an external Gazetteer. For example, </p>
28   * <p> the user might want to replace words in the text with their base </p>
29   * <p> forms (which is an output of the Morphological Analyser) or to segment </p>
30   * <p> a Chinese text (using the Chinese Tokeniser) before running the </p>
31   * <p> Gazetteer on the Chinese text. </p>
32   *
33       * <p> The Flexible Gazetteer performs lookup over a document based on the  </p>
34   * <p> values of an arbitrary feature of an arbitrary annotation type, by </p>
35   * <p> using an externally provided gazetteer. It is important to use an </p>
36   * <p> external gazetteer as this allows the use of any type of gazetteer </p>
37   * <p> (e.g. an Ontological gazetteer). </p>
38   * <p>Copyright: Copyright (c) 2003</p>
39   * <p>Company: </p>
40   * @author not attributable
41   * @version 1.0
42   */
43  
44  public class FlexibleGazetteer
45      extends AbstractLanguageAnalyser
46      implements ProcessingResource {
47  
48    /**
49     * Constructor
50     */
51    public FlexibleGazetteer() {
52      changedNodes = new ArrayList();
53    }
54  
55    /** Does the actual loading and parsing of the lists. This method must be
56     * called before the gazetteer can be used
57     */
58    public Resource init() throws ResourceInstantiationException {
59  
60      /*
61      if (listsURL == null) {
62        throw new ResourceInstantiationException(
63            "No URL provided for gazetteer creation!");
64      }
65  
66      if (gazetteerClassName == null) {
67        throw new ResourceInstantiationException(
68            "No Gazetter Name provided");
69      }
70      */
71      return this;
72    }
73  
74    /**
75     * This method runs the gazetteer. It assumes that all the needed parameters
76     * are set. If they are not, an exception will be fired.
77     */
78    public void execute() throws ExecutionException {
79      fireProgressChanged(0);
80      fireStatusChanged("Checking Document...");
81      if (document == null) {
82        throw new ExecutionException(
83            "No document to process!"
84            );
85      }
86  
87      fireStatusChanged("Creating temporary Document...");
88      StringBuffer newdocString = new StringBuffer(document.getContent().toString());
89      Document tempDoc = null;
90      boolean chineseSplit = false;
91  
92      if (inputFeatureNames == null || inputFeatureNames.size() == 0) {
93        inputFeatureNames = new ArrayList();
94      }
95  
96      Iterator tokenIter = getTokenIterator(document, inputAnnotationSetName);
97      long totalDeductedSpaces = 0;
98      fireStatusChanged("Replacing contents with the feature value...");
99  
100     outer:while (tokenIter != null && tokenIter.hasNext()) {
101       Annotation currentToken = (Annotation) tokenIter.next();
102 
103       // check if it is a chinesesplit
104       // if it is, replace no space character with a single space
105       if (currentToken.getType().equals(ANNIEConstants.
106                                         SPACE_TOKEN_ANNOTATION_TYPE) &&
107           ( (String) (currentToken.getFeatures().get(ANNIEConstants.
108           TOKEN_KIND_FEATURE_NAME))).equals("ChineseSplit")) {
109 
110         // for chinese split startnode and end node are same
111         long startOffset = currentToken.getStartNode().getOffset().
112                            longValue();
113 
114         // because we are adding a space in place of chinesesplit
115         // the endoffset will become newStartOffset + 1
116         long newStartOffset = startOffset - totalDeductedSpaces;
117         long newEndOffset = newStartOffset + 1;
118         NodePosition newNode = new NodePosition(startOffset, startOffset,
119                                                 newStartOffset, newEndOffset,
120                                                 totalDeductedSpaces);
121         chineseSplit = true;
122 
123         // here is the addition of space in the document
124         totalDeductedSpaces--;
125         changedNodes.add(newNode);
126         newdocString = newdocString.insert( (int) newStartOffset, ' ');
127         continue outer;
128       }
129 
130       // search in the provided inputFeaturesNames
131       // if the current token has a feature value that user
132       // wants to paste on and replace the original string of the token
133       inner:for (int i = 0; i < inputFeatureNames.size(); i++) {
134         String[] keyVal = ( (String) (inputFeatureNames.get(i))).split("[.]");
135 
136         if (keyVal.length == 2) {
137           // val is the feature name
138           // key is the annotationName
139           if (currentToken.getType().equals(keyVal[0])) {
140             FeatureMap features = currentToken.getFeatures();
141             String newTokenValue = (String) (features.get(keyVal[1]));
142 
143             // what if provided feature doesnot exist
144             if (newTokenValue == null) {
145               continue;
146 
147             }
148             else {
149               // feature value found so we need to replace it
150               // find the start and end offsets for this token
151               long startOffset = currentToken.getStartNode().getOffset().
152                                  longValue();
153               long endOffset = currentToken.getEndNode().getOffset().
154                                longValue();
155 
156               // what is the actual string
157               String actualString = (String) (features.get(ANNIEConstants.
158                   TOKEN_STRING_FEATURE_NAME));
159 
160               // if the feature value and the actual string both are same
161               // we don't need to replace it
162               if (actualString.equals(newTokenValue)) {
163                 // there is no need to change anything for this
164                 break inner;
165               }
166 
167               // let us find the difference between the lengths of the
168               // actual string and the newTokenValue
169               long lengthDifference = actualString.length() -
170                                       newTokenValue.length();
171 
172               // so lets find the new startOffset and endOffset
173               long newStartOffset = startOffset - totalDeductedSpaces;
174               long newEndOffset = newStartOffset + newTokenValue.length();
175 
176               // and make the entry for this
177               NodePosition newNode = new NodePosition(startOffset,
178                   endOffset,
179                   newStartOffset, newEndOffset, totalDeductedSpaces);
180               changedNodes.add(newNode);
181               // how many spaces have been added or removed till the current
182               // position of the token
183               totalDeductedSpaces += lengthDifference;
184 
185               // and finally replace the actual string in the document
186               // with the new document
187               newdocString = newdocString.replace( (int) newStartOffset,
188                                                   (int) newStartOffset +
189                                                   actualString.length(),
190                                                   newTokenValue);
191               break inner;
192             }
193           }
194         }
195       }
196     }
197 
198     fireStatusChanged("New Document to be processed with Gazetteer...");
199     try {
200       FeatureMap params = Factory.newFeatureMap();
201       params.put("stringContent", newdocString.toString());
202       FeatureMap features = Factory.newFeatureMap();
203       Gate.setHiddenAttribute(features, true);
204       tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
205                                                   params, features);
206     }
207     catch (ResourceInstantiationException rie) {
208       throw new ExecutionException("Temporary document cannot be created");
209     }
210 
211     // lets create the gazetteer based on the provided gazetteer name
212     FeatureMap params = Factory.newFeatureMap();
213     gazetteerInst.setDocument(tempDoc);
214     gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
215 
216     fireStatusChanged("Executing Gazetteer...");
217     gazetteerInst.execute();
218 
219     // now the tempDoc has been looked up, we need to shift the tokens from
220     // this temp document to the original document
221     fireStatusChanged("Transfering new tags to the original one...");
222     Iterator tokensIter = getTokenIterator(tempDoc, outputAnnotationSetName);
223     AnnotationSet original = (outputAnnotationSetName == null) ?
224                              document.getAnnotations() :
225                              document.getAnnotations(outputAnnotationSetName);
226     long totalSpaceAdded = 0;
227     long difference = 0;
228 
229     int foundNode = -1;
230     while (tokensIter != null && tokensIter.hasNext()) {
231       Annotation currentToken = (Annotation) (tokensIter.next());
232       long startOffset = currentToken.getStartNode().getOffset().longValue();
233       long endOffset = currentToken.getEndNode().getOffset().longValue();
234 
235       // search through the changedNodes and if it is found we will have to
236       // find the new offsets
237       int i = foundNode + 1;
238       boolean found = false;
239       inner1:for (; i < changedNodes.size(); i++) {
240 
241         NodePosition tempNode = (NodePosition) (changedNodes.get(i));
242 
243         // all the nodes are in the sorted order based on there offsets
244         // so if we reach beyond the position of the current text
245         // under consideration, simply terminate the loop
246         if (tempNode.getNewStartNode() > startOffset) {
247           // so we lets point to the node whose startOffset
248           // is less than the startOffset of the current node
249           // this will allow us to find out how many
250           // extra spaces were added or removed before the current token
251           i = i - 1;
252           break inner1;
253         }
254 
255         // how do we know if we want to change the offset
256         if (tempNode.getNewStartNode() == startOffset) {
257           // yes it is available
258 
259           // lets find the end node
260           int k = i;
261           for (;
262                k >= 0 && k < changedNodes.size() &&
263                endOffset >
264                ( (NodePosition) changedNodes.get(k)).getNewStartNode(); k++)
265             ;
266           long spacesToAdd = 0;
267           if (k - 1 == i && k - 1 >= 0) {
268             spacesToAdd = (tempNode.getOldEndNode() - tempNode.getNewEndNode());
269           }
270           else if (k - 1 < 0) {
271             spacesToAdd = 0;
272           }
273           else {
274             spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
275                           getOldEndNode() -
276                           ( (NodePosition) changedNodes.get(k - 1)).
277                           getNewEndNode();
278           }
279 
280           // and how many to be added before the endnode
281           // as any look up notation can be for the text with one or more tokens
282           FeatureMap newFeatureMap = currentToken.getFeatures();
283           try {
284 
285             original.add(new Long(startOffset +
286                                   (tempNode.getOldStartNode() -
287                                    tempNode.getNewStartNode())),
288                          new Long(endOffset + spacesToAdd),
289                          //new Long(endOffset + (tempNode.getOldEndNode()
290                          //          - tempNode.getNewEndNode())),
291                          ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
292                          newFeatureMap);
293 
294           }
295           catch (InvalidOffsetException ioe) {
296             throw new ExecutionException("Offset Error");
297           }
298           found = true;
299           foundNode = i;
300           break inner1;
301         }
302       }
303 
304       if (!found) {
305         long totalStartSpaces = 0;
306         long totalEndSpaces = 0;
307 
308         // check if we have reached at the end of the changedNodes
309         // if yes we need to find the last node
310         i = (changedNodes.size() == i) ? i - 1 : i;
311 
312         // lets find the end node
313         int k = i;
314         for (;
315              k > 0 && k < changedNodes.size() &&
316              endOffset > ( (NodePosition) changedNodes.get(k)).getNewStartNode();
317              k++)
318           ;
319         long spacesToAdd = 0;
320         if (k - 1 == i && k - 1 >= 0) {
321           spacesToAdd = ( ( (NodePosition) changedNodes.get(i)).getOldEndNode() -
322                          ( (NodePosition) changedNodes.get(i)).getNewEndNode());
323         }
324         else if (k - 1 < 0) {
325           spacesToAdd = 0;
326         }
327         else {
328           spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
329                         getOldEndNode() -
330                         ( (NodePosition) changedNodes.get(k - 1)).getNewEndNode();
331         }
332 
333         if (i >= 0) {
334           //totalStartSpaces = ((NodePosition)
335           // changedNodes.get(i)).getOldStartNode()
336           // - ((NodePosition) changedNodes.get(i)).getNewStartNode();
337           totalStartSpaces = ( (NodePosition) changedNodes.get(i)).
338                              getOldEndNode() -
339                              ( (NodePosition) changedNodes.get(i)).
340                              getNewEndNode();
341           //totalEndSpaces = ((NodePosition)
342           // changedNodes.get(i)).getOldEndNode() -
343           // ((NodePosition) changedNodes.get(i)).getNewEndNode();
344           totalEndSpaces = spacesToAdd;
345           foundNode = i;
346         }
347 
348         // no it is not available
349         FeatureMap newFeatureMap = currentToken.getFeatures();
350         try {
351           original.add(new Long(startOffset + totalStartSpaces),
352                        new Long(endOffset + totalEndSpaces),
353                        ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
354                        newFeatureMap);
355         }
356         catch (InvalidOffsetException ioe) {
357           throw new ExecutionException("Offset Error");
358         }
359 
360       }
361     }
362 
363     // now remove the newDoc
364     Factory.deleteResource(tempDoc);
365     fireProcessFinished();
366   }
367 
368   /**
369    * Sets the document to work on
370    * @param doc
371    */
372   public void setDocument(gate.Document doc) {
373     this.document = doc;
374   }
375 
376   /**
377    * Returns the document set up by user to work on
378    * @return a {@link Document}
379    */
380   public gate.Document getDocument() {
381     return this.document;
382   }
383 
384   /**
385    * sets the outputAnnotationSetName
386    * @param annName
387    */
388   public void setOutputAnnotationSetName(String annName) {
389     this.outputAnnotationSetName = annName;
390   }
391 
392   /**
393    * Returns the outputAnnotationSetName
394    * @return a {@link String} value.
395    */
396   public String getOutputAnnotationSetName() {
397     return this.outputAnnotationSetName;
398   }
399 
400   /**
401    * sets the inputAnnotationSetName
402    * @param annName
403    */
404   public void setInputAnnotationSetName(String annName) {
405     this.inputAnnotationSetName = annName;
406   }
407 
408   /**
409    * Returns the inputAnnotationSetName
410    * @return a {@link String} value.
411    */
412   public String getInputAnnotationSetName() {
413     return this.inputAnnotationSetName;
414   }
415 
416   /**
417    * Feature names for example: Token.string, Token.root etc... Values of these
418        * features should be used to replace the actual string of these features. This
419    * method allows a user to set the name of such features
420    * @param inputs
421    */
422   public void setInputFeatureNames(java.util.List inputs) {
423     this.inputFeatureNames = inputs;
424   }
425 
426   /**
427    * Returns the feature names that are provided by the user to use their values
428    * to replace their actual strings in the document
429    * @return a {@link List} value.
430    */
431   public java.util.List getInputFeatureNames() {
432     return this.inputFeatureNames;
433   }
434 
435   public Gazetteer getGazetteerInst() {
436     return this.gazetteerInst;
437   }
438 
439   public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
440     this.gazetteerInst = gazetteerInst;
441   }
442 
443   /**
444    * This method takes the document and the annotationSetName and then creates
445    * a interator for the annotations available in the document under the
446    * provided annotationSetName
447    * @param doc
448    * @param annotationSetName
449    * @return an {@link Iterator}
450    */
451   public Iterator getTokenIterator(gate.Document doc, String annotationSetName) {
452     AnnotationSet inputAs = (annotationSetName == null) ? doc.getAnnotations() :
453                             doc.getAnnotations(annotationSetName);
454     AnnotationSet tempSet = inputAs.get();
455     if(tempSet == null)
456       return null;
457 
458     List tokens = new ArrayList(inputAs.get());
459 
460     if(tokens == null)
461       return null;
462 
463     Comparator offsetComparator = new OffsetComparator();
464     Collections.sort(tokens, offsetComparator);
465     Iterator tokenIter = tokens.iterator();
466     return tokenIter;
467   }
468 
469   // Gazetteer Runtime parameters
470   private gate.Document document;
471   private java.lang.String outputAnnotationSetName;
472   private java.lang.String inputAnnotationSetName;
473 
474   // Flexible Gazetteer parameter
475   private Gazetteer gazetteerInst;
476   private java.util.List inputFeatureNames;
477 
478   // parameters required within the program
479   private ArrayList changedNodes;
480 }