1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.13 2001/10/15 11:21:32 valyt Exp $
12   */
13  
14  package gate.creole;
15  
16  import gate.*;
17  import gate.creole.*;
18  import gate.util.*;
19  import gate.event.*;
20  
21  import hepple.postag.*;
22  
23  import java.util.*;
24  import java.io.*;
25  import java.net.URL;
26  import java.text.NumberFormat;
27  /**
28   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
29   */
30  public class POSTagger extends AbstractLanguageAnalyser {
31  
32    public POSTagger() {
33    }
34  
35    public Resource init()throws ResourceInstantiationException{
36      if(lexiconURL == null){
37        throw new ResourceInstantiationException(
38          "NoURL provided for the lexicon!");
39      }
40      if(rulesURL == null){
41        throw new ResourceInstantiationException(
42          "No URL provided for the rules!");
43      }
44      try{
45        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL);
46      }catch(Exception e){
47        throw new ResourceInstantiationException(e);
48      }
49      return this;
50    }
51  
52  
53    public void execute() throws ExecutionException{
54      try{
55        //check the parameters
56        if(document == null) throw new GateRuntimeException(
57          "No document to process!");
58        if(inputASName != null && inputASName.equals("")) inputASName = null;
59        if(outputASName != null && outputASName.equals("")) outputASName = null;
60        AnnotationSet inputAS = (inputASName == null) ?
61                                document.getAnnotations() :
62                                document.getAnnotations(inputASName);
63        AnnotationSet outputAS = (outputASName == null) ?
64                                 document.getAnnotations() :
65                                 document.getAnnotations(outputASName);
66  
67        fireStatusChanged("POS tagging " + document.getName());
68        fireProgressChanged(0);
69        //prepare the input for HepTag
70        //define a comparator for annotations by start offset
71        Comparator offsetComparator = new OffsetComparator();
72        AnnotationSet as = inputAS.get("Sentence");
73        if(as != null && as.size() > 0){
74          List sentences = new ArrayList(as);
75          Collections.sort(sentences, offsetComparator);
76          Iterator sentIter = sentences.iterator();
77          int sentIndex = 0;
78          int sentCnt = sentences.size();
79          long startTime= System.currentTimeMillis();
80          while(sentIter.hasNext()){
81            Annotation sentenceAnn = (Annotation)sentIter.next();
82            AnnotationSet rangeSet = inputAS.get(
83                                      sentenceAnn.getStartNode().getOffset(),
84                                      sentenceAnn.getEndNode().getOffset());
85            if(rangeSet == null) continue;
86            AnnotationSet tokensSet = rangeSet.get("Token");
87            if(tokensSet == null) continue;
88            List tokens = new ArrayList(tokensSet);
89            Collections.sort(tokens, offsetComparator);
90  
91  //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
92            List sentence = new ArrayList(tokens.size());
93            Iterator tokIter = tokens.iterator();
94            while(tokIter.hasNext()){
95              Annotation token = (Annotation)tokIter.next();
96              String text = (String)token.getFeatures().get("string");
97              sentence.add(text);
98            }//while(tokIter.hasNext())
99  
100           //run the POSTagger over this sentence
101           List sentences4tagger = new ArrayList(1);
102           sentences4tagger.add(sentence);
103           List taggerResults = tagger.runTagger(sentences4tagger);
104           //add the results to the output annotation set
105           //we only get one sentence
106           List sentenceFromTagger = (List)taggerResults.get(0);
107           if(sentenceFromTagger.size() != sentence.size()){
108             String taggerResult = "";
109             for(int i = 0; i< sentenceFromTagger.size(); i++){
110               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
111             }
112             throw new GateRuntimeException(
113               "POS Tagger malfunction: the output size (" +
114               sentenceFromTagger.size() +
115               ") is different from the input size (" +
116               sentence.size() + ")!" +
117               "\n Input: " + sentence + "\nOutput: " + taggerResult);
118           }
119           for(int i = 0; i< sentence.size(); i++){
120             String category = ((String[])sentenceFromTagger.get(i))[1];
121             Annotation token = (Annotation)tokens.get(i);
122             token.getFeatures().put("category", category);
123           }//for(i = 0; i<= sentence.size(); i++)
124           fireProgressChanged(sentIndex++ * 100 / sentCnt);
125         }//while(sentIter.hasNext())
126 
127           fireProcessFinished();
128           long endTime = System.currentTimeMillis();
129           fireStatusChanged(document.getName() + " tagged in " +
130                           NumberFormat.getInstance().format(
131                           (double)(endTime - startTime) / 1000) + " seconds!");
132       }else{
133         throw new GateRuntimeException("No sentences to process!\n" +
134                                        "Please run a sentence splitter first!");
135       }//if(as != null && as.size() > 0)
136     }catch(Exception e){
137       throw new ExecutionException(e);
138     }
139   }
140 
141 
142   public void setLexiconURL(java.net.URL newLexiconURL) {
143     lexiconURL = newLexiconURL;
144   }
145   public java.net.URL getLexiconURL() {
146     return lexiconURL;
147   }
148   public void setRulesURL(java.net.URL newRulesURL) {
149     rulesURL = newRulesURL;
150   }
151   public java.net.URL getRulesURL() {
152     return rulesURL;
153   }
154   public void setInputASName(String newInputASName) {
155     inputASName = newInputASName;
156   }
157   public String getInputASName() {
158     return inputASName;
159   }
160   public void setOutputASName(String newOutputASName) {
161     outputASName = newOutputASName;
162   }
163   public String getOutputASName() {
164     return outputASName;
165   }
166 
167   protected hepple.postag.POSTagger tagger;
168   private java.net.URL lexiconURL;
169   private java.net.URL rulesURL;
170   private String inputASName;
171   private String outputASName;
172 }