1   /*
2    *  Copyright (c) 1998-2001, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: SentenceSplitter.java,v 1.24 2002/03/06 17:15:44 kalina Exp $
12   */
13  
14  
15  package gate.creole.splitter;
16  
17  import gate.*;
18  import gate.util.*;
19  import gate.event.*;
20  import gate.creole.tokeniser.*;
21  import gate.creole.gazetteer.*;
22  import gate.creole.*;
23  
24  import java.util.*;
25  /**
26   * A sentence splitter. This is module similar to a
27   * {@link gate.creole.nerc.Nerc} in the fact that it conatins a tokeniser, a
28   * gazetteer and a Jape grammar. This class is used so we can have a different
29   * entry in the creole.xml file describing the default resources and to add
30   * some minor processing after running the components in order to extract the
31   * results in a usable form.
32   */
33  public class SentenceSplitter extends AbstractLanguageAnalyser{
34  
35    public static final String
36      SPLIT_DOCUMENT_PARAMETER_NAME = "document";
37  
38    public static final String
39      SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
40  
41    public static final String
42      SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
43  
44    public static final String
45      SPLIT_ENCODING_PARAMETER_NAME = "encoding";
46  
47    public static final String
48      SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
49  
50    public static final String
51      SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
52  
53    public Resource init()throws ResourceInstantiationException{
54      //create all the componets
55      FeatureMap params;
56      FeatureMap features;
57  
58      //gazetteer
59      fireStatusChanged("Creating the gazetteer");
60      params = Factory.newFeatureMap();
61      if(gazetteerListsURL != null)
62        params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
63                                               gazetteerListsURL);
64      params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
65      features = Factory.newFeatureMap();
66      Gate.setHiddenAttribute(features, true);
67  
68  
69      gazetteer = (DefaultGazetteer)Factory.createResource(
70                      "gate.creole.gazetteer.DefaultGazetteer",
71                      params, features);
72      gazetteer.setName("Gazetteer " + System.currentTimeMillis());
73      fireProgressChanged(10);
74  
75      //transducer
76      fireStatusChanged("Creating the JAPE transducer");
77  
78      params = Factory.newFeatureMap();
79      if(transducerURL != null)
80        params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
81      params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
82      features = Factory.newFeatureMap();
83      Gate.setHiddenAttribute(features, true);
84  
85      transducer = (Transducer)Factory.createResource(
86                      "gate.creole.Transducer",
87                      params, features);
88      transducer.setName("Transducer " + System.currentTimeMillis());
89  
90      fireProgressChanged(100);
91      fireProcessFinished();
92  
93      return this;
94    }
95  
96    public void execute() throws ExecutionException{
97      interrupted = false;
98      //set the runtime parameters
99      FeatureMap params;
100     if(inputASName != null && inputASName.equals("")) inputASName = null;
101     if(outputASName != null && outputASName.equals("")) outputASName = null;
102     try{
103       fireProgressChanged(0);
104       params = Factory.newFeatureMap();
105       params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
106       params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
107       gazetteer.setParameterValues(params);
108 
109       params = Factory.newFeatureMap();
110       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
111       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
112       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
113       transducer.setParameterValues(params);
114     }catch(Exception e){
115       throw new ExecutionException(e);
116     }
117     ProgressListener pListener = null;
118     StatusListener sListener = null;
119     fireProgressChanged(5);
120 
121     //run the gazetteer
122     if(isInterrupted()) throw new ExecutionInterruptedException(
123         "The execution of the \"" + getName() +
124         "\" sentence splitter has been abruptly interrupted!");
125     pListener = new IntervalProgressListener(5, 10);
126     sListener = new StatusListener(){
127       public void statusChanged(String text){
128         fireStatusChanged(text);
129       }
130     };
131     gazetteer.addProgressListener(pListener);
132     gazetteer.addStatusListener(sListener);
133     gazetteer.execute();
134     gazetteer.removeProgressListener(pListener);
135     gazetteer.removeStatusListener(sListener);
136 
137     //run the transducer
138     if(isInterrupted()) throw new ExecutionInterruptedException(
139         "The execution of the \"" + getName() +
140         "\" sentence splitter has been abruptly interrupted!");
141     pListener = new IntervalProgressListener(11, 90);
142     transducer.addProgressListener(pListener);
143     transducer.addStatusListener(sListener);
144     transducer.execute();
145     transducer.removeProgressListener(pListener);
146     transducer.removeStatusListener(sListener);
147 
148     //get pointers to the annotation sets
149     AnnotationSet inputAS = (inputASName == null) ?
150                             document.getAnnotations() :
151                             document.getAnnotations(inputASName);
152 
153     AnnotationSet outputAS = (outputASName == null) ?
154                              document.getAnnotations() :
155                              document.getAnnotations(outputASName);
156 
157     //copy the results to the output set if they are different
158     if(inputAS != outputAS){
159       outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
160     }
161 
162     //create one big sentence if none were found
163     AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
164     if(sentences == null || sentences.isEmpty()){
165       outputAS.add(outputAS.firstNode(), outputAS.lastNode(),
166                    SENTENCE_ANNOTATION_TYPE,
167                    Factory.newFeatureMap());;
168     }else{
169       //add a sentence covering all the tokens after the last sentence
170       Long endSentences = sentences.lastNode().getOffset();
171       AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
172                                                   inputAS.lastNode().getOffset());
173       if(remainingTokens != null && !remainingTokens.isEmpty()){
174         try{
175           outputAS.add(remainingTokens.firstNode().getOffset(),
176                        remainingTokens.lastNode().getOffset(),
177                        SENTENCE_ANNOTATION_TYPE,
178                        Factory.newFeatureMap());
179         }catch(InvalidOffsetException ioe){
180           throw new ExecutionException(ioe);
181         }
182       }
183     }
184     fireProcessFinished();
185   }//execute()
186 
187   /**
188    * Notifies all the PRs in this controller that they should stop their
189    * execution as soon as possible.
190    */
191   public synchronized void interrupt(){
192     interrupted = true;
193     gazetteer.interrupt();
194     transducer.interrupt();
195   }
196 
197   public void setTransducerURL(java.net.URL newTransducerURL) {
198     transducerURL = newTransducerURL;
199   }
200   public java.net.URL getTransducerURL() {
201     return transducerURL;
202   }
203   DefaultGazetteer gazetteer;
204   Transducer transducer;
205   private java.net.URL transducerURL;
206   private String encoding;
207   private java.net.URL gazetteerListsURL;
208 
209 
210   public void setEncoding(String newEncoding) {
211     encoding = newEncoding;
212   }
213   public String getEncoding() {
214     return encoding;
215   }
216   public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
217     gazetteerListsURL = newGazetteerListsURL;
218   }
219   public java.net.URL getGazetteerListsURL() {
220     return gazetteerListsURL;
221   }
222   public void setInputASName(String newInputASName) {
223     inputASName = newInputASName;
224   }
225 
226   public String getInputASName() {
227     return inputASName;
228   }
229   public void setOutputASName(String newOutputASName) {
230     outputASName = newOutputASName;
231   }
232   public String getOutputASName() {
233     return outputASName;
234   }
235 
236 
237 
238   private static final boolean DEBUG = false;
239   private String inputASName;
240   private String outputASName;
241 }//public class SentenceSplitter extends Nerc