1 package gate.creole.tokeniser.chinesetokeniser;
2
3
19
20 import java.util.*;
21
22 import gate.*;
23 import gate.creole.*;
24 import gate.creole.tokeniser.SimpleTokeniser;
25 import gate.util.*;
26
27
34 public class ChineseTokeniser
35 extends AbstractLanguageAnalyser
36 implements ProcessingResource {
37
38
39 private Segmenter segmenter;
40
41
42 private String encoding;
43
44
45 private gate.Document document;
46
47
48 private gate.Document tempDoc;
49
50
51 private SimpleTokeniser tokeniser;
52
53
54 private Boolean runSegmenter;
55
56
57 private Boolean generateSpaceTokens;
58
59
60 private java.net.URL rulesURL;
61
62 private String annotationSetName;
63
64 private int charform;
65
66
67 public ChineseTokeniser() {
68
69 }
70
71 public Resource init() throws ResourceInstantiationException {
72 fireProgressChanged(0);
73 fireStatusChanged("Loading Data Files...");
74 if (encoding == null) {
76 encoding = "UTF8";
78 }
79 else {
80 if (encoding.equals("BIG5")) {
81 charform = Segmenter.TRAD;
82 }
83 else if (encoding.equals("GBK")) {
84 charform = Segmenter.SIMP;
85 }
86 else if (encoding.equals("UTF8")) {
87 charform = Segmenter.BOTH;
88 }
89 else {
90 encoding = "UTF8";
92 charform = Segmenter.BOTH;
93 }
94 }
95
96 if (rulesURL == null) {
97 throw new ResourceInstantiationException(
98 "No URL provided for the tokeniser rules");
99 }
100 segmenter = new Segmenter(charform, true);
102 fireProcessFinished();
103
104 return this;
106 }
107
108
109 public void reInit() throws ResourceInstantiationException {
110 segmenter = new Segmenter(charform, true);
111 }
112
113
121 public void execute() throws ExecutionException {
122 fireProgressChanged(0);
124
125 if (document == null) {
127 throw new GateRuntimeException("No document to process!");
128 }
129
130 String segmentedData = null;
131
132 if (runSegmenter.booleanValue()) {
134 segmentedData = segmenter.segmentData(
136 document.getContent().toString(),
137 encoding);
138 } else {
139 segmentedData = document.getContent().toString();
140 }
141
142 if(encoding.equals("UTF8")) {
143 encoding = "UTF-8";
144 }
145
146 try {
150 FeatureMap params = Factory.newFeatureMap();
151 params.put("stringContent", segmentedData);
152 FeatureMap features = Factory.newFeatureMap();
153
154 Gate.setHiddenAttribute(features, true);
156
157 tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
158 params, features);
159 }
160 catch (ResourceInstantiationException rie) {
161 throw new ExecutionException("Temporary document cannot be created");
162 }
163
164 ArrayList marks = null;
167
168 if (runSegmenter.booleanValue()) {
170 marks = segmenter.getMarks();
171 }
172
173 FeatureMap features = Factory.newFeatureMap();
175 Gate.setHiddenAttribute(features, true);
176
177 FeatureMap params = Factory.newFeatureMap();
179 params.put("rulesURL", rulesURL);
180 params.put("encoding", encoding);
181 params.put("document", tempDoc);
182 params.put("annotationSetName", annotationSetName);
183
184 try {
185 tokeniser = (gate.creole.tokeniser.SimpleTokeniser) Factory.
186 createResource(
187 "gate.creole.tokeniser.SimpleTokeniser", params, features);
188 }
189 catch (ResourceInstantiationException rie) {
190 throw new ExecutionException(
191 "Instance of SimpleTokeniser cannot be created");
192 }
193
194 tokeniser.execute();
196
197 AnnotationSet anns;
200 AnnotationSet original;
201
202 if(annotationSetName == null || annotationSetName.length() == 0) {
203 anns = tempDoc.getAnnotations();
204 original = document.getAnnotations();
205 } else {
206 anns = tempDoc.getAnnotations(annotationSetName);
207 original = document.getAnnotations(annotationSetName);
208 }
209
210 List tokens = new ArrayList(anns.get());
211 Comparator offsetComparator = new OffsetComparator();
212 Collections.sort(tokens, offsetComparator);
213 Iterator tokenIter = tokens.iterator();
214
215
216 long[] markValues = (runSegmenter.booleanValue())? new long[marks.size()] : null;
218 if(markValues != null) {
219 for (int i = 0; i < marks.size(); i++) {
220 markValues[i] = ( (Long) marks.get(i)).longValue();
221 }
222 Arrays.sort(markValues);
223 }
224
225 while (tokenIter.hasNext()) {
227 Annotation currentToken = ( (Annotation) tokenIter.next());
228 long startOffset =
229 currentToken.getStartNode().getOffset().longValue();
230 long endOffset =
231 currentToken.getEndNode().getOffset().longValue();
232
233 int index = (markValues == null) ? -1 : Arrays.binarySearch(markValues, startOffset);
235 if (index >= 0) {
236 if (generateSpaceTokens.booleanValue()) {
238 try {
239 FeatureMap newFeatures = Factory.newFeatureMap();
240 newFeatures.put("kind", "ChineseSplit");
241 original.add(new Long(startOffset - index),
242 new Long(startOffset - index),
243 SPACE_TOKEN_ANNOTATION_TYPE, newFeatures);
244 }
245 catch (InvalidOffsetException ioe) {
246 throw new ExecutionException("Offset Error");
247 }
248 }
249
250 }
251 else {
252 index = Math.abs(index) - 1;
253
254 String annotSetName = currentToken.getType();
257 FeatureMap newFeatureMap = currentToken.getFeatures();
258 try {
259 original.add(new Long(startOffset - index),
260 new Long(endOffset - index), annotSetName,
261 newFeatureMap);
262 }
263 catch (InvalidOffsetException ioe) {
264 throw new ExecutionException(
265 "Problem with the invalid offset while adding annotations" +
266 "to the original document");
267 }
268 }
269 }
270 Factory.deleteResource(tempDoc);
272
273 fireProcessFinished();
275 }
276
277
279
283 public void setRunSegmenter(Boolean runSegmenter) {
284 this.runSegmenter = runSegmenter;
285 }
286
287
288
290 public Boolean getRunSegmenter() {
291 return this.runSegmenter;
292 }
293
294
298 public void setGenerateSpaceTokens(Boolean value) {
299 this.generateSpaceTokens = value;
300 }
301
302
306 public Boolean getGenerateSpaceTokens() {
307 return this.generateSpaceTokens;
308 }
309
310
314 public void setDocument(gate.Document document) {
315 this.document = document;
316 }
317
318
321 public gate.Document getDocument() {
322 return this.document;
323 }
324
325
329 public void setEncoding(String encoding) {
330 this.encoding = encoding;
331 }
332
333
336 public String getEncoding() {
337 return this.encoding;
338 }
339
340
344 public void setRulesURL(java.net.URL rules) {
345 this.rulesURL = rules;
346 }
347
348
352 public java.net.URL getRulesURL() {
353 return rulesURL;
354 }
355
356
360 public void setAnnotationSetName(String name) {
361 this.annotationSetName = name;
362 }
363
364
368 public String getAnnotationSetName() {
369 return this.annotationSetName;
370 }
371 }