1   package gate.creole.tokeniser.chinesetokeniser;
2   
3   import java.io.*;
4   import java.util.*;
5   
6   /**
7    * <p>Title: Segmenter.java</p>
8    * <p>Description: This class segments the Chinese Text by adding extra spaces
9    * </p>
10   * <p>Company: University Of Sheffield</p>
11   * @author Erik E. Peterson - modified by Niraj Aswani
12   * @see <a href="http://www.mandarintools.com/segmenter.html">source</a>
13   */
14  public class Segmenter {
15    //private Hashtable zhwords;
16    private TreeMap zhwords;
17    private TreeSet csurname, cforeign, cnumbers, cnotname;
18  
19    private boolean debug;
20  
21    // Char form
22    public final static int TRAD = 0;
23    public final static int SIMP = 1;
24    public final static int BOTH = 2;
25  
26    // by niraj
27    private ArrayList marks;
28  
29    // Charform is TRAD, SIMP or BOTH
30    public Segmenter(int charform, boolean loadwordfile) {
31      debug = false;
32  
33      int count = 0;
34  
35      int treelevel;
36  
37      csurname = new TreeSet();
38      cforeign = new TreeSet();
39      cnumbers = new TreeSet();
40      cnotname = new TreeSet();
41  
42      if (charform == SIMP) {
43        loadset(cnumbers,
44                "gate:/creole/tokeniser/chinesetokeniser/snumbers_u8.txt");
45        loadset(cforeign,
46                "gate:/creole/tokeniser/chinesetokeniser/sforeign_u8.txt");
47        loadset(csurname,
48                "gate:/creole/tokeniser/chinesetokeniser/ssurname_u8.txt");
49        loadset(cnotname,
50                "gate:/creole/tokeniser/chinesetokeniser/snotname_u8.txt");
51      }
52      else if (charform == TRAD) {
53        loadset(cnumbers,
54                "gate:/creole/tokeniser/chinesetokeniser/tnumbers_u8.txt");
55        loadset(cforeign,
56                "gate:/creole/tokeniser/chinesetokeniser/tforeign_u8.txt");
57        loadset(csurname,
58                "gate:/creole/tokeniser/chinesetokeniser/tsurname_u8.txt");
59        loadset(cnotname,
60                "gate:/creole/tokeniser/chinesetokeniser/tnotname_u8.txt");
61      }
62      else { // BOTH
63        loadset(cnumbers,
64                "gate:/creole/tokeniser/chinesetokeniser/snumbers_u8.txt");
65        loadset(cforeign,
66                "gate:/creole/tokeniser/chinesetokeniser/sforeign_u8.txt");
67        loadset(csurname,
68                "gate:/creole/tokeniser/chinesetokeniser/ssurname_u8.txt");
69        loadset(cnotname,
70                "gate:/creole/tokeniser/chinesetokeniser/snotname_u8.txt");
71        loadset(cnumbers,
72                "gate:/creole/tokeniser/chinesetokeniser/tnumbers_u8.txt");
73        loadset(cforeign,
74                "gate:/creole/tokeniser/chinesetokeniser/tforeign_u8.txt");
75        loadset(csurname,
76                "gate:/creole/tokeniser/chinesetokeniser/tsurname_u8.txt");
77        loadset(cnotname,
78                "gate:/creole/tokeniser/chinesetokeniser/tnotname_u8.txt");
79      }
80  
81      zhwords = new TreeMap();
82  
83      if (!loadwordfile) {
84        return;
85      }
86  
87      String newword = null;
88      try {
89        InputStream worddata = null;
90        if (charform == SIMP) {
91          worddata = new java.net.URL(
92              "gate:/creole/tokeniser/chinesetokeniser/simplexu8.txt").openStream();
93        }
94        else if (charform == TRAD) {
95          worddata = new java.net.URL(
96              "gate:/creole/tokeniser/chinesetokeniser/tradlexu8.txt").openStream();
97        }
98        else if (charform == BOTH) {
99          worddata = new java.net.URL(
100             "gate:/creole/tokeniser/chinesetokeniser/bothlexu8.txt").openStream();
101       }
102       BufferedReader in = new BufferedReader(new InputStreamReader(worddata,
103           "UTF8"));
104       while ( (newword = in.readLine()) != null) {
105         if ( (newword.indexOf("#") == -1) && (newword.length() < 5)) {
106 
107           zhwords.put(newword.intern(), "1");
108 
109           if (newword.length() == 3) {
110             if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
111               zhwords.put(newword.substring(0, 2).intern(), "2");
112             }
113           }
114 
115           if (newword.length() == 4) {
116             if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
117               zhwords.put(newword.substring(0, 2).intern(), "2");
118             }
119             if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
120               zhwords.put(newword.substring(0, 3).intern(), "2");
121             }
122 
123           }
124 
125           //if (count++ % 20000 == 0) { System.err.println(count); }
126         }
127       }
128       in.close();
129 
130     }
131     catch (IOException e) {
132       //System.err.println("IOException: "+e);
133     }
134 
135   }
136 
137   /** Load a set of character data */
138   private void loadset(TreeSet targetset, String sourcefile) {
139     String dataline;
140     try {
141       InputStream setdata = new java.net.URL(sourcefile).openStream();
142       BufferedReader in = new BufferedReader(new InputStreamReader(setdata,
143           "UTF-8"));
144       while ( (dataline = in.readLine()) != null) {
145         if ( (dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
146           continue;
147         }
148         targetset.add(dataline.intern());
149       }
150       in.close();
151     }
152     catch (Exception e) {
153       //System.err.println("Exception loading data file" + sourcefile + " " + e);
154     }
155 
156   }
157 
158   public boolean isNumber(String testword) {
159     boolean result = true;
160     for (int i = 0; i < testword.length(); i++) {
161       if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {
162         result = false;
163         break;
164       }
165     }
166 
167     return result;
168   }
169 
170   public boolean isAllForeign(String testword) {
171     boolean result = true;
172     for (int i = 0; i < testword.length(); i++) {
173       if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {
174         result = false;
175         break;
176       }
177     }
178 
179     return result;
180   }
181 
182   public boolean isNotCJK(String testword) {
183     boolean result = true;
184     for (int i = 0; i < testword.length(); i++) {
185       if (Character.UnicodeBlock.of(testword.charAt(i)) ==
186           Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
187         result = false;
188         break;
189       }
190     }
191 
192     return result;
193   }
194 
195   public String stemWord(String word) {
196     String[] prefix = new String[] {
197         "\u7b2c", "\u526f", "\u4e0d"};
198     String[] suffix = new String[] {
199         "\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
200         "\u5230", "\u5185", "\u5916", "\u4eec"};
201     String[] infix = new String[] {
202         "\u5f97", "\u4e0d"};
203     int i;
204 
205     StringBuffer unstemmed = new StringBuffer(word);
206 
207     for (i = 0; i < prefix.length; i++) {
208       if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
209           (zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
210            unstemmed.length() == 2)) {
211         unstemmed.deleteCharAt(0);
212         return unstemmed.toString();
213       }
214     }
215 
216     for (i = 0; i < suffix.length; i++) {
217       if (unstemmed.substring(unstemmed.length() - 1, unstemmed.length()).
218           equals(suffix[i]) == true &&
219           (zhwords.get(unstemmed.substring(0, unstemmed.length() - 1).intern()) != null ||
220            unstemmed.length() == 2)) {
221         unstemmed.deleteCharAt(unstemmed.length() - 1);
222         return unstemmed.toString();
223       }
224     }
225 
226     for (i = 0; i < infix.length; i++) {
227       if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
228           zhwords.get(new String(unstemmed.substring(0, 1) +
229                                  unstemmed.substring(2, 3)).intern()) != null) {
230         unstemmed.deleteCharAt(1);
231         return unstemmed.toString();
232       }
233     }
234 
235     return unstemmed.toString();
236   }
237 
238   // here we will keep track of where we add the spaces in the original
239   // document and we will store all these marks in the array called *marks*
240   public String segmentLine(String cline, String separator) {
241     StringBuffer currentword = new StringBuffer();
242     StringBuffer outline = new StringBuffer();
243     int i, clength;
244     char currentchar;
245     separator = " ";
246 
247     clength = cline.length();
248     int[][] offsets = new int[clength][2];
249     marks = new ArrayList(); // addition by Niraj
250 
251     for (i = 0; i < clength; i++) {
252       currentchar = cline.charAt(i);
253       if (Character.UnicodeBlock.of(currentchar) ==
254           Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
255           isNumber(cline.substring(i, i + 1)) == true) {
256         // Character in CJK block
257         if (currentword.length() == 0) { // start looking for next word
258           //System.err.println("current word length 0");
259           if (i > 0 && (Character.isWhitespace(cline.charAt(i - 1)) == false)) {
260 
261             marks.add(new Long(i + marks.size())); //addition by Niraj
262             outline.append(separator);
263           }
264           currentword.append(currentchar);
265 
266         }
267         else {
268           if (zhwords.containsKey(new String(currentword.toString() +
269                                              currentchar).intern()) == true &&
270               ( (String) (zhwords.get(new String(currentword.toString() +
271                                                  currentchar).intern()))).
272               equals("1") == true) {
273             // word is in lexicon
274             currentword.append(currentchar);
275 
276           }
277           else if (isAllForeign(currentword.toString()) &&
278                    cforeign.contains(new String(new char[] {currentchar}).
279                                      intern()) &&
280                    i + 2 < clength &&
281                    (zhwords.containsKey(cline.substring(i, i + 2).intern()) == false)) {
282             // Possible a transliteration of a foreign name
283             currentword.append(currentchar);
284 
285           }
286           else if (isNumber(currentword.toString()) &&
287                    cnumbers.contains(new String(new char[] {currentchar}).
288                                      intern())
289                    /* && (i + 2 < clength) &&
290                (zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */) {
291             // Put all consecutive number characters together
292             currentword.append(currentchar);
293 
294           }
295           else if ( (zhwords.containsKey(new String(currentword.toString() +
296               currentchar).intern())) &&
297                    ( ( (String) (zhwords.get(new String(currentword.toString() +
298               currentchar).intern()))).equals("2") == true) &&
299                    i + 1 < clength &&
300                    (zhwords.containsKey(new String(currentword.toString() +
301               currentchar +
302               cline.charAt(i + 1)).intern()) == true)) {
303 
304             // Starts a word in the lexicon
305             currentword.append(currentchar);
306 
307           }
308           else { // Start anew
309 
310             outline.append(currentword.toString());
311             if (Character.isWhitespace(currentchar) == false) {
312               // addition by Niraj
313               marks.add(new Long(i + marks.size()));
314               // end of addition
315               outline.append(separator);
316             }
317             currentword.setLength(0);
318             currentword.append(currentchar);
319           }
320         }
321 
322       }
323       else { // Not chinese character
324         //System.err.println("not cjk");
325         if (currentword.length() > 0) {
326           outline.append(currentword.toString());
327           if (Character.isWhitespace(currentchar) == false) {
328             // addition by Niraj
329             marks.add(new Long(i + marks.size()));
330             // end of addition
331             outline.append(separator);
332           }
333           currentword.setLength(0);
334         }
335         outline.append(currentchar);
336       }
337     }
338 
339     outline.append(currentword.toString());
340 
341     return outline.toString();
342     //return offsets;
343   }
344 
345   public void addword(String newword) {
346     zhwords.put(newword.intern(), "1");
347 
348     if (newword.length() == 3) {
349       if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
350         zhwords.put(newword.substring(0, 2).intern(), "2");
351       }
352     }
353 
354     if (newword.length() == 4) {
355       if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
356         zhwords.put(newword.substring(0, 2).intern(), "2");
357       }
358       if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
359         zhwords.put(newword.substring(0, 3).intern(), "2");
360       }
361 
362     }
363 
364     if (newword.length() == 5) {
365       if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
366         zhwords.put(newword.substring(0, 2).intern(), "2");
367       }
368       if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
369         zhwords.put(newword.substring(0, 3).intern(), "2");
370       }
371       if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
372         zhwords.put(newword.substring(0, 4).intern(), "2");
373       }
374     }
375 
376     if (newword.length() == 6) {
377       if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
378         zhwords.put(newword.substring(0, 2).intern(), "2");
379       }
380       if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
381         zhwords.put(newword.substring(0, 3).intern(), "2");
382       }
383       if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
384         zhwords.put(newword.substring(0, 4).intern(), "2");
385       }
386       if (zhwords.containsKey(newword.substring(0, 5).intern()) == false) {
387         zhwords.put(newword.substring(0, 5).intern(), "2");
388       }
389     }
390 
391   }
392 
393   /**
394    * This method returns the marks where the spaces were added by the segmenter
395    */
396   // addition by Niraj
397   public ArrayList getMarks() {
398     return marks;
399   }
400 
401   public String segmentData(String fileContents, String encoding) {
402     byte[] gbbytes;
403     String segstring = "";
404     boolean debug = false;
405 
406     try {
407       segstring = segmentLine(fileContents, " ");
408       if (debug) {
409         gbbytes = segstring.getBytes(encoding);
410       }
411     }
412     catch (Exception e) {
413       //System.err.println("Exception " + e.toString());
414     }
415 
416     return segstring;
417   }
418 }