gate.creole.tokeniser.chinesetokeniser
Class Segmenter

java.lang.Object
  extended bygate.creole.tokeniser.chinesetokeniser.Segmenter

public class Segmenter
extends Object

Title: Segmenter.java

Description: This class segments the Chinese Text by adding extra spaces

Company: University Of Sheffield

Author:
Erik E. Peterson - modified by Niraj Aswani
See Also:
source

Field Summary
static int BOTH
           
private  TreeSet cforeign
           
private  TreeSet cnotname
           
private  TreeSet cnumbers
           
private  TreeSet csurname
           
private  boolean debug
           
private  ArrayList marks
           
static int SIMP
           
static int TRAD
           
private  TreeMap zhwords
           
 
Constructor Summary
Segmenter(int charform, boolean loadwordfile)
           
 
Method Summary
 void addword(String newword)
           
 ArrayList getMarks()
          This method returns the marks where the spaces were added by the segmenter
 boolean isAllForeign(String testword)
           
 boolean isNotCJK(String testword)
           
 boolean isNumber(String testword)
           
private  void loadset(TreeSet targetset, String sourcefile)
          Load a set of character data
 String segmentData(String fileContents, String encoding)
           
 String segmentLine(String cline, String separator)
           
 String stemWord(String word)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

zhwords

private TreeMap zhwords

csurname

private TreeSet csurname

cforeign

private TreeSet cforeign

cnumbers

private TreeSet cnumbers

cnotname

private TreeSet cnotname

debug

private boolean debug

TRAD

public static final int TRAD
See Also:
Constant Field Values

SIMP

public static final int SIMP
See Also:
Constant Field Values

BOTH

public static final int BOTH
See Also:
Constant Field Values

marks

private ArrayList marks
Constructor Detail

Segmenter

public Segmenter(int charform,
                 boolean loadwordfile)
Method Detail

loadset

private void loadset(TreeSet targetset,
                     String sourcefile)
Load a set of character data


isNumber

public boolean isNumber(String testword)

isAllForeign

public boolean isAllForeign(String testword)

isNotCJK

public boolean isNotCJK(String testword)

stemWord

public String stemWord(String word)

segmentLine

public String segmentLine(String cline,
                          String separator)

addword

public void addword(String newword)

getMarks

public ArrayList getMarks()
This method returns the marks where the spaces were added by the segmenter


segmentData

public String segmentData(String fileContents,
                          String encoding)