gate.creole.namematch
Class Namematch

java.lang.Object
  |
  +--gate.util.AbstractFeatureBearer
        |
        +--gate.creole.AbstractResource
              |
              +--gate.creole.AbstractProcessingResource
                    |
                    +--gate.creole.namematch.Namematch
All Implemented Interfaces:
FeatureBearer, ProcessingResource, Resource, Runnable, Serializable

public class Namematch
extends AbstractProcessingResource
implements ProcessingResource

See Also:
Serialized Form

Field Summary
protected  HashMap alias
           
protected  String annotationSetName
          the name of the annotation set
protected  String annotationType
          the type of annotation
protected  Set annotationTypes
          the types of the annotation
private static int BUFF_SIZE
          the size of the buffer
private  char[] cbuffer
          a buffer in order to read an array of char
protected  HashMap cdg
           
protected  HashMap connector
           
protected  HashMap def_art
           
protected  Document document
          the document for namematch
protected  ExecutionException executionException
           
protected  Boolean extLists
          internal or external list
protected  List matchesDocument
          the set with all the matches from document
protected  AnnotationSet nameAnnots
          the annotation set for the document
protected  String organizationType
          the organization type
protected  String personType
          the person type
protected  HashMap prepos
           
protected  HashMap spur_match
           
 
Fields inherited from class gate.creole.AbstractResource
serialVersionUID
 
Fields inherited from class gate.util.AbstractFeatureBearer
features
 
Constructor Summary
Namematch()
           
 
Method Summary
private  boolean apply_rules_namematch(String shortName, String longName)
          apply_rules_namematch: apply rules similarly to lasie1.5's namematch
private  void buildTables(Document doc)
          Tables for namematch info (used by the namematch rules)
 void check()
          Trigger any exception that was caught when run() was invoked.
 void createAnnotList(String nameFile, String nameList)
          creates the lookup tables
 void createLists()
          if ( == false) then reads the names of files in order to create the lookup tables
 void determineMatchesDocument()
          all the matches from the current document are placed in a list
 String getAnnotationSetName()
          get the name of the annotation set
 Set getAnnotationTypes()
          get the types of the annotation
 Document getDocument()
          Gets the document currently set as target for this namematch.
 Boolean getExtList()
           
 List getMatchesDocument()
           
 String getOrganizationType()
           
 String getPersonType()
           
 Resource init()
          Initialise this resource, and return it.
 boolean matchRule0(String s1, String s2)
          RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations
 boolean matchRule1(String s1, String s2, boolean MatchCase)
          RULE #1: If the two names are identical then they are the same Condition(s): depend on case Applied to: all name annotations
 boolean matchRule10(String s1, String s2)
          RULE #10: is one name the reverse of the other reversing around prepositions only? e.g.
 boolean matchRule11(String s1, String s2)
          RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g.
 boolean matchRule12(String s1, String s2)
          RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only
 boolean matchRule13(String s1, String s2)
          RULE #13: do multi-word names match except for one token e.g.
 boolean matchRule2(String s1, String s2)
          RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations
 boolean matchRule3(String s1, String s2)
          RULE #3: adding a possessive at the end of one name causes a match e.g.
 boolean matchRule4(String s1, String s2)
          RULE #4: Do all tokens other than the punctuation marks , and .
 boolean matchRule5(String s1, String s2)
          RULE #5: if the 1st token of one name matches the second name e.g.
 boolean matchRule6(String s1, String s2)
          RULE #6: if one name is the acronym of the other e.g.
 boolean matchRule7(String s1, String s2)
          RULE #7: if one of the tokens in one of the names is in the list of separators eg.
 boolean matchRule8(String s1, String s2)
          RULE #8: if the names match after stripping off "The" and trailing company designator e.g.
 boolean matchRule9(String s1, String s2)
          RULE #9: does one of the names match the token just before a trailing company designator in the other name? e.g.
 String regularExpressions(String text, String replacement, String regEx)
          substitute all multiple spaces, tabes and newlines with a single space
 void run()
          Run the resource.
 void setAnnotationSetName(String newAnnotationSetName)
          set the annotation set name
 void setAnnotationTypes(Set newType)
          set the types of the annotations
 void setDocument(Document newDocument)
          set the document
 void setExtLists(Boolean newExtLists)
          set the annotations
 void setOrganizationType(String newOrganizationType)
           
 void setPersonType(String newPersonType)
           
 
Methods inherited from class gate.creole.AbstractProcessingResource
reInit
 
Methods inherited from class gate.creole.AbstractResource
getName, setName
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
, clone, equals, finalize, getClass, hashCode, notify, notifyAll, registerNatives, toString, wait, wait, wait
 
Methods inherited from interface gate.ProcessingResource
reInit
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, getName, setFeatures, setName
 

Field Detail

document

protected Document document
the document for namematch

annotationSetName

protected String annotationSetName
the name of the annotation set

annotationTypes

protected Set annotationTypes
the types of the annotation

organizationType

protected String organizationType
the organization type

personType

protected String personType
the person type

annotationType

protected String annotationType
the type of annotation

extLists

protected Boolean extLists
internal or external list

nameAnnots

protected AnnotationSet nameAnnots
the annotation set for the document

matchesDocument

protected List matchesDocument
the set with all the matches from document

executionException

protected ExecutionException executionException

alias

protected HashMap alias

cdg

protected HashMap cdg

spur_match

protected HashMap spur_match

def_art

protected HashMap def_art

connector

protected HashMap connector

prepos

protected HashMap prepos

cbuffer

private char[] cbuffer
a buffer in order to read an array of char

BUFF_SIZE

private static final int BUFF_SIZE
the size of the buffer
Constructor Detail

Namematch

public Namematch()
Method Detail

init

public Resource init()
              throws ResourceInstantiationException
Initialise this resource, and return it.
Specified by:
init in interface Resource
Overrides:
init in class AbstractProcessingResource

run

public void run()
Run the resource. It doesn't make sense not to override this in subclasses so the default implementation signals an exception.
Specified by:
run in interface Runnable
Overrides:
run in class AbstractProcessingResource

determineMatchesDocument

public void determineMatchesDocument()
all the matches from the current document are placed in a list

check

public void check()
           throws ExecutionException
Description copied from interface: ProcessingResource
Trigger any exception that was caught when run() was invoked.
Specified by:
check in interface ProcessingResource
Overrides:
check in class AbstractProcessingResource

createLists

public void createLists()
                 throws IOException
if ( == false) then reads the names of files in order to create the lookup tables

createAnnotList

public void createAnnotList(String nameFile,
                            String nameList)
                     throws IOException
creates the lookup tables

apply_rules_namematch

private boolean apply_rules_namematch(String shortName,
                                      String longName)
apply_rules_namematch: apply rules similarly to lasie1.5's namematch

setDocument

public void setDocument(Document newDocument)
set the document

setExtLists

public void setExtLists(Boolean newExtLists)
set the annotations

setAnnotationSetName

public void setAnnotationSetName(String newAnnotationSetName)
set the annotation set name

setAnnotationTypes

public void setAnnotationTypes(Set newType)
set the types of the annotations

setOrganizationType

public void setOrganizationType(String newOrganizationType)

setPersonType

public void setPersonType(String newPersonType)

getDocument

public Document getDocument()
Gets the document currently set as target for this namematch.
Returns:
a Document

getAnnotationSetName

public String getAnnotationSetName()
get the name of the annotation set

getAnnotationTypes

public Set getAnnotationTypes()
get the types of the annotation

getOrganizationType

public String getOrganizationType()

getPersonType

public String getPersonType()

getExtList

public Boolean getExtList()

getMatchesDocument

public List getMatchesDocument()

matchRule0

public boolean matchRule0(String s1,
                          String s2)
RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations

matchRule1

public boolean matchRule1(String s1,
                          String s2,
                          boolean MatchCase)
RULE #1: If the two names are identical then they are the same Condition(s): depend on case Applied to: all name annotations

matchRule2

public boolean matchRule2(String s1,
                          String s2)
RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations

matchRule3

public boolean matchRule3(String s1,
                          String s2)
RULE #3: adding a possessive at the end of one name causes a match e.g. "Standard and Poor" == "Standard and Poor's" and also "Standard and Poor" == "Standard's" Condition(s): case-insensitive match Applied to: all name annotations

matchRule4

public boolean matchRule4(String s1,
                          String s2)
RULE #4: Do all tokens other than the punctuation marks , and . match? e.g. "Smith, Jones" == "Smith Jones" Condition(s): case-insensitive match Applied to: organisation annotations only

matchRule5

public boolean matchRule5(String s1,
                          String s2)
RULE #5: if the 1st token of one name matches the second name e.g. "Pepsi Cola" == "Pepsi" Condition(s): case-insensitive match Applied to: all name annotations

matchRule6

public boolean matchRule6(String s1,
                          String s2)
RULE #6: if one name is the acronym of the other e.g. "Imperial Chemical Industries" == "ICI" Condition(s): case-sensitive match, remove initial "The" Applied to: organisation annotations only

matchRule7

public boolean matchRule7(String s1,
                          String s2)
RULE #7: if one of the tokens in one of the names is in the list of separators eg. "&" then check if the token before the separator matches the other name e.g. "R.H. Macy & Co." == "Macy" Condition(s): case-sensitive match Applied to: organisation and person annotations only

matchRule8

public boolean matchRule8(String s1,
                          String s2)
RULE #8: if the names match after stripping off "The" and trailing company designator e.g. "The Magic Tricks Co." == "Magic Tricks" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule9

public boolean matchRule9(String s1,
                          String s2)
RULE #9: does one of the names match the token just before a trailing company designator in the other name? e.g. "R.H. Macy Co." == "Macy" Condition(s): case-sensitive match Applied to: organisation and person annotations only

matchRule10

public boolean matchRule10(String s1,
                           String s2)
RULE #10: is one name the reverse of the other reversing around prepositions only? e.g. "Department of Defence" == "Defence Department" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule11

public boolean matchRule11(String s1,
                           String s2)
RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g. "Communications Satellite" == "ComSat" and "Pan American" == "Pan Am" Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule12

public boolean matchRule12(String s1,
                           String s2)
RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only

matchRule13

public boolean matchRule13(String s1,
                           String s2)
RULE #13: do multi-word names match except for one token e.g. "Second Force Recon Company" == "Force Recon Company" Note that this rule has NOT been used in LaSIE's 1.5 namematcher Restrictions: - remove cdg first - shortest name should be 2 words or more - if N is the number of tokens of the longest name, then N-1 tokens should be matched Condition(s): case-sensitive match Applied to: organisation annotations only

buildTables

private void buildTables(Document doc)
Tables for namematch info (used by the namematch rules)

regularExpressions

public String regularExpressions(String text,
                                 String replacement,
                                 String regEx)
substitute all multiple spaces, tabes and newlines with a single space