1   /*
2    *  EntityDescriptor.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, July/2000
12   *
13   *  $Id: EntityDescriptor.java,v 1.7 2004/07/21 17:10:05 akshay Exp $
14   */
15  
16  package gate.creole.nerc;
17  
18  import java.io.Serializable;
19  
20  import gate.Annotation;
21  import gate.Document;
22  import gate.util.InvalidOffsetException;
23  
24  /** Represents a single named entity */
25  public class EntityDescriptor implements Serializable{
26  
27    /** Constructs a new entity descriptor */
28    public EntityDescriptor(String string, String category, int start, int end) {
29      this.string = normaliseString(string);
30      this.category = category;
31      offsets = new int[2];
32      offsets[0] = start;
33      offsets[1] = end;
34    }
35  
36    /** Constructs a new entity descriptor starting from a Gate annotation */
37    public EntityDescriptor(Document document, Annotation annotation) {
38      offsets = new int[2];
39      offsets[0] = annotation.getStartNode().getOffset().intValue();
40      offsets[1] = annotation.getEndNode().getOffset().intValue();
41      try{
42        string = normaliseString(document.getContent().getContent(
43                                      annotation.getStartNode().getOffset(),
44                                      annotation.getEndNode().getOffset()).
45                                      toString());
46      } catch(InvalidOffsetException ioe){
47        ioe.printStackTrace();
48      }
49      category = annotation.getType();
50    }
51  
52    /** Returns a normalised string for the entity. This is the string from the
53      * text document the entity was descovered in, with all whitespace sequences
54      * replaced by a single space character
55      */
56    public String getString(){
57      return string;
58    }
59  
60    /** Returns the category of the entity*/
61    public String getCategory(){
62      return category;
63    }
64  
65    /** Returns a pair of integers specifying the character offsets in the
66      * original file where the entity occured
67      */
68    public int[] getOffsets(){
69      return offsets;
70    }
71  
72    /** Returns a string giving the category, offsets and normalised string for
73      * the entity, with no newlines.
74      */
75    public String toString(){
76      return category + " " + offsets[0] + " " + offsets[1] + " " + string;
77    }
78  
79    String string;
80    String category;
81    int[] offsets;
82  
83    /** Normalises a string. That is removes all the leading and trailing
84      * whitespace characters and replaces all inner whitespace sequences with a
85      * single space character
86      */
87    protected String normaliseString(String text){
88  ///    String res = "";
89      StringBuffer res = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
90      if(text == null) return null;
91      int charIdx = 0;
92      boolean lastWasSpace = false;
93      //skip the leading spaces
94      while(charIdx < text.length() &&
95            Character.isWhitespace(text.charAt(charIdx))) charIdx++;
96      //parse the rest of the text
97      while(charIdx < text.length()){
98        if(Character.isWhitespace(text.charAt(charIdx))){
99          //reading spaces
100         lastWasSpace = true;
101       }else{
102         //reading non-spaces
103         if(lastWasSpace) ///res += " ";
104                 res.append(" ");
105 ///        res += text.charAt(charIdx);
106         res.append(text.charAt(charIdx));
107         lastWasSpace = false;
108       }
109       charIdx++;
110     }//while(charIdx < text.length())
111     return res.toString();
112   }
113 
114 }
115