|
EntityDescriptor |
|
1 /* 2 * EntityDescriptor.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Valentin Tablan, July/2000 12 * 13 * $Id: EntityDescriptor.java,v 1.5 2001/09/26 11:41:05 marin Exp $ 14 */ 15 16 package gate.creole.nerc; 17 18 import gate.*; 19 import gate.util.*; 20 21 import java.io.Serializable; 22 23 /** Represents a single named entity */ 24 public class EntityDescriptor implements Serializable{ 25 26 /** Constructs a new entity descriptor */ 27 public EntityDescriptor(String string, String category, int start, int end) { 28 this.string = normaliseString(string); 29 this.category = category; 30 offsets = new int[2]; 31 offsets[0] = start; 32 offsets[1] = end; 33 } 34 35 /** Constructs a new entity descriptor starting from a Gate annotation */ 36 public EntityDescriptor(Document document, Annotation annotation) { 37 offsets = new int[2]; 38 offsets[0] = annotation.getStartNode().getOffset().intValue(); 39 offsets[1] = annotation.getEndNode().getOffset().intValue(); 40 try{ 41 string = normaliseString(document.getContent().getContent( 42 annotation.getStartNode().getOffset(), 43 annotation.getEndNode().getOffset()). 44 toString()); 45 } catch(InvalidOffsetException ioe){ 46 ioe.printStackTrace(); 47 } 48 category = annotation.getType(); 49 } 50 51 /** Returns a normalised string for the entity. This is the string from the 52 * text document the entity was descovered in, with all whitespace sequences 53 * replaced by a single space character 54 */ 55 public String getString(){ 56 return string; 57 } 58 59 /** Returns the category of the entity*/ 60 public String getCategory(){ 61 return category; 62 } 63 64 /** Returns a pair of integers specifying the character offsets in the 65 * original file where the entity occured 66 */ 67 public int[] getOffsets(){ 68 return offsets; 69 } 70 71 /** Returns a string giving the category, offsets and normalised string for 72 * the entity, with no newlines. 73 */ 74 public String toString(){ 75 return category + " " + offsets[0] + " " + offsets[1] + " " + string; 76 } 77 78 String string; 79 String category; 80 int[] offsets; 81 82 /** Normalises a string. That is removes all the leading and trailing 83 * whitespace characters and replaces all inner whitespace sequences with a 84 * single space character 85 */ 86 protected String normaliseString(String text){ 87 /// String res = ""; 88 StringBuffer res = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE); 89 if(text == null) return null; 90 int charIdx = 0; 91 boolean lastWasSpace = false; 92 //skip the leading spaces 93 while(charIdx < text.length() && 94 Character.isWhitespace(text.charAt(charIdx))) charIdx++; 95 //parse the rest of the text 96 while(charIdx < text.length()){ 97 if(Character.isWhitespace(text.charAt(charIdx))){ 98 //reading spaces 99 lastWasSpace = true; 100 }else{ 101 //reading non-spaces 102 if(lastWasSpace) ///res += " "; 103 res.append(" "); 104 /// res += text.charAt(charIdx); 105 res.append(text.charAt(charIdx)); 106 lastWasSpace = false; 107 } 108 charIdx++; 109 }//while(charIdx < text.length()) 110 return res.toString(); 111 } 112 113 } 114
|
EntityDescriptor |
|