|
POSTagger |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan, 01 Feb 2000 10 * 11 * $Id: POSTagger.java,v 1.14 2002/03/06 17:15:39 kalina Exp $ 12 */ 13 14 package gate.creole; 15 16 import gate.*; 17 import gate.creole.*; 18 import gate.util.*; 19 import gate.event.*; 20 21 import hepple.postag.*; 22 23 import java.util.*; 24 import java.io.*; 25 import java.net.URL; 26 import java.text.NumberFormat; 27 /** 28 * This class is a wrapper for HepTag, Mark Hepple's POS tagger. 29 */ 30 public class POSTagger extends AbstractLanguageAnalyser { 31 32 public static final String 33 TAG_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 TAG_INPUT_AS_PARAMETER_NAME = "inputASName"; 37 38 public static final String 39 TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName"; 40 41 public static final String 42 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL"; 43 44 public static final String 45 TAG_RULES_URL_PARAMETER_NAME = "rulesURL"; 46 47 public POSTagger() { 48 } 49 50 public Resource init()throws ResourceInstantiationException{ 51 if(lexiconURL == null){ 52 throw new ResourceInstantiationException( 53 "NoURL provided for the lexicon!"); 54 } 55 if(rulesURL == null){ 56 throw new ResourceInstantiationException( 57 "No URL provided for the rules!"); 58 } 59 try{ 60 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL); 61 }catch(Exception e){ 62 throw new ResourceInstantiationException(e); 63 } 64 return this; 65 } 66 67 68 public void execute() throws ExecutionException{ 69 try{ 70 //check the parameters 71 if(document == null) throw new GateRuntimeException( 72 "No document to process!"); 73 if(inputASName != null && inputASName.equals("")) inputASName = null; 74 if(outputASName != null && outputASName.equals("")) outputASName = null; 75 AnnotationSet inputAS = (inputASName == null) ? 76 document.getAnnotations() : 77 document.getAnnotations(inputASName); 78 AnnotationSet outputAS = (outputASName == null) ? 79 document.getAnnotations() : 80 document.getAnnotations(outputASName); 81 82 fireStatusChanged("POS tagging " + document.getName()); 83 fireProgressChanged(0); 84 //prepare the input for HepTag 85 //define a comparator for annotations by start offset 86 Comparator offsetComparator = new OffsetComparator(); 87 AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE); 88 if(as != null && as.size() > 0){ 89 List sentences = new ArrayList(as); 90 Collections.sort(sentences, offsetComparator); 91 Iterator sentIter = sentences.iterator(); 92 int sentIndex = 0; 93 int sentCnt = sentences.size(); 94 long startTime= System.currentTimeMillis(); 95 while(sentIter.hasNext()){ 96 Annotation sentenceAnn = (Annotation)sentIter.next(); 97 AnnotationSet rangeSet = inputAS.get( 98 sentenceAnn.getStartNode().getOffset(), 99 sentenceAnn.getEndNode().getOffset()); 100 if(rangeSet == null) continue; 101 AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE); 102 if(tokensSet == null) continue; 103 List tokens = new ArrayList(tokensSet); 104 Collections.sort(tokens, offsetComparator); 105 106 // List tokens = (List)sentenceAnn.getFeatures().get("tokens"); 107 List sentence = new ArrayList(tokens.size()); 108 Iterator tokIter = tokens.iterator(); 109 while(tokIter.hasNext()){ 110 Annotation token = (Annotation)tokIter.next(); 111 String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME); 112 sentence.add(text); 113 }//while(tokIter.hasNext()) 114 115 //run the POSTagger over this sentence 116 List sentences4tagger = new ArrayList(1); 117 sentences4tagger.add(sentence); 118 List taggerResults = tagger.runTagger(sentences4tagger); 119 //add the results to the output annotation set 120 //we only get one sentence 121 List sentenceFromTagger = (List)taggerResults.get(0); 122 if(sentenceFromTagger.size() != sentence.size()){ 123 String taggerResult = ""; 124 for(int i = 0; i< sentenceFromTagger.size(); i++){ 125 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", "; 126 } 127 throw new GateRuntimeException( 128 "POS Tagger malfunction: the output size (" + 129 sentenceFromTagger.size() + 130 ") is different from the input size (" + 131 sentence.size() + ")!" + 132 "\n Input: " + sentence + "\nOutput: " + taggerResult); 133 } 134 for(int i = 0; i< sentence.size(); i++){ 135 String category = ((String[])sentenceFromTagger.get(i))[1]; 136 Annotation token = (Annotation)tokens.get(i); 137 token.getFeatures(). 138 put(TOKEN_CATEGORY_FEATURE_NAME, category); 139 }//for(i = 0; i<= sentence.size(); i++) 140 fireProgressChanged(sentIndex++ * 100 / sentCnt); 141 }//while(sentIter.hasNext()) 142 143 fireProcessFinished(); 144 long endTime = System.currentTimeMillis(); 145 fireStatusChanged(document.getName() + " tagged in " + 146 NumberFormat.getInstance().format( 147 (double)(endTime - startTime) / 1000) + " seconds!"); 148 }else{ 149 throw new GateRuntimeException("No sentences to process!\n" + 150 "Please run a sentence splitter first!"); 151 }//if(as != null && as.size() > 0) 152 }catch(Exception e){ 153 throw new ExecutionException(e); 154 } 155 } 156 157 158 public void setLexiconURL(java.net.URL newLexiconURL) { 159 lexiconURL = newLexiconURL; 160 } 161 public java.net.URL getLexiconURL() { 162 return lexiconURL; 163 } 164 public void setRulesURL(java.net.URL newRulesURL) { 165 rulesURL = newRulesURL; 166 } 167 public java.net.URL getRulesURL() { 168 return rulesURL; 169 } 170 public void setInputASName(String newInputASName) { 171 inputASName = newInputASName; 172 } 173 public String getInputASName() { 174 return inputASName; 175 } 176 public void setOutputASName(String newOutputASName) { 177 outputASName = newOutputASName; 178 } 179 public String getOutputASName() { 180 return outputASName; 181 } 182 183 protected hepple.postag.POSTagger tagger; 184 private java.net.URL lexiconURL; 185 private java.net.URL rulesURL; 186 private String inputASName; 187 private String outputASName; 188 }
|
POSTagger |
|