|
POSTagger |
|
1 /* 2 * Copyright (c) 1998-2001, The University of Sheffield. 3 * 4 * This file is part of GATE (see http://gate.ac.uk/), and is free 5 * software, licenced under the GNU Library General Public License, 6 * Version 2, June 1991 (in the distribution as file licence.html, 7 * and also available at http://gate.ac.uk/gate/licence.html). 8 * 9 * Valentin Tablan, 01 Feb 2000 10 * 11 * $Id: POSTagger.java,v 1.13 2001/10/15 11:21:32 valyt Exp $ 12 */ 13 14 package gate.creole; 15 16 import gate.*; 17 import gate.creole.*; 18 import gate.util.*; 19 import gate.event.*; 20 21 import hepple.postag.*; 22 23 import java.util.*; 24 import java.io.*; 25 import java.net.URL; 26 import java.text.NumberFormat; 27 /** 28 * This class is a wrapper for HepTag, Mark Hepple's POS tagger. 29 */ 30 public class POSTagger extends AbstractLanguageAnalyser { 31 32 public POSTagger() { 33 } 34 35 public Resource init()throws ResourceInstantiationException{ 36 if(lexiconURL == null){ 37 throw new ResourceInstantiationException( 38 "NoURL provided for the lexicon!"); 39 } 40 if(rulesURL == null){ 41 throw new ResourceInstantiationException( 42 "No URL provided for the rules!"); 43 } 44 try{ 45 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL); 46 }catch(Exception e){ 47 throw new ResourceInstantiationException(e); 48 } 49 return this; 50 } 51 52 53 public void execute() throws ExecutionException{ 54 try{ 55 //check the parameters 56 if(document == null) throw new GateRuntimeException( 57 "No document to process!"); 58 if(inputASName != null && inputASName.equals("")) inputASName = null; 59 if(outputASName != null && outputASName.equals("")) outputASName = null; 60 AnnotationSet inputAS = (inputASName == null) ? 61 document.getAnnotations() : 62 document.getAnnotations(inputASName); 63 AnnotationSet outputAS = (outputASName == null) ? 64 document.getAnnotations() : 65 document.getAnnotations(outputASName); 66 67 fireStatusChanged("POS tagging " + document.getName()); 68 fireProgressChanged(0); 69 //prepare the input for HepTag 70 //define a comparator for annotations by start offset 71 Comparator offsetComparator = new OffsetComparator(); 72 AnnotationSet as = inputAS.get("Sentence"); 73 if(as != null && as.size() > 0){ 74 List sentences = new ArrayList(as); 75 Collections.sort(sentences, offsetComparator); 76 Iterator sentIter = sentences.iterator(); 77 int sentIndex = 0; 78 int sentCnt = sentences.size(); 79 long startTime= System.currentTimeMillis(); 80 while(sentIter.hasNext()){ 81 Annotation sentenceAnn = (Annotation)sentIter.next(); 82 AnnotationSet rangeSet = inputAS.get( 83 sentenceAnn.getStartNode().getOffset(), 84 sentenceAnn.getEndNode().getOffset()); 85 if(rangeSet == null) continue; 86 AnnotationSet tokensSet = rangeSet.get("Token"); 87 if(tokensSet == null) continue; 88 List tokens = new ArrayList(tokensSet); 89 Collections.sort(tokens, offsetComparator); 90 91 // List tokens = (List)sentenceAnn.getFeatures().get("tokens"); 92 List sentence = new ArrayList(tokens.size()); 93 Iterator tokIter = tokens.iterator(); 94 while(tokIter.hasNext()){ 95 Annotation token = (Annotation)tokIter.next(); 96 String text = (String)token.getFeatures().get("string"); 97 sentence.add(text); 98 }//while(tokIter.hasNext()) 99 100 //run the POSTagger over this sentence 101 List sentences4tagger = new ArrayList(1); 102 sentences4tagger.add(sentence); 103 List taggerResults = tagger.runTagger(sentences4tagger); 104 //add the results to the output annotation set 105 //we only get one sentence 106 List sentenceFromTagger = (List)taggerResults.get(0); 107 if(sentenceFromTagger.size() != sentence.size()){ 108 String taggerResult = ""; 109 for(int i = 0; i< sentenceFromTagger.size(); i++){ 110 taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", "; 111 } 112 throw new GateRuntimeException( 113 "POS Tagger malfunction: the output size (" + 114 sentenceFromTagger.size() + 115 ") is different from the input size (" + 116 sentence.size() + ")!" + 117 "\n Input: " + sentence + "\nOutput: " + taggerResult); 118 } 119 for(int i = 0; i< sentence.size(); i++){ 120 String category = ((String[])sentenceFromTagger.get(i))[1]; 121 Annotation token = (Annotation)tokens.get(i); 122 token.getFeatures().put("category", category); 123 }//for(i = 0; i<= sentence.size(); i++) 124 fireProgressChanged(sentIndex++ * 100 / sentCnt); 125 }//while(sentIter.hasNext()) 126 127 fireProcessFinished(); 128 long endTime = System.currentTimeMillis(); 129 fireStatusChanged(document.getName() + " tagged in " + 130 NumberFormat.getInstance().format( 131 (double)(endTime - startTime) / 1000) + " seconds!"); 132 }else{ 133 throw new GateRuntimeException("No sentences to process!\n" + 134 "Please run a sentence splitter first!"); 135 }//if(as != null && as.size() > 0) 136 }catch(Exception e){ 137 throw new ExecutionException(e); 138 } 139 } 140 141 142 public void setLexiconURL(java.net.URL newLexiconURL) { 143 lexiconURL = newLexiconURL; 144 } 145 public java.net.URL getLexiconURL() { 146 return lexiconURL; 147 } 148 public void setRulesURL(java.net.URL newRulesURL) { 149 rulesURL = newRulesURL; 150 } 151 public java.net.URL getRulesURL() { 152 return rulesURL; 153 } 154 public void setInputASName(String newInputASName) { 155 inputASName = newInputASName; 156 } 157 public String getInputASName() { 158 return inputASName; 159 } 160 public void setOutputASName(String newOutputASName) { 161 outputASName = newOutputASName; 162 } 163 public String getOutputASName() { 164 return outputASName; 165 } 166 167 protected hepple.postag.POSTagger tagger; 168 private java.net.URL lexiconURL; 169 private java.net.URL rulesURL; 170 private String inputASName; 171 private String outputASName; 172 }
|
POSTagger |
|