|
APFormatExporter |
|
1 /* 2 * APFormatExporter.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/Oct/2001 12 * 13 * $Id: APFormatExporter.java,v 1.9 2001/11/21 14:18:23 cursu Exp $ 14 */ 15 16 package gate.creole; 17 18 import gate.*; 19 import gate.creole.orthomatcher.*; 20 import gate.creole.ANNIEConstants; 21 import gate.util.*; 22 23 import java.util.*; 24 import java.net.*; 25 import java.io.*; 26 27 /** This class implements a APF xml exporter. It works on documents or corpora 28 * to export them in the APF format. 29 */ 30 public class APFormatExporter extends AbstractLanguageAnalyser 31 implements ANNIEConstants{ 32 /** Debug flag */ 33 private static final boolean DEBUG = false; 34 /** Constructor does nothing. This PR is bean like initialized*/ 35 public APFormatExporter() {} 36 37 /** Run the resource and does the entire export process*/ 38 public void execute() throws ExecutionException{ 39 // Check if the thing can be run 40 if(document == null) 41 throw new ExecutionException("No document found to export in APF format!"); 42 if (exportedTypes == null) 43 throw new ExecutionException("No export types found."); 44 xmlDoc = new StringBuffer(10*(document.getContent().size().intValue())); 45 initDocId(); 46 if (docId == null) 47 throw new ExecutionException("Couldn't detect the document's ID"); 48 if (DEBUG) 49 Out.prln("Document id = "+ docId); 50 51 String exportFilePathStr = null; 52 if (exportFilePath == null) 53 exportFilePathStr = new String(document.getSourceUrl().getFile() + 54 ".apf.xml"); 55 else 56 exportFilePathStr = exportFilePath.getPath()+ "/"+docId + ".apf.xml"; 57 58 if (DEBUG) 59 Out.prln("Export file path = "+ exportFilePathStr); 60 //* 61 // Prepare to write into the xmlFile 62 OutputStreamWriter writer = null; 63 try{ 64 writer = new OutputStreamWriter( 65 new FileOutputStream(new File(exportFilePathStr))); 66 67 // Write (test the toXml() method) 68 // This Action is added only when a gate.Document is created. 69 // So, is Bor sure that the resource is a gate.Document 70 serializeDocumentToAPF(); 71 writer.write(xmlDoc.toString()); 72 writer.flush(); 73 writer.close(); 74 }catch (Exception e){ 75 throw new ExecutionException(e); 76 }// End try 77 //*/ 78 } // execute() 79 80 81 /** Initialise this resource, and returns it. */ 82 public Resource init() throws ResourceInstantiationException { 83 return this; 84 } // init() 85 86 /** Java bean style mutator for exportedTypes */ 87 public void setExportedTypes(List anExportedTypesList){ 88 exportedTypes = anExportedTypesList; 89 }// setExportedTypes(); 90 91 /** Java bean style accesor for exportedTypes */ 92 public List getExportedTypes(){ 93 return exportedTypes; 94 }// getExportedTypes() 95 96 /** Java bean style mutator for dtdFileName */ 97 public void setDtdFileName(String aDtdFileName){ 98 dtdFileName = aDtdFileName; 99 }// setDtdFileName(); 100 101 /** Java bean style accesor for DtdFileName */ 102 public String getDtdFileName(){ 103 return dtdFileName; 104 }// getDtdFileName() 105 106 /** Java bean style mutator for exportFilePath */ 107 public void setExportFilePath(URL anExportFilePath){ 108 exportFilePath = anExportFilePath; 109 }// setExportFilePath(); 110 111 /** Java bean style accesor for exportFilePath */ 112 public URL getExportFilePath(){ 113 return exportFilePath; 114 }// getDtdFileName() 115 116 /** Java bean style mutator for source */ 117 public void setSource(String aSource){ 118 source = aSource; 119 }// setSource(); 120 121 /** Java bean style accesor for source */ 122 public String getSource(){ 123 return source; 124 }// getSource() 125 126 127 /** Initialises the docId with documents' file name without the complete path*/ 128 private void initDocId(){ 129 String fileName = ""; 130 fileName = gate.util.Files.getLastPathComponent( 131 document.getSourceUrl().getFile()); 132 // File name contains now the last token 133 if (DEBUG) 134 Out.prln("From initDocId, fileName ="+ fileName); 135 StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,"."); 136 StringBuffer tmpDocId = new StringBuffer(""); 137 while(fileNameTokenizer.hasMoreTokens()){ 138 String token = (String)fileNameTokenizer.nextToken(); 139 // We don't want to append the last token 140 if (fileNameTokenizer.hasMoreTokens()) 141 tmpDocId.append(token + "."); 142 }// End while 143 // if tokenization had place 144 if (!"".equals(tmpDocId)){ 145 // Remove the last dot 146 tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),""); 147 docId = tmpDocId.toString(); 148 }// End if 149 }// initDocId() 150 151 /** Returns the xml document conforming to APF dtd.*/ 152 protected void serializeDocumentToAPF(){ 153 xmlDoc.append("<?xml version=\"1.0\" ?>\n"); 154 xmlDoc.append("<!DOCTYPE source_file SYSTEM "); 155 if (dtdFileName == null) 156 xmlDoc.append("\"ace-pilot-ref.dtd\""); 157 else 158 xmlDoc.append("\""+dtdFileName+"\""); 159 xmlDoc.append(">\n"); 160 xmlDoc.append("<source_file TYPE=\"text\" SOURCE=\""+ 161 source+ "\" VERSION=\"1.2\" URI=\""); 162 xmlDoc.append(docId); 163 xmlDoc.append("-lf\">\n"); 164 xmlDoc.append(" <document DOCID=\""); 165 xmlDoc.append(docId + "\">\n"); 166 serializeEntities(); 167 xmlDoc.append(" </document>\n"); 168 xmlDoc.append("</source_file>"); 169 }// serializeDocumentToAPF() 170 171 /** Transforms all the entities from exportedTypes found in the GATE document 172 * into their xml representation 173 */ 174 protected void serializeEntities(){ 175 // If no types founded then simply return 176 if (exportedTypes == null || exportedTypes.isEmpty()) return; 177 178 Map entitiesMap = null; 179 if ( document.getFeatures() == null || 180 document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null) 181 entitiesMap = new HashMap(); 182 else 183 entitiesMap = (Map)document.getFeatures(). 184 get(DOCUMENT_COREF_FEATURE_NAME); 185 Map namedAnnotSetMap = null; 186 if (document.getNamedAnnotationSets() == null) 187 namedAnnotSetMap = new HashMap(); 188 else 189 namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets()); 190 // Add the default annoattion set 191 namedAnnotSetMap.put(null,document.getAnnotations()); 192 // The entities map is a map from annotation sets names to list of lists 193 // Each list element is composed from annotations refering the same entity 194 // All the entities that are in the exportedTypes need to be serialized. 195 Iterator exportedTypesIter = exportedTypes.iterator(); 196 while(exportedTypesIter.hasNext()){ 197 String entityType = (String)exportedTypesIter.next(); 198 // Serialize all entities of type 199 // The keys in the entitesMap are annotation sets names. The null key 200 // designates the default annotation. 201 Set annotationSetNames = namedAnnotSetMap.keySet(); 202 Iterator annotationSetNamesIter = annotationSetNames.iterator(); 203 while (annotationSetNamesIter.hasNext()){ 204 Object annotSetName = annotationSetNamesIter.next(); 205 // This list contains entities found in the annotSetName 206 List entitiesList = (List) entitiesMap.get(annotSetName); 207 if (entitiesList == null) entitiesList = new ArrayList(); 208 // This annotation set will contain all annotations of "entityType" 209 AnnotationSet annotSet = null; 210 Set serializationAnnotSet = null; 211 annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName); 212 if (annotSet == null || annotSet.get(entityType) == null) continue; 213 serializationAnnotSet = new HashSet(annotSet.get(entityType)); 214 // All annotations from annotSet will be serialized as entities unless 215 // some of them are present in the entities map 216 // Now we are searching for the entityType in the entitiesMap and 217 // serialize it from there. After that, remove all annotations 218 // entityType present in entitiesMap from annotSet and serialize the 219 // remaining entities. 220 //Iterate through the entitiesList in searching for entityType 221 Iterator entitiesListIter = entitiesList.iterator(); 222 while (entitiesListIter.hasNext()){ 223 List entity = (List)entitiesListIter.next(); 224 // We want now to accesate an annotation from the entity list to get 225 // its type and compare it with entityType 226 String theEntityType = new String(""); 227 if (entity != null && !entity.isEmpty()){ 228 Integer annotId = (Integer)entity.get(0); 229 Annotation a = (Annotation)annotSet.get(annotId); 230 if (a != null) theEntityType = a.getType(); 231 }// End if 232 // The the types are equal then serialize the entities 233 if (theEntityType.equals(entityType)){ 234 List ent = new ArrayList(); 235 Iterator entityIter = entity.iterator(); 236 while(entityIter.hasNext()){ 237 Integer id = (Integer)entityIter.next(); 238 ent.add(annotSet.get(id)); 239 }// End while 240 serializeAnEntity(ent); 241 // Remove all annotation from entity that apear in annotSet 242 serializationAnnotSet.removeAll(ent); 243 }// End if 244 }// End while(entitiesListIter.hasNext()) 245 // Serialize the remaining entities in annotSet 246 Iterator serializationAnnotSetIter = serializationAnnotSet.iterator(); 247 while(serializationAnnotSetIter.hasNext()){ 248 Annotation annotEntity = (Annotation) serializationAnnotSetIter.next(); 249 List ent = new ArrayList(); 250 ent.add(annotEntity); 251 serializeAnEntity(ent); 252 }// End while(annotSetIter.hasNext()) 253 }// End while(entitiesKeysIter.hasNext()) 254 }// End while(exportedTypesIter.hasNext()) 255 }// serializeEntities() 256 257 /** Writes an entity in the xmlDoc conforming to APF standards. 258 * @param anEntity represents a list with annotations that refer the same 259 * entity. Those annotations were detected and constructed by the 260 * orthomatcher. 261 */ 262 private void serializeAnEntity(List anEntity){ 263 if (anEntity == null || anEntity.isEmpty()) return; 264 // Write the entities tags 265 xmlDoc.append(" <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n"); 266 // We know for sure that the list is not empty (see above) 267 Annotation a = (Annotation) anEntity.get(0); 268 xmlDoc.append(" <entity_type>" + a.getType().toUpperCase() + 269 "</entity_type>\n"); 270 // Write the entities mentions 271 Iterator anEntityIter = anEntity.iterator(); 272 while(anEntityIter.hasNext()){ 273 Annotation ann = (Annotation)anEntityIter.next(); 274 serializeAnEntityMention(ann); 275 }// End while(anEntityIter.hasNext()) 276 // Write the entities attributes 277 xmlDoc.append(" <entity_attributes>\n"); 278 anEntityIter = anEntity.iterator(); 279 while(anEntityIter.hasNext()){ 280 Annotation ann = (Annotation)anEntityIter.next(); 281 serializeAnEntityAttributes(ann); 282 }// End while(anEntityIter.hasNext()) 283 xmlDoc.append(" </entity_attributes>\n"); 284 xmlDoc.append(" </entity>\n"); 285 }// End serializeAnEntity(); 286 287 /** This method serializes an entity mention from an Annotation*/ 288 private void serializeAnEntityMention(Annotation ann){ 289 if (ann == null) return; 290 String entityMentionType = "NAME"; 291 FeatureMap fm = ann.getFeatures(); 292 if (fm != null && null != fm.get("ENTITY_MENTION_TYPE")) 293 entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE"); 294 xmlDoc.append(" <entity_mention TYPE=\""+entityMentionType+"\">\n"); 295 // extent 296 xmlDoc.append(" <extent>\n"); 297 xmlDoc.append(" <charseq>\n"); 298 try{ 299 xmlDoc.append(" <!-- string = \"" + 300 document.getContent().getContent(ann.getStartNode().getOffset(), 301 ann.getEndNode().getOffset())+"\" -->\n"); 302 }catch (InvalidOffsetException ioe){ 303 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 304 " offsets:" + ann.getStartNode().getOffset() + " and "+ 305 ann.getEndNode().getOffset()); 306 }// End try 307 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 308 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 309 xmlDoc.append(" </charseq>\n"); 310 xmlDoc.append(" </extent>\n"); 311 // head 312 xmlDoc.append(" <head>\n"); 313 xmlDoc.append(" <charseq>\n"); 314 try{ 315 xmlDoc.append(" <!-- string = \"" + 316 document.getContent().getContent(ann.getStartNode().getOffset(), 317 ann.getEndNode().getOffset())+"\" -->\n"); 318 }catch (InvalidOffsetException ioe){ 319 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 320 " offsets:" + ann.getStartNode().getOffset() + " and "+ 321 ann.getEndNode().getOffset()); 322 }// End try 323 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 324 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 325 xmlDoc.append(" </charseq>\n"); 326 xmlDoc.append(" </head>\n"); 327 xmlDoc.append(" </entity_mention>\n"); 328 }//serializeAnEntityMention(); 329 330 /** This method serializes an entity attribute from an Annotation*/ 331 private void serializeAnEntityAttributes(Annotation ann){ 332 if (ann == null) return; 333 334 // name 335 xmlDoc.append(" <name>\n"); 336 xmlDoc.append(" <charseq>\n"); 337 try{ 338 xmlDoc.append(" <!-- string = \"" + 339 document.getContent().getContent(ann.getStartNode().getOffset(), 340 ann.getEndNode().getOffset())+"\" -->\n"); 341 }catch (InvalidOffsetException ioe){ 342 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 343 " offsets:" + ann.getStartNode().getOffset() + " and "+ 344 ann.getEndNode().getOffset()); 345 }// End try 346 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 347 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 348 xmlDoc.append(" </charseq>\n"); 349 xmlDoc.append(" </name>\n"); 350 }//serializeAnEntityMention(); 351 352 /** Returns the next safe ID for an entity*/ 353 private int getNextEntityId(){ 354 return entityId ++; 355 }// getNextEntityId() 356 357 /** This list of strings represents the entities type that will be exported*/ 358 private List exportedTypes = null; 359 /** This is the name of the dtd file. If it's not present no dtd would be 360 * written in the APF file. 361 */ 362 private String dtdFileName = null; 363 /** This field represent the document id and it is used in generating the 364 * entities IDs. It is the file name of the document, without the extension 365 */ 366 private String docId = null; 367 368 /** This field represent an unique entity ID generator*/ 369 private int entityId = 1; 370 /** This is the xmlDoc that will be created*/ 371 private StringBuffer xmlDoc = null; 372 373 private URL exportFilePath = null; 374 375 /** The source attribute for source*/ 376 private String source = null; 377 378 }// APFormatExporter
|
APFormatExporter |
|