|
APFormatExporter |
|
1 /* 2 * APFormatExporter.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/Oct/2001 12 * 13 * $Id: APFormatExporter.java,v 1.15 2002/03/06 17:15:39 kalina Exp $ 14 */ 15 16 package gate.creole; 17 18 import gate.*; 19 import gate.creole.orthomatcher.*; 20 import gate.creole.ANNIEConstants; 21 import gate.util.*; 22 23 import java.util.*; 24 import java.net.*; 25 import java.io.*; 26 27 /** This class implements a APF xml exporter. It works on documents or corpora 28 * to export them in the APF format. 29 */ 30 public class APFormatExporter extends AbstractLanguageAnalyser 31 implements ANNIEConstants{ 32 public static final String 33 APF_EXP_DOCUMENT_PARAMETER_NAME = "document"; 34 35 public static final String 36 APF_EXP_SOURCE_PARAMETER_NAME = "source"; 37 38 public static final String 39 APF_EXP_DTD_PARAMETER_NAME = "dtdFileName"; 40 41 public static final String 42 APF_EXP_PATH_PARAMETER_NAME = "exportFilePath"; 43 44 public static final String 45 APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes"; 46 47 public static final String 48 APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten"; 49 50 /** Debug flag */ 51 private static final boolean DEBUG = false; 52 /** Constructor does nothing. This PR is bean like initialized*/ 53 public APFormatExporter() {} 54 55 /** Run the resource and does the entire export process*/ 56 public void execute() throws ExecutionException{ 57 // Check if the thing can be run 58 if(document == null) 59 throw new ExecutionException("No document found to export in APF format!"); 60 if (exportedTypes == null) 61 throw new ExecutionException("No export types found."); 62 xmlDoc = new StringBuffer(10*(document.getContent().size().intValue())); 63 initDocId(); 64 if (docId == null) 65 throw new ExecutionException("Couldn't detect the document's ID"); 66 if (DEBUG) 67 Out.prln("Document id = "+ docId); 68 69 String exportFilePathStr = null; 70 if (exportFilePath == null) 71 exportFilePathStr = new String(document.getSourceUrl().getFile() + 72 ".apf.xml"); 73 else 74 exportFilePathStr = exportFilePath.getPath()+ "/"+docId + ".apf.xml"; 75 76 if (DEBUG) 77 Out.prln("Export file path = "+ exportFilePathStr); 78 //* 79 // Prepare to write into the xmlFile 80 OutputStreamWriter writer = null; 81 try{ 82 writer = new OutputStreamWriter( 83 new FileOutputStream(new File(exportFilePathStr))); 84 85 // Write (test the toXml() method) 86 // This Action is added only when a gate.Document is created. 87 // So, is Bor sure that the resource is a gate.Document 88 serializeDocumentToAPF(); 89 writer.write(xmlDoc.toString()); 90 writer.flush(); 91 writer.close(); 92 }catch (Exception e){ 93 throw new ExecutionException(e); 94 }// End try 95 //*/ 96 } // execute() 97 98 99 /** Initialise this resource, and returns it. */ 100 public Resource init() throws ResourceInstantiationException { 101 return this; 102 } // init() 103 104 /** Java bean style mutator for exportedTypes */ 105 public void setExportedTypes(List anExportedTypesList){ 106 exportedTypes = anExportedTypesList; 107 }// setExportedTypes(); 108 109 /** Java bean style accesor for exportedTypes */ 110 public List getExportedTypes(){ 111 return exportedTypes; 112 }// getExportedTypes() 113 114 /** Java bean style mutator for dtdFileName */ 115 public void setDtdFileName(String aDtdFileName){ 116 dtdFileName = aDtdFileName; 117 }// setDtdFileName(); 118 119 /** Java bean style accesor for DtdFileName */ 120 public String getDtdFileName(){ 121 return dtdFileName; 122 }// getDtdFileName() 123 124 /** Java bean style mutator for exportFilePath */ 125 public void setExportFilePath(URL anExportFilePath){ 126 exportFilePath = anExportFilePath; 127 }// setExportFilePath(); 128 129 /** Java bean style accesor for exportFilePath */ 130 public URL getExportFilePath(){ 131 return exportFilePath; 132 }// getDtdFileName() 133 134 /** Java bean style mutator for source */ 135 public void setSource(String aSource){ 136 source = aSource; 137 }// setSource(); 138 139 /** Java bean style accesor for source */ 140 public String getSource(){ 141 return source; 142 }// getSource() 143 144 /** Java bean style accesor for isSourceWritten */ 145 public Boolean getIsSourceWritten() { 146 return new Boolean(isSourceWritten); 147 } 148 149 /** Java bean style mutator for isSourceWritten */ 150 public void setIsSourceWritten(Boolean aIsSourceWritten){ 151 isSourceWritten = aIsSourceWritten.booleanValue(); 152 }// setIsSourceWritten(); 153 154 155 156 /** Initialises the docId with documents' file name without the complete path*/ 157 private void initDocId(){ 158 String fileName = ""; 159 fileName = gate.util.Files.getLastPathComponent( 160 document.getSourceUrl().getFile()); 161 // File name contains now the last token 162 if (DEBUG) 163 Out.prln("From initDocId, fileName ="+ fileName); 164 StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,"."); 165 StringBuffer tmpDocId = new StringBuffer(""); 166 while(fileNameTokenizer.hasMoreTokens()){ 167 String token = (String)fileNameTokenizer.nextToken(); 168 // We don't want to append the last token 169 if (fileNameTokenizer.hasMoreTokens()) 170 tmpDocId.append(token + "."); 171 }// End while 172 // if tokenization had place 173 if (!"".equals(tmpDocId)){ 174 // Remove the last dot 175 tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),""); 176 docId = tmpDocId.toString(); 177 }// End if 178 }// initDocId() 179 180 /** Returns the xml document conforming to APF dtd.*/ 181 protected void serializeDocumentToAPF(){ 182 xmlDoc.append("<?xml version=\"1.0\" ?>\n"); 183 xmlDoc.append("<!DOCTYPE source_file SYSTEM "); 184 if (dtdFileName == null) 185 xmlDoc.append("\"ace-pilot-ref.dtd\""); 186 else 187 xmlDoc.append("\""+dtdFileName+"\""); 188 xmlDoc.append(">\n"); 189 xmlDoc.append("<source_file TYPE=\"text\""); 190 if (isSourceWritten) 191 xmlDoc.append(" SOURCE=\""+ source+ "\""); 192 xmlDoc.append("VERSION=\"1.2\" URI=\""); 193 xmlDoc.append(docId); 194 xmlDoc.append("-lf\">\n"); 195 xmlDoc.append(" <document DOCID=\""); 196 xmlDoc.append(docId + "\">\n"); 197 serializeEntities(); 198 xmlDoc.append(" </document>\n"); 199 xmlDoc.append("</source_file>"); 200 }// serializeDocumentToAPF() 201 202 /** Transforms all the entities from exportedTypes found in the GATE document 203 * into their xml representation 204 */ 205 protected void serializeEntities(){ 206 // If no types founded then simply return 207 if (exportedTypes == null || exportedTypes.isEmpty()) return; 208 209 Map entitiesMap = null; 210 if ( document.getFeatures() == null || 211 document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null) 212 entitiesMap = new HashMap(); 213 else 214 entitiesMap = (Map)document.getFeatures(). 215 get(DOCUMENT_COREF_FEATURE_NAME); 216 Map namedAnnotSetMap = null; 217 if (document.getNamedAnnotationSets() == null) 218 namedAnnotSetMap = new HashMap(); 219 else 220 namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets()); 221 // Add the default annoattion set 222 namedAnnotSetMap.put(null,document.getAnnotations()); 223 // The entities map is a map from annotation sets names to list of lists 224 // Each list element is composed from annotations refering the same entity 225 // All the entities that are in the exportedTypes need to be serialized. 226 Iterator exportedTypesIter = exportedTypes.iterator(); 227 while(exportedTypesIter.hasNext()){ 228 String entityType = (String)exportedTypesIter.next(); 229 // Serialize all entities of type 230 // The keys in the entitesMap are annotation sets names. The null key 231 // designates the default annotation. 232 Set annotationSetNames = namedAnnotSetMap.keySet(); 233 Iterator annotationSetNamesIter = annotationSetNames.iterator(); 234 while (annotationSetNamesIter.hasNext()){ 235 Object annotSetName = annotationSetNamesIter.next(); 236 // This list contains entities found in the annotSetName 237 List entitiesList = (List) entitiesMap.get(annotSetName); 238 if (entitiesList == null) entitiesList = new ArrayList(); 239 // This annotation set will contain all annotations of "entityType" 240 AnnotationSet annotSet = null; 241 Set serializationAnnotSet = null; 242 annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName); 243 if (annotSet == null || annotSet.get(entityType) == null) continue; 244 serializationAnnotSet = new HashSet(annotSet.get(entityType)); 245 // All annotations from annotSet will be serialized as entities unless 246 // some of them are present in the entities map 247 // Now we are searching for the entityType in the entitiesMap and 248 // serialize it from there. After that, remove all annotations 249 // entityType present in entitiesMap from annotSet and serialize the 250 // remaining entities. 251 //Iterate through the entitiesList in searching for entityType 252 Iterator entitiesListIter = entitiesList.iterator(); 253 while (entitiesListIter.hasNext()){ 254 List entity = (List)entitiesListIter.next(); 255 // We want now to accesate an annotation from the entity list to get 256 // its type and compare it with entityType 257 String theEntityType = new String(""); 258 if (entity != null && !entity.isEmpty()){ 259 Integer annotId = (Integer)entity.get(0); 260 Annotation a = (Annotation)annotSet.get(annotId); 261 if (a != null) theEntityType = a.getType(); 262 }// End if 263 // The the types are equal then serialize the entities 264 if (theEntityType.equals(entityType)){ 265 List ent = new ArrayList(); 266 Iterator entityIter = entity.iterator(); 267 while(entityIter.hasNext()){ 268 Integer id = (Integer)entityIter.next(); 269 ent.add(annotSet.get(id)); 270 }// End while 271 serializeAnEntity(ent); 272 // Remove all annotation from entity that apear in annotSet 273 serializationAnnotSet.removeAll(ent); 274 }// End if 275 }// End while(entitiesListIter.hasNext()) 276 // Serialize the remaining entities in annotSet 277 Iterator serializationAnnotSetIter = serializationAnnotSet.iterator(); 278 while(serializationAnnotSetIter.hasNext()){ 279 Annotation annotEntity = (Annotation) serializationAnnotSetIter.next(); 280 List ent = new ArrayList(); 281 ent.add(annotEntity); 282 serializeAnEntity(ent); 283 }// End while(annotSetIter.hasNext()) 284 }// End while(entitiesKeysIter.hasNext()) 285 }// End while(exportedTypesIter.hasNext()) 286 }// serializeEntities() 287 288 /** Writes an entity in the xmlDoc conforming to APF standards. 289 * @param anEntity represents a list with annotations that refer the same 290 * entity. Those annotations were detected and constructed by the 291 * orthomatcher. 292 */ 293 private void serializeAnEntity(List anEntity){ 294 if (anEntity == null || anEntity.isEmpty()) return; 295 // Write the entities tags 296 xmlDoc.append(" <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n"); 297 // We know for sure that the list is not empty (see above) 298 Annotation a = (Annotation) anEntity.get(0); 299 xmlDoc.append(" <entity_type>" + a.getType().toUpperCase() + 300 "</entity_type>\n"); 301 // Write the entities mentions 302 Iterator anEntityIter = anEntity.iterator(); 303 while(anEntityIter.hasNext()){ 304 Annotation ann = (Annotation)anEntityIter.next(); 305 serializeAnEntityMention(ann); 306 }// End while(anEntityIter.hasNext()) 307 // Write the entities attributes 308 xmlDoc.append(" <entity_attributes>\n"); 309 anEntityIter = anEntity.iterator(); 310 while(anEntityIter.hasNext()){ 311 Annotation ann = (Annotation)anEntityIter.next(); 312 serializeAnEntityAttributes(ann); 313 }// End while(anEntityIter.hasNext()) 314 xmlDoc.append(" </entity_attributes>\n"); 315 xmlDoc.append(" </entity>\n"); 316 }// End serializeAnEntity(); 317 318 /** This method serializes an entity mention from an Annotation*/ 319 private void serializeAnEntityMention(Annotation ann){ 320 if (ann == null) return; 321 String entityMentionType = "NAME"; 322 String entityMentionRole = null; 323 String entityMentionReference = null; 324 String entityMentionGeneric = null; 325 326 FeatureMap fm = ann.getFeatures(); 327 if (fm != null){ 328 if( null != fm.get("ENTITY_MENTION_TYPE")) 329 entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE"); 330 331 entityMentionRole = (String) fm.get("ROLE"); 332 entityMentionReference = (String) fm.get("REFERENCE"); 333 entityMentionGeneric = (String) fm.get("GENERIC"); 334 }// End if 335 String str1 = (entityMentionRole == null)? "" : 336 ("ROLE=\"" + entityMentionRole + "\""); 337 String str2 = (entityMentionReference == null)? "" : 338 ("REFERENCE=\"" + entityMentionReference + "\""); 339 String str3 = (entityMentionGeneric == null)? "" : 340 ("GENERIC=\"" + entityMentionGeneric + "\""); 341 342 343 xmlDoc.append(" <entity_mention TYPE=\"" + entityMentionType+"\"" + 344 str1 + " " + str2 + " " + str3 + ">\n" 345 ); 346 // extent 347 xmlDoc.append(" <extent>\n"); 348 xmlDoc.append(" <charseq>\n"); 349 try{ 350 xmlDoc.append(" <!-- string = \"" + 351 document.getContent().getContent(ann.getStartNode().getOffset(), 352 ann.getEndNode().getOffset())+"\" -->\n"); 353 }catch (InvalidOffsetException ioe){ 354 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 355 " offsets:" + ann.getStartNode().getOffset() + " and "+ 356 ann.getEndNode().getOffset()); 357 }// End try 358 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 359 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 360 xmlDoc.append(" </charseq>\n"); 361 xmlDoc.append(" </extent>\n"); 362 // head 363 xmlDoc.append(" <head>\n"); 364 xmlDoc.append(" <charseq>\n"); 365 try{ 366 xmlDoc.append(" <!-- string = \"" + 367 document.getContent().getContent(ann.getStartNode().getOffset(), 368 ann.getEndNode().getOffset())+"\" -->\n"); 369 }catch (InvalidOffsetException ioe){ 370 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 371 " offsets:" + ann.getStartNode().getOffset() + " and "+ 372 ann.getEndNode().getOffset()); 373 }// End try 374 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 375 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 376 xmlDoc.append(" </charseq>\n"); 377 xmlDoc.append(" </head>\n"); 378 xmlDoc.append(" </entity_mention>\n"); 379 }//serializeAnEntityMention(); 380 381 /** This method serializes an entity attribute from an Annotation*/ 382 private void serializeAnEntityAttributes(Annotation ann){ 383 if (ann == null) return; 384 boolean isAttribute = false; 385 if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE")) 386 || 387 null == ann.getFeatures().get("ENTITY_MENTION_TYPE")) 388 isAttribute = true; 389 if (! isAttribute) 390 return; 391 392 // name 393 xmlDoc.append(" <name>\n"); 394 xmlDoc.append(" <charseq>\n"); 395 try{ 396 xmlDoc.append(" <!-- string = \"" + 397 document.getContent().getContent(ann.getStartNode().getOffset(), 398 ann.getEndNode().getOffset())+"\" -->\n"); 399 }catch (InvalidOffsetException ioe){ 400 Err.prln("APFormatExporter:Warning: Couldn't access text between"+ 401 " offsets:" + ann.getStartNode().getOffset() + " and "+ 402 ann.getEndNode().getOffset()); 403 }// End try 404 xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+ 405 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n"); 406 xmlDoc.append(" </charseq>\n"); 407 xmlDoc.append(" </name>\n"); 408 }//serializeAnEntityMention(); 409 410 /** Returns the next safe ID for an entity*/ 411 private int getNextEntityId(){ 412 return entityId ++; 413 }// getNextEntityId() 414 415 /** This list of strings represents the entities type that will be exported*/ 416 private List exportedTypes = null; 417 /** This is the name of the dtd file. If it's not present no dtd would be 418 * written in the APF file. 419 */ 420 private String dtdFileName = null; 421 /** This field represent the document id and it is used in generating the 422 * entities IDs. It is the file name of the document, without the extension 423 */ 424 private String docId = null; 425 426 /** This field represent an unique entity ID generator*/ 427 private int entityId = 1; 428 /** This is the xmlDoc that will be created*/ 429 private StringBuffer xmlDoc = null; 430 431 private URL exportFilePath = null; 432 433 /** The source attribute for source*/ 434 private String source = null; 435 436 /** The source attribute for source*/ 437 private boolean isSourceWritten = true; 438 439 440 }// APFormatExporter
|
APFormatExporter |
|