1   /*
2    *  APFormatExporter.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/Oct/2001
12   *
13   *  $Id: APFormatExporter.java,v 1.9 2001/11/21 14:18:23 cursu Exp $
14   */
15  
16  package gate.creole;
17  
18  import gate.*;
19  import gate.creole.orthomatcher.*;
20  import gate.creole.ANNIEConstants;
21  import gate.util.*;
22  
23  import java.util.*;
24  import java.net.*;
25  import java.io.*;
26  
27  /** This class implements a APF xml exporter. It works on documents or corpora
28    * to export them in the APF format.
29    */
30  public class APFormatExporter extends AbstractLanguageAnalyser
31                                implements ANNIEConstants{
32    /** Debug flag */
33    private static final boolean DEBUG = false;
34    /** Constructor does nothing. This PR is bean like initialized*/
35    public APFormatExporter() {}
36  
37    /** Run the resource and does the entire export process*/
38    public void execute() throws ExecutionException{
39      // Check if the thing can be run
40      if(document == null)
41        throw new ExecutionException("No document found to export in APF format!");
42      if (exportedTypes == null)
43        throw new ExecutionException("No export types found.");
44      xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
45      initDocId();
46      if (docId == null)
47        throw new ExecutionException("Couldn't detect the document's ID");
48      if (DEBUG)
49        Out.prln("Document id = "+ docId);
50  
51      String exportFilePathStr = null;
52      if (exportFilePath == null)
53        exportFilePathStr = new String(document.getSourceUrl().getFile() +
54                                                                    ".apf.xml");
55      else
56        exportFilePathStr = exportFilePath.getPath()+ "/"+docId + ".apf.xml";
57  
58      if (DEBUG)
59        Out.prln("Export file path = "+ exportFilePathStr);
60  //*
61      // Prepare to write into the xmlFile
62      OutputStreamWriter writer = null;
63      try{
64        writer = new OutputStreamWriter(
65                new FileOutputStream(new File(exportFilePathStr)));
66  
67        // Write (test the toXml() method)
68        // This Action is added only when a gate.Document is created.
69        // So, is Bor sure that the resource is a gate.Document
70        serializeDocumentToAPF();
71        writer.write(xmlDoc.toString());
72        writer.flush();
73        writer.close();
74      }catch (Exception e){
75        throw new ExecutionException(e);
76      }// End try
77  //*/
78    } // execute()
79  
80  
81    /** Initialise this resource, and returns it. */
82    public Resource init() throws ResourceInstantiationException {
83      return this;
84    } // init()
85  
86    /** Java bean style mutator for exportedTypes */
87    public void setExportedTypes(List anExportedTypesList){
88      exportedTypes = anExportedTypesList;
89    }// setExportedTypes();
90  
91    /** Java bean style accesor for exportedTypes */
92    public List getExportedTypes(){
93      return exportedTypes;
94    }// getExportedTypes()
95  
96    /** Java bean style mutator for dtdFileName */
97    public void setDtdFileName(String aDtdFileName){
98      dtdFileName = aDtdFileName;
99    }// setDtdFileName();
100 
101   /** Java bean style accesor for DtdFileName */
102   public String getDtdFileName(){
103     return dtdFileName;
104   }// getDtdFileName()
105 
106   /** Java bean style mutator for exportFilePath */
107   public void setExportFilePath(URL anExportFilePath){
108     exportFilePath = anExportFilePath;
109   }// setExportFilePath();
110 
111   /** Java bean style accesor for exportFilePath */
112   public URL getExportFilePath(){
113     return exportFilePath;
114   }// getDtdFileName()
115 
116   /** Java bean style mutator for source */
117   public void setSource(String aSource){
118     source = aSource;
119   }// setSource();
120 
121   /** Java bean style accesor for source */
122   public String getSource(){
123     return source;
124   }// getSource()
125 
126 
127   /** Initialises the docId with documents' file name without the complete path*/
128   private void initDocId(){
129     String fileName = "";
130     fileName = gate.util.Files.getLastPathComponent(
131                                             document.getSourceUrl().getFile());
132     // File name contains now the last token
133     if (DEBUG)
134       Out.prln("From initDocId, fileName ="+ fileName);
135     StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
136     StringBuffer tmpDocId = new StringBuffer("");
137     while(fileNameTokenizer.hasMoreTokens()){
138       String token = (String)fileNameTokenizer.nextToken();
139       // We don't want to append the last token
140       if (fileNameTokenizer.hasMoreTokens())
141         tmpDocId.append(token + ".");
142     }// End while
143     // if tokenization had place
144     if (!"".equals(tmpDocId)){
145       // Remove the last dot
146       tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
147       docId = tmpDocId.toString();
148     }// End if
149   }// initDocId()
150 
151   /** Returns the xml document conforming to APF dtd.*/
152   protected void serializeDocumentToAPF(){
153     xmlDoc.append("<?xml version=\"1.0\" ?>\n");
154     xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
155     if (dtdFileName == null)
156       xmlDoc.append("\"ace-pilot-ref.dtd\"");
157     else
158       xmlDoc.append("\""+dtdFileName+"\"");
159     xmlDoc.append(">\n");
160     xmlDoc.append("<source_file TYPE=\"text\" SOURCE=\""+
161                                     source+ "\" VERSION=\"1.2\" URI=\"");
162     xmlDoc.append(docId);
163     xmlDoc.append("-lf\">\n");
164     xmlDoc.append("  <document DOCID=\"");
165     xmlDoc.append(docId + "\">\n");
166     serializeEntities();
167     xmlDoc.append("  </document>\n");
168     xmlDoc.append("</source_file>");
169   }// serializeDocumentToAPF()
170 
171   /** Transforms all the entities from exportedTypes found in the GATE document
172     * into their xml representation
173     */
174   protected void serializeEntities(){
175     // If no types founded then simply return
176     if (exportedTypes == null || exportedTypes.isEmpty()) return;
177 
178     Map entitiesMap = null;
179     if ( document.getFeatures() == null ||
180          document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
181       entitiesMap = new HashMap();
182     else
183       entitiesMap = (Map)document.getFeatures().
184                                         get(DOCUMENT_COREF_FEATURE_NAME);
185     Map namedAnnotSetMap = null;
186     if (document.getNamedAnnotationSets() == null)
187       namedAnnotSetMap = new HashMap();
188     else
189       namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
190     // Add the default annoattion set
191     namedAnnotSetMap.put(null,document.getAnnotations());
192     // The entities map is a map from annotation sets names to list of lists
193     // Each list element is composed from annotations refering the same entity
194     // All the entities that are in the exportedTypes need to be serialized.
195     Iterator exportedTypesIter = exportedTypes.iterator();
196     while(exportedTypesIter.hasNext()){
197       String entityType = (String)exportedTypesIter.next();
198       // Serialize all entities of type
199       // The keys in the entitesMap are annotation sets names. The null key
200       // designates the default annotation.
201       Set annotationSetNames = namedAnnotSetMap.keySet();
202       Iterator annotationSetNamesIter = annotationSetNames.iterator();
203       while (annotationSetNamesIter.hasNext()){
204         Object annotSetName = annotationSetNamesIter.next();
205         // This list contains entities found in the annotSetName
206         List entitiesList = (List) entitiesMap.get(annotSetName);
207         if (entitiesList == null) entitiesList = new ArrayList();
208         // This annotation set will contain all annotations of "entityType"
209         AnnotationSet annotSet = null;
210         Set serializationAnnotSet = null;
211         annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
212         if (annotSet == null || annotSet.get(entityType) == null) continue;
213         serializationAnnotSet = new HashSet(annotSet.get(entityType));
214         // All annotations from annotSet will be serialized as entities unless
215         // some of them are present in the entities map
216         // Now we are searching for the entityType in the entitiesMap and
217         // serialize it from there. After that, remove all annotations
218         // entityType present in entitiesMap from annotSet and serialize the
219         // remaining entities.
220         //Iterate through the entitiesList in searching for entityType
221         Iterator entitiesListIter = entitiesList.iterator();
222         while (entitiesListIter.hasNext()){
223           List entity = (List)entitiesListIter.next();
224           // We want now to accesate an annotation from the entity list to get
225           // its type and compare it with entityType
226           String theEntityType = new String("");
227           if (entity != null && !entity.isEmpty()){
228             Integer annotId = (Integer)entity.get(0);
229             Annotation a = (Annotation)annotSet.get(annotId);
230             if (a != null) theEntityType = a.getType();
231           }// End if
232           // The the types are equal then serialize the entities
233           if (theEntityType.equals(entityType)){
234             List ent = new ArrayList();
235             Iterator entityIter = entity.iterator();
236             while(entityIter.hasNext()){
237               Integer id = (Integer)entityIter.next();
238               ent.add(annotSet.get(id));
239             }// End while
240             serializeAnEntity(ent);
241             // Remove all annotation from entity that apear in annotSet
242             serializationAnnotSet.removeAll(ent);
243           }// End if
244         }// End while(entitiesListIter.hasNext())
245         // Serialize the remaining entities in annotSet
246         Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
247         while(serializationAnnotSetIter.hasNext()){
248           Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
249           List ent = new ArrayList();
250           ent.add(annotEntity);
251           serializeAnEntity(ent);
252         }// End while(annotSetIter.hasNext())
253       }// End while(entitiesKeysIter.hasNext())
254     }// End while(exportedTypesIter.hasNext())
255   }// serializeEntities()
256 
257   /** Writes an entity in the xmlDoc conforming to APF standards.
258     * @param anEntity represents a list with annotations that refer the same
259     * entity. Those annotations were detected and constructed by the
260     * orthomatcher.
261     */
262   private void serializeAnEntity(List anEntity){
263     if (anEntity == null || anEntity.isEmpty()) return;
264     // Write the entities tags
265     xmlDoc.append("  <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
266     // We know for sure that the list is not empty (see above)
267     Annotation a = (Annotation) anEntity.get(0);
268     xmlDoc.append("    <entity_type>" + a.getType().toUpperCase() +
269      "</entity_type>\n");
270     // Write the entities mentions
271     Iterator anEntityIter = anEntity.iterator();
272     while(anEntityIter.hasNext()){
273       Annotation ann = (Annotation)anEntityIter.next();
274       serializeAnEntityMention(ann);
275     }// End while(anEntityIter.hasNext())
276     // Write the entities attributes
277     xmlDoc.append("      <entity_attributes>\n");
278     anEntityIter = anEntity.iterator();
279     while(anEntityIter.hasNext()){
280       Annotation ann = (Annotation)anEntityIter.next();
281       serializeAnEntityAttributes(ann);
282     }// End while(anEntityIter.hasNext())
283     xmlDoc.append("      </entity_attributes>\n");
284     xmlDoc.append("  </entity>\n");
285   }// End serializeAnEntity();
286 
287   /** This method serializes an entity mention from an Annotation*/
288   private void serializeAnEntityMention(Annotation ann){
289     if (ann == null) return;
290     String entityMentionType = "NAME";
291     FeatureMap fm = ann.getFeatures();
292     if (fm != null && null != fm.get("ENTITY_MENTION_TYPE"))
293       entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
294     xmlDoc.append("      <entity_mention TYPE=\""+entityMentionType+"\">\n");
295     // extent
296     xmlDoc.append("        <extent>\n");
297     xmlDoc.append("          <charseq>\n");
298     try{
299       xmlDoc.append("          <!-- string = \"" +
300             document.getContent().getContent(ann.getStartNode().getOffset(),
301                                       ann.getEndNode().getOffset())+"\" -->\n");
302     }catch (InvalidOffsetException ioe){
303       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
304       " offsets:" + ann.getStartNode().getOffset() + " and "+
305       ann.getEndNode().getOffset());
306     }// End try
307     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
308         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
309     xmlDoc.append("          </charseq>\n");
310     xmlDoc.append("        </extent>\n");
311     // head
312     xmlDoc.append("        <head>\n");
313     xmlDoc.append("          <charseq>\n");
314     try{
315       xmlDoc.append("          <!-- string = \"" +
316             document.getContent().getContent(ann.getStartNode().getOffset(),
317                                       ann.getEndNode().getOffset())+"\" -->\n");
318     }catch (InvalidOffsetException ioe){
319       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
320       " offsets:" + ann.getStartNode().getOffset() + " and "+
321       ann.getEndNode().getOffset());
322     }// End try
323     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
324         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
325     xmlDoc.append("          </charseq>\n");
326     xmlDoc.append("        </head>\n");
327     xmlDoc.append("      </entity_mention>\n");
328   }//serializeAnEntityMention();
329 
330   /** This method serializes an entity attribute from an Annotation*/
331   private void serializeAnEntityAttributes(Annotation ann){
332     if (ann == null) return;
333 
334     // name
335     xmlDoc.append("        <name>\n");
336     xmlDoc.append("          <charseq>\n");
337     try{
338       xmlDoc.append("          <!-- string = \"" +
339             document.getContent().getContent(ann.getStartNode().getOffset(),
340                                       ann.getEndNode().getOffset())+"\" -->\n");
341     }catch (InvalidOffsetException ioe){
342       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
343       " offsets:" + ann.getStartNode().getOffset() + " and "+
344       ann.getEndNode().getOffset());
345     }// End try
346     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
347         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
348     xmlDoc.append("          </charseq>\n");
349     xmlDoc.append("        </name>\n");
350   }//serializeAnEntityMention();
351 
352   /** Returns the next safe ID for an entity*/
353   private int getNextEntityId(){
354     return entityId ++;
355   }// getNextEntityId()
356 
357   /** This list of strings represents the entities type that will be exported*/
358   private List exportedTypes = null;
359   /** This is the name of the dtd file. If it's not present no dtd would be
360     * written in the APF file.
361     */
362   private String dtdFileName = null;
363   /** This field represent the document id and it is used in generating the
364     * entities IDs. It is the file name of the document, without the extension
365     */
366   private String docId = null;
367 
368   /** This field represent an unique entity ID generator*/
369   private int entityId = 1;
370   /** This is the xmlDoc that will be created*/
371   private StringBuffer xmlDoc = null;
372 
373   private URL exportFilePath = null;
374 
375   /** The source attribute for source*/
376   private String source = null;
377 
378 }// APFormatExporter