1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.118 2003/01/20 15:45:06 valyt Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.gui.*;
27  import gate.event.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133                                             DatastoreListener {
134   /** Debug flag */
135   private static final boolean DEBUG = false;
136 
137   /** If you set this flag to true the original content of the document will
138    *  be kept in the document feature. <br>
139    *  Default value is false to avoid the unnecessary waste of memory */
140   private Boolean preserveOriginalContent = new Boolean(false);
141 
142   /** If you set this flag to true the repositioning information for
143    *  the document will be kept in the document feature. <br>
144    *  Default value is false to avoid the unnecessary waste of time and memory
145    */
146   private Boolean collectRepositioningInfo = new Boolean(false);
147 
148   /**
149    * This is a variable which contains the latest crossed over annotation
150    * found during export with preserving format, i.e., toXml(annotations)
151    * method.
152    */
153   private Annotation crossedOverAnnotation = null;
154 
155   /** Default construction. Content left empty. */
156   public DocumentImpl() {
157     content = new DocumentContentImpl();
158   } // default construction
159 
160   /** Initialise this resource, and return it. */
161   public Resource init() throws ResourceInstantiationException {
162     // set up the source URL and create the content
163     if(sourceUrl == null) {
164       if(stringContent == null) {
165         throw new ResourceInstantiationException(
166           "The sourceURL and document's content were null."
167         );
168       }
169 
170       content = new DocumentContentImpl(stringContent);
171       getFeatures().put("gate.SourceURL", "created from String");
172     } else {
173       try {
174         content = new DocumentContentImpl(
175           sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
176         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
177       } catch(IOException e) {
178         e.printStackTrace();
179 //        throw new ResourceInstantiationException("DocumentImpl.init: " + e);
180       }
181 
182       if(preserveOriginalContent.booleanValue() && content != null) {
183         String originalContent = new String(
184           ((DocumentContentImpl) content).getOriginalContent());
185         getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
186                       originalContent);
187       } // if
188     }
189 
190     // set up a DocumentFormat if markup unpacking required
191     if(getMarkupAware().booleanValue()) {
192       DocumentFormat docFormat =
193         DocumentFormat.getDocumentFormat(this, sourceUrl);
194       try {
195         if(docFormat != null){
196           StatusListener sListener = (StatusListener)
197                                       gate.gui.MainFrame.getListeners().
198                                       get("gate.event.StatusListener");
199           if(sListener != null) docFormat.addStatusListener(sListener);
200 
201           // set the flag if true and if the document format support collecting
202           docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
203 
204           if(docFormat.getShouldCollectRepositioning().booleanValue()) {
205             // unpack with collectiong of repositioning information
206             RepositioningInfo info = new RepositioningInfo();
207 
208             String origContent = (String) getFeatures().get(
209                 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
210 
211             RepositioningInfo ampCodingInfo = new RepositioningInfo();
212             if(origContent != null) {
213               boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
214               collectInformationForAmpCodding(origContent, ampCodingInfo,
215                                               shouldCorrectCR);
216               if(docFormat instanceof HtmlDocumentFormat) {
217                 collectInformationForWS(origContent, ampCodingInfo);
218               } // if
219             } // if
220 
221             docFormat.unpackMarkup(this, info, ampCodingInfo);
222 
223             if(origContent != null
224                 && docFormat instanceof XmlDocumentFormat) {
225               // CRLF correction of RepositioningInfo
226               correctRepositioningForCRLFInXML(origContent, info);
227             } // if
228 
229             getFeatures().put(
230                 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
231           }
232           else {
233             // normal old fashioned unpack
234             docFormat.unpackMarkup(this);
235           }
236           docFormat.removeStatusListener(sListener);
237        } //if format != null
238       } catch(DocumentFormatException e) {
239         throw new ResourceInstantiationException(
240           "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
241           " " + e
242         );
243       }
244     } // if markup aware
245 
246 //try{
247 //  FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
248 //  fw.write(getContent().toString());
249 //  fw.flush();
250 //  fw.close();
251 //}catch(IOException ioe){
252 //  ioe.printStackTrace();
253 //}
254 
255     return this;
256   } // init()
257 
258   /**
259    * Correct repositioning information for substitution of "\r\n" with "\n"
260    */
261   private void correctRepositioningForCRLFInXML(String content,
262                                             RepositioningInfo info) {
263     int index = -1;
264 
265     do {
266       index = content.indexOf("\r\n", index+1);
267       if(index != -1) {
268         info.correctInformationOriginalMove(index, 1);
269       } // if
270     } while(index != -1);
271   } // correctRepositioningForCRLF
272 
273   /**
274    * Collect information for substitution of "&xxx;" with "y"
275    *
276    * It couldn't be collected a position information about
277    * some unicode and &-coded symbols during parsing. The parser "hide" the
278    * information about the position of such kind of parsed text.
279    * So, there is minimal chance to have &-coded symbol inside the covered by
280    * repositioning records area. The new record should be created for every
281    * coded symbol outside the existing records.
282    * <BR>
283    * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
284    * for CRLF substitution is performed.
285    */
286   private void collectInformationForAmpCodding(String content,
287                                             RepositioningInfo info,
288                                             boolean shouldCorrectCR) {
289 
290     if(content == null || info == null) return;
291 
292     int ampIndex = -1;
293     int semiIndex;
294 
295     do {
296       ampIndex = content.indexOf('&', ampIndex+1);
297       if(ampIndex != -1) {
298         semiIndex = content.indexOf(';', ampIndex+1);
299         // have semicolon and it is near enough for amp codding
300         if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
301           info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
302         }
303         else {
304           // no semicolon or it is too far
305           // analyse for amp codding without semicolon
306           int maxEnd = Math.min(ampIndex+8, content.length());
307           String ampCandidate = content.substring(ampIndex, maxEnd);
308           int ampCodingSize = analyseAmpCodding(ampCandidate);
309 
310           if(ampCodingSize != -1) {
311             info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
312           } // if
313 
314         } // if - semicolon found
315       } // if - ampersand found
316     } while (ampIndex != -1);
317 
318     // correct the collected information to adjust it's positions
319     // with reported by the parser
320     int index = -1;
321 
322     if(shouldCorrectCR) {
323       do {
324         index = content.indexOf("\r\n", index+1);
325         if(index != -1) {
326           info.correctInformationOriginalMove(index, -1);
327         } // if
328       } while(index != -1);
329     } // if
330   } // collectInformationForAmpCodding
331 
332   /**
333    * This function compute size of the ampersand codded sequence when
334    * semicolin is not present.
335    */
336   private int analyseAmpCodding(String content) {
337     int result = -1;
338 
339     try {
340       char ch = content.charAt(1);
341 
342       switch(ch) {
343         case 'l' : // &lt
344         case 'L' : // &lt
345           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
346             result = 3;
347           } // if
348           break;
349         case 'g' : // &gt
350         case 'G' : // &gt
351           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
352             result = 3;
353           } // if
354           break;
355         case 'a' : // &amp
356         case 'A' : // &amp
357           if(content.substring(2, 4).equalsIgnoreCase("mp")) {
358             result = 4;
359           } // if
360           break;
361         case 'q' : // &quot
362         case 'Q' : // &quot
363           if(content.substring(2, 5).equalsIgnoreCase("uot")) {
364             result = 5;
365           } // if
366           break;
367         case '#' : // #number (example &#145, &#x4C38)
368           int endIndex = 2;
369           boolean hexCoded = false;
370           if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
371             // Hex codding
372             ++endIndex;
373             hexCoded = true;
374           } // if
375 
376           while (endIndex < 8
377                   && isNumber(content.charAt(endIndex), hexCoded) ) {
378             ++endIndex;
379           } // while
380           result = endIndex;
381           break;
382       } // switch
383     } catch (StringIndexOutOfBoundsException ex) {
384       // do nothing
385     } // catch
386 
387     return result;
388   } // analyseAmpCodding
389 
390   /** Check for numeric range. If hex is true the A..F range is included */
391   private boolean isNumber(char ch, boolean hex) {
392     if(ch >= '0' && ch <= '9') return true;
393 
394     if(hex) {
395       if(ch >= 'A' && ch <= 'F') return true;
396       if(ch >= 'a' && ch <= 'f') return true;
397     } // if
398 
399     return false;
400   } // isNumber
401 
402   /** HTML parser perform substitution of multiple whitespaces (WS) with
403    *  a single WS. To create correct repositioning information structure we
404    *  should keep the information for such multiple WS.
405    *  <BR>
406    *  The criteria for WS is <code>(ch <= ' ')</code>.
407    */
408   private void collectInformationForWS(String content, RepositioningInfo info) {
409 
410     if(content == null || info == null) return;
411 
412     // analyse the content and correct the repositioning information
413     char ch;
414     int startWS, endWS;
415 
416     startWS = endWS = -1;
417     int contentLength = content.length();
418 
419     for(int i=0; i<contentLength; ++i) {
420       ch = content.charAt(i);
421 
422       // is whitespace
423       if(ch <= ' ') {
424         if(startWS == -1) {
425           startWS = i;
426         } // if
427         endWS = i;
428       }
429       else {
430         if(endWS - startWS > 0) {
431           // put the repositioning information about the WS substitution
432           info.addPositionInfo(
433             (long)startWS, (long)(endWS - startWS + 1), 0, 1);
434         } // if
435         // clear positions
436         startWS = endWS = -1;
437       }// if
438     } // for
439   } // collectInformationForWS
440 
441   /** Clear all the data members of the object. */
442   public void cleanup() {
443 
444     defaultAnnots = null;
445     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
446         namedAnnotSets.clear();
447     if (DEBUG) Out.prln("Document cleanup called");
448     if (this.lrPersistentId != null)
449       Gate.getCreoleRegister().removeCreoleListener(this);
450     if(this.getDataStore() != null)
451       this.getDataStore().removeDatastoreListener(this);
452   } // cleanup()
453 
454 
455   /** Documents are identified by URLs */
456   public URL getSourceUrl() { return sourceUrl; }
457 
458   /** Set method for the document's URL */
459   public void setSourceUrl(URL sourceUrl) {
460     this.sourceUrl = sourceUrl;
461   } // setSourceUrl
462 
463   /** Documents may be packed within files; in this case an optional pair of
464     * offsets refer to the location of the document.
465     */
466   public Long[] getSourceUrlOffsets() {
467     Long[] sourceUrlOffsets = new Long[2];
468     sourceUrlOffsets[0] = sourceUrlStartOffset;
469     sourceUrlOffsets[1] = sourceUrlEndOffset;
470     return sourceUrlOffsets;
471   } // getSourceUrlOffsets
472 
473   /**
474    * Allow/disallow preserving of the original document content.
475    * If is <B>true</B> the original content will be retrieved from
476    * the DocumentContent object and preserved as document feature.
477    */
478   public void setPreserveOriginalContent(Boolean b) {
479     preserveOriginalContent = b;
480   } // setPreserveOriginalContent
481 
482   /** Get the preserving of content status of the Document.
483    *
484    *  @return whether the Document should preserve it's original content.
485    */
486   public Boolean getPreserveOriginalContent() {
487     return preserveOriginalContent;
488   } // getPreserveOriginalContent
489 
490   /**
491    *  Allow/disallow collecting of repositioning information.
492    *  If is <B>true</B> information will be retrieved and preserved
493    *  as document feature.<BR>
494    *  Preserving of repositioning information give the possibilities
495    *  for converting of coordinates between the original document content and
496    *  extracted from the document text.
497    */
498   public void setCollectRepositioningInfo(Boolean b) {
499     collectRepositioningInfo = b;
500   } // setCollectRepositioningInfo
501 
502   /** Get the collectiong and preserving of repositioning information
503    *  for the Document. <BR>
504    *  Preserving of repositioning information give the possibilities
505    *  for converting of coordinates between the original document content and
506    *  extracted from the document text.
507    *
508    *  @return whether the Document should collect and preserve information.
509    */
510   public Boolean getCollectRepositioningInfo() {
511     return collectRepositioningInfo;
512   } // getCollectRepositioningInfo
513 
514   /** Documents may be packed within files; in this case an optional pair of
515     * offsets refer to the location of the document. This method gets the
516     * start offset.
517     */
518   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
519 
520   /** Documents may be packed within files; in this case an optional pair of
521     * offsets refer to the location of the document. This method sets the
522     * start offset.
523     */
524   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
525     this.sourceUrlStartOffset = sourceUrlStartOffset;
526   } // setSourceUrlStartOffset
527 
528   /** Documents may be packed within files; in this case an optional pair of
529     * offsets refer to the location of the document. This method gets the
530     * end offset.
531     */
532   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
533 
534   /** Documents may be packed within files; in this case an optional pair of
535     * offsets refer to the location of the document. This method sets the
536     * end offset.
537     */
538   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
539     this.sourceUrlEndOffset = sourceUrlEndOffset;
540   } // setSourceUrlStartOffset
541 
542   /** The content of the document: a String for text; MPEG for video; etc. */
543   public DocumentContent getContent() { return content; }
544 
545   /** Set method for the document content */
546   public void setContent(DocumentContent content) { this.content = content; }
547 
548   /** Get the encoding of the document content source */
549   public String getEncoding() {
550     //we need to make sure we ALWAYS have an encoding
551     if(encoding == null || encoding.trim().length() == 0){
552       //no encoding definded: use the platform default
553       encoding = java.nio.charset.Charset.forName(
554           System.getProperty("file.encoding")).name();
555     }
556     return encoding;
557   }
558 
559   /** Set the encoding of the document content source */
560   public void setEncoding(String encoding) { this.encoding = encoding; }
561 
562   /** Get the default set of annotations. The set is created if it
563     * doesn't exist yet.
564     */
565   public AnnotationSet getAnnotations() {
566     if(defaultAnnots == null){
567       defaultAnnots = new AnnotationSetImpl(this);
568       fireAnnotationSetAdded(new DocumentEvent(
569            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
570     }//if
571     return defaultAnnots;
572   } // getAnnotations()
573 
574   /** Get a named set of annotations. Creates a new set if one with this
575     * name doesn't exist yet.
576     * If the provided name is null then it returns the default annotation set.
577     */
578   public AnnotationSet getAnnotations(String name) {
579     if(name == null) return getAnnotations();
580     if(namedAnnotSets == null)
581       namedAnnotSets = new HashMap();
582     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
583 
584     if(namedSet == null) {
585       namedSet = new AnnotationSetImpl(this, name);
586       namedAnnotSets.put(name, namedSet);
587 
588       DocumentEvent evt = new DocumentEvent(
589         this, DocumentEvent.ANNOTATION_SET_ADDED, name
590       );
591       fireAnnotationSetAdded(evt);
592     }
593     return namedSet;
594   } // getAnnotations(name)
595 
596   /** Make the document markup-aware. This will trigger the creation
597    *  of a DocumentFormat object at Document initialisation time; the
598    *  DocumentFormat object will unpack the markup in the Document and
599    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
600    *
601    *  @param b markup awareness status.
602    */
603   public void setMarkupAware(Boolean newMarkupAware) {
604       this.markupAware = newMarkupAware;
605   }
606 
607   /** Get the markup awareness status of the Document.
608    *  <B>Documents are markup-aware by default.</B>
609    *  @return whether the Document is markup aware.
610    */
611   public Boolean getMarkupAware() { return markupAware; }
612 
613   /** Returns an XML document aming to preserve the original markups(
614     * the original markup will be in the same place and format as it was
615     * before processing the document) and include (if possible)
616     * the annotations specified in the aSourceAnnotationSet.
617     * It is equivalent to toXml(aSourceAnnotationSet, true).
618     */
619   public String toXml(Set aSourceAnnotationSet){
620     return toXml(aSourceAnnotationSet, true);
621   }
622 
623   /** Returns an XML document aming to preserve the original markups(
624     * the original markup will be in the same place and format as it was
625     * before processing the document) and include (if possible)
626     * the annotations specified in the aSourceAnnotationSet.
627     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
628     * if they will cause a crosed over situation.
629     * @param aSourceAnnotationSet is an annotation set containing all the
630     * annotations that will be combined with the original marup set. If the
631     * param is <code>null</code> it will only dump the original markups.
632     * @param includeFeatures is a boolean that controls whether the annotation
633     * features should be included or not. If false, only the annotation type
634     * is included in the tag.
635     * @return a string representing an XML document containing the original
636     * markup + dumped annotations form the aSourceAnnotationSet
637     */
638   public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
639 
640     if(hasOriginalContentFeatures()) {
641       return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
642     } // if
643 
644     AnnotationSet originalMarkupsAnnotSet =
645             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
646 
647     // Create a dumping annotation set on the document. It will be used for
648     // dumping annotations...
649     AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
650 
651     // This set will be constructed inside this method. If is not empty, the
652     // annotation contained will be lost.
653     if (!dumpingSet.isEmpty()){
654       Out.prln("WARNING: The dumping annotation set was not empty."+
655       "All annotation it contained were lost.");
656       dumpingSet.clear();
657     }// End if
658 
659     StatusListener sListener = (StatusListener)
660                                gate.gui.MainFrame.getListeners().
661                                get("gate.event.StatusListener");
662     // Construct the dumping set in that way that all annotations will verify
663     // the condition that there are not annotations which are crossed.
664     // First add all annotation from the original markups
665     if(sListener != null)
666       sListener.statusChanged("Constructing the dumping annotation set.");
667     dumpingSet.addAll(originalMarkupsAnnotSet);
668     // Then take all the annotations from aSourceAnnotationSet and verify if
669     // they can be inserted safely into the dumpingSet. Where not possible,
670     // report.
671     if (aSourceAnnotationSet != null){
672       Iterator iter = aSourceAnnotationSet.iterator();
673       while (iter.hasNext()){
674         Annotation currentAnnot = (Annotation) iter.next();
675         if(insertsSafety(dumpingSet,currentAnnot)){
676           dumpingSet.add(currentAnnot);
677         }else if (crossedOverAnnotation != null){
678           try {
679             Out.prln("Warning: Annotations were found to violate the " +
680             "crossed over condition: \n" +
681             "1. [" +
682             getContent().getContent(
683                            crossedOverAnnotation.getStartNode().getOffset(),
684                            crossedOverAnnotation.getEndNode().getOffset()) +
685             " (" + crossedOverAnnotation.getType() + ": " +
686             crossedOverAnnotation.getStartNode().getOffset() +
687             ";" + crossedOverAnnotation.getEndNode().getOffset() +
688             ")]\n" +
689             "2. [" +
690             getContent().getContent(
691                            currentAnnot.getStartNode().getOffset(),
692                            currentAnnot.getEndNode().getOffset()) +
693             " (" + currentAnnot.getType() + ": " +
694             currentAnnot.getStartNode().getOffset() +
695             ";" + currentAnnot.getEndNode().getOffset() +
696             ")]\nThe second one will be discarded.\n"  );
697           } catch (gate.util.InvalidOffsetException ex) {
698             throw new GateRuntimeException(ex.getMessage());
699           }
700         }// End if
701       }// End while
702     }// End if
703 
704     // The dumpingSet is ready to be exported as XML
705     // Here we go.
706     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
707     StringBuffer xmlDoc = new StringBuffer(
708           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
709 
710     // Add xml header if original format was xml
711     String mimeType = getFeatures() == null ?
712                       null :
713                       (String)getFeatures().get("MimeType");
714     boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
715 
716     if(wasXML){
717       xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
718       xmlDoc.append(getEncoding());
719       xmlDoc.append("\" ?>");
720       xmlDoc.append(Strings.getNl());
721     }// ENd if
722     // Identify and extract the root annotation from the dumpingSet.
723     theRootAnnotation = identifyTheRootAnnotation(dumpingSet);
724     // If a root annotation has been identified then add it eplicitley at the
725     // beginning of the document
726     if (theRootAnnotation != null){
727       dumpingSet.remove(theRootAnnotation);
728       xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
729     }// End if
730     // Construct and append the rest of the document
731     xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures));
732     // If a root annotation has been identified then add it eplicitley at the
733     // end of the document
734     if (theRootAnnotation != null){
735       xmlDoc.append(writeEndTag(theRootAnnotation));
736     }// End if
737 
738     if(sListener != null) sListener.statusChanged("Done.");
739     return xmlDoc.toString();
740   }//End toXml()
741 
742   /** This method verifies if aSourceAnnotation can ve inserted safety into the
743     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
744     * contition with any annotation from the aTargetAnnotSet.
745     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
746     * @param aSourceAnnotation the annotation to be inserted into the
747     * aTargetAnnotSet
748     * @return true if the annotation inserts safety, or false otherwise.
749     */
750   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
751                                                 Annotation aSourceAnnotation){
752 
753     if (aTargetAnnotSet == null || aSourceAnnotation == null) {
754       this.crossedOverAnnotation = null;
755       return false;
756     }
757     if (aSourceAnnotation.getStartNode() == null ||
758         aSourceAnnotation.getStartNode().getOffset()== null) {
759       this.crossedOverAnnotation = null;
760       return false;
761     }
762     if (aSourceAnnotation.getEndNode() == null ||
763         aSourceAnnotation.getEndNode().getOffset()== null) {
764       this.crossedOverAnnotation = null;
765       return false;
766     }
767 
768     // Get the start and end offsets
769     Long start = aSourceAnnotation.getStartNode().getOffset();
770     Long end =   aSourceAnnotation.getEndNode().getOffset();
771     // Read aSourceAnnotation offsets long
772     long s2 = start.longValue();
773     long e2 = end.longValue();
774 
775     // Obtain a set with all annotations annotations that overlap
776     // totaly or partially with the interval defined by the two provided offsets
777     AnnotationSet as = aTargetAnnotSet.get(start,end);
778 
779     // Investigate all the annotations from as to see if there is one that
780     // comes in conflict with aSourceAnnotation
781     Iterator it = as.iterator();
782     while(it.hasNext()){
783       Annotation ann = (Annotation) it.next();
784       // Read ann offsets
785       long s1 = ann.getStartNode().getOffset().longValue();
786       long e1 = ann.getEndNode().getOffset().longValue();
787 
788       if (s1<s2 && s2<e1 && e1<e2) {
789         this.crossedOverAnnotation = ann;
790         return false;
791       }
792       if (s2<s1 && s1<e2 && e2<e1) {
793         this.crossedOverAnnotation = ann;
794         return false;
795       }
796     }// End while
797     return true;
798   }// insertsSafety()
799 
800   /** This method saves all the annotations from aDumpAnnotSet and combines
801     * them with the document content.
802     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
803     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
804     * then an empty string will be returned.
805     * @param includeFeatures is a boolean, which controls whether the annotation
806     * features and gate ID are included or not.
807     * @return The XML document obtained from raw text + the information from
808     * the dump annotation set.
809     */
810   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
811                                         boolean includeFeatures){
812     String content = null;
813     if (this.getContent()== null)
814       content = new String("");
815     else
816       content = this.getContent().toString();
817     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
818     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
819 
820     TreeMap offsets2CharsMap = new TreeMap();
821     if (this.getContent().size().longValue() != 0){
822       // Fill the offsets2CharsMap with all the indices where
823       // special chars appear
824       buildEntityMapFromString(content,offsets2CharsMap);
825     }//End if
826     // The saving alghorithm is as follows:
827     ///////////////////////////////////////////
828     // Construct a set of annot with all IDs in asc order.
829     // All annotations that end at that offset swap their place in descending
830     // order. For each node write all the tags from left to right.
831 
832     // Construct the node set
833     TreeSet offsets = new TreeSet();
834     Iterator iter = aDumpAnnotSet.iterator();
835     while (iter.hasNext()){
836       Annotation annot = (Annotation) iter.next();
837       offsets.add(annot.getStartNode().getOffset());
838       offsets.add(annot.getEndNode().getOffset());
839     }// End while
840 
841     // ofsets is sorted in ascending order.
842     // Iterate this set in descending order and remove an offset at each
843     // iteration
844     while (!offsets.isEmpty()){
845       Long offset = (Long)offsets.last();
846       // Remove the offset from the set
847       offsets.remove(offset);
848       // Now, use it.
849       // Returns a list with annotations that needs to be serialized in that
850       // offset.
851       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
852       // Attention: the annotation are serialized from left to right
853       StringBuffer tmpBuff = new StringBuffer("");
854       Stack stack = new Stack();
855       // Iterate through all these annotations and serialize them
856       Iterator it = annotations.iterator();
857       while(it.hasNext()){
858         Annotation a = (Annotation) it.next();
859         it.remove();
860         // Test if a Ends at offset
861         if ( offset.equals(a.getEndNode().getOffset()) ){
862           // Test if a Starts at offset
863           if ( offset.equals(a.getStartNode().getOffset()) ){
864             // Here, the annotation a Starts and Ends at the offset
865             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
866                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
867 
868               // Assert: annotation a with start == end and isEmptyAndSpan
869               tmpBuff.append(writeStartTag(a, includeFeatures));
870               stack.push(a);
871             }else{
872               // Assert annotation a with start == end and an empty tag
873               tmpBuff.append(writeEmptyTag(a));
874               // The annotation is removed from dumped set
875               aDumpAnnotSet.remove(a);
876             }// End if
877           }else{
878             // Here the annotation a Ends at the offset.
879             // In this case empty the stack and write the end tag
880             if (!stack.isEmpty()){
881               while(!stack.isEmpty()){
882                 Annotation a1 = (Annotation)stack.pop();
883                 tmpBuff.append(writeEndTag(a1));
884               }// End while
885             }// End if
886             tmpBuff.append(writeEndTag(a));
887           }// End if
888         }else{
889           // The annotation a does NOT end at the offset. Let's see if it starts
890           // at the offset
891           if ( offset.equals(a.getStartNode().getOffset()) ){
892             // The annotation a starts at the offset.
893             // In this case empty the stack and write the end tag
894             if (!stack.isEmpty()){
895               while(!stack.isEmpty()){
896                 Annotation a1 = (Annotation)stack.pop();
897                 tmpBuff.append(writeEndTag(a1));
898               }// End while
899             }// End if
900             tmpBuff.append(writeStartTag(a, includeFeatures));
901             // The annotation is removed from dumped set
902             aDumpAnnotSet.remove(a);
903           }// End if ( offset.equals(a.getStartNode().getOffset()) )
904         }// End if ( offset.equals(a.getEndNode().getOffset()) )
905       }// End while(it.hasNext()){
906 
907       // In this case empty the stack and write the end tag
908       if (!stack.isEmpty()){
909         while(!stack.isEmpty()){
910           Annotation a1 = (Annotation)stack.pop();
911           tmpBuff.append(writeEndTag(a1));
912         }// End while
913       }// End if
914 
915       // Before inserting tmpBuff into docContStrBuff we need to check
916       // if there are chars to be replaced and if there are, they would be
917       // replaced.
918       if (!offsets2CharsMap.isEmpty()){
919         Integer offsChar = (Integer) offsets2CharsMap.lastKey();
920         while( !offsets2CharsMap.isEmpty() &&
921                        offsChar.intValue() >= offset.intValue()){
922           // Replace the char at offsChar with its corresponding entity form
923           // the entitiesMap.
924           docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
925           (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
926           // Discard the offsChar after it was used.
927           offsets2CharsMap.remove(offsChar);
928           // Investigate next offsChar
929           if (!offsets2CharsMap.isEmpty())
930             offsChar = (Integer) offsets2CharsMap.lastKey();
931         }// End while
932       }// End if
933       // Insert tmpBuff to the location where it belongs in docContStrBuff
934       docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
935     }// End while(!offsets.isEmpty())
936     // Need to replace the entities in the remaining text, if there is any text
937     // So, if there are any more items in offsets2CharsMap they need to be
938     // replaced
939     while (!offsets2CharsMap.isEmpty()){
940       Integer offsChar = (Integer) offsets2CharsMap.lastKey();
941       // Replace the char with its entity
942       docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
943       (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
944       // remove the offset from the map
945       offsets2CharsMap.remove(offsChar);
946     }// End while
947     return docContStrBuff.toString();
948   }// saveAnnotationSetAsXml()
949 
950   /**
951    *  Return true only if the document has features for original content and
952    *  repositioning information.
953    */
954   private boolean hasOriginalContentFeatures() {
955     FeatureMap features = getFeatures();
956     boolean result = false;
957 
958     result =
959     (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
960       &&
961     (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
962       != null);
963 
964     return result;
965   } // hasOriginalContentFeatures
966 
967   /** This method saves all the annotations from aDumpAnnotSet and combines
968     * them with the original document content, if preserved as feature.
969     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
970     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
971     * then an empty string will be returned.
972     * @param includeFeatures is a boolean, which controls whether the annotation
973     * features and gate ID are included or not.
974     * @return The XML document obtained from raw text + the information from
975     * the dump annotation set.
976     */
977   private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
978                                         boolean includeFeatures){
979     StringBuffer docContStrBuff;
980 
981     String origContent;
982 
983     origContent =
984      (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
985     if(origContent == null) {
986       origContent = "";
987     } // if
988 
989     long originalContentSize = origContent.length();
990 
991     RepositioningInfo repositioning = (RepositioningInfo)
992       getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
993 
994     docContStrBuff = new StringBuffer(origContent);
995     if (aSourceAnnotationSet == null) return docContStrBuff.toString();
996 
997     StatusListener sListener = (StatusListener)
998                                gate.gui.MainFrame.getListeners().
999                                get("gate.event.StatusListener");
1000
1001    AnnotationSet originalMarkupsAnnotSet =
1002            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1003    // Create a dumping annotation set on the document. It will be used for
1004    // dumping annotations...
1005    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1006    if(sListener != null)
1007      sListener.statusChanged("Constructing the dumping annotation set.");
1008    // Then take all the annotations from aSourceAnnotationSet and verify if
1009    // they can be inserted safely into the dumpingSet. Where not possible,
1010    // report.
1011    if (aSourceAnnotationSet != null){
1012      Iterator iter = aSourceAnnotationSet.iterator();
1013      Annotation currentAnnot;
1014      while (iter.hasNext()){
1015        currentAnnot = (Annotation) iter.next();
1016        if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1017            && insertsSafety(dumpingSet, currentAnnot)){
1018          dumpingSet.add(currentAnnot);
1019        }else{
1020          Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1021          ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1022          ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1023          ", type=" + currentAnnot.getType()+ " was found to violate the" +
1024          " crossed over condition. It will be discarded");
1025        }// End if
1026      }// End while
1027    }// End if
1028
1029    // The dumpingSet is ready to be exported as XML
1030    // Here we go.
1031    if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1032
1033    ///////////////////////////////////////////
1034    // Construct a set of annot with all IDs in asc order.
1035    // All annotations that end at that offset swap their place in descending
1036    // order. For each node write all the tags from left to right.
1037
1038    // Construct the node set
1039    TreeSet offsets = new TreeSet();
1040    Iterator iter = aSourceAnnotationSet.iterator();
1041    while (iter.hasNext()){
1042      Annotation annot = (Annotation) iter.next();
1043      offsets.add(annot.getStartNode().getOffset());
1044      offsets.add(annot.getEndNode().getOffset());
1045    }// End while
1046
1047    // ofsets is sorted in ascending order.
1048    // Iterate this set in descending order and remove an offset at each
1049    // iteration
1050    while (!offsets.isEmpty()){
1051      Long offset = (Long)offsets.last();
1052      // Remove the offset from the set
1053      offsets.remove(offset);
1054      // Now, use it.
1055      // Returns a list with annotations that needs to be serialized in that
1056      // offset.
1057      List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1058      // Attention: the annotation are serialized from left to right
1059      StringBuffer tmpBuff = new StringBuffer("");
1060      Stack stack = new Stack();
1061      // Iterate through all these annotations and serialize them
1062      Iterator it = annotations.iterator();
1063      Annotation a = null;
1064      while(it.hasNext()) {
1065        a = (Annotation) it.next();
1066        it.remove();
1067        // Test if a Ends at offset
1068        if ( offset.equals(a.getEndNode().getOffset()) ){
1069          // Test if a Starts at offset
1070          if ( offset.equals(a.getStartNode().getOffset()) ){
1071            // Here, the annotation a Starts and Ends at the offset
1072            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1073                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1074
1075              // Assert: annotation a with start == end and isEmptyAndSpan
1076              tmpBuff.append(writeStartTag(a, includeFeatures, false));
1077              stack.push(a);
1078            }else{
1079              // Assert annotation a with start == end and an empty tag
1080              tmpBuff.append(writeEmptyTag(a, false));
1081              // The annotation is removed from dumped set
1082              aSourceAnnotationSet.remove(a);
1083            }// End if
1084          }else{
1085            // Here the annotation a Ends at the offset.
1086            // In this case empty the stack and write the end tag
1087            while(!stack.isEmpty()){
1088              Annotation a1 = (Annotation)stack.pop();
1089              tmpBuff.append(writeEndTag(a1));
1090            }// End while
1091            tmpBuff.append(writeEndTag(a));
1092          }// End if
1093        }else{
1094          // The annotation a does NOT end at the offset. Let's see if it starts
1095          // at the offset
1096          if ( offset.equals(a.getStartNode().getOffset()) ){
1097            // The annotation a starts at the offset.
1098            // In this case empty the stack and write the end tag
1099            while(!stack.isEmpty()){
1100              Annotation a1 = (Annotation)stack.pop();
1101              tmpBuff.append(writeEndTag(a1));
1102            }// End while
1103
1104            tmpBuff.append(writeStartTag(a, includeFeatures, false));
1105            // The annotation is removed from dumped set
1106            aSourceAnnotationSet.remove(a);
1107          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1108        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1109      }// End while(it.hasNext()){
1110
1111      // In this case empty the stack and write the end tag
1112      while(!stack.isEmpty()){
1113        Annotation a1 = (Annotation)stack.pop();
1114        tmpBuff.append(writeEndTag(a1));
1115      }// End while
1116
1117      long originalPosition = -1;
1118      boolean backPositioning =
1119        a != null && offset.equals(a.getEndNode().getOffset());
1120      if ( backPositioning ) {
1121        // end of the annotation correction
1122        originalPosition =
1123          repositioning.getOriginalPos(offset.intValue(), true);
1124      } // if
1125
1126      if(originalPosition == -1) {
1127        originalPosition = repositioning.getOriginalPos(offset.intValue());
1128      } // if
1129
1130      // Insert tmpBuff to the location where it belongs in docContStrBuff
1131      if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1132        docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1133      }
1134      else {
1135        Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1136        +") could not be positioned in the original document. \n"
1137        +"Calculated position is: "+originalPosition
1138        +" placed back: "+backPositioning);
1139      } // if
1140
1141    }// End while(!offsets.isEmpty())
1142    if (theRootAnnotation != null)
1143      docContStrBuff.append(writeEndTag(theRootAnnotation));
1144    return docContStrBuff.toString();
1145  } // saveAnnotationSetAsXmlInOrig()
1146
1147  /** This method returns a list with annotations ordered that way that
1148    * they can be serialized from left to right, at the offset. If one of the
1149    * params is null then an empty list will be returned.
1150    * @param aDumpAnnotSet is a set containing all annotations that will be
1151    * dumped.
1152    * @param offset represent the offset at witch the annotation must start
1153    * AND/OR end.
1154    * @return a list with those annotations that need to be serialized.
1155    */
1156  private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1157    List annotationList = new LinkedList();
1158    if (aDumpAnnotSet == null || offset == null) return annotationList;
1159    Set annotThatStartAtOffset = new TreeSet(
1160                          new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1161    Set annotThatEndAtOffset = new TreeSet(
1162                          new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1163    Set annotThatStartAndEndAtOffset = new TreeSet(
1164                          new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1165
1166    // Fill these tree lists with annotation tat start, end or start and
1167    // end at the offset.
1168    Iterator iter = aDumpAnnotSet.iterator();
1169    while(iter.hasNext()){
1170      Annotation ann = (Annotation) iter.next();
1171      if (offset.equals(ann.getStartNode().getOffset())){
1172        if (offset.equals(ann.getEndNode().getOffset()))
1173          annotThatStartAndEndAtOffset.add(ann);
1174        else
1175          annotThatStartAtOffset.add(ann);
1176      }else{
1177        if (offset.equals(ann.getEndNode().getOffset()))
1178          annotThatEndAtOffset.add(ann);
1179      }// End if
1180    }// End while
1181    annotationList.addAll(annotThatEndAtOffset);
1182    annotThatEndAtOffset = null;
1183    annotationList.addAll(annotThatStartAtOffset);
1184    annotThatStartAtOffset = null;
1185    iter = annotThatStartAndEndAtOffset.iterator();
1186    while(iter.hasNext()){
1187      Annotation ann = (Annotation) iter.next();
1188      Iterator it = annotationList.iterator();
1189      boolean breaked = false;
1190      while (it.hasNext()){
1191        Annotation annFromList = (Annotation) it.next();
1192        if (annFromList.getId().intValue() > ann.getId().intValue()){
1193          annotationList.add(annotationList.indexOf(annFromList),ann);
1194          breaked = true;
1195          break;
1196        }// End if
1197      }// End while
1198      if (!breaked)
1199        annotationList.add(ann);
1200      iter.remove();
1201    }// End while
1202    return annotationList;
1203  }// getAnnotationsForOffset()
1204
1205  private String writeStartTag(Annotation annot, boolean includeFeatures){
1206    return writeStartTag(annot, includeFeatures, true);
1207  } // writeStartTag
1208
1209  /** Returns a string representing a start tag based on the input annot*/
1210  private String writeStartTag(Annotation annot, boolean includeFeatures,
1211                                boolean includeNamespace){
1212    AnnotationSet originalMarkupsAnnotSet =
1213            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1214
1215    StringBuffer strBuff = new StringBuffer("");
1216    if (annot == null) return strBuff.toString();
1217//    if (!addGatePreserveFormatTag && isRootTag){
1218      if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1219      //the features are included either if desired or if that's an annotation
1220      //from the original markup of the document. We don't want for example to
1221      //spoil all links in an HTML file!
1222      if (includeFeatures) {
1223        strBuff.append("<");
1224        strBuff.append(annot.getType());
1225        strBuff.append(" ");
1226        if(includeNamespace) {
1227          strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1228          strBuff.append(" gate:");
1229        }
1230        strBuff.append("gateId=\"");
1231        strBuff.append(annot.getId());
1232        strBuff.append("\"");
1233        strBuff.append(" ");
1234        if(includeNamespace) {
1235          strBuff.append("gate:");
1236        }
1237        strBuff.append("annotMaxId=\"");
1238        strBuff.append(nextAnnotationId);
1239        strBuff.append("\"");
1240        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1241        strBuff.append(">");
1242      }
1243      else if (originalMarkupsAnnotSet.contains(annot)) {
1244          strBuff.append("<");
1245          strBuff.append(annot.getType());
1246          strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1247          strBuff.append(">");
1248        }
1249      else {
1250        strBuff.append("<");
1251        strBuff.append(annot.getType());
1252        strBuff.append(">");
1253      }
1254
1255    }else{
1256      //the features are included either if desired or if that's an annotation
1257      //from the original markup of the document. We don't want for example to
1258      //spoil all links in an HTML file!
1259      if (includeFeatures) {
1260        strBuff.append("<");
1261        strBuff.append(annot.getType());
1262        strBuff.append(" ");
1263        if(includeNamespace) {
1264          strBuff.append("gate:");
1265        } // if includeNamespaces
1266        strBuff.append("gateId=\"");
1267        strBuff.append(annot.getId());
1268        strBuff.append("\"");
1269        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1270        strBuff.append(">");
1271      }
1272      else if (originalMarkupsAnnotSet.contains(annot)) {
1273        strBuff.append("<");
1274        strBuff.append(annot.getType());
1275        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1276        strBuff.append(">");
1277      }
1278      else {
1279        strBuff.append("<");
1280        strBuff.append(annot.getType());
1281        strBuff.append(">");
1282      }
1283    }// End if
1284    return strBuff.toString();
1285  }// writeStartTag()
1286
1287  /**
1288   * Identifies the root annotations inside an annotation set.
1289   * The root annotation is the one that starts at offset 0, and has the
1290   * greatest span. If there are more than one with this function, then the
1291   * annotation with the smalled ID wil be selected as root.
1292   * If none is identified it will return null.
1293   * @param anAnnotationSet The annotation set possibly containing
1294   *  the root annotation.
1295   * @return The root annotation or null is it fails
1296   */
1297  private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1298    if (anAnnotationSet == null) return null;
1299    // If the starting node of this annotation is not null, then the annotation
1300    // set will not have a root annotation.
1301    Node startNode = anAnnotationSet.firstNode();
1302    Node endNode = anAnnotationSet.lastNode();
1303    // This is placed here just to speed things up. The alghorithm bellow can
1304    // can identity the annotation that span over the entire set and with the
1305    // smallest ID. However the root annotation will have to have the start
1306    // offset equal to 0.
1307    if (startNode.getOffset().longValue() != 0) return null;
1308    // Go anf find the annotation.
1309    Annotation theRootAnnotation = null;
1310    // Check if there are annotations starting at offset 0. If there are, then
1311    // check all of them to see which one has the greatest span. Basically its
1312    // END offset should be the bigest offset from the input annotation set.
1313    long start = startNode.getOffset().longValue();
1314    long end = endNode.getOffset().longValue();
1315    for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1316      Annotation currentAnnot = (Annotation) it.next();
1317      // If the currentAnnot has both its Start and End equals to the Start and
1318      // end of the AnnotationSet then check to see if its ID is the smallest.
1319      if (
1320          (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1321          (end   == currentAnnot.getEndNode().getOffset().longValue())
1322         ){
1323          // The currentAnnotation has is a potencial root one.
1324          if (theRootAnnotation == null)
1325            theRootAnnotation = currentAnnot;
1326          else{
1327            // If its ID is greater that the currentAnnot then update the root
1328            if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1329              theRootAnnotation = currentAnnot;
1330          }// End if
1331      }// End if
1332    }// End for
1333    return theRootAnnotation;
1334  }// End identifyTheRootAnnotation()
1335
1336  /** This method takes aScanString and searches for those chars from
1337    * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1338    * using as key the offsets where those Chars appear and the Char.
1339    * If one of the params is null the method simply returns.
1340    */
1341  private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1342    if (aScanString == null || aMapToFill == null) return;
1343    if (entitiesMap == null || entitiesMap.isEmpty()){
1344      Err.prln("WARNING: Entities map was not initialised !");
1345      return;
1346    }// End if
1347    // Fill the Map with the offsets of the special chars
1348    Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1349    while(entitiesMapIterator.hasNext()){
1350      Character c = (Character) entitiesMapIterator.next();
1351      int fromIndex = 0;
1352      while (-1 != fromIndex){
1353        fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1354        if (-1 != fromIndex){
1355          aMapToFill.put(new Integer(fromIndex),c);
1356          fromIndex ++;
1357        }// End if
1358      }// End while
1359    }// End while
1360  }//buildEntityMapFromString();
1361
1362  private String writeEmptyTag(Annotation annot){
1363    return writeEmptyTag(annot, true);
1364  } // writeEmptyTag
1365
1366  /** Returns a string representing an empty tag based on the input annot*/
1367  private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1368    StringBuffer strBuff = new StringBuffer("");
1369    if (annot == null) return strBuff.toString();
1370
1371    strBuff.append("<");
1372    strBuff.append(annot.getType());
1373
1374    AnnotationSet originalMarkupsAnnotSet =
1375            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1376    if (! originalMarkupsAnnotSet.contains(annot)) {
1377      strBuff.append(" gateId=\"");
1378      strBuff.append(annot.getId());
1379      strBuff.append("\"");
1380    }
1381    strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1382    strBuff.append("/>");
1383
1384    return strBuff.toString();
1385  }// writeEmptyTag()
1386
1387  /** Returns a string representing an end tag based on the input annot*/
1388  private String writeEndTag(Annotation annot){
1389    StringBuffer strBuff = new StringBuffer("");
1390    if (annot == null) return strBuff.toString();
1391/*
1392    if (annot.getType().indexOf(" ") != -1)
1393      Out.prln("Warning: Truncating end tag to first word for annot type \""
1394      +annot.getType()+ "\". ");
1395*/
1396    strBuff.append("</"+annot.getType()+">");
1397
1398    return strBuff.toString();
1399  }// writeEndTag()
1400
1401  /** Returns a string representing a FeatureMap serialized as XML attributes*/
1402  private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1403    StringBuffer strBuff = new StringBuffer("");
1404    if (feat == null) return strBuff.toString();
1405    Iterator it = feat.keySet().iterator();
1406    while (it.hasNext()){
1407      Object key = it.next();
1408      Object value = feat.get(key);
1409      if ( (key != null) && (value != null) ){
1410        // Eliminate a feature inserted at reading time and which help to
1411        // take some decissions at saving time
1412        if ("isEmptyAndSpan".equals(key.toString()))
1413          continue;
1414        if( !(String.class.isAssignableFrom(key.getClass()) ||
1415              Number.class.isAssignableFrom(key.getClass()))){
1416
1417            Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1418                             " from String or Number.(feature discarded)");
1419            continue;
1420        }// End if
1421        if ( !(String.class.isAssignableFrom(value.getClass()) ||
1422               Number.class.isAssignableFrom(value.getClass()) ||
1423               java.util.Collection.class.isAssignableFrom(value.getClass()))){
1424
1425            Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1426                       " from String, Number or Collection.(feature discarded)");
1427            continue;
1428        }// End if
1429        if ("matches".equals(key)) {
1430          strBuff.append(" ");
1431          if(includeNamespace) {
1432            strBuff.append("gate:");
1433          }
1434//          strBuff.append(key);
1435          // replace non XML chars in attribute name
1436          strBuff.append(
1437            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1438          strBuff.append("=\"");
1439        }
1440        else {
1441          strBuff.append(" ");
1442//          strBuff.append(key);
1443          // replace non XML chars in attribute name
1444          strBuff.append(
1445            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1446          strBuff.append("=\"");
1447        }
1448        if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1449          Iterator valueIter = ((Collection)value).iterator();
1450          while(valueIter.hasNext()){
1451            Object item = valueIter.next();
1452            if (!(String.class.isAssignableFrom(item.getClass()) ||
1453                  Number.class.isAssignableFrom(item.getClass())))
1454                  continue;
1455//            strBuff.append(item);
1456            // replace non XML chars in collection item
1457            strBuff.append(
1458              filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1459            strBuff.append(";");
1460          }// End while
1461          if (strBuff.charAt(strBuff.length()-1) == ';')
1462            strBuff.deleteCharAt(strBuff.length()-1);
1463        }else{
1464//          strBuff.append(value);
1465          // replace non XML chars in attribute value
1466          strBuff.append(
1467            filterNonXmlChars(replaceCharsWithEntities(value.toString())));
1468        }// End if
1469        strBuff.append("\"");
1470      }// End if
1471    }// End while
1472    return strBuff.toString();
1473  }// writeFeatures()
1474
1475  /** Returns a GateXml document that is a custom XML format for wich there is
1476    * a reader inside GATE called gate.xml.GateFormatXmlHandler.
1477    * What it does is to serialize a GATE document in an XML format.
1478    * @return a string representing a Gate Xml document.
1479    */
1480  public String toXml(){
1481    // Initialize the xmlContent with 3 time the size of the current document.
1482    // This is because of the tags size. This measure is made to increase the
1483    // performance of StringBuffer.
1484    StringBuffer xmlContent = new StringBuffer(
1485         DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
1486    // Add xml header
1487    xmlContent.append("<?xml version=\"1.0\" encoding=\"");
1488    xmlContent.append(getEncoding());
1489    xmlContent.append("\" ?>");
1490    xmlContent.append(Strings.getNl());
1491
1492    // Add the root element
1493    xmlContent.append("<GateDocument>\n");
1494    xmlContent.append("<!-- The document's features-->\n\n");
1495    xmlContent.append("<GateDocumentFeatures>\n");
1496
1497    xmlContent.append(featuresToXml(this.getFeatures()));
1498    xmlContent.append("</GateDocumentFeatures>\n");
1499    xmlContent.append("<!-- The document content area with serialized"+
1500                      " nodes -->\n\n");
1501    // Add plain text element
1502    xmlContent.append("<TextWithNodes>");
1503    xmlContent.append(textWithNodes(this.getContent().toString()));
1504    xmlContent.append("</TextWithNodes>\n");
1505    // Serialize as XML all document's annotation sets
1506    // Serialize the default AnnotationSet
1507    StatusListener sListener = (StatusListener)
1508                               gate.gui.MainFrame.getListeners().
1509                               get("gate.event.StatusListener");
1510    if(sListener != null)
1511      sListener.statusChanged("Saving the default annotation set ");
1512    xmlContent.append("<!-- The default annotation set -->\n\n");
1513    xmlContent.append(annotationSetToXml(this.getAnnotations()));
1514    // Serialize all others AnnotationSets
1515    // namedAnnotSets is a Map containing all other named Annotation Sets.
1516    if (namedAnnotSets != null){
1517      Iterator iter = namedAnnotSets.values().iterator();
1518      while(iter.hasNext()){
1519        AnnotationSet annotSet = (AnnotationSet) iter.next();
1520        xmlContent.append("<!-- Named annotation set -->\n\n");
1521        // Serialize it as XML
1522        if(sListener != null) sListener.statusChanged("Saving " +
1523                                                      annotSet.getName()+
1524                                                      " annotation set ");
1525        xmlContent.append(annotationSetToXml(annotSet));
1526      }// End while
1527    }// End if
1528    // Add the end of GateDocument
1529    xmlContent.append("</GateDocument>");
1530    if(sListener != null) sListener.statusChanged("Done !");
1531    // return the XmlGateDocument
1532    return xmlContent.toString();
1533  }// toXml
1534
1535  /** This method filters any non XML char
1536    * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
1537    * All non XML chars will be replaced with 0x20 (space char) This assures
1538    * that the next time the document is loaded there won't be any problems.
1539    * @param aStrBuffer represents the input String that is filtred. If the
1540    * aStrBuffer is null then an empty string will be returend
1541    * @return the "purified" StringBuffer version of the aStrBuffer
1542    */
1543  private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
1544    if (aStrBuffer == null) return new StringBuffer("");
1545    String space = new String(" ");
1546    for (int i=aStrBuffer.length()-1;i>=0; i--){
1547      if (!isXmlChar(aStrBuffer.charAt(i)))
1548        aStrBuffer.replace(i,i+1,space);
1549    }// End for
1550    return aStrBuffer;
1551  }// filterNonXmlChars()
1552
1553  /** This method decide if a char is a valid XML one or not
1554    * @param ch the char to be tested
1555    * @return true if is a valid XML char and fals if is not.
1556    */
1557  public static boolean isXmlChar(char ch){
1558    if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
1559    if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
1560    if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
1561    if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
1562    return false;
1563  }// End isXmlChar()
1564
1565  /** This method saves a FeatureMap as XML elements.
1566    * @ param aFeatureMap the feature map that has to be saved as XML.
1567    * @ return a String like this: <Feature><Name>...</Name>
1568    * <Value>...</Value></Feature><Feature>...</Feature>
1569    */
1570  private String featuresToXml(FeatureMap aFeatureMap){
1571    StringBuffer str = new StringBuffer("");
1572
1573    if (aFeatureMap == null) return str.toString();
1574
1575    Set keySet = aFeatureMap.keySet();
1576    Iterator keyIterator = keySet.iterator();
1577    while(keyIterator.hasNext()){
1578      Object key = keyIterator.next();
1579      Object value = aFeatureMap.get(key);
1580      if ((key != null) && (value != null)){
1581        String keyClassName = null;
1582        String keyItemClassName = null;
1583        String valueClassName = null;
1584        String valueItemClassName = null;
1585        String key2String = key.toString();
1586        String value2String = value.toString();
1587
1588        Object item = null;
1589        // Test key if it is String, Number or Collection
1590        if (key instanceof java.lang.String ||
1591            key instanceof java.lang.Number ||
1592            key instanceof java.util.Collection)
1593          keyClassName = key.getClass().getName();
1594
1595        // Test value if it is String, Number or Collection
1596        if (value instanceof java.lang.String ||
1597            value instanceof java.lang.Number ||
1598            value instanceof java.util.Collection)
1599          valueClassName = value.getClass().getName();
1600
1601        // Features and values that are not Strings, Numbers or collections
1602        // will be discarded.
1603        if (keyClassName == null || valueClassName == null) continue;
1604
1605        // If key is collection serialize the colection in a specific format
1606        if (key instanceof java.util.Collection){
1607          StringBuffer keyStrBuff = new StringBuffer("");
1608          Iterator iter = ((Collection) key).iterator();
1609          if (iter.hasNext()){
1610            item = iter.next();
1611            if (item instanceof java.lang.Number)
1612              keyItemClassName = item.getClass().getName();
1613            else
1614              keyItemClassName = String.class.getName();
1615            keyStrBuff.append(item.toString());
1616          }// End if
1617          while (iter.hasNext()){
1618            item = iter.next();
1619            keyStrBuff.append(";" + item.toString());
1620          }// End while
1621          key2String = keyStrBuff.toString();
1622        }// End if
1623        // If key is collection serialize the colection in a specific format
1624        if (value instanceof java.util.Collection){
1625          StringBuffer valueStrBuff = new StringBuffer("");
1626          Iterator iter = ((Collection) value).iterator();
1627          if (iter.hasNext()){
1628            item = iter.next();
1629            if (item instanceof java.lang.Number)
1630              valueItemClassName = item.getClass().getName();
1631            else
1632              valueItemClassName = String.class.getName();
1633            valueStrBuff.append(item.toString());
1634          }// End if
1635          while (iter.hasNext()){
1636            item = iter.next();
1637            valueStrBuff.append(";" + item.toString());
1638          }// End while
1639          value2String = valueStrBuff.toString();
1640        }// End if
1641        str.append("<Feature>\n  <Name");
1642        if (keyClassName != null)
1643          str.append(" className=\""+keyClassName+"\"");
1644        if (keyItemClassName != null)
1645          str.append(" itemClassName=\""+keyItemClassName+"\"");
1646        str.append(">");
1647        str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
1648        str.append("</Name>\n  <Value");
1649        if (valueClassName != null)
1650          str.append(" className=\"" + valueClassName + "\"");
1651        if (valueItemClassName != null)
1652          str.append(" itemClassName=\"" + valueItemClassName + "\"");
1653        str.append(">");
1654        str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
1655        str.append("</Value>\n</Feature>\n");
1656      }// End if
1657    }// end While
1658    return str.toString();
1659  }//featuresToXml
1660
1661  /** This method replace all chars that appears in the anInputString and also
1662    * that are in the entitiesMap with their corresponding entity
1663    * @param anInputString the string analyzed. If it is null then returns the
1664    *  empty string
1665    * @return a string representing the input string with chars replaced with
1666    *  entities
1667    */
1668  private StringBuffer replaceCharsWithEntities(String anInputString){
1669    if (anInputString == null) return new StringBuffer("");
1670    StringBuffer strBuff = new StringBuffer(anInputString);
1671    for (int i=strBuff.length()-1; i>=0; i--){
1672      Character ch = new Character(strBuff.charAt(i));
1673      if (entitiesMap.keySet().contains(ch)){
1674        strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
1675      }// End if
1676    }// End for
1677    return strBuff;
1678  }//replaceCharsWithEntities()
1679
1680  /** This method creates Node XML elements and inserts them at the
1681    * corresponding offset inside the text. Nodes are created from the default
1682    * annotation set, as well as from all existing named annotation sets.
1683    * @param aText The text representing the document's plain text.
1684    * @return The text with empty <Node id="NodeId"/> elements.
1685    */
1686  private String textWithNodes(String aText){
1687    if (aText == null) return new String("");
1688    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
1689
1690    // Construct a map from offsets to Chars
1691    TreeMap offsets2CharsMap = new TreeMap();
1692    if (aText.length()!= 0){
1693      // Fill the offsets2CharsMap with all the indices where special chars appear
1694      buildEntityMapFromString(aText,offsets2CharsMap);
1695    }//End if
1696    // Construct the offsetsSet for all nodes belonging to this document
1697    TreeSet offsetsSet = new TreeSet();
1698    Iterator annotSetIter = this.getAnnotations().iterator();
1699    while (annotSetIter.hasNext()){
1700      Annotation annot = (Annotation) annotSetIter.next();
1701      offsetsSet.add(annot.getStartNode().getOffset());
1702      offsetsSet.add(annot.getEndNode().getOffset());
1703    }// end While
1704    // Get the nodes from all other named annotation sets.
1705    if (namedAnnotSets != null){
1706      Iterator iter = namedAnnotSets.values().iterator();
1707      while(iter.hasNext()){
1708        AnnotationSet annotSet = (AnnotationSet) iter.next();
1709        Iterator iter2 = annotSet.iterator();
1710        while(iter2.hasNext()){
1711          Annotation annotTmp = (Annotation) iter2.next();
1712          offsetsSet.add(annotTmp.getStartNode().getOffset());
1713          offsetsSet.add(annotTmp.getEndNode().getOffset());
1714        }// End while
1715      }// End while
1716    }// End if
1717    // offsetsSet is ordered in ascending order because the structure
1718    // is a TreeSet
1719
1720    if (offsetsSet.isEmpty()){
1721      return replaceCharsWithEntities(aText).toString();
1722    }// End if
1723    // Iterate through all nodes from anAnnotSet and transform them to
1724    // XML elements. Then insert those elements at the node's offset into the
1725    // textWithNodes .
1726    while (!offsetsSet.isEmpty()){
1727      Long offset = (Long) offsetsSet.last();
1728      // Eliminate the offset from the list in order to create more memory space
1729      offsetsSet.remove(offset);
1730      // Use offset
1731      int offsetValue = offset.intValue();
1732      String strNode = "<Node id=\"" + offsetValue + "\"/>";
1733      // Before inserting this string into the textWithNodes, check to see if
1734      // there are any chars to be replaced with their corresponding entities
1735      if (!offsets2CharsMap.isEmpty()){
1736        Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1737        while( !offsets2CharsMap.isEmpty() &&
1738                       offsChar.intValue() >= offset.intValue()){
1739          // Replace the char at offsChar with its corresponding entity form
1740          // the entitiesMap.
1741          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1742          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1743          // Discard the offsChar after it was used because this offset will
1744          // never appear again
1745          offsets2CharsMap.remove(offsChar);
1746          // Investigate next offsChar
1747          if (!offsets2CharsMap.isEmpty())
1748            offsChar = (Integer) offsets2CharsMap.lastKey();
1749        }// End while
1750      }// End if
1751      // Now it is safe to insert the node
1752      textWithNodes.insert(offsetValue,strNode);
1753    }// end while
1754    // Need to replace the entities in the remaining text, if there is any text
1755    // So, if there are any more items in offsets2CharsMap they need to be
1756    // replaced
1757    while (!offsets2CharsMap.isEmpty()){
1758      Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1759      // Replace the char with its entity
1760      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1761      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1762      // remove the offset from the map
1763      offsets2CharsMap.remove(offsChar);
1764    }// End while
1765    return textWithNodes.toString();
1766  }//textWithNodes()
1767
1768  /** This method saves an AnnotationSet as XML.
1769    * @param anAnnotationSet The annotation set that has to be saved as XML.
1770    * @return a String like this: <AnnotationSet> <Annotation>....
1771    * </AnnotationSet>
1772    */
1773  private String annotationSetToXml(AnnotationSet anAnnotationSet){
1774    StringBuffer str = new StringBuffer("");
1775
1776    if (anAnnotationSet == null){
1777      str.append("<AnnotationSet>\n");
1778      str.append("</AnnotationSet>\n");
1779      return str.toString();
1780    }// End if
1781    if (anAnnotationSet.getName() == null)
1782      str.append("<AnnotationSet>\n");
1783    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
1784                                                                    "\" >\n");
1785    // Iterate through AnnotationSet and save each Annotation as XML
1786    Iterator iterator = anAnnotationSet.iterator();
1787    while (iterator.hasNext()){
1788      Annotation annot = (Annotation) iterator.next();
1789      str.append("<Annotation " + "Type=\"" + annot.getType() +
1790                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
1791                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
1792      str.append(featuresToXml(annot.getFeatures()));
1793      str.append("</Annotation>\n");
1794    }// End while
1795
1796    str.append("</AnnotationSet>\n");
1797    return str.toString();
1798  }// annotationSetToXml
1799
1800  /** Returns a map with the named annotation sets. It returns <code>null</code>
1801   *  if no named annotaton set exists. */
1802  public Map getNamedAnnotationSets() {
1803    return namedAnnotSets;
1804  } // getNamedAnnotationSets
1805
1806  /**
1807   * Removes one of the named annotation sets.
1808   * Note that the default annotation set cannot be removed.
1809   * @param name the name of the annotation set to be removed
1810   */
1811  public void removeAnnotationSet(String name){
1812    Object removed = namedAnnotSets.remove(name);
1813    if(removed != null){
1814      fireAnnotationSetRemoved(
1815        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
1816    }
1817  }
1818
1819  /** Propagate edit changes to the document content and annotations. */
1820  public void edit(Long start, Long end, DocumentContent replacement)
1821    throws InvalidOffsetException
1822  {
1823    if(! isValidOffsetRange(start, end))
1824      throw new InvalidOffsetException();
1825
1826    if(content != null)
1827      ((DocumentContentImpl) content).edit(start, end, replacement);
1828
1829    if(defaultAnnots != null)
1830      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
1831
1832    if(namedAnnotSets != null) {
1833      Iterator iter = namedAnnotSets.values().iterator();
1834      while(iter.hasNext())
1835        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
1836    }
1837
1838  } // edit(start,end,replacement)
1839
1840  /** Check that an offset is valid, i.e. it is non-null, greater than
1841    * or equal to 0 and less than the size of the document content.
1842    */
1843  public boolean isValidOffset(Long offset) {
1844    if(offset == null)
1845      return false;
1846
1847    long o = offset.longValue();
1848    if(o > getContent().size().longValue() || o < 0)
1849      return false;
1850
1851    return true;
1852  } // isValidOffset
1853
1854  /** Check that both start and end are valid offsets and that
1855    * they constitute a valid offset range, i.e. start is greater
1856    * than or equal to long.
1857    */
1858  public boolean isValidOffsetRange(Long start, Long end) {
1859    return
1860      isValidOffset(start) && isValidOffset(end) &&
1861      start.longValue() <= end.longValue();
1862  } // isValidOffsetRange(start,end)
1863
1864  /** Sets the nextAnnotationId */
1865  public void setNextAnnotationId(int aNextAnnotationId){
1866    nextAnnotationId = aNextAnnotationId;
1867  }// setNextAnnotationId();
1868
1869  /** Generate and return the next annotation ID */
1870  public Integer getNextAnnotationId() {
1871    return new Integer(nextAnnotationId++);
1872  } // getNextAnnotationId
1873
1874  /** Generate and return the next node ID */
1875  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
1876
1877  /** Ordering based on URL.toString() and the URL offsets (if any) */
1878  public int compareTo(Object o) throws ClassCastException {
1879    DocumentImpl other = (DocumentImpl) o;
1880    return getOrderingString().compareTo(other.getOrderingString());
1881  } // compareTo
1882
1883  /** Utility method to produce a string for comparison in ordering.
1884    * String is based on the source URL and offsets.
1885    */
1886  protected String getOrderingString() {
1887    if(sourceUrl == null) return toString();
1888
1889    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
1890    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
1891      orderingString.append(sourceUrlStartOffset.toString());
1892      orderingString.append(sourceUrlEndOffset.toString());
1893    }
1894
1895    return orderingString.toString();
1896  } // getOrderingString()
1897
1898  /** The id of the next new annotation */
1899  protected int nextAnnotationId = 0;
1900
1901  /** The id of the next new node */
1902  protected int nextNodeId = 0;
1903  /** The source URL */
1904  protected URL sourceUrl;
1905
1906  /** The document's URL name. */
1907
1908  /** The content of the document */
1909  protected DocumentContent content;
1910
1911  /** The encoding of the source of the document content */
1912  protected String encoding = null;
1913
1914  // Data needed in toXml(AnnotationSet) methos
1915
1916  /** This field indicates whether or not to add the tag
1917    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
1918    * have this tag added
1919    */
1920//  private boolean addGatePreserveFormatTag = false;
1921
1922  /**
1923   * Used by the XML dump preserving format method
1924   */
1925  private Annotation theRootAnnotation = null;
1926
1927  /** This field is used when creating StringBuffers for toXml() methods.
1928    * The size of the StringBuffer will be docDonctent.size() multiplied by this
1929    * value. It is aimed to improve the performance of StringBuffer
1930    */
1931  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1;
1932
1933  /** Constant used in the inner class AnnotationComparator to order
1934    * annotations on their start offset
1935    */
1936  private final int ORDER_ON_START_OFFSET = 0;
1937  /** Constant used in the inner class AnnotationComparator to order
1938    * annotations on their end offset
1939    */
1940  private final int ORDER_ON_END_OFFSET = 1;
1941  /** Constant used in the inner class AnnotationComparator to order
1942    * annotations on their ID
1943    */
1944  private final int ORDER_ON_ANNOT_ID = 2;
1945  /** Constant used in the inner class AnnotationComparator to order
1946    * annotations ascending
1947    */
1948  private final int ASC = 3;
1949  /** Constant used in the inner class AnnotationComparator to order
1950    * annotations descending
1951    */
1952  private final int DESC = -3;
1953
1954  /** A map initialized in init() containing entities that needs to be
1955    * replaced in strings
1956    */
1957  private static Map entitiesMap = null;
1958  // Initialize the entities map use when saving as xml
1959  static{
1960    entitiesMap = new HashMap();
1961    entitiesMap.put(new Character('<'),"&lt;");
1962    entitiesMap.put(new Character('>'),"&gt;");
1963    entitiesMap.put(new Character('&'),"&amp;");
1964    entitiesMap.put(new Character('\''),"&apos;");
1965    entitiesMap.put(new Character('"'),"&quot;");
1966    entitiesMap.put(new Character((char)160),"&#160;");
1967    entitiesMap.put(new Character((char)169),"&#169;");
1968  }//static
1969
1970  /** The range that the content comes from at the source URL
1971    * (or null if none).
1972    */
1973  //protected Long[] sourceUrlOffsets;
1974
1975  /** The start of the range that the content comes from at the source URL
1976    * (or null if none).
1977    */
1978  protected Long sourceUrlStartOffset;
1979
1980  /** The end of the range that the content comes from at the source URL
1981    * (or null if none).
1982    */
1983  protected Long sourceUrlEndOffset;
1984
1985  /** The default annotation set */
1986  protected AnnotationSet defaultAnnots;
1987
1988  /** Named sets of annotations */
1989  protected Map namedAnnotSets;
1990
1991  /**
1992   * A property of the document that will be set when the user
1993   * wants to create the document from a string, as opposed to from
1994   * a URL.
1995   */
1996  private String stringContent;
1997
1998  /**
1999   * The stringContent of a document is
2000   * a property of the document that will be set when the user
2001   * wants to create the document from a string, as opposed to from
2002   * a URL.
2003   * <B>Use the <TT>getContent</TT> method instead to get the actual document
2004   * content.</B>
2005   */
2006  public String getStringContent() { return stringContent; }
2007
2008  /**
2009   * The stringContent of a document is
2010   * a property of the document that will be set when the user
2011   * wants to create the document from a string, as opposed to from
2012   * a URL.
2013   * <B>Use the <TT>setContent</TT> method instead to update the actual
2014   * document content.</B>
2015   */
2016  public void setStringContent(String stringContent) {
2017    this.stringContent = stringContent;
2018  } // set StringContent
2019
2020  /** Is the document markup-aware? */
2021  protected Boolean markupAware = new Boolean(false);
2022
2023//  /** Hash code */
2024//  public int hashCode() {
2025//    int code = getContent().hashCode();
2026//    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2027//    code += memberCode;
2028//    memberCode = (encoding == null) ? 0 : encoding.hashCode();
2029//    code += memberCode;
2030//    memberCode = (features == null) ? 0 : features.hashCode();
2031//    code += memberCode;
2032//    code += (markupAware.booleanValue()) ? 0 : 1;
2033//    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2034//    code += memberCode;
2035//    code += nextAnnotationId;
2036//    code += nextNodeId;
2037//    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2038//    code += memberCode;
2039//    memberCode =
2040//      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2041//    code += memberCode;
2042//    memberCode =
2043//      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2044//    code += memberCode;
2045//    return code;
2046//  } // hashcode
2047
2048  /** String respresentation */
2049  public String toString() {
2050    String n = Strings.getNl();
2051    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2052    s.append("  content:" + content + n);
2053    s.append("  defaultAnnots:" + defaultAnnots + n);
2054    s.append("  encoding:" + encoding + n);
2055    s.append("  features:" + features + n);
2056    s.append("  markupAware:" + markupAware + n);
2057    s.append("  namedAnnotSets:" + namedAnnotSets + n);
2058    s.append("  nextAnnotationId:" + nextAnnotationId + n);
2059    s.append("  nextNodeId:" + nextNodeId + n);
2060    s.append("  sourceUrl:" + sourceUrl + n);
2061    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2062    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2063    s.append(n);
2064
2065    return s.toString();
2066  } // toString
2067
2068   /** Freeze the serialization UID. */
2069  static final long serialVersionUID = -8456893608311510260L;
2070
2071  /** Inner class needed to compare annotations*/
2072  class AnnotationComparator implements java.util.Comparator {
2073    int orderOn = -1;
2074    int orderType = ASC;
2075    /** Constructs a comparator according to one of three sorter types:
2076      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2077      */
2078      public AnnotationComparator(int anOrderOn, int anOrderType){
2079        orderOn = anOrderOn;
2080        orderType = anOrderType;
2081      }// AnnotationComparator()
2082
2083      /**This method must be implemented according to Comparator interface */
2084      public int compare(Object o1, Object o2){
2085        Annotation a1 = (Annotation) o1;
2086        Annotation a2 = (Annotation) o2;
2087        // ORDER_ON_START_OFFSET ?
2088        if (orderOn == ORDER_ON_START_OFFSET){
2089          int result = a1.getStartNode().getOffset().compareTo(
2090                                                a2.getStartNode().getOffset());
2091          if (orderType == ASC){
2092            // ASC
2093            // If they are equal then their ID will decide.
2094            if (result == 0)
2095              return a1.getId().compareTo(a2.getId());
2096            return result;
2097          }else{
2098            // DESC
2099            if (result == 0)
2100              return - (a1.getId().compareTo(a2.getId()));
2101            return -result;
2102          }// End if (orderType == ASC)
2103        }// End if (orderOn == ORDER_ON_START_OFFSET)
2104
2105        // ORDER_ON_END_OFFSET ?
2106        if (orderOn == ORDER_ON_END_OFFSET){
2107          int result = a1.getEndNode().getOffset().compareTo(
2108                                                a2.getEndNode().getOffset());
2109          if (orderType == ASC){
2110            // ASC
2111            // If they are equal then their ID will decide.
2112            if (result == 0)
2113              return - (a1.getId().compareTo(a2.getId()));
2114            return result;
2115          }else{
2116            // DESC
2117            // If they are equal then their ID will decide.
2118            if (result == 0)
2119              return a1.getId().compareTo(a2.getId());
2120            return - result;
2121          }// End if (orderType == ASC)
2122        }// End if (orderOn == ORDER_ON_END_OFFSET)
2123
2124        // ORDER_ON_ANNOT_ID ?
2125        if (orderOn == ORDER_ON_ANNOT_ID){
2126          if (orderType == ASC)
2127            return a1.getId().compareTo(a2.getId());
2128          else
2129            return -(a1.getId().compareTo(a2.getId()));
2130        }// End if
2131        return 0;
2132      }//compare()
2133  } // End inner class AnnotationComparator
2134
2135
2136  private transient Vector documentListeners;
2137  private transient Vector gateListeners;
2138
2139  public synchronized void removeDocumentListener(DocumentListener l) {
2140    if (documentListeners != null && documentListeners.contains(l)) {
2141      Vector v = (Vector) documentListeners.clone();
2142      v.removeElement(l);
2143      documentListeners = v;
2144    }
2145  }
2146  public synchronized void addDocumentListener(DocumentListener l) {
2147    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2148    if (!v.contains(l)) {
2149      v.addElement(l);
2150      documentListeners = v;
2151    }
2152  }
2153
2154  protected void fireAnnotationSetAdded(DocumentEvent e) {
2155    if (documentListeners != null) {
2156      Vector listeners = documentListeners;
2157      int count = listeners.size();
2158      for (int i = 0; i < count; i++) {
2159        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2160      }
2161    }
2162  }
2163
2164  protected void fireAnnotationSetRemoved(DocumentEvent e) {
2165    if (documentListeners != null) {
2166      Vector listeners = documentListeners;
2167      int count = listeners.size();
2168      for (int i = 0; i < count; i++) {
2169        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2170      }
2171    }
2172  }
2173  public void resourceLoaded(CreoleEvent e) {
2174  }
2175  public void resourceUnloaded(CreoleEvent e) {
2176  }
2177  public void datastoreOpened(CreoleEvent e) {
2178  }
2179  public void datastoreCreated(CreoleEvent e) {
2180  }
2181  public void resourceRenamed(Resource resource, String oldName,
2182                              String newName){
2183  }
2184  public void datastoreClosed(CreoleEvent e) {
2185    if (! e.getDatastore().equals(this.getDataStore()))
2186      return;
2187    //close this lr, since it cannot stay open when the DS it comes from
2188    //is closed
2189    Factory.deleteResource(this);
2190  }
2191  public void setLRPersistenceId(Object lrID) {
2192    super.setLRPersistenceId( lrID);
2193    //make persistent documents listen to the creole register
2194    //for events about their DS
2195    Gate.getCreoleRegister().addCreoleListener(this);
2196  }
2197  public void resourceAdopted(DatastoreEvent evt) {
2198  }
2199  public void resourceDeleted(DatastoreEvent evt) {
2200    if(! evt.getSource().equals(this.getDataStore()))
2201      return;
2202    //if an open document is deleted from a DS, then
2203    //it must close itself immediately, as is no longer valid
2204    if(evt.getResourceID().equals(this.getLRPersistenceId()))
2205      Factory.deleteResource(this);
2206  }
2207  public void resourceWritten(DatastoreEvent evt) {
2208  }
2209  public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2210    super.setDataStore( dataStore);
2211    if (this.dataStore != null)
2212      this.dataStore.addDatastoreListener(this);
2213  }
2214
2215} // class DocumentImpl
2216