1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.109 2002/02/28 15:08:47 nasso Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.gui.*;
27  import gate.event.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements Document, CreoleListener, DatastoreListener {
133   /** Debug flag */
134   private static final boolean DEBUG = false;
135 
136   /** If you set this flag to true the original content of the document will
137    *  be kept in the document feature. <br>
138    *  Default value is false to avoid the unnecessary waste of memory */
139   private Boolean preserveOriginalContent = new Boolean(false);
140 
141   /** If you set this flag to true the repositioning information for
142    *  the document will be kept in the document feature. <br>
143    *  Default value is false to avoid the unnecessary waste of time and memory
144    */
145   private Boolean collectRepositioningInfo = new Boolean(false);
146 
147   /**
148    * This is a variable which contains the latest crossed over annotation
149    * found during export with preserving format, i.e., toXml(annotations)
150    * method.
151    */
152   private Annotation crossedOverAnnotation = null;
153 
154   /** Default construction. Content left empty. */
155   public DocumentImpl() {
156     content = new DocumentContentImpl();
157   } // default construction
158 
159   /** Initialise this resource, and return it. */
160   public Resource init() throws ResourceInstantiationException {
161 
162     // set up the source URL and create the content
163     if(sourceUrl == null) {
164       if(stringContent == null) {
165         throw new ResourceInstantiationException(
166           "The sourceURL and document's content were null."
167         );
168       }
169 
170       content = new DocumentContentImpl(stringContent);
171       getFeatures().put("gate.SourceURL", "created from String");
172     } else {
173       try {
174 
175         content = new DocumentContentImpl(
176           sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset);
177         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
178       } catch(IOException e) {
179         throw new ResourceInstantiationException("DocumentImpl.init: " + e);
180       }
181 
182       if(preserveOriginalContent.booleanValue() && content != null) {
183         String originalContent = new String(
184           ((DocumentContentImpl) content).getOriginalContent());
185         getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
186                       originalContent);
187       } // if
188     }
189 
190     // set up a DocumentFormat if markup unpacking required
191     if(getMarkupAware().booleanValue()) {
192       DocumentFormat docFormat =
193         DocumentFormat.getDocumentFormat(this, sourceUrl);
194       try {
195         if(docFormat != null){
196           StatusListener sListener = (StatusListener)
197                                       gate.gui.MainFrame.getListeners().
198                                       get("gate.event.StatusListener");
199           if(sListener != null) docFormat.addStatusListener(sListener);
200 
201           // set the flag if true and if the document format support collecting
202           docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
203 
204           if(docFormat.getShouldCollectRepositioning().booleanValue()) {
205             // unpack with collectiong of repositioning information
206             RepositioningInfo info = new RepositioningInfo();
207 
208             String origContent = (String) getFeatures().get(
209                 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
210 
211             RepositioningInfo ampCodingInfo = new RepositioningInfo();
212             if(origContent != null) {
213               boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
214               collectInformationForAmpCodding(origContent, ampCodingInfo,
215                                               shouldCorrectCR);
216               if(docFormat instanceof HtmlDocumentFormat) {
217                 collectInformationForWS(origContent, ampCodingInfo);
218               } // if
219             } // if
220 
221             docFormat.unpackMarkup(this, info, ampCodingInfo);
222 
223             if(origContent != null
224                 && docFormat instanceof XmlDocumentFormat) {
225               // CRLF correction of RepositioningInfo
226               correctRepositioningForCRLFInXML(origContent, info);
227             } // if
228 
229             getFeatures().put(
230                 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
231           }
232           else {
233             // normal old fashioned unpack
234             docFormat.unpackMarkup(this);
235           }
236           docFormat.removeStatusListener(sListener);
237        } //if format != null
238       } catch(DocumentFormatException e) {
239         throw new ResourceInstantiationException(
240           "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
241           " " + e
242         );
243       }
244     } // if markup aware
245 
246     return this;
247   } // init()
248 
249   /**
250    * Correct repositioning information for substitution of "\r\n" with "\n"
251    */
252   private void correctRepositioningForCRLFInXML(String content,
253                                             RepositioningInfo info) {
254     int index = -1;
255 
256     do {
257       index = content.indexOf("\r\n", index+1);
258       if(index != -1) {
259         info.correctInformationOriginalMove(index, 1);
260       } // if
261     } while(index != -1);
262   } // correctRepositioningForCRLF
263 
264   /**
265    * Collect information for substitution of "&xxx;" with "y"
266    *
267    * It couldn't be collected a position information about
268    * some unicode and &-coded symbols during parsing. The parser "hide" the
269    * information about the position of such kind of parsed text.
270    * So, there is minimal chance to have &-coded symbol inside the covered by
271    * repositioning records area. The new record should be created for every
272    * coded symbol outside the existing records.
273    * <BR>
274    * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
275    * for CRLF substitution is performed.
276    */
277   private void collectInformationForAmpCodding(String content,
278                                             RepositioningInfo info,
279                                             boolean shouldCorrectCR) {
280 
281     if(content == null || info == null) return;
282 
283     int ampIndex = -1;
284     int semiIndex;
285 
286     do {
287       ampIndex = content.indexOf('&', ampIndex+1);
288       if(ampIndex != -1) {
289         semiIndex = content.indexOf(';', ampIndex+1);
290         // have semicolon and it is near enough for amp codding
291         if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
292           info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
293         }
294         else {
295           // no semicolon or it is too far
296           // analyse for amp codding without semicolon
297           int maxEnd = Math.min(ampIndex+8, content.length());
298           String ampCandidate = content.substring(ampIndex, maxEnd);
299           int ampCodingSize = analyseAmpCodding(ampCandidate);
300 
301           if(ampCodingSize != -1) {
302             info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
303           } // if
304 
305         } // if - semicolon found
306       } // if - ampersand found
307     } while (ampIndex != -1);
308 
309     // correct the collected information to adjust it's positions
310     // with reported by the parser
311     int index = -1;
312 
313     if(shouldCorrectCR) {
314       do {
315         index = content.indexOf("\r\n", index+1);
316         if(index != -1) {
317           info.correctInformationOriginalMove(index, -1);
318         } // if
319       } while(index != -1);
320     } // if
321   } // collectInformationForAmpCodding
322 
323   /**
324    * This function compute size of the ampersand codded sequence when
325    * semicolin is not present.
326    */
327   private int analyseAmpCodding(String content) {
328     int result = -1;
329 
330     try {
331       char ch = content.charAt(1);
332 
333       switch(ch) {
334         case 'l' : // &lt
335         case 'L' : // &lt
336           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
337             result = 3;
338           } // if
339           break;
340         case 'g' : // &gt
341         case 'G' : // &gt
342           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
343             result = 3;
344           } // if
345           break;
346         case 'a' : // &amp
347         case 'A' : // &amp
348           if(content.substring(2, 4).equalsIgnoreCase("mp")) {
349             result = 4;
350           } // if
351           break;
352         case 'q' : // &quot
353         case 'Q' : // &quot
354           if(content.substring(2, 5).equalsIgnoreCase("uot")) {
355             result = 5;
356           } // if
357           break;
358         case '#' : // #number (example &#145, &#x4C38)
359           int endIndex = 2;
360           boolean hexCoded = false;
361           if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
362             // Hex codding
363             ++endIndex;
364             hexCoded = true;
365           } // if
366 
367           while (endIndex < 8
368                   && isNumber(content.charAt(endIndex), hexCoded) ) {
369             ++endIndex;
370           } // while
371           result = endIndex;
372           break;
373       } // switch
374     } catch (StringIndexOutOfBoundsException ex) {
375       // do nothing
376     } // catch
377 
378     return result;
379   } // analyseAmpCodding
380 
381   /** Check for numeric range. If hex is true the A..F range is included */
382   private boolean isNumber(char ch, boolean hex) {
383     if(ch >= '0' && ch <= '9') return true;
384 
385     if(hex) {
386       if(ch >= 'A' && ch <= 'F') return true;
387       if(ch >= 'a' && ch <= 'f') return true;
388     } // if
389 
390     return false;
391   } // isNumber
392 
393   /** HTML parser perform substitution of multiple whitespaces (WS) with
394    *  a single WS. To create correct repositioning information structure we
395    *  should keep the information for such multiple WS.
396    *  <BR>
397    *  The criteria for WS is <code>(ch <= ' ')</code>.
398    */
399   private void collectInformationForWS(String content, RepositioningInfo info) {
400 
401     if(content == null || info == null) return;
402 
403     // analyse the content and correct the repositioning information
404     char ch;
405     int startWS, endWS;
406 
407     startWS = endWS = -1;
408     int contentLength = content.length();
409 
410     for(int i=0; i<contentLength; ++i) {
411       ch = content.charAt(i);
412 
413       // is whitespace
414       if(ch <= ' ') {
415         if(startWS == -1) {
416           startWS = i;
417         } // if
418         endWS = i;
419       }
420       else {
421         if(endWS - startWS > 0) {
422           // put the repositioning information about the WS substitution
423           info.addPositionInfo(
424             (long)startWS, (long)(endWS - startWS + 1), 0, 1);
425         } // if
426         // clear positions
427         startWS = endWS = -1;
428       }// if
429     } // for
430   } // collectInformationForWS
431 
432   /** Clear all the data members of the object. */
433   public void cleanup() {
434 
435     defaultAnnots = null;
436     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
437         namedAnnotSets.clear();
438     if (DEBUG) Out.prln("Document cleanup called");
439     if (this.lrPersistentId != null)
440       Gate.getCreoleRegister().removeCreoleListener(this);
441     if(this.getDataStore() != null)
442       this.getDataStore().removeDatastoreListener(this);
443   } // cleanup()
444 
445 
446   /** Documents are identified by URLs */
447   public URL getSourceUrl() { return sourceUrl; }
448 
449   /** Set method for the document's URL */
450   public void setSourceUrl(URL sourceUrl) {
451     this.sourceUrl = sourceUrl;
452   } // setSourceUrl
453 
454   /** Documents may be packed within files; in this case an optional pair of
455     * offsets refer to the location of the document.
456     */
457   public Long[] getSourceUrlOffsets() {
458     Long[] sourceUrlOffsets = new Long[2];
459     sourceUrlOffsets[0] = sourceUrlStartOffset;
460     sourceUrlOffsets[1] = sourceUrlEndOffset;
461     return sourceUrlOffsets;
462   } // getSourceUrlOffsets
463 
464   /**
465    * Allow/disallow preserving of the original document content.
466    * If is <B>true</B> the original content will be retrieved from
467    * the DocumentContent object and preserved as document feature.
468    */
469   public void setPreserveOriginalContent(Boolean b) {
470     preserveOriginalContent = b;
471   } // setPreserveOriginalContent
472 
473   /** Get the preserving of content status of the Document.
474    *
475    *  @return whether the Document should preserve it's original content.
476    */
477   public Boolean getPreserveOriginalContent() {
478     return preserveOriginalContent;
479   } // getPreserveOriginalContent
480 
481   /**
482    *  Allow/disallow collecting of repositioning information.
483    *  If is <B>true</B> information will be retrieved and preserved
484    *  as document feature.<BR>
485    *  Preserving of repositioning information give the possibilities
486    *  for converting of coordinates between the original document content and
487    *  extracted from the document text.
488    */
489   public void setCollectRepositioningInfo(Boolean b) {
490     collectRepositioningInfo = b;
491   } // setCollectRepositioningInfo
492 
493   /** Get the collectiong and preserving of repositioning information
494    *  for the Document. <BR>
495    *  Preserving of repositioning information give the possibilities
496    *  for converting of coordinates between the original document content and
497    *  extracted from the document text.
498    *
499    *  @return whether the Document should collect and preserve information.
500    */
501   public Boolean getCollectRepositioningInfo() {
502     return collectRepositioningInfo;
503   } // getCollectRepositioningInfo
504 
505   /** Documents may be packed within files; in this case an optional pair of
506     * offsets refer to the location of the document. This method gets the
507     * start offset.
508     */
509   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
510 
511   /** Documents may be packed within files; in this case an optional pair of
512     * offsets refer to the location of the document. This method sets the
513     * start offset.
514     */
515   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
516     this.sourceUrlStartOffset = sourceUrlStartOffset;
517   } // setSourceUrlStartOffset
518 
519   /** Documents may be packed within files; in this case an optional pair of
520     * offsets refer to the location of the document. This method gets the
521     * end offset.
522     */
523   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
524 
525   /** Documents may be packed within files; in this case an optional pair of
526     * offsets refer to the location of the document. This method sets the
527     * end offset.
528     */
529   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
530     this.sourceUrlEndOffset = sourceUrlEndOffset;
531   } // setSourceUrlStartOffset
532 
533   /** The content of the document: a String for text; MPEG for video; etc. */
534   public DocumentContent getContent() { return content; }
535 
536   /** Set method for the document content */
537   public void setContent(DocumentContent content) { this.content = content; }
538 
539   /** Get the encoding of the document content source */
540   public String getEncoding() { return encoding; }
541 
542   /** Set the encoding of the document content source */
543   public void setEncoding(String encoding) { this.encoding = encoding; }
544 
545   /** Get the default set of annotations. The set is created if it
546     * doesn't exist yet.
547     */
548   public AnnotationSet getAnnotations() {
549     if(defaultAnnots == null){
550       defaultAnnots = new AnnotationSetImpl(this);
551       fireAnnotationSetAdded(new DocumentEvent(
552            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
553     }//if
554     return defaultAnnots;
555   } // getAnnotations()
556 
557   /** Get a named set of annotations. Creates a new set if one with this
558     * name doesn't exist yet.
559     * If the provided name is null then it returns the default annotation set.
560     */
561   public AnnotationSet getAnnotations(String name) {
562     if(name == null) return getAnnotations();
563     if(namedAnnotSets == null)
564       namedAnnotSets = new HashMap();
565     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
566 
567     if(namedSet == null) {
568       namedSet = new AnnotationSetImpl(this, name);
569       namedAnnotSets.put(name, namedSet);
570 
571       DocumentEvent evt = new DocumentEvent(
572         this, DocumentEvent.ANNOTATION_SET_ADDED, name
573       );
574       fireAnnotationSetAdded(evt);
575     }
576     return namedSet;
577   } // getAnnotations(name)
578 
579   /** Make the document markup-aware. This will trigger the creation
580    *  of a DocumentFormat object at Document initialisation time; the
581    *  DocumentFormat object will unpack the markup in the Document and
582    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
583    *
584    *  @param b markup awareness status.
585    */
586   public void setMarkupAware(Boolean newMarkupAware) {
587       this.markupAware = newMarkupAware;
588   }
589 
590   /** Get the markup awareness status of the Document.
591    *  <B>Documents are markup-aware by default.</B>
592    *  @return whether the Document is markup aware.
593    */
594   public Boolean getMarkupAware() { return markupAware; }
595 
596   /** Returns an XML document aming to preserve the original markups(
597     * the original markup will be in the same place and format as it was
598     * before processing the document) and include (if possible)
599     * the annotations specified in the aSourceAnnotationSet.
600     * It is equivalent to toXml(aSourceAnnotationSet, true).
601     */
602   public String toXml(Set aSourceAnnotationSet){
603     return toXml(aSourceAnnotationSet, true);
604   }
605 
606   /** Returns an XML document aming to preserve the original markups(
607     * the original markup will be in the same place and format as it was
608     * before processing the document) and include (if possible)
609     * the annotations specified in the aSourceAnnotationSet.
610     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
611     * if they will cause a crosed over situation.
612     * @param aSourceAnnotationSet is an annotation set containing all the
613     * annotations that will be combined with the original marup set. If the
614     * param is <code>null</code> it will only dump the original markups.
615     * @param includeFeatures is a boolean that controls whether the annotation
616     * features should be included or not. If false, only the annotation type
617     * is included in the tag.
618     * @return a string representing an XML document containing the original
619     * markup + dumped annotations form the aSourceAnnotationSet
620     */
621   public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
622 
623     if(hasOriginalContentFeatures()) {
624       return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
625     } // if
626 
627     AnnotationSet originalMarkupsAnnotSet =
628             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
629 
630     // Create a dumping annotation set on the document. It will be used for
631     // dumping annotations...
632     AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
633 
634     // This set will be constructed inside this method. If is not empty, the
635     // annotation contained will be lost.
636     if (!dumpingSet.isEmpty()){
637       Out.prln("WARNING: The dumping annotation set was not empty."+
638       "All annotation it contained were lost.");
639       dumpingSet.clear();
640     }// End if
641 
642     StatusListener sListener = (StatusListener)
643                                gate.gui.MainFrame.getListeners().
644                                get("gate.event.StatusListener");
645     // Construct the dumping set in that way that all annotations will verify
646     // the condition that there are not annotations which are crossed.
647     // First add all annotation from the original markups
648     if(sListener != null)
649       sListener.statusChanged("Constructing the dumping annotation set.");
650     dumpingSet.addAll(originalMarkupsAnnotSet);
651     // Then take all the annotations from aSourceAnnotationSet and verify if
652     // they can be inserted safely into the dumpingSet. Where not possible,
653     // report.
654     if (aSourceAnnotationSet != null){
655       Iterator iter = aSourceAnnotationSet.iterator();
656       while (iter.hasNext()){
657         Annotation currentAnnot = (Annotation) iter.next();
658         if(insertsSafety(dumpingSet,currentAnnot)){
659           dumpingSet.add(currentAnnot);
660         }else if (crossedOverAnnotation != null){
661           try {
662             Out.prln("Warning: Annotations were found to violate the " +
663             "crossed over condition: \n" +
664             "1. [" +
665             getContent().getContent(
666                            crossedOverAnnotation.getStartNode().getOffset(),
667                            crossedOverAnnotation.getEndNode().getOffset()) +
668             " (" + crossedOverAnnotation.getType() + ": " +
669             crossedOverAnnotation.getStartNode().getOffset() +
670             ";" + crossedOverAnnotation.getEndNode().getOffset() +
671             ")]\n" +
672             "2. [" +
673             getContent().getContent(
674                            currentAnnot.getStartNode().getOffset(),
675                            currentAnnot.getEndNode().getOffset()) +
676             " (" + currentAnnot.getType() + ": " +
677             currentAnnot.getStartNode().getOffset() +
678             ";" + currentAnnot.getEndNode().getOffset() +
679             ")]\nThe second one will be discarded.\n"  );
680           } catch (gate.util.InvalidOffsetException ex) {
681             throw new GateRuntimeException(ex.getMessage());
682           }
683         }// End if
684       }// End while
685     }// End if
686 
687     // The dumpingSet is ready to be exported as XML
688     // Here we go.
689     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
690     StringBuffer xmlDoc = new StringBuffer(
691           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
692     // Add xml header
693 //    xmlDoc.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
694 
695     // If the annotation set contains this "GatePreserveFormat"
696     // type, then this is removed because it will be added in the saving
697     // process. The reason of this removal is that if the loaded document
698     // was previously loaded from a GatePreserveFormat then we
699     // don't want to create lots of annotation for this type. This annotation
700     // type should be always the root element of a XML preserving format
701     // GATE document.
702     FeatureMap docFeatures = this.getFeatures();
703     String mimeTypeStr = null;
704 //    addGatePreserveFormatTag = false;
705     if (  docFeatures != null &&
706           null != (mimeTypeStr=(String)docFeatures.get("MimeType")) &&
707           (
708             "text/html".equalsIgnoreCase(mimeTypeStr) ||
709             "text/xml".equalsIgnoreCase(mimeTypeStr) ||
710             "text/sgml".equalsIgnoreCase(mimeTypeStr)
711            )
712        ){
713           /* don't add the root tag */
714     }else{
715       // Add the root start element
716 //      xmlDoc.append("<GatePreserveFormat"+
717 //                    " xmlns:gate=\"http://www.gate.ac.uk\"" +
718 //                    " gate:annotMaxId=\"" +
719 //                    getNextAnnotationId() +
720 //                    "\">");
721 //      addGatePreserveFormatTag = true;
722     }// End if
723 
724     xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures));
725 
726 //    if (addGatePreserveFormatTag){
727 //      // Add the root end element
728 //      xmlDoc.append("</GatePreserveFormat>");
729 //    }// End if
730     if(sListener != null) sListener.statusChanged("Done.");
731     return xmlDoc.toString();
732   }//End toXml()
733 
734   /** This method verifies if aSourceAnnotation can ve inserted safety into the
735     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
736     * contition with any annotation from the aTargetAnnotSet.
737     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
738     * @param aSourceAnnotation the annotation to be inserted into the
739     * aTargetAnnotSet
740     * @return true if the annotation inserts safety, or false otherwise.
741     */
742   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
743                                                 Annotation aSourceAnnotation){
744 
745     if (aTargetAnnotSet == null || aSourceAnnotation == null) {
746       this.crossedOverAnnotation = null;
747       return false;
748     }
749     if (aSourceAnnotation.getStartNode() == null ||
750         aSourceAnnotation.getStartNode().getOffset()== null) {
751       this.crossedOverAnnotation = null;
752       return false;
753     }
754     if (aSourceAnnotation.getEndNode() == null ||
755         aSourceAnnotation.getEndNode().getOffset()== null) {
756       this.crossedOverAnnotation = null;
757       return false;
758     }
759 
760     // Get the start and end offsets
761     Long start = aSourceAnnotation.getStartNode().getOffset();
762     Long end =   aSourceAnnotation.getEndNode().getOffset();
763     // Read aSourceAnnotation offsets long
764     long s2 = start.longValue();
765     long e2 = end.longValue();
766 
767     // Obtain a set with all annotations annotations that overlap
768     // totaly or partially with the interval defined by the two provided offsets
769     AnnotationSet as = aTargetAnnotSet.get(start,end);
770 
771     // Investigate all the annotations from as to see if there is one that
772     // comes in conflict with aSourceAnnotation
773     Iterator it = as.iterator();
774     while(it.hasNext()){
775       Annotation ann = (Annotation) it.next();
776       // Read ann offsets
777       long s1 = ann.getStartNode().getOffset().longValue();
778       long e1 = ann.getEndNode().getOffset().longValue();
779 
780       if (s1<s2 && s2<e1 && e1<e2) {
781         this.crossedOverAnnotation = ann;
782         return false;
783       }
784       if (s2<s1 && s1<e2 && e2<e1) {
785         this.crossedOverAnnotation = ann;
786         return false;
787       }
788     }// End while
789     return true;
790   }// insertsSafety()
791 
792   /** This method saves all the annotations from aDumpAnnotSet and combines
793     * them with the document content.
794     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
795     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
796     * then an empty string will be returned.
797     * @param includeFeatures is a boolean, which controls whether the annotation
798     * features and gate ID are included or not.
799     * @return The XML document obtained from raw text + the information from
800     * the dump annotation set.
801     */
802   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
803                                         boolean includeFeatures){
804     String content = null;
805     if (this.getContent()== null)
806       content = new String("");
807     else
808       content = this.getContent().toString();
809     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
810     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
811 
812     TreeMap offsets2CharsMap = new TreeMap();
813     if (this.getContent().size().longValue() != 0){
814       // Fill the offsets2CharsMap with all the indices where
815       // special chars appear
816       buildEntityMapFromString(content,offsets2CharsMap);
817     }//End if
818     // The saving alghorithm is as follows:
819     ///////////////////////////////////////////
820     // Construct a set of annot with all IDs in asc order.
821     // All annotations that end at that offset swap their place in descending
822     // order. For each node write all the tags from left to right.
823 
824     // Construct the node set
825     TreeSet offsets = new TreeSet();
826     Iterator iter = aDumpAnnotSet.iterator();
827     while (iter.hasNext()){
828       Annotation annot = (Annotation) iter.next();
829       offsets.add(annot.getStartNode().getOffset());
830       offsets.add(annot.getEndNode().getOffset());
831     }// End while
832     isRootTag = false;
833     // ofsets is sorted in ascending order.
834     // Iterate this set in descending order and remove an offset at each
835     // iteration
836     while (!offsets.isEmpty()){
837       Long offset = (Long)offsets.last();
838       // Remove the offset from the set
839       offsets.remove(offset);
840       // Now, use it.
841       // Returns a list with annotations that needs to be serialized in that
842       // offset.
843       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
844       // Attention: the annotation are serialized from left to right
845       StringBuffer tmpBuff = new StringBuffer("");
846       Stack stack = new Stack();
847       // Iterate through all these annotations and serialize them
848       Iterator it = annotations.iterator();
849       while(it.hasNext()){
850         Annotation a = (Annotation) it.next();
851         it.remove();
852         // Test if a Ends at offset
853         if ( offset.equals(a.getEndNode().getOffset()) ){
854           // Test if a Starts at offset
855           if ( offset.equals(a.getStartNode().getOffset()) ){
856             // Here, the annotation a Starts and Ends at the offset
857             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
858                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
859 
860               // Assert: annotation a with start == end and isEmptyAndSpan
861               if (offsets.isEmpty() && "".equals(tmpBuff.toString())){
862                 // a is the doc's root tag to be written
863                 // The annotations are serialized from left to right.
864                 // The first annot in the last offset is the ROOT one
865                 isRootTag = true;
866               }// End if
867               tmpBuff.append(writeStartTag(a, includeFeatures));
868               stack.push(a);
869             }else{
870               // Assert annotation a with start == end and an empty tag
871               tmpBuff.append(writeEmptyTag(a));
872               // The annotation is removed from dumped set
873               aDumpAnnotSet.remove(a);
874             }// End if
875           }else{
876             // Here the annotation a Ends at the offset.
877             // In this case empty the stack and write the end tag
878             if (!stack.isEmpty()){
879               while(!stack.isEmpty()){
880                 Annotation a1 = (Annotation)stack.pop();
881                 tmpBuff.append(writeEndTag(a1));
882               }// End while
883             }// End if
884             tmpBuff.append(writeEndTag(a));
885           }// End if
886         }else{
887           // The annotation a does NOT end at the offset. Let's see if it starts
888           // at the offset
889           if ( offset.equals(a.getStartNode().getOffset()) ){
890             // The annotation a starts at the offset.
891             // In this case empty the stack and write the end tag
892             if (!stack.isEmpty()){
893               while(!stack.isEmpty()){
894                 Annotation a1 = (Annotation)stack.pop();
895                 tmpBuff.append(writeEndTag(a1));
896               }// End while
897             }// End if
898             if (offsets.isEmpty() && "".equals(tmpBuff.toString())){
899               // a is the last tag to be written
900               // The annotations are serialized from left to right.
901               // The first annot in the last offset is the ROOT one.
902               isRootTag = true;
903             }// End if
904             tmpBuff.append(writeStartTag(a, includeFeatures));
905             // The annotation is removed from dumped set
906             aDumpAnnotSet.remove(a);
907           }// End if ( offset.equals(a.getStartNode().getOffset()) )
908         }// End if ( offset.equals(a.getEndNode().getOffset()) )
909       }// End while(it.hasNext()){
910 
911       // In this case empty the stack and write the end tag
912       if (!stack.isEmpty()){
913         while(!stack.isEmpty()){
914           Annotation a1 = (Annotation)stack.pop();
915           tmpBuff.append(writeEndTag(a1));
916         }// End while
917       }// End if
918 
919       // Before inserting tmpBuff into docContStrBuff we need to check
920       // if there are chars to be replaced and if there are, they would be
921       // replaced.
922       if (!offsets2CharsMap.isEmpty()){
923         Integer offsChar = (Integer) offsets2CharsMap.lastKey();
924         while( !offsets2CharsMap.isEmpty() &&
925                        offsChar.intValue() >= offset.intValue()){
926           // Replace the char at offsChar with its corresponding entity form
927           // the entitiesMap.
928           docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
929           (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
930           // Discard the offsChar after it was used.
931           offsets2CharsMap.remove(offsChar);
932           // Investigate next offsChar
933           if (!offsets2CharsMap.isEmpty())
934             offsChar = (Integer) offsets2CharsMap.lastKey();
935         }// End while
936       }// End if
937       // Insert tmpBuff to the location where it belongs in docContStrBuff
938       docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
939     }// End while(!offsets.isEmpty())
940     // Need to replace the entities in the remaining text, if there is any text
941     // So, if there are any more items in offsets2CharsMap they need to be
942     // replaced
943     while (!offsets2CharsMap.isEmpty()){
944       Integer offsChar = (Integer) offsets2CharsMap.lastKey();
945       // Replace the char with its entity
946       docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
947       (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
948       // remove the offset from the map
949       offsets2CharsMap.remove(offsChar);
950     }// End while
951     return docContStrBuff.toString();
952   }// saveAnnotationSetAsXml()
953 
954   /**
955    *  Return true only if the document has features for original content and
956    *  repositioning information.
957    */
958   private boolean hasOriginalContentFeatures() {
959     FeatureMap features = getFeatures();
960     boolean result = false;
961 
962     result =
963     (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
964       &&
965     (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
966       != null);
967 
968     return result;
969   } // hasOriginalContentFeatures
970 
971   /** This method saves all the annotations from aDumpAnnotSet and combines
972     * them with the original document content, if preserved as feature.
973     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
974     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
975     * then an empty string will be returned.
976     * @param includeFeatures is a boolean, which controls whether the annotation
977     * features and gate ID are included or not.
978     * @return The XML document obtained from raw text + the information from
979     * the dump annotation set.
980     */
981   private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
982                                         boolean includeFeatures){
983     StringBuffer docContStrBuff;
984 
985     String origContent;
986 
987     origContent =
988      (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
989     if(origContent == null) {
990       origContent = "";
991     } // if
992 
993     long originalContentSize = origContent.length();
994 
995     RepositioningInfo repositioning = (RepositioningInfo)
996       getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
997 
998     docContStrBuff = new StringBuffer(origContent);
999     if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1000
1001    StatusListener sListener = (StatusListener)
1002                               gate.gui.MainFrame.getListeners().
1003                               get("gate.event.StatusListener");
1004
1005    AnnotationSet originalMarkupsAnnotSet =
1006            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1007    // Create a dumping annotation set on the document. It will be used for
1008    // dumping annotations...
1009    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1010    if(sListener != null)
1011      sListener.statusChanged("Constructing the dumping annotation set.");
1012    // Then take all the annotations from aSourceAnnotationSet and verify if
1013    // they can be inserted safely into the dumpingSet. Where not possible,
1014    // report.
1015    if (aSourceAnnotationSet != null){
1016      Iterator iter = aSourceAnnotationSet.iterator();
1017      Annotation currentAnnot;
1018      while (iter.hasNext()){
1019        currentAnnot = (Annotation) iter.next();
1020        if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1021            && insertsSafety(dumpingSet, currentAnnot)){
1022          dumpingSet.add(currentAnnot);
1023        }else{
1024          Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1025          ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1026          ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1027          ", type=" + currentAnnot.getType()+ " was found to violate the" +
1028          " crossed over condition. It will be discarded");
1029        }// End if
1030      }// End while
1031    }// End if
1032
1033    // The dumpingSet is ready to be exported as XML
1034    // Here we go.
1035    if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1036
1037    ///////////////////////////////////////////
1038    // Construct a set of annot with all IDs in asc order.
1039    // All annotations that end at that offset swap their place in descending
1040    // order. For each node write all the tags from left to right.
1041
1042    // Construct the node set
1043    TreeSet offsets = new TreeSet();
1044    Iterator iter = aSourceAnnotationSet.iterator();
1045    while (iter.hasNext()){
1046      Annotation annot = (Annotation) iter.next();
1047      offsets.add(annot.getStartNode().getOffset());
1048      offsets.add(annot.getEndNode().getOffset());
1049    }// End while
1050    isRootTag = false;
1051
1052    // ofsets is sorted in ascending order.
1053    // Iterate this set in descending order and remove an offset at each
1054    // iteration
1055    while (!offsets.isEmpty()){
1056      Long offset = (Long)offsets.last();
1057      // Remove the offset from the set
1058      offsets.remove(offset);
1059      // Now, use it.
1060      // Returns a list with annotations that needs to be serialized in that
1061      // offset.
1062      List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1063      // Attention: the annotation are serialized from left to right
1064      StringBuffer tmpBuff = new StringBuffer("");
1065      Stack stack = new Stack();
1066      // Iterate through all these annotations and serialize them
1067      Iterator it = annotations.iterator();
1068      Annotation a = null;
1069      while(it.hasNext()) {
1070        a = (Annotation) it.next();
1071        it.remove();
1072        // Test if a Ends at offset
1073        if ( offset.equals(a.getEndNode().getOffset()) ){
1074          // Test if a Starts at offset
1075          if ( offset.equals(a.getStartNode().getOffset()) ){
1076            // Here, the annotation a Starts and Ends at the offset
1077            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1078                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1079
1080              // Assert: annotation a with start == end and isEmptyAndSpan
1081              tmpBuff.append(writeStartTag(a, includeFeatures, false));
1082              stack.push(a);
1083            }else{
1084              // Assert annotation a with start == end and an empty tag
1085              tmpBuff.append(writeEmptyTag(a, false));
1086              // The annotation is removed from dumped set
1087              aSourceAnnotationSet.remove(a);
1088            }// End if
1089          }else{
1090            // Here the annotation a Ends at the offset.
1091            // In this case empty the stack and write the end tag
1092            while(!stack.isEmpty()){
1093              Annotation a1 = (Annotation)stack.pop();
1094              tmpBuff.append(writeEndTag(a1));
1095            }// End while
1096            tmpBuff.append(writeEndTag(a));
1097          }// End if
1098        }else{
1099          // The annotation a does NOT end at the offset. Let's see if it starts
1100          // at the offset
1101          if ( offset.equals(a.getStartNode().getOffset()) ){
1102            // The annotation a starts at the offset.
1103            // In this case empty the stack and write the end tag
1104            while(!stack.isEmpty()){
1105              Annotation a1 = (Annotation)stack.pop();
1106              tmpBuff.append(writeEndTag(a1));
1107            }// End while
1108
1109            tmpBuff.append(writeStartTag(a, includeFeatures, false));
1110            // The annotation is removed from dumped set
1111            aSourceAnnotationSet.remove(a);
1112          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1113        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1114      }// End while(it.hasNext()){
1115
1116      // In this case empty the stack and write the end tag
1117      while(!stack.isEmpty()){
1118        Annotation a1 = (Annotation)stack.pop();
1119        tmpBuff.append(writeEndTag(a1));
1120      }// End while
1121
1122      long originalPosition = -1;
1123      boolean backPositioning =
1124        a != null && offset.equals(a.getEndNode().getOffset());
1125      if ( backPositioning ) {
1126        // end of the annotation correction
1127        originalPosition =
1128          repositioning.getOriginalPos(offset.intValue(), true);
1129      } // if
1130
1131      if(originalPosition == -1) {
1132        originalPosition = repositioning.getOriginalPos(offset.intValue());
1133      } // if
1134
1135      // Insert tmpBuff to the location where it belongs in docContStrBuff
1136      if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1137        docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1138      }
1139      else {
1140        Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1141        +") could not be positioned in the original document. \n"
1142        +"Calculated position is: "+originalPosition
1143        +" placed back: "+backPositioning);
1144      } // if
1145
1146    }// End while(!offsets.isEmpty())
1147
1148    return docContStrBuff.toString();
1149  } // saveAnnotationSetAsXml()
1150
1151  /** This method returns a list with annotations ordered that way that
1152    * they can be serialized from left to right, at the offset. If one of the
1153    * params is null then an empty list will be returned.
1154    * @param aDumpAnnotSet is a set containing all annotations that will be
1155    * dumped.
1156    * @param offset represent the offset at witch the annotation must start
1157    * AND/OR end.
1158    * @return a list with those annotations that need to be serialized.
1159    */
1160  private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1161    List annotationList = new LinkedList();
1162    if (aDumpAnnotSet == null || offset == null) return annotationList;
1163    Set annotThatStartAtOffset = new TreeSet(
1164                          new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1165    Set annotThatEndAtOffset = new TreeSet(
1166                          new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1167    Set annotThatStartAndEndAtOffset = new TreeSet(
1168                          new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1169
1170    // Fill these tree lists with annotation tat start, end or start and
1171    // end at the offset.
1172    Iterator iter = aDumpAnnotSet.iterator();
1173    while(iter.hasNext()){
1174      Annotation ann = (Annotation) iter.next();
1175      if (offset.equals(ann.getStartNode().getOffset())){
1176        if (offset.equals(ann.getEndNode().getOffset()))
1177          annotThatStartAndEndAtOffset.add(ann);
1178        else
1179          annotThatStartAtOffset.add(ann);
1180      }else{
1181        if (offset.equals(ann.getEndNode().getOffset()))
1182          annotThatEndAtOffset.add(ann);
1183      }// End if
1184    }// End while
1185    annotationList.addAll(annotThatEndAtOffset);
1186    annotThatEndAtOffset = null;
1187    annotationList.addAll(annotThatStartAtOffset);
1188    annotThatStartAtOffset = null;
1189    iter = annotThatStartAndEndAtOffset.iterator();
1190    while(iter.hasNext()){
1191      Annotation ann = (Annotation) iter.next();
1192      Iterator it = annotationList.iterator();
1193      boolean breaked = false;
1194      while (it.hasNext()){
1195        Annotation annFromList = (Annotation) it.next();
1196        if (annFromList.getId().intValue() > ann.getId().intValue()){
1197          annotationList.add(annotationList.indexOf(annFromList),ann);
1198          breaked = true;
1199          break;
1200        }// End if
1201      }// End while
1202      if (!breaked)
1203        annotationList.add(ann);
1204      iter.remove();
1205    }// End while
1206    return annotationList;
1207  }// getAnnotationsForOffset()
1208
1209  private String writeStartTag(Annotation annot, boolean includeFeatures){
1210    return writeStartTag(annot, includeFeatures, true);
1211  } // writeStartTag
1212
1213  /** Returns a string representing a start tag based on the input annot*/
1214  private String writeStartTag(Annotation annot, boolean includeFeatures,
1215                                boolean includeNamespace){
1216    AnnotationSet originalMarkupsAnnotSet =
1217            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1218
1219    StringBuffer strBuff = new StringBuffer("");
1220    if (annot == null) return strBuff.toString();
1221//    if (!addGatePreserveFormatTag && isRootTag){
1222      if (isRootTag){
1223      //the features are included either if desired or if that's an annotation
1224      //from the original markup of the document. We don't want for example to
1225      //spoil all links in an HTML file!
1226      if (includeFeatures) {
1227        strBuff.append("<");
1228        strBuff.append(annot.getType());
1229        strBuff.append(" ");
1230        if(includeNamespace) {
1231          strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1232          strBuff.append(" gate:");
1233        }
1234        strBuff.append("gateId=\"");
1235        strBuff.append(annot.getId());
1236        strBuff.append("\"");
1237        strBuff.append(" ");
1238        if(includeNamespace) {
1239          strBuff.append("gate:");
1240        }
1241        strBuff.append("annotMaxId=\"");
1242        strBuff.append(getNextAnnotationId());
1243        strBuff.append("\"");
1244        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1245        strBuff.append(">");
1246      }
1247      else if (originalMarkupsAnnotSet.contains(annot)) {
1248          strBuff.append("<");
1249          strBuff.append(annot.getType());
1250          strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1251          strBuff.append(">");
1252        }
1253      else {
1254        strBuff.append("<");
1255        strBuff.append(annot.getType());
1256        strBuff.append(">");
1257      }
1258      // Once the root tag was writen then there will be no other Root tag
1259      isRootTag = false;
1260    }else{
1261      //the features are included either if desired or if that's an annotation
1262      //from the original markup of the document. We don't want for example to
1263      //spoil all links in an HTML file!
1264      if (includeFeatures) {
1265        strBuff.append("<");
1266        strBuff.append(annot.getType());
1267        strBuff.append(" ");
1268        if(includeNamespace) {
1269          strBuff.append("gate:");
1270        } // if includeNamespaces
1271        strBuff.append("gateId=\"");
1272        strBuff.append(annot.getId());
1273        strBuff.append("\"");
1274        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1275        strBuff.append(">");
1276      }
1277      else if (originalMarkupsAnnotSet.contains(annot)) {
1278        strBuff.append("<");
1279        strBuff.append(annot.getType());
1280        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1281        strBuff.append(">");
1282      }
1283      else {
1284        strBuff.append("<");
1285        strBuff.append(annot.getType());
1286        strBuff.append(">");
1287      }
1288    }// End if
1289    return strBuff.toString();
1290  }// writeStartTag()
1291
1292  /** This method takes aScanString and searches for those chars from
1293    * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1294    * using as key the offsets where those Chars appear and the Char.
1295    * If one of the params is null the method simply returns.
1296    */
1297  private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1298    if (aScanString == null || aMapToFill == null) return;
1299    if (entitiesMap == null || entitiesMap.isEmpty()){
1300      Err.prln("WARNING: Entities map was not initialised !");
1301      return;
1302    }// End if
1303    // Fill the Map with the offsets of the special chars
1304    Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1305    while(entitiesMapIterator.hasNext()){
1306      Character c = (Character) entitiesMapIterator.next();
1307      int fromIndex = 0;
1308      while (-1 != fromIndex){
1309        fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1310        if (-1 != fromIndex){
1311          aMapToFill.put(new Integer(fromIndex),c);
1312          fromIndex ++;
1313        }// End if
1314      }// End while
1315    }// End while
1316  }//buildEntityMapFromString();
1317
1318  private String writeEmptyTag(Annotation annot){
1319    return writeEmptyTag(annot, true);
1320  } // writeEmptyTag
1321
1322  /** Returns a string representing an empty tag based on the input annot*/
1323  private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1324    StringBuffer strBuff = new StringBuffer("");
1325    if (annot == null) return strBuff.toString();
1326
1327    strBuff.append("<");
1328    strBuff.append(annot.getType());
1329
1330    AnnotationSet originalMarkupsAnnotSet =
1331            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1332    if (! originalMarkupsAnnotSet.contains(annot)) {
1333      strBuff.append(" gateId=\"");
1334      strBuff.append(annot.getId());
1335      strBuff.append("\"");
1336    }
1337    strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1338    strBuff.append("/>");
1339
1340    return strBuff.toString();
1341  }// writeEmptyTag()
1342
1343  /** Returns a string representing an end tag based on the input annot*/
1344  private String writeEndTag(Annotation annot){
1345    StringBuffer strBuff = new StringBuffer("");
1346    if (annot == null) return strBuff.toString();
1347/*
1348    if (annot.getType().indexOf(" ") != -1)
1349      Out.prln("Warning: Truncating end tag to first word for annot type \""
1350      +annot.getType()+ "\". ");
1351*/
1352    strBuff.append("</"+annot.getType()+">");
1353    return strBuff.toString();
1354  }// writeEndTag()
1355
1356  /** Returns a string representing a FeatureMap serialized as XML attributes*/
1357  private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1358    StringBuffer strBuff = new StringBuffer("");
1359    if (feat == null) return strBuff.toString();
1360    Iterator it = feat.keySet().iterator();
1361    while (it.hasNext()){
1362      Object key = it.next();
1363      Object value = feat.get(key);
1364      if ( (key != null) && (value != null) ){
1365        // Eliminate a feature inserted at reading time and which help to
1366        // take some decissions at saving time
1367        if ("isEmptyAndSpan".equals(key.toString()))
1368          continue;
1369        if( !(String.class.isAssignableFrom(key.getClass()) ||
1370              Number.class.isAssignableFrom(key.getClass()))){
1371
1372            Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1373                             " from String or Number.(feature discarded)");
1374            continue;
1375        }// End if
1376        if ( !(String.class.isAssignableFrom(value.getClass()) ||
1377               Number.class.isAssignableFrom(value.getClass()) ||
1378               java.util.Collection.class.isAssignableFrom(value.getClass()))){
1379
1380            Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1381                       " from String, Number or Collection.(feature discarded)");
1382            continue;
1383        }// End if
1384        if ("matches".equals(key)) {
1385          strBuff.append(" ");
1386          if(includeNamespace) {
1387            strBuff.append("gate:");
1388          }
1389          strBuff.append(key);
1390          strBuff.append("=\"");
1391        }
1392        else {
1393          strBuff.append(" ");
1394          strBuff.append(key);
1395          strBuff.append("=\"");
1396        }
1397        if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1398          Iterator valueIter = ((Collection)value).iterator();
1399          while(valueIter.hasNext()){
1400            Object item = valueIter.next();
1401            if (!(String.class.isAssignableFrom(item.getClass()) ||
1402                  Number.class.isAssignableFrom(item.getClass())))
1403                  continue;
1404            strBuff.append(item);
1405            strBuff.append(";");
1406          }// End while
1407          if (strBuff.charAt(strBuff.length()-1) == ';')
1408            strBuff.deleteCharAt(strBuff.length()-1);
1409        }else{
1410          strBuff.append(value);
1411        }// End if
1412        strBuff.append("\"");
1413      }// End if
1414    }// End while
1415    return strBuff.toString();
1416  }// writeFeatures()
1417
1418  /** Returns a GateXml document that is a custom XML format for wich there is
1419    * a reader inside GATE called gate.xml.GateFormatXmlHandler.
1420    * What it does is to serialize a GATE document in an XML format.
1421    * @return a string representing a Gate Xml document. If saved in a file,this
1422    * string must be written using the UTF-8 encoding because the first line
1423    * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?>
1424    */
1425  public String toXml(){
1426    // Initialize the xmlContent with 3 time the size of the current document.
1427    // This is because of the tags size. This measure is made to increase the
1428    // performance of StringBuffer.
1429    StringBuffer xmlContent = new StringBuffer(
1430         DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
1431    // Add xml header
1432    xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
1433    // Add the root element
1434    xmlContent.append("<GateDocument>\n");
1435    xmlContent.append("<!-- The document's features-->\n\n");
1436    xmlContent.append("<GateDocumentFeatures>\n");
1437
1438    xmlContent.append(featuresToXml(this.getFeatures()));
1439    xmlContent.append("</GateDocumentFeatures>\n");
1440    xmlContent.append("<!-- The document content area with serialized"+
1441                      " nodes -->\n\n");
1442    // Add plain text element
1443    xmlContent.append("<TextWithNodes>");
1444    xmlContent.append(textWithNodes(this.getContent().toString()));
1445    xmlContent.append("</TextWithNodes>\n");
1446    // Serialize as XML all document's annotation sets
1447    // Serialize the default AnnotationSet
1448    StatusListener sListener = (StatusListener)
1449                               gate.gui.MainFrame.getListeners().
1450                               get("gate.event.StatusListener");
1451    if(sListener != null)
1452      sListener.statusChanged("Saving the default annotation set ");
1453    xmlContent.append("<!-- The default annotation set -->\n\n");
1454    xmlContent.append(annotationSetToXml(this.getAnnotations()));
1455    // Serialize all others AnnotationSets
1456    // namedAnnotSets is a Map containing all other named Annotation Sets.
1457    if (namedAnnotSets != null){
1458      Iterator iter = namedAnnotSets.values().iterator();
1459      while(iter.hasNext()){
1460        AnnotationSet annotSet = (AnnotationSet) iter.next();
1461        xmlContent.append("<!-- Named annotation set -->\n\n");
1462        // Serialize it as XML
1463        if(sListener != null) sListener.statusChanged("Saving " +
1464                                                      annotSet.getName()+
1465                                                      " annotation set ");
1466        xmlContent.append(annotationSetToXml(annotSet));
1467      }// End while
1468    }// End if
1469    // Add the end of GateDocument
1470    xmlContent.append("</GateDocument>");
1471    if(sListener != null) sListener.statusChanged("Done !");
1472    // return the XmlGateDocument
1473    return xmlContent.toString();
1474  }// toXml
1475
1476  /** This method filters any non XML char
1477    * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
1478    * All non XML chars will be replaced with 0x20 (space char) This assures
1479    * that the next time the document is loaded there won't be any problems.
1480    * @param aStrBuffer represents the input String that is filtred. If the
1481    * aStrBuffer is null then an empty string will be returend
1482    * @return the "purified" StringBuffer version of the aStrBuffer
1483    */
1484  private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
1485    if (aStrBuffer == null) return new StringBuffer("");
1486    String space = new String(" ");
1487    for (int i=aStrBuffer.length()-1;i>=0; i--){
1488      if (!isXmlChar(aStrBuffer.charAt(i)))
1489        aStrBuffer.replace(i,i+1,space);
1490    }// End for
1491    return aStrBuffer;
1492  }// filterNonXmlChars()
1493
1494  /** This method decide if a char is a valid XML one or not
1495    * @param ch the char to be tested
1496    * @return true if is a valid XML char and fals if is not.
1497    */
1498  public static boolean isXmlChar(char ch){
1499    if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
1500    if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
1501    if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
1502    if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
1503    return false;
1504  }// End isXmlChar()
1505
1506  /** This method saves a FeatureMap as XML elements.
1507    * @ param aFeatureMap the feature map that has to be saved as XML.
1508    * @ return a String like this: <Feature><Name>...</Name>
1509    * <Value>...</Value></Feature><Feature>...</Feature>
1510    */
1511  private String featuresToXml(FeatureMap aFeatureMap){
1512    StringBuffer str = new StringBuffer("");
1513
1514    if (aFeatureMap == null) return str.toString();
1515
1516    Set keySet = aFeatureMap.keySet();
1517    Iterator keyIterator = keySet.iterator();
1518    while(keyIterator.hasNext()){
1519      Object key = keyIterator.next();
1520      Object value = aFeatureMap.get(key);
1521      if ((key != null) && (value != null)){
1522        String keyClassName = null;
1523        String keyItemClassName = null;
1524        String valueClassName = null;
1525        String valueItemClassName = null;
1526        String key2String = key.toString();
1527        String value2String = value.toString();
1528
1529        Object item = null;
1530        // Test key if it is String, Number or Collection
1531        if (key instanceof java.lang.String ||
1532            key instanceof java.lang.Number ||
1533            key instanceof java.util.Collection)
1534          keyClassName = key.getClass().getName();
1535
1536        // Test value if it is String, Number or Collection
1537        if (value instanceof java.lang.String ||
1538            value instanceof java.lang.Number ||
1539            value instanceof java.util.Collection)
1540          valueClassName = value.getClass().getName();
1541
1542        // Features and values that are not Strings, Numbers or collections
1543        // will be discarded.
1544        if (keyClassName == null || valueClassName == null) continue;
1545
1546        // If key is collection serialize the colection in a specific format
1547        if (key instanceof java.util.Collection){
1548          StringBuffer keyStrBuff = new StringBuffer("");
1549          Iterator iter = ((Collection) key).iterator();
1550          if (iter.hasNext()){
1551            item = iter.next();
1552            if (item instanceof java.lang.Number)
1553              keyItemClassName = item.getClass().getName();
1554            else
1555              keyItemClassName = String.class.getName();
1556            keyStrBuff.append(item.toString());
1557          }// End if
1558          while (iter.hasNext()){
1559            item = iter.next();
1560            keyStrBuff.append(";" + item.toString());
1561          }// End while
1562          key2String = keyStrBuff.toString();
1563        }// End if
1564        // If key is collection serialize the colection in a specific format
1565        if (value instanceof java.util.Collection){
1566          StringBuffer valueStrBuff = new StringBuffer("");
1567          Iterator iter = ((Collection) value).iterator();
1568          if (iter.hasNext()){
1569            item = iter.next();
1570            if (item instanceof java.lang.Number)
1571              valueItemClassName = item.getClass().getName();
1572            else
1573              valueItemClassName = String.class.getName();
1574            valueStrBuff.append(item.toString());
1575          }// End if
1576          while (iter.hasNext()){
1577            item = iter.next();
1578            valueStrBuff.append(";" + item.toString());
1579          }// End while
1580          value2String = valueStrBuff.toString();
1581        }// End if
1582        str.append("<Feature>\n  <Name");
1583        if (keyClassName != null)
1584          str.append(" className=\""+keyClassName+"\"");
1585        if (keyItemClassName != null)
1586          str.append(" itemClassName=\""+keyItemClassName+"\"");
1587        str.append(">");
1588        str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
1589        str.append("</Name>\n  <Value");
1590        if (valueClassName != null)
1591          str.append(" className=\"" + valueClassName + "\"");
1592        if (valueItemClassName != null)
1593          str.append(" itemClassName=\"" + valueItemClassName + "\"");
1594        str.append(">");
1595        str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
1596        str.append("</Value>\n</Feature>\n");
1597      }// End if
1598    }// end While
1599    return str.toString();
1600  }//featuresToXml
1601
1602  /** This method replace all chars that appears in the anInputString and also
1603    * that are in the entitiesMap with their corresponding entity
1604    * @param anInputString the string analyzed. If it is null then returns the
1605    *  empty string
1606    * @return a string representing the input string with chars replaced with
1607    *  entities
1608    */
1609  private StringBuffer replaceCharsWithEntities(String anInputString){
1610    if (anInputString == null) return new StringBuffer("");
1611    StringBuffer strBuff = new StringBuffer(anInputString);
1612    for (int i=strBuff.length()-1; i>=0; i--){
1613      Character ch = new Character(strBuff.charAt(i));
1614      if (entitiesMap.keySet().contains(ch)){
1615        strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
1616      }// End if
1617    }// End for
1618    return strBuff;
1619  }//replaceCharsWithEntities()
1620
1621  /** This method creates Node XML elements and inserts them at the
1622    * corresponding offset inside the text. Nodes are created from the default
1623    * annotation set, as well as from all existing named annotation sets.
1624    * @param aText The text representing the document's plain text.
1625    * @return The text with empty <Node id="NodeId"/> elements.
1626    */
1627  private String textWithNodes(String aText){
1628    if (aText == null) return new String("");
1629    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
1630
1631    // Construct a map from offsets to Chars
1632    TreeMap offsets2CharsMap = new TreeMap();
1633    if (aText.length()!= 0){
1634      // Fill the offsets2CharsMap with all the indices where special chars appear
1635      buildEntityMapFromString(aText,offsets2CharsMap);
1636    }//End if
1637    // Construct the offsetsSet for all nodes belonging to this document
1638    TreeSet offsetsSet = new TreeSet();
1639    Iterator annotSetIter = this.getAnnotations().iterator();
1640    while (annotSetIter.hasNext()){
1641      Annotation annot = (Annotation) annotSetIter.next();
1642      offsetsSet.add(annot.getStartNode().getOffset());
1643      offsetsSet.add(annot.getEndNode().getOffset());
1644    }// end While
1645    // Get the nodes from all other named annotation sets.
1646    if (namedAnnotSets != null){
1647      Iterator iter = namedAnnotSets.values().iterator();
1648      while(iter.hasNext()){
1649        AnnotationSet annotSet = (AnnotationSet) iter.next();
1650        Iterator iter2 = annotSet.iterator();
1651        while(iter2.hasNext()){
1652          Annotation annotTmp = (Annotation) iter2.next();
1653          offsetsSet.add(annotTmp.getStartNode().getOffset());
1654          offsetsSet.add(annotTmp.getEndNode().getOffset());
1655        }// End while
1656      }// End while
1657    }// End if
1658    // offsetsSet is ordered in ascending order because the structure
1659    // is a TreeSet
1660
1661    if (offsetsSet.isEmpty()){
1662      return replaceCharsWithEntities(aText).toString();
1663    }// End if
1664    // Iterate through all nodes from anAnnotSet and transform them to
1665    // XML elements. Then insert those elements at the node's offset into the
1666    // textWithNodes .
1667    while (!offsetsSet.isEmpty()){
1668      Long offset = (Long) offsetsSet.last();
1669      // Eliminate the offset from the list in order to create more memory space
1670      offsetsSet.remove(offset);
1671      // Use offset
1672      int offsetValue = offset.intValue();
1673      String strNode = "<Node id=\"" + offsetValue + "\"/>";
1674      // Before inserting this string into the textWithNodes, check to see if
1675      // there are any chars to be replaced with their corresponding entities
1676      if (!offsets2CharsMap.isEmpty()){
1677        Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1678        while( !offsets2CharsMap.isEmpty() &&
1679                       offsChar.intValue() >= offset.intValue()){
1680          // Replace the char at offsChar with its corresponding entity form
1681          // the entitiesMap.
1682          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1683          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1684          // Discard the offsChar after it was used because this offset will
1685          // never appear again
1686          offsets2CharsMap.remove(offsChar);
1687          // Investigate next offsChar
1688          if (!offsets2CharsMap.isEmpty())
1689            offsChar = (Integer) offsets2CharsMap.lastKey();
1690        }// End while
1691      }// End if
1692      // Now it is safe to insert the node
1693      textWithNodes.insert(offsetValue,strNode);
1694    }// end while
1695    // Need to replace the entities in the remaining text, if there is any text
1696    // So, if there are any more items in offsets2CharsMap they need to be
1697    // replaced
1698    while (!offsets2CharsMap.isEmpty()){
1699      Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1700      // Replace the char with its entity
1701      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1702      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1703      // remove the offset from the map
1704      offsets2CharsMap.remove(offsChar);
1705    }// End while
1706    return textWithNodes.toString();
1707  }//textWithNodes()
1708
1709  /** This method saves an AnnotationSet as XML.
1710    * @param anAnnotationSet The annotation set that has to be saved as XML.
1711    * @return a String like this: <AnnotationSet> <Annotation>....
1712    * </AnnotationSet>
1713    */
1714  private String annotationSetToXml(AnnotationSet anAnnotationSet){
1715    StringBuffer str = new StringBuffer("");
1716
1717    if (anAnnotationSet == null){
1718      str.append("<AnnotationSet>\n");
1719      str.append("</AnnotationSet>\n");
1720      return str.toString();
1721    }// End if
1722    if (anAnnotationSet.getName() == null)
1723      str.append("<AnnotationSet>\n");
1724    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
1725                                                                    "\" >\n");
1726    // Iterate through AnnotationSet and save each Annotation as XML
1727    Iterator iterator = anAnnotationSet.iterator();
1728    while (iterator.hasNext()){
1729      Annotation annot = (Annotation) iterator.next();
1730      str.append("<Annotation " + "Type=\"" + annot.getType() +
1731                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
1732                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
1733      str.append(featuresToXml(annot.getFeatures()));
1734      str.append("</Annotation>\n");
1735    }// End while
1736
1737    str.append("</AnnotationSet>\n");
1738    return str.toString();
1739  }// annotationSetToXml
1740
1741  /** Returns a map with the named annotation sets. It returns <code>null</code>
1742   *  if no named annotaton set exists. */
1743  public Map getNamedAnnotationSets() {
1744    return namedAnnotSets;
1745  } // getNamedAnnotationSets
1746
1747  /**
1748   * Removes one of the named annotation sets.
1749   * Note that the default annotation set cannot be removed.
1750   * @param name the name of the annotation set to be removed
1751   */
1752  public void removeAnnotationSet(String name){
1753    Object removed = namedAnnotSets.remove(name);
1754    if(removed != null){
1755      fireAnnotationSetRemoved(
1756        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
1757    }
1758  }
1759
1760  /** Propagate edit changes to the document content and annotations. */
1761  public void edit(Long start, Long end, DocumentContent replacement)
1762    throws InvalidOffsetException
1763  {
1764    if(! isValidOffsetRange(start, end))
1765      throw new InvalidOffsetException();
1766
1767    if(content != null)
1768      ((DocumentContentImpl) content).edit(start, end, replacement);
1769
1770    if(defaultAnnots != null)
1771      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
1772
1773    if(namedAnnotSets != null) {
1774      Iterator iter = namedAnnotSets.values().iterator();
1775      while(iter.hasNext())
1776        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
1777    }
1778
1779  } // edit(start,end,replacement)
1780
1781  /** Check that an offset is valid, i.e. it is non-null, greater than
1782    * or equal to 0 and less than the size of the document content.
1783    */
1784  public boolean isValidOffset(Long offset) {
1785    if(offset == null)
1786      return false;
1787
1788    long o = offset.longValue();
1789    if(o > getContent().size().longValue() || o < 0)
1790      return false;
1791
1792    return true;
1793  } // isValidOffset
1794
1795  /** Check that both start and end are valid offsets and that
1796    * they constitute a valid offset range, i.e. start is greater
1797    * than or equal to long.
1798    */
1799  public boolean isValidOffsetRange(Long start, Long end) {
1800    return
1801      isValidOffset(start) && isValidOffset(end) &&
1802      start.longValue() <= end.longValue();
1803  } // isValidOffsetRange(start,end)
1804
1805  /** Sets the nextAnnotationId */
1806  public void setNextAnnotationId(int aNextAnnotationId){
1807    nextAnnotationId = aNextAnnotationId;
1808  }// setNextAnnotationId();
1809
1810  /** Generate and return the next annotation ID */
1811  public Integer getNextAnnotationId() {
1812    return new Integer(nextAnnotationId++);
1813  } // getNextAnnotationId
1814
1815  /** Generate and return the next node ID */
1816  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
1817
1818  /** Ordering based on URL.toString() and the URL offsets (if any) */
1819  public int compareTo(Object o) throws ClassCastException {
1820    DocumentImpl other = (DocumentImpl) o;
1821    return getOrderingString().compareTo(other.getOrderingString());
1822  } // compareTo
1823
1824  /** Utility method to produce a string for comparison in ordering.
1825    * String is based on the source URL and offsets.
1826    */
1827  protected String getOrderingString() {
1828    if(sourceUrl == null) return toString();
1829
1830    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
1831    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
1832      orderingString.append(sourceUrlStartOffset.toString());
1833      orderingString.append(sourceUrlEndOffset.toString());
1834    }
1835
1836    return orderingString.toString();
1837  } // getOrderingString()
1838
1839  /** The id of the next new annotation */
1840  protected int nextAnnotationId = 0;
1841
1842  /** The id of the next new node */
1843  protected int nextNodeId = 0;
1844  /** The source URL */
1845  protected URL sourceUrl;
1846
1847  /** The document's URL name. */
1848
1849  /** The content of the document */
1850  protected DocumentContent content;
1851
1852  /** The encoding of the source of the document content */
1853  protected String encoding = "UTF-8";
1854
1855  // Data needed in toXml(AnnotationSet) methos
1856
1857  /** This field indicates whether or not to add the tag
1858    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
1859    * have this tag added
1860    */
1861//  private boolean addGatePreserveFormatTag = false;
1862
1863  /** This field indicates if an annotation is the doc's root tag.
1864    * It is needed when adding the namespace information
1865    */
1866  private boolean isRootTag = false;
1867
1868  /** This field is used when creating StringBuffers for toXml() methods.
1869    * The size of the StringBuffer will be docDonctent.size() multiplied by this
1870    * value. It is aimed to improve the performance of StringBuffer
1871    */
1872  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1;
1873
1874  /** Constant used in the inner class AnnotationComparator to order
1875    * annotations on their start offset
1876    */
1877  private final int ORDER_ON_START_OFFSET = 0;
1878  /** Constant used in the inner class AnnotationComparator to order
1879    * annotations on their end offset
1880    */
1881  private final int ORDER_ON_END_OFFSET = 1;
1882  /** Constant used in the inner class AnnotationComparator to order
1883    * annotations on their ID
1884    */
1885  private final int ORDER_ON_ANNOT_ID = 2;
1886  /** Constant used in the inner class AnnotationComparator to order
1887    * annotations ascending
1888    */
1889  private final int ASC = 3;
1890  /** Constant used in the inner class AnnotationComparator to order
1891    * annotations descending
1892    */
1893  private final int DESC = -3;
1894
1895  /** A map initialized in init() containing entities that needs to be
1896    * replaced in strings
1897    */
1898  private static Map entitiesMap = null;
1899  // Initialize the entities map use when saving as xml
1900  static{
1901    entitiesMap = new HashMap();
1902    entitiesMap.put(new Character('<'),"&lt;");
1903    entitiesMap.put(new Character('>'),"&gt;");
1904    entitiesMap.put(new Character('&'),"&amp;");
1905    entitiesMap.put(new Character('\''),"&apos;");
1906    entitiesMap.put(new Character('"'),"&quot;");
1907    entitiesMap.put(new Character((char)160),"&#160;");
1908    entitiesMap.put(new Character((char)169),"&#169;");
1909  }//static
1910
1911  /** The range that the content comes from at the source URL
1912    * (or null if none).
1913    */
1914  //protected Long[] sourceUrlOffsets;
1915
1916  /** The start of the range that the content comes from at the source URL
1917    * (or null if none).
1918    */
1919  protected Long sourceUrlStartOffset;
1920
1921  /** The end of the range that the content comes from at the source URL
1922    * (or null if none).
1923    */
1924  protected Long sourceUrlEndOffset;
1925
1926  /** The default annotation set */
1927  protected AnnotationSet defaultAnnots;
1928
1929  /** Named sets of annotations */
1930  protected Map namedAnnotSets;
1931
1932  /**
1933   * A property of the document that will be set when the user
1934   * wants to create the document from a string, as opposed to from
1935   * a URL.
1936   */
1937  private String stringContent;
1938
1939  /**
1940   * The stringContent of a document is
1941   * a property of the document that will be set when the user
1942   * wants to create the document from a string, as opposed to from
1943   * a URL.
1944   * <B>Use the <TT>getContent</TT> method instead to get the actual document
1945   * content.</B>
1946   */
1947  public String getStringContent() { return stringContent; }
1948
1949  /**
1950   * The stringContent of a document is
1951   * a property of the document that will be set when the user
1952   * wants to create the document from a string, as opposed to from
1953   * a URL.
1954   * <B>Use the <TT>setContent</TT> method instead to update the actual
1955   * document content.</B>
1956   */
1957  public void setStringContent(String stringContent) {
1958    this.stringContent = stringContent;
1959  } // set StringContent
1960
1961  /** Is the document markup-aware? */
1962  protected Boolean markupAware = new Boolean(false);
1963
1964  /** Check: test 2 objects for equality */
1965  protected boolean check(Object a, Object b) {
1966    if( (a == null || b == null) )
1967      return a == b;
1968
1969    return a.equals(b);
1970  } // check(a,b)
1971
1972  /** Equals */
1973  public boolean equals(Object other) {
1974    if(other == null ||
1975       !(other instanceof DocumentImpl))return false;
1976    DocumentImpl doc = (DocumentImpl) other;
1977
1978// PENDING EQUALS IMPLS
1979    if(! check(content, doc.content)) return false;
1980    if(! check(defaultAnnots, doc.defaultAnnots)) return false;
1981    if(! check(encoding, doc.encoding)) return false;
1982    if(! check(features, doc.features)) return false;
1983    if(!markupAware.equals(doc.markupAware)) return false;
1984    if(! check(namedAnnotSets, doc.namedAnnotSets)) return false;
1985    if(nextAnnotationId != doc.nextAnnotationId) return false;
1986    if(nextNodeId != doc.nextNodeId) return false;
1987    if(! check(sourceUrl, doc.sourceUrl)) return false;
1988    if(! check(sourceUrlStartOffset, doc.sourceUrlStartOffset)) return false;
1989    if(! check(sourceUrlEndOffset, doc.sourceUrlEndOffset)) return false;
1990
1991    return true;
1992  } // equals
1993
1994  /** Hash code */
1995  public int hashCode() {
1996    int code = getContent().hashCode();
1997    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
1998    code += memberCode;
1999    memberCode = (encoding == null) ? 0 : encoding.hashCode();
2000    code += memberCode;
2001    memberCode = (features == null) ? 0 : features.hashCode();
2002    code += memberCode;
2003    code += (markupAware.booleanValue()) ? 0 : 1;
2004    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2005    code += memberCode;
2006    code += nextAnnotationId;
2007    code += nextNodeId;
2008    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2009    code += memberCode;
2010    memberCode =
2011      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2012    code += memberCode;
2013    memberCode =
2014      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2015    code += memberCode;
2016    return code;
2017  } // hashcode
2018
2019  /** String respresentation */
2020  public String toString() {
2021    String n = Strings.getNl();
2022    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2023    s.append("  content:" + content + n);
2024    s.append("  defaultAnnots:" + defaultAnnots + n);
2025    s.append("  encoding:" + encoding + n);
2026    s.append("  features:" + features + n);
2027    s.append("  markupAware:" + markupAware + n);
2028    s.append("  namedAnnotSets:" + namedAnnotSets + n);
2029    s.append("  nextAnnotationId:" + nextAnnotationId + n);
2030    s.append("  nextNodeId:" + nextNodeId + n);
2031    s.append("  sourceUrl:" + sourceUrl + n);
2032    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2033    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2034    s.append(n);
2035
2036    return s.toString();
2037  } // toString
2038
2039   /** Freeze the serialization UID. */
2040  static final long serialVersionUID = -8456893608311510260L;
2041
2042  /** Inner class needed to compare annotations*/
2043  class AnnotationComparator implements java.util.Comparator {
2044    int orderOn = -1;
2045    int orderType = ASC;
2046    /** Constructs a comparator according to one of three sorter types:
2047      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2048      */
2049      public AnnotationComparator(int anOrderOn, int anOrderType){
2050        orderOn = anOrderOn;
2051        orderType = anOrderType;
2052      }// AnnotationComparator()
2053
2054      /**This method must be implemented according to Comparator interface */
2055      public int compare(Object o1, Object o2){
2056        Annotation a1 = (Annotation) o1;
2057        Annotation a2 = (Annotation) o2;
2058        // ORDER_ON_START_OFFSET ?
2059        if (orderOn == ORDER_ON_START_OFFSET){
2060          int result = a1.getStartNode().getOffset().compareTo(
2061                                                a2.getStartNode().getOffset());
2062          if (orderType == ASC){
2063            // ASC
2064            // If they are equal then their ID will decide.
2065            if (result == 0)
2066              return a1.getId().compareTo(a2.getId());
2067            return result;
2068          }else{
2069            // DESC
2070            if (result == 0)
2071              return - (a1.getId().compareTo(a2.getId()));
2072            return -result;
2073          }// End if (orderType == ASC)
2074        }// End if (orderOn == ORDER_ON_START_OFFSET)
2075
2076        // ORDER_ON_END_OFFSET ?
2077        if (orderOn == ORDER_ON_END_OFFSET){
2078          int result = a1.getEndNode().getOffset().compareTo(
2079                                                a2.getEndNode().getOffset());
2080          if (orderType == ASC){
2081            // ASC
2082            // If they are equal then their ID will decide.
2083            if (result == 0)
2084              return - (a1.getId().compareTo(a2.getId()));
2085            return result;
2086          }else{
2087            // DESC
2088            // If they are equal then their ID will decide.
2089            if (result == 0)
2090              return a1.getId().compareTo(a2.getId());
2091            return - result;
2092          }// End if (orderType == ASC)
2093        }// End if (orderOn == ORDER_ON_END_OFFSET)
2094
2095        // ORDER_ON_ANNOT_ID ?
2096        if (orderOn == ORDER_ON_ANNOT_ID){
2097          if (orderType == ASC)
2098            return a1.getId().compareTo(a2.getId());
2099          else
2100            return -(a1.getId().compareTo(a2.getId()));
2101        }// End if
2102        return 0;
2103      }//compare()
2104  } // End inner class AnnotationComparator
2105
2106
2107  private transient Vector documentListeners;
2108  private transient Vector gateListeners;
2109
2110  public synchronized void removeDocumentListener(DocumentListener l) {
2111    if (documentListeners != null && documentListeners.contains(l)) {
2112      Vector v = (Vector) documentListeners.clone();
2113      v.removeElement(l);
2114      documentListeners = v;
2115    }
2116  }
2117  public synchronized void addDocumentListener(DocumentListener l) {
2118    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2119    if (!v.contains(l)) {
2120      v.addElement(l);
2121      documentListeners = v;
2122    }
2123  }
2124  protected void fireAnnotationSetAdded(DocumentEvent e) {
2125    if (documentListeners != null) {
2126      Vector listeners = documentListeners;
2127      int count = listeners.size();
2128      for (int i = 0; i < count; i++) {
2129        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2130      }
2131    }
2132  }
2133  protected void fireAnnotationSetRemoved(DocumentEvent e) {
2134    if (documentListeners != null) {
2135      Vector listeners = documentListeners;
2136      int count = listeners.size();
2137      for (int i = 0; i < count; i++) {
2138        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2139      }
2140    }
2141  }
2142  public void resourceLoaded(CreoleEvent e) {
2143  }
2144  public void resourceUnloaded(CreoleEvent e) {
2145  }
2146  public void datastoreOpened(CreoleEvent e) {
2147  }
2148  public void datastoreCreated(CreoleEvent e) {
2149  }
2150  public void resourceRenamed(Resource resource, String oldName,
2151                              String newName){
2152  }
2153  public void datastoreClosed(CreoleEvent e) {
2154    if (! e.getDatastore().equals(this.getDataStore()))
2155      return;
2156    //close this lr, since it cannot stay open when the DS it comes from
2157    //is closed
2158    Factory.deleteResource(this);
2159  }
2160  public void setLRPersistenceId(Object lrID) {
2161    super.setLRPersistenceId( lrID);
2162    //make persistent documents listen to the creole register
2163    //for events about their DS
2164    Gate.getCreoleRegister().addCreoleListener(this);
2165  }
2166  public void resourceAdopted(DatastoreEvent evt) {
2167  }
2168  public void resourceDeleted(DatastoreEvent evt) {
2169    if(! evt.getSource().equals(this.getDataStore()))
2170      return;
2171    //if an open document is deleted from a DS, then
2172    //it must close itself immediately, as is no longer valid
2173    if(evt.getResourceID().equals(this.getLRPersistenceId()))
2174      Factory.deleteResource(this);
2175  }
2176  public void resourceWritten(DatastoreEvent evt) {
2177  }
2178  public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2179    super.setDataStore( dataStore);
2180    if (this.dataStore != null)
2181      this.dataStore.addDatastoreListener(this);
2182  }
2183
2184} // class DocumentImpl
2185