1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.91 2001/12/03 15:42:04 kalina Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  import gate.creole.*;
26  import gate.gui.*;
27  import gate.event.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements Document, CreoleListener {
133   /** Debug flag */
134   private static final boolean DEBUG = false;
135 
136   /** Default construction. Content left empty. */
137   public DocumentImpl() {
138     content = new DocumentContentImpl();
139   } // default construction
140 
141   /** Initialise this resource, and return it. */
142   public Resource init() throws ResourceInstantiationException {
143 
144     // set up the source URL and create the content
145     if(sourceUrl == null) {
146       if(stringContent == null) {
147         throw new ResourceInstantiationException(
148           "The sourceURL and document's content were null."
149         );
150       }
151       content = new DocumentContentImpl(stringContent);
152       getFeatures().put("gate.SourceURL", "created from String");
153     } else {
154       try {
155         content = new DocumentContentImpl(
156           sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset
157         );
158         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
159       } catch(IOException e) {
160         throw new ResourceInstantiationException("DocumentImpl.init: " + e);
161       }
162     }
163 
164     // set up a DocumentFormat if markup unpacking required
165     if(getMarkupAware().booleanValue()) {
166       DocumentFormat docFormat =
167         DocumentFormat.getDocumentFormat(this, sourceUrl);
168       try {
169         if(docFormat != null){
170           StatusListener sListener = (StatusListener)
171                                       gate.gui.MainFrame.getListeners().
172                                       get("gate.event.StatusListener");
173           if(sListener != null) docFormat.addStatusListener(sListener);
174           docFormat.unpackMarkup(this);
175           docFormat.removeStatusListener(sListener);
176        } //if format != null
177       } catch(DocumentFormatException e) {
178         throw new ResourceInstantiationException(
179           "Couldn't unpack markup in document " + sourceUrl.toExternalForm() +
180           " " + e
181         );
182       }
183     } // if markup aware
184 
185     return this;
186   } // init()
187 
188   /** Clear all the data members of the object. */
189   public void cleanup() {
190 
191     defaultAnnots = null;
192     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
193         namedAnnotSets.clear();
194     if (DEBUG) Out.prln("Document cleanup called");
195     if (this.lrPersistentId != null)
196       Gate.getCreoleRegister().removeCreoleListener(this);
197   } // cleanup()
198 
199 
200   /** Documents are identified by URLs */
201   public URL getSourceUrl() { return sourceUrl; }
202 
203   /** Set method for the document's URL */
204   public void setSourceUrl(URL sourceUrl) {
205     this.sourceUrl = sourceUrl;
206   } // setSourceUrl
207 
208   /** Documents may be packed within files; in this case an optional pair of
209     * offsets refer to the location of the document.
210     */
211   public Long[] getSourceUrlOffsets() {
212     Long[] sourceUrlOffsets = new Long[2];
213     sourceUrlOffsets[0] = sourceUrlStartOffset;
214     sourceUrlOffsets[1] = sourceUrlEndOffset;
215     return sourceUrlOffsets;
216   } // getSourceUrlOffsets
217 
218   /** Documents may be packed within files; in this case an optional pair of
219     * offsets refer to the location of the document. This method gets the
220     * start offset.
221     */
222   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
223 
224   /** Documents may be packed within files; in this case an optional pair of
225     * offsets refer to the location of the document. This method sets the
226     * start offset.
227     */
228   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
229     this.sourceUrlStartOffset = sourceUrlStartOffset;
230   } // setSourceUrlStartOffset
231 
232   /** Documents may be packed within files; in this case an optional pair of
233     * offsets refer to the location of the document. This method gets the
234     * end offset.
235     */
236   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
237 
238   /** Documents may be packed within files; in this case an optional pair of
239     * offsets refer to the location of the document. This method sets the
240     * end offset.
241     */
242   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
243     this.sourceUrlEndOffset = sourceUrlEndOffset;
244   } // setSourceUrlStartOffset
245 
246   /** The content of the document: a String for text; MPEG for video; etc. */
247   public DocumentContent getContent() { return content; }
248 
249   /** Set method for the document content */
250   public void setContent(DocumentContent content) { this.content = content; }
251 
252   /** Get the encoding of the document content source */
253   public String getEncoding() { return encoding; }
254 
255   /** Set the encoding of the document content source */
256   public void setEncoding(String encoding) { this.encoding = encoding; }
257 
258   /** Get the default set of annotations. The set is created if it
259     * doesn't exist yet.
260     */
261   public AnnotationSet getAnnotations() {
262     if(defaultAnnots == null){
263       defaultAnnots = new AnnotationSetImpl(this);
264       fireAnnotationSetAdded(new DocumentEvent(
265            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
266     }//if
267     return defaultAnnots;
268   } // getAnnotations()
269 
270   /** Get a named set of annotations. Creates a new set if one with this
271     * name doesn't exist yet.
272     * If the provided name is null then it returns the default annotation set.
273     */
274   public AnnotationSet getAnnotations(String name) {
275     if(name == null) return getAnnotations();
276     if(namedAnnotSets == null)
277       namedAnnotSets = new HashMap();
278     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
279 
280     if(namedSet == null) {
281       namedSet = new AnnotationSetImpl(this, name);
282       namedAnnotSets.put(name, namedSet);
283 
284       DocumentEvent evt = new DocumentEvent(
285         this, DocumentEvent.ANNOTATION_SET_ADDED, name
286       );
287       fireAnnotationSetAdded(evt);
288     }
289     return namedSet;
290   } // getAnnotations(name)
291 
292   /** Make the document markup-aware. This will trigger the creation
293    *  of a DocumentFormat object at Document initialisation time; the
294    *  DocumentFormat object will unpack the markup in the Document and
295    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
296    *
297    *  @param b markup awareness status.
298    */
299   public void setMarkupAware(Boolean newMarkupAware) {
300       this.markupAware = newMarkupAware;
301   }
302 
303   /** Get the markup awareness status of the Document.
304    *  <B>Documents are markup-aware by default.</B>
305    *  @return whether the Document is markup aware.
306    */
307   public Boolean getMarkupAware() { return markupAware; }
308 
309   /** Returns an XML document aming to preserve the original markups(
310     * the original markup will be in the same place and format as it was
311     * before processing the document) and include (if possible)
312     * the annotations specified in the aSourceAnnotationSet.
313     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
314     * if they will cause a crosed over situation.
315     * @param aSourceAnnotationSet is an annotation set containing all the
316     * annotations that will be combined with the original marup set. If the
317     * param is <code>null</code> it will only dump the original markups.
318     * @return a string representing an XML document containing the original
319     * markup + dumped annotations form the aSourceAnnotationSet
320     */
321   public String toXml(Set aSourceAnnotationSet){
322     AnnotationSet originalMarkupsAnnotSet =
323             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
324 
325     // Create a dumping annotation set on the document. It will be used for
326     // dumping annotations...
327     AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
328 
329     // This set will be constructed inside this method. If is not empty, the
330     // annotation contained will be lost.
331     if (!dumpingSet.isEmpty()){
332       Out.prln("WARNING: The dumping annotation set was not empty."+
333       "All annotation it contained were lost.");
334       dumpingSet.clear();
335     }// End if
336 
337     StatusListener sListener = (StatusListener)
338                                gate.gui.MainFrame.getListeners().
339                                get("gate.event.StatusListener");
340     // Construct the dumping set in that way that all annotations will verify
341     // the condition that there are not annotations which are crossed.
342     // First add all annotation from the original markups
343     if(sListener != null)
344       sListener.statusChanged("Constructing the dumping annotation set.");
345     dumpingSet.addAll(originalMarkupsAnnotSet);
346     // Then take all the annotations from aSourceAnnotationSet and verify if
347     // they can be inserted safely into the dumpingSet. Where not possible,
348     // report.
349     if (aSourceAnnotationSet != null){
350       Iterator iter = aSourceAnnotationSet.iterator();
351       while (iter.hasNext()){
352         Annotation currentAnnot = (Annotation) iter.next();
353         if(insertsSafety(dumpingSet,currentAnnot)){
354           dumpingSet.add(currentAnnot);
355         }else{
356           Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
357           ", startOffset=" + currentAnnot.getStartNode().getOffset() +
358           ", endOffset=" + currentAnnot.getEndNode().getOffset() +
359           ", type=" + currentAnnot.getType()+ " was found to violate the" +
360           " crossed over condition. It will be discarded");
361         }// End if
362       }// End while
363     }// End if
364 
365     // The dumpingSet is ready to be exported as XML
366     // Here we go.
367     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
368     StringBuffer xmlDoc = new StringBuffer(
369           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
370     // Add xml header
371 //    xmlDoc.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
372 
373     // If the annotation set contains this "GatePreserveFormat"
374     // type, then this is removed because it will be added in the saving
375     // process. The reason of this removal is that if the loaded document
376     // was previously loaded from a GatePreserveFormat then we
377     // don't want to create lots of annotation for this type. This annotation
378     // type should be always the root element of a XML preserving format
379     // GATE document.
380     FeatureMap docFeatures = this.getFeatures();
381     String mimeTypeStr = null;
382 //    addGatePreserveFormatTag = false;
383     if (  docFeatures != null &&
384           null != (mimeTypeStr=(String)docFeatures.get("MimeType")) &&
385           (
386             "text/html".equalsIgnoreCase(mimeTypeStr) ||
387             "text/xml".equalsIgnoreCase(mimeTypeStr) ||
388             "text/sgml".equalsIgnoreCase(mimeTypeStr)
389            )
390        ){
391           /* don't add the root tag */
392     }else{
393       // Add the root start element
394 //      xmlDoc.append("<GatePreserveFormat"+
395 //                    " xmlns:gate=\"http://www.gate.ac.uk\"" +
396 //                    " gate:annotMaxId=\"" +
397 //                    getNextAnnotationId() +
398 //                    "\">");
399 //      addGatePreserveFormatTag = true;
400     }// End if
401 
402     xmlDoc.append(saveAnnotationSetAsXml(dumpingSet));
403 
404 //    if (addGatePreserveFormatTag){
405 //      // Add the root end element
406 //      xmlDoc.append("</GatePreserveFormat>");
407 //    }// End if
408     if(sListener != null) sListener.statusChanged("Done.");
409     return xmlDoc.toString();
410   }//End toXml()
411 
412   /** This method verifies if aSourceAnnotation can ve inserted safety into the
413     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
414     * contition with any annotation from the aTargetAnnotSet.
415     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
416     * @param aSourceAnnotation the annotation to be inserted into the
417     * aTargetAnnotSet
418     * @return true if the annotation inserts safety, or false otherwise.
419     */
420   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
421                                                 Annotation aSourceAnnotation){
422 
423     if (aTargetAnnotSet == null || aSourceAnnotation == null) return false;
424     if (aSourceAnnotation.getStartNode() == null ||
425         aSourceAnnotation.getStartNode().getOffset()== null) return false;
426     if (aSourceAnnotation.getEndNode() == null ||
427         aSourceAnnotation.getEndNode().getOffset()== null) return false;
428 
429     // Get the start and end offsets
430     Long start = aSourceAnnotation.getStartNode().getOffset();
431     Long end =   aSourceAnnotation.getEndNode().getOffset();
432     // Read aSourceAnnotation offsets long
433     long s2 = start.longValue();
434     long e2 = end.longValue();
435 
436     // Obtain a set with all annotations annotations that overlap
437     // totaly or partially with the interval defined by the two provided offsets
438     AnnotationSet as = aTargetAnnotSet.get(start,end);
439 
440     // Investigate all the annotations from as to see if there is one that
441     // comes in conflict with aSourceAnnotation
442     Iterator it = as.iterator();
443     while(it.hasNext()){
444       Annotation ann = (Annotation) it.next();
445       // Read ann offsets
446       long s1 = ann.getStartNode().getOffset().longValue();
447       long e1 = ann.getEndNode().getOffset().longValue();
448 
449       if (s1<s2 && s2<e1 && e1<e2) return false;
450       if (s2<s1 && s1<e2 && e2<e1) return false;
451     }// End while
452     return true;
453   }// insertsSafety()
454 
455   /** This method saves all the annotations from aDumpAnnotSet and combines
456     * them with the document content.
457     * @param aDumpAnnotationSet is a GATE annotation set prepared to be used
458     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
459     * then an empty string will be returned.
460     * @return The XML document obtained from raw text + the information from
461     * the dump annotation set.
462     */
463   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet){
464     String content = null;
465     if (this.getContent()== null)
466       content = new String("");
467     else
468       content = this.getContent().toString();
469     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
470     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
471 
472     TreeMap offsets2CharsMap = new TreeMap();
473     if (this.getContent().size().longValue() != 0){
474       // Fill the offsets2CharsMap with all the indices where
475       // special chars appear
476       buildEntityMapFromString(content,offsets2CharsMap);
477     }//End if
478     // The saving alghorithm is as follows:
479     ///////////////////////////////////////////
480     // Construct a set of annot with all IDs in asc order.
481     // All annotations that end at that offset swap their place in descending
482     // order. For each node write all the tags from left to right.
483 
484     // Construct the node set
485     TreeSet offsets = new TreeSet();
486     Iterator iter = aDumpAnnotSet.iterator();
487     while (iter.hasNext()){
488       Annotation annot = (Annotation) iter.next();
489       offsets.add(annot.getStartNode().getOffset());
490       offsets.add(annot.getEndNode().getOffset());
491     }// End while
492     isRootTag = false;
493     // ofsets is sorted in ascending order.
494     // Iterate this set in descending order and remove an offset at each
495     // iteration
496     while (!offsets.isEmpty()){
497       Long offset = (Long)offsets.last();
498       // Remove the offset from the set
499       offsets.remove(offset);
500       // Now, use it.
501       // Returns a list with annotations that needs to be serialized in that
502       // offset.
503       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
504       // Attention: the annotation are serialized from left to right
505       StringBuffer tmpBuff = new StringBuffer("");
506       Stack stack = new Stack();
507       // Iterate through all these annotations and serialize them
508       Iterator it = annotations.iterator();
509       while(it.hasNext()){
510         Annotation a = (Annotation) it.next();
511         it.remove();
512         // Test if a Ends at offset
513         if ( offset.equals(a.getEndNode().getOffset()) ){
514           // Test if a Starts at offset
515           if ( offset.equals(a.getStartNode().getOffset()) ){
516             // Here, the annotation a Starts and Ends at the offset
517             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
518                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
519 
520               // Assert: annotation a with start == end and isEmptyAndSpan
521               if (offsets.isEmpty() && "".equals(tmpBuff.toString())){
522                 // a is the doc's root tag to be written
523                 // The annotations are serialized from left to right.
524                 // The first annot in the last offset is the ROOT one
525                 isRootTag = true;
526               }// End if
527               tmpBuff.append(writeStartTag(a));
528               stack.push(a);
529             }else{
530               // Assert annotation a with start == end and an empty tag
531               tmpBuff.append(writeEmptyTag(a));
532               // The annotation is removed from dumped set
533               aDumpAnnotSet.remove(a);
534             }// End if
535           }else{
536             // Here the annotation a Ends at the offset.
537             // In this case empty the stack and write the end tag
538             if (!stack.isEmpty()){
539               while(!stack.isEmpty()){
540                 Annotation a1 = (Annotation)stack.pop();
541                 tmpBuff.append(writeEndTag(a1));
542               }// End while
543             }// End if
544             tmpBuff.append(writeEndTag(a));
545           }// End if
546         }else{
547           // The annotation a does NOT end at the offset. Let's see if it starts
548           // at the offset
549           if ( offset.equals(a.getStartNode().getOffset()) ){
550             // The annotation a starts at the offset.
551             // In this case empty the stack and write the end tag
552             if (!stack.isEmpty()){
553               while(!stack.isEmpty()){
554                 Annotation a1 = (Annotation)stack.pop();
555                 tmpBuff.append(writeEndTag(a1));
556               }// End while
557             }// End if
558             if (offsets.isEmpty() && "".equals(tmpBuff.toString())){
559               // a is the last tag to be written
560               // The annotations are serialized from left to right.
561               // The first annot in the last offset is the ROOT one.
562               isRootTag = true;
563             }// End if
564             tmpBuff.append(writeStartTag(a));
565             // The annotation is removed from dumped set
566             aDumpAnnotSet.remove(a);
567           }// End if ( offset.equals(a.getStartNode().getOffset()) )
568         }// End if ( offset.equals(a.getEndNode().getOffset()) )
569       }// End while(it.hasNext()){
570 
571       // In this case empty the stack and write the end tag
572       if (!stack.isEmpty()){
573         while(!stack.isEmpty()){
574           Annotation a1 = (Annotation)stack.pop();
575           tmpBuff.append(writeEndTag(a1));
576         }// End while
577       }// End if
578 
579       // Before inserting tmpBuff into docContStrBuff we need to check
580       // if there are chars to be replaced and if there are, they would be
581       // replaced.
582       if (!offsets2CharsMap.isEmpty()){
583         Integer offsChar = (Integer) offsets2CharsMap.lastKey();
584         while( !offsets2CharsMap.isEmpty() &&
585                        offsChar.intValue() >= offset.intValue()){
586           // Replace the char at offsChar with its corresponding entity form
587           // the entitiesMap.
588           docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
589           (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
590           // Discard the offsChar after it was used.
591           offsets2CharsMap.remove(offsChar);
592           // Investigate next offsChar
593           if (!offsets2CharsMap.isEmpty())
594             offsChar = (Integer) offsets2CharsMap.lastKey();
595         }// End while
596       }// End if
597       // Insert tmpBuff to the location where it belongs in docContStrBuff
598       docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
599     }// End while(!offsets.isEmpty())
600     // Need to replace the entities in the remaining text, if there is any text
601     // So, if there are any more items in offsets2CharsMap they need to be
602     // replaced
603     while (!offsets2CharsMap.isEmpty()){
604       Integer offsChar = (Integer) offsets2CharsMap.lastKey();
605       // Replace the char with its entity
606       docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
607       (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
608       // remove the offset from the map
609       offsets2CharsMap.remove(offsChar);
610     }// End while
611     return docContStrBuff.toString();
612   }// saveAnnotationSetAsXml()
613 
614   /** This method returns a list with annotations ordered that way that
615     * they can be serialized from left to right, at the offset. If one of the
616     * params is null then an empty list will be returned.
617     * @param aDumpAnnotSet is a set containing all annotations that will be
618     * dumped.
619     * @param offset represent the offset at witch the annotation must start
620     * AND/OR end.
621     * @return a list with those annotations that need to be serialized.
622     */
623   private List getAnnotationsForOffset(AnnotationSet aDumpAnnotSet,Long offset){
624     List annotationList = new LinkedList();
625     if (aDumpAnnotSet == null || offset == null) return annotationList;
626     Set annotThatStartAtOffset = new TreeSet(
627                           new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
628     Set annotThatEndAtOffset = new TreeSet(
629                           new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
630     Set annotThatStartAndEndAtOffset = new TreeSet(
631                           new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
632 
633     // Fill these tree lists with annotation tat start, end or start and
634     // end at the offset.
635     Iterator iter = aDumpAnnotSet.iterator();
636     while(iter.hasNext()){
637       Annotation ann = (Annotation) iter.next();
638       if (offset.equals(ann.getStartNode().getOffset())){
639         if (offset.equals(ann.getEndNode().getOffset()))
640           annotThatStartAndEndAtOffset.add(ann);
641         else
642           annotThatStartAtOffset.add(ann);
643       }else{
644         if (offset.equals(ann.getEndNode().getOffset()))
645           annotThatEndAtOffset.add(ann);
646       }// End if
647     }// End while
648     annotationList.addAll(annotThatEndAtOffset);
649     annotThatEndAtOffset = null;
650     annotationList.addAll(annotThatStartAtOffset);
651     annotThatStartAtOffset = null;
652     iter = annotThatStartAndEndAtOffset.iterator();
653     while(iter.hasNext()){
654       Annotation ann = (Annotation) iter.next();
655       Iterator it = annotationList.iterator();
656       boolean breaked = false;
657       while (it.hasNext()){
658         Annotation annFromList = (Annotation) it.next();
659         if (annFromList.getId().intValue() > ann.getId().intValue()){
660           annotationList.add(annotationList.indexOf(annFromList),ann);
661           breaked = true;
662           break;
663         }// End if
664       }// End while
665       if (!breaked)
666         annotationList.add(ann);
667       iter.remove();
668     }// End while
669     return annotationList;
670   }// getAnnotationsForOffset()
671 
672   /** Returns a string representing a start tag based on the input annot*/
673   private String writeStartTag(Annotation annot){
674     StringBuffer strBuff = new StringBuffer("");
675     if (annot == null) return strBuff.toString();
676     if (!addGatePreserveFormatTag && isRootTag){
677       strBuff.append("<"+annot.getType()+
678             " xmlns:gate=\"http://www.gate.ac.uk\"" +
679             " gate:gateId=\"" + annot.getId()+"\"" +
680             " gate:annotMaxId=\"" + getNextAnnotationId() + "\""+
681                     writeFeatures(annot.getFeatures())+" >");
682       // Once the root tag was writen then there will be no other Root tag
683       isRootTag = false;
684     }else{
685       strBuff.append("<"+annot.getType()+" gate:gateId=\"" +annot.getId()+"\""+
686                     writeFeatures(annot.getFeatures())+" >");
687     }// End if
688     return strBuff.toString();
689   }// writeStartTag()
690 
691   /** This method takes aScanString and searches for those chars from
692     * entitiesMap that appear in the string. A tree map(offset2Char) is filled
693     * using as key the offsets where those Chars appear and the Char.
694     * If one of the params is null the method simply returns.
695     */
696   private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
697     if (aScanString == null || aMapToFill == null) return;
698     if (entitiesMap == null || entitiesMap.isEmpty()){
699       Err.prln("WARNING: Entities map was not initialised !");
700       return;
701     }// End if
702     // Fill the Map with the offsets of the special chars
703     Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
704     while(entitiesMapIterator.hasNext()){
705       Character c = (Character) entitiesMapIterator.next();
706       int fromIndex = 0;
707       while (-1 != fromIndex){
708         fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
709         if (-1 != fromIndex){
710           aMapToFill.put(new Integer(fromIndex),c);
711           fromIndex ++;
712         }// End if
713       }// End while
714     }// End while
715   }//buildEntityMapFromString();
716 
717   /** Returns a string representing an empty tag based on the input annot*/
718   private String writeEmptyTag(Annotation annot){
719     StringBuffer strBuff = new StringBuffer("");
720     if (annot == null) return strBuff.toString();
721     strBuff.append("<"+annot.getType()+" gateId=\"" +annot.getId()+"\""+
722                     writeFeatures(annot.getFeatures())+" />");
723     return strBuff.toString();
724   }// writeEmptyTag()
725 
726   /** Returns a string representing an end tag based on the input annot*/
727   private String writeEndTag(Annotation annot){
728     StringBuffer strBuff = new StringBuffer("");
729     if (annot == null) return strBuff.toString();
730 /*
731     if (annot.getType().indexOf(" ") != -1)
732       Out.prln("Warning: Truncating end tag to first word for annot type \""
733       +annot.getType()+ "\". ");
734 */
735     strBuff.append("</"+annot.getType()+">");
736     return strBuff.toString();
737   }// writeEndTag()
738 
739   /** Returns a string representing a FeatureMap serialized as XML attributes*/
740   private String writeFeatures(FeatureMap feat){
741     StringBuffer strBuff = new StringBuffer("");
742     if (feat == null) return strBuff.toString();
743     Iterator it = feat.keySet().iterator();
744     while (it.hasNext()){
745       Object key = it.next();
746       Object value = feat.get(key);
747       if ( (key != null) && (value != null) ){
748         // Eliminate a feature inserted at reading time and which help to
749         // take some decissions at saving time
750         if ("isEmptyAndSpan".equals(key.toString()))
751           continue;
752         if( !(String.class.isAssignableFrom(key.getClass()) ||
753               Number.class.isAssignableFrom(key.getClass()))){
754 
755             Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
756                              " from String or Number.(feature discarded)");
757             continue;
758         }// End if
759         if ( !(String.class.isAssignableFrom(value.getClass()) ||
760                Number.class.isAssignableFrom(value.getClass()) ||
761                java.util.Collection.class.isAssignableFrom(value.getClass()))){
762 
763             Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
764                        " from String, Number or Collection.(feature discarded)");
765             continue;
766         }// End if
767         if ("matches".equals(key))
768           strBuff.append(" gate:" + key + "=\"");
769         else
770           strBuff.append(" " + key + "=\"");
771         if (java.util.Collection.class.isAssignableFrom(value.getClass())){
772           Iterator valueIter = ((Collection)value).iterator();
773           while(valueIter.hasNext()){
774             Object item = valueIter.next();
775             if (!(String.class.isAssignableFrom(item.getClass()) ||
776                   Number.class.isAssignableFrom(item.getClass())))
777                   continue;
778             strBuff.append(item +";");
779           }// End while
780           if (strBuff.charAt(strBuff.length()-1) == ';')
781             strBuff.deleteCharAt(strBuff.length()-1);
782         }else{
783           strBuff.append(value);
784         }// End if
785         strBuff.append("\"");
786       }// End if
787     }// End while
788     return strBuff.toString();
789   }// writeFeatures()
790 
791   /** Returns a GateXml document that is a custom XML format for wich there is
792     * a reader inside GATE called gate.xml.GateFormatXmlHandler.
793     * What it does is to serialize a GATE document in an XML format.
794     * @return a string representing a Gate Xml document. If saved in a file,this
795     * string must be written using the UTF-8 encoding because the first line
796     * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?>
797     */
798   public String toXml(){
799     // Initialize the xmlContent with 3 time the size of the current document.
800     // This is because of the tags size. This measure is made to increase the
801     // performance of StringBuffer.
802     StringBuffer xmlContent = new StringBuffer(
803          DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
804     // Add xml header
805     xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n");
806     // Add the root element
807     xmlContent.append("<GateDocument>\n");
808     xmlContent.append("<!-- The document's features-->\n\n");
809     xmlContent.append("<GateDocumentFeatures>\n");
810     xmlContent.append(featuresToXml(this.getFeatures()));
811     xmlContent.append("</GateDocumentFeatures>\n");
812     xmlContent.append("<!-- The document content area with serialized"+
813                       " nodes -->\n\n");
814     // Add plain text element
815     xmlContent.append("<TextWithNodes>");
816     xmlContent.append(textWithNodes(this.getContent().toString()));
817     xmlContent.append("</TextWithNodes>\n");
818     // Serialize as XML all document's annotation sets
819     // Serialize the default AnnotationSet
820     StatusListener sListener = (StatusListener)
821                                gate.gui.MainFrame.getListeners().
822                                get("gate.event.StatusListener");
823     if(sListener != null)
824       sListener.statusChanged("Saving the default annotation set ");
825     xmlContent.append("<!-- The default annotation set -->\n\n");
826     xmlContent.append(annotationSetToXml(this.getAnnotations()));
827     // Serialize all others AnnotationSets
828     // namedAnnotSets is a Map containing all other named Annotation Sets.
829     if (namedAnnotSets != null){
830       Iterator iter = namedAnnotSets.values().iterator();
831       while(iter.hasNext()){
832         AnnotationSet annotSet = (AnnotationSet) iter.next();
833         xmlContent.append("<!-- Named annotation set -->\n\n");
834         // Serialize it as XML
835         if(sListener != null) sListener.statusChanged("Saving " +
836                                                       annotSet.getName()+
837                                                       " annotation set ");
838         xmlContent.append(annotationSetToXml(annotSet));
839       }// End while
840     }// End if
841     // Add the end of GateDocument
842     xmlContent.append("</GateDocument>");
843     if(sListener != null) sListener.statusChanged("Done !");
844     // return the XmlGateDocument
845     return xmlContent.toString();
846   }// toXml
847 
848   /** This method filters any non XML char
849     * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
850     * All non XML chars will be replaced with 0x20 (space char) This assures
851     * that the next time the document is loaded there won't be any problems.
852     * @param aStrBuffer represents the input String that is filtred. If the
853     * aStrBuffer is null then an empty string will be returend
854     * @return the "purified" StringBuffer version of the aStrBuffer
855     */
856   private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
857     if (aStrBuffer == null) return new StringBuffer("");
858     String space = new String(" ");
859     for (int i=aStrBuffer.length()-1;i>=0; i--){
860       if (!isXmlChar(aStrBuffer.charAt(i)))
861         aStrBuffer.replace(i,i+1,space);
862     }// End for
863     return aStrBuffer;
864   }// filterNonXmlChars()
865 
866   /** This method decide if a char is a valid XML one or not
867     * @param ch the char to be tested
868     * @return true if is a valid XML char and fals if is not.
869     */
870   private boolean isXmlChar(char ch){
871     if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
872     if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
873     if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
874     if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
875     return false;
876   }// End isXmlChar()
877 
878   /** This method saves a FeatureMap as XML elements.
879     * @ param aFeatureMap the feature map that has to be saved as XML.
880     * @ return a String like this: <Feature><Name>...</Name>
881     * <Value>...</Value></Feature><Feature>...</Feature>
882     */
883   private String featuresToXml(FeatureMap aFeatureMap){
884     StringBuffer str = new StringBuffer("");
885 
886     if (aFeatureMap == null) return str.toString();
887 
888     Set keySet = aFeatureMap.keySet();
889     Iterator keyIterator = keySet.iterator();
890     while(keyIterator.hasNext()){
891       Object key = keyIterator.next();
892       Object value = aFeatureMap.get(key);
893       if ((key != null) && (value != null)){
894         String keyClassName = null;
895         String keyItemClassName = null;
896         String valueClassName = null;
897         String valueItemClassName = null;
898         String key2String = key.toString();
899         String value2String = value.toString();
900         Object item = null;
901         // Test key if it is String, Number or Collection
902         if (key instanceof java.lang.String ||
903             key instanceof java.lang.Number ||
904             key instanceof java.util.Collection)
905           keyClassName = key.getClass().getName();
906 
907         // Test value if it is String, Number or Collection
908         if (value instanceof java.lang.String ||
909             value instanceof java.lang.Number ||
910             value instanceof java.util.Collection)
911           valueClassName = value.getClass().getName();
912 
913         // Features and values that are not Strings, Numbers or collections
914         // will be discarded.
915         if (keyClassName == null || valueClassName == null) continue;
916 
917         // If key is collection serialize the colection in a specific format
918         if (key instanceof java.util.Collection){
919           StringBuffer keyStrBuff = new StringBuffer("");
920           Iterator iter = ((Collection) key).iterator();
921           if (iter.hasNext()){
922             item = iter.next();
923             if (item instanceof java.lang.Number)
924               keyItemClassName = item.getClass().getName();
925             else
926               keyItemClassName = String.class.getName();
927             keyStrBuff.append(item.toString());
928           }// End if
929           while (iter.hasNext()){
930             item = iter.next();
931             keyStrBuff.append(";" + item.toString());
932           }// End while
933           key2String = keyStrBuff.toString();
934         }// End if
935         // If key is collection serialize the colection in a specific format
936         if (value instanceof java.util.Collection){
937           StringBuffer valueStrBuff = new StringBuffer("");
938           Iterator iter = ((Collection) value).iterator();
939           if (iter.hasNext()){
940             item = iter.next();
941             if (item instanceof java.lang.Number)
942               valueItemClassName = item.getClass().getName();
943             else
944               valueItemClassName = String.class.getName();
945             valueStrBuff.append(item.toString());
946           }// End if
947           while (iter.hasNext()){
948             item = iter.next();
949             valueStrBuff.append(";" + item.toString());
950           }// End while
951           value2String = valueStrBuff.toString();
952         }// End if
953         str.append("<Feature>\n  <Name");
954         if (keyClassName != null)
955           str.append(" className=\""+keyClassName+"\"");
956         if (keyItemClassName != null)
957           str.append(" itemClassName=\""+keyItemClassName+"\"");
958         str.append(">");
959         str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
960         str.append("</Name>\n  <Value");
961         if (valueClassName != null)
962           str.append(" className=\"" + valueClassName + "\"");
963         if (valueItemClassName != null)
964           str.append(" itemClassName=\"" + valueItemClassName + "\"");
965         str.append(">");
966         str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
967         str.append("</Value>\n</Feature>\n");
968       }// End if
969     }// end While
970     return str.toString();
971   }//featuresToXml
972 
973   /** This method replace all chars that appears in the anInputString and also
974     * that are in the entitiesMap with their corresponding entity
975     * @param anInputString the string analyzed. If it is null then returns the
976     *  empty string
977     * @return a string representing the input string with chars replaced with
978     *  entities
979     */
980   private StringBuffer replaceCharsWithEntities(String anInputString){
981     if (anInputString == null) return new StringBuffer("");
982     StringBuffer strBuff = new StringBuffer(anInputString);
983     for (int i=strBuff.length()-1; i>=0; i--){
984       Character ch = new Character(strBuff.charAt(i));
985       if (entitiesMap.keySet().contains(ch)){
986         strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
987       }// End if
988     }// End for
989     return strBuff;
990   }//replaceCharsWithEntities()
991 
992   /** This method creates Node XML elements and inserts them at the
993     * corresponding offset inside the text. Nodes are created from the default
994     * annotation set, as well as from all existing named annotation sets.
995     * @param aText The text representing the document's plain text.
996     * @return The text with empty <Node id="NodeId"/> elements.
997     */
998   private String textWithNodes(String aText){
999     if (aText == null) return new String("");
1000    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
1001
1002    // Construct a map from offsets to Chars
1003    TreeMap offsets2CharsMap = new TreeMap();
1004    if (aText.length()!= 0){
1005      // Fill the offsets2CharsMap with all the indices where special chars appear
1006      buildEntityMapFromString(aText,offsets2CharsMap);
1007    }//End if
1008    // Construct the offsetsSet for all nodes belonging to this document
1009    TreeSet offsetsSet = new TreeSet();
1010    Iterator annotSetIter = this.getAnnotations().iterator();
1011    while (annotSetIter.hasNext()){
1012      Annotation annot = (Annotation) annotSetIter.next();
1013      offsetsSet.add(annot.getStartNode().getOffset());
1014      offsetsSet.add(annot.getEndNode().getOffset());
1015    }// end While
1016    // Get the nodes from all other named annotation sets.
1017    if (namedAnnotSets != null){
1018      Iterator iter = namedAnnotSets.values().iterator();
1019      while(iter.hasNext()){
1020        AnnotationSet annotSet = (AnnotationSet) iter.next();
1021        Iterator iter2 = annotSet.iterator();
1022        while(iter2.hasNext()){
1023          Annotation annotTmp = (Annotation) iter2.next();
1024          offsetsSet.add(annotTmp.getStartNode().getOffset());
1025          offsetsSet.add(annotTmp.getEndNode().getOffset());
1026        }// End while
1027      }// End while
1028    }// End if
1029    // offsetsSet is ordered in ascending order because the structure
1030    // is a TreeSet
1031
1032    if (offsetsSet.isEmpty()){
1033      return replaceCharsWithEntities(aText).toString();
1034    }// End if
1035    // Iterate through all nodes from anAnnotSet and transform them to
1036    // XML elements. Then insert those elements at the node's offset into the
1037    // textWithNodes .
1038    while (!offsetsSet.isEmpty()){
1039      Long offset = (Long) offsetsSet.last();
1040      // Eliminate the offset from the list in order to create more memory space
1041      offsetsSet.remove(offset);
1042      // Use offset
1043      int offsetValue = offset.intValue();
1044      String strNode = "<Node id=\"" + offsetValue + "\"/>";
1045      // Before inserting this string into the textWithNodes, check to see if
1046      // there are any chars to be replaced with their corresponding entities
1047      if (!offsets2CharsMap.isEmpty()){
1048        Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1049        while( !offsets2CharsMap.isEmpty() &&
1050                       offsChar.intValue() >= offset.intValue()){
1051          // Replace the char at offsChar with its corresponding entity form
1052          // the entitiesMap.
1053          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1054          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1055          // Discard the offsChar after it was used because this offset will
1056          // never appear again
1057          offsets2CharsMap.remove(offsChar);
1058          // Investigate next offsChar
1059          if (!offsets2CharsMap.isEmpty())
1060            offsChar = (Integer) offsets2CharsMap.lastKey();
1061        }// End while
1062      }// End if
1063      // Now it is safe to insert the node
1064      textWithNodes.insert(offsetValue,strNode);
1065    }// end while
1066    // Need to replace the entities in the remaining text, if there is any text
1067    // So, if there are any more items in offsets2CharsMap they need to be
1068    // replaced
1069    while (!offsets2CharsMap.isEmpty()){
1070      Integer offsChar = (Integer) offsets2CharsMap.lastKey();
1071      // Replace the char with its entity
1072      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
1073      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1074      // remove the offset from the map
1075      offsets2CharsMap.remove(offsChar);
1076    }// End while
1077    return textWithNodes.toString();
1078  }//textWithNodes()
1079
1080  /** This method saves an AnnotationSet as XML.
1081    * @param anAnnotationSet The annotation set that has to be saved as XML.
1082    * @return a String like this: <AnnotationSet> <Annotation>....
1083    * </AnnotationSet>
1084    */
1085  private String annotationSetToXml(AnnotationSet anAnnotationSet){
1086    StringBuffer str = new StringBuffer("");
1087
1088    if (anAnnotationSet == null){
1089      str.append("<AnnotationSet>\n");
1090      str.append("</AnnotationSet>\n");
1091      return str.toString();
1092    }// End if
1093    if (anAnnotationSet.getName() == null)
1094      str.append("<AnnotationSet>\n");
1095    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
1096                                                                    "\" >\n");
1097    // Iterate through AnnotationSet and save each Annotation as XML
1098    Iterator iterator = anAnnotationSet.iterator();
1099    while (iterator.hasNext()){
1100      Annotation annot = (Annotation) iterator.next();
1101      str.append("<Annotation " + "Type=\"" + annot.getType() +
1102                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
1103                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
1104      str.append(featuresToXml(annot.getFeatures()));
1105      str.append("</Annotation>\n");
1106    }// End while
1107
1108    str.append("</AnnotationSet>\n");
1109    return str.toString();
1110  }// annotationSetToXml
1111
1112  /** Returns a map with the named annotation sets. It returns <code>null</code>
1113   *  if no named annotaton set exists. */
1114  public Map getNamedAnnotationSets() {
1115    return namedAnnotSets;
1116  } // getNamedAnnotationSets
1117
1118  /**
1119   * Removes one of the named annotation sets.
1120   * Note that the default annotation set cannot be removed.
1121   * @param name the name of the annotation set to be removed
1122   */
1123  public void removeAnnotationSet(String name){
1124    Object removed = namedAnnotSets.remove(name);
1125    if(removed != null){
1126      fireAnnotationSetRemoved(
1127        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
1128    }
1129  }
1130
1131  /** Propagate edit changes to the document content and annotations. */
1132  public void edit(Long start, Long end, DocumentContent replacement)
1133    throws InvalidOffsetException
1134  {
1135    if(! isValidOffsetRange(start, end))
1136      throw new InvalidOffsetException();
1137
1138    if(content != null)
1139      ((DocumentContentImpl) content).edit(start, end, replacement);
1140
1141    if(defaultAnnots != null)
1142      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
1143
1144    if(namedAnnotSets != null) {
1145      Iterator iter = namedAnnotSets.values().iterator();
1146      while(iter.hasNext())
1147        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
1148    }
1149
1150  } // edit(start,end,replacement)
1151
1152  /** Check that an offset is valid, i.e. it is non-null, greater than
1153    * or equal to 0 and less than the size of the document content.
1154    */
1155  public boolean isValidOffset(Long offset) {
1156    if(offset == null)
1157      return false;
1158
1159    long o = offset.longValue();
1160    if(o > getContent().size().longValue() || o < 0)
1161      return false;
1162
1163    return true;
1164  } // isValidOffset
1165
1166  /** Check that both start and end are valid offsets and that
1167    * they constitute a valid offset range, i.e. start is greater
1168    * than or equal to long.
1169    */
1170  public boolean isValidOffsetRange(Long start, Long end) {
1171    return
1172      isValidOffset(start) && isValidOffset(end) &&
1173      start.longValue() <= end.longValue();
1174  } // isValidOffsetRange(start,end)
1175
1176  /** Sets the nextAnnotationId */
1177  public void setNextAnnotationId(int aNextAnnotationId){
1178    nextAnnotationId = aNextAnnotationId;
1179  }// setNextAnnotationId();
1180
1181  /** Generate and return the next annotation ID */
1182  public Integer getNextAnnotationId() {
1183    return new Integer(nextAnnotationId++);
1184  } // getNextAnnotationId
1185
1186  /** Generate and return the next node ID */
1187  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
1188
1189  /** Ordering based on URL.toString() and the URL offsets (if any) */
1190  public int compareTo(Object o) throws ClassCastException {
1191    DocumentImpl other = (DocumentImpl) o;
1192    return getOrderingString().compareTo(other.getOrderingString());
1193  } // compareTo
1194
1195  /** Utility method to produce a string for comparison in ordering.
1196    * String is based on the source URL and offsets.
1197    */
1198  protected String getOrderingString() {
1199    if(sourceUrl == null) return toString();
1200
1201    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
1202    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
1203      orderingString.append(sourceUrlStartOffset.toString());
1204      orderingString.append(sourceUrlEndOffset.toString());
1205    }
1206
1207    return orderingString.toString();
1208  } // getOrderingString()
1209
1210  /** The id of the next new annotation */
1211  protected int nextAnnotationId = 0;
1212
1213  /** The id of the next new node */
1214  protected int nextNodeId = 0;
1215  /** The source URL */
1216  protected URL sourceUrl;
1217
1218  /** The document's URL name. */
1219
1220  /** The content of the document */
1221  protected DocumentContent content;
1222
1223  /** The encoding of the source of the document content */
1224  protected String encoding = "UTF-8";
1225
1226  // Data needed in toXml(AnnotationSet) methos
1227
1228  /** This field indicates whether or not to add the tag
1229    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
1230    * have this tag added
1231    */
1232  private boolean addGatePreserveFormatTag = false;
1233
1234  /** This field indicates if an annotation is the doc's root tag.
1235    * It is needed when adding the namespace information
1236    */
1237  private boolean isRootTag = false;
1238
1239  /** This field is used when creating StringBuffers for toXml() methods.
1240    * The size of the StringBuffer will be docDonctent.size() multiplied by this
1241    * value. It is aimed to improve the performance of StringBuffer
1242    */
1243  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1;
1244
1245  /** Constant used in the inner class AnnotationComparator to order
1246    * annotations on their start offset
1247    */
1248  private final int ORDER_ON_START_OFFSET = 0;
1249  /** Constant used in the inner class AnnotationComparator to order
1250    * annotations on their end offset
1251    */
1252  private final int ORDER_ON_END_OFFSET = 1;
1253  /** Constant used in the inner class AnnotationComparator to order
1254    * annotations on their ID
1255    */
1256  private final int ORDER_ON_ANNOT_ID = 2;
1257  /** Constant used in the inner class AnnotationComparator to order
1258    * annotations ascending
1259    */
1260  private final int ASC = 3;
1261  /** Constant used in the inner class AnnotationComparator to order
1262    * annotations descending
1263    */
1264  private final int DESC = -3;
1265
1266  /** A map initialized in init() containing entities that needs to be
1267    * replaced in strings
1268    */
1269  private static Map entitiesMap = null;
1270  // Initialize the entities map use when saving as xml
1271  static{
1272    entitiesMap = new HashMap();
1273    entitiesMap.put(new Character('<'),"&lt;");
1274    entitiesMap.put(new Character('>'),"&gt;");
1275    entitiesMap.put(new Character('&'),"&amp;");
1276    entitiesMap.put(new Character('\''),"&apos;");
1277    entitiesMap.put(new Character('"'),"&quot;");
1278    entitiesMap.put(new Character((char)160),"&#160;");
1279    entitiesMap.put(new Character((char)169),"&#169;");
1280  }//static
1281
1282  /** The range that the content comes from at the source URL
1283    * (or null if none).
1284    */
1285  //protected Long[] sourceUrlOffsets;
1286
1287  /** The start of the range that the content comes from at the source URL
1288    * (or null if none).
1289    */
1290  protected Long sourceUrlStartOffset;
1291
1292  /** The end of the range that the content comes from at the source URL
1293    * (or null if none).
1294    */
1295  protected Long sourceUrlEndOffset;
1296
1297  /** The default annotation set */
1298  protected AnnotationSet defaultAnnots;
1299
1300  /** Named sets of annotations */
1301  protected Map namedAnnotSets;
1302
1303  /**
1304   * A property of the document that will be set when the user
1305   * wants to create the document from a string, as opposed to from
1306   * a URL.
1307   */
1308  private String stringContent;
1309
1310  /**
1311   * The stringContent of a document is
1312   * a property of the document that will be set when the user
1313   * wants to create the document from a string, as opposed to from
1314   * a URL.
1315   * <B>Use the <TT>getContent</TT> method instead to get the actual document
1316   * content.</B>
1317   */
1318  public String getStringContent() { return stringContent; }
1319
1320  /**
1321   * The stringContent of a document is
1322   * a property of the document that will be set when the user
1323   * wants to create the document from a string, as opposed to from
1324   * a URL.
1325   * <B>Use the <TT>setContent</TT> method instead to update the actual
1326   * document content.</B>
1327   */
1328  public void setStringContent(String stringContent) {
1329    this.stringContent = stringContent;
1330  } // set StringContent
1331
1332  /** Is the document markup-aware? */
1333  protected Boolean markupAware = new Boolean(false);
1334
1335  /** Check: test 2 objects for equality */
1336  protected boolean check(Object a, Object b) {
1337    if( (a == null || b == null) )
1338      return a == b;
1339
1340    return a.equals(b);
1341  } // check(a,b)
1342
1343  /** Equals */
1344  public boolean equals(Object other) {
1345    if(other == null ||
1346       !(other instanceof DocumentImpl))return false;
1347    DocumentImpl doc = (DocumentImpl) other;
1348
1349// PENDING EQUALS IMPLS
1350    if(! check(content, doc.content)) return false;
1351    if(! check(defaultAnnots, doc.defaultAnnots)) return false;
1352    if(! check(encoding, doc.encoding)) return false;
1353    if(! check(features, doc.features)) return false;
1354    if(!markupAware.equals(doc.markupAware)) return false;
1355    if(! check(namedAnnotSets, doc.namedAnnotSets)) return false;
1356    if(nextAnnotationId != doc.nextAnnotationId) return false;
1357    if(nextNodeId != doc.nextNodeId) return false;
1358    if(! check(sourceUrl, doc.sourceUrl)) return false;
1359    if(! check(sourceUrlStartOffset, doc.sourceUrlStartOffset)) return false;
1360    if(! check(sourceUrlEndOffset, doc.sourceUrlEndOffset)) return false;
1361
1362    return true;
1363  } // equals
1364
1365  /** Hash code */
1366  public int hashCode() {
1367    int code = getContent().hashCode();
1368    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
1369    code += memberCode;
1370    memberCode = (encoding == null) ? 0 : encoding.hashCode();
1371    code += memberCode;
1372    memberCode = (features == null) ? 0 : features.hashCode();
1373    code += memberCode;
1374    code += (markupAware.booleanValue()) ? 0 : 1;
1375    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
1376    code += memberCode;
1377    code += nextAnnotationId;
1378    code += nextNodeId;
1379    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
1380    code += memberCode;
1381    memberCode =
1382      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
1383    code += memberCode;
1384    memberCode =
1385      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
1386    code += memberCode;
1387    return code;
1388  } // hashcode
1389
1390  /** String respresentation */
1391  public String toString() {
1392    String n = Strings.getNl();
1393    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
1394    s.append("  content:" + content + n);
1395    s.append("  defaultAnnots:" + defaultAnnots + n);
1396    s.append("  encoding:" + encoding + n);
1397    s.append("  features:" + features + n);
1398    s.append("  markupAware:" + markupAware + n);
1399    s.append("  namedAnnotSets:" + namedAnnotSets + n);
1400    s.append("  nextAnnotationId:" + nextAnnotationId + n);
1401    s.append("  nextNodeId:" + nextNodeId + n);
1402    s.append("  sourceUrl:" + sourceUrl + n);
1403    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
1404    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
1405    s.append(n);
1406
1407    return s.toString();
1408  } // toString
1409
1410   /** Freeze the serialization UID. */
1411  static final long serialVersionUID = -8456893608311510260L;
1412
1413  /** Inner class needed to compare annotations*/
1414  class AnnotationComparator implements java.util.Comparator {
1415    int orderOn = -1;
1416    int orderType = ASC;
1417    /** Constructs a comparator according to one of three sorter types:
1418      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
1419      */
1420      public AnnotationComparator(int anOrderOn, int anOrderType){
1421        orderOn = anOrderOn;
1422        orderType = anOrderType;
1423      }// AnnotationComparator()
1424
1425      /**This method must be implemented according to Comparator interface */
1426      public int compare(Object o1, Object o2){
1427        Annotation a1 = (Annotation) o1;
1428        Annotation a2 = (Annotation) o2;
1429        // ORDER_ON_START_OFFSET ?
1430        if (orderOn == ORDER_ON_START_OFFSET){
1431          int result = a1.getStartNode().getOffset().compareTo(
1432                                                a2.getStartNode().getOffset());
1433          if (orderType == ASC){
1434            // ASC
1435            // If they are equal then their ID will decide.
1436            if (result == 0)
1437              return a1.getId().compareTo(a2.getId());
1438            return result;
1439          }else{
1440            // DESC
1441            if (result == 0)
1442              return - (a1.getId().compareTo(a2.getId()));
1443            return -result;
1444          }// End if (orderType == ASC)
1445        }// End if (orderOn == ORDER_ON_START_OFFSET)
1446
1447        // ORDER_ON_END_OFFSET ?
1448        if (orderOn == ORDER_ON_END_OFFSET){
1449          int result = a1.getEndNode().getOffset().compareTo(
1450                                                a2.getEndNode().getOffset());
1451          if (orderType == ASC){
1452            // ASC
1453            // If they are equal then their ID will decide.
1454            if (result == 0)
1455              return - (a1.getId().compareTo(a2.getId()));
1456            return result;
1457          }else{
1458            // DESC
1459            // If they are equal then their ID will decide.
1460            if (result == 0)
1461              return a1.getId().compareTo(a2.getId());
1462            return - result;
1463          }// End if (orderType == ASC)
1464        }// End if (orderOn == ORDER_ON_END_OFFSET)
1465
1466        // ORDER_ON_ANNOT_ID ?
1467        if (orderOn == ORDER_ON_ANNOT_ID){
1468          if (orderType == ASC)
1469            return a1.getId().compareTo(a2.getId());
1470          else
1471            return -(a1.getId().compareTo(a2.getId()));
1472        }// End if
1473        return 0;
1474      }//compare()
1475  } // End inner class AnnotationComparator
1476
1477
1478  private transient Vector documentListeners;
1479  private transient Vector gateListeners;
1480
1481  public synchronized void removeDocumentListener(DocumentListener l) {
1482    if (documentListeners != null && documentListeners.contains(l)) {
1483      Vector v = (Vector) documentListeners.clone();
1484      v.removeElement(l);
1485      documentListeners = v;
1486    }
1487  }
1488  public synchronized void addDocumentListener(DocumentListener l) {
1489    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
1490    if (!v.contains(l)) {
1491      v.addElement(l);
1492      documentListeners = v;
1493    }
1494  }
1495  protected void fireAnnotationSetAdded(DocumentEvent e) {
1496    if (documentListeners != null) {
1497      Vector listeners = documentListeners;
1498      int count = listeners.size();
1499      for (int i = 0; i < count; i++) {
1500        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
1501      }
1502    }
1503  }
1504  protected void fireAnnotationSetRemoved(DocumentEvent e) {
1505    if (documentListeners != null) {
1506      Vector listeners = documentListeners;
1507      int count = listeners.size();
1508      for (int i = 0; i < count; i++) {
1509        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
1510      }
1511    }
1512  }
1513  public void resourceLoaded(CreoleEvent e) {
1514  }
1515  public void resourceUnloaded(CreoleEvent e) {
1516  }
1517  public void datastoreOpened(CreoleEvent e) {
1518  }
1519  public void datastoreCreated(CreoleEvent e) {
1520  }
1521  public void datastoreClosed(CreoleEvent e) {
1522    if (! e.getDatastore().equals(this.getDataStore()))
1523      return;
1524    //close this lr, since it cannot stay open when the DS it comes from
1525    //is closed
1526    Factory.deleteResource(this);
1527  }
1528  public void setLRPersistenceId(Object lrID) {
1529    super.setLRPersistenceId( lrID);
1530    //make persistent documents listen to the creole register
1531    //for events about their DS
1532    Gate.getCreoleRegister().addCreoleListener(this);
1533  }
1534
1535} // class DocumentImpl
1536