|
DocumentImpl |
|
1 /* 2 * DocumentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentImpl.java,v 1.91 2001/12/03 15:42:04 kalina Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.gui.*; 27 import gate.event.*; 28 29 /** Represents the commonalities between all sorts of documents. 30 * 31 * <H2>Editing</H2> 32 * 33 * <P> 34 * The DocumentImpl class implements the Document interface. 35 * The DocumentContentImpl class models the textual or audio-visual 36 * materials which are the source and content of Documents. 37 * The AnnotationSetImpl class supplies annotations on Documents. 38 * 39 * <P> 40 * Abbreviations: 41 * 42 * <UL> 43 * <LI> 44 * DC = DocumentContent 45 * <LI> 46 * D = Document 47 * <LI> 48 * AS = AnnotationSet 49 * </UL> 50 * 51 * <P> 52 * We add an edit method to each of these classes; for DC and AS 53 * the methods are package private; D has the public method. 54 * 55 * <PRE> 56 * void edit(Long start, Long end, DocumentContent replacement) 57 * throws InvalidOffsetException; 58 * </PRE> 59 * 60 * <P> 61 * D receives edit requests and forwards them to DC and AS. 62 * On DC, this method makes a change to the content - e.g. replacing 63 * a String range from start to end with replacement. (Deletions 64 * are catered for by having replacement = null.) D then calls 65 * AS.edit on each of its annotation sets. 66 * 67 * <P> 68 * On AS, edit calls replacement.size() (i.e. DC.size()) to 69 * figure out how long the replacement is (0 for null). It then 70 * considers annotations that terminate (start or end) in 71 * the altered or deleted range as invalid; annotations that 72 * terminate after the range have their offsets adjusted. 73 * I.e.: 74 * <UL> 75 * <LI> 76 * the nodes that pointed inside the old modified area are invalid now and 77 * will be deleted along with the connected annotations; 78 * <LI> 79 * the nodes that are before the start of the modified area remain 80 * untouched; 81 * <LI> 82 * the nodes that are after the end of the affected area will have the 83 * offset changed according to the formula below. 84 * </UL> 85 * 86 * <P> 87 * A note re. AS and annotations: annotations no longer have 88 * offsets as in the old model, they now have nodes, and nodes 89 * have offsets. 90 * 91 * <P> 92 * To implement AS.edit, we have several indices: 93 * <PRE> 94 * HashMap annotsByStartNode, annotsByEndNode; 95 * </PRE> 96 * which map node ids to annotations; 97 * <PRE> 98 * RBTreeMap nodesByOffset; 99 * </PRE> 100 * which maps offset to Nodes. 101 * 102 * <P> 103 * When we get an edit request, we traverse that part of the 104 * nodesByOffset tree representing the altered or deleted 105 * range of the DC. For each node found, we delete any annotations 106 * that terminate on the node, and then delete the node itself. 107 * We then traverse the rest of the tree, changing the offset 108 * on all remaining nodes by: 109 * <PRE> 110 * newOffset = 111 * oldOffset - 112 * ( 113 * (end - start) - // size of mod 114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl 115 * ); 116 * </PRE> 117 * Note that we use the same convention as e.g. java.lang.String: start 118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd" 119 * range 1-3 = "bc". Examples, for a node with offset 4: 120 * <PRE> 121 * edit(1, 3, "BC"); 122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4 123 * 124 * edit(1, 3, null); 125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2 126 * 127 * edit(1, 3, "BBCC"); 128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6 129 * </PRE> 130 */ 131 public class DocumentImpl 132 extends AbstractLanguageResource implements Document, CreoleListener { 133 /** Debug flag */ 134 private static final boolean DEBUG = false; 135 136 /** Default construction. Content left empty. */ 137 public DocumentImpl() { 138 content = new DocumentContentImpl(); 139 } // default construction 140 141 /** Initialise this resource, and return it. */ 142 public Resource init() throws ResourceInstantiationException { 143 144 // set up the source URL and create the content 145 if(sourceUrl == null) { 146 if(stringContent == null) { 147 throw new ResourceInstantiationException( 148 "The sourceURL and document's content were null." 149 ); 150 } 151 content = new DocumentContentImpl(stringContent); 152 getFeatures().put("gate.SourceURL", "created from String"); 153 } else { 154 try { 155 content = new DocumentContentImpl( 156 sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset 157 ); 158 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); 159 } catch(IOException e) { 160 throw new ResourceInstantiationException("DocumentImpl.init: " + e); 161 } 162 } 163 164 // set up a DocumentFormat if markup unpacking required 165 if(getMarkupAware().booleanValue()) { 166 DocumentFormat docFormat = 167 DocumentFormat.getDocumentFormat(this, sourceUrl); 168 try { 169 if(docFormat != null){ 170 StatusListener sListener = (StatusListener) 171 gate.gui.MainFrame.getListeners(). 172 get("gate.event.StatusListener"); 173 if(sListener != null) docFormat.addStatusListener(sListener); 174 docFormat.unpackMarkup(this); 175 docFormat.removeStatusListener(sListener); 176 } //if format != null 177 } catch(DocumentFormatException e) { 178 throw new ResourceInstantiationException( 179 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() + 180 " " + e 181 ); 182 } 183 } // if markup aware 184 185 return this; 186 } // init() 187 188 /** Clear all the data members of the object. */ 189 public void cleanup() { 190 191 defaultAnnots = null; 192 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty())) 193 namedAnnotSets.clear(); 194 if (DEBUG) Out.prln("Document cleanup called"); 195 if (this.lrPersistentId != null) 196 Gate.getCreoleRegister().removeCreoleListener(this); 197 } // cleanup() 198 199 200 /** Documents are identified by URLs */ 201 public URL getSourceUrl() { return sourceUrl; } 202 203 /** Set method for the document's URL */ 204 public void setSourceUrl(URL sourceUrl) { 205 this.sourceUrl = sourceUrl; 206 } // setSourceUrl 207 208 /** Documents may be packed within files; in this case an optional pair of 209 * offsets refer to the location of the document. 210 */ 211 public Long[] getSourceUrlOffsets() { 212 Long[] sourceUrlOffsets = new Long[2]; 213 sourceUrlOffsets[0] = sourceUrlStartOffset; 214 sourceUrlOffsets[1] = sourceUrlEndOffset; 215 return sourceUrlOffsets; 216 } // getSourceUrlOffsets 217 218 /** Documents may be packed within files; in this case an optional pair of 219 * offsets refer to the location of the document. This method gets the 220 * start offset. 221 */ 222 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; } 223 224 /** Documents may be packed within files; in this case an optional pair of 225 * offsets refer to the location of the document. This method sets the 226 * start offset. 227 */ 228 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { 229 this.sourceUrlStartOffset = sourceUrlStartOffset; 230 } // setSourceUrlStartOffset 231 232 /** Documents may be packed within files; in this case an optional pair of 233 * offsets refer to the location of the document. This method gets the 234 * end offset. 235 */ 236 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; } 237 238 /** Documents may be packed within files; in this case an optional pair of 239 * offsets refer to the location of the document. This method sets the 240 * end offset. 241 */ 242 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { 243 this.sourceUrlEndOffset = sourceUrlEndOffset; 244 } // setSourceUrlStartOffset 245 246 /** The content of the document: a String for text; MPEG for video; etc. */ 247 public DocumentContent getContent() { return content; } 248 249 /** Set method for the document content */ 250 public void setContent(DocumentContent content) { this.content = content; } 251 252 /** Get the encoding of the document content source */ 253 public String getEncoding() { return encoding; } 254 255 /** Set the encoding of the document content source */ 256 public void setEncoding(String encoding) { this.encoding = encoding; } 257 258 /** Get the default set of annotations. The set is created if it 259 * doesn't exist yet. 260 */ 261 public AnnotationSet getAnnotations() { 262 if(defaultAnnots == null){ 263 defaultAnnots = new AnnotationSetImpl(this); 264 fireAnnotationSetAdded(new DocumentEvent( 265 this, DocumentEvent.ANNOTATION_SET_ADDED, null)); 266 }//if 267 return defaultAnnots; 268 } // getAnnotations() 269 270 /** Get a named set of annotations. Creates a new set if one with this 271 * name doesn't exist yet. 272 * If the provided name is null then it returns the default annotation set. 273 */ 274 public AnnotationSet getAnnotations(String name) { 275 if(name == null) return getAnnotations(); 276 if(namedAnnotSets == null) 277 namedAnnotSets = new HashMap(); 278 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name); 279 280 if(namedSet == null) { 281 namedSet = new AnnotationSetImpl(this, name); 282 namedAnnotSets.put(name, namedSet); 283 284 DocumentEvent evt = new DocumentEvent( 285 this, DocumentEvent.ANNOTATION_SET_ADDED, name 286 ); 287 fireAnnotationSetAdded(evt); 288 } 289 return namedSet; 290 } // getAnnotations(name) 291 292 /** Make the document markup-aware. This will trigger the creation 293 * of a DocumentFormat object at Document initialisation time; the 294 * DocumentFormat object will unpack the markup in the Document and 295 * add it as annotations. Documents are <B>not</B> markup-aware by default. 296 * 297 * @param b markup awareness status. 298 */ 299 public void setMarkupAware(Boolean newMarkupAware) { 300 this.markupAware = newMarkupAware; 301 } 302 303 /** Get the markup awareness status of the Document. 304 * <B>Documents are markup-aware by default.</B> 305 * @return whether the Document is markup aware. 306 */ 307 public Boolean getMarkupAware() { return markupAware; } 308 309 /** Returns an XML document aming to preserve the original markups( 310 * the original markup will be in the same place and format as it was 311 * before processing the document) and include (if possible) 312 * the annotations specified in the aSourceAnnotationSet. 313 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost 314 * if they will cause a crosed over situation. 315 * @param aSourceAnnotationSet is an annotation set containing all the 316 * annotations that will be combined with the original marup set. If the 317 * param is <code>null</code> it will only dump the original markups. 318 * @return a string representing an XML document containing the original 319 * markup + dumped annotations form the aSourceAnnotationSet 320 */ 321 public String toXml(Set aSourceAnnotationSet){ 322 AnnotationSet originalMarkupsAnnotSet = 323 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 324 325 // Create a dumping annotation set on the document. It will be used for 326 // dumping annotations... 327 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 328 329 // This set will be constructed inside this method. If is not empty, the 330 // annotation contained will be lost. 331 if (!dumpingSet.isEmpty()){ 332 Out.prln("WARNING: The dumping annotation set was not empty."+ 333 "All annotation it contained were lost."); 334 dumpingSet.clear(); 335 }// End if 336 337 StatusListener sListener = (StatusListener) 338 gate.gui.MainFrame.getListeners(). 339 get("gate.event.StatusListener"); 340 // Construct the dumping set in that way that all annotations will verify 341 // the condition that there are not annotations which are crossed. 342 // First add all annotation from the original markups 343 if(sListener != null) 344 sListener.statusChanged("Constructing the dumping annotation set."); 345 dumpingSet.addAll(originalMarkupsAnnotSet); 346 // Then take all the annotations from aSourceAnnotationSet and verify if 347 // they can be inserted safely into the dumpingSet. Where not possible, 348 // report. 349 if (aSourceAnnotationSet != null){ 350 Iterator iter = aSourceAnnotationSet.iterator(); 351 while (iter.hasNext()){ 352 Annotation currentAnnot = (Annotation) iter.next(); 353 if(insertsSafety(dumpingSet,currentAnnot)){ 354 dumpingSet.add(currentAnnot); 355 }else{ 356 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + 357 ", startOffset=" + currentAnnot.getStartNode().getOffset() + 358 ", endOffset=" + currentAnnot.getEndNode().getOffset() + 359 ", type=" + currentAnnot.getType()+ " was found to violate the" + 360 " crossed over condition. It will be discarded"); 361 }// End if 362 }// End while 363 }// End if 364 365 // The dumpingSet is ready to be exported as XML 366 // Here we go. 367 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 368 StringBuffer xmlDoc = new StringBuffer( 369 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 370 // Add xml header 371 // xmlDoc.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); 372 373 // If the annotation set contains this "GatePreserveFormat" 374 // type, then this is removed because it will be added in the saving 375 // process. The reason of this removal is that if the loaded document 376 // was previously loaded from a GatePreserveFormat then we 377 // don't want to create lots of annotation for this type. This annotation 378 // type should be always the root element of a XML preserving format 379 // GATE document. 380 FeatureMap docFeatures = this.getFeatures(); 381 String mimeTypeStr = null; 382 // addGatePreserveFormatTag = false; 383 if ( docFeatures != null && 384 null != (mimeTypeStr=(String)docFeatures.get("MimeType")) && 385 ( 386 "text/html".equalsIgnoreCase(mimeTypeStr) || 387 "text/xml".equalsIgnoreCase(mimeTypeStr) || 388 "text/sgml".equalsIgnoreCase(mimeTypeStr) 389 ) 390 ){ 391 /* don't add the root tag */ 392 }else{ 393 // Add the root start element 394 // xmlDoc.append("<GatePreserveFormat"+ 395 // " xmlns:gate=\"http://www.gate.ac.uk\"" + 396 // " gate:annotMaxId=\"" + 397 // getNextAnnotationId() + 398 // "\">"); 399 // addGatePreserveFormatTag = true; 400 }// End if 401 402 xmlDoc.append(saveAnnotationSetAsXml(dumpingSet)); 403 404 // if (addGatePreserveFormatTag){ 405 // // Add the root end element 406 // xmlDoc.append("</GatePreserveFormat>"); 407 // }// End if 408 if(sListener != null) sListener.statusChanged("Done."); 409 return xmlDoc.toString(); 410 }//End toXml() 411 412 /** This method verifies if aSourceAnnotation can ve inserted safety into the 413 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over 414 * contition with any annotation from the aTargetAnnotSet. 415 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation 416 * @param aSourceAnnotation the annotation to be inserted into the 417 * aTargetAnnotSet 418 * @return true if the annotation inserts safety, or false otherwise. 419 */ 420 private boolean insertsSafety(AnnotationSet aTargetAnnotSet, 421 Annotation aSourceAnnotation){ 422 423 if (aTargetAnnotSet == null || aSourceAnnotation == null) return false; 424 if (aSourceAnnotation.getStartNode() == null || 425 aSourceAnnotation.getStartNode().getOffset()== null) return false; 426 if (aSourceAnnotation.getEndNode() == null || 427 aSourceAnnotation.getEndNode().getOffset()== null) return false; 428 429 // Get the start and end offsets 430 Long start = aSourceAnnotation.getStartNode().getOffset(); 431 Long end = aSourceAnnotation.getEndNode().getOffset(); 432 // Read aSourceAnnotation offsets long 433 long s2 = start.longValue(); 434 long e2 = end.longValue(); 435 436 // Obtain a set with all annotations annotations that overlap 437 // totaly or partially with the interval defined by the two provided offsets 438 AnnotationSet as = aTargetAnnotSet.get(start,end); 439 440 // Investigate all the annotations from as to see if there is one that 441 // comes in conflict with aSourceAnnotation 442 Iterator it = as.iterator(); 443 while(it.hasNext()){ 444 Annotation ann = (Annotation) it.next(); 445 // Read ann offsets 446 long s1 = ann.getStartNode().getOffset().longValue(); 447 long e1 = ann.getEndNode().getOffset().longValue(); 448 449 if (s1<s2 && s2<e1 && e1<e2) return false; 450 if (s2<s1 && s1<e2 && e2<e1) return false; 451 }// End while 452 return true; 453 }// insertsSafety() 454 455 /** This method saves all the annotations from aDumpAnnotSet and combines 456 * them with the document content. 457 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 458 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 459 * then an empty string will be returned. 460 * @return The XML document obtained from raw text + the information from 461 * the dump annotation set. 462 */ 463 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet){ 464 String content = null; 465 if (this.getContent()== null) 466 content = new String(""); 467 else 468 content = this.getContent().toString(); 469 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 470 if (aDumpAnnotSet == null) return docContStrBuff.toString(); 471 472 TreeMap offsets2CharsMap = new TreeMap(); 473 if (this.getContent().size().longValue() != 0){ 474 // Fill the offsets2CharsMap with all the indices where 475 // special chars appear 476 buildEntityMapFromString(content,offsets2CharsMap); 477 }//End if 478 // The saving alghorithm is as follows: 479 /////////////////////////////////////////// 480 // Construct a set of annot with all IDs in asc order. 481 // All annotations that end at that offset swap their place in descending 482 // order. For each node write all the tags from left to right. 483 484 // Construct the node set 485 TreeSet offsets = new TreeSet(); 486 Iterator iter = aDumpAnnotSet.iterator(); 487 while (iter.hasNext()){ 488 Annotation annot = (Annotation) iter.next(); 489 offsets.add(annot.getStartNode().getOffset()); 490 offsets.add(annot.getEndNode().getOffset()); 491 }// End while 492 isRootTag = false; 493 // ofsets is sorted in ascending order. 494 // Iterate this set in descending order and remove an offset at each 495 // iteration 496 while (!offsets.isEmpty()){ 497 Long offset = (Long)offsets.last(); 498 // Remove the offset from the set 499 offsets.remove(offset); 500 // Now, use it. 501 // Returns a list with annotations that needs to be serialized in that 502 // offset. 503 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset); 504 // Attention: the annotation are serialized from left to right 505 StringBuffer tmpBuff = new StringBuffer(""); 506 Stack stack = new Stack(); 507 // Iterate through all these annotations and serialize them 508 Iterator it = annotations.iterator(); 509 while(it.hasNext()){ 510 Annotation a = (Annotation) it.next(); 511 it.remove(); 512 // Test if a Ends at offset 513 if ( offset.equals(a.getEndNode().getOffset()) ){ 514 // Test if a Starts at offset 515 if ( offset.equals(a.getStartNode().getOffset()) ){ 516 // Here, the annotation a Starts and Ends at the offset 517 if ( null != a.getFeatures().get("isEmptyAndSpan") && 518 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 519 520 // Assert: annotation a with start == end and isEmptyAndSpan 521 if (offsets.isEmpty() && "".equals(tmpBuff.toString())){ 522 // a is the doc's root tag to be written 523 // The annotations are serialized from left to right. 524 // The first annot in the last offset is the ROOT one 525 isRootTag = true; 526 }// End if 527 tmpBuff.append(writeStartTag(a)); 528 stack.push(a); 529 }else{ 530 // Assert annotation a with start == end and an empty tag 531 tmpBuff.append(writeEmptyTag(a)); 532 // The annotation is removed from dumped set 533 aDumpAnnotSet.remove(a); 534 }// End if 535 }else{ 536 // Here the annotation a Ends at the offset. 537 // In this case empty the stack and write the end tag 538 if (!stack.isEmpty()){ 539 while(!stack.isEmpty()){ 540 Annotation a1 = (Annotation)stack.pop(); 541 tmpBuff.append(writeEndTag(a1)); 542 }// End while 543 }// End if 544 tmpBuff.append(writeEndTag(a)); 545 }// End if 546 }else{ 547 // The annotation a does NOT end at the offset. Let's see if it starts 548 // at the offset 549 if ( offset.equals(a.getStartNode().getOffset()) ){ 550 // The annotation a starts at the offset. 551 // In this case empty the stack and write the end tag 552 if (!stack.isEmpty()){ 553 while(!stack.isEmpty()){ 554 Annotation a1 = (Annotation)stack.pop(); 555 tmpBuff.append(writeEndTag(a1)); 556 }// End while 557 }// End if 558 if (offsets.isEmpty() && "".equals(tmpBuff.toString())){ 559 // a is the last tag to be written 560 // The annotations are serialized from left to right. 561 // The first annot in the last offset is the ROOT one. 562 isRootTag = true; 563 }// End if 564 tmpBuff.append(writeStartTag(a)); 565 // The annotation is removed from dumped set 566 aDumpAnnotSet.remove(a); 567 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 568 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 569 }// End while(it.hasNext()){ 570 571 // In this case empty the stack and write the end tag 572 if (!stack.isEmpty()){ 573 while(!stack.isEmpty()){ 574 Annotation a1 = (Annotation)stack.pop(); 575 tmpBuff.append(writeEndTag(a1)); 576 }// End while 577 }// End if 578 579 // Before inserting tmpBuff into docContStrBuff we need to check 580 // if there are chars to be replaced and if there are, they would be 581 // replaced. 582 if (!offsets2CharsMap.isEmpty()){ 583 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 584 while( !offsets2CharsMap.isEmpty() && 585 offsChar.intValue() >= offset.intValue()){ 586 // Replace the char at offsChar with its corresponding entity form 587 // the entitiesMap. 588 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 589 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 590 // Discard the offsChar after it was used. 591 offsets2CharsMap.remove(offsChar); 592 // Investigate next offsChar 593 if (!offsets2CharsMap.isEmpty()) 594 offsChar = (Integer) offsets2CharsMap.lastKey(); 595 }// End while 596 }// End if 597 // Insert tmpBuff to the location where it belongs in docContStrBuff 598 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 599 }// End while(!offsets.isEmpty()) 600 // Need to replace the entities in the remaining text, if there is any text 601 // So, if there are any more items in offsets2CharsMap they need to be 602 // replaced 603 while (!offsets2CharsMap.isEmpty()){ 604 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 605 // Replace the char with its entity 606 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 607 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 608 // remove the offset from the map 609 offsets2CharsMap.remove(offsChar); 610 }// End while 611 return docContStrBuff.toString(); 612 }// saveAnnotationSetAsXml() 613 614 /** This method returns a list with annotations ordered that way that 615 * they can be serialized from left to right, at the offset. If one of the 616 * params is null then an empty list will be returned. 617 * @param aDumpAnnotSet is a set containing all annotations that will be 618 * dumped. 619 * @param offset represent the offset at witch the annotation must start 620 * AND/OR end. 621 * @return a list with those annotations that need to be serialized. 622 */ 623 private List getAnnotationsForOffset(AnnotationSet aDumpAnnotSet,Long offset){ 624 List annotationList = new LinkedList(); 625 if (aDumpAnnotSet == null || offset == null) return annotationList; 626 Set annotThatStartAtOffset = new TreeSet( 627 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC)); 628 Set annotThatEndAtOffset = new TreeSet( 629 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC)); 630 Set annotThatStartAndEndAtOffset = new TreeSet( 631 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC)); 632 633 // Fill these tree lists with annotation tat start, end or start and 634 // end at the offset. 635 Iterator iter = aDumpAnnotSet.iterator(); 636 while(iter.hasNext()){ 637 Annotation ann = (Annotation) iter.next(); 638 if (offset.equals(ann.getStartNode().getOffset())){ 639 if (offset.equals(ann.getEndNode().getOffset())) 640 annotThatStartAndEndAtOffset.add(ann); 641 else 642 annotThatStartAtOffset.add(ann); 643 }else{ 644 if (offset.equals(ann.getEndNode().getOffset())) 645 annotThatEndAtOffset.add(ann); 646 }// End if 647 }// End while 648 annotationList.addAll(annotThatEndAtOffset); 649 annotThatEndAtOffset = null; 650 annotationList.addAll(annotThatStartAtOffset); 651 annotThatStartAtOffset = null; 652 iter = annotThatStartAndEndAtOffset.iterator(); 653 while(iter.hasNext()){ 654 Annotation ann = (Annotation) iter.next(); 655 Iterator it = annotationList.iterator(); 656 boolean breaked = false; 657 while (it.hasNext()){ 658 Annotation annFromList = (Annotation) it.next(); 659 if (annFromList.getId().intValue() > ann.getId().intValue()){ 660 annotationList.add(annotationList.indexOf(annFromList),ann); 661 breaked = true; 662 break; 663 }// End if 664 }// End while 665 if (!breaked) 666 annotationList.add(ann); 667 iter.remove(); 668 }// End while 669 return annotationList; 670 }// getAnnotationsForOffset() 671 672 /** Returns a string representing a start tag based on the input annot*/ 673 private String writeStartTag(Annotation annot){ 674 StringBuffer strBuff = new StringBuffer(""); 675 if (annot == null) return strBuff.toString(); 676 if (!addGatePreserveFormatTag && isRootTag){ 677 strBuff.append("<"+annot.getType()+ 678 " xmlns:gate=\"http://www.gate.ac.uk\"" + 679 " gate:gateId=\"" + annot.getId()+"\"" + 680 " gate:annotMaxId=\"" + getNextAnnotationId() + "\""+ 681 writeFeatures(annot.getFeatures())+" >"); 682 // Once the root tag was writen then there will be no other Root tag 683 isRootTag = false; 684 }else{ 685 strBuff.append("<"+annot.getType()+" gate:gateId=\"" +annot.getId()+"\""+ 686 writeFeatures(annot.getFeatures())+" >"); 687 }// End if 688 return strBuff.toString(); 689 }// writeStartTag() 690 691 /** This method takes aScanString and searches for those chars from 692 * entitiesMap that appear in the string. A tree map(offset2Char) is filled 693 * using as key the offsets where those Chars appear and the Char. 694 * If one of the params is null the method simply returns. 695 */ 696 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){ 697 if (aScanString == null || aMapToFill == null) return; 698 if (entitiesMap == null || entitiesMap.isEmpty()){ 699 Err.prln("WARNING: Entities map was not initialised !"); 700 return; 701 }// End if 702 // Fill the Map with the offsets of the special chars 703 Iterator entitiesMapIterator = entitiesMap.keySet().iterator(); 704 while(entitiesMapIterator.hasNext()){ 705 Character c = (Character) entitiesMapIterator.next(); 706 int fromIndex = 0; 707 while (-1 != fromIndex){ 708 fromIndex = aScanString.indexOf(c.charValue(),fromIndex); 709 if (-1 != fromIndex){ 710 aMapToFill.put(new Integer(fromIndex),c); 711 fromIndex ++; 712 }// End if 713 }// End while 714 }// End while 715 }//buildEntityMapFromString(); 716 717 /** Returns a string representing an empty tag based on the input annot*/ 718 private String writeEmptyTag(Annotation annot){ 719 StringBuffer strBuff = new StringBuffer(""); 720 if (annot == null) return strBuff.toString(); 721 strBuff.append("<"+annot.getType()+" gateId=\"" +annot.getId()+"\""+ 722 writeFeatures(annot.getFeatures())+" />"); 723 return strBuff.toString(); 724 }// writeEmptyTag() 725 726 /** Returns a string representing an end tag based on the input annot*/ 727 private String writeEndTag(Annotation annot){ 728 StringBuffer strBuff = new StringBuffer(""); 729 if (annot == null) return strBuff.toString(); 730 /* 731 if (annot.getType().indexOf(" ") != -1) 732 Out.prln("Warning: Truncating end tag to first word for annot type \"" 733 +annot.getType()+ "\". "); 734 */ 735 strBuff.append("</"+annot.getType()+">"); 736 return strBuff.toString(); 737 }// writeEndTag() 738 739 /** Returns a string representing a FeatureMap serialized as XML attributes*/ 740 private String writeFeatures(FeatureMap feat){ 741 StringBuffer strBuff = new StringBuffer(""); 742 if (feat == null) return strBuff.toString(); 743 Iterator it = feat.keySet().iterator(); 744 while (it.hasNext()){ 745 Object key = it.next(); 746 Object value = feat.get(key); 747 if ( (key != null) && (value != null) ){ 748 // Eliminate a feature inserted at reading time and which help to 749 // take some decissions at saving time 750 if ("isEmptyAndSpan".equals(key.toString())) 751 continue; 752 if( !(String.class.isAssignableFrom(key.getClass()) || 753 Number.class.isAssignableFrom(key.getClass()))){ 754 755 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+ 756 " from String or Number.(feature discarded)"); 757 continue; 758 }// End if 759 if ( !(String.class.isAssignableFrom(value.getClass()) || 760 Number.class.isAssignableFrom(value.getClass()) || 761 java.util.Collection.class.isAssignableFrom(value.getClass()))){ 762 763 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+ 764 " from String, Number or Collection.(feature discarded)"); 765 continue; 766 }// End if 767 if ("matches".equals(key)) 768 strBuff.append(" gate:" + key + "=\""); 769 else 770 strBuff.append(" " + key + "=\""); 771 if (java.util.Collection.class.isAssignableFrom(value.getClass())){ 772 Iterator valueIter = ((Collection)value).iterator(); 773 while(valueIter.hasNext()){ 774 Object item = valueIter.next(); 775 if (!(String.class.isAssignableFrom(item.getClass()) || 776 Number.class.isAssignableFrom(item.getClass()))) 777 continue; 778 strBuff.append(item +";"); 779 }// End while 780 if (strBuff.charAt(strBuff.length()-1) == ';') 781 strBuff.deleteCharAt(strBuff.length()-1); 782 }else{ 783 strBuff.append(value); 784 }// End if 785 strBuff.append("\""); 786 }// End if 787 }// End while 788 return strBuff.toString(); 789 }// writeFeatures() 790 791 /** Returns a GateXml document that is a custom XML format for wich there is 792 * a reader inside GATE called gate.xml.GateFormatXmlHandler. 793 * What it does is to serialize a GATE document in an XML format. 794 * @return a string representing a Gate Xml document. If saved in a file,this 795 * string must be written using the UTF-8 encoding because the first line 796 * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?> 797 */ 798 public String toXml(){ 799 // Initialize the xmlContent with 3 time the size of the current document. 800 // This is because of the tags size. This measure is made to increase the 801 // performance of StringBuffer. 802 StringBuffer xmlContent = new StringBuffer( 803 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue())); 804 // Add xml header 805 xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); 806 // Add the root element 807 xmlContent.append("<GateDocument>\n"); 808 xmlContent.append("<!-- The document's features-->\n\n"); 809 xmlContent.append("<GateDocumentFeatures>\n"); 810 xmlContent.append(featuresToXml(this.getFeatures())); 811 xmlContent.append("</GateDocumentFeatures>\n"); 812 xmlContent.append("<!-- The document content area with serialized"+ 813 " nodes -->\n\n"); 814 // Add plain text element 815 xmlContent.append("<TextWithNodes>"); 816 xmlContent.append(textWithNodes(this.getContent().toString())); 817 xmlContent.append("</TextWithNodes>\n"); 818 // Serialize as XML all document's annotation sets 819 // Serialize the default AnnotationSet 820 StatusListener sListener = (StatusListener) 821 gate.gui.MainFrame.getListeners(). 822 get("gate.event.StatusListener"); 823 if(sListener != null) 824 sListener.statusChanged("Saving the default annotation set "); 825 xmlContent.append("<!-- The default annotation set -->\n\n"); 826 xmlContent.append(annotationSetToXml(this.getAnnotations())); 827 // Serialize all others AnnotationSets 828 // namedAnnotSets is a Map containing all other named Annotation Sets. 829 if (namedAnnotSets != null){ 830 Iterator iter = namedAnnotSets.values().iterator(); 831 while(iter.hasNext()){ 832 AnnotationSet annotSet = (AnnotationSet) iter.next(); 833 xmlContent.append("<!-- Named annotation set -->\n\n"); 834 // Serialize it as XML 835 if(sListener != null) sListener.statusChanged("Saving " + 836 annotSet.getName()+ 837 " annotation set "); 838 xmlContent.append(annotationSetToXml(annotSet)); 839 }// End while 840 }// End if 841 // Add the end of GateDocument 842 xmlContent.append("</GateDocument>"); 843 if(sListener != null) sListener.statusChanged("Done !"); 844 // return the XmlGateDocument 845 return xmlContent.toString(); 846 }// toXml 847 848 /** This method filters any non XML char 849 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets 850 * All non XML chars will be replaced with 0x20 (space char) This assures 851 * that the next time the document is loaded there won't be any problems. 852 * @param aStrBuffer represents the input String that is filtred. If the 853 * aStrBuffer is null then an empty string will be returend 854 * @return the "purified" StringBuffer version of the aStrBuffer 855 */ 856 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){ 857 if (aStrBuffer == null) return new StringBuffer(""); 858 String space = new String(" "); 859 for (int i=aStrBuffer.length()-1;i>=0; i--){ 860 if (!isXmlChar(aStrBuffer.charAt(i))) 861 aStrBuffer.replace(i,i+1,space); 862 }// End for 863 return aStrBuffer; 864 }// filterNonXmlChars() 865 866 /** This method decide if a char is a valid XML one or not 867 * @param ch the char to be tested 868 * @return true if is a valid XML char and fals if is not. 869 */ 870 private boolean isXmlChar(char ch){ 871 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true; 872 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true; 873 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true; 874 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true; 875 return false; 876 }// End isXmlChar() 877 878 /** This method saves a FeatureMap as XML elements. 879 * @ param aFeatureMap the feature map that has to be saved as XML. 880 * @ return a String like this: <Feature><Name>...</Name> 881 * <Value>...</Value></Feature><Feature>...</Feature> 882 */ 883 private String featuresToXml(FeatureMap aFeatureMap){ 884 StringBuffer str = new StringBuffer(""); 885 886 if (aFeatureMap == null) return str.toString(); 887 888 Set keySet = aFeatureMap.keySet(); 889 Iterator keyIterator = keySet.iterator(); 890 while(keyIterator.hasNext()){ 891 Object key = keyIterator.next(); 892 Object value = aFeatureMap.get(key); 893 if ((key != null) && (value != null)){ 894 String keyClassName = null; 895 String keyItemClassName = null; 896 String valueClassName = null; 897 String valueItemClassName = null; 898 String key2String = key.toString(); 899 String value2String = value.toString(); 900 Object item = null; 901 // Test key if it is String, Number or Collection 902 if (key instanceof java.lang.String || 903 key instanceof java.lang.Number || 904 key instanceof java.util.Collection) 905 keyClassName = key.getClass().getName(); 906 907 // Test value if it is String, Number or Collection 908 if (value instanceof java.lang.String || 909 value instanceof java.lang.Number || 910 value instanceof java.util.Collection) 911 valueClassName = value.getClass().getName(); 912 913 // Features and values that are not Strings, Numbers or collections 914 // will be discarded. 915 if (keyClassName == null || valueClassName == null) continue; 916 917 // If key is collection serialize the colection in a specific format 918 if (key instanceof java.util.Collection){ 919 StringBuffer keyStrBuff = new StringBuffer(""); 920 Iterator iter = ((Collection) key).iterator(); 921 if (iter.hasNext()){ 922 item = iter.next(); 923 if (item instanceof java.lang.Number) 924 keyItemClassName = item.getClass().getName(); 925 else 926 keyItemClassName = String.class.getName(); 927 keyStrBuff.append(item.toString()); 928 }// End if 929 while (iter.hasNext()){ 930 item = iter.next(); 931 keyStrBuff.append(";" + item.toString()); 932 }// End while 933 key2String = keyStrBuff.toString(); 934 }// End if 935 // If key is collection serialize the colection in a specific format 936 if (value instanceof java.util.Collection){ 937 StringBuffer valueStrBuff = new StringBuffer(""); 938 Iterator iter = ((Collection) value).iterator(); 939 if (iter.hasNext()){ 940 item = iter.next(); 941 if (item instanceof java.lang.Number) 942 valueItemClassName = item.getClass().getName(); 943 else 944 valueItemClassName = String.class.getName(); 945 valueStrBuff.append(item.toString()); 946 }// End if 947 while (iter.hasNext()){ 948 item = iter.next(); 949 valueStrBuff.append(";" + item.toString()); 950 }// End while 951 value2String = valueStrBuff.toString(); 952 }// End if 953 str.append("<Feature>\n <Name"); 954 if (keyClassName != null) 955 str.append(" className=\""+keyClassName+"\""); 956 if (keyItemClassName != null) 957 str.append(" itemClassName=\""+keyItemClassName+"\""); 958 str.append(">"); 959 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String))); 960 str.append("</Name>\n <Value"); 961 if (valueClassName != null) 962 str.append(" className=\"" + valueClassName + "\""); 963 if (valueItemClassName != null) 964 str.append(" itemClassName=\"" + valueItemClassName + "\""); 965 str.append(">"); 966 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String))); 967 str.append("</Value>\n</Feature>\n"); 968 }// End if 969 }// end While 970 return str.toString(); 971 }//featuresToXml 972 973 /** This method replace all chars that appears in the anInputString and also 974 * that are in the entitiesMap with their corresponding entity 975 * @param anInputString the string analyzed. If it is null then returns the 976 * empty string 977 * @return a string representing the input string with chars replaced with 978 * entities 979 */ 980 private StringBuffer replaceCharsWithEntities(String anInputString){ 981 if (anInputString == null) return new StringBuffer(""); 982 StringBuffer strBuff = new StringBuffer(anInputString); 983 for (int i=strBuff.length()-1; i>=0; i--){ 984 Character ch = new Character(strBuff.charAt(i)); 985 if (entitiesMap.keySet().contains(ch)){ 986 strBuff.replace(i,i+1,(String) entitiesMap.get(ch)); 987 }// End if 988 }// End for 989 return strBuff; 990 }//replaceCharsWithEntities() 991 992 /** This method creates Node XML elements and inserts them at the 993 * corresponding offset inside the text. Nodes are created from the default 994 * annotation set, as well as from all existing named annotation sets. 995 * @param aText The text representing the document's plain text. 996 * @return The text with empty <Node id="NodeId"/> elements. 997 */ 998 private String textWithNodes(String aText){ 999 if (aText == null) return new String(""); 1000 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText)); 1001 1002 // Construct a map from offsets to Chars 1003 TreeMap offsets2CharsMap = new TreeMap(); 1004 if (aText.length()!= 0){ 1005 // Fill the offsets2CharsMap with all the indices where special chars appear 1006 buildEntityMapFromString(aText,offsets2CharsMap); 1007 }//End if 1008 // Construct the offsetsSet for all nodes belonging to this document 1009 TreeSet offsetsSet = new TreeSet(); 1010 Iterator annotSetIter = this.getAnnotations().iterator(); 1011 while (annotSetIter.hasNext()){ 1012 Annotation annot = (Annotation) annotSetIter.next(); 1013 offsetsSet.add(annot.getStartNode().getOffset()); 1014 offsetsSet.add(annot.getEndNode().getOffset()); 1015 }// end While 1016 // Get the nodes from all other named annotation sets. 1017 if (namedAnnotSets != null){ 1018 Iterator iter = namedAnnotSets.values().iterator(); 1019 while(iter.hasNext()){ 1020 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1021 Iterator iter2 = annotSet.iterator(); 1022 while(iter2.hasNext()){ 1023 Annotation annotTmp = (Annotation) iter2.next(); 1024 offsetsSet.add(annotTmp.getStartNode().getOffset()); 1025 offsetsSet.add(annotTmp.getEndNode().getOffset()); 1026 }// End while 1027 }// End while 1028 }// End if 1029 // offsetsSet is ordered in ascending order because the structure 1030 // is a TreeSet 1031 1032 if (offsetsSet.isEmpty()){ 1033 return replaceCharsWithEntities(aText).toString(); 1034 }// End if 1035 // Iterate through all nodes from anAnnotSet and transform them to 1036 // XML elements. Then insert those elements at the node's offset into the 1037 // textWithNodes . 1038 while (!offsetsSet.isEmpty()){ 1039 Long offset = (Long) offsetsSet.last(); 1040 // Eliminate the offset from the list in order to create more memory space 1041 offsetsSet.remove(offset); 1042 // Use offset 1043 int offsetValue = offset.intValue(); 1044 String strNode = "<Node id=\"" + offsetValue + "\"/>"; 1045 // Before inserting this string into the textWithNodes, check to see if 1046 // there are any chars to be replaced with their corresponding entities 1047 if (!offsets2CharsMap.isEmpty()){ 1048 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1049 while( !offsets2CharsMap.isEmpty() && 1050 offsChar.intValue() >= offset.intValue()){ 1051 // Replace the char at offsChar with its corresponding entity form 1052 // the entitiesMap. 1053 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1054 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1055 // Discard the offsChar after it was used because this offset will 1056 // never appear again 1057 offsets2CharsMap.remove(offsChar); 1058 // Investigate next offsChar 1059 if (!offsets2CharsMap.isEmpty()) 1060 offsChar = (Integer) offsets2CharsMap.lastKey(); 1061 }// End while 1062 }// End if 1063 // Now it is safe to insert the node 1064 textWithNodes.insert(offsetValue,strNode); 1065 }// end while 1066 // Need to replace the entities in the remaining text, if there is any text 1067 // So, if there are any more items in offsets2CharsMap they need to be 1068 // replaced 1069 while (!offsets2CharsMap.isEmpty()){ 1070 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1071 // Replace the char with its entity 1072 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1073 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1074 // remove the offset from the map 1075 offsets2CharsMap.remove(offsChar); 1076 }// End while 1077 return textWithNodes.toString(); 1078 }//textWithNodes() 1079 1080 /** This method saves an AnnotationSet as XML. 1081 * @param anAnnotationSet The annotation set that has to be saved as XML. 1082 * @return a String like this: <AnnotationSet> <Annotation>.... 1083 * </AnnotationSet> 1084 */ 1085 private String annotationSetToXml(AnnotationSet anAnnotationSet){ 1086 StringBuffer str = new StringBuffer(""); 1087 1088 if (anAnnotationSet == null){ 1089 str.append("<AnnotationSet>\n"); 1090 str.append("</AnnotationSet>\n"); 1091 return str.toString(); 1092 }// End if 1093 if (anAnnotationSet.getName() == null) 1094 str.append("<AnnotationSet>\n"); 1095 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+ 1096 "\" >\n"); 1097 // Iterate through AnnotationSet and save each Annotation as XML 1098 Iterator iterator = anAnnotationSet.iterator(); 1099 while (iterator.hasNext()){ 1100 Annotation annot = (Annotation) iterator.next(); 1101 str.append("<Annotation " + "Type=\"" + annot.getType() + 1102 "\" StartNode=\"" + annot.getStartNode().getOffset() + 1103 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n"); 1104 str.append(featuresToXml(annot.getFeatures())); 1105 str.append("</Annotation>\n"); 1106 }// End while 1107 1108 str.append("</AnnotationSet>\n"); 1109 return str.toString(); 1110 }// annotationSetToXml 1111 1112 /** Returns a map with the named annotation sets. It returns <code>null</code> 1113 * if no named annotaton set exists. */ 1114 public Map getNamedAnnotationSets() { 1115 return namedAnnotSets; 1116 } // getNamedAnnotationSets 1117 1118 /** 1119 * Removes one of the named annotation sets. 1120 * Note that the default annotation set cannot be removed. 1121 * @param name the name of the annotation set to be removed 1122 */ 1123 public void removeAnnotationSet(String name){ 1124 Object removed = namedAnnotSets.remove(name); 1125 if(removed != null){ 1126 fireAnnotationSetRemoved( 1127 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name)); 1128 } 1129 } 1130 1131 /** Propagate edit changes to the document content and annotations. */ 1132 public void edit(Long start, Long end, DocumentContent replacement) 1133 throws InvalidOffsetException 1134 { 1135 if(! isValidOffsetRange(start, end)) 1136 throw new InvalidOffsetException(); 1137 1138 if(content != null) 1139 ((DocumentContentImpl) content).edit(start, end, replacement); 1140 1141 if(defaultAnnots != null) 1142 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement); 1143 1144 if(namedAnnotSets != null) { 1145 Iterator iter = namedAnnotSets.values().iterator(); 1146 while(iter.hasNext()) 1147 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement); 1148 } 1149 1150 } // edit(start,end,replacement) 1151 1152 /** Check that an offset is valid, i.e. it is non-null, greater than 1153 * or equal to 0 and less than the size of the document content. 1154 */ 1155 public boolean isValidOffset(Long offset) { 1156 if(offset == null) 1157 return false; 1158 1159 long o = offset.longValue(); 1160 if(o > getContent().size().longValue() || o < 0) 1161 return false; 1162 1163 return true; 1164 } // isValidOffset 1165 1166 /** Check that both start and end are valid offsets and that 1167 * they constitute a valid offset range, i.e. start is greater 1168 * than or equal to long. 1169 */ 1170 public boolean isValidOffsetRange(Long start, Long end) { 1171 return 1172 isValidOffset(start) && isValidOffset(end) && 1173 start.longValue() <= end.longValue(); 1174 } // isValidOffsetRange(start,end) 1175 1176 /** Sets the nextAnnotationId */ 1177 public void setNextAnnotationId(int aNextAnnotationId){ 1178 nextAnnotationId = aNextAnnotationId; 1179 }// setNextAnnotationId(); 1180 1181 /** Generate and return the next annotation ID */ 1182 public Integer getNextAnnotationId() { 1183 return new Integer(nextAnnotationId++); 1184 } // getNextAnnotationId 1185 1186 /** Generate and return the next node ID */ 1187 public Integer getNextNodeId() { return new Integer(nextNodeId++); } 1188 1189 /** Ordering based on URL.toString() and the URL offsets (if any) */ 1190 public int compareTo(Object o) throws ClassCastException { 1191 DocumentImpl other = (DocumentImpl) o; 1192 return getOrderingString().compareTo(other.getOrderingString()); 1193 } // compareTo 1194 1195 /** Utility method to produce a string for comparison in ordering. 1196 * String is based on the source URL and offsets. 1197 */ 1198 protected String getOrderingString() { 1199 if(sourceUrl == null) return toString(); 1200 1201 StringBuffer orderingString = new StringBuffer(sourceUrl.toString()); 1202 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) { 1203 orderingString.append(sourceUrlStartOffset.toString()); 1204 orderingString.append(sourceUrlEndOffset.toString()); 1205 } 1206 1207 return orderingString.toString(); 1208 } // getOrderingString() 1209 1210 /** The id of the next new annotation */ 1211 protected int nextAnnotationId = 0; 1212 1213 /** The id of the next new node */ 1214 protected int nextNodeId = 0; 1215 /** The source URL */ 1216 protected URL sourceUrl; 1217 1218 /** The document's URL name. */ 1219 1220 /** The content of the document */ 1221 protected DocumentContent content; 1222 1223 /** The encoding of the source of the document content */ 1224 protected String encoding = "UTF-8"; 1225 1226 // Data needed in toXml(AnnotationSet) methos 1227 1228 /** This field indicates whether or not to add the tag 1229 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't 1230 * have this tag added 1231 */ 1232 private boolean addGatePreserveFormatTag = false; 1233 1234 /** This field indicates if an annotation is the doc's root tag. 1235 * It is needed when adding the namespace information 1236 */ 1237 private boolean isRootTag = false; 1238 1239 /** This field is used when creating StringBuffers for toXml() methods. 1240 * The size of the StringBuffer will be docDonctent.size() multiplied by this 1241 * value. It is aimed to improve the performance of StringBuffer 1242 */ 1243 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1; 1244 1245 /** Constant used in the inner class AnnotationComparator to order 1246 * annotations on their start offset 1247 */ 1248 private final int ORDER_ON_START_OFFSET = 0; 1249 /** Constant used in the inner class AnnotationComparator to order 1250 * annotations on their end offset 1251 */ 1252 private final int ORDER_ON_END_OFFSET = 1; 1253 /** Constant used in the inner class AnnotationComparator to order 1254 * annotations on their ID 1255 */ 1256 private final int ORDER_ON_ANNOT_ID = 2; 1257 /** Constant used in the inner class AnnotationComparator to order 1258 * annotations ascending 1259 */ 1260 private final int ASC = 3; 1261 /** Constant used in the inner class AnnotationComparator to order 1262 * annotations descending 1263 */ 1264 private final int DESC = -3; 1265 1266 /** A map initialized in init() containing entities that needs to be 1267 * replaced in strings 1268 */ 1269 private static Map entitiesMap = null; 1270 // Initialize the entities map use when saving as xml 1271 static{ 1272 entitiesMap = new HashMap(); 1273 entitiesMap.put(new Character('<'),"<"); 1274 entitiesMap.put(new Character('>'),">"); 1275 entitiesMap.put(new Character('&'),"&"); 1276 entitiesMap.put(new Character('\''),"'"); 1277 entitiesMap.put(new Character('"'),"""); 1278 entitiesMap.put(new Character((char)160)," "); 1279 entitiesMap.put(new Character((char)169),"©"); 1280 }//static 1281 1282 /** The range that the content comes from at the source URL 1283 * (or null if none). 1284 */ 1285 //protected Long[] sourceUrlOffsets; 1286 1287 /** The start of the range that the content comes from at the source URL 1288 * (or null if none). 1289 */ 1290 protected Long sourceUrlStartOffset; 1291 1292 /** The end of the range that the content comes from at the source URL 1293 * (or null if none). 1294 */ 1295 protected Long sourceUrlEndOffset; 1296 1297 /** The default annotation set */ 1298 protected AnnotationSet defaultAnnots; 1299 1300 /** Named sets of annotations */ 1301 protected Map namedAnnotSets; 1302 1303 /** 1304 * A property of the document that will be set when the user 1305 * wants to create the document from a string, as opposed to from 1306 * a URL. 1307 */ 1308 private String stringContent; 1309 1310 /** 1311 * The stringContent of a document is 1312 * a property of the document that will be set when the user 1313 * wants to create the document from a string, as opposed to from 1314 * a URL. 1315 * <B>Use the <TT>getContent</TT> method instead to get the actual document 1316 * content.</B> 1317 */ 1318 public String getStringContent() { return stringContent; } 1319 1320 /** 1321 * The stringContent of a document is 1322 * a property of the document that will be set when the user 1323 * wants to create the document from a string, as opposed to from 1324 * a URL. 1325 * <B>Use the <TT>setContent</TT> method instead to update the actual 1326 * document content.</B> 1327 */ 1328 public void setStringContent(String stringContent) { 1329 this.stringContent = stringContent; 1330 } // set StringContent 1331 1332 /** Is the document markup-aware? */ 1333 protected Boolean markupAware = new Boolean(false); 1334 1335 /** Check: test 2 objects for equality */ 1336 protected boolean check(Object a, Object b) { 1337 if( (a == null || b == null) ) 1338 return a == b; 1339 1340 return a.equals(b); 1341 } // check(a,b) 1342 1343 /** Equals */ 1344 public boolean equals(Object other) { 1345 if(other == null || 1346 !(other instanceof DocumentImpl))return false; 1347 DocumentImpl doc = (DocumentImpl) other; 1348 1349// PENDING EQUALS IMPLS 1350 if(! check(content, doc.content)) return false; 1351 if(! check(defaultAnnots, doc.defaultAnnots)) return false; 1352 if(! check(encoding, doc.encoding)) return false; 1353 if(! check(features, doc.features)) return false; 1354 if(!markupAware.equals(doc.markupAware)) return false; 1355 if(! check(namedAnnotSets, doc.namedAnnotSets)) return false; 1356 if(nextAnnotationId != doc.nextAnnotationId) return false; 1357 if(nextNodeId != doc.nextNodeId) return false; 1358 if(! check(sourceUrl, doc.sourceUrl)) return false; 1359 if(! check(sourceUrlStartOffset, doc.sourceUrlStartOffset)) return false; 1360 if(! check(sourceUrlEndOffset, doc.sourceUrlEndOffset)) return false; 1361 1362 return true; 1363 } // equals 1364 1365 /** Hash code */ 1366 public int hashCode() { 1367 int code = getContent().hashCode(); 1368 int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode(); 1369 code += memberCode; 1370 memberCode = (encoding == null) ? 0 : encoding.hashCode(); 1371 code += memberCode; 1372 memberCode = (features == null) ? 0 : features.hashCode(); 1373 code += memberCode; 1374 code += (markupAware.booleanValue()) ? 0 : 1; 1375 memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode(); 1376 code += memberCode; 1377 code += nextAnnotationId; 1378 code += nextNodeId; 1379 memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode(); 1380 code += memberCode; 1381 memberCode = 1382 (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode(); 1383 code += memberCode; 1384 memberCode = 1385 (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode(); 1386 code += memberCode; 1387 return code; 1388 } // hashcode 1389 1390 /** String respresentation */ 1391 public String toString() { 1392 String n = Strings.getNl(); 1393 StringBuffer s = new StringBuffer("DocumentImpl: " + n); 1394 s.append(" content:" + content + n); 1395 s.append(" defaultAnnots:" + defaultAnnots + n); 1396 s.append(" encoding:" + encoding + n); 1397 s.append(" features:" + features + n); 1398 s.append(" markupAware:" + markupAware + n); 1399 s.append(" namedAnnotSets:" + namedAnnotSets + n); 1400 s.append(" nextAnnotationId:" + nextAnnotationId + n); 1401 s.append(" nextNodeId:" + nextNodeId + n); 1402 s.append(" sourceUrl:" + sourceUrl + n); 1403 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n); 1404 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n); 1405 s.append(n); 1406 1407 return s.toString(); 1408 } // toString 1409 1410 /** Freeze the serialization UID. */ 1411 static final long serialVersionUID = -8456893608311510260L; 1412 1413 /** Inner class needed to compare annotations*/ 1414 class AnnotationComparator implements java.util.Comparator { 1415 int orderOn = -1; 1416 int orderType = ASC; 1417 /** Constructs a comparator according to one of three sorter types: 1418 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET 1419 */ 1420 public AnnotationComparator(int anOrderOn, int anOrderType){ 1421 orderOn = anOrderOn; 1422 orderType = anOrderType; 1423 }// AnnotationComparator() 1424 1425 /**This method must be implemented according to Comparator interface */ 1426 public int compare(Object o1, Object o2){ 1427 Annotation a1 = (Annotation) o1; 1428 Annotation a2 = (Annotation) o2; 1429 // ORDER_ON_START_OFFSET ? 1430 if (orderOn == ORDER_ON_START_OFFSET){ 1431 int result = a1.getStartNode().getOffset().compareTo( 1432 a2.getStartNode().getOffset()); 1433 if (orderType == ASC){ 1434 // ASC 1435 // If they are equal then their ID will decide. 1436 if (result == 0) 1437 return a1.getId().compareTo(a2.getId()); 1438 return result; 1439 }else{ 1440 // DESC 1441 if (result == 0) 1442 return - (a1.getId().compareTo(a2.getId())); 1443 return -result; 1444 }// End if (orderType == ASC) 1445 }// End if (orderOn == ORDER_ON_START_OFFSET) 1446 1447 // ORDER_ON_END_OFFSET ? 1448 if (orderOn == ORDER_ON_END_OFFSET){ 1449 int result = a1.getEndNode().getOffset().compareTo( 1450 a2.getEndNode().getOffset()); 1451 if (orderType == ASC){ 1452 // ASC 1453 // If they are equal then their ID will decide. 1454 if (result == 0) 1455 return - (a1.getId().compareTo(a2.getId())); 1456 return result; 1457 }else{ 1458 // DESC 1459 // If they are equal then their ID will decide. 1460 if (result == 0) 1461 return a1.getId().compareTo(a2.getId()); 1462 return - result; 1463 }// End if (orderType == ASC) 1464 }// End if (orderOn == ORDER_ON_END_OFFSET) 1465 1466 // ORDER_ON_ANNOT_ID ? 1467 if (orderOn == ORDER_ON_ANNOT_ID){ 1468 if (orderType == ASC) 1469 return a1.getId().compareTo(a2.getId()); 1470 else 1471 return -(a1.getId().compareTo(a2.getId())); 1472 }// End if 1473 return 0; 1474 }//compare() 1475 } // End inner class AnnotationComparator 1476 1477 1478 private transient Vector documentListeners; 1479 private transient Vector gateListeners; 1480 1481 public synchronized void removeDocumentListener(DocumentListener l) { 1482 if (documentListeners != null && documentListeners.contains(l)) { 1483 Vector v = (Vector) documentListeners.clone(); 1484 v.removeElement(l); 1485 documentListeners = v; 1486 } 1487 } 1488 public synchronized void addDocumentListener(DocumentListener l) { 1489 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone(); 1490 if (!v.contains(l)) { 1491 v.addElement(l); 1492 documentListeners = v; 1493 } 1494 } 1495 protected void fireAnnotationSetAdded(DocumentEvent e) { 1496 if (documentListeners != null) { 1497 Vector listeners = documentListeners; 1498 int count = listeners.size(); 1499 for (int i = 0; i < count; i++) { 1500 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e); 1501 } 1502 } 1503 } 1504 protected void fireAnnotationSetRemoved(DocumentEvent e) { 1505 if (documentListeners != null) { 1506 Vector listeners = documentListeners; 1507 int count = listeners.size(); 1508 for (int i = 0; i < count; i++) { 1509 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e); 1510 } 1511 } 1512 } 1513 public void resourceLoaded(CreoleEvent e) { 1514 } 1515 public void resourceUnloaded(CreoleEvent e) { 1516 } 1517 public void datastoreOpened(CreoleEvent e) { 1518 } 1519 public void datastoreCreated(CreoleEvent e) { 1520 } 1521 public void datastoreClosed(CreoleEvent e) { 1522 if (! e.getDatastore().equals(this.getDataStore())) 1523 return; 1524 //close this lr, since it cannot stay open when the DS it comes from 1525 //is closed 1526 Factory.deleteResource(this); 1527 } 1528 public void setLRPersistenceId(Object lrID) { 1529 super.setLRPersistenceId( lrID); 1530 //make persistent documents listen to the creole register 1531 //for events about their DS 1532 Gate.getCreoleRegister().addCreoleListener(this); 1533 } 1534 1535} // class DocumentImpl 1536
|
DocumentImpl |
|