|
DocumentImpl |
|
1 /* 2 * DocumentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentImpl.java,v 1.115 2002/07/12 13:24:28 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.gui.*; 27 import gate.event.*; 28 29 /** Represents the commonalities between all sorts of documents. 30 * 31 * <H2>Editing</H2> 32 * 33 * <P> 34 * The DocumentImpl class implements the Document interface. 35 * The DocumentContentImpl class models the textual or audio-visual 36 * materials which are the source and content of Documents. 37 * The AnnotationSetImpl class supplies annotations on Documents. 38 * 39 * <P> 40 * Abbreviations: 41 * 42 * <UL> 43 * <LI> 44 * DC = DocumentContent 45 * <LI> 46 * D = Document 47 * <LI> 48 * AS = AnnotationSet 49 * </UL> 50 * 51 * <P> 52 * We add an edit method to each of these classes; for DC and AS 53 * the methods are package private; D has the public method. 54 * 55 * <PRE> 56 * void edit(Long start, Long end, DocumentContent replacement) 57 * throws InvalidOffsetException; 58 * </PRE> 59 * 60 * <P> 61 * D receives edit requests and forwards them to DC and AS. 62 * On DC, this method makes a change to the content - e.g. replacing 63 * a String range from start to end with replacement. (Deletions 64 * are catered for by having replacement = null.) D then calls 65 * AS.edit on each of its annotation sets. 66 * 67 * <P> 68 * On AS, edit calls replacement.size() (i.e. DC.size()) to 69 * figure out how long the replacement is (0 for null). It then 70 * considers annotations that terminate (start or end) in 71 * the altered or deleted range as invalid; annotations that 72 * terminate after the range have their offsets adjusted. 73 * I.e.: 74 * <UL> 75 * <LI> 76 * the nodes that pointed inside the old modified area are invalid now and 77 * will be deleted along with the connected annotations; 78 * <LI> 79 * the nodes that are before the start of the modified area remain 80 * untouched; 81 * <LI> 82 * the nodes that are after the end of the affected area will have the 83 * offset changed according to the formula below. 84 * </UL> 85 * 86 * <P> 87 * A note re. AS and annotations: annotations no longer have 88 * offsets as in the old model, they now have nodes, and nodes 89 * have offsets. 90 * 91 * <P> 92 * To implement AS.edit, we have several indices: 93 * <PRE> 94 * HashMap annotsByStartNode, annotsByEndNode; 95 * </PRE> 96 * which map node ids to annotations; 97 * <PRE> 98 * RBTreeMap nodesByOffset; 99 * </PRE> 100 * which maps offset to Nodes. 101 * 102 * <P> 103 * When we get an edit request, we traverse that part of the 104 * nodesByOffset tree representing the altered or deleted 105 * range of the DC. For each node found, we delete any annotations 106 * that terminate on the node, and then delete the node itself. 107 * We then traverse the rest of the tree, changing the offset 108 * on all remaining nodes by: 109 * <PRE> 110 * newOffset = 111 * oldOffset - 112 * ( 113 * (end - start) - // size of mod 114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl 115 * ); 116 * </PRE> 117 * Note that we use the same convention as e.g. java.lang.String: start 118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd" 119 * range 1-3 = "bc". Examples, for a node with offset 4: 120 * <PRE> 121 * edit(1, 3, "BC"); 122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4 123 * 124 * edit(1, 3, null); 125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2 126 * 127 * edit(1, 3, "BBCC"); 128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6 129 * </PRE> 130 */ 131 public class DocumentImpl 132 extends AbstractLanguageResource implements TextualDocument, CreoleListener, 133 DatastoreListener { 134 /** Debug flag */ 135 private static final boolean DEBUG = false; 136 137 /** If you set this flag to true the original content of the document will 138 * be kept in the document feature. <br> 139 * Default value is false to avoid the unnecessary waste of memory */ 140 private Boolean preserveOriginalContent = new Boolean(false); 141 142 /** If you set this flag to true the repositioning information for 143 * the document will be kept in the document feature. <br> 144 * Default value is false to avoid the unnecessary waste of time and memory 145 */ 146 private Boolean collectRepositioningInfo = new Boolean(false); 147 148 /** 149 * This is a variable which contains the latest crossed over annotation 150 * found during export with preserving format, i.e., toXml(annotations) 151 * method. 152 */ 153 private Annotation crossedOverAnnotation = null; 154 155 /** Default construction. Content left empty. */ 156 public DocumentImpl() { 157 content = new DocumentContentImpl(); 158 } // default construction 159 160 /** Initialise this resource, and return it. */ 161 public Resource init() throws ResourceInstantiationException { 162 //make sure we have an encoding 163 if(encoding == null || encoding.length() == 0) 164 encoding = System.getProperty("file.encoding"); 165 if(encoding == null || encoding.length() == 0) encoding = "UTF-8"; 166 167 // set up the source URL and create the content 168 if(sourceUrl == null) { 169 if(stringContent == null) { 170 throw new ResourceInstantiationException( 171 "The sourceURL and document's content were null." 172 ); 173 } 174 175 content = new DocumentContentImpl(stringContent); 176 getFeatures().put("gate.SourceURL", "created from String"); 177 } else { 178 try { 179 content = new DocumentContentImpl( 180 sourceUrl, encoding, sourceUrlStartOffset, sourceUrlEndOffset); 181 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); 182 } catch(IOException e) { 183 e.printStackTrace(); 184 // throw new ResourceInstantiationException("DocumentImpl.init: " + e); 185 } 186 187 if(preserveOriginalContent.booleanValue() && content != null) { 188 String originalContent = new String( 189 ((DocumentContentImpl) content).getOriginalContent()); 190 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, 191 originalContent); 192 } // if 193 } 194 195 // set up a DocumentFormat if markup unpacking required 196 if(getMarkupAware().booleanValue()) { 197 DocumentFormat docFormat = 198 DocumentFormat.getDocumentFormat(this, sourceUrl); 199 try { 200 if(docFormat != null){ 201 StatusListener sListener = (StatusListener) 202 gate.gui.MainFrame.getListeners(). 203 get("gate.event.StatusListener"); 204 if(sListener != null) docFormat.addStatusListener(sListener); 205 206 // set the flag if true and if the document format support collecting 207 docFormat.setShouldCollectRepositioning(collectRepositioningInfo); 208 209 if(docFormat.getShouldCollectRepositioning().booleanValue()) { 210 // unpack with collectiong of repositioning information 211 RepositioningInfo info = new RepositioningInfo(); 212 213 String origContent = (String) getFeatures().get( 214 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 215 216 RepositioningInfo ampCodingInfo = new RepositioningInfo(); 217 if(origContent != null) { 218 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat; 219 collectInformationForAmpCodding(origContent, ampCodingInfo, 220 shouldCorrectCR); 221 if(docFormat instanceof HtmlDocumentFormat) { 222 collectInformationForWS(origContent, ampCodingInfo); 223 } // if 224 } // if 225 226 docFormat.unpackMarkup(this, info, ampCodingInfo); 227 228 if(origContent != null 229 && docFormat instanceof XmlDocumentFormat) { 230 // CRLF correction of RepositioningInfo 231 correctRepositioningForCRLFInXML(origContent, info); 232 } // if 233 234 getFeatures().put( 235 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info); 236 } 237 else { 238 // normal old fashioned unpack 239 docFormat.unpackMarkup(this); 240 } 241 docFormat.removeStatusListener(sListener); 242 } //if format != null 243 } catch(DocumentFormatException e) { 244 throw new ResourceInstantiationException( 245 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() + 246 " " + e 247 ); 248 } 249 } // if markup aware 250 251 //try{ 252 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt"); 253 // fw.write(getContent().toString()); 254 // fw.flush(); 255 // fw.close(); 256 //}catch(IOException ioe){ 257 // ioe.printStackTrace(); 258 //} 259 260 return this; 261 } // init() 262 263 /** 264 * Correct repositioning information for substitution of "\r\n" with "\n" 265 */ 266 private void correctRepositioningForCRLFInXML(String content, 267 RepositioningInfo info) { 268 int index = -1; 269 270 do { 271 index = content.indexOf("\r\n", index+1); 272 if(index != -1) { 273 info.correctInformationOriginalMove(index, 1); 274 } // if 275 } while(index != -1); 276 } // correctRepositioningForCRLF 277 278 /** 279 * Collect information for substitution of "&xxx;" with "y" 280 * 281 * It couldn't be collected a position information about 282 * some unicode and &-coded symbols during parsing. The parser "hide" the 283 * information about the position of such kind of parsed text. 284 * So, there is minimal chance to have &-coded symbol inside the covered by 285 * repositioning records area. The new record should be created for every 286 * coded symbol outside the existing records. 287 * <BR> 288 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction 289 * for CRLF substitution is performed. 290 */ 291 private void collectInformationForAmpCodding(String content, 292 RepositioningInfo info, 293 boolean shouldCorrectCR) { 294 295 if(content == null || info == null) return; 296 297 int ampIndex = -1; 298 int semiIndex; 299 300 do { 301 ampIndex = content.indexOf('&', ampIndex+1); 302 if(ampIndex != -1) { 303 semiIndex = content.indexOf(';', ampIndex+1); 304 // have semicolon and it is near enough for amp codding 305 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) { 306 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1); 307 } 308 else { 309 // no semicolon or it is too far 310 // analyse for amp codding without semicolon 311 int maxEnd = Math.min(ampIndex+8, content.length()); 312 String ampCandidate = content.substring(ampIndex, maxEnd); 313 int ampCodingSize = analyseAmpCodding(ampCandidate); 314 315 if(ampCodingSize != -1) { 316 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1); 317 } // if 318 319 } // if - semicolon found 320 } // if - ampersand found 321 } while (ampIndex != -1); 322 323 // correct the collected information to adjust it's positions 324 // with reported by the parser 325 int index = -1; 326 327 if(shouldCorrectCR) { 328 do { 329 index = content.indexOf("\r\n", index+1); 330 if(index != -1) { 331 info.correctInformationOriginalMove(index, -1); 332 } // if 333 } while(index != -1); 334 } // if 335 } // collectInformationForAmpCodding 336 337 /** 338 * This function compute size of the ampersand codded sequence when 339 * semicolin is not present. 340 */ 341 private int analyseAmpCodding(String content) { 342 int result = -1; 343 344 try { 345 char ch = content.charAt(1); 346 347 switch(ch) { 348 case 'l' : // < 349 case 'L' : // < 350 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 351 result = 3; 352 } // if 353 break; 354 case 'g' : // > 355 case 'G' : // > 356 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 357 result = 3; 358 } // if 359 break; 360 case 'a' : // & 361 case 'A' : // & 362 if(content.substring(2, 4).equalsIgnoreCase("mp")) { 363 result = 4; 364 } // if 365 break; 366 case 'q' : // " 367 case 'Q' : // " 368 if(content.substring(2, 5).equalsIgnoreCase("uot")) { 369 result = 5; 370 } // if 371 break; 372 case '#' : // #number (example ‘, 䰸) 373 int endIndex = 2; 374 boolean hexCoded = false; 375 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') { 376 // Hex codding 377 ++endIndex; 378 hexCoded = true; 379 } // if 380 381 while (endIndex < 8 382 && isNumber(content.charAt(endIndex), hexCoded) ) { 383 ++endIndex; 384 } // while 385 result = endIndex; 386 break; 387 } // switch 388 } catch (StringIndexOutOfBoundsException ex) { 389 // do nothing 390 } // catch 391 392 return result; 393 } // analyseAmpCodding 394 395 /** Check for numeric range. If hex is true the A..F range is included */ 396 private boolean isNumber(char ch, boolean hex) { 397 if(ch >= '0' && ch <= '9') return true; 398 399 if(hex) { 400 if(ch >= 'A' && ch <= 'F') return true; 401 if(ch >= 'a' && ch <= 'f') return true; 402 } // if 403 404 return false; 405 } // isNumber 406 407 /** HTML parser perform substitution of multiple whitespaces (WS) with 408 * a single WS. To create correct repositioning information structure we 409 * should keep the information for such multiple WS. 410 * <BR> 411 * The criteria for WS is <code>(ch <= ' ')</code>. 412 */ 413 private void collectInformationForWS(String content, RepositioningInfo info) { 414 415 if(content == null || info == null) return; 416 417 // analyse the content and correct the repositioning information 418 char ch; 419 int startWS, endWS; 420 421 startWS = endWS = -1; 422 int contentLength = content.length(); 423 424 for(int i=0; i<contentLength; ++i) { 425 ch = content.charAt(i); 426 427 // is whitespace 428 if(ch <= ' ') { 429 if(startWS == -1) { 430 startWS = i; 431 } // if 432 endWS = i; 433 } 434 else { 435 if(endWS - startWS > 0) { 436 // put the repositioning information about the WS substitution 437 info.addPositionInfo( 438 (long)startWS, (long)(endWS - startWS + 1), 0, 1); 439 } // if 440 // clear positions 441 startWS = endWS = -1; 442 }// if 443 } // for 444 } // collectInformationForWS 445 446 /** Clear all the data members of the object. */ 447 public void cleanup() { 448 449 defaultAnnots = null; 450 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty())) 451 namedAnnotSets.clear(); 452 if (DEBUG) Out.prln("Document cleanup called"); 453 if (this.lrPersistentId != null) 454 Gate.getCreoleRegister().removeCreoleListener(this); 455 if(this.getDataStore() != null) 456 this.getDataStore().removeDatastoreListener(this); 457 } // cleanup() 458 459 460 /** Documents are identified by URLs */ 461 public URL getSourceUrl() { return sourceUrl; } 462 463 /** Set method for the document's URL */ 464 public void setSourceUrl(URL sourceUrl) { 465 this.sourceUrl = sourceUrl; 466 } // setSourceUrl 467 468 /** Documents may be packed within files; in this case an optional pair of 469 * offsets refer to the location of the document. 470 */ 471 public Long[] getSourceUrlOffsets() { 472 Long[] sourceUrlOffsets = new Long[2]; 473 sourceUrlOffsets[0] = sourceUrlStartOffset; 474 sourceUrlOffsets[1] = sourceUrlEndOffset; 475 return sourceUrlOffsets; 476 } // getSourceUrlOffsets 477 478 /** 479 * Allow/disallow preserving of the original document content. 480 * If is <B>true</B> the original content will be retrieved from 481 * the DocumentContent object and preserved as document feature. 482 */ 483 public void setPreserveOriginalContent(Boolean b) { 484 preserveOriginalContent = b; 485 } // setPreserveOriginalContent 486 487 /** Get the preserving of content status of the Document. 488 * 489 * @return whether the Document should preserve it's original content. 490 */ 491 public Boolean getPreserveOriginalContent() { 492 return preserveOriginalContent; 493 } // getPreserveOriginalContent 494 495 /** 496 * Allow/disallow collecting of repositioning information. 497 * If is <B>true</B> information will be retrieved and preserved 498 * as document feature.<BR> 499 * Preserving of repositioning information give the possibilities 500 * for converting of coordinates between the original document content and 501 * extracted from the document text. 502 */ 503 public void setCollectRepositioningInfo(Boolean b) { 504 collectRepositioningInfo = b; 505 } // setCollectRepositioningInfo 506 507 /** Get the collectiong and preserving of repositioning information 508 * for the Document. <BR> 509 * Preserving of repositioning information give the possibilities 510 * for converting of coordinates between the original document content and 511 * extracted from the document text. 512 * 513 * @return whether the Document should collect and preserve information. 514 */ 515 public Boolean getCollectRepositioningInfo() { 516 return collectRepositioningInfo; 517 } // getCollectRepositioningInfo 518 519 /** Documents may be packed within files; in this case an optional pair of 520 * offsets refer to the location of the document. This method gets the 521 * start offset. 522 */ 523 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; } 524 525 /** Documents may be packed within files; in this case an optional pair of 526 * offsets refer to the location of the document. This method sets the 527 * start offset. 528 */ 529 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { 530 this.sourceUrlStartOffset = sourceUrlStartOffset; 531 } // setSourceUrlStartOffset 532 533 /** Documents may be packed within files; in this case an optional pair of 534 * offsets refer to the location of the document. This method gets the 535 * end offset. 536 */ 537 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; } 538 539 /** Documents may be packed within files; in this case an optional pair of 540 * offsets refer to the location of the document. This method sets the 541 * end offset. 542 */ 543 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { 544 this.sourceUrlEndOffset = sourceUrlEndOffset; 545 } // setSourceUrlStartOffset 546 547 /** The content of the document: a String for text; MPEG for video; etc. */ 548 public DocumentContent getContent() { return content; } 549 550 /** Set method for the document content */ 551 public void setContent(DocumentContent content) { this.content = content; } 552 553 /** Get the encoding of the document content source */ 554 public String getEncoding() { return encoding; } 555 556 /** Set the encoding of the document content source */ 557 public void setEncoding(String encoding) { this.encoding = encoding; } 558 559 /** Get the default set of annotations. The set is created if it 560 * doesn't exist yet. 561 */ 562 public AnnotationSet getAnnotations() { 563 if(defaultAnnots == null){ 564 defaultAnnots = new AnnotationSetImpl(this); 565 fireAnnotationSetAdded(new DocumentEvent( 566 this, DocumentEvent.ANNOTATION_SET_ADDED, null)); 567 }//if 568 return defaultAnnots; 569 } // getAnnotations() 570 571 /** Get a named set of annotations. Creates a new set if one with this 572 * name doesn't exist yet. 573 * If the provided name is null then it returns the default annotation set. 574 */ 575 public AnnotationSet getAnnotations(String name) { 576 if(name == null) return getAnnotations(); 577 if(namedAnnotSets == null) 578 namedAnnotSets = new HashMap(); 579 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name); 580 581 if(namedSet == null) { 582 namedSet = new AnnotationSetImpl(this, name); 583 namedAnnotSets.put(name, namedSet); 584 585 DocumentEvent evt = new DocumentEvent( 586 this, DocumentEvent.ANNOTATION_SET_ADDED, name 587 ); 588 fireAnnotationSetAdded(evt); 589 } 590 return namedSet; 591 } // getAnnotations(name) 592 593 /** Make the document markup-aware. This will trigger the creation 594 * of a DocumentFormat object at Document initialisation time; the 595 * DocumentFormat object will unpack the markup in the Document and 596 * add it as annotations. Documents are <B>not</B> markup-aware by default. 597 * 598 * @param b markup awareness status. 599 */ 600 public void setMarkupAware(Boolean newMarkupAware) { 601 this.markupAware = newMarkupAware; 602 } 603 604 /** Get the markup awareness status of the Document. 605 * <B>Documents are markup-aware by default.</B> 606 * @return whether the Document is markup aware. 607 */ 608 public Boolean getMarkupAware() { return markupAware; } 609 610 /** Returns an XML document aming to preserve the original markups( 611 * the original markup will be in the same place and format as it was 612 * before processing the document) and include (if possible) 613 * the annotations specified in the aSourceAnnotationSet. 614 * It is equivalent to toXml(aSourceAnnotationSet, true). 615 */ 616 public String toXml(Set aSourceAnnotationSet){ 617 return toXml(aSourceAnnotationSet, true); 618 } 619 620 /** Returns an XML document aming to preserve the original markups( 621 * the original markup will be in the same place and format as it was 622 * before processing the document) and include (if possible) 623 * the annotations specified in the aSourceAnnotationSet. 624 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost 625 * if they will cause a crosed over situation. 626 * @param aSourceAnnotationSet is an annotation set containing all the 627 * annotations that will be combined with the original marup set. If the 628 * param is <code>null</code> it will only dump the original markups. 629 * @param includeFeatures is a boolean that controls whether the annotation 630 * features should be included or not. If false, only the annotation type 631 * is included in the tag. 632 * @return a string representing an XML document containing the original 633 * markup + dumped annotations form the aSourceAnnotationSet 634 */ 635 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){ 636 637 if(hasOriginalContentFeatures()) { 638 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures); 639 } // if 640 641 AnnotationSet originalMarkupsAnnotSet = 642 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 643 644 // Create a dumping annotation set on the document. It will be used for 645 // dumping annotations... 646 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 647 648 // This set will be constructed inside this method. If is not empty, the 649 // annotation contained will be lost. 650 if (!dumpingSet.isEmpty()){ 651 Out.prln("WARNING: The dumping annotation set was not empty."+ 652 "All annotation it contained were lost."); 653 dumpingSet.clear(); 654 }// End if 655 656 StatusListener sListener = (StatusListener) 657 gate.gui.MainFrame.getListeners(). 658 get("gate.event.StatusListener"); 659 // Construct the dumping set in that way that all annotations will verify 660 // the condition that there are not annotations which are crossed. 661 // First add all annotation from the original markups 662 if(sListener != null) 663 sListener.statusChanged("Constructing the dumping annotation set."); 664 dumpingSet.addAll(originalMarkupsAnnotSet); 665 // Then take all the annotations from aSourceAnnotationSet and verify if 666 // they can be inserted safely into the dumpingSet. Where not possible, 667 // report. 668 if (aSourceAnnotationSet != null){ 669 Iterator iter = aSourceAnnotationSet.iterator(); 670 while (iter.hasNext()){ 671 Annotation currentAnnot = (Annotation) iter.next(); 672 if(insertsSafety(dumpingSet,currentAnnot)){ 673 dumpingSet.add(currentAnnot); 674 }else if (crossedOverAnnotation != null){ 675 try { 676 Out.prln("Warning: Annotations were found to violate the " + 677 "crossed over condition: \n" + 678 "1. [" + 679 getContent().getContent( 680 crossedOverAnnotation.getStartNode().getOffset(), 681 crossedOverAnnotation.getEndNode().getOffset()) + 682 " (" + crossedOverAnnotation.getType() + ": " + 683 crossedOverAnnotation.getStartNode().getOffset() + 684 ";" + crossedOverAnnotation.getEndNode().getOffset() + 685 ")]\n" + 686 "2. [" + 687 getContent().getContent( 688 currentAnnot.getStartNode().getOffset(), 689 currentAnnot.getEndNode().getOffset()) + 690 " (" + currentAnnot.getType() + ": " + 691 currentAnnot.getStartNode().getOffset() + 692 ";" + currentAnnot.getEndNode().getOffset() + 693 ")]\nThe second one will be discarded.\n" ); 694 } catch (gate.util.InvalidOffsetException ex) { 695 throw new GateRuntimeException(ex.getMessage()); 696 } 697 }// End if 698 }// End while 699 }// End if 700 701 // The dumpingSet is ready to be exported as XML 702 // Here we go. 703 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 704 StringBuffer xmlDoc = new StringBuffer( 705 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 706 707 // Add xml header if original format was xml 708 String mimeType = getFeatures() == null ? 709 null : 710 (String)getFeatures().get("MimeType"); 711 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml"); 712 // boolean needsRootTag = false; 713 if(wasXML){ 714 String defaultEncoding = System.getProperty("file.encoding"); 715 if(defaultEncoding == null) defaultEncoding = "UTF-8"; 716 xmlDoc.append("<?xml version=\"1.0\" encoding=\"" + 717 (encoding == null ? defaultEncoding : encoding) + 718 "\" ?>" + Strings.getNl()); 719 // Add the root start element if not already there 720 // AnnotationSet aType = dumpingSet.get("GatePreserveFormat"); 721 // if(aType == null || aType.isEmpty()){ 722 // needsRootTag = true; 723 // xmlDoc.append("<GatePreserveFormat " + 724 // "xmlns:gate=\"http://www.gate.ac.uk\" " + 725 // "gate:annotMaxId=\"" + getNextAnnotationId() + "\">"); 726 // } 727 } 728 729 xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures)); 730 731 xmlDoc.append(rootEnd); 732 // if(wasXML && needsRootTag){ 733 // xmlDoc.append("</GatePreserveFormat>"); 734 // } 735 if(sListener != null) sListener.statusChanged("Done."); 736 return xmlDoc.toString(); 737 }//End toXml() 738 739 /** This method verifies if aSourceAnnotation can ve inserted safety into the 740 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over 741 * contition with any annotation from the aTargetAnnotSet. 742 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation 743 * @param aSourceAnnotation the annotation to be inserted into the 744 * aTargetAnnotSet 745 * @return true if the annotation inserts safety, or false otherwise. 746 */ 747 private boolean insertsSafety(AnnotationSet aTargetAnnotSet, 748 Annotation aSourceAnnotation){ 749 750 if (aTargetAnnotSet == null || aSourceAnnotation == null) { 751 this.crossedOverAnnotation = null; 752 return false; 753 } 754 if (aSourceAnnotation.getStartNode() == null || 755 aSourceAnnotation.getStartNode().getOffset()== null) { 756 this.crossedOverAnnotation = null; 757 return false; 758 } 759 if (aSourceAnnotation.getEndNode() == null || 760 aSourceAnnotation.getEndNode().getOffset()== null) { 761 this.crossedOverAnnotation = null; 762 return false; 763 } 764 765 // Get the start and end offsets 766 Long start = aSourceAnnotation.getStartNode().getOffset(); 767 Long end = aSourceAnnotation.getEndNode().getOffset(); 768 // Read aSourceAnnotation offsets long 769 long s2 = start.longValue(); 770 long e2 = end.longValue(); 771 772 // Obtain a set with all annotations annotations that overlap 773 // totaly or partially with the interval defined by the two provided offsets 774 AnnotationSet as = aTargetAnnotSet.get(start,end); 775 776 // Investigate all the annotations from as to see if there is one that 777 // comes in conflict with aSourceAnnotation 778 Iterator it = as.iterator(); 779 while(it.hasNext()){ 780 Annotation ann = (Annotation) it.next(); 781 // Read ann offsets 782 long s1 = ann.getStartNode().getOffset().longValue(); 783 long e1 = ann.getEndNode().getOffset().longValue(); 784 785 if (s1<s2 && s2<e1 && e1<e2) { 786 this.crossedOverAnnotation = ann; 787 return false; 788 } 789 if (s2<s1 && s1<e2 && e2<e1) { 790 this.crossedOverAnnotation = ann; 791 return false; 792 } 793 }// End while 794 return true; 795 }// insertsSafety() 796 797 /** This method saves all the annotations from aDumpAnnotSet and combines 798 * them with the document content. 799 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 800 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 801 * then an empty string will be returned. 802 * @param includeFeatures is a boolean, which controls whether the annotation 803 * features and gate ID are included or not. 804 * @return The XML document obtained from raw text + the information from 805 * the dump annotation set. 806 */ 807 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet, 808 boolean includeFeatures){ 809 String content = null; 810 if (this.getContent()== null) 811 content = new String(""); 812 else 813 content = this.getContent().toString(); 814 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 815 if (aDumpAnnotSet == null) return docContStrBuff.toString(); 816 817 TreeMap offsets2CharsMap = new TreeMap(); 818 if (this.getContent().size().longValue() != 0){ 819 // Fill the offsets2CharsMap with all the indices where 820 // special chars appear 821 buildEntityMapFromString(content,offsets2CharsMap); 822 }//End if 823 // The saving alghorithm is as follows: 824 /////////////////////////////////////////// 825 // Construct a set of annot with all IDs in asc order. 826 // All annotations that end at that offset swap their place in descending 827 // order. For each node write all the tags from left to right. 828 829 // Construct the node set 830 TreeSet offsets = new TreeSet(); 831 Iterator iter = aDumpAnnotSet.iterator(); 832 while (iter.hasNext()){ 833 Annotation annot = (Annotation) iter.next(); 834 offsets.add(annot.getStartNode().getOffset()); 835 offsets.add(annot.getEndNode().getOffset()); 836 //compute the smallest ID 837 if(smallestAnnotationID == null || 838 smallestAnnotationID.compareTo(annot.getId()) > 0){ 839 smallestAnnotationID = annot.getId(); 840 } 841 }// End while 842 843 // ofsets is sorted in ascending order. 844 // Iterate this set in descending order and remove an offset at each 845 // iteration 846 while (!offsets.isEmpty()){ 847 Long offset = (Long)offsets.last(); 848 // Remove the offset from the set 849 offsets.remove(offset); 850 // Now, use it. 851 // Returns a list with annotations that needs to be serialized in that 852 // offset. 853 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset); 854 // Attention: the annotation are serialized from left to right 855 StringBuffer tmpBuff = new StringBuffer(""); 856 Stack stack = new Stack(); 857 // Iterate through all these annotations and serialize them 858 Iterator it = annotations.iterator(); 859 while(it.hasNext()){ 860 Annotation a = (Annotation) it.next(); 861 it.remove(); 862 // Test if a Ends at offset 863 if ( offset.equals(a.getEndNode().getOffset()) ){ 864 // Test if a Starts at offset 865 if ( offset.equals(a.getStartNode().getOffset()) ){ 866 // Here, the annotation a Starts and Ends at the offset 867 if ( null != a.getFeatures().get("isEmptyAndSpan") && 868 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 869 870 // Assert: annotation a with start == end and isEmptyAndSpan 871 tmpBuff.append(writeStartTag(a, includeFeatures)); 872 stack.push(a); 873 }else{ 874 // Assert annotation a with start == end and an empty tag 875 tmpBuff.append(writeEmptyTag(a)); 876 // The annotation is removed from dumped set 877 aDumpAnnotSet.remove(a); 878 }// End if 879 }else{ 880 // Here the annotation a Ends at the offset. 881 // In this case empty the stack and write the end tag 882 if (!stack.isEmpty()){ 883 while(!stack.isEmpty()){ 884 Annotation a1 = (Annotation)stack.pop(); 885 tmpBuff.append(writeEndTag(a1)); 886 }// End while 887 }// End if 888 tmpBuff.append(writeEndTag(a)); 889 }// End if 890 }else{ 891 // The annotation a does NOT end at the offset. Let's see if it starts 892 // at the offset 893 if ( offset.equals(a.getStartNode().getOffset()) ){ 894 // The annotation a starts at the offset. 895 // In this case empty the stack and write the end tag 896 if (!stack.isEmpty()){ 897 while(!stack.isEmpty()){ 898 Annotation a1 = (Annotation)stack.pop(); 899 tmpBuff.append(writeEndTag(a1)); 900 }// End while 901 }// End if 902 tmpBuff.append(writeStartTag(a, includeFeatures)); 903 // The annotation is removed from dumped set 904 aDumpAnnotSet.remove(a); 905 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 906 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 907 }// End while(it.hasNext()){ 908 909 // In this case empty the stack and write the end tag 910 if (!stack.isEmpty()){ 911 while(!stack.isEmpty()){ 912 Annotation a1 = (Annotation)stack.pop(); 913 tmpBuff.append(writeEndTag(a1)); 914 }// End while 915 }// End if 916 917 // Before inserting tmpBuff into docContStrBuff we need to check 918 // if there are chars to be replaced and if there are, they would be 919 // replaced. 920 if (!offsets2CharsMap.isEmpty()){ 921 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 922 while( !offsets2CharsMap.isEmpty() && 923 offsChar.intValue() >= offset.intValue()){ 924 // Replace the char at offsChar with its corresponding entity form 925 // the entitiesMap. 926 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 927 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 928 // Discard the offsChar after it was used. 929 offsets2CharsMap.remove(offsChar); 930 // Investigate next offsChar 931 if (!offsets2CharsMap.isEmpty()) 932 offsChar = (Integer) offsets2CharsMap.lastKey(); 933 }// End while 934 }// End if 935 // Insert tmpBuff to the location where it belongs in docContStrBuff 936 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 937 }// End while(!offsets.isEmpty()) 938 // Need to replace the entities in the remaining text, if there is any text 939 // So, if there are any more items in offsets2CharsMap they need to be 940 // replaced 941 while (!offsets2CharsMap.isEmpty()){ 942 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 943 // Replace the char with its entity 944 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 945 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 946 // remove the offset from the map 947 offsets2CharsMap.remove(offsChar); 948 }// End while 949 return docContStrBuff.toString(); 950 }// saveAnnotationSetAsXml() 951 952 /** 953 * Return true only if the document has features for original content and 954 * repositioning information. 955 */ 956 private boolean hasOriginalContentFeatures() { 957 FeatureMap features = getFeatures(); 958 boolean result = false; 959 960 result = 961 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) 962 && 963 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) 964 != null); 965 966 return result; 967 } // hasOriginalContentFeatures 968 969 /** This method saves all the annotations from aDumpAnnotSet and combines 970 * them with the original document content, if preserved as feature. 971 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 972 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 973 * then an empty string will be returned. 974 * @param includeFeatures is a boolean, which controls whether the annotation 975 * features and gate ID are included or not. 976 * @return The XML document obtained from raw text + the information from 977 * the dump annotation set. 978 */ 979 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet, 980 boolean includeFeatures){ 981 StringBuffer docContStrBuff; 982 983 String origContent; 984 985 origContent = 986 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 987 if(origContent == null) { 988 origContent = ""; 989 } // if 990 991 long originalContentSize = origContent.length(); 992 993 RepositioningInfo repositioning = (RepositioningInfo) 994 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); 995 996 docContStrBuff = new StringBuffer(origContent); 997 if (aSourceAnnotationSet == null) return docContStrBuff.toString(); 998 999 StatusListener sListener = (StatusListener) 1000 gate.gui.MainFrame.getListeners(). 1001 get("gate.event.StatusListener"); 1002 1003 AnnotationSet originalMarkupsAnnotSet = 1004 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1005 // Create a dumping annotation set on the document. It will be used for 1006 // dumping annotations... 1007 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 1008 if(sListener != null) 1009 sListener.statusChanged("Constructing the dumping annotation set."); 1010 // Then take all the annotations from aSourceAnnotationSet and verify if 1011 // they can be inserted safely into the dumpingSet. Where not possible, 1012 // report. 1013 if (aSourceAnnotationSet != null){ 1014 Iterator iter = aSourceAnnotationSet.iterator(); 1015 Annotation currentAnnot; 1016 while (iter.hasNext()){ 1017 currentAnnot = (Annotation) iter.next(); 1018 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot) 1019 && insertsSafety(dumpingSet, currentAnnot)){ 1020 dumpingSet.add(currentAnnot); 1021 }else{ 1022 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + 1023 ", startOffset=" + currentAnnot.getStartNode().getOffset() + 1024 ", endOffset=" + currentAnnot.getEndNode().getOffset() + 1025 ", type=" + currentAnnot.getType()+ " was found to violate the" + 1026 " crossed over condition. It will be discarded"); 1027 }// End if 1028 }// End while 1029 }// End if 1030 1031 // The dumpingSet is ready to be exported as XML 1032 // Here we go. 1033 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 1034 1035 /////////////////////////////////////////// 1036 // Construct a set of annot with all IDs in asc order. 1037 // All annotations that end at that offset swap their place in descending 1038 // order. For each node write all the tags from left to right. 1039 1040 // Construct the node set 1041 TreeSet offsets = new TreeSet(); 1042 Iterator iter = aSourceAnnotationSet.iterator(); 1043 while (iter.hasNext()){ 1044 Annotation annot = (Annotation) iter.next(); 1045 offsets.add(annot.getStartNode().getOffset()); 1046 offsets.add(annot.getEndNode().getOffset()); 1047 }// End while 1048 1049 // ofsets is sorted in ascending order. 1050 // Iterate this set in descending order and remove an offset at each 1051 // iteration 1052 while (!offsets.isEmpty()){ 1053 Long offset = (Long)offsets.last(); 1054 // Remove the offset from the set 1055 offsets.remove(offset); 1056 // Now, use it. 1057 // Returns a list with annotations that needs to be serialized in that 1058 // offset. 1059 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset); 1060 // Attention: the annotation are serialized from left to right 1061 StringBuffer tmpBuff = new StringBuffer(""); 1062 Stack stack = new Stack(); 1063 // Iterate through all these annotations and serialize them 1064 Iterator it = annotations.iterator(); 1065 Annotation a = null; 1066 while(it.hasNext()) { 1067 a = (Annotation) it.next(); 1068 it.remove(); 1069 // Test if a Ends at offset 1070 if ( offset.equals(a.getEndNode().getOffset()) ){ 1071 // Test if a Starts at offset 1072 if ( offset.equals(a.getStartNode().getOffset()) ){ 1073 // Here, the annotation a Starts and Ends at the offset 1074 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1075 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1076 1077 // Assert: annotation a with start == end and isEmptyAndSpan 1078 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1079 stack.push(a); 1080 }else{ 1081 // Assert annotation a with start == end and an empty tag 1082 tmpBuff.append(writeEmptyTag(a, false)); 1083 // The annotation is removed from dumped set 1084 aSourceAnnotationSet.remove(a); 1085 }// End if 1086 }else{ 1087 // Here the annotation a Ends at the offset. 1088 // In this case empty the stack and write the end tag 1089 while(!stack.isEmpty()){ 1090 Annotation a1 = (Annotation)stack.pop(); 1091 tmpBuff.append(writeEndTag(a1)); 1092 }// End while 1093 tmpBuff.append(writeEndTag(a)); 1094 }// End if 1095 }else{ 1096 // The annotation a does NOT end at the offset. Let's see if it starts 1097 // at the offset 1098 if ( offset.equals(a.getStartNode().getOffset()) ){ 1099 // The annotation a starts at the offset. 1100 // In this case empty the stack and write the end tag 1101 while(!stack.isEmpty()){ 1102 Annotation a1 = (Annotation)stack.pop(); 1103 tmpBuff.append(writeEndTag(a1)); 1104 }// End while 1105 1106 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1107 // The annotation is removed from dumped set 1108 aSourceAnnotationSet.remove(a); 1109 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1110 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1111 }// End while(it.hasNext()){ 1112 1113 // In this case empty the stack and write the end tag 1114 while(!stack.isEmpty()){ 1115 Annotation a1 = (Annotation)stack.pop(); 1116 tmpBuff.append(writeEndTag(a1)); 1117 }// End while 1118 1119 long originalPosition = -1; 1120 boolean backPositioning = 1121 a != null && offset.equals(a.getEndNode().getOffset()); 1122 if ( backPositioning ) { 1123 // end of the annotation correction 1124 originalPosition = 1125 repositioning.getOriginalPos(offset.intValue(), true); 1126 } // if 1127 1128 if(originalPosition == -1) { 1129 originalPosition = repositioning.getOriginalPos(offset.intValue()); 1130 } // if 1131 1132 // Insert tmpBuff to the location where it belongs in docContStrBuff 1133 if(originalPosition != -1 && originalPosition <= originalContentSize ) { 1134 docContStrBuff.insert((int) originalPosition, tmpBuff.toString()); 1135 } 1136 else { 1137 Out.prln("Error in the repositioning. The offset ("+offset.intValue() 1138 +") could not be positioned in the original document. \n" 1139 +"Calculated position is: "+originalPosition 1140 +" placed back: "+backPositioning); 1141 } // if 1142 1143 }// End while(!offsets.isEmpty()) 1144 docContStrBuff.append(rootEnd); 1145 return docContStrBuff.toString(); 1146 } // saveAnnotationSetAsXml() 1147 1148 /** This method returns a list with annotations ordered that way that 1149 * they can be serialized from left to right, at the offset. If one of the 1150 * params is null then an empty list will be returned. 1151 * @param aDumpAnnotSet is a set containing all annotations that will be 1152 * dumped. 1153 * @param offset represent the offset at witch the annotation must start 1154 * AND/OR end. 1155 * @return a list with those annotations that need to be serialized. 1156 */ 1157 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){ 1158 List annotationList = new LinkedList(); 1159 if (aDumpAnnotSet == null || offset == null) return annotationList; 1160 Set annotThatStartAtOffset = new TreeSet( 1161 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC)); 1162 Set annotThatEndAtOffset = new TreeSet( 1163 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC)); 1164 Set annotThatStartAndEndAtOffset = new TreeSet( 1165 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC)); 1166 1167 // Fill these tree lists with annotation tat start, end or start and 1168 // end at the offset. 1169 Iterator iter = aDumpAnnotSet.iterator(); 1170 while(iter.hasNext()){ 1171 Annotation ann = (Annotation) iter.next(); 1172 if (offset.equals(ann.getStartNode().getOffset())){ 1173 if (offset.equals(ann.getEndNode().getOffset())) 1174 annotThatStartAndEndAtOffset.add(ann); 1175 else 1176 annotThatStartAtOffset.add(ann); 1177 }else{ 1178 if (offset.equals(ann.getEndNode().getOffset())) 1179 annotThatEndAtOffset.add(ann); 1180 }// End if 1181 }// End while 1182 annotationList.addAll(annotThatEndAtOffset); 1183 annotThatEndAtOffset = null; 1184 annotationList.addAll(annotThatStartAtOffset); 1185 annotThatStartAtOffset = null; 1186 iter = annotThatStartAndEndAtOffset.iterator(); 1187 while(iter.hasNext()){ 1188 Annotation ann = (Annotation) iter.next(); 1189 Iterator it = annotationList.iterator(); 1190 boolean breaked = false; 1191 while (it.hasNext()){ 1192 Annotation annFromList = (Annotation) it.next(); 1193 if (annFromList.getId().intValue() > ann.getId().intValue()){ 1194 annotationList.add(annotationList.indexOf(annFromList),ann); 1195 breaked = true; 1196 break; 1197 }// End if 1198 }// End while 1199 if (!breaked) 1200 annotationList.add(ann); 1201 iter.remove(); 1202 }// End while 1203 return annotationList; 1204 }// getAnnotationsForOffset() 1205 1206 private String writeStartTag(Annotation annot, boolean includeFeatures){ 1207 return writeStartTag(annot, includeFeatures, true); 1208 } // writeStartTag 1209 1210 /** Returns a string representing a start tag based on the input annot*/ 1211 private String writeStartTag(Annotation annot, boolean includeFeatures, 1212 boolean includeNamespace){ 1213 AnnotationSet originalMarkupsAnnotSet = 1214 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1215 1216 StringBuffer strBuff = new StringBuffer(""); 1217 if (annot == null) return strBuff.toString(); 1218// if (!addGatePreserveFormatTag && isRootTag){ 1219 if (annot.getId().equals(smallestAnnotationID)){ 1220 //the features are included either if desired or if that's an annotation 1221 //from the original markup of the document. We don't want for example to 1222 //spoil all links in an HTML file! 1223 if (includeFeatures) { 1224 strBuff.append("<"); 1225 strBuff.append(annot.getType()); 1226 strBuff.append(" "); 1227 if(includeNamespace) { 1228 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\""); 1229 strBuff.append(" gate:"); 1230 } 1231 strBuff.append("gateId=\""); 1232 strBuff.append(annot.getId()); 1233 strBuff.append("\""); 1234 strBuff.append(" "); 1235 if(includeNamespace) { 1236 strBuff.append("gate:"); 1237 } 1238 strBuff.append("annotMaxId=\""); 1239 strBuff.append(nextAnnotationId); 1240 strBuff.append("\""); 1241 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1242 strBuff.append(">"); 1243 } 1244 else if (originalMarkupsAnnotSet.contains(annot)) { 1245 strBuff.append("<"); 1246 strBuff.append(annot.getType()); 1247 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1248 strBuff.append(">"); 1249 } 1250 else { 1251 strBuff.append("<"); 1252 strBuff.append(annot.getType()); 1253 strBuff.append(">"); 1254 } 1255 1256 }else{ 1257 //the features are included either if desired or if that's an annotation 1258 //from the original markup of the document. We don't want for example to 1259 //spoil all links in an HTML file! 1260 if (includeFeatures) { 1261 strBuff.append("<"); 1262 strBuff.append(annot.getType()); 1263 strBuff.append(" "); 1264 if(includeNamespace) { 1265 strBuff.append("gate:"); 1266 } // if includeNamespaces 1267 strBuff.append("gateId=\""); 1268 strBuff.append(annot.getId()); 1269 strBuff.append("\""); 1270 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1271 strBuff.append(">"); 1272 } 1273 else if (originalMarkupsAnnotSet.contains(annot)) { 1274 strBuff.append("<"); 1275 strBuff.append(annot.getType()); 1276 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1277 strBuff.append(">"); 1278 } 1279 else { 1280 strBuff.append("<"); 1281 strBuff.append(annot.getType()); 1282 strBuff.append(">"); 1283 } 1284 }// End if 1285 return strBuff.toString(); 1286 }// writeStartTag() 1287 1288 /** This method takes aScanString and searches for those chars from 1289 * entitiesMap that appear in the string. A tree map(offset2Char) is filled 1290 * using as key the offsets where those Chars appear and the Char. 1291 * If one of the params is null the method simply returns. 1292 */ 1293 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){ 1294 if (aScanString == null || aMapToFill == null) return; 1295 if (entitiesMap == null || entitiesMap.isEmpty()){ 1296 Err.prln("WARNING: Entities map was not initialised !"); 1297 return; 1298 }// End if 1299 // Fill the Map with the offsets of the special chars 1300 Iterator entitiesMapIterator = entitiesMap.keySet().iterator(); 1301 while(entitiesMapIterator.hasNext()){ 1302 Character c = (Character) entitiesMapIterator.next(); 1303 int fromIndex = 0; 1304 while (-1 != fromIndex){ 1305 fromIndex = aScanString.indexOf(c.charValue(),fromIndex); 1306 if (-1 != fromIndex){ 1307 aMapToFill.put(new Integer(fromIndex),c); 1308 fromIndex ++; 1309 }// End if 1310 }// End while 1311 }// End while 1312 }//buildEntityMapFromString(); 1313 1314 private String writeEmptyTag(Annotation annot){ 1315 return writeEmptyTag(annot, true); 1316 } // writeEmptyTag 1317 1318 /** Returns a string representing an empty tag based on the input annot*/ 1319 private String writeEmptyTag(Annotation annot, boolean includeNamespace){ 1320 StringBuffer strBuff = new StringBuffer(""); 1321 if (annot == null) return strBuff.toString(); 1322 1323 strBuff.append("<"); 1324 strBuff.append(annot.getType()); 1325 1326 AnnotationSet originalMarkupsAnnotSet = 1327 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1328 if (! originalMarkupsAnnotSet.contains(annot)) { 1329 strBuff.append(" gateId=\""); 1330 strBuff.append(annot.getId()); 1331 strBuff.append("\""); 1332 } 1333 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace)); 1334 strBuff.append("/>"); 1335 1336 return strBuff.toString(); 1337 }// writeEmptyTag() 1338 1339 /** Returns a string representing an end tag based on the input annot*/ 1340 private String writeEndTag(Annotation annot){ 1341 StringBuffer strBuff = new StringBuffer(""); 1342 if (annot == null) return strBuff.toString(); 1343/* 1344 if (annot.getType().indexOf(" ") != -1) 1345 Out.prln("Warning: Truncating end tag to first word for annot type \"" 1346 +annot.getType()+ "\". "); 1347*/ 1348 strBuff.append("</"+annot.getType()+">"); 1349 1350 //don't write the end for the root element as it will be added 1351 //automatically at the end. 1352 if(annot.getId().equals(smallestAnnotationID)){ 1353 rootEnd = strBuff.toString(); 1354 return ""; 1355 } 1356 return strBuff.toString(); 1357 }// writeEndTag() 1358 1359 /** Returns a string representing a FeatureMap serialized as XML attributes*/ 1360 private String writeFeatures(FeatureMap feat, boolean includeNamespace){ 1361 StringBuffer strBuff = new StringBuffer(""); 1362 if (feat == null) return strBuff.toString(); 1363 Iterator it = feat.keySet().iterator(); 1364 while (it.hasNext()){ 1365 Object key = it.next(); 1366 Object value = feat.get(key); 1367 if ( (key != null) && (value != null) ){ 1368 // Eliminate a feature inserted at reading time and which help to 1369 // take some decissions at saving time 1370 if ("isEmptyAndSpan".equals(key.toString())) 1371 continue; 1372 if( !(String.class.isAssignableFrom(key.getClass()) || 1373 Number.class.isAssignableFrom(key.getClass()))){ 1374 1375 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+ 1376 " from String or Number.(feature discarded)"); 1377 continue; 1378 }// End if 1379 if ( !(String.class.isAssignableFrom(value.getClass()) || 1380 Number.class.isAssignableFrom(value.getClass()) || 1381 java.util.Collection.class.isAssignableFrom(value.getClass()))){ 1382 1383 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+ 1384 " from String, Number or Collection.(feature discarded)"); 1385 continue; 1386 }// End if 1387 if ("matches".equals(key)) { 1388 strBuff.append(" "); 1389 if(includeNamespace) { 1390 strBuff.append("gate:"); 1391 } 1392// strBuff.append(key); 1393 // replace non XML chars in attribute name 1394 strBuff.append( 1395 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1396 strBuff.append("=\""); 1397 } 1398 else { 1399 strBuff.append(" "); 1400// strBuff.append(key); 1401 // replace non XML chars in attribute name 1402 strBuff.append( 1403 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1404 strBuff.append("=\""); 1405 } 1406 if (java.util.Collection.class.isAssignableFrom(value.getClass())){ 1407 Iterator valueIter = ((Collection)value).iterator(); 1408 while(valueIter.hasNext()){ 1409 Object item = valueIter.next(); 1410 if (!(String.class.isAssignableFrom(item.getClass()) || 1411 Number.class.isAssignableFrom(item.getClass()))) 1412 continue; 1413// strBuff.append(item); 1414 // replace non XML chars in collection item 1415 strBuff.append( 1416 filterNonXmlChars(replaceCharsWithEntities(item.toString()))); 1417 strBuff.append(";"); 1418 }// End while 1419 if (strBuff.charAt(strBuff.length()-1) == ';') 1420 strBuff.deleteCharAt(strBuff.length()-1); 1421 }else{ 1422// strBuff.append(value); 1423 // replace non XML chars in attribute value 1424 strBuff.append( 1425 filterNonXmlChars(replaceCharsWithEntities(value.toString()))); 1426 }// End if 1427 strBuff.append("\""); 1428 }// End if 1429 }// End while 1430 return strBuff.toString(); 1431 }// writeFeatures() 1432 1433 /** Returns a GateXml document that is a custom XML format for wich there is 1434 * a reader inside GATE called gate.xml.GateFormatXmlHandler. 1435 * What it does is to serialize a GATE document in an XML format. 1436 * @return a string representing a Gate Xml document. If saved in a file,this 1437 * string must be written using the UTF-8 encoding because the first line 1438 * in the generated xml document is <?xml version="1.0" encoding="UTF-8" ?> 1439 */ 1440 public String toXml(){ 1441 // Initialize the xmlContent with 3 time the size of the current document. 1442 // This is because of the tags size. This measure is made to increase the 1443 // performance of StringBuffer. 1444 StringBuffer xmlContent = new StringBuffer( 1445 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue())); 1446 // Add xml header 1447 xmlContent.append("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"); 1448 // Add the root element 1449 xmlContent.append("<GateDocument>\n"); 1450 xmlContent.append("<!-- The document's features-->\n\n"); 1451 xmlContent.append("<GateDocumentFeatures>\n"); 1452 1453 xmlContent.append(featuresToXml(this.getFeatures())); 1454 xmlContent.append("</GateDocumentFeatures>\n"); 1455 xmlContent.append("<!-- The document content area with serialized"+ 1456 " nodes -->\n\n"); 1457 // Add plain text element 1458 xmlContent.append("<TextWithNodes>"); 1459 xmlContent.append(textWithNodes(this.getContent().toString())); 1460 xmlContent.append("</TextWithNodes>\n"); 1461 // Serialize as XML all document's annotation sets 1462 // Serialize the default AnnotationSet 1463 StatusListener sListener = (StatusListener) 1464 gate.gui.MainFrame.getListeners(). 1465 get("gate.event.StatusListener"); 1466 if(sListener != null) 1467 sListener.statusChanged("Saving the default annotation set "); 1468 xmlContent.append("<!-- The default annotation set -->\n\n"); 1469 xmlContent.append(annotationSetToXml(this.getAnnotations())); 1470 // Serialize all others AnnotationSets 1471 // namedAnnotSets is a Map containing all other named Annotation Sets. 1472 if (namedAnnotSets != null){ 1473 Iterator iter = namedAnnotSets.values().iterator(); 1474 while(iter.hasNext()){ 1475 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1476 xmlContent.append("<!-- Named annotation set -->\n\n"); 1477 // Serialize it as XML 1478 if(sListener != null) sListener.statusChanged("Saving " + 1479 annotSet.getName()+ 1480 " annotation set "); 1481 xmlContent.append(annotationSetToXml(annotSet)); 1482 }// End while 1483 }// End if 1484 // Add the end of GateDocument 1485 xmlContent.append("</GateDocument>"); 1486 if(sListener != null) sListener.statusChanged("Done !"); 1487 // return the XmlGateDocument 1488 return xmlContent.toString(); 1489 }// toXml 1490 1491 /** This method filters any non XML char 1492 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets 1493 * All non XML chars will be replaced with 0x20 (space char) This assures 1494 * that the next time the document is loaded there won't be any problems. 1495 * @param aStrBuffer represents the input String that is filtred. If the 1496 * aStrBuffer is null then an empty string will be returend 1497 * @return the "purified" StringBuffer version of the aStrBuffer 1498 */ 1499 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){ 1500 if (aStrBuffer == null) return new StringBuffer(""); 1501 String space = new String(" "); 1502 for (int i=aStrBuffer.length()-1;i>=0; i--){ 1503 if (!isXmlChar(aStrBuffer.charAt(i))) 1504 aStrBuffer.replace(i,i+1,space); 1505 }// End for 1506 return aStrBuffer; 1507 }// filterNonXmlChars() 1508 1509 /** This method decide if a char is a valid XML one or not 1510 * @param ch the char to be tested 1511 * @return true if is a valid XML char and fals if is not. 1512 */ 1513 public static boolean isXmlChar(char ch){ 1514 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true; 1515 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true; 1516 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true; 1517 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true; 1518 return false; 1519 }// End isXmlChar() 1520 1521 /** This method saves a FeatureMap as XML elements. 1522 * @ param aFeatureMap the feature map that has to be saved as XML. 1523 * @ return a String like this: <Feature><Name>...</Name> 1524 * <Value>...</Value></Feature><Feature>...</Feature> 1525 */ 1526 private String featuresToXml(FeatureMap aFeatureMap){ 1527 StringBuffer str = new StringBuffer(""); 1528 1529 if (aFeatureMap == null) return str.toString(); 1530 1531 Set keySet = aFeatureMap.keySet(); 1532 Iterator keyIterator = keySet.iterator(); 1533 while(keyIterator.hasNext()){ 1534 Object key = keyIterator.next(); 1535 Object value = aFeatureMap.get(key); 1536 if ((key != null) && (value != null)){ 1537 String keyClassName = null; 1538 String keyItemClassName = null; 1539 String valueClassName = null; 1540 String valueItemClassName = null; 1541 String key2String = key.toString(); 1542 String value2String = value.toString(); 1543 1544 Object item = null; 1545 // Test key if it is String, Number or Collection 1546 if (key instanceof java.lang.String || 1547 key instanceof java.lang.Number || 1548 key instanceof java.util.Collection) 1549 keyClassName = key.getClass().getName(); 1550 1551 // Test value if it is String, Number or Collection 1552 if (value instanceof java.lang.String || 1553 value instanceof java.lang.Number || 1554 value instanceof java.util.Collection) 1555 valueClassName = value.getClass().getName(); 1556 1557 // Features and values that are not Strings, Numbers or collections 1558 // will be discarded. 1559 if (keyClassName == null || valueClassName == null) continue; 1560 1561 // If key is collection serialize the colection in a specific format 1562 if (key instanceof java.util.Collection){ 1563 StringBuffer keyStrBuff = new StringBuffer(""); 1564 Iterator iter = ((Collection) key).iterator(); 1565 if (iter.hasNext()){ 1566 item = iter.next(); 1567 if (item instanceof java.lang.Number) 1568 keyItemClassName = item.getClass().getName(); 1569 else 1570 keyItemClassName = String.class.getName(); 1571 keyStrBuff.append(item.toString()); 1572 }// End if 1573 while (iter.hasNext()){ 1574 item = iter.next(); 1575 keyStrBuff.append(";" + item.toString()); 1576 }// End while 1577 key2String = keyStrBuff.toString(); 1578 }// End if 1579 // If key is collection serialize the colection in a specific format 1580 if (value instanceof java.util.Collection){ 1581 StringBuffer valueStrBuff = new StringBuffer(""); 1582 Iterator iter = ((Collection) value).iterator(); 1583 if (iter.hasNext()){ 1584 item = iter.next(); 1585 if (item instanceof java.lang.Number) 1586 valueItemClassName = item.getClass().getName(); 1587 else 1588 valueItemClassName = String.class.getName(); 1589 valueStrBuff.append(item.toString()); 1590 }// End if 1591 while (iter.hasNext()){ 1592 item = iter.next(); 1593 valueStrBuff.append(";" + item.toString()); 1594 }// End while 1595 value2String = valueStrBuff.toString(); 1596 }// End if 1597 str.append("<Feature>\n <Name"); 1598 if (keyClassName != null) 1599 str.append(" className=\""+keyClassName+"\""); 1600 if (keyItemClassName != null) 1601 str.append(" itemClassName=\""+keyItemClassName+"\""); 1602 str.append(">"); 1603 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String))); 1604 str.append("</Name>\n <Value"); 1605 if (valueClassName != null) 1606 str.append(" className=\"" + valueClassName + "\""); 1607 if (valueItemClassName != null) 1608 str.append(" itemClassName=\"" + valueItemClassName + "\""); 1609 str.append(">"); 1610 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String))); 1611 str.append("</Value>\n</Feature>\n"); 1612 }// End if 1613 }// end While 1614 return str.toString(); 1615 }//featuresToXml 1616 1617 /** This method replace all chars that appears in the anInputString and also 1618 * that are in the entitiesMap with their corresponding entity 1619 * @param anInputString the string analyzed. If it is null then returns the 1620 * empty string 1621 * @return a string representing the input string with chars replaced with 1622 * entities 1623 */ 1624 private StringBuffer replaceCharsWithEntities(String anInputString){ 1625 if (anInputString == null) return new StringBuffer(""); 1626 StringBuffer strBuff = new StringBuffer(anInputString); 1627 for (int i=strBuff.length()-1; i>=0; i--){ 1628 Character ch = new Character(strBuff.charAt(i)); 1629 if (entitiesMap.keySet().contains(ch)){ 1630 strBuff.replace(i,i+1,(String) entitiesMap.get(ch)); 1631 }// End if 1632 }// End for 1633 return strBuff; 1634 }//replaceCharsWithEntities() 1635 1636 /** This method creates Node XML elements and inserts them at the 1637 * corresponding offset inside the text. Nodes are created from the default 1638 * annotation set, as well as from all existing named annotation sets. 1639 * @param aText The text representing the document's plain text. 1640 * @return The text with empty <Node id="NodeId"/> elements. 1641 */ 1642 private String textWithNodes(String aText){ 1643 if (aText == null) return new String(""); 1644 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText)); 1645 1646 // Construct a map from offsets to Chars 1647 TreeMap offsets2CharsMap = new TreeMap(); 1648 if (aText.length()!= 0){ 1649 // Fill the offsets2CharsMap with all the indices where special chars appear 1650 buildEntityMapFromString(aText,offsets2CharsMap); 1651 }//End if 1652 // Construct the offsetsSet for all nodes belonging to this document 1653 TreeSet offsetsSet = new TreeSet(); 1654 Iterator annotSetIter = this.getAnnotations().iterator(); 1655 while (annotSetIter.hasNext()){ 1656 Annotation annot = (Annotation) annotSetIter.next(); 1657 offsetsSet.add(annot.getStartNode().getOffset()); 1658 offsetsSet.add(annot.getEndNode().getOffset()); 1659 }// end While 1660 // Get the nodes from all other named annotation sets. 1661 if (namedAnnotSets != null){ 1662 Iterator iter = namedAnnotSets.values().iterator(); 1663 while(iter.hasNext()){ 1664 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1665 Iterator iter2 = annotSet.iterator(); 1666 while(iter2.hasNext()){ 1667 Annotation annotTmp = (Annotation) iter2.next(); 1668 offsetsSet.add(annotTmp.getStartNode().getOffset()); 1669 offsetsSet.add(annotTmp.getEndNode().getOffset()); 1670 }// End while 1671 }// End while 1672 }// End if 1673 // offsetsSet is ordered in ascending order because the structure 1674 // is a TreeSet 1675 1676 if (offsetsSet.isEmpty()){ 1677 return replaceCharsWithEntities(aText).toString(); 1678 }// End if 1679 // Iterate through all nodes from anAnnotSet and transform them to 1680 // XML elements. Then insert those elements at the node's offset into the 1681 // textWithNodes . 1682 while (!offsetsSet.isEmpty()){ 1683 Long offset = (Long) offsetsSet.last(); 1684 // Eliminate the offset from the list in order to create more memory space 1685 offsetsSet.remove(offset); 1686 // Use offset 1687 int offsetValue = offset.intValue(); 1688 String strNode = "<Node id=\"" + offsetValue + "\"/>"; 1689 // Before inserting this string into the textWithNodes, check to see if 1690 // there are any chars to be replaced with their corresponding entities 1691 if (!offsets2CharsMap.isEmpty()){ 1692 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1693 while( !offsets2CharsMap.isEmpty() && 1694 offsChar.intValue() >= offset.intValue()){ 1695 // Replace the char at offsChar with its corresponding entity form 1696 // the entitiesMap. 1697 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1698 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1699 // Discard the offsChar after it was used because this offset will 1700 // never appear again 1701 offsets2CharsMap.remove(offsChar); 1702 // Investigate next offsChar 1703 if (!offsets2CharsMap.isEmpty()) 1704 offsChar = (Integer) offsets2CharsMap.lastKey(); 1705 }// End while 1706 }// End if 1707 // Now it is safe to insert the node 1708 textWithNodes.insert(offsetValue,strNode); 1709 }// end while 1710 // Need to replace the entities in the remaining text, if there is any text 1711 // So, if there are any more items in offsets2CharsMap they need to be 1712 // replaced 1713 while (!offsets2CharsMap.isEmpty()){ 1714 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1715 // Replace the char with its entity 1716 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1717 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1718 // remove the offset from the map 1719 offsets2CharsMap.remove(offsChar); 1720 }// End while 1721 return textWithNodes.toString(); 1722 }//textWithNodes() 1723 1724 /** This method saves an AnnotationSet as XML. 1725 * @param anAnnotationSet The annotation set that has to be saved as XML. 1726 * @return a String like this: <AnnotationSet> <Annotation>.... 1727 * </AnnotationSet> 1728 */ 1729 private String annotationSetToXml(AnnotationSet anAnnotationSet){ 1730 StringBuffer str = new StringBuffer(""); 1731 1732 if (anAnnotationSet == null){ 1733 str.append("<AnnotationSet>\n"); 1734 str.append("</AnnotationSet>\n"); 1735 return str.toString(); 1736 }// End if 1737 if (anAnnotationSet.getName() == null) 1738 str.append("<AnnotationSet>\n"); 1739 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+ 1740 "\" >\n"); 1741 // Iterate through AnnotationSet and save each Annotation as XML 1742 Iterator iterator = anAnnotationSet.iterator(); 1743 while (iterator.hasNext()){ 1744 Annotation annot = (Annotation) iterator.next(); 1745 str.append("<Annotation " + "Type=\"" + annot.getType() + 1746 "\" StartNode=\"" + annot.getStartNode().getOffset() + 1747 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n"); 1748 str.append(featuresToXml(annot.getFeatures())); 1749 str.append("</Annotation>\n"); 1750 }// End while 1751 1752 str.append("</AnnotationSet>\n"); 1753 return str.toString(); 1754 }// annotationSetToXml 1755 1756 /** Returns a map with the named annotation sets. It returns <code>null</code> 1757 * if no named annotaton set exists. */ 1758 public Map getNamedAnnotationSets() { 1759 return namedAnnotSets; 1760 } // getNamedAnnotationSets 1761 1762 /** 1763 * Removes one of the named annotation sets. 1764 * Note that the default annotation set cannot be removed. 1765 * @param name the name of the annotation set to be removed 1766 */ 1767 public void removeAnnotationSet(String name){ 1768 Object removed = namedAnnotSets.remove(name); 1769 if(removed != null){ 1770 fireAnnotationSetRemoved( 1771 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name)); 1772 } 1773 } 1774 1775 /** Propagate edit changes to the document content and annotations. */ 1776 public void edit(Long start, Long end, DocumentContent replacement) 1777 throws InvalidOffsetException 1778 { 1779 if(! isValidOffsetRange(start, end)) 1780 throw new InvalidOffsetException(); 1781 1782 if(content != null) 1783 ((DocumentContentImpl) content).edit(start, end, replacement); 1784 1785 if(defaultAnnots != null) 1786 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement); 1787 1788 if(namedAnnotSets != null) { 1789 Iterator iter = namedAnnotSets.values().iterator(); 1790 while(iter.hasNext()) 1791 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement); 1792 } 1793 1794 } // edit(start,end,replacement) 1795 1796 /** Check that an offset is valid, i.e. it is non-null, greater than 1797 * or equal to 0 and less than the size of the document content. 1798 */ 1799 public boolean isValidOffset(Long offset) { 1800 if(offset == null) 1801 return false; 1802 1803 long o = offset.longValue(); 1804 if(o > getContent().size().longValue() || o < 0) 1805 return false; 1806 1807 return true; 1808 } // isValidOffset 1809 1810 /** Check that both start and end are valid offsets and that 1811 * they constitute a valid offset range, i.e. start is greater 1812 * than or equal to long. 1813 */ 1814 public boolean isValidOffsetRange(Long start, Long end) { 1815 return 1816 isValidOffset(start) && isValidOffset(end) && 1817 start.longValue() <= end.longValue(); 1818 } // isValidOffsetRange(start,end) 1819 1820 /** Sets the nextAnnotationId */ 1821 public void setNextAnnotationId(int aNextAnnotationId){ 1822 nextAnnotationId = aNextAnnotationId; 1823 }// setNextAnnotationId(); 1824 1825 /** Generate and return the next annotation ID */ 1826 public Integer getNextAnnotationId() { 1827 return new Integer(nextAnnotationId++); 1828 } // getNextAnnotationId 1829 1830 /** Generate and return the next node ID */ 1831 public Integer getNextNodeId() { return new Integer(nextNodeId++); } 1832 1833 /** Ordering based on URL.toString() and the URL offsets (if any) */ 1834 public int compareTo(Object o) throws ClassCastException { 1835 DocumentImpl other = (DocumentImpl) o; 1836 return getOrderingString().compareTo(other.getOrderingString()); 1837 } // compareTo 1838 1839 /** Utility method to produce a string for comparison in ordering. 1840 * String is based on the source URL and offsets. 1841 */ 1842 protected String getOrderingString() { 1843 if(sourceUrl == null) return toString(); 1844 1845 StringBuffer orderingString = new StringBuffer(sourceUrl.toString()); 1846 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) { 1847 orderingString.append(sourceUrlStartOffset.toString()); 1848 orderingString.append(sourceUrlEndOffset.toString()); 1849 } 1850 1851 return orderingString.toString(); 1852 } // getOrderingString() 1853 1854 /** The id of the next new annotation */ 1855 protected int nextAnnotationId = 0; 1856 1857 /** The id of the next new node */ 1858 protected int nextNodeId = 0; 1859 /** The source URL */ 1860 protected URL sourceUrl; 1861 1862 /** The document's URL name. */ 1863 1864 /** The content of the document */ 1865 protected DocumentContent content; 1866 1867 /** The encoding of the source of the document content */ 1868 protected String encoding = "UTF-8"; 1869 1870 // Data needed in toXml(AnnotationSet) methos 1871 1872 /** This field indicates whether or not to add the tag 1873 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't 1874 * have this tag added 1875 */ 1876// private boolean addGatePreserveFormatTag = false; 1877 1878 /** 1879 * Used by the XML dump preserving format method to remember the smallest 1880 * annoation ID as a marker for the XML document root. 1881 */ 1882 private Integer smallestAnnotationID = null; 1883 1884 /** 1885 * The closing tag for the document root. 1886 */ 1887 private String rootEnd; 1888 1889 /** This field is used when creating StringBuffers for toXml() methods. 1890 * The size of the StringBuffer will be docDonctent.size() multiplied by this 1891 * value. It is aimed to improve the performance of StringBuffer 1892 */ 1893 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1; 1894 1895 /** Constant used in the inner class AnnotationComparator to order 1896 * annotations on their start offset 1897 */ 1898 private final int ORDER_ON_START_OFFSET = 0; 1899 /** Constant used in the inner class AnnotationComparator to order 1900 * annotations on their end offset 1901 */ 1902 private final int ORDER_ON_END_OFFSET = 1; 1903 /** Constant used in the inner class AnnotationComparator to order 1904 * annotations on their ID 1905 */ 1906 private final int ORDER_ON_ANNOT_ID = 2; 1907 /** Constant used in the inner class AnnotationComparator to order 1908 * annotations ascending 1909 */ 1910 private final int ASC = 3; 1911 /** Constant used in the inner class AnnotationComparator to order 1912 * annotations descending 1913 */ 1914 private final int DESC = -3; 1915 1916 /** A map initialized in init() containing entities that needs to be 1917 * replaced in strings 1918 */ 1919 private static Map entitiesMap = null; 1920 // Initialize the entities map use when saving as xml 1921 static{ 1922 entitiesMap = new HashMap(); 1923 entitiesMap.put(new Character('<'),"<"); 1924 entitiesMap.put(new Character('>'),">"); 1925 entitiesMap.put(new Character('&'),"&"); 1926 entitiesMap.put(new Character('\''),"'"); 1927 entitiesMap.put(new Character('"'),"""); 1928 entitiesMap.put(new Character((char)160)," "); 1929 entitiesMap.put(new Character((char)169),"©"); 1930 }//static 1931 1932 /** The range that the content comes from at the source URL 1933 * (or null if none). 1934 */ 1935 //protected Long[] sourceUrlOffsets; 1936 1937 /** The start of the range that the content comes from at the source URL 1938 * (or null if none). 1939 */ 1940 protected Long sourceUrlStartOffset; 1941 1942 /** The end of the range that the content comes from at the source URL 1943 * (or null if none). 1944 */ 1945 protected Long sourceUrlEndOffset; 1946 1947 /** The default annotation set */ 1948 protected AnnotationSet defaultAnnots; 1949 1950 /** Named sets of annotations */ 1951 protected Map namedAnnotSets; 1952 1953 /** 1954 * A property of the document that will be set when the user 1955 * wants to create the document from a string, as opposed to from 1956 * a URL. 1957 */ 1958 private String stringContent; 1959 1960 /** 1961 * The stringContent of a document is 1962 * a property of the document that will be set when the user 1963 * wants to create the document from a string, as opposed to from 1964 * a URL. 1965 * <B>Use the <TT>getContent</TT> method instead to get the actual document 1966 * content.</B> 1967 */ 1968 public String getStringContent() { return stringContent; } 1969 1970 /** 1971 * The stringContent of a document is 1972 * a property of the document that will be set when the user 1973 * wants to create the document from a string, as opposed to from 1974 * a URL. 1975 * <B>Use the <TT>setContent</TT> method instead to update the actual 1976 * document content.</B> 1977 */ 1978 public void setStringContent(String stringContent) { 1979 this.stringContent = stringContent; 1980 } // set StringContent 1981 1982 /** Is the document markup-aware? */ 1983 protected Boolean markupAware = new Boolean(false); 1984 1985// /** Hash code */ 1986// public int hashCode() { 1987// int code = getContent().hashCode(); 1988// int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode(); 1989// code += memberCode; 1990// memberCode = (encoding == null) ? 0 : encoding.hashCode(); 1991// code += memberCode; 1992// memberCode = (features == null) ? 0 : features.hashCode(); 1993// code += memberCode; 1994// code += (markupAware.booleanValue()) ? 0 : 1; 1995// memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode(); 1996// code += memberCode; 1997// code += nextAnnotationId; 1998// code += nextNodeId; 1999// memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode(); 2000// code += memberCode; 2001// memberCode = 2002// (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode(); 2003// code += memberCode; 2004// memberCode = 2005// (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode(); 2006// code += memberCode; 2007// return code; 2008// } // hashcode 2009 2010 /** String respresentation */ 2011 public String toString() { 2012 String n = Strings.getNl(); 2013 StringBuffer s = new StringBuffer("DocumentImpl: " + n); 2014 s.append(" content:" + content + n); 2015 s.append(" defaultAnnots:" + defaultAnnots + n); 2016 s.append(" encoding:" + encoding + n); 2017 s.append(" features:" + features + n); 2018 s.append(" markupAware:" + markupAware + n); 2019 s.append(" namedAnnotSets:" + namedAnnotSets + n); 2020 s.append(" nextAnnotationId:" + nextAnnotationId + n); 2021 s.append(" nextNodeId:" + nextNodeId + n); 2022 s.append(" sourceUrl:" + sourceUrl + n); 2023 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n); 2024 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n); 2025 s.append(n); 2026 2027 return s.toString(); 2028 } // toString 2029 2030 /** Freeze the serialization UID. */ 2031 static final long serialVersionUID = -8456893608311510260L; 2032 2033 /** Inner class needed to compare annotations*/ 2034 class AnnotationComparator implements java.util.Comparator { 2035 int orderOn = -1; 2036 int orderType = ASC; 2037 /** Constructs a comparator according to one of three sorter types: 2038 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET 2039 */ 2040 public AnnotationComparator(int anOrderOn, int anOrderType){ 2041 orderOn = anOrderOn; 2042 orderType = anOrderType; 2043 }// AnnotationComparator() 2044 2045 /**This method must be implemented according to Comparator interface */ 2046 public int compare(Object o1, Object o2){ 2047 Annotation a1 = (Annotation) o1; 2048 Annotation a2 = (Annotation) o2; 2049 // ORDER_ON_START_OFFSET ? 2050 if (orderOn == ORDER_ON_START_OFFSET){ 2051 int result = a1.getStartNode().getOffset().compareTo( 2052 a2.getStartNode().getOffset()); 2053 if (orderType == ASC){ 2054 // ASC 2055 // If they are equal then their ID will decide. 2056 if (result == 0) 2057 return a1.getId().compareTo(a2.getId()); 2058 return result; 2059 }else{ 2060 // DESC 2061 if (result == 0) 2062 return - (a1.getId().compareTo(a2.getId())); 2063 return -result; 2064 }// End if (orderType == ASC) 2065 }// End if (orderOn == ORDER_ON_START_OFFSET) 2066 2067 // ORDER_ON_END_OFFSET ? 2068 if (orderOn == ORDER_ON_END_OFFSET){ 2069 int result = a1.getEndNode().getOffset().compareTo( 2070 a2.getEndNode().getOffset()); 2071 if (orderType == ASC){ 2072 // ASC 2073 // If they are equal then their ID will decide. 2074 if (result == 0) 2075 return - (a1.getId().compareTo(a2.getId())); 2076 return result; 2077 }else{ 2078 // DESC 2079 // If they are equal then their ID will decide. 2080 if (result == 0) 2081 return a1.getId().compareTo(a2.getId()); 2082 return - result; 2083 }// End if (orderType == ASC) 2084 }// End if (orderOn == ORDER_ON_END_OFFSET) 2085 2086 // ORDER_ON_ANNOT_ID ? 2087 if (orderOn == ORDER_ON_ANNOT_ID){ 2088 if (orderType == ASC) 2089 return a1.getId().compareTo(a2.getId()); 2090 else 2091 return -(a1.getId().compareTo(a2.getId())); 2092 }// End if 2093 return 0; 2094 }//compare() 2095 } // End inner class AnnotationComparator 2096 2097 2098 private transient Vector documentListeners; 2099 private transient Vector gateListeners; 2100 2101 public synchronized void removeDocumentListener(DocumentListener l) { 2102 if (documentListeners != null && documentListeners.contains(l)) { 2103 Vector v = (Vector) documentListeners.clone(); 2104 v.removeElement(l); 2105 documentListeners = v; 2106 } 2107 } 2108 public synchronized void addDocumentListener(DocumentListener l) { 2109 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone(); 2110 if (!v.contains(l)) { 2111 v.addElement(l); 2112 documentListeners = v; 2113 } 2114 } 2115 2116 protected void fireAnnotationSetAdded(DocumentEvent e) { 2117 if (documentListeners != null) { 2118 Vector listeners = documentListeners; 2119 int count = listeners.size(); 2120 for (int i = 0; i < count; i++) { 2121 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e); 2122 } 2123 } 2124 } 2125 2126 protected void fireAnnotationSetRemoved(DocumentEvent e) { 2127 if (documentListeners != null) { 2128 Vector listeners = documentListeners; 2129 int count = listeners.size(); 2130 for (int i = 0; i < count; i++) { 2131 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e); 2132 } 2133 } 2134 } 2135 public void resourceLoaded(CreoleEvent e) { 2136 } 2137 public void resourceUnloaded(CreoleEvent e) { 2138 } 2139 public void datastoreOpened(CreoleEvent e) { 2140 } 2141 public void datastoreCreated(CreoleEvent e) { 2142 } 2143 public void resourceRenamed(Resource resource, String oldName, 2144 String newName){ 2145 } 2146 public void datastoreClosed(CreoleEvent e) { 2147 if (! e.getDatastore().equals(this.getDataStore())) 2148 return; 2149 //close this lr, since it cannot stay open when the DS it comes from 2150 //is closed 2151 Factory.deleteResource(this); 2152 } 2153 public void setLRPersistenceId(Object lrID) { 2154 super.setLRPersistenceId( lrID); 2155 //make persistent documents listen to the creole register 2156 //for events about their DS 2157 Gate.getCreoleRegister().addCreoleListener(this); 2158 } 2159 public void resourceAdopted(DatastoreEvent evt) { 2160 } 2161 public void resourceDeleted(DatastoreEvent evt) { 2162 if(! evt.getSource().equals(this.getDataStore())) 2163 return; 2164 //if an open document is deleted from a DS, then 2165 //it must close itself immediately, as is no longer valid 2166 if(evt.getResourceID().equals(this.getLRPersistenceId())) 2167 Factory.deleteResource(this); 2168 } 2169 public void resourceWritten(DatastoreEvent evt) { 2170 } 2171 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException { 2172 super.setDataStore( dataStore); 2173 if (this.dataStore != null) 2174 this.dataStore.addDatastoreListener(this); 2175 } 2176 2177} // class DocumentImpl 2178
|
DocumentImpl |
|