|
DocumentImpl |
|
1 /* 2 * DocumentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentImpl.java,v 1.118 2003/01/20 15:45:06 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 import gate.creole.*; 26 import gate.gui.*; 27 import gate.event.*; 28 29 /** Represents the commonalities between all sorts of documents. 30 * 31 * <H2>Editing</H2> 32 * 33 * <P> 34 * The DocumentImpl class implements the Document interface. 35 * The DocumentContentImpl class models the textual or audio-visual 36 * materials which are the source and content of Documents. 37 * The AnnotationSetImpl class supplies annotations on Documents. 38 * 39 * <P> 40 * Abbreviations: 41 * 42 * <UL> 43 * <LI> 44 * DC = DocumentContent 45 * <LI> 46 * D = Document 47 * <LI> 48 * AS = AnnotationSet 49 * </UL> 50 * 51 * <P> 52 * We add an edit method to each of these classes; for DC and AS 53 * the methods are package private; D has the public method. 54 * 55 * <PRE> 56 * void edit(Long start, Long end, DocumentContent replacement) 57 * throws InvalidOffsetException; 58 * </PRE> 59 * 60 * <P> 61 * D receives edit requests and forwards them to DC and AS. 62 * On DC, this method makes a change to the content - e.g. replacing 63 * a String range from start to end with replacement. (Deletions 64 * are catered for by having replacement = null.) D then calls 65 * AS.edit on each of its annotation sets. 66 * 67 * <P> 68 * On AS, edit calls replacement.size() (i.e. DC.size()) to 69 * figure out how long the replacement is (0 for null). It then 70 * considers annotations that terminate (start or end) in 71 * the altered or deleted range as invalid; annotations that 72 * terminate after the range have their offsets adjusted. 73 * I.e.: 74 * <UL> 75 * <LI> 76 * the nodes that pointed inside the old modified area are invalid now and 77 * will be deleted along with the connected annotations; 78 * <LI> 79 * the nodes that are before the start of the modified area remain 80 * untouched; 81 * <LI> 82 * the nodes that are after the end of the affected area will have the 83 * offset changed according to the formula below. 84 * </UL> 85 * 86 * <P> 87 * A note re. AS and annotations: annotations no longer have 88 * offsets as in the old model, they now have nodes, and nodes 89 * have offsets. 90 * 91 * <P> 92 * To implement AS.edit, we have several indices: 93 * <PRE> 94 * HashMap annotsByStartNode, annotsByEndNode; 95 * </PRE> 96 * which map node ids to annotations; 97 * <PRE> 98 * RBTreeMap nodesByOffset; 99 * </PRE> 100 * which maps offset to Nodes. 101 * 102 * <P> 103 * When we get an edit request, we traverse that part of the 104 * nodesByOffset tree representing the altered or deleted 105 * range of the DC. For each node found, we delete any annotations 106 * that terminate on the node, and then delete the node itself. 107 * We then traverse the rest of the tree, changing the offset 108 * on all remaining nodes by: 109 * <PRE> 110 * newOffset = 111 * oldOffset - 112 * ( 113 * (end - start) - // size of mod 114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl 115 * ); 116 * </PRE> 117 * Note that we use the same convention as e.g. java.lang.String: start 118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd" 119 * range 1-3 = "bc". Examples, for a node with offset 4: 120 * <PRE> 121 * edit(1, 3, "BC"); 122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4 123 * 124 * edit(1, 3, null); 125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2 126 * 127 * edit(1, 3, "BBCC"); 128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6 129 * </PRE> 130 */ 131 public class DocumentImpl 132 extends AbstractLanguageResource implements TextualDocument, CreoleListener, 133 DatastoreListener { 134 /** Debug flag */ 135 private static final boolean DEBUG = false; 136 137 /** If you set this flag to true the original content of the document will 138 * be kept in the document feature. <br> 139 * Default value is false to avoid the unnecessary waste of memory */ 140 private Boolean preserveOriginalContent = new Boolean(false); 141 142 /** If you set this flag to true the repositioning information for 143 * the document will be kept in the document feature. <br> 144 * Default value is false to avoid the unnecessary waste of time and memory 145 */ 146 private Boolean collectRepositioningInfo = new Boolean(false); 147 148 /** 149 * This is a variable which contains the latest crossed over annotation 150 * found during export with preserving format, i.e., toXml(annotations) 151 * method. 152 */ 153 private Annotation crossedOverAnnotation = null; 154 155 /** Default construction. Content left empty. */ 156 public DocumentImpl() { 157 content = new DocumentContentImpl(); 158 } // default construction 159 160 /** Initialise this resource, and return it. */ 161 public Resource init() throws ResourceInstantiationException { 162 // set up the source URL and create the content 163 if(sourceUrl == null) { 164 if(stringContent == null) { 165 throw new ResourceInstantiationException( 166 "The sourceURL and document's content were null." 167 ); 168 } 169 170 content = new DocumentContentImpl(stringContent); 171 getFeatures().put("gate.SourceURL", "created from String"); 172 } else { 173 try { 174 content = new DocumentContentImpl( 175 sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset); 176 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm()); 177 } catch(IOException e) { 178 e.printStackTrace(); 179 // throw new ResourceInstantiationException("DocumentImpl.init: " + e); 180 } 181 182 if(preserveOriginalContent.booleanValue() && content != null) { 183 String originalContent = new String( 184 ((DocumentContentImpl) content).getOriginalContent()); 185 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME, 186 originalContent); 187 } // if 188 } 189 190 // set up a DocumentFormat if markup unpacking required 191 if(getMarkupAware().booleanValue()) { 192 DocumentFormat docFormat = 193 DocumentFormat.getDocumentFormat(this, sourceUrl); 194 try { 195 if(docFormat != null){ 196 StatusListener sListener = (StatusListener) 197 gate.gui.MainFrame.getListeners(). 198 get("gate.event.StatusListener"); 199 if(sListener != null) docFormat.addStatusListener(sListener); 200 201 // set the flag if true and if the document format support collecting 202 docFormat.setShouldCollectRepositioning(collectRepositioningInfo); 203 204 if(docFormat.getShouldCollectRepositioning().booleanValue()) { 205 // unpack with collectiong of repositioning information 206 RepositioningInfo info = new RepositioningInfo(); 207 208 String origContent = (String) getFeatures().get( 209 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 210 211 RepositioningInfo ampCodingInfo = new RepositioningInfo(); 212 if(origContent != null) { 213 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat; 214 collectInformationForAmpCodding(origContent, ampCodingInfo, 215 shouldCorrectCR); 216 if(docFormat instanceof HtmlDocumentFormat) { 217 collectInformationForWS(origContent, ampCodingInfo); 218 } // if 219 } // if 220 221 docFormat.unpackMarkup(this, info, ampCodingInfo); 222 223 if(origContent != null 224 && docFormat instanceof XmlDocumentFormat) { 225 // CRLF correction of RepositioningInfo 226 correctRepositioningForCRLFInXML(origContent, info); 227 } // if 228 229 getFeatures().put( 230 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info); 231 } 232 else { 233 // normal old fashioned unpack 234 docFormat.unpackMarkup(this); 235 } 236 docFormat.removeStatusListener(sListener); 237 } //if format != null 238 } catch(DocumentFormatException e) { 239 throw new ResourceInstantiationException( 240 "Couldn't unpack markup in document " + sourceUrl.toExternalForm() + 241 " " + e 242 ); 243 } 244 } // if markup aware 245 246 //try{ 247 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt"); 248 // fw.write(getContent().toString()); 249 // fw.flush(); 250 // fw.close(); 251 //}catch(IOException ioe){ 252 // ioe.printStackTrace(); 253 //} 254 255 return this; 256 } // init() 257 258 /** 259 * Correct repositioning information for substitution of "\r\n" with "\n" 260 */ 261 private void correctRepositioningForCRLFInXML(String content, 262 RepositioningInfo info) { 263 int index = -1; 264 265 do { 266 index = content.indexOf("\r\n", index+1); 267 if(index != -1) { 268 info.correctInformationOriginalMove(index, 1); 269 } // if 270 } while(index != -1); 271 } // correctRepositioningForCRLF 272 273 /** 274 * Collect information for substitution of "&xxx;" with "y" 275 * 276 * It couldn't be collected a position information about 277 * some unicode and &-coded symbols during parsing. The parser "hide" the 278 * information about the position of such kind of parsed text. 279 * So, there is minimal chance to have &-coded symbol inside the covered by 280 * repositioning records area. The new record should be created for every 281 * coded symbol outside the existing records. 282 * <BR> 283 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction 284 * for CRLF substitution is performed. 285 */ 286 private void collectInformationForAmpCodding(String content, 287 RepositioningInfo info, 288 boolean shouldCorrectCR) { 289 290 if(content == null || info == null) return; 291 292 int ampIndex = -1; 293 int semiIndex; 294 295 do { 296 ampIndex = content.indexOf('&', ampIndex+1); 297 if(ampIndex != -1) { 298 semiIndex = content.indexOf(';', ampIndex+1); 299 // have semicolon and it is near enough for amp codding 300 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) { 301 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1); 302 } 303 else { 304 // no semicolon or it is too far 305 // analyse for amp codding without semicolon 306 int maxEnd = Math.min(ampIndex+8, content.length()); 307 String ampCandidate = content.substring(ampIndex, maxEnd); 308 int ampCodingSize = analyseAmpCodding(ampCandidate); 309 310 if(ampCodingSize != -1) { 311 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1); 312 } // if 313 314 } // if - semicolon found 315 } // if - ampersand found 316 } while (ampIndex != -1); 317 318 // correct the collected information to adjust it's positions 319 // with reported by the parser 320 int index = -1; 321 322 if(shouldCorrectCR) { 323 do { 324 index = content.indexOf("\r\n", index+1); 325 if(index != -1) { 326 info.correctInformationOriginalMove(index, -1); 327 } // if 328 } while(index != -1); 329 } // if 330 } // collectInformationForAmpCodding 331 332 /** 333 * This function compute size of the ampersand codded sequence when 334 * semicolin is not present. 335 */ 336 private int analyseAmpCodding(String content) { 337 int result = -1; 338 339 try { 340 char ch = content.charAt(1); 341 342 switch(ch) { 343 case 'l' : // < 344 case 'L' : // < 345 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 346 result = 3; 347 } // if 348 break; 349 case 'g' : // > 350 case 'G' : // > 351 if(content.charAt(2) == 't' || content.charAt(2) == 'T') { 352 result = 3; 353 } // if 354 break; 355 case 'a' : // & 356 case 'A' : // & 357 if(content.substring(2, 4).equalsIgnoreCase("mp")) { 358 result = 4; 359 } // if 360 break; 361 case 'q' : // " 362 case 'Q' : // " 363 if(content.substring(2, 5).equalsIgnoreCase("uot")) { 364 result = 5; 365 } // if 366 break; 367 case '#' : // #number (example ‘, 䰸) 368 int endIndex = 2; 369 boolean hexCoded = false; 370 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') { 371 // Hex codding 372 ++endIndex; 373 hexCoded = true; 374 } // if 375 376 while (endIndex < 8 377 && isNumber(content.charAt(endIndex), hexCoded) ) { 378 ++endIndex; 379 } // while 380 result = endIndex; 381 break; 382 } // switch 383 } catch (StringIndexOutOfBoundsException ex) { 384 // do nothing 385 } // catch 386 387 return result; 388 } // analyseAmpCodding 389 390 /** Check for numeric range. If hex is true the A..F range is included */ 391 private boolean isNumber(char ch, boolean hex) { 392 if(ch >= '0' && ch <= '9') return true; 393 394 if(hex) { 395 if(ch >= 'A' && ch <= 'F') return true; 396 if(ch >= 'a' && ch <= 'f') return true; 397 } // if 398 399 return false; 400 } // isNumber 401 402 /** HTML parser perform substitution of multiple whitespaces (WS) with 403 * a single WS. To create correct repositioning information structure we 404 * should keep the information for such multiple WS. 405 * <BR> 406 * The criteria for WS is <code>(ch <= ' ')</code>. 407 */ 408 private void collectInformationForWS(String content, RepositioningInfo info) { 409 410 if(content == null || info == null) return; 411 412 // analyse the content and correct the repositioning information 413 char ch; 414 int startWS, endWS; 415 416 startWS = endWS = -1; 417 int contentLength = content.length(); 418 419 for(int i=0; i<contentLength; ++i) { 420 ch = content.charAt(i); 421 422 // is whitespace 423 if(ch <= ' ') { 424 if(startWS == -1) { 425 startWS = i; 426 } // if 427 endWS = i; 428 } 429 else { 430 if(endWS - startWS > 0) { 431 // put the repositioning information about the WS substitution 432 info.addPositionInfo( 433 (long)startWS, (long)(endWS - startWS + 1), 0, 1); 434 } // if 435 // clear positions 436 startWS = endWS = -1; 437 }// if 438 } // for 439 } // collectInformationForWS 440 441 /** Clear all the data members of the object. */ 442 public void cleanup() { 443 444 defaultAnnots = null; 445 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty())) 446 namedAnnotSets.clear(); 447 if (DEBUG) Out.prln("Document cleanup called"); 448 if (this.lrPersistentId != null) 449 Gate.getCreoleRegister().removeCreoleListener(this); 450 if(this.getDataStore() != null) 451 this.getDataStore().removeDatastoreListener(this); 452 } // cleanup() 453 454 455 /** Documents are identified by URLs */ 456 public URL getSourceUrl() { return sourceUrl; } 457 458 /** Set method for the document's URL */ 459 public void setSourceUrl(URL sourceUrl) { 460 this.sourceUrl = sourceUrl; 461 } // setSourceUrl 462 463 /** Documents may be packed within files; in this case an optional pair of 464 * offsets refer to the location of the document. 465 */ 466 public Long[] getSourceUrlOffsets() { 467 Long[] sourceUrlOffsets = new Long[2]; 468 sourceUrlOffsets[0] = sourceUrlStartOffset; 469 sourceUrlOffsets[1] = sourceUrlEndOffset; 470 return sourceUrlOffsets; 471 } // getSourceUrlOffsets 472 473 /** 474 * Allow/disallow preserving of the original document content. 475 * If is <B>true</B> the original content will be retrieved from 476 * the DocumentContent object and preserved as document feature. 477 */ 478 public void setPreserveOriginalContent(Boolean b) { 479 preserveOriginalContent = b; 480 } // setPreserveOriginalContent 481 482 /** Get the preserving of content status of the Document. 483 * 484 * @return whether the Document should preserve it's original content. 485 */ 486 public Boolean getPreserveOriginalContent() { 487 return preserveOriginalContent; 488 } // getPreserveOriginalContent 489 490 /** 491 * Allow/disallow collecting of repositioning information. 492 * If is <B>true</B> information will be retrieved and preserved 493 * as document feature.<BR> 494 * Preserving of repositioning information give the possibilities 495 * for converting of coordinates between the original document content and 496 * extracted from the document text. 497 */ 498 public void setCollectRepositioningInfo(Boolean b) { 499 collectRepositioningInfo = b; 500 } // setCollectRepositioningInfo 501 502 /** Get the collectiong and preserving of repositioning information 503 * for the Document. <BR> 504 * Preserving of repositioning information give the possibilities 505 * for converting of coordinates between the original document content and 506 * extracted from the document text. 507 * 508 * @return whether the Document should collect and preserve information. 509 */ 510 public Boolean getCollectRepositioningInfo() { 511 return collectRepositioningInfo; 512 } // getCollectRepositioningInfo 513 514 /** Documents may be packed within files; in this case an optional pair of 515 * offsets refer to the location of the document. This method gets the 516 * start offset. 517 */ 518 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; } 519 520 /** Documents may be packed within files; in this case an optional pair of 521 * offsets refer to the location of the document. This method sets the 522 * start offset. 523 */ 524 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) { 525 this.sourceUrlStartOffset = sourceUrlStartOffset; 526 } // setSourceUrlStartOffset 527 528 /** Documents may be packed within files; in this case an optional pair of 529 * offsets refer to the location of the document. This method gets the 530 * end offset. 531 */ 532 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; } 533 534 /** Documents may be packed within files; in this case an optional pair of 535 * offsets refer to the location of the document. This method sets the 536 * end offset. 537 */ 538 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) { 539 this.sourceUrlEndOffset = sourceUrlEndOffset; 540 } // setSourceUrlStartOffset 541 542 /** The content of the document: a String for text; MPEG for video; etc. */ 543 public DocumentContent getContent() { return content; } 544 545 /** Set method for the document content */ 546 public void setContent(DocumentContent content) { this.content = content; } 547 548 /** Get the encoding of the document content source */ 549 public String getEncoding() { 550 //we need to make sure we ALWAYS have an encoding 551 if(encoding == null || encoding.trim().length() == 0){ 552 //no encoding definded: use the platform default 553 encoding = java.nio.charset.Charset.forName( 554 System.getProperty("file.encoding")).name(); 555 } 556 return encoding; 557 } 558 559 /** Set the encoding of the document content source */ 560 public void setEncoding(String encoding) { this.encoding = encoding; } 561 562 /** Get the default set of annotations. The set is created if it 563 * doesn't exist yet. 564 */ 565 public AnnotationSet getAnnotations() { 566 if(defaultAnnots == null){ 567 defaultAnnots = new AnnotationSetImpl(this); 568 fireAnnotationSetAdded(new DocumentEvent( 569 this, DocumentEvent.ANNOTATION_SET_ADDED, null)); 570 }//if 571 return defaultAnnots; 572 } // getAnnotations() 573 574 /** Get a named set of annotations. Creates a new set if one with this 575 * name doesn't exist yet. 576 * If the provided name is null then it returns the default annotation set. 577 */ 578 public AnnotationSet getAnnotations(String name) { 579 if(name == null) return getAnnotations(); 580 if(namedAnnotSets == null) 581 namedAnnotSets = new HashMap(); 582 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name); 583 584 if(namedSet == null) { 585 namedSet = new AnnotationSetImpl(this, name); 586 namedAnnotSets.put(name, namedSet); 587 588 DocumentEvent evt = new DocumentEvent( 589 this, DocumentEvent.ANNOTATION_SET_ADDED, name 590 ); 591 fireAnnotationSetAdded(evt); 592 } 593 return namedSet; 594 } // getAnnotations(name) 595 596 /** Make the document markup-aware. This will trigger the creation 597 * of a DocumentFormat object at Document initialisation time; the 598 * DocumentFormat object will unpack the markup in the Document and 599 * add it as annotations. Documents are <B>not</B> markup-aware by default. 600 * 601 * @param b markup awareness status. 602 */ 603 public void setMarkupAware(Boolean newMarkupAware) { 604 this.markupAware = newMarkupAware; 605 } 606 607 /** Get the markup awareness status of the Document. 608 * <B>Documents are markup-aware by default.</B> 609 * @return whether the Document is markup aware. 610 */ 611 public Boolean getMarkupAware() { return markupAware; } 612 613 /** Returns an XML document aming to preserve the original markups( 614 * the original markup will be in the same place and format as it was 615 * before processing the document) and include (if possible) 616 * the annotations specified in the aSourceAnnotationSet. 617 * It is equivalent to toXml(aSourceAnnotationSet, true). 618 */ 619 public String toXml(Set aSourceAnnotationSet){ 620 return toXml(aSourceAnnotationSet, true); 621 } 622 623 /** Returns an XML document aming to preserve the original markups( 624 * the original markup will be in the same place and format as it was 625 * before processing the document) and include (if possible) 626 * the annotations specified in the aSourceAnnotationSet. 627 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost 628 * if they will cause a crosed over situation. 629 * @param aSourceAnnotationSet is an annotation set containing all the 630 * annotations that will be combined with the original marup set. If the 631 * param is <code>null</code> it will only dump the original markups. 632 * @param includeFeatures is a boolean that controls whether the annotation 633 * features should be included or not. If false, only the annotation type 634 * is included in the tag. 635 * @return a string representing an XML document containing the original 636 * markup + dumped annotations form the aSourceAnnotationSet 637 */ 638 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){ 639 640 if(hasOriginalContentFeatures()) { 641 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures); 642 } // if 643 644 AnnotationSet originalMarkupsAnnotSet = 645 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 646 647 // Create a dumping annotation set on the document. It will be used for 648 // dumping annotations... 649 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 650 651 // This set will be constructed inside this method. If is not empty, the 652 // annotation contained will be lost. 653 if (!dumpingSet.isEmpty()){ 654 Out.prln("WARNING: The dumping annotation set was not empty."+ 655 "All annotation it contained were lost."); 656 dumpingSet.clear(); 657 }// End if 658 659 StatusListener sListener = (StatusListener) 660 gate.gui.MainFrame.getListeners(). 661 get("gate.event.StatusListener"); 662 // Construct the dumping set in that way that all annotations will verify 663 // the condition that there are not annotations which are crossed. 664 // First add all annotation from the original markups 665 if(sListener != null) 666 sListener.statusChanged("Constructing the dumping annotation set."); 667 dumpingSet.addAll(originalMarkupsAnnotSet); 668 // Then take all the annotations from aSourceAnnotationSet and verify if 669 // they can be inserted safely into the dumpingSet. Where not possible, 670 // report. 671 if (aSourceAnnotationSet != null){ 672 Iterator iter = aSourceAnnotationSet.iterator(); 673 while (iter.hasNext()){ 674 Annotation currentAnnot = (Annotation) iter.next(); 675 if(insertsSafety(dumpingSet,currentAnnot)){ 676 dumpingSet.add(currentAnnot); 677 }else if (crossedOverAnnotation != null){ 678 try { 679 Out.prln("Warning: Annotations were found to violate the " + 680 "crossed over condition: \n" + 681 "1. [" + 682 getContent().getContent( 683 crossedOverAnnotation.getStartNode().getOffset(), 684 crossedOverAnnotation.getEndNode().getOffset()) + 685 " (" + crossedOverAnnotation.getType() + ": " + 686 crossedOverAnnotation.getStartNode().getOffset() + 687 ";" + crossedOverAnnotation.getEndNode().getOffset() + 688 ")]\n" + 689 "2. [" + 690 getContent().getContent( 691 currentAnnot.getStartNode().getOffset(), 692 currentAnnot.getEndNode().getOffset()) + 693 " (" + currentAnnot.getType() + ": " + 694 currentAnnot.getStartNode().getOffset() + 695 ";" + currentAnnot.getEndNode().getOffset() + 696 ")]\nThe second one will be discarded.\n" ); 697 } catch (gate.util.InvalidOffsetException ex) { 698 throw new GateRuntimeException(ex.getMessage()); 699 } 700 }// End if 701 }// End while 702 }// End if 703 704 // The dumpingSet is ready to be exported as XML 705 // Here we go. 706 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 707 StringBuffer xmlDoc = new StringBuffer( 708 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue())); 709 710 // Add xml header if original format was xml 711 String mimeType = getFeatures() == null ? 712 null : 713 (String)getFeatures().get("MimeType"); 714 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml"); 715 716 if(wasXML){ 717 xmlDoc.append("<?xml version=\"1.0\" encoding=\""); 718 xmlDoc.append(getEncoding()); 719 xmlDoc.append("\" ?>"); 720 xmlDoc.append(Strings.getNl()); 721 }// ENd if 722 // Identify and extract the root annotation from the dumpingSet. 723 theRootAnnotation = identifyTheRootAnnotation(dumpingSet); 724 // If a root annotation has been identified then add it eplicitley at the 725 // beginning of the document 726 if (theRootAnnotation != null){ 727 dumpingSet.remove(theRootAnnotation); 728 xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures)); 729 }// End if 730 // Construct and append the rest of the document 731 xmlDoc.append(saveAnnotationSetAsXml(dumpingSet, includeFeatures)); 732 // If a root annotation has been identified then add it eplicitley at the 733 // end of the document 734 if (theRootAnnotation != null){ 735 xmlDoc.append(writeEndTag(theRootAnnotation)); 736 }// End if 737 738 if(sListener != null) sListener.statusChanged("Done."); 739 return xmlDoc.toString(); 740 }//End toXml() 741 742 /** This method verifies if aSourceAnnotation can ve inserted safety into the 743 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over 744 * contition with any annotation from the aTargetAnnotSet. 745 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation 746 * @param aSourceAnnotation the annotation to be inserted into the 747 * aTargetAnnotSet 748 * @return true if the annotation inserts safety, or false otherwise. 749 */ 750 private boolean insertsSafety(AnnotationSet aTargetAnnotSet, 751 Annotation aSourceAnnotation){ 752 753 if (aTargetAnnotSet == null || aSourceAnnotation == null) { 754 this.crossedOverAnnotation = null; 755 return false; 756 } 757 if (aSourceAnnotation.getStartNode() == null || 758 aSourceAnnotation.getStartNode().getOffset()== null) { 759 this.crossedOverAnnotation = null; 760 return false; 761 } 762 if (aSourceAnnotation.getEndNode() == null || 763 aSourceAnnotation.getEndNode().getOffset()== null) { 764 this.crossedOverAnnotation = null; 765 return false; 766 } 767 768 // Get the start and end offsets 769 Long start = aSourceAnnotation.getStartNode().getOffset(); 770 Long end = aSourceAnnotation.getEndNode().getOffset(); 771 // Read aSourceAnnotation offsets long 772 long s2 = start.longValue(); 773 long e2 = end.longValue(); 774 775 // Obtain a set with all annotations annotations that overlap 776 // totaly or partially with the interval defined by the two provided offsets 777 AnnotationSet as = aTargetAnnotSet.get(start,end); 778 779 // Investigate all the annotations from as to see if there is one that 780 // comes in conflict with aSourceAnnotation 781 Iterator it = as.iterator(); 782 while(it.hasNext()){ 783 Annotation ann = (Annotation) it.next(); 784 // Read ann offsets 785 long s1 = ann.getStartNode().getOffset().longValue(); 786 long e1 = ann.getEndNode().getOffset().longValue(); 787 788 if (s1<s2 && s2<e1 && e1<e2) { 789 this.crossedOverAnnotation = ann; 790 return false; 791 } 792 if (s2<s1 && s1<e2 && e2<e1) { 793 this.crossedOverAnnotation = ann; 794 return false; 795 } 796 }// End while 797 return true; 798 }// insertsSafety() 799 800 /** This method saves all the annotations from aDumpAnnotSet and combines 801 * them with the document content. 802 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 803 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 804 * then an empty string will be returned. 805 * @param includeFeatures is a boolean, which controls whether the annotation 806 * features and gate ID are included or not. 807 * @return The XML document obtained from raw text + the information from 808 * the dump annotation set. 809 */ 810 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet, 811 boolean includeFeatures){ 812 String content = null; 813 if (this.getContent()== null) 814 content = new String(""); 815 else 816 content = this.getContent().toString(); 817 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content)); 818 if (aDumpAnnotSet == null) return docContStrBuff.toString(); 819 820 TreeMap offsets2CharsMap = new TreeMap(); 821 if (this.getContent().size().longValue() != 0){ 822 // Fill the offsets2CharsMap with all the indices where 823 // special chars appear 824 buildEntityMapFromString(content,offsets2CharsMap); 825 }//End if 826 // The saving alghorithm is as follows: 827 /////////////////////////////////////////// 828 // Construct a set of annot with all IDs in asc order. 829 // All annotations that end at that offset swap their place in descending 830 // order. For each node write all the tags from left to right. 831 832 // Construct the node set 833 TreeSet offsets = new TreeSet(); 834 Iterator iter = aDumpAnnotSet.iterator(); 835 while (iter.hasNext()){ 836 Annotation annot = (Annotation) iter.next(); 837 offsets.add(annot.getStartNode().getOffset()); 838 offsets.add(annot.getEndNode().getOffset()); 839 }// End while 840 841 // ofsets is sorted in ascending order. 842 // Iterate this set in descending order and remove an offset at each 843 // iteration 844 while (!offsets.isEmpty()){ 845 Long offset = (Long)offsets.last(); 846 // Remove the offset from the set 847 offsets.remove(offset); 848 // Now, use it. 849 // Returns a list with annotations that needs to be serialized in that 850 // offset. 851 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset); 852 // Attention: the annotation are serialized from left to right 853 StringBuffer tmpBuff = new StringBuffer(""); 854 Stack stack = new Stack(); 855 // Iterate through all these annotations and serialize them 856 Iterator it = annotations.iterator(); 857 while(it.hasNext()){ 858 Annotation a = (Annotation) it.next(); 859 it.remove(); 860 // Test if a Ends at offset 861 if ( offset.equals(a.getEndNode().getOffset()) ){ 862 // Test if a Starts at offset 863 if ( offset.equals(a.getStartNode().getOffset()) ){ 864 // Here, the annotation a Starts and Ends at the offset 865 if ( null != a.getFeatures().get("isEmptyAndSpan") && 866 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 867 868 // Assert: annotation a with start == end and isEmptyAndSpan 869 tmpBuff.append(writeStartTag(a, includeFeatures)); 870 stack.push(a); 871 }else{ 872 // Assert annotation a with start == end and an empty tag 873 tmpBuff.append(writeEmptyTag(a)); 874 // The annotation is removed from dumped set 875 aDumpAnnotSet.remove(a); 876 }// End if 877 }else{ 878 // Here the annotation a Ends at the offset. 879 // In this case empty the stack and write the end tag 880 if (!stack.isEmpty()){ 881 while(!stack.isEmpty()){ 882 Annotation a1 = (Annotation)stack.pop(); 883 tmpBuff.append(writeEndTag(a1)); 884 }// End while 885 }// End if 886 tmpBuff.append(writeEndTag(a)); 887 }// End if 888 }else{ 889 // The annotation a does NOT end at the offset. Let's see if it starts 890 // at the offset 891 if ( offset.equals(a.getStartNode().getOffset()) ){ 892 // The annotation a starts at the offset. 893 // In this case empty the stack and write the end tag 894 if (!stack.isEmpty()){ 895 while(!stack.isEmpty()){ 896 Annotation a1 = (Annotation)stack.pop(); 897 tmpBuff.append(writeEndTag(a1)); 898 }// End while 899 }// End if 900 tmpBuff.append(writeStartTag(a, includeFeatures)); 901 // The annotation is removed from dumped set 902 aDumpAnnotSet.remove(a); 903 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 904 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 905 }// End while(it.hasNext()){ 906 907 // In this case empty the stack and write the end tag 908 if (!stack.isEmpty()){ 909 while(!stack.isEmpty()){ 910 Annotation a1 = (Annotation)stack.pop(); 911 tmpBuff.append(writeEndTag(a1)); 912 }// End while 913 }// End if 914 915 // Before inserting tmpBuff into docContStrBuff we need to check 916 // if there are chars to be replaced and if there are, they would be 917 // replaced. 918 if (!offsets2CharsMap.isEmpty()){ 919 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 920 while( !offsets2CharsMap.isEmpty() && 921 offsChar.intValue() >= offset.intValue()){ 922 // Replace the char at offsChar with its corresponding entity form 923 // the entitiesMap. 924 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 925 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 926 // Discard the offsChar after it was used. 927 offsets2CharsMap.remove(offsChar); 928 // Investigate next offsChar 929 if (!offsets2CharsMap.isEmpty()) 930 offsChar = (Integer) offsets2CharsMap.lastKey(); 931 }// End while 932 }// End if 933 // Insert tmpBuff to the location where it belongs in docContStrBuff 934 docContStrBuff.insert(offset.intValue(),tmpBuff.toString()); 935 }// End while(!offsets.isEmpty()) 936 // Need to replace the entities in the remaining text, if there is any text 937 // So, if there are any more items in offsets2CharsMap they need to be 938 // replaced 939 while (!offsets2CharsMap.isEmpty()){ 940 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 941 // Replace the char with its entity 942 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1, 943 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 944 // remove the offset from the map 945 offsets2CharsMap.remove(offsChar); 946 }// End while 947 return docContStrBuff.toString(); 948 }// saveAnnotationSetAsXml() 949 950 /** 951 * Return true only if the document has features for original content and 952 * repositioning information. 953 */ 954 private boolean hasOriginalContentFeatures() { 955 FeatureMap features = getFeatures(); 956 boolean result = false; 957 958 result = 959 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null) 960 && 961 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME) 962 != null); 963 964 return result; 965 } // hasOriginalContentFeatures 966 967 /** This method saves all the annotations from aDumpAnnotSet and combines 968 * them with the original document content, if preserved as feature. 969 * @param aDumpAnnotationSet is a GATE annotation set prepared to be used 970 * on the raw text from document content. If aDumpAnnotSet is <b>null<b> 971 * then an empty string will be returned. 972 * @param includeFeatures is a boolean, which controls whether the annotation 973 * features and gate ID are included or not. 974 * @return The XML document obtained from raw text + the information from 975 * the dump annotation set. 976 */ 977 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet, 978 boolean includeFeatures){ 979 StringBuffer docContStrBuff; 980 981 String origContent; 982 983 origContent = 984 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME); 985 if(origContent == null) { 986 origContent = ""; 987 } // if 988 989 long originalContentSize = origContent.length(); 990 991 RepositioningInfo repositioning = (RepositioningInfo) 992 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME); 993 994 docContStrBuff = new StringBuffer(origContent); 995 if (aSourceAnnotationSet == null) return docContStrBuff.toString(); 996 997 StatusListener sListener = (StatusListener) 998 gate.gui.MainFrame.getListeners(). 999 get("gate.event.StatusListener"); 1000 1001 AnnotationSet originalMarkupsAnnotSet = 1002 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1003 // Create a dumping annotation set on the document. It will be used for 1004 // dumping annotations... 1005 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this); 1006 if(sListener != null) 1007 sListener.statusChanged("Constructing the dumping annotation set."); 1008 // Then take all the annotations from aSourceAnnotationSet and verify if 1009 // they can be inserted safely into the dumpingSet. Where not possible, 1010 // report. 1011 if (aSourceAnnotationSet != null){ 1012 Iterator iter = aSourceAnnotationSet.iterator(); 1013 Annotation currentAnnot; 1014 while (iter.hasNext()){ 1015 currentAnnot = (Annotation) iter.next(); 1016 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot) 1017 && insertsSafety(dumpingSet, currentAnnot)){ 1018 dumpingSet.add(currentAnnot); 1019 }else{ 1020 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() + 1021 ", startOffset=" + currentAnnot.getStartNode().getOffset() + 1022 ", endOffset=" + currentAnnot.getEndNode().getOffset() + 1023 ", type=" + currentAnnot.getType()+ " was found to violate the" + 1024 " crossed over condition. It will be discarded"); 1025 }// End if 1026 }// End while 1027 }// End if 1028 1029 // The dumpingSet is ready to be exported as XML 1030 // Here we go. 1031 if(sListener != null) sListener.statusChanged("Dumping annotations as XML"); 1032 1033 /////////////////////////////////////////// 1034 // Construct a set of annot with all IDs in asc order. 1035 // All annotations that end at that offset swap their place in descending 1036 // order. For each node write all the tags from left to right. 1037 1038 // Construct the node set 1039 TreeSet offsets = new TreeSet(); 1040 Iterator iter = aSourceAnnotationSet.iterator(); 1041 while (iter.hasNext()){ 1042 Annotation annot = (Annotation) iter.next(); 1043 offsets.add(annot.getStartNode().getOffset()); 1044 offsets.add(annot.getEndNode().getOffset()); 1045 }// End while 1046 1047 // ofsets is sorted in ascending order. 1048 // Iterate this set in descending order and remove an offset at each 1049 // iteration 1050 while (!offsets.isEmpty()){ 1051 Long offset = (Long)offsets.last(); 1052 // Remove the offset from the set 1053 offsets.remove(offset); 1054 // Now, use it. 1055 // Returns a list with annotations that needs to be serialized in that 1056 // offset. 1057 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset); 1058 // Attention: the annotation are serialized from left to right 1059 StringBuffer tmpBuff = new StringBuffer(""); 1060 Stack stack = new Stack(); 1061 // Iterate through all these annotations and serialize them 1062 Iterator it = annotations.iterator(); 1063 Annotation a = null; 1064 while(it.hasNext()) { 1065 a = (Annotation) it.next(); 1066 it.remove(); 1067 // Test if a Ends at offset 1068 if ( offset.equals(a.getEndNode().getOffset()) ){ 1069 // Test if a Starts at offset 1070 if ( offset.equals(a.getStartNode().getOffset()) ){ 1071 // Here, the annotation a Starts and Ends at the offset 1072 if ( null != a.getFeatures().get("isEmptyAndSpan") && 1073 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){ 1074 1075 // Assert: annotation a with start == end and isEmptyAndSpan 1076 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1077 stack.push(a); 1078 }else{ 1079 // Assert annotation a with start == end and an empty tag 1080 tmpBuff.append(writeEmptyTag(a, false)); 1081 // The annotation is removed from dumped set 1082 aSourceAnnotationSet.remove(a); 1083 }// End if 1084 }else{ 1085 // Here the annotation a Ends at the offset. 1086 // In this case empty the stack and write the end tag 1087 while(!stack.isEmpty()){ 1088 Annotation a1 = (Annotation)stack.pop(); 1089 tmpBuff.append(writeEndTag(a1)); 1090 }// End while 1091 tmpBuff.append(writeEndTag(a)); 1092 }// End if 1093 }else{ 1094 // The annotation a does NOT end at the offset. Let's see if it starts 1095 // at the offset 1096 if ( offset.equals(a.getStartNode().getOffset()) ){ 1097 // The annotation a starts at the offset. 1098 // In this case empty the stack and write the end tag 1099 while(!stack.isEmpty()){ 1100 Annotation a1 = (Annotation)stack.pop(); 1101 tmpBuff.append(writeEndTag(a1)); 1102 }// End while 1103 1104 tmpBuff.append(writeStartTag(a, includeFeatures, false)); 1105 // The annotation is removed from dumped set 1106 aSourceAnnotationSet.remove(a); 1107 }// End if ( offset.equals(a.getStartNode().getOffset()) ) 1108 }// End if ( offset.equals(a.getEndNode().getOffset()) ) 1109 }// End while(it.hasNext()){ 1110 1111 // In this case empty the stack and write the end tag 1112 while(!stack.isEmpty()){ 1113 Annotation a1 = (Annotation)stack.pop(); 1114 tmpBuff.append(writeEndTag(a1)); 1115 }// End while 1116 1117 long originalPosition = -1; 1118 boolean backPositioning = 1119 a != null && offset.equals(a.getEndNode().getOffset()); 1120 if ( backPositioning ) { 1121 // end of the annotation correction 1122 originalPosition = 1123 repositioning.getOriginalPos(offset.intValue(), true); 1124 } // if 1125 1126 if(originalPosition == -1) { 1127 originalPosition = repositioning.getOriginalPos(offset.intValue()); 1128 } // if 1129 1130 // Insert tmpBuff to the location where it belongs in docContStrBuff 1131 if(originalPosition != -1 && originalPosition <= originalContentSize ) { 1132 docContStrBuff.insert((int) originalPosition, tmpBuff.toString()); 1133 } 1134 else { 1135 Out.prln("Error in the repositioning. The offset ("+offset.intValue() 1136 +") could not be positioned in the original document. \n" 1137 +"Calculated position is: "+originalPosition 1138 +" placed back: "+backPositioning); 1139 } // if 1140 1141 }// End while(!offsets.isEmpty()) 1142 if (theRootAnnotation != null) 1143 docContStrBuff.append(writeEndTag(theRootAnnotation)); 1144 return docContStrBuff.toString(); 1145 } // saveAnnotationSetAsXmlInOrig() 1146 1147 /** This method returns a list with annotations ordered that way that 1148 * they can be serialized from left to right, at the offset. If one of the 1149 * params is null then an empty list will be returned. 1150 * @param aDumpAnnotSet is a set containing all annotations that will be 1151 * dumped. 1152 * @param offset represent the offset at witch the annotation must start 1153 * AND/OR end. 1154 * @return a list with those annotations that need to be serialized. 1155 */ 1156 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){ 1157 List annotationList = new LinkedList(); 1158 if (aDumpAnnotSet == null || offset == null) return annotationList; 1159 Set annotThatStartAtOffset = new TreeSet( 1160 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC)); 1161 Set annotThatEndAtOffset = new TreeSet( 1162 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC)); 1163 Set annotThatStartAndEndAtOffset = new TreeSet( 1164 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC)); 1165 1166 // Fill these tree lists with annotation tat start, end or start and 1167 // end at the offset. 1168 Iterator iter = aDumpAnnotSet.iterator(); 1169 while(iter.hasNext()){ 1170 Annotation ann = (Annotation) iter.next(); 1171 if (offset.equals(ann.getStartNode().getOffset())){ 1172 if (offset.equals(ann.getEndNode().getOffset())) 1173 annotThatStartAndEndAtOffset.add(ann); 1174 else 1175 annotThatStartAtOffset.add(ann); 1176 }else{ 1177 if (offset.equals(ann.getEndNode().getOffset())) 1178 annotThatEndAtOffset.add(ann); 1179 }// End if 1180 }// End while 1181 annotationList.addAll(annotThatEndAtOffset); 1182 annotThatEndAtOffset = null; 1183 annotationList.addAll(annotThatStartAtOffset); 1184 annotThatStartAtOffset = null; 1185 iter = annotThatStartAndEndAtOffset.iterator(); 1186 while(iter.hasNext()){ 1187 Annotation ann = (Annotation) iter.next(); 1188 Iterator it = annotationList.iterator(); 1189 boolean breaked = false; 1190 while (it.hasNext()){ 1191 Annotation annFromList = (Annotation) it.next(); 1192 if (annFromList.getId().intValue() > ann.getId().intValue()){ 1193 annotationList.add(annotationList.indexOf(annFromList),ann); 1194 breaked = true; 1195 break; 1196 }// End if 1197 }// End while 1198 if (!breaked) 1199 annotationList.add(ann); 1200 iter.remove(); 1201 }// End while 1202 return annotationList; 1203 }// getAnnotationsForOffset() 1204 1205 private String writeStartTag(Annotation annot, boolean includeFeatures){ 1206 return writeStartTag(annot, includeFeatures, true); 1207 } // writeStartTag 1208 1209 /** Returns a string representing a start tag based on the input annot*/ 1210 private String writeStartTag(Annotation annot, boolean includeFeatures, 1211 boolean includeNamespace){ 1212 AnnotationSet originalMarkupsAnnotSet = 1213 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1214 1215 StringBuffer strBuff = new StringBuffer(""); 1216 if (annot == null) return strBuff.toString(); 1217// if (!addGatePreserveFormatTag && isRootTag){ 1218 if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){ 1219 //the features are included either if desired or if that's an annotation 1220 //from the original markup of the document. We don't want for example to 1221 //spoil all links in an HTML file! 1222 if (includeFeatures) { 1223 strBuff.append("<"); 1224 strBuff.append(annot.getType()); 1225 strBuff.append(" "); 1226 if(includeNamespace) { 1227 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\""); 1228 strBuff.append(" gate:"); 1229 } 1230 strBuff.append("gateId=\""); 1231 strBuff.append(annot.getId()); 1232 strBuff.append("\""); 1233 strBuff.append(" "); 1234 if(includeNamespace) { 1235 strBuff.append("gate:"); 1236 } 1237 strBuff.append("annotMaxId=\""); 1238 strBuff.append(nextAnnotationId); 1239 strBuff.append("\""); 1240 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1241 strBuff.append(">"); 1242 } 1243 else if (originalMarkupsAnnotSet.contains(annot)) { 1244 strBuff.append("<"); 1245 strBuff.append(annot.getType()); 1246 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1247 strBuff.append(">"); 1248 } 1249 else { 1250 strBuff.append("<"); 1251 strBuff.append(annot.getType()); 1252 strBuff.append(">"); 1253 } 1254 1255 }else{ 1256 //the features are included either if desired or if that's an annotation 1257 //from the original markup of the document. We don't want for example to 1258 //spoil all links in an HTML file! 1259 if (includeFeatures) { 1260 strBuff.append("<"); 1261 strBuff.append(annot.getType()); 1262 strBuff.append(" "); 1263 if(includeNamespace) { 1264 strBuff.append("gate:"); 1265 } // if includeNamespaces 1266 strBuff.append("gateId=\""); 1267 strBuff.append(annot.getId()); 1268 strBuff.append("\""); 1269 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1270 strBuff.append(">"); 1271 } 1272 else if (originalMarkupsAnnotSet.contains(annot)) { 1273 strBuff.append("<"); 1274 strBuff.append(annot.getType()); 1275 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace)); 1276 strBuff.append(">"); 1277 } 1278 else { 1279 strBuff.append("<"); 1280 strBuff.append(annot.getType()); 1281 strBuff.append(">"); 1282 } 1283 }// End if 1284 return strBuff.toString(); 1285 }// writeStartTag() 1286 1287 /** 1288 * Identifies the root annotations inside an annotation set. 1289 * The root annotation is the one that starts at offset 0, and has the 1290 * greatest span. If there are more than one with this function, then the 1291 * annotation with the smalled ID wil be selected as root. 1292 * If none is identified it will return null. 1293 * @param anAnnotationSet The annotation set possibly containing 1294 * the root annotation. 1295 * @return The root annotation or null is it fails 1296 */ 1297 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){ 1298 if (anAnnotationSet == null) return null; 1299 // If the starting node of this annotation is not null, then the annotation 1300 // set will not have a root annotation. 1301 Node startNode = anAnnotationSet.firstNode(); 1302 Node endNode = anAnnotationSet.lastNode(); 1303 // This is placed here just to speed things up. The alghorithm bellow can 1304 // can identity the annotation that span over the entire set and with the 1305 // smallest ID. However the root annotation will have to have the start 1306 // offset equal to 0. 1307 if (startNode.getOffset().longValue() != 0) return null; 1308 // Go anf find the annotation. 1309 Annotation theRootAnnotation = null; 1310 // Check if there are annotations starting at offset 0. If there are, then 1311 // check all of them to see which one has the greatest span. Basically its 1312 // END offset should be the bigest offset from the input annotation set. 1313 long start = startNode.getOffset().longValue(); 1314 long end = endNode.getOffset().longValue(); 1315 for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){ 1316 Annotation currentAnnot = (Annotation) it.next(); 1317 // If the currentAnnot has both its Start and End equals to the Start and 1318 // end of the AnnotationSet then check to see if its ID is the smallest. 1319 if ( 1320 (start == currentAnnot.getStartNode().getOffset().longValue()) && 1321 (end == currentAnnot.getEndNode().getOffset().longValue()) 1322 ){ 1323 // The currentAnnotation has is a potencial root one. 1324 if (theRootAnnotation == null) 1325 theRootAnnotation = currentAnnot; 1326 else{ 1327 // If its ID is greater that the currentAnnot then update the root 1328 if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue()) 1329 theRootAnnotation = currentAnnot; 1330 }// End if 1331 }// End if 1332 }// End for 1333 return theRootAnnotation; 1334 }// End identifyTheRootAnnotation() 1335 1336 /** This method takes aScanString and searches for those chars from 1337 * entitiesMap that appear in the string. A tree map(offset2Char) is filled 1338 * using as key the offsets where those Chars appear and the Char. 1339 * If one of the params is null the method simply returns. 1340 */ 1341 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){ 1342 if (aScanString == null || aMapToFill == null) return; 1343 if (entitiesMap == null || entitiesMap.isEmpty()){ 1344 Err.prln("WARNING: Entities map was not initialised !"); 1345 return; 1346 }// End if 1347 // Fill the Map with the offsets of the special chars 1348 Iterator entitiesMapIterator = entitiesMap.keySet().iterator(); 1349 while(entitiesMapIterator.hasNext()){ 1350 Character c = (Character) entitiesMapIterator.next(); 1351 int fromIndex = 0; 1352 while (-1 != fromIndex){ 1353 fromIndex = aScanString.indexOf(c.charValue(),fromIndex); 1354 if (-1 != fromIndex){ 1355 aMapToFill.put(new Integer(fromIndex),c); 1356 fromIndex ++; 1357 }// End if 1358 }// End while 1359 }// End while 1360 }//buildEntityMapFromString(); 1361 1362 private String writeEmptyTag(Annotation annot){ 1363 return writeEmptyTag(annot, true); 1364 } // writeEmptyTag 1365 1366 /** Returns a string representing an empty tag based on the input annot*/ 1367 private String writeEmptyTag(Annotation annot, boolean includeNamespace){ 1368 StringBuffer strBuff = new StringBuffer(""); 1369 if (annot == null) return strBuff.toString(); 1370 1371 strBuff.append("<"); 1372 strBuff.append(annot.getType()); 1373 1374 AnnotationSet originalMarkupsAnnotSet = 1375 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 1376 if (! originalMarkupsAnnotSet.contains(annot)) { 1377 strBuff.append(" gateId=\""); 1378 strBuff.append(annot.getId()); 1379 strBuff.append("\""); 1380 } 1381 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace)); 1382 strBuff.append("/>"); 1383 1384 return strBuff.toString(); 1385 }// writeEmptyTag() 1386 1387 /** Returns a string representing an end tag based on the input annot*/ 1388 private String writeEndTag(Annotation annot){ 1389 StringBuffer strBuff = new StringBuffer(""); 1390 if (annot == null) return strBuff.toString(); 1391/* 1392 if (annot.getType().indexOf(" ") != -1) 1393 Out.prln("Warning: Truncating end tag to first word for annot type \"" 1394 +annot.getType()+ "\". "); 1395*/ 1396 strBuff.append("</"+annot.getType()+">"); 1397 1398 return strBuff.toString(); 1399 }// writeEndTag() 1400 1401 /** Returns a string representing a FeatureMap serialized as XML attributes*/ 1402 private String writeFeatures(FeatureMap feat, boolean includeNamespace){ 1403 StringBuffer strBuff = new StringBuffer(""); 1404 if (feat == null) return strBuff.toString(); 1405 Iterator it = feat.keySet().iterator(); 1406 while (it.hasNext()){ 1407 Object key = it.next(); 1408 Object value = feat.get(key); 1409 if ( (key != null) && (value != null) ){ 1410 // Eliminate a feature inserted at reading time and which help to 1411 // take some decissions at saving time 1412 if ("isEmptyAndSpan".equals(key.toString())) 1413 continue; 1414 if( !(String.class.isAssignableFrom(key.getClass()) || 1415 Number.class.isAssignableFrom(key.getClass()))){ 1416 1417 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+ 1418 " from String or Number.(feature discarded)"); 1419 continue; 1420 }// End if 1421 if ( !(String.class.isAssignableFrom(value.getClass()) || 1422 Number.class.isAssignableFrom(value.getClass()) || 1423 java.util.Collection.class.isAssignableFrom(value.getClass()))){ 1424 1425 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+ 1426 " from String, Number or Collection.(feature discarded)"); 1427 continue; 1428 }// End if 1429 if ("matches".equals(key)) { 1430 strBuff.append(" "); 1431 if(includeNamespace) { 1432 strBuff.append("gate:"); 1433 } 1434// strBuff.append(key); 1435 // replace non XML chars in attribute name 1436 strBuff.append( 1437 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1438 strBuff.append("=\""); 1439 } 1440 else { 1441 strBuff.append(" "); 1442// strBuff.append(key); 1443 // replace non XML chars in attribute name 1444 strBuff.append( 1445 filterNonXmlChars(replaceCharsWithEntities(key.toString()))); 1446 strBuff.append("=\""); 1447 } 1448 if (java.util.Collection.class.isAssignableFrom(value.getClass())){ 1449 Iterator valueIter = ((Collection)value).iterator(); 1450 while(valueIter.hasNext()){ 1451 Object item = valueIter.next(); 1452 if (!(String.class.isAssignableFrom(item.getClass()) || 1453 Number.class.isAssignableFrom(item.getClass()))) 1454 continue; 1455// strBuff.append(item); 1456 // replace non XML chars in collection item 1457 strBuff.append( 1458 filterNonXmlChars(replaceCharsWithEntities(item.toString()))); 1459 strBuff.append(";"); 1460 }// End while 1461 if (strBuff.charAt(strBuff.length()-1) == ';') 1462 strBuff.deleteCharAt(strBuff.length()-1); 1463 }else{ 1464// strBuff.append(value); 1465 // replace non XML chars in attribute value 1466 strBuff.append( 1467 filterNonXmlChars(replaceCharsWithEntities(value.toString()))); 1468 }// End if 1469 strBuff.append("\""); 1470 }// End if 1471 }// End while 1472 return strBuff.toString(); 1473 }// writeFeatures() 1474 1475 /** Returns a GateXml document that is a custom XML format for wich there is 1476 * a reader inside GATE called gate.xml.GateFormatXmlHandler. 1477 * What it does is to serialize a GATE document in an XML format. 1478 * @return a string representing a Gate Xml document. 1479 */ 1480 public String toXml(){ 1481 // Initialize the xmlContent with 3 time the size of the current document. 1482 // This is because of the tags size. This measure is made to increase the 1483 // performance of StringBuffer. 1484 StringBuffer xmlContent = new StringBuffer( 1485 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue())); 1486 // Add xml header 1487 xmlContent.append("<?xml version=\"1.0\" encoding=\""); 1488 xmlContent.append(getEncoding()); 1489 xmlContent.append("\" ?>"); 1490 xmlContent.append(Strings.getNl()); 1491 1492 // Add the root element 1493 xmlContent.append("<GateDocument>\n"); 1494 xmlContent.append("<!-- The document's features-->\n\n"); 1495 xmlContent.append("<GateDocumentFeatures>\n"); 1496 1497 xmlContent.append(featuresToXml(this.getFeatures())); 1498 xmlContent.append("</GateDocumentFeatures>\n"); 1499 xmlContent.append("<!-- The document content area with serialized"+ 1500 " nodes -->\n\n"); 1501 // Add plain text element 1502 xmlContent.append("<TextWithNodes>"); 1503 xmlContent.append(textWithNodes(this.getContent().toString())); 1504 xmlContent.append("</TextWithNodes>\n"); 1505 // Serialize as XML all document's annotation sets 1506 // Serialize the default AnnotationSet 1507 StatusListener sListener = (StatusListener) 1508 gate.gui.MainFrame.getListeners(). 1509 get("gate.event.StatusListener"); 1510 if(sListener != null) 1511 sListener.statusChanged("Saving the default annotation set "); 1512 xmlContent.append("<!-- The default annotation set -->\n\n"); 1513 xmlContent.append(annotationSetToXml(this.getAnnotations())); 1514 // Serialize all others AnnotationSets 1515 // namedAnnotSets is a Map containing all other named Annotation Sets. 1516 if (namedAnnotSets != null){ 1517 Iterator iter = namedAnnotSets.values().iterator(); 1518 while(iter.hasNext()){ 1519 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1520 xmlContent.append("<!-- Named annotation set -->\n\n"); 1521 // Serialize it as XML 1522 if(sListener != null) sListener.statusChanged("Saving " + 1523 annotSet.getName()+ 1524 " annotation set "); 1525 xmlContent.append(annotationSetToXml(annotSet)); 1526 }// End while 1527 }// End if 1528 // Add the end of GateDocument 1529 xmlContent.append("</GateDocument>"); 1530 if(sListener != null) sListener.statusChanged("Done !"); 1531 // return the XmlGateDocument 1532 return xmlContent.toString(); 1533 }// toXml 1534 1535 /** This method filters any non XML char 1536 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets 1537 * All non XML chars will be replaced with 0x20 (space char) This assures 1538 * that the next time the document is loaded there won't be any problems. 1539 * @param aStrBuffer represents the input String that is filtred. If the 1540 * aStrBuffer is null then an empty string will be returend 1541 * @return the "purified" StringBuffer version of the aStrBuffer 1542 */ 1543 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){ 1544 if (aStrBuffer == null) return new StringBuffer(""); 1545 String space = new String(" "); 1546 for (int i=aStrBuffer.length()-1;i>=0; i--){ 1547 if (!isXmlChar(aStrBuffer.charAt(i))) 1548 aStrBuffer.replace(i,i+1,space); 1549 }// End for 1550 return aStrBuffer; 1551 }// filterNonXmlChars() 1552 1553 /** This method decide if a char is a valid XML one or not 1554 * @param ch the char to be tested 1555 * @return true if is a valid XML char and fals if is not. 1556 */ 1557 public static boolean isXmlChar(char ch){ 1558 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true; 1559 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true; 1560 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true; 1561 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true; 1562 return false; 1563 }// End isXmlChar() 1564 1565 /** This method saves a FeatureMap as XML elements. 1566 * @ param aFeatureMap the feature map that has to be saved as XML. 1567 * @ return a String like this: <Feature><Name>...</Name> 1568 * <Value>...</Value></Feature><Feature>...</Feature> 1569 */ 1570 private String featuresToXml(FeatureMap aFeatureMap){ 1571 StringBuffer str = new StringBuffer(""); 1572 1573 if (aFeatureMap == null) return str.toString(); 1574 1575 Set keySet = aFeatureMap.keySet(); 1576 Iterator keyIterator = keySet.iterator(); 1577 while(keyIterator.hasNext()){ 1578 Object key = keyIterator.next(); 1579 Object value = aFeatureMap.get(key); 1580 if ((key != null) && (value != null)){ 1581 String keyClassName = null; 1582 String keyItemClassName = null; 1583 String valueClassName = null; 1584 String valueItemClassName = null; 1585 String key2String = key.toString(); 1586 String value2String = value.toString(); 1587 1588 Object item = null; 1589 // Test key if it is String, Number or Collection 1590 if (key instanceof java.lang.String || 1591 key instanceof java.lang.Number || 1592 key instanceof java.util.Collection) 1593 keyClassName = key.getClass().getName(); 1594 1595 // Test value if it is String, Number or Collection 1596 if (value instanceof java.lang.String || 1597 value instanceof java.lang.Number || 1598 value instanceof java.util.Collection) 1599 valueClassName = value.getClass().getName(); 1600 1601 // Features and values that are not Strings, Numbers or collections 1602 // will be discarded. 1603 if (keyClassName == null || valueClassName == null) continue; 1604 1605 // If key is collection serialize the colection in a specific format 1606 if (key instanceof java.util.Collection){ 1607 StringBuffer keyStrBuff = new StringBuffer(""); 1608 Iterator iter = ((Collection) key).iterator(); 1609 if (iter.hasNext()){ 1610 item = iter.next(); 1611 if (item instanceof java.lang.Number) 1612 keyItemClassName = item.getClass().getName(); 1613 else 1614 keyItemClassName = String.class.getName(); 1615 keyStrBuff.append(item.toString()); 1616 }// End if 1617 while (iter.hasNext()){ 1618 item = iter.next(); 1619 keyStrBuff.append(";" + item.toString()); 1620 }// End while 1621 key2String = keyStrBuff.toString(); 1622 }// End if 1623 // If key is collection serialize the colection in a specific format 1624 if (value instanceof java.util.Collection){ 1625 StringBuffer valueStrBuff = new StringBuffer(""); 1626 Iterator iter = ((Collection) value).iterator(); 1627 if (iter.hasNext()){ 1628 item = iter.next(); 1629 if (item instanceof java.lang.Number) 1630 valueItemClassName = item.getClass().getName(); 1631 else 1632 valueItemClassName = String.class.getName(); 1633 valueStrBuff.append(item.toString()); 1634 }// End if 1635 while (iter.hasNext()){ 1636 item = iter.next(); 1637 valueStrBuff.append(";" + item.toString()); 1638 }// End while 1639 value2String = valueStrBuff.toString(); 1640 }// End if 1641 str.append("<Feature>\n <Name"); 1642 if (keyClassName != null) 1643 str.append(" className=\""+keyClassName+"\""); 1644 if (keyItemClassName != null) 1645 str.append(" itemClassName=\""+keyItemClassName+"\""); 1646 str.append(">"); 1647 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String))); 1648 str.append("</Name>\n <Value"); 1649 if (valueClassName != null) 1650 str.append(" className=\"" + valueClassName + "\""); 1651 if (valueItemClassName != null) 1652 str.append(" itemClassName=\"" + valueItemClassName + "\""); 1653 str.append(">"); 1654 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String))); 1655 str.append("</Value>\n</Feature>\n"); 1656 }// End if 1657 }// end While 1658 return str.toString(); 1659 }//featuresToXml 1660 1661 /** This method replace all chars that appears in the anInputString and also 1662 * that are in the entitiesMap with their corresponding entity 1663 * @param anInputString the string analyzed. If it is null then returns the 1664 * empty string 1665 * @return a string representing the input string with chars replaced with 1666 * entities 1667 */ 1668 private StringBuffer replaceCharsWithEntities(String anInputString){ 1669 if (anInputString == null) return new StringBuffer(""); 1670 StringBuffer strBuff = new StringBuffer(anInputString); 1671 for (int i=strBuff.length()-1; i>=0; i--){ 1672 Character ch = new Character(strBuff.charAt(i)); 1673 if (entitiesMap.keySet().contains(ch)){ 1674 strBuff.replace(i,i+1,(String) entitiesMap.get(ch)); 1675 }// End if 1676 }// End for 1677 return strBuff; 1678 }//replaceCharsWithEntities() 1679 1680 /** This method creates Node XML elements and inserts them at the 1681 * corresponding offset inside the text. Nodes are created from the default 1682 * annotation set, as well as from all existing named annotation sets. 1683 * @param aText The text representing the document's plain text. 1684 * @return The text with empty <Node id="NodeId"/> elements. 1685 */ 1686 private String textWithNodes(String aText){ 1687 if (aText == null) return new String(""); 1688 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText)); 1689 1690 // Construct a map from offsets to Chars 1691 TreeMap offsets2CharsMap = new TreeMap(); 1692 if (aText.length()!= 0){ 1693 // Fill the offsets2CharsMap with all the indices where special chars appear 1694 buildEntityMapFromString(aText,offsets2CharsMap); 1695 }//End if 1696 // Construct the offsetsSet for all nodes belonging to this document 1697 TreeSet offsetsSet = new TreeSet(); 1698 Iterator annotSetIter = this.getAnnotations().iterator(); 1699 while (annotSetIter.hasNext()){ 1700 Annotation annot = (Annotation) annotSetIter.next(); 1701 offsetsSet.add(annot.getStartNode().getOffset()); 1702 offsetsSet.add(annot.getEndNode().getOffset()); 1703 }// end While 1704 // Get the nodes from all other named annotation sets. 1705 if (namedAnnotSets != null){ 1706 Iterator iter = namedAnnotSets.values().iterator(); 1707 while(iter.hasNext()){ 1708 AnnotationSet annotSet = (AnnotationSet) iter.next(); 1709 Iterator iter2 = annotSet.iterator(); 1710 while(iter2.hasNext()){ 1711 Annotation annotTmp = (Annotation) iter2.next(); 1712 offsetsSet.add(annotTmp.getStartNode().getOffset()); 1713 offsetsSet.add(annotTmp.getEndNode().getOffset()); 1714 }// End while 1715 }// End while 1716 }// End if 1717 // offsetsSet is ordered in ascending order because the structure 1718 // is a TreeSet 1719 1720 if (offsetsSet.isEmpty()){ 1721 return replaceCharsWithEntities(aText).toString(); 1722 }// End if 1723 // Iterate through all nodes from anAnnotSet and transform them to 1724 // XML elements. Then insert those elements at the node's offset into the 1725 // textWithNodes . 1726 while (!offsetsSet.isEmpty()){ 1727 Long offset = (Long) offsetsSet.last(); 1728 // Eliminate the offset from the list in order to create more memory space 1729 offsetsSet.remove(offset); 1730 // Use offset 1731 int offsetValue = offset.intValue(); 1732 String strNode = "<Node id=\"" + offsetValue + "\"/>"; 1733 // Before inserting this string into the textWithNodes, check to see if 1734 // there are any chars to be replaced with their corresponding entities 1735 if (!offsets2CharsMap.isEmpty()){ 1736 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1737 while( !offsets2CharsMap.isEmpty() && 1738 offsChar.intValue() >= offset.intValue()){ 1739 // Replace the char at offsChar with its corresponding entity form 1740 // the entitiesMap. 1741 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1742 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1743 // Discard the offsChar after it was used because this offset will 1744 // never appear again 1745 offsets2CharsMap.remove(offsChar); 1746 // Investigate next offsChar 1747 if (!offsets2CharsMap.isEmpty()) 1748 offsChar = (Integer) offsets2CharsMap.lastKey(); 1749 }// End while 1750 }// End if 1751 // Now it is safe to insert the node 1752 textWithNodes.insert(offsetValue,strNode); 1753 }// end while 1754 // Need to replace the entities in the remaining text, if there is any text 1755 // So, if there are any more items in offsets2CharsMap they need to be 1756 // replaced 1757 while (!offsets2CharsMap.isEmpty()){ 1758 Integer offsChar = (Integer) offsets2CharsMap.lastKey(); 1759 // Replace the char with its entity 1760 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1, 1761 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar))); 1762 // remove the offset from the map 1763 offsets2CharsMap.remove(offsChar); 1764 }// End while 1765 return textWithNodes.toString(); 1766 }//textWithNodes() 1767 1768 /** This method saves an AnnotationSet as XML. 1769 * @param anAnnotationSet The annotation set that has to be saved as XML. 1770 * @return a String like this: <AnnotationSet> <Annotation>.... 1771 * </AnnotationSet> 1772 */ 1773 private String annotationSetToXml(AnnotationSet anAnnotationSet){ 1774 StringBuffer str = new StringBuffer(""); 1775 1776 if (anAnnotationSet == null){ 1777 str.append("<AnnotationSet>\n"); 1778 str.append("</AnnotationSet>\n"); 1779 return str.toString(); 1780 }// End if 1781 if (anAnnotationSet.getName() == null) 1782 str.append("<AnnotationSet>\n"); 1783 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+ 1784 "\" >\n"); 1785 // Iterate through AnnotationSet and save each Annotation as XML 1786 Iterator iterator = anAnnotationSet.iterator(); 1787 while (iterator.hasNext()){ 1788 Annotation annot = (Annotation) iterator.next(); 1789 str.append("<Annotation " + "Type=\"" + annot.getType() + 1790 "\" StartNode=\"" + annot.getStartNode().getOffset() + 1791 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n"); 1792 str.append(featuresToXml(annot.getFeatures())); 1793 str.append("</Annotation>\n"); 1794 }// End while 1795 1796 str.append("</AnnotationSet>\n"); 1797 return str.toString(); 1798 }// annotationSetToXml 1799 1800 /** Returns a map with the named annotation sets. It returns <code>null</code> 1801 * if no named annotaton set exists. */ 1802 public Map getNamedAnnotationSets() { 1803 return namedAnnotSets; 1804 } // getNamedAnnotationSets 1805 1806 /** 1807 * Removes one of the named annotation sets. 1808 * Note that the default annotation set cannot be removed. 1809 * @param name the name of the annotation set to be removed 1810 */ 1811 public void removeAnnotationSet(String name){ 1812 Object removed = namedAnnotSets.remove(name); 1813 if(removed != null){ 1814 fireAnnotationSetRemoved( 1815 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name)); 1816 } 1817 } 1818 1819 /** Propagate edit changes to the document content and annotations. */ 1820 public void edit(Long start, Long end, DocumentContent replacement) 1821 throws InvalidOffsetException 1822 { 1823 if(! isValidOffsetRange(start, end)) 1824 throw new InvalidOffsetException(); 1825 1826 if(content != null) 1827 ((DocumentContentImpl) content).edit(start, end, replacement); 1828 1829 if(defaultAnnots != null) 1830 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement); 1831 1832 if(namedAnnotSets != null) { 1833 Iterator iter = namedAnnotSets.values().iterator(); 1834 while(iter.hasNext()) 1835 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement); 1836 } 1837 1838 } // edit(start,end,replacement) 1839 1840 /** Check that an offset is valid, i.e. it is non-null, greater than 1841 * or equal to 0 and less than the size of the document content. 1842 */ 1843 public boolean isValidOffset(Long offset) { 1844 if(offset == null) 1845 return false; 1846 1847 long o = offset.longValue(); 1848 if(o > getContent().size().longValue() || o < 0) 1849 return false; 1850 1851 return true; 1852 } // isValidOffset 1853 1854 /** Check that both start and end are valid offsets and that 1855 * they constitute a valid offset range, i.e. start is greater 1856 * than or equal to long. 1857 */ 1858 public boolean isValidOffsetRange(Long start, Long end) { 1859 return 1860 isValidOffset(start) && isValidOffset(end) && 1861 start.longValue() <= end.longValue(); 1862 } // isValidOffsetRange(start,end) 1863 1864 /** Sets the nextAnnotationId */ 1865 public void setNextAnnotationId(int aNextAnnotationId){ 1866 nextAnnotationId = aNextAnnotationId; 1867 }// setNextAnnotationId(); 1868 1869 /** Generate and return the next annotation ID */ 1870 public Integer getNextAnnotationId() { 1871 return new Integer(nextAnnotationId++); 1872 } // getNextAnnotationId 1873 1874 /** Generate and return the next node ID */ 1875 public Integer getNextNodeId() { return new Integer(nextNodeId++); } 1876 1877 /** Ordering based on URL.toString() and the URL offsets (if any) */ 1878 public int compareTo(Object o) throws ClassCastException { 1879 DocumentImpl other = (DocumentImpl) o; 1880 return getOrderingString().compareTo(other.getOrderingString()); 1881 } // compareTo 1882 1883 /** Utility method to produce a string for comparison in ordering. 1884 * String is based on the source URL and offsets. 1885 */ 1886 protected String getOrderingString() { 1887 if(sourceUrl == null) return toString(); 1888 1889 StringBuffer orderingString = new StringBuffer(sourceUrl.toString()); 1890 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) { 1891 orderingString.append(sourceUrlStartOffset.toString()); 1892 orderingString.append(sourceUrlEndOffset.toString()); 1893 } 1894 1895 return orderingString.toString(); 1896 } // getOrderingString() 1897 1898 /** The id of the next new annotation */ 1899 protected int nextAnnotationId = 0; 1900 1901 /** The id of the next new node */ 1902 protected int nextNodeId = 0; 1903 /** The source URL */ 1904 protected URL sourceUrl; 1905 1906 /** The document's URL name. */ 1907 1908 /** The content of the document */ 1909 protected DocumentContent content; 1910 1911 /** The encoding of the source of the document content */ 1912 protected String encoding = null; 1913 1914 // Data needed in toXml(AnnotationSet) methos 1915 1916 /** This field indicates whether or not to add the tag 1917 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't 1918 * have this tag added 1919 */ 1920// private boolean addGatePreserveFormatTag = false; 1921 1922 /** 1923 * Used by the XML dump preserving format method 1924 */ 1925 private Annotation theRootAnnotation = null; 1926 1927 /** This field is used when creating StringBuffers for toXml() methods. 1928 * The size of the StringBuffer will be docDonctent.size() multiplied by this 1929 * value. It is aimed to improve the performance of StringBuffer 1930 */ 1931 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 1; 1932 1933 /** Constant used in the inner class AnnotationComparator to order 1934 * annotations on their start offset 1935 */ 1936 private final int ORDER_ON_START_OFFSET = 0; 1937 /** Constant used in the inner class AnnotationComparator to order 1938 * annotations on their end offset 1939 */ 1940 private final int ORDER_ON_END_OFFSET = 1; 1941 /** Constant used in the inner class AnnotationComparator to order 1942 * annotations on their ID 1943 */ 1944 private final int ORDER_ON_ANNOT_ID = 2; 1945 /** Constant used in the inner class AnnotationComparator to order 1946 * annotations ascending 1947 */ 1948 private final int ASC = 3; 1949 /** Constant used in the inner class AnnotationComparator to order 1950 * annotations descending 1951 */ 1952 private final int DESC = -3; 1953 1954 /** A map initialized in init() containing entities that needs to be 1955 * replaced in strings 1956 */ 1957 private static Map entitiesMap = null; 1958 // Initialize the entities map use when saving as xml 1959 static{ 1960 entitiesMap = new HashMap(); 1961 entitiesMap.put(new Character('<'),"<"); 1962 entitiesMap.put(new Character('>'),">"); 1963 entitiesMap.put(new Character('&'),"&"); 1964 entitiesMap.put(new Character('\''),"'"); 1965 entitiesMap.put(new Character('"'),"""); 1966 entitiesMap.put(new Character((char)160)," "); 1967 entitiesMap.put(new Character((char)169),"©"); 1968 }//static 1969 1970 /** The range that the content comes from at the source URL 1971 * (or null if none). 1972 */ 1973 //protected Long[] sourceUrlOffsets; 1974 1975 /** The start of the range that the content comes from at the source URL 1976 * (or null if none). 1977 */ 1978 protected Long sourceUrlStartOffset; 1979 1980 /** The end of the range that the content comes from at the source URL 1981 * (or null if none). 1982 */ 1983 protected Long sourceUrlEndOffset; 1984 1985 /** The default annotation set */ 1986 protected AnnotationSet defaultAnnots; 1987 1988 /** Named sets of annotations */ 1989 protected Map namedAnnotSets; 1990 1991 /** 1992 * A property of the document that will be set when the user 1993 * wants to create the document from a string, as opposed to from 1994 * a URL. 1995 */ 1996 private String stringContent; 1997 1998 /** 1999 * The stringContent of a document is 2000 * a property of the document that will be set when the user 2001 * wants to create the document from a string, as opposed to from 2002 * a URL. 2003 * <B>Use the <TT>getContent</TT> method instead to get the actual document 2004 * content.</B> 2005 */ 2006 public String getStringContent() { return stringContent; } 2007 2008 /** 2009 * The stringContent of a document is 2010 * a property of the document that will be set when the user 2011 * wants to create the document from a string, as opposed to from 2012 * a URL. 2013 * <B>Use the <TT>setContent</TT> method instead to update the actual 2014 * document content.</B> 2015 */ 2016 public void setStringContent(String stringContent) { 2017 this.stringContent = stringContent; 2018 } // set StringContent 2019 2020 /** Is the document markup-aware? */ 2021 protected Boolean markupAware = new Boolean(false); 2022 2023// /** Hash code */ 2024// public int hashCode() { 2025// int code = getContent().hashCode(); 2026// int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode(); 2027// code += memberCode; 2028// memberCode = (encoding == null) ? 0 : encoding.hashCode(); 2029// code += memberCode; 2030// memberCode = (features == null) ? 0 : features.hashCode(); 2031// code += memberCode; 2032// code += (markupAware.booleanValue()) ? 0 : 1; 2033// memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode(); 2034// code += memberCode; 2035// code += nextAnnotationId; 2036// code += nextNodeId; 2037// memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode(); 2038// code += memberCode; 2039// memberCode = 2040// (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode(); 2041// code += memberCode; 2042// memberCode = 2043// (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode(); 2044// code += memberCode; 2045// return code; 2046// } // hashcode 2047 2048 /** String respresentation */ 2049 public String toString() { 2050 String n = Strings.getNl(); 2051 StringBuffer s = new StringBuffer("DocumentImpl: " + n); 2052 s.append(" content:" + content + n); 2053 s.append(" defaultAnnots:" + defaultAnnots + n); 2054 s.append(" encoding:" + encoding + n); 2055 s.append(" features:" + features + n); 2056 s.append(" markupAware:" + markupAware + n); 2057 s.append(" namedAnnotSets:" + namedAnnotSets + n); 2058 s.append(" nextAnnotationId:" + nextAnnotationId + n); 2059 s.append(" nextNodeId:" + nextNodeId + n); 2060 s.append(" sourceUrl:" + sourceUrl + n); 2061 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n); 2062 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n); 2063 s.append(n); 2064 2065 return s.toString(); 2066 } // toString 2067 2068 /** Freeze the serialization UID. */ 2069 static final long serialVersionUID = -8456893608311510260L; 2070 2071 /** Inner class needed to compare annotations*/ 2072 class AnnotationComparator implements java.util.Comparator { 2073 int orderOn = -1; 2074 int orderType = ASC; 2075 /** Constructs a comparator according to one of three sorter types: 2076 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET 2077 */ 2078 public AnnotationComparator(int anOrderOn, int anOrderType){ 2079 orderOn = anOrderOn; 2080 orderType = anOrderType; 2081 }// AnnotationComparator() 2082 2083 /**This method must be implemented according to Comparator interface */ 2084 public int compare(Object o1, Object o2){ 2085 Annotation a1 = (Annotation) o1; 2086 Annotation a2 = (Annotation) o2; 2087 // ORDER_ON_START_OFFSET ? 2088 if (orderOn == ORDER_ON_START_OFFSET){ 2089 int result = a1.getStartNode().getOffset().compareTo( 2090 a2.getStartNode().getOffset()); 2091 if (orderType == ASC){ 2092 // ASC 2093 // If they are equal then their ID will decide. 2094 if (result == 0) 2095 return a1.getId().compareTo(a2.getId()); 2096 return result; 2097 }else{ 2098 // DESC 2099 if (result == 0) 2100 return - (a1.getId().compareTo(a2.getId())); 2101 return -result; 2102 }// End if (orderType == ASC) 2103 }// End if (orderOn == ORDER_ON_START_OFFSET) 2104 2105 // ORDER_ON_END_OFFSET ? 2106 if (orderOn == ORDER_ON_END_OFFSET){ 2107 int result = a1.getEndNode().getOffset().compareTo( 2108 a2.getEndNode().getOffset()); 2109 if (orderType == ASC){ 2110 // ASC 2111 // If they are equal then their ID will decide. 2112 if (result == 0) 2113 return - (a1.getId().compareTo(a2.getId())); 2114 return result; 2115 }else{ 2116 // DESC 2117 // If they are equal then their ID will decide. 2118 if (result == 0) 2119 return a1.getId().compareTo(a2.getId()); 2120 return - result; 2121 }// End if (orderType == ASC) 2122 }// End if (orderOn == ORDER_ON_END_OFFSET) 2123 2124 // ORDER_ON_ANNOT_ID ? 2125 if (orderOn == ORDER_ON_ANNOT_ID){ 2126 if (orderType == ASC) 2127 return a1.getId().compareTo(a2.getId()); 2128 else 2129 return -(a1.getId().compareTo(a2.getId())); 2130 }// End if 2131 return 0; 2132 }//compare() 2133 } // End inner class AnnotationComparator 2134 2135 2136 private transient Vector documentListeners; 2137 private transient Vector gateListeners; 2138 2139 public synchronized void removeDocumentListener(DocumentListener l) { 2140 if (documentListeners != null && documentListeners.contains(l)) { 2141 Vector v = (Vector) documentListeners.clone(); 2142 v.removeElement(l); 2143 documentListeners = v; 2144 } 2145 } 2146 public synchronized void addDocumentListener(DocumentListener l) { 2147 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone(); 2148 if (!v.contains(l)) { 2149 v.addElement(l); 2150 documentListeners = v; 2151 } 2152 } 2153 2154 protected void fireAnnotationSetAdded(DocumentEvent e) { 2155 if (documentListeners != null) { 2156 Vector listeners = documentListeners; 2157 int count = listeners.size(); 2158 for (int i = 0; i < count; i++) { 2159 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e); 2160 } 2161 } 2162 } 2163 2164 protected void fireAnnotationSetRemoved(DocumentEvent e) { 2165 if (documentListeners != null) { 2166 Vector listeners = documentListeners; 2167 int count = listeners.size(); 2168 for (int i = 0; i < count; i++) { 2169 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e); 2170 } 2171 } 2172 } 2173 public void resourceLoaded(CreoleEvent e) { 2174 } 2175 public void resourceUnloaded(CreoleEvent e) { 2176 } 2177 public void datastoreOpened(CreoleEvent e) { 2178 } 2179 public void datastoreCreated(CreoleEvent e) { 2180 } 2181 public void resourceRenamed(Resource resource, String oldName, 2182 String newName){ 2183 } 2184 public void datastoreClosed(CreoleEvent e) { 2185 if (! e.getDatastore().equals(this.getDataStore())) 2186 return; 2187 //close this lr, since it cannot stay open when the DS it comes from 2188 //is closed 2189 Factory.deleteResource(this); 2190 } 2191 public void setLRPersistenceId(Object lrID) { 2192 super.setLRPersistenceId( lrID); 2193 //make persistent documents listen to the creole register 2194 //for events about their DS 2195 Gate.getCreoleRegister().addCreoleListener(this); 2196 } 2197 public void resourceAdopted(DatastoreEvent evt) { 2198 } 2199 public void resourceDeleted(DatastoreEvent evt) { 2200 if(! evt.getSource().equals(this.getDataStore())) 2201 return; 2202 //if an open document is deleted from a DS, then 2203 //it must close itself immediately, as is no longer valid 2204 if(evt.getResourceID().equals(this.getLRPersistenceId())) 2205 Factory.deleteResource(this); 2206 } 2207 public void resourceWritten(DatastoreEvent evt) { 2208 } 2209 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException { 2210 super.setDataStore( dataStore); 2211 if (this.dataStore != null) 2212 this.dataStore.addDatastoreListener(this); 2213 } 2214 2215} // class DocumentImpl 2216
|
DocumentImpl |
|