|
DocumentContentImpl |
|
1 /* 2 * DocumentContentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentContentImpl.java,v 1.26 2002/01/03 12:46:44 nasso Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 26 /** Represents the commonalities between all sorts of document contents. 27 */ 28 public class DocumentContentImpl implements DocumentContent 29 { 30 /** Debug flag */ 31 private static final boolean DEBUG = false; 32 33 /** Buffer size for reading 34 * 16k is 4 times the block size on most filesystems 35 * so it should be efficient for most cases 36 * */ 37 private static final int INTERNAL_BUFFER_SIZE = 16*1024; 38 39 /** Default construction */ 40 public DocumentContentImpl() { 41 content = new String(); 42 } // default construction 43 44 /** Contruction from URL and offsets. */ 45 public DocumentContentImpl(URL u, String encoding, Long start, Long end) 46 throws IOException { 47 48 int readLength = 0; 49 char[] readBuffer = new char[INTERNAL_BUFFER_SIZE]; 50 51 BufferedReader uReader = null; 52 StringBuffer buf = new StringBuffer(); 53 char c; 54 long s = 0, e = Long.MAX_VALUE, counter = 0; 55 if(start != null && end != null) { 56 s = start.longValue(); 57 e = end.longValue(); 58 } 59 60 if(encoding != null && !encoding.equalsIgnoreCase("")) { 61 uReader = new BufferedReader( 62 new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE 63 ); 64 } else { 65 uReader = new BufferedReader( 66 new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE 67 ); 68 }; 69 70 // 1. skip S characters 71 uReader.skip(s); 72 73 // 2. how many character shall I read? 74 long toRead = e - s; 75 76 // 3. read gtom source into buffer 77 while ( 78 toRead > 0 && 79 (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1 80 ) { 81 if (toRead < readLength) { 82 //well, if toRead(long) is less than readLenght(int) 83 //then there can be no overflow, so the cast is safe 84 readLength = (int)toRead; 85 } 86 87 buf.append(readBuffer, 0, readLength); 88 toRead -= readLength; 89 } 90 91 // 4.close reader 92 uReader.close(); 93 94 content = new String(buf); 95 originalContent = content; 96 } // Contruction from URL and offsets 97 98 /** Propagate changes to the document content. */ 99 void edit(Long start, Long end, DocumentContent replacement) 100 { 101 int s = start.intValue(), e = end.intValue(); 102 String repl = ((DocumentContentImpl) replacement).content; 103 StringBuffer newContent = new StringBuffer(content); 104 newContent.replace(s, e, repl); 105 content = newContent.toString(); 106 } // edit(start,end,replacement) 107 108 /** The contents under a particular span. */ 109 public DocumentContent getContent(Long start, Long end) 110 throws InvalidOffsetException 111 { 112 if(! isValidOffsetRange(start, end)) 113 throw new InvalidOffsetException(); 114 115 return new DocumentContentImpl( 116 content.substring(start.intValue(), end.intValue()) 117 ); 118 } // getContent(start, end) 119 120 /** Returns the String representing the content in case of a textual document. 121 * NOTE: this is a temporary solution until we have a more generic one. 122 */ 123 public String toString(){ 124 return content; 125 } 126 127 /** The size of this content (e.g. character length for textual 128 * content). 129 */ 130 public Long size() { 131 return new Long(content.length()); 132 } // size() 133 134 /** Check that an offset is valid */ 135 boolean isValidOffset(Long offset) { 136 if(offset == null) 137 return false; 138 139 long o = offset.longValue(); 140 long len = content.length(); 141 if(o > len || o < 0) 142 return false; 143 144 return true; 145 } // isValidOffset 146 147 /** Check that both start and end are valid offsets and that 148 * they constitute a valid offset range 149 */ 150 boolean isValidOffsetRange(Long start, Long end) { 151 return 152 isValidOffset(start) && isValidOffset(end) && 153 start.longValue() <= end.longValue(); 154 } // isValidOffsetRange(start,end) 155 156 /** Two documents are the same if their contents is the same 157 */ 158 public boolean equals(Object other) { 159 if (!(other instanceof DocumentContentImpl)) return false; 160 161 DocumentContentImpl docImpl = (DocumentContentImpl) other; 162 return content.equals(docImpl.toString()); 163 } // equals 164 165 /** Calculate the hash value for the object. */ 166 public int hashCode(){ return toString().hashCode(); } 167 168 /** Just for now - later we have to cater for different types of 169 * content. 170 */ 171 String content; 172 173 /** 174 * For preserving the original content of the document. 175 * The edit command didn't affect on the original content. 176 * If you construct the content by URL the originalContent will keep 177 * whole information retrieved by URL even you set some start and end. 178 */ 179 String originalContent; 180 181 /** 182 * Return the original content of the document received during the loading 183 * phase or on construction from string. 184 */ 185 public String getOriginalContent() { return originalContent; } 186 187 /** For ranges */ 188 public DocumentContentImpl(String s) 189 { content = s; originalContent = content; } 190 191 /** Freeze the serialization UID. */ 192 static final long serialVersionUID = -1426940535575467461L; 193 } // class DocumentContentImpl 194
|
DocumentContentImpl |
|