|
DocumentContentImpl |
|
1 /* 2 * DocumentContentImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: DocumentContentImpl.java,v 1.24 2001/11/08 17:13:08 hamish Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 import java.net.*; 20 import java.io.*; 21 22 import gate.*; 23 import gate.annotation.*; 24 import gate.util.*; 25 26 /** Represents the commonalities between all sorts of document contents. 27 */ 28 public class DocumentContentImpl implements DocumentContent 29 { 30 /** Debug flag */ 31 private static final boolean DEBUG = false; 32 33 /** Buffer size for reading 34 * 16k is 4 times the block size on most filesystems 35 * so it should be efficient for most cases 36 * */ 37 private static final int INTERNAL_BUFFER_SIZE = 16*1024; 38 39 /** Default construction */ 40 public DocumentContentImpl() { 41 content = new String(); 42 } // default construction 43 44 /** Contruction from URL and offsets. */ 45 public DocumentContentImpl(URL u, String encoding, Long start, Long end) 46 throws IOException { 47 48 int readLength = 0; 49 char[] readBuffer = new char[INTERNAL_BUFFER_SIZE]; 50 51 BufferedReader uReader = null; 52 StringBuffer buf = new StringBuffer(); 53 char c; 54 long s = 0, e = Long.MAX_VALUE, counter = 0; 55 if(start != null && end != null) { 56 s = start.longValue(); 57 e = end.longValue(); 58 } 59 60 if(encoding != null && !encoding.equalsIgnoreCase("")) { 61 uReader = new BufferedReader( 62 new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE 63 ); 64 } else { 65 uReader = new BufferedReader( 66 new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE 67 ); 68 }; 69 70 // 1. skip S characters 71 uReader.skip(s); 72 73 // 2. how many character shall I read? 74 long toRead = e - s; 75 76 // 3. read gtom source into buffer 77 while ( 78 toRead > 0 && 79 (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1 80 ) { 81 if (toRead < readLength) { 82 //well, if toRead(long) is less than readLenght(int) 83 //then there can be no overflow, so the cast is safe 84 readLength = (int)toRead; 85 } 86 87 buf.append(readBuffer, 0, readLength); 88 toRead -= readLength; 89 } 90 91 // 4.close reader 92 uReader.close(); 93 94 content = new String(buf); 95 } // Contruction from URL and offsets 96 97 /** Propagate changes to the document content. */ 98 void edit(Long start, Long end, DocumentContent replacement) 99 { 100 int s = start.intValue(), e = end.intValue(); 101 String repl = ((DocumentContentImpl) replacement).content; 102 StringBuffer newContent = new StringBuffer(content); 103 newContent.replace(s, e, repl); 104 content = newContent.toString(); 105 } // edit(start,end,replacement) 106 107 /** The contents under a particular span. */ 108 public DocumentContent getContent(Long start, Long end) 109 throws InvalidOffsetException 110 { 111 if(! isValidOffsetRange(start, end)) 112 throw new InvalidOffsetException(); 113 114 return new DocumentContentImpl( 115 content.substring(start.intValue(), end.intValue()) 116 ); 117 } // getContent(start, end) 118 119 /** Returns the String representing the content in case of a textual document. 120 * NOTE: this is a temporary solution until we have a more generic one. 121 */ 122 public String toString(){ 123 return content; 124 } 125 126 /** The size of this content (e.g. character length for textual 127 * content). 128 */ 129 public Long size() { 130 return new Long(content.length()); 131 } // size() 132 133 /** Check that an offset is valid */ 134 boolean isValidOffset(Long offset) { 135 if(offset == null) 136 return false; 137 138 long o = offset.longValue(); 139 long len = content.length(); 140 if(o > len || o < 0) 141 return false; 142 143 return true; 144 } // isValidOffset 145 146 /** Check that both start and end are valid offsets and that 147 * they constitute a valid offset range 148 */ 149 boolean isValidOffsetRange(Long start, Long end) { 150 return 151 isValidOffset(start) && isValidOffset(end) && 152 start.longValue() <= end.longValue(); 153 } // isValidOffsetRange(start,end) 154 155 /** Two documents are the same if their contents is the same 156 */ 157 public boolean equals(Object other) { 158 if (!(other instanceof DocumentContentImpl)) return false; 159 160 DocumentContentImpl docImpl = (DocumentContentImpl) other; 161 return content.equals(docImpl.toString()); 162 } // equals 163 164 /** Calculate the hash value for the object. */ 165 public int hashCode(){ return toString().hashCode(); } 166 167 /** Just for now - later we have to cater for different types of 168 * content. 169 */ 170 String content; 171 172 /** For ranges */ 173 public DocumentContentImpl(String s) { content = s; } 174 175 /** Freeze the serialization UID. */ 176 static final long serialVersionUID = -1426940535575467461L; 177 } // class DocumentContentImpl 178
|
DocumentContentImpl |
|