1   /*
2    *  DocumentContentImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentContentImpl.java,v 1.24 2001/11/08 17:13:08 hamish Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  import java.net.*;
20  import java.io.*;
21  
22  import gate.*;
23  import gate.annotation.*;
24  import gate.util.*;
25  
26  /** Represents the commonalities between all sorts of document contents.
27    */
28  public class DocumentContentImpl implements DocumentContent
29  {
30    /** Debug flag */
31    private static final boolean DEBUG = false;
32  
33    /** Buffer size for reading
34     *  16k is 4 times the block size on most filesystems
35     *  so it should be efficient for most cases
36     *  */
37    private static final int INTERNAL_BUFFER_SIZE  = 16*1024;
38  
39    /** Default construction */
40    public DocumentContentImpl() {
41      content = new String();
42    } // default construction
43  
44    /** Contruction from URL and offsets. */
45    public DocumentContentImpl(URL u, String encoding, Long start, Long end)
46    throws IOException {
47  
48      int readLength = 0;
49      char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
50  
51      BufferedReader uReader = null;
52      StringBuffer buf = new StringBuffer();
53      char c;
54      long s = 0, e = Long.MAX_VALUE, counter = 0;
55      if(start != null && end != null) {
56        s = start.longValue();
57        e = end.longValue();
58      }
59  
60      if(encoding != null && !encoding.equalsIgnoreCase("")) {
61        uReader = new BufferedReader(
62          new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE
63        );
64      } else {
65        uReader = new BufferedReader(
66          new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE
67        );
68      };
69  
70      // 1. skip S characters
71      uReader.skip(s);
72  
73      // 2. how many character shall I read?
74      long toRead = e - s;
75  
76      // 3. read gtom source into buffer
77      while (
78        toRead > 0 &&
79        (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
80      ) {
81        if (toRead <  readLength) {
82          //well, if toRead(long) is less than readLenght(int)
83          //then there can be no overflow, so the cast is safe
84          readLength = (int)toRead;
85        }
86  
87        buf.append(readBuffer, 0, readLength);
88        toRead -= readLength;
89      }
90  
91      // 4.close reader
92      uReader.close();
93  
94      content = new String(buf);
95    } // Contruction from URL and offsets
96  
97    /** Propagate changes to the document content. */
98    void edit(Long start, Long end, DocumentContent replacement)
99    {
100     int s = start.intValue(), e = end.intValue();
101     String repl = ((DocumentContentImpl) replacement).content;
102     StringBuffer newContent = new StringBuffer(content);
103     newContent.replace(s, e, repl);
104     content = newContent.toString();
105   } // edit(start,end,replacement)
106 
107   /** The contents under a particular span. */
108   public DocumentContent getContent(Long start, Long end)
109     throws InvalidOffsetException
110   {
111     if(! isValidOffsetRange(start, end))
112       throw new InvalidOffsetException();
113 
114     return new DocumentContentImpl(
115       content.substring(start.intValue(), end.intValue())
116     );
117   } // getContent(start, end)
118 
119   /** Returns the String representing the content in case of a textual document.
120     * NOTE: this is a temporary solution until we have a more generic one.
121     */
122   public String toString(){
123     return content;
124   }
125 
126   /** The size of this content (e.g. character length for textual
127     * content).
128     */
129   public Long size() {
130     return new Long(content.length());
131   } // size()
132 
133   /** Check that an offset is valid */
134   boolean isValidOffset(Long offset) {
135     if(offset == null)
136       return false;
137 
138     long o = offset.longValue();
139     long len = content.length();
140     if(o > len || o < 0)
141       return false;
142 
143     return true;
144   } // isValidOffset
145 
146   /** Check that both start and end are valid offsets and that
147     * they constitute a valid offset range
148     */
149   boolean isValidOffsetRange(Long start, Long end) {
150     return
151       isValidOffset(start) && isValidOffset(end) &&
152       start.longValue() <= end.longValue();
153   } // isValidOffsetRange(start,end)
154 
155   /** Two documents are the same if their contents is the same
156    */
157   public boolean equals(Object other) {
158     if (!(other instanceof DocumentContentImpl)) return false;
159 
160     DocumentContentImpl docImpl = (DocumentContentImpl) other;
161     return content.equals(docImpl.toString());
162   } // equals
163 
164   /** Calculate the hash value for the object. */
165   public int hashCode(){ return toString().hashCode(); }
166 
167   /** Just for now - later we have to cater for different types of
168     * content.
169     */
170   String content;
171 
172   /** For ranges */
173   public DocumentContentImpl(String s) { content = s; }
174 
175   /** Freeze the serialization UID. */
176   static final long serialVersionUID = -1426940535575467461L;
177 } // class DocumentContentImpl
178