1   /*
2    *  CorpusImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: CorpusImpl.java,v 1.47 2001/11/29 15:15:00 valyt Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  
20  import gate.*;
21  import gate.util.*;
22  import gate.annotation.*;
23  import gate.persist.*;
24  import java.io.*;
25  import java.net.*;
26  import gate.event.*;
27  import gate.creole.*;
28  
29  /** Corpora are sets of Document. They are ordered by lexicographic collation
30    * on Url.
31    */
32  public class CorpusImpl extends AbstractLanguageResource
33                          implements Corpus, CreoleListener {
34  
35    /** Debug flag */
36    private static final boolean DEBUG = false;
37  
38    public CorpusImpl(){
39      supportList = Collections.synchronizedList(new VerboseList());
40      Gate.getCreoleRegister().addCreoleListener(this);
41    }
42  
43  
44    /**
45     * Gets the names of the documents in this corpus.
46     * @return a {@link List} of Strings representing the names of the documents
47     * in this corpus.
48     */
49    public List getDocumentNames(){
50      ArrayList res = new ArrayList(supportList.size());
51      Iterator docIter = supportList.iterator();
52      while(docIter.hasNext()){
53        res.add(((Document)docIter.next()).getName());
54      }
55      return res;
56    }
57  
58    /**
59     * Gets the name of a document in this corpus.
60     * @param index the index of the document
61     * @return a String value representing the name of the document at
62     * <tt>index</tt> in this corpus.
63     */
64    public String getDocumentName(int index){
65      return ((Document)supportList.get(index)).getName();
66    }
67  
68    /**
69     * This method does not make sense for transient corpora, so it does
70     * nothing.
71     */
72    public void unloadDocument(Document doc) {
73      return;
74    }
75  
76  
77    /**
78     * The underlying list that holds the documents in this corpus.
79     */
80    protected List supportList = null;
81  
82    /**
83     * A proxy list that stores the actual data in an internal list and forwards
84     * all operations to that one but it also fires the appropiate corpus events
85     * when necessary.
86     * It also does some type checking so only Documents are accepted as corpus
87     * members.
88     */
89    protected class VerboseList extends AbstractList implements Serializable{
90  
91  
92  
93      VerboseList(){
94        data = new ArrayList();
95      }
96  
97      public Object get(int index){
98        return data.get(index);
99      }
100 
101     public int size(){
102       return data.size();
103     }
104 
105     public Object set(int index, Object element){
106       if(element instanceof Document){
107         Document oldDoc = (Document)data.set(index, element);
108         Document newDoc = (Document)element;
109 
110         //fire the 2 events
111         fireDocumentRemoved(new CorpusEvent(CorpusImpl.this,
112                                             oldDoc,
113                                             index,
114                                             CorpusEvent.DOCUMENT_REMOVED));
115         fireDocumentAdded(new CorpusEvent(CorpusImpl.this,
116                                           newDoc,
117                                           index,
118                                           CorpusEvent.DOCUMENT_ADDED));
119         return oldDoc;
120       }else{
121         throw new UnsupportedOperationException(
122           getClass().getName() +
123           " only accepts gate.Document values as members!\n" +
124           element.getClass().getName() + " is not a gate.Document");
125       }
126     }
127 
128     public void add(int index, Object element){
129       if(element instanceof Document){
130         data.add(index, element);
131 
132         //fire the event
133         fireDocumentAdded(new CorpusEvent(CorpusImpl.this,
134                                           (Document)element,
135                                           index,
136                                           CorpusEvent.DOCUMENT_ADDED));
137       }else{
138         throw new UnsupportedOperationException(
139           getClass().getName() +
140           " only accepts gate.Document values as members!\n" +
141           element.getClass().getName() + " is not a gate.Document");
142       }
143     }
144 
145     public Object remove(int index){
146       Document oldDoc = (Document)data.remove(index);
147 
148       fireDocumentRemoved(new CorpusEvent(CorpusImpl.this,
149                                           oldDoc,
150                                           index,
151                                           CorpusEvent.DOCUMENT_REMOVED));
152       return oldDoc;
153     }
154 
155     /**
156      * The List containing the actual data.
157      */
158     ArrayList data;
159   }
160 
161   /**
162    * This method returns true when the document is already loaded in memory
163    */
164   public boolean isDocumentLoaded(int index) {
165     return true;
166   }
167 
168 
169   protected void clearDocList() {
170     if (supportList == null)
171       return;
172     supportList.clear();
173   }
174 
175 
176   //List methods
177   //java docs will be automatically copied from the List interface.
178 
179   public int size() {
180     return supportList.size();
181   }
182 
183   public boolean isEmpty() {
184     return supportList.isEmpty();
185   }
186 
187   public boolean contains(Object o){
188     return supportList.contains(o);
189   }
190 
191   public Iterator iterator(){
192     return supportList.iterator();
193   }
194 
195   public Object[] toArray(){
196     return supportList.toArray();
197   }
198 
199   public Object[] toArray(Object[] a){
200     return supportList.toArray(a);
201   }
202 
203   public boolean add(Object o){
204     return supportList.add(o);
205   }
206 
207   public boolean remove(Object o){
208     return supportList.remove(o);
209   }
210 
211   public boolean containsAll(Collection c){
212     return supportList.containsAll(c);
213   }
214 
215   public boolean addAll(Collection c){
216     return supportList.addAll(c);
217   }
218 
219   public boolean addAll(int index, Collection c){
220     return supportList.addAll(index, c);
221   }
222 
223   public boolean removeAll(Collection c){
224     return supportList.removeAll(c);
225   }
226 
227   public boolean retainAll(Collection c){
228     return supportList.retainAll(c);
229   }
230 
231   public void clear(){
232     supportList.clear();
233   }
234 
235   public boolean equals(Object o){
236     if (! (o instanceof CorpusImpl))
237       return false;
238 
239     return supportList.equals(o);
240   }
241 
242   public int hashCode(){
243     return supportList.hashCode();
244   }
245 
246   public Object get(int index){
247     return supportList.get(index);
248   }
249 
250   public Object set(int index, Object element){
251     return supportList.set(index, element);
252   }
253 
254   public void add(int index, Object element){
255     supportList.add(index, element);
256   }
257 
258   public Object remove(int index){
259     return supportList.remove(index);
260   }
261 
262   public int indexOf(Object o){
263     return supportList.indexOf(o);
264   }
265 
266   public int lastIndexOf(Object o){
267     return lastIndexOf(o);
268   }
269 
270   public ListIterator listIterator(){
271     return supportList.listIterator();
272   }
273 
274   public ListIterator listIterator(int index){
275     return supportList.listIterator(index);
276   }
277 
278   public List subList(int fromIndex, int toIndex){
279     return supportList.subList(fromIndex, toIndex);
280   }
281 
282 
283   /** Construction */
284 
285   public void cleanup(){
286   }
287 
288   /** Initialise this resource, and return it. */
289   public Resource init() {
290     if(documentsList != null && !documentsList.isEmpty()){
291       addAll(documentsList);
292     }
293     return this;
294   } // init()
295 
296 
297   /**
298    * Fills the provided corpus with documents created on the fly from selected
299    * files in a directory. Uses a link {@FileFilter} to select which files will
300    * be used and which will be ignored.
301    * A simple file filter based on extensions is provided in the Gate
302    * distribution ({@link gate.util.ExtensionFileFilter}).
303    * @param corpus the corpus to be populated
304    * @param directory the directory from which the files will be picked. This
305    * parameter is an URL for uniformity. It needs to be a URL of type file
306    * otherwise an InvalidArgumentException will be thrown.
307    * @param filter the file filter used to select files from the target
308    * directory. If the filter is <tt>null</tt> all the files will be accepted.
309    * @param encoding the encoding to be used for reading the documents
310    * @param recurseDirectories should the directory be parsed recursively?. If
311    * <tt>true</tt> all the files from the provided directory and all its
312    * children directories (on as many levels as necessary) will be picked if
313    * accepted by the filter otherwise the children directories will be ignored.
314    */
315   public static void populate(Corpus corpus, URL directory, FileFilter filter,
316                               String encoding, boolean recurseDirectories)
317                      throws IOException, ResourceInstantiationException{
318     //check input
319     if(!directory.getProtocol().equalsIgnoreCase("file"))
320       throw new IllegalArgumentException(
321         "The URL provided is not of type \"file:\"!");
322 
323     File dir = new File(directory.getPath());
324     if(!dir.exists())
325       throw new FileNotFoundException(dir.toString());
326 
327     if(!dir.isDirectory())
328       throw new IllegalArgumentException(
329         dir.getAbsolutePath() + " is not a directory!");
330 
331     //populate the corpus
332     File[] files = dir.listFiles(filter);
333     if(files != null){
334       for(int i = 0; i < files.length; i++){
335         File aFile = files[i];
336         if(aFile.isDirectory()){
337           //recurse dir if required
338           if(recurseDirectories){
339             populate(corpus, aFile.toURL(), filter,
340                      encoding, recurseDirectories);
341           }
342         }else{
343           //create the doc
344           StatusListener sListener = (StatusListener)
345                                      gate.gui.MainFrame.getListeners().
346                                      get("gate.event.StatusListener");
347           if(sListener != null) sListener.statusChanged(
348             "Reading: " + aFile.getName());
349           String docName = aFile.getName() + "_" + Gate.genSym();
350           FeatureMap params = Factory.newFeatureMap();
351           params.put("sourceUrl", aFile.toURL());
352           if(encoding != null) params.put("encoding", encoding);
353 
354           corpus.add(Factory.createResource(DocumentImpl.class.getName(),
355                                             params, null, docName));
356           if(sListener != null) sListener.statusChanged(
357             aFile.getName() + " read");
358         }
359       }
360     }
361   }//public static void populate
362 
363   /**
364    * Fills this corpus with documents created from files in a directory.
365    * @param filter the file filter used to select files from the target
366    * directory. If the filter is <tt>null</tt> all the files will be accepted.
367    * @param directory the directory from which the files will be picked. This
368    * parameter is an URL for uniformity. It needs to be a URL of type file
369    * otherwise an InvalidArgumentException will be thrown.
370    * An implementation for this method is provided as a static method at
371    * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}.
372    * @param encoding the encoding to be used for reading the documents
373    * @param recurseDirectories should the directory be parsed recursively?. If
374    * <tt>true</tt> all the files from the provided directory and all its
375    * children directories (on as many levels as necessary) will be picked if
376    * accepted by the filter otherwise the children directories will be ignored.
377    */
378   public void populate(URL directory, FileFilter filter, String encoding,
379                        boolean recurseDirectories)
380               throws IOException, ResourceInstantiationException{
381     populate(this, directory, filter, encoding, recurseDirectories);
382   }
383 
384   public synchronized void removeCorpusListener(CorpusListener l) {
385     if (corpusListeners != null && corpusListeners.contains(l)) {
386       Vector v = (Vector) corpusListeners.clone();
387       v.removeElement(l);
388       corpusListeners = v;
389     }
390   }
391   public synchronized void addCorpusListener(CorpusListener l) {
392     Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone();
393     if (!v.contains(l)) {
394       v.addElement(l);
395       corpusListeners = v;
396     }
397   }
398 
399   /** Freeze the serialization UID. */
400   static final long serialVersionUID = -1113142759053898456L;
401   private transient Vector corpusListeners;
402   protected transient java.util.List documentsList;
403 
404 
405   protected void fireDocumentAdded(CorpusEvent e) {
406     if (corpusListeners != null) {
407       Vector listeners = corpusListeners;
408       int count = listeners.size();
409       for (int i = 0; i < count; i++) {
410         ((CorpusListener) listeners.elementAt(i)).documentAdded(e);
411       }
412     }
413   }
414   protected void fireDocumentRemoved(CorpusEvent e) {
415     if (corpusListeners != null) {
416       Vector listeners = corpusListeners;
417       int count = listeners.size();
418       for (int i = 0; i < count; i++) {
419         ((CorpusListener) listeners.elementAt(i)).documentRemoved(e);
420       }
421     }
422   }
423   public void setDocumentsList(java.util.List documentsList) {
424     this.documentsList = documentsList;
425   }
426   public java.util.List getDocumentsList() {
427     return documentsList;
428   }
429   public void resourceLoaded(CreoleEvent e) {
430   }
431   public void resourceUnloaded(CreoleEvent e) {
432     Resource res = e.getResource();
433     //remove all occurences
434     if(res instanceof Document) while(contains(res)) remove(res);
435   }
436   public void datastoreOpened(CreoleEvent e) {
437   }
438   public void datastoreCreated(CreoleEvent e) {
439   }
440   public void datastoreClosed(CreoleEvent e) {
441   }
442 } // class CorpusImpl
443