1   /*
2    *  Corpus.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 19/Jan/2000
12   *
13   *  $Id: Corpus.java,v 1.18 2001/11/29 15:14:59 valyt Exp $
14   */
15  
16  package gate;
17  import java.util.*;
18  import java.net.URL;
19  import java.io.FileFilter;
20  import java.io.IOException;
21  
22  import gate.util.*;
23  import gate.event.*;
24  import gate.creole.ResourceInstantiationException;
25  
26  /** Corpora are lists of Document. TIPSTER equivalent: Collection.
27    */
28  public interface Corpus extends LanguageResource, List, NameBearer {
29  
30    /**
31     * Gets the names of the documents in this corpus.
32     * @return a {@link List} of Strings representing the names of the documents
33     * in this corpus.
34     */
35    public List getDocumentNames();
36  
37    /**
38     * Gets the name of a document in this corpus.
39     * @param index the index of the document
40     * @return a String value representing the name of the document at
41     * <tt>index</tt> in this corpus.
42     */
43    public String getDocumentName(int index);
44  
45    /**
46     * Unloads the document from memory. Only needed if memory
47     * preservation is an issue. Only supported for Corpus which is
48     * stored in a Datastore. To get this document back in memory,
49     * use get() on Corpus or if you have its persistent ID, request it
50     * from the Factory.
51     * <P>
52     * Transient Corpus objects do nothing,
53     * because there would be no way to get the document back
54     * again afterwards.
55     * @param Document to be unloaded from memory.
56     * @return void.
57     */
58    public void unloadDocument(Document doc);
59  
60    /**
61     * Fills this corpus with documents created on the fly from selected files in
62     * a directory. Uses a link {@FileFilter} to select which files will be used
63     * and which will be ignored.
64     * A simple file filter based on extensions is provided in the Gate
65     * distribution ({@link gate.util.ExtensionFileFilter}).
66     * @param directory the directory from which the files will be picked. This
67     * parameter is an URL for uniformity. It needs to be a URL of type file
68     * otherwise an InvalidArgumentException will be thrown.
69     * An implementation for this method is provided as a static method at
70     * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}.
71     * @param filter the file filter used to select files from the target
72     * directory. If the filter is <tt>null</tt> all the files will be accepted.
73     * @param encoding the encoding to be used for reading the documents
74     * @param recurseDirectories should the directory be parsed recursively?. If
75     * <tt>true</tt> all the files from the provided directory and all its
76     * children directories (on as many levels as necessary) will be picked if
77     * accepted by the filter otherwise the children directories will be ignored.
78     */
79    public void populate(URL directory, FileFilter filter,
80                         String encoding, boolean recurseDirectories)
81                         throws IOException, ResourceInstantiationException;
82  
83  
84    /**
85     * This method returns true when the document is already loaded in memory.
86     * The transient corpora will always return true as they can only contain
87     * documents that are present in the memory.
88     */
89    public boolean isDocumentLoaded(int index);
90  
91  
92    /**
93     * Removes one of the listeners registered with this corpus.
94     * @param l the listener to be removed.
95     */
96    public void removeCorpusListener(CorpusListener l);
97  
98    /**
99     * Registers a new {@link CorpusListener} with this corpus.
100    * @param l the listener to be added.
101    */
102   public void addCorpusListener(CorpusListener l);
103 
104 } // interface Corpus
105