1   /*
2    *  Corpus.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 19/Jan/2000
12   *
13   *  $Id: Corpus.java,v 1.19 2002/03/06 17:15:37 kalina Exp $
14   */
15  
16  package gate;
17  import java.util.*;
18  import java.net.URL;
19  import java.io.FileFilter;
20  import java.io.IOException;
21  
22  import gate.util.*;
23  import gate.event.*;
24  import gate.creole.ResourceInstantiationException;
25  
26  /** Corpora are lists of Document. TIPSTER equivalent: Collection.
27    */
28  public interface Corpus extends LanguageResource, List, NameBearer {
29  
30    public static final String CORPUS_NAME_PARAMETER_NAME = "name";
31    public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList";
32  
33    /**
34     * Gets the names of the documents in this corpus.
35     * @return a {@link List} of Strings representing the names of the documents
36     * in this corpus.
37     */
38    public List getDocumentNames();
39  
40    /**
41     * Gets the name of a document in this corpus.
42     * @param index the index of the document
43     * @return a String value representing the name of the document at
44     * <tt>index</tt> in this corpus.
45     */
46    public String getDocumentName(int index);
47  
48    /**
49     * Unloads the document from memory. Only needed if memory
50     * preservation is an issue. Only supported for Corpus which is
51     * stored in a Datastore. To get this document back in memory,
52     * use get() on Corpus or if you have its persistent ID, request it
53     * from the Factory.
54     * <P>
55     * Transient Corpus objects do nothing,
56     * because there would be no way to get the document back
57     * again afterwards.
58     * @param Document to be unloaded from memory.
59     * @return void.
60     */
61    public void unloadDocument(Document doc);
62  
63    /**
64     * Fills this corpus with documents created on the fly from selected files in
65     * a directory. Uses a link {@FileFilter} to select which files will be used
66     * and which will be ignored.
67     * A simple file filter based on extensions is provided in the Gate
68     * distribution ({@link gate.util.ExtensionFileFilter}).
69     * @param directory the directory from which the files will be picked. This
70     * parameter is an URL for uniformity. It needs to be a URL of type file
71     * otherwise an InvalidArgumentException will be thrown.
72     * An implementation for this method is provided as a static method at
73     * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}.
74     * @param filter the file filter used to select files from the target
75     * directory. If the filter is <tt>null</tt> all the files will be accepted.
76     * @param encoding the encoding to be used for reading the documents
77     * @param recurseDirectories should the directory be parsed recursively?. If
78     * <tt>true</tt> all the files from the provided directory and all its
79     * children directories (on as many levels as necessary) will be picked if
80     * accepted by the filter otherwise the children directories will be ignored.
81     */
82    public void populate(URL directory, FileFilter filter,
83                         String encoding, boolean recurseDirectories)
84                         throws IOException, ResourceInstantiationException;
85  
86  
87    /**
88     * This method returns true when the document is already loaded in memory.
89     * The transient corpora will always return true as they can only contain
90     * documents that are present in the memory.
91     */
92    public boolean isDocumentLoaded(int index);
93  
94  
95    /**
96     * Removes one of the listeners registered with this corpus.
97     * @param l the listener to be removed.
98     */
99    public void removeCorpusListener(CorpusListener l);
100 
101   /**
102    * Registers a new {@link CorpusListener} with this corpus.
103    * @param l the listener to be added.
104    */
105   public void addCorpusListener(CorpusListener l);
106 
107 } // interface Corpus
108