|
Corpus |
|
1 /* 2 * Corpus.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 19/Jan/2000 12 * 13 * $Id: Corpus.java,v 1.18 2001/11/29 15:14:59 valyt Exp $ 14 */ 15 16 package gate; 17 import java.util.*; 18 import java.net.URL; 19 import java.io.FileFilter; 20 import java.io.IOException; 21 22 import gate.util.*; 23 import gate.event.*; 24 import gate.creole.ResourceInstantiationException; 25 26 /** Corpora are lists of Document. TIPSTER equivalent: Collection. 27 */ 28 public interface Corpus extends LanguageResource, List, NameBearer { 29 30 /** 31 * Gets the names of the documents in this corpus. 32 * @return a {@link List} of Strings representing the names of the documents 33 * in this corpus. 34 */ 35 public List getDocumentNames(); 36 37 /** 38 * Gets the name of a document in this corpus. 39 * @param index the index of the document 40 * @return a String value representing the name of the document at 41 * <tt>index</tt> in this corpus. 42 */ 43 public String getDocumentName(int index); 44 45 /** 46 * Unloads the document from memory. Only needed if memory 47 * preservation is an issue. Only supported for Corpus which is 48 * stored in a Datastore. To get this document back in memory, 49 * use get() on Corpus or if you have its persistent ID, request it 50 * from the Factory. 51 * <P> 52 * Transient Corpus objects do nothing, 53 * because there would be no way to get the document back 54 * again afterwards. 55 * @param Document to be unloaded from memory. 56 * @return void. 57 */ 58 public void unloadDocument(Document doc); 59 60 /** 61 * Fills this corpus with documents created on the fly from selected files in 62 * a directory. Uses a link {@FileFilter} to select which files will be used 63 * and which will be ignored. 64 * A simple file filter based on extensions is provided in the Gate 65 * distribution ({@link gate.util.ExtensionFileFilter}). 66 * @param directory the directory from which the files will be picked. This 67 * parameter is an URL for uniformity. It needs to be a URL of type file 68 * otherwise an InvalidArgumentException will be thrown. 69 * An implementation for this method is provided as a static method at 70 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 71 * @param filter the file filter used to select files from the target 72 * directory. If the filter is <tt>null</tt> all the files will be accepted. 73 * @param encoding the encoding to be used for reading the documents 74 * @param recurseDirectories should the directory be parsed recursively?. If 75 * <tt>true</tt> all the files from the provided directory and all its 76 * children directories (on as many levels as necessary) will be picked if 77 * accepted by the filter otherwise the children directories will be ignored. 78 */ 79 public void populate(URL directory, FileFilter filter, 80 String encoding, boolean recurseDirectories) 81 throws IOException, ResourceInstantiationException; 82 83 84 /** 85 * This method returns true when the document is already loaded in memory. 86 * The transient corpora will always return true as they can only contain 87 * documents that are present in the memory. 88 */ 89 public boolean isDocumentLoaded(int index); 90 91 92 /** 93 * Removes one of the listeners registered with this corpus. 94 * @param l the listener to be removed. 95 */ 96 public void removeCorpusListener(CorpusListener l); 97 98 /** 99 * Registers a new {@link CorpusListener} with this corpus. 100 * @param l the listener to be added. 101 */ 102 public void addCorpusListener(CorpusListener l); 103 104 } // interface Corpus 105
|
Corpus |
|