|
CorpusImpl |
|
1 /* 2 * CorpusImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 11/Feb/2000 12 * 13 * $Id: CorpusImpl.java,v 1.47 2001/11/29 15:15:00 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 20 import gate.*; 21 import gate.util.*; 22 import gate.annotation.*; 23 import gate.persist.*; 24 import java.io.*; 25 import java.net.*; 26 import gate.event.*; 27 import gate.creole.*; 28 29 /** Corpora are sets of Document. They are ordered by lexicographic collation 30 * on Url. 31 */ 32 public class CorpusImpl extends AbstractLanguageResource 33 implements Corpus, CreoleListener { 34 35 /** Debug flag */ 36 private static final boolean DEBUG = false; 37 38 public CorpusImpl(){ 39 supportList = Collections.synchronizedList(new VerboseList()); 40 Gate.getCreoleRegister().addCreoleListener(this); 41 } 42 43 44 /** 45 * Gets the names of the documents in this corpus. 46 * @return a {@link List} of Strings representing the names of the documents 47 * in this corpus. 48 */ 49 public List getDocumentNames(){ 50 ArrayList res = new ArrayList(supportList.size()); 51 Iterator docIter = supportList.iterator(); 52 while(docIter.hasNext()){ 53 res.add(((Document)docIter.next()).getName()); 54 } 55 return res; 56 } 57 58 /** 59 * Gets the name of a document in this corpus. 60 * @param index the index of the document 61 * @return a String value representing the name of the document at 62 * <tt>index</tt> in this corpus. 63 */ 64 public String getDocumentName(int index){ 65 return ((Document)supportList.get(index)).getName(); 66 } 67 68 /** 69 * This method does not make sense for transient corpora, so it does 70 * nothing. 71 */ 72 public void unloadDocument(Document doc) { 73 return; 74 } 75 76 77 /** 78 * The underlying list that holds the documents in this corpus. 79 */ 80 protected List supportList = null; 81 82 /** 83 * A proxy list that stores the actual data in an internal list and forwards 84 * all operations to that one but it also fires the appropiate corpus events 85 * when necessary. 86 * It also does some type checking so only Documents are accepted as corpus 87 * members. 88 */ 89 protected class VerboseList extends AbstractList implements Serializable{ 90 91 92 93 VerboseList(){ 94 data = new ArrayList(); 95 } 96 97 public Object get(int index){ 98 return data.get(index); 99 } 100 101 public int size(){ 102 return data.size(); 103 } 104 105 public Object set(int index, Object element){ 106 if(element instanceof Document){ 107 Document oldDoc = (Document)data.set(index, element); 108 Document newDoc = (Document)element; 109 110 //fire the 2 events 111 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, 112 oldDoc, 113 index, 114 CorpusEvent.DOCUMENT_REMOVED)); 115 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, 116 newDoc, 117 index, 118 CorpusEvent.DOCUMENT_ADDED)); 119 return oldDoc; 120 }else{ 121 throw new UnsupportedOperationException( 122 getClass().getName() + 123 " only accepts gate.Document values as members!\n" + 124 element.getClass().getName() + " is not a gate.Document"); 125 } 126 } 127 128 public void add(int index, Object element){ 129 if(element instanceof Document){ 130 data.add(index, element); 131 132 //fire the event 133 fireDocumentAdded(new CorpusEvent(CorpusImpl.this, 134 (Document)element, 135 index, 136 CorpusEvent.DOCUMENT_ADDED)); 137 }else{ 138 throw new UnsupportedOperationException( 139 getClass().getName() + 140 " only accepts gate.Document values as members!\n" + 141 element.getClass().getName() + " is not a gate.Document"); 142 } 143 } 144 145 public Object remove(int index){ 146 Document oldDoc = (Document)data.remove(index); 147 148 fireDocumentRemoved(new CorpusEvent(CorpusImpl.this, 149 oldDoc, 150 index, 151 CorpusEvent.DOCUMENT_REMOVED)); 152 return oldDoc; 153 } 154 155 /** 156 * The List containing the actual data. 157 */ 158 ArrayList data; 159 } 160 161 /** 162 * This method returns true when the document is already loaded in memory 163 */ 164 public boolean isDocumentLoaded(int index) { 165 return true; 166 } 167 168 169 protected void clearDocList() { 170 if (supportList == null) 171 return; 172 supportList.clear(); 173 } 174 175 176 //List methods 177 //java docs will be automatically copied from the List interface. 178 179 public int size() { 180 return supportList.size(); 181 } 182 183 public boolean isEmpty() { 184 return supportList.isEmpty(); 185 } 186 187 public boolean contains(Object o){ 188 return supportList.contains(o); 189 } 190 191 public Iterator iterator(){ 192 return supportList.iterator(); 193 } 194 195 public Object[] toArray(){ 196 return supportList.toArray(); 197 } 198 199 public Object[] toArray(Object[] a){ 200 return supportList.toArray(a); 201 } 202 203 public boolean add(Object o){ 204 return supportList.add(o); 205 } 206 207 public boolean remove(Object o){ 208 return supportList.remove(o); 209 } 210 211 public boolean containsAll(Collection c){ 212 return supportList.containsAll(c); 213 } 214 215 public boolean addAll(Collection c){ 216 return supportList.addAll(c); 217 } 218 219 public boolean addAll(int index, Collection c){ 220 return supportList.addAll(index, c); 221 } 222 223 public boolean removeAll(Collection c){ 224 return supportList.removeAll(c); 225 } 226 227 public boolean retainAll(Collection c){ 228 return supportList.retainAll(c); 229 } 230 231 public void clear(){ 232 supportList.clear(); 233 } 234 235 public boolean equals(Object o){ 236 if (! (o instanceof CorpusImpl)) 237 return false; 238 239 return supportList.equals(o); 240 } 241 242 public int hashCode(){ 243 return supportList.hashCode(); 244 } 245 246 public Object get(int index){ 247 return supportList.get(index); 248 } 249 250 public Object set(int index, Object element){ 251 return supportList.set(index, element); 252 } 253 254 public void add(int index, Object element){ 255 supportList.add(index, element); 256 } 257 258 public Object remove(int index){ 259 return supportList.remove(index); 260 } 261 262 public int indexOf(Object o){ 263 return supportList.indexOf(o); 264 } 265 266 public int lastIndexOf(Object o){ 267 return lastIndexOf(o); 268 } 269 270 public ListIterator listIterator(){ 271 return supportList.listIterator(); 272 } 273 274 public ListIterator listIterator(int index){ 275 return supportList.listIterator(index); 276 } 277 278 public List subList(int fromIndex, int toIndex){ 279 return supportList.subList(fromIndex, toIndex); 280 } 281 282 283 /** Construction */ 284 285 public void cleanup(){ 286 } 287 288 /** Initialise this resource, and return it. */ 289 public Resource init() { 290 if(documentsList != null && !documentsList.isEmpty()){ 291 addAll(documentsList); 292 } 293 return this; 294 } // init() 295 296 297 /** 298 * Fills the provided corpus with documents created on the fly from selected 299 * files in a directory. Uses a link {@FileFilter} to select which files will 300 * be used and which will be ignored. 301 * A simple file filter based on extensions is provided in the Gate 302 * distribution ({@link gate.util.ExtensionFileFilter}). 303 * @param corpus the corpus to be populated 304 * @param directory the directory from which the files will be picked. This 305 * parameter is an URL for uniformity. It needs to be a URL of type file 306 * otherwise an InvalidArgumentException will be thrown. 307 * @param filter the file filter used to select files from the target 308 * directory. If the filter is <tt>null</tt> all the files will be accepted. 309 * @param encoding the encoding to be used for reading the documents 310 * @param recurseDirectories should the directory be parsed recursively?. If 311 * <tt>true</tt> all the files from the provided directory and all its 312 * children directories (on as many levels as necessary) will be picked if 313 * accepted by the filter otherwise the children directories will be ignored. 314 */ 315 public static void populate(Corpus corpus, URL directory, FileFilter filter, 316 String encoding, boolean recurseDirectories) 317 throws IOException, ResourceInstantiationException{ 318 //check input 319 if(!directory.getProtocol().equalsIgnoreCase("file")) 320 throw new IllegalArgumentException( 321 "The URL provided is not of type \"file:\"!"); 322 323 File dir = new File(directory.getPath()); 324 if(!dir.exists()) 325 throw new FileNotFoundException(dir.toString()); 326 327 if(!dir.isDirectory()) 328 throw new IllegalArgumentException( 329 dir.getAbsolutePath() + " is not a directory!"); 330 331 //populate the corpus 332 File[] files = dir.listFiles(filter); 333 if(files != null){ 334 for(int i = 0; i < files.length; i++){ 335 File aFile = files[i]; 336 if(aFile.isDirectory()){ 337 //recurse dir if required 338 if(recurseDirectories){ 339 populate(corpus, aFile.toURL(), filter, 340 encoding, recurseDirectories); 341 } 342 }else{ 343 //create the doc 344 StatusListener sListener = (StatusListener) 345 gate.gui.MainFrame.getListeners(). 346 get("gate.event.StatusListener"); 347 if(sListener != null) sListener.statusChanged( 348 "Reading: " + aFile.getName()); 349 String docName = aFile.getName() + "_" + Gate.genSym(); 350 FeatureMap params = Factory.newFeatureMap(); 351 params.put("sourceUrl", aFile.toURL()); 352 if(encoding != null) params.put("encoding", encoding); 353 354 corpus.add(Factory.createResource(DocumentImpl.class.getName(), 355 params, null, docName)); 356 if(sListener != null) sListener.statusChanged( 357 aFile.getName() + " read"); 358 } 359 } 360 } 361 }//public static void populate 362 363 /** 364 * Fills this corpus with documents created from files in a directory. 365 * @param filter the file filter used to select files from the target 366 * directory. If the filter is <tt>null</tt> all the files will be accepted. 367 * @param directory the directory from which the files will be picked. This 368 * parameter is an URL for uniformity. It needs to be a URL of type file 369 * otherwise an InvalidArgumentException will be thrown. 370 * An implementation for this method is provided as a static method at 371 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 372 * @param encoding the encoding to be used for reading the documents 373 * @param recurseDirectories should the directory be parsed recursively?. If 374 * <tt>true</tt> all the files from the provided directory and all its 375 * children directories (on as many levels as necessary) will be picked if 376 * accepted by the filter otherwise the children directories will be ignored. 377 */ 378 public void populate(URL directory, FileFilter filter, String encoding, 379 boolean recurseDirectories) 380 throws IOException, ResourceInstantiationException{ 381 populate(this, directory, filter, encoding, recurseDirectories); 382 } 383 384 public synchronized void removeCorpusListener(CorpusListener l) { 385 if (corpusListeners != null && corpusListeners.contains(l)) { 386 Vector v = (Vector) corpusListeners.clone(); 387 v.removeElement(l); 388 corpusListeners = v; 389 } 390 } 391 public synchronized void addCorpusListener(CorpusListener l) { 392 Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone(); 393 if (!v.contains(l)) { 394 v.addElement(l); 395 corpusListeners = v; 396 } 397 } 398 399 /** Freeze the serialization UID. */ 400 static final long serialVersionUID = -1113142759053898456L; 401 private transient Vector corpusListeners; 402 protected transient java.util.List documentsList; 403 404 405 protected void fireDocumentAdded(CorpusEvent e) { 406 if (corpusListeners != null) { 407 Vector listeners = corpusListeners; 408 int count = listeners.size(); 409 for (int i = 0; i < count; i++) { 410 ((CorpusListener) listeners.elementAt(i)).documentAdded(e); 411 } 412 } 413 } 414 protected void fireDocumentRemoved(CorpusEvent e) { 415 if (corpusListeners != null) { 416 Vector listeners = corpusListeners; 417 int count = listeners.size(); 418 for (int i = 0; i < count; i++) { 419 ((CorpusListener) listeners.elementAt(i)).documentRemoved(e); 420 } 421 } 422 } 423 public void setDocumentsList(java.util.List documentsList) { 424 this.documentsList = documentsList; 425 } 426 public java.util.List getDocumentsList() { 427 return documentsList; 428 } 429 public void resourceLoaded(CreoleEvent e) { 430 } 431 public void resourceUnloaded(CreoleEvent e) { 432 Resource res = e.getResource(); 433 //remove all occurences 434 if(res instanceof Document) while(contains(res)) remove(res); 435 } 436 public void datastoreOpened(CreoleEvent e) { 437 } 438 public void datastoreCreated(CreoleEvent e) { 439 } 440 public void datastoreClosed(CreoleEvent e) { 441 } 442 } // class CorpusImpl 443
|
CorpusImpl |
|