|
SerialCorpusImpl |
|
1 /* 2 * SerialCorpusImpl.java 3 * 4 * Copyright (c) 1998-2001, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 19/Oct/2001 12 * 13 * $Id: SerialCorpusImpl.java,v 1.20 2001/12/03 14:03:05 kalina Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.util.*; 19 20 import gate.*; 21 import gate.util.*; 22 import gate.annotation.*; 23 import gate.persist.*; 24 import java.io.*; 25 import java.net.*; 26 import gate.event.*; 27 import gate.creole.*; 28 import gate.security.SecurityException; 29 30 //The initial design was to implement this on the basis of a WeakValueHashMap. 31 //However this creates problems, because the user might e.g., add a transient 32 //document to the corpus and then if the Document variable goes out of scope 33 //before sync() is called, nothing will be saved of the new document. Bad! 34 //Instead, to cope with the unloading for memory saving use, I implemented 35 //a documentUnload() method, which sets the in-memory copy to null but can 36 //always restore the doc, because it has its persistence ID. 37 38 public class SerialCorpusImpl extends 39 AbstractLanguageResource 40 implements Corpus, CreoleListener, DatastoreListener { 41 42 /** Debug flag */ 43 private static final boolean DEBUG = false; 44 45 static final long serialVersionUID = 3632609241787241616L; 46 47 private transient Vector corpusListeners; 48 private java.util.List docDataList = null; 49 50 //here I keep document index as key (same as the index in docDataList 51 //which defines the document order) and Documents as value 52 private transient List documents = null; 53 54 public SerialCorpusImpl() { 55 } 56 57 /** 58 * Constructor to create a SerialCorpus from a transient one. 59 * This is called by adopt() to store the transient corpus 60 * and re-route the methods calls to it, until the corpus is 61 * sync-ed on disk. After that, the transientCorpus will always 62 * be null, so the new functionality will be used instead. 63 */ 64 protected SerialCorpusImpl(Corpus tCorpus){ 65 //copy the corpus name and features from the one in memory 66 this.setName(tCorpus.getName()); 67 this.setFeatures(tCorpus.getFeatures()); 68 69 docDataList = new ArrayList(); 70 //now cache the names of all docs for future use 71 Iterator iter = tCorpus.getDocumentNames().iterator(); 72 while (iter.hasNext()) 73 docDataList.add(new DocumentData((String) iter.next(), null)); 74 75 //copy all the documents from the transient corpus 76 documents = new ArrayList(); 77 documents.addAll(tCorpus); 78 79 //make sure we fire events when docs are added/removed/etc 80 Gate.getCreoleRegister().addCreoleListener(this); 81 } 82 83 /** 84 * Gets the names of the documents in this corpus. 85 * @return a {@link List} of Strings representing the names of the documents 86 * in this corpus. 87 */ 88 public List getDocumentNames(){ 89 List docsNames = new ArrayList(); 90 if(docDataList == null) 91 return docsNames; 92 Iterator iter = docDataList.iterator(); 93 while (iter.hasNext()) { 94 DocumentData data = (DocumentData) iter.next(); 95 docsNames.add(data.getDocumentName()); 96 } 97 return docsNames; 98 } 99 100 /** 101 * This method should only be used by the Serial Datastore to set 102 */ 103 public void setDocumentPersistentID(int index, Object persID){ 104 if (index >= docDataList.size()) return; 105 ((DocumentData)docDataList.get(index)).setPersistentID(persID); 106 if (DEBUG) Out.prln("IDs are now: " + docDataList); 107 } 108 109 /** 110 * Gets the name of a document in this corpus. 111 * @param index the index of the document 112 * @return a String value representing the name of the document at 113 * <tt>index</tt> in this corpus.<P> 114 */ 115 public String getDocumentName(int index){ 116 if (index >= docDataList.size()) return "No such document"; 117 118 return ((DocumentData) docDataList.get(index)).getDocumentName(); 119 } 120 121 /** 122 * Unloads the document from memory, but calls sync() first, to store the 123 * changes 124 */ 125 public void unloadDocument(int index) { 126 //1. check whether its been loaded and is a persistent one 127 // if a persistent doc is not loaded, there's nothing we need to do 128 if ( (! isDocumentLoaded(index)) && isPersistentDocument(index)) 129 return; 130 131 //2. sync the document before releasing it from memory, because the 132 //creole register garbage collects all LRs which are not used any more 133 Document doc = (Document) documents.get(index); 134 try { 135 //if the document is not already adopted, we need to do that first 136 if (doc.getLRPersistenceId() == null) { 137 doc = (Document) this.getDataStore().adopt(doc, null); 138 this.getDataStore().sync(doc); 139 this.setDocumentPersistentID(index, doc.getLRPersistenceId()); 140 } else //if it is adopted, just sync it 141 this.getDataStore().sync(doc); 142 143 //3. remove the document from the memory 144 //do this, only if the saving has succeeded 145 documents.set(index, null); 146 147 } catch (PersistenceException ex) { 148 throw new GateRuntimeException("Error unloading document from corpus" 149 + "because document sync failed: " + ex.getMessage()); 150 } catch (gate.security.SecurityException ex1) { 151 throw new GateRuntimeException("Error unloading document from corpus" 152 + "because of document access error: " + ex1.getMessage()); 153 } 154 155 } 156 157 /** 158 * Unloads a document from memory 159 */ 160 public void unloadDocument(Document doc) { 161 if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName()); 162 //1. determine the index of the document; if not there, do nothing 163 int index = findDocument(doc); 164 if (index == -1) 165 return; 166 if (DEBUG) Out.prln("Index of doc: " + index); 167 if (DEBUG) Out.prln("Size of corpus: " + documents.size()); 168 unloadDocument(index); 169 // documents.remove(new Integer(index)); 170 } 171 172 /** 173 * This method returns true when the document is already loaded in memory 174 */ 175 public boolean isDocumentLoaded(int index) { 176 if (documents == null || documents.isEmpty()) return false; 177 return documents.get(index) != null; 178 } 179 180 /** 181 * This method returns true when the document is already stored on disk 182 * i.e., is not transient 183 */ 184 public boolean isPersistentDocument(int index) { 185 if (documents == null || documents.isEmpty()) return false; 186 return (((DocumentData)docDataList.get(index)).getPersistentID() != null); 187 } 188 189 /** 190 * Every LR that is a CreoleListener (and other Listeners too) must 191 * override this method and make sure it removes itself from the 192 * objects which it has been listening to. Otherwise, the object will 193 * not be released from memory (memory leak!). 194 */ 195 public void cleanup() { 196 if (DEBUG) Out.prln("serial corpus cleanup called"); 197 if (corpusListeners != null) 198 corpusListeners = null; 199 if (documents != null) 200 documents.clear(); 201 docDataList.clear(); 202 Gate.getCreoleRegister().removeCreoleListener(this); 203 if (this.dataStore != null) { 204 this.dataStore.removeDatastoreListener(this); 205 } 206 } 207 208 /** 209 * Fills this corpus with documents created from files in a directory. 210 * @param filter the file filter used to select files from the target 211 * directory. If the filter is <tt>null</tt> all the files will be accepted. 212 * @param directory the directory from which the files will be picked. This 213 * parameter is an URL for uniformity. It needs to be a URL of type file 214 * otherwise an InvalidArgumentException will be thrown. 215 * An implementation for this method is provided as a static method at 216 * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}. 217 * @param encoding the encoding to be used for reading the documents 218 * @param recurseDirectories should the directory be parsed recursively?. If 219 * <tt>true</tt> all the files from the provided directory and all its 220 * children directories (on as many levels as necessary) will be picked if 221 * accepted by the filter otherwise the children directories will be ignored. 222 */ 223 public void populate(URL directory, FileFilter filter, String encoding, 224 boolean recurseDirectories) 225 throws IOException, ResourceInstantiationException{ 226 CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories); 227 } 228 229 230 public synchronized void removeCorpusListener(CorpusListener l) { 231 if (corpusListeners != null && corpusListeners.contains(l)) { 232 Vector v = (Vector) corpusListeners.clone(); 233 v.removeElement(l); 234 corpusListeners = v; 235 } 236 } 237 public synchronized void addCorpusListener(CorpusListener l) { 238 Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone(); 239 if (!v.contains(l)) { 240 v.addElement(l); 241 corpusListeners = v; 242 } 243 } 244 protected void fireDocumentAdded(CorpusEvent e) { 245 if (corpusListeners != null) { 246 Vector listeners = corpusListeners; 247 int count = listeners.size(); 248 for (int i = 0; i < count; i++) { 249 ((CorpusListener) listeners.elementAt(i)).documentAdded(e); 250 } 251 } 252 } 253 protected void fireDocumentRemoved(CorpusEvent e) { 254 if (corpusListeners != null) { 255 Vector listeners = corpusListeners; 256 int count = listeners.size(); 257 for (int i = 0; i < count; i++) { 258 ((CorpusListener) listeners.elementAt(i)).documentRemoved(e); 259 } 260 } 261 } 262 public void resourceLoaded(CreoleEvent e) { 263 } 264 public void resourceUnloaded(CreoleEvent e) { 265 Resource res = e.getResource(); 266 if (res instanceof Document) { 267 if (DEBUG) 268 Out.prln("resource Unloaded called "); 269 //unload all occurences, but no need to remove them from the corpus too 270 int index = indexOf(res); 271 if (index < 0) 272 return; 273 documents.set(index, null); 274 if (DEBUG) 275 Out.prln("corpus: document "+ index + " unloaded and set to null"); 276 } 277 } 278 public void datastoreOpened(CreoleEvent e) { 279 } 280 public void datastoreCreated(CreoleEvent e) { 281 } 282 public void datastoreClosed(CreoleEvent e) { 283 if (! e.getDatastore().equals(this.getDataStore())) 284 return; 285 if (this.getDataStore() != null) 286 this.getDataStore().removeDatastoreListener(this); 287 //close this corpus, since it cannot stay open when the DS it comes from 288 //is closed 289 Factory.deleteResource(this); 290 } 291 /** 292 * Called by a datastore when a new resource has been adopted 293 */ 294 public void resourceAdopted(DatastoreEvent evt){ 295 } 296 297 /** 298 * Called by a datastore when a resource has been deleted 299 */ 300 public void resourceDeleted(DatastoreEvent evt){ 301 DataStore ds = (DataStore)evt.getSource(); 302 //1. check whether this datastore fired the event. If not, return. 303 if (!ds.equals(this.dataStore)) 304 return; 305 306 Object docID = evt.getResourceID(); 307 if (docID == null) 308 return; 309 310 if (DEBUG) Out.prln("Resource deleted called for: " + docID); 311 boolean isDirty=false; 312 //the problem here is that I only have the doc persistent ID 313 //and nothing else, so I need to determine the index of the doc first 314 for (int i=0; i< docDataList.size(); i++) { 315 DocumentData docData = (DocumentData)docDataList.get(i); 316 //we've found the correct document 317 //don't break the loop, because it might appear more than once 318 if (docID.equals(docData.getPersistentID())) { 319 remove(i); 320 isDirty = true; 321 }//if 322 }//for loop through the doc data 323 324 if (isDirty) 325 try { 326 this.dataStore.sync(this); 327 } catch (PersistenceException ex) { 328 throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage()); 329 } catch (SecurityException sex) { 330 throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage()); 331 } 332 }//resourceDeleted 333 334 /** 335 * Called by a datastore when a resource has been wrote into the datastore 336 */ 337 public void resourceWritten(DatastoreEvent evt){ 338 } 339 340 341 342 //List methods 343 //java docs will be automatically copied from the List interface. 344 345 public int size() { 346 return docDataList.size(); 347 } 348 349 public boolean isEmpty() { 350 return docDataList.isEmpty(); 351 } 352 353 public boolean contains(Object o){ 354 //return true if: 355 // - the document data list contains a document with such a name 356 // and persistent id 357 358 if(! (o instanceof Document)) 359 return false; 360 361 int index = findDocument((Document) o); 362 if (index < 0) 363 return false; 364 else 365 return true; 366 } 367 368 public Iterator iterator(){ 369 return new Iterator(){ 370 Iterator docDataIter = docDataList.iterator(); 371 372 public boolean hasNext() { 373 return docDataIter.hasNext(); 374 } 375 376 public Object next(){ 377 378 //try finding a document with the same name and persistent ID 379 DocumentData docData = (DocumentData) docDataIter.next(); 380 int index = docDataList.indexOf(docData); 381 return SerialCorpusImpl.this.get(index); 382 } 383 384 public void remove() { 385 throw new UnsupportedOperationException("SerialCorpusImpl does not " + 386 "support remove in the iterators"); 387 } 388 }; //return 389 390 }//iterator 391 392 public String toString() { 393 return "document data " + docDataList.toString() + " documents " + documents; 394 } 395 396 public Object[] toArray(){ 397 //there is a problem here, because some docs might not be instantiated 398 throw new MethodNotImplementedException( 399 "toArray() is not implemented for SerialCorpusImpl"); 400 } 401 402 public Object[] toArray(Object[] a){ 403 //there is a problem here, because some docs might not be instantiated 404 throw new MethodNotImplementedException( 405 "toArray(Object[] a) is not implemented for SerialCorpusImpl"); 406 } 407 408 public boolean add(Object o){ 409 if (! (o instanceof Document) || o == null) 410 return false; 411 Document doc = (Document) o; 412 413 //make it accept only docs from its own datastore 414 if (doc.getDataStore() != null 415 && !this.dataStore.equals(doc.getDataStore())) { 416 Err.prln("Error: Persistent corpus can only accept documents " + 417 "from its own datastore!"); 418 return false; 419 }//if 420 421 //add the document with its index in the docDataList 422 //in this case, since it's going to be added to the end 423 //the index will be the size of the docDataList before 424 //the addition 425 DocumentData docData = new DocumentData(doc.getName(), 426 doc.getLRPersistenceId()); 427 boolean result = docDataList.add(docData); 428 documents.add(doc); 429 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 430 doc, 431 docDataList.size()-1, 432 CorpusEvent.DOCUMENT_ADDED)); 433 434 return result; 435 } 436 437 public boolean remove(Object o){ 438 if (DEBUG) Out.prln("SerialCorpus:Remove object called"); 439 if (! (o instanceof Document)) 440 return false; 441 Document doc = (Document) o; 442 443 //see if we can find it first. If not, then judt return 444 int index = findDocument(doc); 445 if (index == -1) 446 return false; 447 448 if(index < docDataList.size()) { //we found it, so remove it 449 docDataList.remove(index); 450 Document oldDoc = (Document) documents.remove(index); 451 if (DEBUG) Out.prln("documents after remove of " + oldDoc.getName() 452 + " are " + documents); 453 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 454 oldDoc, 455 index, 456 CorpusEvent.DOCUMENT_REMOVED)); 457 } 458 459 return true; 460 } 461 462 public int findDocument(Document doc) { 463 boolean found = false; 464 DocumentData docData = null; 465 466 //first try finding the document in memory 467 int index = documents.indexOf(doc); 468 if (index > -1 && index < docDataList.size()) 469 return index; 470 471 //else try finding a document with the same name and persistent ID 472 Iterator iter = docDataList.iterator(); 473 for (index = 0; iter.hasNext(); index++) { 474 docData = (DocumentData) iter.next(); 475 if (docData.getDocumentName().equals(doc.getName()) && 476 docData.getPersistentID().equals(doc.getLRPersistenceId())) { 477 found = true; 478 break; 479 } 480 } 481 if (found && index < docDataList.size()) 482 return index; 483 else 484 return -1; 485 }//findDocument 486 487 public boolean containsAll(Collection c){ 488 Iterator iter = c.iterator(); 489 while (iter.hasNext()) { 490 if (! contains(iter.next())) 491 return false; 492 } 493 return true; 494 } 495 496 public boolean addAll(Collection c){ 497 boolean allAdded = true; 498 Iterator iter = c.iterator(); 499 while (iter.hasNext()) { 500 if (! add(iter.next())) 501 allAdded = false; 502 } 503 return allAdded; 504 } 505 506 public boolean addAll(int index, Collection c){ 507 throw new UnsupportedOperationException(); 508 } 509 510 public boolean removeAll(Collection c){ 511 boolean allRemoved = true; 512 Iterator iter = c.iterator(); 513 while (iter.hasNext()) { 514 if (! remove(iter.next())) 515 allRemoved = false; 516 } 517 return allRemoved; 518 519 } 520 521 public boolean retainAll(Collection c){ 522 throw new UnsupportedOperationException(); 523 } 524 525 public void clear(){ 526 documents.clear(); 527 docDataList.clear(); 528 } 529 530 public boolean equals(Object o){ 531 if (! (o instanceof SerialCorpusImpl)) 532 return false; 533 SerialCorpusImpl oCorpus = (SerialCorpusImpl) o; 534 if ((this == null && oCorpus != null) || (oCorpus == null && this != null)) 535 return false; 536 if (oCorpus == this) 537 return true; 538 if ((oCorpus.lrPersistentId == this.lrPersistentId || 539 ( this.lrPersistentId != null && 540 this.lrPersistentId.equals(oCorpus.lrPersistentId)) 541 ) 542 && 543 oCorpus.name.equals(this.name) 544 && 545 (oCorpus.dataStore == this.dataStore 546 || oCorpus.dataStore.equals(this.dataStore)) 547 && 548 oCorpus.docDataList.equals(docDataList)) 549 return true; 550 return false; 551 } 552 553 public int hashCode(){ 554 return docDataList.hashCode(); 555 } 556 557 public Object get(int index){ 558 if (index >= docDataList.size()) 559 return null; 560 561 Object res = documents.get(index); 562 563 if (DEBUG) 564 Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res); 565 566 //if the document is null, then I must get it from the DS 567 if (res == null) { 568 FeatureMap features = Factory.newFeatureMap(); 569 features.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore); 570 try { 571 features.put(DataStore.LR_ID_FEATURE_NAME, 572 ((DocumentData)docDataList.get(index)).getPersistentID()); 573 Resource lr = Factory.createResource( "gate.corpora.DocumentImpl", 574 features); 575 if (DEBUG) 576 Out.prln("Loaded document :" + lr.getName()); 577 //change the result to the newly loaded doc 578 res = lr; 579 580 //finally replace the doc with the instantiated version 581 documents.set(index, lr); 582 } catch (ResourceInstantiationException ex) { 583 Err.prln("Error reading document inside a serialised corpus."); 584 throw new GateRuntimeException(ex.getMessage()); 585 } 586 } 587 588 return res; 589 } 590 591 public Object set(int index, Object element){ 592 throw new gate.util.MethodNotImplementedException(); 593 //fire the 2 events 594 /* fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 595 oldDoc, 596 ((Integer) key).intValue(), 597 CorpusEvent.DOCUMENT_REMOVED)); 598 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 599 newDoc, 600 ((Integer) key).intValue(), 601 CorpusEvent.DOCUMENT_ADDED)); 602 */ 603 } 604 605 public void add(int index, Object o){ 606 if (! (o instanceof Document) || o == null) 607 return; 608 Document doc = (Document) o; 609 610 DocumentData docData = new DocumentData(doc.getName(), 611 doc.getLRPersistenceId()); 612 docDataList.add(index, docData); 613 614 documents.add(index, doc); 615 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this, 616 doc, 617 index, 618 CorpusEvent.DOCUMENT_ADDED)); 619 620 } 621 622 public Object remove(int index){ 623 if (DEBUG) Out.prln("Remove index called"); 624 docDataList.remove(index); 625 Document res = (Document) documents.remove(index); 626 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this, 627 res, 628 index, 629 CorpusEvent.DOCUMENT_REMOVED)); 630 return res; 631 632 } 633 634 public int indexOf(Object o){ 635 if (o instanceof Document) 636 return findDocument((Document) o); 637 638 return -1; 639 } 640 641 public int lastIndexOf(Object o){ 642 throw new gate.util.MethodNotImplementedException(); 643 } 644 645 public ListIterator listIterator(){ 646 throw new gate.util.MethodNotImplementedException(); 647 } 648 649 public ListIterator listIterator(int index){ 650 throw new gate.util.MethodNotImplementedException(); 651 } 652 653 /** 654 * persistent Corpus does not support this method as all 655 * the documents might no be in memory 656 */ 657 public List subList(int fromIndex, int toIndex){ 658 throw new gate.util.MethodNotImplementedException(); 659 } 660 661 public void setDataStore(DataStore dataStore) 662 throws gate.persist.PersistenceException { 663 super.setDataStore( dataStore); 664 if (this.dataStore != null) 665 this.dataStore.addDatastoreListener(this); 666 } 667 668 public void setTransientSource(Object source) { 669 if (! (source instanceof Corpus)) 670 return; 671 672 //the following initialisation is only valid when we're constructing 673 //this object from a transient one. If it has already been stored in 674 //a datastore, then the initialisation is done in readObject() since 675 //this method is the one called by serialisation, when objects 676 //are restored. 677 if (this.dataStore != null && this.lrPersistentId != null) 678 return; 679 680 Corpus tCorpus = (Corpus) source; 681 682 //copy the corpus name and features from the one in memory 683 this.setName(tCorpus.getName()); 684 this.setFeatures(tCorpus.getFeatures()); 685 686 docDataList = new ArrayList(); 687 //now cache the names of all docs for future use 688 Iterator iter = tCorpus.getDocumentNames().iterator(); 689 while (iter.hasNext()) 690 docDataList.add(new DocumentData((String) iter.next(), null)); 691 692 //copy all the documents from the transient corpus 693 documents = new ArrayList(); 694 documents.addAll(tCorpus); 695 696 //make sure we fire events when docs are added/removed/etc 697 Gate.getCreoleRegister().addCreoleListener(this); 698 699 } 700 701 //we don't keep the transient source, so always return null 702 //Sill this must be implemented, coz of the GUI and Factory 703 public Object getTransientSource() { 704 return null; 705 } 706 707 708 public Resource init() throws gate.creole.ResourceInstantiationException { 709 super.init(); 710 711 return this; 712 713 } 714 715 716 /** 717 * readObject - calls the default readObject() and then initialises the 718 * transient data 719 * 720 * @serialData Read serializable fields. No optional data read. 721 */ 722 private void readObject(ObjectInputStream s) 723 throws IOException, ClassNotFoundException { 724 s.defaultReadObject(); 725 documents = new ArrayList(docDataList.size()); 726 for (int i = 0; i < docDataList.size(); i++) 727 documents.add(null); 728 corpusListeners = new Vector(); 729 //finally set the creole listeners if the LR is like that 730 Gate.getCreoleRegister().addCreoleListener(this); 731 if (this.dataStore != null) 732 this.dataStore.addDatastoreListener(this); 733 734 }//readObject 735 736 protected class DocumentData implements Serializable { 737 //fix the ID for serialisation 738 static final long serialVersionUID = 4192762901421847525L; 739 740 DocumentData(String name, Object ID){ 741 docName = name; 742 persistentID = ID; 743 } 744 745 public String getDocumentName() { 746 return docName; 747 } 748 749 public Object getPersistentID() { 750 return persistentID; 751 } 752 753 public void setPersistentID(Object newID) { 754 persistentID = newID; 755 } 756 757 public String toString() { 758 return new String("DocumentData: " + docName + ", " + persistentID); 759 } 760 761 String docName; 762 Object persistentID; 763 } 764 765 }
|
SerialCorpusImpl |
|