1   /*
2    *  SerialCorpusImpl.java
3    *
4    *  Copyright (c) 1998-2001, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 19/Oct/2001
12   *
13   *  $Id: SerialCorpusImpl.java,v 1.20 2001/12/03 14:03:05 kalina Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.util.*;
19  
20  import gate.*;
21  import gate.util.*;
22  import gate.annotation.*;
23  import gate.persist.*;
24  import java.io.*;
25  import java.net.*;
26  import gate.event.*;
27  import gate.creole.*;
28  import gate.security.SecurityException;
29  
30  //The initial design was to implement this on the basis of a WeakValueHashMap.
31  //However this creates problems, because the user might e.g., add a transient
32  //document to the corpus and then if the Document variable goes out of scope
33  //before sync() is called, nothing will be saved of the new document. Bad!
34  //Instead, to cope with the unloading for memory saving use, I implemented
35  //a documentUnload() method, which sets the in-memory copy to null but can
36  //always restore the doc, because it has its persistence ID.
37  
38  public class SerialCorpusImpl extends
39            AbstractLanguageResource
40                        implements Corpus, CreoleListener, DatastoreListener {
41  
42    /** Debug flag */
43    private static final boolean DEBUG = false;
44  
45    static final long serialVersionUID = 3632609241787241616L;
46  
47    private transient Vector corpusListeners;
48    private java.util.List docDataList = null;
49  
50    //here I keep document index as key (same as the index in docDataList
51    //which defines the document order) and Documents as value
52    private transient List documents = null;
53  
54    public SerialCorpusImpl() {
55    }
56  
57    /**
58     * Constructor to create a SerialCorpus from a transient one.
59     * This is called by adopt() to store the transient corpus
60     * and re-route the methods calls to it, until the corpus is
61     * sync-ed on disk. After that, the transientCorpus will always
62     * be null, so the new functionality will be used instead.
63     */
64    protected SerialCorpusImpl(Corpus tCorpus){
65      //copy the corpus name and features from the one in memory
66      this.setName(tCorpus.getName());
67      this.setFeatures(tCorpus.getFeatures());
68  
69      docDataList = new ArrayList();
70      //now cache the names of all docs for future use
71      Iterator iter = tCorpus.getDocumentNames().iterator();
72      while (iter.hasNext())
73        docDataList.add(new DocumentData((String) iter.next(), null));
74  
75      //copy all the documents from the transient corpus
76      documents = new ArrayList();
77      documents.addAll(tCorpus);
78  
79      //make sure we fire events when docs are added/removed/etc
80      Gate.getCreoleRegister().addCreoleListener(this);
81    }
82  
83    /**
84     * Gets the names of the documents in this corpus.
85     * @return a {@link List} of Strings representing the names of the documents
86     * in this corpus.
87     */
88    public List getDocumentNames(){
89      List docsNames = new ArrayList();
90      if(docDataList == null)
91        return docsNames;
92      Iterator iter = docDataList.iterator();
93      while (iter.hasNext()) {
94        DocumentData data = (DocumentData) iter.next();
95        docsNames.add(data.getDocumentName());
96      }
97      return docsNames;
98    }
99  
100   /**
101    * This method should only be used by the Serial Datastore to set
102    */
103   public void setDocumentPersistentID(int index, Object persID){
104     if (index >= docDataList.size()) return;
105     ((DocumentData)docDataList.get(index)).setPersistentID(persID);
106     if (DEBUG) Out.prln("IDs are now: " + docDataList);
107   }
108 
109   /**
110    * Gets the name of a document in this corpus.
111    * @param index the index of the document
112    * @return a String value representing the name of the document at
113    * <tt>index</tt> in this corpus.<P>
114    */
115   public String getDocumentName(int index){
116     if (index >= docDataList.size()) return "No such document";
117 
118     return ((DocumentData) docDataList.get(index)).getDocumentName();
119   }
120 
121   /**
122    * Unloads the document from memory, but calls sync() first, to store the
123    * changes
124    */
125   public void unloadDocument(int index) {
126     //1. check whether its been loaded and is a persistent one
127     // if a persistent doc is not loaded, there's nothing we need to do
128     if ( (! isDocumentLoaded(index)) && isPersistentDocument(index))
129       return;
130 
131     //2. sync the document before releasing it from memory, because the
132     //creole register garbage collects all LRs which are not used any more
133     Document doc = (Document) documents.get(index);
134     try {
135       //if the document is not already adopted, we need to do that first
136       if (doc.getLRPersistenceId() == null) {
137         doc = (Document) this.getDataStore().adopt(doc, null);
138         this.getDataStore().sync(doc);
139         this.setDocumentPersistentID(index, doc.getLRPersistenceId());
140       } else //if it is adopted, just sync it
141         this.getDataStore().sync(doc);
142 
143       //3. remove the document from the memory
144       //do this, only if the saving has succeeded
145       documents.set(index, null);
146 
147     } catch (PersistenceException ex) {
148         throw new GateRuntimeException("Error unloading document from corpus"
149                       + "because document sync failed: " + ex.getMessage());
150     } catch (gate.security.SecurityException ex1) {
151         throw new GateRuntimeException("Error unloading document from corpus"
152                       + "because of document access error: " + ex1.getMessage());
153     }
154 
155   }
156 
157   /**
158    * Unloads a document from memory
159    */
160   public void unloadDocument(Document doc) {
161     if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
162     //1. determine the index of the document; if not there, do nothing
163     int index = findDocument(doc);
164     if (index == -1)
165       return;
166     if (DEBUG) Out.prln("Index of doc: " + index);
167     if (DEBUG) Out.prln("Size of corpus: " + documents.size());
168     unloadDocument(index);
169 //    documents.remove(new Integer(index));
170   }
171 
172   /**
173    * This method returns true when the document is already loaded in memory
174    */
175   public boolean isDocumentLoaded(int index) {
176     if (documents == null || documents.isEmpty()) return false;
177     return documents.get(index) != null;
178   }
179 
180   /**
181    * This method returns true when the document is already stored on disk
182    * i.e., is not transient
183    */
184   public boolean isPersistentDocument(int index) {
185     if (documents == null || documents.isEmpty()) return false;
186     return (((DocumentData)docDataList.get(index)).getPersistentID() != null);
187   }
188 
189   /**
190    * Every LR that is a CreoleListener (and other Listeners too) must
191    * override this method and make sure it removes itself from the
192    * objects which it has been listening to. Otherwise, the object will
193    * not be released from memory (memory leak!).
194    */
195   public void cleanup() {
196     if (DEBUG) Out.prln("serial corpus cleanup called");
197     if (corpusListeners != null)
198       corpusListeners = null;
199     if (documents != null)
200       documents.clear();
201     docDataList.clear();
202     Gate.getCreoleRegister().removeCreoleListener(this);
203     if (this.dataStore != null) {
204       this.dataStore.removeDatastoreListener(this);
205     }
206   }
207 
208   /**
209    * Fills this corpus with documents created from files in a directory.
210    * @param filter the file filter used to select files from the target
211    * directory. If the filter is <tt>null</tt> all the files will be accepted.
212    * @param directory the directory from which the files will be picked. This
213    * parameter is an URL for uniformity. It needs to be a URL of type file
214    * otherwise an InvalidArgumentException will be thrown.
215    * An implementation for this method is provided as a static method at
216    * {@link gate.corpora.CorpusImpl#populate(Corpus,URL,FileFilter,boolean)}.
217    * @param encoding the encoding to be used for reading the documents
218    * @param recurseDirectories should the directory be parsed recursively?. If
219    * <tt>true</tt> all the files from the provided directory and all its
220    * children directories (on as many levels as necessary) will be picked if
221    * accepted by the filter otherwise the children directories will be ignored.
222    */
223   public void populate(URL directory, FileFilter filter, String encoding,
224                        boolean recurseDirectories)
225               throws IOException, ResourceInstantiationException{
226     CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories);
227   }
228 
229 
230   public synchronized void removeCorpusListener(CorpusListener l) {
231     if (corpusListeners != null && corpusListeners.contains(l)) {
232       Vector v = (Vector) corpusListeners.clone();
233       v.removeElement(l);
234       corpusListeners = v;
235     }
236   }
237   public synchronized void addCorpusListener(CorpusListener l) {
238     Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone();
239     if (!v.contains(l)) {
240       v.addElement(l);
241       corpusListeners = v;
242     }
243   }
244   protected void fireDocumentAdded(CorpusEvent e) {
245     if (corpusListeners != null) {
246       Vector listeners = corpusListeners;
247       int count = listeners.size();
248       for (int i = 0; i < count; i++) {
249         ((CorpusListener) listeners.elementAt(i)).documentAdded(e);
250       }
251     }
252   }
253   protected void fireDocumentRemoved(CorpusEvent e) {
254     if (corpusListeners != null) {
255       Vector listeners = corpusListeners;
256       int count = listeners.size();
257       for (int i = 0; i < count; i++) {
258         ((CorpusListener) listeners.elementAt(i)).documentRemoved(e);
259       }
260     }
261   }
262   public void resourceLoaded(CreoleEvent e) {
263   }
264   public void resourceUnloaded(CreoleEvent e) {
265     Resource res = e.getResource();
266     if (res instanceof Document) {
267       if (DEBUG)
268         Out.prln("resource Unloaded called ");
269       //unload all occurences, but no need to remove them from the corpus too
270       int index = indexOf(res);
271       if (index < 0)
272         return;
273       documents.set(index, null);
274       if (DEBUG)
275         Out.prln("corpus: document "+ index + " unloaded and set to null");
276     }
277   }
278   public void datastoreOpened(CreoleEvent e) {
279   }
280   public void datastoreCreated(CreoleEvent e) {
281   }
282   public void datastoreClosed(CreoleEvent e) {
283     if (! e.getDatastore().equals(this.getDataStore()))
284       return;
285     if (this.getDataStore() != null)
286       this.getDataStore().removeDatastoreListener(this);
287     //close this corpus, since it cannot stay open when the DS it comes from
288     //is closed
289     Factory.deleteResource(this);
290   }
291   /**
292    * Called by a datastore when a new resource has been adopted
293    */
294   public void resourceAdopted(DatastoreEvent evt){
295   }
296 
297   /**
298    * Called by a datastore when a resource has been deleted
299    */
300   public void resourceDeleted(DatastoreEvent evt){
301     DataStore ds = (DataStore)evt.getSource();
302     //1. check whether this datastore fired the event. If not, return.
303     if (!ds.equals(this.dataStore))
304       return;
305 
306     Object docID = evt.getResourceID();
307     if (docID == null)
308       return;
309 
310     if (DEBUG) Out.prln("Resource deleted called for: " + docID);
311     boolean isDirty=false;
312     //the problem here is that I only have the doc persistent ID
313     //and nothing else, so I need to determine the index of the doc first
314     for (int i=0; i< docDataList.size(); i++) {
315       DocumentData docData = (DocumentData)docDataList.get(i);
316       //we've found the correct document
317       //don't break the loop, because it might appear more than once
318       if (docID.equals(docData.getPersistentID())) {
319         remove(i);
320         isDirty = true;
321       }//if
322     }//for loop through the doc data
323 
324     if (isDirty)
325       try {
326         this.dataStore.sync(this);
327       } catch (PersistenceException ex) {
328         throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
329       } catch (SecurityException sex) {
330         throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
331       }
332   }//resourceDeleted
333 
334   /**
335    * Called by a datastore when a resource has been wrote into the datastore
336    */
337   public void resourceWritten(DatastoreEvent evt){
338   }
339 
340 
341 
342   //List methods
343   //java docs will be automatically copied from the List interface.
344 
345   public int size() {
346     return docDataList.size();
347   }
348 
349   public boolean isEmpty() {
350     return docDataList.isEmpty();
351   }
352 
353   public boolean contains(Object o){
354     //return true if:
355     // - the document data list contains a document with such a name
356     //   and persistent id
357 
358     if(! (o instanceof Document))
359       return false;
360 
361     int index = findDocument((Document) o);
362     if (index < 0)
363       return false;
364     else
365       return true;
366   }
367 
368   public Iterator iterator(){
369     return new Iterator(){
370       Iterator docDataIter = docDataList.iterator();
371 
372       public boolean hasNext() {
373         return docDataIter.hasNext();
374       }
375 
376       public Object next(){
377 
378         //try finding a document with the same name and persistent ID
379         DocumentData docData = (DocumentData) docDataIter.next();
380         int index = docDataList.indexOf(docData);
381         return SerialCorpusImpl.this.get(index);
382       }
383 
384       public void remove() {
385         throw new UnsupportedOperationException("SerialCorpusImpl does not " +
386                     "support remove in the iterators");
387       }
388     }; //return
389 
390   }//iterator
391 
392   public String toString() {
393     return "document data " + docDataList.toString() + " documents " + documents;
394   }
395 
396   public Object[] toArray(){
397     //there is a problem here, because some docs might not be instantiated
398     throw new MethodNotImplementedException(
399                 "toArray() is not implemented for SerialCorpusImpl");
400   }
401 
402   public Object[] toArray(Object[] a){
403     //there is a problem here, because some docs might not be instantiated
404     throw new MethodNotImplementedException(
405                 "toArray(Object[] a) is not implemented for SerialCorpusImpl");
406   }
407 
408   public boolean add(Object o){
409     if (! (o instanceof Document) || o == null)
410       return false;
411     Document doc = (Document) o;
412 
413     //make it accept only docs from its own datastore
414     if (doc.getDataStore() != null
415         && !this.dataStore.equals(doc.getDataStore())) {
416       Err.prln("Error: Persistent corpus can only accept documents " +
417                "from its own datastore!");
418       return false;
419     }//if
420 
421     //add the document with its index in the docDataList
422     //in this case, since it's going to be added to the end
423     //the index will be the size of the docDataList before
424     //the addition
425     DocumentData docData = new DocumentData(doc.getName(),
426                                             doc.getLRPersistenceId());
427     boolean result = docDataList.add(docData);
428     documents.add(doc);
429     fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
430                                       doc,
431                                       docDataList.size()-1,
432                                       CorpusEvent.DOCUMENT_ADDED));
433 
434     return result;
435   }
436 
437   public boolean remove(Object o){
438     if (DEBUG) Out.prln("SerialCorpus:Remove object called");
439     if (! (o instanceof Document))
440       return false;
441     Document doc = (Document) o;
442 
443     //see if we can find it first. If not, then judt return
444     int index = findDocument(doc);
445     if (index == -1)
446       return false;
447 
448     if(index < docDataList.size()) { //we found it, so remove it
449       docDataList.remove(index);
450       Document oldDoc =  (Document) documents.remove(index);
451       if (DEBUG) Out.prln("documents after remove of " + oldDoc.getName()
452                           + " are " + documents);
453       fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
454                                           oldDoc,
455                                           index,
456                                           CorpusEvent.DOCUMENT_REMOVED));
457     }
458 
459     return true;
460   }
461 
462   public int findDocument(Document doc) {
463     boolean found = false;
464     DocumentData docData = null;
465 
466     //first try finding the document in memory
467     int index = documents.indexOf(doc);
468     if (index > -1 && index < docDataList.size())
469       return index;
470 
471     //else try finding a document with the same name and persistent ID
472     Iterator iter = docDataList.iterator();
473     for (index = 0;  iter.hasNext(); index++) {
474       docData = (DocumentData) iter.next();
475       if (docData.getDocumentName().equals(doc.getName()) &&
476           docData.getPersistentID().equals(doc.getLRPersistenceId())) {
477         found = true;
478         break;
479       }
480     }
481     if (found && index < docDataList.size())
482       return index;
483     else
484       return -1;
485   }//findDocument
486 
487   public boolean containsAll(Collection c){
488     Iterator iter = c.iterator();
489     while (iter.hasNext()) {
490       if (! contains(iter.next()))
491         return false;
492     }
493     return true;
494   }
495 
496   public boolean addAll(Collection c){
497     boolean allAdded = true;
498     Iterator iter = c.iterator();
499     while (iter.hasNext()) {
500       if (! add(iter.next()))
501         allAdded = false;
502     }
503     return allAdded;
504   }
505 
506   public boolean addAll(int index, Collection c){
507     throw new UnsupportedOperationException();
508   }
509 
510   public boolean removeAll(Collection c){
511     boolean allRemoved = true;
512     Iterator iter = c.iterator();
513     while (iter.hasNext()) {
514       if (! remove(iter.next()))
515         allRemoved = false;
516     }
517     return allRemoved;
518 
519   }
520 
521   public boolean retainAll(Collection c){
522     throw new UnsupportedOperationException();
523   }
524 
525   public void clear(){
526     documents.clear();
527     docDataList.clear();
528   }
529 
530   public boolean equals(Object o){
531     if (! (o instanceof SerialCorpusImpl))
532       return false;
533     SerialCorpusImpl oCorpus = (SerialCorpusImpl) o;
534     if ((this == null && oCorpus != null) || (oCorpus == null && this != null))
535       return false;
536     if (oCorpus == this)
537       return true;
538     if ((oCorpus.lrPersistentId == this.lrPersistentId ||
539           ( this.lrPersistentId != null &&
540             this.lrPersistentId.equals(oCorpus.lrPersistentId))
541           )
542         &&
543         oCorpus.name.equals(this.name)
544         &&
545         (oCorpus.dataStore == this.dataStore
546           || oCorpus.dataStore.equals(this.dataStore))
547         &&
548         oCorpus.docDataList.equals(docDataList))
549       return true;
550     return false;
551   }
552 
553   public int hashCode(){
554     return docDataList.hashCode();
555   }
556 
557   public Object get(int index){
558       if (index >= docDataList.size())
559         return null;
560 
561       Object res = documents.get(index);
562 
563       if (DEBUG)
564         Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);
565 
566       //if the document is null, then I must get it from the DS
567       if (res == null) {
568         FeatureMap features = Factory.newFeatureMap();
569         features.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
570         try {
571           features.put(DataStore.LR_ID_FEATURE_NAME,
572                       ((DocumentData)docDataList.get(index)).getPersistentID());
573           Resource lr = Factory.createResource( "gate.corpora.DocumentImpl",
574                                                 features);
575           if (DEBUG)
576             Out.prln("Loaded document :" + lr.getName());
577           //change the result to the newly loaded doc
578           res = lr;
579 
580           //finally replace the doc with the instantiated version
581           documents.set(index, lr);
582         } catch (ResourceInstantiationException ex) {
583           Err.prln("Error reading document inside a serialised corpus.");
584           throw new GateRuntimeException(ex.getMessage());
585         }
586       }
587 
588       return res;
589   }
590 
591   public Object set(int index, Object element){
592     throw new gate.util.MethodNotImplementedException();
593         //fire the 2 events
594 /*        fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
595                                             oldDoc,
596                                             ((Integer) key).intValue(),
597                                             CorpusEvent.DOCUMENT_REMOVED));
598         fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
599                                           newDoc,
600                                           ((Integer) key).intValue(),
601                                           CorpusEvent.DOCUMENT_ADDED));
602 */
603   }
604 
605   public void add(int index, Object o){
606     if (! (o instanceof Document) || o == null)
607       return;
608     Document doc = (Document) o;
609 
610     DocumentData docData = new DocumentData(doc.getName(),
611                                             doc.getLRPersistenceId());
612     docDataList.add(index, docData);
613 
614     documents.add(index, doc);
615     fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
616                                       doc,
617                                       index,
618                                       CorpusEvent.DOCUMENT_ADDED));
619 
620   }
621 
622   public Object remove(int index){
623     if (DEBUG) Out.prln("Remove index called");
624     docDataList.remove(index);
625     Document res = (Document) documents.remove(index);
626     fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
627                                         res,
628                                         index,
629                                         CorpusEvent.DOCUMENT_REMOVED));
630     return res;
631 
632   }
633 
634   public int indexOf(Object o){
635     if (o instanceof Document)
636       return findDocument((Document) o);
637 
638     return -1;
639   }
640 
641   public int lastIndexOf(Object o){
642     throw new gate.util.MethodNotImplementedException();
643   }
644 
645   public ListIterator listIterator(){
646     throw new gate.util.MethodNotImplementedException();
647   }
648 
649   public ListIterator listIterator(int index){
650     throw new gate.util.MethodNotImplementedException();
651   }
652 
653   /**
654    * persistent Corpus does not support this method as all
655    * the documents might no be in memory
656    */
657   public List subList(int fromIndex, int toIndex){
658     throw new gate.util.MethodNotImplementedException();
659   }
660 
661   public void setDataStore(DataStore dataStore)
662                 throws gate.persist.PersistenceException {
663     super.setDataStore( dataStore);
664     if (this.dataStore != null)
665       this.dataStore.addDatastoreListener(this);
666   }
667 
668   public void setTransientSource(Object source) {
669     if (! (source instanceof Corpus))
670       return;
671 
672     //the following initialisation is only valid when we're constructing
673     //this object from a transient one. If it has already been stored in
674     //a datastore, then the initialisation is done in readObject() since
675     //this method is the one called by serialisation, when objects
676     //are restored.
677     if (this.dataStore != null && this.lrPersistentId != null)
678       return;
679 
680     Corpus tCorpus = (Corpus) source;
681 
682     //copy the corpus name and features from the one in memory
683     this.setName(tCorpus.getName());
684     this.setFeatures(tCorpus.getFeatures());
685 
686     docDataList = new ArrayList();
687     //now cache the names of all docs for future use
688     Iterator iter = tCorpus.getDocumentNames().iterator();
689     while (iter.hasNext())
690       docDataList.add(new DocumentData((String) iter.next(), null));
691 
692     //copy all the documents from the transient corpus
693     documents = new ArrayList();
694     documents.addAll(tCorpus);
695 
696     //make sure we fire events when docs are added/removed/etc
697     Gate.getCreoleRegister().addCreoleListener(this);
698 
699   }
700 
701   //we don't keep the transient source, so always return null
702   //Sill this must be implemented, coz of the GUI and Factory
703   public Object getTransientSource() {
704     return null;
705   }
706 
707 
708   public Resource init() throws gate.creole.ResourceInstantiationException {
709     super.init();
710 
711     return this;
712 
713   }
714 
715 
716   /**
717    * readObject - calls the default readObject() and then initialises the
718    * transient data
719    *
720    * @serialData Read serializable fields. No optional data read.
721    */
722   private void readObject(ObjectInputStream s)
723       throws IOException, ClassNotFoundException {
724     s.defaultReadObject();
725     documents = new ArrayList(docDataList.size());
726     for (int i = 0; i < docDataList.size(); i++)
727       documents.add(null);
728     corpusListeners = new Vector();
729     //finally set the creole listeners if the LR is like that
730     Gate.getCreoleRegister().addCreoleListener(this);
731     if (this.dataStore != null)
732       this.dataStore.addDatastoreListener(this);
733 
734   }//readObject
735 
736   protected class DocumentData implements Serializable {
737     //fix the ID for serialisation
738     static final long serialVersionUID = 4192762901421847525L;
739 
740     DocumentData(String name, Object ID){
741       docName = name;
742       persistentID = ID;
743     }
744 
745     public String getDocumentName() {
746       return docName;
747     }
748 
749     public Object getPersistentID() {
750       return persistentID;
751     }
752 
753     public void setPersistentID(Object newID) {
754       persistentID = newID;
755     }
756 
757     public String toString() {
758       return new String("DocumentData: " + docName + ", " + persistentID);
759     }
760 
761     String docName;
762     Object persistentID;
763   }
764 
765 }