1   /*
2    *  LuceneIndexManager.java
3    *
4    *  Copyright (c) 1998-2004, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Rosen Marinov, 19/Apr/2002
12   *
13   */
14  
15  package gate.creole.ir.lucene;
16  
17  import java.io.File;
18  import java.util.Iterator;
19  import java.util.List;
20  
21  import org.apache.lucene.analysis.SimpleAnalyzer;
22  import org.apache.lucene.document.Field;
23  import org.apache.lucene.index.IndexReader;
24  import org.apache.lucene.index.IndexWriter;
25  
26  import gate.Corpus;
27  import gate.creole.ir.*;
28  import gate.util.GateRuntimeException;
29  
30  /** This class represents Lucene implementation of IndexManeager interface.*/
31  public class LuceneIndexManager implements IndexManager{
32  
33    /** used in Lucene Documents as a key for gate document ID value. */
34    public final static String DOCUMENT_ID = "DOCUMENT_ID";
35  
36    /** IndexDefinition - location, type, fields, etc.*/
37    private IndexDefinition indexDefinition;
38  
39    /** An corpus for indexing*/
40    private Corpus corpus;
41  
42    /** Constructor of the class. */
43    public LuceneIndexManager(){
44    }
45  
46    /** Creates index directory and indexing all
47     *  documents in the corpus. */
48    public void createIndex() throws IndexException{
49      if(indexDefinition == null)
50        throw new GateRuntimeException("Index definition is null!");
51      if(corpus == null)
52        throw new GateRuntimeException("Corpus is null!");
53  
54      String location = indexDefinition.getIndexLocation();
55      try {
56        File file = new File(location);
57        if (file.exists()){
58          if (file.isDirectory() && file.listFiles().length>0) {
59            throw new IndexException(location+ " is not empty directory");
60          }
61          if (!file.isDirectory()){
62            throw new IndexException("Only empty directory can be index path");
63          }
64        }
65  
66        IndexWriter writer = new IndexWriter(location,
67                                             new SimpleAnalyzer(), true);
68  
69        for(int i = 0; i<corpus.size(); i++) {
70          boolean isLoaded = corpus.isDocumentLoaded(i);
71          gate.Document gateDoc = (gate.Document) corpus.get(i);
72          writer.addDocument(getLuceneDoc(gateDoc));
73          if (!isLoaded) {
74            corpus.unloadDocument(gateDoc);
75          }
76        }//for (all documents)
77  
78        writer.close();
79        corpus.sync();
80      } catch (java.io.IOException ioe){
81        throw new IndexException(ioe.getMessage());
82      } catch (gate.persist.PersistenceException pe){
83        pe.printStackTrace();
84      } catch (gate.security.SecurityException se){
85        se.printStackTrace();
86      }
87    }
88  
89    /** Optimize existing index. */
90    public void optimizeIndex() throws IndexException{
91      if(indexDefinition == null)
92        throw new GateRuntimeException("Index definition is null!");
93      try {
94        IndexWriter writer = new IndexWriter(indexDefinition.getIndexLocation(),
95                                           new SimpleAnalyzer(), false);
96        writer.optimize();
97        writer.close();
98      } catch (java.io.IOException ioe){
99        throw new IndexException(ioe.getMessage());
100     }
101   }
102 
103   /** Delete index. */
104   public void deleteIndex() throws IndexException{
105     if(indexDefinition == null)
106       throw new GateRuntimeException("Index definition is null!");
107     boolean isDeleted = true;
108     File dir = new File(indexDefinition.getIndexLocation());
109     if (dir.exists() && dir.isDirectory()) {
110       File[] files = dir.listFiles();
111       for (int i =0; i<files.length; i++){
112         File f = files[i];
113         isDeleted = f.delete();
114       }
115     }
116     dir.delete();
117     if (!isDeleted) {
118       throw new IndexException("Can't delete directory"
119                                + indexDefinition.getIndexLocation());
120     }
121   }
122 
123   /** Reindexing changed documents, removing removed documents and
124    *  add to the index new corpus documents. */
125   public void sync(List added, List removedIDs, List changed) throws IndexException{
126     String location = indexDefinition.getIndexLocation();
127     try {
128 
129       IndexReader reader = IndexReader.open(location);
130 
131       for (int i = 0; i<removedIDs.size(); i++) {
132         String id = removedIDs.get(i).toString();
133         org.apache.lucene.index.Term term =
134                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
135         reader.delete(term);
136       }//for (remove all removed documents)
137 
138       for (int i = 0; i<changed.size(); i++) {
139         gate.Document gateDoc = (gate.Document) changed.get(i);
140         String id = gateDoc.getLRPersistenceId().toString();
141         org.apache.lucene.index.Term term =
142                                new org.apache.lucene.index.Term(DOCUMENT_ID,id);
143         reader.delete(term);
144       }//for (remove all changed documents)
145 
146       reader.close();
147 
148       IndexWriter writer = new IndexWriter(location,
149                                           new SimpleAnalyzer(), false);
150 
151       for(int i = 0; i<added.size(); i++) {
152         gate.Document gateDoc = (gate.Document) added.get(i);
153         writer.addDocument(getLuceneDoc(gateDoc));
154       }//for (add all added documents)
155 
156       for(int i = 0; i<changed.size(); i++) {
157         gate.Document gateDoc = (gate.Document) changed.get(i);
158         writer.addDocument(getLuceneDoc(gateDoc));
159       }//for (add all changed documents)
160 
161       writer.close();
162     } catch (java.io.IOException ioe) {
163       throw new IndexException(ioe.getMessage());
164     }
165   }
166 
167   private org.apache.lucene.document.Document getLuceneDoc(gate.Document gateDoc){
168     org.apache.lucene.document.Document luceneDoc =
169                                      new org.apache.lucene.document.Document();
170     Iterator fields = indexDefinition.getIndexFields();
171 
172     luceneDoc.add(Field.Keyword(DOCUMENT_ID,
173                                 gateDoc.getLRPersistenceId().toString()));
174 
175     while (fields.hasNext()) {
176       IndexField field = (IndexField) fields.next();
177       String valueForIndexing;
178 
179       if (field.getReader() == null){
180         valueForIndexing = gateDoc.getFeatures().get(field.getName()).toString();
181       } else {
182         valueForIndexing = field.getReader().getPropertyValue(gateDoc);
183       } //if-else reader or feature
184 
185       if (field.isPreseved()) {
186         luceneDoc.add(Field.Keyword(field.getName(),valueForIndexing));
187       } else {
188         luceneDoc.add(Field.UnStored(field.getName(),valueForIndexing));
189       } // if-else keyword or text
190 
191     }// while (add all fields)
192 
193     return luceneDoc;
194   }
195 
196   public Corpus getCorpus() {
197     return corpus;
198   }
199   public void setCorpus(Corpus corpus) {
200     this.corpus = corpus;
201   }
202   public IndexDefinition getIndexDefinition() {
203     return indexDefinition;
204   }
205   public void setIndexDefinition(IndexDefinition indexDefinition) {
206     this.indexDefinition = indexDefinition;
207   }
208 
209 }