// largely what I learned from http://javatechniques.com/blog/lucene-in-memory-text-search-example/ // but with a few functions/classes tweaked because Lucene has been updated /** * A simple example of an in-memory search using Lucene. */ import java.io.IOException; import java.io.StringReader; import java.io.InputStream; import java.io.FileOutputStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.List; import org.apache.lucene.search.Hits; import org.apache.lucene.search.Query; import org.apache.lucene.document.Field; import org.apache.lucene.search.Searcher; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.document.Document; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.analysis.SimpleAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import processing.core.PApplet; public class SimpleLucene { public static final String FULL = "full"; public static final String LINE_NUM = "line number"; public static final int MAX_HITS = 10; // Construct a RAMDirectory to hold the in-memory representation of the index. RAMDirectory idx = null; Searcher searcher; MultiFieldQueryParser parser; public SimpleLucene(PApplet parent, List data) { this(parent, data, false); } // assume that data maps a string (labelId) to a string (label) public SimpleLucene(PApplet parent, List data, boolean useDisk) { // first try and load the index from disk because it's faster than creating a new one if (useDisk) { try { InputStream fis = parent.openStream("lucene_data"); ObjectInputStream ois = new ObjectInputStream(fis); idx = (RAMDirectory)ois.readObject(); ois.close(); } catch(Exception e1) { // nothing to worry about, probably the file wasn't there, that's all parent.println("problem loading index, let's create a new one"); } } if (idx == null) { try { // this is an index that sits in memory - // Lucene also (more commonly, actually) has indexes that live on disk idx = new RAMDirectory(); // Make an writer to create the index IndexWriter writer = new IndexWriter(idx, new StandardAnalyzer(), true); // pull the data from our list and add it to the index for (int i = 0; i < data.size(); i++) { Object part = data.get(i); Document doc = new Document(); Field index = new Field(LINE_NUM, ""+i, Field.Store.YES, Field.Index.NO); doc.add(index); Field full = new Field(FULL, part.toString(), Field.Store.NO, Field.Index.TOKENIZED); doc.add(full); writer.addDocument(doc); } // Optimize and close the writer to finish building the index writer.optimize(); writer.close(); // if we're not an applet, save the index to disk for next time // NB:- the lucene docs don't talk much about this: // ...RAMDirectory wasn't Serializable for a long time, caveat saver, I suppose! // also, be sure to delete lucene_data if you change the data or indexing! if (useDisk && !parent.online) { FileOutputStream fos = new FileOutputStream(parent.dataPath("lucene_data")); ObjectOutputStream oos = new ObjectOutputStream(fos); oos.writeObject(idx); oos.close(); } } catch(Exception e2) { e2.printStackTrace(); System.err.println("eek! error saving database"); } } try { // you can get away without using a multifieldqueryparser here, // I left it in because it was useful for the project this code came from parser = new MultiFieldQueryParser(new String[] { FULL } , new StandardAnalyzer()); // Build an IndexSearcher using the in-memory index searcher = new IndexSearcher(idx); } catch(Exception e) { e.printStackTrace(); } } int[] search(String queryString) { int[] indexes = null; if (queryString == null || queryString.length() == 0) { return null; } try { Query query = parser.parse( new String[] { queryString }, new String[] { FULL }, new BooleanClause.Occur[] { BooleanClause.Occur.MUST }, parser.getAnalyzer() ); // Search for the query Hits hits = searcher.search(query); // put all the hit ids into the labels array if (hits.length() > 0) { int numHits = Math.min(MAX_HITS, hits.length()); indexes = new int[numHits]; for (int i = 0; i < numHits; i++) { Document doc = hits.doc(i); indexes[i] = Integer.parseInt(doc.get(LINE_NUM)); } } } catch (Exception e) { e.printStackTrace(); } return indexes; } }