首页 > 代码库 > lucene对文件做简单的索引
lucene对文件做简单的索引
package com.mylucene; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.Reader; import java.nio.CharBuffer; import java.util.ArrayList; import java.util.List; import org.apache.lucene.LucenePackage; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; public class MyLuceneTest { /** * 根据内容,构建索引 * @param analyzer * @param directory * @param items * @return */ private boolean buildIndexer(Analyzer analyzer, Directory directory, List<Item> items) { IndexWriter iwriter = null; try { // 配置索引 iwriter = new IndexWriter(directory, new IndexWriterConfig( Version.LUCENE_47, analyzer)); // 删除所有document iwriter.deleteAll(); // 将文档信息存入索引 Document doc[] = new Document[items.size()]; for (int i = 0; i < items.size(); i++) { doc[i] = new Document(); Item item = items.get(i); java.lang.reflect.Field[] fields = item.getClass().getDeclaredFields(); for (java.lang.reflect.Field field : fields) { String fieldName = field.getName(); // System.out.println(fieldName); String getMethodName = "get"+toFirstLetterUpperCase(fieldName); Object obj = item.getClass().getMethod(getMethodName).invoke(item); //System.out.println((String)obj); doc[i].add(new Field(fieldName, (String)obj, TextField.TYPE_STORED)); // Field field1 = new Field("", new FileReader(new File(""))); // doc[1].add(field1); } iwriter.addDocument(doc[i]); } } catch (Exception e) { e.printStackTrace(); return false; } finally { try { iwriter.close(); } catch (IOException e) { } } return true; } /** * 根据keyword搜索索引 * @param analyzer * @param directory * @param keyword * @return */ public List<Item> searchIndexer(Analyzer analyzer, Directory directory, String keyword) { DirectoryReader ireader = null; List<Item> result = new ArrayList<Item>(); try { // 设定搜索目录 ireader = DirectoryReader.open(directory); IndexSearcher isearcher = new IndexSearcher(ireader); // 对多field进行搜索 java.lang.reflect.Field[] fields = Item.class.getDeclaredFields(); int length = fields.length; String[] multiFields = new String[length]; for (int i = 0; i < length; i++) { multiFields[i] = fields[i].getName(); } MultiFieldQueryParser parser = new MultiFieldQueryParser( Version.LUCENE_47, multiFields, analyzer); // 设定具体的搜索词 Query query = parser.parse(keyword); ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs; for (int i = 0; i < hits.length; i++) { Document hitDoc = isearcher.doc(hits[i].doc); Item item = new Item(); for (String field : multiFields) { String setMethodName = "set"+toFirstLetterUpperCase(field); item.getClass().getMethod(setMethodName, String.class).invoke(item, hitDoc.get(field)); } result.add(item); } } catch (Exception e) { e.printStackTrace(); return null; } finally { try { ireader.close(); directory.close(); } catch (IOException e) { } } return result; } /** * 首字母转大写 * @param str * @return */ public static String toFirstLetterUpperCase(String str) { if(str == null || str.length() < 2){ return str; } return str.substring(0, 1).toUpperCase() + str.substring(1, str.length()); } public static void main(String[] args) throws Exception { System.out.println(LucenePackage.get()); MyLuceneTest demo = new MyLuceneTest(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47); List<Item> items = new ArrayList<Item>(); /*items.add(new Item("1", "中国", "This is the text to be greatly indexed.")); items.add(new Item("2", "second", "This is great")); items.add(new Item("3", "third", "I love apple and pear. ")); items.add(new Item("4", "four", "我是中国人")); items.add(new Item("5", "five", "中华人民共和国")); */File dataFile = new File("C:/mylucene"); File[] dataFiles = dataFile.listFiles(); for(int i = 0; i < dataFiles.length; i++){ Reader txtReader = new FileReader(dataFiles[i]); char []buff = new char[10000]; txtReader.read(buff); String str = String.valueOf(buff); System.out.println(buff); items.add(new Item(dataFiles[i].getCanonicalPath(),dataFiles[i].getName(),str)); //System.out.println(dataFiles[i].getCanonicalPath()); //System.out.println(dataFiles[i].getName()); //System.out.println(buff); //System.out.println(txtReader.toString()); } // 索引存到内存中的目录 //Directory directory = new RAMDirectory(); // 索引存储到硬盘 File file = new File("c:/lucene"); Directory directory = FSDirectory.open(file); demo.buildIndexer(analyzer, directory, items); List<Item> result = demo.searchIndexer(analyzer, directory, "中国"); for (Item item : result) { System.out.println(item.toString()); } } }
package com.mylucene;
public class Item {
private String id;
private String title;
private String content;
public Item() {
}
public Item(String id, String title, String content) {
this.id = id;
this.title = title;
this.content = content;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[id=").append(id).append(",title=").append(title)
.append(",content=").append(content).append("]");
return sb.toString();
}
}
这里是将文件的的三个属性进行了一下抽象,并且运用另一个类去表示,在以前版本中是运用Reader进行读取文件,并且在文件进行添加索引的时候直接对Reader读取的对象进行添加,不需要将其所有进行读出都进行封装。这里就是文件非常大的时候内存将会存不下,导致内存不足或者数组越界的可能。这里应该还可以像以前版本一样可以直接对文件建立索引的,我相信是我没有找到好的解决办法,所以应该多研究一下4.8的api。