首页 > 代码库 > Lucene学习笔记

Lucene学习笔记

用Lucene实现了一个简单文件检索功能,作为最近学习lucene的学习成果。

 

定义常量类:

public class Constant {
    public static String FILE_NAME = "fileName";
    
    public static String FILE_CONTENT = "fileContent";
    
    public static String FILE_PATH = "filePath";
}

 

索引创建类:

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class IndexGenerator {
    
    /**
     * 
     * @Title: generatorIndex 
     * @Description:将该目录下支持的文件生成索引,用于搜索
     * @param @param dir 需要生成索引的目录 
     * @return void    返回类型 
     */
    public static int generatorIndex(String fileDir){
        //遍历文件
        List<File> fileList = new ArrayList<File>();
        Queue<File> fileQueue = new LinkedList<File>();
        
        File file = new File(fileDir);
        if(file.isDirectory()){
            fileQueue.add(file);
        }else{
            fileList.add(file);
        }
        
        while(!fileQueue.isEmpty()){
            File f = fileQueue.poll();
            
            File[] files = f.listFiles();
            
            for(File subFile : files){
                if(subFile.isDirectory()){
                    fileQueue.add(subFile);
                }else{
                    fileList.add(subFile);
                }
            }
        }
        
        //为文件创建索引
        Directory dir = null;
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
        IndexWriter indexWriter = null;
        int num = 0;
        try {
            dir = FSDirectory.open(new File("E:\\exercise\\luceneFS"));
            indexWriter = new IndexWriter(dir, config);
            for(File f : fileList){
                Document doc = new Document();
                
                //解析文件名
                doc.add(new Field(Constant.FILE_NAME, f.getName(), Store.YES, Index.NOT_ANALYZED));
                //解析文件路径并保存
                doc.add(new Field(Constant.FILE_PATH, file.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED));
                
                doc.add(new Field(Constant.FILE_CONTENT, new FileReader(f)));
                indexWriter.addDocument(doc);
                
            }
            
            num = indexWriter.numDocs();
            System.out.println("共对" + num + "个文件生成了索引");
            
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            if(indexWriter != null){
                try {
                    indexWriter.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(dir != null){
                try {
                    dir.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
        
        return num;
    }
}

 

搜索类:

package com.insaneXs.learnLucene;

import java.io.File;
import java.io.IOException;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.SearcherFactory;
import org.apache.lucene.search.SearcherManager;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;

public class LuceneSearcher {
    public static void search(String str){
        Directory dir = null;
        SearcherManager searcherManager = null;
        IndexSearcher indexSearcher = null;
        IndexWriter indexWriter = null;
        SearcherFactory factory = new SearcherFactory();
        try {
            //可以直接构造indexSearcher
//            indexSearcher = new IndexSearcher(dir);
            
            dir = FSDirectory.open(new File("E:\\exercise\\luceneFS"));
            IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36));
            indexWriter = new IndexWriter(dir, config);
            searcherManager = new SearcherManager(indexWriter, true, factory);
//            
            searcherManager.maybeRefresh();
            indexSearcher = searcherManager.acquire();
            
            //用queryparser对象创建Query,但是需要处理ParseException
//            QueryParser queryParser = new QueryParser(Version.LUCENE_36, Constant.FILE_CONTENT, new StandardAnalyzer(Version.LUCENE_36));
//            Query q = queryParser.parse(str);
            
            //注意检索关键字的大小写问题  在检索文件中 关键字全部小写化 
            //用QueryParser指定 StandardAnalyzer时 StandardAnalyzer底层会将关键字小写化
            if(str != null){
                str = str.toLowerCase();
            }
            Query titleQ = new TermQuery(new Term(Constant.FILE_NAME, str));
            Query contentQ = new TermQuery(new Term(Constant.FILE_CONTENT, str));
            
            //5.3版本不再直接使用构造函数创建BooleanQuery对象,而是通过Builder对象的build方法取代
//            Builder queryBuilder = new BooleanQuery.Builder();
//            queryBuilder.add(titleQ, Occur.MUST);
//            queryBuilder.add(contentQ, Occur.SHOULD);
            BooleanQuery query = new BooleanQuery();
            query.add(titleQ, Occur.MUST);
            query.add(contentQ, Occur.SHOULD);
            
            
            //SortFiled构造函数的两个参数分别指定了Field(进行排序的域)和Type(指定排序的类型)
            Sort sort = new Sort(new SortField(Constant.FILE_NAME, SortField.DOC));
            //只查询十条记录
            TopDocs res = indexSearcher.search(query, 10, sort);
            //符合条件的总记录数
            int totalLength = res.totalHits;
            System.out.println("一共有" + totalLength + "条记录符合条件");
            //本次查询的记录数
            int length = res.scoreDocs.length;
            System.out.println("本次查询" + length + "条记录");
            for(ScoreDoc scoreDoc : res.scoreDocs){
                //根据ScoreDoc取对应Document记录
                Document doc = indexSearcher.doc(scoreDoc.doc);
                //取记录对应的Field用于显示
                System.out.println("文件名:" + doc.get(Constant.FILE_NAME) + "; 路径:" + doc.get(Constant.FILE_PATH));
            }
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }/* catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }*/ finally {
            if(searcherManager != null){
                try {
                    searcherManager.release(indexSearcher);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            if(indexWriter != null){
                try {
                    indexWriter.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
            if(dir != null){
                try {
                    dir.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }
}

 

测试类:

public class LuceneTest {

    public static void main(String[] args) {
        IndexGenerator.generatorIndex("E:\\exercise\\testData");
//        
        
        LuceneSearcher.search("happy");
    }

}

 

过程中使用TermQuery的时候踩了不小的坑。主要是索引关键字大小写的问题。关键字会被自动存为小写。因此用TermQuery查询的时候要先转成小写。

而使用QueryParser会在底层自动处理成小写。所以不用关心。

 

另外,Lucene版本变动对API的影响也较大。一些接口都被废弃了。

上述代码用的3.6的版本。

 

参考资料:

Lucene主要API介绍

luceneapi.com

lucene易佰教程

Lucene学习笔记