首页 > 代码库 > lucene创建索引的几种方式(一)

lucene创建索引的几种方式(一)

什么是索引:

根据你输入的值去找,这个值就是索引

第一种创建索引的方式:

根据文件来生成索引,如后缀为.txt等的文件

步骤:

第一步:FSDirectory.open(Paths.get(url));根据路径获取存储索引的目录。

FSDirectory:表示对文件系统目录的操作。RAMDirectory :内存中的目录操作。

Paths为NIO(new io)的一个类;Path 类是 java.io.File 类的升级版,File file=newFile("index.html")而Path path=Paths.get("index.html");由于 Path 类基于字符串创建,因此它引用的资源也有可能不存在。

关于nio:传统的io流都是通过字节的移动来处理的,也就是说输入/输出流一次只能处理一个字节,因此面向流的输入/输出系统通常效率不高;因此引进了新IO(new IO),NIO采用内存映射文件的方式来处理输入/输出,NIO将文件或文件的一段区域映射到内存中,这样就可以向访问内存一样来访问文件了(这种方式模拟了操作系统上的虚拟内存的概念),所以NIO的效率很快。

第二步:new IndexWriter(Directory,IndexWriterConfig)创建索引

第三步:索引指定目录的文件

第四步:将文件写入lucene中的文档(Document)

package com.wp.util;import java.io.File;import java.io.FileReader;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.TextField;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class Indexer {    private IndexWriter writer; // 写索引实例    /**     * 构造方法 实例化IndexWriter     *      * @param indexDir     * @throws Exception     */    public Indexer(String indexDir) throws Exception {        Directory dir = FSDirectory.open(Paths.get(indexDir));// 根据路径获取存储索引的目录        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);        writer = new IndexWriter(dir, iwc);    }    /**     * 关闭写索引     *      * @throws Exception     */    public void close() throws Exception {        writer.close();    }    /**     * 索引指定目录的所有文件     *      * @param dataDir     * @throws Exception     */    public int index(String dataDir) throws Exception {        File[] files = new File(dataDir).listFiles();        for (File f : files) {            indexFile(f);        }        return writer.numDocs();    }    /**     * 索引指定文件     *      * @param f     */    private void indexFile(File f) throws Exception {        // 关于f.getCanonicalPath()查看http://www.blogjava.net/dreamstone/archive/2007/08/08/134968.html        System.out.println("索引文件:" + f.getCanonicalPath());        Document doc = getDocument(f);        writer.addDocument(doc);    }    /**     * 获取文档,文档里再设置每个字段     *      * @param f     */    private Document getDocument(File f) throws Exception {        Document doc = new Document();        doc.add(new TextField("contents", new FileReader(f)));        doc.add(new TextField("fileName", f.getName(), Field.Store.YES));        doc                .add(new TextField("fullPath", f.getCanonicalPath(),                        Field.Store.YES));        return doc;    }    public static void main(String[] args) {        String indexDir = "D:\\lucene4";        String dataDir = "D:\\lucene4\\data";        Indexer indexer = null;        int numIndexed = 0;        long start = System.currentTimeMillis();        try {            indexer = new Indexer(indexDir);            numIndexed = indexer.index(dataDir);        } catch (Exception e) {            e.printStackTrace();        } finally {            try {                indexer.close();            } catch (Exception e) {                e.printStackTrace();            }        }        long end = System.currentTimeMillis();        System.out.println("索引:" + numIndexed + " 个文件 花费了" + (end - start)                + " 毫秒");    }}

第二种创建索引的方式:

根据字段来生成索引,我用的是数组

第一步:创建索引

第二步:将字段添加到文档中

package com.wp.util;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.document.Field;import org.apache.lucene.document.StringField;import org.apache.lucene.document.TextField;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.index.IndexWriter;import org.apache.lucene.index.IndexWriterConfig;import org.apache.lucene.index.Term;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;import org.junit.Before;import org.junit.Test;public class IndexIngTest {    private String ids[] = { "1", "2", "3" };    private String citys[] = { "qingdao", "nanjing", "shanghai" };    private String descs[] = { "Qingdao is a beautiful city.",            "Nanjing is a city of culture.", "Shanghai is a bustling city." };    private Directory dir;// 目录    /**     * 获取IndexWriter实例     *      * @return     * @throws Exception     */    private IndexWriter getWriter() throws Exception {        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);        IndexWriter writer = new IndexWriter(dir, iwc);        return writer;    }    /**     * 添加文档     *      * @throws Exception     */    @Before    public void setUp() throws Exception {        dir = FSDirectory.open(Paths.get("D:\\lucene\\luceneIndex"));// 得到luceneIndex目录        IndexWriter writer = getWriter();// 得到索引        for (int i = 0; i < ids.length; i++) {            Document doc = new Document();// 创建文档            doc.add(new StringField("id", ids[i], Field.Store.YES));// 将id属性存入内存中            doc.add(new StringField("city", citys[i], Field.Store.YES));            doc.add(new TextField("desc", descs[i], Field.Store.NO));            writer.addDocument(doc); // 添加文档        }        writer.close();    }    /**     * 测试写了几个文档     *      * @throws Exception     */    @Test    public void testIndexWriter() throws Exception {        IndexWriter writer = getWriter();        System.out.println("写入了" + writer.numDocs() + "个文档");        writer.close();    }    /**     * 测试读取文档     *      * @throws Exception     */    @Test    public void testIndexReader() throws Exception {        IndexReader reader = DirectoryReader.open(dir);        System.out.println("最大文档数:" + reader.maxDoc());        System.out.println("实际文档数:" + reader.numDocs());        reader.close();    }    /**     * 测试删除 在合并前     *      * @throws Exception     */    @Test    public void testDeleteBeforeMerge() throws Exception {        IndexWriter writer = getWriter();        System.out.println("删除前:" + writer.numDocs());        writer.deleteDocuments(new Term("id", "1"));// term:根据id找到为1的        writer.commit();        System.out.println("writer.maxDoc():" + writer.maxDoc());        System.out.println("writer.numDocs():" + writer.numDocs());        writer.close();    }    /**     * 测试删除 在合并后     *      * @throws Exception     */    @Test    public void testDeleteAfterMerge() throws Exception {        IndexWriter writer = getWriter();        System.out.println("删除前:" + writer.numDocs());        writer.deleteDocuments(new Term("id", "1"));        writer.forceMergeDeletes(); // 强制删除        writer.commit();        System.out.println("writer.maxDoc():" + writer.maxDoc());        System.out.println("writer.numDocs():" + writer.numDocs());        writer.close();    }    /**     * 测试更新     *      * @throws Exception     */    @Test    public void testUpdate() throws Exception {        IndexWriter writer = getWriter();        Document doc = new Document();        doc.add(new StringField("id", "1", Field.Store.YES));        doc.add(new StringField("city", "qingdao", Field.Store.YES));        doc.add(new TextField("desc", "dsss is a city.", Field.Store.NO));        writer.updateDocument(new Term("id", "1"), doc);        writer.close();    }}

生成的索引文件如下:

技术分享

 

关于索引的搜索:

这里有一个要注意的地方:一定要先创建出索引后才能去进行查找,否则会报

org.apache.lucene.index.IndexNotFoundException:
no segments* file found in MMapDirectory@D:\lucene lockFactory=org.apache.lucene.store.NativeFSLockFactory@753f67a9: files: [data, lucene-5.3.1, lucene-5.3.1.zip]
package com.wp.lucene;import java.nio.file.Paths;import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.standard.StandardAnalyzer;import org.apache.lucene.document.Document;import org.apache.lucene.index.DirectoryReader;import org.apache.lucene.index.IndexReader;import org.apache.lucene.queryparser.classic.QueryParser;import org.apache.lucene.search.IndexSearcher;import org.apache.lucene.search.Query;import org.apache.lucene.search.ScoreDoc;import org.apache.lucene.search.TopDocs;import org.apache.lucene.store.Directory;import org.apache.lucene.store.FSDirectory;public class Searcher {    /**     *      * @param indexDir     *            哪个目录     * @param q     *            要查询的字段     * @throws Exception     */    public static void search(String indexDir, String q) throws Exception {        Directory dir = FSDirectory.open(Paths.get(indexDir));// 打开目录        IndexReader reader = DirectoryReader.open(dir);// 进行读取        IndexSearcher is = new IndexSearcher(reader);// 索引查询器        Analyzer analyzer = new StandardAnalyzer(); // 标准分词器        QueryParser parser = new QueryParser("contents", analyzer);// 在哪查询,第一个参数为查询的Document,在Indexer中创建了        Query query = parser.parse(q);// 对字段进行解析后返回给查询        long start = System.currentTimeMillis();        TopDocs hits = is.search(query, 10);// 开始查询,10代表前10条数据;返回一个文档        long end = System.currentTimeMillis();        System.out.println("匹配 " + q + " ,总共花费" + (end - start) + "毫秒" + "查询到"                + hits.totalHits + "个记录");        for (ScoreDoc scoreDoc : hits.scoreDocs) {            Document doc = is.doc(scoreDoc.doc);// 根据文档的标识获取文档            System.out.println(doc.get("fullPath"));        }        reader.close();    }    /**     * 执行这个main方法进行查询之前,必须要有索引,即先执行Indexer这个类     *      * @param args     */    public static void main(String[] args) {        String indexDir = "D:\\lucene";        String q = "ADD";        try {            search(indexDir, q);        } catch (Exception e) {            e.printStackTrace();        }    }}

 

lucene创建索引的几种方式(一)