首页 > 代码库 > lucene+IKAnalyzer实现中文纯文本检索系统

lucene+IKAnalyzer实现中文纯文本检索系统

首先IntelliJ IDEA中搭建Maven项目(web):spring+SpringMVC+Lucene+IKAnalyzer

spring+SpringMVC搭建项目可以参考我的博客

整合Lucene 4.9.0

pom.xml添加lucene依赖

properties标签添加<lucene.version>4.9.0</lucene.version>

dependencies添加:
 1 <!-- lucene start --> 2       <dependency> 3           <groupId> org.apache.lucene</groupId> 4           <artifactId>lucene-analyzers-common</artifactId> 5           <version> ${lucene.version}</version> 6       </dependency> 7  8       <dependency> 9           <groupId> org.apache.lucene</groupId>10           <artifactId>lucene-core</artifactId>11           <version> ${lucene.version}</version>12       </dependency>13 14       <dependency>15           <groupId> org.apache.lucene</groupId>16           <artifactId>lucene-highlighter</artifactId>17           <version> ${lucene.version}</version>18       </dependency>19 20       <dependency>21           <groupId> org.apache.lucene</groupId>22           <artifactId>lucene-queryparser</artifactId>23           <version> ${lucene.version}</version>24       </dependency>25 <!-- lucene end -->

整合IKAnalyzer 2012FF_hf1,中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer  2012FF版本

maven依赖配置参考我的博客

将IKAnalyzer的配置文件考到resources目录里如图:

技术分享

IKAnalyzer.cfg.xml可以配置词典以及停用词点,其它文件的为自定义停用词典

 1 <?xml version="1.0" encoding="UTF-8"?> 2 <!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">   3 <properties>   4     <comment>IK Analyzer 扩展配置</comment> 5     <!--用户可以在这里配置自己的扩展字典  6     <entry key="ext_dict">ext.dic;</entry>  7     --> 8     <!--用户可以在这里配置自己的扩展停止词字典--> 9     <entry key="ext_stopwords">classpath:stopword.dic;classpath:x-noise-charactor.dic;classpath:x-noise-word.dic;</entry>10     11 </properties>

 

好了,到此项目就配置好了,接下来进行Lucene的核心操作——建索引和检索

IndexService.java

 1 package com.ssm.demo.web.service; 2  3 import com.ssm.demo.core.dto.DocDto; 4 import org.springframework.ui.Model; 5  6 import java.util.List; 7  8 /** 9  * Describe: sevice接口10  * Author: ouym11  * Created Date: 2016/11/30.12  */13 public interface IndexService {14 15     /**16      * 构建索引,传入参数:文档路径17      * @param path18      * @return19      */20     public boolean createIndex(String path);21 22 23     /**24      * 通过query查询索引25      * @param query26      */27     public List<DocDto> searchIndex(String query, Model model);28 }

IndexServiceImpl.java

  1 package com.ssm.demo.web.service.impl;  2   3 import com.ssm.demo.core.constants.MyConstant;  4 import com.ssm.demo.core.dto.DocDto;  5 import com.ssm.demo.core.util.MyFileUtil;  6 import com.ssm.demo.web.service.IndexService;  7 import org.apache.lucene.analysis.Analyzer;  8 import org.apache.lucene.analysis.standard.StandardAnalyzer;  9 import org.apache.lucene.document.Document; 10 import org.apache.lucene.document.Field; 11 import org.apache.lucene.document.TextField; 12 import org.apache.lucene.index.DirectoryReader; 13 import org.apache.lucene.index.IndexWriter; 14 import org.apache.lucene.index.IndexWriterConfig; 15 import org.apache.lucene.queryparser.classic.QueryParser; 16 import org.apache.lucene.search.IndexSearcher; 17 import org.apache.lucene.search.Query; 18 import org.apache.lucene.search.ScoreDoc; 19 import org.apache.lucene.search.highlight.Highlighter; 20 import org.apache.lucene.search.highlight.QueryScorer; 21 import org.apache.lucene.search.highlight.SimpleHTMLFormatter; 22 import org.apache.lucene.store.Directory; 23 import org.apache.lucene.store.FSDirectory; 24 import org.apache.lucene.util.Version; 25 import org.springframework.stereotype.Service; 26 import org.springframework.ui.Model; 27 import org.wltea.analyzer.lucene.IKAnalyzer; 28  29 import java.io.File; 30 import java.nio.file.Paths; 31 import java.util.ArrayList; 32 import java.util.Date; 33 import java.util.List; 34  35 import static com.ssm.demo.core.util.MyFileUtil.*; 36  37 /** 38  * Describe: description of this class 39  * Author: ouym 40  * Created Date: 2016/11/30. 41  */ 42 @Service("indexService") 43 public class IndexServiceImpl implements IndexService { 44  45     public boolean createIndex(String path) { 46         Date date1 = new Date(); 47         List<File> fileList = getFileList(path); 48         File indexFile = new File(MyConstant.INDEX_PATH); 49         //避免重复索引 50         if (indexFile.exists()){ 51             MyFileUtil.deleteDir(indexFile); 52         }else { 53             indexFile.mkdirs(); 54         } 55         String content=""; 56         Analyzer analyzer = null; 57         Directory directory = null; 58         IndexWriter indexWriter = null; 59  60         for (File file : fileList) { 61             content = ""; 62             //获取文件后缀,只对.doc和.txt文件建索引 63             String type = file.getName().substring(file.getName().lastIndexOf(".")+1); 64             if("txt".equalsIgnoreCase(type)){ 65                 content += txt2String(file); 66             }else if("doc".equalsIgnoreCase(type)){ 67                 content += doc2String(file); 68             } 69  70             try{ 71  72                 //使用第三方中文分词器IKAnalyzer 73                 analyzer = new IKAnalyzer(true); 74                 directory = FSDirectory.open(new File(MyConstant.INDEX_PATH)); 75                 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_CURRENT,analyzer); 76                 indexWriter = new IndexWriter(directory, config); 77  78                 Document document = new Document(); 79                 document.add(new TextField("filename", file.getName(), Field.Store.YES)); 80                 document.add(new TextField("content", content, Field.Store.YES)); 81                 document.add(new TextField("path", file.getPath(), Field.Store.YES)); 82                 indexWriter.addDocument(document); 83                 indexWriter.commit(); 84                 indexWriter.close(); 85  86             }catch(Exception e){ 87                 e.printStackTrace(); 88             } 89             content = ""; 90         } 91         Date date2 = new Date(); 92         System.out.println("创建索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n"); 93         return false; 94     } 95  96     public List<DocDto> searchIndex(String queryStr,Model model) { 97  98         Date date1 = new Date(); 99         Analyzer analyzer = null;100         Directory directory = null;101         IndexWriter indexWriter = null;102         String prefixHTML = "<font color=‘red‘>";103         String suffixHTML = "</font>";104         List<DocDto> docDtoList = new ArrayList<>();105         try{106             directory = FSDirectory.open(new File(MyConstant.INDEX_PATH));107             //analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);108             analyzer = new IKAnalyzer(true);109             DirectoryReader ireader = DirectoryReader.open(directory);110             IndexSearcher isearcher = new IndexSearcher(ireader);111 112             QueryParser parser = new QueryParser(Version.LUCENE_CURRENT,"content", analyzer);113             Query query = parser.parse(queryStr);114 115             ScoreDoc[] hits = isearcher.search(query, null, 10).scoreDocs;116             //ScoreDoc[] hits = isearcher.search(query, 10).scoreDocs;117 118             for (int i = 0; i < hits.length; i++) {119                 DocDto docDto = new DocDto();120                 Document hitDoc = isearcher.doc(hits[i].doc);121                 //自动摘要,查询关键词高亮122                 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(prefixHTML, suffixHTML);123                 Highlighter highlighter = new Highlighter(simpleHTMLFormatter,new QueryScorer(query));124                 String highLightText = highlighter.getBestFragment(analyzer,"content",hitDoc.get("content"));125 126                 docDto.setDocName(hitDoc.get("filename"));127                 String path = hitDoc.get("path");128                 path = path.replaceAll("\\\\", "/");129                 docDto.setDocPath(path);130                 docDto.setDocAbstract(highLightText+"...");131                 docDtoList.add(docDto);132             }133             ireader.close();134             directory.close();135         }catch(Exception e){136             e.printStackTrace();137         }138         Date date2 = new Date();139         //System.out.println("查看索引-----耗时:" + (date2.getTime() - date1.getTime()) + "ms\n");140         model.addAttribute("spendTimes",(date2.getTime() - date1.getTime()));141         return docDtoList;142     }143 }

省略了一些常量类和自定义的工具类,接下来只需要在controller里面调用service就行了

 1 @RequestMapping("/index") 2     public String index(@RequestParam("wd")String wd, Model model){ 3  4         //建立索引 5         indexService.createIndex(MyConstant.DATA_PATH); 6         if (wd.trim().equals("")){ 7             return "redirect:/index/index"; 8         } 9 10         List<DocDto> docDtoList = indexService.searchIndex(wd,model);11         if (!StringUtils.isEmpty(wd)) {12             model.addAttribute("query",wd);13         }14         model.addAttribute("docDtoList",docDtoList);15         model.addAttribute("listSize",docDtoList.size());16         return "result";17     }

我的测试文档集是30篇doc文档,然后自己简单仿百度首页做了一个界面,效果图如下:

首页:

技术分享

检索结果:

技术分享

结果按相关度排好序了~

 

实现过程中注意事项:

1.中文分词器的版本要和Lucene的版本对应,Lucene 4.X对应IKAnalyzer  2012FF版本

2.Maven仓库中没有IKAnalyzer 的jar包依赖,需要自己手动添加本地jar包

3. IKAnalyzer 分词器有自己的智能切词优化,声明时参数为true即可开启:analyzer = new IKAnalyzer(true);

若要添加自己的词典和停用词典,将true改为false效果可能更好(有待确认)。analyzer = new IKAnalyzer(false);

lucene+IKAnalyzer实现中文纯文本检索系统