首页 > 代码库 > 探索Lucene.Net全文检索
探索Lucene.Net全文检索
在CSDN,博客园找了一番Lucene.Net相关资料后,最后发现还是没有自己想要的,毕竟lucene.net版本一直在变,这里我用的是Lucene.Net 3.0的版本,demo是在http://www.dotlucene.net/ 网站里面找到的。方法很完善,API也比较全面,http://www.dotlucene.net/30648/lucene-net-api-search-demo。
不过demo里面讲解是检索文件内容服务,而我想要的是检索数据内容服务,我就在它的基础上改了一番。
过程:
1.建立索引,更新索引,删除索引
protected luseneTxt m=null; protected void Page_Load(object sender, EventArgs e) { } protected void creatIndexBt_Click(object sender, EventArgs e) { DateTime dt = DateTime.Now; List<luseneTxt> l = new List<luseneTxt>(); bool flag = true; int i = 0; LuceneSearch ls = new LuceneSearch(); while (flag) { m = new luseneTxt(); m.text = "test"; m.path = "http://www.baidu.com/?i="; m.title = "mofijeck "; m.des = "12"; m.keyword = "34"; l.Add(m); i++; m = null; if (i == 99999) { flag = false; } } ls.CreatIndexByData(l); l = new List<luseneTxt>(); TimeSpan ts = DateTime.Now - dt; Label1.Text = "建立索引耗时" + ts.TotalSeconds + "秒"; } protected void deleteIndexBt_Click(object sender, EventArgs e) { DateTime dt = DateTime.Now; LuceneSearch ls = new LuceneSearch(); ls.DeleteIndex(); TimeSpan ts = DateTime.Now - dt; Label1.Text = "删除索引耗时" + ts.TotalSeconds + "秒"; } protected void updateIndexBt_Click(object sender, EventArgs e) { DateTime dt = DateTime.Now; List<luseneTxt> l = new List<luseneTxt>(); bool flag = true; int i = 0; LuceneSearch ls = new LuceneSearch(); while (flag) { m = new luseneTxt(); m.text = "test"; m.path = "http://www.baidu.com/?i="; m.title = "mofijeck "; m.des = "12"; m.keyword = "34"; l.Add(m); i++; m = null; if (i == 999999) { flag = false; } } ls.UpdateIndexByData(l); l = new List<luseneTxt>(); TimeSpan ts = DateTime.Now - dt; Label1.Text = "建立索引耗时" + ts.TotalSeconds + "秒"; }
2.搜索
其实这里说白了就是增删改查,没有比较特别的东西,但是关于全文检索原理上的东西我也不是特别懂,网上很多大牛有相关博客解释的。
protected void search(string q) { if (q == "") { Label1.Text = "不能为空"; return; } TimeSpan duration=new TimeSpan(); DateTime dt = DateTime.Now; LuceneSearch ls = new LuceneSearch(); int pageIndex = Request["Page"] == null ? 0 : int.Parse(Request["Page"]); int pageSize = 10; string colName = tbcol.Text.Trim() == "" ? "text" : tbcol.Text.Trim(); tbcol.Text = colName; Repeater1.DataSource = ls.Search(q,colName, pageSize, pageIndex); Repeater1.DataBind(); duration = DateTime.Now - dt; dateTimeMsg = "耗时" + duration.TotalSeconds + "秒"; SqlPager sqlpager = new SqlPager("", pageSize, ls.getSearchCount(q, colName), "Search.aspx", "q=" + q, pageIndex); pagerStr = sqlpager.GetShowPageStr("2", 1, 10); }前台显示:
<div> <asp:repeater id="Repeater1" runat="server" > <ItemTemplate> <p><a href=http://www.mamicode.com/'' class="link">
>源码如下:
数据索引类(DataIndexer):
public class DataIndexer { private IndexWriter writer; public DataIndexer(string directory) { writer = new IndexWriter(FSDirectory.Open(directory), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), true, IndexWriter.MaxFieldLength.LIMITED); writer.UseCompoundFile = true; } public DataIndexer(string directory, bool create) { writer = new IndexWriter(FSDirectory.Open(directory), new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30), create, IndexWriter.MaxFieldLength.LIMITED); writer.UseCompoundFile = true; } public void AddHtmlData(List<luseneTxt> list) { foreach (luseneTxt t in list) { AddHtmlDocument(t); } } /// <summary> /// Loads, parses and indexes an HTML file. /// </summary> /// <param name="path"></param> public void AddHtmlDocument(luseneTxt lt) { Document doc = new Document(); doc.Add(new Field("text", lt.title, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("path", lt.path, Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.Add(new Field("title", lt.text, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("des", lt.des, Field.Store.YES, Field.Index.ANALYZED)); doc.Add(new Field("keyword", lt.keyword, Field.Store.YES, Field.Index.ANALYZED)); writer.AddDocument(doc); } /// <summary> /// Optimizes and save the index. /// </summary> public void Close() { writer.Optimize(); writer.Dispose(); } public void Delete() { writer.DeleteAll(); } }
Lucene.Net搜索检索类(LuceneSearch):public class LuceneSearch { private string indexDirectory = System.Web.HttpContext.Current.Server.MapPath("~/App_Data/index");//默认 /// <summary> /// 空构造函数 /// </summary> public LuceneSearch() { } /// <summary> /// 构造函数 /// </summary> /// <param name="filePath">默认是~/App_Data/index</param> public LuceneSearch(string filePath) { indexDirectory = System.Web.HttpContext.Current.Server.MapPath(filePath); } /// <summary> /// 文件索引 /// </summary> /// <param name="url">文件路径</param> /// <param name="pattenRegex">正则匹配 比如*.htm*</param> public void CreatIndex(string url,string pattenRegex) { string dataDirectory = System.Web.HttpContext.Current.Server.MapPath(url); IntranetIndexer indexer = new IntranetIndexer(indexDirectory); indexer.AddDirectory(new DirectoryInfo(dataDirectory), pattenRegex); indexer.Close(); } /// <summary> /// 创建数据库索引 /// </summary> public void CreatIndexByData(List<luseneTxt> list) { DataIndexer indexer = new DataIndexer(indexDirectory); indexer.AddHtmlData(list); indexer.Close(); } /// <summary> /// 更新数据库索引 /// </summary> public void UpdateIndexByData(List<luseneTxt> list) { DataIndexer indexer = new DataIndexer(indexDirectory, false); indexer.AddHtmlData(list); indexer.Close(); } public void DeleteIndex() { DataIndexer indexer = new DataIndexer(indexDirectory); indexer.Delete(); indexer.Close(); } #region 搜索 public List<SearchLucene> Search(string q, string colname, int pageSize, int page) { List<SearchLucene> list = new List<SearchLucene>(); // create the searcher // index is placed in "index" subdirectory DateTime start = DateTime.Now; var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(indexDirectory)); // parse the query, "text" is the default field to search var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, colname, analyzer); Query query = parser.Parse(q); // search TopDocs hits = searcher.Search(query, 200); int count = hits.TotalHits; // create highlighter IFormatter formatter = new SimpleHTMLFormatter("<span style=\"font-weight:bold;\">", "</span>"); SimpleFragmenter fragmenter = new SimpleFragmenter(80); QueryScorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); highlighter.TextFragmenter = fragmenter; // initialize page int startPage = page; int endPage = (page + 1) * pageSize > count ? count : (page + 1) * pageSize; // how many items we should show - less than defined at the end of the results for (int i = startPage; i < endPage; i++) { // get the document from index Document doc = searcher.Doc(hits.ScoreDocs[i].Doc); TokenStream stream = analyzer.TokenStream("", new StringReader(doc.Get("text"))); String highText = highlighter.GetBestFragments(stream, doc.Get("text"), 2, "..."); SearchLucene sm = new SearchLucene(); sm.title = doc.Get("title"); sm.des = doc.Get("des"); sm.url = doc.Get("path"); sm.text = doc.Get("text"); sm.keyword = doc.Get("keyword"); sm.lightText = highText; list.Add(sm); } searcher.Dispose(); return list; } public int getSearchCount(string q, string colname) { var analyzer = new StandardAnalyzer(Lucene.Net.Util.Version.LUCENE_30); IndexSearcher searcher = new IndexSearcher(FSDirectory.Open(indexDirectory)); // parse the query, "text" is the default field to search var parser = new QueryParser(Lucene.Net.Util.Version.LUCENE_30, colname, analyzer); Query query = parser.Parse(q); // search TopDocs hits = searcher.Search(query, 200); return hits.TotalHits; } #endregion } public class luseneTxt { public string title { set; get; } public string path { set; get; } public string text { set; get; } public string des { set; get; } public string keyword { set; get; } } public class SearchLucene{ public string title{set;get;} public string des{set;get;} public string keyword { set; get; } public string url{set;get;} public string text{set;get;} public string lightText{set;get;} }分页类库:
using System; using System.Collections.Generic; using System.Linq; using System.Text; namespace LuceneSolution.Code { /// <summary> /// 初始化构造函数 /// 调用GetShowPageStr("2", 3, 10); /// </summary> public class SqlPager { #region 初始构造 public SqlPager(string Language) { this.unitLa(Language); } /// <summary> /// /// </summary> /// <param name="Language">默认是中文</param> /// <param name="MyPageSize"></param> /// <param name="Counts"></param> /// <param name="MyParameters"></param> /// <param name="PageIndex"></param> public SqlPager(string Language, int MyPageSize, int Counts, string TurnUrlStr, string MyParameters, int PageIndex) { this.unitLa(Language); this.PageSize = MyPageSize;//页码大小 this.TotalRecord = Counts;//总记录数 this.TurnUrlStr = TurnUrlStr;//要跳转的页面(当前页) this.Parameters = MyParameters;//页面所带参数 this.PageIndex = PageIndex;//当前要显示的页码 //具体样式 this.PageString = "第[PageIndex]页/共[TotalPage]页 第每[PageSize]条/共[TotalRecord]条 [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr] [TurnControl]"; } /// <summary> /// 初始化中英文 /// </summary> /// <param name="Language"></param> protected void unitLa(string Language) { if (Language == "EN")//英文默认 { PageString = "Page [PageIndex]/[TotalPage] Info [PageSize]/[TotalRecord] [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr] [TurnControl]"; TurnUrlStr = ""; PageSize = 20; PageIndex = 1; TotalRecord = 0; FirstStr = "<<"; PrevStr = "<"; NextStr = ">"; LastStr = ">>"; NoRecord = "Total 0"; ButtonStr = " GO "; TotalPage = 0; TurnControl = ""; PageNumber = ""; ShowPageStr = ""; } else//中文默认 { PageString = "第[PageIndex]页/共[TotalPage]页 第每[PageSize]条/共[TotalRecord]条 [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr] [TurnControl]"; TurnUrlStr = ""; PageSize = 20; PageIndex = 1; TotalRecord = 0; FirstStr = "首页"; PrevStr = "上一页"; NextStr = "下一页"; LastStr = "尾页"; NoRecord = "共有 0 条信息"; ButtonStr = "跳转"; TotalPage = 0; TurnControl = ""; PageNumber = ""; ShowPageStr = ""; } } #endregion #region 字段 公共 /// <summary> /// 具体样式(需要显示哪些内容) /// </summary> public string PageString; /// <summary> /// 跳转 的url链接 /// </summary> public string TurnUrlStr; /// <summary> /// 跳转的url链接的参数前面不要加问号和与号 /// </summary> public string Parameters; /// <summary> /// 每页记录数 /// </summary> public int PageSize; /// <summary> /// 需要获取第几页的数据,从 1 开始 /// </summary> public int PageIndex; /// <summary> /// 总记录数 /// </summary> public int TotalRecord; /// <summary> /// 首页 显示样式 /// </summary> public string FirstStr; /// <summary> /// 上一页 显示样式 /// </summary> public string PrevStr; /// <summary> /// 下一页 显示样式 /// </summary> public string NextStr; /// <summary> /// 尾页 显示样式 /// </summary> public string LastStr; /// <summary> /// 没有记录时显示的信息 /// </summary> public string NoRecord; /// <summary> /// 跳转按钮文字 /// </summary> public string ButtonStr; #endregion #region 字段 私有 /// <summary> /// 总页数 /// </summary> private int TotalPage; /// <summary> /// 跳转控件 /// </summary> private string TurnControl; /// <summary> /// 存储中间的内容(如 1 2 3 页码) /// </summary> private string PageNumber; /// <summary> /// 显示页码的总字符 /// </summary> private string ShowPageStr; #endregion #region 获取最终页码显示 /// <summary> /// 获取最终页码显示 /// </summary> /// <param name="Style">样式(1 是上下页,2 是显示一批页码)</param> /// <param name="PlaceIn">固定当前页在第几个位置(样式2用)</param> /// <param name="ShowNum">一页显示几个页码(样式2用)</param> /// <returns>最终页码显示</returns> public string GetShowPageStr(string Style, int PlaceIn, int ShowNum) { TotalPage = (TotalRecord + PageSize - 1) / PageSize; //超出最小页码 if (PageIndex < 1) { PageIndex = 1; } //超出最大页码 if (PageIndex > TotalPage) { PageIndex = TotalPage; } //跳转 TurnControl = "<input value='" + PageIndex.ToString() + "' id='txtPageGo' name='txtPageGo' type='text' style='width:35px;' onkeydown=\"if(event.keyCode==13)window.location.href='" + TurnUrlStr + "?Page=' + document.getElementById('txtPageGo').value + '" + "&" + Parameters + "'\"><input name='btnGo' type='button' id='btnGo' value='" + ButtonStr + "' onclick=\"javascript:window.location.href='" + TurnUrlStr + "?Page=' + document.getElementById('txtPageGo').value + '" + "&" + Parameters + "'\">"; //没有记录 if (TotalRecord <= 0) { ShowPageStr = NoRecord; } //有记录 else { //只有一页 if (TotalPage <= 1) { ShowPageStr = "共有 " + TotalRecord.ToString() + " 条信息"; } //不止一页 else { //第一页 if (PageIndex == 1) { } else { FirstStr = "<a href=\"" + TurnUrlStr + "?Page=1" + "&" + Parameters + "\">" + FirstStr + "</a>"; PrevStr = "<a href=\"" + TurnUrlStr + "?Page=" + Convert.ToString(PageIndex - 1) + "&" + Parameters + "\">" + PrevStr + "</a>"; } //最后一页 if (PageIndex == TotalPage) { } else { NextStr = "<a href=\"" + TurnUrlStr + "?Page=" + Convert.ToString(PageIndex + 1) + "&" + Parameters + "\">" + NextStr + "</a>"; LastStr = "<a href=\"" + TurnUrlStr + "?Page=" + TotalPage + "&" + Parameters + "\">" + LastStr + "</a>"; } } #region 样式一: 共X条信息 第N页/共M页 首页 上一页 下一页 尾页 跳转 if (Style == "1") { PageNumber = ""; } #endregion #region 样式2: 共X条信息 第N页/共M页 首页 1 2 3 尾页 跳转 当前页码定位在第 PlaceIn 位 if (Style == "2") { int PageTemp = 0; if (PlaceIn < 1 || PlaceIn > ShowNum) { PlaceIn = 2;//前面保持有两个,则当前页码定位在第3位 } else { PlaceIn = PlaceIn - 1;//前面保持有 PlaceIn - 1 个 } string strPageNum = ""; string strTempNow = ""; //当页码总数 <= 一页要显示的页码数 if (TotalPage <= ShowNum) { for (int i = 1; i <= TotalPage; i++) { strTempNow = Convert.ToString(PageTemp + i); //当前页不显示超链接 if (PageIndex == PageTemp + i) { strPageNum = strPageNum + "<b>" + strTempNow + "</b> "; } else { strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a> "; } } } else { //当前页码 <= 固定位置 PlaceIn if (PageIndex <= PlaceIn) { for (int i = 1; i <= ShowNum; i++) { strTempNow = Convert.ToString(i); //当前页不显示超链接 if (PageIndex == i) { strPageNum = strPageNum + "<b>" + strTempNow + "</b> "; } else { strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a> "; } } } //当前页码 >=最后一批该显示 else if (PageIndex >= TotalPage - ShowNum + PlaceIn + 1) { //第一个显示的数字 PageTemp = TotalPage - ShowNum + 1; for (int i = PageTemp; i <= TotalPage; i++) { if (i > TotalPage) break; strTempNow = Convert.ToString(i); //当前页不显示超链接 if (PageIndex == i) { strPageNum = strPageNum + "<b>" + strTempNow + "</b> "; } else { strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a> "; } } } else { for (int i = PageIndex - PlaceIn; i < PageIndex - PlaceIn + ShowNum; i++) { strTempNow = Convert.ToString(i); //当前页不显示超链接 if (i == PageIndex) { strPageNum = strPageNum + "<b>" + strTempNow + "</b> "; } else { strPageNum = strPageNum + "<a href=\"" + TurnUrlStr + "?Page=" + strTempNow + "&" + Parameters + "\">[" + strTempNow + "]</a> "; } } } } PageNumber = strPageNum; } #endregion } ShowPageStr = PageString; ShowPageStr = ShowPageStr.Replace("[PageIndex]", PageIndex.ToString());// ShowPageStr = ShowPageStr.Replace("[TotalPage]", TotalPage.ToString());// ShowPageStr = ShowPageStr.Replace("[PageSize]", PageSize.ToString());// ShowPageStr = ShowPageStr.Replace("[TotalRecord]", TotalRecord.ToString());// ShowPageStr = ShowPageStr.Replace("[FirstStr]", FirstStr);// ShowPageStr = ShowPageStr.Replace("[PrevStr]", PrevStr);// ShowPageStr = ShowPageStr.Replace("[NextStr]", NextStr);// ShowPageStr = ShowPageStr.Replace("[LastStr]", LastStr);// ShowPageStr = ShowPageStr.Replace("[TurnControl]", TurnControl);// ShowPageStr = ShowPageStr.Replace("[PageNumber]", PageNumber);// return ShowPageStr; } #endregion #region 简要说明 /* .Net通用分页类(存储过程分页版,可以完全自定义显示样式,包括中英显示). 大概思路是:主要是利用存储过程在数据库进行分页, 所以在这个类里面不涉及到数据的处理,只进行页码的显示格式处理, 配合SQL2005 的 ROW_NUMBER () 功能,能够达到更好的效果. 显示样式可以完全自定义,用图片也可以,只是要把字符串拼成图片的Html 效果图:http://images.cnblogs.com/cnblogs_com/84ww/128905/r_PageStore.gif */ #endregion #region 调用实例 /* protected void BindData() { PageStyle MyPage = new PageStyle("EN");//创建类 MyPage.PageSize = MyPageSize;//页码大小 MyPage.TotalRecord = Counts;//总记录数 MyPage.TurnUrlStr = "AskSearchList.aspx";//要跳转的页面(当前页) MyPage.Parameters = MyParameters;//页面所带参数 MyPage.PageIndex = PageIndex;//当前要显示的页码 //具体样式 MyPage.PageString = "第[PageIndex]页/共[TotalPage]页 第每[PageSize]条/共[TotalRecord]条 [FirstStr] [PrevStr] [PageNumber] [NextStr] [LastStr] [TurnControl]"; string strPage = MyPage.GetShowPageStr("2", 3, 10); labShowPage.Text = strPage; } */ #endregion } }全部的代码都奉献上了,如果大家有更好的写法,更好的见解,欢迎拍砖。
探索Lucene.Net全文检索
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。