首页 > 代码库 > 记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索
实现效果:
上一篇文章有附全文搜索结果的设计图,下面截一张开发完成上线后的实图:
基本风格是模仿的百度搜索结果,绿色的分页略显小清新。
目前已采集并创建索引的文章约3W多篇,索引文件不算太大,查询速度非常棒。
刀不磨要生锈,人不学要落后。每天都要学一些新东西。
基本技术介绍:
还记得上一次做全文搜索是在2013年,主要核心设计与代码均是当时的架构师写的,自己只能算是全程参与。
当时使用的是经典搭配:盘古分词+Lucene.net。
前几篇文章有说到,盘古分词已经很多年不更新了,我在SupportYun系统一直引用的JieBaNet来做分词技术。
那么是否也有成型的JieBaNet+Lucene.Net的全文搜索方案呢?
经过多番寻找,在GitHub上面找到一个简易的例子:https://github.com/anderscui/jiebaForLuceneNet
博主下面要讲的实现方案就是从这个demo得到的启发,大家有兴趣可以去看看这个demo。
博主使用的具体版本:Lucene.net 3.0.3.0 ,JieBaNet 0.38.3.0(做过简易的调整与扩展,前面文章有讲到)
首先我们对Lucene.Net的分词器Tokenizer、分析器Analyzer做一个基于JieBaNet的扩展。
1.基于LuceneNet扩展的JieBa分析器JiebaForLuceneAnalyzer
1 /// <summary> 2 /// 基于LuceneNet扩展的JieBa分析器 3 /// </summary> 4 public class JiebaForLuceneAnalyzer : Analyzer 5 { 6 protected static readonly ISet<string> DefaultStopWords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 7 8 private static ISet<string> StopWords; 9 10 static JiebaForLuceneAnalyzer()11 {12 StopWords = new HashSet<string>();13 var stopWordsFile = Path.GetFullPath(JiebaNet.Analyser.ConfigManager.StopWordsFile);14 if (File.Exists(stopWordsFile))15 {16 var lines = File.ReadAllLines(stopWordsFile);17 foreach (var line in lines)18 {19 StopWords.Add(line.Trim());20 }21 }22 else23 {24 StopWords = DefaultStopWords;25 }26 }27 28 public override TokenStream TokenStream(string fieldName, TextReader reader)29 {30 var seg = new JiebaSegmenter();31 TokenStream result = new JiebaForLuceneTokenizer(seg, reader);32 result = new LowerCaseFilter(result);33 result = new StopFilter(true, result, StopWords);34 return result;35 }36 }
2.基于LuceneNet扩展的JieBa分词器:JiebaForLuceneTokenizer
1 /// <summary> 2 /// 基于Lucene的JieBa分词扩展 3 /// </summary> 4 public class JiebaForLuceneTokenizer:Tokenizer 5 { 6 private readonly JiebaSegmenter segmenter; 7 private readonly ITermAttribute termAtt; 8 private readonly IOffsetAttribute offsetAtt; 9 private readonly ITypeAttribute typeAtt;10 11 private readonly List<Token> tokens;12 private int position = -1;13 14 public JiebaForLuceneTokenizer(JiebaSegmenter seg, TextReader input):this(seg, input.ReadToEnd()) { }15 16 public JiebaForLuceneTokenizer(JiebaSegmenter seg, string input)17 {18 segmenter = seg;19 termAtt = AddAttribute<ITermAttribute>();20 offsetAtt = AddAttribute<IOffsetAttribute>();21 typeAtt = AddAttribute<ITypeAttribute>();22 23 var text = input;24 tokens = segmenter.Tokenize(text, TokenizerMode.Search).ToList();25 }26 27 public override bool IncrementToken()28 {29 ClearAttributes();30 position++;31 if (position < tokens.Count)32 {33 var token = tokens[position];34 termAtt.SetTermBuffer(token.Word);35 offsetAtt.SetOffset(token.StartIndex, token.EndIndex);36 typeAtt.Type = "Jieba";37 return true;38 }39 40 End();41 return false;42 }43 44 public IEnumerable<Token> Tokenize(string text, TokenizerMode mode = TokenizerMode.Search)45 {46 return segmenter.Tokenize(text, mode);47 }48 }
理想如果不向现实做一点点屈服,那么理想也将归于尘土。
实现方案设计:
我们做全文搜索的设计时一定会考虑的一个问题就是:我们系统是分很多模块的,不同模块的字段差异很大,怎么才能实现同一个索引,既可以单个模块搜索又可以全站搜索,甚至按一些字段做条件来搜索呢?
这些也是SupportYun系统需要考虑的问题,因为目前的数据就天然的拆分成了活动、文章两个类别,字段也大有不同。博主想实现的是一个可以全站搜索(结果包括活动、文章),也可以在文章栏目/活动栏目分别搜索,并且可以按几个指定字段来做搜索条件。
要做一个这样的全文搜索功能,我们需要从程序设计上来下功夫。下面就介绍一下博主的设计方案:
一、索引创建
1.我们设计一个IndexManager来处理最基本的索引创建、更新、删除操作。
1 public class IndexManager 2 { 3 /// <summary> 4 /// 索引存储目录 5 /// </summary> 6 public static readonly string IndexStorePath = ConfigurationManager.AppSettings["IndexStorePath"]; 7 private IndexWriter indexWriter; 8 private FSDirectory entityDirectory; 9 10 ~IndexManager() 11 { 12 if (entityDirectory != null) 13 { 14 entityDirectory.Dispose(); 15 } 16 if (indexWriter != null) 17 { 18 indexWriter.Dispose(); 19 } 20 } 21 22 /// <summary> 23 /// 对内容新增索引 24 /// </summary> 25 public void BuildIndex(List<IndexContent> indexContents) 26 { 27 try 28 { 29 if (entityDirectory == null) 30 { 31 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 32 } 33 if (indexWriter == null) 34 { 35 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 36 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 37 } 38 lock (IndexStorePath) 39 { 40 foreach (var indexContent in indexContents) 41 { 42 var doc = GetDocument(indexContent); 43 indexWriter.AddDocument(doc); 44 } 45 indexWriter.Commit(); 46 indexWriter.Optimize(); 47 indexWriter.Dispose(); 48 } 49 } 50 catch (Exception exception) 51 { 52 LogUtils.ErrorLog(exception); 53 } 54 finally 55 { 56 if (entityDirectory != null) 57 { 58 entityDirectory.Dispose(); 59 } 60 if (indexWriter != null) 61 { 62 indexWriter.Dispose(); 63 } 64 } 65 } 66 67 /// <summary> 68 /// 删除索引 69 /// </summary> 70 /// <param name="moduleType"></param> 71 /// <param name="tableName">可空</param> 72 /// <param name="rowID"></param> 73 public void DeleteIndex(string moduleType, string tableName, string rowID) 74 { 75 try 76 { 77 if (entityDirectory == null) 78 { 79 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath)); 80 } 81 if (indexWriter == null) 82 { 83 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 84 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED); 85 } 86 lock (IndexStorePath) 87 { 88 var query = new BooleanQuery 89 { 90 {new TermQuery(new Term("ModuleType", moduleType)), Occur.MUST}, 91 {new TermQuery(new Term("RowId", rowID)), Occur.MUST} 92 }; 93 if (!string.IsNullOrEmpty(tableName)) 94 { 95 query.Add(new TermQuery(new Term("TableName", tableName)), Occur.MUST); 96 } 97 98 indexWriter.DeleteDocuments(query); 99 indexWriter.Commit();100 indexWriter.Optimize();101 indexWriter.Dispose();102 }103 }104 catch (Exception exception)105 {106 LogUtils.ErrorLog(exception);107 }108 finally109 {110 if (entityDirectory != null)111 {112 entityDirectory.Dispose();113 }114 if (indexWriter != null)115 {116 indexWriter.Dispose();117 }118 }119 }120 121 /// <summary>122 /// 更新索引123 /// </summary>124 /// <param name="indexContent"></param>125 public void UpdateIndex(IndexContent indexContent)126 {127 try128 {129 if (entityDirectory == null)130 {131 entityDirectory = FSDirectory.Open(new DirectoryInfo(IndexStorePath));132 }133 if (indexWriter == null)134 {135 Analyzer analyzer = new JiebaForLuceneAnalyzer();136 indexWriter = new IndexWriter(entityDirectory, analyzer, IndexWriter.MaxFieldLength.LIMITED);137 }138 lock (IndexStorePath)139 {140 var query = new BooleanQuery141 {142 {new TermQuery(new Term("ModuleType", indexContent.ModuleType)), Occur.MUST},143 {new TermQuery(new Term("RowId", indexContent.RowId.ToString())), Occur.MUST}144 };145 if (!string.IsNullOrEmpty(indexContent.TableName))146 {147 query.Add(new TermQuery(new Term("TableName", indexContent.TableName)), Occur.MUST);148 }149 150 indexWriter.DeleteDocuments(query);151 152 var document = GetDocument(indexContent);153 indexWriter.AddDocument(document);154 155 indexWriter.Commit();156 indexWriter.Optimize();157 indexWriter.Dispose();158 }159 }160 catch (Exception exception)161 {162 LogUtils.ErrorLog(exception);163 }164 finally165 {166 if (entityDirectory != null)167 {168 entityDirectory.Dispose();169 }170 if (indexWriter != null)171 {172 indexWriter.Dispose();173 }174 }175 }176 177 private Document GetDocument(IndexContent indexContent)178 {179 var doc = new Document();180 doc.Add(new Field("ModuleType", indexContent.ModuleType, Field.Store.YES, Field.Index.NOT_ANALYZED));181 doc.Add(new Field("TableName", indexContent.TableName, Field.Store.YES, Field.Index.NOT_ANALYZED));182 doc.Add(new Field("RowId", indexContent.RowId.ToString().ToLower(), Field.Store.YES, Field.Index.NOT_ANALYZED));183 doc.Add(new Field("Title", indexContent.Title, Field.Store.YES, Field.Index.ANALYZED));184 doc.Add(new Field("IndexTextContent", ReplaceIndexSensitiveWords(indexContent.IndexTextContent), Field.Store.YES, Field.Index.ANALYZED));185 doc.Add(new Field("CollectTime", indexContent.CollectTime.ToString("yyyy-MM-dd HH:mm:ss"),Field.Store.YES, Field.Index.NO));186 187 // 预留188 doc.Add(new Field("Tag1", indexContent.Tag1.Value, GetStoreEnum(indexContent.Tag1.Store)189 , GetIndexEnum(indexContent.Tag1.Index)));190 doc.Add(new Field("Tag2", indexContent.Tag2.Value, GetStoreEnum(indexContent.Tag2.Store)191 , GetIndexEnum(indexContent.Tag2.Index)));192 doc.Add(new Field("Tag3", indexContent.Tag3.Value, GetStoreEnum(indexContent.Tag3.Store)193 , GetIndexEnum(indexContent.Tag3.Index)));194 doc.Add(new Field("Tag4", indexContent.Tag4.Value, GetStoreEnum(indexContent.Tag4.Store)195 , GetIndexEnum(indexContent.Tag4.Index)));196 doc.Add(new Field("Tag5", indexContent.Tag5.Value, GetStoreEnum(indexContent.Tag5.Store)197 , GetIndexEnum(indexContent.Tag5.Index)));198 doc.Add(new Field("Tag6", indexContent.Tag6.Value, GetStoreEnum(indexContent.Tag6.Store)199 , GetIndexEnum(indexContent.Tag6.Index)));200 doc.Add(new Field("Tag7", indexContent.Tag7.Value, GetStoreEnum(indexContent.Tag7.Store)201 , GetIndexEnum(indexContent.Tag7.Index)));202 doc.Add(new Field("Tag8", indexContent.Tag8.Value, GetStoreEnum(indexContent.Tag8.Store)203 , GetIndexEnum(indexContent.Tag8.Index)));204 var field = new NumericField("FloatTag9", GetStoreEnum(indexContent.FloatTag9.Store),205 indexContent.FloatTag9.Index != IndexEnum.NotIndex);206 field = field.SetFloatValue(indexContent.FloatTag9.Value);207 doc.Add(field);208 field = new NumericField("FloatTag10", GetStoreEnum(indexContent.FloatTag10.Store),209 indexContent.FloatTag10.Index != IndexEnum.NotIndex);210 field = field.SetFloatValue(indexContent.FloatTag10.Value);211 doc.Add(field);212 return doc;213 }214 215 /// <summary>216 /// 权益方法,临时使用217 /// 去除文本中非索引文本218 /// </summary>219 /// <param name="str"></param>220 /// <returns></returns>221 private string ReplaceIndexSensitiveWords(string str)222 {223 for (var i = 0; i < 3; i++)224 {225 str = str.Replace(" ", "");226 str = str.Replace(" ", "").Replace("\n", "");227 }228 return str;229 }230 231 private Field.Index GetIndexEnum(IndexEnum index)232 {233 switch (index)234 {235 case IndexEnum.NotIndex:236 return Field.Index.NO;237 case IndexEnum.NotUseAnalyzerButIndex:238 return Field.Index.NOT_ANALYZED;239 case IndexEnum.UseAnalyzerIndex:240 return Field.Index.ANALYZED;241 default:242 return Field.Index.NO;243 }244 }245 246 private Field.Store GetStoreEnum(bool store)247 {248 return store ? Field.Store.YES : Field.Store.NO;249 }250 }
2.创建、更新使用到的标准数据类:IndexContent。
我们设计TableName(对应DB表名)、RowId(对应DB主键)、CollectTime(对应DB数据创建时间)、ModuleType(所属系统模块)、Title(检索标题)、IndexTextContent(检索文本)等六个基础字段,所有模块需要创建索引必须构建该6个字段(大家可据具体情况扩展)。
然后设计10个预留字段Tag1-Tag10,用以兼容各大模块其他不同字段。
预留字段的存储、索引方式可独立配置。
1 /// <summary> 2 /// 索引内容扩展类 3 /// 增加10个预留字段(8个文本型,2个数值型) 4 /// </summary> 5 public class IndexContent : BaseIndexContent 6 { 7 public IndexContent() 8 { 9 Tag1 = new IndexContentStringValue(); 10 Tag2 = new IndexContentStringValue(); 11 Tag3 = new IndexContentStringValue(); 12 Tag4 = new IndexContentStringValue(); 13 Tag5 = new IndexContentStringValue(); 14 Tag6 = new IndexContentStringValue(); 15 Tag7 = new IndexContentStringValue(); 16 Tag8 = new IndexContentStringValue(); 17 FloatTag9 = new IndexContentFloatValue(); 18 FloatTag10 = new IndexContentFloatValue(); 19 } 20 21 /// <summary> 22 /// 预留1 23 /// </summary> 24 public IndexContentStringValue Tag1 { get; set; } 25 26 /// <summary> 27 /// 预留2 28 /// </summary> 29 public IndexContentStringValue Tag2 { get; set; } 30 31 /// <summary> 32 /// 预留3 33 /// </summary> 34 public IndexContentStringValue Tag3 { get; set; } 35 36 /// <summary> 37 /// 预留4 38 /// </summary> 39 public IndexContentStringValue Tag4 { get; set; } 40 41 /// <summary> 42 /// 预留5 43 /// </summary> 44 public IndexContentStringValue Tag5 { get; set; } 45 46 /// <summary> 47 /// 预留6 48 /// </summary> 49 public IndexContentStringValue Tag6 { get; set; } 50 51 /// <summary> 52 /// 预留7 53 /// </summary> 54 public IndexContentStringValue Tag7 { get; set; } 55 56 /// <summary> 57 /// 预留8 58 /// </summary> 59 public IndexContentStringValue Tag8 { get; set; } 60 61 /// <summary> 62 /// 预留9(数值型) 63 /// </summary> 64 public IndexContentFloatValue FloatTag9 { get; set; } 65 66 /// <summary> 67 /// 预留10(数值型) 68 /// </summary> 69 public IndexContentFloatValue FloatTag10 { get; set; } 70 } 71 72 /// <summary> 73 /// 索引值及方式 74 /// </summary> 75 public class IndexContentStringValue 76 { 77 public IndexContentStringValue() 78 { 79 Value = http://www.mamicode.com/""; 80 Store = true; 81 Index = IndexEnum.NotIndex; 82 } 83 84 /// <summary> 85 /// 字符值 86 /// </summary> 87 public string Value { get; set; } 88 89 /// <summary> 90 /// 是否存储 91 /// </summary> 92 public bool Store { get; set; } 93 94 /// <summary> 95 /// 索引&分词方式 96 /// </summary> 97 public IndexEnum Index { get; set; } 98 } 99 100 /// <summary>101 /// 索引值及方式102 /// </summary>103 public class IndexContentFloatValue104 {105 public IndexContentFloatValue()106 {107 Value = http://www.mamicode.com/0;108 Store = true;109 Index = IndexEnum.NotIndex;110 }111 112 /// <summary>113 /// 字符值114 /// </summary>115 public float Value { get; set; }116 117 /// <summary>118 /// 是否存储119 /// </summary>120 public bool Store { get; set; }121 122 /// <summary>123 /// 是否索引且分词124 /// </summary>125 public IndexEnum Index { get; set; }126 }
其中BaseIndexContent含有六个基础字段。
3.创建一个子模块索引构建器的接口:IIndexBuilder。
各子模块通过继承实现IIndexBuilder,来实现索引的操作。
1 /// <summary> 2 /// 各子模块内容索引构建器接口 3 /// </summary> 4 public interface IIndexBuilder<TIndexContent> 5 { 6 /// <summary> 7 /// 将内容集合建立索引 8 /// </summary> 9 void BuildIndex(List<TIndexContent> indexContents);10 11 /// <summary>12 /// 删除索引13 /// </summary>14 void DeleteIndex(string tableName, string rowID);15 16 /// <summary>17 /// 更新索引18 /// </summary>19 /// <param name="indexContents"></param>20 void UpdateIndex(List<TIndexContent> indexContents);21 }
4.下面我们以活动模块为例,来实现索引创建。
a)首先创建一个基于活动模块的数据类:ActivityIndexContent,可以将我们需要索引或存储的字段都设计在内。
1 public class ActivityIndexContent 2 { 3 /// <summary> 4 /// 关联表格名 5 /// </summary> 6 public string TableName { get; set; } 7 8 /// <summary> 9 /// 关联表格行ID10 /// </summary>11 public Guid RowId { get; set; }12 13 /// <summary>14 /// 采集分析时间15 /// </summary>16 public DateTime CollectTime { get; set; }17 18 public string Title { get; set; }19 20 /// <summary>21 /// 详情22 /// </summary>23 public string InformationContent { get; set; }24 25 /// <summary>26 /// 活动类别27 /// </summary>28 public List<ActivityType> ActivityTypes { get; set; }29 30 public Guid CityId { get; set; }31 32 /// <summary>33 /// 活动地址34 /// </summary>35 public string Address { get; set; }36 37 /// <summary>38 /// 活动日期39 /// </summary>40 public DateTime? ActivityDate { get; set; }41 42 /// <summary>43 /// 源链接44 /// </summary>45 public string Url { get; set; }46 47 /// <summary>48 /// 采集源名称49 /// </summary>50 public string SourceName { get; set; }51 52 /// <summary>53 /// 采集源主站地址54 /// </summary>55 public string SourceUrl { get; set; }56 57 /// <summary>58 /// 采集源官方热线59 /// </summary>60 public string SourceOfficialHotline { get; set; }61 }
b)我们再创建ActivityIndexBuilder并继承IIndexBuilder,实现其创建、更新、删除方法。
1 /// <summary> 2 /// 活动数据索引创建器 3 /// </summary> 4 public class ActivityIndexBuilder : IIndexBuilder<ActivityIndexContent> 5 { 6 public const string MODULETYPE = "活动"; 7 8 /// <summary> 9 /// 创建索引 10 /// </summary> 11 /// <param name="activityIndexContents"></param> 12 public void BuildIndex(List<ActivityIndexContent> activityIndexContents) 13 { 14 var indexManager = new IndexManager(); 15 var indexContents = activityIndexContents.Select(activityIndexContent => new IndexContent 16 { 17 ModuleType = MODULETYPE, 18 TableName = activityIndexContent.TableName, 19 RowId = activityIndexContent.RowId, 20 Title = activityIndexContent.Title, 21 IndexTextContent = activityIndexContent.InformationContent, 22 CollectTime = activityIndexContent.CollectTime, 23 Tag1 = new IndexContentStringValue 24 { 25 // 活动分类 26 Value =http://www.mamicode.com/ activityIndexContent.GetActivityTypeStr() 27 }, 28 Tag2 = new IndexContentStringValue 29 { 30 // 源链接 31 Value =http://www.mamicode.com/ activityIndexContent.Url 32 }, 33 Tag3 = new IndexContentStringValue 34 { 35 // 采集源名称 36 Value =http://www.mamicode.com/ activityIndexContent.SourceName, 37 Index = IndexEnum.UseAnalyzerIndex 38 }, 39 Tag4 = new IndexContentStringValue 40 { 41 // 采集源官方热线 42 Value =http://www.mamicode.com/ activityIndexContent.SourceOfficialHotline 43 }, 44 Tag5 = new IndexContentStringValue 45 { 46 // 采集源主站地址 47 Value =http://www.mamicode.com/ activityIndexContent.SourceUrl 48 }, 49 Tag6 = new IndexContentStringValue() 50 { 51 // 采集活动举办城市ID 52 Value =http://www.mamicode.com/ activityIndexContent.CityId.ToString().ToLower(), 53 Index = IndexEnum.NotUseAnalyzerButIndex 54 }, 55 Tag7 = new IndexContentStringValue() 56 { 57 // 采集活动举办地址 58 Value = http://www.mamicode.com/string.IsNullOrEmpty(activityIndexContent.Address)?"":activityIndexContent.Address 59 }, 60 Tag8 = new IndexContentStringValue() 61 { 62 // 采集活动举办时间 63 Value = http://www.mamicode.com/activityIndexContent.ActivityDate.HasValue?activityIndexContent.ActivityDate.Value.ToString("yyyy年MM月dd日"):"" 64 } 65 }).ToList(); 66 indexManager.BuildIndex(indexContents); 67 } 68 69 /// <summary> 70 /// 删除索引 71 /// </summary> 72 /// <param name="tableName"></param> 73 /// <param name="rowID"></param> 74 public void DeleteIndex(string tableName, string rowID) 75 { 76 var indexManager = new IndexManager(); 77 indexManager.DeleteIndex(MODULETYPE, tableName, rowID); 78 } 79 80 /// <summary> 81 /// 更新索引 82 /// </summary> 83 /// <param name="indexContents"></param> 84 public void UpdateIndex(List<ActivityIndexContent> indexContents) 85 { 86 foreach (var indexContent in indexContents) 87 { 88 if (indexContent.RowId != Guid.Empty && 89 indexContent.TableName != null) 90 { 91 // 删除索引 92 this.DeleteIndex(indexContent.TableName, 93 indexContent.RowId.ToString().ToLower()); 94 } 95 } 96 97 // 添加索引 98 this.BuildIndex(indexContents); 99 }100 }
代码就不解释了,很简单。主要就是调用IndexManager来执行操作。
我们只需要在需要创建活动数据索引的业务点,构建ActivityIndexBuilder对象,并构建ActivityIndexContent集合作为参数,调用BuildIndex方法即可。
二、全文搜索
全文搜索我们采用同样的设计方式。
1.设计一个抽象的搜索类:BaseIndexSearch,所有搜索模块(包括全站)均需继承它来实现搜索效果。
1 public abstract class BaseIndexSearch<TIndexSearchResultItem> 2 where TIndexSearchResultItem : IndexSearchResultItem 3 { 4 /// <summary> 5 /// 索引存储目录 6 /// </summary> 7 private static readonly string IndexStorePath = ConfigurationManager.AppSettings["IndexStorePath"]; 8 private readonly string[] fieldsToSearch; 9 protected static readonly SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<em>", "</em>"); 10 private static IndexSearcher indexSearcher = null; 11 12 /// <summary> 13 /// 索引内容命中片段大小 14 /// </summary> 15 public int FragmentSize { get; set; } 16 17 /// <summary> 18 /// 构造方法 19 /// </summary> 20 /// <param name="fieldsToSearch">搜索文本字段</param> 21 protected BaseIndexSearch(string[] fieldsToSearch) 22 { 23 FragmentSize = 100; 24 this.fieldsToSearch = fieldsToSearch; 25 } 26 27 /// <summary> 28 /// 创建搜索结果实例 29 /// </summary> 30 /// <returns></returns> 31 protected abstract TIndexSearchResultItem CreateIndexSearchResultItem(); 32 33 /// <summary> 34 /// 修改搜索结果(主要修改tag字段对应的属性) 35 /// </summary> 36 /// <param name="indexSearchResultItem">搜索结果项实例</param> 37 /// <param name="content">用户搜索内容</param> 38 /// <param name="docIndex">索引库位置</param> 39 /// <param name="doc">当前位置内容</param> 40 /// <returns>搜索结果</returns> 41 protected abstract void ModifyIndexSearchResultItem(ref TIndexSearchResultItem indexSearchResultItem, string content, int docIndex, Document doc); 42 43 /// <summary> 44 /// 修改筛选器(各模块) 45 /// </summary> 46 /// <param name="filter"></param> 47 protected abstract void ModifySearchFilter(ref Dictionary<string, string> filter); 48 49 /// <summary> 50 /// 全库搜索 51 /// </summary> 52 /// <param name="content">搜索文本内容</param> 53 /// <param name="filter">查询内容限制条件,默认为null,不限制条件.</param> 54 /// <param name="fieldSorts">对字段进行排序</param> 55 /// <param name="pageIndex">查询结果当前页,默认为1</param> 56 /// <param name="pageSize">查询结果每页结果数,默认为20</param> 57 public PagedIndexSearchResult<TIndexSearchResultItem> Search(string content 58 , Dictionary<string, string> filter = null, List<FieldSort> fieldSorts = null 59 , int pageIndex = 1, int pageSize = 20) 60 { 61 try 62 { 63 if (!string.IsNullOrEmpty(content)) 64 { 65 content = ReplaceIndexSensitiveWords(content); 66 content = GetKeywordsSplitBySpace(content, 67 new JiebaForLuceneTokenizer(new JiebaSegmenter(), content)); 68 } 69 if (string.IsNullOrEmpty(content) || pageIndex < 1) 70 { 71 throw new Exception("输入参数不符合要求(用户输入为空,页码小于等于1)"); 72 } 73 74 var stopWatch = new Stopwatch(); 75 stopWatch.Start(); 76 77 Analyzer analyzer = new JiebaForLuceneAnalyzer(); 78 // 索引条件创建 79 var query = MakeSearchQuery(content, analyzer); 80 // 筛选条件构建 81 filter = filter == null ? new Dictionary<string, string>() : new Dictionary<string, string>(filter); 82 ModifySearchFilter(ref filter); 83 Filter luceneFilter = MakeSearchFilter(filter); 84 85 #region------------------------------执行查询--------------------------------------- 86 87 TopDocs topDocs; 88 if (indexSearcher == null) 89 { 90 var dir = new DirectoryInfo(IndexStorePath); 91 FSDirectory entityDirectory = FSDirectory.Open(dir); 92 IndexReader reader = IndexReader.Open(entityDirectory, true); 93 indexSearcher = new IndexSearcher(reader); 94 } 95 else 96 { 97 IndexReader indexReader = indexSearcher.IndexReader; 98 if (!indexReader.IsCurrent()) 99 {100 indexSearcher.Dispose();101 indexSearcher = new IndexSearcher(indexReader.Reopen());102 }103 }104 // 收集器容量为所有105 int totalCollectCount = pageIndex*pageSize;106 Sort sort = GetSortByFieldSorts(fieldSorts);107 topDocs = indexSearcher.Search(query, luceneFilter, totalCollectCount, sort ?? Sort.RELEVANCE);108 109 #endregion110 111 #region-----------------------返回结果生成-------------------------------112 113 ScoreDoc[] hits = topDocs.ScoreDocs;114 var start = (pageIndex - 1)*pageSize + 1;115 var end = Math.Min(totalCollectCount, hits.Count());116 117 var result = new PagedIndexSearchResult<TIndexSearchResultItem>118 {119 PageIndex = pageIndex,120 PageSize = pageSize,121 TotalRecords = topDocs.TotalHits122 };123 124 for (var i = start; i <= end; i++)125 {126 var scoreDoc = hits[i - 1];127 var doc = indexSearcher.Doc(scoreDoc.Doc);128 129 var indexSearchResultItem = CreateIndexSearchResultItem();130 indexSearchResultItem.DocIndex = scoreDoc.Doc;131 indexSearchResultItem.ModuleType = doc.Get("ModuleType");132 indexSearchResultItem.TableName = doc.Get("TableName");133 indexSearchResultItem.RowId = Guid.Parse(doc.Get("RowId"));134 if (!string.IsNullOrEmpty(doc.Get("CollectTime")))135 {136 indexSearchResultItem.CollectTime = DateTime.Parse(doc.Get("CollectTime"));137 }138 var title = GetHighlighter(formatter, FragmentSize).GetBestFragment(content, doc.Get("Title"));139 indexSearchResultItem.Title = string.IsNullOrEmpty(title) ? doc.Get("Title") : title;140 var text = GetHighlighter(formatter, FragmentSize)141 .GetBestFragment(content, doc.Get("IndexTextContent"));142 indexSearchResultItem.Content = string.IsNullOrEmpty(text)143 ? (doc.Get("IndexTextContent").Length > 100144 ? doc.Get("IndexTextContent").Substring(0, 100)145 : doc.Get("IndexTextContent"))146 : text;147 ModifyIndexSearchResultItem(ref indexSearchResultItem, content, scoreDoc.Doc, doc);148 result.Add(indexSearchResultItem);149 }150 stopWatch.Stop();151 result.Elapsed = stopWatch.ElapsedMilliseconds*1.0/1000;152 153 return result;154 155 #endregion156 }157 catch (Exception exception)158 {159 LogUtils.ErrorLog(exception);160 return null;161 }162 }163 164 private Sort GetSortByFieldSorts(List<FieldSort> fieldSorts)165 {166 if (fieldSorts == null)167 {168 return null;169 }170 return new Sort(fieldSorts.Select(fieldSort => new SortField(fieldSort.FieldName, SortField.FLOAT, !fieldSort.Ascend)).ToArray());171 }172 173 private static Filter MakeSearchFilter(Dictionary<string, string> filter)174 {175 Filter luceneFilter = null;176 if (filter != null && filter.Keys.Any())177 {178 var booleanQuery = new BooleanQuery();179 foreach (KeyValuePair<string, string> keyValuePair in filter)180 {181 var termQuery = new TermQuery(new Term(keyValuePair.Key, keyValuePair.Value));182 booleanQuery.Add(termQuery, Occur.MUST);183 }184 luceneFilter = new QueryWrapperFilter(booleanQuery);185 }186 return luceneFilter;187 }188 189 private Query MakeSearchQuery(string content, Analyzer analyzer)190 {191 var query = new BooleanQuery();192 // 总查询参数193 // 属性查询194 if (!string.IsNullOrEmpty(content))195 {196 QueryParser parser = new MultiFieldQueryParser(Version.LUCENE_30, fieldsToSearch, analyzer);197 Query queryObj;198 try199 {200 queryObj = parser.Parse(content);201 }202 catch (ParseException parseException)203 {204 throw new Exception("在FileLibraryIndexSearch中构造Query时出错。", parseException);205 }206 query.Add(queryObj, Occur.MUST);207 }208 return query;209 }210 211 private string GetKeywordsSplitBySpace(string keywords, JiebaForLuceneTokenizer jiebaForLuceneTokenizer)212 {213 var result = new StringBuilder();214 215 var words = jiebaForLuceneTokenizer.Tokenize(keywords);216 217 foreach (var word in words)218 {219 if (string.IsNullOrWhiteSpace(word.Word))220 {221 continue;222 }223 224 result.AppendFormat("{0} ", word.Word);225 }226 227 return result.ToString().Trim();228 }229 230 private string ReplaceIndexSensitiveWords(string str)231 {232 str = str.Replace("+", "");233 str = str.Replace("+", "");234 str = str.Replace("-", "");235 str = str.Replace("-", "");236 str = str.Replace("!", "");237 str = str.Replace("!", "");238 str = str.Replace("(", "");239 str = str.Replace(")", "");240 str = str.Replace("(", "");241 str = str.Replace(")", "");242 str = str.Replace(":", "");243 str = str.Replace(":", "");244 str = str.Replace("^", "");245 str = str.Replace("[", "");246 str = str.Replace("]", "");247 str = str.Replace("【", "");248 str = str.Replace("】", "");249 str = str.Replace("{", "");250 str = str.Replace("}", "");251 str = str.Replace("{", "");252 str = str.Replace("}", "");253 str = str.Replace("~", "");254 str = str.Replace("~", "");255 str = str.Replace("*", "");256 str = str.Replace("*", "");257 str = str.Replace("?", "");258 str = str.Replace("?", "");259 return str;260 }261 262 protected Highlighter GetHighlighter(Formatter formatter, int fragmentSize)263 {264 var highlighter = new Highlighter(formatter, new Segment()) { FragmentSize = fragmentSize };265 return highlighter;266 }267 }
几个protected abstract方法,是需要继承的子类来实现的。
其中为了实现搜索结果对命中关键词进行高亮显示,特引用了盘古分词的Highlighter。原则是此处应该是参照盘古分词的源码,自己使用JieBaNet来做实现的,由于工期较紧,直接引用了盘古。
2.我们设计一个IndexSearchResultItem,表示搜索结果的基类。
1 /// <summary> 2 /// 全库搜索结果单项内容 3 /// </summary> 4 public class IndexSearchResultItem 5 { 6 /// <summary> 7 /// 内容索引 8 /// </summary> 9 public int DocIndex { get; set; }10 11 /// <summary>12 /// 模块类别13 /// </summary>14 public string ModuleType { get; set; }15 16 /// <summary>17 /// 表名18 /// </summary>19 public string TableName { get; set; }20 21 /// <summary>22 /// 行号23 /// </summary>24 public Guid RowId { get; set; }25 26 /// <summary>27 /// 文档标题28 /// </summary>29 public string Title { get; set; }30 31 /// <summary>32 /// 文档内容片段33 /// </summary>34 public string Content { get; set; }35 36 public DateTime? CollectTime { get; set; }37 }
3.我们来看看具体的实现,先来看全站搜索的SearchService
1 public class IndexSearch : BaseIndexSearch<IndexSearchResultItem> 2 { 3 public IndexSearch() 4 : base(new[] { "IndexTextContent", "Title" }) 5 { 6 } 7 8 protected override IndexSearchResultItem CreateIndexSearchResultItem() 9 {10 return new IndexSearchResultItem();11 }12 13 protected override void ModifyIndexSearchResultItem(ref IndexSearchResultItem indexSearchResultItem, string content,14 int docIndex, Document doc)15 {16 //不做修改17 }18 19 protected override void ModifySearchFilter(ref Dictionary<string, string> filter)20 {21 //不做筛选条件修改22 }23 }
是不是非常简单。由于我们此处搜索的是全站,结果展示直接用基类,取出基本字段即可。
4.再列举一个活动的搜索实现。
a)我们首先创建一个活动搜索结果类ActivityIndexSearchResultItem,继承自结果基类IndexSearchResultItem
1 public class ActivityIndexSearchResultItem : IndexSearchResultItem 2 { 3 /// <summary> 4 /// 活动类别 5 /// </summary> 6 public string ActivityTypes { get; set; } 7 8 public Guid CityId { get; set; } 9 10 /// <summary>11 /// 活动地址12 /// </summary>13 public string Address { get; set; }14 15 /// <summary>16 /// 活动日期17 /// </summary>18 public string ActivityDate { get; set; }19 20 /// <summary>21 /// 源链接22 /// </summary>23 public string Url { get; set; }24 25 /// <summary>26 /// 采集源名称27 /// </summary>28 public string SourceName { get; set; }29 30 /// <summary>31 /// 采集源主站地址32 /// </summary>33 public string SourceUrl { get; set; }34 35 /// <summary>36 /// 采集源官方热线37 /// </summary>38 public string SourceOfficialHotline { get; set; }39 }
b)然后创建活动模块的搜索服务:ActivityIndexSearch,同样需要继承BaseIndexSearch,这时候ActivityIndexSearch只需要相对全站搜索修改几个参数即可。
1 public class ActivityIndexSearch: BaseIndexSearch<ActivityIndexSearchResultItem> 2 { 3 public ActivityIndexSearch() 4 : base(new[] { "IndexTextContent", "Title" }) 5 { 6 } 7 8 protected override ActivityIndexSearchResultItem CreateIndexSearchResultItem() 9 {10 return new ActivityIndexSearchResultItem();11 }12 13 protected override void ModifyIndexSearchResultItem(ref ActivityIndexSearchResultItem indexSearchResultItem, string content,14 int docIndex, Document doc)15 {16 indexSearchResultItem.ActivityTypes = doc.Get("Tag1");17 indexSearchResultItem.Url = doc.Get("Tag2");18 indexSearchResultItem.SourceName = doc.Get("Tag3");19 indexSearchResultItem.SourceOfficialHotline = doc.Get("Tag4");20 indexSearchResultItem.SourceUrl = doc.Get("Tag5");21 indexSearchResultItem.CityId=new Guid(doc.Get("Tag6"));22 indexSearchResultItem.Address = doc.Get("Tag7");23 indexSearchResultItem.ActivityDate = doc.Get("Tag8");24 }25 26 protected override void ModifySearchFilter(ref Dictionary<string, string> filter)27 {28 filter.Add("ModuleType", "活动");29 }30 }
筛选条件加上模块=活动,返回结果数据类指定,活动特有字段返回赋值。
业务调用就非常简单了。
全站全文搜索:我们直接new IndexSearch(),然后调用其Search()方法
活动全文搜索:我们直接new ActivityIndexSearch(),然后调用其Search()方法
Search()方法几个参数:
///<param name="content">搜索文本内容</param>
/// <param name="filter">查询内容限制条件,默认为null,不限制条件.</param>
/// <param name="fieldSorts">对字段进行排序</param>
/// <param name="pageIndex">查询结果当前页,默认为1</param>
/// <param name="pageSize">查询结果每页结果数,默认为20</param>
如果我们用软能力而不是用技术能力来区分程序员的好坏 – 是不是有那么点反常和变态。
很多思路均来源于13年那次做全文搜索,跟当时的架构师学习的。
在此表示感谢。
原创文章,代码都是从自己项目里贴出来的。转载请注明出处哦,亲~~~
记一次企业级爬虫系统升级改造(五):基于JieBaNet+Lucene.Net实现全文搜索