首页 > 代码库 > Lucene 自定义分析器
Lucene 自定义分析器
自定义一个NGramAnalyzer:
1 package org.apache.lucene.analysis.ngram; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.FileReader; 6 import java.io.IOException; 7 import java.io.Reader; 8 import java.util.ArrayList; 9 import java.util.List; 10 11 import org.apache.lucene.analysis.Analyzer; 12 import org.apache.lucene.analysis.TokenStream; 13 import org.apache.lucene.analysis.Tokenizer; 14 import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; 15 import org.apache.lucene.analysis.core.StopFilter; 16 import org.apache.lucene.analysis.core.WhitespaceTokenizer; 17 import org.apache.lucene.analysis.standard.StandardFilter; 18 import org.apache.lucene.analysis.util.CharArraySet; 19 import org.apache.lucene.util.Version; 20 21 public class NGramAnalyzer extends Analyzer{ 22 23 private Version matchVersion ; 24 25 private int gramSize ; 26 27 private CharArraySet stopwordSet ; 28 29 public NGramAnalyzer(){ 30 31 } 32 33 public NGramAnalyzer(Version matchVersion ,int gramSize) { 34 35 this.matchVersion = matchVersion ; 36 37 this.gramSize = gramSize ; 38 39 try { 40 41 List<String> stopWords; 42 43 String stopWordPath = "E://Java program//PatentRetrival//resouce//stopwords.txt"; 44 45 stopWords = readStopwords(stopWordPath); 46 47 this.stopwordSet = new CharArraySet(Version.LUCENE_4_9 ,stopWords ,true); 48 49 } catch (IOException e) { 50 // TODO Auto-generated catch block 51 e.printStackTrace(); 52 } 53 54 55 56 57 } 58 59 60 61 @Override 62 protected TokenStreamComponents createComponents(String fieldName,Reader reader) { 63 64 // TODO Auto-generated method stub 65 66 Tokenizer tokenizer = new NGramTokenizer(Version.LUCENE_4_9, reader,gramSize ,gramSize); 67 68 TokenStream tokenStream = new NGramTokenFilter(matchVersion, tokenizer); 69 70 //tok = new LowerCaseFilter(matchVersion, tok); 71 tokenStream = new StopFilter(matchVersion, tokenStream, stopwordSet); 72 return new TokenStreamComponents(tokenizer ,tokenStream); 73 74 } 75 76 public List<String> readStopwords(String filepath) throws IOException{ 77 78 List<String> stopwords = new ArrayList<String>() ; 79 80 File file = new File(filepath); 81 82 BufferedReader reader = new BufferedReader(new FileReader(file)); 83 84 String word; 85 86 while((word = reader.readLine() )!= null){ 87 88 89 90 word = new String(word.getBytes("UTF-8"), "gb2312"); 91 92 stopwords.add(word); 93 94 System.out.print(" stopword = " + word); 95 } 96 97 return stopwords ; 98 99 }100 101 }
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。