首页 > 代码库 > Lucene 自定义分析器

Lucene 自定义分析器

自定义一个NGramAnalyzer:

  1 package org.apache.lucene.analysis.ngram;  2   3 import java.io.BufferedReader;  4 import java.io.File;  5 import java.io.FileReader;  6 import java.io.IOException;  7 import java.io.Reader;  8 import java.util.ArrayList;  9 import java.util.List; 10  11 import org.apache.lucene.analysis.Analyzer; 12 import org.apache.lucene.analysis.TokenStream; 13 import org.apache.lucene.analysis.Tokenizer; 14 import org.apache.lucene.analysis.Analyzer.TokenStreamComponents; 15 import org.apache.lucene.analysis.core.StopFilter; 16 import org.apache.lucene.analysis.core.WhitespaceTokenizer; 17 import org.apache.lucene.analysis.standard.StandardFilter; 18 import org.apache.lucene.analysis.util.CharArraySet; 19 import org.apache.lucene.util.Version; 20  21 public class NGramAnalyzer extends Analyzer{ 22      23     private Version matchVersion ; 24      25     private int gramSize ; 26      27     private CharArraySet stopwordSet ; 28      29     public NGramAnalyzer(){ 30          31     } 32      33     public NGramAnalyzer(Version matchVersion ,int gramSize) { 34          35         this.matchVersion = matchVersion ; 36          37         this.gramSize = gramSize ; 38  39         try { 40              41             List<String> stopWords; 42              43             String stopWordPath = "E://Java program//PatentRetrival//resouce//stopwords.txt"; 44              45             stopWords = readStopwords(stopWordPath); 46              47             this.stopwordSet = new CharArraySet(Version.LUCENE_4_9 ,stopWords ,true); 48              49         } catch (IOException e) { 50             // TODO Auto-generated catch block 51             e.printStackTrace(); 52         } 53          54          55          56          57     } 58      59      60  61     @Override 62     protected TokenStreamComponents createComponents(String fieldName,Reader reader) { 63          64         // TODO Auto-generated method stub 65          66          Tokenizer tokenizer = new NGramTokenizer(Version.LUCENE_4_9, reader,gramSize ,gramSize);  67           68          TokenStream tokenStream = new NGramTokenFilter(matchVersion, tokenizer); 69           70             //tok = new LowerCaseFilter(matchVersion, tok); 71          tokenStream = new StopFilter(matchVersion, tokenStream, stopwordSet); 72          return new TokenStreamComponents(tokenizer ,tokenStream); 73          74     } 75      76     public  List<String> readStopwords(String filepath) throws IOException{ 77          78         List<String> stopwords = new ArrayList<String>() ; 79          80         File file = new File(filepath); 81          82         BufferedReader reader = new BufferedReader(new FileReader(file)); 83          84         String word; 85          86         while((word = reader.readLine() )!= null){ 87              88              89               90              word = new String(word.getBytes("UTF-8"), "gb2312"); 91               92              stopwords.add(word); 93               94              System.out.print(" stopword = " + word); 95         } 96          97         return stopwords  ; 98          99     }100 101 }