首页 > 代码库 > [solr] - spell check

[solr] - spell check

solr提供了一个spell check,又叫suggestions,可以用于查询输入的自动完成功能auto-complete。

参考文献:

https://cwiki.apache.org/confluence/display/solr/Spell+Checking

http://www.cnblogs.com/ibook360/archive/2011/11/30/2269077.html

 

方法:

 


 

修改core的solrconfig.xml

加入这段到<config />内

    <searchComponent name="spellcheck" class="solr.SpellCheckComponent">      <lst name="spellchecker">        <str name="name">wordbreak</str>        <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>          <str name="lookupImpl">org.apache.solr.spelling.suggest.tst.TSTLookup</str>        <str name="field">content</str>        <str name="combineWords">true</str>        <str name="breakWords">true</str>        <int name="maxChanges">10</int>      </lst>    </searchComponent>    <requestHandler name="/spellcheck" class="org.apache.solr.handler.component.SearchHandler">      <lst name="defaults">        <str name="spellcheck">true</str>        <str name="spellcheck.dictionary">wordbreak</str>        <str name="spellcheck.count">20</str>      </lst>      <arr name="last-components">        <str>spellcheck</str>      </arr>    </requestHandler>

 

schema.xml配置:

<?xml version="1.0" ?><schema name="my core" version="1.1">    <fieldtype name="string"  class="solr.StrField" sortMissingLast="true" omitNorms="true"/>    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>    <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>    <fieldtype name="binary" class="solr.BinaryField"/>    <fieldType name="text_cn" class="solr.TextField">        <analyzer type="index" class="org.wltea.analyzer.lucene.IKAnalyzer" />        <analyzer type="query" class="org.wltea.analyzer.lucene.IKAnalyzer" />        <analyzer>            <tokenizer class="solr.KeywordTokenizerFactory"/>            <filter class="solr.LowerCaseFilterFactory"/>        </analyzer>    </fieldType>        <!-- general -->    <field name="id" type="long" indexed="true" stored="true" multiValued="false" required="true"/>    <field name="subject" type="text_cn" indexed="true" stored="true" />    <field name="content" type="text_cn" indexed="true" stored="true" />    <field name="category_id" type="long" indexed="true" stored="true" />    <field name="category_name" type="text_cn" indexed="true" stored="true" />    <field name="last_update_time" type="tdate" indexed="true" stored="true" />    <field name="_version_" type="long" indexed="true" stored="true"/>         <!-- field to use to determine and enforce document uniqueness. -->     <uniqueKey>id</uniqueKey>     <!-- field for the QueryParser to use when an explicit fieldname is absent -->     <defaultSearchField>subject</defaultSearchField>     <!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->     <solrQueryParser defaultOperator="OR"/></schema>

关键在于这句:

        <analyzer>            <tokenizer class="solr.KeywordTokenizerFactory"/>            <filter class="solr.LowerCaseFilterFactory"/>        </analyzer>

意思是词组搜索

 

设置完xml,重启tomcat,在浏览器中运行:

http://localhost:8899/solr/mycore/spellcheck?spellcheck.build=true

运行结果:

 

然后在浏览器中运行:

http://localhost:8899/solr/mycore/spellcheck?q=中央&rows=0

运行结果:

 

Java代码:


 

Java bean:

package com.my.entity;import java.util.Date;import org.apache.solr.client.solrj.beans.Field;public class Item {    @Field    private long id;    @Field    private String subject;    @Field    private String content;    @Field("category_id")    private long categoryId;    @Field("category_name")    private String categoryName;    @Field("last_update_time")    private Date lastUpdateTime;        public long getId() {        return id;    }    public void setId(long id) {        this.id = id;    }    public String getSubject() {        return subject;    }    public void setSubject(String subject) {        this.subject = subject;    }    public String getContent() {        return content;    }    public void setContent(String content) {        this.content = content;    }    public long getCategoryId() {        return categoryId;    }    public void setCategoryId(long categoryId) {        this.categoryId = categoryId;    }    public String getCategoryName() {        return categoryName;    }    public void setCategoryName(String categoryName) {        this.categoryName = categoryName;    }    public Date getLastUpdateTime() {        return lastUpdateTime;    }    public void setLastUpdateTime(Date lastUpdateTime) {        this.lastUpdateTime = lastUpdateTime;    }}

 

测试代码:

package com.my.solr;import java.io.IOException;import java.util.ArrayList;import java.util.Date;import java.util.List;import java.util.Map;import org.apache.solr.client.solrj.SolrQuery;import org.apache.solr.client.solrj.SolrServerException;import org.apache.solr.client.solrj.impl.HttpSolrServer;import org.apache.solr.client.solrj.impl.XMLResponseParser;import org.apache.solr.client.solrj.response.QueryResponse;import org.apache.solr.client.solrj.response.SpellCheckResponse;import org.apache.solr.client.solrj.response.SpellCheckResponse.Collation;import org.apache.solr.client.solrj.response.SpellCheckResponse.Correction;import org.apache.solr.client.solrj.response.SpellCheckResponse.Suggestion;import com.my.entity.Item;public class TestSolr {    public static void main(String[] args) throws IOException, SolrServerException {        String url = "http://localhost:8899/solr/mycore";        HttpSolrServer core = new HttpSolrServer(url);        core.setMaxRetries(1);        core.setConnectionTimeout(5000);        core.setParser(new XMLResponseParser()); // binary parser is used by default        core.setSoTimeout(1000); // socket read timeout        core.setDefaultMaxConnectionsPerHost(100);        core.setMaxTotalConnections(100);        core.setFollowRedirects(false); // defaults to false        core.setAllowCompression(true);        // ------------------------------------------------------        // remove all data        // ------------------------------------------------------        core.deleteByQuery("*:*");        List<Item> items = new ArrayList<Item>();        items.add(makeItem(1, "cpu", "this is intel cpu", 1, "cpu-intel"));        items.add(makeItem(2, "cpu AMD", "this is AMD cpu", 2, "cpu-AMD"));        items.add(makeItem(3, "cpu intel", "this is intel-I7 cpu", 1, "cpu-intel"));        items.add(makeItem(4, "cpu AMD", "this is AMD 5000x cpu", 2, "cpu-AMD"));        items.add(makeItem(5, "cpu intel I6", "this is intel-I6 cpu", 1, "cpu-intel-I6"));        items.add(makeItem(6, "处理器", "中央处理器英特儿", 1, "cpu-intel"));        items.add(makeItem(7, "处理器AMD", "中央处理器AMD", 2, "cpu-AMD"));        items.add(makeItem(8, "中央处理器", "中央处理器Intel", 1, "cpu-intel"));        items.add(makeItem(9, "中央空调格力", "格力中央空调", 3, "air"));        items.add(makeItem(10, "中央空调海尔", "海尔中央空调", 3, "air"));        items.add(makeItem(11, "中央空调美的", "美的中央空调", 3, "air"));        core.addBeans(items);        // commit        core.commit();        // ------------------------------------------------------        // search        // ------------------------------------------------------        SolrQuery query = new SolrQuery();        String token = "中央";        query.set("qt", "/spellcheck");        query.set("q", token);        query.set("spellcheck", "on");        query.set("spellcheck.build", "true");        query.set("spellcheck.onlyMorePopular", "true");        query.set("spellcheck.count", "100");        query.set("spellcheck.alternativeTermCount", "4");        query.set("spellcheck.onlyMorePopular", "true");        query.set("spellcheck.extendedResults", "true");        query.set("spellcheck.maxResultsForSuggest", "5");        query.set("spellcheck.collate", "true");        query.set("spellcheck.collateExtendedResults", "true");        query.set("spellcheck.maxCollationTries", "5");        query.set("spellcheck.maxCollations", "3");        QueryResponse response = null;        try {            response = core.query(query);            System.out.println("查询耗时:" + response.getQTime());        } catch (SolrServerException e) {            System.err.println(e.getMessage());            e.printStackTrace();        } catch (Exception e) {            System.err.println(e.getMessage());            e.printStackTrace();        } finally {            core.shutdown();        }        SpellCheckResponse spellCheckResponse = response.getSpellCheckResponse();        if (spellCheckResponse != null) {            List<Suggestion> suggestionList = spellCheckResponse.getSuggestions();            for (Suggestion suggestion : suggestionList) {                System.out.println("Suggestions NumFound: " + suggestion.getNumFound());                System.out.println("Token: " + suggestion.getToken());                System.out.print("Suggested: ");                List<String> suggestedWordList = suggestion.getAlternatives();                for (String word : suggestedWordList) {                    System.out.println(word + ", ");                }                System.out.println();            }            System.out.println();            Map<String, Suggestion> suggestedMap = spellCheckResponse.getSuggestionMap();            for (Map.Entry<String, Suggestion> entry : suggestedMap.entrySet()) {                System.out.println("suggestionName: " + entry.getKey());                Suggestion suggestion = entry.getValue();                System.out.println("NumFound: " + suggestion.getNumFound());                System.out.println("Token: " + suggestion.getToken());                System.out.print("suggested: ");                List<String> suggestedList = suggestion.getAlternatives();                for (String suggestedWord : suggestedList) {                    System.out.print(suggestedWord + ", ");                }                System.out.println("\n\n");            }            Suggestion suggestion = spellCheckResponse.getSuggestion(token);            System.out.println("NumFound: " + suggestion.getNumFound());            System.out.println("Token: " + suggestion.getToken());            System.out.print("suggested: ");            List<String> suggestedList = suggestion.getAlternatives();            for (String suggestedWord : suggestedList) {                System.out.print(suggestedWord + ", ");            }            System.out.println("\n\n");            System.out.println("The First suggested word for solr is : " + spellCheckResponse.getFirstSuggestion(token));            System.out.println("\n\n");            List<Collation> collatedList = spellCheckResponse.getCollatedResults();            if (collatedList != null) {                for (Collation collation : collatedList) {                    System.out.println("collated query String: " + collation.getCollationQueryString());                    System.out.println("collation Num: " + collation.getNumberOfHits());                    List<Correction> correctionList = collation.getMisspellingsAndCorrections();                    for (Correction correction : correctionList) {                        System.out.println("original: " + correction.getOriginal());                        System.out.println("correction: " + correction.getCorrection());                    }                    System.out.println();                }            }            System.out.println();            System.out.println("The Collated word: " + spellCheckResponse.getCollatedResult());            System.out.println();        }        System.out.println("查询耗时:" + response.getQTime());    }    private static Item makeItem(long id, String subject, String content, long categoryId, String categoryName) {        Item item = new Item();        item.setId(id);        item.setSubject(subject);        item.setContent(content);        item.setLastUpdateTime(new Date());        item.setCategoryId(categoryId);        item.setCategoryName(categoryName);        return item;    }}

 

测试结果:

 

这种方式可以使用于对现在数据内容的查询拼写检查。

[solr] - spell check