首页 > 代码库 > 100行实现全文检索

100行实现全文检索

最近两天看了下lucene的源码,印象还停留在2.x版本,现在都4.8了,变化不小,不过换汤不换药,还是比较容易理解的。

其实关于全文检索的倒排序,逻辑是非常简单的,“空间换时间”的概念也不复杂。

写了一段示意代码,说明一下。

以下的示意代码,采用mysql作为索引文件的存储介质。

使用“二元切分”,亦即“二元” “元切” “切分”。

import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class TestLucene {
	
	public static Object insertToDB(String sql) {
		return 0;
	}
	
	public static List<Object> getAllFromDB(String sql) {
		return null;
	}
	
	public static Object getOneFromDB(String sql) {
		return null;
	}
	
	public static Map<String, Integer> analyze(String text){
		
		Map<String, Integer> words = new HashMap<String, Integer>();		
		for(int i = 0; i < text.length() - 1; i++) {
			String word = text.substring(i, i + 1);
			int freq = 1;
			if(words.keySet().contains(word)) {
				freq += words.get(word);
			}
			words.put(word, freq);
		}		
		return words;
	}
	
	public static void buildIndex(String text) {		
		//保存文档
		Integer docID = (Integer)insertToDB(String.format(
				"insert into docs (doc=?) values ('%s')", text));
		
		Map<String, Integer> words = analyze(text);
		for(String word : words.keySet()) {
			
			//词频
			int freq = words.get(word);
			
			//查询是否存在word
			Integer wordID = (Integer)getOneFromDB(String.format(
					"select id from words where word='%s'", word));
			//如果没有则创建
			if(wordID == null) {
				wordID = (Integer)insertToDB(String.format(
						"insert into words (word=?) values ('%s')", word));
			}
			
			//创建关联关系
			insertToDB(String.format(
					"insert into ref_doc_words (docid=?, wordid=?, freq=?) values ('%d', '%d', '%d')", 
					docID, wordID, freq));
		}		
	}
	
	public static String buidSqlInClause(Collection coll) {
		StringBuffer sb = new StringBuffer("in (");
		Iterator<Object> itor = coll.iterator();
		while(itor.hasNext()) {
			sb.append("'").append(itor.next()).append("',");
		}
		sb.deleteCharAt(sb.length() - 1);
		sb.append(")");
		return sb.toString();
	}
	
	public static List<Object> search(String keywords) {
		List<Object> docIDs = null;
		
		//切分查询词
		Map<String, Integer> keys = analyze(keywords);
		
		//查询词对应的所有wordID		
		List<Object> wordIDs = getAllFromDB(String.format("select id from words where word in %s", 
                                buidSqlInClause(keys.keySet())));
		
		//查询文档
		docIDs = getAllFromDB(String.format(
                                "select d.id, d.doc from docs d, ref_doc_words r where r.wordid in %s order by freq desc", 
				buidSqlInClause(wordIDs)));
		
		return docIDs;
	}

	public static void main(String[] args) {
		buildIndex("100行搞定全文检索");
		buildIndex("LUCENE的逻辑很简单");
		buildIndex("我是为了凑足一百行");
		
		for(Object doc : search("检索")) {
			System.out.println(doc);
		}
	}
}