首页 > 代码库 > 朴树贝叶斯新闻分类系统

朴树贝叶斯新闻分类系统

基于搜狗语料库,建立的一个新闻分类系统;类别包括:

classifierMap.put(0, "IT");
classifierMap.put(1, "体育");
classifierMap.put(2, "健康");
classifierMap.put(3, "军事");
classifierMap.put(4, "招聘");
classifierMap.put(5, "教育");
classifierMap.put(6, "文化");
classifierMap.put(7, "旅游");
classifierMap.put(8, "财经");

分词器:中科院分词工具或者IK;本人采用IK分词器,通过测试发现速度快,内存消耗低,不会电脑死机,在训练数据的时候;训练集是下载的搜狗新闻数据集,对新闻分类

算法步骤:

1. 首先下载IK分词器和搜狗新闻训练集和搜狗词典(对词进行了词性标注,个人只选择了名词,考虑到内存和速度,准确率的因素)

2. 对训练集分词处理,将属于不同类别的新闻分词处理,并去除,词频低于10的词,过滤掉,节省内存和提高速度的考虑;并以文本的形式保存,以类别定义文件名字

3. 编写朴素贝叶斯分类函数,对输入文本进行分类处理,选择概率最大的作为分类类别

4. web系统采用JSP+JavaBean+Servlet的架构,软件平台式新浪云;网址:http://naivebayes.sinaapp.com;如果是无法访问,应该是服务器没有开

使用方式:输入文本,并点击新闻分类;

主程序代码:

package com.sogou.servlet;import java.io.IOException;import java.util.HashMap;import java.util.List;import java.util.Map;import javax.servlet.RequestDispatcher;import javax.servlet.ServletContext;import javax.servlet.ServletException;import javax.servlet.annotation.WebServlet;import javax.servlet.http.HttpServlet;import javax.servlet.http.HttpServletRequest;import javax.servlet.http.HttpServletResponse;import com.sogou.util.BayesUtil;/** * Servlet implementation class BayesServlet */@WebServlet("/bayes.do")public class BayesServlet extends HttpServlet {    private static final long serialVersionUID = 1L;    /**     * @see HttpServlet#HttpServlet()     */    public BayesServlet() {        super();        // TODO Auto-generated constructor stub    }    /**     * @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse     *      response)     */    protected void doGet(HttpServletRequest request,            HttpServletResponse response) throws ServletException, IOException {        // TODO Auto-generated method stub        this.doPost(request, response);    }    /**     * @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse     *      response)     */    @SuppressWarnings("unchecked")    protected void doPost(HttpServletRequest request,            HttpServletResponse response) throws ServletException, IOException {        // TODO Auto-generated method stub        String newsText = request.getParameter("newsText");        newsText = new String(newsText.getBytes("ISO8859-1"), "utf-8");        ServletContext st = this.getServletContext();        List<Map<String, Integer>> trainSets = (List<Map<String, Integer>>) st                .getAttribute("trainSets");        Map<Integer, String> classifierMap = (Map<Integer, String>) st                .getAttribute("classifierMap");        if (classifierMap == null) {            classifierMap = new HashMap<Integer, String>();            classifierMap.put(0, "IT");            classifierMap.put(1, "体育");            classifierMap.put(2, "健康");            classifierMap.put(3, "军事");            classifierMap.put(4, "招聘");            classifierMap.put(5, "教育");            classifierMap.put(6, "文化");            classifierMap.put(7, "旅游");            classifierMap.put(8, "财经");            st.setAttribute("classifierMap", classifierMap);        }        BayesUtil bayes = new BayesUtil();        if (trainSets == null) {            String dirName = "D:/dataMing/bys";            trainSets = bayes.loadTrainSet(dirName);            st.setAttribute("trainSets", trainSets);        }        String classifier = bayes.bayesClassifierText(trainSets, newsText,                classifierMap);        System.out.println(classifier);        request.setAttribute("classifier", classifier);        RequestDispatcher rd = request.getRequestDispatcher("./index.jsp");        rd.forward(request, response);    }}
package com.sogou.util;import java.io.BufferedReader;import java.io.File;import java.io.FileNotFoundException;import java.io.FileReader;import java.io.IOException;import java.io.StringReader;import java.util.ArrayList;import java.util.HashMap;import java.util.LinkedList;import java.util.List;import java.util.Map;import org.wltea.analyzer.core.IKSegmenter;import org.wltea.analyzer.core.Lexeme;public class BayesUtil {    /**     * 加载训练集分类词典目录,对内容分类处理     *      * @param dirName     * @param content     */    public List<Map<String, Integer>> loadTrainSet(String dirName) {        File directory = new File(dirName);        File[] files = directory.listFiles();        BufferedReader br = null;        List<Map<String, Integer>> list = new ArrayList<>(files.length);        // 加载字典        for (int i = 0; i < files.length; i++) {            try {                br = new BufferedReader(new FileReader(files[i]));                Map<String, Integer> hashMap = new HashMap<String, Integer>();                String line = null;                while ((line = br.readLine()) != null) {                    String[] values = line.split("\t");                    hashMap.put(values[0], Integer.parseInt(values[1]));                }                list.add(hashMap);            } catch (FileNotFoundException e) {                // TODO Auto-generated catch block                e.printStackTrace();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            } finally {                try {                    br.close();                } catch (IOException e) {                    // TODO Auto-generated catch block                    e.printStackTrace();                }            }        }        // 对传入文本或者文件处理        return list;    }    /**     * 对传入的文本分类处理     *      * @param content     */    public String bayesClassifierText(List<Map<String, Integer>> trainSets,            String content, Map<Integer, String> textClassifier) {        IKSegmenter ik = new IKSegmenter(new StringReader(content), true);        Lexeme value = null;        List<String> list = new LinkedList<String>();        String text = null;        try {            while ((value = http://www.mamicode.com/ik.next()) != null) {                text = value.getLexemeText();                if (text.length() >= 2) {                    list.add(text);                }            }        } catch (IOException e) {            // TODO Auto-generated catch block            e.printStackTrace();        }        int length = trainSets.size();        long[] maxCfVal = new long[length];        int[] wordsCount = new int[length];        boolean flag = false;        for (String tt : list) {            for (int i = 0; i < length; i++) {                if (!flag) {                    wordsCount[i] = trainSets.get(i).get("wordsCount");                }                Integer iv = trainSets.get(i).get(tt);                if (iv != null) {                    maxCfVal[i] += Math.log((float) iv / wordsCount[i]);                } else {                    maxCfVal[i] += Math.log(1.0 / (wordsCount[i]));                }            }            flag = true;        }        long maxValue = http://www.mamicode.com/maxCfVal[0];        int index = 0;        for (int i = 1; i < length; i++) {            if (maxCfVal[i] > maxValue) {                index = i;                maxValue = maxCfVal[i];            }        }        return textClassifier.get(index);    }    /**     * 对传入的文本文件分类     *      * @param fileName     */    public void bayesClassifierFile(String fileName) {    }}