首页 > 代码库 > 对节目微博进行强过滤之后的处理

对节目微博进行强过滤之后的处理

1,对原始数据.data进行过滤,利用java实现

package com.bobo.DataPre;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import com.bobo.util.Constants;import com.bobo.util.Fenci;import com.bobo.util.StopwordsRemover;import com.bobo.util.StringUtil;import com.bobo.util.UtilityForRemoveAtName;public class ProgramDataFilter {    /**     * @param args     */    public static void main(String[] args) {        long start = System.currentTimeMillis();        for (int i = 0; i < Constants.ProgramNameList.length; i++) {        }        ProgramDataFilter pre = new ProgramDataFilter();        String inFilePath;        String outFilePath;        String programName;        String[] keywords;        for (int i = 0; i < Constants.ProgramNameList.length; i++) {            programName = Constants.ProgramNameList[i];            keywords = Constants.keywordsList[i];            inFilePath = Constants.TitleDir + File.separator + programName                    + ".title.uniqByWeiboId";            outFilePath = Constants.FilterDir + File.separator + programName                    + ".filter.fenci";            pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords);            long end = System.currentTimeMillis();            System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:"                    + (end - start) / 1000);        }    }    public boolean isRelative(String weiboText, String programName,            String[] filterWords) {        // 包含节目名称        if (!weiboText.contains(programName)) {            return false;        }        // 对于歧义性小的,单独利用名字就够了        if (filterWords.length < 1) {            return true;        }        if (weiboText.contains("" + programName + "")) {            return true;        }        // 包含节目名称的同时,包含演员名称或者节目类别        for (String keyword : filterWords) {            if (weiboText.contains(keyword)) {                return true;            }        }        return false;    }    // 第一步,进行分词、去除停用词、去除@后的用户名称?    private void dataSetAndRmStop(String inFilePath, String outFilePath,            String programName, String[] keywords) {        FileReader fr = null;        BufferedReader br = null;        FileWriter fw = null;        BufferedWriter bw = null;        PrintWriter pw = null;        String line = null;        Fenci fenci = new Fenci();        fenci.initial();        StopwordsRemover stop = new StopwordsRemover();        stop.setStoppingListSet(stop                .loadStoppingListSet("./conf/stopwords.list"));        String weiboText;        try {            fr = new FileReader(inFilePath);            br = new BufferedReader(fr);            fw = new FileWriter(outFilePath);            bw = new BufferedWriter(fw);            pw = new PrintWriter(bw);            while ((line = br.readLine()) != null) {                String[] lineArr = line.split("\t");                if (lineArr.length != 3) {                    continue;                }                weiboText = lineArr[1];                if (StringUtil.isNullOrEmpty(weiboText)) {                    continue;                }                if (!isRelative(weiboText, programName, keywords)) {                    continue;                }                String fenciString = stop.removeStoppingWords(fenci                        .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName                                .removeName(weiboText))));                if (!StringUtil.isNullOrEmpty(fenciString)) {                    pw.println(lineArr[0]+"\t"+fenciString);                }            }        } catch (Exception e) {            e.printStackTrace();            System.out.println("RemoveUrlUtil.java文件去除链接出现异常");        } finally {            try {                br.close();            } catch (IOException e) {                // TODO Auto-generated catch block                e.printStackTrace();            }            pw.close();        }    }}
过滤和分词

生成的文件格式是

用户id “\t” 微博文本

2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应