首页 > 代码库 > 对节目微博进行强过滤之后的处理
对节目微博进行强过滤之后的处理
1,对原始数据.data进行过滤,利用java实现
package com.bobo.DataPre;import java.io.BufferedReader;import java.io.BufferedWriter;import java.io.File;import java.io.FileReader;import java.io.FileWriter;import java.io.IOException;import java.io.PrintWriter;import com.bobo.util.Constants;import com.bobo.util.Fenci;import com.bobo.util.StopwordsRemover;import com.bobo.util.StringUtil;import com.bobo.util.UtilityForRemoveAtName;public class ProgramDataFilter { /** * @param args */ public static void main(String[] args) { long start = System.currentTimeMillis(); for (int i = 0; i < Constants.ProgramNameList.length; i++) { } ProgramDataFilter pre = new ProgramDataFilter(); String inFilePath; String outFilePath; String programName; String[] keywords; for (int i = 0; i < Constants.ProgramNameList.length; i++) { programName = Constants.ProgramNameList[i]; keywords = Constants.keywordsList[i]; inFilePath = Constants.TitleDir + File.separator + programName + ".title.uniqByWeiboId"; outFilePath = Constants.FilterDir + File.separator + programName + ".filter.fenci"; pre.dataSetAndRmStop(inFilePath, outFilePath, programName, keywords); long end = System.currentTimeMillis(); System.out.println(programName + "数据预处理,分词、去处停用时、去除@花费的时间为:" + (end - start) / 1000); } } public boolean isRelative(String weiboText, String programName, String[] filterWords) { // 包含节目名称 if (!weiboText.contains(programName)) { return false; } // 对于歧义性小的,单独利用名字就够了 if (filterWords.length < 1) { return true; } if (weiboText.contains("《" + programName + "》")) { return true; } // 包含节目名称的同时,包含演员名称或者节目类别 for (String keyword : filterWords) { if (weiboText.contains(keyword)) { return true; } } return false; } // 第一步,进行分词、去除停用词、去除@后的用户名称? private void dataSetAndRmStop(String inFilePath, String outFilePath, String programName, String[] keywords) { FileReader fr = null; BufferedReader br = null; FileWriter fw = null; BufferedWriter bw = null; PrintWriter pw = null; String line = null; Fenci fenci = new Fenci(); fenci.initial(); StopwordsRemover stop = new StopwordsRemover(); stop.setStoppingListSet(stop .loadStoppingListSet("./conf/stopwords.list")); String weiboText; try { fr = new FileReader(inFilePath); br = new BufferedReader(fr); fw = new FileWriter(outFilePath); bw = new BufferedWriter(fw); pw = new PrintWriter(bw); while ((line = br.readLine()) != null) { String[] lineArr = line.split("\t"); if (lineArr.length != 3) { continue; } weiboText = lineArr[1]; if (StringUtil.isNullOrEmpty(weiboText)) { continue; } if (!isRelative(weiboText, programName, keywords)) { continue; } String fenciString = stop.removeStoppingWords(fenci .testICTCLAS_ParagraphProcess((UtilityForRemoveAtName .removeName(weiboText)))); if (!StringUtil.isNullOrEmpty(fenciString)) { pw.println(lineArr[0]+"\t"+fenciString); } } } catch (Exception e) { e.printStackTrace(); System.out.println("RemoveUrlUtil.java文件去除链接出现异常"); } finally { try { br.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } pw.close(); } }}
生成的文件格式是
用户id “\t” 微博文本
2,提取每个节目下的用户列表,并将用户的列表和用户的profile进行对应
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。