首页 > 代码库 > java词频统计——web版支持
java词频统计——web版支持
需求概要:
1.把程序迁移到web平台,通过用户上传TXT的方式接收文件。
2.用户直接输入要统计的文本,服务器返回结果
3.在页面上给出链接 (如果有封皮、作者、字数、页数等信息更佳)或表格,展示经典英文小说词频统计结果;
4.支持用户自定义单词分隔符;
5.词汇范围对比(额外项)。
分析和设计:
1.创建web工程,利用servlet上传文件的技术实现用户向服务器上传文件。页面设置表单类型为enctype="multipart/form-data",创建文件上传文本框<input type="file" id="upfilename" name="upfilename" value="" />,服务器端使用Part p = request.getPart("upfilename");获取上传的文件,然后写入到指定地址即可。
2.直接分析用户post到服务器的内容,为了使用原有的api,可以将输入内容写到文件中,再进行分析。
3.页面展示统计结果
4.用户可以输入自定义的分隔符和设置显示统计结果前10行(可修改)。需要修改原词频统计的有效字符函数。
5.暂时不考虑
部分代码实现:
表单实现
1 <div align="center" id="txtform"> 2 <form action="upload" method="post" enctype="multipart/form-data"> 3 <input type="file" id="upfilename" name="upfilename" value="" /> 自定义分隔符<input 4 type="text" id="splitter" name="splitter"> <input 5 type="submit" id="submit" value="上传" /> 6 </form> 7 </div> 8 <div align="center" id="txtform"> 9 <form action="wordcount" method="post">10 <div align="center">待统计内容</div>11 <textarea name="content" id="content"12 style="width: 700px; height: 200px;"></textarea>13 <br> 统计前<select id="num" name="num">14 <option value="10">10</option>15 <option value="20">20</option>16 <option value="0">所有</option>17 </select>项 <br>自定义分隔符<input type="text" id="splitter" name="splitter"> <input18 type="submit" value="提交" /> <input type="button"19 onclick="if(confirm(‘确认重置?‘)){reset()}" value="重置">20 </form>21 </div>
文件上传:
1 request.setCharacterEncoding("UTF-8"); 2 response.setCharacterEncoding("UTF-8"); 3 response.setContentType("text/html"); 4 PrintWriter out = response.getWriter(); 5 byte b[] = new byte[2048]; 6 @SuppressWarnings("unused") 7 int len = 0; 8 Part p = request.getPart("upfilename"); 9 if(p==null){10 System.out.println("p == null");11 }12 String splitter = request.getParameter("splitter");13 InputStream in = p.getInputStream();14 String name = ""+System.currentTimeMillis();15 FileWriter fr = new FileWriter("D:\\upload\\" + name+".txt");16 while ((len = in.read(b)) > 0) {17 fr.write(new String(b));18 }19 fr.close();20 out.println("uploaded");21 response.sendRedirect("wordcount?id="+name+"&splitter"+splitter);22 out.flush();23 out.close();
servlet处理:
1 protected void doGet(HttpServletRequest request, HttpServletResponse response) 2 throws ServletException, IOException { 3 request.setCharacterEncoding("UTF-8"); 4 response.setCharacterEncoding("UTF-8"); 5 response.setContentType("text/html"); 6 PrintWriter out = response.getWriter(); 7 String id = request.getParameter("id"); 8 int num = 10; 9 String filename = "D:\\upload\\" + id + ".txt";10 WordUtil wu = WordUtilFactory.getWordUtil();11 long start = System.currentTimeMillis();12 String splitter = request.getParameter("splitter");13 wu.setSplitter(splitter);14 List<String[]> result = wu.getSortedWordGroupCountBuffered(filename, splitter);15 int size = result.size();16 for (int i = 0; i < (size > num ? num == 0 ? size : num : size); i++) {17 String[] strs = result.get(i);18 out.println(strs[1] + " : " + strs[0] + "<br>");19 }20 long end = System.currentTimeMillis();21 out.println("execution time :" + (end - start) + "ms");22 out.flush();23 out.close();24 }25 26 protected void doPost(HttpServletRequest request, HttpServletResponse response)27 throws ServletException, IOException {28 request.setCharacterEncoding("UTF-8");29 response.setCharacterEncoding("UTF-8");30 response.setContentType("text/html");31 PrintWriter out = response.getWriter();32 String content = request.getParameter("content");33 String numStr = request.getParameter("num");34 int num = 10;35 if (numStr != null) {36 num = Integer.parseInt(numStr);37 }38 WordUtil wu = WordUtilFactory.getWordUtil();39 40 long start = System.currentTimeMillis();41 String filename = "D://tmp.txt";42 43 FileWriter fr = new FileWriter(filename);44 fr.write(content);45 fr.close();46 String splitter = request.getParameter("splitter");47 wu.setSplitter(splitter);48 List<String[]> result = wu.getSortedWordGroupCountBuffered(filename, splitter);49 int size = result.size();50 for (int i = 0; i < (size > num ? num == 0 ? size : num : size); i++) {51 String[] strs = result.get(i);52 out.println(strs[1] + " : " + strs[0] + "<br>");53 }54 long end = System.currentTimeMillis();55 out.println("execution time :" + (end - start) + "ms");56 out.flush();57 out.close();58 }
有效字符判定(即自定义分隔符)
1 public void setSplitter(String splitter) { 2 char[] tmp = splitter.toCharArray(); 3 ArrayList<Character> deleted = new ArrayList<>(); 4 for(int i=0;i<tmp.length-1;i++){ 5 if(tmp[i]==‘\\‘){ 6 char c = tmp[i+1]; 7 if(c==‘n‘){ 8 deleted.add(‘\n‘); 9 }10 if(c==‘r‘){11 deleted.add(‘\n‘);12 }13 if(c==‘t‘){14 deleted.add(‘\n‘);15 }16 char[] copy = new char[tmp.length-2];17 for(int j = 0;j <i;j++){18 copy[j]=tmp[j];19 }20 for(int j=i;j<tmp.length-2;j++){21 copy[j]=tmp[j+2];22 }23 i++;24 }25 }26 split = new char[tmp.length+deleted.size()];27 for(int i = 0;i<tmp.length;i++){28 split[i]=tmp[i];29 }30 for(int i=tmp.length;i<split.length;i++){31 split[i]=deleted.get(split.length-tmp.length-1);32 }33 }34 35 private int isCharacter(char ch, String splitter) {36 if (split == null) {37 if ((ch >= ‘a‘ && ch <= ‘z‘))38 return 1;39 if ((ch >= ‘A‘ && ch <= ‘Z‘))40 return 1;41 if (ch >= ‘0‘ && ch <= ‘9‘)42 return 2;43 return 0;44 }45 if (split.equals("")) {46 if ((ch >= ‘a‘ && ch <= ‘z‘))47 return 1;48 if ((ch >= ‘A‘ && ch <= ‘Z‘))49 return 1;50 if (ch >= ‘0‘ && ch <= ‘9‘)51 return 2;52 return 0;53 }54 for (int i = 0; i < split.length; i++) {55 if (ch == split[i]) {56 return 0;57 }58 }59 if ((ch >= ‘a‘ && ch <= ‘z‘))60 return 1;61 if ((ch >= ‘A‘ && ch <= ‘Z‘))62 return 1;63 if (ch >= ‘0‘ && ch <= ‘9‘)64 return 2;65 return 1;66 }
web版工程地址:https://git.coding.net/jx8zjs/wordcount-web.git
ssh: git@git.coding.net:jx8zjs/wordcount-web.git
console版工程地址:https://coding.net/u/jx8zjs/p/wordCount/git
ssh: git@git.coding.net:jx8zjs/wordCount.git
java词频统计——web版支持