首页 > 代码库 > 【搜索引擎Jediael开发4】V0.01完整代码
【搜索引擎Jediael开发4】V0.01完整代码
截止目前,已完成如下功能:
1、指定某个地址,下载其页面中包含的所有链接指向的网页
主要有以下类:
1、主类MyCrawler
2、网页下载类PageDownloader
3、网页内容分类类HtmlParserTool
4、接口Filter
完整代码可见归档代码 Jediael_v0.01
或者
https://code.csdn.net/jediael_lu/daopattern/tree/d196da609baa59ef08176322ca61928fbfbdf813
或者
http://download.csdn.net/download/jediael_lu/7382011
1、主类MyCrawler
package org.ljh.search; import java.io.IOException; import java.util.Iterator; import java.util.Set; import org.htmlparser.Parser; import org.ljh.search.downloadpage.PageDownloader; import org.ljh.search.html.HtmlParserTool; import org.ljh.search.html.LinkFilter; public class MyCrawler { public static void main(String[] args) { String url = "http://www.baidu.com"; LinkFilter linkFilter = new LinkFilter(){ @Override public boolean accept(String url) { if(url.contains("baidu")){ return true; }else{ return false; } } }; try { PageDownloader.downloadPageByGetMethod(url); Set<String> urlSet = HtmlParserTool.extractLinks(url, linkFilter); Iterator iterator = urlSet.iterator(); while(iterator.hasNext()){ PageDownloader.downloadPageByGetMethod((String) iterator.next()); } } catch (Exception e) { e.printStackTrace(); } } }
package org.ljh.search.downloadpage; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.PrintWriter; import java.io.Writer; import java.util.Scanner; import org.apache.http.HttpEntity; import org.apache.http.HttpStatus; import org.apache.http.client.ClientProtocolException; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; //本类用于将指定url对应的网页下载至本地一个文件。 public class PageDownloader { public static void downloadPageByGetMethod(String url) throws IOException { // 1、通过HttpGet获取到response对象 CloseableHttpClient httpClient = HttpClients.createDefault(); // 注意,必需要加上http://的前缀,否则会报:Target host is null异常。 HttpGet httpGet = new HttpGet(url); CloseableHttpResponse response = httpClient.execute(httpGet); InputStream is = null; if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) { try { // 2、获取response的entity。 HttpEntity entity = response.getEntity(); // 3、获取到InputStream对象,并对内容进行处理 is = entity.getContent(); String fileName = getFileName(url); saveToFile("D:\\tmp\\", fileName, is); } catch (ClientProtocolException e) { e.printStackTrace(); } finally { if (is != null) { is.close(); } if (response != null) { response.close(); } } } } //将输入流中的内容输出到path指定的路径,fileName指定的文件名 private static void saveToFile(String path, String fileName, InputStream is) { Scanner sc = new Scanner(is); Writer os = null; try { os = new PrintWriter(path + fileName); while (sc.hasNext()) { os.write(sc.nextLine()); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (sc != null) { sc.close(); } if (os != null) { try{ os.flush(); os.close(); }catch(IOException e){ e.printStackTrace(); System.out.println("输出流关闭失败!"); } } } } // 将url中的特殊字符用下划线代替 private static String getFileName(String url) { url = url.substring(7); String fileName = url.replaceAll("[\\?:*|<>\"/]", "_") + ".html"; return fileName; } }
3、网页内容分类类HtmlParserTool
package org.ljh.search.html; import java.util.HashSet; import java.util.Set; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; //本类创建用于HTML文件解释工具 public class HtmlParserTool { // 本方法用于提取某个html文档中内嵌的链接 public static Set<String> extractLinks(String url, LinkFilter filter) { Set<String> links = new HashSet<String>(); try { // 1、构造一个Parser,并设置相关的属性 Parser parser = new Parser(url); parser.setEncoding("gb2312"); // 2.1、自定义一个Filter,用于过滤<Frame >标签,然后取得标签中的src属性值 NodeFilter frameNodeFilter = new NodeFilter() { @Override public boolean accept(Node node) { if (node.getText().startsWith("frame src=http://www.mamicode.com/")) {>
4、接口Filterpackage org.ljh.search.html; //本接口所定义的过滤器,用于判断url是否属于本次搜索范围。 public interface LinkFilter { public boolean accept(String url); }
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。