首页 > 代码库 > 正则表达式--网页爬虫

正则表达式--网页爬虫

 1 /* 2  * 网页爬虫:其实就一个程序用于在互联网中获取符合指定规则的数据。  3  *  4  * 爬取邮箱地址。  5  *  6  */ 7 public class RegexTest2 { 8  9     /**10      * @param args11      * @throws IOException 12      */13     public static void main(String[] args) throws IOException {14 15         16         List<String> list = getMailsByWeb();17         18         for(String mail : list){19             System.out.println(mail);20         }21     }22     23     public static List<String> getMailsByWeb() throws IOException {24         25         //1,读取源文件。26 //                BufferedReader bufr = new BufferedReader(new FileReader("c:\\mail.html"));27         28         URL url = new URL("http://192.168.1.100:8080/myweb/mail.html");29         30         BufferedReader bufIn = new BufferedReader(new InputStreamReader(url.openStream()));31                 32         //2,对读取的数据进行规则的匹配。从中获取符合规则的数据.33         String mail_regex = "\\w+@\\w+(\\.\\w+)+";34         35         List<String> list = new ArrayList<String>();36         37         38         Pattern p = Pattern.compile(mail_regex);39         40         String line = null;41         42         while((line=bufIn.readLine())!=null){43             44             Matcher m = p.matcher(line);45             while(m.find()){46                 //3,将符合规则的数据存储到集合中。47                 list.add(m.group());48             }49             50         }51         return list;52     }53 54     public static List<String>  getMails() throws IOException{55         56         //1,读取源文件。57         BufferedReader bufr = new BufferedReader(new FileReader("c:\\mail.html"));58         59         //2,对读取的数据进行规则的匹配。从中获取符合规则的数据.60         String mail_regex = "\\w+@\\w+(\\.\\w+)+";61         62         List<String> list = new ArrayList<String>();63         64         65         Pattern p = Pattern.compile(mail_regex);66         67         String line = null;68         69         while((line=bufr.readLine())!=null){70             71             Matcher m = p.matcher(line);72             while(m.find()){73                 //3,将符合规则的数据存储到集合中。74                 list.add(m.group());75             }76             77         }78         return list;79         80     }81 82 }

 

正则表达式--网页爬虫