首页 > 代码库 > htmlparser 基础 网页拔取
htmlparser 基础 网页拔取
拔取网页 是http://mm.10086.cn/android/info/300008730468.html?from=www&fw=227062网页
打开网页 网页的bug模式(F12)
找出你想要爬取的数据
代码
package com.baidu;
import org.htmlparser.Node;
import org.htmlparser.Parser;
import org.htmlparser.Tag;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.HasChildFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import com.baidu.apply.bean.Apply;
import com.baidu.util.WebUtils;
import com.baidu.util.httpDownload;
public class HtmlParse {
public static void main(String[] args) throws ParserException {
Parser parse=new Parser("http://mm.10086.cn/android/info/300008730468.html?from=www&fw=227062");
parse.setEncoding("UTF-8");
//获取应用的名称
Apply apply=new Apply();
String appName="";
TagNameFilter nameFilter = new TagNameFilter("div");
AndFilter andFilter = new AndFilter(nameFilter, new HasAttributeFilter("class","mj_big_title font-f-yh"));
NodeList list= parse.parse(andFilter);
Tag tag=(Tag) list.elementAt(0);
System.out.println(tag.toPlainTextString());
//清零
parse.reset();
//获取应用
andFilter = new AndFilter(nameFilter, new HasAttributeFilter("class","mj_info font-f-yh"));
list= parse.parse(andFilter);
tag=(Tag) list.elementAt(0);
Node children=tag.getFirstChild();
list = children.getChildren();
for (int i = 0; i < list.size(); i++) {
tag = (Tag) list.elementAt(i);
if(i==0){
apply.setDownloadsize(tag.toPlainTextString());
System.out.println(tag.toPlainTextString());
}
System.out.println(tag.toPlainTextString());
String[] split = tag.toPlainTextString().split(":");
for (int j = 0; j < split.length; j++) {
if(i==1&&j==1){
apply.setPrice(split[j]);
}else if(i==2&&j==1){
apply.setVersion(split[j]);
}else if(i==3&&j==1){
apply.setFilesize(split[j]);
}
else if(i==4&&j==1){
apply.setDeveloper(split[j]);
}else if(i==5&&j==1){
apply.setApptype(split[j]);
}else if(i==6&&j==1){
apply.setUpdatetime(split[j]);
}else if(i==7&&j==1){
apply.setPlatform(split[j]);
}
}
}
//清零
parse.reset();
/**
*下载图片
*/
andFilter = new AndFilter(nameFilter,new HasAttributeFilter("class","mj_lunbo"));
list = parse.parse(andFilter);
tag= (Tag) list.elementAt(0);
// parse.reset();
list = tag.getChildren();
HasChildFilter hasChildFilter = new HasChildFilter(new TagNameFilter("img"));
andFilter=new AndFilter(nameFilter,hasChildFilter);
NodeList extractAll = list.extractAllNodesThatMatch(andFilter);
for (int i = 0; i < extractAll.size(); i++) {
tag=(Tag) extractAll.elementAt(i);
System.out.println(tag.getAttribute("id"));
Tag tag1 = (Tag) tag.getFirstChild();
String pic = tag1.getAttribute("src");
httpDownload.httpDownload(pic, apply.getAppname()+WebUtils.getRandomId()+".jpg");
}
parse.reset();
//
// /**
// * 下载apk
// */
//
// nameFilter = new TagNameFilter("div");
//
// andFilter =new AndFilter(nameFilter, new HasAttributeFilter("class", "mj_cont_left_t"));
//
// list = parse.parse(andFilter);
//
// tag =(Tag) list.elementAt(0);
//
// NodeList children1 = tag.getChildren();
//
// tag= (Tag)children1.elementAt(2);
//
// String href = http://www.mamicode.com/tag.getAttribute("href");
//
// httpDownload.httpDownload(href, apply.getAppname()+WebUtils.getRandomId()+".apk");
//
//
// parse.reset();
/**
* 下载二维码
*/
nameFilter=new TagNameFilter("div");
andFilter =new AndFilter(nameFilter,new HasAttributeFilter("class","mj_ewlist"));
list=parse.parse(andFilter);
tag = (Tag) list.elementAt(0);
list = tag.getChildren();
tag = (Tag) list.elementAt(1);
list = tag.getChildren();
for (int i = 0; i < list.size(); i++) {
tag=(Tag) list.elementAt(i);
list= tag.getChildren();
tag = (Tag) list.elementAt(0);
String src = http://www.mamicode.com/tag.getAttribute("src");
httpDownload.httpDownload(src, apply.getAppname()+WebUtils.getRandomId()+".jpg");
}
//添加到数据库
String appid = WebUtils.getRandomId();
String sql="insert into t_app (id,appname,version,description,filesize,updatetime,developer,apptype,price,downloadsize,platform,status) "
+ "values(‘"+appid+"‘,‘"+apply.getAppname()+"‘,‘"+apply.getVersion()+"‘,‘"+apply.getDescription()+"‘,"
+ "‘"+apply.getFilesize()+"‘,‘"+apply.getUpdatetime()+"‘,‘"+apply.getDeveloper()+"‘,‘"+apply.getApptype()+"‘,"
+ "‘"+apply.getPrice()+"‘,‘"+apply.getDownloadsize()+"‘,‘"+apply.getPlatform()+"‘,‘0‘)";
System.out.println(sql);
}
}
htmlparser 基础 网页拔取