首页 > 代码库 > HTML、XML 等 Dom 结点类解析库Jsoup

HTML、XML 等 Dom 结点类解析库Jsoup

Jsoup 是一款Java 的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。

特点:

  • HTML、XML、自定义DOM格式文本解析;
  • 可操作HTML元素、属性、文本;
  • 适用于采集解析网站HTML;
  • DOM解析功能强大。

开源库jsoup-1.8.1.jar,基本用法:

 1 package com.zhang.jsoupdemo;
 2 
 3 import android.os.Environment;
 4 import android.support.v7.app.AppCompatActivity;
 5 import android.os.Bundle;
 6 
 7 import org.jsoup.Jsoup;
 8 import org.jsoup.nodes.Document;
 9 import org.jsoup.nodes.Element;
10 import org.jsoup.safety.Whitelist;
11 import org.jsoup.select.Elements;
12 
13 import java.io.File;
14 import java.io.IOException;
15 
16 public class MainActivity extends AppCompatActivity {
17 
18     private String html = "<html><head><title>Jsoup用法</title></head>"
19             + "<body><p><a href=http://www.mamicode.com/‘http://baidu.com‘>这里是 jsoup 项目的相关文章

"; 20 private String url = "http://baidu.com"; 21 22 @Override 23 protected void onCreate(Bundle savedInstanceState) { 24 super.onCreate(savedInstanceState); 25 setContentView(R.layout.activity_main); 26 27 //解析html文本 28 //载入数据 29 Document doc = Jsoup.parse(html); 30 //直接获取title 31 doc.title(); 32 33 34 //解析并提取 HTML 元素 35 Elements eles = doc.getElementsByTag("a"); 36 for (Element ele : eles) { 37 String linkHref = http://www.mamicode.com/ele.attr("href"); 38 String text = ele.text(); 39 } 40 41 //数据筛选、检索 42 Elements elements = doc.select("a[href]"); 43 Elements elements1 = doc.select("img[src$=.png]"); 44 Element element = doc.select("div.className").first(); 45 46 //修改数据 47 doc.select("div.className").attr("key", "value"); 48 doc.select("div.className").addClass("myClass");//class="myClass" 49 //清理数据 50 doc.select("img").removeAttr("onClick"); 51 //转换 52 String htmls = "";//不安全的 53 String safe = Jsoup.clean(htmls, Whitelist.basic());//安全的 54 55 56 //解析url 57 //get方式 58 try { 59 Document document = Jsoup.connect(url).get(); 60 } catch (IOException e) { 61 e.printStackTrace(); 62 } 63 //post方式 64 try { 65 Document document = Jsoup.connect(url).data("key", "value").timeout(3000).post(); 66 } catch (IOException e) { 67 e.printStackTrace(); 68 } 69 70 //解析本地html 71 File input = new File(Environment.getExternalStorageDirectory() + "/index.html"); 72 try { 73 Document document = Jsoup.parse(input,"utf-8","http://baidu.com");// ../baidu.png -> http://baidu.com/baidu.png 74 } catch (IOException e) { 75 e.printStackTrace(); 76 } 77 } 78 }

 

解析Html 和 Epub应用:

 1 package com.zhang.jsoup;
 2 
 3 import android.support.v7.app.AppCompatActivity;
 4 import android.os.Bundle;
 5 
 6 import org.jsoup.Jsoup;
 7 import org.jsoup.nodes.Document;
 8 import org.jsoup.nodes.Element;
 9 import org.jsoup.select.Elements;
10 
11 import java.io.IOException;
12 import java.io.InputStream;
13 
14 public class MainActivity extends AppCompatActivity {
15 
16     private String url = "http://mobile.csdn.net/";
17 
18     @Override
19     protected void onCreate(Bundle savedInstanceState) {
20         super.onCreate(savedInstanceState);
21         setContentView(R.layout.activity_main);
22 
23         new Thread(new Runnable() {
24             @Override
25             public void run() {
26                 parseHtml();
27                 parseEpub();
28             }
29         }).start();
30 
31     }
32 
33     private void parseHtml() {
34         try {
35             Document doc = Jsoup.connect(url).get();
36             Elements eles = doc.select("div.unit");
37             for (Element ele : eles) {
38                 String title = ele.getElementsByTag("h1").first().text();
39                 String href = http://www.mamicode.com/ele.getElementsByTag("h1").first().getElementsByTag("a").attr("href");
40                 System.out.println(title + "\n" + href);
41             }
42         } catch (IOException e) {
43             e.printStackTrace();
44         }
45     }
46 
47     private void parseEpub() {
48         try {
49             InputStream inputStream = getAssets().open("fb.ncx");
50             int size = inputStream.available();
51             byte[] buffer = new byte[size];
52             inputStream.read(buffer);
53             inputStream.close();
54 
55             String epubText = new String(buffer, "UTF-8");
56 
57             Document document = Jsoup.parse(epubText);
58             String title = document.getElementsByTag("docTitle").first().text();
59             System.out.println(title + "\n");
60 
61             Elements elements = document.getElementsByTag("navPoint");
62             for (Element element : elements) {
63                 String s = element.text();
64                 String imgHref = http://www.mamicode.com/element.getElementsByTag("content").first().attr("src");
65                 System.out.println(s + ":" + imgHref);
66             }
67 
68 
69         } catch (IOException e) {
70             e.printStackTrace();
71         }
72     }
73 }

 

HTML、XML 等 Dom 结点类解析库Jsoup