简单爬虫从指定地址下载网站内容

首页 > 代码库 > 简单爬虫从指定地址下载网站内容

简单爬虫从指定地址下载网站内容

2024-10-13 00:54:02 214人阅读

Http01App.java 
1.使用了多线程、io流，net（网络包）

package main;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
/**
 * Created by lxj-pc on 2017/6/27.
 */
public class Http01App {
    public static void main(String[] args) {
        String url = "http://tuijian.hao123.com:80/index.html";
        //启动线程下载指定位置的html内容
        new Thread(new DownloadHtmlTask(url)).start();
        //下载指定的html内容，并存在d://html/hao123.html
    }

    static class DownloadHtmlTask implements Runnable {
        private String url;
        String fileName = "hao123.html";
        String dirPath = "d:/lxj";

        public DownloadHtmlTask(String url) {

            this.url = url;
        }

        @Override
        public void run() {
            //下载url指定的HTMl网页内容
            try {
                URL htmlURL = new URL(url);
                //打开网络资源连接
                try {
                    URLConnection urlConnection = htmlURL.openConnection();//.filed
                    HttpURLConnection conn = (HttpURLConnection) urlConnection;
                    //获取网络资源的读取流
                    InputStream is = conn.getInputStream();
                    //判断网络资源响应是否成功
                    if (conn.getResponseCode() == 200) {

                        //内存流 ByteArrayOutputStream
                        ByteArrayOutputStream baos = new ByteArrayOutputStream();

                        byte[] buffer = new byte[20 * 1024];//每次读取最大内存大小20k，缓冲大小
                        int len = -1; //每次读取的字节长度

                        //开始读取网路数据
                        //检测文件下载的进度
                        //1.获取网络资源的总长度
                        int contentLength = conn.getContentLength();
                        //2.声明当前已读取的资源长度，累加len
                        int curLen = 0;

                        while ((len = is.read(buffer)) != -1) {
                            //将读取的数据读取写入内存流中
                            baos.write(buffer, 0, len);

                            //3.计算下载进度
                            curLen += len;
                            System.out.println(curLen + " " + contentLength);
                            //4.计算下载进度
                            int p = curLen  / contentLength;
                            System.out.println("下载进度" + p + "%");
                        }
                        //下载完成,获取内存流中的数据
                        byte[] bytes = baos.toByteArray();
                        //将字节数组转成字符串，并打印到控制台
//                        "hello".getBytes(); zifu->zijie
                        String htmlContent = new String(bytes, "utf-8");
                        writerFile(htmlContent, dirPath, fileName);//成员方法在下面
                        System.out.println(htmlContent);
                        //  System.out.println(htmlContent);
                    }} catch (IOException e) {
                    e.printStackTrace();
                }
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }


        }

        //将文件存储在指定的path的文件中
        private void writerFile(String htmlContent, String dirPath, String fileName) {
            File dir = new File(dirPath);

            FileWriter fileWriter = null;
            try {
                fileWriter = new FileWriter(new File(dir, fileName));
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                fileWriter.write(htmlContent);
            } catch (IOException e) {
                e.printStackTrace();
            }
            try {
                fileWriter.close();
            } catch (IOException e) {
                e.printStackTrace();
            }


        }
    }
    public static void outputFile(String content,String dirPath,String fileName){
        File dir=new File(dirPath);
        try {
            FileOutputStream fileOutputStream=new FileOutputStream(new File(dir,fileName));
            fileOutputStream.write(content.getBytes("utf-8"));
            fileOutputStream.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

简单爬虫从指定地址下载网站内容

声明：以上内容来自用户投稿及互联网公开渠道收集整理发布，本网站不拥有所有权，未作人工编辑处理，也不承担相关法律责任，若内容有误或涉及侵权可进行投诉：投诉/举报工作人员会在5个工作日内联系你，一经查实，本站将立刻删除涉嫌侵权内容。

联系
我们

首页 > 代码库 > 简单爬虫 从指定地址下载网站内容

简单爬虫 从指定地址下载网站内容

看完仍有疑问？有类似问题直接问程序猿

首页 > 代码库 > 简单爬虫从指定地址下载网站内容

简单爬虫从指定地址下载网站内容