首页 > 代码库 > PHP - php抓取页面方法汇总

PHP - php抓取页面方法汇总

    //网页抓取方法总结    //一、使用file_get_contents()    $timeout = array(          ‘http‘=> array(              ‘timeout‘=>5, //设置一个超时时间,单位为秒          )      );      $ctx = stream_context_create($timeout);      $text = file_get_contents("http://www.baidu.com",0, $ctx);    // var_dump($text);         //二、使用fopen()    function request($url) {        $timeout = array(           ‘http‘ => array(               ‘method‘=>‘GET‘,               ‘timeout‘ => 5 //设置一个超时时间,单位为秒           )        );        $ctx = stream_context_create($timeout);        $response = ‘‘;        if ($fp = fopen($url, "r", false, $ctx)) {            while( $c = fread($fp, 8192)) {                $response .= $c;            }            fclose($fp);        }        return $response;    }    $data = request(‘http://www.baidu.com‘);    //var_dump($data);        //使用file_get_contents和fopen必须空间开启allow_url_fopen。方    //法:编辑php.ini,设置 allow_url_fopen = On,allow_url_fopen关闭时    //fopen和file_get_contents都不能打开远程文件。        //三、使用CURL    function request2($url, $method=‘GET‘, $data=‘‘) {        $ch = curl_init();                        if($method == ‘POST‘) {            curl_setopt($ch, CURLOPT_POST, 1);            $data ? curl_setopt($ch, CURLOPT_POSTFIELDS, $data) : ‘‘;        }elseif($method == ‘GET‘) {            $url = $data ? $url.‘?‘.http_build_query($data) : $url;        }        //curl_setopt($ch, CURLOPT_HEADER, 1);  //是否获取http响应头        curl_setopt($ch, CURLOPT_URL, $url);        curl_setopt($ch, CURLOPT_USERAGENT, ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)‘);            curl_setopt($ch,CURLOPT_RETURNTRANSFER,1);   //默认为0,直接输出curl_exec获取到的信息        curl_setopt($ch,CURLOPT_TIMEOUT,30);         $res = curl_exec($ch);        $info = curl_getinfo($ch);        curl_close($ch);        return array(‘content‘=>$res,‘headers‘=>$info);    }    $data = request2(‘http://www.baidu.com‘);    //var_dump($data);        //fopen / file_get_contents 每次请求都会重新做DNS查询,并不对DNS信息进行缓存。    //但是CURL会自动对DNS信息进行缓存。对同一域名下的网页或者图片的请求只需要一次DNS查询。    //这大大减少了DNS查询的次数。    //所以CURL的性能比fopen / file_get_contents 好很多。        //四、使用socket    //1、使用socket_create()    function request3($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) {        //分析URL        $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘;        if(!preg_match($pattern, $url, $mathes)) {            die(‘URL格式错误!‘);        }        $host = gethostbyname($mathes[2]);        $port = empty($mathes[4]) ? 80 : $mathes[4];        $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串                //1、创建一个scoket        $socket = socket_create(AF_INET, SOCK_STREAM, SOL_TCP);        if(!$socket) {            die(‘创建scoket失败!‘.socket_strerror(socket_last_error()));        }        //2.创建一个socket连接        $sconn = socket_connect($socket, $host, $port);        if(!$sconn) {            die(‘初始化scoket连接失败!‘.socket_strerror(socket_last_error()));        }        //3.写入scoket        if($method == ‘GET‘) {            $header = "GET $suri HTTP/1.1\r\n";        }elseif($method == ‘POST‘) {              $header = "Content-Type: application/x-www-form-urlencoded\r\n";              $header .= "Content-Length: ".strlen($postdata)."\r\n";              $header .= "\r\n";              $header .= $postdata."\r\n";        }        $header .= "Host: $host\r\n";        $header .= "Connection: Close\r\n";        // $header .= "User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:29.0) Gecko/20100101 Firefox/29.0";        $header .= "\r\n";        $bytes = socket_write($socket, $header, strlen($header));        if($bytes === false) {            die(‘写入scoket失败!‘.socket_strerror(socket_last_error()));        }        //4.读取socket        $response = ‘‘;        while($v = socket_read($socket, 4096)) {            $response.= $v;        }        //5.关闭scoket        socket_close($socket);                $data = preg_split(‘/\r\n\r\n/‘, $response, 2);        if($rettype == 1) {   //获取响应正文            return $data[1];        }elseif($rettype == 2) {  //获取响应头            return $data[0];        }else {  //获取响应            return $data[1];        }    }    $response = request3(‘http://www.baidu.com‘, 1);    // var_dump($response);        //2、使用fsockopen()    function request4($url, $rettype=1, $method=‘GET‘, $postdata=‘‘) {        //分析URL        $pattern = ‘ /^(http|https):\/\/([a-zA-Z0-9_.]+)(:(\d+)){0,1}(.*)/i‘;        if(!preg_match($pattern, $url, $mathes)) {            die(‘URL格式错误!‘);        }        $host = gethostbyname($mathes[2]);        $port = empty($mathes[4]) ? 80 : $mathes[4];        $suri = empty($mathes[5]) ? ‘/‘ : $mathes[5]; //查询字符串                //1、打开一个scoket连接        $fp = fsockopen($host, $port, $errno, $errstr);        if(!$fp) {            die(‘打开scoket连接失败!‘.$errstr);        }                //2.往文件句柄写入内容        if($method == ‘GET‘) {            $header = "GET $suri HTTP/1.1\r\n";        }elseif($method == ‘POST‘) {              $header = "Content-Type: application/x-www-form-urlencoded\r\n";              $header .= "Content-Length: ".strlen($postdata)."\r\n";              $header .= "\r\n";              $header .= $postdata."\r\n";        }        $header .= "Host: $host\r\n";        $header .= "Connection: Close\r\n";        $header .= "\r\n";        fwrite($fp, $header);                //3.读取内容        $response = ‘‘;        while (!feof($fp)) {            $response .= fgets($fp, 128);        }                //4.关闭文件句柄        fclose($fp);                $data = preg_split(‘/\r\n\r\n/‘, $response, 2);        if($rettype == 1) {   //获取响应正文            return $data[1];        }elseif($rettype == 2) {  //获取响应头            return $data[0];        }else {  //获取响应            return $data[1];        }    }        $response = request4(‘http://www.163.com‘);    // var_dump($response);        //四、使用snoopy    function request5($url) {        include_once ‘interview_lib/snoopy.php‘;        $snoopy = new snoopy();        $snoopy->referer = ‘http://www.sina.com‘;        $snoopy->fetch($url);        return $snoopy->results;    }    $data = request5(‘http://www.163.com‘);    var_dump($data);        //snoopy很好用,封装好了,测试的都能抓取    ?>

参考:http://blog.csdn.net/lxzo123/article/details/6718771 

         http://www.nowamagic.net/librarys/veda/detail/2585

         http://www1.phpchina.com/archives/view-42979-1.html

         http://blog.csdn.net/lxzo123/article/details/6718771

 

PHP - php抓取页面方法汇总