首页 > 代码库 > 网页爬虫WebCrawler(1)-Http网页内容抓取

网页爬虫WebCrawler(1)-Http网页内容抓取

在windows下的C++通过Http协议实现对网页的内容抓取:

    首先介绍下两个重要的包(一般是在linux下的开源数据包,在windows下则调用其动态链接库dll):curl包和pthreads_dll,其中curl包解释为命令行浏览器,通过调用内置的curl_easy_setopt等函数即可实现特定的网页内容获取(正确的编译导入的curl链接库,还需要另外一个包C-ares)。pthreads是多线程控制包,当中包含了互斥变量加锁和解锁。程序进程分配等函数。

下载地址:点击打开链接。其中要正确的导入外接动态链接库,需要步骤:1,项目->属性->配置属性->C/C++->常规->附加包含目录(添加include的路径),2,项目->属性->配置属性->连接器->常规->附加库目录(添加lib包含的路径);3,在链接器->输入->附加依赖项(libcurld.lib ;pthreadVC2.lib;ws2_32.lib;winmm.lib;wldap32.lib;areslib.lib添加)4,在c/c++->预处理器->预处理器定义(_CONSOLE;BUILDING_LIBCURL;HTTP_ONLY)

    具体实现过程介绍:

1:自定义hashTable结构,用以存储获取的string字符。以hashTable类的形式实现,包含hash表set类型,以及add、find和几种常见的string哈希方式函数

Code:

///HashTable.h#ifndef HashTable_H#define HashTable_H#include <set>#include <string>#include <vector>class HashTable{public:	HashTable(void);	~HashTable(void);	unsigned int ForceAdd(const std::string& str);	unsigned int Find(const std::string& str);	/*string的常见的hash方式*/	unsigned int RSHash(const std::string& str);	unsigned int JSHash  (const std::string& str);    unsigned int PJWHash (const std::string& str);    unsigned int ELFHash (const std::string& str);    unsigned int BKDRHash(const std::string& str);    unsigned int SDBMHash(const std::string& str);    unsigned int DJBHash (const std::string& str);    unsigned int DEKHash (const std::string& str);    unsigned int BPHash  (const std::string& str);    unsigned int FNVHash (const std::string& str);    unsigned int APHash  (const std::string& str);private:	std::set<unsigned int> HashFunctionResultSet;	std::vector<unsigned int> hhh;};#endif
/////HashTable.cpp#include "HashTable.h"HashTable::HashTable(void){}HashTable::~HashTable(void){}unsigned int HashTable::ForceAdd(const std::string& str){	unsigned int i=ELFHash(str);	HashFunctionResultSet.insert(i);	return i;}unsigned int HashTable::Find(const std::string& str){	int ff=hhh.size();	const unsigned int i=ELFHash(str);	std::set<unsigned int>::const_iterator it;	if(HashFunctionResultSet.size()>0)	{		it=HashFunctionResultSet.find(i);		if(it==HashFunctionResultSet.end())			return -1;	}	else	{		return -1;	}	return i;}/*几种常见的字符串hash方式实现函数*/unsigned int HashTable::APHash(const std::string& str){	unsigned int hash=0xAAAAAAAA;	for(std::size_t i=0;i<str.length();i++)	{		hash^=((i & 1) == 0) ? (  (hash <<  7) ^ str[i] * (hash >> 3)) :                               (~((hash << 11) + str[i] ^ (hash >> 5)));	}	return hash;}unsigned int HashTable::BKDRHash(const std::string& str){	unsigned int seed=131;   //31 131 1313 13131 131313 etc	unsigned int hash=0;	for(std::size_t i=0;i<str.length();i++)	{		hash=(hash*seed)+str[i];	}	return hash;}unsigned int HashTable::BPHash(const std::string& str){	unsigned int hash = 0;	for(std::size_t i = 0; i < str.length(); i++)	{		 hash = hash << 7 ^ str[i];	}	return hash;}unsigned int HashTable::DEKHash(const std::string& str){	unsigned int hash = static_cast<unsigned int>(str.length());	for(std::size_t i = 0; i < str.length(); i++)	{		hash = ((hash << 5) ^ (hash >> 27)) ^ str[i];	}	return hash;}unsigned int HashTable::DJBHash(const std::string& str){	unsigned int hash = 5381;    for(std::size_t i = 0; i < str.length(); i++)    {        hash = ((hash << 5) + hash) + str[i];    }    return hash;}unsigned int HashTable::ELFHash(const std::string& str){	unsigned int hash=0;	unsigned int x=0;	for(std::size_t i = 0; i < str.length(); i++)	{		hash=(hash<<4)+str[i];		if((x = hash & 0xF0000000L) != 0)			hash^=(x>>24);		hash&=~x;	}	return hash;}unsigned int HashTable::FNVHash(const std::string& str){	const unsigned int fnv_prime = 0x811C9DC5;    unsigned int hash = 0;    for(std::size_t i = 0; i < str.length(); i++)    {         hash *= fnv_prime;         hash ^= str[i];    }    return hash;}unsigned int HashTable::JSHash(const std::string& str){	unsigned int hash = 1315423911;	for(std::size_t i = 0; i < str.length(); i++)	{		hash ^= ((hash << 5) + str[i] + (hash >> 2));	}	return hash;}unsigned int HashTable::PJWHash(const std::string& str){	 unsigned int BitsInUnsignedInt = (unsigned int)(sizeof(unsigned int) * 8);	 unsigned int ThreeQuarters     = (unsigned int)((BitsInUnsignedInt  * 3) / 4);	 unsigned int OneEighth         = (unsigned int)(BitsInUnsignedInt / 8);	 unsigned int HighBits          = (unsigned int)(0xFFFFFFFF) << (BitsInUnsignedInt - OneEighth);	 unsigned int hash              = 0;	 unsigned int test              = 0;	      for(std::size_t i = 0; i < str.length(); i++)	 {		  hash = (hash << OneEighth) + str[i];		  if((test = hash & HighBits)  != 0)			  hash = (( hash ^ (test >> ThreeQuarters)) & (~HighBits));	 }	 return hash;}unsigned int HashTable::RSHash(const std::string& str){	unsigned int b    = 378551;    unsigned int a    = 63689;    unsigned int hash = 0;	for(std::size_t i = 0; i < str.length(); i++)	{		hash = hash * a + str[i];        a    = a * b;	}	return hash;}unsigned int HashTable::SDBMHash(const std::string& str){	unsigned int hash = 0;	for(std::size_t i = 0; i < str.length(); i++)	{		hash = str[i] + (hash << 6) + (hash << 16) - hash;	}	return hash;}


2:实现进程间的互斥处理函数(另外提供进行当前操作的进程ID,以便加锁机制)。以SingleTone类实现。该类只能有静态函数Instance建立一个唯一的类对象。以互斥的方式实现对hashTable的基本操作,当中的变量加锁和解锁有mutex类来实现,具体参见代码:

////mutex.h#ifndef mutex_H#define mutex_H#pragma once#include "pthread.h"class mutex{	pthread_mutex_t& m_mutex;public:	mutex(pthread_mutex_t& m):m_mutex(m) 	{		pthread_mutex_lock(&m_mutex);	}	~mutex(void)	{		pthread_mutex_unlock(&m_mutex);	}};#endif


 

////SingleTone.h#ifndef SingleTone_H#define SingleTone_H#include <string>#include <list>#include <map>#include "Constants.h"#include "HashTable.h"#include "pthread.h"#include "curl/curl.h"class SingleTone{public:	static SingleTone* Instance();	void push_back(std::string s);	void pop_back();	int size();	std::list<std::string>::reference back();	std::list<std::string>::iterator begin();	std::list<std::string>::iterator end();	void push_front(std::string s);	bool empty();	unsigned int Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url);	unsigned int Get_m_UniqueMap_Find(const std::string& key,const std::string& url);	HashTable Get_m_UniqueMap(const std::string& key);	void Set_m_UniqueMap(const std::string& key,HashTable& hash);	CURL* GetpCurl();	protected:	SingleTone();	~SingleTone();	pthread_mutex_t m_singleton_mutex;private:	static SingleTone* m_pSingleTone;	std::list<std::string> m_LinkStack;	std::map<std::string,HashTable> m_UniqueMap;	CURL *m_pcurl;};#endif
#include "SingleTone.h"#include "mutex.h"SingleTone* SingleTone::m_pSingleTone=NULL;SingleTone::SingleTone(){	pthread_mutex_init(&m_singleton_mutex,NULL);	m_pcurl=curl_easy_init();}SingleTone::~SingleTone(){	pthread_mutex_destroy(&m_singleton_mutex);}SingleTone* SingleTone::Instance(){	if(m_pSingleTone==NULL){		m_pSingleTone=new SingleTone();	}	return (m_pSingleTone);}void SingleTone::push_back(std::string s){	mutex m(m_singleton_mutex);	return m_LinkStack.push_back(s);}void SingleTone::pop_back(){	mutex m(m_singleton_mutex);	return m_LinkStack.pop_back();}int SingleTone::size(){	return m_LinkStack.size();}std::list<std::string>::iterator SingleTone::begin(){	return m_LinkStack.begin();}std::list<std::string>::reference SingleTone::back(){	mutex m(m_singleton_mutex);	return m_LinkStack.back();}std::list<std::string>::iterator SingleTone::end(){    return m_LinkStack.end();}void SingleTone::push_front(std::string s){	mutex  m(m_singleton_mutex);    return m_LinkStack.push_front(s);}bool SingleTone::empty(){	return m_LinkStack.empty();}unsigned int SingleTone::Get_m_UniqueMap_ForceAdd(const std::string& key,const std::string& url){    mutex  m(m_singleton_mutex);    return m_UniqueMap[key].ForceAdd(url);}unsigned int SingleTone::Get_m_UniqueMap_Find(const std::string& key,const std::string& url){        HashTable hss = m_UniqueMap[key];    unsigned int uiRet =hss.Find(url);    //unsigned int uiRet = m_UniqueMap[key]->Find(url);    return uiRet;}HashTable SingleTone::Get_m_UniqueMap(const std::string& key){    return m_UniqueMap[key];}void SingleTone::Set_m_UniqueMap(const std::string& key,HashTable& hash){      m_UniqueMap[key] = hash;      }CURL* SingleTone::GetpCurl(){    return m_pcurl;}

3:实现HTTP对网页内容的获取:功能包含初始网页内容的获取,和URL设置等函数。这个过程要求是互斥的,所以引入SingleTone类的内容。

Code:

/////Http.h#ifndef Http_H#define Http_H#include "curl/curl.h"#include "pthread.h"#include <string>using namespace std;class Http{public:	Http(void);	~Http(void);	bool InitCurl(void);	bool InitCurl(const std::string& url, std::string& szbuffer);	bool DeInitCurl();	void setUrl(const std::string& url);	string setUrl();	const string getBuffer();private:	static void writer(void* buffer,size_t size,size_t nmemb,void* f);	int setBuffer(char* buffer,size_t size,size_t nmemb);	CURL *m_pcurl;	char m_errorBuffer[CURL_ERROR_SIZE];	string m_szbuffer;	string m_szUrl;	pthread_mutex_t m_http_mutex;};#endif
#include "Http.h"#include "SingleTone.h"#include "mutex.h"Http::Http(void){	m_pcurl=SingleTone::Instance()->GetpCurl();}Http::~Http(void){}bool Http::InitCurl(void){	return false;}int Http::setBuffer(char *buffer, size_t size, size_t nmemb){	int result = 0;	if (buffer!=NULL)	{		m_szbuffer.append(buffer, size * nmemb);		result = size * nmemb;	}	buffer = NULL ;       return result;}void Http::writer(void *buffer, size_t size, size_t nmemb,void* f){	static_cast<Http*>(f)->setBuffer((char*)buffer,size,nmemb);}bool Http::InitCurl(const std::string& url, std::string& szbuffer){	pthread_mutex_init(&m_http_mutex,NULL);	Http::m_szUrl=url;	CURLcode result;	if(m_pcurl)	{		curl_easy_setopt(m_pcurl, CURLOPT_ERRORBUFFER, Http::m_errorBuffer);        curl_easy_setopt(m_pcurl, CURLOPT_URL,m_szUrl.c_str());        curl_easy_setopt(m_pcurl, CURLOPT_HEADER, 0);        curl_easy_setopt(m_pcurl, CURLOPT_FOLLOWLOCATION, 1);        curl_easy_setopt(m_pcurl, CURLOPT_WRITEFUNCTION,Http::writer);        curl_easy_setopt(m_pcurl, CURLOPT_WRITEDATA,this);		result = curl_easy_perform(m_pcurl);	}	if(result!=CURLE_OK)	    return false;	szbuffer=m_szbuffer;	m_szbuffer.clear();	m_szUrl.clear();	pthread_mutex_destroy(&m_http_mutex);	return true;}bool Http::DeInitCurl(){    curl_easy_cleanup(m_pcurl);    curl_global_cleanup();    m_pcurl = NULL;         return true;}const string Http::getBuffer(){	return m_szbuffer;}string Http::setUrl(){	return Http::m_szUrl;}void Http::setUrl(const std::string& url){    Http::m_szUrl = url;}

其中 m_szbuffer存放网页的内容。初始网页的内容存放在Init函数的形参。




网页爬虫WebCrawler(1)-Http网页内容抓取