首页 > 代码库 > c++ 爬虫

c++ 爬虫

这是一个简单的c++爬虫,效率并不是很高...

 1 #include<stdio.h> 2 int s1[1000000],s2[1000000]; 3 void fun(int a, int b) 4 { 5     int i,ii; 6     bool t1,t2,t3,t4; 7     s1[0] = s2[0] = s1[1] = s2[1] = 0; 8     for(i=a; i <= b; i++){ 9         ii = i;10         t1 = t2 = t3 = t4 =false;11         while(ii != 0 ){12             int a = ii %10;13             if( a == 5)14             {15                 t1 = true;16             }17             else if( a == 2)18             {19                 t2 = true;20             }21             else if( a == 1)22             {23                 t3 = true;24             }25             ii = ii / 10;26         }27         if(t1 && t2 && t3){28             s1[i-1] = s1[i-2] + 1;29             ii = i;30         while(ii != 0 ){31             int a = ii % 10;32             int b = (ii / 10) % 10;33             int c = (ii / 100) % 10;34             if( c > 0 && a == 1 && b == 2 && c ==5)35                 t4 = true;36             ii = ii / 10;37         }38         if(t4)39             s2[i-1] = s2[i-2] + 1;40         else 41             s2[i-1] = s2[i-2];42         }43         else{44             s2[i-1] = s2[i-2];45             s1[i-1] = s1[i-2];46         }47     }48 }49 50 int main()51 {52     int a,b,i=1;53     fun(2,1000000);54     while(scanf("%d%d",&a,&b) != EOF){55         if(a == 1)56             printf("Case %d:%d %d\n",i,s1[b-1]-s1[a-1],s2[b-1]-s2[a-1]);57         else58             printf("Case %d:%d %d\n",i,s1[b-1]-s1[a-2],s2[b-1]-s2[a-2]);59         i++;60     }61     return 0;62 }
 1 #include"urlThread.h" 2 #include<QFile> 3 #include<QMessageBox> 4 #include<QTextStream> 5 #include <QMainWindow> 6 void urlThread::run() 7 { 8     open(); 9 }10 11 void urlThread::startThread()12 {13     start();14 }15 16 //显示找到的url17 void urlThread::open()18 {19     QString path = "url.txt";20     QFile file(path);21     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {22        // QMessageBox::warning(this,tr("Read File"),23         //                     tr("Cannot open file:\n%1").arg(path));24         send("error!cannot open url.txt!");25         return;26     }27     QTextStream in(&file);28     while(in.readLine().compare("") != 0){29         //ui->textBrowser->append(in.readLine());30         send(q2s(in.readLine()));31         Sleep(1);32     }33     file.close();34 }
View Code
 1 #include "mainwindow.h" 2 #include <QApplication> 3  4 int main(int argc, char *argv[]) 5 { 6     QApplication a(argc, argv); 7     MainWindow w; 8     w.setWindowTitle("小小爬虫"); 9     w.show();10 11     return a.exec();12 }
View Code
 1 #include "mainwindow.h" 2 #include "ui_mainwindow.h" 3  4 MainWindow::MainWindow(QWidget *parent) : 5     QMainWindow(parent), 6     ui(new Ui::MainWindow) 7 { 8     ui->setupUi(this); 9     QObject::connect(ui->start,SIGNAL(released()),this,SLOT(beginGeturl()));10     //QObject::connect(ui->display,SIGNAL(released()),this,SLOT(open()));11     QObject::connect(ui->display,SIGNAL(released()),&uth,SLOT(startThread()));12     QObject::connect(&uth,&urlThread::sendMessage,this,&MainWindow::receiveMessage);13     QObject::connect(&crawler,&Crawler::sendMessage,this,&MainWindow::receiveMessage);14 }15 16 MainWindow::~MainWindow()17 {18     delete ui;19 }20 21 void MainWindow::receiveMessage(const QString name)22 {23     ui->textBrowser->append(name);24     ui->textBrowser->moveCursor(QTextCursor::End);25 }26 27 void MainWindow::open()28 {29     QString path = "url.txt";30     QFile file(path);31     if (!file.open(QIODevice::ReadOnly | QIODevice::Text)) {32         QMessageBox::warning(this,tr("Read File"),33                              tr("Cannot open file:\n%1").arg(path));34         return;35     }36     QTextStream in(&file);37     while(in.readLine().compare("") != 0){38         //ui->textBrowser->append(in.readLine());39         crawler.send(q2s(in.readLine()));40     }41     file.close();42 }43 44 void MainWindow::beginGeturl()45 {46     //crawler = new Crawler();47     string url = "" ,dep, filter = "www";48     if(!ui->site->text().isEmpty())49         url = q2s(ui->site->text());50     crawler.addURL(url);51     int depth = 1;52     if(!ui->depth->text().isEmpty())53     {54         url = q2s(ui->depth->text());55         depth = atoi(url.c_str());56     }57     if(!ui->filter->text().isEmpty())58         filter = q2s(ui->filter->text());59     crawler.setJdugeDomain(filter);60     crawler.setDepth(depth);61     crawler.startThread();62 }
View Code
 1 #ifndef CRAWLER_H 2 #define CRAWLER_H 3  4 #include<set> 5 #include<string> 6 #include<queue> 7 #include "winsock2.h" 8 #include <iostream> 9 #include <fstream>10 #include <stdio.h>11 #include<time.h>12 #include<winsock.h>13 #include<QThread>14 15 #pragma comment(lib, "ws2_32.lib")16 using namespace std;17 18 bool ParseURL(const string & url, string & host, string & resource);19 bool GetHttpResponse(const string & url, char * &response, int &bytesRead);20 QString s2q(const string &s);21 string q2s(const QString &s);22 23 #define DEFAULT_PAGE_BUF_SIZE 100000024 25 class Crawler: public QThread26 {27     Q_OBJECT28 private:29     queue<string> urlWaiting;30     set<string> urlWaitset;31     set<string> urlProcessed;32     set<string> urlError;33     set<string> disallow;34     set<string>::iterator it;35     int numFindUrl;36     time_t starttime, finish;37     string filter;38     int depth;39 40 public:41     Crawler(){  filter = "\0";numFindUrl = 0;}42     ~Crawler(){}43     void begin();44     void setDepth(int depth);45     void processURL(string& strUrl);46     void addURL(string url);47     void log(string entry, int num);48     void HTMLParse(string & htmlResponse, const string & host);49     bool getRobotx(const string & url, char * &response, int &bytesRead);50     void setJdugeDomain(const string domain);51     long urlOtherWebsite(string url);52     void send(string s)53     {54         QString qs = s2q(s);55         emit sendMessage(qs);56     }57 signals:58     void sendMessage(const QString name);59 60 public slots:61         bool startThread();62 63 protected:64     void run();65 66 };67 #endif // CRAWLER_H
 1 #ifndef MAINWINDOW_H 2 #define MAINWINDOW_H 3  4 #include <QMainWindow> 5 #include<QFile> 6 #include<QMessageBox> 7 #include<QTextStream> 8 #include<QLineEdit> 9 #include<QDebug>10 #include"crawler.h"11 #include"urlThread.h"12 13 namespace Ui {14 class MainWindow;15 }16 17 class MainWindow : public QMainWindow18 {19     Q_OBJECT20 21 public:22     explicit MainWindow(QWidget *parent = 0);23     ~MainWindow();24     void receiveMessage(const QString name);25 26 public slots:27         void beginGeturl();28         void open();29 30 private:31     Ui::MainWindow *ui;32     Crawler crawler;33     urlThread uth;34 };35 36 #endif // MAINWINDOW_H
View Code
 1 #ifndef URLTHREAD_H 2 #define URLTHREAD_H 3 #include"crawler.h" 4  5 class urlThread: public QThread 6 { 7     Q_OBJECT 8 public slots: 9         void startThread();10         void open();11         void send(string s)12         {13             QString qs = s2q(s);14             emit sendMessage(qs);15         }16 signals:17         void sendMessage(const QString name);18 protected:19         void run();20 };21 22 #endif // URLTHREAD_H
View Code
  1 <?xml version="1.0" encoding="UTF-8"?>  2 <ui version="4.0">  3  <class>MainWindow</class>  4  <widget class="QMainWindow" name="MainWindow">  5   <property name="geometry">  6    <rect>  7     <x>0</x>  8     <y>0</y>  9     <width>706</width> 10     <height>381</height> 11    </rect> 12   </property> 13   <property name="windowTitle"> 14    <string>MainWindow</string> 15   </property> 16   <widget class="QWidget" name="centralWidget"> 17    <widget class="QLabel" name="label"> 18     <property name="geometry"> 19      <rect> 20       <x>10</x> 21       <y>20</y> 22       <width>54</width> 23       <height>21</height> 24      </rect> 25     </property> 26     <property name="text"> 27      <string>初始网址:</string> 28     </property> 29    </widget> 30    <widget class="QLineEdit" name="site"> 31     <property name="geometry"> 32      <rect> 33       <x>70</x> 34       <y>20</y> 35       <width>151</width> 36       <height>20</height> 37      </rect> 38     </property> 39     <property name="text"> 40      <string>http://www.scut.edu.cn</string> 41     </property> 42    </widget> 43    <widget class="QLabel" name="label_2"> 44     <property name="geometry"> 45      <rect> 46       <x>230</x> 47       <y>20</y> 48       <width>54</width> 49       <height>21</height> 50      </rect> 51     </property> 52     <property name="text"> 53      <string>搜索深度:</string> 54     </property> 55    </widget> 56    <widget class="QLineEdit" name="depth"> 57     <property name="geometry"> 58      <rect> 59       <x>290</x> 60       <y>20</y> 61       <width>51</width> 62       <height>20</height> 63      </rect> 64     </property> 65     <property name="text"> 66      <string>2</string> 67     </property> 68    </widget> 69    <widget class="QLabel" name="label_3"> 70     <property name="geometry"> 71      <rect> 72       <x>360</x> 73       <y>20</y> 74       <width>71</width> 75       <height>21</height> 76      </rect> 77     </property> 78     <property name="text"> 79      <string>过滤字符串:</string> 80     </property> 81    </widget> 82    <widget class="QLineEdit" name="filter"> 83     <property name="geometry"> 84      <rect> 85       <x>430</x> 86       <y>20</y> 87       <width>71</width> 88       <height>20</height> 89      </rect> 90     </property> 91     <property name="text"> 92      <string>scut</string> 93     </property> 94    </widget> 95    <widget class="QPushButton" name="start"> 96     <property name="geometry"> 97      <rect> 98       <x>510</x> 99       <y>20</y>100       <width>51</width>101       <height>23</height>102      </rect>103     </property>104     <property name="text">105      <string>开始</string>106     </property>107    </widget>108    <widget class="QPushButton" name="display">109     <property name="geometry">110      <rect>111       <x>570</x>112       <y>20</y>113       <width>61</width>114       <height>23</height>115      </rect>116     </property>117     <property name="text">118      <string>显示url</string>119     </property>120    </widget>121    <widget class="QTextBrowser" name="textBrowser">122     <property name="geometry">123      <rect>124       <x>10</x>125       <y>60</y>126       <width>671</width>127       <height>271</height>128      </rect>129     </property>130    </widget>131   </widget>132   <widget class="QMenuBar" name="menuBar">133    <property name="geometry">134     <rect>135      <x>0</x>136      <y>0</y>137      <width>706</width>138      <height>23</height>139     </rect>140    </property>141   </widget>142   <widget class="QToolBar" name="mainToolBar">143    <attribute name="toolBarArea">144     <enum>TopToolBarArea</enum>145    </attribute>146    <attribute name="toolBarBreak">147     <bool>false</bool>148    </attribute>149   </widget>150   <widget class="QStatusBar" name="statusBar"/>151  </widget>152  <layoutdefault spacing="6" margin="11"/>153  <resources/>154  <connections/>155 </ui>
View Code

ui文件要自己设计

爬虫还有一些小问题,抓取的url content并不完全,某些地址还有一点小问题...

貌似博客园没有上传附件的地方?还是我没找到?希望得到提示~