首页 > 代码库 > 乌云漏洞爬虫的数据库版本(mysql)

乌云漏洞爬虫的数据库版本(mysql)

特别鸣谢阮思绮同学!虽然感觉这个冷冷的博客也没人看23333

import mysql.connectorimport sys, osimport urllib.requestimport reimport itertoolsuser = rootpwd  = ‘‘host = 127.0.0.1db   = testdata_file = wooyun.datcreate_table_sql = "CREATE TABLE IF NOT EXISTS mytable (id int(10) AUTO_INCREMENT PRIMARY KEY, type varchar(300) , info varchar(1000) , detail varchar(5000) , repair varchar(1000) )CHARACTER SET utf8"insert_sql = "INSERT INTO mytable (type, info, detail, repair) VALUES ( %s, %s, %s, %s)"select_sql = "SELECT id, type, info, detail, repair FROM mytable"cnx = mysql.connector.connect(user=user, password=pwd, host=host, database=db)cursor = cnx.cursor()def create_table_sql_api(a):    try:        cursor.execute(a)    except mysql.connector.Error as err:        print("create table ‘mytable‘ failed.")        print("Error: {}".format(err.msg))        sys.exit()def insert_sql_api(a,b):    try:        cursor.execute(a,b)    except mysql.connector.Error as err:        print("insert table ‘mytable‘ failed.")        print("Error: {}".format(err.msg))        sys.exit()def select_sql_api(a):    try:        cursor.execute(a)        for (id, type, info, detail, repair) in cursor:            print("ID:{}  type:{}  info:{}  repair:{}".format(id, type, info, detail, repair))    except mysql.connector.Error as err:        print("query table ‘mytable‘ failed.")        print("Error: {}".format(err.msg))        sys.exit()def get_html_response(url):    html_response = urllib.request.urlopen(url).read().decode(utf-8)    return html_responsedef geturl(starturl):    a=get_html_response(starturl)    childurl=(re.findall(r/bugs/wooyun-\w*-\w*\b,a))    return childurldef get_nextpage(starturl):    d=get_html_response(starturl)    nextpage=(re.findall(rsearchbug.php\?q=6YeR6J6N&pNO=\w,d))    return nextpagestarturl="http://www.wooyun.org/searchbug.php?q=6YeR6J6N"result=[]final=[]type_wooyun_n=[]info_n=[]detail_n=[]repair_n=[]#output=open("D:\\wooyun.csv","w+")create_table_sql_api(create_table_sql)for i in get_nextpage(starturl):    result+=geturl(http://wooyun.org/+re.sub(金融,6YeR6J6N,i))    #扫描各种漏洞的url地址放入result中result=set(result)#去除result中重复的地址for i in result:    k=get_html_response(http://wooyun.org/+re.sub(金融,%E9%87%91%E8%9E%8D,i))#下载页面到k    type_wooyun=re.findall(r漏洞类型:.*.</h3>,k)    info=re.findall(r<h3>\w*:.*.</h3>,k)#空白字符用/s,寻找所有适用于<h3>标签的文字    detail=re.findall(r<p class="detail">.*.</p>,k)    repair=re.findall(r修复方案:</h3>\s*<p class="detail">.*.\s*</p>,k)    for j in type_wooyun:#漏洞类型,为之后进行数据库分类做准备        j=re.sub(r:\s,:,j)        j=re.sub(r\t,‘‘,j)        j=re.sub(r</h3>,‘‘,j)        type_wooyun_n+=j    for j in info:#处理概要        j=re.sub(r:\s,:,j)        j=re.sub(r<h3>,‘‘,j)        j=re.sub(r</h3>,‘‘,j)        j=re.sub(r<a\shref="http://www.mamicode.com/.*.">,‘‘,j)        j=re.sub(r</a>,‘‘,j)        j=re.sub(r<imgheight=".*./>,‘‘,j)        j=j.split()        info_n+=j    for j in detail:#处理详情        j=re.sub(r:\s,:,j)        j=re.sub(r<p\sclass="detail">,‘‘,j)        j=re.sub(r</p>,‘‘,j)        j=re.sub(r"\starget="_blank"><img\ssrc="http://www.mamicode.com/upload/.*.width="600"/></a>,,,j)        j=re.sub(r<a href="http://www.mamicode.com/, http://www.wooyun.org,j)        j=re.sub(r对本漏洞信息进行评价,.*.备学习价值,‘‘,j)        detail_n+=j    for j in repair:#处理回复方法        j=re.sub(r</br>,,,j)        j=re.sub(r</p>,,,j)        j=re.sub(r</h3>,,,j)        j=re.sub(r<p\sclass="detail">,‘‘,j)        j=re.sub(r,:,j)        j=j.split()        repair_n+=j        type_wooyun_str="".join(itertools.chain(*type_wooyun_n))    info_str="".join(itertools.chain(*info_n))    detail_str="".join(itertools.chain(*detail_n))      repair_str="".join(itertools.chain(*repair_n))     final.append(type_wooyun_str)    final.append(info_str)    final.append(detail_str)    final.append(repair_str)    insert_sql_api(insert_sql,tuple(final))    select_sql_api(select_sql)    #output.writelines(final)    #output.writelines(‘\n\n‘)    final.clear()    repair_n.clear()    info_n.clear()    type_wooyun_n.clear()    detail_n.clear()        cnx.commit()cursor.close()cnx.close()#output.close()