首页 > 代码库 > 网页行为分析

网页行为分析

去掉重复的id,并且存入数据库:

import MySQLdb# 打开数据库连接db = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()# db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port=‘3306‘ )# # 使用cursor()方法获取操作游标# cursor = db.cursor()# 使用execute方法执行SQL语句sql=select ip from t_useripvisittracecur.execute(sql)# 使用 fetchone() 方法获取一条数据库。data =http://www.mamicode.com/ cur.fetchall()#print(data)user_ip=[]for l in data:    user_ip.append(l[0])#print(type(user_ip))#print(user_ip)# 关闭数据库连接user_ip_cai=set(user_ip)# print(user_ip_cai)userip=list(user_ip_cai)value=[]for i in range(len(userip)):    value.append((i,userip[i]))cur.executemany(insert into t_userIP_list values(%s,%s),value)db.commit()

 根据网站情况,进行网站的分类:

# -*- coding: utf-8 -*-import MySQLdbimport reimport requestsfrom lxml import etree# 打开数据库连接count=0url=http://www.tipdm.orgdb = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()cur1=db.cursor()sql=select page_path from t_useripvisittracesql1=select url_list from urlscur.execute(sql)cur1.execute(sql1)# 使用 fetchone() 方法获取一条数据库。value=http://www.mamicode.com/[]data = cur.fetchall()print(len(data))for each in data:    #print(type(each[0]))    if each[0]==/:        print(2222)        value.append((each[0],首页))        cur.executemany(insert into t_url_classify values(%s,%s), value)        db.commit()        count+=1        print(count)        #print(value)    elif each[0]==/index.jhtml:        print(3333)        value.append((each[0],首页))        cur.executemany(insert into t_url_classify values(%s,%s), value)        db.commit()        count += 1        print(count)        #print(value)    elif index in each[0]:        print(4444)        urls=url+each[0]        html = requests.get(urls)        selector = etree.HTML(html.text)        content=selector.xpath(/html/body/div[6]/div[2]/div[1]/div/a[2]/text())        value.append((each[0],content))        cur.executemany(insert into t_url_classify values(%s,%s), value)        db.commit()        count += 1        print(count)        #print(value)    elif .jhtml in each[0]:        print(5555)        url1=url+each[0]        html = requests.get(url1)        selector = etree.HTML(html.text)        content=selector.xpath(/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text())        value.append((each[0],content))        cur.executemany(insert into t_url_classify values(%s,%s), value)        db.commit()        count += 1        print(count)    else:        print(666)        value.append((each[0],其他))        print(each[0])        cur.executemany(insert into t_url_classify values(%s,%s), value)        db.commit()        count += 1        print(count)print(value)print(finish)

 使用pandas读取数据库进行统计

import pandas as pdfrom sqlalchemy import create_engineengine = create_engine(mysql+pymysql://root:123456@127.0.0.1:3306/db_websiterecommend?charset=utf8)sql = pd.read_sql(t_useripvisittrace, engine, chunksize = 10000)output=C:\\Users\\lenovo\\Desktop\\count_.xls‘‘‘z用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8;all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。‘‘‘import MySQLdb# 打开数据库连接db = MySQLdb.connect(host=localhost, user=root, passwd=123456, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()value=[]# for j in range(len(list(sql))):#     s=sql[j][‘ip‘].value_counts()#     value.append((j,s))#     print valuefor i in sql:    s=i[ip].value_counts()    print type(s)    value.append((list(i[ip]),list(s)))    cur.executemany(insert into userip values(%,%s), value)    db.commit()print value

 

网页行为分析