首页 > 代码库 > 网页行为分析
网页行为分析
去掉重复的id,并且存入数据库:
import MySQLdb# 打开数据库连接db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()# db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port=‘3306‘ )# # 使用cursor()方法获取操作游标# cursor = db.cursor()# 使用execute方法执行SQL语句sql=‘select ip from t_useripvisittrace‘cur.execute(sql)# 使用 fetchone() 方法获取一条数据库。data =http://www.mamicode.com/ cur.fetchall()#print(data)user_ip=[]for l in data: user_ip.append(l[0])#print(type(user_ip))#print(user_ip)# 关闭数据库连接user_ip_cai=set(user_ip)# print(user_ip_cai)userip=list(user_ip_cai)value=[]for i in range(len(userip)): value.append((i,userip[i]))cur.executemany(‘insert into t_userIP_list values(%s,%s)‘,value)db.commit()
根据网站情况,进行网站的分类:
# -*- coding: utf-8 -*-import MySQLdbimport reimport requestsfrom lxml import etree# 打开数据库连接count=0url=‘http://www.tipdm.org‘db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()cur1=db.cursor()sql=‘select page_path from t_useripvisittrace‘sql1=‘select url_list from urls‘cur.execute(sql)cur1.execute(sql1)# 使用 fetchone() 方法获取一条数据库。value=http://www.mamicode.com/[]data = cur.fetchall()print(len(data))for each in data: #print(type(each[0])) if each[0]==‘/‘: print(‘2222‘) value.append((each[0],‘首页‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count+=1 print(count) #print(value) elif each[0]==‘/index.jhtml‘: print(‘3333‘) value.append((each[0],‘首页‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘index‘ in each[0]: print(‘4444‘) urls=url+each[0] html = requests.get(urls) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[6]/div[2]/div[1]/div/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘.jhtml‘ in each[0]: print(‘5555‘) url1=url+each[0] html = requests.get(url1) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) else: print(‘666‘) value.append((each[0],‘其他‘)) print(each[0]) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count)print(value)print(‘finish‘)
使用pandas读取数据库进行统计
import pandas as pdfrom sqlalchemy import create_engineengine = create_engine(‘mysql+pymysql://root:123456@127.0.0.1:3306/db_websiterecommend?charset=utf8‘)sql = pd.read_sql(‘t_useripvisittrace‘, engine, chunksize = 10000)output=‘C:\\Users\\lenovo\\Desktop\\count_.xls‘‘‘‘z用create_engine建立连接,连接地址的意思依次为“数据库格式(mysql)+程序名(pymysql)+账号密码@地址端口/数据库名(test)”,最后指定编码为utf8;all_gzdata是表名,engine是连接数据的引擎,chunksize指定每次读取1万条记录。这时候sql是一个容器,未真正读取数据。‘‘‘import MySQLdb# 打开数据库连接db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()value=[]# for j in range(len(list(sql))):# s=sql[j][‘ip‘].value_counts()# value.append((j,s))# print valuefor i in sql: s=i[‘ip‘].value_counts() print type(s) value.append((list(i[‘ip‘]),list(s))) cur.executemany(‘insert into userip values(%,%s)‘, value) db.commit()print value
网页行为分析
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。