有点草率,瞎写了一下
数据库表结构
CREATE TABLE `race` ( `id` int(64) NOT NULL AUTO_INCREMENT, `race_id` int(11) DEFAULT NULL, `race_name` varchar(255) DEFAULT NULL, `race_time` int(8) DEFAULT NULL, `user_id` int(11) DEFAULT NULL, `username` varchar(255) DEFAULT NULL, `comment` text, `add_time` int(11) DEFAULT NULL, PRIMARY KEY (`id`) ) ENGINE=InnoDB AUTO_INCREMENT=76201 DEFAULT CHARSET=utf8
爬虫代码
eg:先导入模块,Python3,pip install **** 没写的严谨,有点小bug
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- # Author: kkxl import requests import json import time import pymysql import datetime import emoji from fake_useragent import UserAgent import logging logging.basicConfig(filename='my.log', level=logging.DEBUG) def insert_data(race_id,race_name,race_time,user_id,username,comment,addtime): print(race_id,race_name,race_time,user_id,username,comment,addtime) race_id = race_id race_name = str(race_name) race_time = race_time user_id = user_id username = str(username) comment = pymysql.escape_string(str(comment)) coment1 = emoji.demojize(comment) addtime = addtime conn = pymysql.connect(connect_timeout=15,host='127.0.0.1', user='root', password='password', database='mydb', port=3306) cursor = conn.cursor() sql = "insert into race (race_id,race_name,race_time,user_id,username,comment,add_time) values ('%s', '%s', '%s', '%s', '%s','%s','%s')" % (race_id,race_name,race_time,user_id,username,coment1,addtime) try: cursor.execute(sql) conn.commit() # 错误回滚 print(cursor.rowcount, "记录已插入") except: logging.debug("'%s', '%s', '%s', '%s', '%s','%s','%s'" % (race_id,race_name,race_time,user_id,username,coment1,addtime)) conn.rollback() print(cursor.rowcount, "------------------异常回滚-") conn.close() def getJsonBySearchRaces(day:'int') -> 'int': url = "http://api.dszuqiu.com/v6/diary/?day=%s&page=1&token=&only_need=2&per_page=20" % (day) headers = {'User-Agent': str(UserAgent().random)} # proxies 开启代理 #response = requests.get(url, proxies=proxies, headers=headers, timeout=10) response = requests.get(url, headers=headers) response.encoding = 'utf-8' try: jsonData = json.loads(response.text) if jsonData["status"] != 200: print("Request failed!") return except Exception as e: print("异常") return jsonData def getJsondata(race_id,race_name,race_time): url = "http://api.dszuqiu.com/v14/race/comment?min_comment_id=0&is_inplay=0&page=1&token=&per_page=30&biaoqing_ver=1&following=0&race_id=%s&view_user_id=0" % (int(race_id)) time.sleep(0.02) print(url) headers = {'User-Agent': str(UserAgent().random)} response = requests.get(url, headers=headers) response.encoding = 'utf-8' try: jsonData = json.loads(emoji.demojize(response.text)) if jsonData["status"] != 200: print("Comment Request failed!") return comment_data = jsonData['comments'] for data in comment_data: race_id = race_id race_name = race_name race_time = race_time user_id= data['user']['user_id'] username = data['user']['username'] comment = data['comment'] addtime = data['add_time'] insert_data(race_id,race_name,race_time,user_id,username,comment,addtime) time.sleep(0.01) except Exception as e: print("请求异常了") def spider(race_day,race_time): jsonData = getJsonBySearchRaces(race_day) data_list = jsonData["races"] for data in data_list: race_name = data['league']['name'] race_id = data["id"] getJsondata(race_id,race_name,race_time) def race(start_day,end_day): begin = datetime.date(2020,7,int(start_day)) end = datetime.date(2020,12,int(end_day)) d = begin delta = datetime.timedelta(days=1) while d <= end: race_day = d.strftime("%Y%m%d") race_time = race_day spider(race_day,race_time) d += delta def main(): #start_day = input("请输入比赛开始日期:") #end_day = input("请输入比赛结束日期:") #race(start_day,end_day) race(12, 15) main()
版权声明:如无特殊说明,文章均为本站原创,转载请注明出处
本文链接:http://kkxl95.cn/article/1608186837/
-多抓鱼
1 楼 - 4 年前
太强了