有点草率,瞎写了一下

数据库表结构

CREATE TABLE `race` (
  `id` int(64) NOT NULL AUTO_INCREMENT,
  `race_id` int(11) DEFAULT NULL,
  `race_name` varchar(255) DEFAULT NULL,
  `race_time` int(8) DEFAULT NULL,
  `user_id` int(11) DEFAULT NULL,
  `username` varchar(255) DEFAULT NULL,
  `comment` text,
  `add_time` int(11) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=76201 DEFAULT CHARSET=utf8

爬虫代码

eg:先导入模块,Python3,pip install **** 没写的严谨,有点小bug

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Author:  kkxl

import requests
import json
import time
import pymysql
import datetime
import emoji
from fake_useragent import UserAgent
import logging

logging.basicConfig(filename='my.log', level=logging.DEBUG)
def insert_data(race_id,race_name,race_time,user_id,username,comment,addtime):
    print(race_id,race_name,race_time,user_id,username,comment,addtime)
    race_id = race_id
    race_name = str(race_name)
    race_time = race_time
    user_id = user_id
    username = str(username)
    comment = pymysql.escape_string(str(comment))
    coment1 = emoji.demojize(comment)
    addtime = addtime
    conn = pymysql.connect(connect_timeout=15,host='127.0.0.1', user='root', password='password', database='mydb', port=3306)
    cursor = conn.cursor()
    sql = "insert into race (race_id,race_name,race_time,user_id,username,comment,add_time) values ('%s', '%s', '%s', '%s', '%s','%s','%s')" % (race_id,race_name,race_time,user_id,username,coment1,addtime)
    try:
        cursor.execute(sql)
        conn.commit()
        # 错误回滚
        print(cursor.rowcount, "记录已插入")
    except:
        logging.debug("'%s', '%s', '%s', '%s', '%s','%s','%s'" % (race_id,race_name,race_time,user_id,username,coment1,addtime))
        conn.rollback()
        print(cursor.rowcount, "------------------异常回滚-")

    conn.close()

def getJsonBySearchRaces(day:'int') -> 'int':
    url = "http://api.dszuqiu.com/v6/diary/?day=%s&page=1&token=&only_need=2&per_page=20" % (day)
    headers = {'User-Agent': str(UserAgent().random)}
    # proxies 开启代理
    #response = requests.get(url, proxies=proxies, headers=headers, timeout=10)
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    try:
        jsonData = json.loads(response.text)
        if jsonData["status"] != 200:
            print("Request failed!")
            return
    except Exception as e:
        print("异常")
    return jsonData


def getJsondata(race_id,race_name,race_time):
    url = "http://api.dszuqiu.com/v14/race/comment?min_comment_id=0&is_inplay=0&page=1&token=&per_page=30&biaoqing_ver=1&following=0&race_id=%s&view_user_id=0" % (int(race_id))
    time.sleep(0.02)
    print(url)
    headers = {'User-Agent': str(UserAgent().random)}
    response = requests.get(url, headers=headers)
    response.encoding = 'utf-8'
    try:
        jsonData = json.loads(emoji.demojize(response.text))
        if jsonData["status"] != 200:
            print("Comment Request failed!")
            return
        comment_data = jsonData['comments']
        for data in comment_data:
            race_id = race_id
            race_name = race_name
            race_time = race_time
            user_id= data['user']['user_id']
            username = data['user']['username']
            comment = data['comment']
            addtime = data['add_time']
            insert_data(race_id,race_name,race_time,user_id,username,comment,addtime)
            time.sleep(0.01)
    except Exception as e:
        print("请求异常了")

def spider(race_day,race_time):
        jsonData = getJsonBySearchRaces(race_day)
        data_list = jsonData["races"]
        for data in data_list:
            race_name = data['league']['name']
            race_id = data["id"]
            getJsondata(race_id,race_name,race_time)

def race(start_day,end_day):
    begin = datetime.date(2020,7,int(start_day))
    end = datetime.date(2020,12,int(end_day))
    d = begin
    delta = datetime.timedelta(days=1)
    while d <= end:
        race_day = d.strftime("%Y%m%d")
        race_time = race_day
        spider(race_day,race_time)
        d += delta

def main():
    #start_day = input("请输入比赛开始日期:")
    #end_day = input("请输入比赛结束日期:")
    #race(start_day,end_day)
    race(12, 15)
main()

版权声明:如无特殊说明,文章均为本站原创,转载请注明出处

本文链接:http://kkxl95.cn/article/1608186837/