• 数据库想要啥字段 自己创建吧
  • 需要简单 的修改 调试 即可用
  • 学习使用,不用于任何非法用途
#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
# Author:  AimerNeige

from urllib.request import urlretrieve
import requests
import json
import re
import os
import pymysql
import datetime
import time
'''
需要下载的库
requests
'''

# 下载文件到本地磁盘的话
rootPath = '/home/spider/duitang/'

# 自定义了
def insert_data(url,msg,picId):
    conn = pymysql.connect(connect_timeout=15,host='127.0.0.1', user='root', password='password', database='izone', port=3306)
    cursor = conn.cursor()

    # 若id选择自动递增并为主键,可以设为null,让其自动增长。
    ''':arg
    1   表情
    2   壁纸
    3   头像
    4   摄影
    5   旅行
    6   素材
    7   搞笑萌宠
    8   人文艺术
    9   美妆造型
    10  文字句子
    '''
    sql = """
    insert into photos_photos(id,title,photos,create_date,likes,favorites,is_delete,author_id,category_id) value(%s,%s,%s,%s,%s,%s,%s,%s,%s)
    """
    title = msg
    photos = url
    # create_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    likes = 0
    is_delete = 0
    favorites = 0
    author_id = 5 #发布者
    category_id = 6
    id = picId
    try:
        cursor.execute(sql, (id,title, photos, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), likes, is_delete, favorites, author_id, category_id))
        conn.commit()
        # 错误回滚
    except:
        conn.rollback()
    conn.close()

def getJsonBySearch(keyword:'str', limit:'int', start:'int') -> 'str':
    """
    通过关键词搜索获取图片的json数据
    :param keyword: 搜索关键词
    :param limit:   返回数据数量限制(最大为100,使用大于100的数据会按照100处理)
    :param start:   从何处开始访问数据(索引)
    :return str:    json数据
    """
    url = "https://www.duitang.com/napi/blog/list/by_search/?kw=%s&type=feed&limit=%d&start=%d" % (keyword, limit, start)
    response = requests.get(url)
    if response.status_code != 200:
        print("Request failed!")
        return
    response.encoding = 'utf-8'
    jsonData = response.text
    return jsonData


def getJsonByAlbum(album_id:'str', limit:'int', start:'int') -> 'str':
    """
    通过专辑id获取图片的json数据
    :param album_id:专辑id
    :param limit:   返回数据数量限制(最大为100,使用大于100的数据会按照100处理)
    :param start:   从何处开始访问数据(索引)
    :return str:    json数据
    """
    url = "https://www.duitang.com/napi/blog/list/by_album/?album_id=%s&limit=%d&start=%d" % (album_id, limit, start)
    response = requests.get(url)
    if response.status_code != 200:
        print("Request failed!")
        return
    response.encoding = 'utf-8'
    jsonData = response.text
    return jsonData


def download(url, path, name):
    """
    下载文件
    :param url: 文件链接
    :param path: 保存目录
    :param name: 文件名称
    :return: None
    """
    def reporthook(a, b, c):
        """
        显示下载进度
        :param a: 已经下载的数据块
        :param b: 数据块的大小
        :param c: 远程文件大小
        :return: None
        """
        print("\rdownloading: %5.1f%%" % (a * b * 100.0 / c), end="")

    filePath = os.path.join(path, name)
    if not os.path.isfile(filePath):
        print("开始下载:%s" % url)
        print("文件名:%s" % name)
        urlretrieve(url, filePath, reporthook = reporthook)
        print("下载完成!")
    else:
        print("该目录下已经存在了同名文件!下载失败!")
    filesize = os.path.getsize(filePath)
    print("文件大小:%.2f Mb" % (filesize/1024/1024))


def spider(value:'str', getAll:'str', minSize:'int', withId:'str'):
    allFlag = False
    if getAll == "y" or getAll == "Y" or getAll == "yes" or getAll == "Yes":
        allFlag = True
    if allFlag:
        limit = 1000
    else:
        limit = input("你想要每次爬取多少张图片?(最大1000)\n")
    next_start = 0
    while True:
        idFlag = False
        if withId == "y":
            idFlag = True
        if idFlag:
            jsonData = getJsonByAlbum(value, limit, next_start)
        else:
            jsonData = getJsonBySearch(value, limit, next_start)
        jsonItem = json.loads(jsonData)
        if jsonItem['status'] != 1:
            print("无法获取数据!这些信息可能会帮到你:\n%s" % jsonItem)
        else:
            data = jsonItem['data']
            next_start = data['next_start']
            objectList = data['object_list']
            for objectItem in objectList:
                photo = objectItem['photo']
                size = int(photo['size'])
                if size < minSize:
                    continue
                # height = int(photo['height'])
                # #头像 1100 1080 #壁纸要限制长度
                # if maxheight > height >2200:
                #      continue
                url = photo['path']
                msg = objectItem['msg']
                picId = objectItem['id']
                print(msg)
                insert_data(url,msg,picId)

                # 我不需要  只要图片数据 自行修改
                nameEnd = re.findall(r'.*_(.*)', url)[0]
                #name = "%s_%s" % (picId, nameEnd)
                # if not os.path.exists(rootPath):
                #     os.makedirs(rootPath)
                #newPath = os.path.join(rootPath, "%s/" % value)
                # if not os.path.exists(newPath):
                #     os.makedirs(newPath)
                # download(url, newPath, name)
            more = data['more']
            if more != 1:
                print("已经没有更多图片了!")
                break
            if not allFlag:
                a = ("还要继续爬吗 Y/N\n")
                if a == "y" or a == "Y" or a == "yes" or a == "Yes":
                    continue
                else:
                    break


def main():
    print("当前保存目录为%s,如果需要修改请修改源码" % rootPath)
    print("请选择爬取类型")
    print("1. 通过关键词爬取\t\t爬取关键词搜索结果")
    print("2. 通过专辑id爬取\t\t爬取专辑的所有图片")
    status_way = input()
    withId = 'y'
    value = "miku"
    getAll = 'y'
    if status_way == "1":
        withId = 'n'
        value = input("请输入搜索关键词\n")
    elif status_way == "2":
        value = input("请输入专辑Id(url里面有)\n")
    else:
        print("输入有误,请重新输入。")
        main()
    # getAll = input("是否自动爬取全部图片 Y/N\n")
    minSize = int(input("输入被过滤图片的大小,小于该数值则不会下载(整数、单位kb)\n"))
    # maxheight = int(input("下载图片的最大高度,(正方形1100,壁纸2000+)(整数、单位kb)\n"))
    minSize = minSize * 1024
    spider(value, getAll, minSize, withId)
main()

版权声明:如无特殊说明,文章均为本站原创,转载请注明出处

本文链接:http://kkxl95.cn/article/1609928079/