- 数据库想要啥字段 自己创建吧
- 需要简单 的修改 调试 即可用
- 学习使用,不用于任何非法用途
#!/usr/bin/env python3 # -*- encoding: utf-8 -*- # Author: AimerNeige from urllib.request import urlretrieve import requests import json import re import os import pymysql import datetime import time ''' 需要下载的库 requests ''' # 下载文件到本地磁盘的话 rootPath = '/home/spider/duitang/' # 自定义了 def insert_data(url,msg,picId): conn = pymysql.connect(connect_timeout=15,host='127.0.0.1', user='root', password='password', database='izone', port=3306) cursor = conn.cursor() # 若id选择自动递增并为主键,可以设为null,让其自动增长。 ''':arg 1 表情 2 壁纸 3 头像 4 摄影 5 旅行 6 素材 7 搞笑萌宠 8 人文艺术 9 美妆造型 10 文字句子 ''' sql = """ insert into photos_photos(id,title,photos,create_date,likes,favorites,is_delete,author_id,category_id) value(%s,%s,%s,%s,%s,%s,%s,%s,%s) """ title = msg photos = url # create_date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") likes = 0 is_delete = 0 favorites = 0 author_id = 5 #发布者 category_id = 6 id = picId try: cursor.execute(sql, (id,title, photos, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), likes, is_delete, favorites, author_id, category_id)) conn.commit() # 错误回滚 except: conn.rollback() conn.close() def getJsonBySearch(keyword:'str', limit:'int', start:'int') -> 'str': """ 通过关键词搜索获取图片的json数据 :param keyword: 搜索关键词 :param limit: 返回数据数量限制(最大为100,使用大于100的数据会按照100处理) :param start: 从何处开始访问数据(索引) :return str: json数据 """ url = "https://www.duitang.com/napi/blog/list/by_search/?kw=%s&type=feed&limit=%d&start=%d" % (keyword, limit, start) response = requests.get(url) if response.status_code != 200: print("Request failed!") return response.encoding = 'utf-8' jsonData = response.text return jsonData def getJsonByAlbum(album_id:'str', limit:'int', start:'int') -> 'str': """ 通过专辑id获取图片的json数据 :param album_id:专辑id :param limit: 返回数据数量限制(最大为100,使用大于100的数据会按照100处理) :param start: 从何处开始访问数据(索引) :return str: json数据 """ url = "https://www.duitang.com/napi/blog/list/by_album/?album_id=%s&limit=%d&start=%d" % (album_id, limit, start) response = requests.get(url) if response.status_code != 200: print("Request failed!") return response.encoding = 'utf-8' jsonData = response.text return jsonData def download(url, path, name): """ 下载文件 :param url: 文件链接 :param path: 保存目录 :param name: 文件名称 :return: None """ def reporthook(a, b, c): """ 显示下载进度 :param a: 已经下载的数据块 :param b: 数据块的大小 :param c: 远程文件大小 :return: None """ print("\rdownloading: %5.1f%%" % (a * b * 100.0 / c), end="") filePath = os.path.join(path, name) if not os.path.isfile(filePath): print("开始下载:%s" % url) print("文件名:%s" % name) urlretrieve(url, filePath, reporthook = reporthook) print("下载完成!") else: print("该目录下已经存在了同名文件!下载失败!") filesize = os.path.getsize(filePath) print("文件大小:%.2f Mb" % (filesize/1024/1024)) def spider(value:'str', getAll:'str', minSize:'int', withId:'str'): allFlag = False if getAll == "y" or getAll == "Y" or getAll == "yes" or getAll == "Yes": allFlag = True if allFlag: limit = 1000 else: limit = input("你想要每次爬取多少张图片?(最大1000)\n") next_start = 0 while True: idFlag = False if withId == "y": idFlag = True if idFlag: jsonData = getJsonByAlbum(value, limit, next_start) else: jsonData = getJsonBySearch(value, limit, next_start) jsonItem = json.loads(jsonData) if jsonItem['status'] != 1: print("无法获取数据!这些信息可能会帮到你:\n%s" % jsonItem) else: data = jsonItem['data'] next_start = data['next_start'] objectList = data['object_list'] for objectItem in objectList: photo = objectItem['photo'] size = int(photo['size']) if size < minSize: continue # height = int(photo['height']) # #头像 1100 1080 #壁纸要限制长度 # if maxheight > height >2200: # continue url = photo['path'] msg = objectItem['msg'] picId = objectItem['id'] print(msg) insert_data(url,msg,picId) # 我不需要 只要图片数据 自行修改 nameEnd = re.findall(r'.*_(.*)', url)[0] #name = "%s_%s" % (picId, nameEnd) # if not os.path.exists(rootPath): # os.makedirs(rootPath) #newPath = os.path.join(rootPath, "%s/" % value) # if not os.path.exists(newPath): # os.makedirs(newPath) # download(url, newPath, name) more = data['more'] if more != 1: print("已经没有更多图片了!") break if not allFlag: a = ("还要继续爬吗 Y/N\n") if a == "y" or a == "Y" or a == "yes" or a == "Yes": continue else: break def main(): print("当前保存目录为%s,如果需要修改请修改源码" % rootPath) print("请选择爬取类型") print("1. 通过关键词爬取\t\t爬取关键词搜索结果") print("2. 通过专辑id爬取\t\t爬取专辑的所有图片") status_way = input() withId = 'y' value = "miku" getAll = 'y' if status_way == "1": withId = 'n' value = input("请输入搜索关键词\n") elif status_way == "2": value = input("请输入专辑Id(url里面有)\n") else: print("输入有误,请重新输入。") main() # getAll = input("是否自动爬取全部图片 Y/N\n") minSize = int(input("输入被过滤图片的大小,小于该数值则不会下载(整数、单位kb)\n")) # maxheight = int(input("下载图片的最大高度,(正方形1100,壁纸2000+)(整数、单位kb)\n")) minSize = minSize * 1024 spider(value, getAll, minSize, withId) main()
版权声明:如无特殊说明,文章均为本站原创,转载请注明出处
本文链接:http://kkxl95.cn/article/1609928079/