MENU

【日常Py】百度图片关键词多线程爬虫

January 22, 2019 • Read: 1264 • 程序源码

前几天朋友需要大量各种关键词的图片,于是帮她:smiley: 写了一个关于图片的多线程爬虫,爬的是百度图片,同时换了接口,也适用于搜狗图片,360 图片;除此之外,还加了一个简单的去重。

脚本用 python3 写的,用的都是一些基础模块,没有使用框架之类的,反正我也不会,这些模块也已经满足要求。使用过程中某些参数需要自己配置,比如关键词,线程数量等,如果你什么都不会,那不推荐你使用。


爬虫效果:


2.png1.png

百度图片代码:

  • import requests, queue, os, urllib3, time, threading
  • from hashlib import md5
  • urllib3.disable_warnings() # 忽略ssl报错
  • def getManyPages(keyword, pages):
  • params = {
  • 'tn': 'resultjson_com',
  • 'ipn': 'rj',
  • 'ct': 201326592,
  • 'is': '',
  • 'fp': 'result',
  • 'queryWord': keyword,
  • 'cl': 2,
  • 'lm': -1,
  • 'ie': 'utf-8',
  • 'oe': 'utf-8',
  • 'adpicid': '',
  • 'st': -1,
  • 'z': '',
  • 'ic': 0,
  • 'word': keyword,
  • 's': '',
  • 'se': '',
  • 'tab': '',
  • 'width': '',
  • 'height': '',
  • 'face': 0,
  • 'istype': 2,
  • 'qc': '',
  • 'nc': 1,
  • 'fr': '',
  • 'pn': pages, # 起始数
  • 'rn': 50, # 返回数
  • 'gsm': '1e',
  • '1532325785686': '' # 根据下图修改,这只是一个时间戳
  • }
  • url = 'https://image.baidu.com/search/acjson' # 百度图片网址
  • try:
  • res = requests.get(url, params=params, timeout=(5, 20), verify=False)
  • return res.json().get('data')
  • except:
  • print('百度图片json数据获取失败')
  • return []
  • def Run():
  • global Dnum
  • while not Plist.empty():
  • data2 = Plist.get()
  • picpath = FilePath + data2['filename']
  • try:
  • res = requests.get(data2['url'], timeout=(5, 30), stream=True)
  • if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
  • destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
  • m = md5(destr).hexdigest()
  • # --------------------------
  • if m in Mlist: # 重复重复跳过!~~
  • print('唯一表示符:' + m + ' ,已重复跳过!~~')
  • continue
  • else:
  • Mlist.append(m)
  • # --------------------------
  • with open(picpath, 'wb') as file:
  • file.write(res.content)
  • except requests.RequestException as err:
  • Plist.put(data2)
  • print('下载失败,稍后重试!' + err)
  • continue
  • if __name__ == '__main__':
  • # ------------------------------------------
  • KeyWords = '白色轿车' # 关键词
  • Num = 2 # 爬取数量,50的倍数
  • Tnum = 100 # 下载线程数
  • # ------------------------------------------
  • Name = 0 # 顺序命名
  • Tlist = list() # 子线程列表
  • Mlist = list() # 图片唯一表示符
  • Plist = queue.Queue(Num * 100) # 图片下载地址队列
  • FilePath = './baidu/' + KeyWords + '/' # 保存路径,使用相对路径
  • Start = time.time()
  • if not os.path.exists(FilePath):
  • os.makedirs(FilePath)
  • for xx in range(Num):
  • dataList = getManyPages(KeyWords, xx * 50)
  • for x in dataList:
  • if 'thumbURL' in x:
  • filename = 'baidu_' + KeyWords + '_' + str(Name) + '.jpg'
  • data = {'filename': filename, 'url': x['thumbURL']}
  • Plist.put(data)
  • Name += 1
  • print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))
  • print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
  • print('开始启动下载线程')
  • for x in range(Tnum):
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • print('所有线程启动完成,全力下载')
  • while not Plist.empty() and threading.activeCount() > 1:
  • time.sleep(1)
  • print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
  • round(time.time() - Start)) + 'S')
  • for x in range(len(Tlist)):
  • if not Tlist[x].isAlive() and not Plist.empty():
  • print('其中一个线程死掉,马上重启')
  • Tlist.pop(x)
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
  • round(time.time() - Start)) + ' S')

搜狗图片代码:

  • import requests, queue, os, urllib3, time, threading
  • from hashlib import md5
  • urllib3.disable_warnings() # 忽略ssl报错
  • def getManyPages(keyword, pages):
  • url = 'https://pic.sogou.com/pics?query=' + keyword + '&start=' + str(pages) + '&reqType=ajax' # 搜狗图片接口
  • try:
  • res = requests.get(url, timeout=(5, 20), verify=False)
  • return res.json().get('items')
  • except:
  • print('json数据获取失败')
  • return []
  • def Run():
  • global Dnum
  • while not Plist.empty():
  • data2 = Plist.get()
  • picpath = FilePath + data2['filename']
  • try:
  • res = requests.get(data2['url'], timeout=(5, 30), stream=True)
  • if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
  • destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
  • m = md5(destr).hexdigest()
  • # --------------------------
  • if m in Mlist: # 重复重复跳过!~~
  • print('唯一表示符:' + m + ' ,已重复跳过!~~')
  • continue
  • else:
  • Mlist.append(m)
  • # --------------------------
  • with open(picpath, 'wb') as file:
  • file.write(res.content)
  • except requests.RequestException as err:
  • Plist.put(data2)
  • print('下载失败,稍后重试!' + err)
  • continue
  • if __name__ == '__main__':
  • # ------------------------------------------
  • KeyWords = '轿车' # 关键词
  • Num = 50 # 爬取数量,48的倍数
  • Tnum = 100 # 下载线程数
  • # ------------------------------------------
  • Name = 0 # 顺序命名
  • Tlist = list() # 子线程列表
  • Mlist = list() # 图片唯一表示符
  • Plist = queue.Queue(Num * 100) # 图片下载地址队列
  • FilePath = './sougou/' + KeyWords + '/' # 保存路径,使用相对路径
  • Start = time.time()
  • if not os.path.exists(FilePath):
  • os.makedirs(FilePath)
  • for xx in range(Num):
  • dataList = getManyPages(KeyWords, xx * 48)
  • for x in dataList:
  • if 'thumbUrl' in x:
  • filename = 'sougou_' + KeyWords + '_' + str(Name) + '.jpg'
  • data = {'filename': filename, 'url': x['thumbUrl']}
  • Plist.put(data)
  • Name += 1
  • print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))
  • print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
  • print('开始启动下载线程')
  • for x in range(Tnum):
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • print('所有线程启动完成,全力下载')
  • while not Plist.empty() and threading.activeCount() > 1:
  • time.sleep(1)
  • print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
  • round(time.time() - Start)) + 'S')
  • for x in range(len(Tlist)):
  • if not Tlist[x].isAlive() and not Plist.empty():
  • print('其中一个线程死掉,马上重启')
  • Tlist.pop(x)
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • time.sleep(3)
  • print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
  • round(time.time() - Start)) + ' S')

360 图片代码:

  • import requests, queue, os, urllib3, time, threading
  • from hashlib import md5
  • urllib3.disable_warnings() # 忽略ssl报错
  • def getManyPages(keyword, pages):
  • url = 'http://image.so.com/j?q=' + keyword + '&pn=50&sn=' + str(pages) # 百度图片网址,pn代表返回数量
  • try:
  • res = requests.get(url, timeout=(5, 20), verify=False)
  • return res.json().get('list')
  • except:
  • print('百度图片json数据获取失败')
  • return []
  • def Run():
  • global Dnum
  • while not Plist.empty():
  • data2 = Plist.get()
  • picpath = FilePath + data2['filename']
  • try:
  • res = requests.get(data2['url'], timeout=(5, 30), stream=True)
  • if res.status_code == requests.codes.ok and int(res.headers['Content-Length']) > 0:
  • destr = (str(res.headers['Content-Length']) + data2['url']).encode(encoding='UTF-8')
  • m = md5(destr).hexdigest()
  • # --------------------------
  • if m in Mlist: # 重复重复跳过!~~
  • print('唯一表示符:' + m + ' ,已重复跳过!~~')
  • continue
  • else:
  • Mlist.append(m)
  • # --------------------------
  • with open(picpath, 'wb') as file:
  • file.write(res.content)
  • except requests.RequestException as err:
  • Plist.put(data2)
  • print('下载失败,稍后重试!' + err)
  • continue
  • if __name__ == '__main__':
  • # ------------------------------------------
  • KeyWords = '黑色轿车' # 关键词
  • Num = 40 # 爬取数量,50的倍数
  • Tnum = 100 # 下载线程数
  • # ------------------------------------------
  • Name = 0 # 顺序命名
  • Tlist = list() # 子线程列表
  • Mlist = list() # 图片唯一表示符
  • Plist = queue.Queue(Num * 100) # 图片下载地址队列
  • FilePath = './360/' + KeyWords + '/' # 保存路径,使用相对路径
  • Start = time.time()
  • if not os.path.exists(FilePath):
  • os.makedirs(FilePath)
  • for xx in range(Num):
  • dataList = getManyPages(KeyWords, xx * 50)
  • for x in dataList:
  • if 'thumb_bak' in x:
  • filename = '360_' + KeyWords + '_' + str(Name) + '.jpg'
  • data = {'filename': filename, 'url': x['thumb_bak']}
  • Plist.put(data)
  • Name += 1
  • print('当前已获取图片数:' + str(Plist.qsize()) + ' ' + str(xx))
  • print('实际获取数量可能略少,实际获取数量:' + str(Plist.qsize()))
  • print('开始启动下载线程')
  • for x in range(Tnum):
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • print('所有线程启动完成,全力下载')
  • while not Plist.empty() and threading.activeCount() > 1:
  • time.sleep(1)
  • print('当前线程总数:' + str(threading.activeCount()) + ' 剩余任务数:' + str(Plist.qsize()) + ' 执行耗时:' + str(
  • round(time.time() - Start)) + 'S')
  • for x in range(len(Tlist)):
  • if not Tlist[x].isAlive() and not Plist.empty():
  • print('其中一个线程死掉,马上重启')
  • Tlist.pop(x)
  • t = threading.Thread(target=Run)
  • t.setDaemon(False)
  • t.start()
  • Tlist.append(t)
  • time.sleep(3)
  • print('全部下载完毕,下载需求数:' + str(Num * 50) + ',实际有效下载数:' + str(len(Mlist)) + ',执行总耗时:' + str(
  • round(time.time() - Start)) + ' S')