分析展示
3.1 queue相关知识
本次使用python中的queue,也就是队列来模拟数据的存取过程。
首先对于基本爬虫初期,可以简单的使用到queue的知识可以如下所示:
- 初始化: class Queue.Queue(maxsize) FIFO 先进先出
包中的常用方法:
- queue.qsize() 返回队列的大小
- queue.empty() 如果队列为空,返回True,反之False
- queue.full() 如果队列满了,返回True,反之False
- queue.full 与 maxsize 大小对应
- queue.get([block[, timeout]])获取队列,timeout等待时间
- 创建一个"队列"对象 import queue myqueue = queue.Queue(maxsize = 10)
- 将一个值放入队列中 myqueue.put(10)
- 将一个值从队列中取出 myqueue.get()
- from queue import Queue,这个数据包在Python3中是内置了,不需要安装
3.2 多线程框架
首先先实现多线的框架:
import threading
for queue import Queue
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue, data_queue):
# threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
def run(self):
print(self.threadName + ' 启动************')
def main():
# 声明一个队列,使用循环在里面存入100个页码
page_queue = Queue(100)
for i in range(1,101):
page_queue.put(i)
# 采集结果(等待下载的图片地址)
data_queue = Queue()
# 记录线程的列表
thread_crawl = []
# 每次开启4个线程
craw_list = ['采集线程1号','采集线程2号','采集线程3号','采集线程4号']
for thread_name in craw_list:
c_thread = ThreadCrawl(thread_name, page_queue, data_queue)
c_thread.start()
thread_crawl.append(c_thread)
# 等待page_queue队列为空,也就是等待之前的操作执行完毕
while not page_queue.empty():
pass
if __name__ == '__main__':
main()
运行结果:
在这里插入图片描述
线程已经开启,在run方法中,补充爬取数据的代码就好了,这个地方引入一个全局变量,用来标识爬取状态
CRAWL_EXIT = False
CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue):
# threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = thread_name
self.page_queue = page_queue
def run(self):
print(self.threadName + ' 启动************')
while not CRAWL_EXIT:
try:
#global tag, url, img_format # 把全局的值拿过来
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }
# 队列为空 产生异常
page = self.page_queue.get(block=False) # 从里面获取值
spider_url = 'https://tuchong.com/rest/tags/%E8%87%AA%E7%84%B6/posts?page={}&count=20&order=weekly'.format(page)
print(spider_url)
except:
break
timeout = 4 # 合格地方是尝试获取3次,3次都失败,就跳出
while timeout > 0:
timeout -= 1
try:
with requests.Session() as s:
response = s.get(spider_url, headers=headers, timeout=3)
json_data = response.json()
if json_data is not None:
imgs = json_data["postList"]
for i in imgs:
imgs = i["images"]
for img in imgs:
user_id = img["user_id"]
img_id = img["img_id"]
img_url = 'https://photo.tuchong.com/{}/f/{}.jpg'.format(user_id, img_id) # 捕获到图片链接,之后,存入一个新的队列里面,等待下一步的操作
#self.data_queue.put(img_url)
title = 'download/' + str(img_id)
response = requests.get(img_url)
# 保存图片名字有问题,不知道会不会重复
with open(title + '.jpg', 'wb') as f:
f.write(response.content)
time.sleep(3)
break
except Exception as e:
print(e)
if timeout <= 0:
print('time out!')
然后在main函数中添加如下代码:
while not page_queue.empty():
pass
# 如果page_queue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
总结:上面用到了队列的思想存取数据,同时采用多线程提高速度
完整demo:
import threading
from queue import Queue
import requests
import os
import time
CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue):
# threading.Thread.__init__(self)
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = thread_name
self.page_queue = page_queue
def run(self):
print(self.threadName + ' 启动************')
while not CRAWL_EXIT:
try:
#global tag, url, img_format # 把全局的值拿过来
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }
# 队列为空 产生异常
page = self.page_queue.get(block=False) # 从里面获取值
spider_url = 'https://tuchong.com/rest/tags/%E8%87%AA%E7%84%B6/posts?page={}&count=20&order=weekly'.format(page)
print(spider_url)
except:
break
timeout = 4 # 合格地方是尝试获取3次,3次都失败,就跳出
while timeout > 0:
timeout -= 1
try:
with requests.Session() as s:
response = s.get(spider_url, headers=headers, timeout=3)
json_data = response.json()
if json_data is not None:
imgs = json_data["postList"]
for i in imgs:
imgs = i["images"]
for img in imgs:
user_id = img["user_id"]
img_id = img["img_id"]
img_url = 'https://photo.tuchong.com/{}/f/{}.jpg'.format(user_id, img_id)
#self.data_queue.put(img_url) # 捕获到图片链接,之后,存入一个新的队列里面,等待下一步的操作
title = 'download/' + str(img_id)
response = requests.get(img_url)
# 保存图片名字有问题,不知道会不会重复
with open(title + '.jpg', 'wb') as f:
f.write(response.content)
time.sleep(3)
break
except Exception as e:
print(e)
if timeout <= 0:
print('time out!')
def main():
# 声明一个队列,使用循环在里面存入100个页码
page_queue = Queue(100)
for i in range(1,101):
page_queue.put(i)
# 采集结果(等待下载的图片地址)
#data_queue = Queue()
# 记录线程的列表
thread_crawl = []
# 每次开启4个线程
craw_list = ['采集线程1号','采集线程2号','采集线程3号','采集线程4号']
if not os.path.exists('download'):
os.mkdir('download')
for thread_name in craw_list:
c_thread = ThreadCrawl(thread_name, page_queue)
c_thread.start()
thread_crawl.append(c_thread)
# 等待page_queue队列为空,也就是等待之前的操作执行完毕
while not page_queue.empty():
pass
# 如果page_queue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
if __name__ == '__main__':
main()
如还有异步请求需求可参考此处:队列与多线程