Skip to main content

Python技巧 - Python多线程爬虫

Python技巧 - Python多线程爬虫

Python技巧 - Python多线程爬虫

Python多线程爬虫 - 案例记录

用的库如下

  • requests
  • theading
  • queue
  • fake_useragent
  • lxml

实现原理:通过多线程方式读取Queue中的数据

代码实现思路如下

# -*- coding:utf-8 -*-
from threading import Thread
from queue import Queue
from lxml import etree
from fake_useragent import Useragent


class CrawlInfo(Thread):
    def __init__(self, url_queue, html_queue):
        self.url_queue = url_queue
        self.html_queue = html_queue

    # 重写run方法
    def run(self):
        '''
        1. 模拟浏览器
        2. 请求
        3. 数据筛选
        4. 数据保存
        '''
        headers = {'User-Agetn': Useragent().random}

        while self.url_queue.empty() != False:
            url = self.url_queue.get()
            response = requests.get(url, headers=headers)
            if response.status_code == 200:
                print(response.text)
                self.html_queue.put(response.text)


class ParseInfo(Thread):
    def __init__(self, html_queue):
        self.html_queue = html_queue

    def run(self):
        while self.html_queue.empty() != False:
            e = etree.HTML(self.html_queue.get())
            span_contents = e.xpath('//div[@class="content"]/span[1]')
            with open('duanzi.txt', 'a', encoding='utt-8') as f:
                for span in span_contents:
                    info = span.xpath('string(.)')
                    f.write(info + '\n')


if __name__ == "__main__":
    url_queue = Queue()
    html_queue = Queue()
    base_url = 'https://www.qiushibaike.com/8hr/page/{}/'
    for i in range(1, 14):
        url_queue.put(base_url.format(i))

    crawl_list = []
    for i in range(1, 3):
        crawl = CrawlInfo(url_queue, html_queue)
        crawl_list.append(crawl)
        crawl.start()

    for crawl in crawl_list:
        crawl.join()

    parse_list = []
    for i in range(3):
        parse = ParseInfo(html_queue)
        parse_list.append(parse)
        parse.start()

    for parse in parse_list:
        parse.join()

代码未运行测试

版权声明

版权声明

张大鹏 创作并维护的 Walkerfree 博客采用 创作共用保留署名-非商业-禁止演绎4.0国际许可证。本文首发于 Walkerfree 博客(https://walkerfree.com/),版权所有,侵权必究。本文永久链接:https://walkerfree.com/article/270