本文档由脑图导出，地址：Spider脑图

参考：heima

Spider

Scrapy 框架

数据抓取基础

概述&架构图&运作流程&安装

概述
Scrapy是一个为了爬取网站数据，提取结构性数据而编写的应用框架
多线程、异步

架构图&运作流程
略

配置安装

Code

1	pip3 install Scrapy

入门案例

# 进入项目目录中运行：
scrapy startproject mySpider

$ cd mySpider
mySpider        scrapy.cfg

# 目录结构
...

# 进入mySpider目录运行 生成爬虫
scrapy genspider itcast "itcast.cn”
# 会自动生成itcast.py

# itcast.py
...

itcast.py

python

# -*- coding: utf-8 -*-
import scrapy


class ItcastSpider(scrapy.Spider):
    name = 'itcast' # 爬虫名
    allowed_domains = ['itcast.cn']  # 允许爬的范围
    start_urls = ['http://www.itcast.cn/channel/teacher.shtml']   # 开始爬取的地址

    # 继承
    def parse(self, response):
        # 处理start_urls对应的响应

        # 每一个老师信息
        li_list = response.xpath("//div[@class='tea_con']//li")
        for li in li_list:
            item = {}
            # extract_first: 提取数据
            item["name"] = li.xpath(".//h3/text()").extract_first()
            item["title"] = li.xpath(".//h4/text()").extract_first()
            # 变成生成器、减小内存占用、将item传递给pipline
            # print(item)
            yield item

go on

# 启动
scrapy crawl itcast


settings.py
# 设置日志等级
LOG_LEVEL = "WARNING"
# 开启pipline
# 多个pipline，有处理优先级（距离引擎的远近）,越近越先处理
ITEM_PIPELINES = {
   'mySpider.pipelines.MyspiderPipeline': 300,
   'mySpider.pipelines.MyspiderPipeline1': 301,
}


# pipelines.py
...

pipelines.py

python

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class MyspiderPipeline(object):
    def process_item(self, item, spider):
        item["hello"] = "world"
        # pipline必须return传递item
        return item


class MyspiderPipeline1(object):
    def process_item(self, item, spider):
        print(item)
        return item

关于迭代器、生成器

迭代器：不用再将所有要迭代的数据都一次性缓存下来供后续依次读取，这样可以节省大量的存储（内存）空间

生成器(generator)：生成器是一类特殊的迭代器。将每次迭代返回数值的return换成了yield，此时新定义的函数便不再是函数，而是一个生成器了。简单来说：只要在def中有yield关键字的就称为生成器

yield关键字有两点作用：

保存当前运行状态（断点），然后暂停执行，即将生成器（函数）挂起
将yield关键字后面表达式的值作为返回值返回，此时可以理解为起到了return的作用

关于日志

多个spider都提交到pipeline，需要if判断区分

关于日志

Code

import logging

# 设置日志输出样式
logging.basicConfig(...)

logger = logging.getLogger(__name__)

logger.warning("XX")

# -----------------------
settings.py
# 设置日志保存位置
LOG_FILE = './log.log'

实现翻页

hr.py

python

# -*- coding: utf-8 -*-
import scrapy
# 爬招聘信息，存到mongodb
class HrSpider(scrapy.Spider):
    name = 'hr'
    allowed_domains = ['tencent.com']
    start_urls = ['http://hr.tencent.com/position.php']

    def parse(self, response):
        tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
        for tr in tr_list:
            item = {}
            item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
            item["position"] = tr.xpath("./td[2]/text()").extract_first()
            item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
            yield item
        #找到下一页的url地址
        next_url = response.xpath("//a[@id='next']/@href").extract_first()
        if next_url != "javascript:;":
            next_url = "http://hr.tencent.com/" +next_url
            yield scrapy.Request(
                next_url,
                callback=self.parse,	# 新请求使用哪个处理逻辑
                # meta = {"item":item}
            )

    # def parse1(self,response):
    #     response.meta["item"]

settings.py

python

1	USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

piplines.py

python

from pymongo import MongoClient
from tencent.items import TencentItem

client = MongoClient()
collection = client["tencent"]["hr"]

class TencentPipeline(object):
    def process_item(self, item, spider):
        print(item)
        collection.insert(item)
        return item

else

python

1
2
3

scrapy.Request(url, callback, method, headers, body, cookies, meta, dont_filter=False)
# dont_filter : scrapy默认去重（请求过的url不会再请求）
# 如贴吧会更新数据（同一个url不同时间请求，数据不同），这时可将dont_filter=False

item

items.py

python

import scrapy
class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    title = scrapy.Field()
    position = scrapy.Field()
    publish_date = scrapy.Field()

hr.py修改

python

from tencent.items import TencentItem

for tr in tr_list:
    # 使用定义好的item
    item = TencentItem()
    item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
    item["position"] = tr.xpath("./td[2]/text()").extract_first()
    item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
    yield item

piplines.py修改

python

class TencentPipeline(object):
    def process_item(self, item, spider):
        if isinstance(item,TencentItem):
            print(item)
            # item不是字典，需要转化
            collection.insert(dict(item))
        return item

item使用案例 - 阳光政务平台爬虫

实现翻页+请求详情页

yg.py

python

# -*- coding: utf-8 -*-
import scrapy
from yangguang.items import YangguangItem
from yangguang.settings import MONGO_HOST

class YgSpider(scrapy.Spider):
    name = 'yg'
    allowed_domains = ['sun07691.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']

    def parse(self, response):
        #分组
        tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
        for tr in tr_list:
            item = YangguangItem()
            item["title"] = tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
            item["href"] = tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()
            item["publish_date"]=tr.xpath("./td[last()]/text()").extract_first()
			
            # item有些字段需要请求详情页
            yield  scrapy.Request(
                item["href"],
                callback=self.parse_detail,
                meta = {"item":item}
            )
        #翻页
        next_url = response.xpath("//a[text()='>']/@href").extract_first()
        if next_url is not None:
            yield scrapy.Request(
                next_url,
                callback=self.parse
            )

    def parse_detail(self,response): #处理详情页
        item = response.meta["item"]
        item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract()
        item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
        item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]
        # print(item)
        yield item

piplines.py

python

import re
from yangguang.settings import MONGO_HOST
from pymongo import MongoClient

class YangguangPipeline(object):
    def open_spider(self,spider):
        # spider.hello = "world"
        client = MongoClient()
        self.collection = client["test"]["test"]
	
    # 保存到mongo
    def process_item(self, item, spider):
        # 
        spider.settings.get("MONGO_HOST")
        item["content"] = self.process_content(item["content"])
        print(item)

        self.collection.insert(dict(item))
        return item
	
    # 做一些数据处理
    def process_content(self,content):
        content = [re.sub(r"\xa0|\s","",i) for i in content]
        content = [i for i in content if len(i)>0] #去除列表中的空字符串
        return content

scrapy深入

debug日志 & scrapy shell & settings.py & pipline

程序的debug日志信息

略

scrapy shell

python

Scrapy shell是一个交互终端，我们可以在未启动spider的情况下尝试及调试代码，也可以用来测试XPath表达式

使用方法：
	scrapy shell http://www.itcast.cn/channel/teacher.shtml


response.url：当前响应的url地址
response.request.url：当前响应对应的请求的url地址
response.headers：响应头
response.body：响应体，也就是html代码，默认是byte类型
response.requests.headers：当前响应的请求头

settings.py

python

# 存放公共变量
MONGO_HOST = "localhost"
# 在pipline里使用
spider.settings.get("MONGO_HOST")

pipline

python

# 爬虫开启的的时候，仅执行一次
def open_spider(self,spider):
        # spider.hello = "world"
        client = MongoClient()
        self.collection = client["test"]["test"]
        
# 爬虫关闭的的时候，仅执行一次      
def close_spider

综合案例-苏宁图书爬虫

crawlspider

crowspider

启动项目

1
2
3

$ scrapy startproject circ
$ cd circ
$ scrapy genspider -t crawl cf bxjg.circ.gov.cn

cf.py

python

"""
提取每一个详情页的某些字段

"""
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re


class CfSpider(CrawlSpider):
    name = 'cf'
    allowed_domains = ['bxjg.circ.gov.cn']
    start_urls = ['http://bxjg.circ.gov.cn/web/site0/tab5240/module14430/page1.htm']

    # 定义提取url地址规则
    rules = (
        # LinkExtractor：连接提取器，提取url地址
        # allow：通过正则表达式提取url
        # 提取后会交给父类parse函数发送请求，所以子类不能自定义parse函数
        # callback：提取后的url的response 会交给callback处理，callback可以为null
        # follow：当前url的响应，是否重新经过该Rule的规则来提取

        # /web/site0/tab5240/module14430/page3.htm
        # CrawlSpider会自动补充完整的url
        Rule(LinkExtractor(allow=r'/web/site0/tab5240/info\d+\.htm'), callback='parse_item', follow=False),
        Rule(LinkExtractor(allow=r'/web/site0/tab5240/module14430/page\d+\.htm'), follow=True),


        # Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        item = {}
        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()

        # <!--TitleStart-->中国银行保险监督管理委员会行政处罚决定书(银保监罚决字〔2019〕2号) <!--TitleEnd-->
        item['title'] = re.findall("<!--TitleStart-->(.*?)<!--TitleEnd-->", response.body.decode())[0]
        # 发布时间：2019-03-18
        item['publish_date'] = re.findall("发布时间：(\d{4}-\d{2}-\d{2})", response.body.decode())[0]

        print(item)

else

Code

# 运行
$ scrapy crawl cf

其他参数说明：
略