avatar

目录
python爬虫(3)-Scrapy框架

本文档由脑图导出,地址:Spider脑图

参考:heima

Spider

Scrapy 框架

数据抓取基础

概述&架构图&运作流程&安装

概述
Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架
多线程、异步

架构图&运作流程

配置安装

Code
1
pip3 install Scrapy

入门案例

入门案例

sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
# 进入项目目录中运行:
scrapy startproject mySpider

$ cd mySpider
mySpider scrapy.cfg

# 目录结构
...

# 进入mySpider目录运行 生成爬虫
scrapy genspider itcast "itcast.cn”
# 会自动生成itcast.py

# itcast.py
...

itcast.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
# -*- coding: utf-8 -*-
import scrapy


class ItcastSpider(scrapy.Spider):
name = 'itcast' # 爬虫名
allowed_domains = ['itcast.cn'] # 允许爬的范围
start_urls = ['http://www.itcast.cn/channel/teacher.shtml'] # 开始爬取的地址

# 继承
def parse(self, response):
# 处理start_urls对应的响应

# 每一个老师信息
li_list = response.xpath("//div[@class='tea_con']//li")
for li in li_list:
item = {}
# extract_first: 提取数据
item["name"] = li.xpath(".//h3/text()").extract_first()
item["title"] = li.xpath(".//h4/text()").extract_first()
# 变成生成器、减小内存占用、将item传递给pipline
# print(item)
yield item

go on

sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 启动
scrapy crawl itcast


settings.py
# 设置日志等级
LOG_LEVEL = "WARNING"
# 开启pipline
# 多个pipline,有处理优先级(距离引擎的远近),越近越先处理
ITEM_PIPELINES = {
'mySpider.pipelines.MyspiderPipeline': 300,
'mySpider.pipelines.MyspiderPipeline1': 301,
}


# pipelines.py
...

pipelines.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class MyspiderPipeline(object):
def process_item(self, item, spider):
item["hello"] = "world"
# pipline必须return传递item
return item


class MyspiderPipeline1(object):
def process_item(self, item, spider):
print(item)
return item

关于迭代器、生成器

迭代器:不用再将所有要迭代的数据都一次性缓存下来供后续依次读取,这样可以节省大量的存储(内存)空间

生成器(generator):生成器是一类特殊的迭代器。将每次迭代返回数值的return换成了yield,此时新定义的函数便不再是函数,而是一个生成器了。简单来说:只要在def中有yield关键字的 就称为 生成器

yield关键字有两点作用:

  • 保存当前运行状态(断点),然后暂停执行,即将生成器(函数)挂起
  • 将yield关键字后面表达式的值作为返回值返回,此时可以理解为起到了return的作用

关于日志

多个spider都提交到pipeline,需要if判断区分

关于日志

Code
1
2
3
4
5
6
7
8
9
10
11
12
13
import logging

# 设置日志输出样式
logging.basicConfig(...)

logger = logging.getLogger(__name__)

logger.warning("XX")

# -----------------------
settings.py
# 设置日志保存位置
LOG_FILE = './log.log'

实现翻页

hr.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# -*- coding: utf-8 -*-
import scrapy
# 爬招聘信息,存到mongodb
class HrSpider(scrapy.Spider):
name = 'hr'
allowed_domains = ['tencent.com']
start_urls = ['http://hr.tencent.com/position.php']

def parse(self, response):
tr_list = response.xpath("//table[@class='tablelist']/tr")[1:-1]
for tr in tr_list:
item = {}
item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
item["position"] = tr.xpath("./td[2]/text()").extract_first()
item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
yield item
#找到下一页的url地址
next_url = response.xpath("//a[@id='next']/@href").extract_first()
if next_url != "javascript:;":
next_url = "http://hr.tencent.com/" +next_url
yield scrapy.Request(
next_url,
callback=self.parse, # 新请求使用哪个处理逻辑
# meta = {"item":item}
)

# def parse1(self,response):
# response.meta["item"]

settings.py

python
1
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'

piplines.py

python
1
2
3
4
5
6
7
8
9
10
11
from pymongo import MongoClient
from tencent.items import TencentItem

client = MongoClient()
collection = client["tencent"]["hr"]

class TencentPipeline(object):
def process_item(self, item, spider):
print(item)
collection.insert(item)
return item

else

python
1
2
3
scrapy.Request(url, callback, method, headers, body, cookies, meta, dont_filter=False)
# dont_filter : scrapy默认去重(请求过的url不会再请求)
# 如贴吧会更新数据(同一个url不同时间请求,数据不同),这时可将dont_filter=False

item

items.py

python
1
2
3
4
5
6
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
position = scrapy.Field()
publish_date = scrapy.Field()

hr.py修改

python
1
2
3
4
5
6
7
8
9
from tencent.items import TencentItem

for tr in tr_list:
# 使用定义好的item
item = TencentItem()
item["title"] = tr.xpath("./td[1]/a/text()").extract_first()
item["position"] = tr.xpath("./td[2]/text()").extract_first()
item["publish_date"] = tr.xpath("./td[5]/text()").extract_first()
yield item

piplines.py修改

python
1
2
3
4
5
6
7
class TencentPipeline(object):
def process_item(self, item, spider):
if isinstance(item,TencentItem):
print(item)
# item不是字典,需要转化
collection.insert(dict(item))
return item

item使用案例 - 阳光政务平台爬虫

实现翻页+请求详情页

yg.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# -*- coding: utf-8 -*-
import scrapy
from yangguang.items import YangguangItem
from yangguang.settings import MONGO_HOST

class YgSpider(scrapy.Spider):
name = 'yg'
allowed_domains = ['sun07691.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0']

def parse(self, response):
#分组
tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr")
for tr in tr_list:
item = YangguangItem()
item["title"] = tr.xpath("./td[2]/a[@class='news14']/@title").extract_first()
item["href"] = tr.xpath("./td[2]/a[@class='news14']/@href").extract_first()
item["publish_date"]=tr.xpath("./td[last()]/text()").extract_first()

# item有些字段需要请求详情页
yield scrapy.Request(
item["href"],
callback=self.parse_detail,
meta = {"item":item}
)
#翻页
next_url = response.xpath("//a[text()='>']/@href").extract_first()
if next_url is not None:
yield scrapy.Request(
next_url,
callback=self.parse
)

def parse_detail(self,response): #处理详情页
item = response.meta["item"]
item["content"] = response.xpath("//div[@class='c1 text14_2']//text()").extract()
item["content_img"] = response.xpath("//div[@class='c1 text14_2']//img/@src").extract()
item["content_img"] = ["http://wz.sun0769.com"+i for i in item["content_img"]]
# print(item)
yield item

piplines.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import re
from yangguang.settings import MONGO_HOST
from pymongo import MongoClient

class YangguangPipeline(object):
def open_spider(self,spider):
# spider.hello = "world"
client = MongoClient()
self.collection = client["test"]["test"]

# 保存到mongo
def process_item(self, item, spider):
#
spider.settings.get("MONGO_HOST")
item["content"] = self.process_content(item["content"])
print(item)

self.collection.insert(dict(item))
return item

# 做一些数据处理
def process_content(self,content):
content = [re.sub(r"\xa0|\s","",i) for i in content]
content = [i for i in content if len(i)>0] #去除列表中的空字符串
return content

scrapy深入

debug日志 & scrapy shell & settings.py & pipline

程序的debug日志信息

scrapy shell

python
1
2
3
4
5
6
7
8
9
10
11
Scrapy shell是一个交互终端,我们可以在未启动spider的情况下尝试及调试代码,也可以用来测试XPath表达式

使用方法:
scrapy shell http://www.itcast.cn/channel/teacher.shtml


response.url:当前响应的url地址
response.request.url:当前响应对应的请求的url地址
response.headers:响应头
response.body:响应体,也就是html代码,默认是byte类型
response.requests.headers:当前响应的请求头

settings.py

python
1
2
3
4
# 存放公共变量
MONGO_HOST = "localhost"
# 在pipline里使用
spider.settings.get("MONGO_HOST")

pipline

python
1
2
3
4
5
6
7
8
# 爬虫开启的的时候,仅执行一次
def open_spider(self,spider):
# spider.hello = "world"
client = MongoClient()
self.collection = client["test"]["test"]

# 爬虫关闭的的时候,仅执行一次
def close_spider

综合案例-苏宁图书爬虫

crawlspider

crowspider

启动项目

sh
1
2
3
$ scrapy startproject circ
$ cd circ
$ scrapy genspider -t crawl cf bxjg.circ.gov.cn

cf.py

python
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
"""
提取每一个详情页的某些字段

"""
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import re


class CfSpider(CrawlSpider):
name = 'cf'
allowed_domains = ['bxjg.circ.gov.cn']
start_urls = ['http://bxjg.circ.gov.cn/web/site0/tab5240/module14430/page1.htm']

# 定义提取url地址规则
rules = (
# LinkExtractor:连接提取器,提取url地址
# allow:通过正则表达式提取url
# 提取后会交给父类parse函数发送请求,所以子类不能自定义parse函数
# callback:提取后的url的response 会交给callback处理,callback可以为null
# follow:当前url的响应,是否重新经过该Rule的规则来提取

# /web/site0/tab5240/module14430/page3.htm
# CrawlSpider会自动补充完整的url
Rule(LinkExtractor(allow=r'/web/site0/tab5240/info\d+\.htm'), callback='parse_item', follow=False),
Rule(LinkExtractor(allow=r'/web/site0/tab5240/module14430/page\d+\.htm'), follow=True),


# Rule(LinkExtractor(allow=r'Items/'), callback='parse_item', follow=True),
)

def parse_item(self, response):
item = {}
#item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
#item['name'] = response.xpath('//div[@id="name"]').get()
#item['description'] = response.xpath('//div[@id="description"]').get()

# <!--TitleStart-->中国银行保险监督管理委员会行政处罚决定书(银保监罚决字〔2019〕2号) <!--TitleEnd-->
item['title'] = re.findall("<!--TitleStart-->(.*?)<!--TitleEnd-->", response.body.decode())[0]
# 发布时间:2019-03-18
item['publish_date'] = re.findall("发布时间:(\d{4}-\d{2}-\d{2})", response.body.decode())[0]

print(item)

else

Code
1
2
3
4
5
# 运行
$ scrapy crawl cf

其他参数说明:

文章作者: Machine
文章链接: https://machine4869.gitee.io/2019/03/21/20190321133211318/
版权声明: 本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 哑舍
打赏
  • 微信
    微信
  • 支付宝
    支付宝

评论