本文档由脑图导出,地址:Spider脑图
参考:heima
Spider Scrapy 框架 数据抓取基础 概述&架构图&运作流程&安装 概述 Scrapy是一个为了爬取网站数据,提取结构性数据而编写的应用框架 多线程、异步
架构图&运作流程 略
配置安装
入门案例 入门案例
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 scrapy startproject mySpider $ cd mySpider mySpider scrapy.cfg ... scrapy genspider itcast "itcast.cn” # 会自动生成itcast.py # itcast.py ...
itcast.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 import scrapyclass ItcastSpider (scrapy.Spider) : name = 'itcast' allowed_domains = ['itcast.cn' ] start_urls = ['http://www.itcast.cn/channel/teacher.shtml' ] def parse (self, response) : li_list = response.xpath("//div[@class='tea_con']//li" ) for li in li_list: item = {} item["name" ] = li.xpath(".//h3/text()" ).extract_first() item["title" ] = li.xpath(".//h4/text()" ).extract_first() yield item
go on
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 scrapy crawl itcast settings.py LOG_LEVEL = "WARNING" ITEM_PIPELINES = { 'mySpider.pipelines.MyspiderPipeline' : 300, 'mySpider.pipelines.MyspiderPipeline1' : 301, } ...
pipelines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 class MyspiderPipeline (object) : def process_item (self, item, spider) : item["hello" ] = "world" return item class MyspiderPipeline1 (object) : def process_item (self, item, spider) : print(item) return item
关于迭代器、生成器
迭代器:不用再将所有要迭代的数据都一次性缓存下来供后续依次读取,这样可以节省大量的存储(内存)空间
生成器(generator):生成器是一类特殊的迭代器。将每次迭代返回数值的return换成了yield,此时新定义的函数便不再是函数,而是一个生成器 了。简单来说:只要在def中有yield关键字的 就称为 生成器
yield关键字有两点作用:
保存当前运行状态(断点),然后暂停执行,即将生成器(函数)挂起
将yield关键字后面表达式的值作为返回值返回,此时可以理解为起到了return的作用
关于日志 多个spider都提交到pipeline,需要if判断区分
关于日志
1 2 3 4 5 6 7 8 9 10 11 12 13 import logging # 设置日志输出样式 logging.basicConfig(...) logger = logging.getLogger(__name__) logger.warning("XX") # ----------------------- settings.py # 设置日志保存位置 LOG_FILE = './log.log'
实现翻页 hr.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 import scrapyclass HrSpider (scrapy.Spider) : name = 'hr' allowed_domains = ['tencent.com' ] start_urls = ['http://hr.tencent.com/position.php' ] def parse (self, response) : tr_list = response.xpath("//table[@class='tablelist']/tr" )[1 :-1 ] for tr in tr_list: item = {} item["title" ] = tr.xpath("./td[1]/a/text()" ).extract_first() item["position" ] = tr.xpath("./td[2]/text()" ).extract_first() item["publish_date" ] = tr.xpath("./td[5]/text()" ).extract_first() yield item next_url = response.xpath("//a[@id='next']/@href" ).extract_first() if next_url != "javascript:;" : next_url = "http://hr.tencent.com/" +next_url yield scrapy.Request( next_url, callback=self.parse, )
settings.py
1 USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
piplines.py
1 2 3 4 5 6 7 8 9 10 11 from pymongo import MongoClientfrom tencent.items import TencentItemclient = MongoClient() collection = client["tencent" ]["hr" ] class TencentPipeline (object) : def process_item (self, item, spider) : print(item) collection.insert(item) return item
else
1 2 3 scrapy.Request(url, callback, method, headers, body, cookies, meta, dont_filter=False )
item items.py
1 2 3 4 5 6 import scrapyclass TencentItem (scrapy.Item) : title = scrapy.Field() position = scrapy.Field() publish_date = scrapy.Field()
hr.py修改
1 2 3 4 5 6 7 8 9 from tencent.items import TencentItemfor tr in tr_list: item = TencentItem() item["title" ] = tr.xpath("./td[1]/a/text()" ).extract_first() item["position" ] = tr.xpath("./td[2]/text()" ).extract_first() item["publish_date" ] = tr.xpath("./td[5]/text()" ).extract_first() yield item
piplines.py修改
1 2 3 4 5 6 7 class TencentPipeline (object) : def process_item (self, item, spider) : if isinstance(item,TencentItem): print(item) collection.insert(dict(item)) return item
item使用案例 - 阳光政务平台爬虫
实现翻页+请求详情页
yg.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 import scrapyfrom yangguang.items import YangguangItemfrom yangguang.settings import MONGO_HOSTclass YgSpider (scrapy.Spider) : name = 'yg' allowed_domains = ['sun07691.com' ] start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=0' ] def parse (self, response) : tr_list = response.xpath("//div[@class='greyframe']/table[2]/tr/td/table/tr" ) for tr in tr_list: item = YangguangItem() item["title" ] = tr.xpath("./td[2]/a[@class='news14']/@title" ).extract_first() item["href" ] = tr.xpath("./td[2]/a[@class='news14']/@href" ).extract_first() item["publish_date" ]=tr.xpath("./td[last()]/text()" ).extract_first() yield scrapy.Request( item["href" ], callback=self.parse_detail, meta = {"item" :item} ) next_url = response.xpath("//a[text()='>']/@href" ).extract_first() if next_url is not None : yield scrapy.Request( next_url, callback=self.parse ) def parse_detail (self,response) : item = response.meta["item" ] item["content" ] = response.xpath("//div[@class='c1 text14_2']//text()" ).extract() item["content_img" ] = response.xpath("//div[@class='c1 text14_2']//img/@src" ).extract() item["content_img" ] = ["http://wz.sun0769.com" +i for i in item["content_img" ]] yield item
piplines.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 import refrom yangguang.settings import MONGO_HOSTfrom pymongo import MongoClientclass YangguangPipeline (object) : def open_spider (self,spider) : client = MongoClient() self.collection = client["test" ]["test" ] def process_item (self, item, spider) : spider.settings.get("MONGO_HOST" ) item["content" ] = self.process_content(item["content" ]) print(item) self.collection.insert(dict(item)) return item def process_content (self,content) : content = [re.sub(r"\xa0|\s" ,"" ,i) for i in content] content = [i for i in content if len(i)>0 ] return content
scrapy深入 debug日志 & scrapy shell & settings.py & pipline 程序的debug日志信息
略
scrapy shell
1 2 3 4 5 6 7 8 9 10 11 Scrapy shell是一个交互终端,我们可以在未启动spider的情况下尝试及调试代码,也可以用来测试XPath表达式 使用方法: scrapy shell http://www.itcast.cn/channel/teacher.shtml response.url:当前响应的url地址 response.request.url:当前响应对应的请求的url地址 response.headers:响应头 response.body:响应体,也就是html代码,默认是byte类型 response.requests.headers:当前响应的请求头
settings.py
1 2 3 4 MONGO_HOST = "localhost" spider.settings.get("MONGO_HOST" )
pipline
1 2 3 4 5 6 7 8 def open_spider (self,spider) : client = MongoClient() self.collection = client["test" ]["test" ] def close_spider
综合案例-苏宁图书爬虫 crawlspider crowspider
启动项目
1 2 3 $ scrapy startproject circ $ cd circ $ scrapy genspider -t crawl cf bxjg.circ.gov.cn
cf.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 """ 提取每一个详情页的某些字段 """ import scrapyfrom scrapy.linkextractors import LinkExtractorfrom scrapy.spiders import CrawlSpider, Ruleimport reclass CfSpider (CrawlSpider) : name = 'cf' allowed_domains = ['bxjg.circ.gov.cn' ] start_urls = ['http://bxjg.circ.gov.cn/web/site0/tab5240/module14430/page1.htm' ] rules = ( Rule(LinkExtractor(allow=r'/web/site0/tab5240/info\d+\.htm' ), callback='parse_item' , follow=False ), Rule(LinkExtractor(allow=r'/web/site0/tab5240/module14430/page\d+\.htm' ), follow=True ), ) def parse_item (self, response) : item = {} item['title' ] = re.findall("<!--TitleStart-->(.*?)<!--TitleEnd-->" , response.body.decode())[0 ] item['publish_date' ] = re.findall("发布时间:(\d{4}-\d{2}-\d{2})" , response.body.decode())[0 ] print(item)
else
1 2 3 4 5 # 运行 $ scrapy crawl cf 其他参数说明: 略