逆向爬虫16 Scrapy持久化存储_一个小黑酱

网络 02-07 6516

逆向爬虫16 Scrapy持久化存储

在Scrapy中，数据可以持久化存储到4个地方：① CSV文件，② MySQL数据库，③ MongoDB数据库，④ 文件存储 (图片等二进制文件)

本节通过两个实战案例来说明Scrapy是如何进行数据持久化存储的。

说明：我认为在初学框架的时候，最重要的就是弄清楚框架代码的运行顺序，而框架将不同功能的代码拆分成不同模块，写到不同的函数和文件中，只用文字不方便说明，因此本节使用截图+手动标记顺序的方式来说明代码的运行顺序。

案例一：双色球目标：说明items.py文件的作用，用于规范要爬取数据的字段名称实现① CSV文件，② MySQL数据库，③ MongoDB数据库持久化数据存储文件目录结构：

shuangseqiu.py文件

items.py文件

piplines.py文件（保存CSV部分）

piplines.py文件（保存MySQL部分）

piplines.py文件（保存MongoDB部分）

settings.py文件

shuangseqiu.py源码： import scrapy from caipiao.items import CaipiaoItem # 先导包 class ShuangseqiuSpider(scrapy.Spider): name = 'shuangseqiu' allowed_domains = ['500.com'] # 限制域名范围 start_urls = ['http://datachart.500.com/ssq/'] # 起始url def parse(self, resp, **kwargs): trs = resp.xpath('//*[@id="tdata"]/tr') # result = [] # 别这么干，很傻 for tr in trs: if tr.xpath('./@class').extract_first() == 'tdbck': # 过滤掉空行 continue # red_ball = tr.xpath('./td[@class="chartBall01"]/text()').extract() # blue_ball = tr.xpath('./td[@class="chartBall02"]/text()').extract_first() qihao = tr.xpath('./td[1]/text()').extract_first().strip() red_ball = '_'.join(tr.css(".chartBall01::text").extract()) blue_ball = tr.css(".chartBall02::text").extract_first() # print(qihao, red_ball, blue_ball) # 打印下看看 cai = CaipiaoItem() # cai = dict() cai['qihao'] = qihao cai['red_ball'] = red_ball cai['blue_ball'] = blue_ball yield cai # 聪明人都这么干 # dic = { # 'qihao': qihao, # 'red_ball': red_ball, # 'blue_ball': blue_ball # } # result.append(dic) # return result # 别这么干，很傻 # yield result # 别这么干，很傻 items.py源码： import scrapy class CaipiaoItem(scrapy.Item): # define the fields for your item here like: qihao = scrapy.Field() # 想当于字典的Key red_ball = scrapy.Field() blue_ball = scrapy.Field() pipelines.py源码： from itemadapter import ItemAdapter import pymysql from caipiao.settings import MYSQL import pymongo """ 存储数据的方案: 1. 数据要存储在csv文件中 2. 数据存储在mysql数据库中 3. 数据存储在mongodb数据库中 4. 文件存储 """ class CaipiaoPipeline: """ 我们希望的是，在爬虫开始的时候，打开这个文件在执行过程中，不断地往里存储数据在执行完毕时，关掉这个文件 """ def open_spider(self, spider): self.f = open("./双色球.csv", mode="a", encoding="utf-8") def close_spider(self, spider): if self.f: self.f.close() def process_item(self, item, spider): self.f.write(f"{item['qihao']},{item['red_ball']},{item['blue_ball']}\n") return item class CaipiaoMySQLPipeline: def open_spider(self, spider): self.conn = pymysql.connect( host=MYSQL['host'], port=MYSQL['port'], user=MYSQL['user'], password=MYSQL['password'], database=MYSQL['database'] ) def close_spider(self, spider): if self.conn: self.conn.close() def process_item(self, item, spider): try: cursor = self.conn.cursor() sql = "insert into caipiao (qihao, red_ball, blue_ball) values (%s, %s, %s)" cursor.execute(sql, (item['qihao'], item['red_ball'], item['blue_ball'])) self.conn.commit() except: self.conn.rollback() finally: if cursor: cursor.close() return item class CaipiaoMongoDBPipeline: def open_spider(self, spider): self.client = pymongo.MongoClient(host='localhost', port=27017) db = self.client['haha'] # use database self.collection = db['caipiao'] # 指定彩票集合 def close_spider(self, spider): self.client.close() def process_item(self, item, spider): self.collection.insert({"qihao": item['qihao'], "red_ball": item['red_ball'], "blue_ball": item['blue_ball']}) return item settings.py源码： BOT_NAME = 'caipiao' SPIDER_MODULES = ['caipiao.spiders'] NEWSPIDER_MODULE = 'caipiao.spiders' LOG_LEVEL = "WARNING" # 配置MySQL MYSQL = { 'host':'localhost', 'port':3306, 'user':'root', 'password':'xxxxxx', 'database':'spider' } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'caipiao.pipelines.CaipiaoPipeline': 300, 'caipiao.pipelines.CaipiaoMySQLPipeline': 301, 'caipiao.pipelines.CaipiaoMongoDBPipeline': 302 } # Obey robots.txt rules ROBOTSTXT_OBEY = True 案例二：图片之家目标：说明从起始URL中获取详情URL列表，依次进入详情URL中抓取数据的过程实现② MySQL数据库 ④ 文件存储 (图片等二进制文件) 文件目录结构：

? 和双色球案例一样

meinv.py文件（解析起始页面部分）

meinv.py文件（解析详情页面部分）

items.py文件

pipelines.py文件（图片下载功能）

piplines.py文件（结果保存到MySQL功能）

settings.py文件

meinv.py源码 import scrapy # from urllib.parse import urljoin from tupianzhijia.items import MeinvItem class MeinvSpider(scrapy.Spider): name = 'meinv' allowed_domains = ['tupianzj.com'] start_urls = ['http://tupianzj.com/bizhi/DNmeinv/'] def parse(self, resp, **kwargs): # print(resp.text) # 看看源码中是否包含要的内容 li_list = resp.xpath("//ul[@class='list_con_box_ul']/li") for li in li_list: href = li.xpath("./a/@href").extract_first() # print(href) # 打印下看看 # 理论上应该开始进行一个网络请求了 # 根据Scrapy的运行原理，此处应该对href进行处理，处理成一个请求，交给引擎 # print(resp.urljoin(href)) # 打印下看看 yield scrapy.Request( url=resp.urljoin(href), # 吧resp中的url和刚刚获取的url进行拼接整合 method='get', callback=self.parse_detial # 回调函数，当响应回馈之后，如何进行处理响应内容 ) # 可以考虑下一页的问题 # 如果这里可以下一页，那么数据的解析，直接就是当前的这个parse next_href = resp.xpath('//div[@class="pages"]/ul/li/a[contains(text(), "下一页")]/@href').extract_first() yield scrapy.Request( url=resp.urljoin(next_href), # 吧resp中的url和刚刚获取的url进行拼接整合 method='get', callback=self.parse # 回调函数，当响应回馈之后，如何进行处理响应内容 ) def parse_detial(self, resp, **kwargs): # print(resp.text) # 看看源码中是否包含要的内容 name = resp.xpath('//*[@id="container"]/div/div/div[2]/h1/text()').extract_first() img_src = resp.xpath("//div[@id='bigpic']/a/img/@src").extract_first() # print(name, img_src) # 打印下看看 Meinv = MeinvItem() Meinv['name'] = name Meinv['img_src'] = img_src yield Meinv items.py源码 import scrapy class MeinvItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() img_src = scrapy.Field() local_path = scrapy.Field() pipelines.py源码 from itemadapter import ItemAdapter from scrapy.pipelines.images import ImagesPipeline import scrapy import pymysql from tupianzhijia.settings import MYSQL class TupianzhijiaPipeline: def open_spider(self, spider): self.conn = pymysql.connect( host=MYSQL['host'], port=MYSQL['port'], user=MYSQL['user'], password=MYSQL['password'], database=MYSQL['database'] ) def close_spider(self, spider): if self.conn: self.conn.close() def process_item(self, item, spider): try: cursor = self.conn.cursor() sql = "insert into tu (name, img_src, local_path) values (%s, %s, %s)" cursor.execute(sql, (item['name'], item['img_src'], item['local_path'])) self.conn.commit() except: self.conn.rollback() finally: if cursor: cursor.close() return item # 想要使用ImagesPipeline必须单独设置一个配置，用来保存文件的文件夹 class MeinvSavePipeline(ImagesPipeline): # 利用图片管道帮我们完成数据下载操作 """ 重写父类三个功能 """ def get_media_requests(self, item, info): # 负责下载 yield scrapy.Request(item['img_src']) # 直接返回一个请求即可 def file_path(self, request, response=None, info=None, *, item=None): # 准备文件路径 file_name = request.url.split("/")[-1] # request.url可以直接获取到刚刚请求的url return f"img/{file_name}" def item_completed(self, results, item, info): # 返回文件的详细信息 ok, finfo = results[0] # finfo["path"] # print(results) item['local_path'] = finfo['path'] return item settings.py源码 BOT_NAME = 'tupianzhijia' SPIDER_MODULES = ['tupianzhijia.spiders'] NEWSPIDER_MODULE = 'tupianzhijia.spiders' LOG_LEVEL = "WARNING" # 配置MySQL MYSQL = { 'host':'localhost', 'port':3306, 'user':'root', 'password':'xxxxxxxx', 'database':'spider' } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'tupianzhijia.pipelines.TupianzhijiaPipeline': 300, 'tupianzhijia.pipelines.MeinvSavePipeline': 299 } IMAGES_STORE = './meinvtupian' # Obey robots.txt rules ROBOTSTXT_OBEY = True 小结：

本节需要弄明白重点：

两个案例的代码运行顺序每一种数据持久化存储的方法