原创

爬虫框架scrapy学习笔记


爬虫框架scrapy学习笔记

1. 框架运行大致流程

  1. 引擎从调度器中取出一个链接(URL)用于接下来的抓取
  2. 引擎把URL封装成一个请求(Request)传给下载器
  3. 下载器把资源下载下来,并封装成应答包(Response)
  4. 爬虫解析Response
  5. 解析出实体(Item),则交给实体管道进行进一步的处理
  6. 解析出的是链接(URL),则把URL交给调度器等待抓取

2. 安装与使用

  1. pip install scrapy -i 资源网址(详见爬虫基础笔记)
  2. 使用:
  3. 定位到要创建项目的目录
  4. 创建项目:scrapy startproject myfrist(your_project_name)
  5. 创建爬虫:scrapy genspider 爬虫名 爬虫的地址
  6. 运行爬虫:scrapy crawl 爬虫名

3. 案例(爬取小说)

81中文网小说爬取(道神和替天行道)

ttxd.py

# -*- coding: utf-8 -*-
import scrapy


class TtxdSpider(scrapy.Spider):
    name = 'ttxd'
    allowed_domains = ['zwdu.com']
    # 第一章开始,爬取小说:替天行道
    start_urls = ['https://www.zwdu.com/book/28364/9673844.html']

    # 测试最后几章
    # start_urls = ['https://www.zwdu.com/book/28364/19653880.html']

    def parse(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '\n')

        yield {
            'title': title,
            'content': content
        }   

        next_url = response.xpath('//div[@class="bottem1"]/a[3]/@href').extract_first()
        # base_url = 'https://www.zwdu.com/{}'.format(next_url)
        if next_url.find('.html') != -1:
            yield scrapy.Request(response.urljoin(next_url))

ds.py

ds.py使用了 CrawlSpider模板
创建爬虫需执行命令:scrapy genspider -t crawl 文件名 (allowed_url)
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class DsSpider(CrawlSpider):
    name = 'ds'
    allowed_domains = ['zwdu.com']
    start_urls = ['https://www.zwdu.com/book/8725/']

    rules = (
        Rule(LinkExtractor(restrict_xpaths=r'//div[@id="list"]//dd[2]'),callback='parse_item',follow=True),
        Rule(LinkExtractor(restrict_xpaths=r'/div[@class="bottem1"]/a[3]'), callback='parse_item', follow=True),
    )

    def parse_item(self, response):
        title = response.xpath('//h1/text()').extract_first()
        content = ''.join(response.xpath('//div[@id="content"]/text()').extract()).replace('    ', '\n')

        yield {
            'title': title,
            'content': content
        }


main.py

from scrapy.cmdline import execute

execute(['scrapy','crawl','ttxd'])
# execute(['scrapy','crawl','ds'])

pipline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class XiaoshuoPipeline(object):
    def open_spider(self, spider):
        # self.file = open('ttxd.txt', 'w', encoding='utf-8')
        self.file = open('ds.txt', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        title = item['title']
        content = item['content']
        # info = '\n-------'+title+'------\n'+content+'\n'
        info = title+'\n'
        self.file.write(info)
        self.file.flush()
        return item

    def close_spider(self, spider):
        self.file.close()

setting.py

# -*- coding: utf-8 -*-

# Scrapy settings for xiaoshuo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'xiaoshuo'

SPIDER_MODULES = ['xiaoshuo.spiders']
NEWSPIDER_MODULE = 'xiaoshuo.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 2
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'xiaoshuo.middlewares.XiaoshuoSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'xiaoshuo.middlewares.XiaoshuoDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'xiaoshuo.pipelines.XiaoshuoPipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

4. scrapy的登录

普通传参登录
# -*- coding: utf-8 -*-
import scrapy


class Login1Spider(scrapy.Spider):
    name = 'login1'
    allowed_domains = ['zengqiang.club']
    # start_urls = ['http://zengqiang.club/admin']

    def start_requests(self):
        url = 'http://www.zengqiang.club/admin/login'
        from_data = {
            'username':'曾强',
            'password':'ZQZ981004'
        }
        # 发送post请求用scrapy.FormRequest()
        yield scrapy.FormRequest(url,formdata=from_data,callback=self.parse)

    def parse(self, response):
        # print(response.text)
        next_url = 'http://www.zengqiang.club/admin/blogs'
        yield scrapy.Request(next_url,callback=self.parse_info)

    def parse_info(self,response):
        titles = response.xpath('//tr//td[2]/text()').extract()
        for title in titles:
            print(title)
cookie登录
# -*- coding: utf-8 -*-
import scrapy


class Login2Spider(scrapy.Spider):
    name = 'login2'
    allowed_domains = ['zengqiang.club']
    # start_urls = ['http://zengqiang.club/']

    def start_requests(self):
        url = 'http://www.zengqiang.club/admin/blogs'
        cookie_str = 'JSESSIONID=CBB390075280E7FA8BB4B7A3A7890D94;'
        cookies = {
            'JSESSIONID': 'CBB390075280E7FA8BB4B7A3A7890D94'
        }
        # 适合于cookie有很多key和value时
        # cookies = {}
        # for cookie in cookie_str.split(';'):
        #     # key,value = cookie.split('=',1)
        #     key, value = cookie.split('=',1)
        #     cookies[key.strip()] = value.strip()

        yield scrapy.Request(url,cookies=cookies,callback=self.parse)

    def parse(self, response):
        print(response.text)

登录需要验证码,登录(赶集网案例)
# -*- coding: utf-8 -*-
import scrapy,re


class Login3Spider(scrapy.Spider):
    name = 'login3'
    allowed_domains = ['ganji.com']
    start_urls = ['https://passport.ganji.com/login.php']

    def parse(self, response):
        hash_code = re.findall(r'"__hash__":"(.+)"', response.text)[0]
        image_url = response.xpath('//img[@class="login-img-checkcode"]/@data-url').extract_first()
        print(hash_code,'\n',image_url)

        yield scrapy.Request(image_url,callback=self.parse_info,meta={'hash_code':hash_code})

    def parse_info(self,response):
        hash_code = response.request.meta['hash_code']
        print(hash_code)
        with open('yzm.jpg','wb')as f:
            f.write(response.body)

        code = input('请输入验证码:')

        form_data = {
            "username": "17784476955",
            "password": "ZQZ981004",
            "setcookie": "0",
            "checkCode": code,
            "next": "/",
            "source": "passport",
            "__hash__": hash_code
        }

        login_url = 'https://passport.ganji.com/login.php'
        yield scrapy.FormRequest(login_url,formdata=form_data,callback=self.login)

    def login(self,response):
        print(response.text)
        user_info_url = 'http://www.ganji.com/vip'
        yield scrapy.Request(user_info_url,callback=self.login_check)

    def login_check(self,response):
        print(response.text)

5. scrapy代理的使用

创建一个代理中间件
在setting中打开自己定义的中间件,使用代理
# proxymiddleware.py

class ProxyMiddleware(object):

    def process_request(self, request, spider):
        # request.meta['proxy']='http://ip:port'
        # request.meta['proxy']='http://user:password@ip:port'

        request.meta['proxy'] = 'http://222.95.240.159:3000'
# setting中打开
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   # 'zol.middlewares.ZolDownloaderMiddleware': 543,
   'zol.proxymiddleware.ProxyMiddleware': 301
}

6. 爬取zol桌面壁纸案例

wallpaper.py

# -*- coding: utf-8 -*-
import scrapy


class WallpaperSpider(scrapy.Spider):
    name = 'wallpaper'
    allowed_domains = ['zol.com.cn']
    start_urls = ['http://desk.zol.com.cn/bizhi/8672_106957_2.html']

    def parse(self, response):
        image_url = response.xpath('//img[@id="bigImg"]/@src').extract()
        image_name = response.xpath('string(//h3)').extract_first()

        yield {
            'image_urls': image_url,
            'image_name': image_name
        }

        next_url = response.xpath('//a[@id="pageNext"]/@href').extract_first()
        if next_url.find('.html') != -1:
            yield scrapy.Request(response.urljoin(next_url), callback=self.parse)

plpeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
import scrapy

class ZolPipeline(object):
    def process_item(self, item, spider):
        return item


class ImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['image_urls']:
            yield scrapy.Request(image_url, meta={'image_name':item['image_name']})

    def file_path(self, request, response=None, info=None):
        filename = request.meta['image_name'].strip().replace('\r\n\t\t','').replace('/','_')+'.jpg'
        return filename

setting

from fake_useragent import UserAgent


BOT_NAME = 'zol'

SPIDER_MODULES = ['zol.spiders']
NEWSPIDER_MODULE = 'zol.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = UserAgent().random

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0


ITEM_PIPELINES = {
   # 'zol.pipelines.ZolPipeline': 300,
   # 'scrapy.contrib.pipeline.images.ImagesPipeline': 300,这个不行
   # 'scrapy.pipelines.images.ImagesPipeline': 300,
   'zol.pipelines.ImagePipeline': 300,
}
IMAGES_STORE = 'F:/PythonProjects/study/爬虫学习/scrapy框架/zol/zol/img'

7. scrapy使用selenium

selenium用于爬取Ajax异步请求网站的数据
爬取瓜子二手车页面代码(Ajax请求的网页)
实现爬取完成关闭浏览器,程序未结束,不关浏览器

guazi.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy import signals
from selenium import webdriver


class GuaziSpider(scrapy.Spider):
    name = 'guazi'
    allowed_domains = ['gauzi.com']
    start_urls = ['https://www.guazi.com/cd/buy/']

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super(GuaziSpider, cls).from_crawler(crawler, *args, **kwargs)
        spider.driver = webdriver.Chrome()
        crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
        return spider

    def spider_closed(self, spider):
        spider.driver.quit()
        # print('爬虫结束!')

    # 得到代码,即可解析
    def parse(self, response):
        print(response.text)

middlewares.py

from selenium import webdriver
from scrapy.http import HtmlResponse


class SeleniumMiddleware(object):

    # 初始化创建的程序结束浏览器也不会关闭
    # def __init__(self):
    #     self.driver = webdriver.Chrome()

    def process_request(self, request, spider):
        url = request.url
        # driver = webdriver.Chrome()
        # self.driver.get(url)
        # html = self.driver.page_source

        # 使用spider的driver
        spider.driver.get(url)
        html = spider.driver.page_source
        # print(html)
        return HtmlResponse(url=url, body=html, request=request, encoding='utf-8')

setting.py(略)

8. scrapy案例

爬取链家二手房的房源数据(地点,价格,大小,等等)
保存数据到Mongo数据库和Mysql数据库

lianjia.py

# -*- coding: utf-8 -*-
import scrapy


class LianjiSpider(scrapy.Spider):
    name = 'lianjia'
    allowed_domains = ['lianjia.com']
    # 测试 只爬取两页数据(60个房源信息)
    start_urls = ['https://cd.lianjia.com/ershoufang/cd/pg{}/'.format(num) for num in range(1, 3)]
    # 测试
    # start_urls = ['https://cd.lianjia.com/ershoufang/cd/pg1/']

    def parse(self, response):
        # print(response.url)
        urls = response.xpath('//div[@class="info clear"]//div[@class="title"]/a/@href').extract()
        for url in urls:
            yield scrapy.Request(url, callback=self.parse_info)

    def parse_info(self, response):
        # print(response.url)
        # 二手房标题
        title = response.xpath('//h1/text()').extract_first()
        # 二手房价格
        total_price = response.xpath(
            'concat(//div[@class="price "]//span[@class="total"]/text(),//div[@class="price "]//span[@class="unit"]/span/text())').extract_first()
        # 单价
        unitPriceValue = response.xpath('string(//span[@class="unitPriceValue"])').extract_first()
        # 地址
        areaName = response.xpath(
            'concat(//div[@class="areaName"]//span[2],//div[@class="areaName"]/a)').extract_first()
        # 小区
        village = response.xpath('//div[@class="communityName"]/a[1]/text()').extract_first()

        # 户型
        hu_xing = response.xpath('//div[@class="base"]//ul/li[1]/text()').extract_first()
        # 楼层
        lou_ceng = response.xpath('//div[@class="base"]//ul/li[2]/text()').extract_first()
        # 面积
        area = response.xpath('//div[@class="base"]//ul/li[3]/text()').extract_first()
        # 产权
        chan_quan = response.xpath('//div[@class="transaction"]//ul/li[last()-2]/span[2]/text()').extract_first()

        yield {
            'title': title,
            'total_price': total_price,
            'unitPriceValue': unitPriceValue,
            'areaName': areaName,
            'village': village,
            'hu_xing': hu_xing,
            'lou_ceng': lou_ceng,
            'area': area,
            'chan_quan': chan_quan
        }

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
import pymysql


# class LianjiaHourseMongoPipeline(object):
#
#     def open_spider(self, spider):
#         self.client = pymongo.MongoClient()
#
#     def process_item(self, item, spider):
#         self.client.lianjia.ershoufang.insert(item)
#         return item
#
#     def close_spider(self, spider):
#         self.client.close()


class LianjiaHourseMysqlPipeline(object):

    def open_spider(self, spider):
        self.client = pymysql.connect(host='localhost', port=3306, user='root', password='ZQZ981004', db='python',
                                      charset='utf8')
        self.cursor = self.client.cursor()

    def process_item(self, item, spider):
        args = [
            item['title'],
            item['total_price'],
            item['unitPriceValue'],
            item['areaName'],
            item['village'],
            item['hu_xing'],
            item['lou_ceng'],
            item['area'],
            item['chan_quan']
        ]
        sql = 'insert into lianjia_ershoufang values (0,%s,%s,%s,%s,%s,%s,%s,%s,%s)'
        self.cursor.execute(sql, args)
        self.client.commit()
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.client.close()

setting.py(略)

资料

链接:https://pan.baidu.com/s/10e8PphvR7Um0-WPAylw8Yw

提取码:h8i8

python
爬虫
  • 作者:曾强(联系作者)
  • 发表时间:2020-02-29 21:23
  • 版权声明:自由转载-非商用-非衍生-保持署名
  • 转载声明:转载时请注明出处:www.zengqiang.club
  • 注:如果文章有错误,望请评论指出,谢谢;如果看了文章还有不明白的地方,欢迎进群与我交流。
  • 评论