Python爬虫_案例分析(二)

Python爬虫_案例分析(二)

一、电影天堂案例

import scrapy
from scrapy_movie.items import ScrapyMovieItem
class MvSpider(scrapy.Spider):
    name = 'mv'
    allowed_domains = ['www.dytt8.net']
    start_urls = ['https://www.dytt8.net/html/gndy/china/index.html']
    def parse(self, response):
        # 第一页的名字和第二页的图片
        a_list= response.xpath('//div[@class="co_content8"]//td[2]//a[2]')
        for a in a_list:
            # 获取第一页的name和要点击的链接
            name = a.xpath('./text()').extract_first()
            href = a.xpath('./@href').extract_first()

            # 第二页的地址是
            url = 'https://www.dytt8.net' + href
            # 对第二页的链接发起访问
            yield scrapy.Request(url=url,callback=self.parse_second,meta = {
    
    'name':name})
    def parse_second(self,response):
        # 如果拿不到数据,务必检查xpath的路径是否正确
        src = response.xpath('//div[@id="Zoom"]//img/@src').extract_first()
        # 接收到请求的meta参数的值
        name = response.meta['name']
        movie = ScrapyMovieItem(src = src,name = name)
        yield movie

pipelines.py

from itemadapter import ItemAdapter
class ScrapyMoviePipeline:
    def open_spider(self,spider):
        self.fp = open('movie.json','w',encoding='utf-8')
    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item
    def close_spider(self,spider):
        self.fp.close()

二、链接提取器的使用_CrawlSpider

案例:读书网

scrapy shell https://www.dushu.com/book/1188.html
from scrapy.linkextractors import LinkExtractor
	# allow()   # 正则表达式,提取符合正则的链接
	link = LinkExtractor(allow = r'/book/1188_\d+\.html')
	link.extract_links(response)
	
	# restrict_xpaths() # xpath提取符合xpath规则的链接
	link1 = LinkExtractor(restrict_xpaths = r'//div[@class = "pages"]/a')
	link1.extract_links(response)

	# restrict_css() # 提取符合选择器规则的链接

在这里插入图片描述

1.创建项目 scrapy startproject 【项目名字:scrapy_readbook】
2.跳转到spider文件夹的目录下
cd D:\PythonCode\scrapy_readbook\scrapy_readbook\spiders
3.创建爬虫文件
scrapy genspider -t crawl read https://www.dushu.com/book/1188.html

(一)修改read.py

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_readbook.items import ScrapyReadbookItem
class ReadSpider(CrawlSpider):
    name = 'read'
    allowed_domains = ['www.dushu.com']
    start_urls = ['https://www.dushu.com/book/1188_1.html']
    rules = (
        # 正则表达式 + 代表很多个
        Rule(LinkExtractor(allow=r'/book/1188_\d+\.html'),
             callback='parse_item',
             follow=False),
    )
    def parse_item(self, response):
        img_list = response.xpath('//div[@class="bookslist"]//img')
        for img in img_list : 
            name = img.xpath('./@alt').extract_first()
            src = img.xpath('./@data-original').extract_first()
            book = ScrapyReadbookItem(name=name,src=src)
            yield book

(二)定义变量items.py

import scrapy
class ScrapyReadbookItem(scrapy.Item):
   name = scrapy.Field()
   src = scrapy.Field()

(三)开启管道settings.py

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    
    
   'scrapy_readbook.pipelines.ScrapyReadbookPipeline': 300,
}

(四)自定义方法pipelines.py

from itemadapter import ItemAdapter
class ScrapyReadbookPipeline:
    def open_spider(self,spider):
        self.fp = open('book.json','w',encoding='utf-8')
    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item
    def close_spider(self,spider):
        self.fp.close()

在这里插入图片描述

三、读书网数据存入数据库

(一)配置相关信息settings.py

DB_HOST = '127.0.0.1'
# 端口号
DB_PORT = 3306
DB_USER = 'root'
DB_PASSWORD = '123456'
DB_NAME = 'book'
# utf-8没有-
DB_CHARSET = 'utf8'

(二)安装pymysql

pip install pymysql -i https://pypi.douban.com/simple

(三)自定义管道类pipelines.py

from itemadapter import ItemAdapter
class ScrapyReadbookPipeline:
    def open_spider(self,spider):
        self.fp = open('book.json','w',encoding='utf-8')
    def process_item(self, item, spider):
        self.fp.write(str(item))
        return item
    def close_spider(self,spider):
        self.fp.close()

# 加载settings文件
from scrapy.utils.project import get_project_settings
import pymysql
class MysqlPipeline:

    def open_spider(self,spider):
        settings = get_project_settings()
        # DB_HOST = '192.168.97.233'
        # # 端口号
        # DB_PORT = 3306
        # DB_USER = 'root'
        # DB_PASSWORD = '123456'
        # DB_NAME = 'book'
        # DB_CHARSET = 'utf-8'
        self.host = settings['DB_HOST']
        self.port = settings['DB_PORT']
        self.user = settings['DB_USER']
        self.password = settings['DB_PASSWORD']
        self.name = settings['DB_NAME']
        self.charset = settings['DB_CHARSET']
        self.connect()
    def connect(self):
        self.conn = pymysql.connect(
            host = self.host,
            port = self.port,
            user = self.user,
            password = self.password,
            db = self.name,
            charset = self.charset
        )
        self.cursor = self.conn.cursor()
    def process_item(self,item,spider):
        sql = 'insert into book(name,src) values("{}","{}")'.format(item['name'],item['src'])
        # 执行sql语句
        self.cursor.execute(sql)
        # 提交
        self.conn.commit()
        return item
    def close_spider(self,spider):
        self.cursor.close()
        self.conn.close()

在这里插入图片描述
若把read.py中的follow属性值设为True,则爬取所有数据
在这里插入图片描述
在这里插入图片描述

四、scrapy日志信息和日志等级

在settings.py文件中指定日志级别
在这里插入图片描述
在这里插入图片描述
将日志信息保存成文件
在这里插入图片描述

五、scrapy百度翻译post请求

import scrapy

import json
class TestpostSpider(scrapy.Spider):
    name = 'testpost'
    allowed_domains = ['fanyi.baidu.com']
    # post请求没有参数,则该请求没有意义 故start_urls也没用
    # start_urls = ['https://fanyi.baidu.com/sug']
    #
    # def parse(self, response):
    #     pass
    def start_requests(self):
        url = 'https://fanyi.baidu.com/sug'
        data = {
    
    
            'kw': 'final'
        }
        yield scrapy.FormRequest(url=url,formdata=data,callback=self.parse_second)
    def parse_second(self,response):
        content = response.text
        obj = json.loads(content)
        print(obj)

在这里插入图片描述
Scrapy系列完结撒花啦啦~~

猜你喜欢

转载自blog.csdn.net/qq_45556665/article/details/125498464