Write crawlers with scrapy framework

Structure diagram

There are two types of requests that the crawler can send to the engine:

    # 1、url:
    # (爬虫)yield scrapy.Request -> 引擎 -> 调度器(发送给调度器入队) -> 引擎(调度器出队请求于引擎)
    # -> 下载器(引擎发送于下载器) -> 引擎(下载器成功(失败)返回引擎):-> 爬虫(引擎接收成功将给爬虫response)or -> 调度器(失败给调度器重新下载)
    # -> 引擎(爬虫接收response并做处理,发送跟进url:yield scrapy.Request) -> 调度器(引擎发送给调度器入队) ->...
    # 2、weiboitem:
    # 一般在接收response后便有了数据,然后
    # (爬虫) yield weiboitem -> 引擎 -> pipelines(管道)进行存储,管道中自己写存储代码 -> mysql or redis

1. Preparation

  • python、pip、scrapy(pip install Scrapy)
  • Test: scrapy fetch http://www.baidu.com

Second, build the craby framework

  • Create a crawler project (cmd or terminal): scrapy startproject mySpiderName
  • cd:cd mySpider
  • Create a crawler: scrapy genspider myspidername www.dytt8.net
    (www.dytt8.net is the root domain name of the URL to be crawled, only this root domain name can crawl the content)
  • Modify the settings protocol: ROBOTSTXT_OBEY = False
  • Remember to add a statement in the ITEM_PIPELINES list in settings (open comments), otherwise the pipeline will not be executed:
    'mySpiderName.pipelines.WeiboSpiderPipeline': 300,

Three, fill in the code trilogy

  • Edit in the myspider.py file in the automatically generated spiders folder:
import scrapy
from hotnewsSpider.items import WeiboSpiderItem     # hotnewsSpider为项目名,WeiboSpiderItem为爬虫item类,在items.py中可找到
                                                    # 创建第二个爬虫时需要手动在items中添加此类

from bs4 import BeautifulSoup

class WeiboSpider(scrapy.Spider):
	# 以微博为例:
 	name = 'weibo'                          # 爬虫名 -- 自动生成,唯一,不可变
    allowed_domains = ['s.weibo.com']       # 允许访问的根域名
    start_urls = ['http://s.weibo.com/']    # 起始访问地址
    
    searchName = "张钧甯 感谢抬爱"
    headers = {

    }
    cookies = {

    }
    urls = [
        # 模拟搜索 searchName
        "https://s.weibo.com/weibo?q=%s&Refer=SWeibo_box"%searchName

    ]
    # urls.extend(start_urls)

	# 重写起始请求,可以给请求加上许多信息
    def start_requests(self):
        # 发送初始请求
        for url in self.urls:
            yield scrapy.Request(url=url, headers=self.headers, cookies=self.cookies, callback=self.parse)

    # 默认第一次返回response接收函数,第二次response可以继续返回这里,也可以返回你定义的人一个函数中,
    # 这在yield scrapy.Request(url,callback=self.your_parse)中决定
    def parse(self, response):
    
        # 用爬虫对应的item类声明一个对象,类型为字典,用来保存数据,通过 yield weiboitem 返回给引擎
        weiboitem = WeiboSpiderItem()                       # from hotnewsSpider.items import WeiboSpiderItem
		
		# BeautifulSoup代码块:
        html = response.text
        soup = BeautifulSoup(html, 'lxml')
        content_id_div = soup.find(id='pl_feedlist_index')
        card_wraps = content_id_div.find_all(class_='card-wrap')

        id = 0

        for card_wrap_item in card_wraps:
            # 用户名
            username = card_wrap_item.find(class_='info').find(class_='name').text

            # 用户头像
            user_headimg = card_wrap_item.find(class_='avator').find('img')['src']

            # 内容
            # 文字 偶尔会搜索出某个人
            content_text_html = card_wrap_item.find(class_='txt')
            content_text = ''
            if content_text_html:
                content_text = content_text_html.get_text().replace(' ', '').replace('\n', '').replace('展开全文c', '')

            # 图片 有的无图
            img_items_html = card_wrap_item.find(class_='m3')
            content_imgs = []
            if img_items_html:
                for img_item in img_items_html.find_all('img'):
                    content_imgs.append(img_item['src'])

            # (收藏)、转发、评论、点赞数量
            other_items_html = card_wrap_item.find(class_='card-act')
            other_items_dic = {}
            if other_items_html:
                other_items_lst = other_items_html.find_all('a')
                for other_item_index in range(len(other_items_lst)):
                    if other_item_index == 0:
                        other_items_dic['收藏'] = ""
                    elif other_item_index == 1:
                        other_items_dic['转发'] = other_items_lst[other_item_index].text.strip().split()[1]
                    elif other_item_index == 2:
                        other_items_dic['评论'] = other_items_lst[other_item_index].text.strip().split()[1]
                    else:
                        other_items_dic['点赞'] = other_items_lst[other_item_index].text.strip()
            # print(other_items_dic)
            id += 1
            weiboitem['id'] = id
            weiboitem['username'] = username
            weiboitem['user_headimg'] = user_headimg
            weiboitem['content_text'] = content_text
            weiboitem['content_imgs'] = content_imgs
            weiboitem['other_items_dic'] = other_items_dic

            yield weiboitem		# 返回数据给引擎,引擎将其传入管道执行管道中的代码
            # yield scrapy.Request(url,callback=self.parse)	# 返回跟进url给引擎
            # yield scrapy.Request(url,callback=self.parse2)	# 返回跟进url给引擎

            break       # 用于测试,只拿一次数据
          
 	def parse2(self,response):
 		pass
  • Initialize the item dictionary in items.py (the crawlers created for the second time or more need to add corresponding classes by themselves)
import scrapy


# 第一个爬虫对应的item类,在创建项目时自动产生
class HotnewsspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass

# 自己新增的爬虫类
class WeiboSpiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    id = scrapy.Field()
    username = scrapy.Field()
    user_headimg = scrapy.Field()
    content_text = scrapy.Field()
    content_imgs = scrapy.Field()
    other_items_dic = scrapy.Field()
    pass
  • Save data in pipelines.py
# 第一个爬虫对应的Pipeline类,在创建项目时自动产生
class HotnewsspiderPipeline(object):
    def process_item(self, item, spider):
        pass
        # return item

# 自己新增的爬虫类
# 切记在settings中ITEM_PIPELINES列表添加语句,否则不会被执行:
# 'hotnewsSpider.pipelines.WeiboSpiderPipeline': 300,
class WeiboSpiderPipeline(object):
    def process_item(self, item, spider):
        # 在这里将数据存入mysql,redis
        print(item)

run:

  • scrapy crawl mysipdername (don’t forget the cd directory)
  • Add any of the following py running files named run or main, which should be at the same level as the scrapy.cfg file.
    Change your crawler name to run by right-clicking
一、
from scrapy.cmdline import execute
import sys
import os

'''
运行scrapy爬虫的方式是在命令行输入    scrapy crawl <spider_name>
调试的常用方式是在命令行输入          scrapy shell <url_name>
'''

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

execute(['scrapy', 'crawl', 'weibo'])  # 你需要将此处的spider_name替换为你自己的爬虫名称
二、
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings

'''
运行scrapy爬虫的方式是在命令行输入    scrapy crawl <spider_name>
调试的常用方式是在命令行输入          scrapy shell <url_name>
'''

if __name__ == '__main__':
    process = CrawlerProcess(get_project_settings())
    process.crawl('weibo')    #  你需要将此处的spider_name替换为你自己的爬虫名称
    process.start()

Guess you like

Origin blog.csdn.net/GeniusXYT/article/details/101511376