一个完整的scrapy 项目

#注意 管道文件一定要看settings.py是否开启不然不会执行

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup as bs
import re
from douban.items import DoubanItem #这里是要引入items字段 

#scrapy crawl dou

class DouSpider(scrapy.Spider):

    name = 'dou' #爬虫名字
    
    start_urls = ['https://movie.douban.com/subject/30314127/reviews'] #需要爬的链接

    def parse(self, response):
         
        html = response.text

        html = bs(html,'lxml')

        cont = html.findAll('div',class_='main review-item')

        for i in cont:

            item = DoubanItem()#这里就是用items 字段来存获取的东西 相当于一个字典
                
            name = i.header.text
            name = re.sub('\n','',name)
                
            con = i.div.text
            con = re.sub('\n','',con)
            con = re.sub(' ','',con)


            item['name'] = name
            item['con'] = con


            #self.log(name)
            #self.log(con)
            #self.log('\n')
            #self.log('\n')
            
            yield item #存好就可以yield 这个是时时返回并不结束程序







import scrapy   #这个就是定义items 字段了
class DoubanItem(scrapy.Item):

    name = scrapy.Field()

    con = scrapy.Field()

    
#管道文件存items 字段过来的内容

class DoubanPipeline(object):

    def process_item(self, item, spider):
        
        with open("douban.txt", "a",encoding='utf-8')as f:

            f.write(item['name'])
            f.write('\n')
            f.write(item['con'])

            f.write('\n')
            f.write('\n')
            f.write('\n')

        
        return item


猜你喜欢

转载自blog.csdn.net/AnYeZhiYin/article/details/105889203
今日推荐