利用scrapy框架爬取网易新闻排行榜

wyxw.py中代码

# -*- coding: utf-8 -*-
import scrapy
from ..items import WyxwItem

class WyxwSpider(scrapy.Spider):
    name = 'wyxw'
    allowed_domains = ['news.163.com']
    start_urls = ['http://news.163.com/special/0001386F/rank_whole.html']

    def parse(self, response):


        item = WyxwItem()
        xq_title = response.xpath('//table/tr/td[1]/a/text()').extract()
        item['title'] = xq_title

        print(xq_title)
        xq_url = response.xpath('//table/tr/td[1]/a/@href').extract()
        item['xq_url'] = xq_url
        print(xq_url)
        xq_djl = response.xpath('//table/tr/td[2]/text()').extract()
        print(xq_djl)
        item['dj'] = xq_djl
        for i in range(0, len(xq_title)):

            item['title'] = xq_title[i]
            item['xq_url'] = xq_url[i]
            item['dj'] = xq_djl[i]
            yield item


items.py文件代码

class WyxwItem(scrapy.Item):
    title = scrapy.Field()
    xq_url = scrapy.Field()
    dj = scrapy.Field()
    def get_insert_sql(self):
        sql = 'insert into wyxw_test(title,xq_url,dj) values (%s,%s,%s)'
        data = (self['title'],self['xq_url'],self['dj'])
        return (sql,data)

pipelines.py代码

class MysqlProjectPipeline(object):
    def process_item(self, item, spider):
        (insert_sql,data) = item.get_insert_sql()
        myhelper = MysqlHelper()
        myhelper.execute_modify_sql(insert_sql,data)

其他文件配置看scrapy框架基本设置

猜你喜欢

转载自blog.csdn.net/majiexiong/article/details/82108025