[Python爬虫]Scrapy框架爬取bilibili个人信息

启动文件main.py

from scrapy.cmdline import execute

execute('scrapy crawl bili_gr_xx'.split())

执行spider下的爬取文件

# -*- coding: utf-8 -*-
import scrapy,json
from .. import items

class BiliGrXxSpider(scrapy.Spider):
    name = 'bili_gr_xx'
    allowed_domains = ['bilibili.com']
    # start_urls = ['http://bilibili.com/']
    # 我们使用这个函数作为初始的执行函数
    def start_requests(self):
        url = 'https://space.bilibili.com/ajax/member/GetInfo'
        for i in range(1,201):
            data_form = {
                'mid':str(i),
                'csrf': '',
            }
            url_ajax = 'https://space.bilibili.com/{}/'.format(i)
            # get的时候是这个东东, scrapy.Request(url=, callback=)
            req = scrapy.FormRequest(url=url, formdata=data_form, callback=self.parse, method='POST')
            req.headers['referer'] = url_ajax

            yield req


    def parse(self, response):
        print('--'*20)
        mysql=items.bili_mysql()
        html=json.loads(response.text)
        # print(html)
        mysql['name']=html['data']['name']
        mysql['ID']=html['data']['mid']
        mysql['sex']=html['data']['sex']
        mysql['tx_img']=html['data']['face']
        mysql['gr_biaoq']=html['data']['sign']
        mysql['chao']=html['data']['official_verify']['desc']


        for i in mysql:
            if mysql[i] == '':
                mysql[i]=None

        yield mysql

items文件

class bili_mysql(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    name=scrapy.Field()
    ID=scrapy.Field()
    sex=scrapy.Field()
    tx_img=scrapy.Field()
    gr_biaoq=scrapy.Field()
    chao=scrapy.Field()

settings.py配置文件

将改的地方写了下来

#导包
from ..piaot import *

#是否遵循规则,不懂请百度
#ROBOTSTXT_OBEY改成False
ROBOTSTXT_OBEY = False

#开启报头
DEFAULT_REQUEST_HEADERS = {

  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
}

#配置存储文件地址和优先级
ITEM_PIPELINES = {

  'bilibili_wj.pipelines.bilibili_mysql': 300,

}

pipelines.py存储文件

# -*- coding: utf-8 -*-
import pymysql
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html


class BilibiliWjPipeline(object):
    def process_item(self, item, spider):
        return item


class bilibili_mysql(object):
    def process_item(self, item, spider):
        sql = "insert into xq_2 values(NULL,'{}',{},'{}','{}','{}','{}')".format(item['name'], item['ID'], item['sex'], item['tx_img'], item['gr_biaoq'],item['chao'])
        print(sql)
        # 打开数据库连接,
        db = pymysql.connect("192.168.43.128", "root", "123456", "xq", charset='utf8')
        # 使用 cursor() 方法创建一个游标对象 cursor
        cursor = db.cursor()
        # 使用 fetchone() 方法获取单条数据.
        cursor.execute(sql)

        # 执行mysql
        db.commit()
        db.close()

猜你喜欢

转载自blog.csdn.net/Black_God1/article/details/82121173