Top Python crawling station b and video information into Excel spreadsheet

Renderings:
Here Insert Picture Description
1. To facilitate its thing work must first weapon, we have to first download the library :

pip install requests
pip install lxml
pip install xlwt

requests to send a request to the web
lxml processing xml file (xpath)
xlwt to do Excel writes
information 2. crawling b station popular video :
  Open the station b popular video page:
Here Insert Picture Description
  Press f12to enter the Developer options, and then click to select you want to get the information page, you can find this information in what position the HTML file (this is very important for us to acquire element attributes and element values xpath), for example:
Here Insert Picture Description
code is as follows:

# 爬取b站热门视频信息
def spider(video_list):
    url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
    html_data = requests.get(url).text
    selector = html.fromstring(html_data)
    infolist = selector.xpath('//li[@class="rank-item"]')
    for item in infolist:
        rank = "".join(item.xpath('./div[@class="num"]/text()'))
        videolink = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/a/@href'))
        title = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/a/text()'))
        playinfo = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span/text()')).split("万")
        play = playinfo[0] + "万"
        comment = playinfo[1]
        if comment.isdigit() == False:
            comment += "万"
        upname = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/text()'))
        uplink = "http:" + "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/@href'))
        hot = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()'))
        video_list.append({
            'rank': rank,
            'videolink': videolink,
            'title': title,
            'play': play,
            'comment': comment,
            'upname': upname,
            'uplink': uplink,
             'hot': hot
        })
    return video_list

3. The collection of the information we get (video_list) is written to an Excel spreadsheet :
  basic use of xlwt:

import xlwt
# 创建一个workbook (并设置编码)
workbook = xlwt.Workbook(encoding = 'utf-8')
# 创建一个worksheet
worksheet = workbook.add_sheet('My Worksheet')
 
# 写入excel
# 参数对应 行, 列, 值,(格式)
worksheet.write(1,0, label = 'this is test')
# 保存
#参数为你保存该Excel文件的路径
workbook.save('Excel_test.xls')

  If we want to click on the video name or names you can jump up, then we will use the Excel spreadsheet HYPERLINK method:
HYPERLINK ( "http://www.baidu.com"; "Baidu")
Baidu is displayed in the cell information, and links to jump in front of the link.
xlwt.Formula () method need to pass a string s, s = 'HYPERLINK ( " http://www.baidu.com"; " Baidu")'.
code show as below:

# 将爬取到的数据写入Excel表格
def write_Excel(video_list):
    print("将b站热门视频信息导入到Excel表格:")
    workbook = xlwt.Workbook()  # 定义workbook
    sheet = workbook.add_sheet('b站热门视频')  # 添加sheet
    xstyle = xlwt.XFStyle()  # 实例化表格样式对象
    xstyle.alignment.horz = 0x02  # 字体居中
    xstyle.alignment.vert = 0x01  # 字体居中
    head = ['视频名', 'up主','排名', '热度','播放量','评论数']  # 表头
    for h in range(len(head)):
        sheet.write(0, h, head[h],xstyle)  # 把表头写到Excel里面去
    i = 1
    for item in video_list:
        # 向单元格(视频名)添加(该视频的)超链接
        title_data = 'HYPERLINK("'+item["videolink"]+'";"'+item["title"]+'")'  # 设置超链接
        sheet.col(0).width = int(256 * len(title_data) * 3/5)   # 设置列宽
        sheet.write(i, 0, xlwt.Formula(title_data), xstyle)
        name_data = 'HYPERLINK("' + item["uplink"] + '";"' + item["upname"] + '")'  # 设置超链接
        sheet.col(1).width = int(256 * len(title_data) * 3 / 10)
        sheet.write(i, 1, xlwt.Formula(name_data), xstyle)
        sheet.write(i, 2, item['rank'], xstyle)
        sheet.write(i, 3, item['hot'], xstyle)
        sheet.write(i, 4, item['play'], xstyle)
        sheet.write(i, 5, item['comment'], xstyle)
        i += 1
    # 如果文件存在,则将其删除
    if os.path.exists('D:/Test/b站热门视频信息.xls'):
        os.remove('D:/Test/b站热门视频信息.xls')
    workbook.save('D:/Test/b站热门视频信息.xls')
    print('写入excel成功')
    print("文件位置:D:/Test/b站热门视频信息.xls")

4. Call the above two main functions in the inlet
complete code as follows:

import requests
from lxml import html
import xlwt
import os

# 爬取b站热门视频信息
def spider(video_list):
    url = 'https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3'
    html_data = requests.get(url).text
    selector = html.fromstring(html_data)
    infolist = selector.xpath('//li[@class="rank-item"]')
    for item in infolist:
        rank = "".join(item.xpath('./div[@class="num"]/text()'))
        videolink = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/a/@href'))
        title = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/a/text()'))
        playinfo = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/span/text()')).split("万")
        play = playinfo[0] + "万"
        comment = playinfo[1]
        if comment.isdigit() == False:
            comment += "万"
        upname = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/span/text()'))
        uplink = "http:" + "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="detail"]/a/@href'))
        hot = "".join(item.xpath('./div[@class="content"]/div[@class="info"]/div[@class="pts"]/div/text()'))
        video_list.append({
            'rank': rank,
            'videolink': videolink,
            'title': title,
            'play': play,
            'comment': comment,
            'upname': upname,
            'uplink': uplink,
             'hot': hot
        })
    return video_list

# 将爬取到的数据写入Excel表格
def write_Excel(video_list):
    print("将b站热门视频信息导入到Excel表格:")
    workbook = xlwt.Workbook()  # 定义workbook
    sheet = workbook.add_sheet('b站热门视频')  # 添加sheet
    xstyle = xlwt.XFStyle()  # 实例化表格样式对象
    xstyle.alignment.horz = 0x02  # 字体居中
    xstyle.alignment.vert = 0x01  # 字体居中
    head = ['视频名', 'up主','排名', '热度','播放量','评论数']  # 表头
    for h in range(len(head)):
        sheet.write(0, h, head[h],xstyle)  # 把表头写到Excel里面去
    i = 1
    for item in video_list:
        # 向单元格(视频名)添加(该视频的)超链接
        title_data = 'HYPERLINK("'+item["videolink"]+'";"'+item["title"]+'")'  # 设置超链接
        sheet.col(0).width = int(256 * len(title_data) * 3/5)   # 设置列宽
        sheet.write(i, 0, xlwt.Formula(title_data), xstyle)
        name_data = 'HYPERLINK("' + item["uplink"] + '";"' + item["upname"] + '")'  # 设置超链接
        sheet.col(1).width = int(256 * len(title_data) * 3 / 10)
        sheet.write(i, 1, xlwt.Formula(name_data), xstyle)
        sheet.write(i, 2, item['rank'], xstyle)
        sheet.write(i, 3, item['hot'], xstyle)
        sheet.write(i, 4, item['play'], xstyle)
        sheet.write(i, 5, item['comment'], xstyle)
        i += 1
    # 如果文件存在,则将其删除
    if os.path.exists('D:/Test/b站热门视频信息.xls'):
        os.remove('D:/Test/b站热门视频信息.xls')
    workbook.save('D:/Test/b站热门视频信息.xls')
    print('写入excel成功')
    print("文件位置:D:/Test/b站热门视频信息.xls")

if __name__ == '__main__':
    video_list = []
    write_Excel(spider(video_list))
Published 10 original articles · won praise 7 · views 153

Guess you like

Origin blog.csdn.net/qq_44204959/article/details/105160474