崔庆才爬虫训练网址第一题ssr1

崔庆才爬虫训练网址一

一、只爬取第一页

import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

headers = {
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/87.0.4280.141 Safari/537.36'}
url = 'https://ssr1.scrape.center/'
html = requests.get(url,headers=headers,verify=False)
soup = BeautifulSoup(html.content,'lxml')
url_list = soup.find_all(class_='name')
title_list = soup.find_all(class_='m-b-sm')
theme_list = soup.find_all(class_='categories')
score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
url, title, theme, score = [], [], [], []
for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
    url.append('https://ssr1.scrape.center'+x['href'])
    title.append(y.text)
    theme.append(z.text.replace('\n', '').replace('\r', ''))
    score.append(i.text.strip())
df = {
    
    
    '链接':url,
    '标题':title,
    '主题':theme,
    '评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')

最终爬取csv文件显示效果如下所示:
最终爬取csv文件效果

二、爬取全部电影信息

import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import urllib3
urllib3.disable_warnings()
#因为网页无ssl证书,会显示警告信息,此行去除警告信息
url, title, theme, score = [], [], [], []
headers = {
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/87.0.4280.141 Safari/537.36'}
global url_list,title_list,theme_list,score_list
for i in range(1,11):
    the_url = 'https://ssr1.scrape.center/page/' + str(i)
    html = requests.get(the_url,headers=headers,verify=False)
    #需将上面的url改为the_url,不然会在第22行报错,22行会将url默认为此处的url局部变量,而不是全局变量
    soup = BeautifulSoup(html.content,'lxml')
    url_list = soup.find_all(class_='name')
    title_list = soup.find_all(class_='m-b-sm')
    theme_list = soup.find_all(class_='categories')
    score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
    for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
        url.append('https://ssr1.scrape.center'+x['href'])
        title.append(y.text)
        theme.append(z.text.replace('\n', '').replace('\r', ''))
        score.append(i.text.strip())
df = {
    
    
    '链接':url,
    '标题':title,
    '主题':theme,
    '评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')

爬取csv文件展示
图片效果

猜你喜欢

转载自blog.csdn.net/sgsdsdd/article/details/112723316