Cui Qingcai crawler training website first question ssr1

Cui Qingcai crawler training website one

1. Only crawl the first page

import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd

headers = {
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/87.0.4280.141 Safari/537.36'}
url = 'https://ssr1.scrape.center/'
html = requests.get(url,headers=headers,verify=False)
soup = BeautifulSoup(html.content,'lxml')
url_list = soup.find_all(class_='name')
title_list = soup.find_all(class_='m-b-sm')
theme_list = soup.find_all(class_='categories')
score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
url, title, theme, score = [], [], [], []
for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
    url.append('https://ssr1.scrape.center'+x['href'])
    title.append(y.text)
    theme.append(z.text.replace('\n', '').replace('\r', ''))
    score.append(i.text.strip())
df = {
    
    
    '链接':url,
    '标题':title,
    '主题':theme,
    '评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')

The final display effect of the crawled csv file is as follows:
The final crawling csv file effect

2. Crawl all movie information

import requests
from bs4 import BeautifulSoup
import time
import re
import pandas as pd
import urllib3
urllib3.disable_warnings()
#因为网页无ssl证书,会显示警告信息,此行去除警告信息
url, title, theme, score = [], [], [], []
headers = {
    
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/87.0.4280.141 Safari/537.36'}
global url_list,title_list,theme_list,score_list
for i in range(1,11):
    the_url = 'https://ssr1.scrape.center/page/' + str(i)
    html = requests.get(the_url,headers=headers,verify=False)
    #需将上面的url改为the_url,不然会在第22行报错,22行会将url默认为此处的url局部变量,而不是全局变量
    soup = BeautifulSoup(html.content,'lxml')
    url_list = soup.find_all(class_='name')
    title_list = soup.find_all(class_='m-b-sm')
    theme_list = soup.find_all(class_='categories')
    score_list = soup.find_all(class_='score m-t-md m-b-n-sm')
    for x,y,z,i in zip(url_list,title_list,theme_list,score_list):
        url.append('https://ssr1.scrape.center'+x['href'])
        title.append(y.text)
        theme.append(z.text.replace('\n', '').replace('\r', ''))
        score.append(i.text.strip())
df = {
    
    
    '链接':url,
    '标题':title,
    '主题':theme,
    '评分':score
}
work1 = pd.DataFrame(df)
work1.to_csv('work1.csv')

Crawl csv file display
Picture effect

Guess you like

Origin blog.csdn.net/sgsdsdd/article/details/112723316