'' 'Crawling IMDb TOP250
first page:
https://movie.douban.com/top250?start=0&filter=
second page:
https://movie.douban.com/top250?start=25&filter=
' '
Import Requests
Import Re
# climbing film splicing address URL
NUM = 0
for Line Range in (10):
URL =' https://movie.douban.com/top250?start=%s&filter= '% (NUM,)
NUM = 25 +
#Print (URL)
# spliced ulr address to send a request to obtain data
Response = requests.get (URL)
# Print (response.text) acquires text data #
# 3. parse and extract data
# movie name, movie address, movie score, the number of evaluators
# re.findall ( 'text-matching rules,' 'matching text', 'pattern matching') # parse text data you want to extract data
# * ?: filter unwanted data until the desired data appears
# (. *?): extract the desired data
# matching rules
# <div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(.*?)人评价</span>
data = re.findall('<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(.*?)人评价</span>', response.text, re.S) # re.S忽略换行
# print(data)
for d in data:
# print(d)
url, name, point, count = d
movie_data = '''
电影名称: %s
电影地址: %s
电影评分: %s
评价人数:S% # a: append # 4. Save the data Print (movie_data) '' '% (name, URL, Point, COUNT)
\ n-
with open('豆瓣.txt', 'a', encoding='utf-8') as f:
f.write(movie_data)
On the third day came with the job
Guess you like
Origin www.cnblogs.com/fage5113/p/11588560.html
Recommended
Ranking