On the third day came with the job

'' 'Crawling IMDb TOP250
first page:
https://movie.douban.com/top250?start=0&filter=
second page:
https://movie.douban.com/top250?start=25&filter=
' '
Import Requests
Import Re
# climbing film splicing address URL
NUM = 0
for Line Range in (10):
    URL =' https://movie.douban.com/top250?start=%s&filter= '% (NUM,)
    NUM = 25 +
#Print (URL)
# spliced ulr address to send a request to obtain data
Response = requests.get (URL)
# Print (response.text) acquires text data #
# 3. parse and extract data
# movie name, movie address, movie score, the number of evaluators
# re.findall ( 'text-matching rules,' 'matching text', 'pattern matching') # parse text data you want to extract data
# * ?: filter unwanted data until the desired data appears
# (. *?): extract the desired data
# matching rules
# <div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(.*?)人评价</span>
data = re.findall('<div class="item">.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<span class="rating_num" property="v:average">(.*?)</span>.*?<span>(.*?)人评价</span>', response.text, re.S)  # re.S忽略换行
# print(data)
for d in data:
    # print(d)

    url, name, point, count = d

    movie_data = '''
    电影名称: %s
    电影地址: %s
    电影评分: %s
    评价人数:S%     # a: append    # 4. Save the data    Print (movie_data)    '' '% (name, URL, Point, COUNT)
    \ n-






    with open('豆瓣.txt', 'a', encoding='utf-8') as f:
        f.write(movie_data)

Guess you like

Origin www.cnblogs.com/fage5113/p/11588560.html