Python 爬取 豆瓣

...

import urllib.request
import time
from bs4 import BeautifulSoup

def url_open(url):
    response = urllib.request.urlopen(url)
    return response
def parse_html(response):
    html_content = response.read()
    html_soup = BeautifulSoup(html_content, 'html.parser', from_encoding='utf-8')
    tag_lis = html_soup.find_all('li')
    for li in tag_lis:
        em = li.find('em')
        title = li.find_all('span', class_='title')
        # other = li.find_all('span', class_='other')
        rating = li.find('span', class_='rating_num')
        if title != []:
            rank=em.get_text()
            print("排名:" + rank + "------评分:" + str(rating.get_text()) + "-------" + title[0].get_text())
            if rank==250:
                return None
            if int(rank)%25==0:
                url="https://movie.douban.com/top250?start="+rank+"&filter="
                return url

url = "https://movie.douban.com/top250?start=0&filter="
if __name__=='__main__':
    response=url_open(url)
    start_time=time.time()
    print("开始:"+str(start_time))
    while 1:
        url=parse_html(response)
        if url==None:
            break
        response=url_open(url)
    end_time=time.time()
    print("结束:"+str(end_time))
    print("一共用了:"+str(end_time-start_time)+"")

猜你喜欢

转载自www.cnblogs.com/mysterious-killer/p/10156985.html