36行代码爬取微博热搜榜和要闻榜

import requests
from bs4 import BeautifulSoup
import time
from urllib import parse

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/85.0.4183.83 Safari/537.36 '
}

def text(url):
    html = requests.get(url, headers=headers)
    if html.status_code == 200:
        print('成功')
        parse_html(html.text)
    else:
        print('失败')

def parse_html(html):
    soup = BeautifulSoup(html,'lxml')
    hot = soup.select('table tbody tr')
    for i in hot:
        title = i.select_one('td a').text
        url = i.select_one('td a')['href']
        print(title,url)
        url = parse.urljoin('https://www.weibo.com',url)
        print(url)
        
if __name__ == '__main__':
    start = time.time()
    url = 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
    text(url=url)
    url2 = 'https://s.weibo.com/top/summary?cate=socialevent'
    text(url = url2)
    time = time.time() - start
    print(time)

同步爬取

猜你喜欢

转载自blog.csdn.net/weixin_43554217/article/details/108442735