import requests
from bs4 import BeautifulSoup
import time
from urllib import parse
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/85.0.4183.83 Safari/537.36 '
}
def text(url):
html = requests.get(url, headers=headers)
if html.status_code == 200:
print('成功')
parse_html(html.text)
else:
print('失败')
def parse_html(html):
soup = BeautifulSoup(html,'lxml')
hot = soup.select('table tbody tr')
for i in hot:
title = i.select_one('td a').text
url = i.select_one('td a')['href']
print(title,url)
url = parse.urljoin('https://www.weibo.com',url)
print(url)
if __name__ == '__main__':
start = time.time()
url = 'https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'
text(url=url)
url2 = 'https://s.weibo.com/top/summary?cate=socialevent'
text(url = url2)
time = time.time() - start
print(time)
同步爬取