爬取网址:http://www.meijutt.com/alltop_hit.html
爬取内容:排行榜名称
爬取目标:存储到本地文件中
涉及,浏览器伪装,代理服务器,异常处理
代码如下:
from bs4 import BeautifulSoup #调用库
from urllib import request,error
try:
url = 'http://www.meijutt.com/alltop_hit.html'
headers = ("User-Agent",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
proxy_ip = '119.28.194.66' #代理服务器ip
proxy_ip_address = request.ProxyHandler({"http:": proxy_ip})
openner = request.build_opener(proxy_ip_address, request.HTTPHandler) #使用代理ip
openner.addheaders = [headers] #浏览器伪装
request.install_opener(openner)
data = openner.open(url).read().decode("gb2312") #获取网页源码并解码
file = open(r"E:\crawl\meiju.txt", "a")
i = 1
soup = BeautifulSoup(data,"html.parser")
nodes = soup.find_all("a",target="_blank")
for node in nodes:
file.write(str(i)+".")
file.write(node.text.replace("\n","")+"\n") #将文件写入指定的文件中
i+=1
except error.URLError as e:
print("再检查下程序吧!")