python爬虫的两种方式

1.

import urllib.request
from bs4 import BeautifulSoup
import re
import os

url='http://cpc.people.com.cn/n1/2018/0318/c64094-29873799-8.html'
#GB2312
head={}
head['User_Agent']='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6'
#设置代理,假装是用户访问. 注意,服务器会限制部分user-agent,如果程序报错,就换一个。
#user-agent的获取可参考以下网址:http://blog.csdn.net/bone_ace/article/details/52476016
req=urllib.request.Request(url,headers=head)#给请求装上代理
response=urllib.request.urlopen(url)#打开url
res=response.read().decode('gb2312',"ignore")#读取网页内容,用utf-8解码成字节
soup=BeautifulSoup(res,'lxml')

2.

import requests
#res = requests.get('http://ldzl.people.com.cn/dfzlk/front/personProvince2580.htm')
#print(res.encoding)

import urllib.request
from bs4 import BeautifulSoup
import re
import os

url='http://ldzl.people.com.cn/dfzlk/front/personProvince2580.htm'#云南
#GBk
head={}
head['User_Agent']='Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6'

req=urllib.request.Request(url,headers=head)#给请求装上代理
r = requests.get(url, timeout=30)
r.encoding = 'GBK'
soup = BeautifulSoup(r.text, 'lxml')

猜你喜欢

转载自www.cnblogs.com/polipolu/p/12968677.html