python爬虫-使用IP代理爬取豆瓣读书图书信息
from urllib import request
from bs4 import BeautifulSoup
book.douban.com
if __name__ == '__main__':
for i in range(1, 5):
url = 'https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web'+'&page='+str(i)+'&page_num=18&'
#这是代理IP,请自行输入购买的ip地址和端口号
proxy = {'http':'ip:port'}
#创建ProxyHandler
proxy_support = request.ProxyHandler(proxy)
#创建Opener
opener = request.build_opener(proxy_support)
#添加User Angent
opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
#安装OPener
request.install_opener(opener)
#使用自己安装好的Opener
response = request.urlopen(url)
#读取相应信息并解码
html = response.read().decode("utf-8")
#将html转换成beautifulsoup格式
soup = BeautifulSoup(html, 'lxml')
#打印信息
#print(html)
# content = requests.get().text
#使用beautifulsoup的find_all查找标签为li 属性class=book-item的所有内容
pid = soup.findAll('li', {'class': 'book-item'})
for terms in pid:
book_herf = terms.a['href']
book_price = terms.i.string
book_title = terms.h3.string
book_describe = terms.p.string
print(book_title+","+book_describe+","+book_price+","+book_herf)
print(50*"-")