python爬虫-使用IP代理爬取豆瓣读书图书信息

python爬虫-使用IP代理爬取豆瓣读书图书信息

from urllib import request
from bs4 import BeautifulSoup

book.douban.com

if __name__ == '__main__':
    for i in range(1, 5):
        url = 'https://market.douban.com/book/?utm_campaign=book_nav_freyr&utm_source=douban&utm_medium=pc_web'+'&page='+str(i)+'&page_num=18&'
         #这是代理IP,请自行输入购买的ip地址和端口号
        proxy = {'http':'ip:port'}
        #创建ProxyHandler
        proxy_support = request.ProxyHandler(proxy)
        #创建Opener
        opener = request.build_opener(proxy_support)
        #添加User Angent
        opener.addheaders = [('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36')]
        #安装OPener
        request.install_opener(opener)
        #使用自己安装好的Opener
        response = request.urlopen(url)
        #读取相应信息并解码
        html = response.read().decode("utf-8")
        #将html转换成beautifulsoup格式
        soup = BeautifulSoup(html, 'lxml')
        #打印信息
        #print(html)
        # content = requests.get().text
		#使用beautifulsoup的find_all查找标签为li 属性class=book-item的所有内容
        pid = soup.findAll('li', {'class': 'book-item'})
        for terms in pid:
            book_herf = terms.a['href']
            book_price = terms.i.string
            book_title = terms.h3.string
            book_describe = terms.p.string
            print(book_title+","+book_describe+","+book_price+","+book_herf)
            print(50*"-")

猜你喜欢

转载自blog.csdn.net/smsmtiger/article/details/88994982