python 获取京东畅销书目录和网址!赞一个

从事python开发有一年多时间,起初是学习全栈的,工作中主要是在做后端开发,现在练练手,了解一下最新爬虫思路和爬虫方法。

#!/usr/bin/env python

# encoding=utf-8


import requests
from bs4 import BeautifulSoup
from requests import HTTPError


HTTP_ = 'http:'




def download_page(url):
    print(url)
    try:
        data = requests.get(url).content
    except HTTPError as err:
        print(err.__traceback__)
    except ConnectionError as err:
        print(err.__traceback__)
    except TimeoutError as err:
        print(err.__traceback__)


    return data




def parse_html(html):
    soup = BeautifulSoup(html, "html.parser")
    book_list_soup = soup.find_all('div', attrs={'class': 'p-detail'})
    for book_li in book_list_soup:
        a_tag = book_li.find('a', attrs={'class': 'p-name'})
        print a_tag
        # print '书名 {} 链接 HTTP_{}'.format(a_tag['title'],a_tag['href'])
        # print('书名 : ' + a_tag['title'] + '\t链接 : ' + HTTP_ + a_tag['href'])
    next_button = soup.find('a', attrs={'class': 'pn-next'})
    return next_button['href']




def main():
    download_url = '//book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-5-1#comfort'
    while download_url != 'javascript:void(0);':
        html = download_page(HTTP_ + download_url)
        download_url = parse_html(html)




if __name__ == '__main__':
    main()

运行结果:

请使用手机"扫一扫"x

猜你喜欢

转载自blog.csdn.net/qq_37981867/article/details/80668733
今日推荐