python爬取douban读书

版权声明:zhiyu https://blog.csdn.net/ichglauben/article/details/82495501

通过两天的速成学习,自己写了py的代码来爬豆瓣。
要注意的还是很多的cookie User-Agent
还有字符串编辑问题- (使用的是pycharm 记得设置字符格式为UTF8)

# -*- coding: UTF-8 -*-
import threading
import pymongo
import requests
import re
import random
import time
import MY

from  bs4  import BeautifulSoup
##1.目的爬  https://read.douban.com/kind/115?start=0&sort=hot&promotion_only=False&min_price=None&max_price=None&works_type=None
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
cookies = {'cookie':'bid=gMA1Y4dROSI; ll="118172"; viewed="26878124_26957760_27191009_30259720_3351237_26745780_26638586_1944338_26670241_10607365"; ps=y; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=4f611dd9-6180-49d1-95d6-5c4adb435c87; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_4f611dd9-6180-49d1-95d6-5c4adb435c87=true; ap_v=0,6.0; gr_cs1_4f611dd9-6180-49d1-95d6-5c4adb435c87=user_id%3A1; gr_user_id=4b52f409-b910-4345-9aae-1a170df071ec; __utmt_douban=1; _vwo_uuid_v2=D3178371425194B801CFCEA97775AC031|6642ed0fea096d79e53f923c5540f473; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1536291519%2C%22https%3A%2F%2Faccounts.douban.com%2Fregister_success%22%5D; push_noty_num=0; push_doumail_num=0; __utmt=1; _gat_UA-7019765-1=1; _pk_id.100001.8cb4=1e693baa8d06e564.1536291519.1.1536291584.1536291519.; _pk_ses.100001.8cb4=*; __utma=30149280.1766689065.1533978106.1536283455.1536290506.8; __utmb=30149280.9.10.1536290506; __utmc=30149280; __utmz=30149280.1536290506.8.8.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/accounts/login; __utmv=30149280.18420; _ga=GA1.2.1766689065.1533978106; _gid=GA1.2.684466045.1536291537; ue="[email protected]"; dbcl2="184206822:Lpt4qRfe+vM"'}
def get_page(url):
    response = requests.get(url, headers=headers, cookies=cookies)
    res = response.text
    return  res


def parse_page(info):
    #pattern =  '<div.*?class="info">.*?"title"><a.*?="/ebook/.*?">(.*?)</a>.*?<a.*?"author-item".*?>(.*?)</a>.*?<span.*?"genre">(.*?)</span>.*?</div>'
    #items = re.findall(pattern, info, re.S)
    soup = BeautifulSoup(info,'lxml')

    return  items


def write_page(items):
    fileBook = open('doubanbook.txt', 'a', encoding='utf8')
    try:
        for book in items:
            fileBook.write('书名:' + book[0] + '\r\n')
            fileBook.write('作者:' + book[1].strip() + '\r\n')
            fileBook.write('类型:' + book[2].strip() + '\r\n\r\n')
    finally:
        fileBook.close()



def main(start):
    url = 'https://read.douban.com/kind/115?start='+str(start)+'&sort=hot&promotion_only=False&min_price=None&max_price=None&works_type=None'
    page_info = get_page(url)
    items = parse_page(page_info)
    write_page(items)

#print(items)
if __name__ == '__main__':
    start_time =time.time()
    for i in range(0, 100, 20):
        t = threading.Thread(target=main,args=(i,))
        t.start()

    end_time = time.time()

    print('cost time is:',end_time-start_time)



书名:镜狱岛事件

作者:时晨

类型:图书 / 虚构



书名:怪屋女孩1

作者:〔美〕兰萨姆·里格斯

类型:图书 / 虚构



书名:特案科:刑警手记(全2册)

作者:风雨如书

类型:图书 / 虚构

后续准备连接存储进mysql数据库 做一个小项目

猜你喜欢

转载自blog.csdn.net/ichglauben/article/details/82495501