requests + re 爬去网站图书信息(Python)

# -*- coding: utf-8 -*-


import requests
import re, json




if __name__ == '__main__':
    content = requests.get('https://book.douban.com/').text
    reg_base = '<ul.*?list-col list-col5 list-express slide-item">(.*?)</ul>'
    base_pattern = re.compile(reg_base, re.S)
    base_html = re.findall(base_pattern, content)


    href = '<li.*?cover.*?href="(.*?)".*?'
    title = '<div.*?title.*?title.*?>(.*?)</a>.*?'
    author = '<div.*?more-meta.*?author.*?>(.*?)</span>.*?</li>'
    regex = href + title + author
    pattern = re.compile(regex, re.S)
    results = []
    # 匹配子标签中的图书信息
    for html in base_html:
        results += re.findall(pattern, html)


    with open('touban.txt', 'w', encoding='utf-8') as f:
        for result in results:
            d = {
                'href': result[0].strip(),
                'title': result[1].strip(),
                'author': result[2].replace('&nbsp;', '').strip()
            }

            f.write(json.dumps(d, ensure_ascii=False) + '\n')


{"author": "[法] 米歇尔·普西", "href": "https://book.douban.com/subject/30180673/?icn=index-editionrecommend", "title": "她不是我妈妈"}
{"author": "[意]马西米利亚诺·威尔吉利奥", "href": "https://book.douban.com/subject/30180821/?icn=index-editionrecommend", "title": "那不勒斯的萤火"}
{"author": "于蕾,吕逸涛", "href": "https://book.douban.com/subject/30206904/?icn=index-editionrecommend", "title": "国家宝藏"}
{"author": "张立民", "href": "https://book.douban.com/subject/30235899/?icn=index-editionrecommend", "title": "最后一公里的哲学:电商物流全链条运营管理"}
{"author": "【英】詹姆斯•霍尼伯内(James Honeyborne)/【英】马克•布朗罗(Mark Brownlow)", "href": "https://book.douban.com/subject/30183403/?icn=index-editionrecommend", "title": "BBC全新4K海洋百科:蓝色星球II"}
{"author": "[葡] 若泽·萨拉马戈", "href": "https://book.douban.com/subject/27598520/?icn=index-latestbook-subject", "title": "里卡尔多·雷耶斯离世那年"}
{"author": "[美] 史蒂芬·平克", "href": "https://book.douban.com/subject/30186025/?icn=index-latestbook-subject", "title": "风格感觉"}
{"author": "赵垒", "href": "https://book.douban.com/subject/30204837/?icn=index-latestbook-subject", "title": "傀儡城之荆轲刺秦"}
{"author": "梅贻琦/黄延复/王小宁", "href": "https://book.douban.com/subject/30197575/?icn=index-latestbook-subject", "title": "梅贻琦西南联大日记"}
{"author": "[日] 永井荷风", "href": "https://book.douban.com/subject/30171301/?icn=index-latestbook-subject", "title": "濹东绮谭"}
{"author": "[波兰] 安杰伊·瓦伊达/Andrzej Wajda", "href": "https://book.douban.com/subject/30211002/?icn=index-latestbook-subject", "title": "我们一起拍片!"}
{"author": "[德] 弗兰克·施茨廷", "href": "https://book.douban.com/subject/27604676/?icn=index-latestbook-subject", "title": "群"}
{"author": "[美] 克丽丝特尔·潘恩/Crystal Paine", "href": "https://book.douban.com/subject/30206819/?icn=index-latestbook-subject", "title": "会赚钱的妈妈"}
{"author": "[日] 石田衣良", "href": "https://book.douban.com/subject/27622428/?icn=index-latestbook-subject", "title": "美丽的孩子"}
{"author": "杨时旸", "href": "https://book.douban.com/subject/30218577/?icn=index-latestbook-subject", "title": "孤独的影猎人"}
{"author": "[德]沃尔夫冈·赫伦多夫", "href": "https://book.douban.com/subject/27598521/?icn=index-latestbook-subject", "title": "小心,沙漠有人"}
{"author": "[英] 珍妮特·温特森", "href": "https://book.douban.com/subject/27663541/?icn=index-latestbook-subject", "title": "我要快乐,不必正常"}
{"author": "朱一叶", "href": "https://book.douban.com/subject/30198364/?icn=index-latestbook-subject", "title": "死于象蹄"}
{"author": "[荷] 伊恩·布鲁玛", "href": "https://book.douban.com/subject/27662697/?icn=index-latestbook-subject", "title": "日本之镜"}
{"author": "[美] 威廉·莫尔顿·马斯顿", "href": "https://book.douban.com/subject/30210732/?icn=index-latestbook-subject", "title": "神奇女侠"}
{"author": "[美] 特德·焦亚", "href": "https://book.douban.com/subject/30203912/?icn=index-latestbook-subject", "title": "如何听爵士"}
{"author": "邓安庆", "href": "https://book.douban.com/subject/30221630/?icn=index-latestbook-subject", "title": "纸上王国"}
{"author": "朱伟", "href": "https://book.douban.com/subject/30205589/?icn=index-latestbook-subject", "title": "重读八十年代"}
{"author": "邓安庆", "href": "https://book.douban.com/subject/30190319/?icn=index-latestbook-subject", "title": "望花"}
{"author": "[美]沃尔特·李普曼", "href": "https://book.douban.com/subject/27662713/?icn=index-latestbook-subject", "title": "舆论"}
{"author": "[英] P•D•詹姆斯", "href": "https://book.douban.com/subject/27111572/?icn=index-latestbook-subject", "title": "人类之子"}
{"author": "骆仪", "href": "https://book.douban.com/subject/30198500/?icn=index-latestbook-subject", "title": "京都好物"}
{"author": "(美) 比尔·克林顿 (Bill Clinton)/[美] 詹姆斯·帕特森", "href": "https://book.douban.com/subject/30218923/?icn=index-latestbook-subject", "title": "失踪的总统"}
{"author": "刘冰/林秦文/李敏", "href": "https://book.douban.com/subject/30203973/?icn=index-latestbook-subject", "title": "中国常见植物野外识别手册(北京册)"}
{"author": "冶文彪", "href": "https://book.douban.com/subject/30205286/?icn=index-latestbook-subject", "title": "清明上河图密码 5"}
{"author": "[英] 劳拉·卡琳/Laura Carlin", "href": "https://book.douban.com/subject/30181220/?icn=index-latestbook-subject", "title": "创造自己的世界"}
{"author": "郭强生", "href": "https://book.douban.com/subject/30217599/?icn=index-latestbook-subject", "title": "断代"}
{"author": "史杰鹏", "href": "https://book.douban.com/subject/30183948/?icn=index-latestbook-subject", "title": "悠悠我心"}
{"author": "[俄] 柳德米拉·乌利茨卡娅", "href": "https://book.douban.com/subject/30205823/?icn=index-latestbook-subject", "title": "库科茨基医生的病案"}
{"author": "[美] 兰德尔·柯林斯", "href": "https://book.douban.com/subject/30143236/?icn=index-latestbook-subject", "title": "文凭社会"}
{"author": "[法] 让-皮埃尔·吉布拉", "href": "https://book.douban.com/subject/30205166/?icn=index-latestbook-subject", "title": "爱的缓刑"}
{"author": "[美]丽贝卡·特雷斯特", "href": "https://book.douban.com/subject/30128172/?icn=index-latestbook-subject", "title": "单身女性的时代"}
{"author": "[俄] 弗拉基米尔·索罗金", "href": "https://book.douban.com/subject/27200259/?icn=index-latestbook-subject", "title": "碲钉国"}
{"author": "苏精", "href": "https://book.douban.com/subject/30218894/?icn=index-latestbook-subject", "title": "铸以代刻"}
{"author": "[英] 石黑一雄", "href": "https://book.douban.com/subject/30181685/?icn=index-latestbook-subject", "title": "莫失莫忘"}

猜你喜欢

转载自blog.csdn.net/yinhangxitong36/article/details/80632502