Python 最适合练手的爬虫百思不得姐

这个网站没有反扒，使用的是Python 自带urllib ，解析库数xpath,生成器，数据保存到mongodb。
代码

from urllib import request
from pymongo import MongoClient

from lxml import etree

clinet = MongoClient(host='localhost', port=27017)
class Requesterror(Exception):
    """自定义异常"""
    pass

class spider_joke(object):
    def __init__(self):
        self.url = "http://www.budejie.com/2"
        self.headers = {
            'User-Agent': "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50"}

    def get_request(self, url, headers):
        """发送请求"""
        try:
            response = request.urlopen(request.Request(url, headers=headers))
            if not response:
                raise Requesterror("Request error,Please check the request parameters") # 主动抛出异常
            return response.read().decode()
        except Exception as e:
            print(e)

    def get_parse(self, text):
        """解析网页"""
        htm = etree.HTML(text)
        li_list = htm.xpath("//div[@class='j-r-list']/ul/li")
        for li in li_list:
            item = dict()
            item["user_name"] = li.xpath(".//div[@class='u-txt']/a/text()")[0] if li.xpath(
                ".//div[@class='u-txt']/a/text()") else None
            item["pubDate"] = li.xpath(".//div[@class='u-txt']/span/text()")[0] if li.xpath(
                ".//div[@class='u-txt']/span/text()") else None
            item["pic_url"] = li.xpath(".//div[@class='j-r-list-c-img']//img/@src")[0] if li.xpath(
                ".//div[@class='j-r-list-c-img']//img/@src") else None
            item["praise_count"] = li.xpath(".//li[@class='j-r-list-tool-l-up']/span/text()")[0] if li.xpath(
                ".//li[@class='j-r-list-tool-l-up']/span/text()") else None
            item["comment-counts"] = li.xpath(".//li[contains(@class,'f-tac')]//a/span/text()")[0] if li.xpath(
                ".//li[contains(@class,'f-tac')]//a/span/text()") else None
            item["cotent"] = li.xpath(".//div[@class='j-r-list-c-desc']/a/text()")[0] if li.xpath(
                ".//div[@class='j-r-list-c-desc']/a/text()") else None
            yield item

    def save_data(self, data):
        """保存到mongodb"""
        try:
            col = clinet["test"]["joke"]
            col.insert_one(data)
        except Exception as e:
            print(e)

    def main(self):
        """主函数"""
        try:
            text = self.get_request(self.url, self.headers)
            for item in self.get_parse(text):
                self.save_data(item)
        except Exception as e:
            print(e)


if __name__ == '__main__':
    spider_joke().main()

go_flush

发布了127 篇原创文章 · 获赞 25 · 访问量 3万+

私信关注

Python 最适合练手的爬虫百思不得姐

猜你喜欢