python小爬虫,爬取文章(知乎专栏)片段

 爬取知乎专栏

#爬取知乎专栏

class XSSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=10&offset=0']

    def parse(self, response):
        obj = json.loads(response.text)
        icount=0
        for p_item in obj['data']:
            icount = icount + 1
            p_content = get_html_of_response(p_item['url'])
            p_selector = etree.HTML(p_content.text)
            sid = re.sub(".+/p/","",p_item['url'])
            content_json = json.loads(p_selector.xpath("//script[@id='js-initialData']/text()")[0])
            txt = ""
            i_title = content_json['initialState']['entities']['articles'][sid]['title']
            txt = txt + i_title.strip() + "\r\n"
            i_p = content_json['initialState']['entities']['articles'][sid]['content']
            txt = txt + i_p.replace("<p>","\r\n").replace("</p>","\r\n")
            fo = open('G:/learn/3.txt', "ab+")  # 打开小说文件
            fo.write((txt).encode('UTF-8'))
            fo.close()
        # 自动翻页
        pre_page_item = obj['paging']['next']
        if icount>0 :
            yield scrapy.Request(pre_page_item, callback=self.parse)

class XSSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['zhihu.com']
    start_urls = ['https://www.zhihu.com/api/v4/columns/c_1059416559054893056/items?limit=50&offset=490']

    def parse(self, response):
        obj = json.loads(response.text)
        icount=0
        for i in range(0,len(obj['data'])):
            icount = icount + 1
            p_content = get_html_of_response(obj['data'][len(obj['data'])-1-i]['url'])
            p_selector = etree.HTML(p_content.text)
            txt = ""
            i_title = p_selector.xpath("//h1[@class='Post-Title']/text()")
            if len(i_title)>0:
                txt = txt + i_title[0].strip() + "\r\n"
            i_p = p_selector.xpath("//div[@class='RichText ztext Post-RichText']//p//text()")
            for p in i_p:
                txt = txt + p.strip() + "\r\n"
            fo = open('G:/learn/7.txt', "ab+")  # 打开小说文件
            fo.write((txt).encode('UTF-8'))
            fo.close()
        # 自动翻页
        pre_page_item = obj['paging']['previous']
        if pre_page_item != response.url :
            yield scrapy.Request(pre_page_item, callback=self.parse)

爬取普通小说网站 

class XSSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['2mcn.com']
    start_urls = ['https://www.2mcn.com/html/book/73323/73323986/49627483.html']

    def parse(self, response):
        txt = ""
        i_title = response.xpath("//h1/text()").extract()[0]
        txt = txt + i_title.strip() + "\r\n"
        i_p = response.xpath("//div[@id='content']//text()").extract()
        for p in i_p:
            txt = txt + p.strip() + "\r\n"
        fo = open('3.txt', "ab+")  # 打开小说文件
        fo.write((txt).encode('UTF-8'))
        fo.close()

        # 自动翻页
        next_page_item = response.xpath("//a[contains(text(),'下一章')]/@href").extract()
        if len(next_page_item) >0:
            yield scrapy.Request(response.urljoin(next_page_item[0]), callback=self.parse)

 

猜你喜欢

转载自blog.csdn.net/mao_mao37/article/details/107387035