A directional crawler written in Scrapy, the crawling target is a forum using the Discuz framework

heartsong /scrapy.cfg file

# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = heartsong.settings

[deploy]
#url = http://localhost:6800/
project = heartsong

heartsong/heartsong/settings.py文件

# -*- coding: utf-8 -*-

BOT_NAME = 'heartsong'

SPIDER_MODULES = ['heartsong.spiders']
NEWSPIDER_MODULE = 'heartsong.spiders'

ROBOTSTXT_OBEY = False # Does not follow the Robot protocol

# Configure the pipeline, the number represents the priority, because there is only one pipeline in this project, it can take any value from 1-1000
ITEM_PIPELINES = {
    'heartsong.pipelines.HeartsongPipeline': 300,
}

MONGO_HOST = "127.0.0.1" # database host IP
MONGO_PORT = 27017 # port number
MONGO_DB = "Spider" # database name
MONGO_COLL = "heartsong"  # collection名
# MONGO_USER = "zhangsan" # If the database has set access rights
# MONGO_PSW = "123456"

heartsong/heartsong/pipelines.py文件

# -*- coding: utf-8 -*-

import pymongo
from scrapy.conf import settings

class HeartsongPipeline(object):
    def __init__(self):
        # link database
        self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
        # If the database login requires an account password
        # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
        self.db = self.client[settings['MONGO_DB']] # Get the handle to the database
        self.coll = self.db[settings['MONGO_COLL']] # Get the handle of the collection

    def process_item(self, item, spider):
        postItem = dict(item) # Convert item to dictionary form
        self.coll.insert(postItem) # Insert a record into the database
        return item # will output the original item data in the console, you can choose not to write

heartsong/heartsong/items.py文件

# -*- coding: utf-8 -*-

import scrapy

class HeartsongItem(scrapy.Item):
    title = scrapy.Field() # the title of the post
    url = scrapy.Field() # web link of the post
    author = scrapy.Field() # The author of the post
    post_time = scrapy.Field() # Post time
    content = scrapy.Field() # the content of the post

heartsong/heartsong/spiders/heartsong_scrapy.py文件

# -*- coding: utf-8 -*-

# import scrapy # This sentence can be used to replace the following three sentences, but it is not recommended
from scrapy.spiders import Spider
from scrapy.selector import Selector
from scrapy import Request
from heartsong.items import HeartsongItem # If the error is the reason why pyCharm misunderstands the directory, it does not affect

class HeartsongSpider(Spider):
    name = "heartsong"
    allowed_domains = ["heartsong.top"] # Domain names that are allowed to be crawled, web pages other than this domain name will not be crawled
    start_urls = [
        # Start url, set here to start from the maximum tid and iterate in the direction of 0
        "http://www.heartsong.top/forum.php?mod=viewthread&tid=34"
    ]

    # Used to keep the login state, you can convert the cookie in the form of a string copied from chrome into a dictionary form and paste it here
    cookies = {}

    # The http header information sent to the server, some websites need to disguise the browser header for crawling, and some do not need to
    headers = {
        # 'Connection': 'keep - alive',
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'
    }

    # Configuration for processing the return of the request
    meta = {
        'dont_redirect': True, # disable web redirection
        'handle_httpstatus_list': [301, 302] # Which exception returns should be handled
    }

    def get_next_url(self, oldUrl):
        '''
        description: returns the url of the next iteration
        :param oldUrl: the last url crawled
        :return: The url to be crawled next time
        '''
        # Incoming url format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34
        l = oldUrl.split('=') #Split string with equal sign
        oldID = int (l [2])
        newID = oldID - 1
        if newID == 0: # If the tid iterates to 0, it means that the website has finished crawling, and the crawler can end
            return
        newUrl = l[0] + "=" + l[1] + "=" + str(newID) #Construct a new url
        return str(newUrl) # return the new url

    def start_requests(self):
        """
        This is an overloaded function, its role is to issue the first Request request
        :return:
        """
        # Request self.start_urls[0] with headers and cookies, the returned response will be sent to
        # in the callback function parse
        yield Request(self.start_urls[0],
                             callback=self.parse, headers=self.headers,
                             cookies=self.cookies, meta=self.meta)

    def parse(self, response):
        """
        Homepage to handle threading
        :param response:
        :return:
        """
        selector = Selector(response) # create selector

        table = selector.xpath('//*[starts-with(@id, "pid")]') # Get all floors
        if not table:
            # There is no floor in this link, indicating that this thread may have been deleted.
            # Save such urls to a file to review why
            print "bad url!"
            f = open('badurl.txt', 'a')
            f.write(response.url)
            f.write('\n')
            f.close()
            # Initiate a request for the next thread
            next_url = self.get_next_url(response.url) # response.url is the url of the original request
            if next_url != None: # If a new url is returned
                yield Request(next_url, callback=self.parse, headers=self.headers,
                                cookies=self.cookies, meta=self.meta)
            return
        for each in table:
            item = HeartsongItem() # Instantiate an item
            try:
                # Match information through XPath, note that the extract() method returns a list
                item['author'] = each.xpath('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0]
                item['post_time'] = each.xpath('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0]
            except:
                continue
            # The string(.) usage of XPath solves the situation where the label sets the label. For the specific explanation, please find the XPath tutorial by yourself
            content_list = each.xpath('.//td[@class="t_f"]').xpath('string(.)').extract()
            content = "".join(content_list) # Convert list to string
            item['url'] = response.url # Get the url of the web page in this way
            # Remove line breaks, spaces, etc. from the content
            item['content'] = content.replace('\r\n', '').replace(' ', '').replace('\n', '')
            yield item # Pass the created and assigned Item object to PipeLine for processing

        pages = selector.xpath('//*[@id="pgt"]/div/div/label/span')
        if pages: # If pages is not an empty list, it means the thread is paginated
            pages = pages[0].re(r'[0-9]+')[0] # Regularly match the total number of pages
            print "This post has", pages, "pages"
            # response.url format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34
            # Sub utl format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34&page=1
            tmp = response.url.split('=') # split url with =
            # Loop to generate requests for all subpages
            for page_num in xrange(2, int(pages) + 1):
                # construct new url
                sub_url = tmp[0] + '=' + tmp[1] + '=' + tmp[2] + 'page=' + str(page_num)
                # Note that the callback function here is self.sub_parse, which means that the response of this request will be passed to
                # process in self.sub_parse
                yield Request(sub_url,callback=self.sub_parse, headers=self.headers,
                                cookies=self.cookies, meta=self.meta)

        # Initiate a request for the next thread
        next_url = self.get_next_url(response.url) # response.url is the url of the original request
        if next_url != None: # If a new url is returned
            yield Request(next_url,callback=self.parse, headers=self.headers,
                        cookies=self.cookies, meta=self.meta)

    def sub_parse(self, response):
        """
        Used to crawl other subpages except the home page of the theme post
        :param response:
        :return:
        """
        selector = Selector(response)
        table = selector.xpath('//*[starts-with(@id, "pid")]') # Get all floors
        for each in table:
            item = HeartsongItem() # Instantiate an item
            try:
                # Match information through XPath, note that the extract() method returns a list
                item['author'] = each.xpath('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0]
                item['post_time'] = each.xpath('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0]
            except:
                continue
            content_list = each.xpath('.//td[@class="t_f"]').xpath('string(.)').extract()
            content = "".join(content_list) # Convert list to string
            item['url'] = response.url # Get the url of the web page in this way
            # Remove line breaks, spaces, etc. from the content
            item['content'] = content.replace('\r\n', '').replace(' ', '').replace('\n', '')
            yield item # Pass the created and assigned Item object to PipeLine for processing

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=324703635&siteId=291194637