heartsong /scrapy.cfg file
# Automatically created by: scrapy startproject # # For more information about the [deploy] section see: # https://scrapyd.readthedocs.org/en/latest/deploy.html [settings] default = heartsong.settings [deploy] #url = http://localhost:6800/ project = heartsong
heartsong/heartsong/settings.py文件
# -*- coding: utf-8 -*- BOT_NAME = 'heartsong' SPIDER_MODULES = ['heartsong.spiders'] NEWSPIDER_MODULE = 'heartsong.spiders' ROBOTSTXT_OBEY = False # Does not follow the Robot protocol # Configure the pipeline, the number represents the priority, because there is only one pipeline in this project, it can take any value from 1-1000 ITEM_PIPELINES = { 'heartsong.pipelines.HeartsongPipeline': 300, } MONGO_HOST = "127.0.0.1" # database host IP MONGO_PORT = 27017 # port number MONGO_DB = "Spider" # database name MONGO_COLL = "heartsong" # collection名 # MONGO_USER = "zhangsan" # If the database has set access rights # MONGO_PSW = "123456"
heartsong/heartsong/pipelines.py文件
# -*- coding: utf-8 -*- import pymongo from scrapy.conf import settings class HeartsongPipeline(object): def __init__(self): # link database self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) # If the database login requires an account password # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) self.db = self.client[settings['MONGO_DB']] # Get the handle to the database self.coll = self.db[settings['MONGO_COLL']] # Get the handle of the collection def process_item(self, item, spider): postItem = dict(item) # Convert item to dictionary form self.coll.insert(postItem) # Insert a record into the database return item # will output the original item data in the console, you can choose not to write
heartsong/heartsong/items.py文件
# -*- coding: utf-8 -*- import scrapy class HeartsongItem(scrapy.Item): title = scrapy.Field() # the title of the post url = scrapy.Field() # web link of the post author = scrapy.Field() # The author of the post post_time = scrapy.Field() # Post time content = scrapy.Field() # the content of the post
heartsong/heartsong/spiders/heartsong_scrapy.py文件
# -*- coding: utf-8 -*- # import scrapy # This sentence can be used to replace the following three sentences, but it is not recommended from scrapy.spiders import Spider from scrapy.selector import Selector from scrapy import Request from heartsong.items import HeartsongItem # If the error is the reason why pyCharm misunderstands the directory, it does not affect class HeartsongSpider(Spider): name = "heartsong" allowed_domains = ["heartsong.top"] # Domain names that are allowed to be crawled, web pages other than this domain name will not be crawled start_urls = [ # Start url, set here to start from the maximum tid and iterate in the direction of 0 "http://www.heartsong.top/forum.php?mod=viewthread&tid=34" ] # Used to keep the login state, you can convert the cookie in the form of a string copied from chrome into a dictionary form and paste it here cookies = {} # The http header information sent to the server, some websites need to disguise the browser header for crawling, and some do not need to headers = { # 'Connection': 'keep - alive', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36' } # Configuration for processing the return of the request meta = { 'dont_redirect': True, # disable web redirection 'handle_httpstatus_list': [301, 302] # Which exception returns should be handled } def get_next_url(self, oldUrl): ''' description: returns the url of the next iteration :param oldUrl: the last url crawled :return: The url to be crawled next time ''' # Incoming url format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34 l = oldUrl.split('=') #Split string with equal sign oldID = int (l [2]) newID = oldID - 1 if newID == 0: # If the tid iterates to 0, it means that the website has finished crawling, and the crawler can end return newUrl = l[0] + "=" + l[1] + "=" + str(newID) #Construct a new url return str(newUrl) # return the new url def start_requests(self): """ This is an overloaded function, its role is to issue the first Request request :return: """ # Request self.start_urls[0] with headers and cookies, the returned response will be sent to # in the callback function parse yield Request(self.start_urls[0], callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta) def parse(self, response): """ Homepage to handle threading :param response: :return: """ selector = Selector(response) # create selector table = selector.xpath('//*[starts-with(@id, "pid")]') # Get all floors if not table: # There is no floor in this link, indicating that this thread may have been deleted. # Save such urls to a file to review why print "bad url!" f = open('badurl.txt', 'a') f.write(response.url) f.write('\n') f.close() # Initiate a request for the next thread next_url = self.get_next_url(response.url) # response.url is the url of the original request if next_url != None: # If a new url is returned yield Request(next_url, callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta) return for each in table: item = HeartsongItem() # Instantiate an item try: # Match information through XPath, note that the extract() method returns a list item['author'] = each.xpath('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0] item['post_time'] = each.xpath('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0] except: continue # The string(.) usage of XPath solves the situation where the label sets the label. For the specific explanation, please find the XPath tutorial by yourself content_list = each.xpath('.//td[@class="t_f"]').xpath('string(.)').extract() content = "".join(content_list) # Convert list to string item['url'] = response.url # Get the url of the web page in this way # Remove line breaks, spaces, etc. from the content item['content'] = content.replace('\r\n', '').replace(' ', '').replace('\n', '') yield item # Pass the created and assigned Item object to PipeLine for processing pages = selector.xpath('//*[@id="pgt"]/div/div/label/span') if pages: # If pages is not an empty list, it means the thread is paginated pages = pages[0].re(r'[0-9]+')[0] # Regularly match the total number of pages print "This post has", pages, "pages" # response.url format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34 # Sub utl format: http://www.heartsong.top/forum.php?mod=viewthread&tid=34&page=1 tmp = response.url.split('=') # split url with = # Loop to generate requests for all subpages for page_num in xrange(2, int(pages) + 1): # construct new url sub_url = tmp[0] + '=' + tmp[1] + '=' + tmp[2] + 'page=' + str(page_num) # Note that the callback function here is self.sub_parse, which means that the response of this request will be passed to # process in self.sub_parse yield Request(sub_url,callback=self.sub_parse, headers=self.headers, cookies=self.cookies, meta=self.meta) # Initiate a request for the next thread next_url = self.get_next_url(response.url) # response.url is the url of the original request if next_url != None: # If a new url is returned yield Request(next_url,callback=self.parse, headers=self.headers, cookies=self.cookies, meta=self.meta) def sub_parse(self, response): """ Used to crawl other subpages except the home page of the theme post :param response: :return: """ selector = Selector(response) table = selector.xpath('//*[starts-with(@id, "pid")]') # Get all floors for each in table: item = HeartsongItem() # Instantiate an item try: # Match information through XPath, note that the extract() method returns a list item['author'] = each.xpath('tr[1]/td[@class="pls"]/div[@class="pls favatar"]/div[@class="pi"]/div[@class="authi"]/a/text()').extract()[0] item['post_time'] = each.xpath('tr[1]/td[@class="plc"]/div[@class="pi"]').re(r'[0-9]+-[0-9]+-[0-9]+ [0-9]+:[0-9]+:[0-9]+')[0] except: continue content_list = each.xpath('.//td[@class="t_f"]').xpath('string(.)').extract() content = "".join(content_list) # Convert list to string item['url'] = response.url # Get the url of the web page in this way # Remove line breaks, spaces, etc. from the content item['content'] = content.replace('\r\n', '').replace(' ', '').replace('\n', '') yield item # Pass the created and assigned Item object to PipeLine for processing