Use MySQL to store the content of each page of the news and all the pictures in the news by incrementally updating the headline, news, introduction, release time, news, and news of the whole station. Project file structure.
Here is the content of run.py
1 #coding=utf-8 2 from scrapy import cmdline 3 import redis,time,threading 4 from multiprocessing import Process 5 #import scrapy.log 6 7 #cmdline.execute("scrapy crawl baoxian -s LOG_FILE=scrapy10.log".split()) 8 9 #scrapy crawl myspider -s LOG_FILE=scrapy2.log 10 11 12 start_urls = ['http://mini.eastday.com/', 13 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0010¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603095010319,170603093955594-2,170603093955594&jsonpcallback=jQuery18303164258797187358_1496455837718&_=1496455838146', #国内 14 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0011¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=170603142336718-2,170603142336718,170603122752716&jsonpcallback=jQuery18307262756496202201_1496477922458&_=1496477923254', #国际 15 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0005¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18302500620267819613_1496483754044&_=1496483755277',#军事 16 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0003¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183026658024708740413_1496480575988&_=1496480576634',#社会 17 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0002¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery1830691694314358756_1496480816841&_=1496480817500',#娱乐 18 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0019¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18303703077440150082_1496480892188&_=1496480892581',#健康 19 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0015¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183023222095426172018_1496480961781&_=1496480962307',#时尚 20 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0008¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183017557532875798643_1496481013410&_=1496481013824',#科技 21 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0012¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18308183211348950863_1496481106550&_=1496481106993',#汽车 22 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0018¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18309359942991286516_1496481227742&_=1496481228242',#人文 23 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0007¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery183019699203735217452_1496481313637&_=1496481314077',#游戏 24 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0020¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18307782149398699403_1496481413006&_=1496481413401',#星座 25 'http://ttpc.dftoutiao.com/jsonpc/refresh?type=0021¶m=null%0914963741798389872%09toutiao%09DFTT%091&readhistory=n170603081129137,n170603071002231,170603142336718-2&jsonpcallback=jQuery18306590236281044781_1496481467020&_=1496481467496',#家居 26 27 ] 28 29 r = redis.Redis(host='127.0.0.1',port=6379,db=0) 30 31 32 33 def check_redis_requsts(): 34 while(1): 35 ''' 36 for url in start_urls: 37 r.rpush('eastdayspider:start_urls',url) 38 print u'Insert into start_urls:',r.lrange('eastdayspider:start_urls',0,-1) 39 ''' 40 for url in start_urls: 41 r.sadd('eastdayspider:start_urls',url) 42 print u'Insert into start_urls:',r.smembers('eastdayspider:start_urls') 43 44 count=0 45 while (count<30): 46 if r.exists('eastdayspider:requests'): 47 time.sleep(60) 48 count=0 49 else: 50 count+=1 51 time.sleep(10) 52 53 def run_spider(): 54 cmdline.execute("scrapy crawl eastdayspider".split()) 55 56 57 if __name__=='__main__': 58 pass 59 60 61 p1= Process(target=check_redis_requsts) 62 p2=Process(target=run_spider) 63 64 p1.start() 65 time.sleep(5) 66 p2.start() 67 68 p1.join() 69 p2.join()
this is settings.py
1 # -*- coding: utf-8 -*- 2 3 # Scrapy settings for eastday project 4 # 5 # For simplicity, this file contains only settings considered important or 6 # commonly used. You can find more settings consulting the documentation: 7 # 8 # http://doc.scrapy.org/en/latest/topics/settings.html 9 # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 12 BOT_NAME = 'eastday' 13 14 SPIDER_MODULES = ['eastday.spiders'] 15 NEWSPIDER_MODULE = 'eastday.spiders' 16 17 DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 18 SCHEDULER = "scrapy_redis.scheduler.Scheduler" 19 REDIS_START_URLS_AS_SET=True #shezhi strat_urls key is set, default is false is list 20 SCHEDULER_PERSIST = True 21 22 DEPTH_PRIORITY=0 23 RETRY_TIMES = 20 24 25 IMAGES_STORE = 'd:/' 26 IMAGES_EXPIRES = 90 27 28 REDIS_HOST = 'localhost' 29 REDIS_PORT = 6379 30 # Crawl responsibly by identifying yourself (and your website) on the user-agent 31 #USER_AGENT = 'eastday (+http://www.yourdomain.com)' 32 33 # Obey robots.txt rules 34 ROBOTSTXT_OBEY = False 35 36 # Configure maximum concurrent requests performed by Scrapy (default: 16) 37 CONCURRENT_REQUESTS = 10 38 39 # Configure a delay for requests for the same website (default: 0) 40 # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 41 # See also autothrottle settings and docs 42 DOWNLOAD_DELAY = 0 43 # The download delay setting will honor only one of: 44 #CONCURRENT_REQUESTS_PER_DOMAIN = 16 45 #CONCURRENT_REQUESTS_PER_IP = 16 46 47 # Disable cookies (enabled by default) 48 #COOKIES_ENABLED = False 49 50 # Disable Telnet Console (enabled by default) 51 #TELNETCONSOLE_ENABLED = False 52 53 # Override the default request headers: 54 #DEFAULT_REQUEST_HEADERS = { 55 # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 56 # 'Accept-Language': 'en', 57 #} 58 59 # Enable or disable spider middlewares 60 # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 61 #SPIDER_MIDDLEWARES = { 62 # 'eastday.middlewares.EastdaySpiderMiddleware': 543, 63 #} 64 65 # Enable or disable downloader middlewares 66 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 67 68 DOWNLOADER_MIDDLEWARES = { 69 "eastday.middlewares.UserAgentMiddleware": 401, 70 #"eastday.middlewares.CookiesMiddleware": 402, 71 } 72 73 74 75 # Enable or disable extensions 76 # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 77 #EXTENSIONS = { 78 # 'scrapy.extensions.telnet.TelnetConsole': None, 79 #} 80 81 # Configure item pipelines 82 # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 83 ITEM_PIPELINES = { 84 #'eastday.pipelines.EastdayPipeline': 300, 85 'eastday.pipelines.MysqlDBPipeline':400, 86 'eastday.pipelines.DownloadImagesPipeline':200, 87 #'scrapy_redis.pipelines.RedisPipeline': 400, 88 89 } 90 91 # Enable and configure the AutoThrottle extension (disabled by default) 92 # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 93 #AUTOTHROTTLE_ENABLED = True 94 # The initial download delay 95 #AUTOTHROTTLE_START_DELAY = 5 96 # The maximum download delay to be set in case of high latencies 97 #AUTOTHROTTLE_MAX_DELAY = 60 98 # The average number of requests Scrapy should be sending in parallel to 99 # each remote server 100 #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 101 # Enable showing throttling stats for every response received: 102 #AUTOTHROTTLE_DEBUG = False 103 104 # Enable and configure HTTP caching (disabled by default) 105 # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 106 #HTTPCACHE_ENABLED = True 107 #HTTPCACHE_EXPIRATION_SECS = 0 108 #HTTPCACHE_DIR = 'httpcache' 109 #HTTPCACHE_IGNORE_HTTP_CODES = [] 110 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
This is pipelines.py, which contains the table building file. There is a mysql statement to check whether the url exists, which is actually redundant. Because the url has been deduplicated in redis.
1 # -*- coding: utf-8 -*- 2 3 # Define your item pipelines here 4 # 5 # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 import time,json,pymysql,re 8 from items import EastdayItem 9 from scrapy import Request 10 from scrapy.pipelines.images import ImagesPipeline 11 from scrapy.exceptions import DropItem 12 13 14 ''' 15 CREATE TABLE `eastday` ( 16 `id` INT(10) NOT NULL AUTO_INCREMENT, 17 `title` VARCHAR(255) DEFAULT NULL, 18 `url` VARCHAR(80) DEFAULT NULL, 19 `tag` VARCHAR(30) DEFAULT NULL, 20 `brief` VARCHAR(300) DEFAULT NULL, 21 pubdate DATETIME, 22 origin VARCHAR(50), 23 crawled_time DATETIME, 24 25 `miniimg` VARCHAR(500) DEFAULT NULL, 26 `img_urls` TEXT, 27 `article` TEXT, 28 PRIMARY KEY (`id`) 29 ) ENGINE=INNODB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8 30 ''' 31 32 class EastdayPipeline(object): 33 34 def process_item(self, item, spider): 35 print '----------------------------%s'%json.dumps(dict(item),ensure_ascii=False) 36 item["crawled_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 37 return item 38 39 40 class MysqlDBPipeline(object): 41 def __init__(self): 42 43 44 self.conn = pymysql.connect( 45 host='localhost', 46 port=3306, 47 user='root', 48 49 passwd='123456', 50 db='test', 51 charset='utf8', 52 ) 53 self.cur = self.conn.cursor() 54 55 def process_item(self, item, spider): 56 57 58 if isinstance(item, EastdayItem): 59 item["crawled_time"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 60 print item['pubdate'] 61 62 try: 63 for key in dict(item): 64 pass 65 item[key]=str(item[key]).replace("'", "\\\'") 66 item[key] = str(item[key]).replace('"','\\\"') 67 68 sql="""insert into eastday values(NULL,"{title}","{url}","{tag}","{brief}","{pubdate}","{origin}","{crawled_time}","{miniimg}","{img_urls}","{article}")""".format(title=item['title'],url=item['url'],tag=item['tag'],brief=item['brief'],pubdate=item['pubdate'],origin=item['origin'],crawled_time=item['crawled_time'],miniimg=item['miniimg'],img_urls=item['img_urls'],article=item['article']) 69 sql2 = 'select 1 from eastday where url="%s"'%item['url'] 70 print 'sql:',sql 71 72 self.cur.execute(sql2) 73 is_exist = self.cur.fetchone() 74 if is_exist==(1,): 75 print '%s' already exists %item['url'] 76 77 else: 78 self.cur.execute(sql) 79 self.conn.commit() 80 print 'insertion successful' 81 82 except Exception as e: 83 print u'database error:',e 84 pass 85 86 87 else: 88 print 'nonnonono' 89 90 91 92 class DownloadImagesPipeline(ImagesPipeline): 93 94 95 def get_media_requests(self, item, info): 96 97 98 if item['img_urls']: 99 for img_url in item['img_urls']: 100 101 yield Request(img_url,meta={'name':img_url}) 102 103 def item_completed(self, results, item, info): 104 image_paths = [x['path'] for ok, x in results if ok] 105 if not image_paths: 106 raise DropItem("Item contains no images") 107 return item 108 109 def file_path(self, request, response=None, info=None): 110 m=request.meta 111 img_name=re.findall('/([a-z_0-9]*)\.[(jpeg)|(jpg)|(png)|(bmp)|(gif)|(JPEG)|(JPG)|(PNG)|(BMP)|(GIF)]',m['name'])[-1] 112 #print 'img_name',img_name 113 filename = 'full3/%s.jpg'%img_name 114 return filename
this is items.py
1 # -*- coding: utf-8 -*- 2 3 # Define here the models for your scraped items 4 # 5 # See documentation in: 6 # http://doc.scrapy.org/en/latest/topics/items.html 7 8 import scrapy 9 10 11 class EastdayItem(scrapy.Item): 12 # define the fields for your item here like: 13 # name = scrapy.Field() 14 title=scrapy.Field() 15 url=scrapy.Field() 16 tag=scrapy.Field() 17 article=scrapy.Field() 18 img_urls=scrapy.Field() 19 crawled_time=scrapy.Field() 20 pubdate=scrapy.Field() 21 origin=scrapy.Field() 22 23 24 brief = scrapy.Field() 25 miniimg = scrapy.Field() 26 27 28 pass 29 30 ''' 31 class GuoneiItem(scrapy.Item): 32 # define the fields for your item here like: 33 # name = scrapy.Field() 34 title=scrapy.Field() 35 url=scrapy.Field() 36 tag=scrapy.Field() 37 article=scrapy.Field() 38 img_urls=scrapy.Field() 39 crawled_time=scrapy.Field() 40 41 brief=scrapy.Field() 42 miniimg=scrapy.Field() 43 44 45 pass 46 '''
There are too many files, so I won't post them one by one. The source code files have been packaged and uploaded to the blog garden, but I can't find a place to share the file link. If you want the source code, you can leave a message in the comments.
This is the stored result of mysql:
The content of Dongfang Toutiao is also collected from newspapers and periodicals of other websites, and the content is still very rich. After climbing down Dongfang Toutiao, you can quickly make an app for consulting content.
The article picture uses the source file name of the link of the picture in the news, which is convenient for front-end development to display the correct picture on the page.