建立MongoDB服务:
打开MongoDB的下载路径,进入bin文件夹下:
mongod -dbpath F:\mongod\data\db
另启一个命令行窗口(当前窗口不要关闭),进入bin文件夹下:
mongo
法一:
爬虫文件:
#import modules import bs4 import scrapy import pymongo from bs4 import BeautifulSoup from pymongo import MongoClient class UniversityRankSpider(scrapy.Spider): name = "university-rank" #name of spider start_urls = ['http://gaokao.xdf.cn/201702/10612921.html',] #url of website def parse(self, response): #parse function content = response.xpath("//tbody").extract()[0] soup = BeautifulSoup(content, "lxml") #use BeautifulSoup table = soup.find('tbody') count = 0 lst = [] # list to save data from the table for tr in table.children: #BeautifulSoup grammmer if isinstance(tr, bs4.element.Tag): td = tr('td') if count >= 2: #ingore the first line lst.append([td[i]('p')[0].string.replace('\n','').replace('\t','') for i in range(8)]) count += 1 conn = MongoClient('mongodb://localhost:27017/') #connect mongodb db = conn.testdb for item in lst: #insert data into university_rank table db.university_rank.insert([ {'rank':'%s'%item[0], 'university':'%s'%item[1], 'address':'%s'%item[2], 'local_rank':'%s'%item[3], 'total grade':'%s'%item[4], 'type':'%s'%item[5], 'star rank':'%s'%item[6], 'class':'%s'%item[7]}, ]) print 'Successfully downloading data from website, and write it to mongodb database!'
法二:
# pipelines to insert the data into mongodb import pymongo from scrapy.conf import settings class BankPipeline(object): def __init__(self): # connect database self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT']) # using name and password to login mongodb # self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW']) # handle of the database and collection of mongodb self.db = self.client[settings['MONGO_DB']] self.coll = self.db[settings['MONGO_COLL']] def process_item(self, item, spider): postItem = dict(item) self.coll.insert(postItem) return item