1,利用cmd命令创建Scrapy项目
切换到自己想要建立项目的目录,执行
scrapy startproject 项目名
例如,我就是在G盘下建立Tutorial项目
2,利用Scrapy框架抓取数据
(1)在spider文件夹下面新建一个dmoz_spider.py文件,作为爬虫主文件:
代码如下:
import sys sys.path.append('tutorial') import re from tutorial.sqlHelper import DBHelper from tutorial.items import TeamInfoItem from scrapy.spiders import Spider from scrapy.selector import HtmlXPathSelector class DmozSpider(Spider): name = "dmoz" allowed_domains = ["nba.hupu.com"] start_urls = [ "https://nba.hupu.com/teams" ] def parse(self, response): html = HtmlXPathSelector(response) db = DBHelper('call', 'hdgjs@888', '192.168.1.205:6996', 'CRM_SJH') #数据库连接类 sql = "insert into TeamInfo (TeamName,Location)VALUES (%s,%s)" TeamList = [] for each in html.xpath("//div[@class='all']").extract(): #获取所有<div class="all"> result = re.findall('<div.*?<span class="">(.*?)</span>',each,re.S) #利用正则表达式获取球队球队赛区 tresult = re.findall('<h2>(.*?)</h2>', each, re.S) #获取球队名 for rlt in tresult: item = (rlt,result[0]) TeamList.append(item) db.write(sql, TeamList) #写入数据库
(2)Sqlserver连接帮助类sqlHelper.py
代码如下:
import pymssql class DBHelper: def __init__(self, user=None, passwd=None, host=None, database=None): try: self.connection = pymssql.connect(host=host, user=user, password=passwd, database=database) self.cursor = self.connection.cursor() except: print("Could not connect to DB server.") exit(0) def __del__(self): self.cursor.close() self.connection.close() def read(self, Sql, param=None): '''Exec select sql , return type is Tuple,use len fun return select row num use param like this: Sql=select * from table where param=%s and param1=%s param=(value1,valuei2) ''' try: cursor = self.connection.cursor() if param == None: cursor.execute(Sql) rs = cursor.fetchall() cursor.close() else: cursor.execute(Sql, param) rs = cursor.fetchall() cursor.close() except Exception as e: print(e) rs = () return rs def write(self, sql, param, iscommit=True): try: cursor = self.connection.cursor() print(sql) n = cursor.executemany(sql, param) if iscommit: self.connection.commit() return n except Exception as e: print(e) self.connection.rollback() return -1 def writeOneRecord(self, sql): try: cursor = self.connection.cursor() n = cursor.execute(sql) self.connection.commit() return int(cursor.lastrowid) except: self.connection.rollback() return -1
(3)运行cmd执行爬虫main.py:
代码如下:
from scrapy import cmdline cmdline.execute("scrpay crawl dmoz".split())