版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/qq_37049050/article/details/84313620
在spider中创建Item 以及对应的操作,row 中的字段为数据库表中的字段,table为表名,在爬虫启动的时候初始化数据库链接,这块用了scrapy的信号机制,不了解的自己去查。
class UniversalRow(Item):
row = Field()
table = Field()
class BDMonitor(Spider):
name = "bd"
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super(BDMonitor, cls).from_crawler(crawler, *args, **kwargs)
crawler.signals.connect(spider.spider_opened, signals.spider_opened)
def spider_opened(self, spider):
self.data_conn = MySQLConnection(settings['DATA_DB']).get_conn()
def compose_item(self, table, item_tuple):
item = UniversalRow()
item['table'] = table
item['row']['hash_label'] = hashlib.md5(item_string.encode('utf8')).hexdigest()
item['row']['crawl_date'] = int(time.strftime('%Y%m%d', time.localtime()))
item['row']['source'] = 'bd'
item['row']['version'] = self.version+1
return item
接下来是pipelines 中的代码
class LoadDBPipeline(object):
def process_item(self, item, spider):
self.conn = spider.data_conn
self.dbsession = db(self.conn)
try:
with self.conn:
self.conn.ping(reconnect=True)
self.dbsession.Insert(item['table'], item['row'])
except pymysql.Warning as w:
logging.warning("Insert Warning:%s" % str(w))
except pymysql.Error as e:
logging.error("Insert Error:%s" % str(e))
# logging.error("Item: %s" % json.dumps(item, ensure_ascii=False))
return item
settings中
ITEM_PIPELINES = {
'jihuashu.pipelines.LoadDBPipeline': 10
}
最后就是插入数据库代码
class db:
def __init__(self, conn):
self.conn = conn
def Insert(self, table, data):
# insert data (pairs of column and value) into table
strCol = ''
strVal = ''
for k in data.keys():
strCol += ',`' + k + '`'
if isinstance(data[k], list):
dataValue = '|'.join(data[k])
elif isinstance(data[k], dict):
dataValue = json.dumps(data[k], ensure_ascii=False)
elif not isinstance(data[k], str):
dataValue = str(data[k])
else:
dataValue = data[k]
strVal += ",'" + self.conn.escape_string(dataValue) + "'"
qs = "INSERT INTO `%s` (%s) VALUES (%s)" % (table, strCol[1:], strVal[1:])
self.conn.query(qs)
return self.conn.insert_id()
初始化传入连接对象,Insert 中table 为表名,data为数据字典。