import re
class CsdnPipeline(object):
def process_item(self, item, spider):
item["tag"] = [re.sub(r"\s+|/","",i,re.S) for i in item["tag"]]
item["tag"] =[i for i in item["tag"] if len(i)>0 and i!='标签:']
print(item)
在settings.py
BOT_NAME = 'csdn'
SPIDER_MODULES = ['csdn.spiders']
NEWSPIDER_MODULE = 'csdn.spiders'
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
ROBOTSTXT_OBEY = False