Scrapy插入Mysql数据库(多表)

mysql参考
保存数据到MySql数据库——我用scrapy写爬虫（二）
保证安装

pip install pymysql
pip install Twisted

主要是需要主子表插入，子表有多条数据，子表是json格式数据
思路是

for page in range(0,int(pages)):
           
   #获取表格中的所有行，使用find_elements_by_xpath
    included_names = self.driver.find_elements_by_xpath('/html/body/div[5]/div[2]/div/div')
       
    item['titles']="" 
    item['projId']="" 
    item['projCode']="" 
    item['projName']="" 
    item['itemType']=""
    titles=""
    #遍历获取的div
    for i in included_names:
        #获取结点的子节点，在xpath路径前加"."
        title=i.find_element_by_xpath('./div[4]/div[2]/div/a').text
        print(title) 
        projId=i.find_element_by_xpath('./div[4]/div[2]/div/a').get_attribute('id')
        
        #1.主表,selenium切换到新页面打开网页
        href_main="https://XXXXXXXXXX?projId="+projId;
        newwindow  = 'window.open("'+href_main+'");'
        self.driver.execute_script(newwindow)
      
        # #移动句柄，对新打开页面进行操作
        self.driver.switch_to_window(self.driver.window_handles[1])
        self.driver.get(href_main)
        html = self.driver.page_source
        # print(html)
        
        soup = BeautifulSoup(html, 'lxml')   
        json_text = soup.find('pre')
        if json_text!=None and json_text!=''and json_text!="None" :
            # print(json_text.get_text())
            text = json.loads(json_text.get_text())
            print("项目ID："+text['projId'])
            print("项目编号："+text['projCode'])
            projCode=text['projCode']
            print("项目名称："+text['projName'])
            projName=text['projName']                 
            item['projId']=projId 
            item['projCode']=projCode 
            item['projName']=projName 
            

            #2 项目子表信息 1
            herf_sub1="https://xxxxxxxxxx?projId="+projId
            self.driver.get(herf_sub)
            html = self.driver.page_source
            soup = BeautifulSoup(html, 'lxml')   
            json_text = soup.find('pre')
            if json_text!=None and json_text!=''and json_text!="None" :
                texts = json.loads(json_text.get_text())
                ##解析多条
                for text in texts:
                    print("合同ID:"+text['contractId'])
                    print("合同编号:"+text['contractCode'])
                    print("合同名称:"+text['contractName'])
                   
                    item['contractId']=text['contractId']
                    item['contractCode']=text['contractCode'] 
                    item['projId']=projId 
                    ###定义了一个 item['itemType']，用来判断执行哪一个插入语句
                    item['itemType']="sub1"
                    #-----------单独执行sub1对应的插入-------------
                    yield item 

            #3 项目子表信息 2
            herf_sub2="https://xxxxxxxxxxxx?projCode="+projCode
            self.driver.get(herf_sub2)
            html = self.driver.page_source                
            soup = BeautifulSoup(html, 'lxml')   
            json_text = soup.find('pre')
            if json_text!=None and json_text!=''and json_text!="None" :
                texts = json.loads(json_text.get_text())
                ##解析多条
                for text in texts:
                    print("订单编号:"+text['orderCode'])                            
                    print("合同编号:"+str(text['contractCode']))
                    print("项目编号:"+text['projectCode'])
                   
                    item['orderCode']=text['orderCode']
                    item['contractCode']=text['contractCode'] 
                    item['projectCode']=projCode 
                    #------------单独执行sub2对应的插入-----------
                    item['itemType']="sub2"
                    yield item 
        else:                
            print("null")
           
        item['itemType']="main"
        #-------单独执行main对应的插入---------
        yield item 
               
        time.sleep(1)  # 仅为能达到悬停效果睡眠，可删除
        #关闭该新打开的页面
        self.driver.close()
        self.driver.switch_to_window(self.driver.window_handles[0])
    time.sleep(3)  # 仅为能达到悬停效果睡眠，可删除
    print("page:"+str(page))
    if(page!=pages):
    #下一页    self.driver.find_element_by_xpath('/html/body/div[6]/div/div[1]/div[2]/ul/li[4]/button').click()

Pipeline中改写对应的方法

from XX.db.dbhelper import DBHelper
class XXXXPipeline(object):

    # 连接数据库
    def __init__(self):
        self.db = DBHelper()
    
    def process_item(self, item, spider):
        # 插入数据库
        if item['itemType']=="main":
            self.db.insertmain(item)
        if item['itemType']=="sub1":
            self.db.insertsub1(item)
        if item['itemType']=="sub2":            
            self.db.insertsub2(item)
        return item

Scrapy插入Mysql数据库(多表)

猜你喜欢