案例精选:爬取美食杰所有图片批量下载并入库

import shutil,os  #文件处理的高级模块
from bs4 import BeautifulSoup
from urllib.request import Request,urlopen,urlretrieve
import sqlite3

class Image_downLoad(object):
    def __init__(self):
        self.base_url='https://www.meishij.net/chufang/diy/?&page=1'
        self.current_page=1
    def stat_downLoad(self):#===================
        #判断是否存在指定的文件夹
        if os.path.exists('image'):
            #删除树状结构的文件夹,忽略错误信息
            shutil.rmtree('image',True)
        os.makedirs('image')#创建文件夹
        os.chdir('image')#如果对某个文件夹内部进行操作,首先进入该文件夹内部
        self.get_page_code_with_url(self.base_url)
    def get_page_code_with_url(self,full_url):
        headers={
            'User-Agent':' Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
        }
        request = Request(full_url,headers=headers)
        try:
            response=urlopen(request)
            code=response.read().decode()
        except Exception as e:
            print('请求失败')
        else:
            self.get_data_with_code(code)
    def get_data_with_code(self,code):
        print('正在下载第{}页'.format(self.current_page))
        soup=BeautifulSoup(code,'lxml')
        page_name='Page{}'.format(self.current_page)
        os.mkdir(page_name)
        os.chdir(page_name)
        image_list=soup.select('div.listtyle1_list div a img')#===================================
        talk_list=soup.select('div.listtyle1_list div.c1 span')
        # print(talk_list)
        tack_list1=[]
        for tack in talk_list:
            tack=tack.text[-8:-3]
            tack_list1.append(tack)
        # print(tack_list1)
        for i in range(len(image_list)):
            image=image_list[i]
            image_src = image.get('src')
            image_alt=image.get('alt')
            num=tack_list1[i]
            image_alt=image_alt.split('(')[0]+str(num)+'.jpg'
            print(image_alt,image_src)
            urlretrieve(image_src,image_alt) #下载操作
            Sql.insert_info_to_table(image_alt,image_src)


        os.chdir(os.path.pardir) #移动到父级目录
        self.current_page+=1
        self.get_next_page(code) #在当前页源码中寻在下一页的链接

    def get_next_page(self,code):
        soup=BeautifulSoup(code,'lxml')#解析当前页码
        next_page=soup.select('div.listtyle1_page_w a.next')[0]#============================
        url=next_page.get('href')#获取该标签的href属性
        self.get_page_code_with_url(url)
class Sql(object):
    connect = None
    cursor = None
    @classmethod
    def create_db_and_table(cls):  # 创建数据库和表
        cls.connect = sqlite3.connect('msjDB')
        cls.cursor = cls.connect.cursor()
        cls.cursor.execute(
            'create table if not exists qbTable (name text,src text)')
        cls.connect.commit()
    @classmethod
    def insert_info_to_table(cls, image_alt,image_src):
        cls.cursor.execute('insert into qbtable (name,src) VALUES ("{}","{}")'. \
                           format(image_alt,image_src))
        cls.connect.commit()
    @classmethod
    def close_db(cls):
        cls.cursor.close()
        cls.connect.close()

Sql.create_db_and_table()
downLoad=Image_downLoad()
downLoad.stat_downLoad()
Sql.close_db()#网站维护更改,代码将不可使用#

猜你喜欢

转载自blog.csdn.net/qq_38059635/article/details/81229167