python简单爬虫实例5之抓取猫眼网数据并存放数据库

继续之前的实例上进行操作。

本次主要介绍SQLite的使用。

首先简单一介绍,SQLite是一个内置数据库,是以一种文件的方式存储的。

1.创建数据库

数据库创建过程如下代码(有基础的道友可以轻松阅读,没有基础的建议预习一下数据库语言,在此不做详细介绍):

import sqlite3
import os

db_file = 'maoyan.db'

# 创建表
def create_table():

    # 1. 连接数据库
    conn = sqlite3.connect(db_file)

    # 2. 创建执行对象
    cursor = conn.cursor()

    # 3. 执行SQL语句
    cursor.execute('''
        create table movie(
            id integer primary key autoincrement,
            title text,
            star text,
            reltime text,
            score float
        )
    ''')

    # 4. 提交操作, 对于可以修改数据库内容的操作, 必须要提交
    conn.commit()

    # 5. 关闭连接
    conn.close()

def save(movie):
    # 1. 连接
    conn = sqlite3.connect(db_file)

    # 2. 创建执行对象
    cursor = conn.cursor()

    # 3. 执行SQL语句
    cursor.execute('''
        insert into movie 
        (title, star, reltime, country, score)
        values
        (?, ?, ?, ?, ?)
    ''', (movie.get('title'), movie.get('star'), movie.get('time'),
          movie.get('country'), movie.get('score')) )

    # 4. 提交
    conn.commit()

    # 5. 关闭
    conn.close()


if __name__ == '__main__':
    # 创建一个数据表
    if not os.path.exists(db_file):
        create_table()

    

    # 保存
    movie = {'title': '霸王别姬', 'star': '主演:张国荣,张丰毅,巩俐', 'time': '上映时间:1993-01-01', 'score': 9.6}
    # save(movie)

执行完代码后,文件夹中会生成maoyan.db,打开后发现无法查看内容。

按如下操作执行:1) PyCharm 右侧点击Database


                               2) 点击 + 弹出菜单中, 选择Data Source, 再选择sqlite


                               3)  如果有 Download missing driver files, 点击 Download


                               4)  选择 file 选择需要打开的数据库文件 (提前查询好创建的db文件路径)


                               5) 完成以上操作即可。

效果如图所示:

2.向数据库中存放数据

代码如下:

from bs4 import BeautifulSoup
from urllib.request import urlopen
import sqlite3
import os
import re
db_file = 'maoyan.db'
def save(movie):
    # 1. 连接
    conn = sqlite3.connect(db_file)

    # 2. 创建执行对象
    cursor = conn.cursor()

    # 3. 执行SQL语句
    cursor.execute('''
        insert into movie 
        (name, star, reltime, score)
        values
        (?, ?, ?, ?)
    ''', (movie.get('name'), movie.get('star'), movie.get('releasetime'),
          movie.get('score')) )

    # 4. 提交
    conn.commit()

    # 5. 关闭
    conn.close()
def get_one_page(x):
    #字符串的格式化处理: {}占位符表示未知的参数,后面会补上
    url = 'https://maoyan.com/board/4?offset={}'.format(x*10)
    #第二种方法:url = 'https://maoyan.com/board/4?offset=%d'%(x*10)
    response = urlopen(url)
    return (response.read().decode())

def get_film(html):
    ls = []
    #html = get_one_page(x)
    soup = BeautifulSoup(html,'html.parser')
    class_name = soup.select('.name')
    class_star = soup.select('.star')
    class_releasetime = soup.select('.releasetime')
    class_integer = soup.select('.integer')
    class_fraction = soup.select('.fraction')
    #print(class_name,class_star,class_releasetime,class_integer,class_fraction)
    for a,b,c,d,e in zip(class_name,class_star,class_releasetime,class_integer,class_fraction):
        '''print(a.get_text())
        print(b.get_text().strip())
        print(c.get_text())
        print(d.get_text(),end='')
        print(e.get_text().strip())
        print()'''
        movie={}
        movie['name']=a.get_text()
        movie['star'] = b.get_text().strip()
        movie['releasetime'] = c.get_text()
        #movie['releasetime'] = d.get_text()
        #movie['fraction'] = e.get_text()
        #评分合成
        f=d.get_text()+e.get_text()
        movie['score']=f
        ls.append(movie)

    return ls


if __name__ == '__main__':

    movie_list = []

    for index in range(0, 10):
        html = get_one_page(index)
        movie_list += get_film(html)

    # 使用数据库保存数据
    print((movie_list))

    # SQLite
    for movie in movie_list:
        save(movie)

该代码在数据库文件建立的情况下可以直接运行。

效果图如下:

3.数据库查询

编写查询模块代码:

def find_by_title(key):

    # 1.
    conn = sqlite3.connect(db_file)

    # 2.
    cursor = conn.cursor()

    # 3.
    result = cursor.execute('''
        select * from movie
        where name like ?
    ''', ('%'+key+'%',))

    # 4. 查询不需要提交
    ls = []
    for row in result:
        movie = {}
        movie['id'] = row[0]
        movie['name'] = row[1]
        movie['star'] = row[2]
        movie['reltime'] = row[3]
        movie['score'] = row[4]

        ls.append(movie)

    # 5. 关闭
    conn.close()

    return ls

完整代码如下:

from bs4 import BeautifulSoup
from urllib.request import urlopen
import sqlite3
import os
import re
db_file = 'maoyan.db'
def find_by_title(key):
    # 1.
    conn = sqlite3.connect(db_file)
    # 2.
    cursor = conn.cursor()
    # 3.
    result = cursor.execute('''
        select * from movie
        where name like ?
    ''', ('%'+key+'%',))

    # 4. 查询不需要提交
    ls = []
    for row in result:
        movie = {}
        movie['id'] = row[0]
        movie['name'] = row[1]
        movie['star'] = row[2]
        movie['reltime'] = row[3]
        movie['score'] = row[4]

        ls.append(movie)
    # 5. 关闭
    conn.close()
    return ls
def save(movie):
    # 1. 连接
    conn = sqlite3.connect(db_file)

    # 2. 创建执行对象
    cursor = conn.cursor()

    # 3. 执行SQL语句
    cursor.execute('''
        insert into movie 
        (name, star, reltime, score)
        values
        (?, ?, ?, ?)
    ''', (movie.get('name'), movie.get('star'), movie.get('releasetime'),
          movie.get('score')) )

    # 4. 提交
    conn.commit()

    # 5. 关闭
    conn.close()
def get_one_page(x):
    #字符串的格式化处理: {}占位符表示未知的参数,后面会补上
    url = 'https://maoyan.com/board/4?offset={}'.format(x*10)
    #第二种方法:url = 'https://maoyan.com/board/4?offset=%d'%(x*10)
    response = urlopen(url)
    return (response.read().decode())

def get_film(html):
    ls = []
    #html = get_one_page(x)
    soup = BeautifulSoup(html,'html.parser')
    class_name = soup.select('.name')
    class_star = soup.select('.star')
    class_releasetime = soup.select('.releasetime')
    class_integer = soup.select('.integer')
    class_fraction = soup.select('.fraction')
    #print(class_name,class_star,class_releasetime,class_integer,class_fraction)
    for a,b,c,d,e in zip(class_name,class_star,class_releasetime,class_integer,class_fraction):
        '''print(a.get_text())
        print(b.get_text().strip())
        print(c.get_text())
        print(d.get_text(),end='')
        print(e.get_text().strip())
        print()'''
        movie={}
        movie['name']=a.get_text()
        movie['star'] = b.get_text().strip()
        movie['releasetime'] = c.get_text()
        #movie['releasetime'] = d.get_text()
        #movie['fraction'] = e.get_text()
        #评分合成
        f=d.get_text()+e.get_text()
        movie['score']=f
        ls.append(movie)

    return ls


if __name__ == '__main__':

    movie_list = []

    for index in range(0, 10):
        html = get_one_page(index)
        movie_list += get_film(html)

    # 使用数据库保存数据
    print((movie_list))

    # SQLite
    for movie in movie_list:
        save(movie)
    print(find_by_title('王'))

猜你喜欢

转载自blog.csdn.net/RHJlife/article/details/87630216