python3简单爬虫并存入mysql数据库

网络爬虫是一种高效的信息采集器，利用它可以快速、准确地采集我们想要的各种的数据资源。因此，可以说，网络爬虫技术几乎已成为大数据时代IT从业时代的必修课程。

爬取当当网商品数据（图片，价格，作者）

（1）导入包

import requests
from bs4 import BeautifulSoup

beautifulsoup是python的一个HTML解析库，可以用它来方便地从网页中提取数据。

（2)发送请求并打印状态码

#添加请求头   修改user-agent来伪装浏览器
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
url = 'http://category.dangdang.com/cp01.19.34.00.00.00.html'
res = requests.get(url,headers=headers)
print(res.status_code)

添加headers伪装浏览器

（3）分析网页发现商品在<li>标签中

通过beautifulsoup查找到所有的商品

找到class为bigimg的<ul>标签

data = soup.find_all('ul', attrs={'class': 'bigimg'})

（4）通过正则表达式取出需要的数据

pertern = re.compile(
    r'<img.*?data-original="(.*?)".*?<span class="search_now_price">(.*?)</span>.*?<a.*?单品作者.*?title="(.*?)">.*?</a>',
    re.S)

生成正则模式

data = str(data)
item = re.findall(pertern, data)

对data进行强制类型转换（转为字符串）

匹配出数据

（5）存入数据库

def create():
    db = pymysql.connect("localhost", "root", "root", "TESTDB")#连接数据库 

    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS EMPLOYER")

    sql = """CREATE TABLE EMPLOYER (
            ID INT PRIMARY KEY AUTO_INCREMENT,
            LOGO  CHAR(255),
            PRICE CHAR(20),
            AUTHER CHAR(255) )"""

    cursor.execute(sql)

    db.close()

def insert(value):
    db = pymysql.connect("localhost", "root", "root", "TESTDB")

    cursor = db.cursor()
    sql = "INSERT INTO polls_EXPLOYER(LOGO,PRICE,AUTHER) VALUES (%s, %s,  %s)"
    try:
        cursor.execute(sql,value)
        db.commit()
        print('插入数据成功')
    except:
        db.rollback()
        print("插入数据失败")
    db.close()

首先创建表，如果存在相同名字的表，则删除；再创建表。

插入数据。

（6）源码

import requests
from bs4 import BeautifulSoup
import re
import pymysql


def create():
    db = pymysql.connect("localhost", "root", "root", "TESTDB")#连接数据库 

    cursor = db.cursor()
    cursor.execute("DROP TABLE IF EXISTS EMPLOYER")

    sql = """CREATE TABLE EMPLOYER (
            ID INT PRIMARY KEY AUTO_INCREMENT,
            LOGO  CHAR(255),
            PRICE CHAR(20),
            AUTHER CHAR(255) )"""

    cursor.execute(sql)

    db.close()

def insert(value):
    db = pymysql.connect("localhost", "root", "root", "TESTDB")

    cursor = db.cursor()
    sql = "INSERT INTO EMPLOYER(LOGO,PRICE,AUTHER) VALUES (%s, %s,  %s)"
    try:
        cursor.execute(sql,value)
        db.commit()
        print('插入数据成功')
    except:
        db.rollback()
        print("插入数据失败")
    db.close()

create()  #创建表

#re匹配需要的数据
pertern = re.compile(
    r'<img.*?data-original="(.*?)".*?<span class="search_now_price">(.*?)</span>.*?<a.*?单品作者.*?title="(.*?)">.*?</a>',
    re.S)
#添加请求头   修改user-agent来伪装浏览器
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
url = 'http://category.dangdang.com/cp01.19.34.00.00.00.html'
res = requests.get(url,headers=headers)
print(res.status_code)
soup = BeautifulSoup(res.text, 'html.parser')
data = soup.find_all('ul', attrs={'class': 'bigimg'})
data = str(data)
item = re.findall(pertern, data)
for i in item:
    print(i)
    insert(i)

python3简单爬虫并存入mysql数据库

python3简单爬虫并存入mysql数据库

猜你喜欢