如何在python3中将网页爬虫数据存储到mysql数据库

前两篇文章都在说在py中用BeautfulSoup爬取本地网页的事情,本来准备去真实网页试一下的,但是老林说不如把你之前学的mysql数据库温习一下,顺道学着把你现在爬到的网页存取到mysql数据库之中~
由此 本文的主题就出现了:

如何在python3中将网页爬虫数据存储到mysql数据库

先小小插播一下:为何标题强调python3!
因为py2与py3连接数据库时用的不是一个库!
PyMySQL 是在 Python3.x 版本中用于连接 MySQL 服务器的一个库,
Python2中则使用mysqldb。

from bs4 import BeautifulSoup
import pymysql

#本地网页爬取数据
#即上一篇文章所学知识
def getData():
  datalist = []
  with open('D:/Study/Data Analysis/week1/1_2/1_2answer_of_homework/1_2_homework_required/index.html','r')as wb_data:
    Soup = BeautifulSoup(wb_data,'lxml')
    #print(Soup)
    # address = Soup.select('body > div:nth-of-type(3) > div > div.col-md-9 > div:nth-of-type(3) > div:nth-of-type(3) > div > img')
    address = Soup.select('body > div > div > div.col-md-9 > div > div > div > img')
    price = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4.pull-right')
    title = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.caption > h4 > a')
    amount = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p.pull-right')
    stars = Soup.select('body > div > div > div.col-md-9 > div > div > div > div.ratings > p:nth-of-type(2)')
    print("getData--end")
    print('start-print-data')
    for address,price,title,amount,stars in zip(address,price,title,amount,stars):
      data = {
        'address':address.get('src'),
        'price':price.get_text(),
        'title':title.get_text(),
        'amount':list(amount.stripped_strings)[0],
        'stars': len(stars.find_all("span", class_='glyphicon glyphicon-star'))
    }
      print(data)
      datalist.append(data)
    print('end-print-data')
    return datalist


#数据库中创建新表用以存储
def mysql_create():
  mysql_host = 'localhost'
  mysql_db = 'school'
  mysql_user = 'root'
  mysql_password = '123'
  mysql_port = 3306
  db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 连接数据库编码注意是utf8,不然中文结果输出会乱码
  sql_create = """CREATE TABLE schoolsheet(
price VARCHAR(10),
title VARCHAR(50),
amount VARCHAR(265),
stars VARCHAR(265),
address VARCHAR(265),
PRIMARY KEY (`price`),
UNIQUE KEY `title`(`title`))ENGINE=InnoDB AUTO_INCREMENT=12 DEFAULT CHARSET=utf8"""
 # sql_key="CREATE UNIQUE INDEX id ON schoolsheet(id)"
  cursor = db.cursor()
  cursor.execute("DROP TABLE IF EXISTS schoolsheet")
  cursor.execute(sql_create)# 执行SQL语句
  db.commit()
  #cursor.execute(sql_key)
  db.close() # 关闭数据库连


#存放爬取数据到数据库中
def IntoMysql(datalist):
  mysql_host = 'localhost'
  mysql_db = 'school'
  mysql_user = 'root'
  mysql_password = '123'
  mysql_port = 3306
  db = pymysql.connect(host=mysql_host, port=mysql_port, user=mysql_user, password=mysql_password, db=mysql_db,charset='utf8') # 连接数据库编码注意是utf8,不然中文结果输出会乱码
  print('open connect!')
  cursor = db.cursor()
  print('start-insert-data')
  for j in range(len(datalist)):
    datarow = datalist[j]
    addr = datarow['address']
    pric = datarow['price']
    titl = datarow['title']
    amou = datarow['amount']
    star = datarow['stars']
    sql = "INSERT INTO schoolsheet(price,title,amount,stars,address)VALUES ('%s','%s','%s','%s','%s')"%(pric,titl,amou,star,addr)
    cursor.execute(sql)
    db.commit()

  db.close()


datalist=getData()
mysql_create()
IntoMysql(datalist)

一切领悟都在代码之中,还不太熟的我准备再多敲一敲领会一下,没有太多讲解,没准等我领会完再来更,毕竟不懂就不瞎BB了!!!

萌星一枚,大神轻喷!!!

猜你喜欢

转载自blog.csdn.net/jessica__lu/article/details/86421580