# coding=utf-8
'''
Created on 2017年2月20日
@author: chenkai
'''
import MySQLdb
import sys
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.remote import webelement
from selenium.webdriver.remote.webelement import WebElement
'''
连接数据库
'''
def getConn():
host ='127.0.0.1'
user ='root'
passwd ='123456'
port = 3306
dbcon=MySQLdb.connect(host,user,passwd,port=3306,charset="utf8")
return dbcon
def getCursor(mysqlConn):
return mysqlConn.cursor()
def closeDBConnandCur(cur,mysqlConn):
cur.close()
mysqlConn.commit() #Add
this sentence, submit the database operation before closing the database connection
mysqlConn.close()
#Connection Database
mysqlConn=getConn() #Get cursor
cur
=getCursor(mysqlConn) #Use
test database
cur.execute("use test")
'''
Browser
'''
options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches ", ["ignore-certificate-errors"])
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe", chrome_options=options) #Call chrome browser#
print dir(driver)
driver.get('https://sanya.nuomi.com/326')
#点击按钮
#driver.find_element_by_class_name("next-btn").click()
#
page = driver.page_source
# print(page)
# print type(page)
#
soup = BeautifulSoup(page,'html.parser',from_encoding="utf-8")
# print soup.prettify()
#
div_list=soup.find_all("div", class_="contentbox")
shopUrl=""
shopName=""
index=1001
for con in div_list:
index+=1
shopUrl=("https:"+con.a.get("href")).encode('utf-8')#
转码,插入mysql后不会乱码
shopName=(con.h4.get_text()).encode('utf-8')
# shopUrl.encode('utf-8')
# shopName.encode('utf-8')
print shopUrl,shopName
print 'insert into k_bdnm_shopinfo values(%d,%s,%s)'%(index,shopUrl,shopName)
try:
cur.execute("insert into k_bdnm_shopinfo values(%d,'%s','%s')"%(index,shopUrl,shopName))
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
driver.quit()
closeDBConnandCur(cur,mysqlConn)#关闭游标和数据库连接
'''
数据表信息
'''
CREATE TABLE `k_bdnm_shopinfo` (
`shop_id` int(11) NOT NULL auto_increment,
`shop_url` varchar(300) NOT NULL,
`shop_name` varchar(100) NOT NULL,
PRIMARY KEY (`shop_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
python+BeautifulSoup+selenium+mysqldb completes data capture
Guess you like
Origin http://43.154.161.224:23101/article/api/json?id=326332010&siteId=291194637
Ranking