知识点:
爬虫的代码有一点需要注意,需要操作事件的时候最好不要直接用相应的方法,比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快,前端元素结构往往反应不过来,从而找出元素不可见或者不存在的错误。
province_items = DRIVER.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
#province_item.click() #前端加载不赢
DRIVER.execute_script('arguments[0].click();',province_item)
Python3爬虫三大案例实战
教程:https://edu.hellobi.com/course/156/play/lesson/2454
函数分2部分。主要文件main.py和配置MongoDB文件config.py
1、main.py
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from pyquery import PyQuery as pq
import time
import re
from configs import *
import pymongo
'''
1、搜索关键词
2、分析页码并翻页
3、分析提取数据
4、存到数据库
'''
#mongo数据库
client=pymongo.MongoClient(MONGO_URL)
db=client[MONGO_DB]
#headless chrome无界面模式
# from selenium.webdriver.chrome.options import Options
# global browser
# chrome_options = Options()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--disable-gpu')
# browser = webdriver.Chrome(chrome_options=chrome_options)
# browser.set_window_size(1400, 900)
# wait = WebDriverWait(browser, 10)
#有界面模式,掉起浏览器便于观察
global browser
browser = webdriver.Chrome()
browser.set_window_size(1400, 900)
wait = WebDriverWait(browser, 10)
#打开函数
def search():
try:
#百度
# browser.get("https://www.baidu.com")
# input=browser.find_element_by_id("kw")
# input.send_keys("Python")
# input.send_keys(Keys.ENTER)
# wait=WebDriverWait(browser,10)
# wait.until(EC.presence_of_element_located((By.ID,"content_left")))
# print(browser.current_url)
# print(browser.get_cookies())
# print(browser.page_source)
#淘宝
browser.get("http://www.taobao.com")
#1、获取按钮
input = wait.until(
EC.presence_of_element_located((By.ID, "q")) )
submit = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#J_TSearchForm > div.search-button > button")) )
#2、模拟操作
input.send_keys('美食')
submit.click()
#等待加载挖完成再提取总页数
total = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.total"))
).text
# print(total.text) # '100天'提取数字
totalnum=int(re.search(r'\d+',total).group())
return totalnum
except TimeoutError:
search()
#自动翻页
def next_page(page_number):
try:
input = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > input")))
submit = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > div.form > span.btn.J_Submit")))
input.clear() #清除内容先
input.send_keys(page_number)
submit.click()
#判断百度首页上,“糯米”按钮这个元素中存在文本:糯米,判断翻页是否成功
wait.until(
EC.text_to_be_present_in_element(
(By.CSS_SELECTOR, "#mainsrp-pager > div > div > div > ul > li.item.active > span"),str(page_number)))
time.sleep(0.1)
getproducts()
except TimeoutError:
next_page(page_number)
#数据提取
def getproducts():
wait.until(
EC.presence_of_element_located(
(By.CSS_SELECTOR, "#mainsrp-itemlist .items .item")))
html=browser.page_source
doc=pq(html)
#items方法得到所有选择内容
items=doc('#mainsrp-itemlist .items .item').items()
for item in items:
product={
# 'image':item.find('.pic .img').attr('src'),
'name': item.find('.pic .img').attr('alt'),
'price':item.find('.price').text(),
'desl':item.find('.deal-cnt').text()[:-2],
'shop':item.find(('.shop')).text(),
'location':item.find('.location').text()
}
print(product)
# save_to_mongo(product)
def save_to_mongo(result):
try:
if db[MONGO_TABLE].insert(result):
print('存储成功',result)
except Exception:
print('存储失败')
#主函数
def main():
try:
totalnum=search()
# print(totalnum)
for i in range(2,totalnum+1):
# for i in range(2, 3):
next_page(i)
except Exception:
print('出错啊')
finally:
browser.close()
if __name__=='__main__':
main()
2、config.py
前提启动MongoDB数据库
MONGO_URL='localhost'
MONGO_DB='taobao'
MONGO_TABLE='table'