最近才开了博客,想着把之前写过的代码都整理整理
基本思路是利用selenium模拟鼠标点击事件,进入相应页面后根据元素提取出相应信息
在深入点其实可以做一个UI
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from pyquery import PyQuery as pq
import re
import pymysql
import time
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
num = 1
def replace_addr(addr, num):
c = str(num)
d = str(num+1)
b = re.sub(c, d, addr)
return b
def save_to_db(result):
conn = pymysql.connect(host='localhost', user='root', password='root', port=3306, db='ms', charset='utf8')
cursor = conn.cursor()
#
user_level = result["user_level"]
comment_time = result["comment_time"]
comment_value = result["comment_value"]
type_p = result["type_p"]
recommands = result["recommands"]
reply_count = result["reply_count"]
user_image = result["user_image"]
user_name = result["user_name"]
try:
print("wait")
except Exception:
print("fail save to database")
finally:
conn.close()
cursor.close()
def get_message():
type_p = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#crumb-wrap > div > div.crumb.fl.clearfix > div:nth-child(5) > a")))
comment_click = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#detail > div.tab-main.large > ul > li:nth-child(5)")))
comment_click.click()
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#comment-0 .comment-item")))
html = browser.page_source
doc = pq(html)
items = doc("#comment-0 .comment-item").items()
for item in items:
product = {
'user_level': item.find('.user-column .user-level').text(),
'comment_time': item.find('.comment-column .comment-message .order-info').text()[-16:-6],
'comment_value': item.find('.comment-column .comment-con').text(),
'type_p': type_p.text,
'recommands': item.find('.comment-column .comment-message .comment-op .J-nice').text(),
'reply_count': item.find(".comment-column .comment-message .comment-op .a").text(),
'user_image': item.find(".user-column .user-info .avatar").attr('src'),
'user_name': item.find(".user-column .user-info .avatar").attr('alt')
}
print(product)
def open_a_product(addres):
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_goodsList")))
try:
product_click = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, addres)))
now_handle = browser.current_window_handle
product_click.click()
all_handles = browser.window_handles
for handle in all_handles:
if handle != now_handle:
browser.switch_to_window(handle)
get_message()
browser.close()
browser.switch_to_window(now_handle)
except TimeoutException:
open_a_product(addres)
def traversal_all_products():
wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_goodsList")))
global num
tag = -1
new_link = "#J_goodsList > ul > li:nth-child(1) > div > div.p-img > a > img"
for i in range(40):
try:
tag += 1
new_link = replace_addr(new_link, tag)
open_a_product(new_link)
print("open product sucess! count:%d"%(num))
num += 1
except Exception:
print("open error")
def search():
try:
browser.get("https://www.jd.com/")
print("success enter jingdong\n")
inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#key")))
search_click = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#search > div > div.form > button")))
inputs.send_keys("得力")
search_click.click()
total = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > em:nth-child(1) > b")))
print("enter search!\n")
traversal_all_products()
return total.text
except TimeoutException:
return search()
def next_page(page_number):
try:
inputs = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > input")))
page_click = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#J_bottomPage > span.p-skip > a")))
inputs.clear()
inputs.send_keys(page_number)
page_click.click()
wait.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "#J_bottomPage > span.p-num > a.curr"), str(page_number)))
traversal_all_products()
print("page %d"%(page_number))
except TimeoutException:
next_page(page_number)
def main():
total = search()
total = int(re.compile('(\d+)').search(total).group(1))
for i in range(2, total-2):
try:
next_page(i)
except Exception:
print("page down error")
next_page(i)
if __name__ == "__main__":
main()