This blog article is only used my amateur record, publish this, only users to read reference, if infringement, please inform me and I will be deleted.
The code takes advantage of selenium in webriver module, if you want to run the following code, will have to download the browser plug-ins.
Google browser click: http: //npm.taobao.org/mirrors/chromedriver/
Firefox browser click: https: //github.com/mozilla/geckodriver/releases
click on the corresponding hyperlink, find the corresponding own browser version plug-in, and then downloaded into the plug-in installation path of python.
Man of few words said, directly on the code. There are not clear or do not understand can leave a comment below.
from selenium import webdriver
import time
import csv
class JdSpider(object):
def __init__(self):
self.url = 'https://www.jd.com/' #设置url
# 创建浏览器对象
self.browser = webdriver.Chrome() #设置浏览器
# 打开某东,输入搜索内容,点击搜索
def get_page(self):
self.browser.get(self.url)
self.browser.find_element_by_xpath('//*[@id="key"]').send_keys('爬虫书籍') # 这里填写需要搜索的知识
self.browser.find_element_by_xpath('//*[@id="search"]/div/div[2]/button').click() #执行点击 查询 按钮
time.sleep(2) #睡眠两秒让页面加载
# 提取商品信息
def parse_page(self):
# 执行js脚本,把进度条拉到最底部
self.browser.execute_script(
'window.scrollTo(0,document.body.scrollHeight)')
# 加载网页的商品
time.sleep(3)
li_list = self.browser.find_elements_by_xpath('//*[@id="J_goodsList"]/ul/li')
# li_list : [li节点1,li节点2,商品信息li]
for li in li_list:
product_list = li.text.split('\n')
if product_list[0] == '单件':
# 价格
price = product_list[2]
# 名字
name = product_list[3]
# 评论
comment = product_list[4]
# 商家
market = product_list[5]
else:
# 价格
price = product_list[0]
# 名字
name = product_list[1]
# 评论
comment = product_list[2]
# 商家
market = product_list[3]
item=[price,market,name,comment]
print([price,market,comment,name])
with open('xxx.csv','a',encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(item)
def main(self):
self.get_page()
while True:
self.parse_page()
# 判断是否应该点击下一页
if self.browser.page_source.find('pn-next disabled') == -1:
self.browser.find_element_by_class_name('pn-next').click()
time.sleep(2)
else:
break
if __name__ == '__main__':
spider = JdSpider()
spider.main()
Reference in the classroom, I made some comments finishing, add a step to save the file.