#coding: utf-8
from bs4 import BeautifulSoup
# -*- coding: UTF-8 -*-
from selenium import webdriver
import time
browser = webdriver.Chrome()
#打开百度文库的首界面
browser.get("https://wenku.baidu.com/")
#通过ID找网页的标签,找到搜索框的标签
seek_input = browser.find_element_by_id("kw")
#设置搜索的内容
seek_input.send_keys("2018财税新政策,有哪些调整")
#找到搜索文档按钮
seek_but = browser.find_element_by_id("sb")
#并点击搜索文档按钮
seek_but.click()
#并点击搜索文档按钮
html = browser.page_source
bf1 = BeautifulSoup(html, 'lxml')
result = bf1.find_all("a", class_="log-xsend tiaoquan act-xsend")
for result_table in result:
print ("-----标题----\n"+result_table.get('title'))#标题
print("----链接----\n"+result_table.get('href'))#链接
web=result_table.get('href')
chromedriver_dir = 'D:\software\chormedriver\chromedriver.exe'
options = webdriver.ChromeOptions()
# 伪装成iPhone
options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3')
driver = webdriver.Chrome(chromedriver_dir, chrome_options=options)
driver.get(web)
# 向下滑动,到有“继续阅读”的位置停止
foldpagewg = driver.find_elements_by_xpath("//div[@class='foldpagewg-root']")
try:
# 拖动到对应的位置
driver.execute_script('arguments[0].scrollIntoView();', foldpagewg[-1])
# 模拟点击,继续阅读
continue_read = driver.find_element_by_xpath("//div[@class='foldpagewg-icon']")
continue_read.click()
except:
print('此文章无需翻页!!!')
attempts = 0
success = True
while attempts < 10 and success:
try:
# 向下滑动,到有“点击加载更多”的位置停止
pagerwg = driver.find_elements_by_xpath("//div[@class='pagerwg-root']")
# 拖动到可见的元素
driver.execute_script('arguments[0].scrollIntoView();', pagerwg[-1])
# 模拟点击,加载更多页面
more_text = driver.find_element_by_xpath("//span[@class='pagerwg-arrow-lower']")
more_text.click()
except:
success = False
attempts += 1
print('到底啦~~!请开始你的爬虫表演>.<')
# 等待网页加载每次点击间隔2s
time.sleep(3)
html = driver.page_source
bf1 = BeautifulSoup(html, 'lxml')
result = bf1.find_all(class_='txt')
texts_list = []
# 获得文章标题
title = bf1.find_all('div', class_='doc-title')
title = title[0].text.replace('\n' , "").replace('?' , "")
filename = str(title) + '.txt'
for each_result in result:
bf2 = BeautifulSoup(str(each_result), 'lxml')
texts = bf2.find_all('p')
string=texts[0].getText()
print(texts[0].getText())
if string.find('。')>=0 :
texts_list.append(string+'\n')
else:
texts_list.append(string)
contentss = ''.join(texts_list)
with open(filename, 'a', encoding='utf-8') as f:
f.writelines(contentss)
//此爬取还是有缺陷,有的文章并不能爬取全,有的是vip才能爬取,博主还是个小白,仅作笔记用