python3 学习1（搜索关键字爬取一页word格式的百度文库并下载成文本）

#coding: utf-8

from bs4 import BeautifulSoup
# -*- coding: UTF-8 -*-
from selenium import webdriver

import time
browser = webdriver.Chrome()
#打开百度文库的首界面
browser.get("https://wenku.baidu.com/")
#通过ID找网页的标签，找到搜索框的标签
seek_input = browser.find_element_by_id("kw")
#设置搜索的内容
seek_input.send_keys("2018财税新政策,有哪些调整")
#找到搜索文档按钮
seek_but = browser.find_element_by_id("sb")
#并点击搜索文档按钮
seek_but.click()
#并点击搜索文档按钮

html = browser.page_source
bf1 = BeautifulSoup(html, 'lxml')
result = bf1.find_all("a", class_="log-xsend tiaoquan act-xsend")

for result_table in result:
print ("-----标题----\n"+result_table.get('title'))#标题
print("----链接----\n"+result_table.get('href'))#链接
web=result_table.get('href')
chromedriver_dir = 'D:\software\chormedriver\chromedriver.exe'
options = webdriver.ChromeOptions()
# 伪装成iPhone
options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3')

driver = webdriver.Chrome(chromedriver_dir, chrome_options=options)
driver.get(web)
# 向下滑动，到有“继续阅读”的位置停止
foldpagewg = driver.find_elements_by_xpath("//div[@class='foldpagewg-root']")
try:
# 拖动到对应的位置
driver.execute_script('arguments[0].scrollIntoView();', foldpagewg[-1])
# 模拟点击，继续阅读
continue_read = driver.find_element_by_xpath("//div[@class='foldpagewg-icon']")
continue_read.click()
except:
print('此文章无需翻页！！！')

attempts = 0
success = True
while attempts < 10 and success:
try:
# 向下滑动，到有“点击加载更多”的位置停止
pagerwg = driver.find_elements_by_xpath("//div[@class='pagerwg-root']")
# 拖动到可见的元素
driver.execute_script('arguments[0].scrollIntoView();', pagerwg[-1])

# 模拟点击，加载更多页面
more_text = driver.find_element_by_xpath("//span[@class='pagerwg-arrow-lower']")
more_text.click()

except:
success = False
attempts += 1
print('到底啦~~！请开始你的爬虫表演>.<')

# 等待网页加载每次点击间隔2s
time.sleep(3)

html = driver.page_source
bf1 = BeautifulSoup(html, 'lxml')
result = bf1.find_all(class_='txt')
texts_list = []

# 获得文章标题
title = bf1.find_all('div', class_='doc-title')

title = title[0].text.replace('\n' , "").replace('?' , "")
filename = str(title) + '.txt'

for each_result in result:
bf2 = BeautifulSoup(str(each_result), 'lxml')
texts = bf2.find_all('p')
string=texts[0].getText()
print(texts[0].getText())
if string.find('。')>=0 :
texts_list.append(string+'\n')
else:
texts_list.append(string)
contentss = ''.join(texts_list)

with open(filename, 'a', encoding='utf-8') as f:
f.writelines(contentss)

//此爬取还是有缺陷，有的文章并不能爬取全，有的是vip才能爬取，博主还是个小白，仅作笔记用

python3 学习1（搜索关键字爬取一页word格式的百度文库并下载成文本）

猜你喜欢