python3 网络爬虫 如何对非页数选择,下拉加载类的网页进行爬取

目标网站是:https://www.americamakes.us/about/news/

网页加载方式:
这里写图片描述
Form Data为:
这里写图片描述

下面是我写的代码:

**import requests
from bs4 import BeautifulSoup
import os
import re

HTML_DIR = ‘html’
TXT_PATH = ‘thg_news.txt’

def getHTMLTest(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return “”

def printUnivList(html):
soup = BeautifulSoup(html, “html.parser”)
div_w_blog_list = soup.find(‘div’, class_ = ‘w-blog-list’)
articles = div_w_blog_list.find(‘article’)
divs = div_w_blog_list.find_all(‘div’, class_ = ‘w-blog-post-body’)
# print(divs)
for child in divs:
a_dict = {}
# print(child)
TYPE = articles.get(‘data-categories’)
a_page = child.find(‘a’)
data_page = child.find(‘time’)
data = data_page.get_text()
title_url = a_page.get(‘href’)
title = a_page.get_text()
# res = re.title_url(r’https://www.americamakes.us(.*?)’)
filename = title_url.split(“/”)[-2]
# a_html = getHTMLTest(title_url)
a_dict[‘TYPE’] = TYPE
a_dict[‘Title’] = title
a_dict[‘Title_Url’] = title_url
a_dict[‘Data’] = data
a_dict[‘File_Name’] = filename
# print(a_dict)
add_name_to_text(a_dict)
a_html = getHTMLTest(title_url)
write_html(a_html, filename)

def add_name_to_text(newsdic):
with open(TXT_PATH, ‘a’, encoding=’utf-8’) as f:
for k in [‘TYPE’, ‘Title’, ‘Title_Url’, ‘Data’, ‘File_Name’]:
f.write(‘[%s]: %s’ %(k,newsdic[k]))
f.write(‘\n’)
f.write(‘\n’)

def write_html(html, name):
html_file = name+’.html’
html_path = os.path.join(HTML_DIR, html_file)
with open(html_path, ‘w’, encoding=’utf-8’) as f:
f.write(html)

def main():
url = ‘https://www.americamakes.us/about/news/
html = getHTMLTest(url)
printUnivList(html)

if name == ‘main‘:
main()**

猜你喜欢

转载自blog.csdn.net/qq_43182687/article/details/82629589