目标网站是:https://www.americamakes.us/about/news/
网页加载方式:
Form Data为:
下面是我写的代码:
**import requests
from bs4 import BeautifulSoup
import os
import re
HTML_DIR = ‘html’
TXT_PATH = ‘thg_news.txt’
def getHTMLTest(url):
try:
r = requests.get(url, timeout = 30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return “”
def printUnivList(html):
soup = BeautifulSoup(html, “html.parser”)
div_w_blog_list = soup.find(‘div’, class_ = ‘w-blog-list’)
articles = div_w_blog_list.find(‘article’)
divs = div_w_blog_list.find_all(‘div’, class_ = ‘w-blog-post-body’)
# print(divs)
for child in divs:
a_dict = {}
# print(child)
TYPE = articles.get(‘data-categories’)
a_page = child.find(‘a’)
data_page = child.find(‘time’)
data = data_page.get_text()
title_url = a_page.get(‘href’)
title = a_page.get_text()
# res = re.title_url(r’https://www.americamakes.us(.*?)’)
filename = title_url.split(“/”)[-2]
# a_html = getHTMLTest(title_url)
a_dict[‘TYPE’] = TYPE
a_dict[‘Title’] = title
a_dict[‘Title_Url’] = title_url
a_dict[‘Data’] = data
a_dict[‘File_Name’] = filename
# print(a_dict)
add_name_to_text(a_dict)
a_html = getHTMLTest(title_url)
write_html(a_html, filename)
def add_name_to_text(newsdic):
with open(TXT_PATH, ‘a’, encoding=’utf-8’) as f:
for k in [‘TYPE’, ‘Title’, ‘Title_Url’, ‘Data’, ‘File_Name’]:
f.write(‘[%s]: %s’ %(k,newsdic[k]))
f.write(‘\n’)
f.write(‘\n’)
def write_html(html, name):
html_file = name+’.html’
html_path = os.path.join(HTML_DIR, html_file)
with open(html_path, ‘w’, encoding=’utf-8’) as f:
f.write(html)
def main():
url = ‘https://www.americamakes.us/about/news/’
html = getHTMLTest(url)
printUnivList(html)
if name == ‘main‘:
main()**