# coding=utf-8 import lxml, bs4, re, requests csvContent='' # file = open('D:\\tyc_demo.htm','rb') # soup = bs4.BeautifulSoup(file,'html.parser') resultPage = requests.get("http://mp.weixin.qq.com/s/u_WmkE5meMWuZ81G5gHhBQ") soup = bs4.BeautifulSoup(resultPage.text,'html.parser') for link in soup.find_all('a'): if (link.get('href')).startswith('http://mp.weixin.qq.com') : # print(link.get('href')) resultPage = requests.get(link.get('href')) tempSoup = bs4.BeautifulSoup(resultPage.text,'html.parser') pics=tempSoup.find_all(attrs={'class': 'rich_media_title'}) title =pics[0].string.strip() title= title.replace("计算机程序的思维逻辑","") title= title.replace(")","") title= title.replace("(","") title= title.replace(":","") title= title.strip() if title[-1:]=="/": title= title[0:-1] print(title) fileName='D:\\Java编程的逻辑\\'+title+'.html' with open(fileName,'w',encoding='utf-8') as of: of.write(resultPage.text)
效果如下