- I spent one night debugging, practicing, and finally changed this program
- Through this, I am more familiar with the usage of the BeautifulSoup library, and also practiced the use of regular expressions
import requests
from bs4 import BeautifulSoup
import bs4
import traceback
import re
def getHTMLText(url, code="utf-8"):
try:
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
}
r = requests.get(url, headers=Headers, timeout=30)
r.raise_for_status()
r.encoding = code
return r.text
except:
return "网页访问失败"
def getFundList(lst, fundURL):
html = getHTMLText(fundURL, "GB2312")
soup = BeautifulSoup(html, 'html.parser')
tr = soup.find_all('tr')
for i in tr:
try:
id = i.attrs['id']
lst.append(re.findall(r"\d{6}", id)[0])
except:
continue
def getFundInfo(lst, fundURL, fpath):
count = 0
for fund_code in lst:
url = fundURL + fund_code + ".html"
html = getHTMLText(url)
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html, 'html.parser')
FundInfo = soup.find('div', attrs={'class': 'merchandiseDetail'})
if isinstance(FundInfo, bs4.element.Tag):
name = FundInfo.find_all(attrs={'class': "fundDetail-tit"})[0]
infoDict.update({'基金名称': name.text})
keyList = FundInfo.find_all('dt')
valueList = FundInfo.find_all('dd')
for i in range(len(keyList)):
if i==0:
key = re.findall(r"^净值估算.{16}", keyList[i].text)[0]
val = re.findall(r"[\+\-]\d.\d\d+%$", valueList[3 * i].text)[0], valueList[3 * i + 1].text, valueList[3 * i + 2].text
infoDict[key] = val
else:
key = keyList[i].text
val = valueList[3*i].text,valueList[3*i+1].text,valueList[3*i+2].text
infoDict[key] = val
with open(fpath, 'a', encoding='utf-8') as f:
f.write(str(infoDict) + '\n')
count = count + 1
if count > 100:
break
print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
except:
count = count + 1
print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
continue
def main():
Fund_list_url = 'https://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1'
Fund_info_url = 'https://fund.eastmoney.com/'
output_file = 'D:/DayDayFund.txt'
slist = []
getFundList(slist, Fund_list_url)
getFundInfo(slist, Fund_info_url, output_file)
main()