Tiantian Fund Net python crawler based on bs4 library and re library

  • I spent one night debugging, practicing, and finally changed this program
  • Through this, I am more familiar with the usage of the BeautifulSoup library, and also practiced the use of regular expressions

import requests
from bs4 import BeautifulSoup
import bs4
import traceback
import re


def getHTMLText(url, code="utf-8"):
    try:
        Headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
        }
        r = requests.get(url, headers=Headers, timeout=30)
        r.raise_for_status()
        r.encoding = code
        return r.text
    except:
        return "网页访问失败"


def getFundList(lst, fundURL):
    html = getHTMLText(fundURL, "GB2312")
    soup = BeautifulSoup(html, 'html.parser')
    tr = soup.find_all('tr')
    for i in tr:
        try:
            id = i.attrs['id']
            lst.append(re.findall(r"\d{6}", id)[0])
        except:
            continue


def getFundInfo(lst, fundURL, fpath):
    count = 0
    for fund_code in lst:
        url = fundURL + fund_code + ".html"
        html = getHTMLText(url)
        try:
            if html == "":
                continue
            infoDict = {}
            soup = BeautifulSoup(html, 'html.parser')
            FundInfo = soup.find('div', attrs={'class': 'merchandiseDetail'})
            if isinstance(FundInfo, bs4.element.Tag):
                name = FundInfo.find_all(attrs={'class': "fundDetail-tit"})[0]
                infoDict.update({'基金名称': name.text})

            keyList = FundInfo.find_all('dt')
            valueList = FundInfo.find_all('dd')
            for i in range(len(keyList)):
                if i==0:
                    key = re.findall(r"^净值估算.{16}", keyList[i].text)[0]
                    val = re.findall(r"[\+\-]\d.\d\d+%$", valueList[3 * i].text)[0], valueList[3 * i + 1].text, valueList[3 * i + 2].text
                    infoDict[key] = val
                else:
                    key = keyList[i].text
                    val = valueList[3*i].text,valueList[3*i+1].text,valueList[3*i+2].text
                    infoDict[key] = val

            with open(fpath, 'a', encoding='utf-8') as f:
                f.write(str(infoDict) + '\n')
                count = count + 1
                if count > 100:
                    break
                print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
        except:
            count = count + 1
            print("\r当前进度: {:.2f}%".format(count * 100 / 100), end="")
            # traceback.print_exc()  # 获得错误信息
            continue


def main():
    Fund_list_url = 'https://fund.eastmoney.com/fund.html#os_0;isall_0;ft_;pt_1'
    Fund_info_url = 'https://fund.eastmoney.com/'
    output_file = 'D:/DayDayFund.txt'
    slist = []
    getFundList(slist, Fund_list_url)
    getFundInfo(slist, Fund_info_url, output_file)


main()


  • Record learning
Published 10 original articles · Like1 · Visits 132

Guess you like

Origin blog.csdn.net/qq_39419113/article/details/105695305