python定向爬虫实例(二)

功能描述:爬取东方财富网和百度股票的信息并将信息存在文件中

程序设计:

  • 爬取东方财富网的股票信息,并将股票代码存在列表中
  • 根据股票代码列表,爬取百度股票的详细信息存在字典中
  • 将股票信息字典存在文件中
import re
# import traceback
import requests
import bs4
from bs4 import BeautifulSoup

def getHTMLText(url):
    try:
        r=requests.get(url)
        r.raise_for_status()
        r.encoding=r.raise_for_status()
        return r.text
    except:
        return ""

def getStockList(stock_list_url,slt):
    demo=getHTMLText(stock_list_url)
    soup=BeautifulSoup(demo,"html.parser")
    for a in soup.find_all('a',attrs={'target':'_blank'}):
        try:
            if isinstance(a,bs4.element.Tag):
                match=re.search(r'[s][hz]\d{6}',a.attrs.get("href"))
                slt.append(match.group(0))
        except:
            continue
    return ""

def getStockInfo(stock_info_url,slt,file_path):
    count=0
    for stock in slt:
        info_dict={}    #存放股票信息
        url=stock_info_url+stock+'.html'
        try:
            demo=getHTMLText(url)
            soup=BeautifulSoup(demo,'html.parser')
            info_dict['股票名称']=soup.find('a','bets-name').text.split()[0]    #text:获取改标签下的所有字符串并使用空格分隔
            info_div=soup.find('div','bets-content')
            dt_list=info_div.find_all('dt') #ksy
            dd_list=info_div.find_all('dd') #value
            for i,dt in enumerate(dt_list):
                info_dict[dt.string]=dd_list[i].string
            with open(file_path,"a") as f:
                f.write(str(info_dict)+'\n')
            count+=1
            print('\r当前进度:{:.2f}%'.format(count*100/len(slt)),end='')   #打印进度条
        except:
            # traceback.print_exc()
            continue
    return ""

def main():
    stock_list_url='http://quote.eastmoney.com/stock_list.html' #东方财富网
    stock_info_url='https://gupiao.baidu.com/stock/'   #百度股票
    slt=[]  #存放股票代码列表
    file_path="D://股票爬虫.txt"
    getStockList(stock_list_url,slt)
    getStockInfo(stock_info_url,slt,file_path)
main()

效果显示:

猜你喜欢

转载自www.cnblogs.com/BUPT-MrWu/p/11315920.html