引言:鉴于做神经网络的训练集与测试集,需要大量的数据,就学了个爬虫方便以后的数据搜集。下面是实现的代码框架,具体的应用可以稍微改一改。
首先是淘宝定向商品信息抓取:
import requests import re import urllib def get_html(url): try: r = requests.get(url, timeout = 30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def parse_page(infolist,html): try: plt = re.findall(r'\"view_price\"\:\"[\d.]*\"', html) tlt = re.findall(r'\"raw_title\"\:\".*?\"',html) for i in range(len(plt)): price = eval(plt[i].split(":")[1]) title = eval(tlt[i].split(":")[1]) infolist.append([price, title]) except: print("") def print_list(infolist): count = 1; tlpt = "{0:<6}\t{1:8}\t{2:16}" print(tlpt.format("序号","价格","商品名称")) for l in infolist: print(tlpt.format(count,l[0],l[1])) count += 1 def main(): goods = "手机" depth = 5 goods_url = urllib.request.quote(goods) start_url = "https://s.taobao.com/search?q=" + goods_url infolist = [] for i in range(depth): try: url = start_url + '&s=' + str(44 * i) html = get_html(url) parse_page(infolist,html) except: continue print_list(infolist) main()
其次是股票信息的抓取:
import requests import re from bs4 import BeautifulSoup import traceback def get_html(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: print("") def get_stock_list(lst, stock_url): html = get_html(stock_url) soup = BeautifulSoup(html, "html.parser") a = soup.find_all('a') for i in a: try: href = i.attrs['href'] lst.append(re.findall(r'[s][hz]\d{6}',href)[0]) except: continue def get_stock_info(lst, stock_url, fpath): count = 0 for stocks in lst: url = stock_url + stocks + ".html" html = get_html(url) try: if html == "": continue info_dict = {} soup = BeautifulSoup(html,"html.parser") stock_info = soup.find("div", attrs = {"class":"stock-bets"}) name = stock_info.find_all(attrs = {"class":"bets-name"})[0] info_dict.update({"股票名称":name.text.split()[0]}) key_list = stock_info.find_all("dt") value_list = stock_info.find_all("dd") for i in range(len(key_list)): key = key_list[i].text value = value_list[i].text info_dict[key] = value.text with open(fpath, "a", encoding = "utf-8") as f: f.write(str(info_dict) + "\n") count += 1 print("\r当前进度:{:.2f}%".format(count * 100 / len(lst)), end = "") except: traceback.print_exc() continue def main(): lst = [] stock_list_url = 'http://quote.eastmoney.com/stocklist.html' stock_info_url = 'http://gupiao.baidu.com/stock/' output_file = 'D:/StockInfo.txt' get_stock_list(lst, stock_list_url) get_stock_info(lst, stock_info_url, output_file) main()