版权声明:本博客内容为学习过程中的一些记录,大部分转载内容已标明转载,部分内容如有因疏忽未注明请与我联系,转载原创内容请注明链接,谢谢! https://blog.csdn.net/xiaodongxiexie/article/details/79231946
我在github
上建了个repo
DataSetForMachineLearning,用来存放各种数据集,如果有需要,欢迎star
。
进行机器学习时,有时候需要一些数据做练手,数据从何而来呢,可以充分利用一些库,像sklearn
,seaborn
都是自带一些数据的(如常见的iris花卉,titanic泰坦尼克号数据。。。
),可以通过如下方式获取:
sklearn
In [80]: from sklearn import datasets
In [81]: list(filter(lambda x: 'load' in x, dir(datasets)))
Out[81]:
['__loader__',
'load_boston',
'load_breast_cancer',
'load_diabetes',
'load_digits',
'load_files',
'load_iris',
'load_linnerud',
'load_mlcomp',
'load_sample_image',
'load_sample_images',
'load_svmlight_file',
'load_svmlight_files',
'load_wine']
使用方法如下:
In [90]: wine = datasets.load_wine()
In [91]: wine.data.shape
Out[91]: (178, 13)
In [92]: wine.data[:10]
Out[92]:
array([[ 1.42300000e+01, 1.71000000e+00, 2.43000000e+00, ...,
1.04000000e+00, 3.92000000e+00, 1.06500000e+03],
[ 1.32000000e+01, 1.78000000e+00, 2.14000000e+00, ...,
1.05000000e+00, 3.40000000e+00, 1.05000000e+03],
[ 1.31600000e+01, 2.36000000e+00, 2.67000000e+00, ...,
1.03000000e+00, 3.17000000e+00, 1.18500000e+03],
...,
[ 1.40600000e+01, 2.15000000e+00, 2.61000000e+00, ...,
1.06000000e+00, 3.58000000e+00, 1.29500000e+03],
[ 1.48300000e+01, 1.64000000e+00, 2.17000000e+00, ...,
1.08000000e+00, 2.85000000e+00, 1.04500000e+03],
[ 1.38600000e+01, 1.35000000e+00, 2.27000000e+00, ...,
1.01000000e+00, 3.55000000e+00, 1.04500000e+03]])
In [94]: wine.keys()
Out[94]: dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
In [95]: wine.feature_names
Out[95]:
['alcohol',
'malic_acid',
'ash',
'alcalinity_of_ash',
'magnesium',
'total_phenols',
'flavanoids',
'nonflavanoid_phenols',
'proanthocyanins',
'color_intensity',
'hue',
'od280/od315_of_diluted_wines',
'proline']
seaborn
In [96]: import seaborn as sns
In [97]: sns.get_dataset_names()
E:\Anaconda\lib\site-packages\bs4\__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
The code that caused this warning is on line 231 of the file E:\Anaconda\lib\site-packages\spyder\utils\ipython\start_kernel.py. To get rid of this warning, change code that looks like this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "lxml")
markup_type=markup_type))
Out[97]:
['anscombe',
'attention',
'brain_networks',
'car_crashes',
'dots',
'exercise',
'flights',
'fmri',
'gammas',
'iris',
'planets',
'tips',
'titanic']
In [98]: tips = sns.load_dataset('tips')
In [99]: tips.head()
Out[99]:
total_bill tip sex smoker day time size
0 16.99 1.01 Female No Sun Dinner 2
1 10.34 1.66 Male No Sun Dinner 3
2 21.01 3.50 Male No Sun Dinner 3
3 23.68 3.31 Male No Sun Dinner 2
4 24.59 3.61 Female No Sun Dinner 4
In [100]: flights = sns.load_dataset('flights')
In [101]: flights.head()
Out[101]:
year month passengers
0 1949 January 112
1 1949 February 118
2 1949 March 132
3 1949 April 129
4 1949 May 121
In [102]: titanic = sns.load_dataset('titanic')
In [103]: titanic.head()
Out[103]:
survived pclass sex age sibsp parch fare embarked class \
0 0 3 male 22.0 1 0 7.2500 S Third
1 1 1 female 38.0 1 0 71.2833 C First
2 1 3 female 26.0 0 0 7.9250 S Third
3 1 1 female 35.0 1 0 53.1000 S First
4 0 3 male 35.0 0 0 8.0500 S Third
who adult_male deck embark_town alive alone
0 man True NaN Southampton no False
1 woman False C Cherbourg yes False
2 woman False NaN Southampton yes True
3 woman False C Southampton yes False
4 man True NaN Southampton no True
altair
In [52]: import altair as alt
In [53]: cars = alt.load_dataset('cars')
In [63]:altair.list_datasets()
Out[63]:
['airports',
'anscombe',
'barley',
'birdstrikes',
'budget',
'budgets',
'burtin',
'cars',
'climate',
'countries',
'crimea',
'driving',
'flare',
'flights-10k',
'flights-20k',
'flights-2k',
'flights-3m',
'flights-5k',
'flights-airport',
'gapminder',
'gapminder-health-income',
'github',
'iris',
'jobs',
'miserables',
'monarchs',
'movies',
'points',
'population',
'seattle-temps',
'seattle-weather',
'sf-temps',
'sp500',
'stocks',
'unemployment-across-industries',
'us-10m',
'weather',
'weball26',
'wheat',
'world-110m']
这个包不常见,如果不想安装这个包可以通过如下代码使用其数据
import pandas as pd
try:
from urllib.error import URLError, HTTPError
from urllib.request import urlopen
except ImportError:
# Python 2.X
from urllib2 import URLError, HTTPError, urlopen
try:
from functools import lru_cache
except ImportError:
# Python 2.X: function not available
lru_cache = lambda maxsize=128, typed=False: (lambda y: y)
all_data_name = {
"airports": {
"filename": "airports.csv",
"format": "csv"
},
"anscombe": {
"filename": "anscombe.json",
"format": "json"
},
"barley": {
"filename": "barley.json",
"format": "json"
},
"birdstrikes": {
"filename": "birdstrikes.json",
"format": "json"
},
"budget": {
"filename": "budget.json",
"format": "json"
},
"budgets": {
"filename": "budgets.json",
"format": "json"
},
"burtin": {
"filename": "burtin.json",
"format": "json"
},
"cars": {
"filename": "cars.json",
"format": "json"
},
"climate": {
"filename": "climate.json",
"format": "json"
},
"countries": {
"filename": "countries.json",
"format": "json"
},
"crimea": {
"filename": "crimea.json",
"format": "json"
},
"driving": {
"filename": "driving.json",
"format": "json"
},
"flare": {
"filename": "flare.json",
"format": "json"
},
"flights-10k": {
"filename": "flights-10k.json",
"format": "json"
},
"flights-20k": {
"filename": "flights-20k.json",
"format": "json"
},
"flights-2k": {
"filename": "flights-2k.json",
"format": "json"
},
"flights-3m": {
"filename": "flights-3m.csv",
"format": "csv"
},
"flights-5k": {
"filename": "flights-5k.json",
"format": "json"
},
"flights-airport": {
"filename": "flights-airport.csv",
"format": "csv"
},
"gapminder": {
"filename": "gapminder.json",
"format": "json"
},
"gapminder-health-income": {
"filename": "gapminder-health-income.csv",
"format": "csv"
},
"github": {
"filename": "github.csv",
"format": "csv"
},
"iris": {
"filename": "iris.json",
"format": "json"
},
"jobs": {
"filename": "jobs.json",
"format": "json"
},
"miserables": {
"filename": "miserables.json",
"format": "json"
},
"monarchs": {
"filename": "monarchs.json",
"format": "json"
},
"movies": {
"filename": "movies.json",
"format": "json"
},
"points": {
"filename": "points.json",
"format": "json"
},
"population": {
"filename": "population.json",
"format": "json"
},
"seattle-temps": {
"filename": "seattle-temps.csv",
"format": "csv"
},
"seattle-weather": {
"filename": "seattle-weather.csv",
"format": "csv"
},
"sf-temps": {
"filename": "sf-temps.csv",
"format": "csv"
},
"sp500": {
"filename": "sp500.csv",
"format": "csv"
},
"stocks": {
"filename": "stocks.csv",
"format": "csv"
},
"unemployment-across-industries": {
"filename": "unemployment-across-industries.json",
"format": "json"
},
"us-10m": {
"filename": "us-10m.json",
"format": "json"
},
"weather": {
"filename": "weather.json",
"format": "json"
},
"weball26": {
"filename": "weball26.json",
"format": "json"
},
"wheat": {
"filename": "wheat.json",
"format": "json"
},
"world-110m": {
"filename": "world-110m.json",
"format": "json"
}
}
BASE_URL = 'https://vega.github.io/vega-datasets/data/'
@lru_cache()
def load_dataset(name, url_only=False):
"""Load a dataset by name as a pandas.DataFrame."""
item = all_data_name.get(name, None)
if item is None:
raise ValueError('No such dataset {0} exists, '
'use list_datasets to get a list'.format(name))
url = BASE_URL + item['filename']
if url_only:
return url
elif item['format'] == 'json':
return pd.read_json(url)
elif item['format'] == 'tsv':
return pd.read_csv(url, sep='\t')
elif item['format'] == 'csv':
return pd.read_csv(url)
else:
raise ValueError("Unrecognized file format: {0}. "
"Valid options are ['json', 'csv', 'tsv']."
"".format(item['format']))