导入鸢尾属植物数据集,保持文本不变
import numpy as np
iris_type = np. dtype( {
"names" : [ "sepallength" , "sepalwidth" , "petallength" , "petalwidth" , "species" ] ,
"formats" : [ "f8" , "f8" , "f8" , "f8" , "U30" ] } )
iris_data = np. loadtxt( "iris.csv" , dtype= iris_type, delimiter= ',' , skiprows= 1 )
求出鸢尾属植物萼片长度的平均值、中位数和标准差(第1列,sepallength)
print ( "鸢尾属植物萼片长度的平均值为" , round ( np. mean( iris_data[ "sepallength" ] ) , 3 ) , "cm" )
print ( "鸢尾属植物萼片长度的中位数为" , round ( np. median( iris_data[ "sepallength" ] ) , 3 ) , "cm" )
print ( "鸢尾属植物萼片长度的标准差为" , round ( np. std( iris_data[ "sepallength" ] ) , 3 ) )
鸢尾属植物萼片长度的平均值为 5.843 cm
鸢尾属植物萼片长度的中位数为 5.8 cm
鸢尾属植物萼片长度的标准差为 0.825
创建一种标准化形式的鸢尾属植物萼片长度,其值正好介于0和1之间,这样最小值为0,最大值为1(第1列,sepallength)
np. set_printoptions( threshold= 20 )
( iris_data[ "sepallength" ] - np. mean( iris_data[ "sepallength" ] ) ) / np. std( iris_data[ "sepallength" ] )
array([-0.90068117, -1.14301691, -1.38535265, ..., 0.79566902,
0.4321654 , 0.06866179])
找到鸢尾属植物萼片长度的第5和第95百分位数(第1列,sepallength)
np. quantile( iris_data[ "sepallength" ] , [ 0.05 , 0.95 ] )
array([4.6 , 7.255])
把iris_data数据集中的20个随机位置修改为np.nan值
names= [ "sepallength" , "sepalwidth" , "petallength" , "petalwidth" , "species" ]
index= np. random. randint( low= 0 , high= 150 * 5 , size= 20 )
index_col= index// 150
index_row= index% 150
index = np. vstack( [ index_row, index_col] ) . transpose( )
for i in range ( 20 ) :
iris_data[ names[ index_col[ i] ] ] [ index_row[ i] ] = np. nan
print ( iris_data)
[(5.1, 3.5, 1.4, 0.2, 'Iris-setosa') (4.9, 3. , 1.4, 0.2, 'Iris-setosa')
(4.7, 3.2, 1.3, 0.2, 'Iris-setosa') ...
(6.5, 3. , 5.2, nan, 'Iris-virginica')
(6.2, 3.4, 5.4, 2.3, 'Iris-virginica')
(5.9, 3. , 5.1, 1.8, 'Iris-virginica')]
在iris_data的sepallength中查找缺失值的个数和位置(第1列)
print ( "缺失值的个数为" , np. sum ( np. isnan( iris_data[ "sepallength" ] ) ) )
print ( "缺失值的位置为" , np. where( np. isnan( iris_data[ "sepallength" ] ) ) )
缺失值的个数为 3
缺失值的位置为 (array([ 3, 79, 84], dtype=int64),)
筛选具有 sepallength(第1列)< 5.0 并且 petallength(第3列)> 1.5 的 iris_data行
np. intersect1d( np. where( iris_data[ "sepallength" ] < 5 ) ,
np. where( iris_data[ "petallength" ] > 1.5 ) )
<ipython-input-7-af47433621a2>:1: RuntimeWarning: invalid value encountered in less
np.intersect1d(np.where(iris_data["sepallength"]<5),
<ipython-input-7-af47433621a2>:2: RuntimeWarning: invalid value encountered in greater
np.where(iris_data["petallength"]>1.5))
array([ 24, 29, 30, 57, 106], dtype=int64)
选择没有任何 nan 值的 iris_data行
temp= np. zeros( 150 )
for name in names:
if ( iris_data[ name] . dtype== "float" ) :
temp= np. add( temp, np. isnan( iris_data[ name] ) )
else :
temp= np. add( temp, iris_data[ name] == "nan" )
iris_data_new= iris_data[ temp== 0 ]
print ( iris_data_new)
[(5.1, 3.5, 1.4, 0.2, 'Iris-setosa') (4.9, 3. , 1.4, 0.2, 'Iris-setosa')
(4.7, 3.2, 1.3, 0.2, 'Iris-setosa') ...
(6.7, 3.3, 5.7, 2.5, 'Iris-virginica')
(6.2, 3.4, 5.4, 2.3, 'Iris-virginica')
(5.9, 3. , 5.1, 1.8, 'Iris-virginica')]
计算 iris_data 中sepalLength(第1列)和petalLength(第3列)之间的相关系数
np. corrcoef( iris_data_new[ "sepallength" ] , iris_data_new[ "petallength" ] )
array([[1. , 0.86547632],
[0.86547632, 1. ]])
找出iris_data是否有任何缺失值
temp= np. zeros( 150 )
for name in names:
if ( iris_data[ name] . dtype== "float" ) :
temp= np. add( temp, np. isnan( iris_data[ name] ) )
else :
temp= np. add( temp, iris_data[ name] == "nan" )
print ( "缺失值个数为" , np. sum ( temp) )
缺失值个数为 20.0
在numpy数组中将所有出现的nan替换为0
for name in names:
if ( iris_data[ name] . dtype== "float" ) :
iris_data[ name] [ np. isnan( iris_data[ name] ) ] = 0
else :
iris_data[ name] [ iris_data[ name] == "nan" ] = 0
iris_data
array([(5.1, 3.5, 1.4, 0.2, 'Iris-setosa'),
(4.9, 3. , 1.4, 0.2, 'Iris-setosa'),
(4.7, 3.2, 1.3, 0.2, 'Iris-setosa'), ...,
(6.5, 3. , 5.2, 0. , 'Iris-virginica'),
(6.2, 3.4, 5.4, 2.3, 'Iris-virginica'),
(5.9, 3. , 5.1, 1.8, 'Iris-virginica')],
dtype=[('sepallength', '<f8'), ('sepalwidth', '<f8'), ('petallength', '<f8'), ('petalwidth', '<f8'), ('species', '<U30')])
找出鸢尾属植物物种中的唯一值和唯一值出现的数量
np. unique( np. array( iris_data[ "species" ] , dtype= str ) , return_counts= True )
(array(['0', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'],
dtype='<U30'),
array([ 5, 48, 50, 47], dtype=int64))
将 iris_data 的花瓣长度(第3列)以形成分类变量的形式显示。定义:Less than 3 --> ‘small’;3-5 --> ‘medium’;’>=5 --> ‘large’
petallength_bin= np. digitize( iris_data[ "petallength" ] , [ 0 , 3 , 5 ] )
label_map= {
1 : "small" , 2 : "medium" , 3 : "large" }
petallength_cat= [ label_map[ x] for x in petallength_bin]
在 iris_data 中创建一个新列,其中 volume 是 (pi x petallength x sepallength ^ 2)/ 3
iris_data= np. genfromtxt( "iris.csv" , delimiter= "," , dtype= "object" , skip_header= True )
sepallength= iris_data[ : , 0 ] . astype( "float" )
petallength= iris_data[ : , 2 ] . astype( "float" )
petallength. dtype
new= ( np. pi* petallength* sepallength** 2 ) / 3
new= new[ : , np. newaxis]
np. hstack( [ iris_data, new] )
array([[b'5.1 ', b'3.5 ', b'1.4 ', b'0.2 ', b'Iris-setosa',
38.13265162927291],
[b'4.9 ', b'3.0 ', b'1.4 ', b'0.2 ', b'Iris-setosa',
35.200498485922445],
[b'4.7 ', b'3.2 ', b'1.3 ', b'0.2 ', b'Iris-setosa',
30.0723720777127],
...,
[b'6.5 ', b'3.0 ', b'5.2 ', b'2.0 ', b'Iris-virginica',
230.0693019978925],
[b'6.2 ', b'3.4 ', b'5.4 ', b'2.3 ', b'Iris-virginica',
217.373078887185],
[b'5.9 ', b'3.0 ', b'5.1 ', b'1.8 ', b'Iris-virginica',
185.9100284614832]], dtype=object)
随机抽鸢尾属植物的种类,使得Iris-setosa的数量是Iris-versicolor和Iris-virginica数量的两倍
species = iris_data[ : , 4 ]
probs = np. r_[ np. linspace( 0 , 0.500 , num= 50 ) , np. linspace( 0.501 , .750 , num= 50 ) , np. linspace( .751 , 1.0 , num= 50 ) ]
index = np. searchsorted( probs, np. random. random( 150 ) )
species_out = species[ index]
print ( np. unique( species_out, return_counts= True ) )
(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
dtype=object), array([71, 41, 38], dtype=int64))
根据 sepallength 列对数据集进行排序
print ( iris_data[ iris_data[ : , 0 ] . argsort( ) ] [ : 20 ] )
[[b'4.3 ' b'3.0 ' b'1.1 ' b'0.1 ' b'Iris-setosa']
[b'4.4 ' b'3.2 ' b'1.3 ' b'0.2 ' b'Iris-setosa']
[b'4.4 ' b'3.0 ' b'1.3 ' b'0.2 ' b'Iris-setosa']
...
[b'4.9 ' b'2.5 ' b'4.5 ' b'1.7 ' b'Iris-virginica']
[b'4.9 ' b'3.1 ' b'1.5 ' b'0.1 ' b'Iris-setosa']
[b'4.9 ' b'3.1 ' b'1.5 ' b'0.1 ' b'Iris-setosa']]
在鸢尾属植物数据集中找到最常见的花瓣长度值(第3列)
vals, counts = np. unique( iris_data[ : , 2 ] , return_counts= True )
print ( vals[ np. argmax( counts) ] )
b'1.5 '
在鸢尾花数据集的 petalwidth(第4列)中查找第一次出现的值大于1.0的位置
np. argwhere( iris_data[ : , 3 ] . astype( float ) > 1.0 ) [ 0 ]
array([50], dtype=int64)