python数据科学（2）

1. Numpy 库

# import numpy as np

# 创造数组
a_list = [1, 2, 3]
an_array = np.array(a_list)
print(an_array)

# 指定数据类型
an_array = np.array(a_list, dtype = float)
print(an_array)

# 创建矩阵
a_listoflist = [[1, 2, 3], [5, 6, 7], [8, 9, 10]]
a_matrix = np.matrix(a_listoflist, dtype = float)
print(a_matrix)

def display_shape(a):
    print()
    print(a)
    print()
    print("Number of elements in a = %d" % (a.size))
    print("Number of dimensions in a = %d" % (a.ndim))
    print("Rows and Columns in a ", a.shape)
    print()
    
display_shape(a_matrix)

# np.arange 创建数组 Numpy 数组

created_array = np.arange(1, 10, dtype = float)
display_shape(created_array)

# np.linspace 来创建 Numpy 数组

created_array = np.linspace(1, 10)
display_shape(created_array)

# np.logspace 来创建 numpy 数组

created_array = np.logspace(1, 10, base = 10.0)
display_shape(created_array)

# np.arange, 指定步长
created_array = np.arange(1, 10, 2, dtype = int)
display_shape(created_array)

# 创造一些特殊的矩阵

ones_matrix = np.ones((3, 3))
display_shape(ones_matrix)

zeros_matrix = np.zeros((3, 3))
display_shape(zeros_matrix)

# 鉴别矩阵
# k 参数控制 1 的索引
# if k = 0, (0, 0), (1, 1), (2, 2) cell values
# 被设置为1， 在一个 3 x 3 的矩阵中
identity_mateix = np.eye(N = 3, M = 3, k = 0)
display_shape(identity_mateix)

identity_mateix = np.eye(N = 3, k = 1)
display_shape(identity_mateix)

# 数组的整形
a_matrix = np.arange(9).reshape(3, 3)
display_shape(a_matrix)

back_to_array = a_matrix.reshape(-1) 
display_shape(back_to_array)

# ravel 和 flatten 函数可以用来将矩阵转化为一维的数组
a_matrix = np.arange(9).reshape(3, 3)
back_array = np.ravel(a_matrix)
display_shape(back_array)

a_matrix = np.arange(9).reshape(3, 3)
back_array = a_matrix.flatten()
display_shape(back_array)

a_matrix = np.arange(9).reshape(3, 3)
b_matrix = np.arange(9).reshape(3, 3)

c_matrix = a_matrix + b_matrix
d_matrix = a_matrix * b_matrix
e_matrix = np.dot(a_matrix, b_matrix)
f_matrix = e_matrix.T

print()
print("f_matrix, minimum = %d" % (f_matrix.min()))
print("f_matrix, maximum = %d" % (f_matrix.max()))
print("f_matrix, col sum", f_matrix.sum(axis = 0))
print("f_matrix, row sum", f_matrix.sum(axis = 1))

# 对元素进行逆运算
display_shape(f_matrix[::-1])

# Python 中所有元素都能用来引用
# 如果需要复制，使用copy
f_copy = f_matrix.copy()

# grid 命令
xx, yy, zz = np.mgrid[0:3, 0:3, 0:3]
xx = xx.flatten()
yy = yy.flatten()
zz = zz.flatten()

# 随机数
general_random_numbers = np.random.randint(1, 100, size = 10)
print(general_random_numbers)

uniform_rnd_numbers = np.random.normal(loc = 0.2, scale = 0.2, size = 10)
print(uniform_rnd_numbers)

[1 2 3]
[1. 2. 3.]
[[ 1.  2.  3.]
 [ 5.  6.  7.]
 [ 8.  9. 10.]]

[[ 1.  2.  3.]
 [ 5.  6.  7.]
 [ 8.  9. 10.]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[1. 2. 3. 4. 5. 6. 7. 8. 9.]

Number of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[ 1.          1.18367347  1.36734694  1.55102041  1.73469388  1.91836735
  2.10204082  2.28571429  2.46938776  2.65306122  2.83673469  3.02040816
  3.20408163  3.3877551   3.57142857  3.75510204  3.93877551  4.12244898
  4.30612245  4.48979592  4.67346939  4.85714286  5.04081633  5.2244898
  5.40816327  5.59183673  5.7755102   5.95918367  6.14285714  6.32653061
  6.51020408  6.69387755  6.87755102  7.06122449  7.24489796  7.42857143
  7.6122449   7.79591837  7.97959184  8.16326531  8.34693878  8.53061224
  8.71428571  8.89795918  9.08163265  9.26530612  9.44897959  9.63265306
  9.81632653 10.        ]

Number of elements in a = 50
Number of dimensions in a = 1
Rows and Columns in a  (50,)


[1.00000000e+01 1.52641797e+01 2.32995181e+01 3.55648031e+01
 5.42867544e+01 8.28642773e+01 1.26485522e+02 1.93069773e+02
 2.94705170e+02 4.49843267e+02 6.86648845e+02 1.04811313e+03
 1.59985872e+03 2.44205309e+03 3.72759372e+03 5.68986603e+03
 8.68511374e+03 1.32571137e+04 2.02358965e+04 3.08884360e+04
 4.71486636e+04 7.19685673e+04 1.09854114e+05 1.67683294e+05
 2.55954792e+05 3.90693994e+05 5.96362332e+05 9.10298178e+05
 1.38949549e+06 2.12095089e+06 3.23745754e+06 4.94171336e+06
 7.54312006e+06 1.15139540e+07 1.75751062e+07 2.68269580e+07
 4.09491506e+07 6.25055193e+07 9.54095476e+07 1.45634848e+08
 2.22299648e+08 3.39322177e+08 5.17947468e+08 7.90604321e+08
 1.20679264e+09 1.84206997e+09 2.81176870e+09 4.29193426e+09
 6.55128557e+09 1.00000000e+10]

Number of elements in a = 50
Number of dimensions in a = 1
Rows and Columns in a  (50,)


[1 3 5 7 9]

Number of elements in a = 5
Number of dimensions in a = 1
Rows and Columns in a  (5,)


[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 0.]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[[0 1 2]
 [3 4 5]
 [6 7 8]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)


[0 1 2 3 4 5 6 7 8]

Number of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[0 1 2 3 4 5 6 7 8]

Number of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


[0 1 2 3 4 5 6 7 8]

Number of elements in a = 9
Number of dimensions in a = 1
Rows and Columns in a  (9,)


f_matrix, minimum = 15
f_matrix, maximum = 111
f_matrix, col sum [ 54 162 270]
f_matrix, row sum [126 162 198]

[[ 21  66 111]
 [ 18  54  90]
 [ 15  42  69]]

Number of elements in a = 9
Number of dimensions in a = 2
Rows and Columns in a  (3, 3)

[ 4 96 48 11  2 59 76 82 26 22]
[-0.16853565  0.1799353   0.3972849   0.26343443  0.02334051  0.07958427
  0.05164016 -0.19068026  0.06433856  0.05085711]

2. matplotlib 库

import numpy as np
import matplotlib.pyplot as plt

def simple_line_plot(x, y, figure_no):
    plt.figure(figure_no)
    plt.plot(x, y)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple Line')
    
def simple_dots(x, y, figure_no):
    plt.figure(figure_no)
    plt.plot(x, y, 'or')
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple Dots')
    
def simple_scatter(x, y, figure_no):
    plt.figure(figure_no)
    plt.scatter(x, y)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Simple scatter')

def scatter_with_color(x, y, labels, figure_no):
    plt.figure(figure_no)
    plt.scatter(x, y, c = labels)
    plt.xlabel('x values')
    plt.ylabel('y values')
    plt.title('Scatter with color')
    
if __name__ == "__main__":
    plt.close('all')
    # x, y 样例数据生成折线图和简单的点图
    x = np.arange(1, 100, dtype = float)
    y = np.array([np.power(xx, 2) for xx in x])
    
    figure_no = 1
    simple_line_plot(x, y, figure_no)
    figure_no += 1
    simple_dots(x, y, figure_no)
    
    # x, y 样例数据生成散点图
    x = np.random.uniform(size = 100)
    y = np.random.uniform(size = 100)
    
    figure_no += 1
    simple_scatter(x, y, figure_no)
    
    figure_no += 1
    label = np.random.randint(2, size = 100)
    
    scatter_with_color(x, y, label, figure_no)
    plt.show()
    
# 生成热力图，并且给 x 和 y 轴添加标签
import numpy as np
import matplotlib.pyplot as plt
def x_y_axis_labeling(x, y, x_labels, y_labels, figure_no):
    plt.figure(figure_no)
    plt.plot(x, y, '+r')
    plt.margins(0.2)
    plt.xticks(x, x_labels, rotation = 'vertical')
    plt.yticks(y, y_labels)
    
def plot_heat_map(x, figure_no):
    plt.figure(figure_no)
    plt.pcolor(x)
    plt.colorbar()
    
if __name__ == "__main__":
    plt.close("all")
    x = np.array(range(1, 6))
    y = np.array(range(100, 600, 100))
    
    x_label = ['element 1', 'element 2', 'element 3', 'element 4', 'element 5']
    y_label = ['weight1', 'weight2', 'weight3', 'weight4', 'weight5']
    
    x_y_axis_labeling(x, y, x_label, y_label, 1)
    
    x = np.random.normal(loc = 0.5, scale = 0.2, size = (10, 10))
    plot_heat_map(x, 2)
    
    plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HhZaNQ3u-1571470760094)(output_3_0.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JLVLvkLr-1571470760098)(output_3_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-NXBHtOpl-1571470760108)(output_3_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Ml0Rkxre-1571470760118)(output_3_3.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3AhwgqMB-1571470760120)(output_3_4.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-QWcoLwA3-1571470760121)(output_3_5.png)]

3. scikit-learn 进行机器学习

import sklearn
sklearn.__version__

from sklearn.datasets import load_iris, load_boston, make_classification, make_circles, make_moons

# iris数据集
data = load_iris()
x = data['data']
y = data['target']
y_labels = data['target_names']
x_labels = data['feature_names']
print()
print(x.shape)
print(y.shape)
print(x_labels)
print(y_labels)

# Boston 数据集
data = load_boston()
x = data['data']
y = data['target']
x_lables = data['feature_names']
print()
print(x.shape)
print(y.shape)
print(x_labels)

# 制作一些分类数据集
x,y = make_classification(n_samples = 50, n_features = 5, n_classes = 2)
print()
print(x.shape)
print(y.shape)

print(x[1, :])
print(y[1])

# 一些非线性数据集
x, y = make_circles()
import numpy as np
import matplotlib.pyplot as plt
plt.close('all')
plt.figure(1)
plt.scatter(x[:, 0], x[:, 1], c = y)

x, y = make_moons()
plt.figure(2)
plt.scatter(x[:, 0], x[:, 1], c = y)
plt.show()

(150, 4)
(150,)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']

(506, 13)
(506,)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']

(50, 5)
(50,)
[-0.51412805 -0.13780935  1.6860627   1.37773306 -1.36406825]
1

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-eTasiURV-1571470760122)(output_5_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3bbSkza1-1571470760124)(output_5_2.png)]

import numpy as np
from sklearn.preprocessing import PolynomialFeatures

# 数据预处理
x = np.asmatrix([[1, 2], [2, 4]])
poly = PolynomialFeatures(degree = 2)
poly.fit(x)
x_poly = poly.transform(x)

print("Original x variable shape", x.shape)
print(x)
print()
print("transformed x variables", x_poly.shape)
print(x_poly)

# 另一种写法
x_poly = poly.fit_transform(x)
print(x_poly)

from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

data = load_iris()
x = data['data']
y = data['target']

estimator = DecisionTreeClassifier()
estimator.fit(x, y)
predicted_y = estimator.predict(x)
predicted_y_prob = estimator.predict_proba(x)
predicted_y_lprob = estimator.predict_log_proba(x)


# from sklearn.pipeline import Pipeline

# poly = PolynomialFeatures(n=3)
# tree_estimator = DecisionTreeClassifier()

# steps = [('poly', poly), ('tree', tree_estimator)]
# estimator = Pipeline(steps = steps)
# estimator.fit(x, y)
# predicted_y = estimator.predict(x)

Original x variable shape (2, 2)
[[1 2]
 [2 4]]

transformed x variables (2, 6)
[[ 1.  1.  2.  1.  2.  4.]
 [ 1.  2.  4.  4.  8. 16.]]
[[ 1.  1.  2.  1.  2.  4.]
 [ 1.  2.  4.  4.  8. 16.]]


/Users/apple/anaconda3/lib/python3.7/site-packages/sklearn/tree/tree.py:890: RuntimeWarning: divide by zero encountered in log
  return np.log(proba)



---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-43-c9060e3935fd> in <module>
     34 from sklearn.pipeline import Pipeline
     35 
---> 36 poly = PolynomialFeatures(n_estimator=3)
     37 tree_estimator = DecisionTreeClassifier()
     38 


TypeError: __init__() got an unexpected keyword argument 'n_estimator'

lancecrazy

发布了182 篇原创文章 · 获赞 101 · 访问量 20万+

私信关注