Python- data analysis module

numpy module

numpy module is mainly used for data analysis, scientific computing numpy array

The main common attributes and methods, with the object are generated numpy .out

import numpy as np

Attributes description
T Transpose of the array, the rows and columns correspond, reconstruction, two elements per row
dtype Data type array elements (Int32 and float64)
size The number of array elements
help Dimensions of the array
shape Dimension size of the array (there are several odd row)
astype Data type conversion
Common method description
Element segmentation [:,:] Represents the row and column
Logical values Remove array numpy generating objects> element 4
Assignment Remove numpy array object with the generated index value = 0
The combined lateral array Rows and rows merger, consolidation columns and columns
The combined vertical array Equivalent list update, add elements directly
Array Functions description
np.array() The list into an array, you can choose whether or not to develop dtype
np.ones() Incoming number of rows and columns, values ​​are 1
np.zeros() Incoming number of rows and columns, values ​​are 0
np.eye() Enter the number of rows and columns, the value of a diagonal
np.arange() And a range of different methods list, support floating point
np.linspace () Similarly arange (), the third parameter is the length of the array
np.empty() Creating a full array of random elements
np.reshape() Reshape
Array Operations Functions associated with the array of + - * / D
Generates a random number (common) np.random.rand (x, y)
np.random.random(x,y)
np.random.choice(x,y)
np.random.shuffle(x,y)
numpy Statistical Methods description
sum Summing
cumsum Cumulative sum
mean Averaging
std Seeking standard deviation
where Seeking variance
me For the minimum
max Seeking maximum
argmin For the minimum index
argmax Seeking the maximum index
sort Sequence

The following code specific explanation

lt1 = [1,2,3]
lt2 = [4,5,6]

lt = []
# 如果我们想要对这两个列表内数据相乘,我们可以用for循环
for i in range(len(lt1)):
    lt.append(lt1[i] * lt2[i])
print(lt)


import numpy as np

# 利用numpy 进行矩阵计算 更方便
arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])
print(arr1 * arr2)
## [ 4 10 18]




# numpy 创建 numpy 数组   --》 可变的数据类型
# 一维数组  通常不使用,创建的数组没有,
arr = np.array([1,2,3])
print(arr)
# [1 2 3]

# 二维数组
arr = np.array([
    [1,2,3],
    [4,5,6]
])
print(arr)
# [[1 2 3]
#  [4 5 6]]

# 三维数组      通常不使用
arr = np.array([
    [1,2,3],
    [4,5,6],
    [7,8,9]
])
print(arr)
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]


# numpy 数组的属性 特性
arr = np.array([
    [1,2,3],
    [4,5,6]
])

# T数组的转置,行列互换
print(arr, "\n",arr.T)
# [[1 4]
#  [2 5]
# [3 6]]

# dtype 数组元素的数据类型,
# numpy数组是属于python解释器的,
# int32 float64 属于numpy数组
print(arr.dtype)
# int32

# size 数组元素的个数
print(arr.size)
# 6

# ndim 数据的维数
print(arr.ndim)
# 2

# shape 数据的纬度大小(以元组形式)
print(arr.shape)
# (2, 3)

# astype 类型转换 为int32
arr = arr.astype(np.float64)
print(arr)
# [[1. 2. 3.]
#  [4. 5. 6.]]

# 切片numpy数组
arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

print(arr[:,:])     # :行,:列
# [[1 2 3]
#  [4 5 6]]
print(arr[0,0])
# 1
print(arr[1,2])
# 6
print(arr[:,-2:])
# [[2 3]
#  [5 6]]

# 逻辑取值
print(arr[arr > 4])
# [[2 3]
#  [5 6]]
#  [5 6]

# 赋值
arr[0,0] = 0
print(arr)
# [[0 2 3]
#  [4 5 6]]

# 数组合并
arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [7, 8, 9],
    ['a', 'b', 'c']
])

# 横向合并
print(np.hstack((arr1,arr2)))
# [['1' '2' '3' '7' '8' '9']
#  ['4' '5' '6' 'a' 'b' 'c']]

# 垂直合并
print(np.vstack((arr1,arr2)))
# [['1' '2' '3']
#  ['4' '5' '6']
#  ['7' '8' '9']
#  ['a' 'b' 'c']]

# 默认以列合并  #axis = 0    0表示列,1表示行
print(np.concatenate((arr1,arr2),axis=1))
# [['1' '2' '3' '7' '8' '9']
#  ['4' '5' '6' 'a' 'b' 'c']]

# 通过函数创建numpy数组

print(np.ones((2,3)))
# [[1. 1. 1.]
#  [1. 1. 1.]]

print(np.zeros((2,3)))
# [[0. 0. 0.]
#  [0. 0. 0.]]

print(np.eye(3,3))
# [0. 1. 0.]
# [0. 0. 1.]]

print(np.linspace(1,100,10))
# [  1.  12.  23.  34.  45.  56.  67.  78.  89. 100.]

print(np.arange(2,10))
# [2 3 4 5 6 7 8 9]

# 重构形状
arr1 = np.zeros((2,6))      #
print(arr1.reshape((3,4)))      # 重构形状必须相乘的 相等
# [[0. 0. 0. 0.]
#  [0. 0. 0. 0.]
#  [0. 0. 0. 0.]]


# numpy 数组运算
# +-*/
arr = np.ones((3,4)) * 4
print(arr)
# [[4. 4. 4. 4.]
#  [4. 4. 4. 4.]
#  [4. 4. 4. 4.]]

arr = np.ones((3,4)) + 4
print(arr)
# [[5. 5. 5. 5.]
#  [5. 5. 5. 5.]
#  [5. 5. 5. 5.]]

# numpy 数组运算函数      了解——————-
print(np.sin(arr))
# [[-0.95892427 -0.95892427 -0.95892427 -0.95892427]
#  [-0.95892427 -0.95892427 -0.95892427 -0.95892427]
#  [-0.95892427 -0.95892427 -0.95892427 -0.95892427]]

# 矩阵运算 --  点乘
arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [1, 2],
    [4, 5],
    [6, 7]
])
print(np.dot(arr1,arr2))
# [[27 33]
#  [60 75]]

# 求逆
arr = np.array([[1, 2, 3], [4, 5, 6], [9, 8, 9]])
print(np.linalg.inv(arr))
# [[ 0.5        -1.          0.5       ]
#  [-3.          3.         -1.        ]
#  [ 2.16666667 -1.66666667  0.5       ]]





# numpy 数组数学和统计方法

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
print(np.sum(arr[:,:]))
# 21

# 生成随机数
print(np.random.rand(3,4))
# [[0.76654824 0.23510842 0.79989748 0.93094884]
#  [0.97155472 0.29956374 0.27754847 0.91103403]
#  [0.43714323 0.7549109  0.14547903 0.20511579]]

print(np.random.random((3,4)))
# [[0.91673193 0.15218486 0.32976182 0.41812734]
#  [0.33360061 0.20190749 0.48689467 0.46679115]
#  [0.12490532 0.50441629 0.95525997 0.5402791 ]]


# 针对一维 随机选择数字
print(np.random.choice([1,2,3],1))
# [1]

# 追对某一范围
print(np.random.randint(1,100,(3,4)))
# [[33 40 93 18]
#  [80 65 64 51]
#  [66  6 83 10]]

matplotlib module

drawing module is used matplotlib

# 条形图

from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 设置字体,不然画出来会乱码
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")

# 设置背景
plt.style.use("ggplot")

# 定义 行 列 信息
clas = ["3班","4班","5班","6班"]
students = [50,55,45,60]
clas_index = range(len(clas))

# 开始画
plt.bar(clas_index,students,color="darkblue")

plt.xlabel("学生",FontProperties=font)
plt.xlabel("学生人数",FontProperties=font)
plt.title("班级-学生人数",FontProperties=font,Fontsize=25,fontweight=20)
plt.xticks(clas_index,clas,FontProperties=font)

# 展示
plt.show()

1569746051687

# 直方图
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 设置字体,不然画出来会乱码
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")
plt.style.use("ggplot")

# 生成随机数对象
x1 = np.random.randn(10000)
x2 = np.random.randn(10000)

# 生成画布
fig = plt.figure()

# 每行每列
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

ax1.hist(x1,bins=50,color="darkblue")
ax2.hist(x2,bins=50,color="y")

fig.suptitle("两个正太分布",FontProperties=font,fontsize=20)
ax1.set_title("x1的正态分布",FontProperties=font)
ax2.set_title("x2的正态分布",FontProperties=font)

# 展示
plt.show()

1569746069693

# 折线图

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties

# 设置字体,不然画出来会乱码
font = FontProperties(fname=r"C:\Windows\Fonts\simsun.ttc")
plt.style.use("ggplot")

np.random.seed(10)

x1 = np.random.randn(40).cumsum()
x2 = np.random.randn(40).cumsum()
x3 = np.random.randn(40).cumsum()
x4 = np.random.randn(40).cumsum()

plt.plot(x1,color="r",linestyle="-",marker="o",label="红圆线")
plt.plot(x2,color="y",linestyle="--",marker="*",label="黄虚线")
plt.plot(x3,color="b",linestyle="-.",marker="s",label="蓝方线")
plt.plot(x4,color="black",linestyle=":",marker="s",label="黑方线")
plt.legend(loc="best",prop=font)

# 展示
plt.show()

1569746079556

# 散点图 + 直线图
import numpy as np
from matplotlib import pyplot as plt  # 约定俗成
from matplotlib.font_manager import FontProperties  # 修改字体

# 设置字体,不然画出来会乱码
font = FontProperties(fname='C:\Windows\Fonts\simsun.ttc')
plt.style.use('ggplot')

fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)


x = np.arange(20)
y = x ** 2
x2 = np.arange(20)
y2 = x2 ** 2

ax1.scatter(x,y,color="r",label="红")
ax2.scatter(x2,y2,color="b",label="蓝")

ax1.plot(x,y)
ax2.plot(x2,y2)

fig.suptitle("两张图",FontProperties=font,fontsize=15)
ax1.set_title("散点图",FontProperties=font)
ax2.set_title("折线图",FontProperties=font)
ax1.legend(prop=font)

# 展示
plt.show()

1569746089270

pandas module

pandas module operates excel / json / sql / ini / csv file

import pandas as pd
import numpy as np

np.random.seed(10)

# 生成6个月份
index = pd.date_range("2019-01-01",periods=6,freq="M")
print(index)
columns = ["c1","c2","c3","c4"]

# 生成随机数
val = np.random.randn(6,4)


df = pd.DataFrame(index=index,columns=columns,data=val)
print(df)
#                   c1        c2        c3        c4
# 2019-01-31  1.331587  0.715279 -1.545400 -0.008384
# 2019-02-28  0.621336 -0.720086  0.265512  0.108549
# 2019-03-31  0.004291 -0.174600  0.433026  1.203037
# 2019-04-30 -0.965066  1.028274  0.228630  0.445138
# 2019-05-31 -1.136602  0.135137  1.484537 -1.079805
# 2019-06-30 -1.977728 -1.743372  0.266070  2.384967


# 保存成 xlsx 文件
df.to_excel("date_c.xlsx")
# 读出文件
df = pd.read_excel("date_c.xlsx",index_col=[0])
print(df)
#                   c1        c2        c3        c4
# 2019-01-31  1.331587  0.715279 -1.545400 -0.008384
# 2019-02-28  0.621336 -0.720086  0.265512  0.108549
# 2019-03-31  0.004291 -0.174600  0.433026  1.203037
# 2019-04-30 -0.965066  1.028274  0.228630  0.445138
# 2019-05-31 -1.136602  0.135137  1.484537 -1.079805
# 2019-06-30 -1.977728 -1.743372  0.266070  2.384967



###############
print(df.index)
print(df.columns)
print(df.values)

print(df[['c1', 'c2']])

# 按照index取值
# print(df['2019-01-31'])
print(df.loc['2019-01-31'])
print(df.loc['2019-01-31':'2019-05-31'])

# 按照values取值
print(df)
print(df.iloc[0, 0])

df.iloc[0, :] = 0
print(df)

Guess you like

Origin www.cnblogs.com/qinyujie/p/11608643.html