1. NumPy
1. Creation and Access
1. The difference from the list: the data type of all elements in the array is the same. The underlying fully optimized C language code has a high computing performance ratio. Provides a comprehensive set of mathematical functions that can be applied directly to arrays.
2. The defined array is called ndarray, n-dimensions-array is: n-dimensional array.
import numpy as np # 按照传统导入
a = np.array([1, 2, 3]) # 或a = np.array((1, 2, 3))
print(a) # [1 2 3]
print(type(a)) # <class 'numpy.ndarray'>
3. The way to create ndarray
import numpy as np
# ==================有值创建==================
a_list = np.array([1, 2, 3])
a_tuple = np.array((1, 2, 3))
# ==================填充创建==================
a_zeros = np.zeros((2, 3))
a_ones = np.ones((2, 3))
a_empty = np.empty((2, 3))
# ==================等差创建==================
a_ar = np.arange(6)
a_lin = np.linspace(0, 10, num=5)
# ==================随机创建==================
# 给定随机种子
np.random.seed(10)
# 创建维度为(3,1)的0~1的随机数列
t1 = np.random.rand(3, 1)
# 创建维度为(2,2)的(0~100)的小数随机数列
t2 = np.random.uniform(0, 100, (2, 2))
# 创建维度为(2,2)的(0~100)的整数随机数列
t3 = np.random.randint(0, 20, (2, 2))
# 给定均值、标准差、维度的正态分布
t4 = np.random.normal(0, 1, (2, 2))
# 标准正太分布。定均值为0、标准差为1的正太分布
t5 = np.random.standard_normal(size=(2, 2))
4. Access arrays
can be accessed by indexing or slicing
import numpy as np
# 创建 5*4 二维数组(5行4列)
c = np.array([[0, 1, 2, 3], [10, 11, 12, 13], [20, 21, 22, 23], [30, 31, 32, 33], [40, 41, 42, 43]])
print(c)
# 按照索引取 第1行,第2列的元素
print(c[1, 2]) # 12
# 切片,取 1~2 行,第2~3列元素(数组)
print(c[1:3, 2:4])
# 切片,取 1~2 行,第2列的元素(数组)
print(c[1:3, 2])
# 步长=2,取到第 1, 3 行,第2~3列元素
print(c[1:6:2, 2:4])
# 取最后一维,下标为2的元素
print(c[:, 2])
# 维度比较多,需要写很多:,提供...可以代表之前或之后的任意维度
print(c[..., 2])
2. Array operation
1. Modify the array shape
import numpy as np
a = np.arange(8)
print('原始数组:')
print(a)
b = a.reshape(4, 2)
print('修改后的数组:')
print(b)
er, Matplotlib
1. Getting Started
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(-1, 1, 50) # x为(-1,1)区间上的50个等差点构成的数组/列表/元组
y = 2 * x + 1 # y与x的函数关系
plt.plot(x, y) # 用于画图,它可以绘制点和线, 并且对其样式进行控制
plt.show() # 显示图像
2. Data incoming
1.x is the x-axis data, y is the y-axis data
import matplotlib.pyplot as plt
x = [3, 4, 5] # [列表]
y = [2, 3, 2] # x,y元素个数N应相同
plt.plot(x, y)
plt.show()
2.x, y can be passed in (tuple), [list], np.array, pd.Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x=(3,4,5) # (元组)
y1=np.array([3,4,3]) # np.array
y2=pd.Series([4,5,4]) # pd.Series
plt.plot(x,y1)
plt.plot(y2) # x可省略,默认[0,1..,N-1]递增
plt.show() # plt.show()前可加多个plt.plot(),画在同一张图上
3. Multiple sets of x, y can be passed in
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
x=(3,4,5)
y1=np.array([3,4,3])
y2=pd.Series([4,5,4])
plt.plot(x,y1,x,y2) # 此时x不可省略
plt.show()
4.x or y is passed into a two-dimensional array
import matplotlib.pyplot as plt
import numpy as np
lst1 = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
x = np.array(lst1)
lst2 = [[2, 3, 2], [3, 4, 3], [4, 5, 4]]
y = np.array(lst2)
print(x)
print(y)
plt.plot(x, y)
plt.show()
Blue: x1(0,3,6) y1(2,3,4)
Orange: x2(1,4,7) y2(3,4,5)
Green: x3(2,5,8) y3(2 ,3,4)
3. Graphics Control
1.plt.plot(x, y, "Format Control String")
import matplotlib.pyplot as plt
import numpy as np
lst1 = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
x = np.array(lst1)
lst2 = [[2, 3, 2], [3, 4, 3], [4, 5, 4]]
y = np.array(lst2)
plt.plot(x, y, "ob:") # "b"为蓝色, "o"为圆点, ":"为点线
plt.show()
2. "Format control string" can include up to three parts, "color", "point type", "line type"
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
color = ['b', 'g', 'r', 'c', 'm', 'y', 'k', 'w']
line_style = ['-', '--', '-.', ':']
dic1 = [[0, 1, 2], [3, 4, 5]]
x = pd.DataFrame(dic1)
dic2 = [[2, 3, 2], [3, 4, 3], [4, 5, 4], [5, 6, 5]]
y = pd.DataFrame(dic2)
# 循环输出所有"颜色"与"线型"
for i in range(2):
for j in range(4):
plt.plot(x.loc[i], y.loc[j], color[i * 4 + j] + line_style[j])
plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
marker = ['.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_',
'.', ',']
dic1 = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14], [15, 16, 17]]
x = pd.DataFrame(dic1)
dic2 = [[2, 3, 2.5], [3, 4, 3.5], [4, 5, 4.5], [5, 6, 5.5]]
y = pd.DataFrame(dic2)
# 循环输出所有"点型"
for i in range(6):
for j in range(4):
plt.plot(x.loc[i], y.loc[j], "b" + marker[i * 4 + j] + ":") # "b"蓝色,":"点线
plt.show()
Color
"c" cyan
"r" red red
"g" green green
"b" blue blue
"w" white white
"k" black black
"y" yellow yellow
"m" magenta magenta
Line type
":" dotted line
"-." dotted line
"–" dashed line
"-" solid line
Point type
3.plt.plot(x, y, "format control string", keyword=parameter)
In addition to "format control string", you can also add keywords = parameters after
import matplotlib.pyplot as plt
y = [2, 3, 2]
# 蓝色,线宽20,圆点,点尺寸50,点填充红色,点边缘宽度6,点边缘灰色
plt.plot(y, color="blue", linewidth=20, marker="o", markersize=50,
markerfacecolor="red", markeredgewidth=6, markeredgecolor="grey")
plt.show()
4. Advanced
Line Chart
Histogram
Bar
Chart Scatter Chart
5. Three-dimensional surface
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure()
ax = fig.add_subplot(projection="3d")
x = np.arange(-5, 5, 0.25)
y = np.arange(-5, 5, 0.25)
x, y = np.meshgrid(x, y)
z = np.sqrt(x ** 2 + y ** 2)
ax.set_xlabel("X")
ax.set_ylabel("Y")
ax.set_zlabel("Z")
ax.set_title("3D surface plot")
ax.plot_surface(x, y, z,
rstride=1,
cstride=1,
cmap=plt.cm.coolwarm,
linewidth=0,
antialiased=False)
plt.show()
3. Pandas
pandas is a NumPy-based tool created to solve data analysis tasks. Pandas incorporates a large number of libraries and some standard data models, providing the tools needed to efficiently manipulate large datasets. pandas provides a large number of functions and methods that allow us to process data quickly and easily.
Two different data structures are built on the basis of ndarray arrays (arrays in NumPy), namely Series (one-dimensional data structure) and DataFrame (two-dimensional data structure).
- Series is a labeled one-dimensional array, where the label can be understood as an index, but this index is not limited to integers, it can also be a character type, such as a, b, c, etc.;
- A DataFrame is a tabular data structure that has both row and column labels.
1.Series sequence
It is a structure similar to a one-dimensional array, consisting of a set of data values (value) and a set of labels, where there is a one-to-one correspondence between labels and data values.
Series can save any data type, such as integers, strings, floating-point numbers, Python objects, etc., and its labels default to integers, starting from 0 and increasing sequentially.
import pandas as pd
import numpy as np
print(pd.Series([], dtype='float64')) # 1.空白数据
# 使用列表创建
data = np.array(['a', 'b', 'c', 'd', 'e'])
print(pd.Series(data)) # 2.标签为默认从0开始递增
print(pd.Series(data, index=[101, 102, 103, 104, 105])) # 3.自定义标签
# 使用键值对创建
data_dict = {
'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
print(pd.Series(data_dict)) # 4.标签为键,数值为值
print(pd.Series(data_dict, index=['b', 'd', 'a', 'f'])) # 5.当传递的索引值无法找到与其对应的值时,使用 NaN(非数字)填充。
import pandas as pd
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
# 索引访问
print(s[0]) # 1.单个元素访问
print(s[:3]) # 2.切片访问
print(s[[1, 4, 3]]) # 3.多个非连续元素访问
# 标签访问
print(s['a']) # 4.单个元素访问
print(s[:'d']) # 5.切片访问
print(s[['a', 'd', 'e']]) # 6.多个非连续元素访问
head()&tail() view data
import pandas as pd
import numpy as np
s = pd.Series(np.random.randint(1, 100, 8))
print(s)
print(s.head()) # 默认访问前5个数据
print(s.head(3)) # 自定义访问前n个数据
print(s.tail()) # 默认访问后5个数据
print(s.tail(3)) # 自定义访问后n个数据
isnull()&nonull() detects missing values
import pandas as pd
s = pd.Series(['a', 'b', 'c', None])
print(pd.isnull(s)) # 如果为值不存在或者缺失,则返回 True
print(pd.notnull(s)) # 如果值不为空,则返回 True
2. DataFrame table structure
import pandas as pd
# 1.创建空的DataFrame对象
print(pd.DataFrame())
# 2.单一列表创建 DataFrame
print(pd.DataFrame(['a', 'b', 'c', 'd']))
# 3.使用嵌套列表创建 DataFrame 对象
print(pd.DataFrame([['Alex', 10], ['Bob', 12], ['Clarke', 13]], columns=['name', 'age']))
# 4.字典嵌套列表创建,字典的键被用作列名
print(pd.DataFrame({
'Name': ['Tom', 'Jack', 'Steve', 'Ricky'], 'Age': [28, 34, 29, 42]}))
# 5.列表嵌套字典创建DataFrame对象,字典的键被用作列名
print(pd.DataFrame([{
'a': 1, 'b': 2}, {
'a': 5, 'b': 10, 'c': 20}]))