import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
3.1.2 随机变量及其分布
r = np.random.RandomState(1)
r.rand(10)
array([4.17022005e-01, 7.20324493e-01, 1.14374817e-04, 3.02332573e-01,
1.46755891e-01, 9.23385948e-02, 1.86260211e-01, 3.45560727e-01,
3.96767474e-01, 5.38816734e-01])
x = r.rand(1000)
y = r.rand(1000)
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x114540198>
x = r.randn(1000)
y = r.randn(1000)
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x1145d0c88>
x = r.normal(10, 5, 1000)
y = r.normal(10, 5, 1000)
plt.scatter(x, y)
<matplotlib.collections.PathCollection at 0x1145cfda0>
lambd = 0.5
x = np.arange(0, 15, 0.1)
y = lambd * np.exp(-lambd * x)
plt.plot(x, y)
[<matplotlib.lines.Line2D at 0x1146cb860>]
binomial = np.random.binomial(9, 0.5, 10000)
money = np.zeros(10000)
money[0] = 1000
for i in range(1, 10000):
if binomial[i] < 5:
money[i] = money[i-1] - 8
else:
money[i] = money[i-1] + 8
plt.plot(np.arange(10000), money)
[<matplotlib.lines.Line2D at 0x114c65390>]
a1 = sum(np.random.binomial(3,0.5,10000)==1)/10000
a2 = sum(np.random.binomial(3,0.5,10000)>0)/10000
print('恰好出现一次正面的概率为%.4f, 至少有一次出现正面的概率为%.4f' % (a1,a2))
恰好出现一次正面的概率为0.3740, 至少有一次出现正面的概率为0.8737
r.choice(['a', 'b', 'c', 'd', 'e', 'f'], size = 40)
array(['a', 'a', 'b', 'e', 'e', 'a', 'd', 'f', 'f', 'c', 'c', 'e', 'a',
'f', 'b', 'd', 'd', 'a', 'e', 'c', 'f', 'e', 'c', 'a', 'a', 'a',
'd', 'a', 'd', 'b', 'a', 'a', 'e', 'a', 'e', 'e', 'c', 'b', 'c',
'a'], dtype='<U1')
ar = np.arange(20)
r.shuffle(ar)
ar
array([ 6, 15, 2, 0, 12, 9, 11, 1, 8, 5, 19, 16, 7, 4, 14, 18, 3,
10, 17, 13])
3.1.3 随机变量的数字特征
r = np.random.RandomState(2)
ar = r.randn(100)
n = len(ar)
m = ar.mean()
var = np.sum((ar - m)**2)/n
print(var)
print(np.var(ar))
1.0754432724043381
1.0754432724043381
ar = r.normal(loc = 10, scale = 5, size = 100)
var = np.var(ar)
e = np.mean(ar)
print('结果的均值为%.2f,方差为%.2f' % (e,var))
结果的均值为9.05,方差为23.21
r = np.random.RandomState(1)
ar = r.randn(1000) * 100
plt.hist(ar, bins = 50)
plt.grid()
plt.boxplot(ar,
vert = True,
whis = 1.5,
patch_artist = True,
meanline = False,showmeans=True,
showbox = True,
showcaps = True,
showfliers = True,
notch = False,
)
plt.grid()
df = pd.DataFrame(ar, columns = ['value'])
q25 = df['value'].quantile(0.25)
q40 = df['value'].quantile(0.4)
q75 = df['value'].quantile(0.75)
print('df的25分位数为%.2f, 40分位数为%.2f, 75分位数为%.2f' % (q25,q40,q75))
print('df的中位数为%.2f' % df['value'].median())
print('-------')
a25 = np.percentile(ar,25)
a40 = np.percentile(ar,40)
a75 = np.percentile(ar,75)
print('ar的25分位数为%.2f, 40分位数为%.2f, 75分位数为%.2f' % (a25,a40,a75))
df的25分位数为-60.02, 40分位数为-19.19, 75分位数为70.40
df的中位数为4.13
-------
ar的25分位数为-60.02, 40分位数为-19.19, 75分位数为70.40