Data Mining Exercises - Chapter 2

Data mining concepts and techniques optional exercises

Chapter 2 Exercises

write picture description here
The answer in python is as follows:

import numpy as np
import matplotlib.pyplot as plt

data = np.array([13,15,16,16,19,20,20,21,22,22,25,25,25,25,30,33,33,35,35,35,35,36,40,45,46,52,70])

# 数据的均值
mean = np.average(data)
print("该数据的均值为:" + str(mean))

# 中位数的位置
median_index = int(len(data)/2) 
# 判断数据的奇偶性
if len(data) % 2 == 1:
    # 若数据长度为奇数,则中位数为最中间的那个数
    median = data[median_index]
else:
    # 若数据长度为偶数,则中位数为最中间两个数的平均值
    median = (data[median_index-1] + data[median_index]) / 2
print("该数据的中位数为: " + str(median))

# 下面求众数
# 首先创建一个列表,储存没有重复项的数据
data_pure = []
for value in data:
    if value not in data_pure:
        data_pure.append(value)


for i in range(len(data_pure)):
    num = 0
    for value in data:
        if data_pure[i] == value:
            num += 1
    if num == 4:
        print("该数列的众数数值为" + str(data_pure[i]) + "有" + str(num) + "个")

print("该数据的中列数为: " + str((np.max(data)+np.min(data))/2))

# 四分位数
Q1_index = int(len(data) / 4)
Q2_index = int(len(data) / 4 * 2)
Q3_index = int(len(data) / 4 * 3)
print("第一个四分位数Q1为:" + str(data[Q1_index])+"\n"
      "第三个四分位数Q3为:" + str(data[Q3_index]))

# 该数据的分位数列表,为了绘制分位图
f_value = []
for i in range(len(data)):
    f_value.append(i/len(data))

# 绘制盒图
fig = plt.figure()
ax1 = fig.add_subplot(121)
ax1.boxplot(data)
# 绘制分位图
ax2 = fig.add_subplot(122)
ax2.scatter(f_value, data)
plt.show()

The running result is:
write picture description herewrite picture description here

write picture description here

Answer : According to the approximate calculation formula of the median:

m e d i a n = L 1 + N / 2 ( f r e q ) l f r e q m e d i a n w i d t h

First calculate the location of the median: M l = N / 2 = ( 200 + 450 + 300 + 1500 + 700 + 44 ) / 2 = 1597 Then it can be judged that the median is in the range of 21 to 50. The lower bound of the interval is 21, the width is 30, the sum of all interval frequencies below the median is 200+450+300=950, and the median interval frequency is 1500. Then the median can be calculated as: m e d i a n = 21 + 1597 950 1500 × 30 = 21 + 12.94 = 33.94 34

write picture description here

Answer with code:

import numpy as np
import matplotlib.pyplot as plt

age = np.array([23,23,27,27,39,41,47,49,50,52,54,54,56,57,58,58,60,61])
fat = np.array([9.5,26.5,7.8,17.8,31.4,25.9,27.4,27.2,31.2,34.6,42.5,28.8,33.4,30.2,34.1,32.9,41.2,35.7])

# 求平均值
age_mean = np.average(age)
fat_mean = np.average(fat)
print("age数据的均值为: " + str(age_mean) + "\n"
      "fat数据的均值为: " + str(fat_mean))


def median(list):
    """定义一个求中位数的函数"""
    median_index = int(len(list) / 2)
    if len(list) % 2 == 1:
        median = list[median_index]
    else:
        median = (list[median_index-1]+list[median_index]) / 2

    return median
print("age数据的中位数为:" + str(median(age)) + "\n"
      "fat数据的中位数为:" + str(median(fat)))


def sigma(list):
    """定义一个求标准差的函数"""
    variance = 0
    for value in list:
        variance += (value - np.average(list)) ** 2
    sigma = np.sqrt(variance)
    return sigma
print("age数据的标准差为:" + str(sigma(age))+ "\n"
      "fat数据的标准差为:"+str(sigma(fat)))

# 绘制盒图
fig1 = plt.figure()
ax1 = fig1.add_subplot(121)
ax1.boxplot(age)
ax1.set_title(label="Box of age")
ax2 = fig1.add_subplot(122)
ax2.boxplot(fat)
ax2.set_title(label="Box of fat")
plt.show()

# 绘制散点图
x = np.linspace(1,len(age),len(age))

fig2 = plt.figure()
ax3 = fig2.add_subplot(121)
ax3.scatter(x, age)
ax3.set_title("Scatter of age")
ax4 = fig2.add_subplot(122)
ax4.scatter(x, fat)
ax4.set_title("Scatter of fat")
plt.show()

# 绘制q-q图
fig3 = plt.figure()
ax5 = fig3.add_subplot(111)
ax5.scatter(age, fat)
ax5.set_title("q-q of age and fat")
plt.show()

The running result is:
write picture description herewrite picture description herewrite picture description here
write picture description here

write picture description here

Answer with code:

import numpy as np

x1 = np.array([22,1,42,10])
x2 = np.array([20,0,36,8])


def Ogld_dis(x1,x2):
    """定义一个函数计算两组数的欧几里得距离"""
    sum_var = 0
    for i in range(len(x1)):
        sum_var += (x1[i] - x2[i]) ** 2
    distance = np.sqrt(sum_var)
    return distance


def Mhd_dis(x1,x2):
    """定义一个函数计算两组数的曼哈顿距离"""
    sum_abs = 0
    for i in range(len(x1)):
        sum_abs += np.abs(x1[i] - x2[i])
    distance = sum_abs
    return distance


def Mkfsj_dis(x1,x2,h):
    """定义一个函数计算两组数的闵可夫斯基距离"""
    sum_h = 0
    for i in range(len(x1)):
        sum_h += np.abs(x1[i]-x2[i]) ** h
    distance = np.power(sum_h, 1/h)
    return distance


def Cbxf_dis(x1, x2):
    """定义一个函数计算两组数的上确界即切比雪夫距离"""
    dis_list = []
    for i in range(len(x1)):
        dis_list.append(np.abs(x1[i]-x2[i]))
    distance = np.max(dis_list)
    return distance

# 计算这两个对象的欧几里得距离
print("这两个对象的欧几里得距离为:" + str(Ogld_dis(x1,x2)) + '\n'
      "这两个对象的曼哈顿距离为:" + str(Mhd_dis(x1,x2)) + '\n'
      "这两个对象的闵可夫斯基距离(h=3)为:" + str(Mkfsj_dis(x1,x2, h=3)) + '\n'
      "这两个对象之间的上确界距离为:" + str(Cbxf_dis(x1,x2)))

The calculation result is:
write picture description here

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325690269&siteId=291194637