Udacity机器学习入门笔记——数据集与问题

    MORE DATA>FINE-TUNED ALGORITHM

    更多的数据集要比经过精密调整的算法可以提供更好的结果,使用更多的数据几乎总能帮助算法取得更好的效果

数据类型:

值数据:基本特征就是数值(薪水信息)

分类数据(categorical):类别变量,监督分类较多,包括有限数量的离散值(职位——有限的工作种类)

时序数据:时间变量(邮件中的时间戳、金融)

文字数据:词袋模型(邮件内容)

其他

import pickle

enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
#数据集有多少数据点(人)
print(len(enron_data))
#每个人有多少个特征可用
print(len(enron_data['METTS MARK']))
#数据集中有多少poi=True
print(len(dict((key, values) for key, values in enron_data.items() if values['poi'] == True)))
#总共有多少 POI?
with open('../final_project/poi_names.txt','r') as t:
	poi_file = t.readlines()
	print(len(poi_file[2:]))
#James Prentice 名下的股票总值是多少?
print(enron_data['PRENTICE JAMES']['total_stock_value'])
#有多少来自 Wesley Colwell 的发给嫌疑人的电子邮件?
print(enron_data['COLWELL WESLEY']['from_this_person_to_poi'])
#Jeffrey Skilling 行使的股票期权价值是多少?
print(enron_data['SKILLING JEFFREY K']['exercised_stock_options'])
#(Lay、Skilling 和 Fastow)当中,谁拿回家的钱最多(“total_payments”特征的最大值)?这个人得到了多少钱?
people=["LAY KENNETH L","SKILLING JEFFREY K","FASTOW ANDREW S"]
money = 0
who = ""
for i in people:
	if money < enron_data[i]["total_payments"]:
		money = enron_data[i]["total_payments"]
		who = i
print(money,who)
#数据集中有多少雇员有量化的工资?已知的邮箱地址是否可用?
count_salary,count_email=0,0
for i in enron_data.keys():
	if enron_data[i]['salary']!='NaN':
		count_salary += 1
	if enron_data[i]['email_address']!='NaN':
		count_email += 1
print('count_salary:',count_salary)
print('count_email:',count_email)

#(当前的)E+F 数据集中有多少人的薪酬总额被设置了“NaN”?数据集中这些人的比例占多少?
count_NaN_total_payments = 0
for i in enron_data.keys():
	if enron_data[i]['total_payments'] =='NaN':
		count_NaN_total_payments += 1
print(count_NaN_total_payments,count_NaN_total_payments/float(len(enron_data)))

#E+F 数据集中有多少 POI 的薪酬总额被设置了“NaN”?这些 POI 占多少比例?
count_POI_NaN_total_payments = 0
count_POI_NaN_total_stock_value = 0
poi_num = 0
for i in enron_data.keys():
	if enron_data[i]['poi']==True:
		poi_num += 1
		if enron_data[i]['total_payments']=='NaN':
			count_POI_NaN_total_payments += 1
		if enron_data[i]['total_stock_value']=='NaN':
			count_POI_NaN_total_stock_value += 1

print(count_POI_NaN_total_payments,count_POI_NaN_total_payments/float(poi_num))
print(count_POI_NaN_total_stock_value)

猜你喜欢

转载自blog.csdn.net/u012084802/article/details/79923499