from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2).fit(X) kmeans.predict(X)
n_clusters :
The number of clusters to form as well as the number of centroids to generate.#分为几类
n_init :
Number of time the k-means algorithm will be run with different centroid seeds. The final results will be the best output of n_init consecutive runs in terms of inertia.#初始次数
max_iter :
Maximum number of iterations of the k-means algorithm for a single run.#迭代次数
##使用 3 个特征聚类
feature_1 = "salary" feature_2 = "exercised_stock_options" feature_3 = "total_payments" poi = "poi" features_list = [poi, feature_1, feature_2, feature_3] data = featureFormat(data_dict, features_list ) poi, finance_features = targetFeatureSplit( data ) from sklearn.cluster import KMeans pred =[] pred = KMeans(n_clusters=2).fit_predict(finance_features) ### in the "clustering with 3 features" part of the mini-project, ### you'll want to change this line to ### for f1, f2, _ in finance_features: ### (as it's currently written, the line below assumes 2 features) for f1, f2, f3 in finance_features: plt.scatter( f1, f2, f3 ) plt.show()
##股票期权范围
import numpy as np stocklist=[] for item in data_dict: stock = data_dict[item]['exercised_stock_options'] if stock != 'NaN': stocklist.append(stock) stocklist = np.array(stocklist) print "max:",np.max(stocklist) print "min:",np.min(stocklist)
##薪酬范围
salarylist=[] for item in data_dict: stock = data_dict[item]['salary'] if stock != 'NaN': salarylist.append(stock) salarylist = np.array(salarylist) print "max:",np.max(salarylist) print "min:",np.min(salarylist)