特征缩放
机器学习算法会受到特征缩放的影响?使用 RBF 核函数的 SVM和K-均值聚类
sklearnfrom sklearn.preprocessing import MinMaxScaler import numpy weights = numpy.array([[115.],[140.],[175.]]) scaler = MinMaxScaler() rescaled_weight = scaler.fit_transform(weights) print rescaled_weight
def featureScaling(arr): import numpy as np arr = np.array(arr) max = np.max(arr) min = np.min(arr) new=[] for item in arr: if max != min : float (item) item =float(item-min)/(max-min) new.append(item) else: item=0.5 new.append(item) return new data = [115, 140, 175] print featureScaling(data)
文本学习
##从NLTK中获取停止词
from nltk.corpus import stopwords sw = stopwords.words('english') len(sw)
##清除“签名文字”
words = ' '.join(words.split())
##进行 TfIdf
from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words="english") vectorizer.fit_transform(word_data) feature_names = vectorizer.get_feature_names() print len(feature_names) print feature_names[34597]