RandomSeedGenerator.buildRandom
//初始中心为vector自己 Kluster newCluster = new Kluster(value.get(), nextClusterId++, measure);
TFIDF建立的Vector类型
//TFIDFPartialVectorReducer Vector vector = new RandomAccessSparseVector((int) featureCount, value.getNumNondefaultElements());
默认的测量距离的方法
measureClass = SquaredEuclideanDistanceMeasure.class.getName();
聚类策率
ClusteringPolicy policy = new KMeansClusteringPolicy(convergenceDelta); ClusterClassifier prior = new ClusterClassifier(clusters, policy);
主要算法保存在
org.apache.mahout.clustering.iterator.CIMapper org.apache.mahout.clustering.iterator.CIReducer
求得某点到所有cluster中心的距离,得到的是一个数组,
AbstractClusteringPolicy.java
public Vector classify(Vector data, ClusterClassifier prior) { List<Cluster> models = prior.getModels(); int i = 0; Vector pdfs = new DenseVector(models.size()); for (Cluster model : models) { //求得该点属于某个模型的几率,为1/(1+该点到聚类中心的距离) pdfs.set(i++, model.pdf(new VectorWritable(data))); } //zSum:vector中所有元素的和,TimesFunction用来计算乘积的函数,assign对每个元素执行该函数 return pdfs.assign(new TimesFunction(), 1.0 / pdfs.zSum()); }
找出相似都最大的值
public Vector select(Vector probabilities) { //获取最大的值,因为保存的值为1/距离,所以距离越小值越大,相似性就越高 int maxValueIndex = probabilities.maxValueIndex(); Vector weights = new SequentialAccessSparseVector(probabilities.size()); weights.set(maxValueIndex, 1.0); return weights; }
合同相同的聚类,并重新计算中心点
CIReducer.java
protected void reduce(IntWritable key, Iterable<ClusterWritable> values, Context context) throws IOException, InterruptedException { Iterator<ClusterWritable> iter = values.iterator(); ClusterWritable first = null; while (iter.hasNext()) { ClusterWritable cw = iter.next(); if (first == null) { first = cw; } else { first.getValue().observe(cw.getValue()); } } List<Cluster> models = new ArrayList<Cluster>(); models.add(first.getValue()); classifier = new ClusterClassifier(models, policy); //此处进入重新计算聚类中心点,引用的是KMeansClusteringPolicy.close()方法, //KMeansClusteringPolicy在close中引用computeParameters方法计算中心点 classifier.close(); context.write(key, first); }
computeParameters方法计算中心点
public void computeParameters() { if (getS0() == 0) { return; } setNumObservations((long) getS0()); setTotalObservations(getTotalObservations() + getNumObservations()); setCenter(getS1().divide(getS0())); // compute the component stds //计算半径,貌似采用标准方差的办法 if (getS0() > 1) { setRadius(getS2().times(getS0()).minus(getS1().times(getS1())).assign(new SquareRootFunction()).divide(getS0())); } setS0(0); //设置s1为空的vector setS1(center.like()); //设置s2为空的vector setS2(center.like()); }
计算聚类归属:
在计算完中心点后计算,也可以理解为分类,参数为runClustering=true时候才生效,此时可设置一定的阀值,参数为clusterClassificationThreshold,只取指定的条件分类,输出目录为clusteredPoints
public static void clusterData(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, double clusterClassificationThreshold, boolean runSequential) throws IOException, InterruptedException, ClassNotFoundException { if (log.isInfoEnabled()) { log.info("Running Clustering"); log.info("Input: {} Clusters In: {} Out: {} Distance: {}", new Object[] {input, clustersIn, output, measure}); } ClusterClassifier.writePolicy(new KMeansClusteringPolicy(), clustersIn); ClusterClassificationDriver.run(input, output, new Path(output, CLUSTERED_POINTS_DIRECTORY), clusterClassificationThreshold, true, runSequential); }