为了把LDA算法用于文本聚类,我真的是绞尽脑汁。除了去看让我头大的概率论、随机过程、高数这些基础的数学知识,还到网上找已经实现的源代码。
最先让我看到署光的是Mallet,我研究了大概一个星期,最后决定放弃了。因为Mallet作者提供的例子实在太少了。
回到了网上找到的这样一段源代码:
- /*
- * (C) Copyright 2005, Gregor Heinrich (gregor :: arbylon : net) (This file is
- * part of the org.knowceans experimental software packages.)
- */
- /*
- * LdaGibbsSampler is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option) any
- * later version.
- */
- /*
- * LdaGibbsSampler is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- * details.
- */
- /*
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
- * Place, Suite 330, Boston, MA 02111-1307 USA
- */
- /*
- * Created on Mar 6, 2005
- */
- package com.xh.lda;
- import java.text.DecimalFormat;
- import java.text.NumberFormat;
- /**
- * Gibbs sampler for estimating the best assignments of topics for words and
- * documents in a corpus. The algorithm is introduced in Tom Griffiths' paper
- * "Gibbs sampling in the generative model of Latent Dirichlet Allocation"
- * (2002).
- *
- * @author heinrich
- */
- public class LdaGibbsSampler {
- /**
- * document data (term lists)
- */
- int[][] documents;
- /**
- * vocabulary size
- */
- int V;
- /**
- * number of topics
- */
- int K;
- /**
- * Dirichlet parameter (document--topic associations)
- */
- double alpha;
- /**
- * Dirichlet parameter (topic--term associations)
- */
- double beta;
- /**
- * topic assignments for each word.
- */
- int z[][];
- /**
- * cwt[i][j] number of instances of word i (term?) assigned to topic j.
- */
- int[][] nw;
- /**
- * na[i][j] number of words in document i assigned to topic j.
- */
- int[][] nd;
- /**
- * nwsum[j] total number of words assigned to topic j.
- */
- int[] nwsum;
- /**
- * nasum[i] total number of words in document i.
- */
- int[] ndsum;
- /**
- * cumulative statistics of theta
- */
- double[][] thetasum;
- /**
- * cumulative statistics of phi
- */
- double[][] phisum;
- /**
- * size of statistics
- */
- int numstats;
- /**
- * sampling lag (?)
- */
- private static int THIN_INTERVAL = 20;
- /**
- * burn-in period
- */
- private static int BURN_IN = 100;
- /**
- * max iterations
- */
- private static int ITERATIONS = 1000;
- /**
- * sample lag (if -1 only one sample taken)
- */
- private static int SAMPLE_LAG;
- private static int dispcol = 0;
- /**
- * Initialise the Gibbs sampler with data.
- *
- * @param V
- * vocabulary size
- * @param data
- */
- public LdaGibbsSampler(int[][] documents, int V) {
- this.documents = documents;
- this.V = V;
- }
- /**
- * Initialisation: Must start with an assignment of observations to topics ?
- * Many alternatives are possible, I chose to perform random assignments
- * with equal probabilities
- *
- * @param K
- * number of topics
- * @return z assignment of topics to words
- */
- public void initialState(int K) {
- int i;
- int M = documents.length;
- // initialise count variables.
- nw = new int[V][K];
- nd = new int[M][K];
- nwsum = new int[K];
- ndsum = new int[M];
- // The z_i are are initialised to values in [1,K] to determine the
- // initial state of the Markov chain.
- z = new int[M][];
- for (int m = 0; m < M; m++) {
- int N = documents[m].length;
- z[m] = new int[N];
- for (int n =