LDA模型学习（代码）

为了把LDA算法用于文本聚类，我真的是绞尽脑汁。除了去看让我头大的概率论、随机过程、高数这些基础的数学知识，还到网上找已经实现的源代码。
最先让我看到署光的是Mallet,我研究了大概一个星期，最后决定放弃了。因为Mallet作者提供的例子实在太少了。
回到了网上找到的这样一段源代码：
   
   
    
    /*  
    
     * (C) Copyright 2005, Gregor Heinrich (gregor :: arbylon : net) (This file is  
    
     * part of the org.knowceans experimental software packages.)  
    
     */ 
    
    /*  
    
     * LdaGibbsSampler is free software; you can redistribute it and/or modify it  
    
     * under the terms of the GNU General Public License as published by the Free  
    
     * Software Foundation; either version 2 of the License, or (at your option) any  
    
     * later version.  
    
     */ 
    
    /*  
    
     * LdaGibbsSampler is distributed in the hope that it will be useful, but  
    
     * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or  
    
     * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more  
    
     * details.  
    
     */ 
    
    /*  
    
     * You should have received a copy of the GNU General Public License along with  
    
     * this program; if not, write to the Free Software Foundation, Inc., 59 Temple  
    
     * Place, Suite 330, Boston, MA 02111-1307 USA  
    
     */ 
    
     
    
    /*  
    
     * Created on Mar 6, 2005  
    
     */ 
    
    package com.xh.lda;  
    
     
    
    import java.text.DecimalFormat;  
    
    import java.text.NumberFormat;  
    
     
    
    /**  
    
     * Gibbs sampler for estimating the best assignments of topics for words and  
    
     * documents in a corpus. The algorithm is introduced in Tom Griffiths' paper  
    
     * "Gibbs sampling in the generative model of Latent Dirichlet Allocation"  
    
     * (2002).  
    
     *   
    
     * @author heinrich  
    
     */ 
    
    public class LdaGibbsSampler {  
    
     
    
        /**  
    
         * document data (term lists)  
    
         */ 
    
        int[][] documents;  
    
     
    
        /**  
    
         * vocabulary size  
    
         */ 
    
        int V;  
    
     
    
        /**  
    
         * number of topics  
    
         */ 
    
        int K;  
    
     
    
        /**  
    
         * Dirichlet parameter (document--topic associations)  
    
         */ 
    
        double alpha;  
    
     
    
        /**  
    
         * Dirichlet parameter (topic--term associations)  
    
         */ 
    
        double beta;  
    
     
    
        /**  
    
         * topic assignments for each word.  
    
         */ 
    
        int z[][];  
    
     
    
        /**  
    
         * cwt[i][j] number of instances of word i (term?) assigned to topic j.  
    
         */ 
    
        int[][] nw;  
    
     
    
        /**  
    
         * na[i][j] number of words in document i assigned to topic j.  
    
         */ 
    
        int[][] nd;  
    
     
    
        /**  
    
         * nwsum[j] total number of words assigned to topic j.  
    
         */ 
    
        int[] nwsum;  
    
     
    
        /**  
    
         * nasum[i] total number of words in document i.  
    
         */ 
    
        int[] ndsum;  
    
     
    
        /**  
    
         * cumulative statistics of theta  
    
         */ 
    
        double[][] thetasum;  
    
     
    
        /**  
    
         * cumulative statistics of phi  
    
         */ 
    
        double[][] phisum;  
    
     
    
        /**  
    
         * size of statistics  
    
         */ 
    
        int numstats;  
    
     
    
        /**  
    
         * sampling lag (?)  
    
         */ 
    
        private static int THIN_INTERVAL = 20;  
    
     
    
        /**  
    
         * burn-in period  
    
         */ 
    
        private static int BURN_IN = 100;  
    
     
    
        /**  
    
         * max iterations  
    
         */ 
    
        private static int ITERATIONS = 1000;  
    
     
    
        /**  
    
         * sample lag (if -1 only one sample taken)  
    
         */ 
    
        private static int SAMPLE_LAG;  
    
     
    
        private static int dispcol = 0;  
    
     
    
        /**  
    
         * Initialise the Gibbs sampler with data.  
    
         *   
    
         * @param V  
    
         *            vocabulary size  
    
         * @param data  
    
         */ 
    
        public LdaGibbsSampler(int[][] documents, int V) {  
    
     
    
            this.documents = documents;  
    
            this.V = V;  
    
        }  
    
     
    
        /**  
    
         * Initialisation: Must start with an assignment of observations to topics ?  
    
         * Many alternatives are possible, I chose to perform random assignments  
    
         * with equal probabilities  
    
         *   
    
         * @param K  
    
         *            number of topics  
    
         * @return z assignment of topics to words  
    
         */ 
    
        public void initialState(int K) {  
    
            int i;  
    
     
    
            int M = documents.length;  
    
     
    
            // initialise count variables.  
    
            nw = new int[V][K];  
    
            nd = new int[M][K];  
    
            nwsum = new int[K];  
    
            ndsum = new int[M];  
    
     
    
            // The z_i are are initialised to values in [1,K] to determine the  
    
            // initial state of the Markov chain.  
    
     
    
            z = new int[M][];  
    
            for (int m = 0; m < M; m++) {  
    
                int N = documents[m].length;  
    
                z[m] = new int[N];  
    
                for (int n =
LDA模型学习（代码）

猜你喜欢