pyspark LDA

from pyspark.ml.clustering import LDA
from pyspark.sql import SparkSession
spark= SparkSession\
                .builder \
                .appName("dataFrame") \
                .getOrCreate()
# Loads data.
dataset = spark.read.format("libsvm").load("/home/luogan/lg/softinstall/spark-2.3.0-bin-hadoop2.7/data/mllib/sample_lda_libsvm_data.txt")

# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[4, 2, 3]  |[0.10158114017063827, 0.09795896224032703, 0.09634513784922352]|
|1    |[10, 2, 0] |[0.10235714779173181, 0.10170547536458709, 0.10110389889666756]|
|2    |[1, 9, 4]  |[0.23131896487100045, 0.18586479080021476, 0.10634477925718694]|
|3    |[2, 0, 8]  |[0.10105099432861726, 0.09819276397111937, 0.09439293308247762]|
|4    |[2, 5, 6]  |[0.10906148029406972, 0.10299199849781812, 0.1021917583012123] |
|5    |[9, 5, 10] |[0.10994415292597645, 0.10747293316091602, 0.09798917719731169]|
|6    |[1, 10, 0] |[0.10607846050394759, 0.0993757059356977, 0.09866793692409519] |
|7    |[6, 5, 9]  |[0.12489807512390964, 0.09670014499093649, 0.09498633503471743]|
|8    |[10, 3, 6] |[0.18594749407136002, 0.17594182592557966, 0.13121226266337097]|
|9    |[9, 1, 4]  |[0.09841372306322634, 0.09649420277995173, 0.09646167974087538]|

猜你喜欢