- An n-gram is a sequence of words of length n. The input of N-Gram is a series of character strings, such as the output of the tokenizer. The parameter n represents the number of word terms in each n-gram. The output consists of n-gram sequences, where each n-gram is a string of n consecutive words separated by spaces. If the input string sequence is less than n Words, NGram output is empty
Examples:
import org.apache.spark.ml.feature.NGram
import org.apache.spark.sql.SparkSession
/**
*
* @author wangjuncheng
* N-Gram N元模型
*
**/
object NGram extends App{
val spark = SparkSession
.builder()
.master("local[*]")
.appName("ml_learn")
// .enableHiveSupport()
.config("", "")
.getOrCreate()
val wordDataFrame = spark.createDataFrame(Seq(
(0, Array("Hi", "I", "heard", "about", "Spark")),
(1, Array("I", "wish", "Java", "could", "use", "case", "classes")),
(2, Array("Logistic", "regression", "models", "are", "neat")))).toDF("id", "words")
//ngram model
val nGram = new NGram()
.setN(2)
.setInputCol("words")
.setOutputCol("ngrams")
//transfrom
val nGramDF = nGram.transform(wordDataFrame)
//result
nGramDF.select("ngrams").show(false)
spark.stop()
}