[Spark] Build a tree based on parent and child nodes

There are two columns of child and parent in the data set. It is necessary to build a tree based on the parent and child nodes, find the root node and call chain.

1. Recursive traversal

import org.apache.spark.sql.DataFrame;
val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b", "b", "a-b", "20190201"),
  ("b", "c", "c", "b-c", "20190201"),
  ("c", "d", "d", "c-d", "20190201"),
  ("d", "e", "e", "d-e", "20190201")
)).toDF("child", "parent", "grandparent", "chain", "dt")

var result = df.selectExpr("child as child1","parent as parent1","grandparent as grandparent1","chain as chain1","dt as dt1")

def findroot(dataframe:DataFrame) : DataFrame = {
    val r = result.where("grandparent1 is not null").take(1)
    if(r.isEmpty) {
        return result
    }
    result = dataframe
    result = df.join(result,df("parent") === result("child1"),"left_outer").selectExpr("child as child1","NVL(parent1, parent) as parent1","grandparent1","concat_ws('-',child,chain1) as chain1","dt as dt1")
    findroot(result)
}

val output = findroot(result)

output.show

The result of the operation is as follows:

2. Optimization

  import org.apache.spark.sql.functions._
  import org.apache.spark.sql.DataFrame
  def findRoot(df:DataFrame):DataFrame = {
    import df.sparkSession.implicits._
    df.cache()
    var result = df.withColumn("chain",concat_ws("->",df("child"),df("parent"))).withColumn("root",df("parent")).withColumn("tmp",df("parent"))
    var sample = result.where("tmp is not null").take(1)
    while (!sample.isEmpty) {
      result = result.as("result").join(df.as("source"),($"source.child"===$"result.tmp").and($"result.tmp".isNotNull),"left_outer")
        .select(col("result.child"),col("result.parent")
          ,concat_ws ("->",col("result.chain"),col("source.parent")).as("chain")
          ,coalesce(col("source.parent"),column("result.root")).as("root"),col("source.parent").as("tmp")  )
      result.cache()
      sample = result.where("tmp is not null").take(1)
    }

    df.unpersist()
    result.drop("tmp")
  } 

val sqlContext=new org.apache.spark.sql.SQLContext(sc);

val df = sqlContext.createDataFrame(List(
  ("a", "b"),
  ("b", "c"),
  ("c", "d"),
  ("d", "e")
)).toDF("child", "parent")

val result = findRoot(df)
result.show

The result of the operation is as follows:

[Spark] Build a tree based on parent and child nodes

1. Recursive traversal

2. Optimization

Guess you like