. 1 . . 1 basic conceptual diagram of:
Definition: A graph is a data structure composed of a vertex set (vertex) and a set of relationships between vertices (edges) .
Role: Graphs can model things and the relationships between them. Graphs can be used to represent naturally occurring connected data, such as: social networks, Internet web pages
Commonly used applications: Find the shortest path in the map application, and recommend products based on the similarity map with others.
1.2 Spark GraphX :
GraphX is a new component in Spark for graph and graph parallel computing.
GraphX expands Spark's RDD by introducing attribute graphs: directed multigraphs with attributes on vertices and edges
1.3 Property chart:
Genus of the core of FIG. GraphX abstract model, and is a multiple to FIG, with each vertex (Vertex) and the edge (Edge) user-defined objects. Due to the multiple relationships between the same vertices, the attribute graph supports parallel edges, which simplifies the modeling scene of the attribute graph.
1.4 Examples of attribute graphs:
1.5 Spark GraphX execution process:
1.6 The first SparkGraphX case:
(1)通过上面的项点数据和边数据创建图对象
(2)找出图中年龄大于 30 的顶点
(3)找出图中属性大于 5 的边
(4)找出图中最大的出度、入度、度数
package sparkGraphX
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object sparkGrapaX {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
//(1)通过上面的项点数据和边数据创建图对象和(2)找出图中年龄大于 30 的顶点
val graph = Graph(vertexArray, edgeArray)
println("图中年龄大于 30 的顶点")
graph.vertices.filter{case (id, (name, age)) => age >30 }.collect.foreach {
case (id, (name, age)) => println(s"$name age is $age")}
//(3)找出图中属性大于 5 的边
println("图中属性大于 5 的边")
graph.edges.filter(e=>(e.attr>5)).collect.foreach(e=>println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
//(4)找出图中最大的出度、入度、度数
println("图中最大的出度、入度、度数")
val max_OutDegrees=graph.outDegrees.reduce(max)
val max_InDegrees=graph.inDegrees.reduce(max)
val Max_Degrees=graph.degrees.reduce(max)
println("max of outDegrees:" + max_OutDegrees + " max of inDegrees:" + max_InDegrees + " max of Degrees:" + Max_Degrees)
}
def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
if (a._2 > b._2) a else b
}
}
2.1 triplet view triplet:
triplet: triple view, this view logically saves the attributes of vertices and edges as an RDD [ EdgeTriplet [VD, ED]]
The Sql expression indicates the meaning of this ternary view :
Graphical form:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
object sparkGraphX {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
var graph1: Graph[(String, Int), Int] = graph.mapVertices((vid: VertexId, attr: (String, Int)) => (attr._1, attr._2 * 2))
graph1.vertices.collect.foreach(println(_))
println("----------------------------")
//(2)使用mapEdges函数遍历所有的边,新增加一个属性值然后构建出新的图
var graph2: Graph[(String, Int), (Int, Boolean)] = graph.mapEdges(e => (e.attr, true))
graph2.edges.collect.foreach(println(_))
println("----------------------------")
//(3)使用mapTriplets函数遍历所有的三元组,新增加一个属性值,然后返回新的图
var graph3: Graph[(String, Int), (Int, String)] = graph.mapTriplets(triplets => (triplets.attr, "age"))
graph3.edges.collect.foreach(println(_))
}
}
2.2 Attribute operation of graph :
2.3 Structural operation:
Structure operation code demonstration:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
object sparkGraphX_mask {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
val vertexArray2 : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(7L, ("Tom", 24)),
(4L, ("David", 42)),
(8L, ("Smise", 55)),
(6L, ("Fran", 50))
)
)
val edgeArray2 : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(7L, 6L, 4),
Edge(7L, 6L, 3),
Edge(4L, 1L, 1),
Edge(8L, 2L, 2),
Edge(8L, 7L, 8),
Edge(8L, 6L, 3)
)
)
//生成一个新的图graph2中存在的边graph中不存在
val graph2 = Graph(vertexArray2, edgeArray2)
//mask生成一个新的子图,包含两个图共同的顶点,去除不同的
var graph3: Graph[(String, Int), Int] = graph.mask(graph2)
graph3.edges.collect.foreach(println(_))
graph3.vertices.collect.foreach(println(_))
println("-----------------------------------")
//reverse将边的原点和目标点反转
var graph_reverse: Graph[(String, Int), Int] = graph3.reverse
graph_reverse.edges.collect.foreach(println(_))
graph_reverse.vertices.collect.foreach(println(_))
println("-----------------------------------")
//subgraph筛选满足条件的顶点和边
var graph_subgraph: Graph[(String, Int), Int] = graph2.subgraph(ep => !(ep.srcId == 8L || ep.dstId == 7L))
graph_subgraph.edges.collect.foreach(println(_))
println("-----------------------------------")
//groupEdges将原点和目标点相同的重复的边合并
var graph_GraphEdges: Graph[(String, Int), Int] = graph2.groupEdges(merge = (ed1, ed2) => (ed1 + ed2))
graph_GraphEdges.edges.collect.foreach(println(_))
}
}
2.4 Association operation:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
object sparkGraphX_Join {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
var RDD1: RDD[(Long, String)] = sc.parallelize(Array((4L, "true"), (3L, "true"), (5L, "false"), (2L, "true")))
var joinGraph: Graph[(String, Int), Int] = graph.joinVertices(RDD1)((id, attr, RDDVertices) => {
RDDVertices match {
case "true" => (attr._1, attr._2 + 1)
case "false" => (attr._1, attr._2 - 1)
case none => (attr._1, 0)
}
})
joinGraph.vertices.collect.foreach(println(_))
}
}