. 1 . . 1 Diagrama conceptual básico de:
Definición: Un gráfico es una estructura de datos compuesta por un conjunto de vértices (vértice) y un conjunto de relaciones entre vértices (bordes) .
Rol: Los gráficos pueden modelar cosas y las relaciones entre ellos. Los gráficos se pueden usar para representar datos conectados de forma natural, tales como: redes sociales, páginas web de Internet
Aplicaciones de uso común: encuentre la ruta más corta en la aplicación de mapas y recomiende productos basados en el mapa de similitud con otros.
1 .2 Spark GraphX :
GraphX es un nuevo componente en Spark para computación paralela gráfica y gráfica.
GraphX expande el RDD de Spark al introducir gráficos de atributos: multigrafos dirigidos con atributos en vértices y bordes
1.3 Cuadro de propiedades:
Género del núcleo de modelo abstracto. Figura GraphX, y es un múltiplo a la figura, con cada vértice (Vertex) y el borde (EDGE) objetos definidos por el usuario. Debido a las múltiples relaciones entre los mismos vértices, el gráfico de atributos admite bordes paralelos, lo que simplifica la escena de modelado del gráfico de atributos.
1.4 Ejemplos de gráficos de atributos:
1.5 Proceso de ejecución de Spark GraphX:
1.6 El primer caso de SparkGraphX:
(1)通过上面的项点数据和边数据创建图对象
(2)找出图中年龄大于 30 的顶点
(3)找出图中属性大于 5 的边
(4)找出图中最大的出度、入度、度数
package sparkGraphX
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object sparkGrapaX {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
//(1)通过上面的项点数据和边数据创建图对象和(2)找出图中年龄大于 30 的顶点
val graph = Graph(vertexArray, edgeArray)
println("图中年龄大于 30 的顶点")
graph.vertices.filter{case (id, (name, age)) => age >30 }.collect.foreach {
case (id, (name, age)) => println(s"$name age is $age")}
//(3)找出图中属性大于 5 的边
println("图中属性大于 5 的边")
graph.edges.filter(e=>(e.attr>5)).collect.foreach(e=>println(s"${e.srcId} to ${e.dstId} att ${e.attr}"))
//(4)找出图中最大的出度、入度、度数
println("图中最大的出度、入度、度数")
val max_OutDegrees=graph.outDegrees.reduce(max)
val max_InDegrees=graph.inDegrees.reduce(max)
val Max_Degrees=graph.degrees.reduce(max)
println("max of outDegrees:" + max_OutDegrees + " max of inDegrees:" + max_InDegrees + " max of Degrees:" + Max_Degrees)
}
def max(a: (VertexId, Int), b: (VertexId, Int)): (VertexId, Int) = {
if (a._2 > b._2) a else b
}
}
2.1 triplete ver triplete:
triplete: vista triple, esta vista guarda lógicamente los atributos de vértices y bordes como un RDD [ EdgeTriplet [VD, ED]]
La expresión SQL indica el significado de esta vista ternaria :
Forma gráfica:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
object sparkGraphX {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
var graph1: Graph[(String, Int), Int] = graph.mapVertices((vid: VertexId, attr: (String, Int)) => (attr._1, attr._2 * 2))
graph1.vertices.collect.foreach(println(_))
println("----------------------------")
//(2)使用mapEdges函数遍历所有的边,新增加一个属性值然后构建出新的图
var graph2: Graph[(String, Int), (Int, Boolean)] = graph.mapEdges(e => (e.attr, true))
graph2.edges.collect.foreach(println(_))
println("----------------------------")
//(3)使用mapTriplets函数遍历所有的三元组,新增加一个属性值,然后返回新的图
var graph3: Graph[(String, Int), (Int, String)] = graph.mapTriplets(triplets => (triplets.attr, "age"))
graph3.edges.collect.foreach(println(_))
}
}
2.2 Operación de atributo del gráfico :
2.3 Operación estructural:
Demostración del código de operación de estructura:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId}
import org.apache.spark.rdd.RDD
object sparkGraphX_mask {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
val vertexArray2 : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(7L, ("Tom", 24)),
(4L, ("David", 42)),
(8L, ("Smise", 55)),
(6L, ("Fran", 50))
)
)
val edgeArray2 : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(7L, 6L, 4),
Edge(7L, 6L, 3),
Edge(4L, 1L, 1),
Edge(8L, 2L, 2),
Edge(8L, 7L, 8),
Edge(8L, 6L, 3)
)
)
//生成一个新的图graph2中存在的边graph中不存在
val graph2 = Graph(vertexArray2, edgeArray2)
//mask生成一个新的子图,包含两个图共同的顶点,去除不同的
var graph3: Graph[(String, Int), Int] = graph.mask(graph2)
graph3.edges.collect.foreach(println(_))
graph3.vertices.collect.foreach(println(_))
println("-----------------------------------")
//reverse将边的原点和目标点反转
var graph_reverse: Graph[(String, Int), Int] = graph3.reverse
graph_reverse.edges.collect.foreach(println(_))
graph_reverse.vertices.collect.foreach(println(_))
println("-----------------------------------")
//subgraph筛选满足条件的顶点和边
var graph_subgraph: Graph[(String, Int), Int] = graph2.subgraph(ep => !(ep.srcId == 8L || ep.dstId == 7L))
graph_subgraph.edges.collect.foreach(println(_))
println("-----------------------------------")
//groupEdges将原点和目标点相同的重复的边合并
var graph_GraphEdges: Graph[(String, Int), Int] = graph2.groupEdges(merge = (ed1, ed2) => (ed1 + ed2))
graph_GraphEdges.edges.collect.foreach(println(_))
}
}
2.4 Operación de asociación:
package sparkGraphX
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.graphx.{Edge, Graph, VertexId, VertexRDD}
import org.apache.spark.rdd.RDD
object sparkGraphX_Join {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("SimpleGraphX").setMaster("local")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
//设置users顶点
val vertexArray : RDD[(VertexId, (String, Int))] =
sc.parallelize(Array(
(1L, ("Alice", 28)),
(2L, ("Bob", 27)),
(3L, ("Charlie", 65)),
(4L, ("David", 42)),
(5L, ("Ed", 55)),
(6L, ("Fran", 50))
)
)
//设置relationships边
val edgeArray : RDD[Edge[Int]] =
sc.parallelize(Array(
Edge(2L, 1L, 7),
Edge(2L, 4L, 2),
Edge(3L, 2L, 4),
Edge(3L, 6L, 3),
Edge(4L, 1L, 1),
Edge(5L, 2L, 2),
Edge(5L, 3L, 8),
Edge(5L, 6L, 3)
)
)
val graph = Graph(vertexArray, edgeArray)
var RDD1: RDD[(Long, String)] = sc.parallelize(Array((4L, "true"), (3L, "true"), (5L, "false"), (2L, "true")))
var joinGraph: Graph[(String, Int), Int] = graph.joinVertices(RDD1)((id, attr, RDDVertices) => {
RDDVertices match {
case "true" => (attr._1, attr._2 + 1)
case "false" => (attr._1, attr._2 - 1)
case none => (attr._1, 0)
}
})
joinGraph.vertices.collect.foreach(println(_))
}
}