一. Spark:Apache Spark是专为大规模数据处理而设计的快速通用的计算引擎/计算平台,可用来构建大型的、低延迟的数据分析应用程序。
二. Maven中添加依赖包:
<!-- Spark dependency -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
三. 写了2个方法,第一个是使用spark完成计算的hello world,第二个是spark的常用java api的列举。程序的运行是打包完成后部署到装有spark的linux上运行的。
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.sql.SparkSession;
import redis.clients.jedis.Jedis;
import scala.Tuple2;
import java.util.*;
public class HelloSpark {
public static void main(String[] args) {
// helloSpark();
javaApi();
}
private static void helloSpark() {
SparkSession spark = SparkSession
.builder()
.appName("JavaWordCount")
.getOrCreate();
SparkContext sparkContext = spark.sparkContext();
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkContext);
List<Integer> list = Arrays.asList(1, 2, 3, 4, 5);
JavaRDD<Integer> javaRDD = javaSparkContext.parallelize(list);
Integer aa = javaRDD.reduce((a, b) -> a + b);//lambda reduce:减少,换算
System.out.println("===========aa ================= "+aa);
List<Integer> list1 = javaRDD.collect();
for (Integer i : list1) {
System.out.print(i+",");
}
javaRDD.saveAsTextFile("file:/opt/spark/test/rdd.txt");//指定存储数据的文件夹,并不是文件名,且文件夹不可存在!
Jedis jedis = new Jedis("127.0.0.1");
jedis.select(9);
jedis.set("hellospark",String.valueOf(aa));
jedis.close();
spark.stop();
}
private static void javaApi() {
SparkSession spark = SparkSession
.builder()
.appName("JavaWordCount")
.getOrCreate();
SparkContext sparkContext = spark.sparkContext();
JavaSparkContext javaSparkContext = new JavaSparkContext(sparkContext);
List<String> list = Arrays.asList("a,b,cde","12,34,5");
JavaRDD<String> javaRDD = javaSparkContext.parallelize(list);
JavaRDD<String[]> map = javaRDD.map(a -> a.split(","));//对list中每个元素a进行操作
JavaRDD<String> flatMap = javaRDD.flatMap(new FlatMapFunction<String, String>() {
@Override
public Iterator<String> call(String s) throws Exception {//对list中每个元素的每个属性进行操作
List<String> strings = Arrays.asList(s.split(","));
return ((Iterable<String>) strings).iterator();
}
});
List<String> collect = javaRDD.collect();
List<String[]> collect1 = map.collect();
List<String> collect2 = flatMap.collect();
System.out.print("javaRdd===========");
for (String s : collect) {
System.out.print(s + ":");
}
System.out.println();
System.out.println("map=========");
for (String[] arr : collect1) {
System.out.print("========arr=======");
for (int i = 0; i < arr.length; i++) {
System.out.print(arr[i]+":");
}
System.out.println();
}
System.out.println("flatMap===========");
for (String s : collect2) {
System.out.println(s);
}
JavaRDD<String> filter = javaRDD.filter(new Function<String, Boolean>() {//过滤
@Override
public Boolean call(String s) throws Exception {
if (s.contains("c")) {
return true;
}
return false;
}
});
List<String> collect3 = filter.collect();
System.out.println("filterRdd=================");
for (String s : collect3) {
System.out.print(s);
}
System.out.println();
JavaRDD<String> union = javaRDD.union(filter);//联合
List<String> collect4 = union.collect();
System.out.println("unionRdd===============");
for (String s : collect4) {
System.out.println(s);
}
Tuple2<String, String> stringStringTuple2 = new Tuple2<String, String>("a","1");
Tuple2<String, String> stringStringTuple3 = new Tuple2<String, String>("a","2");
Tuple2<String, String> stringStringTuple4 = new Tuple2<String, String>("b","3");
Tuple2<String, String> stringStringTuple5 = new Tuple2<String, String>("b","4");
Tuple2<String, String> stringStringTuple6 = new Tuple2<String, String>("c","5");
List<Tuple2<String, String>> tuple2s = new ArrayList<Tuple2<String, String>>();
tuple2s.add(stringStringTuple2);
tuple2s.add(stringStringTuple3);
tuple2s.add(stringStringTuple4);
tuple2s.add(stringStringTuple5);
tuple2s.add(stringStringTuple6);
JavaPairRDD<String, String> stringStringJavaPairRDD = javaSparkContext.parallelizePairs(tuple2s);
JavaPairRDD<String, Iterable<String>> stringIterableJavaPairRDD = stringStringJavaPairRDD.groupByKey();
Map<String, Iterable<String>> stringIterableMap = stringIterableJavaPairRDD.collectAsMap();
Set<Map.Entry<String, Iterable<String>>> entries1 = stringIterableMap.entrySet();
System.out.println("============pairRdd============");
for (Map.Entry<String, Iterable<String>> entry : entries1) {
String key = entry.getKey();
Iterable<String> value = entry.getValue();
System.out.println(key+value);
Iterator<String> iterator = value.iterator();
System.out.println("===iterator===");
while(iterator.hasNext()){
System.out.print(iterator.next());
}
System.out.println();
}
JavaPairRDD<String, String> stringStringJavaPairRDD1 = stringStringJavaPairRDD.reduceByKey(new Function2<String, String, String>() {
@Override
public String call(String s, String s2) throws Exception {
return s + "&" + s2;
}
});
List<Tuple2<String, String>> collect5 = stringStringJavaPairRDD1.collect();
Map<String, String> stringStringMap = stringStringJavaPairRDD1.collectAsMap();
System.out.println("================reduceMapRdd================");
for (Tuple2<String, String> tuple2 : collect5){
System.out.println(tuple2._1+tuple2._2);
}
spark.stop();
}
}