Spark读取Es写入Hdfs

https://blog.csdn.net/qq_39481696/article/details/82597912

hbase 入门
https://blog.csdn.net/guolindonggld/article/details/82767620

package org.bathkafka.com;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
import org.apache.hadoop.hbase.mapreduce.TableInputFormat;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.util.Base64;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.api.java.function.VoidFunction;
import scala.Tuple2;

public class SparkToHbase {
public static void main(String[] args) {
// 集群启用了kerberos认证
//Configuration configuration = kerberos();//集群启用了kerberos认证，没有认证的话，将这行注释掉即可
Configuration configuration = new Configuration();
String tableName = "gld:student"; // 命名空间下的表

String FAMILY = "cf1"; 列族名称
String COLUM_NAME = "name"; 列名
String COLUM_SEX = "sex"; 列名
String COLUM_AGE = "age"; 列名

SparkConf sparkConf = new SparkConf().setAppName("SparkDataFromHbase").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(sparkConf);

// Hbase配置
Configuration hconf = HBaseConfiguration.create(configuration);// kerberos认证集群必须传递已经认证过的conf
hconf.set("hbase.zookeeper.quorum","node1,node2,node3");
hconf.set("hbase.zookeeper.property.clientPort", "2181");
hconf.set(TableInputFormat.INPUT_TABLE, tableName);

Scan scan = new Scan();
scan.addFamily(Bytes.toBytes(FAMILY));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_AGE));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_SEX));
scan.addColumn(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME));
try {
//添加scan
ClientProtos.Scan proto = ProtobufUtil.toScan(scan);
String ScanToString = Base64.encodeBytes(proto.toByteArray());
hconf.set(TableInputFormat.SCAN, ScanToString);
//读HBase数据转化成RDD
JavaPairRDD<ImmutableBytesWritable, Result> hbaseRDD = sc.newAPIHadoopRDD(hconf,
TableInputFormat.class, ImmutableBytesWritable.class, Result.class);
hbaseRDD.cache();// 对myRDD进行缓存
System.out.println("数据总条数：" + hbaseRDD.count());

//将Hbase数据转换成PairRDD，年龄：姓名
JavaPairRDD<Integer, String> mapToPair = hbaseRDD.mapToPair(new PairFunction<Tuple2<ImmutableBytesWritable,
Result>, Integer, String>() {
private static final long serialVersionUID = -2437063503351644147L;

@Override
public Tuple2<Integer, String> call(
Tuple2<ImmutableBytesWritable, Result> resultTuple2) throws Exception {
byte[] o1 = resultTuple2._2.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_NAME));//取列的值
byte[] o2 = resultTuple2._2.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_AGE));//取列的值
byte[] o3 = resultTuple2._2.getValue(Bytes.toBytes(FAMILY), Bytes.toBytes(COLUM_SEX));//取列的值
return new Tuple2<Integer, String>(new Integer(new String(o2)), new String(o1));
}
});

//按第几列降序排序
JavaPairRDD<Integer, String> sortByKey = mapToPair.sortByKey(false);
sortByKey.foreach(new VoidFunction<Tuple2<Integer, String>>() {
@Override
public void call(Tuple2<Integer, String> integerStringTuple2) throws Exception {
System.out.println(integerStringTuple2._2);
}
});
//写入数据到hdfs系统
// sortByKey.saveAsTextFile("hdfs://********:8020/tmp/test");

扫描二维码关注公众号，回复： 8703052 查看本文章

hbaseRDD.unpersist();
} catch (Exception e) {
e.printStackTrace();
} finally {
}
}
}

Spark读取Es写入Hdfs

猜你喜欢