spark 推送数据至 elasticsearch

1.工程依赖

<properties>
    <spark_version>2.3.1</spark_version>
    <!-- elasticsearch-->
    <elasticsearch.version>5.5.2</elasticsearch.version>
    <fastjson.version>1.2.28</fastjson.version>
    <elasticsearch-hadoop.version>6.3.2</elasticsearch-hadoop.version>
    <elasticsearch-spark.version>5.5.2</elasticsearch-spark.version>
</properties>
<dependencies>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-yarn -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-yarn_2.11</artifactId>
        <version>${spark_version}</version>
    </dependency>
    <dependency>
        <groupId>org.elasticsearch</groupId>
        <artifactId>elasticsearch-spark-20_2.11</artifactId>
        <version>${elasticsearch-spark.version}</version>
    </dependency>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.46</version>
    </dependency>
</dependencies>

2.spark读取hadoop hdfs数据,并推送至elasticsearch

public class PushWordCombination {

    private static PinyinTool tool = new PinyinTool();

    public static void pushDataByLen(SparkContext sc, SparkSession sparkSession, String goodsCategory, Integer len) {
        Dataset<Row> goodsDF1 = sparkSession.read().format("json").json(String.format("/data/app/%s/combination%d.json", goodsCategory, len));
        if (goodsDF1.count() == 0) {
            return;
        }

        sparkSession.udf().register("pinYin", (String s) -> tool.toPinYin(s, "", PinyinTool.Type.LOWERCASE), DataTypes.StringType);

        Encoder<RDDKeyByCounts> nameKeyEncoder = Encoders.bean(RDDKeyByCounts.class);
        Dataset<RDDKeyByCounts> dataset = goodsDF1.selectExpr("name as name", "counts as counts", String.format("%d as goodsCategory", 0),
                String.format("%d as nameLen", len), "pinYin(name) as pinYin").as(nameKeyEncoder);

        JavaEsSpark.saveToEs(dataset.javaRDD(),"goods-category/category", ImmutableMap.of("es.mapping.id", "name"));
    }

    public static void main(String[] args) {
        //自定义比较器
        SparkConf conf = new SparkConf().setAppName("my-app").
                set("es.nodes", ESProperties.IP).
                set("es.port",ESProperties.PORT).
                set("pushdown",ESProperties.PUSH_DOWN).
                set("es.index.auto.create",ESProperties.INDEX_AUTO_CREATE).
                set("es.nodes.wan.only","true").//在这种模式下,连接器禁用发现,并且只在所有操作中通过声明的ESE节点连接,包括读和写
                set("es.net.http.auth.user",ESProperties.SECURITY_USER).
                set("es.net.http.auth.pass",ESProperties.SECURITY_PWD);

扫描二维码关注公众号,回复: 5567553 查看本文章

        SparkContext sc = new SparkContext(conf);

        SparkSession sparkSession = new SparkSession(sc);

        for (int j = 2; j <= 4; j++) {
            pushDataByLen(sc, sparkSession, "all", j);
        }
        sparkSession.stop();
    }
}

猜你喜欢

转载自blog.csdn.net/yangbosos/article/details/88620100