Use docker-compose to build Doris and Hudi test environment

The local environment is MacOS M1, and the system version is 12.5.1.

Related documents

docker-compose.yml

version: "3.3"

services:

  namenode:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
    platform: linux/arm64
    hostname: namenode
    container_name: namenode
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    ports:
      - "50070:50070"
      - "8020:8020"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    env_file:
      - ./hadoop.env
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://namenode:50070" ]
      interval: 30s
      timeout: 10s
      retries: 3
    networks:
      mynet:
        ipv4_address: 172.18.0.100

  datanode1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
    platform: linux/arm64
    container_name: datanode1
    hostname: datanode1
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    env_file:
      - ./hadoop.env
    ports:
      - "50075:50075"
      - "50010:50010"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    links:
      - "namenode"
      - "historyserver"
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://datanode1:50075" ]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - namenode
    networks:
      mynet:
        ipv4_address: 172.18.0.101

  historyserver:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
    hostname: historyserver
    container_name: historyserver
    environment:
      - CLUSTER_NAME=hudi_hadoop284_hive232_spark244
    depends_on:
      - "namenode"
    links:
      - "namenode"
    ports:
      - "58188:8188"
    healthcheck:
      test: [ "CMD", "curl", "-f", "http://historyserver:8188" ]
      interval: 30s
      timeout: 10s
      retries: 3
    env_file:
      - ./hadoop.env
    volumes:
      - historyserver:/hadoop/yarn/timeline
    networks:
      mynet:
        ipv4_address: 172.18.0.102

  hive-metastore-postgresql:
    image: menorah84/hive-metastore-postgresql:2.3.0
    platform: linux/arm64
    environment:
      - POSTGRES_HOST_AUTH_METHOD=trust
    volumes:
      - hive-metastore-postgresql:/var/lib/postgresql
    hostname: hive-metastore-postgresql
    container_name: hive-metastore-postgresql
    networks:
      mynet:
        ipv4_address: 172.18.0.103

  hivemetastore:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
    platform: linux/arm64
    hostname: hivemetastore
    container_name: hivemetastore
    links:
      - "hive-metastore-postgresql"
      - "namenode"
    env_file:
      - ./hadoop.env
    command: /opt/hive/bin/hive --service metastore
    environment:
      SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
    ports:
      - "9083:9083"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    healthcheck:
      test: [ "CMD", "nc", "-z", "hivemetastore", "9083" ]
      interval: 30s
      timeout: 10s
      retries: 3
    depends_on:
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.104

  hiveserver:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
    platform: linux/arm64
    hostname: hiveserver
    container_name: hiveserver
    env_file:
      - ./hadoop.env
    environment:
      SERVICE_PRECONDITION: "hivemetastore:9083"
    ports:
      - "10000:10000"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    depends_on:
      - "hivemetastore"
    links:
      - "hivemetastore"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${
    
    HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.105

  sparkmaster:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
    platform: linux/arm64
    hostname: sparkmaster
    container_name: sparkmaster
    env_file:
      - ./hadoop.env
    ports:
      - "8080:8080"
      - "7077:7077"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - INIT_DAEMON_STEP=setup_spark
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.106

  spark-worker-1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
    platform: linux/arm64
    hostname: spark-worker-1
    container_name: spark-worker-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - "8081:8081"
      # JVM debugging port (will be mapped to a random port on host)
      - "5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    networks:
      mynet:
        ipv4_address: 172.18.0.107

  zookeeper:
    image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12'
    platform: linux/arm64
    hostname: zookeeper
    container_name: zookeeper
    ports:
      - "2181:2181"
    environment:
      - ALLOW_ANONYMOUS_LOGIN=yes
    networks:
      mynet:
        ipv4_address: 172.18.0.108

  kafka:
    image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1'
    platform: linux/arm64
    hostname: kafkabroker
    container_name: kafkabroker
    ports:
      - "9092:9092"
    environment:
      - KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
      - ALLOW_PLAINTEXT_LISTENER=yes
      - KAFKA_ADVERTISED_HOST_NAME=kafkabroker
    networks:
      mynet:
        ipv4_address: 172.18.0.109

  adhoc-1:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
    platform: linux/arm64
    hostname: adhoc-1
    container_name: adhoc-1
    env_file:
      - ./hadoop.env
    depends_on:
      - sparkmaster
    ports:
      - '4040:4040'
      # JVM debugging port (mapped to 5006 on the host)
      - "5006:5005"
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${
    
    HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.110

  adhoc-2:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1
    platform: linux/arm64
    hostname: adhoc-2
    container_name: adhoc-2
    env_file:
      - ./hadoop.env
    ports:
      # JVM debugging port (mapped to 5005 on the host)
      - "5005:5005"
    depends_on:
      - sparkmaster
    environment:
      - "SPARK_MASTER=spark://sparkmaster:7077"
    links:
      - "hivemetastore"
      - "hiveserver"
      - "hive-metastore-postgresql"
      - "namenode"
    volumes:
      - ${
    
    HUDI_WS}:/var/hoodie/ws
    networks:
      mynet:
        ipv4_address: 172.18.0.111

  doris-fe:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
    hostname: doris-fe
    container_name: doris-fe
    environment:
      - FE_SERVERS=fe1:172.18.0.112:9010
      - FE_ID=1
    networks:
      mynet:
        ipv4_address: 172.18.0.112

  doris-be:
    image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
    hostname: doris-fe
    container_name: doris-be
    environment:
      - FE_SERVERS=fe1:172.18.0.112:9010
      - BE_ADDR=172.18.0.113:9050
    depends_on:
      - doris-fe
    networks:
      mynet:
        ipv4_address: 172.18.0.113

volumes:
  namenode:
  historyserver:
  hive-metastore-postgresql:

networks:
  mynet:
    external: true

Among them, the docker-compose of hudi refers to the writing method in hudi, but in hudi, both the hiveserver and adhoc containers need to be mounted to the hudi directory of the machine, and hudi needs to be compiled in advance. Here, the two containers of hiveserver and adhoc are compiled Recompile, and put the required packages and configuration files in the image, and will no longer depend on the mounting of external directories.

networks needs to use a custom network, which needs to be created in advance, because the default network name created by docker-compose is {the folder name where the docker-compose.yml file is located}_{default}, and "_" is illegal in the host name characters, a similar error occurs on subsequent accesses:

ERROR 1105 (HY000): HMSClientException, msg: org.apache.hadoop.hive.metastore.api.MetaException: Got exception: java.net.URISyntaxException Illegal character in hostname at index 30: thrift://hivemetastore.compose_default:9083

Mynet can be created in advance with the command:

docker network create mynet --driver bridge --subnet 172.18.0.0/16 --gateway 172.18.0.1
Exception

max_map_countDue to the different ways of implementing containers internally on MacOS, it may not be possible to directly modify the value on the host during deployment . You need to create the following containers first:

docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh

The container was created successfully executing the following command:

sysctl -w vm.max_map_count=2000000

Then exitexit to create a Doris Docker cluster.

hadoop.env

HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083

HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
#HDFS_CONF_dfs_client_use_datanode_hostname=true
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
HDFS_CONF_dfs_replication=1

CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*

YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false

batch_1.json

{"volume": 483951, "symbol": "MSFT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 111.74, "low": 111.55, "key": "MSFT_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 111.72, "open": 111.55, "day": "31"}
{"volume": 1533226, "symbol": "AAPL", "ts": "2018-08-31 09:30:00", "month": "08", "high": 227.3101, "low": 226.23, "key": "AAPL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 227.3101, "open": 226.53, "day": "31"}
{"volume": 36179, "symbol": "GOOG", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1236.0, "low": 1234.36, "key": "GOOG_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1234.54, "open": 1236.0, "day": "31"}
{"volume": 456506, "symbol": "FB", "ts": "2018-08-31 09:30:00", "month": "08", "high": 177.5, "low": 176.465, "key": "FB_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 176.83, "open": 177.29, "day": "31"}
{"volume": 142747, "symbol": "NFLX", "ts": "2018-08-31 09:30:00", "month": "08", "high": 372.0, "low": 370.49, "key": "NFLX_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 371.9, "open": 370.49, "day": "31"}
{"volume": 126884, "symbol": "TSLA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 301.81, "low": 300.11, "key": "TSLA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 300.61, "open": 301.81, "day": "31"}
{"volume": 1201915, "symbol": "F", "ts": "2018-08-31 09:30:00", "month": "08", "high": 9.63, "low": 9.6, "key": "F_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 9.61, "open": 9.63, "day": "31"}
{"volume": 176474, "symbol": "AMZN", "ts": "2018-08-31 09:30:00", "month": "08", "high": 2010.8101, "low": 2007.0, "key": "AMZN_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 2010.5, "open": 2009.8199, "day": "31"}
{"volume": 142523, "symbol": "NVDA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 277.1899, "low": 276.64, "key": "NVDA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 277.1899, "open": 276.875, "day": "31"}
{"volume": 351118, "symbol": "INTC", "ts": "2018-08-31 09:30:00", "month": "08", "high": 48.06, "low": 47.96, "key": "INTC_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 48.03, "open": 47.961, "day": "31"}

The batch_1.json data is reduced here, if you want to get the full amount, you can download it from here: https://github.com/apache/hudi/blob/master/docker/demo/data/batch_1.json

Execute cat batch_1.json | kcat -b kafkabroker -t stock_ticks -Pto import data into Kafka.

start_demo.sh

#!/bin/bash

SCRIPT_PATH=$(cd `dirname $0`; pwd)
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"

# restart cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
sleep 5
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} up -d
sleep 15

docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh

stop_demo.sh

#!/bin/bash

SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
# set up root directory
WS_ROOT=`dirname $SCRIPT_PATH`

COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"

# shut down cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down

# remove houst mount directory
rm -rf /tmp/hadoop_data
rm -rf /tmp/hadoop_name

All images involved are:

docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm

For convenience, you can use the script to start the docker-compose cluster after the download is complete.

test verification

Import data using DeltaStreamer

Hudi provides the DeltaStreamer tool. The tool can connect to various data sources (including Kafka) to pull changes and apply them to Hudi tables using upsert/insert primitives. This tool is used here to download json data from kafka and ingest it into COW and MOR tables. This tool automatically initializes tables in the file system if they do not yet exist.

docker exec -it adhoc-2 /bin/bash

# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
spark-submit \
  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
  --table-type COPY_ON_WRITE \
  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
  --source-ordering-field ts  \
  --target-base-path /user/hive/warehouse/stock_ticks_cow \
  --target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider

# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
spark-submit \
  --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
  --table-type MERGE_ON_READ \
  --source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
  --source-ordering-field ts \
  --target-base-path /user/hive/warehouse/stock_ticks_mor \
  --target-table stock_ticks_mor \
  --props /var/demo/config/kafka-source.properties \
  --schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
  --disable-compaction

# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.

exit

After the import is complete, you can open the following link to view the data locally:

http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow

http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor

Synchronize data in Hudi to Hive

Synchronize tables on HDFS to Hive, including creating Hive tables, adding partitions, etc.

docker exec -it adhoc-2 /bin/bash

# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
  --jdbc-url jdbc:hive2://hiveserver:10000 \
  --user hive \
  --pass hive \
  --partitioned-by dt \
  --base-path /user/hive/warehouse/stock_ticks_cow \
  --database default \
  --table stock_ticks_cow \
  --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
.....
2020-01-25 19:51:28,953 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow
.....

# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
  --jdbc-url jdbc:hive2://hiveserver:10000 \
  --user hive \
  --pass hive \
  --partitioned-by dt \
  --base-path /user/hive/warehouse/stock_ticks_mor \
  --database default \
  --table stock_ticks_mor \
  --partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
...
2020-01-25 19:51:51,066 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro
...
2020-01-25 19:51:51,569 INFO  [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt
....

exit

Connect to Hive for query

docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 \
  --hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
  --hiveconf hive.stats.autogather=false

# List Tables
0: jdbc:hive2://hiveserver:10000> show tables;
+---------------------+--+
|      tab_name       |
+---------------------+--+
| stock_ticks_cow     |
| stock_ticks_mor_ro  |
| stock_ticks_mor_rt  |
+---------------------+--+
3 rows selected (1.199 seconds)
0: jdbc:hive2://hiveserver:10000>


# Look at partitions that were added
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
+----------------+--+
|   partition    |
+----------------+--+
| dt=2018-08-31  |
+----------------+--+
1 row selected (0.24 seconds)


# COPY-ON-WRITE Queries:
=========================


0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+

Now, run a projection query:

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_cow where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20230621074927233    | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20230621074927233    | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+


# Merge-On-Read Queries:
==========================

Lets run similar queries against M-O-R table. Lets look at both 
ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table

# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+
1 row selected (6.326 seconds)


# Run Snapshot Query. Notice that the latest timestamp is again 10:29

0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol  |         _c1          |
+---------+----------------------+--+
| GOOG    | 2018-08-31 10:29:00  |
+---------+----------------------+--+
1 row selected (1.606 seconds)


# Run Read Optimized and Snapshot project queries

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_ro where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+

0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close  from stock_ticks_mor_rt where  symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time  | symbol  |          ts          | volume  |    open    |   close   |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155       | GOOG    | 2018-08-31 09:59:00  | 6330    | 1230.5     | 1230.02   |
| 20180924222155       | GOOG    | 2018-08-31 10:29:00  | 3391    | 1230.1899  | 1230.085  |
+----------------------+---------+----------------------+---------+------------+-----------+--+

exit

connect fe

docker exec -ti doris-fe /bin/bash
# 登录容器后再登录 fe
mysql -h127.0.0.1 -uroot -P9030

Doris creates the Hudi catalog

CREATE CATALOG hudi PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
    'hive.version' = '2.3.3'
);

CREATE CATALOG hudi2 PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://hivemetastore:9083',
    'hive.version' = '2.3.3'
);

CREATE CATALOG hudi3 PROPERTIES (
    'type'='hms',
    'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
    'hadoop.username' = 'hive',
    'dfs.nameservices'='namenode',
    'dfs.ha.namenodes.namenode'='nn1',
    'dfs.namenode.rpc-address.namenode.nn1'='172.18.0.100:8082',
    'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider'
);

Because the IP of hivemetastore above is 172.18.0.104, it is specified here.

view catalog

MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+

switch catalog

MySQL [(none)]> switch hudi;

view catalog

MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+

view catalog

MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type     | IsCurrent |
+-----------+-------------+----------+-----------+
|     10004 | hudi        | hms      |           |
|         0 | internal    | internal | yes       |
+-----------+-------------+----------+-----------+

specified database

MySQL [(none)]> use default;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A

Database changed

execute query

MySQL [default]> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+--------+---------------------+
| symbol | max(`ts`)           |
+--------+---------------------+
| GOOG   | 2018-08-31 10:29:00 |
+--------+---------------------+

Guess you like

Origin blog.csdn.net/weixin_39992480/article/details/131532425