The local environment is MacOS M1, and the system version is 12.5.1.
Related documents
docker-compose.yml
version: "3.3"
services:
namenode:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
platform: linux/arm64
hostname: namenode
container_name: namenode
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
ports:
- "50070:50070"
- "8020:8020"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
env_file:
- ./hadoop.env
healthcheck:
test: [ "CMD", "curl", "-f", "http://namenode:50070" ]
interval: 30s
timeout: 10s
retries: 3
networks:
mynet:
ipv4_address: 172.18.0.100
datanode1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
platform: linux/arm64
container_name: datanode1
hostname: datanode1
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
env_file:
- ./hadoop.env
ports:
- "50075:50075"
- "50010:50010"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
links:
- "namenode"
- "historyserver"
healthcheck:
test: [ "CMD", "curl", "-f", "http://datanode1:50075" ]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- namenode
networks:
mynet:
ipv4_address: 172.18.0.101
historyserver:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
hostname: historyserver
container_name: historyserver
environment:
- CLUSTER_NAME=hudi_hadoop284_hive232_spark244
depends_on:
- "namenode"
links:
- "namenode"
ports:
- "58188:8188"
healthcheck:
test: [ "CMD", "curl", "-f", "http://historyserver:8188" ]
interval: 30s
timeout: 10s
retries: 3
env_file:
- ./hadoop.env
volumes:
- historyserver:/hadoop/yarn/timeline
networks:
mynet:
ipv4_address: 172.18.0.102
hive-metastore-postgresql:
image: menorah84/hive-metastore-postgresql:2.3.0
platform: linux/arm64
environment:
- POSTGRES_HOST_AUTH_METHOD=trust
volumes:
- hive-metastore-postgresql:/var/lib/postgresql
hostname: hive-metastore-postgresql
container_name: hive-metastore-postgresql
networks:
mynet:
ipv4_address: 172.18.0.103
hivemetastore:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
platform: linux/arm64
hostname: hivemetastore
container_name: hivemetastore
links:
- "hive-metastore-postgresql"
- "namenode"
env_file:
- ./hadoop.env
command: /opt/hive/bin/hive --service metastore
environment:
SERVICE_PRECONDITION: "namenode:50070 hive-metastore-postgresql:5432"
ports:
- "9083:9083"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
healthcheck:
test: [ "CMD", "nc", "-z", "hivemetastore", "9083" ]
interval: 30s
timeout: 10s
retries: 3
depends_on:
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.104
hiveserver:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
platform: linux/arm64
hostname: hiveserver
container_name: hiveserver
env_file:
- ./hadoop.env
environment:
SERVICE_PRECONDITION: "hivemetastore:9083"
ports:
- "10000:10000"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
depends_on:
- "hivemetastore"
links:
- "hivemetastore"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${
HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.105
sparkmaster:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
platform: linux/arm64
hostname: sparkmaster
container_name: sparkmaster
env_file:
- ./hadoop.env
ports:
- "8080:8080"
- "7077:7077"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
environment:
- INIT_DAEMON_STEP=setup_spark
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.106
spark-worker-1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
platform: linux/arm64
hostname: spark-worker-1
container_name: spark-worker-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- "8081:8081"
# JVM debugging port (will be mapped to a random port on host)
- "5005"
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
networks:
mynet:
ipv4_address: 172.18.0.107
zookeeper:
image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12'
platform: linux/arm64
hostname: zookeeper
container_name: zookeeper
ports:
- "2181:2181"
environment:
- ALLOW_ANONYMOUS_LOGIN=yes
networks:
mynet:
ipv4_address: 172.18.0.108
kafka:
image: 'registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1'
platform: linux/arm64
hostname: kafkabroker
container_name: kafkabroker
ports:
- "9092:9092"
environment:
- KAFKA_ZOOKEEPER_CONNECT=zookeeper:2181
- ALLOW_PLAINTEXT_LISTENER=yes
- KAFKA_ADVERTISED_HOST_NAME=kafkabroker
networks:
mynet:
ipv4_address: 172.18.0.109
adhoc-1:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
platform: linux/arm64
hostname: adhoc-1
container_name: adhoc-1
env_file:
- ./hadoop.env
depends_on:
- sparkmaster
ports:
- '4040:4040'
# JVM debugging port (mapped to 5006 on the host)
- "5006:5005"
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${
HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.110
adhoc-2:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:linux-arm64-0.10.1
platform: linux/arm64
hostname: adhoc-2
container_name: adhoc-2
env_file:
- ./hadoop.env
ports:
# JVM debugging port (mapped to 5005 on the host)
- "5005:5005"
depends_on:
- sparkmaster
environment:
- "SPARK_MASTER=spark://sparkmaster:7077"
links:
- "hivemetastore"
- "hiveserver"
- "hive-metastore-postgresql"
- "namenode"
volumes:
- ${
HUDI_WS}:/var/hoodie/ws
networks:
mynet:
ipv4_address: 172.18.0.111
doris-fe:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
hostname: doris-fe
container_name: doris-fe
environment:
- FE_SERVERS=fe1:172.18.0.112:9010
- FE_ID=1
networks:
mynet:
ipv4_address: 172.18.0.112
doris-be:
image: registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
hostname: doris-fe
container_name: doris-be
environment:
- FE_SERVERS=fe1:172.18.0.112:9010
- BE_ADDR=172.18.0.113:9050
depends_on:
- doris-fe
networks:
mynet:
ipv4_address: 172.18.0.113
volumes:
namenode:
historyserver:
hive-metastore-postgresql:
networks:
mynet:
external: true
Among them, the docker-compose of hudi refers to the writing method in hudi, but in hudi, both the hiveserver and adhoc containers need to be mounted to the hudi directory of the machine, and hudi needs to be compiled in advance. Here, the two containers of hiveserver and adhoc are compiled Recompile, and put the required packages and configuration files in the image, and will no longer depend on the mounting of external directories.
networks needs to use a custom network, which needs to be created in advance, because the default network name created by docker-compose is {the folder name where the docker-compose.yml file is located}_{default}, and "_" is illegal in the host name characters, a similar error occurs on subsequent accesses:
ERROR 1105 (HY000): HMSClientException, msg: org.apache.hadoop.hive.metastore.api.MetaException: Got exception: java.net.URISyntaxException Illegal character in hostname at index 30: thrift://hivemetastore.compose_default:9083
Mynet can be created in advance with the command:
docker network create mynet --driver bridge --subnet 172.18.0.0/16 --gateway 172.18.0.1
Exception
max_map_count
Due to the different ways of implementing containers internally on MacOS, it may not be possible to directly modify the value on the host during deployment . You need to create the following containers first:
docker run -it --privileged --pid=host --name=change_count debian nsenter -t 1 -m -u -n -i sh
The container was created successfully executing the following command:
sysctl -w vm.max_map_count=2000000
Then exit
exit to create a Doris Docker cluster.
hadoop.env
HIVE_SITE_CONF_javax_jdo_option_ConnectionURL=jdbc:postgresql://hive-metastore-postgresql/metastore
HIVE_SITE_CONF_javax_jdo_option_ConnectionDriverName=org.postgresql.Driver
HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
HDFS_CONF_dfs_webhdfs_enabled=true
HDFS_CONF_dfs_permissions_enabled=false
#HDFS_CONF_dfs_client_use_datanode_hostname=true
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
HDFS_CONF_dfs_replication=1
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
CORE_CONF_hadoop_http_staticuser_user=root
CORE_CONF_hadoop_proxyuser_hue_hosts=*
CORE_CONF_hadoop_proxyuser_hue_groups=*
YARN_CONF_yarn_log___aggregation___enable=true
YARN_CONF_yarn_resourcemanager_recovery_enabled=true
YARN_CONF_yarn_resourcemanager_store_class=org.apache.hadoop.yarn.server.resourcemanager.recovery.FileSystemRMStateStore
YARN_CONF_yarn_resourcemanager_fs_state___store_uri=/rmstate
YARN_CONF_yarn_nodemanager_remote___app___log___dir=/app-logs
YARN_CONF_yarn_log_server_url=http://historyserver:8188/applicationhistory/logs/
YARN_CONF_yarn_timeline___service_enabled=true
YARN_CONF_yarn_timeline___service_generic___application___history_enabled=true
YARN_CONF_yarn_resourcemanager_system___metrics___publisher_enabled=true
YARN_CONF_yarn_resourcemanager_hostname=resourcemanager
YARN_CONF_yarn_timeline___service_hostname=historyserver
YARN_CONF_yarn_resourcemanager_address=resourcemanager:8032
YARN_CONF_yarn_resourcemanager_scheduler_address=resourcemanager:8030
YARN_CONF_yarn_resourcemanager_resource___tracker_address=resourcemanager:8031
YARN_CONF_yarn_nodemanager_vmem___check___enabled=false
batch_1.json
{"volume": 483951, "symbol": "MSFT", "ts": "2018-08-31 09:30:00", "month": "08", "high": 111.74, "low": 111.55, "key": "MSFT_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 111.72, "open": 111.55, "day": "31"}
{"volume": 1533226, "symbol": "AAPL", "ts": "2018-08-31 09:30:00", "month": "08", "high": 227.3101, "low": 226.23, "key": "AAPL_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 227.3101, "open": 226.53, "day": "31"}
{"volume": 36179, "symbol": "GOOG", "ts": "2018-08-31 09:30:00", "month": "08", "high": 1236.0, "low": 1234.36, "key": "GOOG_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 1234.54, "open": 1236.0, "day": "31"}
{"volume": 456506, "symbol": "FB", "ts": "2018-08-31 09:30:00", "month": "08", "high": 177.5, "low": 176.465, "key": "FB_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 176.83, "open": 177.29, "day": "31"}
{"volume": 142747, "symbol": "NFLX", "ts": "2018-08-31 09:30:00", "month": "08", "high": 372.0, "low": 370.49, "key": "NFLX_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 371.9, "open": 370.49, "day": "31"}
{"volume": 126884, "symbol": "TSLA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 301.81, "low": 300.11, "key": "TSLA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 300.61, "open": 301.81, "day": "31"}
{"volume": 1201915, "symbol": "F", "ts": "2018-08-31 09:30:00", "month": "08", "high": 9.63, "low": 9.6, "key": "F_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 9.61, "open": 9.63, "day": "31"}
{"volume": 176474, "symbol": "AMZN", "ts": "2018-08-31 09:30:00", "month": "08", "high": 2010.8101, "low": 2007.0, "key": "AMZN_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 2010.5, "open": 2009.8199, "day": "31"}
{"volume": 142523, "symbol": "NVDA", "ts": "2018-08-31 09:30:00", "month": "08", "high": 277.1899, "low": 276.64, "key": "NVDA_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 277.1899, "open": 276.875, "day": "31"}
{"volume": 351118, "symbol": "INTC", "ts": "2018-08-31 09:30:00", "month": "08", "high": 48.06, "low": 47.96, "key": "INTC_2018-08-31 09", "year": 2018, "date": "2018/08/31", "close": 48.03, "open": 47.961, "day": "31"}
The batch_1.json data is reduced here, if you want to get the full amount, you can download it from here: https://github.com/apache/hudi/blob/master/docker/demo/data/batch_1.json
Execute cat batch_1.json | kcat -b kafkabroker -t stock_ticks -P
to import data into Kafka.
start_demo.sh
#!/bin/bash
SCRIPT_PATH=$(cd `dirname $0`; pwd)
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"
# restart cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
sleep 5
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} up -d
sleep 15
docker exec -it adhoc-1 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
docker exec -it adhoc-2 /bin/bash /var/hoodie/ws/docker/demo/setup_demo_container.sh
stop_demo.sh
#!/bin/bash
SCRIPT_PATH=$(cd `dirname $0`; pwd)
HUDI_DEMO_ENV=$1
# set up root directory
WS_ROOT=`dirname $SCRIPT_PATH`
COMPOSE_FILE_NAME="docker-compose_mac_aarch64.yml"
# shut down cluster
HUDI_WS=${WS_ROOT} docker-compose -f ${SCRIPT_PATH}/${COMPOSE_FILE_NAME} down
# remove houst mount directory
rm -rf /tmp/hadoop_data
rm -rf /tmp/hadoop_name
All images involved are:
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkworker_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkmaster_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-datanode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-namenode:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-history:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/hudi-hadoop_2.8.4-hive_2.3.3-sparkadhoc_2.4.4:arm64
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/kafka:2.13-2.8.1
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/zookeeper:3.4.12
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-fe-arm
docker pull registry.cn-hangzhou.aliyuncs.com/jensenchen/doris:1.2.2-be-arm
For convenience, you can use the script to start the docker-compose cluster after the download is complete.
test verification
Import data using DeltaStreamer
Hudi provides the DeltaStreamer tool. The tool can connect to various data sources (including Kafka) to pull changes and apply them to Hudi tables using upsert/insert primitives. This tool is used here to download json data from kafka and ingest it into COW and MOR tables. This tool automatically initializes tables in the file system if they do not yet exist.
docker exec -it adhoc-2 /bin/bash
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_cow table in HDFS
spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
--table-type COPY_ON_WRITE \
--source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
--source-ordering-field ts \
--target-base-path /user/hive/warehouse/stock_ticks_cow \
--target-table stock_ticks_cow --props /var/demo/config/kafka-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider
# Run the following spark-submit command to execute the delta-streamer and ingest to stock_ticks_mor table in HDFS
spark-submit \
--class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer $HUDI_UTILITIES_BUNDLE \
--table-type MERGE_ON_READ \
--source-class org.apache.hudi.utilities.sources.JsonKafkaSource \
--source-ordering-field ts \
--target-base-path /user/hive/warehouse/stock_ticks_mor \
--target-table stock_ticks_mor \
--props /var/demo/config/kafka-source.properties \
--schemaprovider-class org.apache.hudi.utilities.schema.FilebasedSchemaProvider \
--disable-compaction
# As part of the setup (Look at setup_demo.sh), the configs needed for DeltaStreamer is uploaded to HDFS. The configs
# contain mostly Kafa connectivity settings, the avro-schema to be used for ingesting along with key and partitioning fields.
exit
After the import is complete, you can open the following link to view the data locally:
http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_cow
http://namenode:50070/explorer.html#/user/hive/warehouse/stock_ticks_mor
Synchronize data in Hudi to Hive
Synchronize tables on HDFS to Hive, including creating Hive tables, adding partitions, etc.
docker exec -it adhoc-2 /bin/bash
# This command takes in HiveServer URL and COW Hudi table location in HDFS and sync the HDFS state to Hive
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
--jdbc-url jdbc:hive2://hiveserver:10000 \
--user hive \
--pass hive \
--partitioned-by dt \
--base-path /user/hive/warehouse/stock_ticks_cow \
--database default \
--table stock_ticks_cow \
--partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
.....
2020-01-25 19:51:28,953 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_cow
.....
# Now run hive-sync for the second data-set in HDFS using Merge-On-Read (MOR table type)
/var/hoodie/ws/hudi-sync/hudi-hive-sync/run_sync_tool.sh \
--jdbc-url jdbc:hive2://hiveserver:10000 \
--user hive \
--pass hive \
--partitioned-by dt \
--base-path /user/hive/warehouse/stock_ticks_mor \
--database default \
--table stock_ticks_mor \
--partition-value-extractor org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor
...
2020-01-25 19:51:51,066 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_ro
...
2020-01-25 19:51:51,569 INFO [main] hive.HiveSyncTool (HiveSyncTool.java:syncHoodieTable(129)) - Sync complete for stock_ticks_mor_rt
....
exit
Connect to Hive for query
docker exec -it adhoc-2 /bin/bash
beeline -u jdbc:hive2://hiveserver:10000 \
--hiveconf hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat \
--hiveconf hive.stats.autogather=false
# List Tables
0: jdbc:hive2://hiveserver:10000> show tables;
+---------------------+--+
| tab_name |
+---------------------+--+
| stock_ticks_cow |
| stock_ticks_mor_ro |
| stock_ticks_mor_rt |
+---------------------+--+
3 rows selected (1.199 seconds)
0: jdbc:hive2://hiveserver:10000>
# Look at partitions that were added
0: jdbc:hive2://hiveserver:10000> show partitions stock_ticks_mor_rt;
+----------------+--+
| partition |
+----------------+--+
| dt=2018-08-31 |
+----------------+--+
1 row selected (0.24 seconds)
# COPY-ON-WRITE Queries:
=========================
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
Now, run a projection query:
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_cow where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20230621074927233 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20230621074927233 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
# Merge-On-Read Queries:
==========================
Lets run similar queries against M-O-R table. Lets look at both
ReadOptimized and Snapshot(realtime data) queries supported by M-O-R table
# Run ReadOptimized Query. Notice that the latest timestamp is 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_ro group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (6.326 seconds)
# Run Snapshot Query. Notice that the latest timestamp is again 10:29
0: jdbc:hive2://hiveserver:10000> select symbol, max(ts) from stock_ticks_mor_rt group by symbol HAVING symbol = 'GOOG';
WARNING: Hive-on-MR is deprecated in Hive 2 and may not be available in the future versions. Consider using a different execution engine (i.e. spark, tez) or using Hive 1.X releases.
+---------+----------------------+--+
| symbol | _c1 |
+---------+----------------------+--+
| GOOG | 2018-08-31 10:29:00 |
+---------+----------------------+--+
1 row selected (1.606 seconds)
# Run Read Optimized and Snapshot project queries
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_ro where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
0: jdbc:hive2://hiveserver:10000> select `_hoodie_commit_time`, symbol, ts, volume, open, close from stock_ticks_mor_rt where symbol = 'GOOG';
+----------------------+---------+----------------------+---------+------------+-----------+--+
| _hoodie_commit_time | symbol | ts | volume | open | close |
+----------------------+---------+----------------------+---------+------------+-----------+--+
| 20180924222155 | GOOG | 2018-08-31 09:59:00 | 6330 | 1230.5 | 1230.02 |
| 20180924222155 | GOOG | 2018-08-31 10:29:00 | 3391 | 1230.1899 | 1230.085 |
+----------------------+---------+----------------------+---------+------------+-----------+--+
exit
connect fe
docker exec -ti doris-fe /bin/bash
# 登录容器后再登录 fe
mysql -h127.0.0.1 -uroot -P9030
Doris creates the Hudi catalog
CREATE CATALOG hudi PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
'hive.version' = '2.3.3'
);
CREATE CATALOG hudi2 PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://hivemetastore:9083',
'hive.version' = '2.3.3'
);
CREATE CATALOG hudi3 PROPERTIES (
'type'='hms',
'hive.metastore.uris' = 'thrift://172.18.0.104:9083',
'hadoop.username' = 'hive',
'dfs.nameservices'='namenode',
'dfs.ha.namenodes.namenode'='nn1',
'dfs.namenode.rpc-address.namenode.nn1'='172.18.0.100:8082',
'dfs.client.failover.proxy.provider.your-nameservice'='org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider'
);
Because the IP of hivemetastore above is 172.18.0.104, it is specified here.
view catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
switch catalog
MySQL [(none)]> switch hudi;
view catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
view catalog
MySQL [(none)]> show catalogs;
+-----------+-------------+----------+-----------+
| CatalogId | CatalogName | Type | IsCurrent |
+-----------+-------------+----------+-----------+
| 10004 | hudi | hms | |
| 0 | internal | internal | yes |
+-----------+-------------+----------+-----------+
specified database
MySQL [(none)]> use default;
Reading table information for completion of table and column names
You can turn off this feature to get a quicker startup with -A
Database changed
execute query
MySQL [default]> select symbol, max(ts) from stock_ticks_cow group by symbol HAVING symbol = 'GOOG';
+--------+---------------------+
| symbol | max(`ts`) |
+--------+---------------------+
| GOOG | 2018-08-31 10:29:00 |
+--------+---------------------+