多种姿势带你玩转Flume

前戏

部署单机版Kafka

[hadoop@hadoop004 software]$ wget https://archive.apache.org/dist/kafka/0.8.2.2/kafka_2.11-0.8.2.2.tgz

[hadoop@hadoop004 software]$ tar -zxf kafka_2.11-0.8.2.2.tgz -C ~/app/

[hadoop@hadoop004 kafka_2.11-0.8.2.2]$ bin/zookeeper-server-start.sh config/zookeeper.properties

[hadoop@hadoop004 kafka_2.11-0.8.2.2]$ bin/kafka-server-start.sh config/server.properties

此时，zookeeper那里报了如下错误，其实也不是报错，可以先不管

[2019-05-30 09:54:28,799] INFO Established session 0x16b0670f42f0000 with negotiated timeout 6000 for client /127.0.0.1:52970 (org.apache.zookeeper.server.ZooKeeperServer)
[2019-05-30 09:54:28,830] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x5 zxid:0x3 txntype:-1 reqpath:n/a Error Path:/brokers Error:KeeperErrorCode = NoNode for /brokers (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:28,840] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0xb zxid:0x7 txntype:-1 reqpath:n/a Error Path:/config Error:KeeperErrorCode = NoNode for /config (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:28,852] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x13 zxid:0xc txntype:-1 reqpath:n/a Error Path:/admin Error:KeeperErrorCode = NoNode for /admin (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,081] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:setData cxid:0x21 zxid:0x12 txntype:-1 reqpath:n/a Error Path:/controller_epoch Error:KeeperErrorCode = NoNode for /controller_epoch (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,115] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:delete cxid:0x30 zxid:0x14 txntype:-1 reqpath:n/a Error Path:/admin/preferred_replica_election Error:KeeperErrorCode = NoNode for /admin/preferred_replica_election (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,162] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x37 zxid:0x15 txntype:-1 reqpath:n/a Error Path:/brokers Error:KeeperErrorCode = NodeExists for /brokers (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,162] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x38 zxid:0x16 txntype:-1 reqpath:n/a Error Path:/brokers/ids Error:KeeperErrorCode = NodeExists for /brokers/ids (org.apache.zookeeper.server.PrepRequestProcessor)

部署安装telnet

先检查是否有下面两个安装包

[hadoop@hadoop004 conffile]$ rpm -qa telnet-server
[hadoop@hadoop004 conffile]$ rpm -qa xinetd

[hadoop@hadoop004 conffile]$ yum list |grep telnet
Repodata is over 2 weeks old. Install yum-cron? Or run: yum makecache fast
dcap-tunnel-telnet.x86_64                2.47.12-4.el7                   epel
libguac-client-telnet.x86_64             1:0.9.14-1.el7                  epel
libtelnet.x86_64                         0.21-5.el7                      epel
libtelnet-devel.x86_64                   0.21-5.el7                      epel
libtelnet-utils.x86_64                   0.21-5.el7                      epel
telnet.x86_64                            1:0.17-64.el7                   base
telnet-server.x86_64                     1:0.17-64.el7                   base

[root@hadoop004 ~]# yum install telnet-server.x86_64 -y

[root@hadoop004 ~]# yum install telnet.x86_64 -y

下载Flume

Flume CDH版本的文档地址 http://archive.cloudera.com/cdh5/cdh/5/flume-ng-1.6.0-cdh5.7.0/

[hadoop@hadoop004 software]$ wget http://archive.cloudera.com/cdh5/cdh/5/flume-ng-1.6.0-cdh5.7.0.tar.gz

[hadoop@hadoop004 software]$ tar -zxf flume-ng-1.6.0-cdh5.7.0.tar.gz -C ~/app/

[hadoop@hadoop004 app]$ cd apache-flume-1.6.0-cdh5.7.0-bin/

[hadoop@hadoop004 apache-flume-1.6.0-cdh5.7.0-bin]$ cd conf

[hadoop@hadoop004 conf]$ cp flume-env.sh.template flume-env.sh

[hadoop@hadoop004 conf]$ vim flume-env.sh

export JAVA_HOME=/usr/java/jdk1.8.0_144

调优点

export JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote"

把-Xms -Xmx的值改为一样的

我们继续，先实践一下官网的例子

[hadoop@hadoop004 conf]$ mkdir conffile

[hadoop@hadoop004 conf]$ cd conffile/

[hadoop@hadoop004 conffile]$ vim example.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

# Describe the sink
a1.sinks.k1.type = logger

# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

./flume-ng agent \
--name a1 \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/example.conf \
-Dflume.root.logger=INFO,console

下面再来一个案例

首先我们看看Agent的选型
source --> channel --> sink --> source --> channel --> sink
exec memory avro avro memory logger

[root@hadoop004 conffile]# vim flume-avro-sink.conf

flume-avro-sink.sources = exec-source
flume-avro-sink.channels = avro-memory-channel
flume-avro-sink.sinks = avro-sink

flume-avro-sink.sources.exec-source.type = exec
flume-avro-sink.sources.exec-source.command = tail -F /home/hadoop/data/avro_access.data

flume-avro-sink.channels.avro-memory-channel.type = memory

flume-avro-sink.sinks.avro-sink.type = avro
flume-avro-sink.sinks.avro-sink.hostname = localhost
flume-avro-sink.sinks.avro-sink.port = 44444

flume-avro-sink.sources.exec-source.channels = avro-memory-channel
flume-avro-sink.sinks.avro-sink.channel = avro-memory-channel

[root@hadoop004 conffile]# vim flume-avro-source.conf

flume-avro-source.sources = avro-source
flume-avro-source.channels = avro-memory-channel
flume-avro-source.sinks = logger-sink

flume-avro-source.sources.avro-source.type = avro
flume-avro-source.sources.avro-source.bind = localhost
flume-avro-source.sources.avro-source.port = 44444

flume-avro-source.channels.avro-memory-channel.type = memory

flume-avro-source.sinks.logger-sink.type = logger

flume-avro-source.sources.avro-source.channels = avro-memory-channel
flume-avro-source.sinks.logger-sink.channel = avro-memory-channel

按顺序启动下面两条命令，一定要按顺序

./flume-ng agent \
--name flume-avro-source \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/flume-avro-source.conf \
-Dflume.root.logger=INFO,console



./flume-ng agent \
--name flume-avro-sink \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/flume-avro-sink.conf \
-Dflume.root.logger=INFO,console

echo一条数据进去

[root@hadoop004 ~]# echo "hello world flume" >> /home/hadoop/data/avro_access.data

这是由flume-avro-source.conf所启动的Flume进程的截图

成功了！！

Flume还支持这样的架构，这里就不实践了。

下面再来一个案例

首先我们看看Agent的选型

netcat --> channel --> HDFS
--> channel --> Logger

[hadoop@hadoop004 conffile]$ vim channel-replicating-selector.conf

a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2

a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444

a1.sources.r1.selector.type = replicating
a1.sources.r1.channels = c1 c2

a1.channels.c1.type = memory
a1.channels.c2.type = memory

a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/g6-events/%y%m%d%H%M
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp=true
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.writeFormat=Text

a1.sinks.k2.type = logger


a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2

执行命令

./flume-ng agent \
--name a1 \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/channel-replicating-selector.conf \
-Dflume.root.logger=INFO,console

再去Hadoop Web UI界面看一看检查一下

成功了！！

我们开始来做个作业吧

首先我们看看Agent的选型

Taildir Source --> File Channel --> AVRO Sink
AVRO Source --> File Channel --> HDFS
--> Kafka

开干吧！

先启动HDFS吧，略

修改Kafka配置文件

[root@hadoop004 config]# vim server.properties

host.name=hadoop004
port=9092

zookeeper.connect=localhost:2181/kafka

但我这里还是使用了0.10.0，先在Kafka上创建一个名为world的topic

[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-topics.sh --list --zookeeper hadoop004:2181/kafka
hi

启动zookeeper

[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/zookeeper-server-start.sh config/zookeeper.properties

启动Kafka

[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-server-start.sh config/server.properties

Flume配置文件如下

[hadoop@hadoop004 conffile]$ vim homework_start.conf

flume-homework.sources = taildir-src
flume-homework.channels = taildir-file-channel
flume-homework.sinks = avro-sink

flume-homework.sources.taildir-src.type = TAILDIR
flume-homework.sources.taildir-src.positionFile = /home/hadoop/data/hello/taildir_position.json
flume-homework.sources.taildir-src.filegroups = f1 f2
flume-homework.sources.taildir-src.filegroups.f1 = /home/hadoop/data/hello/hello.log
flume-homework.sources.taildir-src.fheaders.f1.headerKey1 = hello
flume-homework.sources.taildir-src.filegroups.f2 = /home/hadoop/data/hi/hi.log
flume-homework.sources.taildir-src.headers.f2.headerKey1 = hi
flume-homework.sources.taildir-src.headers.f2.headerKey2 = hi-2

flume-homework.channels.taildir-file-channel.type = file
flume-homework.channels.taildir-file-channel.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/src/checkpoint
flume-homework.channels.taildir-file-channel.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/src/data


flume-homework.sinks.avro-sink.type = avro
flume-homework.sinks.avro-sink.hostname = localhost
flume-homework.sinks.avro-sink.port = 44444

flume-homework.sources.taildir-src.channels = taildir-file-channel
flume-homework.sinks.avro-sink.channel = taildir-file-channel

[hadoop@hadoop004 conffile]$ vim homework_end.conf

flume-homework-end.sources = avro-source
flume-homework-end.channels = hdfs-chan kafka-chan
flume-homework-end.sinks = hdfs-end kafka-end

# 定义sources类型
flume-homework-end.sources.avro-source.type = avro
flume-homework-end.sources.avro-source.bind = localhost
flume-homework-end.sources.avro-source.port = 44444

# 定义多个channel输出同样的数据
flume-homework-end.sources.avro-source.selector.type = replicating
flume-homework-end.sources.avro-source.channels = hdfs-chan kafka-chan

# 定义channel为file类型
flume-homework-end.channels.hdfs-chan.type = file
flume-homework-end.channels.hdfs-chan.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/hdfs/checkpoint
flume-homework-end.channels.hdfs-chan.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/hdfs/data

flume-homework-end.channels.kafka-chan.type = file
flume-homework-end.channels.kafka-chan.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/kafka/checkpoint
flume-homework-end.channels.kafka-chan.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/kafka/data


# 定义hdfs sinks
flume-homework-end.sinks.hdfs-end.type = hdfs
flume-homework-end.sinks.hdfs-end.hdfs.path = /flume/g6-hdfs/%y%m%d%H%M
flume-homework-end.sinks.hdfs-end.hdfs.filePrefix = events-
flume-homework-end.sinks.hdfs-end.hdfs.round = true
flume-homework-end.sinks.hdfs-end.hdfs.roundValue = 1
flume-homework-end.sinks.hdfs-end.hdfs.roundUnit = minute
flume-homework-end.sinks.hdfs-end.hdfs.useLocalTimeStamp=true
flume-homework-end.sinks.hdfs-end.hdfs.fileType=DataStream
flume-homework-end.sinks.hdfs-end.hdfs.writeFormat=Text

# 定义kafka sinks
flume-homework-end.sinks.kafka-end.type = org.apache.flume.sink.kafka.KafkaSink
flume-homework-end.sinks.kafka-end.topic = hi
flume-homework-end.sinks.kafka-end.brokerList = hadoop004:9092
flume-homework-end.sinks.kafka-end.requiredAcks = 1
flume-homework-end.sinks.kafka-end.batchSize = 20
#flume-homework-end.sinks.kafka-end.type = logger

flume-homework-end.sinks.hdfs-end.channel = hdfs-chan
flume-homework-end.sinks.kafka-end.channel = kafka-chan

启动Flume agent，一定要按顺序，先把后面的启动起来，再启动前面的

./flume-ng agent \
--name flume-homework-end \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/homework_end.conf \
-Dflume.root.logger=INFO,console


./flume-ng agent \
--name flume-homework \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/homework_start.conf \
-Dflume.root.logger=INFO,console

[root@hadoop004 config]# echo "hello homework" >> /home/hadoop/data/hello/hello.log

启动一个Kafka消费者来验证是否有数据到达Kafka

[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-console-consumer.sh --zookeeper localhost:2181/kafka --topic hi  --from-beginning

再插入另外一条数据到另外一个被监听的文件

[root@hadoop004 config]# echo "hi homework" >> /home/hadoop/data/hi/hi.log

数据都是都进去到HDFS和Kafka里面

至此，作业完成了！！