前戏
部署单机版Kafka
[hadoop@hadoop004 software]$ wget https://archive.apache.org/dist/kafka/0.8.2.2/kafka_2.11-0.8.2.2.tgz
[hadoop@hadoop004 software]$ tar -zxf kafka_2.11-0.8.2.2.tgz -C ~/app/
[hadoop@hadoop004 kafka_2.11-0.8.2.2]$ bin/zookeeper-server-start.sh config/zookeeper.properties
[hadoop@hadoop004 kafka_2.11-0.8.2.2]$ bin/kafka-server-start.sh config/server.properties
此时,zookeeper那里报了如下错误,其实也不是报错,可以先不管
[2019-05-30 09:54:28,799] INFO Established session 0x16b0670f42f0000 with negotiated timeout 6000 for client /127.0.0.1:52970 (org.apache.zookeeper.server.ZooKeeperServer)
[2019-05-30 09:54:28,830] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x5 zxid:0x3 txntype:-1 reqpath:n/a Error Path:/brokers Error:KeeperErrorCode = NoNode for /brokers (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:28,840] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0xb zxid:0x7 txntype:-1 reqpath:n/a Error Path:/config Error:KeeperErrorCode = NoNode for /config (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:28,852] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x13 zxid:0xc txntype:-1 reqpath:n/a Error Path:/admin Error:KeeperErrorCode = NoNode for /admin (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,081] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:setData cxid:0x21 zxid:0x12 txntype:-1 reqpath:n/a Error Path:/controller_epoch Error:KeeperErrorCode = NoNode for /controller_epoch (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,115] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:delete cxid:0x30 zxid:0x14 txntype:-1 reqpath:n/a Error Path:/admin/preferred_replica_election Error:KeeperErrorCode = NoNode for /admin/preferred_replica_election (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,162] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x37 zxid:0x15 txntype:-1 reqpath:n/a Error Path:/brokers Error:KeeperErrorCode = NodeExists for /brokers (org.apache.zookeeper.server.PrepRequestProcessor)
[2019-05-30 09:54:29,162] INFO Got user-level KeeperException when processing sessionid:0x16b0670f42f0000 type:create cxid:0x38 zxid:0x16 txntype:-1 reqpath:n/a Error Path:/brokers/ids Error:KeeperErrorCode = NodeExists for /brokers/ids (org.apache.zookeeper.server.PrepRequestProcessor)
部署安装telnet
先检查是否有下面两个安装包
[hadoop@hadoop004 conffile]$ rpm -qa telnet-server
[hadoop@hadoop004 conffile]$ rpm -qa xinetd
[hadoop@hadoop004 conffile]$ yum list |grep telnet
Repodata is over 2 weeks old. Install yum-cron? Or run: yum makecache fast
dcap-tunnel-telnet.x86_64 2.47.12-4.el7 epel
libguac-client-telnet.x86_64 1:0.9.14-1.el7 epel
libtelnet.x86_64 0.21-5.el7 epel
libtelnet-devel.x86_64 0.21-5.el7 epel
libtelnet-utils.x86_64 0.21-5.el7 epel
telnet.x86_64 1:0.17-64.el7 base
telnet-server.x86_64 1:0.17-64.el7 base
[root@hadoop004 ~]# yum install telnet-server.x86_64 -y
[root@hadoop004 ~]# yum install telnet.x86_64 -y
下载Flume
Flume CDH版本的文档地址 http://archive.cloudera.com/cdh5/cdh/5/flume-ng-1.6.0-cdh5.7.0/
[hadoop@hadoop004 software]$ wget http://archive.cloudera.com/cdh5/cdh/5/flume-ng-1.6.0-cdh5.7.0.tar.gz
[hadoop@hadoop004 software]$ tar -zxf flume-ng-1.6.0-cdh5.7.0.tar.gz -C ~/app/
[hadoop@hadoop004 app]$ cd apache-flume-1.6.0-cdh5.7.0-bin/
[hadoop@hadoop004 apache-flume-1.6.0-cdh5.7.0-bin]$ cd conf
[hadoop@hadoop004 conf]$ cp flume-env.sh.template flume-env.sh
[hadoop@hadoop004 conf]$ vim flume-env.sh
export JAVA_HOME=/usr/java/jdk1.8.0_144
调优点
export JAVA_OPTS="-Xms100m -Xmx2000m -Dcom.sun.management.jmxremote"
把-Xms -Xmx的值改为一样的
我们继续,先实践一下官网的例子
[hadoop@hadoop004 conf]$ mkdir conffile
[hadoop@hadoop004 conf]$ cd conffile/
[hadoop@hadoop004 conffile]$ vim example.conf
# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
# Describe the sink
a1.sinks.k1.type = logger
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1
./flume-ng agent \
--name a1 \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/example.conf \
-Dflume.root.logger=INFO,console
下面再来一个案例
首先我们看看Agent的选型
source --> channel --> sink --> source --> channel --> sink
exec memory avro avro memory logger
[root@hadoop004 conffile]# vim flume-avro-sink.conf
flume-avro-sink.sources = exec-source
flume-avro-sink.channels = avro-memory-channel
flume-avro-sink.sinks = avro-sink
flume-avro-sink.sources.exec-source.type = exec
flume-avro-sink.sources.exec-source.command = tail -F /home/hadoop/data/avro_access.data
flume-avro-sink.channels.avro-memory-channel.type = memory
flume-avro-sink.sinks.avro-sink.type = avro
flume-avro-sink.sinks.avro-sink.hostname = localhost
flume-avro-sink.sinks.avro-sink.port = 44444
flume-avro-sink.sources.exec-source.channels = avro-memory-channel
flume-avro-sink.sinks.avro-sink.channel = avro-memory-channel
[root@hadoop004 conffile]# vim flume-avro-source.conf
flume-avro-source.sources = avro-source
flume-avro-source.channels = avro-memory-channel
flume-avro-source.sinks = logger-sink
flume-avro-source.sources.avro-source.type = avro
flume-avro-source.sources.avro-source.bind = localhost
flume-avro-source.sources.avro-source.port = 44444
flume-avro-source.channels.avro-memory-channel.type = memory
flume-avro-source.sinks.logger-sink.type = logger
flume-avro-source.sources.avro-source.channels = avro-memory-channel
flume-avro-source.sinks.logger-sink.channel = avro-memory-channel
按顺序启动下面两条命令,一定要按顺序
./flume-ng agent \
--name flume-avro-source \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/flume-avro-source.conf \
-Dflume.root.logger=INFO,console
./flume-ng agent \
--name flume-avro-sink \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/flume-avro-sink.conf \
-Dflume.root.logger=INFO,console
echo一条数据进去
[root@hadoop004 ~]# echo "hello world flume" >> /home/hadoop/data/avro_access.data
这是由flume-avro-source.conf所启动的Flume进程的截图
这是由flume-avro-source.conf所启动的Flume进程的截图
成功了!!
Flume还支持这样的架构,这里就不实践了。
下面再来一个案例
首先我们看看Agent的选型
netcat --> channel --> HDFS
--> channel --> Logger
[hadoop@hadoop004 conffile]$ vim channel-replicating-selector.conf
a1.sources = r1
a1.channels = c1 c2
a1.sinks = k1 k2
a1.sources.r1.type = netcat
a1.sources.r1.bind = localhost
a1.sources.r1.port = 44444
a1.sources.r1.selector.type = replicating
a1.sources.r1.channels = c1 c2
a1.channels.c1.type = memory
a1.channels.c2.type = memory
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = /flume/g6-events/%y%m%d%H%M
a1.sinks.k1.hdfs.filePrefix = events-
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 1
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp=true
a1.sinks.k1.hdfs.fileType=DataStream
a1.sinks.k1.hdfs.writeFormat=Text
a1.sinks.k2.type = logger
a1.sinks.k1.channel = c1
a1.sinks.k2.channel = c2
执行命令
./flume-ng agent \
--name a1 \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/channel-replicating-selector.conf \
-Dflume.root.logger=INFO,console
再去Hadoop Web UI界面看一看检查一下
成功了!!
我们开始来做个作业吧
首先我们看看Agent的选型
Taildir Source --> File Channel --> AVRO Sink
AVRO Source --> File Channel --> HDFS
--> Kafka
开干吧!
先启动HDFS吧,略
修改Kafka配置文件
[root@hadoop004 config]# vim server.properties
host.name=hadoop004
port=9092
zookeeper.connect=localhost:2181/kafka
但我这里还是使用了0.10.0,先在Kafka上创建一个名为world的topic
[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-topics.sh --list --zookeeper hadoop004:2181/kafka
hi
启动zookeeper
[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/zookeeper-server-start.sh config/zookeeper.properties
启动Kafka
[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-server-start.sh config/server.properties
Flume配置文件如下
[hadoop@hadoop004 conffile]$ vim homework_start.conf
flume-homework.sources = taildir-src
flume-homework.channels = taildir-file-channel
flume-homework.sinks = avro-sink
flume-homework.sources.taildir-src.type = TAILDIR
flume-homework.sources.taildir-src.positionFile = /home/hadoop/data/hello/taildir_position.json
flume-homework.sources.taildir-src.filegroups = f1 f2
flume-homework.sources.taildir-src.filegroups.f1 = /home/hadoop/data/hello/hello.log
flume-homework.sources.taildir-src.fheaders.f1.headerKey1 = hello
flume-homework.sources.taildir-src.filegroups.f2 = /home/hadoop/data/hi/hi.log
flume-homework.sources.taildir-src.headers.f2.headerKey1 = hi
flume-homework.sources.taildir-src.headers.f2.headerKey2 = hi-2
flume-homework.channels.taildir-file-channel.type = file
flume-homework.channels.taildir-file-channel.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/src/checkpoint
flume-homework.channels.taildir-file-channel.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/src/data
flume-homework.sinks.avro-sink.type = avro
flume-homework.sinks.avro-sink.hostname = localhost
flume-homework.sinks.avro-sink.port = 44444
flume-homework.sources.taildir-src.channels = taildir-file-channel
flume-homework.sinks.avro-sink.channel = taildir-file-channel
[hadoop@hadoop004 conffile]$ vim homework_end.conf
flume-homework-end.sources = avro-source
flume-homework-end.channels = hdfs-chan kafka-chan
flume-homework-end.sinks = hdfs-end kafka-end
# 定义sources类型
flume-homework-end.sources.avro-source.type = avro
flume-homework-end.sources.avro-source.bind = localhost
flume-homework-end.sources.avro-source.port = 44444
# 定义多个channel输出同样的数据
flume-homework-end.sources.avro-source.selector.type = replicating
flume-homework-end.sources.avro-source.channels = hdfs-chan kafka-chan
# 定义channel为file类型
flume-homework-end.channels.hdfs-chan.type = file
flume-homework-end.channels.hdfs-chan.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/hdfs/checkpoint
flume-homework-end.channels.hdfs-chan.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/hdfs/data
flume-homework-end.channels.kafka-chan.type = file
flume-homework-end.channels.kafka-chan.checkpointDir = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/kafka/checkpoint
flume-homework-end.channels.kafka-chan.dataDirs = /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/tmp_data/kafka/data
# 定义hdfs sinks
flume-homework-end.sinks.hdfs-end.type = hdfs
flume-homework-end.sinks.hdfs-end.hdfs.path = /flume/g6-hdfs/%y%m%d%H%M
flume-homework-end.sinks.hdfs-end.hdfs.filePrefix = events-
flume-homework-end.sinks.hdfs-end.hdfs.round = true
flume-homework-end.sinks.hdfs-end.hdfs.roundValue = 1
flume-homework-end.sinks.hdfs-end.hdfs.roundUnit = minute
flume-homework-end.sinks.hdfs-end.hdfs.useLocalTimeStamp=true
flume-homework-end.sinks.hdfs-end.hdfs.fileType=DataStream
flume-homework-end.sinks.hdfs-end.hdfs.writeFormat=Text
# 定义kafka sinks
flume-homework-end.sinks.kafka-end.type = org.apache.flume.sink.kafka.KafkaSink
flume-homework-end.sinks.kafka-end.topic = hi
flume-homework-end.sinks.kafka-end.brokerList = hadoop004:9092
flume-homework-end.sinks.kafka-end.requiredAcks = 1
flume-homework-end.sinks.kafka-end.batchSize = 20
#flume-homework-end.sinks.kafka-end.type = logger
flume-homework-end.sinks.hdfs-end.channel = hdfs-chan
flume-homework-end.sinks.kafka-end.channel = kafka-chan
启动Flume agent,一定要按顺序,先把后面的启动起来,再启动前面的
./flume-ng agent \
--name flume-homework-end \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/homework_end.conf \
-Dflume.root.logger=INFO,console
./flume-ng agent \
--name flume-homework \
--conf /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile \
--conf-file /home/hadoop/app/apache-flume-1.6.0-cdh5.7.0-bin/conf/conffile/homework_start.conf \
-Dflume.root.logger=INFO,console
[root@hadoop004 config]# echo "hello homework" >> /home/hadoop/data/hello/hello.log
启动一个Kafka消费者来验证是否有数据到达Kafka
[hadoop@hadoop004 kafka_2.11-0.10.0.0]$ bin/kafka-console-consumer.sh --zookeeper localhost:2181/kafka --topic hi --from-beginning
再插入另外一条数据到另外一个被监听的文件
[root@hadoop004 config]# echo "hi homework" >> /home/hadoop/data/hi/hi.log
数据都是都进去到HDFS和Kafka里面
至此,作业完成了!!