使用flume采集web服务器日志,架构见图
每台webserver 的agent的配置:
#configuration 'flume74Agent' flume74Agent.sources=source74 flume74Agent.sinks=sink74-1 sink74-2 flume74Agent.channels=channel74 #configuration sinks group flume74Agent.sinksgroups=group74 #配置source为syslogtcp的源 flume74Agent.sources.source74.type=syslogtcp flume74Agent.sources.source74.port=514 flume74Agent.sources.source74.host=10.21.3.74 flume74Agent.sources.source74.channels=channel74 #配置 memory channels,说明capacity必须大于transactionCapacity,容量配置越小,Agent挂了之后丢失的数据量越少,keep-alive的单位是秒,存活时间 flume74Agent.channels.channel74.type=memory flume74Agent.channels.channel74.capacity=2000 flume74Agent.channels.channel74.transactionCapacity=1000 flume74Agent.channels.channel74.keep-alive=30 #配置 file channel,为了提高效率checkpointDir和dataDir的目录最好分开 #flume74Agent.channels.channel74.type=file #flume74Agent.channels.channel74.checkpointDir=/usr/local/new-cluster/apache-flume-1.6.0-bin/checkpoint #flume74Agent.channels.channel74.dataDirs=/usr/local/new-cluster/apache-flume-1.6.0-bin/data #flume74Agent.channels.channel74.transactionCapacity=10000 #flume74Agent.channels.channel74.checkpointInterval=60000 #flume74Agent.channels.channel74.capacity=20000 #flume74Agent.channels.channel74.keep-alive=30 #配置第一个sink sink74-1 flume74Agent.sinks.sink74-1.type=avro flume74Agent.sinks.sink74-1.port=4141 flume74Agent.sinks.sink74-1.hostname=10.21.3.73 flume74Agent.sinks.sink74-1.channel=channel74 #配置第二个sink sink74-2 flume74Agent.sinks.sink74-2.type=avro flume74Agent.sinks.sink74-2.port=4141 flume74Agent.sinks.sink74-2.hostname=10.21.3.75 flume74Agent.sinks.sink74-2.channel=channel74 #配置sink组 flume74Agent.sinkgroups.group74.sinks=sink74-1 sink74-2 # 配置sink组的负载均衡,既能分摊压力又能防止其中一个collect采集挂了丢失数据问题 flume74Agent.sinkgroups.group74.processor.type = load_balance flume74Agent.sinkgroups.group74.processor.backoff = true flume74Agent.sinkgroups.group74.processor.selector = random
flume collect的agent配置:
collection75Agent.sources=source75 collection75Agent.sinks=sink75-1 collection75Agent.channels=channel75 #configuration source collection75Agent.sources.source75.type=avro collection75Agent.sources.source75.channels=channel75 collection75Agent.sources.source75.bind=10.21.3.75 collection75Agent.sources.source75.port=4141 collection75Agent.sources.source75.interceptors = i1 i2 collection75Agent.sources.source75.interceptors.i1.type = org.apache.flume.interceptor.HostInterceptor$Builder collection75Agent.sources.source75.interceptors.i1.preserveExisting = false collection75Agent.sources.source75.interceptors.i1.hostHeader = hostname collection75Agent.sources.source75.interceptors.i2.type = org.apache.flume.interceptor.TimestampInterceptor$Builder #configuration memory channel collection75Agent.channels.channel75.type=memory collection75Agent.channels.channel75.capacity=2000 collection75Agent.channels.channel75.transactionCapacity=1000 collection75Agent.channels.channel75.keep-alive=30 #configuration file channel #collection75Agent.channels.channel75.type=file #collection75Agent.channels.channel75.checkpointDir=/usr/local/new-cluster/apache-flume-1.6.0-bin/checkpoint #collection75Agent.channels.channel75.dataDirs=/usr/local/new-cluster/apache-flume-1.6.0-bin/data #collection75Agent.channels.channel75.transactionCapacity=10000 #collection75Agent.channels.channel75.checkpointInterval=60000 #collection75Agent.channels.channel75.capacity=20000 #collection75Agent.channels.channel75.keep-alive=30 #confituration sinks collection75Agent.sinks.sink75-1.type=hdfs collection75Agent.sinks.sink75-1.channel=channel75 collection75Agent.sinks.sink75-1.hdfs.path=hdfs://mycluster1/flume/%Y-%m collection75Agent.sinks.sink75-1.hdfs.filePrefix=syslog75.%Y-%m-%d collection75Agent.sinks.sink75-1.hdfs.fileSuffix=.log collection75Agent.sinks.sink75-1.hdfs.round=true collection75Agent.sinks.sink75-1.hdfs.roundValue=10 collection75Agent.sinks.sink75-1.hdfs.roundUnit=minute collection75Agent.sinks.sink75-1.hdfs.rollInterval=0 #多久后重新生成日志文件,0从不生成日志文件 collection75Agent.sinks.sink75-1.hdfs.rollSize=0 #日志多大后重新生成日志文件 collection75Agent.sinks.sink75-1.hdfs.batchSize=1000 #flush到hdfs的日志条数 collection75Agent.sinks.sink75-1.hdfs.rollCount=0 #多少条后重新生成日志文件 collection75Agent.sinks.sink75-1.hdfs.fileType = DataStream collection75Agent.sinks.sink75-1.hdfs.writeFormat=Text collection75Agent.sinks.sink75-1.hdfs.callTimeout=600000 #和hdfs通讯多久超时 collection75Agent.sinks.sink75-1.hdfs.threadsPoolSize=20 collection75Agent.sinks.sink75-1.hdfs.rollTimerPoolSize=5 collection75Agent.sinks.sink75-1.hdfs.idleTimeout=600 #间隔多久没有往该日志文件写数据,那么把这个文件结束重命名去除.tmp状态,单位为s #confituration sinks #collection75Agent.sinks.sink75-2.type=hdfs #collection75Agent.sinks.sink75-2.channel=channel75 #collection75Agent.sinks.sink75-2.hdfs.path=hdfs://mycluster1/flume/%Y-%m #collection75Agent.sinks.sink75-2.hdfs.filePrefix=syslog2.%Y-%m-%d #collection75Agent.sinks.sink75-2.hdfs.fileSuffix=.log #collection75Agent.sinks.sink75-2.hdfs.round=true #collection75Agent.sinks.sink75-2.hdfs.roundValue=10 #collection75Agent.sinks.sink75-2.hdfs.roundUnit=minute #collection75Agent.sinks.sink75-2.hdfs.rollInterval=0 #collection75Agent.sinks.sink75-2.hdfs.rollSize=0 #collection75Agent.sinks.sink75-2.hdfs.batchSize=1000 #collection75Agent.sinks.sink75-2.hdfs.rollCount=0 #collection75Agent.sinks.sink75-2.hdfs.fileType = DataStream #collection75Agent.sinks.sink75-2.hdfs.writeFormat=Text #collection75Agent.sinks.sink75-2.hdfs.callTimeout=600000 #collection75Agent.sinks.sink75-2.hdfs.threadsPoolSize=20 #collection75Agent.sinks.sink75-2.hdfs.rollTimerPoolSize=5 #collection75Agent.sinks.sink75-2.channel=channel75
后台启动flume Agent:
nohup flume-ng agent -c conf/ -f conf/collection73Agent.conf -n collection73Agent > start.log 2>&1 &
rsyslog.conf配置图:
补充:flume-env.sh配置
JAVA_OPTS="-Xms2048m -Xmx2048m -Xss256k -Xmn512m -XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:-UseGCOverheadLimit"