Storm+Kafka集成

http://blog.csdn.net/ch717828/article/details/50748912

 

1. 机器&环境 准备

我准备了3台机器 ,分别是

 

 

 

   10.101.214.71 

   10.101.214.73

   10.101.214.74

且这三台机器均安装了 kafka和storm。详细参考上面两篇文章。

注意,之前的文章我安装的storm版本为0.9.1 ,该版本中缺少许多与kafka集成需要的包,因此,升级为0.9.2 。


 

2.Storm自定义日志 

为了清晰得打印出Storm处理 Kafka发送来的消息,此处自定义了一个日志。

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. // 在73,74机器上 修改 /usr/share/storm/logback/cluster.xml  
  2. <appender name="mylog" class="ch.qos.logback.core.rolling.RollingFileAppender">  
  3.       <file>${storm.home}/logs/mylog.log</file><!-- log文件输出path -->  
  4.       <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">  
  5.         <fileNamePattern>${storm.home}/logs/mylog.log.%i</fileNamePattern><!-- 保留多个文件的文件命名格式 -->  
  6.         <minIndex>1</minIndex>  
  7.         <maxIndex>20</maxIndex><!-- 这两行可以共同配置保留多少个文件 -->  
  8.       </rollingPolicy>  
  9.       <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">  
  10.         <maxFileSize>100MB</maxFileSize><!-- log文件的最大大小 -->  
  11.       </triggeringPolicy>  
  12.       <encoder>  
  13.         <pattern>%d{yyyy-MM-dd'T'HH:mm:ss.SSSZZ} %c{1} [%p] %m%n</pattern> <!-- 输出的日志信息的格式 -->  
  14.       </encoder>  
  15.   </appender>  
  16.   
  17. <logger name="ch.main.MyKafkaTopology" additivity="false" >  
  18. <!-- name 可以配置哪些包下的日志信息要输出,也可以精准到一个类 -->  
  19.     <level value="INFO"/><!-- 要输出的日志信息的级别,我要输出业务日志,则配置为INFO -->  
  20.     <appender-ref ref="mylog"/><!-- 上面的appender的name -->  
  21.   </logger>  


配置好后, ch.main.MyKafkaTopology打印出的INFO日志,均会存在 /usr/share/storm/logs/mylog.log 文件下

 

3.代码编写

pom.xml

 

[html] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. <dependencies>  
  2.         <dependency>  
  3.             <groupId>org.apache.storm</groupId>  
  4.             <artifactId>storm-core</artifactId>  
  5.             <version>0.9.2-incubating</version>  
  6.         </dependency>  
  7.         <dependency>  
  8.             <groupId>org.apache.storm</groupId>  
  9.             <artifactId>storm-kafka</artifactId>  
  10.             <version>0.9.2-incubating</version>  
  11.         </dependency>  
  12.         <dependency>  
  13.             <groupId>org.apache.kafka</groupId>  
  14.             <artifactId>kafka_2.11</artifactId>  
  15.             <version>0.9.0.0</version>  
  16.             <exclusions>  
  17.                 <exclusion>  
  18.                     <groupId>org.apache.zookeeper</groupId>  
  19.                     <artifactId>zookeeper</artifactId>  
  20.                 </exclusion>  
  21.                 <exclusion>  
  22.                     <groupId>log4j</groupId>  
  23.                     <artifactId>log4j</artifactId>  
  24.                 </exclusion>  
  25.                 <exclusion>  
  26.                     <groupId>org.slf4j</groupId>  
  27.                     <artifactId>slf4j-log4j12</artifactId>  
  28.                 </exclusion>  
  29.             </exclusions>  
  30.         </dependency>  
  31.     </dependencies>  


java 代码

 

[java] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. package ch.main;  
  2.   
  3. import backtype.storm.Config;  
  4. import backtype.storm.LocalCluster;  
  5. import backtype.storm.StormSubmitter;  
  6. import backtype.storm.generated.AlreadyAliveException;  
  7. import backtype.storm.generated.InvalidTopologyException;  
  8. import backtype.storm.spout.SchemeAsMultiScheme;  
  9. import backtype.storm.task.OutputCollector;  
  10. import backtype.storm.task.TopologyContext;  
  11. import backtype.storm.topology.OutputFieldsDeclarer;  
  12. import backtype.storm.topology.TopologyBuilder;  
  13. import backtype.storm.topology.base.BaseRichBolt;  
  14. import backtype.storm.tuple.Fields;  
  15. import backtype.storm.tuple.Tuple;  
  16. import backtype.storm.tuple.Values;  
  17.   
  18.   
  19. import org.slf4j.Logger;  
  20. import org.slf4j.LoggerFactory;  
  21. import storm.kafka.*;  
  22.   
  23.   
  24. import java.util.Arrays;  
  25. import java.util.HashMap;  
  26.   
  27. import java.util.Iterator;  
  28. import java.util.Map;  
  29. import java.util.concurrent.atomic.AtomicInteger;  
  30.   
  31. /** 
  32.  * Created by chenhong on 16/2/24. 
  33.  */  
  34. public class MyKafkaTopology {  
  35.   
  36.   
  37.    public static class KafkaWordSplitter extends BaseRichBolt{  
  38.       // private static final Log LOG = LogFactory.getLog(KafkaWordSplitter.class);  
  39.        private static final Logger LOG = LoggerFactory.getLogger(KafkaWordSplitter.class);  
  40.        private static final long serialVersionUID = 1L;  
  41.        private OutputCollector collector;  
  42.   
  43.   
  44.        @Override  
  45.        public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {  
  46.            this.collector = collector;  
  47.        }  
  48.   
  49.        @Override  
  50.        public void execute(Tuple input) {  
  51.            String line = input.getString(0);  
  52.            LOG.info("RECE[kafka -> splitter] "+line);  
  53.            String[] words = line.split("\\s+");  
  54.            for(String word : words){  
  55.                LOG.info("EMIT[splitter -> counter] "+word);  
  56.                collector.emit(input,new Values(word,1));  
  57.            }  
  58.            collector.ack(input);  
  59.        }  
  60.   
  61.        @Override  
  62.        public void declareOutputFields(OutputFieldsDeclarer declarer) {  
  63.             declarer.declare(new Fields("word","count"));  
  64.        }  
  65.    }  
  66.   
  67.     public static class WordCounter extends BaseRichBolt {  
  68.        // private static final Log LOG = LogFactory.getLog(WordCounter.class);  
  69.         private static final Logger LOG = LoggerFactory.getLogger(WordCounter.class);  
  70.         private static final long serialVersionUID =1L;  
  71.         private OutputCollector collector;  
  72.         private Map<String,AtomicInteger> counterMap;  
  73.   
  74.         @Override  
  75.         public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {  
  76.             this.collector=collector;  
  77.             this.counterMap = new HashMap<String,AtomicInteger>();  
  78.         }  
  79.   
  80.         @Override  
  81.         public void execute(Tuple input) {  
  82.             String word = input.getString(0);  
  83.             int count = input.getInteger(1);  
  84.             LOG.info("RECE[splitter -> counter] "+word+" : "+count);  
  85.             AtomicInteger ai = this.counterMap.get(word);  
  86.             if(ai==null){  
  87.                 ai= new AtomicInteger();  
  88.                 this.counterMap.put(word,ai);  
  89.             }  
  90.             ai.addAndGet(count);  
  91.             collector.ack(input);  
  92.             LOG.info("CHECK statistics map: "+this.counterMap);  
  93.         }  
  94.   
  95.         @Override  
  96.         public void declareOutputFields(OutputFieldsDeclarer declarer) {  
  97.             declarer.declare(new Fields("word","count"));  
  98.         }  
  99.   
  100.         @Override  
  101.         public void cleanup() {  
  102.             LOG.info("The final result:");  
  103.             Iterator<Map.Entry<String,AtomicInteger>> iter = this.counterMap.entrySet().iterator();  
  104.             while(iter.hasNext()){  
  105.                 Map.Entry<String,AtomicInteger> entry =iter.next();  
  106.                 LOG.info(entry.getKey()+"\t:\t"+entry.getValue().get());  
  107.             }  
  108.         }  
  109.     }  
  110.   
  111.     public static void main(String[] args) throws AlreadyAliveException,InvalidTopologyException,InterruptedException{  
  112.         String zks = "10.101.214.71:2181,10.101.214.73:2181,10.101.214.74:2181";  
  113.         String topic ="my-replicated-topic5";  
  114.         String zkRoot ="/kafka" ;  
  115.         String id ="word"// 读取的status会被存在,/zkRoot/id下面,所以id类似consumer group  
  116.   
  117.         BrokerHosts brokerHosts = new ZkHosts(zks,"/kafka/brokers");  
  118.         SpoutConfig spoutConf = new SpoutConfig(brokerHosts,topic,zkRoot,id);  
  119.         spoutConf.scheme = new SchemeAsMultiScheme(new StringScheme());  
  120.         spoutConf.forceFromStart = false;  
  121.         spoutConf.zkServers= Arrays.asList(new String[]{"10.101.214.71","10.101.214.73","10.101.214.74"});  
  122.         spoutConf.zkPort=2181;  
  123.   
  124.         TopologyBuilder  builder = new TopologyBuilder();  
  125.         builder.setSpout("kafka-reader"new KafkaSpout(spoutConf), 5); //// Kafka我们创建了一个5分区的Topic,这里并行度设置为5  
  126.         builder.setBolt("word-splitter",new KafkaWordSplitter(),2).shuffleGrouping("kafka-reader");  
  127.         builder.setBolt("word-counter",new WordCounter() ).fieldsGrouping("word-splitter",new Fields("word"));  
  128.   
  129.         Config config = new Config();  
  130.         String name = MyKafkaTopology.class.getSimpleName();  
  131.         if(args !=null && args.length>0 ){  
  132.           //config.put(Config.NIMBUS_HOST,args[0]);  
  133.             config.setNumWorkers(3);  
  134.             StormSubmitter.submitTopology(name,config,builder.createTopology());  
  135.         }else{  
  136.             config.setMaxTaskParallelism(3);  
  137.             LocalCluster cluster = new LocalCluster();  
  138.             cluster.submitTopology(name,config,builder.createTopology());  
  139.             Thread.sleep(60000);  
  140.             cluster.shutdown();  
  141.         }  
  142.   
  143.   
  144.   
  145.     }  
  146.   
  147. }  

4 提交运行

使用 mvn将项目打包 

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. mvn clean install  

 

为了在storm中使用kafka,需要将 依赖jar文件到Storm集群中的lib目录下面

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. cp /usr/local/kafka/libs/kafka_2.11-0.9.0.0.jar /usr/share/storm/lib/  
  2. cp /usr/local/kafka/libs/scala-library-2.11.7.jar /usr/share/storm/lib/  
  3. cp /usr/local/kafka/libs/metrics-core-2.2.0.jar /usr/share/storm/lib/  
  4. cp /usr/local/kafka/libs/snappy-java-1.1.1.7.jar /usr/share/storm/lib/  
  5. cp /usr/local/kafka/libs/zkclient-0.7.jar /usr/share/storm/lib/  
  6. cp /usr/local/kafka/libs/log4j-1.2.17.jar /usr/share/storm/lib/  
  7. cp /usr/local/kafka/libs/slf4j-api-1.7.6.jar /usr/share/storm/lib/  
  8. cp /usr/local/kafka/libs/jopt-simple-3.2.jar /usr/share/storm/lib/  

提交

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. //在 71机器上提交  
  2. storm jar StormKafka0.1-1.0-SNAPSHOT.jar ch.main.MyKafkaTopology MyKafkaTopology  
  3. //在71机器上打开 kafka启动Producer ,产生日志  
  4. /usr/local/kafka_2.11-0.9.0.0/bin/kafka-console-producer.sh --broker-list 10.101.214.71:9092,10.101.214.73:9092,10.101.214.74:9092 --topic my-replicated-topic5  
  5.   
  6. (随便输入一些内容)  
  7. //在 73,74机器上查看日志   
  8. cat  /usr/share/storm/logs/mylog.log   
  9. (可以看到 MyKafkaTopology 打出的日志)  


下面是我查看mylog.log的部分日志

 

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] 123  
  2. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] 123123  
  3. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] 123123  
  4. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] 123123  
  5. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] aa  
  6. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] bbc  
  7. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] ccc  
  8. 2016-02-26T12:47:13.238+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] ddd  
  9. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] eeee  
  10. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] ffffff  
  11. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$KafkaWordSplitter [INFO] EMIT[splitter -> counter] jsdkfjasnng  
  12. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] RECE[splitter -> counter] 123 : 1  
  13. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] CHECK statistics map: {=2, aa=6, bbc=6, ccc=6, --broker-list=1, ddd=6, eeee=6, my-replicated-topic5=1, asdfasdfasdf=15, 123=7, jsdkfjasnng=6, 123123=18, 10.101.214.71:9092,10.101.214.73:9092,10.101.214.74:9092=1, --topic=1, v=1, /usr/local/kafka_2.11-0.9.0.0/bin/kafka-console-producer.sh=1, ffffff=6}  
  14. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] RECE[splitter -> counter] 123123 : 1  
  15. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] CHECK statistics map: {=2, aa=6, bbc=6, ccc=6, --broker-list=1, ddd=6, eeee=6, my-replicated-topic5=1, asdfasdfasdf=15, 123=7, jsdkfjasnng=6, 123123=19, 10.101.214.71:9092,10.101.214.73:9092,10.101.214.74:9092=1, --topic=1, v=1, /usr/local/kafka_2.11-0.9.0.0/bin/kafka-console-producer.sh=1, ffffff=6}  
  16. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] RECE[splitter -> counter] 123123 : 1  
  17. 2016-02-26T12:47:13.239+0800 c.m.MyKafkaTopology$WordCounter [INFO] CHECK statistics map: {=2, aa=6, bbc=6, ccc=6, --broker-list=1, ddd=6, eeee=6, my-replicated-topic5=1, asdfasdfasdf=15, 123=7, jsdkfjasnng=6, 123123=20, 10.101.214.71:9092,10.101.214.73:9092,10.101.214.74:9092=1, --topic=1, v=1, /usr/local/kafka_2.11-0.9.0.0/bin/kafka-console-producer.sh=1, ffffff=6}  



 

其他

 

1 启动storm 发生 line 61:normclasspath = cygpath if sys.platform == 'cygwin' else identity  错误

 

[plain] view plain copy 在CODE上查看代码片 派生到我的代码片
  1. 安装python2.7  
  2. 修改/usr/bin/storm  
  3. 将首行显示的 !#/usr/bin/python 修改为 !#/home/tops/bin/python2.7  

 

 

在集成过程中可能会遇到许多奇怪的问题,一路走来也踩了许多坑,有问题的可以私信或者留言。

猜你喜欢

转载自hunan.iteye.com/blog/2359660