word statistics
1. Topological structure
1. Data source
2. Word split
3. Word count
4. Statistical results
5. Topology construction
2. Code implementation
1. Word source
package com.study.storm.test.wordcount; import java.util.Map; import backtype.storm.spout.SpoutOutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Values; /** * Spout : data source * The producer of data, sends data to one or more Bolts for data processing */ public class SentenceSpout extends BaseRichSpout { /** * */ private static final long serialVersionUID = -5569170406079406193L; private SpoutOutputCollector collector = null ; // data private static String [] sentences = { "i am a boy ","i like eat","i do not like ","what" }; // The subscript of the array, the msgId sent by the data private int index = 0 ; /** * from ISpout * keep sending data * No data is sent, you can sleep for a period of time to avoid frequent CPU calls */ @Override public void nextTuple() { // only send once if(index >= sentences.length){ return ; } // The content of the data sent, the msgId number of the data (not passed, the default is null) collector.emit(new Values(sentences[index]),index); // Send in a loop to avoid array out of bounds // index = index > sentences.length ? 0 : index++ ; index ++ ; } /** * Use static member variables with caution, thread safety issues * Because SpoutOutputCollector is thread-safe, the global collector here can be set to static */ /** * from the ISpout interface * Called when the component is initialized * @param arg0 configuration information * @param arg1 task information * @param arg2 The component used to emit data, thread-safe */ @SuppressWarnings("rawtypes") @Override public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector arg2) { this.collector = arg2 ; } /** * from IComponent * Declare fields for output */ @Override public void declareOutputFields(OutputFieldsDeclarer arg0) { arg0.declare(new Fields("sentence")); } @Override public void ack(Object msgId) { // send a successful response System.out.println("ack : "+msgId); } @Override public void fail(Object msgId) { System.out.println("fail : "+msgId); // Failed to send: resend this.collector.emit(new Values(sentences[(Integer)msgId]),msgId); } }
2. Word split
package com.study.storm.test.wordcount; import java.util.Map; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; /** * word split */ public class SentenceBolt extends BaseRichBolt { /** * */ private static final long serialVersionUID = -5420313164197072550L; private OutputCollector collector ; /** * Inherited from IBolt * The initialization method, called when the component is initialized * @param stormConf The current Bolt's configuration information object * @param context current environment information object * @param collector object for external output tuple */ @SuppressWarnings("rawtypes") @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this.collector = collector; } @Override public void execute(Tuple input) { try { // Get data through field, sentence is defined in spout String sentences = input.getStringByField("sentence"); String [] words = sentences.split(" "); for(String word : words){ // Anchor: Bind the bolt to the upper level, and transfer the data sending status. If there is a problem, it is convenient to find the source of the upper level data this.collector.emit(input,new Values(word)); } // Confirm that the send was successful collector.ack(input); } catch (Exception e) { collector.fail(input); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { declarer.declare(new Fields("word")); } }
3. Word count
package com.study.storm.test.wordcount; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Fields; import backtype.storm.tuple.Tuple; import backtype.storm.tuple.Values; /** * Word count statistics */ public class WordCountBolt extends BaseRichBolt { /** * */ private static final long serialVersionUID = -4811405807833269368L; private OutputCollector collector = null ; /** * Thread safety */ private Map<String,Integer> countMap = new ConcurrentHashMap<String,Integer>(); @SuppressWarnings("rawtypes") @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this.collector = collector ; } @Override public void execute(Tuple input) { try { String word = input.getStringByField("word"); countMap.put(word, countMap.containsKey(word) ? countMap.get(word)+1 : 1); // Send data in the order of the declarer, word first, then the number of words this.collector.emit(input,new Values(word,countMap.get(word))); collector.ack(input); } catch (Exception e) { collector.fail(input); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { // data send data declarer.declare(new Fields("word","count")); } }
4. Statistical Results
package com.study.storm.test.wordcount; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import backtype.storm.task.OutputCollector; import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichBolt; import backtype.storm.tuple.Tuple; public class ResultBolt extends BaseRichBolt { /** * */ private static final long serialVersionUID = 7436620687730623420L; private Map<String,Integer> map = new ConcurrentHashMap<String,Integer>(); private OutputCollector collector = null ; @SuppressWarnings("rawtypes") @Override public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) { this.collector = collector ; } @Override public void execute(Tuple input) { try { // get data from field String word = input.getStringByField("word"); // Cannot be obtained through getStringByField, a conversion exception will be reported Integer count = (Integer) input.getValueByField("count"); map.put(word, count); System.out.println(word +" : " + count); collector.ack(input); } catch (Exception e) { collector.fail(input); } } @Override public void declareOutputFields(OutputFieldsDeclarer declarer) { } /** * Run after the normal operation of the program ends * */ @Override public void cleanup() { System.out.println("Statistical result: "); for(String key : map.keySet()){ System.out.println(key + " : " + map.get(key)); } } }
5. Topology Construction
package com.study.storm.test.wordcount; import backtype.storm.Config; import backtype.storm.LocalCluster; import backtype.storm.StormSubmitter; import backtype.storm.generated.AlreadyAliveException; import backtype.storm.generated.InvalidTopologyException; import backtype.storm.generated.StormTopology; import backtype.storm.topology.TopologyBuilder; import backtype.storm.tuple.Fields; public class WordCountTopology { public static void main(String args[]) throws AlreadyAliveException, InvalidTopologyException, InterruptedException{ // Instantiate Spout Bolt SentenceSpout sentenceSpout = new SentenceSpout(); SentenceBolt sentenceBolt = new SentenceBolt(); WordCountBolt wordCountBolt = new WordCountBolt(); ResultBolt resultBolt = new ResultBolt(); // Topology TopologyBuilder builder = new TopologyBuilder(); /** * Consider how to distribute data under concurrent conditions * 1. Concurrency level * node : corresponds to the server in the storm cluster * worker : thread level * executor : thread level * task : */ builder.setSpout("sentenceSpout", sentenceSpout); // Random distribution: that is, data can be sent to any next-level processing machine without affecting the statistical results builder.setBolt("sentenceBolt", sentenceBolt).shuffleGrouping("sentenceSpout"); // Distributed according to field = word, the data passed by the same word is sent to the same machine to avoid statistical omission // hash % taskNum // If this is random distribution and the parallel level is greater than or equal to 2, word a is sent to 1 machine, count a: 1, the second time the word is sent to 2 machines, the number of a will be distorted builder.setBolt("wordCountBolt", wordCountBolt).fieldsGrouping("sentenceBolt", new Fields("word")); // If the number of parallels in the upper level is large, no matter which one is processed, it will be sent to the same processing machine. In this way, the setting of the number of parallels is invalid. builder.setBolt("resultBolt", resultBolt).globalGrouping("wordCountBolt"); // production topology StormTopology stormTopology = builder.createTopology(); Config config = new Config(); // cluster running // StormSubmitter.submitTopology("wordCountTopology", config, stormTopology); LocalCluster local = new LocalCluster(); local.submitTopology("wordCountTopology", config, stormTopology); // breakpoint debugging, adjust time Thread.sleep(10*1000); local.killTopology("wordCountTopology"); local.shutdown(); } }