Five: Storm-wordcount Real Time Development
1: write Spout
WC Package; Import a java.util.Map; Import org.apache.storm.spout.SpoutOutputCollector; Import org.apache.storm.task.TopologyContext; Import org.apache.storm.topology.OutputFieldsDeclarer; Import org.apache.storm. topology.base.BaseRichSpout; Import org.apache.storm.tuple.Fields; Import org.apache.storm.tuple.Values; / ** * @author Dawn * @date 2019 Nian 6 Yue 5 Ri 15:59:04 * 1.0 @version * demand: Hello word count Dawn Dawn Indicate Hello * implements the interface: IRichSpout IRichBolt * abstract class inheritance: BaseRichSpout BaseRichSpout used * / public class WordCountSpout the extends BaseRichSpout { // definition of collector private SpoutOutputCollector collector; // send data @Override public void nextTuple () { //. 1: sends data to blot collector.emit (new new Values ( "Hello Dawn Dawn Indicate Hello")); . 2 // Set delay the try { the Thread.sleep (500 ); } the catch (InterruptedException E) { // the TODO Auto-Generated Block the catch e.printStackTrace (); } } // Create collector @Override public void Open (the arg0 the Map, TopologyContext arg1, SpoutOutputCollector collector) { this.collector = Collector; } // declare @Override public void declareOutputFields (OutputFieldsDeclarer dECLARE) { // aliases declare.declare (new new Fields ( "Dawn")); } }
2 : Write word bolt
WC Package; Import a java.util.Map; Import org.apache.storm.task.OutputCollector; Import org.apache.storm.task.TopologyContext; Import org.apache.storm.topology.OutputFieldsDeclarer; Import org.apache.storm. topology.base.BaseRichBolt; Import org.apache.storm.tuple.Fields; Import org.apache.storm.tuple.Tuple; Import org.apache.storm.tuple.Values; / ** * @author Dawn * 2019 @date on June 5, 16:09:58 * @version 1.0 * Bolt assembly word segmentation * / public class WordCountSplitBolt the extends BaseRichBolt { // continue to send data to the next Bolt Private OutputCollector Collector; @Override public void execute(Tuple in) { //. 1 acquired data. Line in.getStringByField = String ( "Dawn"); . 2 // partitioned data String [] = line.split Fields ( ""); . @ 3 under <word 1> sent a Bolt (cumulative sum ) for (String W: Fields) { collector.emit (new new Values (W,. 1)); } } // initialize @Override public void PREPARE (the arg0 the Map, TopologyContext arg1, OutputCollector Collector) { this.collector = Collector; } // declare described @Override public void declareOutputFields (OutputFieldsDeclarer dECLARE) { declare.declare (new new Fields ( "Word", "SUM")); } }
3 : Write count bolt
package wc; import java.util.HashMap; import java.util.Map; import org.apache.storm.task.OutputCollector; import org.apache.storm.task.TopologyContext; import org.apache.storm.topology.OutputFieldsDeclarer; import org.apache.storm.topology.base.BaseRichBolt; import org.apache.storm.tuple.Fields; import org.apache.storm.tuple.Tuple; /** * @author Dawn * @date 2019年6月5日16:17:45 * @version 1.0 * 计数bolt */ public class WordCount extends BaseRichBolt{ private Map<String, Integer> map=new HashMap<>(); //累加求和 @Override public void execute(Tuple in) { //1.获取数据 Word in.getStringByField = String ( "Word"); Integer = SUM in.getIntegerByField ( "SUM"); . // 2 business process if (map.containsKey (word)) { several appearances before // Integer COUNT = the Map .get (Word); // existing map.put (Word, COUNT + SUM); } the else { map.put (Word, SUM); } .. 3 // print console System.err.println (Thread. currentThread () getName () + " \ t word is:" + word + "\ t currently occurrences as:." + map.get (Word)); } @Override public void PREPARE (the Map arg0, arg1 TopologyContext,Collector OutputCollector) { } // no need to write the next stage @Override public void declareOutputFields (OutputFieldsDeclarer DECLARE) { } }
4 : Write driver driving class
WC Package; Import org.apache.storm.Config; Import org.apache.storm.LocalCluster; Import org.apache.storm.StormSubmitter; Import org.apache.storm.generated.AlreadyAliveException; Import org.apache.storm.generated. an AuthorizationException; Import org.apache.storm.generated.InvalidTopologyException; Import org.apache.storm.topology.TopologyBuilder; Import org.apache.storm.tuple.Fields; / ** * @author Dawn * @date June 5, 2019 day 16:18:52 * @version 1.0 * class drivers, and demonstrates the use of different grouping strategy (fields, randomized, global) * / public class WordCountDriver { public static void main (String [] args) { //1.hadoop -> Job storm-> topology to create a topology Builder = new new TopologyBuilder TopologyBuilder (); word split bolt assembly // // setup tasks are randomized parallelism of 2, the total number of tasks 4 // Specify provided 2 // setup tasks spout assembly builder.setSpout ( "WordCountSpout", new WordCountSpout (), 2); // topology name, the data source, the degree of parallelism // set the task of word split bolt assembly , is a field of 2 parallel groups, the total number of tasks. 4 builder.setBolt ( "WordCountSplitBolt", new new WordCountSplitBolt (), 2) .setNumTasks (. 4) .fieldsGrouping ( "WordCountSpout", new new fields ( "Dawn")); / / word count of the task set bolt assembly, a field of the packet, the degree of parallelism of 2 builder.setBolt ( "the wordCount", the wordCount new new (),. 4) .fieldsGrouping ( "WordCountSplitBolt", new new fields ( "Word", "SUM" )); // ============================================= ================================================== ============= " // builder.setBolt (" WordCountSplitBolt ", new new WordCountSplitBolt (), 2) .setNumTasks (. 4) .shuffleGrouping (" WordCountSpout "); // // // word count of the bolt assembly disposed task is randomized, a degree of parallelism 2 // builder.setBolt ( "the WordCount", the WordCount new new (),. 4) .shuffleGrouping ( "WordCountSplitBolt"); // =========================================== ================================================== =============== " word split bolt assembly // // setup tasks, a global packet parallelism is 2, the total number of tasks 4 // assigned to a task id the minimum value is determined based on the thread id, only points to a thread id oh Oh minimum // builder.setBolt ( "WordCountSplitBolt", new new WordCountSplitBolt (), 2) .setNumTasks (. 4) .globalGrouping ( "WordCountSpout"); // / / // set the task of the bolt assembly word count, a global packet, a degree of parallelism 2 // builder.setBolt ( "the wordCount", the wordCount new new (),. 4) .globalGrouping ( "WordCountSplitBolt"); // // 3. Create the configuration information Config new new Config the conf = (); // conf.setNumWorkers (10); set the number of Work // // trunked mode operation // the try { // StormSubmitter.submitTopology (args [0], the conf, builder.createTopology ()); //} the catch (AlreadyAliveException E) { // // TODO Auto-generated catch block // e.printStackTrace(); // } catch (InvalidTopologyException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } catch (AuthorizationException e) { // // TODO Auto-generated catch block // e.printStackTrace(); // } //4.提交任务(本地模式) LocalCluster cluster = new LocalCluster(); cluster.submitTopology("wordcounttopology", conf, builder.createTopology()); } }
Six: Storm-wordcount submitted to run on a cluster
1 : Packager to Linux on
2 : Submit task
3 : In the Storm UI point of view task performance
Seven: grouping strategy
Using the above word count program to learn the grouping strategy
FIG. Total: reference to this look
A executor is a number of threads
A task is a task number
1) Fields Grouping
Grouped field. Field is sent to a same task in.
operation result:
It can be seen is the field that grouping, why? I should be here field ( the Hello Dawn Dawn Indicate the Hello), it was only three, and I'm here to set the degree of parallelism is 4 (it is understood as the number of threads). From the results, only three threads in use! ! ! That is the same field into a task in. . .
2)shuffle Grouping
Randomized. polling. equally distributed. Random distribution tuple, each tuple to ensure that the same number of bolt.
operation result:
It can be seen clearly not a field of the packet. Because this degree of parallelism is 4, and four threads are using. equally distributed. Random distribution tuple, each tuple to ensure that the same number of bolt. And here are the number of occurrences of a problem. Personally I feel a bit like that thread synchronization issues. (Ha !! just personally feel that facilitate understanding, not specifically, I do not know)
3)Non Grouping
Do not group
采用这种策略每个bolt中接收的单词不同。
4)All Grouping
广播发送
5)Global Grouping
全局分组
分配给task id值最小的
根据线程id判断,只分噢诶给线程id最小的
运行结果:
可以看出虽然我们设置了4个线程。但是这个全局分组分配给task id值(线程id)最小的。根据线程id判断,只分给线程id最小的。只用到了一个线程id最小的线程
总结:一般来说,就字段分组和随机分组用的多点。其他用的都很少