storm-wordCount

word statistics


1. Topological structure

1. Data source

2. Word split

3. Word count

4. Statistical results

5. Topology construction

2. Code implementation

1. Word source
package com.study.storm.test.wordcount;

import java.util.Map;

import backtype.storm.spout.SpoutOutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichSpout;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Values;

/**
 * Spout : data source
 * The producer of data, sends data to one or more Bolts for data processing
 */
public class SentenceSpout extends BaseRichSpout {

	/**
	 *
	 */
	private static final long serialVersionUID = -5569170406079406193L;

	private SpoutOutputCollector collector = null ;

	// data
	private static String [] sentences = {
		"i am a boy ","i like eat","i do not like ","what"
	};
	
	// The subscript of the array, the msgId sent by the data
	private int index = 0 ;

	/**
	 * from ISpout
	 * keep sending data
	 * No data is sent, you can sleep for a period of time to avoid frequent CPU calls
	 */
	@Override
	public void nextTuple() {
		// only send once
		if(index >= sentences.length){
			return ;
		}
		// The content of the data sent, the msgId number of the data (not passed, the default is null)
		collector.emit(new Values(sentences[index]),index);
		// Send in a loop to avoid array out of bounds
//		index = index > sentences.length ? 0 : index++ ;
		index ++ ;
	}

	/**
	 * Use static member variables with caution, thread safety issues
	 * Because SpoutOutputCollector is thread-safe, the global collector here can be set to static
	 */
	
	/**
	 * from the ISpout interface
	 * Called when the component is initialized
	 * @param arg0 configuration information
	 * @param arg1 task information
	 * @param arg2 The component used to emit data, thread-safe
	 */
	@SuppressWarnings("rawtypes")
	@Override
	public void open(Map arg0, TopologyContext arg1, SpoutOutputCollector arg2) {
		this.collector = arg2 ;
	}

	/**
	 * from IComponent
	 * Declare fields for output
	 */
	@Override
	public void declareOutputFields(OutputFieldsDeclarer arg0) {
		arg0.declare(new Fields("sentence"));
	}

	@Override
	public void ack(Object msgId) {
		// send a successful response
		System.out.println("ack : "+msgId);
	}
	
	@Override
	public void fail(Object msgId) {
		System.out.println("fail : "+msgId);
		// Failed to send: resend
		this.collector.emit(new Values(sentences[(Integer)msgId]),msgId);
	}
}




2. Word split
package com.study.storm.test.wordcount;

import java.util.Map;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

/**
 * word split
 */
public class SentenceBolt extends BaseRichBolt {

	/**
	 *
	 */
	private static final long serialVersionUID = -5420313164197072550L;

	private OutputCollector collector ;
	/**
	 * Inherited from IBolt
	 * The initialization method, called when the component is initialized
	 * @param stormConf The current Bolt's configuration information object
	 * @param context current environment information object
	 * @param collector object for external output tuple
	 */
	@SuppressWarnings("rawtypes")
	@Override
	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.collector = collector;
	}

	@Override
	public void execute(Tuple input) {
		try {
            // Get data through field, sentence is defined in spout			
			String sentences = input.getStringByField("sentence");
			String [] words = sentences.split(" ");
			for(String word : words){
				// Anchor: Bind the bolt to the upper level, and transfer the data sending status. If there is a problem, it is convenient to find the source of the upper level data
				this.collector.emit(input,new Values(word));
			}
			// Confirm that the send was successful
			collector.ack(input);
		} catch (Exception e) {
			collector.fail(input);
		}
		
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		declarer.declare(new Fields("word"));
	}

}



3. Word count
package com.study.storm.test.wordcount;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

/**
 * Word count statistics
 */
public class WordCountBolt extends BaseRichBolt {

	/**
	 *
	 */
	private static final long serialVersionUID = -4811405807833269368L;

	private OutputCollector collector = null ;

	/**
	 * Thread safety
	 */
	private Map<String,Integer> countMap = new ConcurrentHashMap<String,Integer>();
	
	@SuppressWarnings("rawtypes")
	@Override
	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.collector = collector ;
	}

	@Override
	public void execute(Tuple input) {
		try {
			
			String word = input.getStringByField("word");
			countMap.put(word, countMap.containsKey(word) ? countMap.get(word)+1 : 1);
			// Send data in the order of the declarer, word first, then the number of words
			this.collector.emit(input,new Values(word,countMap.get(word)));
			
			collector.ack(input);
		} catch (Exception e) {
			collector.fail(input);
		}
		
		
	
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		// data send data
		declarer.declare(new Fields("word","count"));
	}

}



4. Statistical Results
package com.study.storm.test.wordcount;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

public class ResultBolt extends BaseRichBolt {

	/**
	 *
	 */
	private static final long serialVersionUID = 7436620687730623420L;

	private Map<String,Integer> map = new ConcurrentHashMap<String,Integer>();
	
	private OutputCollector collector = null ;
	
	@SuppressWarnings("rawtypes")
	@Override
	public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
		this.collector = collector ;
	}

	@Override
	public void execute(Tuple input) {
		try {
			// get data from field
			String word = input.getStringByField("word");
			// Cannot be obtained through getStringByField, a conversion exception will be reported
			Integer count = (Integer) input.getValueByField("count");
			
			map.put(word, count);
			
			System.out.println(word +" : " + count);
			
			collector.ack(input);
			
		} catch (Exception e) {
			collector.fail(input);
		}
		
		
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {

	}

	/**
	 * Run after the normal operation of the program ends
	 *
	 */
	@Override
	public void cleanup() {

		System.out.println("Statistical result: ");
		for(String key : map.keySet()){
			System.out.println(key + " : " + map.get(key));
		}
	}
}



5. Topology Construction
package com.study.storm.test.wordcount;

import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.generated.StormTopology;
import backtype.storm.topology.TopologyBuilder;
import backtype.storm.tuple.Fields;

public class WordCountTopology {

	public static void main(String args[]) throws AlreadyAliveException, InvalidTopologyException, InterruptedException{
		
		// Instantiate Spout Bolt
		SentenceSpout sentenceSpout = new SentenceSpout();
		SentenceBolt sentenceBolt = new SentenceBolt();
		WordCountBolt wordCountBolt = new WordCountBolt();
		ResultBolt resultBolt = new ResultBolt();
		
		// Topology
		TopologyBuilder builder = new TopologyBuilder();
		/**
		 * Consider how to distribute data under concurrent conditions
		 * 1. Concurrency level
		 * node : corresponds to the server in the storm cluster
		 * worker : thread level
		 * executor : thread level
		 * task :
		 */
		builder.setSpout("sentenceSpout", sentenceSpout);
		// Random distribution: that is, data can be sent to any next-level processing machine without affecting the statistical results
		builder.setBolt("sentenceBolt", sentenceBolt).shuffleGrouping("sentenceSpout");
		// Distributed according to field = word, the data passed by the same word is sent to the same machine to avoid statistical omission
		// hash % taskNum
		// If this is random distribution and the parallel level is greater than or equal to 2, word a is sent to 1 machine, count a: 1, the second time the word is sent to 2 machines, the number of a will be distorted
		builder.setBolt("wordCountBolt", wordCountBolt).fieldsGrouping("sentenceBolt", new Fields("word"));
		// If the number of parallels in the upper level is large, no matter which one is processed, it will be sent to the same processing machine. In this way, the setting of the number of parallels is invalid.
		builder.setBolt("resultBolt", resultBolt).globalGrouping("wordCountBolt");
		
		// production topology
		StormTopology stormTopology = builder.createTopology();
		
		Config config = new Config();
		// cluster running
//		StormSubmitter.submitTopology("wordCountTopology", config, stormTopology);
	
		LocalCluster local = new LocalCluster();
		local.submitTopology("wordCountTopology", config, stormTopology);
		// breakpoint debugging, adjust time
		Thread.sleep(10*1000);
		local.killTopology("wordCountTopology");
		local.shutdown();
	}
}




Guess you like

Origin http://10.200.1.11:23101/article/api/json?id=327033577&siteId=291194637