一、分组策略grouping
-------------------------------------------------------------------------
1.shuffle -- 随机分组,无规律
2.field -- 根据关键字的key进行hash处理,相同的key会进入同一组
3.all -- 将数据发送给所有的下家,每个下家都会持有一个消息的副本。广播模式。
4.direct -- 将数据发送给指定的下家。只有指定的下家才会收到消息。点对点模式。
a.获取定向目标的taskid
int taskID = 0;
Map<Integer, String> map = context.getTaskToComponent();
Set<Integer> set = map.keySet();
for(Integer i : set)
{
if(map.get(i).equals("spiltBolt"))
{
taskID = i;
break;
}
}
b.向指定目标发送tuple
collector.emitDirect(taskID,new Values(line));
c.app设定分组方式为定向分组
builder.setBolt("spiltBolt", new SpiltBolt(), 8).directGrouping("wordSpout").setNumTasks(4);
5.global -- 对所有下家的TaskID进行排序,然后将消息全部发送给id排在首位的Task(发送给id最小的)。可以说是特殊的direct
a.app设定分组方式为全局分组
builder.setBolt("spiltBolt", new SpiltBolt(), 4).globalGrouping("wordSpout").setNumTasks(4);
6.自定义分组
a.自定义CustomStreamGrouping类
---------------------------------------------------------
package test.storm.group.custom;
import org.apache.storm.generated.GlobalStreamId;
import org.apache.storm.grouping.CustomStreamGrouping;
import org.apache.storm.task.WorkerTopologyContext;
import java.util.ArrayList;
import java.util.List;
/**
* 自定义分组
*/
public class MyGrouping implements CustomStreamGrouping {
//接收数据的下家task id集合
private List<Integer> targetTaskIds = new ArrayList<Integer>();
public void prepare(WorkerTopologyContext context, GlobalStreamId stream, List<Integer> targetTasks) {
targetTaskIds = targetTasks;
}
public List<Integer> chooseTasks(int taskId, List<Object> values) {
List<Integer> returnIds = new ArrayList<Integer>();
//取所有目标的前半段
for (int i = 0; i <targetTaskIds.size() / 2 ; i++) {
returnIds.add(targetTaskIds.get(i));
}
return returnIds;
}
}
b.App设定分组为自定义分组
builder.setBolt("spiltBolt", new SpiltBolt(), 4).customGrouping("wordSpout",new MyGrouping()).setNumTasks(4);
二、修改Storm的打印日志
------------------------------------------------------------------
storm-core-1.3.jar下。
[main/resources/log4j2.xml目录下]
<configuration monitorInterval="60">
<Appenders>
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%-4r [%t] %-5p %c{1.} - %msg%n"/>
</Console>
</Appenders>
<Loggers>
<Logger name="org.apache.zookeeper" level="ERROR"/>
<Root level="error">
<AppenderRef ref="Console"/>
</Root>
</Loggers>
</configuration>
三、确保消费机制ack/fali
----------------------------------------------------------------
1.akc()函数
当一个tuple被完全处理完毕时的回调函数
2.fail()函数
当一个tiple失败的时候的回调函数
3.msgID
想要确保消费,那么spout发射的tuple必须保证是携带有msgID的
4.在最后一道bolt中需要对tuple进行ack确认,这个时候回调ack()函数,告知tuple被成功消费
5.如果tuple在bolt工序的某一个bolt中失败,那么就会回调fail()函数,告知tuple消费失败
四、双集合确保消费
------------------------------------------------------------------------
1.sport改造
----------------------------------------------------
package test.storm.group.custom;
import org.apache.storm.spout.SpoutOutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichSpout;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Fields;
import org.apache.storm.tuple.Values;
import java.util.HashMap;
import java.util.Map;
import java.util.Random;
/**
* 单词产生源spout -- 水龙头
*/
public class WordSpout implements IRichSpout {
private TopologyContext context;
private SpoutOutputCollector collector;
//存放所有的tuple数据
private Map<Long, String> msgMap = new HashMap<Long, String>();
//失败的数据集合
private Map<Long, Integer> failMap = new HashMap<Long, Integer>();
public void open(Map map, TopologyContext topologyContext, SpoutOutputCollector spoutOutputCollector) {
context = topologyContext;
collector = spoutOutputCollector;
}
public void close() {
}
public void activate() {
}
public void deactivate() {
}
/**
* 下一个
*/
public void nextTuple() {
String line = "how are you" + " tom" + new Random().nextInt(100);
//储存消息
Long msgId = System.currentTimeMillis();
msgMap.put(msgId, line);
collector.emit(new Values(line),msgId);
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
public void ack(Object o) {
if (msgMap.containsKey(o)) {
msgMap.remove(o);
}
if (failMap.containsKey(o)) {
failMap.remove(o);
}
}
public void fail(Object o) {
if (failMap.containsKey(o)) {
int count = failMap.get(o);
count ++;
if (count > 3) {
//取消这条消息
msgMap.remove(o);
failMap.remove(o);
} else {
failMap.put((Long)o,count);
//重发
String line = msgMap.get(o);
collector.emit(new Values(line),o);
}
}
else
{
failMap.put((Long)o,1);
//重发
String line = msgMap.get(o);
collector.emit(new Values(line),o);
}
}
public void declareOutputFields(OutputFieldsDeclarer outputFieldsDeclarer) {
outputFieldsDeclarer.declare(new Fields("line"));
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
五、storm上集成kafka
--------------------------------------------------------------
1.storm作为kafka的消费者,从kafka队列提取消息,交给storm集群进行计算
2.添加maven依赖
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.8.1.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
3.启动kafka 和 storm集群
4.重写App类
----------------------------------------------
package test.storm.kafka;
import org.apache.storm.Config;
import org.apache.storm.LocalCluster;
import org.apache.storm.kafka.*;
import org.apache.storm.spout.SchemeAsMultiScheme;
import org.apache.storm.topology.TopologyBuilder;
import java.util.UUID;
public class App {
public static void main(String [] args)
{
TopologyBuilder builder = new TopologyBuilder();
//zk连接串
String zkConnString = "s200:2181,s300:2181,s400:2181";
//
BrokerHosts hosts = new ZkHosts(zkConnString);
//Spout配置
SpoutConfig spoutConfig = new SpoutConfig(hosts, "test", "/test", UUID.randomUUID().toString());
spoutConfig.scheme = new SchemeAsMultiScheme(new StringScheme());
KafkaSpout kafkaSpout = new KafkaSpout(spoutConfig);
builder.setSpout("kafkaspout", kafkaSpout).setNumTasks(2);
builder.setBolt("split-bolt", new SpiltBolt(),2).shuffleGrouping("kafkaspout").setNumTasks(2);
Config conf = new Config();
conf.setNumWorkers(2);
conf.setDebug(true);
/**
* 本地模式storm
*/
LocalCluster cluster = new LocalCluster();
cluster.submitTopology("wc", conf, builder.createTopology());
}
}
六、storm上集成hbase
--------------------------------------------------------------
1.描述
将计算结果写入到hbase数据库中。
hbase 高吞吐量
随机定位
实时读写。
master
regionServer | region | wal | hadoop
2.创建hbase wordcount表,f1
$>hbase shell
$hbase shell>create 'ns1:wordcount' , 'f1'
3.添加Maven依赖
<dependencies>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-core</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-kafka</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka_2.10</artifactId>
<version>0.8.1.1</version>
<exclusions>
<exclusion>
<groupId>org.apache.zookeeper</groupId>
<artifactId>zookeeper</artifactId>
</exclusion>
<exclusion>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.storm</groupId>
<artifactId>storm-hbase</artifactId>
<version>1.0.3</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.7.3</version>
</dependency>
</dependencies>
4.拷贝hbase的配置文件[hbase-site.xml/hdfs-site.xml]到resourcs目录下
5.HbaseBolt
----------------------------------------------------------
package test.storm.hbase;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.storm.task.OutputCollector;
import org.apache.storm.task.TopologyContext;
import org.apache.storm.topology.IRichBolt;
import org.apache.storm.topology.OutputFieldsDeclarer;
import org.apache.storm.tuple.Tuple;
import java.io.IOException;
import java.util.Map;
/**
* 写入数据到hbase中
*/
public class HbaseBolt implements IRichBolt {
private TopologyContext context;
private OutputCollector collector;
private Table tb;
public void prepare(Map stormConf, TopologyContext context, OutputCollector collector) {
this.context = context;
this.collector = collector;
try {
//获取配置文件
Configuration conf = HBaseConfiguration.create();
//工厂类创建连接
Connection conn = ConnectionFactory.createConnection(conf);
//get table
TableName tbName = TableName.valueOf("ns1:wordcount");
tb = conn.getTable(tbName);
} catch (IOException e) {
e.printStackTrace();
}
}
public void execute(Tuple input) {
String word = input.getString(0);
int count = input.getInteger(1);
System.out.println("word : count => " + word + ":" + count);
//使用hbase的increment计数器机制,进行单词累加
byte[] rowkey = Bytes.toBytes(word);
byte[] f = Bytes.toBytes("f1");
byte[] c = Bytes.toBytes("count");
try {
tb.incrementColumnValue(rowkey,f,c,count);
} catch (IOException e) {
}
}
public void cleanup() {
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
}
public Map<String, Object> getComponentConfiguration() {
return null;
}
}
6.查看hbase表数据
$hbase> get_counter 'ns1:wordcount' , 'word' , 'f1:count'