Frames:
flume: a distributed, reliable, and highly available system for massive log collection, aggregation and transmission. Supports customizing various data senders in the log system to collect data; at the same time, Flume provides the ability to simply process data and write to various data recipients (such as text, HDFS, Hbase, etc.)
Kafka: A high-throughput distributed publish-subscribe messaging system that can process all action flow data in consumer-scale websites. Such actions (web browsing, search and other user actions) are a key factor in many social functions on the modern web. These data are usually resolved by processing logs and log aggregation due to throughput requirements
storm: stream computing
mysql: database: used for linking with the web. (Currently, the clients involved are all interacting with the database)
hadoop: distributed big data framework
hive: data warehouse
spark: memory-based distributed big data framework
redis: key-value database
Use maven to load dependencies automatically, without us needing to manually import the package.
Read this article deployment: eclipse configuration maven environment
Use eclipse for development.
Several categories of telephone fraud establishment
1. Database link: JDBC
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;
public final class MyDbUtils {// 拒绝继承
private static String className = "com.mysql.jdbc.Driver";
private static String url = "jdbc:mysql://192.168.115.130:3306/test?useUnicode=true&characterEncoding=utf-8";
private static String user = "root";
private static String password = "root";
private static QueryRunner queryRunner = new QueryRunner();
public static final String INSERT_LOG = "INSERT INTO LOG(topdomain,usetime,time) VALUES(?,?,?)";
// 拒绝new一个实例
private MyDbUtils() {
};
static {// 调用该类时既注册驱动
try {
Class.forName(className);
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException();
}
}
public static void main(String[] args) {
}
public static List<String> executeQuerySql(String sql) {
List<String> result = new ArrayList<String>();
try {
List<Object[]> requstList = queryRunner.query(getConnection(), sql,
new ArrayListHandler(new BasicRowProcessor() {
@Override
public <Object> List<Object> toBeanList(ResultSet rs,
Class<Object> type) throws SQLException {
return super.toBeanList(rs, type);
}
}));
for (Object[] objects : requstList) {
result.add(objects[0].toString());
}
} catch (SQLException e) {
e.printStackTrace();
}
return result;
}
@SuppressWarnings("unused")
public static void update(String sql, Object... params) {
try {
Connection connection = getConnection();
queryRunner.update(connection, sql, params);
connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
// 获取连接
public static Connection getConnection() throws SQLException {
return DriverManager.getConnection(url, user, password);
}
}
2. Storm's topology creation (equivalent to the creation of jobtrack)
import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.ZkHosts;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.generated.StormTopology;
import backtype.storm.topology.TopologyBuilder;
public class CdrTopology {
public static void main(String[] args) {
TopologyBuilder topologyBuilder = new TopologyBuilder();
String KAFKASPOUT = KafkaSpout.class.getSimpleName();
String SPLIT_BOLT = SplitBolt.class.getSimpleName();
String SAVETOKAFKABOLT = SaveCallLogToKafkaBolt.class.getSimpleName();
String SPLIT_BOLT1 = SplitBolt1.class.getSimpleName();
String SAVETOKAFKABOLT1 = SaveCallLogToKafkaBolt1.class.getSimpleName();
String SAVETOMYSQL = SavaCallLogToMysql.class.getSimpleName();
//配置zookeeper
BrokerHosts hosts = new ZkHosts("localhost:2181");//指定kafka使用的zk地址
String topic = "cdr_log";//主题
String zkRoot = "/kafka";//指定一个zk节点,节点不存在会自动创建。[这个节点会创建在storm集群使用的zk中]
String id = "123";//groupid
SpoutConfig spoutConf = new SpoutConfig(hosts, topic, zkRoot, id);
topologyBuilder.setSpout(KAFKASPOUT, new KafkaSpout(spoutConf));
//topologyBuilder.setSpout(SPOUT_ID, new KafkaSpout(spoutConf),3);
topologyBuilder.setBolt(SPLIT_BOLT1, new SplitBolt1()).shuffleGrouping(KAFKASPOUT);
topologyBuilder.setBolt(SAVETOKAFKABOLT1, new SaveCallLogToKafkaBolt1()).shuffleGrouping(SPLIT_BOLT,"calllog");
//topologyBuilder.setBolt(SAVETOMYSQL, new SavaCallLogToMysql()).shuffleGrouping(SPLIT_BOLT);
StormTopology createTopology = topologyBuilder.createTopology();
String simpleName = CdrTopology.class.getSimpleName();
Config config = new Config();
config.setStatsSampleRate(1D);// 开启精确计数
if(args.length==0){
LocalCluster localCluster = new LocalCluster();
localCluster.submitTopology(simpleName, config, createTopology);
}else{
try {
//config.setNumWorkers(45);
config.setMaxSpoutPending(1000);
StormSubmitter.submitTopology(simpleName, config, createTopology);
} catch (AlreadyAliveException e) {
e.printStackTrace();
} catch (InvalidTopologyException e) {
e.printStackTrace();
}
}
}}
3. Partition of the job:
import kafka.producer.Partitioner;
import kafka.utils.VerifiableProperties;
/**
* Created by jason on 2016/11/27.
*/
public class PartitionerDemo implements Partitioner {
private VerifiableProperties verifiableProperties;
public PartitionerDemo(VerifiableProperties verifiableProperties) {
this.verifiableProperties=verifiableProperties;
}
public int partition(Object key, int numPartitions) {
String strKey= (String) key;
//根据userid的hashCode分区
return strKey.hashCode()%numPartitions;
}
}
4. Class establishment of each table in the database
import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Map;
import cn.com.cintel.storm_siyuan.utils.MyDbUtils;
import clojure.string__init;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
public class SavaCallLogToMysql extends BaseRichBolt{
private OutputCollector collector;
private int time=0;
private String callingnumber="0";
private String callednumber="0";
private String callingarea="0";
private String calledarea="0";
int is_land=0;
int domain=0;
private Connection connection;
@Override
public void prepare(Map stormConf, TopologyContext context,
OutputCollector collector) {
// TODO Auto-generated method stub
this.collector = collector;
this.connection = null;//注意:建议在这使用连接池
}
@Override
public void execute(Tuple tuple) {
// TODO Auto-generated method stub
try {
connection = MyDbUtils.getConnection();
Statement state = connection.createStatement();
time = tuple.getIntegerByField("time");
callingnumber = tuple.getStringByField("callingnumber");
callednumber = tuple.getStringByField("callednumber");
callingarea = tuple.getStringByField("callingarea");
calledarea = tuple.getStringByField("calledarea");
is_land = tuple.getIntegerByField("is_land");
domain = tuple.getIntegerByField("domain");
String sql = "insert into calllog(time,callingnumber,callednumber,callingarea,calledarea,is_inland,domain) values("+time+",'"+callingnumber+"',"+"'"+callednumber+"',"+"'"+callingarea+"',"+"'"+calledarea+"',"+is_land+","+domain+")";
System.out.println(sql);
state.executeUpdate(sql);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally{
if(connection!=null){
try {
connection.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
@Override
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// TODO Auto-generated method stub
}
@Override
public Map<String, Object> getComponentConfiguration() {
// TODO Auto-generated method stub
return super.getComponentConfiguration();
}
}
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
import org.omg.CORBA.PRIVATE_MEMBER;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;
public class SaveCallLogToKafkaBolt extends BaseRichBolt {
private String topic;
private Properties prop;
private Producer<String, String> producer;
Random random=null;
private int time;
private String callingnumber;
private String callednumber;
private String callingarea;
private String calledarea;
int is_land;
int domain;
@Override
public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
System.out.println("SaveCallLogToKafkaBolt start");
topic = "call_log";
prop = new Properties();
try {
//加载producer的配置
prop.load(SaveCallLogToKafkaBolt.class.getClassLoader().getResourceAsStream("producer.properties"));
} catch (IOException e) {
e.printStackTrace();
}
/*prop.setProperty("metadata.broker.list", "192.168.115.130:9092,192.168.115.132:9092,192.168.115.133:9092");
prop.setProperty("partitioner.class", "cn.com.cintel.storm_siyuan.PartitionerDemo");
prop.setProperty("producer.type", "sync");
prop.setProperty("compression.codec", "none");
prop.setProperty("serializer.class", "kafka.serializer.StringEncoder");*/
producer = new Producer<>(new ProducerConfig(prop));
random=new Random();
}
@Override
public void execute(Tuple tuple) {
try
{
StringBuffer keyedMessage = getKeyedMessage(tuple);
String msg = keyedMessage.toString();
//System.out.println(msg);
Integer tt=random.nextInt(3);
System.out.println("SaveCallLogToKafkaBolt msg:"+msg);
producer.send(new KeyedMessage<String, String>(topic,tt.toString(),msg));
}
catch (Exception e)
{
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private StringBuffer getKeyedMessage(Tuple tuple) {
// TODO Auto-generated method stub
time = tuple.getIntegerByField("time");
callingnumber = tuple.getStringByField("callingnumber");
callednumber = tuple.getStringByField("callednumber");
callingarea = tuple.getStringByField("callingarea");
calledarea = tuple.getStringByField("calledarea");
is_land = tuple.getIntegerByField("is_land");
domain = tuple.getIntegerByField("domain");
StringBuffer msg = new StringBuffer();
msg.append(time);
msg.append("|" +callingnumber);
msg.append("|" +callednumber);
msg.append("|" +callingarea);
msg.append("|" +calledarea);
msg.append("|" +is_land);
msg.append("|" +domain);
msg.append("|" +"55");
msg.append("|" +"66");
msg.append("|" +";");
return msg;
}
@Override
public void declareOutputFields(OutputFieldsDeclarer arg0) {
// TODO Auto-generated method stub
}
}
5. Storm's bolt processing (bolt: the abstract process of tuple processing)
import java.util.Map;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;
public class SplitBolt1 extends BaseRichBolt{
private OutputCollector collector;
String log;
String[] splited;
String time;
public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
// TODO Auto-generated method stub
this.collector = arg2;
}
public void execute(Tuple input) {
// TODO Auto-generated method stub
//time,callingnumber,callednumber,callingarea,calledarea,is_land,domain
try {
log = new String(input.getBinaryByField("bytes"));
splited = log.split("\\|",-1);
Message msg = new Message();
msg.setTime(Integer.parseInt(splited[0]));
msg.setCallingnumber(splited[1]);
msg.setCallednumber(splited[2]);
msg.setCallingarea(splited[3]);
msg.setCalledarea(splited[4]);
msg.setIs_land(Integer.parseInt(splited[5]));
msg.setDomain(Integer.parseInt(splited[6]));
this.collector.emit("calllog", new Values(msg));
this.collector.ack(input);
} catch (NumberFormatException e) {
// TODO Auto-generated catch block
this.collector.fail(input);
e.printStackTrace();
}
}
public void declareOutputFields(OutputFieldsDeclarer declarer) {
// TODO Auto-generated method stub
declarer.declareStream("calllog",new Fields("calllog"));
}
}
For the improvement of spark and hive, please see the next part.