Big data framework: (real-time query framework for telecom fraud)

Frames:

flume: a distributed, reliable, and highly available system for massive log collection, aggregation and transmission. Supports customizing various data senders in the log system to collect data; at the same time, Flume provides the ability to simply process data and write to various data recipients (such as text, HDFS, Hbase, etc.)

Kafka: A high-throughput distributed publish-subscribe messaging system that can process all action flow data in consumer-scale websites. Such actions (web browsing, search and other user actions) are a key factor in many social functions on the modern web. These data are usually resolved by processing logs and log aggregation due to throughput requirements

storm: stream computing

mysql: database: used for linking with the web. (Currently, the clients involved are all interacting with the database)

hadoop: distributed big data framework

hive: data warehouse

spark: memory-based distributed big data framework

redis: key-value database

Use maven to load dependencies automatically, without us needing to manually import the package.

Read this article deployment: eclipse configuration maven environment

Use eclipse for development.

Several categories of telephone fraud establishment

1. Database link: JDBC

import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.apache.commons.dbutils.BasicRowProcessor;
import org.apache.commons.dbutils.QueryRunner;
import org.apache.commons.dbutils.handlers.ArrayListHandler;

public final class MyDbUtils {// 拒绝继承
	private static String className = "com.mysql.jdbc.Driver";
	private static String url = "jdbc:mysql://192.168.115.130:3306/test?useUnicode=true&amp;characterEncoding=utf-8";
	private static String user = "root";
	private static String password = "root";
	private static QueryRunner queryRunner = new QueryRunner();

	public static final String INSERT_LOG = "INSERT INTO LOG(topdomain,usetime,time) VALUES(?,?,?)";

	// 拒绝new一个实例
	private MyDbUtils() {
	};

	static {// 调用该类时既注册驱动
		try {
			Class.forName(className);
		} catch (Exception e) {
			e.printStackTrace();
			throw new RuntimeException();
		}
	}

	public static void main(String[] args) {
	}

	public static List<String> executeQuerySql(String sql) {
		List<String> result = new ArrayList<String>();
		try {
			List<Object[]> requstList = queryRunner.query(getConnection(), sql,
					new ArrayListHandler(new BasicRowProcessor() {
						@Override
						public <Object> List<Object> toBeanList(ResultSet rs,
								Class<Object> type) throws SQLException {
							return super.toBeanList(rs, type);
						}
					}));
			for (Object[] objects : requstList) {
				result.add(objects[0].toString());
			}
		} catch (SQLException e) {
			e.printStackTrace();
		}
		return result;
	}

	@SuppressWarnings("unused")
	public static void update(String sql, Object... params) {
		try {
			Connection connection = getConnection();
			queryRunner.update(connection, sql, params);
			connection.close();
		} catch (SQLException e) {
			e.printStackTrace();
		}
	}

	// 获取连接
	public static Connection getConnection() throws SQLException {
		return DriverManager.getConnection(url, user, password);
	}

}

2. Storm's topology creation (equivalent to the creation of jobtrack)

import storm.kafka.BrokerHosts;
import storm.kafka.KafkaSpout;
import storm.kafka.SpoutConfig;
import storm.kafka.ZkHosts;
import backtype.storm.Config;
import backtype.storm.LocalCluster;
import backtype.storm.StormSubmitter;
import backtype.storm.generated.AlreadyAliveException;
import backtype.storm.generated.InvalidTopologyException;
import backtype.storm.generated.StormTopology;
import backtype.storm.topology.TopologyBuilder;


public class CdrTopology {
	public static void main(String[] args) {
	TopologyBuilder topologyBuilder = new TopologyBuilder();
	String KAFKASPOUT = KafkaSpout.class.getSimpleName();
	String SPLIT_BOLT = SplitBolt.class.getSimpleName();
	String SAVETOKAFKABOLT = SaveCallLogToKafkaBolt.class.getSimpleName();
	String SPLIT_BOLT1 = SplitBolt1.class.getSimpleName();
	String SAVETOKAFKABOLT1 = SaveCallLogToKafkaBolt1.class.getSimpleName();
	String SAVETOMYSQL = SavaCallLogToMysql.class.getSimpleName();
	
	//配置zookeeper
	BrokerHosts hosts = new ZkHosts("localhost:2181");//指定kafka使用的zk地址
	String topic = "cdr_log";//主题
	String zkRoot = "/kafka";//指定一个zk节点，节点不存在会自动创建。[这个节点会创建在storm集群使用的zk中]
	String id = "123";//groupid
	SpoutConfig spoutConf = new SpoutConfig(hosts, topic, zkRoot, id);
	topologyBuilder.setSpout(KAFKASPOUT, new KafkaSpout(spoutConf));
	//topologyBuilder.setSpout(SPOUT_ID, new KafkaSpout(spoutConf),3);
	topologyBuilder.setBolt(SPLIT_BOLT1, new SplitBolt1()).shuffleGrouping(KAFKASPOUT);
	topologyBuilder.setBolt(SAVETOKAFKABOLT1, new SaveCallLogToKafkaBolt1()).shuffleGrouping(SPLIT_BOLT,"calllog");
	//topologyBuilder.setBolt(SAVETOMYSQL, new SavaCallLogToMysql()).shuffleGrouping(SPLIT_BOLT);
	
	StormTopology createTopology = topologyBuilder.createTopology();
	String simpleName = CdrTopology.class.getSimpleName();
	Config config = new Config();
	config.setStatsSampleRate(1D);// 开启精确计数
	if(args.length==0){
		LocalCluster localCluster = new LocalCluster();
		localCluster.submitTopology(simpleName, config, createTopology);
	}else{
		try {
			//config.setNumWorkers(45);
			config.setMaxSpoutPending(1000);
			StormSubmitter.submitTopology(simpleName, config, createTopology);
		} catch (AlreadyAliveException e) {
			e.printStackTrace();
		} catch (InvalidTopologyException e) {
			e.printStackTrace();
		}
	}

}}

3. Partition of the job:

import kafka.producer.Partitioner;
import kafka.utils.VerifiableProperties;

/**
 * Created by jason on 2016/11/27.
 */
public class PartitionerDemo implements Partitioner {

	
    private VerifiableProperties verifiableProperties;

    public PartitionerDemo(VerifiableProperties verifiableProperties) {
        this.verifiableProperties=verifiableProperties;
    }

    public int partition(Object key, int numPartitions) {

        String strKey= (String) key;
        //根据userid的hashCode分区
        return strKey.hashCode()%numPartitions;
    }
}

4. Class establishment of each table in the database

import java.sql.Connection;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.Map;

import cn.com.cintel.storm_siyuan.utils.MyDbUtils;
import clojure.string__init;
import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

public class SavaCallLogToMysql extends BaseRichBolt{
	private OutputCollector collector;
	private int time=0;
    private String callingnumber="0";
    private String callednumber="0";
    private String callingarea="0";
    private String calledarea="0";
	int is_land=0;
	int domain=0;
	private Connection connection;
	@Override
	public void prepare(Map stormConf, TopologyContext context,
			OutputCollector collector) {
		// TODO Auto-generated method stub
		this.collector = collector;
		this.connection = null;//注意：建议在这使用连接池
	}

	@Override
	public void execute(Tuple tuple) {
		// TODO Auto-generated method stub
		try {
			connection = MyDbUtils.getConnection();
			Statement state = connection.createStatement();
			time = tuple.getIntegerByField("time");
			callingnumber = tuple.getStringByField("callingnumber");
			callednumber = tuple.getStringByField("callednumber");
			callingarea = tuple.getStringByField("callingarea");
			calledarea = tuple.getStringByField("calledarea");
			is_land = tuple.getIntegerByField("is_land");
			domain = tuple.getIntegerByField("domain");
			String sql = "insert into calllog(time,callingnumber,callednumber,callingarea,calledarea,is_inland,domain) values("+time+",'"+callingnumber+"',"+"'"+callednumber+"',"+"'"+callingarea+"',"+"'"+calledarea+"',"+is_land+","+domain+")";
			System.out.println(sql);
			state.executeUpdate(sql);
		} catch (SQLException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			if(connection!=null){
				try {
					connection.close();
				} catch (SQLException e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				}
			}
		}
		
	}

	@Override
	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		// TODO Auto-generated method stub
		
	}

	@Override
	public Map<String, Object> getComponentConfiguration() {
		// TODO Auto-generated method stub
		return super.getComponentConfiguration();
	}
    
}

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Map;
import java.util.Properties;
import java.util.Random;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;

import org.omg.CORBA.PRIVATE_MEMBER;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Tuple;

public class SaveCallLogToKafkaBolt extends BaseRichBolt {
    private String topic;
	private Properties prop;
	private Producer<String, String> producer;
	Random random=null;
	private int time;
    private String callingnumber;
    private String callednumber;
    private String callingarea;
    private String calledarea;
	int is_land;
	int domain;
	@Override
	public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
		System.out.println("SaveCallLogToKafkaBolt start");
		topic = "call_log";
		prop = new Properties();
		try {
			//加载producer的配置
			prop.load(SaveCallLogToKafkaBolt.class.getClassLoader().getResourceAsStream("producer.properties"));
		} catch (IOException e) {
			e.printStackTrace();
		}
		/*prop.setProperty("metadata.broker.list", "192.168.115.130:9092,192.168.115.132:9092,192.168.115.133:9092");
		prop.setProperty("partitioner.class", "cn.com.cintel.storm_siyuan.PartitionerDemo");
		prop.setProperty("producer.type", "sync");
		prop.setProperty("compression.codec", "none");
		prop.setProperty("serializer.class", "kafka.serializer.StringEncoder");*/
		producer = new Producer<>(new ProducerConfig(prop));
		random=new Random();
	}
	@Override
	public void execute(Tuple tuple) {
		try
        {		   
            StringBuffer keyedMessage = getKeyedMessage(tuple);
            String msg = keyedMessage.toString();
            //System.out.println(msg);
            Integer tt=random.nextInt(3);
            System.out.println("SaveCallLogToKafkaBolt msg:"+msg);
            producer.send(new KeyedMessage<String, String>(topic,tt.toString(),msg));            
        }
        catch (Exception e)
        {
            // TODO Auto-generated catch block
            e.printStackTrace();
            
        }

	}

	private StringBuffer getKeyedMessage(Tuple tuple) {
		// TODO Auto-generated method stub
		time = tuple.getIntegerByField("time");
		callingnumber = tuple.getStringByField("callingnumber");
		callednumber = tuple.getStringByField("callednumber");
		callingarea = tuple.getStringByField("callingarea");
		calledarea = tuple.getStringByField("calledarea");
		is_land = tuple.getIntegerByField("is_land");
		domain = tuple.getIntegerByField("domain");
		StringBuffer msg = new StringBuffer();
		msg.append(time);
		msg.append("|" +callingnumber);
		msg.append("|" +callednumber);
		msg.append("|" +callingarea);
		msg.append("|" +calledarea);
		msg.append("|" +is_land);
		msg.append("|" +domain);
		msg.append("|" +"55");
		msg.append("|" +"66");
        msg.append("|" +";");
		return msg; 
	}
	@Override
	public void declareOutputFields(OutputFieldsDeclarer arg0) {
		// TODO Auto-generated method stub
		
	}

}

5. Storm's bolt processing (bolt: the abstract process of tuple processing)

import java.util.Map;

import backtype.storm.task.OutputCollector;
import backtype.storm.task.TopologyContext;
import backtype.storm.topology.OutputFieldsDeclarer;
import backtype.storm.topology.base.BaseRichBolt;
import backtype.storm.tuple.Fields;
import backtype.storm.tuple.Tuple;
import backtype.storm.tuple.Values;

public class SplitBolt1 extends BaseRichBolt{
	private OutputCollector collector;
	String log;
	String[] splited;
	String time;
	public void prepare(Map arg0, TopologyContext arg1, OutputCollector arg2) {
		// TODO Auto-generated method stub
		this.collector = arg2;
		
	}

	public void execute(Tuple input) {
		// TODO Auto-generated method stub
		//time,callingnumber,callednumber,callingarea,calledarea,is_land,domain
		try {
			log = new String(input.getBinaryByField("bytes"));
			splited = log.split("\\|",-1);
			Message msg = new Message();
			msg.setTime(Integer.parseInt(splited[0]));
			msg.setCallingnumber(splited[1]);
			msg.setCallednumber(splited[2]);
			msg.setCallingarea(splited[3]);
			msg.setCalledarea(splited[4]);
			msg.setIs_land(Integer.parseInt(splited[5]));
			msg.setDomain(Integer.parseInt(splited[6]));	
			this.collector.emit("calllog", new Values(msg));
			this.collector.ack(input);
		} catch (NumberFormatException e) {
			// TODO Auto-generated catch block
			this.collector.fail(input);
			e.printStackTrace();
		}
	}

	public void declareOutputFields(OutputFieldsDeclarer declarer) {
		// TODO Auto-generated method stub
		declarer.declareStream("calllog",new Fields("calllog"));
	}

}

For the improvement of spark and hive, please see the next part.

Big data framework: (real-time query framework for telecom fraud)

Guess you like