大数据项目流程1

本文链接： https://blog.csdn.net/Romantic_sir/article/details/102689153

1、将工程打成jar包放到linux中运行。

2、启动nginx 目的是为了产生日志，还有负载均衡和反向代理以后更新，重点是配置文件

#user  nobody;
worker_processes  1;
 
#error_log  logs/error.log;
#error_log  logs/error.log  notice;
#error_log  logs/error.log  info;
 
#pid        logs/nginx.pid;
 
 
events {
    worker_connections  1024;
}
 
 
http {
    include       mime.types;
    default_type  application/octet-stream;
 
    log_format  main  '$remote_addr';
 
    #access_log  logs/access.log  main;
 
    sendfile        on;
    #tcp_nopush     on;
 
    #keepalive_timeout  0;
    keepalive_timeout  65;
 
    #gzip  on;
    upstream frame-tomcat {
          server hdp-3:8888; 
##指明nginx转发地址
    }
    server {
        listen       80;
        server_name  hdp-1;
##nginx的服务地址
        #charset koi8-r;
 
        access_log  logs/log.frame.access.log  main;
##输出生成的日志文件的路径和格式
        location / {
            # root   html;
            # index  index.html index.htm;
            proxy_pass http://frame-tomcat;
##代理传递（转发）
        }
 
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
    }
    server {
        listen       80;
        server_name  localhost;
 
        #charset koi8-r;
 
        #access_log  logs/host.access.log  main;
 
        location / {
            root   html;
            index  index.html index.htm;
        }
 
        #error_page  404              /404.html;
 
        # redirect server error pages to the static page /50x.html
        #
        error_page   500 502 503 504  /50x.html;
        location = /50x.html {
            root   html;
        }
 
        # proxy the PHP scripts to Apache listening on 127.0.0.1:80
        #
        #location ~ \.php$ {
        #    proxy_pass   http://127.0.0.1;
        #}
 
        # pass the PHP scripts to FastCGI server listening on 127.0.0.1:9000
        #
        #location ~ \.php$ {
        #    root           html;
        #    fastcgi_pass   127.0.0.1:9000;
        #    fastcgi_index  index.php;
        #    fastcgi_param  SCRIPT_FILENAME  /scripts$fastcgi_script_name;
        #    include        fastcgi_params;
        #}
 
        # deny access to .htaccess files, if Apache's document root
        # concurs with nginx's one
        #
        #location ~ /\.ht {
        #    deny  all;
        #}
    }
 
 
    # another virtual host using mix of IP-, name-, and port-based configuration
    #
    #server {
    #    listen       8000;
    #    listen       somename:8080;
    #    server_name  somename  alias  another.alias;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
 
    # HTTPS server
    #
    #server {
    #    listen       443;
    #    server_name  localhost;
 
    #    ssl                  on;
    #    ssl_certificate      cert.pem;
    #    ssl_certificate_key  cert.key;
 
    #    ssl_session_timeout  5m;
 
    #    ssl_protocols  SSLv2 SSLv3 TLSv1;
    #    ssl_ciphers  HIGH:!aNULL:!MD5;
    #    ssl_prefer_server_ciphers   on;
 
    #    location / {
    #        root   html;
    #        index  index.html index.htm;
    #    }
    #}
 
}

3、flume采集nginx产生的日志文件到kafka，重点是配置文件

a1.sources = source1
a1.sinks = k1
a1.channels = c1
# Describe/configure the source
a1.sources.source1.type = exec
##指明数据源来自于一个可执行指令
a1.sources.source1.command = tail -F /usr/local/nginx/logs/log.frame.access.log
##可执行指令，跟踪一个文件中的内容
# Describe the sink
##下沉到kafka的下沉类型
a1.sinks.k1.type = org.apache.flume.sink.kafka.KafkaSink
a1.sinks.k1.topic = animal
a1.sinks.k1.brokerList = hdp-2:9092, hdp-3:9092
a1.sinks.k1.requiredAcks = 1
a1.sinks.k1.batchSize = 20
a1.sinks.k1.channel = c1
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100
 
# Bind the source and sink to the channel
a1.sources.source1.channels = c1
a1.sinks.k1.channel = c1

4、启动kafka，之前启动zookeeper，在消费者中收到数据并在本地产生临时路径（log4j进行滚动文件存储）

### \u8BBE\u7F6E###
#log4j.rootLogger=debug,stdout,genlog
log4j.rootLogger=logRollingFile,stdout


log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.Target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} method:%l%n%m%n


###
log4j.logger.logRollingFile= DEBUG,test1  
log4j.appender.test1 = org.apache.log4j.RollingFileAppender 
log4j.appender.test1.layout = org.apache.log4j.PatternLayout 
log4j.appender.test1.layout.ConversionPattern = %m%n
log4j.appender.test1.Threshold = DEBUG 
log4j.appender.test1.ImmediateFlush = TRUE 
log4j.appender.test1.Append = TRUE 
log4j.appender.test1.File = f:/testlog/access.log 
log4j.appender.test1.MaxFileSize = 10KB 
log4j.appender.test1.MaxBackupIndex = 200 
### log4j.appender.test1.Encoding = UTF-8

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.Collections;
import java.util.Properties;

import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.clients.consumer.ConsumerRecords;
import org.apache.kafka.clients.consumer.KafkaConsumer;
import org.apache.log4j.Logger;

public class ConsumerDemo {
    private static KafkaConsumer<String, String> consumer;
    private static Properties props;
    //static静态代码块，在main之前先执行
    static {
        props = new Properties();
        //消费者kafka地址
        props.put("bootstrap.servers", "hdp-1:9092");
            //key反序列化
        props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        props.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
        //组
        props.put("group.id", "yangk");
    }

    /**
     * 从kafka中获取数据（SpringBoot也集成了kafka）
     */

    private static void ConsumerMessage() {
        Logger logger = Logger.getLogger("logRollingFile");
        //允许自动提交位移
        props.put("enable.auto.commit", true);
        consumer = new KafkaConsumer<String, String>(props);
        consumer.subscribe(Collections.singleton("first_kafka"));

        //使用轮询拉取数据--消费完成之后会根据设置时长来清除消息，被消费过的消息，如果想再次被消费，可以根据偏移量(offset)来获取
        try {
            File file = new File("F:/xiu.txt");
            FileOutputStream fos = new FileOutputStream(file);
            while (true) {
                //从kafka中读到了数据放在records中
                ConsumerRecords<String, String> records = consumer.poll(100);
                for (ConsumerRecord<String, String> r : records) {

                    String a ="topic:"+ r.topic()+",offset:"+ r.offset()+",key:"+ r.key()+",value:"+ r.value() + "\r\n";
                    //将数据写到F:/xiu.txt
//                    fos.write(a.getBytes());

                    System.out.printf("topic = %s, offset = %s, key = %s, value = %s", r.topic(), r.offset(),
                            r.key(), r.value() + "\n");
                    logger.debug(r.value());

                }
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            consumer.close();
        }
    }

    public static void main(String[] args) {
        //调用接收消息的方法
        ConsumerMessage();

    }
}

5、上传到hdfshive表中，hive中建立外部表

建表语句：create external table xiangmu1(ip string ) row format delimited location '/usr/';

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import java.net.URI;
import java.util.TimerTask;

public class UploadData extends TimerTask{

    public void run() {


        URI uri = null;
        try {
            uri = new URI("hdfs://hdp-1:9000");


        Configuration conf = new Configuration();
        conf.set("dfs.replication", "2");//name,value 副本个数
        conf.set("dfs.blocksize", "64m");//块的大小
        String user = "root";

        FileSystem fs = FileSystem.get(uri,conf,user);
        Path src = new Path("F:/testlog/access.log");
        Path dst = new Path("/usr/a.txt");
        fs.copyFromLocalFile(src,dst);
        fs.close();
        } catch (Exception e) {
            e.printStackTrace();
        }

    }


}

6、分析pv总值：selelct count(*) from xiangmu1;

猜你喜欢