采集文件到kafka

  采集指定目录下文本数据到kafka

  1 package com.shenyuchong;
  2 import java.io.BufferedReader;
  3 import java.io.File;
  4 import java.io.FileReader;
  5 import java.io.IOException;
  6 import java.net.HttpURLConnection;
  7 import java.net.URL;
  8 import java.util.ArrayList;
  9 import java.util.Date;
 10 import java.util.List;
 11 import java.util.Properties;
 12 import java.util.concurrent.ExecutorService;
 13 import java.util.concurrent.Executors;
 14 import java.util.regex.Pattern;
 15 import org.apache.kafka.clients.producer.Callback;
 16 import org.apache.kafka.clients.producer.KafkaProducer;
 17 import org.apache.kafka.clients.producer.Producer;
 18 import org.apache.kafka.clients.producer.ProducerRecord;
 19 import org.apache.kafka.clients.producer.RecordMetadata;
 20 import org.apache.kafka.common.serialization.StringSerializer;
 21 
 22 public class App {
 23     public static List<File> SendedFiles = new ArrayList<>();
 24     public static String fieldSquence = "";
 25     public static int    fieldNum = 0;
 26     public static String ip = "";
 27     public static String port = "";
 28     public static String path = "";
 29     public static String threadNum = "5";
 30     public static String topic = "";
 31     public static String lineRegex = "^.*$";
 32     public static String delimiter = "\\s+";
 33     public static String delimiter2 = "|||";
 34     public static String includesuffix = "aSuffix,bSuffix";
 35     public static Pattern linePattern =null;
 36     public static Properties props =null;
 37     public static String noticeUrl;
 38     public static void main(String[] args) {
 39         /*
 40          * 配置文件弱不存在则抛出异常
 41          */
 42         if(args.length<1){
 43             try {
 44                 throw new Exception("无配置文件");
 45             } catch (Exception e) {
 46                 e.printStackTrace();
 47             }
 48         }
 49         try {
 50             BufferedReader br = new BufferedReader(new FileReader(new File(args[0])));
 51             String line="";
 52             while((line=br.readLine())!=null){
 53                 line = line.replaceAll("\\s+", "");
 54                 if(line.indexOf("=")!=-1){
 55                     String[] kv=line.split("=");
 56                     String k= kv[0];
 57                     String v= kv[1];
 58                     if (k.equals("port"))          port = v;            //kafka 端口
 59                     if (k.equals("ip"))            ip = v;              //kafka 主机地址
 60                     if (k.equals("topic"))         topic = v;           //kafka 主题
 61                     if (k.equals("fieldsquence"))  fieldSquence = v;    //字段序列,逗号隔开
 62                     if (k.equals("threadnum"))     threadNum = v;       //采集线程数
 63                     if (k.equals("path"))          path = v;            //采集的目录,多目录逗号隔开
 64                     if (k.equals("lineregex"))     lineRegex=v;         //行正则,不匹配的行数据丢弃
 65                     if (k.equals("delimiter"))     delimiter=v;         //字段分隔符
 66                     if (k.equals("delimiter2"))    delimiter2=v;        //重组分隔符(发送到Kafka)
 67                     if (k.equals("includesuffix")) includesuffix=v;     //包含文件的后缀
 68                     if (k.equals("noticeurl"))     noticeUrl=v;         //采集完成通知的接口
 69 
 70                 }
 71             }
 72             br.close();
 73         } catch (IOException e1) {
 74             e1.printStackTrace();
 75         }
 76         /*
 77          * kafka配置
 78          */
 79         props = new Properties();
 80         props.put("bootstrap.servers", ip+":"+port);
 81         props.put("acks", "all");
 82         props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 83         props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
 84         
 85         fieldNum = fieldSquence.split(",").length;
 86         linePattern= Pattern.compile(lineRegex);
 87         
 88         /*
 89          * 线程池
 90          */
 91         ExecutorService es = Executors.newFixedThreadPool(Integer.valueOf(threadNum));
 92         /*
 93          * 根据path目录获取文件
 94          * 根据includesuffix选中文件调用send(file)
 95          * 每个文件创建一个线程(线程实际总数由threadNum决定)
 96          */
 97         for(String path:path.split(",")){
 98             File dir=new File(path);
 99             File[] files = dir.listFiles();
100             for(final File file:files){
101                 for(String suffix:includesuffix.split(",")){
102                     if(file.getAbsolutePath().endsWith(suffix)){
103                         es.submit(new Runnable() {
104                             @Override
105                             public void run() {
106                                 send(file);                            
107                             }
108                         });
109                     }
110                 }
111                 
112             }
113         }
114         /*
115          * 关闭线程池
116          */
117         es.shutdown();
118         /*
119          * 线程池停止后通知后续服务指导后续服务接受了请求
120          */
121         boolean stop=false,noticed=false;
122         try {
123             while(!stop||!noticed){
124                 if (es.isTerminated()) { 
125                     stop=true;
126                 } 
127                 Thread.sleep(2000);
128                 if(stop){
129                     noticed = connectSuccess(noticeUrl);
130                 }
131             }
132         } catch (Exception e) {
133             e.printStackTrace();
134         }
135     }
136     /*
137      * 读取文件并发送到kafka,文件内容发送完成后将文件添加.COMPLETED后缀
138      */
139     public static void send(File file){
140         BufferedReader bf =null;
141         StringBuffer sb = null;
142         try {            
143             bf = new BufferedReader(new FileReader(file));
144             String line = null;
145             Producer<String, String> producer = new KafkaProducer<>(props, new StringSerializer(), new StringSerializer());
146             while((line = bf.readLine())!=null){
147                 sb = new StringBuffer();
148                 line = line.trim();
149                 if(linePattern.matcher(line).matches()){
150                     String[] fields = line.split(delimiter);
151                     if(fields.length<fieldNum){
152                     }else{
153                         for(String fieldValue:fields)
154                             sb.append(fieldValue).append(delimiter2);
155                         sb.append(file.getAbsolutePath());
156                         producer.send(new ProducerRecord<String, String>(topic, String.valueOf((new Date()).getTime()), sb.toString()),new Callback() {
157                             @Override
158                             public void onCompletion(RecordMetadata arg0, Exception arg1) {
159                                 if(arg1!=null)System.out.println("插入数据失败"+arg0.toString()+",e:"+arg1);
160                             }
161                         });
162                     }
163                 }else{
164                 }
165             }
166             producer.close();
167         } catch (Exception e) {
168             System.out.println(e.toString());
169         }finally {
170             if(bf!=null)
171                 try {
172                     bf.close();
173                 } catch (Exception e) {
174                     e.printStackTrace();
175                 }
176         }
177         file.renameTo(new File(file.getAbsolutePath()+".COMPLETED"));
178     }
179     /*
180      * 根据地址请求服务,请求成功则返回true
181      */
182     public static boolean connectSuccess(String path){
183         URL url;
184         try {
185             url = new URL(noticeUrl);
186             HttpURLConnection con = (HttpURLConnection) url.openConnection();
187             if(con.getResponseCode()==200) return true;
188         } catch (Exception e) {
189             return false;
190         }
191         
192         return false;
193     }
194 }

   配置文件编写customer2kafka.conf

ip=192.168.1.91
threadnum=20
port=9092
topic=customertopic
path=/home/ftpuser/customer
includesuffix=txt
lineregex=^#\d.*$
delimiter=\s+
noticeurl=http://192.168.1.92:6009/schedule/customer
fieldsquence=id,name,score

  maven打包执行:

java -jar file2kafka-2.0.jar /opt/app/file2kafka/customer2kafka.conf

  pom.xml

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.shenyuchong</groupId>
<artifactId>file2kafka</artifactId>
<version>2.0</version>
<packaging>jar</packaging>

<name>file2hive</name>
<url>http://maven.apache.org</url>

<properties>
  <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
  <dependency>
    <groupId>org.apache.kafka</groupId>
    <artifactId>kafka-clients</artifactId>
    <version>2.0.0</version><!--$NO-MVN-MAN-VER$ -->
  </dependency>
</dependencies>
<build>
  <sourceDirectory>src</sourceDirectory>
  <plugins>
    <plugin>
      <artifactId>maven-assembly-plugin</artifactId>
      <configuration>
        <appendAssemblyId>false</appendAssemblyId>
        <descriptorRefs>
          <descriptorRef>jar-with-dependencies</descriptorRef>
        </descriptorRefs>
        <archive>
          <manifest>
            <mainClass>com.gbd.App</mainClass>
          </manifest>
        </archive>
      </configuration>
      <executions>
        <execution>
        <id>make-assembly</id>
        <phase>package</phase>
        <goals>
          <goal>assembly</goal>
        </goals>
      </execution>
    </executions>
</plugin>

</plugins>
</build>
</project>

猜你喜欢

转载自www.cnblogs.com/shenyuchong/p/11454506.html