采集指定目录下文本数据到kafka
1 package com.shenyuchong; 2 import java.io.BufferedReader; 3 import java.io.File; 4 import java.io.FileReader; 5 import java.io.IOException; 6 import java.net.HttpURLConnection; 7 import java.net.URL; 8 import java.util.ArrayList; 9 import java.util.Date; 10 import java.util.List; 11 import java.util.Properties; 12 import java.util.concurrent.ExecutorService; 13 import java.util.concurrent.Executors; 14 import java.util.regex.Pattern; 15 import org.apache.kafka.clients.producer.Callback; 16 import org.apache.kafka.clients.producer.KafkaProducer; 17 import org.apache.kafka.clients.producer.Producer; 18 import org.apache.kafka.clients.producer.ProducerRecord; 19 import org.apache.kafka.clients.producer.RecordMetadata; 20 import org.apache.kafka.common.serialization.StringSerializer; 21 22 public class App { 23 public static List<File> SendedFiles = new ArrayList<>(); 24 public static String fieldSquence = ""; 25 public static int fieldNum = 0; 26 public static String ip = ""; 27 public static String port = ""; 28 public static String path = ""; 29 public static String threadNum = "5"; 30 public static String topic = ""; 31 public static String lineRegex = "^.*$"; 32 public static String delimiter = "\\s+"; 33 public static String delimiter2 = "|||"; 34 public static String includesuffix = "aSuffix,bSuffix"; 35 public static Pattern linePattern =null; 36 public static Properties props =null; 37 public static String noticeUrl; 38 public static void main(String[] args) { 39 /* 40 * 配置文件弱不存在则抛出异常 41 */ 42 if(args.length<1){ 43 try { 44 throw new Exception("无配置文件"); 45 } catch (Exception e) { 46 e.printStackTrace(); 47 } 48 } 49 try { 50 BufferedReader br = new BufferedReader(new FileReader(new File(args[0]))); 51 String line=""; 52 while((line=br.readLine())!=null){ 53 line = line.replaceAll("\\s+", ""); 54 if(line.indexOf("=")!=-1){ 55 String[] kv=line.split("="); 56 String k= kv[0]; 57 String v= kv[1]; 58 if (k.equals("port")) port = v; //kafka 端口 59 if (k.equals("ip")) ip = v; //kafka 主机地址 60 if (k.equals("topic")) topic = v; //kafka 主题 61 if (k.equals("fieldsquence")) fieldSquence = v; //字段序列,逗号隔开 62 if (k.equals("threadnum")) threadNum = v; //采集线程数 63 if (k.equals("path")) path = v; //采集的目录,多目录逗号隔开 64 if (k.equals("lineregex")) lineRegex=v; //行正则,不匹配的行数据丢弃 65 if (k.equals("delimiter")) delimiter=v; //字段分隔符 66 if (k.equals("delimiter2")) delimiter2=v; //重组分隔符(发送到Kafka) 67 if (k.equals("includesuffix")) includesuffix=v; //包含文件的后缀 68 if (k.equals("noticeurl")) noticeUrl=v; //采集完成通知的接口 69 70 } 71 } 72 br.close(); 73 } catch (IOException e1) { 74 e1.printStackTrace(); 75 } 76 /* 77 * kafka配置 78 */ 79 props = new Properties(); 80 props.put("bootstrap.servers", ip+":"+port); 81 props.put("acks", "all"); 82 props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 83 props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer"); 84 85 fieldNum = fieldSquence.split(",").length; 86 linePattern= Pattern.compile(lineRegex); 87 88 /* 89 * 线程池 90 */ 91 ExecutorService es = Executors.newFixedThreadPool(Integer.valueOf(threadNum)); 92 /* 93 * 根据path目录获取文件 94 * 根据includesuffix选中文件调用send(file) 95 * 每个文件创建一个线程(线程实际总数由threadNum决定) 96 */ 97 for(String path:path.split(",")){ 98 File dir=new File(path); 99 File[] files = dir.listFiles(); 100 for(final File file:files){ 101 for(String suffix:includesuffix.split(",")){ 102 if(file.getAbsolutePath().endsWith(suffix)){ 103 es.submit(new Runnable() { 104 @Override 105 public void run() { 106 send(file); 107 } 108 }); 109 } 110 } 111 112 } 113 } 114 /* 115 * 关闭线程池 116 */ 117 es.shutdown(); 118 /* 119 * 线程池停止后通知后续服务指导后续服务接受了请求 120 */ 121 boolean stop=false,noticed=false; 122 try { 123 while(!stop||!noticed){ 124 if (es.isTerminated()) { 125 stop=true; 126 } 127 Thread.sleep(2000); 128 if(stop){ 129 noticed = connectSuccess(noticeUrl); 130 } 131 } 132 } catch (Exception e) { 133 e.printStackTrace(); 134 } 135 } 136 /* 137 * 读取文件并发送到kafka,文件内容发送完成后将文件添加.COMPLETED后缀 138 */ 139 public static void send(File file){ 140 BufferedReader bf =null; 141 StringBuffer sb = null; 142 try { 143 bf = new BufferedReader(new FileReader(file)); 144 String line = null; 145 Producer<String, String> producer = new KafkaProducer<>(props, new StringSerializer(), new StringSerializer()); 146 while((line = bf.readLine())!=null){ 147 sb = new StringBuffer(); 148 line = line.trim(); 149 if(linePattern.matcher(line).matches()){ 150 String[] fields = line.split(delimiter); 151 if(fields.length<fieldNum){ 152 }else{ 153 for(String fieldValue:fields) 154 sb.append(fieldValue).append(delimiter2); 155 sb.append(file.getAbsolutePath()); 156 producer.send(new ProducerRecord<String, String>(topic, String.valueOf((new Date()).getTime()), sb.toString()),new Callback() { 157 @Override 158 public void onCompletion(RecordMetadata arg0, Exception arg1) { 159 if(arg1!=null)System.out.println("插入数据失败"+arg0.toString()+",e:"+arg1); 160 } 161 }); 162 } 163 }else{ 164 } 165 } 166 producer.close(); 167 } catch (Exception e) { 168 System.out.println(e.toString()); 169 }finally { 170 if(bf!=null) 171 try { 172 bf.close(); 173 } catch (Exception e) { 174 e.printStackTrace(); 175 } 176 } 177 file.renameTo(new File(file.getAbsolutePath()+".COMPLETED")); 178 } 179 /* 180 * 根据地址请求服务,请求成功则返回true 181 */ 182 public static boolean connectSuccess(String path){ 183 URL url; 184 try { 185 url = new URL(noticeUrl); 186 HttpURLConnection con = (HttpURLConnection) url.openConnection(); 187 if(con.getResponseCode()==200) return true; 188 } catch (Exception e) { 189 return false; 190 } 191 192 return false; 193 } 194 }
配置文件编写customer2kafka.conf
ip=192.168.1.91 threadnum=20 port=9092 topic=customertopic path=/home/ftpuser/customer includesuffix=txt lineregex=^#\d.*$ delimiter=\s+ noticeurl=http://192.168.1.92:6009/schedule/customer fieldsquence=id,name,score
maven打包执行:
java -jar file2kafka-2.0.jar /opt/app/file2kafka/customer2kafka.conf
pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.shenyuchong</groupId>
<artifactId>file2kafka</artifactId>
<version>2.0</version>
<packaging>jar</packaging>
<name>file2hive</name>
<url>http://maven.apache.org</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>2.0.0</version><!--$NO-MVN-MAN-VER$ -->
</dependency>
</dependencies>
<build>
<sourceDirectory>src</sourceDirectory>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>com.gbd.App</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>assembly</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>