1.从impala表导出文件,导出格式为Csv
impala-shell -q "select * from qscar.kcar " -B --output_delimiter="," --print_header -o kcar.csv
2.将Csv文件转化为指定格式(索引头+jsonobject),方便插入elasticsearch,以下为代码展示
import org.json.JSONArray;
import org.json.JSONObject;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class Change {
private String fileName = null;
private BufferedReader br = null;
// private List<String> list = new ArrayList<String>();
public List readCsv(String path){
List<String> list = new ArrayList<String>();
try {
br = new BufferedReader(new FileReader(path));
String stemp;
while ((stemp = br.readLine()) != null) {
list.add(stemp);
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
public void ToJson(List list) {
int rowNum = list.size();//行数
int colNum = list.get(0).toString().split(",").length;//列数
String tittle[] = list.get(0).toString().split(",");
for (int i = 1; i < rowNum; i++) {
JSONObject jsonObject = new JSONObject();
JSONArray jsonArray = new JSONArray();
//获取jsonobject的name
for (int j = 0; j < colNum; j++) {
String[] tmp = list.get(i).toString().split(",");
jsonObject.put(tittle[j],tmp[j]);
// jsonArray.put(jsonObject);
}
//写文件
try {
FileWriter fileWriter=new FileWriter("D:/test2.json",true);
String head="{ \"index\" : { \"_index\" : \"zjx\", \"_type\" : \"type2\" } }";
fileWriter.write(head+"\n"+jsonObject.toString()+"\n");
fileWriter.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) {
String path="D:/kcar.csv";
Change change=new Change();
List list=change.readCsv(path);
change.ToJson(list);
}
}
进行转化时,需要控制行数,因为输出的文件大小需要控制在100M以下,不然插入es时会报错,所以最好分批次插入。
3.向elasticsearch插入数据
首先,bulk命令格式:
curl -XPOST localhost:9200/_bulk --data-binary @test2.json
test2.json是你所需要读取的json格式文件,文件格式:
{ "index" : { "_index" : "test", "_type" : "type2" } }
{"id":"60","areaname":"重庆市","tid":"157","mid":"158","ctime":"20180114160801","ctime1":"2018-01-14 16:08:01"}
{ "index" : { "_index" : "test", "_type" : "type2" } }
{"id":"60","areaname":"重庆市","tid":"157","mid":"158","ctime":"20180114160801","ctime1":"2018-01-14 16:08:01"
每一条字段数据都要加入索引和类型信息。