java实现大批量json文件数据去重

上周从数据采集部门拿到一批400份的json文件,每个文件里30w+的json对象,对象里有uid,对重复的uid,需要去重下.

本人电脑4核8G已经不能满足了,总是内存不够用.所以在服务器上写了一下(配置8核128G) ,结果读取文件以及去重都没问题,

在最后的写入是又是内存不够了.所以总结了一下,整体的思路是,先把400份分为每份20份,这样处理一下,就只有20份了,再把20份的json文件处理去重整理成一份.

准备工作: 400份json文件,每个文件名称为1 (1).json..........1 (400).json

直接上代码

package com.ufc.user.controller;

import java.io.BufferedWriter;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

public class test2 {

   public static void main(String[] args) {
       //第一步400份json循环读取,每次读20份
       for (int i = 21; i <= 401; i+=20) {
           System.out.println(i);
           test2 test1=new test2();
           test1.testjson1(i);
       }
       //到此已经将400份json文件处理成20份,而且这20份的每份中没有重复的数据,但是不能保证第n份和第n1份两者之间的数据有没有重复,所以还需要将这20份合并成一份.

       //第二步20份json文件名21.json,41json,61.json,81.json...401.json 一共20份,需要手动把20份的名称改为1 (1).json...... 1 (20).json

       //第三步将20份json文件数据去重整理成一份
       test2 test1=new test2();
       for (int i = 1; i < 21; i++) {
           System.out.println(i);
           test1.testjson2(i);
       }
       //第四步导出处理好数据的文件,名称为666.json
       test1.str();
   }

   /**
   * 读取400分json文件并将每20份json去重整理成1份
   * @param rr
   */
   public void testjson1(int rr) {
       JSONArray arr = new JSONArray();
       Set<String> testSet = new HashSet<String>();
       int c=rr-20;
       try {
           for (int r = c; r < rr; r++) {
               System.out.println("i=" + r);
               String Path = "G:\\gjx_data\\json\\1 (" + r + ").json";
               String str1 = new String(downFileByte(Path));
               String[] str = str1.split("\n");
               for (int i = 0; i < str.length; i++) {
                   Map maps = (Map) JSON.parse(str[i]);
                   String user_id = maps.get("user_id").toString();
                   testSet.add(user_id);
               }
           }
           Iterator<String> it = testSet.iterator();
           while (it.hasNext()) {
               JSONObject obj = new JSONObject();
               String str11 = it.next();
               obj.put("uid", str11);
               arr.add(obj);
           }
           String data = arr.toJSONString();
           wirter1(data,rr);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
   /**
   * 写入文件的方法
   * @param s
   * @param a
   * @throws Exception
   */
   public void wirter1(String s,int a) throws Exception {
       FileWriter fw = null;
       File f = new File("G:\\gjx_data\\json1\\"+a+".json");
       try {
           if (!f.exists()) {
               f.createNewFile();
           }
           fw = new FileWriter(f);
           BufferedWriter out = new BufferedWriter(fw);
           out.write(s, 0, s.length() - 1);
           out.close();
       } catch (IOException e) {
           e.printStackTrace();
       }
       System.out.println("end");
   }




   Set<String> testSet1 = new HashSet<String>();
   /**
   * 将20份json数据去重整理成1份
   * @param rr
   */
   public void testjson2(int rr) {
       try {
               System.out.println("i=" + rr);
               String Path = "G:\\gjx_data\\json1\\1 (" + rr + ").json";
               String str1 = new String(downFileByte(Path));
               JSONArray jsonarray=JSONArray.parseArray(str1+"]");//还是手动拼上好
               for (int i = 0; i < jsonarray.size(); i++) {
                   JSONObject parse = JSONObject.parseObject( jsonarray.get(i).toString());
                   String uid = parse.getString("uid");
                   testSet1.add(uid);
               }
       } catch (Exception e) {
           e.printStackTrace();
       }
   }

   StringBuffer sb=new StringBuffer();
   /**
   * 处理成每行一个uid的字符串
   */
   public void str(){
       Iterator<String> it = testSet1.iterator();
       while (it.hasNext()) {
           String str11 = it.next();
           sb.append(str11);
           sb.append("\n");
       }
       String data =sb.toString();
       try {
           wirter1(data,666);
       } catch (Exception e) {
           e.printStackTrace();
       }
   }
   /**
   * 读取文件方法
   * @param downLoadPath
   * @return
   * @throws Exception
   */
   public static byte[] downFileByte(String downLoadPath) throws Exception {
       byte[] return_arraybyte = null;
       InputStream ins = new FileInputStream(downLoadPath);
       ByteArrayOutputStream byteOut = new ByteArrayOutputStream();
       byte[] buf = new byte[5 * 1024 * 1024];
       int bufsize = 0;
       while ((bufsize = ins.read(buf, 0, buf.length)) != -1) {
           byteOut.write(buf, 0, bufsize);
       }
       return_arraybyte = byteOut.toByteArray();
       byteOut.close();
       ins.close();
       return return_arraybyte;
   }

}

java实现大批量json文件数据去重

猜你喜欢