背景:
文本文件a.txt,里面每行存放了一个URL。
需求:
计算出出现频率最多的TOP100个URL。
NOTE:简单写了个demo ,处理逻辑 1、先把大数据文件按行数分割为多个小文件 2、每个文件启动一个线程分析文件内容
HELP:100W条数据以下效率1分钟以内,200W以上数据效率很慢,多线程读取文件时出现内存溢出
package test; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Scanner; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; public class SortURL { public static String FilePath = "a.txt"; public static int rows = 10000*10; public static List<String> fileList = null; public static ConcurrentHashMap<String, Integer> urlMap = new ConcurrentHashMap<>(); public static void main(String[] args) { //拆分文件 cutFile(FilePath,rows); //多线程处理文件,排序文件内容 threadFile(); } /** * 排序文件内容 */ public static void sortMap() { // 频率排序 List<Map.Entry<String, Integer>> sortList = new ArrayList<>(urlMap.entrySet()); Collections.sort(sortList, new Comparator<Map.Entry<String, Integer>>() { @Override public int compare(Entry<String, Integer> o1, Entry<String, Integer> o2) { // TODO Auto-generated method stub return o2.getValue().compareTo(o1.getValue()); } }); //取前100 最多频率 // List<String> url = new ArrayList<>(); for(int i=0;i<sortList.size();i++) { if (i > 100) { break; } System.out.println("URL:"+sortList.get(i).getKey() +" ---- 出现频率:"+sortList.get(i).getValue()); } } /** * 创建线程池 ,启动线程处理文件 */ public static void threadFile() { System.out.println("多线程分析文件...."); long begin = System.currentTimeMillis(); //线程数 最好依据cpu 分配 ExecutorService es = Executors.newCachedThreadPool(); SortURL su = new SortURL(); for (int i=0 ; i< fileList.size();i++) { File file = new File(fileList.get(i)); if (file.isFile()) { es.execute(su.new readFile(file)); } else { System.out.println("未找到文件"); } } es.shutdown(); while(true) { if (es.isTerminated()) { long end = System.currentTimeMillis(); System.out.println("解析 "+fileList.size()+" 个文件 耗时:"+(end-begin)+" 毫秒"); sortMap(); break; } } } /** * 线程 解析文件汇总url * @author ThinkPad * */ private class readFile implements Runnable { File tFile = null; public readFile(File f) { // TODO Auto-generated constructor stub this.tFile = f; } @Override public void run() { resolverFile(); } /** * 解析文件 */ private void resolverFile() { FileInputStream fis = null; Scanner sc = null; Map<String, Integer> map = new HashMap<>(); try { fis = new FileInputStream(tFile); sc = new Scanner(fis); while(sc.hasNextLine()) { String len = sc.nextLine().trim(); if (map.containsKey(len)) { int mValue = map.get(len); map.put(len, mValue+1); }else { map.put(len, 1); } } sc.close(); //合并总得urlMap ConcurrentHashMap 线程安全 if(urlMap.size() == 0) { urlMap.putAll(map); } else { for (String key : map.keySet()) { if (urlMap.containsKey(key)) { int mValue = urlMap.get(key) + map.get(key); urlMap.put(key, mValue); }else { urlMap.put(key, 1); } } } // //清空临时map内存 // map.clear(); System.out.println(tFile.getName()+" mapsize:"+map.size() + " urlmapSize:"+urlMap.size()); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } /** * 大文件切割 * @param sourceFile * @param curRows */ public static void cutFile(String sourceFile,int curRows) { System.out.println("开始拆每 "+curRows+" 行,拆分文件"); FileInputStream inputstream = null; Scanner sc = null; StringBuilder sbu = null; BufferedWriter bw = null; // try { inputstream = new FileInputStream(sourceFile); sbu = new StringBuilder(); fileList = new ArrayList<>(); long begin = System.currentTimeMillis(); // Scanner 方法消耗内存低 sc = new Scanner(inputstream); int i = 1; while(sc.hasNextLine()) { sbu.append(sc.nextLine()).append("\r\n"); if ((i % curRows) == 0) { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(sourceFile+i+".txt")),"UTF-8")); bw.write(sbu.toString()); fileList.add(sourceFile+i+".txt"); bw.close(); sbu.setLength(0); } i++; } // 余下行数生成文件 if(((i-1) % curRows) != 0 ) { bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(sourceFile+i+".txt")),"UTF-8")); bw.write(sbu.toString()); fileList.add(sourceFile+i+".txt"); bw.close(); sbu.setLength(0); } long end = System.currentTimeMillis(); System.out.println("切割文件耗时: "+(end - begin)+" 毫秒"); inputstream.close(); sc.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { } } }