提取出某日访问网站次数最多的那K个IP之并发版

前边提到了单线程的实现，这里贴出多线程版，此处主要用多线程去处理hash后的小文件：

    Java代码   
    
  
package com.kingdee.gmis.mass.data.ips;  
  
import static com.kingdee.gmis.mass.data.ips.MassIP.K10;  
import static com.kingdee.gmis.mass.data.ips.MassIP.getPartitionFile;  
import static com.kingdee.gmis.mass.data.ips.MassIP.partationCount;  
import static com.kingdee.gmis.mass.data.ips.MassIP.printResult;  
import static com.kingdee.gmis.mass.data.ips.MassIP.*;  
  
import java.io.BufferedInputStream;  
import java.io.DataInputStream;  
import java.io.File;  
import java.io.FileInputStream;  
import java.io.IOException;  
import java.util.HashMap;  
import java.util.Map;  
import java.util.concurrent.atomic.AtomicInteger;  
  
import com.kingdee.gmis.common.TopNHeap;  
import com.kingdee.gmis.mass.data.ips.MassIP.IPInfo;  
  
public class ConcurrentMassIP {  
  
    private static int workerCnt = 3;  
  
    /** 
     * @param args 
     * @throws Exception 
     */  
    public static void main(String[] args) throws Exception {  
//       generateMassIp("ip", "ips.txt", 100000000);  
//       generatePartitionFile("ip", "ips.txt", 100);  
        searchTopN(20);  
    }  
  
    private static AtomicInteger curIdx = new AtomicInteger(-1);  
    static File[] smallFiles;  
    static TopNHeap<IPInfo> destHeap;  
  
    /** 
     * 查找出现次数最多的K个ip地址 
     *  
     * @param count 
     * @throws Exception 
     */  
    public static void searchTopN(int count) throws Exception {  
        Thread.sleep(10000);  
        long start = System.currentTimeMillis();  
        smallFiles = getPartitionFile("ip", partationCount);  
        destHeap = new TopNHeap<MassIP.IPInfo>(count);  
        int cnt = workerCnt;  
        synchronized (lock) {  
            for (int i = 0; i < cnt; i++) {  
                new Thread(new Worker(i, count)).start();  
            }  
            lock.wait();  
            printResult(destHeap);  
        }  
        System.out.println("Total spend " + (System.currentTimeMillis() - start) + " ms");  
    }  
  
    static Object lock = new Object();  
  
    public static synchronized void mergeToResult(TopNHeap<IPInfo> srcHeap) {  
        try {  
            destHeap.mergeHeap(srcHeap);  
        } finally {  
            if (--workerCnt == 0) {  
                synchronized (lock) {  
                    lock.notify();  
                }  
            }  
        }  
    }  
  
    static class Worker implements Runnable {  
        private int topCount;  
        private int id;  
  
        public Worker(int id, int count) {  
            this.id = id;  
            this.topCount = count;  
        }  
  
        @Override  
        public void run() {  
            int curFileIdx;  
            DataInputStream dis = null;  
            Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>();  
            TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(topCount);  
            int processCnt = 0;  
            while ((curFileIdx = curIdx.addAndGet(1)) < partationCount) {  
                processCnt++;  
                ipCountMap.clear();  
                try {  
                    dis = new DataInputStream(new BufferedInputStream(  
                            new FileInputStream(smallFiles[curFileIdx]), K10));  
                    while (dis.available() > 0) {  
                        int ip = dis.readInt();  
                        Integer cnt = ipCountMap.get(ip);  
                        ipCountMap.put(ip, cnt == null ? 1 : cnt + 1);  
                    }  
                    searchMostCountIps(ipCountMap, heap);  
                } catch (Exception e) {  
                    throw new RuntimeException(e);  
                } finally {  
                    if (dis != null) {  
                        try {  
                            dis.close();  
                        } catch (IOException e) {  
                            e.printStackTrace();  
                        }  
                    }  
                }  
            }  
            System.out.println("Thread " + this.id + " process " + processCnt  
                    + " files.");  
            mergeToResult(heap);  
        }  
    }  
}  

测试了下，3线程是120s左右，串行是320s左右，的确提高了很多。测试机子是4核cpu，如果启4个线程，机器都变得很卡，cpu居高不下。另jvm启动参数为：

    Java代码   
    
  
-server  
-Xmx1024m  
-Xms1024m  
-Xmn600m  
-XX:+UseConcMarkSweepGC  
-XX:ParallelGCThreads=4  

因为此处理过程中，大多数对象存活周期并不长，所以可以把新生代设置大一些。堆初始化设置大些，避免minor gc的时候才去扩展内存大小，因为可以预料到程序一旦启动，加载的内存的东西就会很多。

另外，此处垃圾收集器是用了cms，老生代内存回收就用了cms，新生代用了PurNew，并行收集的，设置gc线程数等于cpu内核数。当然这里也可以设置成-XX:+UseParallelGC、-XX:+UseParallelOldGC都可以，此处主要是新生代内存回收频繁，所以一定要把新生代设置成并发或并行版本的。

通过-server把虚拟机启动为server模式，这样运行时候会启用c2级别的JIT优化，能获得更高质量的编译代码。当然server模式下启动的jvm，默认使用的gc收集器跟-XX:UseParallelGC使用的一样

http://yueyemaitian.iteye.com/blog/1207913

提取出某日访问网站次数最多的那K个IP之并发版

猜你喜欢