提取出某日访问网站次数最多的那K个IP

提取出某日访问百度次数最多的那K个IP

 

基本思路:ip最多2^32个,放入内存也要40G,基本不现实。所以只有用外排序,把ip分割成到不同的小文件里,然后统计次数后,汇总。另外,ip本质就是一32bit的数值,不要拘泥于字符串表示的ip。

思路也可参考此处问题一

基于以上思路,看代码:

 

package com.kingdee.gmis.ips;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;

import com.kingdee.gmis.common.TopNHeap;
import com.kingdee.gmis.data.FileUtils;

/**
 * 提取出某日访问百度次数最多的那K个IP。<br>
 * 注意,此处简化起见,没有处理重复的情况,即有多个ip出现次数一样多
 * 
 * @author Administrator
 * 
 */
public class MassIP {
	public static int K10 = 1024 * 10;
	public static int M1 = 1024 * 1024;
	public static int M4 = 4 * M1;
	private static int partationCount = 100;

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		generateMassIp("ip", "ips.txt", 100000000);
		generatePartitionFile("ip", "ips.txt", 100);
		searchTopN(10);
		// searchTopN2("ip", "ips.txt", 10);
	}

	/**
	 * 如果是1亿条记录,直接放到内存排序会如何呢?
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @throws IOException
	 */
	public static void searchTopN2(String srcDirName, String srcFileName, int count) throws IOException {
		Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>();
		File file = FileUtils.getFile(srcDirName, srcFileName);
		BufferedReader br = null;
		try {
			br = new BufferedReader(new FileReader(file), M4);
			String s;
			int i = 0, j = 0;
			while ((s = br.readLine()) != null) {
				int ip = parseIp2Int(s);
				Integer cnt = ipCountMap.get(ip);
				if (cnt == null) {
					i++;
					if (i % 10000000 == 0) {
						System.out.println(i);
					}
				}
				j++;
				if (j % 10000000 == 0) {
					System.out.println(j);
				}
				ipCountMap.put(ip, cnt == null ? 1 : cnt + 1);
			}
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count);
		searchMostCountIps(ipCountMap, heap);
		printResult(heap);
	}

	/**
	 * 查找出现次数最多的K个ip地址
	 * 
	 * @param count
	 * @throws IOException
	 */
	public static void searchTopN(int count) throws IOException {
		File[] smallFiles = getPartitionFile("ip", partationCount);
		DataInputStream dis = null;
		Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>();
		TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count);
		for (int i = 0; i < partationCount; i++) {
			ipCountMap.clear();
			try {
				dis = new DataInputStream(new BufferedInputStream(new FileInputStream(smallFiles[i]), K10));
				while (dis.available() > 0) {
					int ip = dis.readInt();
					Integer cnt = ipCountMap.get(ip);
					ipCountMap.put(ip, cnt == null ? 1 : cnt + 1);
				}
				searchMostCountIps(ipCountMap, heap);
			} finally {
				if (dis != null) {
					try {
						dis.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
		printResult(heap);
	}

	/**
	 * 打印结果集
	 * 
	 * @param heap
	 */
	private static void printResult(TopNHeap<IPInfo> heap) {
		while (heap.hasNext()) {
			System.out.println(heap.removeTop().toString());
		}
	}

	/**
	 * 查找出现次数最多的ip
	 * 
	 * @param map
	 * @param heap
	 */
	private static void searchMostCountIps(Map<Integer, Integer> map, TopNHeap<IPInfo> heap) {
		Iterator<Integer> iter = map.keySet().iterator();
		Integer key = null;
		while (iter.hasNext()) {
			key = iter.next();
			int count = map.get(key);
			if (!heap.isFull() || count > heap.getHeapTop().getCount()) {
				heap.addToHeap(new IPInfo(count, key));
			}
		}
	}

	/**
	 * 把32位int值转换成ip字符串
	 * 
	 * @param value
	 * @return
	 */
	public static String parseInt2Ip(int value) {
		StringBuilder sb = new StringBuilder(15);
		String[] segs = new String[4];
		for (int i = 0; i < 4; i++) {
			segs[3 - i] = String.valueOf((0xFF & value));
			value >>= 8;
		}
		for (int i = 0; i < 4; i++) {
			sb.append(segs[i]).append(".");
		}
		sb.deleteCharAt(sb.length() - 1);
		return sb.toString();
	}

	/**
	 * 把ip字符串转换成int值
	 * 
	 * @param ip
	 * @return
	 */
	public static int parseIp2Int(String ip) {
		String[] segs = ip.split("\\.");
		int rst = 0;
		for (int i = 0; i < segs.length; i++) {
			rst = (rst << 8) | Integer.valueOf(segs[i]);
		}
		return rst;
	}

	/**
	 * 随机生成ip
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @throws IOException
	 */
	public static void generateMassIp(String srcDirName, String srcFileName, int count) throws IOException {
		File file = FileUtils.createFile(srcDirName, srcFileName);
		BufferedWriter bw = null;
		try {
			bw = new BufferedWriter(new FileWriter(file));
			count = count / 100;
			Random rd = new Random(255);
			StringBuilder sb = new StringBuilder(1500);
			for (int i = 0; i < count; i++) {
				sb.delete(0, sb.length());
				for (int j = 0; j < 100; j++) {
					sb.append(rd.nextInt(100)).append(".");
					sb.append(rd.nextInt(150)).append(".");
					sb.append(rd.nextInt(256)).append(".");
					sb.append(rd.nextInt(256)).append("\n");
				}
				bw.write(sb.toString());
				bw.flush();
			}
		} finally {
			if (bw != null) {
				bw.close();
			}
		}
	}

	public static File[] getPartitionFile(String srcDirName, int count) throws IOException {
		File[] files = new File[count];
		for (int i = 0; i < count; i++) {
			files[i] = FileUtils.getFile(srcDirName, i + ".data");
		}
		return files;
	}

	/**
	 * 分割大文件到小文件中
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @return
	 * @throws IOException
	 */
	public static File[] generatePartitionFile(String srcDirName, String srcFileName, int count) throws IOException {
		File[] files = new File[count];
		DataOutputStream[] dops = new DataOutputStream[count];
		for (int i = 0; i < count; i++) {
			files[i] = FileUtils.createFile(srcDirName, i + ".data");
		}
		File file = FileUtils.getFile(srcDirName, srcFileName);
		BufferedReader br = null;
		// Buffered
		try {
			for (int i = 0; i < count; i++) {
				dops[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(files[i]), K10));
			}
			br = new BufferedReader(new FileReader(file), M4);
			String s;
			while ((s = br.readLine()) != null) {
				int ip = parseIp2Int(s);
				dops[Math.abs(ip % count)].writeInt(ip);
			}
		} finally {
			for (int i = 0; i < count; i++) {
				if (dops[i] != null) {
					try {
						dops[i].close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return files;
	}

	/**
	 * 表示IP信息,存有次数和ip值。在放入堆中时候会用作存储
	 * 
	 * @author Administrator
	 * 
	 */
	static class IPInfo implements Comparable<IPInfo> {
		private int count;
		private int ipValue;

		public IPInfo(int count, int ipValue) {
			super();
			this.count = count;
			this.ipValue = ipValue;
		}

		public int getCount() {
			return count;
		}

		public void setCount(int count) {
			this.count = count;
		}

		public int getIpValue() {
			return ipValue;
		}

		public void setIpValue(int ipValue) {
			this.ipValue = ipValue;
		}

		public String getIp() {
			return parseInt2Ip(ipValue);
		}

		public int compareTo(IPInfo o) {
			if (this.count > o.getCount()) {
				return 1;
			} else if (this.count < o.getCount()) {
				return -1;
			} else {
				return 0;
			}
		}

		public String toString() {
			return this.getIp() + " -- " + this.count;
		}

	}
}

 

package com.kingdee.gmis.data;

import java.io.File;
import java.io.IOException;

public class FileUtils {

	public static String directory = "data";

	public static String getDirPath() {
		return System.getProperty("user.dir");
	}

	public static File createFile(String dirName, String fileName) throws IOException {
		String dataDir = getDirPath() + File.separator + directory;
		String dir = dirName != null ? dataDir + File.separator + dirName : dataDir;
		ensureDirectory(dir);
		String filePath = dir + File.separator + fileName;
		ensureFile(filePath);
		return new File(filePath);
	}

	public static File getFile(String dirName, String fileName) throws IOException {
		String dataDir = getDirPath() + File.separator + directory;
		String filePath = (dirName != null ? dataDir + File.separator + dirName : dataDir) + File.separator + fileName;
		return new File(filePath);
	}

	public static synchronized void ensureFile(String filePath) throws IOException {
		File dataFile = new File(filePath);
		if (!dataFile.exists()) {
			dataFile.createNewFile();
		}
	}

	public static synchronized void ensureDirectory(String dataDir) {
		File path = new File(dataDir);
		if (!path.exists()) {
			path.mkdirs();
		}
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		System.out.println(getDirPath());
	}

}

 

package com.kingdee.gmis.data.ips;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;

import com.kingdee.gmis.common.TopNHeap;
import com.kingdee.gmis.data.FileUtils;

/**
 * 提取出某日访问百度次数最多的那K个IP。<br>
 * 注意,此处简化起见,没有处理重复的情况,即有多个ip出现次数一样多
 * 
 * @author Administrator
 * 
 */
public class MassIP {
	public static int K10 = 1024 * 10;
	public static int M1 = 1024 * 1024;
	public static int M4 = 4 * M1;
	private static int partationCount = 100;

	/**
	 * @param args
	 * @throws IOException
	 */
	public static void main(String[] args) throws IOException {
		generateMassIp("ip", "ips.txt", 100000000);
		generatePartitionFile("ip", "ips.txt", 100);
		searchTopN(10);
		// searchTopN2("ip", "ips.txt", 10);
	}

	/**
	 * 如果是1亿条记录,直接放到内存排序会如何呢?
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @throws IOException
	 */
	public static void searchTopN2(String srcDirName, String srcFileName, int count) throws IOException {
		Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>();
		File file = FileUtils.getFile(srcDirName, srcFileName);
		BufferedReader br = null;
		try {
			br = new BufferedReader(new FileReader(file), M4);
			String s;
			int i = 0, j = 0;
			while ((s = br.readLine()) != null) {
				int ip = parseIp2Int(s);
				Integer cnt = ipCountMap.get(ip);
				if (cnt == null) {
					i++;
					if (i % 10000000 == 0) {
						System.out.println(i);
					}
				}
				j++;
				if (j % 10000000 == 0) {
					System.out.println(j);
				}
				ipCountMap.put(ip, cnt == null ? 1 : cnt + 1);
			}
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count);
		searchMostCountIps(ipCountMap, heap);
		printResult(heap);
	}

	/**
	 * 查找出现次数最多的K个ip地址
	 * 
	 * @param count
	 * @throws IOException
	 */
	public static void searchTopN(int count) throws IOException {
		File[] smallFiles = getPartitionFile("ip", partationCount);
		DataInputStream dis = null;
		Map<Integer, Integer> ipCountMap = new HashMap<Integer, Integer>();
		TopNHeap<IPInfo> heap = new TopNHeap<IPInfo>(count);
		for (int i = 0; i < partationCount; i++) {
			ipCountMap.clear();
			try {
				dis = new DataInputStream(new BufferedInputStream(new FileInputStream(smallFiles[i]), K10));
				while (dis.available() > 0) {
					int ip = dis.readInt();
					Integer cnt = ipCountMap.get(ip);
					ipCountMap.put(ip, cnt == null ? 1 : cnt + 1);
				}
				searchMostCountIps(ipCountMap, heap);
			} finally {
				if (dis != null) {
					try {
						dis.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
		printResult(heap);
	}

	/**
	 * 打印结果集
	 * 
	 * @param heap
	 */
	private static void printResult(TopNHeap<IPInfo> heap) {
		while (heap.hasNext()) {
			System.out.println(heap.removeTop().toString());
		}
	}

	/**
	 * 查找出现次数最多的ip
	 * 
	 * @param map
	 * @param heap
	 */
	private static void searchMostCountIps(Map<Integer, Integer> map, TopNHeap<IPInfo> heap) {
		Iterator<Integer> iter = map.keySet().iterator();
		Integer key = null;
		while (iter.hasNext()) {
			key = iter.next();
			int count = map.get(key);
			if (!heap.isFull() || count > heap.getHeapTop().getCount()) {
				heap.addToHeap(new IPInfo(count, key));
			}
		}
	}

	/**
	 * 把32位int值转换成ip字符串
	 * 
	 * @param value
	 * @return
	 */
	public static String parseInt2Ip(int value) {
		StringBuilder sb = new StringBuilder(15);
		String[] segs = new String[4];
		for (int i = 0; i < 4; i++) {
			segs[3 - i] = String.valueOf((0xFF & value));
			value >>= 8;
		}
		for (int i = 0; i < 4; i++) {
			sb.append(segs[i]).append(".");
		}
		sb.deleteCharAt(sb.length() - 1);
		return sb.toString();
	}

	/**
	 * 把ip字符串转换成int值
	 * 
	 * @param ip
	 * @return
	 */
	public static int parseIp2Int(String ip) {
		String[] segs = ip.split("\\.");
		int rst = 0;
		for (int i = 0; i < segs.length; i++) {
			rst = (rst << 8) | Integer.valueOf(segs[i]);
		}
		return rst;
	}

	/**
	 * 随机生成ip
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @throws IOException
	 */
	public static void generateMassIp(String srcDirName, String srcFileName, int count) throws IOException {
		File file = FileUtils.createFile(srcDirName, srcFileName);
		BufferedWriter bw = null;
		try {
			bw = new BufferedWriter(new FileWriter(file));
			count = count / 100;
			Random rd = new Random(255);
			StringBuilder sb = new StringBuilder(1500);
			for (int i = 0; i < count; i++) {
				sb.delete(0, sb.length());
				for (int j = 0; j < 100; j++) {
					sb.append(rd.nextInt(100)).append(".");
					sb.append(rd.nextInt(150)).append(".");
					sb.append(rd.nextInt(256)).append(".");
					sb.append(rd.nextInt(256)).append("\n");
				}
				bw.write(sb.toString());
				bw.flush();
			}
		} finally {
			if (bw != null) {
				bw.close();
			}
		}
	}

	public static File[] getPartitionFile(String srcDirName, int count) throws IOException {
		File[] files = new File[count];
		for (int i = 0; i < count; i++) {
			files[i] = FileUtils.getFile(srcDirName, i + ".data");
		}
		return files;
	}

	/**
	 * 分割大文件到小文件中
	 * 
	 * @param srcDirName
	 * @param srcFileName
	 * @param count
	 * @return
	 * @throws IOException
	 */
	public static File[] generatePartitionFile(String srcDirName, String srcFileName, int count) throws IOException {
		File[] files = new File[count];
		DataOutputStream[] dops = new DataOutputStream[count];
		for (int i = 0; i < count; i++) {
			files[i] = FileUtils.createFile(srcDirName, i + ".data");
		}
		File file = FileUtils.getFile(srcDirName, srcFileName);
		BufferedReader br = null;
		// Buffered
		try {
			for (int i = 0; i < count; i++) {
				dops[i] = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(files[i]), K10));
			}
			br = new BufferedReader(new FileReader(file), M4);
			String s;
			while ((s = br.readLine()) != null) {
				int ip = parseIp2Int(s);
				dops[Math.abs(ip % count)].writeInt(ip);
			}
		} finally {
			for (int i = 0; i < count; i++) {
				if (dops[i] != null) {
					try {
						dops[i].close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return files;
	}

	/**
	 * 表示IP信息,存有次数和ip值。在放入堆中时候会用作存储
	 * 
	 * @author Administrator
	 * 
	 */
	static class IPInfo implements Comparable<IPInfo> {
		private int count;
		private int ipValue;

		public IPInfo(int count, int ipValue) {
			super();
			this.count = count;
			this.ipValue = ipValue;
		}

		public int getCount() {
			return count;
		}

		public void setCount(int count) {
			this.count = count;
		}

		public int getIpValue() {
			return ipValue;
		}

		public void setIpValue(int ipValue) {
			this.ipValue = ipValue;
		}

		public String getIp() {
			return parseInt2Ip(ipValue);
		}

		public int compareTo(IPInfo o) {
			if (this.count > o.getCount()) {
				return 1;
			} else if (this.count < o.getCount()) {
				return -1;
			} else {
				return 0;
			}
		}

		public String toString() {
			return this.getIp() + " -- " + this.count;
		}

	}
}

 

package com.kingdee.gmis.common;

import java.util.Random;

/**
 * 
 * @author hua_fan
 * 
 */
public class TopNHeap<T extends Comparable<? super T>> {

	private int size;
	private int maxSize;
	private Object[] data = new Object[256];

	public TopNHeap(T[] data) {
		super();
		this.initHeap(data);
	}

	public TopNHeap(int fixedSize) {
		super();
		assert fixedSize >= 1;
		this.maxSize = fixedSize;
	}
	

	@SuppressWarnings("unchecked")
	public void initHeap(T[] data) {
		assert data.length >= 1;
		if (this.data.length <= this.size) {
			this.data = new Object[(int) (data.length * 1.5)];
		}
		this.maxSize = this.size = data.length;
		System.arraycopy(data, 0, this.data, 0, this.size);
		int startPos = this.getParentIndex(this.size - 1);
		for (int i = startPos; i >= 0; i--) {
			this.shiftdown(i);
		}
	}

	@SuppressWarnings("unchecked")
	public T getHeapTop() {
		return (T) this.data[0];
	}

	/**
	 * 加元素到堆中,但是保持堆 的大小
	 * 
	 * @param value
	 */
	public void addToHeap(T value) {
		if (this.maxSize > this.size) {
			this.data[this.size] = value;
			this.shiftup(this.size++);
		} else {
			if (value.compareTo(this.getHeapTop()) > 0) {
				this.data[0] = value;
				this.shiftdown(0);
			}
		}
	}

	private void shiftup(int pos) {
		int parentIdx = this.getParentIndex(pos);
		while (pos != 0
				&& this.getValue(pos).compareTo(this.getValue(parentIdx)) < 0) {
			this.swap(pos, parentIdx);
			pos = parentIdx;
			parentIdx = this.getParentIndex(pos);
		}
	}

	public T removeTop() {
		T rst = this.getHeapTop();
		this.data[0] = this.data[--this.size];
		this.shiftdown(0);
		return rst;
	}

	public boolean hasNext() {
		return this.size > 0;
	}

	@SuppressWarnings("unchecked")
	public T[] getData() {
		return (T[]) this.data;
	}

	@SuppressWarnings("unchecked")
	public T getValue(int index) {
		return (T) this.data[index];
	}

	private int getParentIndex(int pos) {
		return (pos - 1) / 2;
	}

	private int getLeftChildIdx(int pos) {
		return pos * 2 + 1;
	}

	private int getRightChildIdx(int pos) {
		return pos * 2 + 2;
	}

	private void swap(int idx1, int idx2) {
		T tmp = this.getValue(idx1);
		this.data[idx1] = this.getValue(idx2);
		this.data[idx2] = tmp;
	}

	/**
	 * 节点值向下级交换,构造堆
	 * 
	 * @param pos
	 */
	private void shiftdown(int pos) {
		int leftChildIdx = this.getLeftChildIdx(pos);
		// 没有子节点了
		if (leftChildIdx >= this.size) {
			return;
		}
		int rightChildIdx = getRightChildIdx(pos);
		int toBeSwapIdx = leftChildIdx;
		if (rightChildIdx < this.size
				&& this.getValue(leftChildIdx).compareTo(
						this.getValue(rightChildIdx)) > 0) {
			toBeSwapIdx = rightChildIdx;
		}
		// swap
		if (this.getValue(pos).compareTo(this.getValue(toBeSwapIdx)) > 0) {
			this.swap(pos, toBeSwapIdx);
			this.shiftdown(toBeSwapIdx);
		}
	}
	
	public boolean isFull(){
		return this.maxSize == this.size;
	}


	public int getMaxSize() {
		return maxSize;
	}
//
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		Integer[] data = { 7, 12, 13, 24, 8, 6, 4, 27, 14, 8, 12, 56, 22 };
		TopNHeap<Integer> heap = new TopNHeap<Integer>(data);
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

		System.out.println("  ");
		heap.initHeap(data);
		for (int i = 0; i < 10; i++) {
			heap.addToHeap(i);
		}
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

		System.out.println("  ");
		heap = new TopNHeap<Integer>(10);
		Random rd = new Random();
		for (int i = 0; i < 20; i++) {
			int value = rd.nextInt(100);
//			System.out.print(value);
//			System.out.print("  ");
			heap.addToHeap(value);
		}
		System.out.println("  ");
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

	}

}

 

package com.kingdee.gmis.common;

/**
 * 
 * @author hua_fan
 * 
 */
public class HeapSort<T extends Comparable<? super T>> {

	private int size;
	private Object[] data = new Object[256];

	public HeapSort(T[] data) {
		super();
		this.initHeap(data);
	}

	@SuppressWarnings("unchecked")
	public void initHeap(T[] data) {
		assert data.length >= 1;
		ensureCapacity();
		this.size = data.length;
		System.arraycopy(data, 0, this.data, 0, this.size);
		int startPos = this.getParentIndex(this.size - 1);
		for (int i = startPos; i >= 0; i--) {
			this.shiftdown(i);
		}
	}

	@SuppressWarnings("unchecked")
	public T getHeapTop() {
		return (T) this.data[0];
	}

	/**
	 * 加元素到堆中,但是保持堆 的大小
	 * 
	 * @param value
	 */
	public void addToHeapKeepFixedSize(T value) {
		if (this.getHeapTop() == null || value.compareTo(this.getHeapTop()) > 0) {
			this.data[0] = value;
			this.shiftdown(0);
		}
	}

	/**
	 * 加元素到堆中,同时扩展堆的大小
	 * 
	 * @param value
	 */
	public void addToHeap(T value) {
		this.ensureCapacity();
		this.data[this.size++] = value;
		// this.shiftdown(this.getParentIndex(this.size));
		this.shiftup(this.size - 1);
	}

	private void shiftup(int pos) {
		int parentIdx = this.getParentIndex(pos);
		while (pos != 0 && this.getValue(pos).compareTo(this.getValue(parentIdx)) < 0) {
			this.swap(pos, parentIdx);
			pos = parentIdx;
			parentIdx = this.getParentIndex(pos);
		}
	}

	private void ensureCapacity() {
		if (this.data.length <= this.size) {
			Object[] oldData = this.data;
			this.data = new Object[(int) (data.length * 1.5)];
			System.arraycopy(oldData, 0, this.data, 0, oldData.length);
		}
	}

	public T removeTop() {
		T rst = this.getHeapTop();
		this.data[0] = this.data[--this.size];
		this.shiftdown(0);
		return rst;
	}

	public boolean hasNext() {
		return this.size > 0;
	}

	@SuppressWarnings("unchecked")
	public T[] getData() {
		return (T[]) this.data;
	}

	@SuppressWarnings("unchecked")
	public T getValue(int index) {
		return (T) this.data[index];
	}

	private int getParentIndex(int pos) {
		return (pos - 1) / 2;
	}

	private int getLeftChildIdx(int pos) {
		return pos * 2 + 1;
	}

	private int getRightChildIdx(int pos) {
		return pos * 2 + 2;
	}

	private void swap(int idx1, int idx2) {
		T tmp = this.getValue(idx1);
		this.data[idx1] = this.getValue(idx2);
		this.data[idx2] = tmp;
	}

	/**
	 * 节点值向下级交换,构造堆
	 * 
	 * @param pos
	 */
	private void shiftdown(int pos) {
		int leftChildIdx = this.getLeftChildIdx(pos);
		// 没有子节点了
		if (leftChildIdx >= this.size) {
			return;
		}
		int rightChildIdx = getRightChildIdx(pos);
		int toBeSwapIdx = leftChildIdx;
		if (rightChildIdx < this.size && this.getValue(leftChildIdx).compareTo(this.getValue(rightChildIdx)) > 0) {
			toBeSwapIdx = rightChildIdx;
		}
		// swap
		if (this.getValue(pos).compareTo(this.getValue(toBeSwapIdx)) > 0) {
			this.swap(pos, toBeSwapIdx);
			this.shiftdown(toBeSwapIdx);
		}
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		Integer[] data = { 7, 12, 13, 24, 8, 6, 4, 27, 14, 8, 12, 56, 22 };
		HeapSort<Integer> heap = new HeapSort<Integer>(data);
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

		System.out.println("  ");
		heap.initHeap(data);
		for (int i = 0; i < 10; i++) {
			heap.addToHeap(i);
		}
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

		System.out.println("  ");
		heap.initHeap(data);
		for (int i = 0; i < 10; i++) {
			heap.addToHeapKeepFixedSize(i);
		}
		while (heap.hasNext()) {
			System.out.print(heap.removeTop());
			System.out.print("  ");
		}

	}

}

 

以上代码用到了前边文章中提到的top K堆

此方法,内存占用情况也非常理想,高峰不到100m内存就搞定了。

扩展:

 如果我们是排序1亿条ip,算下是1*4/10 = 0.4G,那么说400+m的内存也够把所有的1亿条ip的统计都放入到内存当中了?上边代码中searchTopN2是这么做了下,但是情况很不理想,读入2kw条记录的时候,内存已经占用到了快1G:

这是什么原因呢?虽然是java对象,但是分析下,HashMap的key和value都是Integer,从下图Integer的非静态变量和非静态方法看,直观分析也就value这个field占用一个字节嘛,两个Integer顶多也就是占用800+m的内存嘛,为何会1g才读入2kw整数呢?

这个问题,且等我们继续深究!

另,本题目的多线程版本后续贴出

 

转http://yueyemaitian.iteye.com/blog/1180299

猜你喜欢

转载自action-java.iteye.com/blog/2217122
今日推荐