问题描述:有N(N>>10000)个整数,求出其中的前K个最大的数。(称作Top k或者Top 10)
问题分析:由于
(1)输入的大量数据;
(2)只要前K个
对整个输入数据的保存和排序是相当的不可取的. 可以利用数据结构的最小堆来处理该问题, 最小堆如图所示.
对于每个非叶子节点的数值,一定不大于孩子节点的数值。
这样可用含有K个节点的最小堆来保存K个目前的最大值(当然根节点是其中的最小数值)。
每次有数据输入的时候可以先与根节点比较。若不大于根节点,则舍弃;否则用新数值替换根节点数值。
并进行最小堆的调整。
先看找最小的数
* 找到无序的一堆数字中最小的k个数
* MinHeap.php
<?php
/**
* 有N(N>>10000)个整数,求出其中的前K个最小的数。(称作Top k或者Top 10)
*
* 从末尾数,第一个非叶节点的位置:
* k == 10 floor(10/2)-1 = 4
* k == 11 floor(11/2)-1 = 4
* k == 12 floor(12/2)-1 = 5
*
* 0
* / \
* 1 2
* / \ / \
* 3 4 5 6
* /\ /
* 7 8 9
*/
class MinHeap {
/** @var int top K, heapSize */
private $heapSize;
/** @var null|\SplFixedArray */
private $heap = null;
/** @var int processed element count */
private $count = 0;
public function __construct($heapSize) {
$this->heapSize = $heapSize;
$this->heap = new \SplFixedArray($heapSize);
}
/**
* @param \SplFixedArray $input $this->heapSize numbers
* @param int $n effective element count
*/
public function handle(\SplFixedArray $input, $n) {
// echo implode(',', $input->toArray()).PHP_EOL;
if ($this->count < $this->heapSize) {
if ($n < $this->heapSize) {
throw new RuntimeException("Unexpected input length < heap size");
}
for ($i = 0; $i != $n; $i++) {
self::heapInsert($this->heap, $input[$i], $i);
}
}
for ($i = 0; $i < $n; $i++) {
if ($input[$i] < $this->heap[0]) {
$this->heap[0] = $input[$i];
$this->heapify(0);
}
}
$this->count += $n;
// echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
}
public function getHeap() {
return $this->heap;
}
public static function heapInsert(\SplFixedArray $arr, $value, int $index) {
$arr[$index] = $value;
while ($index !== 0) {
$parent = intval(($index - 1) / 2);
if ($arr[$parent] < $arr[$index]) {
self::swap($arr, $parent, $index);
$index = $parent;
} else {
break;
}
}
}
public function heapify(int $index) {
$left = $index * 2 + 1;
$right = $left + 1;
$largest = $index;
while ($left < $this->heapSize) {
if ($this->heap[$left] > $this->heap[$index]) {
$largest = $left;
}
if ($right < $this->heapSize && $this->heap[$right] > $this->heap[$largest]) {
$largest = $right;
}
if ($largest != $index) {
self::swap($this->heap, $largest, $index);
} else {
break;
}
$index = $largest;
$left = $index * 2 + 1;
$right = $left + 1;
}
}
private static function swap(\SplFixedArray $a, int $i, int $j) {
$t = $a->offsetGet($i);
$a->offsetSet($i, $a->offsetGet($j));
$a->offsetSet($j, $t);
}
}
* 测试数据准备 generate.php
把测试数据写入到文件, 然后读取到内存操作。
为了证明大数据量 不能把数据一次性全部加载到内存的场景。
生成 0-9999 1万个数字,并打乱顺序,一行一个写入data.txt
<?php
function myshuffle(\SplFixedArray &$a) {
$swap = function(\SplFixedArray &$a, $i, $j) {
if ($i==$j) return;
$a[$i] = $a[$i] ^ $a[$j];
$a[$j] = $a[$i] ^ $a[$j];
$a[$i] = $a[$i] ^ $a[$j];
};
$m = count($a);
while ($m) {
$i = rand(0, --$m);
$swap($a, $i, $m);
}
}
define('BUFSIZE', !array_key_exists(1, $argv) ? 10000 : $argv[1]);
define('OUT_PATH', !array_key_exists(2, $argv) ? './data.txt':$argv[2]);
$fp = fopen(OUT_PATH, "w");
$a = new \SplFixedArray(BUFSIZE);
for ($i = 0; $i < BUFSIZE; $i++) {
$a[$i] = $i;
}
myshuffle($a);
// var_dump($a->toArray());
while ($a->valid()) {
fwrite($fp, $a->current().PHP_EOL);
$a->next();
}
fclose($fp);
run:
php generate.php 1000 a.txt
写入a.txt 0-999打乱顺序的数字 一行一个
* 测试 TestMinHeap.php
注意需要测到 最后一次读取文件内容 buffer 不满的情况
比如要找到top10 前10大的数字,每次读取12个数字
如果读取10003个数字, 最后一次读入 10003 - 10003 / 12 = 7 个数字 < 10
<?php
/**
* Created by PhpStorm.
* User: Mch
* Date: 9/28/18
* Time: 10:59 PM
*/
class TestMinHeap {
/** @var \SplFileObject */
private $file;
/** @var int */
private $linum;
public function __construct(string $path) {
$this->file = new \SplFileObject($path, 'r');
$this->linum = 0;
}
/**
* 试着读取$limit行, 对每一行执行$callback
* @param callable $callback
* @param int $limit
* @return int 实际读取的行数
*/
public function forEach(callable $callback, int $limit = 0) {
if (0===$limit) {
while ($this->file->valid()) {
$line = $this->file->fgets();
call_user_func($callback, $line, $this->linum);
$this->linum++;
}
return 0;
}
for ($i = 0; $i < $limit && $this->file->valid(); $this->linum++) {
$line = $this->file->fgets();
if (!empty($line)) {
call_user_func($callback, $line, $i);
$i++;
}
}
return $i;
}
/**
* 试着读取$limit行 扫描每一行的数字读到 \SplFixedArray $acc
* @param SplFixedArray $acc
* @return int 实际读取到的数字个数
*/
public function chunk(SplFixedArray &$acc) {
self::resetFixedArray($acc);
$nRead = 0;
$this->forEach(function($cur, $i) use (&$acc, &$nRead) {
sscanf($cur, "%d\n", $elem);
$acc->offsetSet($i, $elem);
$nRead++;
}, $acc->count());
return $nRead;
}
private static function resetFixedArray(SplFixedArray &$acc) {
$size = $acc->count();
// reset acc
for ($i = 0; $i < $size; $i++) {
$acc->offsetSet($i, null);
}
$acc->rewind();
}
public function __destruct() {}
}
function __autoload($className) {
include $className.'.php';
}
define('CHUNK_SIZE', 12);
define('HEAP_SIZE', 10); // TOP 10
$test = new TestMinHeap('data.txt');
$buffer = new SplFixedArray(CHUNK_SIZE);
$minHeap = new MinHeap(HEAP_SIZE);
for (;;) {
$n = $test->chunk($buffer);
if ($n<1) {break;}
$minHeap->handle($buffer, $n);
}
print_r($minHeap->getHeap());
* Run:
$ php TestMinHeap.php
SplFixedArray Object
(
[0] => 9
[1] => 7
[2] => 8
[3] => 5
[4] => 6
[5] => 4
[6] => 2
[7] => 1
[8] => 0
[9] => 3
)
反过来 再试找最大的前k个数
<?php
/**
* 有N(N>>10000)个整数,求出其中的前K个最大的数。(称作Top k或者Top 10)
* 这个博客没法测试, 目测有问题.
* https://www.cnblogs.com/xudong-bupt/archive/2013/03/20/2971262.html
*
* 从末尾数,第一个非叶节点的位置:
* k == 10 floor(10/2)-1 = 4
* k == 11 floor(11/2)-1 = 4
* k == 12 floor(12/2)-1 = 5
*
* 0
* / \
* 1 2
* / \ / \
* 3 4 5 6
* /\ /
* 7 8 9
*/
class MinHeap {
/** @var int top K, heapSize */
private $heapSize;
/** @var null|\SplFixedArray */
private $heap = null;
/** @var int processed element count */
private $count = 0;
public function __construct($heapSize) {
$this->heapSize = $heapSize;
$this->heap = new \SplFixedArray($heapSize);
}
/**
* @param \SplFixedArray $input $this->heapSize numbers
* @param int $n effective element count
*/
public function handle(\SplFixedArray $input, $n) {
// echo implode(',', $input->toArray()).PHP_EOL;
if ($this->count < $this->heapSize) {
if ($n < $this->heapSize) {
throw new RuntimeException("Unexpected input length < heap size");
}
for ($i = 0; $i != $this->heapSize; $i++) {
self::heapInsert($this->heap, $input[$i], $i);
}
}
for ($i = 0; $i < $n; $i++) {
if ($input[$i] > $this->heap[0]) {
$this->heap[0] = $input[$i];
$this->heapify(0);
}
}
$this->count += $n;
// echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
}
public function getHeap() {
return $this->heap;
}
public static function heapInsert(\SplFixedArray $arr, $value, int $index) {
$arr[$index] = $value;
while ($index !== 0) {
$parent = intval(($index - 1) / 2);
if ($arr[$parent] > $arr[$index]) {
self::swap($arr, $parent, $index);
$index = $parent;
} else {
break;
}
}
}
public function heapify(int $index) {
$left = $index * 2 + 1;
$right = $left + 1;
$min = $index;
while ($left < $this->heapSize) {
if ($this->heap[$left] < $this->heap[$index]) {
$min = $left;
}
if ($right < $this->heapSize && $this->heap[$right] < $this->heap[$min]) {
$min = $right;
}
if ($min != $index) {
self::swap($this->heap, $min, $index);
} else {
break;
}
$index = $min;
$left = $index * 2 + 1;
$right = $left + 1;
}
}
private static function swap(\SplFixedArray $a, int $i, int $j) {
$t = $a->offsetGet($i);
$a->offsetSet($i, $a->offsetGet($j));
$a->offsetSet($j, $t);
}
}
再测试
$ php TestMinHeap.php
SplFixedArray Object
(
[0] => 9990
[1] => 9991
[2] => 9993
[3] => 9992
[4] => 9995
[5] => 9996
[6] => 9994
[7] => 9998
[8] => 9997
[9] => 9999
)
执行上面的代码如果报错 Parse error: parse error, expecting `"identifier (T_STRING)"', 检查php版本
PHP版本 > 7 才能支持 参数带类型
$ php --version
PHP 7.2.4 (cli) (built: Apr 10 2018 10:59:05) ( NTS )
Copyright (c) 1997-2018 The PHP Group
Zend Engine v3.2.0, Copyright (c) 1998-2018 Zend Technologies
with Xdebug v2.7.0alpha2-dev, Copyright (c) 2002-2018, by Derick Rethans
上面的例子比较元素的大小只适用于整数,可以添加Comparator比较大小的类.
3个文件都在同一级目录下
Comparator.php MinHeap.php TestMinHeap.php data.txt generate.php
+ Comparator.php
MinHeap.php, TestMinHeap.php都要改动
* Comparator.php
<?php
class Comparator {
/** @var callable */
protected $compare;
public function __construct(callable $compareFunction) {
if ($compareFunction) {
$this->compare = $compareFunction;
} else {
$this->compare = function($a, $b) {
if ($a === $b) {return 0;}
return $a < $b ? -1 : 1;
};
}
}
public function equal($a, $b) {return call_user_func($this->compare, $a, $b) === 0;}
public function lessThan($a, $b) {return call_user_func($this->compare, $a, $b) < 0;}
public function greeterThan($a, $b) {return call_user_func($this->compare, $a, $b) > 0;}
public function lessThanOrEqual($a, $b) {
return $this->lessThan($a, $b) || $this->equal($a, $b);
}
public function greeterThanOrEqual($a, $b) {
return $this->greeterThan($a, $b) || $this->equal($a, $b);
}
public function reverse() {
$compareOriginal = $this->compare;
$this->compare = function ($a, $b) use ($compareOriginal) {
return $compareOriginal($b, $a);
};
}
}
* MinHeap.php
<?php
// include_once './Comparator.php';
class MinHeap {
/** @var int top K, heapSize */
private $heapSize;
/** @var null|\SplFixedArray */
private $heap = null;
/** @var int processed element count */
private $count = 0;
/** @var Comparator compare function */
private $comparator;
public function __construct(int $heapSize, Comparator $comparator) {
$this->heapSize = $heapSize;
$this->heap = new \SplFixedArray($heapSize);
$this->comparator = $comparator;
}
/**
* @param \SplFixedArray $input $this->heapSize numbers
* @param int $n effective element count
*/
public function handle(\SplFixedArray $input, $n) {
// echo implode(',', $input->toArray()).PHP_EOL;
if ($this->count < $this->heapSize) {
if ($n < $this->heapSize) {
throw new RuntimeException("Unexpected input length < heap size");
}
for ($i = 0; $i != $this->heapSize; $i++) {
self::heapInsert($this->heap, $input[$i], $i, $this->comparator);
}
}
for ($i = 0; $i < $n; $i++) {
// if ($input[$i] > $this->heap[0]) {
if ($this->comparator->greeterThan($input[$i], $this->heap[0])) {
$this->heap[0] = $input[$i];
$this->heapify(0);
}
}
$this->count += $n;
// echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
}
public function getHeap() {
return $this->heap;
}
public static function heapInsert(\SplFixedArray $arr, $value, int $index, Comparator $c) {
$arr[$index] = $value;
while ($index !== 0) {
$parent = intval(($index - 1) / 2);
// if ($arr[$parent] > $arr[$index]) {
if ( $c->greeterThan($arr[$parent], $arr[$index]) ) {
self::swap($arr, $parent, $index);
$index = $parent;
} else {
break;
}
}
}
public function heapify(int $index) {
$left = $index * 2 + 1;
$right = $left + 1;
$min = $index;
while ($left < $this->heapSize) {
if ($this->comparator->lessThan($this->heap[$left], $this->heap[$index]))
{$min = $left;}
if ($right < $this->heapSize && $this->comparator->lessThan(
$this->heap[$right], $this->heap[$min]))
{$min = $right;}
if ($min != $index) {self::swap($this->heap, $min, $index);}
else {break;}
$index = $min;
$left = $index * 2 + 1;
$right = $left + 1;
}
}
private static function swap(\SplFixedArray $a, int $i, int $j) {
$t = $a->offsetGet($i);
$a->offsetSet($i, $a->offsetGet($j));
$a->offsetSet($j, $t);
}
}
* TestMinHeap.php
<?php
class TestMinHeap {
/** @var \SplFileObject */
private $file;
/** @var int */
private $linum;
public function __construct(string $path) {
$this->file = new \SplFileObject($path, 'r');
$this->linum = 0;
}
/**
* 试着读取$limit行, 对每一行执行$callback
* @param callable $callback
* @param int $limit
* @return int 实际读取的行数
*/
public function forEach(callable $callback, int $limit = 0) {
if (0===$limit) {
while ($this->file->valid()) {
$line = $this->file->fgets();
call_user_func($callback, $line, $this->linum);
$this->linum++;
}
return 0;
}
for ($i = 0; $i < $limit && $this->file->valid(); $this->linum++) {
$line = $this->file->fgets();
if (!empty($line)) {
call_user_func($callback, $line, $i);
$i++;
}
}
return $i;
}
/**
* 试着读取$limit行 扫描每一行的数字读到 \SplFixedArray $acc
* @param SplFixedArray $acc
* @return int 实际读取到的数字个数
*/
public function chunk(SplFixedArray &$acc) {
self::resetFixedArray($acc);
$nRead = 0;
$this->forEach(function($cur, $i) use (&$acc, &$nRead) {
sscanf($cur, "%d\n", $elem);
$acc->offsetSet($i, $elem);
$nRead++;
}, $acc->count());
return $nRead;
}
private static function resetFixedArray(SplFixedArray &$acc) {
$size = $acc->count();
// reset acc
for ($i = 0; $i < $size; $i++) {
$acc->offsetSet($i, null);
}
$acc->rewind();
}
public function __destruct() {}
}
function __autoload($className) {
include $className.'.php';
}
define('CHUNK_SIZE', 12);
define('HEAP_SIZE', 10); // TOP 10
$test = new TestMinHeap('data.txt');
$buffer = new SplFixedArray(CHUNK_SIZE);
$minHeap = new MinHeap(HEAP_SIZE,
new Comparator(function($a, $b) {
return $a-$b;
}));
for (;;) {
$n = $test->chunk($buffer);
if ($n<1) {break;}
$minHeap->handle($buffer, $n);
}
print_r($minHeap->getHeap());
$ php TestMinHeap.php
...
改写generate.php, 分批次生成数据
<?php
// 要生成的总元素个数
define('BUFF_SIZE', !array_key_exists(1, $argv) ? 10000 : $argv[1]);
// 输出路径
define('OUT_PATH', !array_key_exists(2, $argv) ? './data.txt':$argv[2]);
// 每批次处理元素个数
define('CHUNK_SIZE', 128);
// 打乱数组顺序
function myshuffle(\SplFixedArray &$a) {
$swap = function(\SplFixedArray &$a, int $i, int $j) {
if ($i==$j) return;
$a[$i] = $a[$i] ^ $a[$j];
$a[$j] = $a[$i] ^ $a[$j];
$a[$i] = $a[$i] ^ $a[$j];
};
$m = count($a);
while ($m) {
$i = rand(0, --$m);
$swap($a, $i, $m);
}
}
function writeChunk($fp, $begin = 0, $end = CHUNK_SIZE) {
$size = $end - $begin;
$a = new \SplFixedArray($size);
for ($i = 0; $i < $size; $i++) {
$a[$i] = $i + $begin;
}
myshuffle($a);
while ($a->valid()) {
fwrite($fp, $a->current().PHP_EOL);
$a->next();
}
}
$fp = fopen(OUT_PATH, "w");
$loop = intval(BUFF_SIZE / CHUNK_SIZE);
for ($i = 0; $i < $loop; $i++) {
writeChunk($fp, $i * CHUNK_SIZE, $i * CHUNK_SIZE + CHUNK_SIZE);
}
$frag = BUFF_SIZE - $loop * CHUNK_SIZE;
if ($frag > 0) {
writeChunk($fp, $loop*CHUNK_SIZE, BUFF_SIZE);
}
fclose($fp);