Top K problem 堆实现 找到无序数组中最小(最大)的k个数

问题描述:有N(N>>10000)个整数,求出其中的前K个最大的数。(称作Top k或者Top 10)

问题分析:由于

(1)输入的大量数据;

(2)只要前K个

对整个输入数据的保存和排序是相当的不可取的. 可以利用数据结构的最小堆来处理该问题, 最小堆如图所示.

对于每个非叶子节点的数值,一定不大于孩子节点的数值。

这样可用含有K个节点的最小堆来保存K个目前的最大值(当然根节点是其中的最小数值)。

每次有数据输入的时候可以先与根节点比较。若不大于根节点,则舍弃;否则用新数值替换根节点数值。

并进行最小堆的调整。

先看找最小的数

* 找到无序的一堆数字中最小的k个数

* MinHeap.php 

<?php
/**
 * 有N(N>>10000)个整数,求出其中的前K个最小的数。(称作Top k或者Top 10)
 *
 * 从末尾数,第一个非叶节点的位置:
 *   k == 10  floor(10/2)-1 = 4
 *   k == 11  floor(11/2)-1 = 4
 *   k == 12  floor(12/2)-1 = 5
 *
 *         0
 *       /  \
 *      1    2
 *     / \  / \
 *    3  4  5 6
 *   /\ /
 *  7 8 9
 */

class MinHeap {
    /** @var int top K, heapSize */
    private $heapSize;

    /** @var null|\SplFixedArray */
    private $heap = null;

    /** @var int processed element count */
    private $count = 0;

    public function __construct($heapSize) {
        $this->heapSize = $heapSize;
        $this->heap = new \SplFixedArray($heapSize);
    }

    /**
     * @param \SplFixedArray $input $this->heapSize numbers
     * @param int $n effective element count
     */
    public function handle(\SplFixedArray $input, $n) {
        // echo implode(',', $input->toArray()).PHP_EOL;

        if ($this->count < $this->heapSize) {
            if ($n < $this->heapSize) {
                throw new RuntimeException("Unexpected input length < heap size");
            }
            for ($i = 0; $i != $n; $i++) {
                self::heapInsert($this->heap, $input[$i], $i);
            }
        }
        for ($i = 0; $i < $n; $i++) {
            if ($input[$i] < $this->heap[0]) {
                $this->heap[0] = $input[$i];
                $this->heapify(0);
            }
        }

        $this->count += $n;

        // echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
    }

    public function getHeap() {
        return $this->heap;
    }

    public static function heapInsert(\SplFixedArray $arr, $value, int $index) {
        $arr[$index] = $value;

        while ($index !== 0) {
            $parent = intval(($index - 1) / 2);
            if ($arr[$parent] < $arr[$index]) {
                self::swap($arr, $parent, $index);
                $index = $parent;
            } else {
                break;
            }
        }
    }

    public function heapify(int $index) {
        $left = $index * 2 + 1;
        $right = $left + 1;
        $largest = $index;

        while ($left < $this->heapSize) {
            if ($this->heap[$left] > $this->heap[$index]) {
                $largest = $left;
            }
            if ($right < $this->heapSize && $this->heap[$right] > $this->heap[$largest]) {
                $largest = $right;
            }
            if ($largest != $index) {
                self::swap($this->heap, $largest, $index);
            } else {
                break;
            }
            $index = $largest;
            $left = $index * 2 + 1;
            $right = $left + 1;
        }
    }

    private static function swap(\SplFixedArray $a, int $i, int $j) {
        $t = $a->offsetGet($i);
        $a->offsetSet($i, $a->offsetGet($j));
        $a->offsetSet($j, $t);
    }
}

* 测试数据准备 generate.php

 把测试数据写入到文件, 然后读取到内存操作。

 为了证明大数据量 不能把数据一次性全部加载到内存的场景。

生成 0-9999 1万个数字,并打乱顺序,一行一个写入data.txt

<?php

function myshuffle(\SplFixedArray &$a) {
    $swap = function(\SplFixedArray &$a, $i, $j) {
        if ($i==$j) return;
        $a[$i] = $a[$i] ^ $a[$j];
        $a[$j] = $a[$i] ^ $a[$j];
        $a[$i] = $a[$i] ^ $a[$j];
    };
    $m = count($a);
    while ($m) {
        $i = rand(0, --$m);
        $swap($a, $i, $m);
    }
}

define('BUFSIZE', !array_key_exists(1, $argv) ? 10000 : $argv[1]);
define('OUT_PATH', !array_key_exists(2, $argv) ? './data.txt':$argv[2]);

$fp = fopen(OUT_PATH, "w");

$a = new \SplFixedArray(BUFSIZE);
for ($i = 0; $i < BUFSIZE; $i++) {
    $a[$i] = $i;
}
myshuffle($a);
// var_dump($a->toArray());
while ($a->valid()) {
    fwrite($fp, $a->current().PHP_EOL);
    $a->next();
}

fclose($fp);

run:

php generate.php 1000 a.txt

写入a.txt 0-999打乱顺序的数字 一行一个

* 测试 TestMinHeap.php

  注意需要测到 最后一次读取文件内容 buffer 不满的情况

 比如要找到top10 前10大的数字,每次读取12个数字

 如果读取10003个数字, 最后一次读入 10003 - 10003 / 12 = 7 个数字 < 10

<?php
/**
 * Created by PhpStorm.
 * User: Mch
 * Date: 9/28/18
 * Time: 10:59 PM
 */
class TestMinHeap {
    /** @var \SplFileObject */
    private $file;

    /** @var int */
    private $linum;

    public function __construct(string $path) {
        $this->file = new \SplFileObject($path, 'r');
        $this->linum = 0;
    }

    /**
     * 试着读取$limit行, 对每一行执行$callback
     * @param callable $callback
     * @param int $limit
     * @return int 实际读取的行数
     */
    public function forEach(callable $callback, int $limit = 0) {
        if (0===$limit) {
            while ($this->file->valid()) {
                $line = $this->file->fgets();
                call_user_func($callback, $line, $this->linum);
                $this->linum++;
            }
            return 0;
        }
        for ($i = 0; $i < $limit && $this->file->valid(); $this->linum++) {
            $line = $this->file->fgets();
            if (!empty($line)) {
                call_user_func($callback, $line, $i);
                $i++;
            }   
        }
        return $i;
    }

    /**
     * 试着读取$limit行 扫描每一行的数字读到 \SplFixedArray $acc
     * @param SplFixedArray $acc
     * @return int 实际读取到的数字个数
     */
    public function chunk(SplFixedArray &$acc) {
        self::resetFixedArray($acc);

        $nRead = 0;
        $this->forEach(function($cur, $i) use (&$acc, &$nRead) {
            sscanf($cur, "%d\n", $elem);
            $acc->offsetSet($i, $elem);
            $nRead++;
        }, $acc->count());

        return $nRead;
    }

    private static function resetFixedArray(SplFixedArray &$acc) {
        $size = $acc->count();
        // reset acc
        for ($i = 0; $i < $size; $i++) {
            $acc->offsetSet($i, null);
        }
        $acc->rewind();
    }

    public function __destruct() {}
}

function __autoload($className) {
    include $className.'.php';
}

define('CHUNK_SIZE', 12);
define('HEAP_SIZE', 10);  // TOP 10

$test = new TestMinHeap('data.txt');
$buffer = new SplFixedArray(CHUNK_SIZE);

$minHeap = new MinHeap(HEAP_SIZE);

for (;;) {
    $n = $test->chunk($buffer);
    if ($n<1) {break;}
    $minHeap->handle($buffer, $n);
}
print_r($minHeap->getHeap());

* Run:

$ php TestMinHeap.php 

SplFixedArray Object

(

    [0] => 9

    [1] => 7

    [2] => 8

    [3] => 5

    [4] => 6

    [5] => 4

    [6] => 2

    [7] => 1

    [8] => 0

    [9] => 3

)

反过来 再试找最大的前k个数

<?php
/**
 * 有N(N>>10000)个整数,求出其中的前K个最大的数。(称作Top k或者Top 10)
 * 这个博客没法测试, 目测有问题.
 * https://www.cnblogs.com/xudong-bupt/archive/2013/03/20/2971262.html
 *
 * 从末尾数,第一个非叶节点的位置:
 *   k == 10  floor(10/2)-1 = 4
 *   k == 11  floor(11/2)-1 = 4
 *   k == 12  floor(12/2)-1 = 5
 *
 *         0
 *       /  \
 *      1    2
 *     / \  / \
 *    3  4  5 6
 *   /\ /
 *  7 8 9
 */

class MinHeap {
    /** @var int top K, heapSize */
    private $heapSize;

    /** @var null|\SplFixedArray */
    private $heap = null;

    /** @var int processed element count */
    private $count = 0;

    public function __construct($heapSize) {
        $this->heapSize = $heapSize;
        $this->heap = new \SplFixedArray($heapSize);
    }

    /**
     * @param \SplFixedArray $input $this->heapSize numbers
     * @param int $n effective element count
     */
    public function handle(\SplFixedArray $input, $n) {
        // echo implode(',', $input->toArray()).PHP_EOL;

        if ($this->count < $this->heapSize) {
            if ($n < $this->heapSize) {
                throw new RuntimeException("Unexpected input length < heap size");
            }
            for ($i = 0; $i != $this->heapSize; $i++) {
                self::heapInsert($this->heap, $input[$i], $i);
            }
        }
        for ($i = 0; $i < $n; $i++) {
            if ($input[$i] > $this->heap[0]) {
                $this->heap[0] = $input[$i];
                $this->heapify(0);
            }
        }
        $this->count += $n;

        // echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
    }

    public function getHeap() {
        return $this->heap;
    }

    public static function heapInsert(\SplFixedArray $arr, $value, int $index) {
        $arr[$index] = $value;

        while ($index !== 0) {
            $parent = intval(($index - 1) / 2);
            if ($arr[$parent] > $arr[$index]) {
                self::swap($arr, $parent, $index);
                $index = $parent;
            } else {
                break;
            }
        }
    }

    public function heapify(int $index) {
        $left = $index * 2 + 1;
        $right = $left + 1;
        $min = $index;

        while ($left < $this->heapSize) {
            if ($this->heap[$left] < $this->heap[$index]) {
                $min = $left;
            }
            if ($right < $this->heapSize && $this->heap[$right] < $this->heap[$min]) {
                $min = $right;
            }
            if ($min != $index) {
                self::swap($this->heap, $min, $index);
            } else {
                break;
            }
            $index = $min;
            $left = $index * 2 + 1;
            $right = $left + 1;
        }
    }

    private static function swap(\SplFixedArray $a, int $i, int $j) {
        $t = $a->offsetGet($i);
        $a->offsetSet($i, $a->offsetGet($j));
        $a->offsetSet($j, $t);
    }
}

再测试

$ php TestMinHeap.php 

SplFixedArray Object

(

    [0] => 9990

    [1] => 9991

    [2] => 9993

    [3] => 9992

    [4] => 9995

    [5] => 9996

    [6] => 9994

    [7] => 9998

    [8] => 9997

    [9] => 9999

)

执行上面的代码如果报错 Parse error: parse error, expecting `"identifier (T_STRING)"', 检查php版本

PHP版本 > 7 才能支持 参数带类型

$ php --version

PHP 7.2.4 (cli) (built: Apr 10 2018 10:59:05) ( NTS )

Copyright (c) 1997-2018 The PHP Group

Zend Engine v3.2.0, Copyright (c) 1998-2018 Zend Technologies

with Xdebug v2.7.0alpha2-dev, Copyright (c) 2002-2018, by Derick Rethans

上面的例子比较元素的大小只适用于整数,可以添加Comparator比较大小的类.

3个文件都在同一级目录下

Comparator.php  MinHeap.php     TestMinHeap.php data.txt        generate.php

+ Comparator.php

MinHeap.php, TestMinHeap.php都要改动

* Comparator.php

<?php
class Comparator {
    /** @var callable */
    protected $compare;

    public function __construct(callable $compareFunction) {
        if ($compareFunction) {
            $this->compare = $compareFunction;
        } else {
            $this->compare = function($a, $b) {
                if ($a === $b) {return 0;}
                return $a < $b ? -1 : 1;
            };
        }
    }
    public function equal($a, $b) {return call_user_func($this->compare, $a, $b) === 0;}
    public function lessThan($a, $b) {return call_user_func($this->compare, $a, $b) < 0;}
    public function greeterThan($a, $b) {return call_user_func($this->compare, $a, $b) > 0;}

    public function lessThanOrEqual($a, $b) {
        return $this->lessThan($a, $b) || $this->equal($a, $b);
    }
    public function greeterThanOrEqual($a, $b) {
        return $this->greeterThan($a, $b) || $this->equal($a, $b);
    }
    public function reverse() {
        $compareOriginal = $this->compare;
        $this->compare = function ($a, $b) use ($compareOriginal) {
            return $compareOriginal($b, $a);
        };
    }
}

* MinHeap.php

<?php
// include_once './Comparator.php';

class MinHeap {
    /** @var int top K, heapSize */
    private $heapSize;

    /** @var null|\SplFixedArray */
    private $heap = null;

    /** @var int processed element count */
    private $count = 0;

    /** @var Comparator compare function */
    private $comparator;

    public function __construct(int $heapSize, Comparator $comparator) {
        $this->heapSize = $heapSize;
        $this->heap = new \SplFixedArray($heapSize);
        $this->comparator = $comparator;
    }

    /**
     * @param \SplFixedArray $input $this->heapSize numbers
     * @param int $n effective element count
     */
    public function handle(\SplFixedArray $input, $n) {
        // echo implode(',', $input->toArray()).PHP_EOL;

        if ($this->count < $this->heapSize) {
            if ($n < $this->heapSize) {
                throw new RuntimeException("Unexpected input length < heap size");
            }
            for ($i = 0; $i != $this->heapSize; $i++) {
                self::heapInsert($this->heap, $input[$i], $i, $this->comparator);
            }
        }
        for ($i = 0; $i < $n; $i++) {
            // if ($input[$i] > $this->heap[0]) {
            if ($this->comparator->greeterThan($input[$i], $this->heap[0])) {
                $this->heap[0] = $input[$i];
                $this->heapify(0);
            }
        }
        $this->count += $n;

        // echo implode(',', $this->getHeap()->toArray()).PHP_EOL;
    }

    public function getHeap() {
        return $this->heap;
    }

    public static function heapInsert(\SplFixedArray $arr, $value, int $index, Comparator $c) {
        $arr[$index] = $value;

        while ($index !== 0) {
            $parent = intval(($index - 1) / 2);
            // if ($arr[$parent] > $arr[$index]) {
            if ( $c->greeterThan($arr[$parent], $arr[$index]) ) {
                self::swap($arr, $parent, $index);
                $index = $parent;
            } else {
                break;
            }
        }
    }

    public function heapify(int $index) {
        $left = $index * 2 + 1;
        $right = $left + 1;
        $min = $index;

        while ($left < $this->heapSize) {
            if ($this->comparator->lessThan($this->heap[$left], $this->heap[$index]))
            {$min = $left;}

            if ($right < $this->heapSize && $this->comparator->lessThan(
                    $this->heap[$right], $this->heap[$min]))
            {$min = $right;}

            if ($min != $index) {self::swap($this->heap, $min, $index);}
            else {break;}

            $index = $min;
            $left = $index * 2 + 1;
            $right = $left + 1;
        }
    }

    private static function swap(\SplFixedArray $a, int $i, int $j) {
        $t = $a->offsetGet($i);
        $a->offsetSet($i, $a->offsetGet($j));
        $a->offsetSet($j, $t);
    }
}

* TestMinHeap.php

<?php

class TestMinHeap {
    /** @var \SplFileObject */
    private $file;

    /** @var int */
    private $linum;

    public function __construct(string $path) {
        $this->file = new \SplFileObject($path, 'r');
        $this->linum = 0;
    }

    /**
     * 试着读取$limit行, 对每一行执行$callback
     * @param callable $callback
     * @param int $limit
     * @return int 实际读取的行数
     */
    public function forEach(callable $callback, int $limit = 0) {
        if (0===$limit) {
            while ($this->file->valid()) {
                $line = $this->file->fgets();
                call_user_func($callback, $line, $this->linum);
                $this->linum++;
            }
            return 0;
        }
        for ($i = 0; $i < $limit && $this->file->valid(); $this->linum++) {
            $line = $this->file->fgets();
            if (!empty($line)) {
                call_user_func($callback, $line, $i);
                $i++;
            }
        }
        return $i;
    }

    /**
     * 试着读取$limit行 扫描每一行的数字读到 \SplFixedArray $acc
     * @param SplFixedArray $acc
     * @return int 实际读取到的数字个数
     */
    public function chunk(SplFixedArray &$acc) {
        self::resetFixedArray($acc);

        $nRead = 0;
        $this->forEach(function($cur, $i) use (&$acc, &$nRead) {
            sscanf($cur, "%d\n", $elem);
            $acc->offsetSet($i, $elem);
            $nRead++;
        }, $acc->count());

        return $nRead;
    }

    private static function resetFixedArray(SplFixedArray &$acc) {
        $size = $acc->count();
        // reset acc
        for ($i = 0; $i < $size; $i++) {
            $acc->offsetSet($i, null);
        }
        $acc->rewind();
    }

    public function __destruct() {}
}

function __autoload($className) {
    include $className.'.php';
}

define('CHUNK_SIZE', 12);
define('HEAP_SIZE', 10);  // TOP 10

$test = new TestMinHeap('data.txt');
$buffer = new SplFixedArray(CHUNK_SIZE);

$minHeap = new MinHeap(HEAP_SIZE,
    new Comparator(function($a, $b) {
        return $a-$b;
    }));

for (;;) {
    $n = $test->chunk($buffer);
    if ($n<1) {break;}
    $minHeap->handle($buffer, $n);
}
print_r($minHeap->getHeap());

$ php TestMinHeap.php

...

改写generate.php, 分批次生成数据

<?php

// 要生成的总元素个数
define('BUFF_SIZE', !array_key_exists(1, $argv) ? 10000 : $argv[1]);
// 输出路径
define('OUT_PATH', !array_key_exists(2, $argv) ? './data.txt':$argv[2]);
// 每批次处理元素个数
define('CHUNK_SIZE', 128);

// 打乱数组顺序
function myshuffle(\SplFixedArray &$a) {
    $swap = function(\SplFixedArray &$a, int $i, int $j) {
        if ($i==$j) return;
        $a[$i] = $a[$i] ^ $a[$j];
        $a[$j] = $a[$i] ^ $a[$j];
        $a[$i] = $a[$i] ^ $a[$j];
    };
    $m = count($a);
    while ($m) {
        $i = rand(0, --$m);
        $swap($a, $i, $m);
    }
}

function writeChunk($fp, $begin = 0, $end = CHUNK_SIZE) {
    $size = $end - $begin;
    $a = new \SplFixedArray($size);

    for ($i = 0; $i < $size; $i++) {
        $a[$i] = $i + $begin;
    }

    myshuffle($a);

    while ($a->valid()) {
        fwrite($fp, $a->current().PHP_EOL);
        $a->next();
    }
}

$fp = fopen(OUT_PATH, "w");

$loop = intval(BUFF_SIZE / CHUNK_SIZE);

for ($i = 0; $i < $loop; $i++) {
    writeChunk($fp, $i * CHUNK_SIZE, $i * CHUNK_SIZE + CHUNK_SIZE);
}

$frag = BUFF_SIZE - $loop * CHUNK_SIZE;
if ($frag > 0) {
    writeChunk($fp, $loop*CHUNK_SIZE, BUFF_SIZE);
}

fclose($fp);

猜你喜欢

转载自blog.csdn.net/fareast_mzh/article/details/82875342