BFPRT算法：时间复杂度O(n)求第k小的数字（分治算法+快排）

去年写了一篇《分治算法求第 $k$ 小元素 $O(n)$ & $O(nlog2^n$ )》的文章。介绍了一种对快排进行改进的算法，可以在时间复杂度 $O(n)$ 的情况下，找到第 $k$ 小的数字。

那时候，我还不知道这个算法叫BFPRT算法——现在知道了，还知道它又被称为中位数的中位数算法，它的最坏时间复杂度为 $O(n)$ ，它是由Blum、Floyd、Pratt、Rivest、Tarjan提出，它的思想是修改快速选择算法的主元选取方法，提高算法在最坏情况下的时间复杂度。

而且，我还发现了STL中有一个类似的函数——std::nth_element （位于头文件<algorithm>中）:

#include <iostream>
#include <vector>
#include <algorithm>
#include <functional>

int main() {
    std::vector<int> v{5, 6, 4, 3, 2, 6, 7, 9, 3};
    std::nth_element(v.begin(), v.begin() + v.size()/2, v.end());
    std::cout << "The median is " << v[v.size()/2] << '\n';
    std::nth_element(v.begin(), v.begin()+1, v.end(), std::greater<int>());
    std::cout << "The second largest element is " << v[1] << '\n';
}

The median is 5
The second largest element is 7

好了，言归正传。BFPRT算法主要由两部分组成（快排、基准选取函数）。基准选取函数也就是中位数的中位数算法（Median of Medians algorithm）的实现，具体来说——就是将快排中基准选取策略进行了优化，改为每次尽可能的选择中位数作为基准。

那么是如何尽可能的选出中位数？如果要找到一个精确的中位数，所消耗的时间代价将得不偿失，而且对于快排算法来说，只要基准尽可能的接近真正的中位数，就能形成近似的均分。我在上一篇文章中举了个例子，这里我再重复一遍：

假设，我们要找arr[18]的近似中位数——其实，也就是找到数字8。（注意到，由于使用了分组，这里产生的只是尽可能的中位数）

int arr[18] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 };

BFPRT算法规定了，将5个元素作为一组（选出中位数）。然后，再选出中位数中的中位数…一直到，最终选出一个数字。

那么，第一轮就是这样（将选出的中位数放置在最前面）：

//执行momedians之前
{ 1,2,3,4,5} {6,7,8,9,10} {11,12,13,14,15} {16,17,18 };

分别对每组sort后选取小组中位数，再使用swap将小组中位数放置在“最前面”对应位置（注意下文中*号的标注，它表示这一步选出的小组中位数放置到哪儿去了）。

//开始momedians
{ 3*,2,1*,4,5};//sort->swap(3,1)
{ 3,8*,1,4,5} {6,7,2*,9,10};//sort->swap(8,2)
{ 3,8,13*,4,5} {6,7,2,9,10} {11,12,1*,14,15};//sort->swap(13,1)
{ 3,8,13,17*,5} {6,7,2,9,10} {11,12,1,14,15} {16,4*,18 };//sort->swap(17,4)

同样，第二轮初始时就成如下样子了，很显然已经少于5个数字了：

//执行momedians之前
{ 3,8,13,17};

直接选出中位数8。

可以将上述过程使用C++代码描述如下（对于5个数字的排序，使用Insert sort可能会效率更高）。再次强调，下面这段代码唯一的作用，就是用来选出每次快排的基准：

//Median of Medians algorithm
int momedians(int* const low, int* const high) {
	if (low >= high - 1) {
		return *low;
	}
	//int grp_length = (int)std::sqrt(high - low);
	int grp_length = 5, grp_idx = 0;
	for (int* l = low, *r; l < high; l = r+1) {
		r = (l + grp_length >= high) ? high : l + grp_length - 1;
		std::sort(l, r); //可以使用下文的void isort(int* const low, int* const high)
		std::swap(*(low+grp_idx++), *(l + (r - l) / 2));
	}
	return momedians(low, low + grp_idx);
}

写到这里已经将BFPRT算法核心介绍完毕了。

如果想测试使用Insert sort是否会带来效率上提升的小伙伴，可以试试下面这段代码isort，尝试着替换文中的std::sort排序函数。

//Insert sort
void isort(int* const low, int* const high) {
	for (int* i = low + 1; i <= high; ++i) {
		if (*i < *(i-1)) {
			int border = *i, *j = i-1;
			for ( ; border < *j; --j) {
				*(j+1) = *j;
			}
			*(j+1) = border;
		}
	}
}

让我们思维次再回到momedians函数。从上述代码中可以看到grp_length = 5是一个固定值。那么，在BFPRT算法中，为什么是选5个作为分组？这个我也不是很明白，我也尝试使用sqrt(数组长度)作为分组的长度。

有兴趣的可以阅读 ACdreamer 的一篇《BFPRT算法原理》，他在文章结尾处，对为什么使用5作为固定分组长度进行了简单说明。同时，还附有BFPRT算法的最坏时间复杂度为 $O(n)$ 的证明。

好了，以下为完整的“BFPRT算法：时间复杂度 $O(n)$ 求第k小的数字”代码。如上文所说，算法主体功能是快排，只是在基准选取的时候使用了momedians算法——而不是直接取第一个数作为基准（严蔚敏版教材中的做法）。

#include <iostream>
#include <algorithm>
using namespace std;

//Median of Medians algorithm
int momedians(int* const low, int* const high) {
	if (low >= high - 1) {
		return *low;
	}
	int grp_length = 5, grp_idx = 0;
	for (int* l = low, *r; l < high; l = r+1) {
		r = (l + grp_length >= high) ? high : l + grp_length - 1;
		std::sort(l, r);
		std::swap(*(low+grp_idx++), *(l + (r - l) / 2));
	}
	return momedians(low, low + grp_idx);
}

//Quick sort
int qsort(int* const low, int* const high, int* const ptrk) {
	int* l = low, *r = high;
	if (l < r) {
		int pivot = momedians(l, r);
		while (l < r) {
			while (l < r && *r >= pivot) {
				--r;
			}
			*l = *r;
			while (l < r && *l <= pivot) {
				++l;
			}
			*r = *l;
		}
		*r = pivot;
	}
	//per qsort end, check left == right == ptrk?
	return r == ptrk ? *ptrk :
		(r > ptrk ?
			qsort(low, r - 1, ptrk) :
			qsort(r + 1, high, ptrk));
}

//Blum、Floyd、Pratt、Rivest、Tarjan
int bfprt(int* const low, int* const high, const int k = 1) {
	if (low >= high || k < 1 || k > high - low) {
		throw std::invalid_argument("low > high || k < 1");
	}
	return qsort(low, high-1, low + k -1);//[low, high)
}

int main() {
	int arr[18] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 };
	//cout << bfprt(&arr[0], &arr[0]+1, 1) << endl;
	cout << bfprt(&arr[0], &arr[0] + 18, 18) << endl;

	return 0;
}

以上的代码是针对int数组编写的，下面再测试一下对于其他类型的支持情况（这里直接粗暴的加上了一个模板）。有一点想强调的是，这里的bfprt函数参数是左闭右开区间[low,high)，同时k必须是从1开始的正数。

//Blum、Floyd、Pratt、Rivest、Tarjan
template<typename T>
T bfprt(T* const low, T* const high, const int k = 1) {
	if (low >= high || k < 1 || k > high - low) {
		throw std::invalid_argument("low > high || k < 1");
	}
	return qsort(low, high-1, low + k -1);//[low, high)
}

在main函数中对int、char、string类型进行了测试。完整代码如下…

#include <iostream>
#include <algorithm>
using namespace std;

//Median of Medians algorithm
template<typename T>
T momedians(T* const low, T* const high) {
	if (low >= high - 1) {
		return *low;
	}
	int grp_length = 5, grp_idx = 0;
	for (T* l = low, *r; l < high; l = r+1) {
		r = (l + grp_length >= high) ? high : l + grp_length - 1;
		std::sort(l, r);
		std::swap(*(low+grp_idx++), *(l + (r - l) / 2));
	}
	return momedians(low, low + grp_idx);
}

//Quick sort
template<typename T>
T qsort(T* const low, T* const high, T* const ptrk) {
	T* l = low, *r = high;
	if (l < r) {
		T pivot = momedians(l, r);
		while (l < r) {
			while (l < r && *r >= pivot) {
				--r;
			}
			*l = *r;
			while (l < r && *l <= pivot) {
				++l;
			}
			*r = *l;
		}
		*r = pivot;
	}
	//per qsort end, check left == right == ptrk?
	return r == ptrk ? *ptrk :
		(r > ptrk ?
			qsort(low, r - 1, ptrk) :
			qsort(r + 1, high, ptrk));
}

//Blum、Floyd、Pratt、Rivest、Tarjan
template<typename T>
T bfprt(T* const low, T* const high, const int k = 1) {
	if (low >= high || k < 1 || k > high - low) {
		throw std::invalid_argument("low > high || k < 1");
	}
	return qsort(low, high-1, low + k -1);//[low, high)
}

int main() {
	int arr[18] = { 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 };
	cout << bfprt(&arr[0], &arr[0] + 18, 18) << endl;

	char str[7] = {'a','b','c','d','e','f','g'};
	cout << bfprt(&str[0], &str[0] + 7, 4) << endl;

	string s = "abcdefghijklmnopqrstuvwxyz";
	cout << bfprt(&s[0], &s[0] + 26, 10) << endl;
	
	return 0;
}

References:
[1] 为径，分治算法求第k小元素 $O(n)$ & $O(nlog2^n)$ ，2018-12-25
[2] ACdreamer，BFPRT算法原理，2018-12-25
[3] STL nth_element() , 2018-12-25

BFPRT算法：时间复杂度O(n)求第k小的数字（分治算法+快排）

猜你喜欢