STL笔记之hashtable

散列函数(hash function)：使用某种映射函数，将大数映射为小数，负责将某一元素映射为一个“大小可接受之索引”，这样的函数称为散列函数。

散列表(hashtable)：可提供对任何有名项（named item）的存取操作和删除操作，可被视为是一种字典结构，这种结构的用意在于提供常数时间的基本操作。

使用散列函数会带来一个问题：可能有不同的元素被映射到相同的位置（即有相同的索引）。因为元素大小大于array容量，这种情况就无法避免。这就是所谓的碰撞问题。解决碰撞问题的方法有很多种，包括线性检测(linear probing)、二次检测(quadratic probing)、开链(separate chaining)等。

线性检测

负载系数(loading factor)：元素个数除以表格大小。负载系数永远在0到1之间，除非采用开链策略。

当散列表计算出某个元素的插入位置，而该位置上的空间已不再可用时，就循序往下意义寻找（如果到达尾端，就绕到头部寻找），直到找到一个可用空间为止。这就是线性检测。

最坏的情况是线性巡防整个表格，平均情况是巡防一半的表格。与期望的常数时间天差地远。

主集团（primary clustering）：平均插入成本的成长幅度，远高于负载系数的成长幅度。

二次检测

二次检测主要用来解决主集团得到问题。其命名由来是因为解决碰撞问题的方程式F(i)=i*i是个二次方程式。更明确的说，如果散列函数计算出新元素的位置为H，而该位置实际上已被使用，那么就依序尝试H+1,H+4,H+9,...而不是线性检测那样依序尝试H+1,H+2,H+3,...H+i。

开链

开链法是在每一个表格元素中维护一个list，hash function为我们分配某个list，然后我们在那个list身上执行元素的插入、搜寻、删除等操作。虽然针对list而进行的搜寻只能是一种线性操作，但如果list够短，速度还是很快。

使用开链手法，表格的负载系数将大于1。SGI STL的hashtable便是采用这种做法。

hashtable的桶子(buckets)与节点(nodes)

template<class Value>
struct __hashtable_node
{
	__hashtable_node* next;
	Value val;
};

bucket所维护的linked list，并不采用STL的list或slist，而是自行维护上述的hash table node。至于buckets聚合体，则是以vector完成，以便有动态扩充能力。

迭代器

template<class Value>
struct __hashtable_node
{
	__hashtable_node* next;
	Value val;
};

template<class Value,class Key,class HashFcn,
	class ExtractKey,class EqualKey,class Alloc>
struct __hashtable_iterator
{
	typedef hashtable<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>
		hashtable;
	typedef __hashtable_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>
		iterator;
	typedef __hashtable_const_iterator<Value, Key, HashFcn, ExtractKey, EqualKey, Alloc>
		const_iterator;
	typedef __hashtable_node<Value> node;

	typedef forward_iterator_tag iterator_category;
	typedef Value value_type;
	typedef ptrdiff_t difference_type;
	typedef size_t size_type;
	typedef Value& reference;
	typedef Value* pointer;

	node* cur;//迭代器目前所指定节点
	hashtable* ht;//保持对容器的连结关系

	__hashtable_iterator(node* n,hashtable* tab):cur(n),ht(tab){}
	__hashtable_iterator(){}
	reference operator*()const { return cur->val; }
	pointer operator->()const { return &(operator*()); }
	iterator& operator++();
	iterator opearator++(int);
	bool operator==(const iterator& it)const { return cur == it.cur; }
	bool operator!=(const iterator& it)const { return cur != it.cur; }
};

template<class V,class K,class HF,class ExK,class EqK,class A>
__hashtable_iterator<V,K,HF,ExK,EqK,A>& 
__hashtable_iterator<V, K, HF, ExK, EqK, A>::operator++()
{
	const node* old = cur;
	cur = cur->next;//如果存在，就是它
	if (!cur) {
		//根据元素值，定位出下一个bucket。其起头处就是我们的目的地
		size_type bucket = ht->bkt_num(old->val);
		while (!cur&&++bucket < ht->buckets.size())
			cur = ht->buckets[bucket];
	}
	return *this;
}

template<class V, class K, class HF, class ExK, class EqK, class A>
__hashtable_iterator<V, K, HF, ExK, EqK, A>&
__hashtable_iterator<V, K, HF, ExK, EqK, A>::operator++(int)
{
	iterator tmp = *this;
	++*this;
	return tmp;
}

hashtable的迭代器没有后退操作(operator--())，hashtable也没有所谓的逆向迭代器(reverse iterator)。

数据结构

template<class Value, class Key, class HashFcn,
	class ExtractKey, class EqualKey, class Alloc = alloc>
class hashtable;
//...
template<class Value, class Key, class HashFcn,
	class ExtractKey, class EqualKey, class Alloc>
class hashtable {
public:
	typedef HashFcn haseher;//为template型别参数重新定义一个名称
	typedef EqualKey key_equal;//为template型别重新定义一个名称
	typedef size_t size_type;

private:
	//以下三者都是函数对象
	//<stl_hash_fcn.h>中定义有数个标准型别(int,c-style string等)的hasher
	hasher hash;
	key_equal equals;
	ExtractKey get_key;

	typedef __hashtable_node<Value> node;
	typedef simple_alloc<node, Alloc> node_allocator;

	vector<node*, Alloc>buckets;
	size_type num_elements;

public:
	//bucket的个数即buckets vector的大小
	size_type bucket_count()const { return buckets.size(); }
//...
};

虽然开链法并不要求表格大小为质数，但SGI STL仍然以质数来设计表格大小，并且先将28个质数（逐渐呈现大约两倍的关系）计算好，以便随时访问，同时提供一个函数，用来查询在这28个质数之中，最接近某数并大于某数的质数。

//注意：假设long至少有32bits
static const int __stl_num_primes = 28;
static const unsigned long __stl_prime_list[__stl_num_primes] = {
	53,97,193,389,769,
	1543,3079,6151,12289,24593,
	49157,98317,196613,393241,786433,
	1572869,3145739,6291469,12582917,25165843,
	50331653,100663319,201326611,402653189,805306457,
	1610612741,3221225473ul,4294967291ul
};

//找出上述28个质数中，最接近并大于或等于n的那个质数
inline unsigned long __stl_next_prime(unsigned long n)
{
	const unsigned long* first = __stl_prime_list;
	const unsigned long* last = __stl_prime_list + __stl_num_primes;
	const unsigned long* pos = lower_bound(first, last, n);//lower_bound是泛型算法
	return pos == last ? *(last - 1) : *pos;
}

//总共可以有多少buckets
size_type max_bucket_count()const
{
	return __stl_prime_list[__stl_num_primes - 1];
}

构造与内存管理

节点配置和节点释放：

	//创建新节点
	node* new_node(const value_type& obj)
	{
		node* n = node_allocator::allocate();
		n->next = 0;
		__STL_TRY{
			construct(&n->val,obj);
		return n;
		}
		__STL_UNWIND(node_allocator::deallocate(n));
	}

	//删除节点
	void delete_node(node* n)
	{
		destroy(&n->val);
		node_allocator::deallocate(n);
	}

构造函数：

hashtable(size_type n, const HashFcn& hf, const EqualKey& eql)
		:hash(hf), equal(eql), get_key(ExtractKey()), num_elements(0)
	{
		initialize_buckets(n);
	}

	void initialize_buckets(size_type n)
	{
		const size_type n_buckets = next_size(n);//返回最接近n并大于等于n的质数
		buckets.reserve(n_buckets);
		buckets.insert(buckets.end(), n_buckets, (node*)0);
		num_elements = 0;
	}

插入操作(insert)和表格重整(resize):

//插入元素，不允许重复
	pair<iterator, bool>insert_unique(const value_type& obj) {
		resize(num_elements + 1);//判断是否需要重建表格，如需要就扩充
		return insert_unique_noresize(obj);
	}

	//以下函数判断是否需要重建表格
	template<class V, class K, class HF, class Ex, class Eq, class A>
	void hashtable<V, K, HF, Ex, Eq, A>::resize(size_type num_elements_hint)
	{
	    //拿元素个数（把新增元素计入后）和bucket vector的大小来比较
		//如果前者大于后者，就重建表格
		//由此可知，每个bucket(list)的最大容量和buckets vector的大小相同
		const size_type old_n = buckets.size();
		if (num_elements_hint > old_n) {
			//确定真的需要重新配置
			const size_type n = next_size(num_elements_hint);//找出下一个质数
			if (n > old_n) {
				vector<node*, A>tmp(n, (node*)0);//设立新的buckets
				__STL_TRY{
					//以下处理每一个旧的bucket
					for (size_type bucket = 0; bucket < old_n; ++bucket)
					{
						node* first = buckets[bucket];//指向节点所对应之串行的起始节点
						//以下处理每一个旧bucket所含串行的每一个节点
						while (first) {
							//以下找出节点落在哪一个新bucket内
							size_type new_bucket = bkt_num(first->val, n);
							//以下四个操作颇为微妙
							//（1）令旧bucket指向其所对应之串行的下一个节点以便迭代处理
							buckets[bucket] = first->next;
							//(2)(3)将当前节点插入到新bucket内，成为其对应串行的第一个节点
							first->next = tmp[new_bucket];
							tmp[new_bucket] = first;
							//(4)回到旧bucket所指的待处理串行，准备下一个节点
							first = buckets[bucket];
						}
					}
				buckets.swap(tmp);//vector::swap,新旧两个buckets对调
				//注意，对调双方如果大小不同，大的会变小，小的会变大
				//离开时释放local tmp的内存
				}
			}
		}
	}

	template<class V, class K, class HF, class Ex, class Eq, class A>
	pair<typename hashtable<V,K,HF,Ex,Eq,A>::iterator,bool>
	hashtable<V, K, HF, Ex, Eq, A>::insert_unique_noresize(const value_type& obj)
	{
		const size_type n = bkt_num(obj);//决定obj应位于n号bucket
		node* first = buckets[n];//令first指向bucket对应之串行头部

		//如果buckets[n]已被占用，此时first将不为0，于是进入下列循环
		//走过bucket所对应的整个链表
		for (node* cur = first; cur; cur = cur->next)
			if (equals(get_key(cur->val), get_key(obj)))
				//如果发现与链表中的某键值相同，就不插入，立即返回
				return pair<iterator, bool>(iterator(cur, this), false);

		//离开以上循环（或根本没进入循环）时，first指向bucket所指链表的头部节点
		node* tmp = new_node(obj);//产生新节点
		tmp->next = first;
		buckets[n] = tmp;//令新节点成为链表的第一个节点
		++num_elements;//节点个数累加1
		return pair<iterator, bool>(iterator(tmp, this), true);
	}

	//插入元素，允许重复
	iterator insert_equal(const value_type& obj)
	{
		resize(num_elsements + 1);
		return insert_equal)noresize(obj);
	}

	template<class V, class K, class HF, class Ex, class Eq, class A>
	typename hashtable<V, K, HF, Ex, Eq, A>::iterator
		hashtable<V, K, HF, Ex, Eq, A>::insert_equal_noresize(const value_type& obj)
	{
		const size_type n = bkt_num(obj);//决定obj应位于n号bucket
		node* first = buckets[n];//令first指向bucket对应之串行头部
		//如果buckets[n]已被占用，此时first将不为0，于是进入下列循环
		//走过bucket所对应的整个链表
		for (node* cur = first; cur; cur = cur->next)
			if (equals(get_key(cur->val), get_key(obj))) {
				//如果发现与链表中的某键值相同，就插入，然后返回
				node* tmp = new_node(obj);
				tmp->next = cur->next;
				cur->next = tmp;
				++num_elements;
				return iterator(tmp, this);
			}
		//进行至此，没有发现重复的键值
		node* tmp = new_node(obj);//产生新节点
		tmp->next = first;
		buckets[n] = tmp;//令新节点成为链表的第一个节点
		++num_elements;//节点个数累加1
		return iterator(tmp, this);
	}

判断元素的落脚处（bkt_num）：

	//版本1：接受实值（value）和buckets个数
	size_type bkt_num(const value_type& obj, size_t)const
	{
		return bkt_num_key(get_key(obj), n);
	}
	//版本2：只接受实值（value）
	size_type bkt_num(const value_type& obj)const
	{
		return bkt_num_key(get_key(obj));
	}
	//版本3：只接受键值
	size_type bkt_num_key(const key_type& key)const
	{
		return bkt_num_key(key, buckets.size());
	}
	//版本4：接受键值和buckets个数
	size_type bkt_num_key(cosnt key_type& key, size_t n)const
	{
		return hash(key) % n;
	}

复制(copy_from)和整体删除(clear)：

	template<class V, class K, class HF, class Ex, class Eq, class A>
	void hashtable<V, K, HF, Ex, Eq, A>::clear()
	{
		//针对每一个bucket
		for (size_t i = 0; i < buckets.size(); ++i)
		{
			node* cur = buckets[i];
			//将bucket list中的每一个节点删除掉
			while (cur != 0) {
				node* next = cur->next;
				delete_node(cur);
				cur = next;
			}
			buckets[i] = 0;
		}
		num_elements = 0;
		//注意：buckets vector并未释放掉空间，仍保持原有大小
	}

	template<class V, class K, class HF, class Ex, class Eq, class A>
	void hashtable<V, K, HF, Ex, Eq, A>::copy_from(const hashtable& ht)
	{
		//先清除己方的buckets vector,调用vector::clear
		buckets.clear();
		//为己方的buckets vector保留空间，使与对方相同
		//如果己方空间大于对方，就不动，否则增大
		buckets.reserve(ht.buckets.size());
		//从己方的buckets vector 尾端开始，插入n个元素，其值为null指针
		//此时buckets vector为空,所以所谓尾端，就是起头处
		buckets.insert(buckets.end(), ht.buckets.size(), (node*)0);
		__STL_TRY{
			//针对buckets vector
			for (size_type i = 0; i < ht.buckets.size(); ++i)
			{
				//复制vector的每一个元素（是个指针，指向hashtable节点)
				if (const node* cur = ht.buckets[i]) {
					node* copy = new_node(cur->val);
					buckets[i] = copy;

					//针对同一个bucket list，复制每一个节点
					for (node* next = cur->next; next; cur = next, next = cur->next) {
						copy->next = new_node(next->val);
						copy = copy->next;
					}
				}
			}
		num_elements = ht.num_elements;
		}
		__STL_UNWIND(clear());
	}

find和count：

	iterator find(const key_type& key) {
		size_type n = bkt_num_key(key);//首先寻找在哪个bucket内
		node* first;
		//从bucket list的头开始，一一比对每个元素的值，比对成功就跳出
		for (first = buckets[n];
			first && !equals(get_key(first->val),key);
			first=first->next)
		{ }
		return iterator(first, this);
	}

	size_type count(const key_type& key)const
	{
		const size_type n = bkt_num_key(key);
		size_type result = 0;
		//以上，从bucket list的头开始，一一比对每个元素的键值，比对成功就累加1
		for (const node* cur = buckets[n]; cur; cur = cur->next)
			if (equals(get_key(cur->val), key))
				++result;
		return result;
	}

hash function

<stl_hash_fun.h>定义了数个现成的hash function，全都是仿函数。针对char，int，long等整数型别，这里大部分的hash function什么也没做，只是忠实返回原值。但对于字符字符串(const char*)，就设计了下面这个转换函数：

template<class Key>struct hash{};

inline size_t __stl_hash_string(const char* s)
{
	unsigned long h = 0;
	for (; *s; ++s)
		h = 5 * h + *s;
	return size_t(h);
}

SGI hashtable无法处理如string，double，float等型别的元素，用户必须自行为它们定义hash function。