动手实践PHP7的HashTable

参照PHP7的HashTable实现思路写一个精简版,加深对hashtable数据结构的理解。

PHP7 HashTable数据结构:

/*
 * HashTable Data Layout
 * =====================
 *
 *                 +=============================+
 *                 | HT_HASH(ht, ht->nTableMask) |
 *                 | ...                         |
 *                 | HT_HASH(ht, -1)             |
 *                 +-----------------------------+
 * ht->arData ---> | Bucket[0]                   |
 *                 | ...                         |
 *                 | Bucket[ht->nTableSize-1]    |
 *                 +=============================+
 */

详细分析参见:PHP7 HashTable源码分析

代码实现:

1.基本数据结构

typedef struct bucket_val_t {
    short type;
    union {
        long d;
        char *str;
        struct hashtable_t *arr;
    } v;
} bucket_val;

typedef struct bucket_t {
    unsigned long h;
    char *key;
    bucket_val *val;
    size_t next;
} bucket;

typedef struct hashtable_t {
    size_t cap;
    size_t sizemask;
    size_t used;
    size_t next;
    bucket *arrData;
} hashtable;

2.功能清单

hashtable *new_hash_table(size_t cap);
void init_hash_data(size_t *data,size_t cap);
unsigned long hash(char *key);

bucket_val *hash_get(hashtable *ht,char *key);
int hash_set(hashtable *ht,char *key,char *val);
unsigned short hash_exists(hashtable *ht,char *key);
int hash_remove(hashtable *ht,char *key);

int hash_resize(hashtable *ht);
void hash_rehash(hashtable *ht);
size_t hash_recap(size_t cap);
void hash_copy_bucket(bucket* dest,bucket* src,size_t count);

void hash_free_bucket(bucket *pb,char freeval);
void hash_free_bucket_val(bucket_val *pval);
void hash_free(hashtable *ht);

3.源码

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
//HashTable
#define HASH_TABLE_INIT_CAP (8)
#define HASH_VAL_TYPE_STR (1<<1)
#define HASH_VAL_TYPE_LONG (1<<2)
#define HASH_VAL_TYPE_ARR (1<<3)

#define HASH_DATA_START(ht) ((size_t*)ht->arrData - ht->cap)
#define HASH_SIZEMASK(ht) (-(ht->cap))
#define HASH_OFFSET(ht,h) (h | ht->sizemask)

typedef struct bucket_val_t {
    short type;
    union {
        long d;
        char *str;
        struct hashtable_t *arr;
    } v;
} bucket_val;

typedef struct bucket_t {
    unsigned long h;
    char *key;
    bucket_val *val;
    size_t next;
} bucket;

typedef struct hashtable_t {
    size_t cap;
    size_t sizemask;
    size_t used;
    size_t next;
    bucket *arrData;
} hashtable;


hashtable *new_hash_table(size_t cap);
void init_hash_data(size_t *data,size_t cap);
unsigned long hash(char *key);

bucket_val *hash_get(hashtable *ht,char *key);
int hash_set(hashtable *ht,char *key,char *val);
unsigned short hash_exists(hashtable *ht,char *key);
int hash_remove(hashtable *ht,char *key);

int hash_resize(hashtable *ht);
void hash_rehash(hashtable *ht);
size_t hash_recap(size_t cap);
void hash_copy_bucket(bucket* dest,bucket* src,size_t count);

void hash_free_bucket(bucket *pb,char freeval);
void hash_free_bucket_val(bucket_val *pval);
void hash_free(hashtable *ht);
hashtable.h
#include "hashtable.h"

hashtable *new_hash_table(size_t cap){
    cap = hash_recap(cap);
    hashtable *ht = (hashtable *)malloc(sizeof(hashtable));
    if(ht == NULL){
        return NULL;
    }
    size_t *data = (size_t *)malloc(sizeof(size_t) * cap + sizeof(bucket) * cap);
    if(data == NULL){
        free(ht);
        return NULL;
    }
    init_hash_data(data,cap);
    ht->cap = cap;
    ht->used = 0;
    ht->next = 0;
    ht->sizemask = HASH_SIZEMASK(ht);
    ht->arrData = (bucket *)(data + cap);
    return ht;
}

void init_hash_data(size_t *data,size_t cap){
    int i;
    for(i = 0;i<cap;++i){
        *(data+i) = -1;
    }
    bucket *arrData = (bucket *)(data+cap);
    for(i=0;i<cap;++i){
        (arrData + i)->h = -1;
        (arrData + i)->key = NULL;
        (arrData + i)->val = NULL;
        (arrData + i)->next = -1;
    }
    return;
}
unsigned long hash(char *key){
    unsigned long h = 1234;
    int c;
    while(c = *key++){
        h = h + c;
    }
    return h;
}

int hash_set(hashtable *ht,char *key,char *val){
    if(ht == NULL){
        return -1;
    }
    char *v = (char *)malloc(sizeof(char) * (strlen(val)+1));
    if(v == NULL){
        return -1;
    }
    v = strcpy(v,val);
    if(v == NULL){
        return -1;
    }

    // key already exists
    bucket_val *bval = hash_get(ht,key);

    if(bval != NULL){
        free(bval->v.str);
        bval->v.str = v;
        return 0;
    }
    char *k = (char *)malloc(sizeof(char) * (strlen(key)+1));
    if(k == NULL){
        return -1;
    }
    k = strcpy(k,key);
    if(k == NULL){
        free(k);
        free(v);
        return -1;
    }
    
    // key not exists
    if(ht->next == ht->cap - 1){
        printf("prepare to resize:%d\n",ht->next);
        int res = hash_resize(ht);
        if(res < 0){
            free(k);
            free(v);
            return res;
        }
    }

    bval = (bucket_val*)malloc(sizeof(bucket_val));
    if(bval == NULL){
        return -1;
    }
    bval->type = HASH_VAL_TYPE_STR;
    bval->v.str = v;

    unsigned long h = hash(key);
    size_t offset = HASH_OFFSET(ht,h);
    size_t *pidx = (size_t *)ht->arrData + offset;
    bucket *pb;
    if(*pidx == -1){    
        *pidx = ht->next;
        ht->next++;
        ht->used++;
        pb = ht->arrData + *pidx;
        pb->h = h;
        pb->key = k;
        pb->val = bval;
        pb->next = -1;
        printf("hash_set(%s)idx:%d,h:%d,next:%d\n",pb->key,*pidx,pb->h,pb->next);
        return 0;
    }
    pb = ht->arrData + ht->next;
    pb->next = *pidx;
    *pidx = ht->next;
    ht->used++;
    ht->next++;
    pb->h = h;
    pb->key = k;
    pb->val = bval;
    printf("hash_set(%s)idx:%d,h:%d,next:%d\n",pb->key,*pidx,pb->h,pb->next);
    return 0;
}

bucket_val *hash_get(hashtable *ht,char *key){
    if(ht == NULL){
        return NULL;
    }
    unsigned long h = hash(key);
    size_t offset = HASH_OFFSET(ht,h);
    size_t index = *((size_t *)(ht->arrData) + offset);
    if(index == -1){
        return NULL;
    }
    bucket *pb = ht->arrData+index;
    
    while(pb->h != -1){
        if(strcmp(pb->key,key) == 0){
            return pb->val;
        }
        if(pb->next == -1){
            break;
        }else{
            pb = ht->arrData + pb->next;
        }
        
    }
    return NULL;
}

unsigned short hash_exists(hashtable *ht,char *key){
    if(ht == NULL){
        return 0;
    }
    unsigned long h = hash(key);
    size_t offset = HASH_OFFSET(ht,h);
    size_t index = *((size_t *)(ht->arrData) + offset);
    if(index == -1){
        return 0;
    }
    bucket *pb = ht->arrData+index;
    while(pb){
        if(strcmp(pb->key,key) == 0){
            return 1;
        }
        pb = ht->arrData + pb->next;
    }
    return 0;
}

int hash_remove(hashtable *ht,char *key){
    if(ht == NULL || !hash_exists(ht,key)){
        return 0;
    }
    unsigned long h = hash(key);
    size_t offset = HASH_OFFSET(ht,h);
    size_t *pidx = (size_t*)ht->arrData + offset;
    bucket *pb = ht->arrData + *pidx;
    bucket *preb = NULL;
    while(pb){
        if(strcmp(pb->key,key) == 0){
            break;
        }
        preb = pb;
        if(pb->next != -1){
            pb = ht->arrData + pb->next;
        }else{
            return 0;
        }
    }
    if(preb != NULL){
        preb->next = pb->next;
    }else{
        *pidx = pb->next;
    }
    hash_free_bucket(pb,'1');
    ht->used--;
    return 0;
}

void hash_free_bucket(bucket *pb,char freeval){
    if(pb == NULL){
        return;
    }
    pb->h = -1;
    pb->key = NULL;
    if(freeval == '1'){
        free(pb->key);
        hash_free_bucket_val(pb->val);
    }
    pb->val = NULL;
    pb->next = -1;
    return;
}

void hash_free_bucket_val(bucket_val *pval){
    if(pval == NULL){
        return;
    }
    switch(pval->type){
        case HASH_VAL_TYPE_STR:
            free(pval->v.str);
            break;
        case HASH_VAL_TYPE_LONG:
            break;
        case HASH_VAL_TYPE_ARR:
            hash_free(pval->v.arr);
            break;
        default:
            printf("unknown bucket value type.\n");
            break;
    }
    free(pval);
    return;
}

void hash_free(hashtable *ht){
    int idx;
    for(idx=0;idx < ht->next;++idx){
        hash_free_bucket(ht->arrData+idx,'1');
    }
    free(HASH_DATA_START(ht));
    free(ht);
}

// 计算大于等于cap的最小2^n
size_t hash_recap(size_t cap){
    if(cap <= HASH_TABLE_INIT_CAP){
        return HASH_TABLE_INIT_CAP;
    }
    
    int count=0,last=0;
    while(cap > 0){
        last = cap;
        cap = cap & (cap-1);
        count++;
    }
    
    return count == 1 ? last : last<<1;
}

int hash_resize(hashtable *ht){
    if(ht == NULL){
        *ht = *new_hash_table(HASH_TABLE_INIT_CAP);
        return 0;
    }
    size_t cap = ht->cap << 1;
    printf("new cap:%d\n",cap);
    size_t *data = (size_t *)malloc(sizeof(size_t)*cap + sizeof(bucket) * cap);
    if(data == NULL){
        return -1;
    }
    size_t *origdata = HASH_DATA_START(ht);
    init_hash_data(data,cap);
    hash_copy_bucket((bucket *)(data+cap),ht->arrData,ht->next);
    ht->cap = cap;
    ht->sizemask = HASH_SIZEMASK(ht);
    ht->arrData = (bucket *)(data+cap);
    hash_rehash(ht);
    free(origdata);
    return 0;
}

void hash_copy_bucket(bucket* dest,bucket* src,size_t count){
    if(count <= 0){
        return;
    }
    int idx;
    for (idx=0; idx<count;++idx){
        if((src+idx)->h == -1){
            continue;
        }
        *(dest+idx) = *(src+idx);
    }
    return;
}

void hash_rehash(hashtable *ht){
    int idx,idxused;
    bucket *pb,*pbused;
    for(idx = 0; idx < ht->next; ++idx){
        pb = ht->arrData+idx;
        if(pb->h == -1){
            // find next used bucket to fill the hole
            pbused = NULL;
            idxused = idx+1;
            while(idxused < ht->next){
                if((ht->arrData+idxused)->h != -1){
                    pbused = ht->arrData+idxused;
                    break;
                }
                idxused++; 
            }
            if(pbused == NULL){
                ht->next = idx;
                break;
            }
            *pb = *pbused;
            hash_free_bucket(pbused,'0');
        }
        size_t offset = HASH_OFFSET(ht,pb->h);
        size_t * pidx = (size_t*)ht->arrData + offset;
        if(*pidx == -1){
            *pidx = idx;
            continue;
        } 
        pb->next = *pidx;
        *pidx = idx;
    }
}
hashtable.c

4.测试

#include "hashtable.h"
#include <stdio.h>

int test_recap(){
    hashtable *ht = new_hash_table(15);
    if(ht->cap != 16){
        printf("16 != ht->cap=%d\n",ht->cap);
        return -1;
    }
    hash_free(ht);
    ht = new_hash_table(0);
    if(ht->cap != HASH_TABLE_INIT_CAP){
        printf("%d != ht->cap=%d\n",HASH_TABLE_INIT_CAP,ht->cap);
        return -1;
    }
    hash_free(ht);
    ht = new_hash_table(32);
    if(ht->cap != 32){
        printf("32 != ht->cap=%d\n",ht->cap);
        return -1;
    }
    
    hash_free(ht);
    ht = new_hash_table(33);
    if(ht->cap != 64){
        printf("64 != ht->cap=%d\n",ht->cap);
        return -1;
    }
    hash_free(ht);

    return 0;
}

int test_set(hashtable *ht){
    int res = hash_set(ht,"a","aaa");
    if(res < 0){
        printf("hash_set(a) fail\n");
        return res;
    }
    if(ht->used != 1){
        printf("ht->used:%d != 1\n",ht->used);
        return -1;
    }

    res = hash_set(ht,"ab","ababab");
    if(res < 0){
        printf("hash_set(ab) fail\n");
        return res;
    }
    if(ht->used != 2){
        printf("ht->used:%d != 2\n",ht->used);
        return -1;
    }

    res = hash_set(ht,"a","aaaaaaaaaa");
    if(res < 0){
        printf("hash_set(abc) fail\n");
        return res;
    }
    if(ht->used != 2){
        printf("ht->used:%d != 2\n",ht->used);
        return -1;
    }

    res = hash_set(ht,"ccccc","cccccc");
    if(res < 0){
        printf("hash_set fail %d\n",res);
        return res;
    }

    res = hash_set(ht,"ddddddd","ddddddd");
    if(res < 0){
        printf("hash_set fail %d\n",res);
        return res;
    }    

    return 0;
}

int test_get(hashtable *ht){
    bucket_val * v = hash_get(ht,"a");
    if(v == NULL){
        return -1;
    }
    if(!(v->type & HASH_VAL_TYPE_STR)){
        printf("type:%d != %d\n",v->type,HASH_VAL_TYPE_STR);
        return -1;
    }

    if(strcmp(v->v.str,"aaaaaaaaaa") != 0){
        printf("value:%s != %s\n",v->v.str,"aaaaaaaaaa");
        return -1;
    }

    return 0;
}

int test_remove(hashtable *ht){
    int res = hash_remove(ht,"a");
    if(res < 0){
        printf("hash_remove(a) fail\n");
        return res;
    }
    unsigned short e = hash_exists(ht,"a");
    if(e){
        printf("hash_exists(a)=%d\n",e);
        return -1;
    }
    return 0;
}

int test_resize(hashtable *ht){
    int i=1,res,origcap = ht->cap;
    res = hash_set(ht,"1111","11111");
    if(res < 0){
        printf("hash_set fail %d\n",i);
        return res;
    }

    res = hash_set(ht,"2222","22222");
    if(res < 0){
        printf("hash_set fail %d\n",i);
        return res;
    }

    res = hash_set(ht,"3333","33333");
    if(res < 0){
        printf("hash_set fail %d\n",i);
        return res;
    }

    res = hash_set(ht,"44444","44444");
    if(res < 0){
        printf("hash_set fail %d\n",i);
        return res;
    }

    res = hash_set(ht,"55555","55555");
    if(res < 0){
        printf("hash_set fail %d\n",i);
        return res;
    }

    if(ht->cap != 2*origcap){
        printf("ht->cap:%d != %d\n",ht->cap,2*origcap);
        return -1;
    }

    return 0;
}

int main(int argc,char* argv[]){    
    int res;
    res = test_recap();
    if(res == 0){
        printf("PASS hash_recap\n");
    }

    hashtable *ht = new_hash_table(0);
    res = test_set(ht);
    if(res == 0){
        printf("PASS hash_set\n");
    }

    res = test_get(ht);
    if(res == 0){
        printf("PASS hash_get\n");
    }
    res = test_remove(ht);
    if(res == 0){
        printf("PASS hash_remove\n");
    }
    res = test_resize(ht);
    if(res == 0){
        printf("PASS hash_resize\n");
    }
    hash_free(ht);
    return 0;
}
test.c

猜你喜欢

转载自www.cnblogs.com/ling-diary/p/10676109.html