类Hash结构词典

版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/nlpzryyclxz/article/details/48047009

由于C语言中没有封装好的Hash(Python 字典)或红黑树(C++ STL map)结构,
因此在这里借用Hash的思想,实现了一个类似Hash的三级字典存储结构。

这里写图片描述

实现代码如下

/***********************************************************
*   File Name               : dict.c
*   Copyright               : 
*   Module Name             : 词典相关操作
*
*   CPU                     : id-2328M CPU @ 2.20GHz
*   OS                      : MicroSoft Windows Xp
*
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*
*   Abstract Description    : 
-----------------Revision Histroy---------------------------
No  Version     Date    Revised By      Item    Description
************************************************************/
#include "dict.h"

/***********************************************************
*   Function Name           : Init_Dict_List
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 初始化词典存储数据结构
*   Param                   : 

*   Return Code             : 0成功,-1失败
************************************************************/
extern int Init_Dict_List(void)
{
    int i = 0;
    FollowChar *s;
    for(i=0;i<65536;i++)
    {
        s =(FollowChar *)malloc(sizeof(FollowChar));
        if(s != NULL)
        {
            s->Next = NULL;
            s->priority = 0;
            s->value = 0;
            s->Down = NULL;
            address[i].Next = s;
            address[i].priority = 0;
        }
        else
        {
            printf("没有内存啦");
            return -1;
        }

    }
    return 0;
}

/***********************************************************
*   Function Name           : Create_Dict
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 创建词典
*   Param                   : fp:语料库指针

*   Return Code             : 0成功
************************************************************/
extern unsigned int Create_Dict(FILE *fp)
{
    char wbuffer[100] = "\0";
    char tbuffer[5] = "\0";
    unsigned int cnt = 0;
    FollowChar *stp1;
    FollowWords *stp2;
    double wordcount = 0;

    Init_Dict_List();
    while(cnt = Read_a_Word(fp,wbuffer,tbuffer))
    {   
        if(cnt == 3)
        {
            QuAndIn_First_List(&wbuffer[0],1);
        }
        else if (cnt == 5)
        {
            stp1 = QuAndIn_First_List(&wbuffer[0],0);
            QuAndIn_Second_List(stp1,&wbuffer[2],1);
        }
        else
        {
            stp1 = QuAndIn_First_List(&wbuffer[0],0);
            stp2 = QuAndIn_Second_List(stp1,&wbuffer[2],0);
            QuAndIn_Third_List(stp2,&wbuffer[4],cnt-4);
            //插入到第三级表
        }
    }
    printf("创建双字哈希词典中...");
    printf("词典创建完毕!\n");
    wordcount = Word_Cnt_Statistic();
    Wordcnt_To_Wordpro(wordcount);
    return 0;
}

/***********************************************************
*   Function Name           : QuAndIn_First_List
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 查询一级哈希,对词登记
*   Param                   : arr : 要存入词典的词数组
                              flag: 单字时flag=1,计数
                                    多余单字flag =0;
*   Return Code             : 返回所连接的二级哈希头
************************************************************/
extern FollowChar *QuAndIn_First_List(char arr[],int flag)
{
    unsigned short int s = 0;
    s = (unsigned char)arr[0];
    s = s<<8;
    s += (unsigned char)arr[1];
    if(flag)
    {
        address[s].priority++;
    }
    return address[s].Next;
}

/***********************************************************
*   Function Name           : QuAndIn_Second_List
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 查询二级哈希,对词登记
*   Param                   : p   : 二级哈希头地址
                              arr : 要存入词典的词数组
                              flag: 双字时flag=1,计数;
                                    多余双字字flag =0;
*   Return Code             : 返回连接的三级哈希地址
************************************************************/
extern FollowWords *QuAndIn_Second_List(FollowChar *p,char arr[],int flag)
{
    unsigned short int s = 0;
    FollowChar *tp = p;
    FollowChar *tq;
    FollowWords *tu;
    s = (unsigned char)arr[0];
    s = s<<8;
    s += (unsigned char)arr[1];
    if(p->Next )
    {
        while(p = p->Next)
        {
            if(s > p->value )
            {
                break;
            }
            else if (s == p->value)
            {
                if(flag) p->priority++;
                return p->Down;
            }
            else
            {
                tp = p;
            }
        }
    }
    p = tp;
    tq = (FollowChar *)malloc(sizeof(FollowChar));
    if(tq !=NULL)
    {
        if(flag)
        {
            tq->priority = 1;
        }
        else
        {
            tq->priority = 0;
        }
        tu = (FollowWords *)malloc(sizeof(FollowWords));
        if(tu!=NULL)
        {
            tu->Next = NULL;
            tu->priority = 0;
            tu->wordptr = NULL;
            tq->Down =tu;
        }
        else
        {
            printf("没有内存啦!");   
        }
        tq->value = s;
        tq->Next = p->Next ;
        p->Next = tq;
    }
    else
    {
        printf("没有内存啦!");
    }
    return tq->Down;
}

/***********************************************************
*   Function Name           : QuAndIn_Third_List
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 查询三级哈希,对词登记
*   Param                   : p   : 三级哈希头地址
                              arr : 要存入词典的词数组
                              num : 词组长度
*   Return Code             : 返回连接的三级哈希地址
************************************************************/
extern FollowWords *QuAndIn_Third_List(FollowWords *p,char arr[],unsigned int num)
{
    FollowWords *s;
    char *aw;
    unsigned int j = 0;

    while(p->Next )
    {
        p = p->Next ;
        if(strcmp(p->wordptr,arr) == 0 )
        {
            p->priority ++;
            return p;
        }
    }
    s = (FollowWords *)malloc(sizeof(FollowWords));
    if(s != NULL)
    {
        s->priority = 1;
        aw = (char *)malloc(num*sizeof(char));
        if(aw ==NULL)
        {
            printf("内存不够啦!");
            return NULL;
        }
        for(j=0;j<num;j++)
        {
            *(aw+j) = arr[j];
        }
        s->wordptr = aw;
    }
    else
    {
        printf("没有内存啦!");
        return NULL;
    }
    s->Next = p->Next ;
    p->Next = s;
    return p->Next ;
}

/***********************************************************
*   Function Name           : Word_Cnt_Statistic
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 统计语料集中词语总数
*   Param                   : 
*   Return Code             : 返回词语总数
************************************************************/
extern double Word_Cnt_Statistic(void)
{
    unsigned int i = 0;
    double cnt = 0;
    FollowChar *p1;
    FollowWords *p2;

    for(i=0;i<65536;i++)
    {
        cnt += address[i].priority ;
        p1 = address[i].Next ;
        while(p1 = p1->Next)
        {
            cnt += p1->priority ;
            p2 = p1->Down ;
            while(p2 = p2->Next )
            {
                cnt += p2->priority ;
            }
        }
    }
    return cnt;
}

/***********************************************************
*   Function Name           : Wordcnt_To_Wordpro
*   Create Date             : 2013/07/27
*   Author/Corporation      : 于飞
*   Description             : 将每种词的出现次数转化为词频
*   Param                   : 
*   Return Code             : 
************************************************************/
extern void Wordcnt_To_Wordpro(double cnt)
{
    unsigned int i = 0;
    FollowChar *p1;
    FollowWords *p2;
    double temppri = 0;
    for(i=0;i<65536;i++)
    {
        if(address[i].Next->Next != NULL)
        {
            if((temppri = address[i].priority/cnt) == 0)
            {
                temppri = DBL_MIN;
            }
            address[i].priority = -log(temppri);
            p1 = address[i].Next ;
            while(p1 = p1->Next)
            {
                p1->priority = -log(p1->priority /cnt);
                p2 = p1->Down ;
                while(p2 = p2->Next )
                {
                    p2->priority = -log(p2->priority /cnt);
                }
            }
        }
    }
}

extern void Destroy_Dict(void)
{
    unsigned int i = 0;
    FollowChar *p1;
    FollowChar *tp1;
    FollowWords *p2;
    FollowWords *tp2;

    for(i=0;i<65536;i++)
    {
        p1 = address[i].Next;
        tp1 = p1->Next;
        while(p1 = tp1)
        {
            p2 = p1->Down;
            tp2 = p2->Next;
            while(p2 = tp2)
            {
                free(p2->wordptr);
                tp2 = p2->Next;
                free(p2);//删除第三级链表
            }
            free(p1->Down);//删除第三级链表头
            tp1 = p1->Next;
            free(p1);//删除第二级链表
        }
        free(address[i].Next);//删除第二级链表头
    }
}

猜你喜欢

转载自blog.csdn.net/nlpzryyclxz/article/details/48047009