Pinyin word segmentation - C# implementation

In the project, the pinyin of the person's name needs to be extracted from a bunch of strings. There are both English and non-alphabetic characters in the string. The previous solution was to write a lot of regular expressions, but it is ok for normal person names. But I am stupid when I encounter something like wangaie. I am too lazy to study so many regular expressions, so I re-think a plan. If the string is still in normal Pinyin after the word segmentation according to Pinyin, then submit it. Here you need a pinyin dictionary, which is found on the Internet as follows:

a 
ai 
an 
ang 
ao 
ba 
bai 
ban 
bang 
bao 
bei 
ben 
beng 
bi 
bian 
biao 
bie 
bin 
bing 
bo 
bu 
ca 
cai 
can 
cang 
cao 
ce 
ceng 
cha 
chai 
chan 
chang 
chao 
che 
chen 
cheng 
chi 
chong 
chou 
chu 
chuai 
chuan 
chuang 
chui 
chun 
chuo 
ci 
cong 
cou 
cu 
cuan 
cui 
cun 
cuo 
da 
dai 
dan 
dang 
dao 
de 
deng 
di 
dian 
diao 
die 
ding 
diu 
dong 
dou 
du 
duan 
dui 
dun 
duo 
e 
en 
er 
fa 
fan 
fang 
fei 
fen 
feng 
fo 
fou 
fu 
ga 
gai 
gan 
gang 
gao 
ge 
gei 
gen 
geng 
gong 
gou 
gu 
gua 
guai 
guan 
guang 
gui 
gun 
guo 
ha 
hai 
han 
hang 
hao 
he 
hei 
hen 
heng 
hong 
hou 
hu 
hua 
huai 
huan 
huang 
hui 
hun 
huo 
ji 
jia 
jian 
jiang 
jiao 
jie 
jin 
jing 
jiong 
jiu 
ju 
juan 
jue 
jun 
ka 
kai 
kan 
kang 
kao 
ke 
ken 
keng 
kong 
kou 
ku 
kua 
kuai 
kuan 
kuang 
kui 
kun 
kuo 
la 
lai 
lan 
lang 
lao 
le 
lei 
leng 
li 
lia 
lian 
liang 
liao 
lie 
lin 
ling 
liu 
long 
lou 
lu 
lv 
luan 
lue 
lun 
luo 
ma 
mai 
man 
mang 
mao 
me 
mei 
men 
meng 
mi 
mian 
miao 
mie 
min 
ming 
miu 
mo 
mou 
mu 
na 
nai 
nan 
nang 
nao 
ne 
nei 
nen 
neng 
ni 
nian 
niang 
niao 
nie 
nin 
ning 
niu 
nong 
nu 
nv 
nuan 
nue 
nuo 
o 
ou 
pa 
pai 
pan 
pang 
pao 
pei 
pen 
peng 
pi 
pian 
piao 
pie 
pin 
ping 
po 
pu 
qi 
qia 
qian 
qiang 
qiao 
qie 
qin 
qing 
qiong 
qiu 
qu 
quan 
que 
qun 
ran 
rang 
rao 
re 
ren 
reng 
ri 
rong 
rou 
ru 
ruan 
rui 
run 
ruo 
sa 
sai 
san 
sang 
sao 
se 
sen 
seng 
sha 
shai 
shan 
shang 
shao 
she 
shen 
sheng 
shi 
shou 
shu 
shua 
shuai 
shuan 
shuang 
shui 
shun 
shuo 
si 
song 
sou 
su 
suan 
sui 
sun 
suo 
ta 
tai 
tan 
tang 
tao 
te 
teng 
ti 
tian 
tiao 
tie 
ting 
tong 
tou 
tu 
tuan 
tui 
tun 
tuo 
wa 
wai 
wan 
wang 
wei 
wen 
weng 
wo 
wu 
xi 
xia 
xian 
xiang 
xiao 
xie 
xin 
xing 
xiong 
xiu 
xu 
xuan 
xue 
xun 
ya 
yan 
yang 
yao 
ye 
yi 
yin 
ying 
yo 
yong 
you 
yu 
yuan 
yue 
yun 
za 
zai 
zan 
zang 
zao 
ze 
zei 
zen 
zeng 
zha 
zhai 
zhan 
zhang 
zhao 
zhe 
zhen 
zheng 
zhi 
zhong 
zhou 
zhu 
zhua 
zhuai 
zhuan 
zhuang 
zhui 
zhun 
zhuo 
zi 
zong 
zou 
zu 
zuan 
zui 
zun 
zuo 

Save it as a *.txt file, such as pinyin_dataset.txt, and then read in the program and the
word segmentation is ok, you can use the forward maximum matching or the latter maximum matching. I use the forward maximum matching. There are many specific algorithm ideas on the Internet, mainly It is necessary to build a dictionary and set a maximum number of matching characters. The longest pinyin is 6 digits. Each time, divide 6 characters to compare in the dictionary. PYSplit(string py_str) in the code is the word segmentation function

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Text.RegularExpressions;

namespace OCRConfig
{
    public class CPYCheck
    {
        private const string BLANK = " ";
        private const string NON_CHAR_PATTERN = "[^A-Z|a-z]";
        private const string DEFAULT_DICT_NAME = "pinyin_dataset.txt";
        private Dictionary<char, List<string>> data_set;
        private List<string> word_list; // 存储拼音分词后的结果

        /// <summary>
        /// 
        /// </summary>
        /// <param name="dict_path">拼音字典文件路径</param>
        public CPYCheck(string dict_path = null)
        {            
            data_set = new Dictionary<char, List<string>>();
            word_list = new List<string>();

            if (File.Exists(dict_path))
            {
                CreatePYDataSet(dict_path);                
            }
            else
            {
                throw new Exception(string.Format("the file {0} not find!", dict_path));
            }

        }

        private void CreatePYDataSet(string dict_path)
        {
            char[] alphabet = { 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' };
            int index = 0;
            //读取字典文件,创建拼音规则字典
            StreamReader sr = new StreamReader(dict_path, Encoding.Default);
            String line;
            while ((line = sr.ReadLine()) != null)
            {
                if (!data_set.ContainsKey(alphabet[index]))
                {
                    data_set[alphabet[index]] = new List<string>();                    
                }               
                if (line[0] == alphabet[index])
                    data_set[alphabet[index]].Add(line.Trim());
                else
                {
                    index += 1;
                    data_set[alphabet[index]] = new List<string>();
                    data_set[alphabet[index]].Add(line.Trim());
                }
            }
        }

        public string NameCheck(string name_str)
        {
            string pname = null;
            bool res = true;
            Regex rgx = new Regex(NON_CHAR_PATTERN);

            //用空格替换所有非字母字符
            name_str = rgx.Replace(name_str, BLANK);
            //去除首尾空格
            name_str = name_str.Trim();
            pname = name_str;
            name_str = name_str.ToLower();
            //认为姓名最少由两个字母组成
            if (name_str.Length < 2)
                return null;

            string[] name_list = name_str.Split(' ');
            foreach (string name in name_list)
            {
                List<string> wordSplit = PYSplit(name);
                if (wordSplit != null)
                {
                    foreach (string word in wordSplit)
                        res = res && WordTest(word);
                    this.word_list.Clear();
                    if (res == false)
                    {
                        return null;
                    }
                }
                else
                {
                    return null;
                }
            }

            return pname;
        }



        // 拼音分词,前向最大匹配法算法
        // 返回分词后的拼音数组
        private List<string> PYSplit(string py_str)
        {           
            const int MAX_WORD_LEN = 6; // 单词的最大长度     
            bool flag = false;       
            string s2 = py_str.Length > MAX_WORD_LEN ? py_str.Substring(0, MAX_WORD_LEN) : py_str;
            try
            {
                List<string> word_list = this.data_set[s2[0]];

                for (int index = s2.Length; index > -1; index--)
                {
                    if (word_list.Contains(s2.Substring(0, index)))
                    {
                        this.word_list.Add(s2.Substring(0, index));
                        flag = true;
                        if (index < py_str.Length)
                        {
                            this.PYSplit(py_str.Substring(index, py_str.Length - index));
                        }
                        return this.word_list;
                    }
                }
                if (flag == false)
                    this.word_list.Add(py_str);
                return null;
            }
            catch(Exception ex)
            {
                return null;
            }           
        }        

        private bool WordTest(string word)
        {
            bool res = true;
            word = word.ToLower();
            try
            {
                res = data_set[word[0]].Contains(word);
            }
            catch(Exception ex)
            {
                res = false;
            }
            return res;

        }       
    }
}

Guess you like

Origin http://43.154.161.224:23101/article/api/json?id=325859938&siteId=291194637