汉字到拼音的转换

汉字到拼音的转换可以看看 pinyin.py git库。
在博客Python汉字转换成拼音中提出了英文兼容性问题,也提出了兼容方式,评论中也有改进。但是英文变成了一个一个的字母,和我们大众有点区别了,我也对原项目进行了一些更改,对英文词汇支持。

    def hanzi2pinyin(self, string=""):
        result = []
        if not isinstance(string, str):
            string = string.decode("utf-8")
        split_strings = string.split()
        for split_string in split_strings:
            spell = ''
            for char in split_string:
                key = '%04X' % ord(char)
                if ord(char) > 128 and spell != '':                 
                    result.append(spell)
                    spell = ''
                elif ord(char) <= 128 and ord(char) >=0:
                    spell += char
                elif ord(char) >= 13312:        #   (13312)10 = (3400)16 
                    temp = self.word_dict.get(key, char).split()[0][:-1].lower()
                    result.append(temp)
            if spell != '':                 
                result.append(spell)
        return result

输出为:

in: 钓鱼岛是中国的inherent territory
out: ['diao', 'yu', 'dao', 'shi', 'zhong', 'guo', 'de', 'inherent', 'territory']
out: diao-yu-dao-shi-zhong-guo-de-inherent-territory

由于我的环境是python3,有些细节做了写改变,整个文件如下:

#!/usr/bin/env python

"""
    Author:cleverdeng
    E-mail:[email protected]
"""

__version__ = '0.9'
__all__ = ["PinYin"]

import os.path


class PinYin(object):
    def __init__(self, dict_file='word.data'):
        self.word_dict = {}
        self.dict_file = dict_file


    def load_word(self):
        if not os.path.exists(self.dict_file):
            raise IOError("NotFoundFile")

        with open(self.dict_file) as f_obj:
            for f_line in f_obj.readlines():
                try:
                    line = f_line.split('    ')
                    self.word_dict[line[0]] = line[1]
                except:
                    line = f_line.split('   ')
                    self.word_dict[line[0]] = line[1]


    def hanzi2pinyin(self, string=""):
        result = []
        if not isinstance(string, str):
            string = string.decode("utf-8")
        split_strings = string.split()
        for split_string in split_strings:
            spell = ''
            for char in split_string:
                key = '%04X' % ord(char)
                if ord(char) > 128 and spell != '':                 
                    result.append(spell)
                    spell = ''
                elif ord(char) <= 128 and ord(char) >=0:
                    spell += char
                elif ord(char) >= 13312:        #(13312)10 = #(3400)16 
                    temp = self.word_dict.get(key, char).split()[0][:-1].lower()
                    result.append(temp)
            if spell != '':                 
                result.append(spell)

        return result


    def hanzi2pinyin_split(self, string="", split=""):
        result = self.hanzi2pinyin(string=string)
        if split == "":
            return result
        else:
            return split.join(result)


if __name__ == "__main__":
    test = PinYin()
    test.load_word()
    string = "钓鱼岛是中国的inherent territory"
    print ("in: %s" % string)
    print ("out: %s" % str(test.hanzi2pinyin(string=string)))
    print ("out: %s" % test.hanzi2pinyin_split(string=string, split="-"))

猜你喜欢

转载自blog.csdn.net/u012939880/article/details/87609987