Python sample questions about strings (jieba library)

Preparation :

  1. Install jieba library
    Insert picture description here

Example 1 : Generate a random password of specified length

import string
import random
characters=string.digits+string.ascii_letters
pass8=''.join([random.choice(characters) for i in range(8)])
print(pass8)
pass10=''.join([random.choice(characters) for i in range(10)])
print(pass10)
pass16=''.join([random.choice(characters) for i in range(16)])
print(pass16)

Insert picture description here
Example 2 : Write a program to generate a lot of random information. It is very useful when you need to obtain a lot of data to test or demonstrate software functions. It can not only show the software functions or algorithms, but also avoid leaking real data or causing unnecessary disputes.

import random
import string
import codecs

# StringBase为常用汉字unicode编码表,网上搜即可得到

StringBase="\u7684\u4e00\u4e86\u662f\u6211\u4e0d\u5728\u4eba\u4eec\u6709\u6765\u4ed6\u8fd9\u4e0a\u7740\u4e2a\u5730\u5230\u5927\u91cc\u8bf4\u5c31\u53bb\u5b50\u5f97\
\u4e5f\u548c\u90a3\u8981\u4e0b\u770b\u5929\u65f6\u8fc7\u51fa\u5c0f\u4e48\u8d77\u4f60\u90fd\u628a\u597d\u8fd8\u591a\u6ca1\u4e3a\u53c8\u53ef\u5bb6\u5b66\
\u53ea\u4ee5\u4e3b\u4f1a\u6837\u5e74\u60f3\u751f\u540c\u8001\u4e2d\u5341\u4ece\u81ea\u9762\u524d\u5934\u9053\u5b83\u540e\u7136\u8d70\u5f88\u50cf\u89c1\
\u4e24\u7528\u5979\u56fd\u52a8\u8fdb\u6210\u56de\u4ec0\u8fb9\u4f5c\u5bf9\u5f00\u800c\u5df1\u4e9b\u73b0\u5c71\u6c11\u5019\u7ecf\u53d1\u5de5\u5411\u4e8b\
\u547d\u7ed9\u957f\u6c34\u51e0\u4e49\u4e09\u58f0\u4e8e\u9ad8\u624b\u77e5\u7406\u773c\u5fd7\u70b9\u5fc3\u6218\u4e8c\u95ee\u4f46\u8eab\u65b9\u5b9e\
\u5403\u505a\u53eb\u5f53\u4f4f\u542c\u9769\u6253\u5462\u771f\u5168\u624d\u56db\u5df2\u6240\u654c\u4e4b\u6700\u5149\u4ea7\u60c5\u8def\u5206\u603b\u6761\
\u767d\u8bdd\u4e1c\u5e2d\u6b21\u4eb2\u5982\u88ab\u82b1\u53e3\u653e\u513f\u5e38\u6c14\u4e94\u7b2c\u4f7f\u5199\u519b\u5427\u6587\u8fd0\u518d\u679c\
\u600e\u5b9a\u8bb8\u5feb\u660e\u884c\u56e0\u522b\u98de\u5916\u6811\u7269\u6d3b\u90e8\u95e8\u65e0\u5f80\u8239\u671b\u65b0\u5e26\u961f\u5148\u529b\
\u5b8c\u5374\u7ad9\u4ee3\u5458\u673a\u66f4\u4e5d\u60a8\u6bcf\u98ce\u7ea7\u8ddf\u7b11\u554a\u5b69\u4e07\u5c11\u76f4\u610f\u591c\u6bd4\u9636\
\u8fde\u8f66\u91cd\u4fbf\u6597\u9a6c\u54ea\u5316\u592a\u6307\u53d8\u793e\u4f3c\u58eb\u8005\u5e72\u77f3\u6ee1\u65e5\u51b3\u767e\u539f\u62ff\u7fa4\
\u7a76\u5404\u516d\u672c\u601d\u89e3\u7acb\u6cb3\u6751\u516b\u96be\u65e9\u8bba\u5417\u6839\u5171\u8ba9\u76f8\u7814\u4eca\u5176\u4e66\u5750\
\u63a5\u5e94\u5173\u4fe1\u89c9\u6b65\u53cd\u5904\u8bb0\u5c06\u5343\u627e\u4e89\u9886\u6216\u5e08\u7ed3\u5757\u8dd1\u8c01\u8349\u8d8a\u5b57\
\u52a0\u811a\u7d27\u7231\u7b49\u4e60\u9635\u6015\u6708\u9752\u534a\u706b\u6cd5\u9898\u5efa\u8d76\u4f4d\u5531\u6d77\u4e03\u5973\u4efb\u4ef6\u611f\
\u51c6\u5f20\u56e2\u5c4b\u79bb\u8272\u8138\u7247\u79d1\u5012\u775b\u5229\u4e16\u521a\u4e14\u7531\u9001\u5207\u661f\u5bfc\u665a\u8868\u591f\u6574\
\u8ba4\u54cd\u96ea\u6d41\u672a\u573a\u8be5\u5e76\u5e95\u6df1\u523b\u5e73\u4f1f\u5fd9\u63d0\u786e\u8fd1\u4eae\u8f7b\u8bb2\u519c\u53e4\u9ed1\
\u544a\u754c\u62c9\u540d\u5440\u571f\u6e05\u9633\u7167\u529e\u53f2\u6539\u5386\u8f6c\u753b\u9020\u5634\u6b64\u6cbb\u5317\u5fc5\u670d\
\u96e8\u7a7f\u5185\u8bc6\u9a8c\u4f20\u4e1a\u83dc\u722c\u7761\u5174\u5f62\u91cf\u54b1\u89c2\u82e6\u4f53\u4f17\u901a\u51b2\u5408\u7834\
\u53cb\u5ea6\u672f\u996d\u516c\u65c1\u623f\u6781\u5357\u67aa\u8bfb\u6c99\u5c81\u7ebf\u91ce\u575a\u7a7a\u6536\u7b97\u81f3\u653f\u57ce\
\u52b3\u843d\u94b1\u7279\u56f4\u5f1f\u80dc\u6559\u70ed\u5c55\u5305\u6b4c\u7c7b\u6e10\u5f3a\u6570\u4e61\u547c\u6027\u97f3\u7b54\u54e5\
\u9645\u65e7\u795e\u5ea7\u7ae0\u5e2e\u5566\u53d7\u7cfb\u4ee4\u8df3\u975e\u4f55\u725b\u53d6\u5165\u5cb8\u6562\u6389\u5ffd\u79cd\u88c5\
\u9876\u6025\u6797\u505c\u606f\u53e5\u533a\u8863\u822c\u62a5\u53f6\u538b\u6162\u53d4\u80cc\u7ec6"

#准备好一个庞大的常用汉字库
StringBase=''.join(StringBase.split('\\u'))

#随机生成邮箱
def getEmail():
    suffix=['.com','.org','.net','.cn']  #常用的邮箱后缀
    characters=string.ascii_letters+string.digits+' '
    username=''.join((random.choice(characters) for i in range(random.randint(6,12))))
    domain=''.join((random.choice(characters) for i in range(random.randint(3,6))))
    return username+'@'+domain+random.choice(suffix)

#随机生成电话,1开头
def getTelNo():
    return '1'+''.join((str(random.randint(0,9)) for i in range(10)))
#随机生成姓名或地址
def getNameOrAddress(flag):
    '''flag=1表示返回随机姓名,flag=0表示返回随机地址'''
    result=''
    if flag==1:
        rangestart,rangeend=2,4
    elif flag==0:
        rangestart, rangeend = 10,31
    else:
        print('flag must be 1 or 0')
        return ''
    for i in range(random.randrange(rangestart,rangeend)):
        result+=random.choice(StringBase)
    return result
#随机生成性别
def getSex():
    return random.choice(('男','女'))
#随机生成年龄
def getAge():
    return str(random.randint(18,100))

def main(filename):
    with open(filename,'w',encoding='utf-8') as fp:
        for i in range(20):#定义生成信息的数量
            name=getNameOrAddress(1)
            sex=getSex()
            tel=getTelNo()
            age=getAge()
            address=getNameOrAddress(0)
            email=getEmail()
            line=','.join([name,sex,age,tel,address,email])+'\n'
            fp.write(line)

def output(filename):
    with open(filename,'r',encoding='utf-8') as fp:
        for line in fp:
            print(line)

if __name__=='__main__':
    filename='information.txt'
    main(filename)
    #output(filename)

Insert picture description here
Example 3 : Use jieba library Chinese and English word segmentation

import jieba
x='分词的准确度直接影响了后续文本处理和挖掘算法的最终效果。'
a=jieba.cut(x)  #使用默认词库进行分词
print(list(a))

print(list(jieba.cut('纸杯')))

print(list(jieba.cut('花纸杯')))

jieba.add_word('花纸杯') #增加花纸杯词条
print(list(jieba.cut('花纸杯')))  #使用新词库分词

Insert picture description here
Example 4 : Romance of the Three Kingdoms, analyzing the statistics of the appearances of characters

import jieba

txt=open("E:/zhuoyueban/Python/program/2010春python/20200420/threekingdoms.txt",'r',encoding="utf-8").read()
excludes={"将军","却说","荆州","二人","不可","不能","如此","商议","如何",\
          "引兵","次日","大喜","左右","军士","军马","天下","于是","东吴",\
          "今日","不敢","一人","魏兵",'人马','不知','汉中','只见','众将',\
          '后主','蜀兵','上马','大叫','太守','此人','夫人','先主','后人',\
          '背后','城中'}
words=jieba.lcut(txt)
counts={}
for word in words:
    if len(word)==1:
        continue
    elif word=="诸葛亮" or word=="孔明曰":
        rword="孔明"
    elif word=="关公" or word=="云长":
        rword="关羽"
    elif word=="玄德" or word=="玄德曰" or word=="主公":
        rword="刘备"
    elif word=="孟德" or word=="丞相" or word=="陛下":
        rword="曹操"
    else:
        rword=word
    counts[rword]=counts.get(rword,0)+1

for word in excludes:
    del counts[word]

items=list(counts.items())

items.sort(key=lambda x:x[1],reverse=True)
for i in range(20):
    word,count=items[i]
    print("{0:>2}:{1:<10}{2:^7}".format(i+1,word,count))

Insert picture description here

Published 72 original articles · Like 3 · Visits 3528

Guess you like

Origin blog.csdn.net/id__39/article/details/105634119