NLTK健康领域英文文本分词、词性标注、词频统计

import re
import numpy as np
import pandas as pd
import nltk.tokenize as tk
import nltk.corpus as nc
handel_file = 'health_handel.csv' #分词好要保存的数据文件路径
#读取数据
data=pd.read_excel('health.xlsx')
print(data.head(10))

在这里插入图片描述

stopwords = nc.stopwords.words('english') #停用词
tokenizer=tk.WordPunctTokenizer() #分词器
## 对word进行去掉punctuation、叠字 &等处理
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word
##判断是否为无效word
def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)
##处理相关emoji表情
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' ', tweet)
    return tweet
##对tweet进行 小写、去除url @ 、#  等处理
def clean_text(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', ' ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', ' ', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.lower().split()
    words = [w for w in words if w not in stopwords]
    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            processed_tweet.append(word)
    return processed_tweet
data['clean_review']=data.Tweet.apply(clean_text)
data.to_csv(handel_file, index=False)
from nltk.tag import pos_tag
sentences=data.clean_review.tolist()
# print(sentences)
words=[]
for sent in sentences:
    for word in sent:
        words.append(word)
# print(words)
word_tag=pos_tag(words)
print(word_tag)

在这里插入图片描述

#词频统计
word_dict={}
for item in words:
    if item not in word_dict:#统计数量
        word_dict[item]=1
    else:
        word_dict[item]+=1
        
# print(word_dict)
key=list(word_dict.keys())    
value=list(word_dict.values())
# print(key)
# print(value)

在这里插入图片描述

#数据二次处理整合
list_words=[]
list_cixing=[]
for k in word_tag:
    list_words.append(k[0])
    list_cixing.append(k[1])
# row_list=len(list_words)
# print(list_words)
# print(list_cixing)
import xlwt
#将数据写入新文件
#创建excel
workbook=xlwt.Workbook()
#创建sheet
sheet1=workbook.add_sheet(r'sheet',cell_overwrite_ok=True)
#在单元格中写入
for row in range(row_list):
    for column in range(3):
        if column==0:
            sheet1.write(row,column,list_words[row])
        elif column==1:
            sheet1.write(row,column,list_cixing[row])
        else:
            sheet1.write(row,column,value[row])
workbook.save('分词_词性_词频.xls')
#读取处理好的数据
unsorted_df=pd.read_excel('分词_词性_词频.xls')
unsorted_df.head(5)

在这里插入图片描述

#将处理好的数据根据词频降序排序
sorted_df=unsorted_df.sort_values(by='count',ascending=False)
sorted_df.head(5)

在这里插入图片描述

#删除重复值,保留重复当中排在前面的第一个
sorted_df.drop_duplicates(keep='first',inplace=True)
sorted_df

在这里插入图片描述

#保存
sorted_df.to_excel('分词_词性_词频2.xls',index=False)
发布了27 篇原创文章 · 获赞 112 · 访问量 3254

猜你喜欢

转载自blog.csdn.net/jaffe507/article/details/105670879