各种类型的数据结果保存到Excel中

一、list保存为Excel:

import xlrd
import xlwt 
import re
import numpy as np 

def lists_to_excel(listname1,listname2,listname3,listname4,filename):
    f = xlwt.Workbook()
    sheet1 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)
    for i in range(len(listname1)):
        sheet1.write(i, 0, str(listname1[i]))
        sheet1.write(i, 1, str(listname2[i]))
        sheet1.write(i, 2, str(listname3[i]))
        sheet1.write(i, 3, str(listname4[i]))
    f.save(filename)

lists_to_excel(list1,list2,list3,list4,filename)  #filename:Excel的保存路径及名称,可绝对路径也可相对路径。

二、字典数据结构保存到Excel:

import xlwt

def dict_to_excel(dict,filename):
    key_list = list(dict.keys())
    value_list = list(dict.values())
    f = xlwt.Workbook()
    sheet1 = f.add_sheet(u'sheet1',cell_overwrite_ok=True)
    for i in range(len(key_list)):
        sheet1.write(i,0,str(key_list[i]))
        sheet1.write(i,1,int(value_list[i]))
    print(key_list)
    print(value_list)
    f.save(filename)

# 调用

dict_to_excel(dict,filename)   #filename:Excel的保存路径及名称,可绝对路径也可相对路径。

三、计算一个文本或者list中某些字段的频次,返回未排序的结果和按频次排序的结果:

import xlrd
import xlwt 
import re
import numpy as np 

#count the count of th text in the list.s is the list 
def count_texts(s):
    count_dict ={}
    for word in s:
        if word in count_dict:
            count_dict[word] += 1
        else:
            count_dict[word] = 1
        count_dict1 = sorted(CBRC_count_dict.items(), key=lambda x: x[1], reverse=True) #排序之后数据结果为list
    return count_dict,count_dict1

#调用

real_count_dict,real_count_dict1 = count_texts(list1)

四、向已有文件追加list中的文本数据:

import xlrd
import xlwt 
import re
import numpy as np 

def text_save(filename,data):
    file = open(filename,'a')   #a向文件追加
    for i in range(len(data)):
        s = str(data[i])   #
        s = s +'\n'        #
        file.write(s)
    file.close()

#调用

filename='./text1.txt'
text_save(filename,data1) # data1 是已有的一个list数据表。将data1追加到已有的filename文件中,并以换行符进行分割

五、对于此类文本数据的处理:先是括号,括号内是有效文本数据,然后接着是括号内文本的解释。此类仅仅提取括号内的有效文本

import xlrd
import xlwt 
import re
import numpy as np 

#仅提取括号内的有效内容
def subString(text):
    copy = False
    finished = False
    number= ['一','二','三','四','五','六','七','八']
    slotList = []
    str = ""
    for s in text:
        if s == '(' :
            copy = True
        elif s == ')':
            copy = False
            finished = True
        elif copy and s not in number:  #为了防止多条数据中的(一)等此类情况的发生,其并不符合此类文本数据。
            str = str + s
        if finished:
            slotList.append(str)
            str = ""
            finished = False
    slotList = "".join(slotList)
    return slotList

# 调用
text = subString(text)

六、Excel中文本数据的预处理:

import xlrd
import xlwt 
import re
import jieba 
import numpy as np 

def predict(predict_filename):
    # 所有待预测文本数据集的文件名
    all_data = xlrd.open_workbook(predict_filename)
    all_table = all_data.sheet_by_index(0)   # 第一张sheet
 
    all_nrows = all_table.nrows
    all_ncols = all_table.ncols
    print('文本数据条数',all_nrows)  #
    print('列数:',all_ncols)  #

    all_uncleaned_texts = []  # 定义初始的预处理后的文本数据list
    for row in range(all_nrows):
        all_uncleaned_text = all_table.cell(row,2).value  # 第一张sheet中第二列是文本数据列
        # 将含有多条文本的文本数据的整合成一条文本
        all_uncleaned_text = re.sub("2、|3、|4、|5、|6、|7、|8、|9、|10、|11、|12、|13、|14、|15、|", "",  all_uncleaned_text)
        all_uncleaned_text = re.sub("2.|3.|4.|5.|6.|7.|8.|9.|10.|11.|12.|13.|14.|15.|", "",  all_uncleaned_text)
        all_uncleaned_text = re.sub("二、|三、|四、|五、|六、|七、|八、|九、|十、|十一、|十二、|十三、|十四、|十五、|", "",  all_uncleaned_text)
        all_uncleaned_text = re.sub("(二)|(三)|(四)|(五)|(六)|(七)|(八)|(九)|(十)|(十一)|(十二)|(十三)|(十四)|(十五)|", "",all_uncleaned_text)
        all_uncleaned_text = re.sub("1、|1.|一、|(一)|", "",  all_uncleaned_text)
        # 去除所含有的英文字母、所含的数字
        all_uncleaned_text = re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\《\》\'\:\;\'\,\[\]\.\<\>\/\?\~\!\@\#\\\&\*\%]","",all_uncleaned_text)
        # 结巴分词
        all_uncleaned_text = jieba.cut(all_uncleaned_text, cut_all=False)
        all_uncleaned_text = ' '.join(['%s' % x for x in all_uncleaned_text])
        all_uncleaned_texts.append(all_uncleaned_text)
    print('所有数据中,去除了数字、字母且分词后文本:',all_uncleaned_texts)

七、由字典的值去获得字典的键:

#由字典的value得到对应的key
def get_keys(d, value):
    return [k for k, v in d.items() if v == value]
text_label_texts = []
for i in range(len(test_predicted)):
    text_label_text = get_keys(label_dict,test_predicted[i])
    text_label_texts.append(text_label_text)

八、中文转相应的Unicode编码

def to_unicode(string):
    ret = ''
    for v in string:
        ret = ret + hex(ord(v)).upper().replace('0X', '\\u')
    return ret
print(to_unicode("作为")) # \u4F5C\u4E3A

猜你喜欢

转载自blog.csdn.net/Jasminexjf/article/details/87071249
今日推荐