foreword
When processing data at work, it is often necessary to use pandas, and record some habits of using pandas at work.
1. Merge data
#读取原始数据
data1 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/1.xlsx')
data2 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/2.xlsx')
data3 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/3.xlsx')
data4 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/20230712.xlsx')
#新建data
data = pd.DataFrame()
#有选择的将原始数据进行合并
data['序号'] = pd.concat([data4['序号'],data3['序号'],data1['序号'],data2['序号']], ignore_index=False)
data['内容'] = pd.concat([data4['内容'],data3['内容'],data1['内容'],data2['内容']], ignore_index=False)
data['一级分类'] = pd.concat([data4['一级分类'],data3['一级分类'],data1['一级分类'],data2['一级分类']], ignore_index=False)
data['二级分类'] = pd.concat([data4['二级分类'],data3['二级分类'],data1['二级分类'],data2['二级分类']], ignore_index=False)
data['反馈类型'] = pd.concat([data4['反馈类型'],data3['反馈类型'],data1['反馈类型'],data2['反馈类型']], ignore_index=False)
2. Delete data
2.1 Delete row
The index parameter specifies the row to be deleted, and the inplace parameter indicates whether to operate on the original data set.
#根据行索引删除行
data.drop(index=[0, 4], inplace=True)
2.2 Delete column
The columns parameter specifies the columns to be deleted, and the inplace parameter indicates whether to operate on the original data set.
#根据列名称删除列
data.drop(columns=['class'], inplace=True)
You can also use del to delete.
del data['class']
2.3 Delete duplicate rows
The subset parameter specifies the columns that need to be checked for duplicates, and the default value is all columns. The inplace parameter indicates whether to operate on the original dataset. keep=last means to keep the last duplicate line, and keep=first means to keep the first duplicate line.
data.drop_duplicates(subset="gender", inplace=True,keep='last')
2.4 Delete missing rows
axis: axis. 0 or 'index' means delete by row; 1 or 'columns' means delete by column.
how: filter method. 'any' means that as long as the row/column has more than one null value, the row/column will be deleted; 'all' means that the row/column is all null values, and the row/column will be deleted.
thresh: Minimum number of non-empty elements. int type, the default is None. If the number of non-empty elements in the row/column is less than this value, delete the row/column.
subset: subset. A list whose elements are row or column indices. If axis=0 or 'index', the elements in the subset are the index of the column; if axis=1 or 'column', the elements in the subset are the index of the row. The sub-area restricted by subset is a condition judgment area for judging whether to delete the row/column.
inplace: Whether to replace in place. Boolean value, default is False. If True, operate on the original DataFrame and return None.
dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# 删除包含缺失值的行
data.dropna()
# 删除全部为缺失值的行
data.dropna(how='all')
# 删除至少有2个缺失值的行
data.dropna(thresh=2)
# 根据指定的列删除包含缺失的行
data.dropna(subset=['列名'])
2.5 Delete rows by condition
2.5.1 Eligible random deletion
Delete the lines whose feedback type is BUG, and randomly delete 44 lines.
from random import sample
drop_index_1 = data[(data.反馈类型 == 'BUG问题')].index.tolist()
drop_index_2 = sample(drop_index_1, k = 44)
data = data.drop(index = drop_index_2)
#重新设置索引
data.reset_index(drop=True, inplace=True)
2.5.2 Delete all items that meet the conditions
data = data.drop(index = data[(data.反馈类型 == '其他')].index.tolist())
3. Reset the index
data.reset_index(drop=True, inplace=True)
4. View the number of categories
for index,key_values in enumerate(data.联合分类.value_counts().items()):
print(index,key_values)
data['标注类型'].value_counts()
5. traverse data
Get the data of the specified row through data.iloc[index].
content_list = []
label_list = []
for index in range(len(data)):
content = data.iloc[index]['内容']
label = data.iloc[index]['标注类型']
content_list.append(content)
label_list.append(label)
6. New data
new_data = pd.DataFrame()
new_data['content'] = content_list
new_data['label'] = label_list
7. Statistics and set labels
Feedback_type = {}
for index,item in enumerate(list(new_data['label'].value_counts().keys())):
Feedback_type[item] = index
def add_Feedback_type_index(Feed_type):
index = Feedback_type.get(Feed_type)
return index
new_data['feedback_type_label'] = new_data['label'].apply(add_Feedback_type_index)
8. Save the mapping file
with open("Feedback_type.json", "w") as outfile:
json.dump(Feedback_type, outfile,ensure_ascii=False)
9. Statistics data generation excel
def Statistics(data):
keys = []
values = []
for key_values in (data):
keys.append(key_values[0])
values.append(key_values[1])
pkg_name_index_xlsx=pd.DataFrame()
pkg_name_index_xlsx['类目'] = keys
pkg_name_index_xlsx['数量'] = values
pkg_name_index_xlsx.to_excel(excel_writer= r"联合分类.xlsx")
Statistics(data.联合分类.value_counts().items())
10. Store data
#将合并的data存储
data.to_csv('./Total_data.csv',index = False,encoding='utf8')
11. Data negative sampling
#统计联合分类中数量大于4000的类目
drop_element_list = []
for index,key_values in enumerate(data.联合分类.value_counts().items()):
if key_values[1] > 4000:
drop_element_list.append(key_values[0])
#对其删除1200条
from random import sample
def drop_element(data,drop_element_list):
for index in range(len(drop_element_list)):
element = drop_element_list[index]
result = 1200
drop_index_1 = data[(data.联合分类 == element)].index.tolist()
drop_index_2 = sample(drop_index_1, k=result)
data = data.drop(index = drop_index_2)
data.reset_index(drop=True, inplace=True)
return data
data = drop_element(data,drop_element_list)