[Engineering Practice] Records using pandas

foreword

When processing data at work, it is often necessary to use pandas, and record some habits of using pandas at work.

1. Merge data

#读取原始数据
data1 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/1.xlsx')
data2 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/2.xlsx')
data3 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/3.xlsx')
data4 = pd.read_excel('/home/zhenhengdong/WORk/Classfier/Dates/Original/20230712.xlsx')
#新建data
data  = pd.DataFrame()
#有选择的将原始数据进行合并
data['序号'] = pd.concat([data4['序号'],data3['序号'],data1['序号'],data2['序号']], ignore_index=False)
data['内容'] = pd.concat([data4['内容'],data3['内容'],data1['内容'],data2['内容']], ignore_index=False)
data['一级分类'] = pd.concat([data4['一级分类'],data3['一级分类'],data1['一级分类'],data2['一级分类']], ignore_index=False)
data['二级分类'] = pd.concat([data4['二级分类'],data3['二级分类'],data1['二级分类'],data2['二级分类']], ignore_index=False)
data['反馈类型'] = pd.concat([data4['反馈类型'],data3['反馈类型'],data1['反馈类型'],data2['反馈类型']], ignore_index=False)

2. Delete data

2.1 Delete row

The index parameter specifies the row to be deleted, and the inplace parameter indicates whether to operate on the original data set.

#根据行索引删除行
data.drop(index=[0, 4], inplace=True)

2.2 Delete column

The columns parameter specifies the columns to be deleted, and the inplace parameter indicates whether to operate on the original data set.

#根据列名称删除列
data.drop(columns=['class'], inplace=True)

You can also use del to delete.

 del data['class']

2.3 Delete duplicate rows

The subset parameter specifies the columns that need to be checked for duplicates, and the default value is all columns. The inplace parameter indicates whether to operate on the original dataset. keep=last means to keep the last duplicate line, and keep=first means to keep the first duplicate line.

data.drop_duplicates(subset="gender", inplace=True,keep='last')

2.4 Delete missing rows

axis: axis. 0 or 'index' means delete by row; 1 or 'columns' means delete by column.

how: filter method. 'any' means that as long as the row/column has more than one null value, the row/column will be deleted; 'all' means that the row/column is all null values, and the row/column will be deleted.

thresh: Minimum number of non-empty elements. int type, the default is None. If the number of non-empty elements in the row/column is less than this value, delete the row/column.

subset: subset. A list whose elements are row or column indices. If axis=0 or 'index', the elements in the subset are the index of the column; if axis=1 or 'column', the elements in the subset are the index of the row. The sub-area restricted by subset is a condition judgment area for judging whether to delete the row/column.

inplace: Whether to replace in place. Boolean value, default is False. If True, operate on the original DataFrame and return None.

dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
# 删除包含缺失值的行
data.dropna()
# 删除全部为缺失值的行
data.dropna(how='all')
# 删除至少有2个缺失值的行
data.dropna(thresh=2)
# 根据指定的列删除包含缺失的行
data.dropna(subset=['列名'])

2.5 Delete rows by condition

2.5.1 Eligible random deletion

Delete the lines whose feedback type is BUG, and randomly delete 44 lines.

from random import sample
drop_index_1 =  data[(data.反馈类型 == 'BUG问题')].index.tolist()
drop_index_2 = sample(drop_index_1, k = 44)
data =  data.drop(index = drop_index_2)
#重新设置索引
data.reset_index(drop=True, inplace=True)

2.5.2 Delete all items that meet the conditions

data =  data.drop(index = data[(data.反馈类型 == '其他')].index.tolist())

3. Reset the index

data.reset_index(drop=True, inplace=True)

4. View the number of categories

for index,key_values in enumerate(data.联合分类.value_counts().items()):
    print(index,key_values)

data['标注类型'].value_counts()

5. traverse data

Get the data of the specified row through data.iloc[index].

content_list = []
label_list = []
for index in range(len(data)):
    content = data.iloc[index]['内容']
    label = data.iloc[index]['标注类型']
    content_list.append(content)
    label_list.append(label)

6. New data

new_data = pd.DataFrame()
new_data['content'] = content_list
new_data['label'] = label_list

7. Statistics and set labels

Feedback_type = {}
for index,item in enumerate(list(new_data['label'].value_counts().keys())):
    Feedback_type[item]  = index
def add_Feedback_type_index(Feed_type):
    index = Feedback_type.get(Feed_type)
    return index
new_data['feedback_type_label'] = new_data['label'].apply(add_Feedback_type_index)

8. Save the mapping file

with open("Feedback_type.json", "w") as outfile:
    json.dump(Feedback_type, outfile,ensure_ascii=False)

9. Statistics data generation excel

def Statistics(data):
    keys = []
    values = []
    for key_values in (data):
        keys.append(key_values[0])
        values.append(key_values[1])
    pkg_name_index_xlsx=pd.DataFrame()
    pkg_name_index_xlsx['类目'] = keys
    pkg_name_index_xlsx['数量'] = values
    pkg_name_index_xlsx.to_excel(excel_writer= r"联合分类.xlsx")
Statistics(data.联合分类.value_counts().items())

10. Store data

#将合并的data存储
data.to_csv('./Total_data.csv',index = False,encoding='utf8')

11. Data negative sampling

#统计联合分类中数量大于4000的类目
drop_element_list = []
for index,key_values in enumerate(data.联合分类.value_counts().items()):
    if key_values[1] > 4000:
        drop_element_list.append(key_values[0])

#对其删除1200条
from random import sample
def drop_element(data,drop_element_list):
    for index in range(len(drop_element_list)):
        element = drop_element_list[index]
        result = 1200
        drop_index_1 =  data[(data.联合分类 == element)].index.tolist()
        drop_index_2 = sample(drop_index_1, k=result)
        data =  data.drop(index = drop_index_2)
        data.reset_index(drop=True, inplace=True)
    return data
data = drop_element(data,drop_element_list)