划分训练、测试数据集,从Dataframe中选取固定id的行数据并存入txt文件

import os, random, shutil

# 创建目录
test_path = './test_path/'
if not os.path.exists(test_path):
    os.makedirs(test_path)
    print('test_path is Ok')
else:
    print('test_path is exit')


new_train_list_path = './test_path/new_train_list.txt'
old_train_list_path = './train_list.txt'

# 去掉train/, 存入new_train_list.txt

f_new_train = open(new_train_list_path, 'w')

for line in open(old_train_list_path, 'r'):
#     print(line)
    img = line.split(' ')[0].split('/')[1]
    pid = line.split(' ')[1].rstrip()
    f_new_train.write(img + ' ' + pid + '\n')
f_new_train.close()



## 划分test_split.txt

import pandas as pd
import numpy as np

# 读取new_train_list.txt

new_train_df = pd.read_table(new_train_list_path, sep=' ', header=None)
new_train_df.columns = ['img', 'pid']
# new_train_df.shape
if os.path.exists(finally_train_list):
    os.remove(finally_train_list)
    
if os.path.exists(test_split_100):
    os.remove(test_split_100)    
finally_train_list = './test_path/finally_train_list.txt'
test_split_100 = './test_path/test_split_1200.txt'

import random
# 新建一个空的dataframe
test_split_100_df = pd.DataFrame(columns=new_train_df.columns)

# 产生100个不重复的随机数
all_random_pid = random.sample(list(new_train_df['pid'].unique()), 1200)

# 选取100个随机数对应的行
for random_pid in all_random_pid:
#     print(random_pid)
    random_pid_line = new_train_df[new_train_df['pid'] == random_pid]
    test_split_100_df = test_split_100_df.append(random_pid_line)
    new_train_df = new_train_df[~(new_train_df['pid'] == random_pid)]

test_split_100_df.to_csv(test_split_100)
new_train_df.to_csv(finally_train_list)
test_split_100_df.head(10)
    
    
发布了29 篇原创文章 · 获赞 12 · 访问量 1万+

猜你喜欢

转载自blog.csdn.net/c2250645962/article/details/103253996