Use python to get a certain column or some columns in the .csv file

1. Integrate the feature values ​​in the three csv files into one file, and add the corresponding label at the same time.

# -*-coding:utf-8 -*-
import csv;
label1 = '1'
label2 = '2'
label3 = '3'
a = "feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,label" + "\n"
with open("./dataset/dataTime2.csv", 'a') as rfile:
     rfile.writelines(a)
with open("./dataset/f02.csv", 'rb') as file:
    a = file.readline().strip()
    while a:
        a = a + ',' + label1 + "\n"
        #a = label1 + ',' + a + "\n"
        with open("./dataset/dataTime2.csv", 'a') as rfile:
            rfile.writelines(a)
        a = file.readline().strip()
with open("./dataset/g03.csv", 'rb') as file:
    a = file.readline().strip()
    while a:
        a = a + ',' + label2 + "\n"
        #a = label2 + ',' + a + "\n"
        with open("./dataset/dataTime2.csv", 'a') as rfile:
            rfile.writelines(a)
        a = file.readline().strip()
with open("./dataset/normal05.csv", 'rb') as file:
    a = file.readline().strip()
    while a:
        a = a + ',' + label3 + "\n"
        #a = label3 + ',' + a + "\n"
        with open("./dataset/dataTime2.csv", 'a') as rfile:
            rfile.writelines(a)
        a = file.readline().strip()

 

2. To get a certain column in the csv file, you can get all the values ​​corresponding to the column whose label is the header.

filename = "./dataset/dataTime2.csv"
list1 = []
with open(filename, 'r') as file:
    reader = csv.DictReader(file)
    column = [row['label'] for row in reader]

 

3. Get some columns in the csv file, and you can get all values ​​except the corresponding columns in the label header below.

import pandas as pd
odata = pd.read_csv(filename)
y = odata['label']
x = odata.drop(['label'], axis=1) #除去label列之外的所有feature值

4. It can also be processed into data in the form of list[np.array].

filename = "./dataset/dataTime2.csv"
list1 = []
with open(filename, 'r') as file:
    a = file.readline()
    while a:
        c = np.array(a.strip("\n").split(","))
        list1.append(c)

5. It can also be processed into a tensor format dataset

# -*-coding:utf-8 -*-
import tensorflow as tf
# 读取的时候需要跳过第一行
filename = tf.train.string_input_producer(["./dataset/dataTime.csv"])
reader = tf.TextLineReader(skip_header_lines=1)
key, value = reader.read(filename)
record_defaults = [[1.], [1.], [1.], [1.], [1.], [1.], [1.], [1.], [1.], [1.], tf.constant([], dtype=tf.int32)]
col1, col2, col3, col4, col5, col6, col7, col8, col9, col10, col11= tf.decode_csv(
    value, record_defaults=record_defaults)
features = tf.stack([col1, col2, col3, col4, col5, col6, col7, col8, col9, col10])
with tf.Session() as sess:
  # Start populating the filename queue.
  coord = tf.train.Coordinator()
  threads = tf.train.start_queue_runners(coord=coord)
  trainx = []
  trainy = []
  for i in range(81000):
    # Retrieve a single instance:
      example, label = sess.run([features, col11])
      trainx.append(example)
      trainy.append(label)
  coord.request_stop()
  coord.join(threads)
#最后长度是81000,trainx是10个特征

References: http://t.csdn.cn/HFTPy

おすすめ

転載: blog.csdn.net/weixin_64338372/article/details/130251739