Encoder+Decoder+LSTM 预测图像帧

Git代码地址：https://github.com/wdf19961118/LSTM

问题描述：

在这里基于卷积循环神经网络，做一个图像序列的预测。输入连续的16张图像帧，图像大小（3,128,128）。利用卷积网络对每连续的16张图像进行Encoder特征提取，然后将提取的特征序列输入到循环神经网络（LSTM）中，之后通过Decoder反卷积成原图像大小的troch（3,12,8,128），也可以当做根据前16帧生成了第17帧图像，原序列第17帧图像作为label，计算loss。

数据预处理：

1、想得到一个txt文本，里面每一行记录连续的17帧图像的路径

2、如何生成我们想要的txt路径文件？

数据集存储方式：

1）文件名按数字顺序：0，1，2。。。

2）每个文件夹下面都是一个视频的分解帧，命名方式如下：

3、代码：

import os
#
dir='/home/lab226/wdf/imgsrc'
fp = open('./img_path.txt','w+')
imgfile_list = os.listdir('/home/lab226/wdf/imgsrc')
#对文件夹列表按文件名的数字顺序排序
imgfile_list.sort(key= lambda x:int(x[:]))
#print(img_list)
seqsize =17
for imgfile in imgfile_list:
    filepath = os.path.join(dir,imgfile)
    img_list = os.listdir(filepath)
    #这个排序比较重要，因为我们要顺序取，但是文件的存储方式并不是按照我们理解的数字顺序存储
    img_list.sort(key=lambda x: int(x[:-4]))
    #滑窗取序列，步长为8
    for i in range(0, len(img_list)-seqsize, 8):
        for j in range(i,i+seqsize):
             img = img_list[j]
             path = os.path.join(filepath, img)
             if j == i+seqsize-1:
                fp.write(path+'\n')
             else:
                fp.write(path+' ')
fp.close()

数据加载：

我写了自己的SeqDataset，改写了Dataset类中的__getitem__（）函数，使得每次迭代返回连续的16张图像和第17张标签图像。详细代码如下：

class SeqDataset(Dataset):
    def __init__(self, txt, transform=None, target_transform=None, loader=default_loader):
        fh = open(txt, 'r')
        imgseqs = []
        for line in fh:
            line = line.strip('\n')
            line = line.rstrip()
            imgseqs.append(line)
        self.num_samples = len(imgseqs)
        self.imgseqs = imgseqs
        self.transform = transform
        self.target_transform = target_transform
        self.loader = loader

    def __getitem__(self, index):
        current_index = np.random.choice(range(0, self.num_samples))
        imgs_path = self.imgseqs[current_index].split()
        current_imgs = []
        current_imgs_path = imgs_path[:len(imgs_path)-1]
        current_label_path = imgs_path[len(imgs_path)-1]
        current_label = self.loader(current_label_path)



        for frame in current_imgs_path:
            img = self.loader(frame)
            if self.transform is not None:
                img = self.transform(img)
            current_imgs.append(img)
        current_label = self.transform(current_label)
        #print(current_label.shape)
        batch_cur_imgs = np.stack(current_imgs, axis=0)  
        return batch_cur_imgs, current_label


transform_list = [
        transforms.ToTensor()
        ]

data_transforms = transforms.Compose( transform_list )

train_data = SeqDataset(txt='./img_path.txt',transform=data_transforms)
train_loader = DataLoader(train_data, shuffle=True, num_workers=20,batch_size=BATCH_SIZE)

模型介绍：

由Encoder+LSTM和Decoder这两部分组成

具体代码：

class EncoderMUG2d_LSTM(nn.Module):
    def __init__(self, input_nc=3, encode_dim=1024, lstm_hidden_size=1024, seq_len=SEQ_SIZE, num_lstm_layers=1, bidirectional=False):
        super(EncoderMUG2d_LSTM, self).__init__()
        self.seq_len = seq_len
        self.num_directions = 2 if bidirectional else 1
        self.num_lstm_layers = num_lstm_layers
        self.lstm_hidden_size = lstm_hidden_size
        #3*128*128
        self.encoder = nn.Sequential(
            nn.Conv2d(input_nc, 32, 4,2,1), # 32*64*64
            nn.BatchNorm2d(32),
            nn.LeakyReLU(0.2, inplace=True),
            #32*63*63
            nn.Conv2d(32, 64, 4, 2, 1), # 64*32*32
            nn.BatchNorm2d(64),
            nn.LeakyReLU(0.2, inplace=True),
            #64*31*31
            nn.Conv2d(64, 128, 4, 2, 1), # 128*16*16
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(128, 256, 4, 2, 1), # 256*8*8
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(256, 512, 4, 2, 1), # 512*4*4
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(512, 512, 4, 2, 1),  # 512*2*2 
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(512, 1024, 4, 2, 1),  # 1024*1*1
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.2, inplace=True),

        )

        self.fc = nn.Linear(1024, encode_dim)
        self.lstm = nn.LSTM(encode_dim, encode_dim, batch_first=True)

    def init_hidden(self, x):
        batch_size = x.size(0)
        h = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        c = x.data.new(
                self.num_directions * self.num_lstm_layers, batch_size, self.lstm_hidden_size).zero_()
        return Variable(h), Variable(c)


    def forward(self, x):
        #x.shape [batchsize,seqsize,3,128,128]
        B = x.size(0)
        x = x.view(B * SEQ_SIZE, 3, 128, 128) #x.shape[batchsize*seqsize,3,128,128]
        # [batchsize*seqsize, 3, 128, 128] -> [batchsize*seqsize, 1024,1,1]
        x = self.encoder(x)
        #[batchsize * seqsize, 1024, 1, 1]-> [batchsize*seqsize, 1024]
        x = x.view(-1, 1024)
        # [batchsize * seqsize, 1024]
        x = self.fc(x)
        # [batchsize , seqsize ,1024]
        x = x.view(-1, SEQ_SIZE, x.size(1))
        h0, c0 = self.init_hidden(x)
        output, (hn,cn) = self.lstm(x,(h0,c0))
        return hn

class DecoderMUG2d(nn.Module):
    def __init__(self, output_nc=3, encode_dim=1024): #output size: 64x64
        super(DecoderMUG2d, self).__init__()

        self.project = nn.Sequential(
            nn.Linear(encode_dim, 1024*1*1),
            nn.ReLU(inplace=True)
        )
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(1024, 512, 4), # 512*4*4
            nn.BatchNorm2d(512),
            nn.ReLU(True),

            nn.ConvTranspose2d(512, 256, 4, stride=2), # 256*10*10
            nn.BatchNorm2d(256),
            nn.ReLU(True),

            nn.ConvTranspose2d(256, 128, 4), # 128*13*13
            nn.BatchNorm2d(128),
            nn.ReLU(True),

            nn.ConvTranspose2d(128, 64, 4,stride=2),  # 64*28*28
            nn.BatchNorm2d(64),
            nn.ReLU(True),

            nn.ConvTranspose2d(64, 32, 4),  # 32*31*31
            nn.BatchNorm2d(32),
            nn.ReLU(True),

            nn.ConvTranspose2d(32, 16, 4,stride=2),  # 16*64*64
            nn.BatchNorm2d(16),
            nn.ReLU(True),

            nn.ConvTranspose2d(16, output_nc, 4, stride=2, padding=1),  # 3*128*128
            nn.Sigmoid(),
        )
    def forward(self, x):
        x = self.project(x)
        x = x.view(-1, 1024, 1, 1)
        decode = self.decoder(x)
        return decode

class net(nn.Module):
    def __init__(self):
        super(net,self).__init__()
        self.n1 = EncoderMUG2d_LSTM()
        self.n2 = DecoderMUG2d()

    def forward(self, x):
        output = self.n1(x)
        output = self.n2(output) #B*3*128*128
        return output

训练过程：

if __name__ == '__main__':
    model = net()
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    loss_func = nn.MSELoss()

    inputs, label = next(iter(train_loader))
    
    for epoch in range(10):
        print('epoch {}'.format(epoch + 1))
        train_loss = 0.
        train_acc = 0.
        #count = 1
        for batch_x, batch_y in train_loader:
            inputs, label = Variable(batch_x).cuda(), Variable(batch_y).cuda()
            output = model(inputs)
            loss = loss_func(output, label)/label.shape[0]
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print('epoch: {}, Loss: {:.4f}'.format(epoch + 1, loss.data.cpu().numpy()))

        if (epoch + 1) % 5 == 0:  # 每 5 次，保存一下解码的图片和原图片
            pic = to_img(output.cpu().data)
            img = to_img(label.cpu().data)
            if not os.path.exists('./conv_autoencoder'):
                os.mkdir('./conv_autoencoder')
            save_image(pic, './conv_autoencoder/decode_image_{}.png'.format(epoch + 1))
            save_image(img, './conv_autoencoder/raw_image_{}.png'.format(epoch + 1))
        #count = count +1

    torch.save(model.state_dict(), PATH_SAVE)

PMPWDF

发布了52 篇原创文章 · 获赞 6 · 访问量 9005

私信关注

Encoder+Decoder+LSTM 预测图像帧

猜你喜欢