文本分类-IMDB数据集

一、IMDB数据集

1.1、下载数据集

from __future__ import absolute_import, division, print_function
import tensorflow as tf
from tensorflow import keras

import tensorflow_datasets as tfds

import numpy as np


# 加载IMDB数据集
(train_data, test_data), info = tfds.load(
    # 数据集
    'imdb_reviews/subwords8k',
    # 训练集,数据集以tuple形式返回
    split=(tfds.Split.TRAIN, tfds.Split.TEST),
    #  Return (example, label) pairs from the dataset (instead of a dictionary).
    as_supervised=True,
    # 返回`info`的结构
    with_info=True)

1.2、探索数据集

1.2.1、encoder

print(type(train_data))  # <class 'tensorflow.python.data.ops.dataset_ops._OptionsDataset'>
print(type(test_data))
print(type(info))
encoder = info.features['text'].encoder
print(type(encoder))
print('Vocabulary size: {}'.format(encoder.vocab_size))

sample_word = 'hello tensorflow'
encoded_Arr = encoder.encode(sample_word)  # 编码
print(encoded_Arr)  # [3618, 222, 943, 2327, 2934]

original_word = encoder.decode(encoded_Arr)	# 解码
print(original_word)

print(sample_word == original_word)


for ts in encoded_Arr:
    print(ts, '--->', encoder.decode([ts]))

1.2.2、探索数据内容与格式

for train_example, train_label in train_data.take(1):
    # 每个example都是一个数值数据,表示这电影评论
    print(train_example[0:10])              # tf.Tensor([ 249    4  277  309  560    6 6639 4574    2   12], shape=(10,), dtype=int64)
    print(encoder.decode(train_example))    # 解码  评论: As a lifelong fan of Dickens, I have invariably been disappointed by adaptations of his novels.<br /><br />Altho。。。。
    print(train_label)                      # 标签0: negative/1: positive  
发布了784 篇原创文章 · 获赞 90 · 访问量 44万+

猜你喜欢

转载自blog.csdn.net/wuxintdrh/article/details/103509590
今日推荐