如何使用Tensorflow make_csv_dataset为Keras模型配置数据集管道 [英] How to configure dataset pipelines with Tensorflow make_csv_dataset for Keras Model
问题描述
我有大约200 GB的结构化数据集(csv功能文件).我正在使用
Edit2
经过一些努力,我设法通过使用较低级别但类似的API 我尝试仅向模型提供一些无用的功能,但是仍然可以提供1.00或100%的精度.现在哪里出问题了?过度拟合等? 在摘录中,您写了 I have a structured dataset(csv features files) of around 200 GB. I'm using make_csv_dataset to make the input pipelines. Here is my code Now, when I execute this code , it's completed within few minutes (I think not going through the whole training data) with the following warnings: W tensorflow/core/kernels/data/cache_dataset_ops.cc:798] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset will be discarded. This can happen if you have an input pipeline similar to As per this warning and as training is completed in few minutes meaning that... input pipeline is not configured correctly... Can anyone please guide me, how to correct this problem. GPU of my system is NVIDIA Quadro RTX 6000 (compute capability 7.5). A solution based on some other function like Edit That warning gone by changing the code to avoid any cache as But now the problem is I'm getting zero accuracy, even on the training data. Which I think is a problem of input pipelines. Here is the output. Edit2 After some efforts, I managed to resolve the known issues by using a bit lower level but similar API, CsvDataset. But now, I'm getting the accuracy=1.00 which I think is not OK. At first epoch, it's .95 and then for next 19 epochs, it's 1.00. Here is my final code. I tried to feed just the few useless features to the model, but still, it's giving accuracy=1.00 or 100 %. Which is going wrong now? Overfitting etc? In the snippets, you wrote Is the 这篇关于如何使用Tensorflow make_csv_dataset为Keras模型配置数据集管道的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持IT屋!
model.fit(train_ds,validation_data = validate_ds,validation_steps = 1,steps_per_epoch = 1,epochs = 20,verbose = 1)
steps_per_epoch = 1
是错字吗?如果没有,那意味着您每次培训只能使用一批,这说明了快速培训和较低的准确性. validation_steps = 1
也是一个问题def pack_features_vector(features, labels):
"""Pack the features into a single array."""
features = tf.stack(list(features.values()), axis=1)
return features, labels
def main():
defaults=[float()]*len(selected_columns)
data_set=tf.data.experimental.make_csv_dataset(
file_pattern = "./../path-to-dataset/Train_DS/*/*.csv",
column_names=all_columns, # all_columns=["col1,col2,..."]
select_columns=selected_columns, # selected_columns= a subset of all_columns
column_defaults=defaults,
label_name="Target",
batch_size=1000,
num_epochs=20,
num_parallel_reads=50,
# shuffle_buffer_size=10000,
ignore_errors=True)
data_set = data_set.map(pack_features_vector)
N_VALIDATION = int(1e3)
N_TRAIN= int(1e4)
BUFFER_SIZE = int(1e4)
BATCH_SIZE = 1000
STEPS_PER_EPOCH = N_TRAIN//BATCH_SIZE
validate_ds = data_set.take(N_VALIDATION).cache().repeat()
train_ds = data_set.skip(N_VALIDATION).take(N_TRAIN).cache().repeat()
# validate_ds = validate_ds.batch(BATCH_SIZE)
# train_ds = train_ds.batch(BATCH_SIZE)
model = tf.keras.Sequential([
layers.Flatten(),
layers.Dense(256, activation='elu'),
layers.Dense(256, activation='elu'),
layers.Dense(128, activation='elu'),
layers.Dense(64, activation='elu'),
layers.Dense(32, activation='elu'),
layers.Dense(1,activation='sigmoid')
])
model.compile(optimizer='adam',
loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
metrics=['accuracy'])
model.fit(train_ds,
validation_data=validate_ds,
validation_steps=1,
steps_per_epoch= 1,
epochs=20,
verbose=1
)
if __name__ == "__main__":
main()
print('Training completed!')
dataset.cache().take(k).repeat()
. You should use dataset.take(k).cache().repeat()
instead.experimental.CsvDataset
would work as well. validate_ds = data_set.take(N_VALIDATION).repeat()
train_ds = data_set.skip(N_VALIDATION).take(N_TRAIN).repeat()
def preprocess(*fields):
features=tf.stack(fields[:-1])
# convert Target column values to int to make it work for binary classification
labels=tf.stack([int(x) for x in fields[-1:]])
return features,labels # x, y
def main():
# selected_columns=["col1,col2,..."]
selected_indices=[]
for selected_column in selected_columns:
index=all_columns.index(selected_column)
selected_indices.append(index)
print("All_columns length"+str(len(all_columns)))
print("selected_columns length"+str(len(selected_columns)))
print("selected_indices length"+str(len(selected_indices)))
print(selected_indices)
defaults=[float()]*(len(selected_columns))
#defaults.append(int())
print("defaults"+str(defaults))
print("defaults length"+str(len(defaults)))
FEATURES = len(selected_columns) - 1
training_csvs = sorted(str(p) for p in pathlib.Path('.').glob("path-to-data/Train_DS/*/*.csv"))
testing_csvs = sorted(str(p) for p in pathlib.Path('.').glob("path-to-data/Test_DS/*/*.csv"))
training_csvs
testing_csvs
training_dataset=tf.data.experimental.CsvDataset(
training_csvs,
record_defaults=defaults,
compression_type=None,
buffer_size=None,
header=True,
field_delim=',',
# use_quote_delim=True,
# na_value="",
select_cols=selected_indices
)
print(type(training_dataset))
for features in training_dataset.take(1):
print("Training samples before mapping")
print(features)
validate_ds = training_dataset.map(preprocess).take(10).batch(100).repeat()
train_ds = training_dataset.map(preprocess).skip(10).take(90).batch(100).repeat()
validate_ds
train_ds
for features,labels in train_ds.take(1):
print("Training samples")
print(features)
print(labels)
testing_dataset=tf.data.experimental.CsvDataset(
testing_csvs,
record_defaults=defaults,
compression_type=None,
buffer_size=None,
header=True,
field_delim=',',
use_quote_delim=True,
na_value="",
select_cols=selected_indices
)
print(type(testing_dataset))
test_ds = testing_dataset.map(preprocess).batch(100).repeat()
test_ds
for features,labels in test_ds.take(1):
print("Testing samples")
print(features)
print(labels)
model = tf.keras.Sequential([
layers.Dense(256,activation='elu'),
layers.Dense(128,activation='elu'),
layers.Dense(64,activation='elu'),
layers.Dense(1,activation='sigmoid')
])
history = model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
metrics=['accuracy'])
model.fit(train_ds,
validation_data=validate_ds,
validation_steps=20,
steps_per_epoch= 20,
epochs=20,
verbose=1
)
loss, accuracy = model.evaluate(test_ds)
print("Test Accuracy", accuracy)
if __name__ == "__main__":
main()
print('Training completed!')
model.fit(train_ds,
validation_data=validate_ds,
validation_steps=1,
steps_per_epoch= 1,
epochs=20,
verbose=1)
steps_per_epoch= 1
a typo? If not, that would mean you only use one batch per training, which explains the fast training and the low accuracy. validation_steps=1
is also an issue