This commit is contained in:
louiscklaw
2025-01-31 22:10:02 +08:00
parent 97df42e0d5
commit 2627562070
2852 changed files with 748727 additions and 0 deletions

View File

@@ -0,0 +1,665 @@
"""Tests for Distributed TensorFlow Dataloader."""
import collections
import json
import os
import shutil
import sys
import unittest
import cpuinfo
import numpy as np
import tensorflow as tf
from neural_compressor import data
from neural_compressor.adaptor.tf_utils.util import version1_lt_version2
from neural_compressor.data import DATALOADERS, TRANSFORMS, DataLoader, Datasets
from neural_compressor.utils import logger
from neural_compressor.utils.create_obj_from_config import create_dataloader, create_dataset
@unittest.skipIf(version1_lt_version2(tf.version.VERSION, "2.10.0"), "Only test TF 2.10.0 or above")
class TestDistributedTFDataDataloader(unittest.TestCase):
def setUp(self):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
tf.compat.v1.enable_eager_execution()
self.dataset = tf.data.Dataset.from_tensors((tf.ones([3, 224, 224]), tf.ones([1000]))).repeat(600)
self.count = 0
logger.info(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
logger.info(f"Test: {sys.modules[__name__].__file__}-{self.__class__.__name__}-{self._testMethodName}")
def tearDown(self):
logger.info(f"{self._testMethodName} done.\n")
def check_tf_dataset_with_batch_raise(self, batch_size, last_batch, distributed):
dataset_with_batch = (
tf.data.Dataset.from_tensors((tf.ones([3, 224, 224]), tf.ones([1000]))).repeat(600).batch(2)
)
dataloader = DATALOADERS["tensorflow"](
dataset_with_batch, batch_size=batch_size, last_batch=last_batch, distributed=distributed
)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def check_distributed_raise(self, batch_size, last_batch, distributed):
dataloader = DATALOADERS["tensorflow"](
self.dataset, batch_size=batch_size, last_batch=last_batch, distributed=distributed
)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_1(self):
self.assertRaises(TypeError, self.check_tf_dataset_with_batch_raise, 32, "rollover", True)
def test_dis_tf_data_dataloader_2(self):
self.assertRaises(TypeError, self.check_tf_dataset_with_batch_raise, 32, "no_rollover", True)
def test_dis_tf_data_dataloader_3(self):
self.assertRaises(TypeError, self.check_tf_dataset_with_batch_raise, 1, "rollover", True)
def test_dis_tf_data_dataloader_4(self):
self.assertRaises(TypeError, self.check_tf_dataset_with_batch_raise, 1, "no_rollover", True)
def test_dis_tf_data_dataloader_5(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "rollover", True)
def test_dis_tf_data_dataloader_6(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "no_rollover", True)
def test_dis_tf_data_dataloader_7(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "rollover", True)
def test_dis_tf_data_dataloader_8(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "no_rollover", True)
def test_dis_tf_data_dataloader_9(self):
batch_size = 32
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_10(self):
batch_size = 32
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="no_rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_11(self):
batch_size = 1
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_12(self):
batch_size = 1
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="no_rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_13(self):
batch_size = 600
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_dis_tf_data_dataloader_14(self):
batch_size = 600
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch="no_rollover")
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (3, 224, 224))
self.assertEqual(x[-1].shape, (3, 224, 224))
self.assertIsInstance(x, np.ndarray)
self.count += 1
@unittest.skipIf(version1_lt_version2(tf.version.VERSION, "2.10.0"), "Only test TF 2.10.0 or above")
class TestDefaultDataLoaderSequentialSampler(unittest.TestCase):
@classmethod
def tearDownClass(cls):
if os.path.exists("minist"):
shutil.rmtree("minist")
def setUp(self):
self.count = 0
logger.info(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
logger.info(f"Test: {sys.modules[__name__].__file__}-{self.__class__.__name__}-{self._testMethodName}")
def tearDown(self):
logger.info(f"{self._testMethodName} done.\n")
def check_get_len_raise(self, batch_size, last_batch, distributed):
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": last_batch,
"distributed": distributed,
}
dataloader = create_dataloader("tensorflow", dataloader_args)
len_dataloader = len(dataloader)
def check_distributed_raise(self, batch_size, last_batch, distributed):
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": last_batch,
"distributed": distributed,
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler1(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 32, "rollover", True)
def test_sequential_sampler2(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 32, "no_rollover", True)
def test_sequential_sampler3(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 1, "rollover", True)
def test_sequential_sampler4(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 1, "no_rollover", True)
def test_sequential_sampler5(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "rollover", True)
def test_sequential_sampler6(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "no_rollover", True)
def test_sequential_sampler7(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "rollover", True)
def test_sequential_sampler8(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "no_rollover", True)
def test_sequential_sampler9(self):
batch_size = 3332
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler10(self):
batch_size = 3332
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "no_rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler11(self):
batch_size = 1
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler12(self):
batch_size = 1
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "no_rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler13(self):
batch_size = 10000
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
def test_sequential_sampler14(self):
batch_size = 10000
dataloader_args = {
"batch_size": batch_size,
"dataset": {"MNIST": {"root": "./minist", "train": False, "download": True}},
"transform": {"Resize": {"size": 24}},
"filter": None,
"last_batch": "no_rollover",
}
dataloader = create_dataloader("tensorflow", dataloader_args)
for batch in dataloader:
x, y = batch
if self.count < len(dataloader) - 1:
self.assertEqual(len(x), batch_size)
else:
self.assertTrue(len(x) == dataloader.dis_dataset_len % batch_size or len(x) == batch_size)
self.assertEqual(x[0].shape, (24, 24))
self.assertEqual(x[-1].shape, (24, 24))
self.assertIsInstance(x, np.ndarray)
self.count += 1
@unittest.skipIf(version1_lt_version2(tf.version.VERSION, "2.10.0"), "Only test TF 2.10.0 or above")
class TestDefaultDataLoaderIterableSampler(unittest.TestCase):
class iter_dataset(object):
def __iter__(self):
sample_size = 250
for i in range(1, sample_size + 1):
yield np.array([i])
def setUp(self):
self.rank = 0
self.size = 1
self.count = 1
self.dataset = self.iter_dataset()
logger.info(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
logger.info(f"Test: {sys.modules[__name__].__file__}-{self.__class__.__name__}-{self._testMethodName}")
def tearDown(self):
logger.info(f"{self._testMethodName} done.\n")
def check_get_len_raise(self, batch_size, last_batch, distributed):
dataloader = DATALOADERS["tensorflow"](
self.dataset, batch_size=batch_size, last_batch=last_batch, distributed=distributed
)
len_dataloader = len(dataloader)
def check_distributed_raise(self, batch_size, last_batch, distributed):
dataloader = DATALOADERS["tensorflow"](
self.dataset, batch_size=batch_size, last_batch=last_batch, distributed=distributed
)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler1(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 32, "rollover", True)
def test_iterable_sampler2(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 32, "no_rollover", True)
def test_iterable_sampler3(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 1, "rollover", True)
def test_iterable_sampler4(self):
self.assertRaises(EnvironmentError, self.check_get_len_raise, 1, "no_rollover", True)
def test_iterable_sampler5(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "rollover", True)
def test_iterable_sampler6(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 32, "no_rollover", True)
def test_iterable_sampler7(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "rollover", True)
def test_iterable_sampler8(self):
self.assertRaises(EnvironmentError, self.check_distributed_raise, 1, "no_rollover", True)
def test_iterable_sampler9(self):
batch_size = 128
last_batch = "rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler10(self):
batch_size = 128
last_batch = "no_rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler11(self):
batch_size = 1
last_batch = "rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler12(self):
batch_size = 1
last_batch = "no_rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler13(self):
batch_size = 1000
last_batch = "rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
def test_iterable_sampler14(self):
batch_size = 1000
last_batch = "no_rollover"
dataloader = DATALOADERS["tensorflow"](self.dataset, batch_size=batch_size, last_batch=last_batch)
for batch in dataloader:
if self.count < len(dataloader):
self.assertEqual(len(batch), batch_size)
self.assertEqual(self.count * batch_size * self.size - self.size + self.rank + 1, batch[-1][0])
else:
self.assertTrue(len(batch) == dataloader.dis_dataset_len % batch_size or len(batch) == batch_size)
self.assertEqual(
((self.count - 1) * batch_size + len(batch) - 1) * self.size + self.rank + 1, batch[-1][0]
)
break
self.count += 1
@unittest.skipIf(version1_lt_version2(tf.version.VERSION, "2.10.0"), "Only test TF 2.10.0 or above")
class TestTensorflowBertDataLoader(unittest.TestCase):
label = [
{
"paragraphs0": [
{
"context": "Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season.",
"qas": [
{
"answers": [
{"answer_start": 177, "text": "Denver Broncos"},
{"answer_start": 177, "text": "Denver Broncos"},
{"answer_start": 177, "text": "Denver Broncos"},
],
"question": "Which NFL team represented the AFC at Super Bowl 50?",
"id": "56be4db0acb8001400a502ec",
}
],
}
]
}
]
unique_id = 1000000000
input_ids = [
101,
2029,
5088,
2136,
3421,
1996,
10511,
2012,
3565,
4605,
2753,
1029,
102,
3565,
4605,
2753,
1007,
2005,
1996,
2325,
2161,
1012,
1996,
2137,
2374,
3034,
1006,
]
input_mask = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
segment_ids = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
fake_json = json.dumps({"data": label, "version": "1.1"})
with open("dev.json", "w") as f:
f.write(fake_json)
@classmethod
def tearDownClass(cls):
os.remove("test.record")
os.remove("dev.json")
def setUp(self):
logger.info(f"CPU: {cpuinfo.get_cpu_info()['brand_raw']}")
logger.info(f"Test: {sys.modules[__name__].__file__}-{self.__class__.__name__}-{self._testMethodName}")
def tearDown(self):
logger.info(f"{self._testMethodName} done.\n")
def check_not_implement(self, batch_size, distributed):
with tf.io.TFRecordWriter("./test.record") as writer:
features = collections.OrderedDict()
features["unique_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list([self.unique_id])))
features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_ids)))
features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_mask)))
features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.segment_ids)))
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
eval_dataset = create_dataset(
"tensorflow", {"bert": {"root": "test.record", "label_file": "./dev.json"}}, None, None
)
dataloader = DATALOADERS["tensorflow"](dataset=eval_dataset, batch_size=batch_size, distributed=distributed)
def test_tf_bert_dataloader_1(self):
self.assertRaises(NotImplementedError, self.check_not_implement, 32, True)
def test_tf_bert_dataloader_2(self):
batch_size = 128
with tf.io.TFRecordWriter("./test.record") as writer:
features = collections.OrderedDict()
features["unique_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list([self.unique_id])))
features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_ids)))
features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_mask)))
features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.segment_ids)))
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
eval_dataset = create_dataset(
"tensorflow", {"bert": {"root": "test.record", "label_file": "./dev.json"}}, None, None
)
dataloader = DATALOADERS["tensorflow"](dataset=eval_dataset, batch_size=batch_size)
for inputs, labels in dataloader:
self.assertEqual(inputs[0], "test.record")
self.assertEqual(inputs[1], batch_size)
self.assertEqual(len(labels), 1)
def test_tf_bert_dataloader_3(self):
batch_size = 1
with tf.io.TFRecordWriter("./test.record") as writer:
features = collections.OrderedDict()
features["unique_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list([self.unique_id])))
features["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_ids)))
features["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.input_mask)))
features["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(value=list(self.segment_ids)))
tf_example = tf.train.Example(features=tf.train.Features(feature=features))
writer.write(tf_example.SerializeToString())
eval_dataset = create_dataset(
"tensorflow", {"bert": {"root": "test.record", "label_file": "./dev.json"}}, None, None
)
dataloader = DATALOADERS["tensorflow"](dataset=eval_dataset, batch_size=batch_size)
for inputs, labels in dataloader:
self.assertEqual(inputs[0], "test.record")
self.assertEqual(inputs[1], batch_size)
self.assertEqual(len(labels), 1)
if __name__ == "__main__":
unittest.main()