From 9e158f07636feb5acd024dd6f66ef5fffac72514 Mon Sep 17 00:00:00 2001 From: Z ZH Date: Sat, 14 Dec 2019 04:56:00 +0900 Subject: [PATCH 1/3] fix dtype issue --- matchzoo/dataloader/callbacks/padding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/matchzoo/dataloader/callbacks/padding.py b/matchzoo/dataloader/callbacks/padding.py index b38ce46..f6df538 100755 --- a/matchzoo/dataloader/callbacks/padding.py +++ b/matchzoo/dataloader/callbacks/padding.py @@ -130,8 +130,8 @@ def on_batch_unpacked(self, x: dict, y: np.ndarray): """Pad `x['text_left']` and `x['text_right]`.""" batch_size = len(x['id_left']) - pad_length_left = max(x['length_left']) - pad_length_right = max(x['length_right']) + pad_length_left = int(max(x['length_left'])) + pad_length_right = int(max(x['length_right'])) if self._with_ngram: ngram_length_left = max([len(w) for k in x['ngram_left'] for w in k]) From 0e5c04e1e948aa9277abd5c85ff99d9950d8527f Mon Sep 17 00:00:00 2001 From: XinyuMa Date: Sat, 26 Sep 2020 17:39:01 +0800 Subject: [PATCH 2/3] Update turorials/ranking/bert.ipynb (#149) --- tutorials/ranking/bert.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tutorials/ranking/bert.ipynb b/tutorials/ranking/bert.ipynb index 51a4b10..ecdaaa0 100644 --- a/tutorials/ranking/bert.ipynb +++ b/tutorials/ranking/bert.ipynb @@ -70,9 +70,13 @@ " mode='pair',\n", " num_dup=2,\n", " num_neg=1\n", + " resample=True,\n", + " sort=False,\n", + " batch_size=20,\n", ")\n", "testset = mz.dataloader.Dataset(\n", " data_pack=test_pack_processed\n", + " batch_size=20,\n", ")" ] }, @@ -85,15 +89,11 @@ "padding_callback = mz.models.Bert.get_default_padding_callback()\n", "trainloader = mz.dataloader.DataLoader(\n", " dataset=trainset,\n", - " batch_size=20,\n", " stage='train',\n", - " resample=True,\n", - " sort=False,\n", " callback=padding_callback\n", ")\n", "testloader = mz.dataloader.DataLoader(\n", " dataset=testset,\n", - " batch_size=20,\n", " stage='dev',\n", " callback=padding_callback\n", ")" From 9943969d657e47e869875a19699bdb0f59cd3cec Mon Sep 17 00:00:00 2001 From: mxy Date: Sat, 26 Sep 2020 06:15:59 +0800 Subject: [PATCH 3/3] Split workload when use multiprocessing to loading data. --- matchzoo/dataloader/dataset.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/matchzoo/dataloader/dataset.py b/matchzoo/dataloader/dataset.py index 6c71db7..926fe86 100755 --- a/matchzoo/dataloader/dataset.py +++ b/matchzoo/dataloader/dataset.py @@ -116,8 +116,20 @@ def __iter__(self): """Create a generator that iterate over the Batches.""" if self._resample or self._shuffle: self.on_epoch_end() - for i in range(len(self)): - yield self[i] + + worker_info = data.get_worker_info() + if worker_info is None: + # single-process + for i in range(len(self)): + yield self[i] + else: + # multi-process, split workload across all workers + per_work_load = int(math.ceil(len(self) / float(worker_info.num_workers))) + worker_id = worker_info.id + iter_start = worker_id * per_work_load + iter_end = min(iter_start + per_work_load, len(self)) + for i in range(iter_start, iter_end): + yield self[i] def on_epoch_end(self): """Reorganize the index array if needed."""