diff --git a/concept_formation/vision-experiments/datasets_cifar.py b/concept_formation/vision-experiments/datasets_cifar.py new file mode 100644 index 0000000..c08d733 --- /dev/null +++ b/concept_formation/vision-experiments/datasets_cifar.py @@ -0,0 +1,963 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader, Subset, ConcatDataset +from torchvision import datasets, transforms +import copy +import random + +items = ('plane', 'car', 'bird', 'cat', + 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') + + +def CIFAR_dataset(split='train', pad=False, normalize=False, permutation=False, + download=True, verbose=True): + """ + Load the original MNIST training/test dataset. + """ + + dataset_class = datasets.CIFAR10 + transform = [transforms.ToTensor()] + if normalize: + transform.append(transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))) # mean and std for all pixels in MNIST + # transform.append(transforms.Normalize((0.5,), (0.5,))) + dataset_transform = transforms.Compose(transform) + + # Load dataset: + dataset = dataset_class(root='./data', + train=False if split=='test' else True, + download=download, + transform=dataset_transform) + if verbose: + print("CIFAR10 {} dataset consisting of {} samples.".format(split, len(dataset))) + + return dataset + + + +def get_data_loader(dataset, batch_size, cuda=False, drop_last=False, shuffle=False): + """ + Return object for the provided dataset object. + """ + return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, + **({'num_workers': 0, 'pin_memory': True} if cuda else {})) + + +# def test_loaders(te_dataset, general_config, data_config, verbose=True): +# """ +# Generate test DataLoaders for all experiment cases. +# """ +# label = general_config['label'] +# cuda = general_config['cuda'] +# seed = general_config['seed'] + +# drop_last = data_config['drop_last'] +# shuffle = data_config['shuffle'] +# split_size = data_config['split_size'] +# batch_size_te = data_config['batch_size_te'] + +# # Set random seeds: +# np.random.seed(seed) +# torch.manual_seed(seed) +# random.seed(seed) + +# if verbose: +# print("\n\n " +' Loading test DataLoaders '.center(70, '*')) + +# test_datasets = [] +# print(te_dataset[0][1]) +# for item in items: +# single_digit_indices = [i for i in range(len(te_dataset)) if te_dataset[i][1] == item] +# test_datasets.append(Subset(te_dataset, single_digit_indices)) +# test_datasets.append(te_dataset) + +# test_loaders = [get_data_loader( +# test_dataset, +# batch_size=batch_size_te, +# cuda=cuda, +# drop_last=drop_last, +# shuffle=shuffle) for test_dataset in test_datasets] + +# if verbose: +# print(' Loading test DataLoaders successful '.center(70, '*')) + +# labels_te = [] +# for item in items: +# labels_te.append(item) +# labels_te.append(items) + +# return test_loaders, labels_te + + + +# class dataloaders_0(object): +# """ +# Corresponds to the case: +# All trained data are shuffled and trained in sequential splits. +# """ + +# def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + +# self.cuda = general_config['cuda'] +# self.seed = general_config['seed'] + +# self.shuffle = data_config['shuffle'] +# self.batch_size_tr = data_config['batch_size_tr'] +# self.batch_size_te = data_config['batch_size_te'] +# self.split_size = data_config['split_size'] +# if self.split_size > len(dataset_tr): +# raise ValueError("split_size exceeds the size of the initial training dataset.") +# self.normalize = data_config['normalize'] +# self.pad = data_config['pad'] +# self.permutation = data_config['permutation'] +# self.drop_last = data_config['drop_last'] + +# # Set random seeds: +# np.random.seed(self.seed) +# torch.manual_seed(self.seed) +# random.seed(self.seed) + +# self.verbose = verbose +# self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) +# self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) +# if verbose: +# print("\nAn overview of the dataloaders for experiments 0:") +# print("Training dataloaders: (all classes: 1), ..., (all classes: {})".format(self.n_splits) + ", each split with {} data.".format(self.split_size)) +# print("Test dataloaders: (plane), (car), ..., (truck), (ALL), each with all the test data available,") +# print("(Approx. 5000 for each label and 10000 for all labels).") +# print("The actual size of each tr subset:", self.size_subsets) + + +# def training_loaders(self, tr_dataset): + +# if self.verbose: +# print("\n\n " +' Loading Training DataLoader for Experiments 0 '.center(70, '*')) + +# split_size = self.split_size + +# tr_dataset_indices = list(range(len(tr_dataset))) +# if self.shuffle: +# random.shuffle(tr_dataset_indices) + +# subset_datasets = [] +# n_splits = len(tr_dataset) // split_size +# if len(tr_dataset) % split_size != 0: +# n_splits += 1 + +# for i in range(n_splits): +# start = i * split_size +# end = (i + 1) * split_size if i < n_splits - 1 else len(tr_dataset_indices) +# subsets_indices = tr_dataset_indices[start:end] +# subset_datasets.append(Subset(tr_dataset, subsets_indices)) + +# subset_loaders = [get_data_loader( +# subset_dataset, +# batch_size=self.batch_size_tr, +# cuda=self.cuda, +# drop_last=self.drop_last, +# shuffle=self.shuffle) for subset_dataset in subset_datasets] + +# if self.verbose: +# print(' Loading training DataLoaders successful '.center(70, '*')) + +# # labels exist among the tr-datasets: +# labels_tr = [list(items)] * 10 + +# # size of each tr dataset: +# size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + +# return subset_loaders, n_splits, labels_tr, size_subsets + + + +# class dataloaders_1(object): +# """ +# Corresponds to the case: +# First train all data from the specified label (0 for instance) +# Then train all shuffled data from remaining labels in sequential splits. +# """ + +# def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + +# self.cuda = general_config['cuda'] +# self.seed = general_config['seed'] +# self.label = general_config['label'] + +# self.shuffle = data_config['shuffle'] +# self.batch_size_tr = data_config['batch_size_tr'] +# self.batch_size_te = data_config['batch_size_te'] +# self.split_size = data_config['split_size'] +# if self.split_size > len(dataset_tr): +# raise ValueError("split_size exceeds the size of the initial training dataset.") +# self.normalize = data_config['normalize'] +# self.pad = data_config['pad'] +# self.permutation = data_config['permutation'] +# self.drop_last = data_config['drop_last'] + +# # Set random seeds: +# np.random.seed(self.seed) +# torch.manual_seed(self.seed) +# random.seed(self.seed) + +# self.verbose = verbose +# self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) +# self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) +# if verbose: +# print("\nAn overview of the dataloaders for experiments 1:") +# print("Training dataloaders:") +# print("\t({}) with all available data,".format(self.label)) +# print("\t(classes except {}: 1), ..., (classes except {}: {})".format(self.label, self.label, self.n_splits)) +# print("Test dataloaders: (plane), (car), ..., (truck), (ALL), each with all the test data available,") +# print("\t(Approx. 5000 for each label and 10000 for all labels).") +# print("The actual size of each tr subset:", self.size_subsets) + +# def training_loaders(self, tr_dataset): +# if self.verbose: +# print("\n\n " +' Loading Training DataLoader for Experiments 1 '.center(70, '*')) + +# split_size = self.split_size + +# # Trial one: +# single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] +# # if self.shuffle: +# # random.shuffle(single_indices) +# single_dataset_tr = Subset(tr_dataset, single_indices) + +# # Trial two: +# remaining_indices = set(range(len(tr_dataset))) +# remaining_indices -= set(single_indices) +# remaining_indices = list(remaining_indices) +# if self.shuffle: +# random.shuffle(remaining_indices) + +# subset_datasets = [single_dataset_tr] +# n_splits = len(remaining_indices) // split_size +# if len(tr_dataset) % split_size != 0: +# n_splits += 1 +# for i in range(n_splits): +# start = i * split_size +# end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) +# subsets_indices = remaining_indices[start:end] +# subset_datasets.append(Subset(tr_dataset, subsets_indices)) + +# subset_loaders = [] +# for subset_dataset in subset_datasets: +# subset_loaders.append( +# get_data_loader( +# subset_dataset, +# batch_size=self.batch_size_tr, +# cuda=self.cuda, +# drop_last=self.drop_last, +# shuffle=self.shuffle, +# ) +# ) +# if self.verbose: +# print(' Loading training DataLoaders successful '.center(70, '*')) + +# # labels exist among the tr-datasets: +# labels_tr = [[self.label]] +# labels_tr_splits = list(items) +# labels_tr_splits.remove(self.label) +# labels_tr += [labels_tr_splits] * n_splits + +# # size of each tr dataset: +# size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + +# return subset_loaders, n_splits, labels_tr, size_subsets + + + +# class dataloaders_2(object): +# """ +# Corresponds to the case: +# First train all data from the specified label (0 for instance) along with the same portion of other labels, +# Then train all remaining shuffled data from remaining labels in sequential splits. +# """ + +# def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + +# self.cuda = general_config['cuda'] +# self.seed = general_config['seed'] +# self.label = general_config['label'] + +# self.shuffle = data_config['shuffle'] +# self.batch_size_tr = data_config['batch_size_tr'] +# self.batch_size_te = data_config['batch_size_te'] +# self.split_size = data_config['split_size'] +# if self.split_size > len(dataset_tr): +# raise ValueError("split_size exceeds the size of the initial training dataset.") +# self.normalize = data_config['normalize'] +# self.pad = data_config['pad'] +# self.permutation = data_config['permutation'] +# self.drop_last = data_config['drop_last'] +# self.size_all_tr_each = data_config['size_all_tr_each'] + +# # Set random seeds: +# np.random.seed(self.seed) +# torch.manual_seed(self.seed) +# random.seed(self.seed) + +# self.verbose = verbose +# self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) +# self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) +# if verbose: +# print("\nAn overview of the dataloaders for experiments 2:") +# print("Training dataloaders:") +# print("\t({}) with all available data plus {} samples from each other classes,".format(self.label, self.size_all_tr_each)) +# print("\t(all classes except {}: 1), ..., (all classes except {}: {})".format(self.label, self.label, self.n_splits)) +# print("Test dataloaders: (plane), (car), ..., (truck), (ALL), each with all the test data available,") +# print("\t(Approx. 6000 for each label and 10000 for all labels).") +# print("The actual size of each tr dataset:", self.size_subsets) + +# def training_loaders(self, tr_dataset): +# if self.verbose: +# print("\n\n " +' Loading Training DataLoader for Experiments 2 '.center(70, '*')) + +# split_size = self.split_size + +# # Trial one: +# digit_subsets = {} # (digit, dataset.Subset) +# for item in items: +# digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == item] +# if item == self.label: +# digit_subset_indices = digit_indices +# else: +# digit_subset_indices = digit_indices[:self.size_all_tr_each] +# digit_subsets[item] = Subset(tr_dataset, digit_subset_indices) +# all_prior_dataset_tr = ConcatDataset([digit_subsets[item] for item in items]) + +# # Trial two: +# remaining_indices = set(range(len(tr_dataset))) +# for item in items: +# remaining_indices -= set(digit_subsets[item].indices) +# remaining_indices = list(remaining_indices) + +# if self.shuffle: +# random.shuffle(remaining_indices) + +# subset_datasets = [all_prior_dataset_tr] +# n_splits = len(remaining_indices) // split_size +# if len(tr_dataset) % split_size != 0: +# n_splits += 1 +# for i in range(n_splits): +# start = i * split_size +# end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) +# subsets_indices = remaining_indices[start:end] +# subset_datasets.append(Subset(tr_dataset, subsets_indices)) + +# subset_loaders = [] +# for subset_dataset in subset_datasets: +# # print(n_splits) +# # print(split_size) +# # print(len(subset_dataset)) +# subset_loaders.append( +# get_data_loader( +# subset_dataset, +# batch_size=self.batch_size_tr, +# cuda=self.cuda, +# drop_last=self.drop_last, +# shuffle=self.shuffle, +# ) +# ) +# if self.verbose: +# print(' Loading training DataLoaders successful '.center(70, '*')) + +# # labels exist among the tr-datasets: +# labels_tr = [list(items)] +# labels_tr_splits = list(items) +# labels_tr_splits.remove(self.label) +# labels_tr += [labels_tr_splits] * n_splits + +# # size of each tr dataset: +# size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + +# return subset_loaders, n_splits, labels_tr, size_subsets + + +# class dataloaders_3(object): +# """ +# Corresponds to the `relearning` case: +# Similar to experiments 2, but inlude about 50% specified-label data in the first loader only, +# and after some splits (say 4), include the rest 50% of specified-label data by spliting evenly to the rest splits. +# """ +# def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + +# self.cuda = general_config['cuda'] +# self.seed = general_config['seed'] +# self.label = general_config['label'] + +# self.shuffle = data_config['shuffle'] +# self.batch_size_tr = data_config['batch_size_tr'] +# self.batch_size_te = data_config['batch_size_te'] +# self.split_size = data_config['split_size'] +# if self.split_size > len(dataset_tr): +# raise ValueError("split_size exceeds the size of the initial training dataset.") +# self.normalize = data_config['normalize'] +# self.pad = data_config['pad'] +# self.permutation = data_config['permutation'] +# self.drop_last = data_config['drop_last'] +# self.size_all_tr_each = data_config['size_all_tr_each'] +# self.n_relearning = data_config['n_relearning'] # Number of splits for relearning + +# # Set random seeds: +# np.random.seed(self.seed) +# torch.manual_seed(self.seed) +# random.seed(self.seed) + +# self.verbose = verbose +# self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) +# self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) +# if verbose: +# print("\nAn overview of the dataloaders for experiments 3:") +# print("Training dataloaders:") +# print("\t({}) with half available data plus {} samples from each other classes,".format(self.label, self.size_all_tr_each)) +# print("\t(classes except {}: 1), ..., (classes except {}: {})".format(self.label, self.label, self.n_splits - self.n_relearning)) +# print("\t(all classes: 1), ..., (all classes: {})".format(self.label, self.label, self.n_relearning)) +# print("Test dataloaders: (plane), (car), ..., (truck), (ALL), each with all the test data available,") +# print("\t(Approx. 5000 for each label and 10000 for all labels).") +# print("The actual size of each tr dataset:", self.size_subsets) + + +# def training_loaders(self, tr_dataset): +# if self.verbose: +# print("\n\n " +' Loading Training DataLoader for Experiments 3 '.center(70, '*')) + +# split_size = self.split_size + +# # First have the subset of all the ones with specified label: +# single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] +# # if self.shuffle: +# # random.shuffle(single_indices) +# single_dataset_tr = Subset(tr_dataset, single_indices) + +# single_former_size = len(single_indices) // 2 # nbr of samples with specified label included in the first subset +# single_indices_former = single_indices[:single_former_size] +# single_indices_latter = single_indices[single_former_size:] +# single_latter_size = len(single_indices_latter) // self.n_relearning # nbr of samples with specified label included in the last n_relearning subsets + +# # Trial 1: +# digit_subsets = {} # (digit, dataset.Subset) +# for item in items: +# digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == item] +# if item == self.label: +# digit_subset_indices = single_indices_former +# else: +# digit_subset_indices = digit_indices[:self.size_all_tr_each] +# digit_subsets[item] = Subset(tr_dataset, digit_subset_indices) +# all_prior_dataset_tr = ConcatDataset([digit_subsets[item] for item in items]) + +# # Trial 2: +# remaining_indices = set(range(len(tr_dataset))) +# for item in items: +# remaining_indices -= set(digit_subsets[item].indices) +# remaining_indices -= set(single_indices_latter) +# remaining_indices = list(remaining_indices) + +# if self.shuffle: +# random.shuffle(remaining_indices) + +# subset_datasets = [all_prior_dataset_tr] +# n_splits = len(remaining_indices) // split_size +# if len(tr_dataset) % split_size != 0: +# n_splits += 1 +# if self.n_relearning > n_splits: +# raise ValueError("The number of splits assigned for relearning of the specified label exceeds the total number of splits.") +# index_start_relearning = n_splits - self.n_relearning + +# for i in range(n_splits): +# start = i * split_size +# end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) +# subsets_indices = remaining_indices[start:end] +# subset_datasets.append(Subset(tr_dataset, subsets_indices)) + +# # Trial 3: +# for i in range(self.n_relearning, n_splits): +# start = i * single_latter_size +# end = (i + 1) * single_latter_size if i < self.n_relearning - 1 else len(single_indices_latter) +# relearning_indices = single_indices_latter[start:end] +# subset_datasets[i] = ConcatDataset([subset_datasets[i], Subset(tr_dataset, relearning_indices)]) + +# subset_loaders = [] +# for subset_dataset in subset_datasets: +# subset_loaders.append( +# get_data_loader( +# subset_dataset, +# batch_size=self.batch_size_tr, +# cuda=self.cuda, +# drop_last=self.drop_last, +# shuffle=self.shuffle, +# ) +# ) +# if self.verbose: +# print(' Loading training DataLoaders successful '.center(70, '*')) + +# # labels exist among the tr-datasets: +# labels_tr = [list(items)] +# labels_tr_splits = list(items) +# labels_tr_splits.remove(self.label) +# labels_tr += [labels_tr_splits] * (n_splits - self.n_relearning) +# labels_tr += [list(items)] * self.n_relearning + +# # size of each tr dataset: +# size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + +# return subset_loaders, n_splits, labels_tr, size_subsets + + +def test_loaders(te_dataset, general_config, data_config, verbose=True): + """ + Generate test DataLoaders for all experiment cases. + """ + label = general_config['label'] + cuda = general_config['cuda'] + seed = general_config['seed'] + + drop_last = data_config['drop_last'] + shuffle = data_config['shuffle'] + split_size = data_config['split_size'] + batch_size_te = data_config['batch_size_te'] + + # Set random seeds: + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) + + if verbose: + print("\n\n " +' Loading test DataLoaders '.center(70, '*')) + + test_datasets = [] + for d in range(10): + single_digit_indices = [i for i in range(len(te_dataset)) if te_dataset[i][1] == d] + test_datasets.append(Subset(te_dataset, single_digit_indices)) + test_datasets.append(te_dataset) + + test_loaders = [get_data_loader( + test_dataset, + batch_size=batch_size_te, + cuda=cuda, + drop_last=drop_last, + shuffle=shuffle) for test_dataset in test_datasets] + + if verbose: + print(' Loading test DataLoaders successful '.center(70, '*')) + + labels_te = [] + for i in range(10): + labels_te.append(i) + labels_te.append(tuple(range(10))) + + return test_loaders, labels_te + + + +class dataloaders_0(object): + """ + Corresponds to the case: + All trained data are shuffled and trained in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.split_size = data_config['split_size'] + if self.split_size > len(dataset_tr): + raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 0:") + print("Training dataloaders: (0-9: 1), ..., (0-9: {})".format(self.n_splits) + ", each split with {} data.".format(self.split_size)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr subset:", self.size_subsets) + + + def training_loaders(self, tr_dataset): + + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 0 '.center(70, '*')) + + split_size = self.split_size + + tr_dataset_indices = list(range(len(tr_dataset))) + if self.shuffle: + random.shuffle(tr_dataset_indices) + + subset_datasets = [] + n_splits = len(tr_dataset) // split_size + if len(tr_dataset) % split_size != 0: + n_splits += 1 + + for i in range(n_splits): + start = i * split_size + end = (i + 1) * split_size if i < n_splits - 1 else len(tr_dataset_indices) + subsets_indices = tr_dataset_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle) for subset_dataset in subset_datasets] + + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] * 10 + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_splits, labels_tr, size_subsets + + + +class dataloaders_1(object): + """ + Corresponds to the case: + First train all data from the specified label (0 for instance) + Then train all shuffled data from remaining labels in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.split_size = data_config['split_size'] + if self.split_size > len(dataset_tr): + raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 1:") + print("Training dataloaders:") + print("\t({}) with all available data,".format(self.label)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_splits)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr subset:", self.size_subsets) + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 1 '.center(70, '*')) + + split_size = self.split_size + + # Trial one: + single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] + # if self.shuffle: + # random.shuffle(single_indices) + single_dataset_tr = Subset(tr_dataset, single_indices) + + # Trial two: + remaining_indices = set(range(len(tr_dataset))) + remaining_indices -= set(single_indices) + remaining_indices = list(remaining_indices) + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [single_dataset_tr] + n_splits = len(remaining_indices) // split_size + if len(tr_dataset) % split_size != 0: + n_splits += 1 + for i in range(n_splits): + start = i * split_size + end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [] + for subset_dataset in subset_datasets: + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [[self.label]] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * n_splits + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_splits, labels_tr, size_subsets + + + +class dataloaders_2(object): + """ + Corresponds to the case: + First train all data from the specified label (0 for instance) along with the same portion of other labels, + Then train all remaining shuffled data from remaining labels in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.split_size = data_config['split_size'] + if self.split_size > len(dataset_tr): + raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + self.size_all_tr_each = data_config['size_all_tr_each'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 2:") + print("Training dataloaders:") + print("\t({}) with all available data plus {} samples from each other labels,".format(self.label, self.size_all_tr_each)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_splits)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr dataset:", self.size_subsets) + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 2 '.center(70, '*')) + + split_size = self.split_size + + # Trial one: + digit_subsets = {} # (digit, dataset.Subset) + for digit in range(10): + digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == digit] + if digit == self.label: + digit_subset_indices = digit_indices + else: + digit_subset_indices = digit_indices[:self.size_all_tr_each] + digit_subsets[digit] = Subset(tr_dataset, digit_subset_indices) + all_prior_dataset_tr = ConcatDataset([digit_subsets[digit] for digit in range(10)]) + + # Trial two: + remaining_indices = set(range(len(tr_dataset))) + for digit in range(10): + remaining_indices -= set(digit_subsets[digit].indices) + remaining_indices = list(remaining_indices) + + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [all_prior_dataset_tr] + n_splits = len(remaining_indices) // split_size + if len(tr_dataset) % split_size != 0: + n_splits += 1 + for i in range(n_splits): + start = i * split_size + end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [] + for subset_dataset in subset_datasets: + # print(n_splits) + # print(split_size) + # print(len(subset_dataset)) + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * n_splits + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_splits, labels_tr, size_subsets + + +class dataloaders_3(object): + """ + Corresponds to the `relearning` case: + Similar to experiments 2, but inlude about 50% specified-label data in the first loader only, + and after some splits (say 4), include the rest 50% of specified-label data by spliting evenly to the rest splits. + """ + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.split_size = data_config['split_size'] + if self.split_size > len(dataset_tr): + raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + self.size_all_tr_each = data_config['size_all_tr_each'] + self.n_relearning = data_config['n_relearning'] # Number of splits for relearning + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 3:") + print("Training dataloaders:") + print("\t({}) with half available data plus {} samples from each other labels,".format(self.label, self.size_all_tr_each)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_splits - self.n_relearning)) + print("\t(0-9: 1), ..., (0-9: {})".format(self.label, self.label, self.n_relearning)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr dataset:", self.size_subsets) + + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 3 '.center(70, '*')) + + split_size = self.split_size + + # First have the subset of all the ones with specified label: + single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] + # if self.shuffle: + # random.shuffle(single_indices) + single_dataset_tr = Subset(tr_dataset, single_indices) + + single_former_size = len(single_indices) // 2 # nbr of samples with specified label included in the first subset + single_indices_former = single_indices[:single_former_size] + single_indices_latter = single_indices[single_former_size:] + single_latter_size = len(single_indices_latter) // self.n_relearning # nbr of samples with specified label included in the last n_relearning subsets + + # Trial 1: + digit_subsets = {} # (digit, dataset.Subset) + for digit in range(10): + digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == digit] + if digit == self.label: + digit_subset_indices = single_indices_former + else: + digit_subset_indices = digit_indices[:self.size_all_tr_each] + digit_subsets[digit] = Subset(tr_dataset, digit_subset_indices) + all_prior_dataset_tr = ConcatDataset([digit_subsets[digit] for digit in range(10)]) + + # Trial 2: + remaining_indices = set(range(len(tr_dataset))) + for digit in range(10): + remaining_indices -= set(digit_subsets[digit].indices) + remaining_indices -= set(single_indices_latter) + remaining_indices = list(remaining_indices) + + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [all_prior_dataset_tr] + n_splits = len(remaining_indices) // split_size + if len(tr_dataset) % split_size != 0: + n_splits += 1 + if self.n_relearning > n_splits: + raise ValueError("The number of splits assigned for relearning of the specified label exceeds the total number of splits.") + index_start_relearning = n_splits - self.n_relearning + + for i in range(n_splits): + start = i * split_size + end = (i + 1) * split_size if i < n_splits - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + # Trial 3: + for i in range(self.n_relearning, n_splits): + start = i * single_latter_size + end = (i + 1) * single_latter_size if i < self.n_relearning - 1 else len(single_indices_latter) + relearning_indices = single_indices_latter[start:end] + subset_datasets[i] = ConcatDataset([subset_datasets[i], Subset(tr_dataset, relearning_indices)]) + + subset_loaders = [] + for subset_dataset in subset_datasets: + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * (n_splits - self.n_relearning) + labels_tr += [list(range(10))] * self.n_relearning + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_splits, labels_tr, size_subsets \ No newline at end of file diff --git a/concept_formation/vision-experiments/datasets_mnist.py b/concept_formation/vision-experiments/datasets_mnist.py new file mode 100644 index 0000000..3d125e3 --- /dev/null +++ b/concept_formation/vision-experiments/datasets_mnist.py @@ -0,0 +1,664 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader, Subset, ConcatDataset +from torchvision import datasets, transforms +import copy +import random + + + +def MNIST_dataset(split='train', pad=False, normalize=False, permutation=False, + download=True, verbose=True): + """ + Load the original MNIST training/test dataset. + """ + + dataset_class = datasets.MNIST + transform = [transforms.ToTensor(), transforms.Pad(2)] if pad else [transforms.ToTensor()] + if normalize: + transform.append(transforms.Normalize((0.1307,), (0.3081,))) # mean and std for all pixels in MNIST + # transform.append(transforms.Normalize((0.5,), (0.5,))) + if permutation: + transform.append(transforms.Lambda(lambda x, p=permutation: permutate_image_pixels(x, p))) + dataset_transform = transforms.Compose(transform) + + # Load dataset: + dataset = dataset_class('./datasets/MNIST', + train=False if split=='test' else True, + download=download, + transform=dataset_transform) + if verbose: + print("MNIST {} dataset consisting of {} samples.".format(split, len(dataset))) + + return dataset + + + +def get_data_loader(dataset, batch_size, cuda=False, drop_last=False, shuffle=False): + """ + Return object for the provided dataset object. + """ + return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, + **({'num_workers': 0, 'pin_memory': True} if cuda else {})) + + +def test_loaders(te_dataset, general_config, data_config, verbose=True): + """ + Generate test DataLoaders for all experiment cases. + """ + label = general_config['label'] + cuda = general_config['cuda'] + seed = general_config['seed'] + + drop_last = data_config['drop_last'] + shuffle = data_config['shuffle'] + split_size = data_config['split_size'] + batch_size_te = data_config['batch_size_te'] + + # Set random seeds: + np.random.seed(seed) + torch.manual_seed(seed) + random.seed(seed) + + if verbose: + print("\n\n " +' Loading test DataLoaders '.center(70, '*')) + + test_datasets = [] + for d in range(10): + single_digit_indices = [i for i in range(len(te_dataset)) if te_dataset[i][1] == d] + test_datasets.append(Subset(te_dataset, single_digit_indices)) + test_datasets.append(te_dataset) + + test_loaders = [get_data_loader( + test_dataset, + batch_size=batch_size_te, + cuda=cuda, + drop_last=drop_last, + shuffle=shuffle) for test_dataset in test_datasets] + + if verbose: + print(' Loading test DataLoaders successful '.center(70, '*')) + + labels_te = [] + for i in range(10): + labels_te.append(i) + labels_te.append(tuple(range(10))) + + return test_loaders, labels_te + + + +class dataloaders_0(object): + """ + Corresponds to the case: + All trained data are shuffled and trained in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.split_size = data_config['split_size'] + if self.split_size > len(dataset_tr): + raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_splits, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 0:") + print("Training dataloaders: (0-9: 1), ..., (0-9: {})".format(self.n_splits) + ", each split with {} data.".format(self.split_size)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr subset:", self.size_subsets) + + + def training_loaders(self, tr_dataset): + + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 0 '.center(70, '*')) + + split_size = self.split_size + + tr_dataset_indices = list(range(len(tr_dataset))) + if self.shuffle: + random.shuffle(tr_dataset_indices) + + subset_datasets = [] + n_splits = len(tr_dataset) // split_size + if len(tr_dataset) % split_size != 0: + n_splits += 1 + + for i in range(n_splits): + start = i * split_size + end = (i + 1) * split_size if i < n_splits - 1 else len(tr_dataset_indices) + subsets_indices = tr_dataset_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle) for subset_dataset in subset_datasets] + + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] * 10 + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_splits, labels_tr, size_subsets + + + +class dataloaders_1(object): + """ + Corresponds to the case: + First train all data from the specified label (0 for instance) + Then train all shuffled data from remaining labels in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.n_split = data_config['n_split'] + # if self.split_size > len(dataset_tr): + # raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_rest_split, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 1:") + print("Training dataloaders:") + print("\t({}) with all available data,".format(self.label)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_rest_split)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr subset:", self.size_subsets) + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 1 '.center(70, '*')) + + n_split = self.n_split + + # Trial one: + single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] + # if self.shuffle: + # random.shuffle(single_indices) + single_dataset_tr = Subset(tr_dataset, single_indices) + + # Trial two: + remaining_indices = set(range(len(tr_dataset))) + remaining_indices -= set(single_indices) + remaining_indices = list(remaining_indices) + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [single_dataset_tr] + n_rest_split = n_split - 1 # nbr of splits for rest labels + split_size = len(remaining_indices) // n_rest_split + # n_splits = len(remaining_indices) // split_size + # if len(tr_dataset) % split_size != 0: + # n_splits += 1 + for i in range(n_rest_split): + start = i * split_size + end = (i + 1) * split_size if i < n_rest_split - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [] + for subset_dataset in subset_datasets: + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [[self.label]] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * n_rest_split + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_rest_split, labels_tr, size_subsets + + + +class dataloaders_2(object): + """ + Corresponds to the case: + First train all data from the specified label (0 for instance) along with the same portion of other labels, + Then train all remaining shuffled data from remaining labels in sequential splits. + """ + + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.n_split = data_config['n_split'] + # if self.split_size > len(dataset_tr): + # raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + self.size_all_tr_each = data_config['size_all_tr_each'] + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_rest_split, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 2:") + print("Training dataloaders:") + print("\t({}) with all available data plus {} samples from each other labels,".format(self.label, self.size_all_tr_each)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_rest_split)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr dataset:", self.size_subsets) + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 2 '.center(70, '*')) + + n_split = self.n_split + + # Trial one: + digit_subsets = {} # (digit, dataset.Subset) + for digit in range(10): + digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == digit] + if digit == self.label: + digit_subset_indices = digit_indices + else: + digit_subset_indices = digit_indices[:self.size_all_tr_each] + digit_subsets[digit] = Subset(tr_dataset, digit_subset_indices) + all_prior_dataset_tr = ConcatDataset([digit_subsets[digit] for digit in range(10)]) + + # Trial two: + remaining_indices = set(range(len(tr_dataset))) + for digit in range(10): + remaining_indices -= set(digit_subsets[digit].indices) + remaining_indices = list(remaining_indices) + + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [all_prior_dataset_tr] + # n_splits = len(remaining_indices) // split_size + # split_size = len(remaining_indices) // n_split + # if len(tr_dataset) % split_size != 0: + # n_splits += 1 + n_rest_split = n_split - 1 + split_size = len(remaining_indices) // n_rest_split + for i in range(n_rest_split): + start = i * split_size + end = (i + 1) * split_size if i < n_rest_split - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + subset_loaders = [] + for subset_dataset in subset_datasets: + # print(n_splits) + # print(split_size) + # print(len(subset_dataset)) + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * n_rest_split + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_rest_split, labels_tr, size_subsets + + +class dataloaders_3(object): + """ + Corresponds to the `relearning` case: + Similar to experiments 2, but inlude about 50% specified-label data in the first loader only, + and after some splits (say 4), include the rest 50% of specified-label data by spliting evenly to the rest splits. + """ + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.n_split = data_config['n_split'] + # if self.split_size > len(dataset_tr): + # raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + self.size_all_tr_each = data_config['size_all_tr_each'] + self.n_relearning = data_config['n_relearning'] # Number of splits for relearning + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_rest_split, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 3:") + print("Training dataloaders:") + print("\t({}) with half available data plus {} samples from each other labels,".format(self.label, self.size_all_tr_each)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_rest_split - self.n_relearning)) + print("\t(0-9: 1), ..., (0-9: {})".format(self.label, self.label, self.n_relearning)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr dataset:", self.size_subsets) + + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 3 '.center(70, '*')) + + n_split = self.n_split + + # First have the subset of all the ones with specified label: + single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] + # if self.shuffle: + # random.shuffle(single_indices) + single_dataset_tr = Subset(tr_dataset, single_indices) + + single_former_size = len(single_indices) // 2 # nbr of samples with specified label included in the first subset + single_indices_former = single_indices[:single_former_size] + single_indices_latter = single_indices[single_former_size:] + single_latter_size = len(single_indices_latter) // self.n_relearning # nbr of samples with specified label included in the last n_relearning subsets + + # Trial 1: + digit_subsets = {} # (digit, dataset.Subset) + for digit in range(10): + digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == digit] + if digit == self.label: + digit_subset_indices = single_indices_former + else: + digit_subset_indices = digit_indices[:self.size_all_tr_each] + digit_subsets[digit] = Subset(tr_dataset, digit_subset_indices) + all_prior_dataset_tr = ConcatDataset([digit_subsets[digit] for digit in range(10)]) + + # Trial 2: + remaining_indices = set(range(len(tr_dataset))) + for digit in range(10): + remaining_indices -= set(digit_subsets[digit].indices) + remaining_indices -= set(single_indices_latter) + remaining_indices = list(remaining_indices) + + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [all_prior_dataset_tr] + + n_rest_split = n_split - 1 + split_size = len(remaining_indices) // n_rest_split + # if len(tr_dataset) % split_size != 0: + # n_splits += 1 + if self.n_relearning > n_rest_split: + raise ValueError("The number of splits assigned for relearning of the specified label exceeds the total number of rest splits.") + index_start_relearning = n_split - self.n_relearning + + for i in range(n_rest_split): + start = i * split_size + end = (i + 1) * split_size if i < n_rest_split - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + # print(index_start_relearning) + # print(n_splits) + # print(single_latter_size) + # Trial 3: + for i in range(self.n_relearning): + start = i * single_latter_size + end = (i + 1) * single_latter_size if i < self.n_relearning - 1 else len(single_indices_latter) + relearning_indices = single_indices_latter[start:end] + # print(start, end) + subset_datasets[index_start_relearning + i] = ConcatDataset([subset_datasets[index_start_relearning + i], Subset(tr_dataset, relearning_indices)]) + + subset_loaders = [] + for subset_dataset in subset_datasets: + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * (n_rest_split - self.n_relearning) + labels_tr += [list(range(10))] * self.n_relearning + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_rest_split, labels_tr, size_subsets + + + +class dataloaders_4(object): + """ + This is a `bonus` one that also corresponds to the `relearning` case: + Similar to experiments 3, but choose the splits with chosen labels in the middle only instead of last splits. + """ + def __init__(self, general_config, data_config, dataset_tr, dataset_te, verbose): + + self.cuda = general_config['cuda'] + self.seed = general_config['seed'] + self.label = general_config['label'] + + self.shuffle = data_config['shuffle'] + self.batch_size_tr = data_config['batch_size_tr'] + self.batch_size_te = data_config['batch_size_te'] + self.n_split = data_config['n_split'] + # if self.split_size > len(dataset_tr): + # raise ValueError("split_size exceeds the size of the initial training dataset.") + self.normalize = data_config['normalize'] + self.pad = data_config['pad'] + self.permutation = data_config['permutation'] + self.drop_last = data_config['drop_last'] + self.size_all_tr_each = data_config['size_all_tr_each'] + self.n_relearning = data_config['n_relearning'] # Number of splits for relearning + + # Set random seeds: + np.random.seed(self.seed) + torch.manual_seed(self.seed) + random.seed(self.seed) + + self.verbose = verbose + self.training_loaders, self.n_rest_split, self.labels_tr, self.size_subsets = self.training_loaders(dataset_tr) + self.test_loaders, self.labels_te = test_loaders(dataset_te, general_config, data_config, verbose=verbose) + if verbose: + print("\nAn overview of the dataloaders for experiments 4:") + print("Training dataloaders:") + print("\t({}) with half available data plus {} samples from each other labels,".format(self.label, self.size_all_tr_each)) + print("\t(0-9 except {}: 1), ..., (0-9 except {}: {})".format(self.label, self.label, self.n_rest_split)) + print("\tWhile the middle {} splits are added with the rest data from label {}.".format(self.n_relearning, self.label)) + # print("\t(0-9: 1), ..., (0-9: {})".format(self.label, self.label, self.n_relearning)) + print("Test dataloaders: (0), (1), ..., (9), (0-9), each with all the test data available,") + print("\t(Approx. 6000 for each label and 10000 for all labels).") + print("The actual size of each tr dataset:", self.size_subsets) + + + def training_loaders(self, tr_dataset): + if self.verbose: + print("\n\n " +' Loading Training DataLoader for Experiments 4 '.center(70, '*')) + + n_split = self.n_split + + # First have the subset of all the ones with specified label: + single_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == self.label] + # if self.shuffle: + # random.shuffle(single_indices) + single_dataset_tr = Subset(tr_dataset, single_indices) + + single_former_size = len(single_indices) // 2 # nbr of samples with specified label included in the first subset + single_indices_former = single_indices[:single_former_size] + single_indices_latter = single_indices[single_former_size:] + single_latter_size = len(single_indices_latter) // self.n_relearning # nbr of samples with specified label included in the last n_relearning subsets + + # Decide which splits are chosen as the relearning ones: + middle_split = n_split // 2 + if self.n_relearning > n_split - 2: + raise ValueError("The number of splits for relearning is not valid. It should be less than the number of splits - 2 for this experiment set.") + if self.n_relearning % 2 != 0: + index_start_relearning = int(middle_split - (self.n_relearning // 2)) + # index_end_relearning = int(middle_split + (self.n_relearning // 2)) + else: + index_start_relearning = int(middle_split - (self.n_relearning / 2 - 1)) + # index_end_relearning = int(middle_split + (self.n_relearning / 2)) + # print(middle_split) + # print(index_start_relearning) + + # Trial 1: + digit_subsets = {} # (digit, dataset.Subset) + for digit in range(10): + digit_indices = [i for i in range(len(tr_dataset)) if tr_dataset[i][1] == digit] + if digit == self.label: + digit_subset_indices = single_indices_former + else: + digit_subset_indices = digit_indices[:self.size_all_tr_each] + digit_subsets[digit] = Subset(tr_dataset, digit_subset_indices) + all_prior_dataset_tr = ConcatDataset([digit_subsets[digit] for digit in range(10)]) + + # Trial 2: + remaining_indices = set(range(len(tr_dataset))) + for digit in range(10): + remaining_indices -= set(digit_subsets[digit].indices) + remaining_indices -= set(single_indices_latter) + remaining_indices = list(remaining_indices) + + if self.shuffle: + random.shuffle(remaining_indices) + + subset_datasets = [all_prior_dataset_tr] + + n_rest_split = n_split - 1 + split_size = len(remaining_indices) // n_rest_split + # if len(tr_dataset) % split_size != 0: + # n_splits += 1 + # if self.n_relearning > n_rest_split: + # raise ValueError("The number of splits assigned for relearning of the specified label exceeds the total number of rest splits.") + # index_start_relearning = n_split - self.n_relearning + + for i in range(n_rest_split): + start = i * split_size + end = (i + 1) * split_size if i < n_rest_split - 1 else len(remaining_indices) + subsets_indices = remaining_indices[start:end] + subset_datasets.append(Subset(tr_dataset, subsets_indices)) + + # print(index_start_relearning) + # print(n_splits) + # print(single_latter_size) + + # Trial 3 & 4: + for i in range(self.n_relearning): + start = i * single_latter_size + end = (i + 1) * single_latter_size if i < self.n_relearning - 1 else len(single_indices_latter) + relearning_indices = single_indices_latter[start:end] + # print(start, end) + subset_datasets[index_start_relearning + i] = ConcatDataset([subset_datasets[index_start_relearning + i], Subset(tr_dataset, relearning_indices)]) + + subset_loaders = [] + for subset_dataset in subset_datasets: + subset_loaders.append( + get_data_loader( + subset_dataset, + batch_size=self.batch_size_tr, + cuda=self.cuda, + drop_last=self.drop_last, + shuffle=self.shuffle, + ) + ) + if self.verbose: + print(' Loading training DataLoaders successful '.center(70, '*')) + + # labels exist among the tr-datasets: + labels_tr = [list(range(10))] + labels_tr_splits = list(range(10)) + labels_tr_splits.remove(self.label) + labels_tr += [labels_tr_splits] * (n_rest_split - self.n_relearning) + labels_tr += [list(range(10))] * self.n_relearning + + # size of each tr dataset: + size_subsets = [len(subset_dataset) for subset_dataset in subset_datasets] + + return subset_loaders, n_rest_split, labels_tr, size_subsets diff --git a/concept_formation/vision-experiments/experiment_0.py b/concept_formation/vision-experiments/experiment_0.py new file mode 100644 index 0000000..8b2b461 --- /dev/null +++ b/concept_formation/vision-experiments/experiment_0.py @@ -0,0 +1,115 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_cifar import dataloaders_0 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 6000 + else: + data_config['split_size'] = 5000 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_0_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + + # Dataloaders: + dataloaders = dataloaders_0(general_config, data_config, dataset_tr, dataset_te, verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 0") + print("Experiments description: Train data from all labels with sequential splits.") + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_0_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + + # Dataloaders: + dataloaders = dataloaders_0(general_config, data_config, dataset_tr, dataset_te, verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 0") + print("Experiments description: Train data from all labels with sequential splits.") + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + diff --git a/concept_formation/vision-experiments/experiment_1.py b/concept_formation/vision-experiments/experiment_1.py new file mode 100644 index 0000000..2993684 --- /dev/null +++ b/concept_formation/vision-experiments/experiment_1.py @@ -0,0 +1,118 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_mnist import dataloaders_1 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 6000 + else: + data_config['split_size'] = 5000 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_1_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + + # Dataloaders: + dataloaders = dataloaders_1(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 1") + print("Experiments description: Train all data from label {} first, then train the rest with sequential splits.".format(label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_1_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + + # Dataloaders: + dataloaders = dataloaders_1(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 1") + print("Experiments description: Train all data from label {} first, then train the rest with sequential splits.".format(label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs diff --git a/concept_formation/vision-experiments/experiment_2.py b/concept_formation/vision-experiments/experiment_2.py new file mode 100644 index 0000000..e2e506d --- /dev/null +++ b/concept_formation/vision-experiments/experiment_2.py @@ -0,0 +1,124 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_mnist import dataloaders_2 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 5400 + data_config['size_all_tr_each'] = 600 + else: + data_config['split_size'] = 4500 + data_config['size_all_tr_each'] = 500 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_2_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_2(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 2") + print("Experiments description: Train all data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_2_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_2(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 2") + print("Experiments description: Train all data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs diff --git a/concept_formation/vision-experiments/experiment_3.py b/concept_formation/vision-experiments/experiment_3.py new file mode 100644 index 0000000..2f46f9e --- /dev/null +++ b/concept_formation/vision-experiments/experiment_3.py @@ -0,0 +1,126 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_mnist import dataloaders_3 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 5400 + data_config['size_all_tr_each'] = 600 + else: + data_config['split_size'] = 4500 + data_config['size_all_tr_each'] = 500 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_3_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_3(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 3") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the last {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_3_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_3(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 3") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the last {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs diff --git a/concept_formation/vision-experiments/experiment_4.py b/concept_formation/vision-experiments/experiment_4.py new file mode 100644 index 0000000..165bbf5 --- /dev/null +++ b/concept_formation/vision-experiments/experiment_4.py @@ -0,0 +1,126 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_mnist import dataloaders_4 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 5400 + data_config['size_all_tr_each'] = 600 + else: + data_config['split_size'] = 4500 + data_config['size_all_tr_each'] = 500 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_4_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_4(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 4") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the middle {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_4_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_4(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 4") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the middle {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs diff --git a/concept_formation/vision-experiments/experiment_5.py b/concept_formation/vision-experiments/experiment_5.py new file mode 100644 index 0000000..65f50d1 --- /dev/null +++ b/concept_formation/vision-experiments/experiment_5.py @@ -0,0 +1,159 @@ +import torch + +from concept_formation.cobweb_torch import CobwebTorchTree +from datasets_mnist import dataloaders_4 +import models_nn +import models_cobweb + + +def reconfig(general_config, model_config, data_config): + """ + Re-initialization for model_config and data_config based on requirements of experiments 0. + """ + if data_config['dataset'] == 'mnist': + data_config['split_size'] = 5400 + data_config['size_all_tr_each'] = 600 + else: + data_config['split_size'] = 4500 + data_config['size_all_tr_each'] = 500 + if model_config['type'] == 'cobweb': + general_config['cuda'] == False + data_config['batch_size_tr'] = 60000 + data_config['batch_size_te'] = 10000 + + +def experiment_5_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_4(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + device = torch.device("cuda" if general_config['cuda'] else "cpu") + model, optimizer = models_nn.build_model(model_config, data_config, device) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 5") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the middle {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("\tAfter all the process, repeated them again.") + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # fc or cnn + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("Epochs:", model_config['epoch']) + print("\nModel overview:") + print(model) + print("\nOptimizer:") + print(optimizer) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + # Repeat the process: + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+len(loaders_tr)+1).center(70, '=')) + + for epoch in range(1, model_config['epoch'] + 1): + if verbose: + print("\n\n [Epoch {}]".format(epoch)) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_nn.train(model, optimizer, loaders_tr[i], epoch, model_config['log_interval'], device) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_nn.test(model, loaders_te[j], device) + test_accs.append(acc.item()) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs + + +def experiment_5_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose): + + reconfig(general_config, model_config, data_config) + label = general_config['label'] + size_all_tr_each = data_config['size_all_tr_each'] + + # Dataloaders: + dataloaders = dataloaders_4(general_config, data_config, dataset_tr, dataset_te, verbose=verbose) + loaders_tr = dataloaders.training_loaders + loaders_te = dataloaders.test_loaders + + # Models and optimizers: + example_imgs, _ = next(iter(loaders_tr[0])) + model = CobwebTorchTree(example_imgs.shape[1:]) + + # Store the test accuracies + test_accs = [] + + if verbose: + print('\n\n' + ' START EXPERIMENTS '.center(70, '~')) + print("Experiments type: 5") + print("Experiments description: Train half data from label {} and some data ({}) from each remaining label first," + " then train the rest with sequential splits.".format(label, size_all_tr_each)) + print("\tIn the middle {} splits, fit in the remaining training data from label {} evenly. (ReLearning)".format(dataloaders.n_relearning, label)) + print("\tAfter all the process, repeated them again.") + print("Number of Train-test trials:", len(loaders_tr)) + print("Model:", model_config['type']) # cobweb + print("Seed:", general_config['seed']) + print("The selected label:", label) + print("\nCUDA is {}used.".format("" if general_config['cuda'] else "NOT ")) + + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + # Repeat the process: + for i in range(len(loaders_tr)): + if verbose: + print("\n\n" + " Trial {} ".format(i+len(loaders_tr)+1).center(70, '=')) + print("\n====> Model Training with labels {} <====".format(dataloaders.labels_tr[i])) + models_cobweb.train(model, loaders_tr[i]) + + for j in range(len(loaders_te)): + if verbose: + print("\n----> Model Testing with labels {} <----".format(dataloaders.labels_te[j])) + acc = models_cobweb.test(model, loaders_te[j]) + print("Test accuracy: {}".format(acc)) + test_accs.append(acc) + + print("\n\nThis is the end of the experiments.") + print("There are {} test accuracy data in total.".format(len(test_accs))) + return test_accs diff --git a/concept_formation/vision-experiments/main_cifar.py b/concept_formation/vision-experiments/main_cifar.py new file mode 100644 index 0000000..7c39676 --- /dev/null +++ b/concept_formation/vision-experiments/main_cifar.py @@ -0,0 +1,153 @@ +import os +import pandas as pd +import numpy as np +import time +import torch +from torch import optim +import argparse +from tqdm import tqdm + +from options import general_options, data_options, model_options +from datasets_cifar import CIFAR_dataset +from experiment_0 import experiment_0_nn, experiment_0_cobweb +from experiment_1 import experiment_1_nn, experiment_1_cobweb +from experiment_2 import experiment_2_nn, experiment_2_cobweb +from experiment_3 import experiment_3_nn, experiment_3_cobweb + + +def checkattr(args, attr): + '''Check whether attribute exists, whether it's a boolean and whether its value is True.''' + return hasattr(args, attr) and type(getattr(args, attr))==bool and getattr(args, attr) + + + +def accuracy2csv(general_config, model_config, test_accs): + data = {'test_acc': test_accs} + df = pd.DataFrame(data) + + model = model_config['type'] + seed = general_config['seed'] + label = general_config['label'] + experiment = general_config['type'] + + folder_name = 'test_accs' + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + file_name = 'cifar_e' + str(experiment) + '_' + model + '_s' + str(seed) + '_l' + str(label) + "_accs.csv" + file_path = os.path.join(folder_name, file_name) + df.to_csv(file_path, index=False) + + + +def experiments(args): + + verbose = True + + general_config = { + 'type': args.experiment, + 'label': args.label, + 'seed': args.seed, + 'cuda': False, + } + + model_config = { + 'type': args.model_type, + 'lr': args.lr, + 'epoch': args.epoch, + 'log_interval': args.log_interval, + 'momentum': args.momentum, + 'kernel': args.kernel, + } + + data_config = { + # Basic info of the initial dataset: + 'dataset': 'cifar', + 'available_labels': 10, + 'image_size': 32, + + # Size of each dataset: + 'size_all_tr_each': args.size_all_tr_each, + 'split_size': args.split_size, + 'n_relearning': args.n_relearning, + + # settings: + 'shuffle': True, + 'normalize': False, + 'batch_size_tr': args.batch_size_tr, + 'batch_size_te': args.batch_size_te, + 'drop_last': False, + 'pad': False, + 'permutation': False, + } + + if checkattr(args, 'no_shuffle'): + data_config['shuffle'] = False + if checkattr(args, 'normalize'): + data_config['normalize'] = True + if checkattr(args, 'drop_last'): + data_config['drop_last'] = True + if checkattr(args, 'pad'): + data_config['pad'] = True + if checkattr(args, 'permutation'): + data_config['permutation'] = True + + # Cuda: + if torch.cuda.is_available(): + model_config['cuda'] = True + if checkattr(args, 'no_cuda'): + model_config['cuda'] = False + device = torch.device("cuda" if general_config['cuda'] else "cpu") + + # Load initial datasets: + dataset_tr = CIFAR_dataset(split='train', + pad=data_config['pad'], + normalize=data_config['normalize'], + permutation=data_config['permutation'], + download=True, verbose=verbose) + dataset_te = CIFAR_dataset(split='test', + pad=data_config['pad'], + normalize=data_config['normalize'], + permutation=data_config['permutation'], + download=True, verbose=verbose) + + experiment = general_config['type'] + model_type = model_config['type'] + label = general_config['label'] + split_size = data_config['split_size'] + log_interval = model_config['log_interval'] + + if experiment == 0: + if model_type == 'cobweb': + test_accs = experiment_0_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_0_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 1: + if model_type == 'cobweb': + test_accs = experiment_1_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_1_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 2: + if model_type == 'cobweb': + test_accs = experiment_2_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_2_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 3: + if model_type == 'cobweb': + test_accs = experiment_3_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_3_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + accuracy2csv(general_config, model_config, test_accs) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser("./main_cifar.py", + description="Run sets of experiments based on the MNIST dataset.") + parser = general_options(parser) + parser = data_options(parser) + parser = model_options(parser) + args = parser.parse_args() + + experiments(args) + diff --git a/concept_formation/vision-experiments/main_mnist.py b/concept_formation/vision-experiments/main_mnist.py new file mode 100644 index 0000000..3d85b36 --- /dev/null +++ b/concept_formation/vision-experiments/main_mnist.py @@ -0,0 +1,257 @@ +import os +import pandas as pd +import numpy as np +import time +import torch +from torch import optim +import argparse +from tqdm import tqdm + +from options import general_options, data_options, model_options +from datasets_mnist import MNIST_dataset +from experiment_0 import experiment_0_nn, experiment_0_cobweb +from experiment_1 import experiment_1_nn, experiment_1_cobweb +from experiment_2 import experiment_2_nn, experiment_2_cobweb +from experiment_3 import experiment_3_nn, experiment_3_cobweb +from experiment_4 import experiment_4_nn, experiment_4_cobweb +from experiment_5 import experiment_5_nn, experiment_5_cobweb + + +def checkattr(args, attr): + '''Check whether attribute exists, whether it's a boolean and whether its value is True.''' + return hasattr(args, attr) and type(getattr(args, attr))==bool and getattr(args, attr) + + + +def accuracy2csv(general_config, model_config, data_config, test_accs): + + model = model_config['type'] + if model == 'cobweb': + model = 'cobweb4v' + seed = general_config['seed'] + label = general_config['label'] + experiment = general_config['type'] + n_split = data_config['n_split'] + + trainset = [] + if experiment == 0: + for i in range(1, n_split + 1): + trainset += ["S%d" % i] * 11 + elif experiment == 1 or experiment == 2: + for i in range(0, n_split): + trainset += ["S%d" % i] * 11 + elif experiment == 3: + n_relearning = data_config['n_relearning'] + for i in range(0, n_split - n_relearning): + trainset += ["S%d" % i] * 11 + for i in range(1, n_relearning + 1): + trainset += ["R%d" % i] * 11 + else: + n_relearning = data_config['n_relearning'] + middle_split = n_split // 2 + if n_relearning % 2 != 0: + index_start_relearning = int(middle_split - (n_relearning // 2)) + else: + index_start_relearning = int(middle_split - (n_relearning / 2 - 1)) + for i in range(0, index_start_relearning): + trainset += ["S%d" % i] * 11 + for i in range(n_relearning): + trainset += ["R%d" % i] * 11 + for i in range(n_split - n_relearning - index_start_relearning): + trainset += ["R%d" % (i + index_start_relearning)] * 11 + + + testset = [f"L{i}" for i in range(10)] + testset[label] = "L0" + for i in range(len(testset)): + if i != label: + testset[i] = f"L{i + 1}" + else: + break + testset.append("All") + # print(testset) + testset = testset * n_split + + # print(len(trainset)) + # print(len(testset)) + # print(len(test_accs)) + # print(len(trainset)) + # print(len(trainset)) + # print(len(trainset)) + + data = { + 'TrainSet': trainset, + 'TestSet': testset, + 'Model': [model] * len(trainset), + 'Seed': [seed] * len(trainset), + 'Experiment': [experiment] * len(trainset), + 'TestAccuracy': test_accs, + } + if experiment == 5: + data = { + 'TrainSet': trainset * 2, + 'TestSet': testset * 2, + 'Model': ([model] * len(trainset)) * 2, + 'Seed': ([seed] * len(trainset)) * 2, + 'Experiment': ([experiment] * len(trainset)) * 2, + 'TestAccuracy': test_accs, + } + df = pd.DataFrame(data) + + folder_name = 'test_accs' + experiment_name = f'exp{experiment}' + + if not os.path.exists(folder_name): + os.makedirs(folder_name) + folder_path = os.path.join(folder_name, experiment_name) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + + # file_name = 'e' + str(experiment) + '_' + model + '_s' + str(seed) + '_l' + str(label) + "_accs.csv" + file_name = f"L{label}_{model}_S{seed}.csv" + file_path = os.path.join(folder_path, file_name) + df.to_csv(file_path, index=False) + + + # data = {'test_acc': test_accs} + # df = pd.DataFrame(data) + + # model = model_config['type'] + # seed = general_config['seed'] + # label = general_config['label'] + # experiment = general_config['type'] + + # folder_name = 'test_accs' + # if not os.path.exists(folder_name): + # os.makedirs(folder_name) + + # file_name = 'e' + str(experiment) + '_' + model + '_s' + str(seed) + '_l' + str(label) + "_accs.csv" + # file_path = os.path.join(folder_name, file_name) + # df.to_csv(file_path, index=False) + + + +def experiments(args): + + verbose = True + + general_config = { + 'type': args.experiment, + 'label': args.label, + 'seed': args.seed, + 'cuda': False, + } + + model_config = { + 'type': args.model_type, + 'lr': args.lr, + 'epoch': args.epoch, + 'log_interval': args.log_interval, + 'momentum': args.momentum, + 'kernel': args.kernel, + 'n_hidden': args.n_hidden, + 'n_nodes': args.n_nodes, + } + + data_config = { + # Basic info of the initial dataset: + 'dataset': 'mnist', + 'available_labels': 10, + 'image_size': 28, + + # Size of each dataset: + 'size_all_tr_each': args.size_all_tr_each, + 'n_split': args.n_split, + 'split_size': args.split_size, + 'n_relearning': args.n_relearning, + + # settings: + 'shuffle': True, + 'normalize': False, + 'batch_size_tr': args.batch_size_tr, + 'batch_size_te': args.batch_size_te, + 'drop_last': False, + 'pad': False, + 'permutation': False, + } + + if checkattr(args, 'no_shuffle'): + data_config['shuffle'] = False + if checkattr(args, 'normalize'): + data_config['normalize'] = True + if checkattr(args, 'drop_last'): + data_config['drop_last'] = True + if checkattr(args, 'pad'): + data_config['pad'] = True + if checkattr(args, 'permutation'): + data_config['permutation'] = True + + # Cuda: + if torch.cuda.is_available(): + model_config['cuda'] = True + if checkattr(args, 'no_cuda'): + model_config['cuda'] = False + device = torch.device("cuda" if general_config['cuda'] else "cpu") + + # Load initial datasets: + dataset_tr = MNIST_dataset(split='train', + pad=data_config['pad'], + normalize=data_config['normalize'], + permutation=data_config['permutation'], + download=True, verbose=verbose) + dataset_te = MNIST_dataset(split='test', + pad=data_config['pad'], + normalize=data_config['normalize'], + permutation=data_config['permutation'], + download=True, verbose=verbose) + + experiment = general_config['type'] + model_type = model_config['type'] + label = general_config['label'] + split_size = data_config['split_size'] + log_interval = model_config['log_interval'] + + if experiment == 0: + if model_type == 'cobweb': + test_accs = experiment_0_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_0_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 1: + if model_type == 'cobweb': + test_accs = experiment_1_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_1_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 2: + if model_type == 'cobweb': + test_accs = experiment_2_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_2_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 3: + if model_type == 'cobweb': + test_accs = experiment_3_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_3_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 4: + if model_type == 'cobweb': + test_accs = experiment_4_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_4_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + if experiment == 5: + if model_type == 'cobweb': + test_accs = experiment_5_cobweb(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + else: + test_accs = experiment_5_nn(general_config, model_config, data_config, dataset_tr, dataset_te, verbose) + accuracy2csv(general_config, model_config, data_config, test_accs) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser("./main_mnist.py", + description="Run sets of experiments based on the MNIST dataset.") + parser = general_options(parser) + parser = data_options(parser) + parser = model_options(parser) + args = parser.parse_args() + + experiments(args) + diff --git a/concept_formation/vision-experiments/models_cobweb.py b/concept_formation/vision-experiments/models_cobweb.py new file mode 100644 index 0000000..a2cf8f6 --- /dev/null +++ b/concept_formation/vision-experiments/models_cobweb.py @@ -0,0 +1,25 @@ +from tqdm import tqdm +import torch + + +def train(tree, tr_loader): + imgs, labels = next(iter(tr_loader)) + for i in tqdm(range(imgs.shape[0])): + tree.ifit(imgs[i], labels[i].item()) + +def test(tree, te_loader): + imgs, labels = next(iter(te_loader)) + + correct_prediction = 0 + for i in tqdm(range(imgs.shape[0])): + actual_label = labels[i] + o = tree.categorize(imgs[i]) + out, out_label = o.predict() + + predicted_label = torch.tensor(out_label) + if predicted_label == actual_label: + correct_prediction += 1 + + accuracy = correct_prediction / len(imgs) + return accuracy + \ No newline at end of file diff --git a/concept_formation/vision-experiments/models_nn.py b/concept_formation/vision-experiments/models_nn.py new file mode 100644 index 0000000..01e4d83 --- /dev/null +++ b/concept_formation/vision-experiments/models_nn.py @@ -0,0 +1,127 @@ +import torch +from torch import optim, nn +import torch.nn.functional as F +from tqdm import tqdm + + +class FC_CNN(nn.Module): + def __init__(self, available_labels=10, kernel_size=5): + super(FC_CNN, self).__init__() + self.conv1 = nn.Conv2d(1, 10, kernel_size=kernel_size) + self.conv2 = nn.Conv2d(10, 20, kernel_size=kernel_size) + self.conv2_dropout = nn.Dropout2d() + self.fc1 = nn.Linear(320, 50) + self.fc2 = nn.Linear(50, available_labels) + + def forward(self, x): + x = F.relu(F.max_pool2d(self.conv1(x), 2)) + x = F.relu(F.max_pool2d(self.conv2_dropout(self.conv2(x)), 2)) + x = x.view(-1, 320) + x = F.relu(self.fc1(x)) + x = F.dropout(x, training=self.training) + x = self.fc2(x) + return F.log_softmax(x) + + +class FC(nn.Module): + def __init__(self, n_hidden, n_nodes, available_labels=10, image_size=28): + super(FC, self).__init__() + # self.fc1 = nn.Linear(image_size * image_size, 100) + # self.fc2 = nn.Linear(100, available_labels) + self.n_hidden = n_hidden + self.hidden_layers = nn.ModuleList([nn.Linear(image_size * image_size, n_nodes)]) + for _ in range(self.n_hidden): + self.hidden_layers.append(nn.Linear(n_nodes, n_nodes)) + self.fc_out = nn.Linear(n_nodes, 10) + + def forward(self, x): + x = x.view(x.size(0), -1) + # x = self.fc1(x) + # x = F.relu(self.fc1(x)) + # # x = F.dropout(x, training=self.training) + # x = self.fc2(x) + + for layer in self.hidden_layers: + x = torch.relu(layer(x)) + x = self.fc_out(x) + return F.log_softmax(x) + + +def build_model(model_config, data_config, device): + + available_labels = data_config['available_labels'] + image_size = data_config['image_size'] + kernel_size = model_config['kernel'] + lr = model_config['lr'] + momentum = model_config['momentum'] + n_hidden = model_config['n_hidden'] + n_nodes = model_config['n_nodes'] + + if model_config['type'] == 'fc': + model = FC(n_hidden=n_hidden, n_nodes=n_nodes, available_labels=available_labels, image_size=image_size).to(device) + else: + model = FC_CNN(available_labels=available_labels, kernel_size=kernel_size).to(device) + # if cuda: + # model = model.cuda() + optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum) + return model, optimizer + + +def train(model, optimizer, train_loader, epoch, log_interval, device): + + # batch_size = data_config['batch_size_tr'] + # log_interval = model_config['log_interval'] + + model.train() + running_loss = 0.0 + # train_losses = [] + # train_counter = [] + # train_loader = tqdm(train_loader) # include a progress bar + + with tqdm(train_loader, unit='batch') as tepoch: # include a progress bar + tepoch.set_description(f"Epoch {epoch}") + for batch_id, (imgs, labels) in enumerate(train_loader): + optimizer.zero_grad() + outputs = model(imgs) + loss = F.nll_loss(outputs, labels) + loss.backward() + optimizer.step() + if batch_id % log_interval == 0: + print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format( + epoch, + batch_id * len(imgs), + len(train_loader.dataset), + 100. * batch_id / len(train_loader), + loss.item())) + # train_losses.append(loss.item()) + # train_counter.append((batch_id * batch_size) + ((epoch - 1) * len*(train_loader.dataset))) + # torch.save(model.state_dict(), '/results/model.pth') + # torch.save(optimizer.state_dict(), '/results/optimizer/pth') + running_loss += loss.item() + tepoch.set_postfix(loss=running_loss / len(tepoch)) + + +def test(model, test_loader, device): + # test_loader = tqdm(test_loader) # include a progress bar + model.eval() + # test_losses = [] + test_loss = 0 + correct = 0 + accuracy = 0. + with tqdm(test_loader, unit='batch') as tepoch: # include a progress bar + tepoch.set_description("Testing") + with torch.no_grad(): + for imgs, labels in test_loader: + imgs, labels = imgs.to(device), labels.to(device) + outputs = model(imgs) + test_loss += F.nll_loss(outputs, labels, size_average=False).item() + preds = outputs.data.max(1, keepdim=True)[1] + correct += preds.eq(labels.data.view_as(preds)).sum() + test_loss /= len(test_loader.dataset) + # test_losses.append(test_loss) + accuracy = correct / len(test_loader.dataset) + print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format( + test_loss, correct, len(test_loader.dataset), 100. * accuracy)) + tepoch.set_postfix(accuracy=accuracy.item()) + return accuracy + diff --git a/concept_formation/vision-experiments/options.py b/concept_formation/vision-experiments/options.py new file mode 100644 index 0000000..9f5013d --- /dev/null +++ b/concept_formation/vision-experiments/options.py @@ -0,0 +1,43 @@ +""" +Define options. +""" + +def general_options(parser): + parser.add_argument('--experiment', type=int, default=0, choices=[0, 1, 2, 3, 4, 5], help='type of experiments') + # parser.add_argument('--label', type=int, default=0, help='the label to be separated from in the experiments') + parser.add_argument('--label', type=lambda x: int(x) if x.isdigit() else x, default=0, help='the label to be separated from in the experiments') + parser.add_argument('--no-cuda', action='store_true', help='do NOT use cuda (even if the condition holds)') + return parser + + +def data_options(parser): + parser.add_argument('--dataset', type=str, default='mnist', choices=['mnist', 'cifar'], help='Initial dataset chosen') + parser.add_argument('--no-shuffle', action='store_true', help='do NOT shuffle the data when generating a dataset') + parser.add_argument('--normalize', action='store_true', help='normalize every data') + parser.add_argument('--seed', type=int, default=123, help='random seed') + parser.add_argument('--n_split', type=int, default=10, help='the number of training splits in an experiment') + parser.add_argument('--split-size', type=int, default=6000, help='# of training data in every split of remained labels training') + parser.add_argument('--batch-size-tr', type=int, default=64, help='batch size of every training data loader') + parser.add_argument('--batch-size-te', type=int, default=64, help='batch size of every test data loader') + parser.add_argument('--pad', action='store_true', help='whether pad each image by 2 pixels') + parser.add_argument('--drop-last', action='store_true', help='whether drop the last batch if its size is less than batch size') + parser.add_argument('--permutation', action='store_true', help='whether permute the pixels of all images') + parser.add_argument('--size-all-tr-each', type=int, default=600, + help='the size of training data from each label that forms the concatenated training set (which has the size n_label * size_all_tr_each)') + parser.add_argument('--n-relearning', type=int, default=5, help='number of splits that include relearning') + return parser + + +def model_options(parser): + parser.add_argument('--model-type', type=str, default='cobweb', choices=['cobweb', 'fc', 'fc-cnn'], help='type of model') + parser.add_argument('--lr', type=float, default=0.01, help='learning rate') + parser.add_argument('--epoch', type=int, default=1, help='# of epochs of training') + parser.add_argument('--log-interval', type=int, default=10, help='size of log interval') + parser.add_argument('--momentum', type=float, default=0.5, help='the value of momentum parameter in optimizer SGD') + parser.add_argument('--kernel', type=int, default=5, help='kernel size of the CNN model layers') + parser.add_argument('--n-hidden', type=int, default=1, help='# of hidden layers in fc model') + parser.add_argument('--n-nodes', type=int, default=100, help='# of nodes in each hidden layer in fc model') + return parser + + + diff --git a/concept_formation/vision-experiments/plots/exp0.R b/concept_formation/vision-experiments/plots/exp0.R new file mode 100644 index 0000000..1891d9c --- /dev/null +++ b/concept_formation/vision-experiments/plots/exp0.R @@ -0,0 +1,135 @@ +library(readr) # Read the csv file +library(ggplot2) # Basic plot package + + + +exp0 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/exp0.csv", + col_types = cols(Seed = col_character())) # read the file and import it to a dataframe +# Change the name of '0-9': +exp0$TestSet[exp0$TestSet == '0-9'] <- 'all' +# Change the name of x-axis (Trainset): +exp0$TrainSet[exp0$TrainSet == '0-9: 1'] <- 1 +exp0$TrainSet[exp0$TrainSet == '0-9: 2'] <- 2 +exp0$TrainSet[exp0$TrainSet == '0-9: 3'] <- 3 +exp0$TrainSet[exp0$TrainSet == '0-9: 4'] <- 4 +exp0$TrainSet[exp0$TrainSet == '0-9: 5'] <- 5 +exp0$TrainSet[exp0$TrainSet == '0-9: 6'] <- 6 +exp0$TrainSet[exp0$TrainSet == '0-9: 7'] <- 7 +exp0$TrainSet[exp0$TrainSet == '0-9: 8'] <- 8 +exp0$TrainSet[exp0$TrainSet == '0-9: 9'] <- 9 +exp0$TrainSet[exp0$TrainSet == '0-9: 10'] <- 10 +# Convert TrainSet to factor and specify the desired order of levels +exp0$TrainSet <- factor(exp0$TrainSet, levels = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) # Adjust the levels as needed +exp0$Model[exp0$Model == "COBWEB/4T"] <- "COBWEB/4V" + +df_0 <- exp0 %>% + filter(TestSet %in% c('0', 'all')) # Only with the specified label (0) and 'all' + +p <- ggplot(exp0, aes(x = TrainSet, y = TestAccuracy, color = TestSet, fill = TestSet, group = TestSet)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + stat_summary( + fun.data = function(y) data.frame(y = mean(y), ymin = quantile(y, 0.025), ymax = quantile(y, 0.975)), + geom = "ribbon", + method = "lm", + formula = y ~ x, + aes(group = TestSet), + se = FALSE, + linetype = "blank", + size = 1, + alpha = 0.5 + ) + # Adding bootstrap confidence intervals + stat_summary(fun.data = function(y) data.frame(y = median(y)), + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = '# of Training Sets', y = 'Test Accuracy', title = 'Test Accuracy with Increasing Training Data from all Labels') + + theme_minimal() + + facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('0' = 'red', '1' = 'orange', '2' = 'green', '3' = 'blue', '4' = 'pink', + '5' = 'yellow', '6' = 'brown', '7' = 'purple', '8'='darkgreen', '9' = 'lightblue', + 'all' = 'black')) + + scale_fill_manual(values = c('0' = 'red', '1' = 'orange', '2' = 'green', '3' = 'blue', '4' = 'pink', + '5' = 'yellow', '6' = 'brown', '7' = 'purple', '8'='darkgreen', '9' = 'lightblue', + 'all' = 'black')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p) + +# With error bars and lines only +p_err <- ggplot(exp0, aes(x = TrainSet, y = TestAccuracy, color = TestSet, fill = TestSet, group = TestSet)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + geom_errorbar(stat = "summary", + fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + width = 0.2) + + stat_summary(fun.data = function(y) data.frame(y = median(y)), + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = '# of Training Sets', y = 'Test Accuracy', title = 'Test Accuracy with Increasing Training Data from all Labels') + + theme_minimal() + + facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('0' = 'red', '1' = 'orange', '2' = 'green', '3' = 'blue', '4' = 'pink', + '5' = 'yellow', '6' = 'brown', '7' = 'purple', '8'='darkgreen', '9' = 'lightblue', + 'all' = 'black')) + + scale_fill_manual(values = c('0' = 'red', '1' = 'orange', '2' = 'green', '3' = 'blue', '4' = 'pink', + '5' = 'yellow', '6' = 'brown', '7' = 'purple', '8'='darkgreen', '9' = 'lightblue', + 'all' = 'black')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_err) + + +p <- ggplot(df_0, aes(x = TrainSet, y = TestAccuracy, color = TestSet, fill = TestSet, group = TestSet)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + geom_errorbar(stat = "summary", + fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + width = 0.2) + + stat_summary( + fun.data = function(y) data.frame(y = mean(y), ymin = quantile(y, 0.025), ymax = quantile(y, 0.975)), + geom = "ribbon", + method = "lm", + formula = y ~ x, + aes(group = TestSet), + se = FALSE, + linetype = "blank", + size = 1, + alpha = 0.5 + ) + # Adding bootstrap confidence intervals + stat_summary(fun.data = function(y) data.frame(y = median(y)), + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = '# of Training Sets', y = 'Test Accuracy', title = 'Test Accuracy with Increasing Training Data from all Labels') + + theme_minimal() + + facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('0' = 'red', 'all' = 'blue')) + + scale_fill_manual(values = c('0' = 'red', 'all' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p) + +df_all <- exp0 %>% + filter(TestSet %in% c('all')) +p_all <- ggplot(df_all, aes(x = TrainSet, y = TestAccuracy, color = Model, fill = Model, group = Model)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + fun.data = "mean_cl_boot", + width = 0.2) + + stat_summary(#fun.data = function(y) data.frame(y = median(y)), + fun.data = "mean_cl_boot", + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = '# of Training Sets', y = 'Test Accuracy', title = 'Test Accuracy of all Labels with Increasing Training Data from all Labels') + + theme_minimal() + + #facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + scale_fill_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_all) diff --git a/concept_formation/vision-experiments/plots/exp1-2.R b/concept_formation/vision-experiments/plots/exp1-2.R new file mode 100644 index 0000000..65ac092 --- /dev/null +++ b/concept_formation/vision-experiments/plots/exp1-2.R @@ -0,0 +1,91 @@ +library(readr) +library(readr) # Read the csv file +library(ggplot2) # Basic plot package + +exp1 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/Data/exp1.csv") +exp1$TrainSet[exp1$TrainSet == "L0"] <- "S0" +exp1$Model[exp1$Model == "COBWEB/4T"] <- "COBWEB/4V" + +exp1$TrainSet <- factor(exp1$TrainSet, levels = c('S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9')) +exp1$TestSet <- factor(exp1$TestSet, levels = c('L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'All')) + +# Calculate the averages among "rest labels" +df_avg <- exp1 %>% + filter(TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) %>% + group_by(TrainSet, Model, Label, Seed) %>% + summarize(AvgTestAccuracy = mean(TestAccuracy)) + +# Then remove the "rest labels" +df_filtered <- exp1 %>% + filter(!TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) + +df_rest <- df_avg %>% + mutate(TestSet = "Rest", TestAccuracy = AvgTestAccuracy) %>% + select(-AvgTestAccuracy) + +df_1 <- rbind(df_rest, df_filtered[,c("TrainSet", "Model", "Label", "Seed", "TestSet", "TestAccuracy")]) +df_1$TestSet[df_1$TestSet == "L0"] <- "Chosen" +df_1$TestSet <- factor(df_1$TestSet, levels = c('Chosen', 'Rest', 'All')) + +df_chosen_1 = df_1 %>% + filter(TestSet == 'Chosen') +df_chosen_1$Model[df_chosen_1$Model == "COBWEB/4V"] <- "COBWEB/4V, 1" +df_chosen_1$Model[df_chosen_1$Model == "fc"] <- "fc, 1" +df_chosen_1$Model[df_chosen_1$Model == "fc-CNN"] <- "fc-CNN, 1" + +exp2 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/Data/exp2.csv") + +exp2$TrainSet <- factor(exp2$TrainSet, levels = c('S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9')) +exp2$TestSet <- factor(exp2$TestSet, levels = c('L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'All')) + +# Calculate the averages among "rest labels" +df_avg <- exp2 %>% + filter(TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) %>% + group_by(TrainSet, Model, Label, Seed) %>% + summarize(AvgTestAccuracy = mean(TestAccuracy)) + +# Then remove the "rest labels" +df_filtered <- exp2 %>% + filter(!TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) + +df_rest <- df_avg %>% + mutate(TestSet = "Rest", TestAccuracy = AvgTestAccuracy) %>% + select(-AvgTestAccuracy) + +df_2 <- rbind(df_rest, df_filtered[,c("TrainSet", "Model", "Label", "Seed", "TestSet", "TestAccuracy")]) +df_2$TestSet[df_2$TestSet == "L0"] <- "Chosen" +df_2$TestSet <- factor(df_2$TestSet, levels = c('Chosen', 'Rest', 'All')) + +df_chosen_2 = df_2 %>% + filter(TestSet == 'Chosen') +df_chosen_2$Model[df_chosen_2$Model == "COBWEB/4V"] <- "COBWEB/4V, 2" +df_chosen_2$Model[df_chosen_2$Model == "fc"] <- "fc, 2" +df_chosen_2$Model[df_chosen_2$Model == "fc-CNN"] <- "fc-CNN, 2" + +df_chosen_1_2 = rbind(df_chosen_1, df_chosen_2) + +p_chosen <- ggplot(df_chosen_1_2, aes(x = TrainSet, y = TestAccuracy, color = Model, fill = Model, group = Model)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + #geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + #width = 0.2) + + geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + fun.data = "mean_cl_boot", + width = 0.2) + + stat_summary(#fun.data = function(y) data.frame(y = median(y)), + fun.data = "mean_cl_boot", + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = 'Incoming Training Split', y = 'Test Accuracy', title = 'Test Accuracy for Chosen Label with More Pre-defined Training Splits, Experiments 1 & 2') + + theme_minimal() + + #facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('COBWEB/4V, 1' = 'red', 'COBWEB/4V, 2' = 'pink', 'fc, 1' = 'darkgreen', 'fc, 2' = 'lightgreen', + 'fc-CNN, 1' = 'blue', 'fc-CNN, 2' = 'lightblue')) + + scale_fill_manual(values = c('COBWEB/4V, 1' = 'red', 'COBWEB/4V, 2' = 'pink', 'fc, 1' = 'darkgreen', 'fc, 2' = 'lightgreen', + 'fc-CNN, 1' = 'blue', 'fc-CNN, 2' = 'lightblue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_chosen) diff --git a/concept_formation/vision-experiments/plots/exp1.R b/concept_formation/vision-experiments/plots/exp1.R new file mode 100644 index 0000000..dc43538 --- /dev/null +++ b/concept_formation/vision-experiments/plots/exp1.R @@ -0,0 +1,85 @@ +library(readr) # Read the csv file +library(ggplot2) # Basic plot package +library(dplyr) + +exp1 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/Data/exp1.csv") + +exp1$TrainSet <- factor(exp1$TrainSet, levels = c('L0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9')) +exp1$TestSet <- factor(exp1$TestSet, levels = c('L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'All')) +exp1$Model[exp1$Model == "COBWEB/4T"] <- "COBWEB/4V" + +# Calculate the averages among "rest labels" +df_avg <- exp1 %>% + filter(TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) %>% + group_by(TrainSet, Model, Label, Seed) %>% + summarize(AvgTestAccuracy = mean(TestAccuracy)) + +# Then remove the "rest labels" +df_filtered <- exp1 %>% + filter(!TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) + +df_rest <- df_avg %>% + mutate(TestSet = "Rest", TestAccuracy = AvgTestAccuracy) %>% + select(-AvgTestAccuracy) + +df <- rbind(df_rest, df_filtered[,c("TrainSet", "Model", "Label", "Seed", "TestSet", "TestAccuracy")]) +df$TestSet[df$TestSet == "L0"] <- "Chosen" +df$TestSet <- factor(df$TestSet, levels = c('Chosen', 'Rest', 'All')) + + + +p <- ggplot(df, aes(x = TrainSet, y = TestAccuracy, color = TestSet, fill = TestSet, group = TestSet)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + #geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + #width = 0.2) + + stat_summary( + fun.data = function(y) data.frame(y = mean(y), ymin = quantile(y, 0.025), ymax = quantile(y, 0.975)), + geom = "ribbon", + method = "lm", + formula = y ~ x, + aes(group = TestSet), + se = FALSE, + linetype = "blank", + size = 1, + alpha = 0.5 + ) + # Adding bootstrap confidence intervals + stat_summary(fun.data = function(y) data.frame(y = median(y)), + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = 'Incoming Training Split', y = 'Test Accuracy', title = 'Test Accuracy with More Pre-defined Training Splits, Experiment 1') + + theme_minimal() + + facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('Chosen' = 'red', 'Rest' = 'darkgreen', 'All' = 'blue')) + + scale_fill_manual(values = c('Chosen' = 'red', 'Rest' = 'darkgreen', 'All' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p) + +df_chosen = df %>% + filter(TestSet == 'Chosen') +p_chosen <- ggplot(df_chosen, aes(x = TrainSet, y = TestAccuracy, color = Model, fill = Model, group = Model)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + #geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + #width = 0.2) + + geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + fun.data = "mean_cl_boot", + width = 0.2) + + stat_summary(#fun.data = function(y) data.frame(y = median(y)), + fun.data = "mean_cl_boot", + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = 'Incoming Training Split', y = 'Test Accuracy', title = 'Test Accuracy for Chosen Label with More Pre-defined Training Splits, Experiment 1') + + theme_minimal() + + #facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + scale_fill_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_chosen) diff --git a/concept_formation/vision-experiments/plots/exp2.R b/concept_formation/vision-experiments/plots/exp2.R new file mode 100644 index 0000000..99e3940 --- /dev/null +++ b/concept_formation/vision-experiments/plots/exp2.R @@ -0,0 +1,54 @@ +library(readr) # Read the csv file +library(ggplot2) # Basic plot package +library(dplyr) + +exp2 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/Data/exp2.csv") + +exp2$TrainSet <- factor(exp2$TrainSet, levels = c('S0', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9')) +exp2$TestSet <- factor(exp2$TestSet, levels = c('L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'All')) + +# Calculate the averages among "rest labels" +df_avg <- exp2 %>% + filter(TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) %>% + group_by(TrainSet, Model, Label, Seed) %>% + summarize(AvgTestAccuracy = mean(TestAccuracy)) + +# Then remove the "rest labels" +df_filtered <- exp2 %>% + filter(!TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) + +df_rest <- df_avg %>% + mutate(TestSet = "Rest", TestAccuracy = AvgTestAccuracy) %>% + select(-AvgTestAccuracy) + +df <- rbind(df_rest, df_filtered[,c("TrainSet", "Model", "Label", "Seed", "TestSet", "TestAccuracy")]) +df$TestSet[df$TestSet == "L0"] <- "Chosen" +df$TestSet <- factor(df$TestSet, levels = c('Chosen', 'Rest', 'All')) + +df_chosen = df %>% + filter(TestSet == 'Chosen') + +p_chosen <- ggplot(df_chosen, aes(x = TrainSet, y = TestAccuracy, color = Model, fill = Model, group = Model)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + #geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + #width = 0.2) + + geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + fun.data = "mean_cl_boot", + width = 0.2) + + stat_summary(#fun.data = function(y) data.frame(y = median(y)), + fun.data = "mean_cl_boot", + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = 'Incoming Training Split', y = 'Test Accuracy', title = 'Test Accuracy for Chosen Label with More Pre-defined Training Splits, Experiment 2') + + theme_minimal() + + #facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + scale_fill_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_chosen) + diff --git a/concept_formation/vision-experiments/plots/exp3.R b/concept_formation/vision-experiments/plots/exp3.R new file mode 100644 index 0000000..b35e177 --- /dev/null +++ b/concept_formation/vision-experiments/plots/exp3.R @@ -0,0 +1,57 @@ +library(readr) +library(readr) # Read the csv file +library(ggplot2) # Basic plot package + +exp3 <- read_csv("Documents/GitHub/catastrophic-forgetting-cobweb/experiments/r_plots/Data/exp3.csv") + +exp3$TrainSet <- factor(exp3$TrainSet, levels = c('S0', 'S1', 'S2', 'S3', 'S4', 'R1', 'R2', 'R3', 'R4', 'R5')) +exp3$TestSet <- factor(exp3$TestSet, levels = c('L0', 'L1', 'L2', 'L3', 'L4', 'L5', 'L6', 'L7', 'L8', 'L9', 'All')) + +# Calculate the averages among "rest labels" +df_avg <- exp3 %>% + filter(TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) %>% + group_by(TrainSet, Model, Label, Seed) %>% + summarize(AvgTestAccuracy = mean(TestAccuracy)) + +# Then remove the "rest labels" +df_filtered <- exp3 %>% + filter(!TestSet %in% c("L1", "L2", "L3", "L4", "L5", "L6", "L7", "L8", "L9")) + +df_rest <- df_avg %>% + mutate(TestSet = "Rest", TestAccuracy = AvgTestAccuracy) %>% + select(-AvgTestAccuracy) + +df <- rbind(df_rest, df_filtered[,c("TrainSet", "Model", "Label", "Seed", "TestSet", "TestAccuracy")]) +df$TestSet[df$TestSet == "L0"] <- "Chosen" +df$TestSet <- factor(df$TestSet, levels = c('Chosen', 'Rest', 'All')) + +df_chosen = df %>% + filter(TestSet == 'Chosen') + +p_chosen <- ggplot(df_chosen, aes(x = TrainSet, y = TestAccuracy, color = Model, fill = Model, group = Model)) + + #geom_line() + + #geom_ribbon(aes(ymin = TestAccuracy - sd(TestAccuracy), ymax = TestAccuracy + sd(TestAccuracy)), alpha = 0.2) + + #geom_ribbon(aes(ymin = quantile(TestAccuracy, 0.025), ymax = quantile(TestAccuracy, 0.975)), alpha = 0.2) + + #geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + #width = 0.2) + + geom_errorbar(stat = "summary", + #fun.data = function(y) data.frame(ymin = median(y) - sd(y), ymax = median(y) + sd(y)), + fun.data = "mean_cl_boot", + width = 0.2) + + stat_summary(#fun.data = function(y) data.frame(y = median(y)), + fun.data = "mean_cl_boot", + geom = "line", + size = 1, + alpha = 0.5) + + labs(x = 'Incoming Training Split', y = 'Test Accuracy', title = 'Test Accuracy for Chosen Label with More Pre-defined Training Splits, Experiment 3') + + theme_minimal() + + #facet_wrap(~ Model, ncol = 3) + + scale_color_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + scale_fill_manual(values = c('COBWEB/4V' = 'red', 'fc' = 'darkgreen', 'fc-CNN' = 'blue')) + + theme(plot.title = element_text(hjust = 0.5)) # Center the title +print(p_chosen) + + + +