diff --git a/README.md b/README.md
index 564fbb2..8662438 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,3 @@ This repository contains code for a method for clustering multivariate time seri
 The use of the method is not restricted to clinical data. It can generally be used for multivariate time series data. 
 
 In addition to variational autoencoders with gaussian mixture priors, the code allows to train ordinary variational autoencoders (multivariate gaussian prior) and ordinary autoencoders (without prior), for all available time series models (LSTM, GRUs and Transformers).
-
-The code was written using
-
-(1) Python 3.6 and Tensorflow 1.10.1 (directory tensorflow1), and
-
-(2) Python 3.8 and Tensorflow 2.4.0 (directory tensorflow2).
-
-Note that only the Tensorflow 2.4.0 version gives the option for training transformer networks in addition to LSTMs/GRUs.
\ No newline at end of file
diff --git a/VaDER/__init__.py b/VaDER/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/VaDER/__pycache__/__init__.cpython-39.pyc b/VaDER/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000..b44cd02
Binary files /dev/null and b/VaDER/__pycache__/__init__.cpython-39.pyc differ
diff --git a/VaDER/__pycache__/layers.cpython-39.pyc b/VaDER/__pycache__/layers.cpython-39.pyc
new file mode 100644
index 0000000..82b60bd
Binary files /dev/null and b/VaDER/__pycache__/layers.cpython-39.pyc differ
diff --git a/VaDER/__pycache__/utils.cpython-39.pyc b/VaDER/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000..f2fd3d0
Binary files /dev/null and b/VaDER/__pycache__/utils.cpython-39.pyc differ
diff --git a/VaDER/__pycache__/vader.cpython-39.pyc b/VaDER/__pycache__/vader.cpython-39.pyc
new file mode 100644
index 0000000..469929d
Binary files /dev/null and b/VaDER/__pycache__/vader.cpython-39.pyc differ
diff --git a/VaDER/__pycache__/vadermodel.cpython-39.pyc b/VaDER/__pycache__/vadermodel.cpython-39.pyc
new file mode 100644
index 0000000..41ea7ae
Binary files /dev/null and b/VaDER/__pycache__/vadermodel.cpython-39.pyc differ
diff --git a/tensorflow2/vader/layers.py b/VaDER/layers.py
similarity index 95%
rename from tensorflow2/vader/layers.py
rename to VaDER/layers.py
index 9061c33..2c6a89d 100644
--- a/tensorflow2/vader/layers.py
+++ b/VaDER/layers.py
@@ -1,6 +1,15 @@
 import tensorflow as tf
 import numpy as np
-from utils import positional_encoding, scaled_dot_product_attention
+from scipy.optimize import linear_sum_assignment as linear_assignment
+from sklearn import metrics
+from scipy.stats import multivariate_normal
+import warnings
+from sklearn.mixture import GaussianMixture
+import tensorflow_addons as tfa
+import abc
+
+from VaDER.utils import get_angles, positional_encoding, create_padding_mask, create_look_ahead_mask, \
+    scaled_dot_product_attention, create_masks
 
 class ImputationLayer(tf.keras.layers.Layer):
     def __init__(self, A_init):
@@ -112,7 +121,6 @@ def call(self, x, training, mask):
 
         return out2
 
-
 class TransformerDecoderLayer(tf.keras.layers.Layer):
     def __init__(self, d_model, num_heads, dff, rate=0.1):
         super(TransformerDecoderLayer, self).__init__()
@@ -146,7 +154,6 @@ def call(self, x, enc_output, training,
 
         return out3, attn_weights_block1, attn_weights_block2
 
-
 class TransformerEncoder(tf.keras.layers.Layer):
     def __init__(self, num_layers, D, d_model, num_heads, dff, maximum_position_encoding, rate=0.1):
         super(TransformerEncoder, self).__init__()
@@ -174,7 +181,6 @@ def call(self, x, training, mask):
 
         return x
 
-
 class TransformerDecoder(tf.keras.layers.Layer):
     def __init__(self, num_layers, D, d_model, num_heads, dff, maximum_position_encoding, rate=0.1):
         super(TransformerDecoder, self).__init__()
diff --git a/tensorflow2/vader/utils.py b/VaDER/utils.py
similarity index 92%
rename from tensorflow2/vader/utils.py
rename to VaDER/utils.py
index d307314..ef99843 100644
--- a/tensorflow2/vader/utils.py
+++ b/VaDER/utils.py
@@ -1,5 +1,12 @@
 import tensorflow as tf
 import numpy as np
+from scipy.optimize import linear_sum_assignment as linear_assignment
+from sklearn import metrics
+from scipy.stats import multivariate_normal
+import warnings
+from sklearn.mixture import GaussianMixture
+import tensorflow_addons as tfa
+import abc
 
 def get_angles(pos, i, d_model):
     angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
@@ -64,7 +71,6 @@ def scaled_dot_product_attention(q, k, v, mask):
 
     return output, attention_weights
 
-
 def create_masks(inp):
     # does not matter which i we take for inp[:,:,i], so just take i=0
     tar = inp[:, :-1, 0]
diff --git a/tensorflow2/vader/vader.py b/VaDER/vader.py
similarity index 99%
rename from tensorflow2/vader/vader.py
rename to VaDER/vader.py
index fb1e8eb..86e1f65 100644
--- a/tensorflow2/vader/vader.py
+++ b/VaDER/vader.py
@@ -1,11 +1,14 @@
 import tensorflow as tf
+import numpy as np
 from scipy.optimize import linear_sum_assignment as linear_assignment
 from sklearn import metrics
 from scipy.stats import multivariate_normal
-import numpy as np
 import warnings
 from sklearn.mixture import GaussianMixture
-from vadermodel import VaderRNN, VaderFFN, VaderTransformer
+import tensorflow_addons as tfa
+import abc
+
+from VaDER.vadermodel import VaderModel, VaderRNN, VaderFFN, VaderTransformer
 
 class VADER:
     '''
diff --git a/tensorflow2/vader/vadermodel.py b/VaDER/vadermodel.py
similarity index 95%
rename from tensorflow2/vader/vadermodel.py
rename to VaDER/vadermodel.py
index 48d7584..e4c7ec2 100644
--- a/tensorflow2/vader/vadermodel.py
+++ b/VaDER/vadermodel.py
@@ -1,9 +1,19 @@
 import tensorflow as tf
+import numpy as np
+from scipy.optimize import linear_sum_assignment as linear_assignment
+from sklearn import metrics
+from scipy.stats import multivariate_normal
+import warnings
+from sklearn.mixture import GaussianMixture
 import tensorflow_addons as tfa
 import abc
-import numpy as np
-from layers import ImputationLayer, RnnDecodeTransformLayer, GmmLayer, TransformerEncoder, TransformerDecoder
-from utils import create_masks
+
+from VaDER.layers import ImputationLayer, RnnDecodeTransformLayer, GmmLayer, MultiHeadAttention, \
+    point_wise_feed_forward_network, TransformerEncoderLayer, TransformerDecoderLayer, \
+    TransformerEncoder, TransformerDecoder
+from VaDER.utils import get_angles, positional_encoding, create_padding_mask, create_look_ahead_mask, \
+    scaled_dot_product_attention, create_masks
+
 
 class VaderModel(tf.keras.Model):
     def __init__(self, X, W, D, K, I, cell_type, n_hidden, recurrent, output_activation, cell_params=None):
diff --git a/tensorflow2/test_vader.py b/main.py
similarity index 96%
rename from tensorflow2/test_vader.py
rename to main.py
index beaa503..0acd8a8 100644
--- a/tensorflow2/test_vader.py
+++ b/main.py
@@ -1,12 +1,9 @@
 import numpy as np
 import time
 import os
-import sys
-import cProfile
 import tensorflow as tf
 tf.config.run_functions_eagerly(False)
-from tensorflow2.vader.vader import VADER
-# from tensorflow2.prog import Transformer
+from VaDER.vader import VADER
 
 save_path = os.path.join('test_vader', 'vader.ckpt')
 
@@ -14,7 +11,7 @@
 
 # generating some simple random data [ns * 2 samples, nt - 1 time points, 2 variables]
 nt = int(8)
-ns = int(5e2    )
+ns = int(5e2)
 sigma = 0.5
 mu1 = -2
 mu2 = 2
@@ -60,6 +57,7 @@
 # "cell_params". These are hyperparameters to the transformer architecture, and are interpreted as in
 # https://www.tensorflow.org/tutorials/text/transformer
 # The use of dropout can be debated, due to the regularizing properties of the variational layer
+# Also note that a Transformer is not a great model choice for this data...
 vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4,
               learning_rate=1e-3, output_activation=None, recurrent=True, cell_type="Transformer", batch_size=64,
               cell_params={'d_model': 4, 'num_layers': 1, 'num_heads': 1, 'dff': 16, 'rate': 0.0})
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9cbbf00
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+numpy==1.21.0
+scikit_learn==1.0.2
+scipy==1.7.3
+setuptools==57.0.0
+tensorflow==2.7.1
+tensorflow_addons==0.16.1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..23e757d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,23 @@
+from setuptools import setup
+
+setup(
+    name='VaDER',
+    version='0.1.0',
+    author='Johann de Jong',
+    author_email='johanndejong@gmail.com',
+    packages=['VaDER'],
+    #    package_dir={'': '.'},
+    #    scripts=['bin/script1', 'bin/script2'],
+    url='',
+    license='LICENSE',
+    description='Clustering multivariate time series with missing values',
+    long_description=open('README.md').read(),
+    install_requires=[
+        "numpy >= 1.19.5",
+        "scikit_learn >= 1.0.2",
+        "scipy >= 1.7.3",
+        "setuptools >= 57.0.0",
+        "tensorflow >= 2.7.1",
+        "tensorflow_addons >= 0.16.1"
+    ]
+)
\ No newline at end of file
diff --git a/tensorflow1/requirements.txt b/tensorflow1/requirements.txt
deleted file mode 100644
index 7d5dbd0..0000000
--- a/tensorflow1/requirements.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-numpy==1.19.2
-scikit-learn==0.23.2
-scipy==1.5.2
-tensorflow==1.15.0
diff --git a/tensorflow1/test_vader.py b/tensorflow1/test_vader.py
deleted file mode 100644
index 58de72b..0000000
--- a/tensorflow1/test_vader.py
+++ /dev/null
@@ -1,92 +0,0 @@
-import numpy as np
-import time
-import os
-import sys
-import cProfile
-import tensorflow as tf
-from tensorflow1.vader.vader import VADER
-
-save_path = os.path.join('test_vader', 'vader.ckpt')
-
-np.random.seed(123)
-
-# generating some simple random data [ns * 2 samples, nt - 1 time points, 2 variables]
-nt = int(8)
-ns = int(5e2)
-sigma = 0.5
-mu1 = -2
-mu2 = 2
-a1 = np.random.normal(mu1, sigma, ns)
-a2 = np.random.normal(mu2, sigma, ns)
-# one variable with two clusters
-X0 = np.outer(a1, 2 * np.append(np.arange(-nt/2, 0), 0.5 * np.arange(0, nt/2)[1:]))
-X1 = np.outer(a2, 0.5 * np.append(np.arange(-nt/2, 0), 2 * np.arange(0, nt/2)[1:]))
-X_train = np.concatenate((X0, X1), axis=0)
-y_train = np.repeat([0, 1], ns)
-# add another variable as a random permutation of the first one
-# resulting in four clusters in total
-ii = np.random.permutation(ns * 2)
-X_train = np.stack((X_train, X_train[ii,]), axis=2)
-# we now get four clusters in total
-y_train = y_train * 2**0 + y_train[ii] * 2**1
-
-# randomly re-order the samples
-ii = np.random.permutation(ns * 2)
-X_train = X_train[ii,:]
-y_train = y_train[ii]
-# Randomly set 50% of values to missing (0: missing, 1: present)
-# Note: All X_train[i,j] for which W_train[i,j] == 0 are treated as missing (i.e. their specific value is ignored)
-W_train = np.random.choice(2, X_train.shape)
-
-# normalize (better for fitting)
-for i in np.arange(X_train.shape[2]):
-    X_train[:,:,i] = (X_train[:,:,i] - np.mean(X_train[:,:,i])) / np.std(X_train[:,:,i])
-
-# Note: y_train is used purely for monitoring performance when a ground truth clustering is available.
-# It can be omitted if no ground truth is available.
-vader = VADER(X_train=X_train, W_train=W_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=4,
-              learning_rate=1e-3, output_activation=None, recurrent=True, batch_size=16)
-
-# pre-train without latent loss
-start = time.time()
-vader.pre_fit(n_epoch=50, verbose=True)
-# train with latent loss
-vader.fit(n_epoch=50, verbose=True)
-end = time.time()
-print("Elapsed: ", end - start)
-
-# get the clusters
-c = vader.cluster(X_train)
-# get the re-constructions
-p = vader.predict(X_train)
-# compute the loss given the network
-l = vader.get_loss(X_train)
-
-# Run VaDER non-recurrently (ordinary VAE with GM prior)
-nt = int(8)
-ns = int(2e2)
-sigma = np.diag(np.repeat(2, nt))
-mu1 = np.repeat(-1, nt)
-mu2 = np.repeat(1, nt)
-a1 = np.random.multivariate_normal(mu1, sigma, ns)
-a2 = np.random.multivariate_normal(mu2, sigma, ns)
-X_train = np.concatenate((a1, a2), axis=0)
-y_train = np.repeat([0, 1], ns)
-ii = np.random.permutation(ns * 2)
-X_train = X_train[ii,:]
-y_train = y_train[ii]
-# normalize (better for fitting)
-X_train = (X_train - np.mean(X_train)) / np.std(X_train)
-
-vader = VADER(X_train=X_train, y_train=y_train, save_path=save_path, n_hidden=[12, 2], k=2,
-              learning_rate=1e-3, output_activation=None, recurrent=False, batch_size=16)
-# pre-train without latent loss
-vader.pre_fit(n_epoch=10, verbose=True)
-# train with latent loss
-vader.fit(n_epoch=50, verbose=True)
-# get the clusters
-c = vader.cluster(X_train)
-# get the re-constructions
-p = vader.predict(X_train)
-# compute the loss given the network
-l = vader.get_loss(X_train)
diff --git a/tensorflow1/vader/__init__.py b/tensorflow1/vader/__init__.py
deleted file mode 100644
index 4c57a6d..0000000
--- a/tensorflow1/vader/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os;
-import sys
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-
-if __name__ == '__main__':
-    pass
diff --git a/tensorflow1/vader/layers.py b/tensorflow1/vader/layers.py
deleted file mode 100644
index ced13b8..0000000
--- a/tensorflow1/vader/layers.py
+++ /dev/null
@@ -1,145 +0,0 @@
-import tensorflow as tf
-from functools import partial
-
-def encode(X, D, I, cell_type, n_hidden, recurrent):
-    if recurrent:  # train a recurrent autoencoder
-        return encode_recurrent(X, D, I, cell_type, n_hidden)
-    else:  # train a non-recurrent autoencoder
-        return encode_nonrecurrent(X, n_hidden)
-
-def decode(z, D, I, cell_type, n_hidden, recurrent, output_activation):
-    if recurrent:  # train a recurrent autoencoder
-        return decode_recurrent(z, D, I, cell_type, n_hidden, output_activation)
-    else:  # train a non-recurrent autoencoder
-        return decode_nonrecurrent(z, D, n_hidden, output_activation)
-
-def encode_recurrent(X, D, I, cell_type, n_hidden):
-    if len(n_hidden) > 1:
-        return encode_multilayer_recurrent(X, D, I, cell_type, n_hidden)
-    else:
-        return encode_monolayer_recurrent(X, D, I, cell_type, n_hidden)
-
-def decode_recurrent(X, D, I, cell_type, n_hidden, output_activation):
-    if len(n_hidden) > 1:
-        return decode_multilayer_recurrent(X, D, I, cell_type, n_hidden, output_activation)
-    else:
-        return decode_monolayer_recurrent(X, D, I, cell_type, n_hidden, output_activation)
-
-def encode_monolayer_recurrent(X, D, I, cell_type, n_hidden):
-    n_hidden = n_hidden[0]
-    X = [tf.squeeze(t, [1]) for t in tf.split(X, D, 1)]
-    if cell_type == "LSTM":
-        encoder = tf.nn.rnn_cell.LSTMCell(
-            num_units=n_hidden + n_hidden,  # one for mu, one for sigma2
-            activation=tf.nn.tanh, name="encoder", use_peepholes=True
-        )
-        (_, (_, hidden)) = tf.nn.static_rnn(encoder, X, dtype=tf.float32)
-    else:
-        encoder = tf.nn.rnn_cell.GRUCell(
-            num_units=n_hidden + n_hidden,  # one for mu, one for sigma2
-            activation=tf.nn.tanh, name="encoder"
-        )
-        (_, hidden) = tf.nn.static_rnn(encoder, X, dtype=tf.float32)
-
-    mu = tf.identity(hidden[:, :n_hidden], name="mu_tilde")
-    log_sigma2 = tf.identity(hidden[:, n_hidden:], name="log_sigma2_tilde")
-    return mu, log_sigma2
-
-
-# encode
-def encode_multilayer_recurrent(X, D, I, cell_type, n_hidden):
-    X = [tf.squeeze(t, [1]) for t in tf.split(X, D, 1)]
-    if cell_type == "LSTM":
-        encoder = tf.nn.rnn_cell.LSTMCell(
-            num_units=n_hidden[0],
-            activation=tf.nn.tanh, name="encoder", use_peepholes=True
-        )
-        (_, (_, hidden)) = tf.nn.static_rnn(encoder, X, dtype=tf.float32)
-    else:
-        encoder = tf.nn.rnn_cell.GRUCell(
-            num_units=n_hidden[0],
-            activation=tf.nn.tanh, name="encoder"
-        )
-        (_, hidden) = tf.nn.static_rnn(encoder, X, dtype=tf.float32)
-    for n in n_hidden[1:-1]:
-        hidden = my_dense_layer(hidden, n)
-    mu = tf.identity(my_dense_layer(hidden, n_hidden[-1], activation=None), name="mu_tilde")
-    log_sigma2 = tf.identity(my_dense_layer(hidden, n_hidden[-1], activation=None),
-                            name="log_sigma2_tilde")
-    return mu, log_sigma2
-
-
-# decode
-def decode_monolayer_recurrent(z, D, I, cell_type, n_hidden, output_activation):
-    n_hidden = n_hidden[0]
-    weight = tf.Variable(tf.truncated_normal([n_hidden, I], dtype=tf.float32), name='weight')
-    bias = tf.Variable(tf.constant(0.1, shape=[I], dtype=tf.float32), name='bias')
-
-    input = [tf.zeros((tf.shape(z)[0], I), dtype=tf.float32) for _ in range(D)]
-    if cell_type == "LSTM":
-        decoder = tf.nn.rnn_cell.LSTMCell(n_hidden, name="decoder", use_peepholes=True)
-        (output, (_, _)) = tf.nn.static_rnn(decoder, input, initial_state=(tf.zeros(tf.shape(z)), z), dtype=tf.float32)
-    else:
-        decoder = tf.nn.rnn_cell.GRUCell(n_hidden, name="decoder")
-        (output, _) = tf.nn.static_rnn(decoder, input, initial_state=z, dtype=tf.float32)
-    output = tf.transpose(tf.stack(output), [1, 0, 2])
-    weight = tf.tile(tf.expand_dims(weight, 0), [tf.shape(z)[0], 1, 1])
-    output = tf.matmul(output, weight) + bias
-    x_raw = tf.identity(output, name="x_raw")
-    x = output_activation(x_raw, name="x_output")
-
-    return x, x_raw
-
-# decode
-def decode_multilayer_recurrent(z, D, I, cell_type, n_hidden, output_activation):
-    n_hidden = n_hidden[::-1]
-    hidden = z
-    for n in n_hidden[1:]:
-        hidden = my_dense_layer(hidden, n)
-    weight = tf.Variable(tf.truncated_normal([n_hidden[-1], I], dtype=tf.float32),
-                         name='weight')
-    bias = tf.Variable(tf.constant(0.1, shape=[I], dtype=tf.float32), name='bias')
-
-    input = [tf.zeros((tf.shape(hidden)[0], I), dtype=tf.float32) for _ in range(D)]
-    if cell_type == "LSTM":
-        decoder = tf.nn.rnn_cell.LSTMCell(n_hidden[-1], name="decoder", use_peepholes=True)
-        (output, (_, _)) = tf.nn.static_rnn(decoder, input, initial_state=(tf.zeros(tf.shape(hidden)), hidden),
-                                            dtype=tf.float32)
-    else:
-        decoder = tf.nn.rnn_cell.GRUCell(n_hidden[-1], name="decoder")
-        (output, _) = tf.nn.static_rnn(decoder, input, initial_state=hidden, dtype=tf.float32)
-
-    output = tf.transpose(tf.stack(output), [1, 0, 2])
-    weight = tf.tile(tf.expand_dims(weight, 0), [tf.shape(hidden)[0], 1, 1])
-    output = tf.matmul(output, weight) + bias
-    x_raw = tf.identity(output, name="x_raw")
-    x = output_activation(x_raw, name="x_output")
-    return x, x_raw
-
-
-# encode
-def encode_nonrecurrent(X, n_hidden):
-    hidden = X # tf.clip_by_value(X, eps, 1 - eps)
-    for n in n_hidden[:-1]:
-        hidden = my_dense_layer(hidden, n)
-    mu = tf.identity(my_dense_layer(hidden, n_hidden[-1], activation=None), name="mu_tilde")
-    log_sigma2 = tf.identity(my_dense_layer(hidden, n_hidden[-1], activation=None), name="log_sigma2_tilde")
-    return mu, log_sigma2
-
-
-# decode
-def decode_nonrecurrent(z, D, n_hidden, output_activation):
-    hidden = z
-    for n in (n_hidden[:-1])[::-1]:
-        hidden = my_dense_layer(hidden, n)
-    x_raw = tf.identity(my_dense_layer(hidden, D, activation=None), name="x_raw")
-    x = output_activation(x_raw, name="x_output")  # the reconstructions, one for each mixture component
-    return x, x_raw
-
-my_dense_layer = partial(
-    tf.layers.dense,
-    activation=tf.nn.softplus,  # tf.nn.softplus, # tf.nn.elu,
-    kernel_initializer=None  # initializer
-)
-
-
diff --git a/tensorflow1/vader/losses.py b/tensorflow1/vader/losses.py
deleted file mode 100644
index f9315cf..0000000
--- a/tensorflow1/vader/losses.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import tensorflow as tf
-import numpy as np
-
-def vader_reconstruction_loss(X, x, x_raw, W, output_activation, D, I, eps=1e-10):
-    # reconstruction loss: E[log p(x|z)]
-    # xentropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.clip_by_value(X, eps, 1-eps), logits=x_raw)
-    # # summing across both dimensions: should I prove this?
-    # rec_loss = tf.reduce_sum(xentropy, axis=1)
-    # # average across the samples
-    # rec_loss = tf.reduce_mean(rec_loss, name="reconstruction_loss")
-    #
-    # return rec_loss
-    #
-    if (output_activation == tf.nn.sigmoid):
-        rec_loss = tf.losses.sigmoid_cross_entropy(tf.clip_by_value(X, eps, 1 - eps), x_raw, W)
-    else:
-        rec_loss = tf.losses.mean_squared_error(X, x, W)
-
-    # re-scale the loss to the original dims (making sure it balances correctly with the latent loss)
-    rec_loss = rec_loss * tf.cast(tf.reduce_prod(tf.shape(W)), dtype=tf.float32) / tf.reduce_sum(W)
-    rec_loss = D * I * rec_loss
-
-    return rec_loss
-
-def vader_latent_loss(z, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde, K, eps=1e-10):
-    sigma2_tilde = tf.exp(log_sigma2_tilde)
-    log_sigma2_c = tf.log(eps + sigma2_c)
-    if K == 1:  # ordinary VAE
-        latent_loss = tf.reduce_mean(input_tensor=0.5 * tf.reduce_sum(
-            input_tensor=sigma2_tilde + tf.square(mu_tilde) - 1 - log_sigma2_tilde,
-            axis=1
-        ))
-    else:
-        log_2pi = tf.log(2 * np.pi)
-        log_phi_c = tf.log(eps + phi_c)
-
-        # def f(i):
-        #     return - 0.5 * (log_sigma2_c[i] + log_2pi + tf.square(z - mu_c[i]) / sigma2_c[i])
-        # log_pdf_z = tf.transpose(a=tf.map_fn(f, np.arange(K), fn_output_signature=tf.float32), perm=[1, 0, 2])
-
-        N = tf.shape(z)[0]
-        ii, jj = tf.meshgrid(tf.range(K, dtype=tf.int32), tf.range(N, dtype=tf.int32))
-        ii = tf.reshape(ii, [N * K])
-        jj = tf.reshape(jj, [N * K])
-        lsc_b = tf.gather(log_sigma2_c, ii, axis=0)
-        mc_b = tf.gather(mu_c, ii, axis=0)
-        sc_b = tf.gather(sigma2_c, ii, axis=0)
-        z_b = tf.gather(z, jj, axis=0)
-        log_pdf_z = - 0.5 * (lsc_b + log_2pi + tf.square(z_b - mc_b) / sc_b)
-        log_pdf_z = tf.reshape(log_pdf_z, [N, K, tf.shape(z)[1]])
-
-        log_p = log_phi_c + tf.reduce_sum(input_tensor=log_pdf_z, axis=2)
-        lse_p = tf.reduce_logsumexp(input_tensor=log_p, keepdims=True, axis=1)
-        log_gamma_c = log_p - lse_p
-
-        gamma_c = tf.exp(log_gamma_c)
-
-        # # latent loss: E[log p(z|c) + log p(c) - log q(z|x) - log q(c|x)]
-        # term1 = tf.log(eps + sigma2_c)
-        # f2 = lambda i: sigma2_tilde / (eps + sigma2_c[i])
-        # term2 = tf.transpose(a=tf.map_fn(f2, np.arange(K), tf.float32), perm=[1, 0, 2])
-        # f3 = lambda i: tf.square(mu_tilde - mu_c[i]) / (eps + sigma2_c[i])
-        # term3 = tf.transpose(a=tf.map_fn(f3, np.arange(K), tf.float32), perm=[1, 0, 2])
-
-        # latent loss: E[log p(z|c) + log p(c) - log q(z|x) - log q(c|x)]
-        term1 = tf.log(eps + sigma2_c)
-        N = tf.shape(sigma2_tilde)[0]
-        ii, jj = tf.meshgrid(tf.range(K, dtype=tf.int32), tf.range(N, dtype=tf.int32))
-        ii = tf.reshape(ii, [N * K])
-        jj = tf.reshape(jj, [N * K])
-        st_b = tf.gather(sigma2_tilde, jj, axis=0)
-        sc_b = tf.gather(sigma2_c, ii, axis=0)
-        term2 = tf.reshape(st_b / (eps + sc_b), [N, K, tf.shape(sigma2_tilde)[1]])
-        mt_b = tf.gather(mu_tilde, jj, axis=0)
-        mc_b = tf.gather(mu_c, ii, axis=0)
-        term3 = tf.reshape(tf.square(mt_b - mc_b) / (eps + sc_b), [N, K, tf.shape(sigma2_tilde)[1]])
-
-        latent_loss1 = 0.5 * tf.reduce_sum(
-            input_tensor=gamma_c * tf.reduce_sum(input_tensor=term1 + term2 + term3, axis=2), axis=1)
-        # latent_loss2 = - tf.reduce_sum(gamma_c * tf.log(eps + phi_c / (eps + gamma_c)), axis=1)
-        latent_loss2 = - tf.reduce_sum(input_tensor=gamma_c * (log_phi_c - log_gamma_c), axis=1)
-        latent_loss3 = - 0.5 * tf.reduce_sum(input_tensor=1 + log_sigma2_tilde, axis=1)
-        # average across the samples
-        latent_loss1 = tf.reduce_mean(input_tensor=latent_loss1)
-        latent_loss2 = tf.reduce_mean(input_tensor=latent_loss2)
-        latent_loss3 = tf.reduce_mean(input_tensor=latent_loss3)
-        # add the different terms
-        latent_loss = latent_loss1 + latent_loss2 + latent_loss3
-    return latent_loss
\ No newline at end of file
diff --git a/tensorflow1/vader/vader.py b/tensorflow1/vader/vader.py
deleted file mode 100644
index d17d563..0000000
--- a/tensorflow1/vader/vader.py
+++ /dev/null
@@ -1,787 +0,0 @@
-import tensorflow as tf
-from scipy.optimize import linear_sum_assignment as linear_assignment
-from scipy.stats import multivariate_normal
-import os
-import numpy as np
-import warnings
-from sklearn.mixture import GaussianMixture
-from losses import vader_reconstruction_loss, vader_latent_loss
-from layers import encode, decode
-
-class VADER:
-    '''
-        A VADER object represents a (recurrent) (variational) (Gaussian mixture) autoencoder
-    '''
-    def __init__(self, X_train, W_train=None, y_train=None, n_hidden=[12, 2], k=3, groups=None, output_activation=None,
-        batch_size = 32, learning_rate=1e-3, alpha=1.0, phi=None, cell_type="LSTM", recurrent=True,
-        save_path=os.path.join('vader', 'vader.ckpt'), eps=1e-10, seed=None, n_thread=0):
-        '''
-            Constructor for class VADER
-
-            Parameters
-            ----------
-            X_train : float
-                The data to be clustered. Numpy array with dimensions [samples, time points, variables] if recurrent is
-                True, else [samples, variables].
-            W_train : integer
-                Missingness indicator. Numpy array with same dimensions as X_train. Entries in X_train for which the
-                corresponding entry in W_train equals 0 are treated as missing. More precisely, their specific numeric
-                value is completely ignored. If None, then no missingness is assumed. (default: None)
-            y_train : int
-                Cluster labels. Numpy array or list of length X_train.shape[0]. y_train is used purely for monitoring
-                performance when a ground truth clustering is available. It does not affect training, and can be omitted
-                 if no ground truth is available. (default: None)
-            n_hidden : int
-                The hidden layers. List of length >= 1. Specification of the number of nodes in the hidden layers. For
-                example, specifying [a, b, c] will lead to an architecture with layer sizes a -> b -> c -> b -> a.
-                (default: [12, 2])
-            k : int
-                Number of mixture components. (default: 3)
-            groups : int
-                Grouping of the input variables as a list of length X.shape[2], with integers {0, 1, 2, ...} denoting
-                groups; used for weighting proportional to group size. (default: None)
-            output_activation : str
-                Output activation function, "sigmoid" for binary output, None for continuous output. (default: None)
-            batch_size : int
-                Batch size used for training. (default: 32)
-            learning_rate : float
-                Learning rate for training. (default: 1e-3)
-            alpha : float
-                Weight of the latent loss, relative to the reconstruction loss. (default: 1.0, i.e. equal weight)
-            phi : float
-                Initial values for the mixture component probabilities. List of length k. If None, then initialization
-                is according to a uniform distribution. (default: None)
-            cell_type : str
-                Cell type of the recurrent neural network. Currently only LSTM is supported. (default: "LSTM")
-            recurrent : bool
-                Train a recurrent autoencoder, or a non-recurrent autoencoder? (default: True)
-            save_path : str
-                Location to store the Tensorflow checkpoint files. (default: os.path.join('vader', 'vader.ckpt'))
-            eps : float
-                Small value used for numerical stability in logarithmic computations, divisions, etc. (default: 1e-10)
-            seed : int
-                Random seed, to be used for reproducibility. (default: None)
-            n_thread : int
-                Number of threads, passed to Tensorflow's intra_op_parallelism_threads and inter_op_parallelism_threads.
-                (default: 0)
-
-            Attributes
-            ----------
-            X : float
-                The data to be clustered. Numpy array.
-            W : int
-                Missingness indicator. Numpy array of same dimensions as X_train. Entries in X_train for which the
-                corresponding entry in W_train equals 0 are treated as missing. More precisely, their specific numeric
-                value is completely ignored.
-            y : int
-                Cluster labels. Numpy array or list of length X_train.shape[0]. y_train is used purely for monitoring
-                performance when a ground truth clustering is available. It does not affect training, and can be omitted
-                 if no ground truth is available.
-            n_hidden : int
-                The hidden layers. List of length >= 1. Specification of the number of nodes in the hidden layers.
-            K : int
-                Number of mixture components.
-            groups: int
-                Grouping of the input variables as a list of length self.X.shape[2], with integers {0, 1, 2, ...}
-                denoting groups; used for weighting proportional to group size.
-            G : float
-                Weights determined by variable groups, as computed from the groups argument.
-            output_activation : str
-                Output activation function, "sigmoid" for binary output, None for continuous output.
-            batch_size : int
-                Batch size used for training.
-            learning_rate : float
-                Learning rate for training.
-            alpha : float
-                Weight of the latent loss, relative to the reconstruction loss.
-            phi : float
-                Initial values for the mixture component probabilities. List of length k.
-            cell_type : str
-                Cell type of the recurrent neural network. Currently only LSTM is supported.
-            recurrent : bool
-                Train a recurrent autoencoder, or a non-recurrent autoencoder?
-            save_path : str
-                Location to store the Tensorflow checkpoint files.
-            eps : float
-                Small value used for numerical stability in logarithmic computations, divisions, etc.
-            seed : int
-                Random seed, to be used for reproducibility.
-            n_thread : int
-                Number of threads, passed to Tensorflow's intra_op_parallelism_threads and inter_op_parallelism_threads.
-            n_epoch : int
-                The number of epochs that this VADER object was trained.
-            loss : float
-                The current training loss of this VADER object.
-            n_param : int
-                The total number of parameters of this VADER object.
-            latent_loss : float
-                The current training latent loss of this VADER object.
-            reconstruction_loss : float
-                The current training reconstruction loss of this VADER object.
-            D : int
-                self.X.shape[1]. The number of time points if self.recurrent is True, otherwise the number of variables.
-            I : integer
-                X_train.shape[2]. The number of variables if self.recurrent is True, otherwise not defined.
-        '''
-
-        if seed is not None:
-            np.random.seed(seed)
-
-        self.D = X_train.shape[1]  # dimensionality of input/output
-        self.X = X_train
-        if W_train is not None:
-            self.W = W_train
-        else:
-            self.W = np.ones(X_train.shape, dtype=np.float32)
-        if y_train is not None:
-            self.y = np.asarray(y_train, np.int32)
-        else:
-            self.y = y_train
-        self.save_path = save_path
-        self.eps = eps
-        self.alpha = alpha  # weight for the latent loss (alpha times the reconstruction loss weight)
-        self.learning_rate = learning_rate
-        self.K = k  # 10 number of mixture components (clusters)
-        if groups is not None:
-            self.groups = np.array(groups, np.int32)
-            self.G = 1 / np.bincount(groups)[groups]
-            self.G = self.G / sum(self.G)
-            self.G = np.broadcast_to(self.G, self.X.shape)
-        else:
-            self.groups = np.ones(X_train.shape[-1], np.int32)
-            self.G = np.ones(X_train.shape, dtype=np.float32)
-
-        self.n_hidden = n_hidden  # n_hidden[-1] is dimensions of the mixture distribution (size of hidden layer)
-        if output_activation is None:
-            self.output_activation = tf.identity
-        else:
-            if output_activation == "sigmoid":
-                self.output_activation = tf.nn.sigmoid
-        self.n_hidden = n_hidden
-        self.seed = seed
-        self.n_epoch = 0
-        self.n_thread = n_thread
-        self.batch_size = batch_size
-        self.loss = np.array([])
-        self.reconstruction_loss = np.array([])
-        self.latent_loss = np.array([])
-        self.n_param = None
-        self.cell_type = cell_type
-        self.recurrent = recurrent
-        if self.recurrent:
-            self.I = X_train.shape[2]  # multivariate dimensions
-        else:
-            self.I = 1
-
-        # encoder function
-        def g(X):
-            return encode(X, self.D, self.I, self.cell_type, self.n_hidden, self.recurrent)
-
-        # decoder function
-        def f(z):
-            return decode(z, self.D, self.I, self.cell_type, self.n_hidden, self.recurrent, self.output_activation)
-
-        # reconstruction loss function
-        def reconstruction_loss(X, x, x_raw, W):
-            return vader_reconstruction_loss(X, x, x_raw, W, self.output_activation, self.D, self.I, self.eps)
-
-        # latent loss function
-        def latent_loss(z, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde):
-            return vader_latent_loss(z, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde, self.K, self.eps)
-
-        tf.reset_default_graph()
-        graph = tf.get_default_graph()
-
-        with graph.as_default():
-            if self.seed is not None:
-                tf.set_random_seed(self.seed)
-
-            if self.recurrent:
-                X = tf.placeholder(tf.float32, [None, self.D, self.I], name="X_input")
-                W = tf.placeholder(tf.float32, [None, self.D, self.I], name="W_input")
-                G = tf.placeholder(tf.float32, [None, self.D, self.I], name="G_input")
-                A = tf.get_variable("A", dtype=tf.float32, trainable=True, initializer=self._initialize_imputation(self.X, self.W))
-            else:
-                X = tf.placeholder(tf.float32, [None, self.D], name="X_input")
-                W = tf.placeholder(tf.float32, [None, self.D], name="W_input")
-                G = tf.placeholder(tf.float32, [None, self.D], name="G_input")
-                A = tf.get_variable("A", dtype=tf.float32, trainable=True, initializer=self._initialize_imputation(self.X, self.W))
-            alpha_t = tf.placeholder_with_default(tf.convert_to_tensor(self.alpha), (), name="alpha_input")
-            mu_c_unscaled = tf.get_variable("mu_c_unscaled", [self.K, self.n_hidden[-1]], dtype=tf.float32, trainable=True)
-            mu_c = tf.identity(mu_c_unscaled, name="mu_c")
-            sigma2_c_unscaled = tf.get_variable("sigma2_c_unscaled", shape=[self.K, self.n_hidden[-1]], dtype=tf.float32, trainable=True)
-            sigma2_c = tf.nn.softplus(sigma2_c_unscaled, name="sigma2_c")
-            if phi is None:
-                phi_c_unscaled = tf.get_variable("phi_c_unscaled", shape=[self.K], dtype=tf.float32, trainable=True, initializer=tf.constant_initializer(1))
-            else: # set phi_c to some constant provided by the user
-                phi_c_unscaled = tf.get_variable("phi_c_unscaled", dtype=tf.float32, trainable=False, initializer=tf.log(tf.constant(phi)))
-            phi_c = tf.nn.softmax(phi_c_unscaled, name="phi_c")
-
-            # encode
-            # Treat W as an indicator for nonmissingness (1: nonmissing; 0: missing)
-            if ~np.all(self.W == 1.0) and np.all(np.logical_or(self.W == 0.0, self.W == 1.0)):
-                XW = tf.multiply(X, W) + tf.multiply(A, (1.0 - W))
-                mu_tilde, log_sigma2_tilde = g(XW)
-            else:
-                mu_tilde, log_sigma2_tilde = g(X)
-
-            # sample from the mixture component
-            noise = tf.random_normal(tf.shape(log_sigma2_tilde), dtype=tf.float32)
-            z = tf.add(mu_tilde, tf.exp(0.5 * log_sigma2_tilde) * noise, name="z")
-
-            # decode
-            x, x_raw = f(z)
-
-            # calculate the loss
-            rec_loss = reconstruction_loss(X, x, x_raw, G * W)
-            rec_loss = tf.identity(rec_loss, name="reconstruction_loss")
-
-            lat_loss = tf.cond(
-                tf.greater(alpha_t, tf.convert_to_tensor(0.0)),
-                lambda: tf.multiply(alpha_t, latent_loss(z, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde)), # variational
-                lambda: tf.convert_to_tensor(0.0) # non-variational
-            )
-            lat_loss = tf.identity(lat_loss, name="latent_loss")
-
-            loss = tf.add(rec_loss, lat_loss, name="loss")
-
-            optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, name="optimizer")
-            gradients, variables = zip(*optimizer.compute_gradients(loss))
-            gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
-            training_op = optimizer.apply_gradients(zip(gradients, variables), name="training_op")
-
-            init = tf.global_variables_initializer()
-            self.n_param = np.sum([np.product([xi.value for xi in x.get_shape()]) for x in tf.trainable_variables()])
-            saver = tf.train.Saver()
-
-        with tf.Session(graph=graph, config=tf.ConfigProto(
-            intra_op_parallelism_threads=self.n_thread, inter_op_parallelism_threads=self.n_thread)) as sess:
-            init.run()
-            saver.save(sess, self.save_path)
-
-    def _initialize_imputation(self, X, W):
-        # average per time point, variable
-        W_A = np.sum(W, axis=0)
-        A = np.sum(X * W, axis=0)
-        A[W_A>0] = A[W_A>0] / W_A[W_A>0]
-        # if not available, then average across entire variable
-        if self.recurrent:
-            for i in np.arange(A.shape[0]):
-                for j in np.arange(A.shape[1]):
-                    if W_A[i,j] == 0:
-                        A[i,j] = np.sum(X[:,:,j]) / np.sum(W[:,:,j])
-                        W_A[i,j] = 1
-        # if not available, then average across all variables
-        A[W_A==0] = np.mean(X[W==1])
-        return A.astype(np.float32)
-        # return np.zeros(X.shape[1:], np.float32)
-
-    def _restore_session(self):
-        tf.reset_default_graph()
-        sess = tf.InteractiveSession(config=tf.ConfigProto(
-            intra_op_parallelism_threads=self.n_thread,
-            inter_op_parallelism_threads=self.n_thread
-        ))
-        saver = tf.train.import_meta_graph(self.save_path + ".meta")
-        saver.restore(sess, self.save_path)
-        graph = tf.get_default_graph()
-        return sess, saver, graph
-
-    # def _cluster(self, mu_t, mu, sigma2, phi):
-    #     def f(mu_t, mu, sigma2, phi):
-    #         # the covariance matrix is diagonal, so we can just take the product
-    #         p = np.log(self.eps + phi) + np.sum(np.log(self.eps + norm.pdf(mu_t, loc=mu, scale=np.sqrt(sigma2))), axis=1)
-    #         return np.argmax(p)
-    #     return np.array([f(mu_t[i], mu, sigma2, phi) for i in np.arange(mu_t.shape[0])])
-
-    def _cluster(self, mu_t, mu, sigma2, phi):
-        # import pickle
-        # pickle.dump(mu_t, open( "mu_t.pickle", "wb" ) )
-        # pickle.dump(mu, open( "mu.pickle", "wb" ) )
-        # pickle.dump(sigma2, open( "sigma2.pickle", "wb" ) )
-        # pickle.dump(phi, open( "phi.pickle", "wb" ) )
-        def f(mu_t, mu, sigma2, phi):
-            # the covariance matrix is diagonal, so we can just take the product
-            return np.log(self.eps + phi) + np.log(self.eps + multivariate_normal.pdf(mu_t, mean=mu, cov=np.diag(sigma2)))
-        p = np.array([f(mu_t, mu[i], sigma2[i], phi[i]) for i in np.arange(mu.shape[0])])
-        return np.argmax(p, axis=0)
-
-    def _accuracy(self, y_pred, y_true):
-        def cluster_acc(Y_pred, Y):
-            assert Y_pred.size == Y.size
-            D = max(Y_pred.max(), Y.max()) + 1
-            w = np.zeros((D, D), dtype=np.int64)
-            for i in range(Y_pred.size):
-                w[Y_pred[i], Y[i]] += 1
-            ind = np.transpose(np.asarray(linear_assignment(w.max() - w)))
-            return sum([w[i, j] for i, j in ind]) * 1.0 / Y_pred.size, np.array(w)
-
-        y_pred = np.array(y_pred, np.int32)
-        y_true = np.array(y_true, np.int32)
-        return cluster_acc(y_pred, y_true)
-
-    def _get_vars(self, graph):
-        training_op = graph.get_operation_by_name("training_op")
-        loss = graph.get_tensor_by_name("loss:0")
-        rec_loss = graph.get_tensor_by_name("reconstruction_loss:0")
-        lat_loss = graph.get_tensor_by_name("latent_loss:0")
-        X = graph.get_tensor_by_name("X_input:0")
-        W = graph.get_tensor_by_name("W_input:0")
-        G = graph.get_tensor_by_name("G_input:0")
-        alpha_t = graph.get_tensor_by_name("alpha_input:0")
-        z = graph.get_tensor_by_name("z:0")
-        x_output = graph.get_tensor_by_name("x_output:0")
-        sigma2_c = graph.get_tensor_by_name("sigma2_c:0")
-        mu_c = graph.get_tensor_by_name("mu_c:0")
-        mu_tilde = graph.get_tensor_by_name("mu_tilde:0")
-        log_sigma2_tilde = graph.get_tensor_by_name("log_sigma2_tilde:0")
-        phi_c = graph.get_tensor_by_name("phi_c:0")
-        return training_op, loss, rec_loss, lat_loss, X, W, G, alpha_t, z, x_output, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde
-
-    def _get_batch(self, batch_size):
-        ii = np.random.choice(np.arange(self.X.shape[0]), batch_size, replace=False)
-        X_batch = self.X[ii,]
-        if self.y is not None:
-            y_batch = self.y[ii]
-        else:
-            y_batch = None
-        W_batch = self.W[ii,]
-        G_batch = self.G[ii,]
-        return X_batch, y_batch, W_batch, G_batch
-
-    def _print_progress(self, epoch, sess, graph):
-        X_batch, y_batch, W_batch, G_batch = self._get_batch(min(10 * self.batch_size, self.X.shape[0]))
-        training_op, loss, rec_loss, lat_loss, X, W, G, alpha_t, z, x_output, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde = self._get_vars(
-            graph)
-        loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, rec_loss, lat_loss],
-                                                                      feed_dict={X: X_batch, W: W_batch, G: G_batch, alpha_t: self.alpha})
-        self.reconstruction_loss = np.append(self.reconstruction_loss, reconstruction_loss_val)
-        self.latent_loss = np.append(self.latent_loss, latent_loss_val)
-        self.loss = np.append(self.loss, loss_val)
-        clusters = self._cluster(mu_tilde.eval(feed_dict={X: X_batch, W: W_batch, G: G_batch, alpha_t: self.alpha}), mu_c.eval(), sigma2_c.eval(),
-                                 phi_c.eval())
-        # if y_batch is not None:
-        if y_batch is not None:
-            acc, _ = self._accuracy(clusters, y_batch)
-            print(epoch,
-                  "tot_loss:", "%.2f" % round(loss_val, 2),
-                  "\trec_loss:", "%.2f" % round(reconstruction_loss_val, 2),
-                  "\tlat_loss:", "%.2f" % round(latent_loss_val, 2),
-                  "\tacc:", "%.2f" % round(acc, 2),
-                  flush=True
-                  )
-        else:
-            print(epoch,
-                  "tot_loss:", "%.2f" % round(loss_val, 2),
-                  "\trec_loss:", "%.2f" % round(reconstruction_loss_val, 2),
-                  "\tlat_loss:", "%.2f" % round(latent_loss_val, 2),
-                  flush=True
-                  )
-        return 0
-
-    def fit(self, n_epoch=10, learning_rate=None, verbose=False):
-        '''
-            Train a VADER object.
-
-            Parameters
-            ----------
-            n_epoch : int
-                Train n_epoch epochs. (default: 10)
-            learning_rate: float
-                Learning rate for this set of epochs (default: learning rate specified at object construction)
-                (NB: not currently used!)
-            verbose : bool
-                Print progress? (default: False)
-
-            Returns
-            -------
-            0 if successful
-        '''
-        if self.seed is not None:
-            np.random.seed(self.seed)
-
-        self.n_epoch += n_epoch
-
-        sess, saver, graph = self._restore_session()
-
-        # if learning_rate is not None:
-        #     lr_t = graph.get_tensor_by_name("learning_rate:0")
-
-        writer = tf.summary.FileWriter(self.save_path, sess.graph)
-        training_op, loss, rec_loss, lat_loss, X, W, G, alpha_t, z, x_output, mu_c, sigma2_c, phi_c, mu_tilde, log_sigma2_tilde = self._get_vars(graph)
-        X = graph.get_tensor_by_name("X_input:0")
-        if verbose:
-            self._print_progress(-1, sess, graph)
-        for epoch in range(n_epoch): # NOTE: explicitly not self.epoch in case of repeated calls to fit!
-            n_batches = self.X.shape[0] // self.batch_size
-            for iteration in range(n_batches):
-                X_batch, y_batch, W_batch, G_batch = self._get_batch(self.batch_size)
-                sess.run(training_op, feed_dict={X: X_batch, W: W_batch, G: G_batch, alpha_t: self.alpha})
-            if verbose:
-                self._print_progress(epoch, sess, graph)
-        # loss_val, reconstruction_loss_val, latent_loss_val = sess.run([loss, rec_loss, lat_loss], feed_dict={X: self.X, W: self.W, G: self.G, alpha_t: self.alpha})
-        # self.reconstruction_loss = np.append(self.reconstruction_loss, reconstruction_loss_val)
-        # self.latent_loss = np.append(self.latent_loss, latent_loss_val)
-        # self.loss = np.append(self.loss, loss_val)
-        # if learning_rate is not None:
-        #     self.learning_rate = self_learning_rate
-        saver.save(sess, self.save_path)
-        writer.close()
-        sess.close()
-        return 0
-
-    def pre_fit(self, n_epoch=10, GMM_initialize=True, learning_rate=None, verbose=False):
-        '''
-            Pre-train a VADER object using only the latent loss, and initialize the Gaussian mixture parameters using
-            the resulting latent representation.
-
-            Parameters
-            ----------
-            n_epoch : int
-                Train n_epoch epochs. (default: 10)
-            GMM_initialize: bool
-                Should a GMM be fit on the pre-trained latent layer, in order to initialize the VaDER
-                mixture component parameters?
-            learning_rate: float
-                Learning rate for this set of epochs(default: learning rate specified at object construction)
-                (NB: not currently used!)
-            verbose : bool
-                Print progress? (default: False)
-
-            Returns
-            -------
-            0 if successful
-        '''
-
-        # save the alpha
-        alpha = self.alpha
-        # pre-train using non-variational AEs
-        self.alpha = 0.0
-        # pre-train
-        ret = self.fit(n_epoch, learning_rate, verbose)
-
-        if GMM_initialize:
-            try:
-                # map to latent
-                z = self.map_to_latent(self.X, self.W, n_samp=10)
-                # fit GMM
-                gmm = GaussianMixture(n_components=self.K, covariance_type="diag", reg_covar=1e-04).fit(z)
-                # get GMM parameters
-                phi = np.log(gmm.weights_ + self.eps) # inverse softmax
-                mu = gmm.means_
-                def inverse_softplus(x):
-                    b = x < 1e2
-                    x[b] = np.log(np.exp(x[b]) - 1.0 + self.eps)
-                    return x
-                sigma2 = inverse_softplus(gmm.covariances_)
-
-                # initialize mixture components
-                sess, saver, graph = self._restore_session()
-                def my_get_variable(varname):
-                    return [v for v in tf.global_variables() if v.name == varname][0]
-                mu_c_unscaled = my_get_variable("mu_c_unscaled:0")
-                sess.run(mu_c_unscaled.assign(tf.convert_to_tensor(mu, dtype=tf.float32)))
-                sigma2_c_unscaled = my_get_variable("sigma2_c_unscaled:0")
-                sess.run(sigma2_c_unscaled.assign(tf.convert_to_tensor(sigma2, dtype=tf.float32)))
-                phi_c_unscaled = my_get_variable("phi_c_unscaled:0")
-                sess.run(phi_c_unscaled.assign(tf.convert_to_tensor(phi, dtype=tf.float32)))
-                self.gmm = {'mu': mu, 'sigma2': sigma2, 'phi': phi}
-                saver.save(sess, self.save_path)
-                sess.close()
-            except:
-                warnings.warn("Failed to initialize VaDER with Gaussian mixture")
-            finally:
-                pass
-        # restore the alpha
-        self.alpha = alpha
-        return ret
-
-    def map_to_latent(self, X_c, W_c=None, n_samp=1):
-        '''
-            Map an input to its latent representation.
-
-            Parameters
-            ----------
-            X_c : float
-                The data to be clustered. Numpy array with dimensions [samples, time points, variables] if recurrent is
-                True, else [samples, variables]
-            W_c : int
-                Missingness indicator. Numpy array with same dimensions as X_c. Entries in X_c for which the
-                corresponding entry in W_train equals 0 are treated as missing. More precisely, their specific numeric
-                value is completely ignored. If None, then no missingness is assumed. (default: None)
-            n_samp : int
-                The number of latent samples to take for each input sample. (default: 1)
-
-            Returns
-            -------
-            numpy array containing the latent representations.
-        '''
-        if W_c is None:
-            W_c = np.ones(X_c.shape)
-        G_c = np.ones(X_c.shape)
-        sess, saver, graph = self._restore_session()
-        z = graph.get_tensor_by_name("z:0")
-        mu_tilde = graph.get_tensor_by_name("mu_tilde:0")
-        X = graph.get_tensor_by_name("X_input:0")
-        W = graph.get_tensor_by_name("W_input:0")
-        G = graph.get_tensor_by_name("G_input:0")
-        latent = np.concatenate([z.eval(feed_dict={X: X_c, W: W_c, G: G_c}) for i in np.arange(n_samp)], axis=0)
-        sess.close()
-        return latent
-
-    def get_loss(self, X_c, W_c=None, mu_c=None, sigma2_c=None, phi_c=None):
-        '''
-            Calculate the loss for specific input data and Gaussian mixture parameters.
-
-            Parameters
-            ----------
-            X_c : float
-                The data to be clustered. Numpy array with dimensions [samples, time points, variables] if recurrent is
-                True, else [samples, variables]
-            W_c : int
-                Missingness indicator. Numpy array with same dimensions as X_c. Entries in X_c for which the
-                corresponding entry in W_train equals 0 are treated as missing. More precisely, their specific numeric
-                value is completely ignored. If None, then no missingness is assumed. (default: None)
-            mu_c : float
-                The mixture component means, one for each self.K. Numpy array of dimensions [self.K, self.n_hidden[-1]].
-                If None, then the means of this VADER object are used. (default: None)
-            sigma2_c : float
-                The mixture component variances, representing diagonal covariance matrices, one for each self.K. Numpy
-                array of dimensions [self.K, self.n_hidden[-1]]. If None, then the variances of this VADER object are
-                used. (default: None)
-            phi_c : float
-                The mixture component probabilities. List of length self.K. If None, then the component probabilities of
-                 this VADER object are used. (default: None)
-
-            Returns
-            -------
-            Dictionary with two components, "reconstruction_loss" and "latent_loss".
-        '''
-        if W_c is None:
-            W_c = np.ones(X_c.shape, dtype=np.float32)
-        sess, saver, graph = self._restore_session()
-        rec_loss = graph.get_tensor_by_name("reconstruction_loss:0")
-        lat_loss = graph.get_tensor_by_name("latent_loss:0")
-        X = graph.get_tensor_by_name("X_input:0")
-        W = graph.get_tensor_by_name("W_input:0")
-        G = graph.get_tensor_by_name("G_input:0")
-        alpha_t = graph.get_tensor_by_name("alpha_input:0")
-
-        def my_get_variable(varname):
-            return [v for v in tf.global_variables() if v.name == varname][0]
-
-        mu_c_unscaled = my_get_variable("mu_c_unscaled:0")
-        if mu_c is not None:
-            mu_c_new = mu_c
-            mu_c_old = mu_c_unscaled.eval()
-            sess.run(mu_c_unscaled.assign(tf.convert_to_tensor(mu_c_new, dtype=tf.float32)))
-
-        sigma2_c_unscaled = my_get_variable("sigma2_c_unscaled:0")
-        if sigma2_c is not None:
-            sigma2_c_new = sigma2_c
-            sigma2_c_old = sigma2_c_unscaled.eval()
-            sess.run(sigma2_c_unscaled.assign(tf.contrib.distributions.softplus_inverse(tf.convert_to_tensor(sigma2_c_new, dtype=tf.float32))))
-
-        phi_c_unscaled = my_get_variable("phi_c_unscaled:0")
-        if phi_c is not None:
-            phi_c_new = phi_c
-            phi_c_old = phi_c_unscaled.eval()
-            sess.run(phi_c_unscaled.assign(tf.convert_to_tensor(phi_c_new, dtype=tf.float32)))
-
-        G_c = 1 / np.bincount(self.groups)[self.groups]
-        G_c = G_c / sum(G_c)
-        G_c = np.broadcast_to(G_c, X_c.shape)
-
-        lat_loss = lat_loss.eval(feed_dict={X: X_c, W: W_c, G: G_c, alpha_t: self.alpha})
-        rec_loss = rec_loss.eval(feed_dict={X: X_c, W: W_c, G: G_c, alpha_t: self.alpha})
-
-        sess.close()
-        return {"reconstruction_loss": rec_loss, "latent_loss": lat_loss}
-
-    def get_imputation_matrix(self):
-        '''
-            Returns
-            -------
-            The imputation matrix.
-        '''
-        sess, saver, graph = self._restore_session()
-        def my_get_variable(varname):
-            return [v for v in tf.global_variables() if v.name == varname][0]
-        A = my_get_variable("A:0").eval()
-        sess.close()
-        return A
-
-    def cluster(self, X_c, W_c=None, mu_c=None, sigma2_c=None, phi_c=None):
-        '''
-            Cluster input data using this VADER object.
-
-            Parameters
-            ----------
-            X_c : float
-                The data to be clustered. Numpy array with dimensions [samples, time points, variables] if recurrent is
-                True, else [samples, variables]
-            W_c : int
-                Missingness indicator. Numpy array with same dimensions as X_c. Entries in X_c for which the
-                corresponding entry in W_train equals 0 are treated as missing. More precisely, their specific numeric
-                value is completely ignored. If None, then no missingness is assumed. (default: None)
-            mu_c : float
-                The mixture component means, one for each self.K. Numpy array of dimensions [self.K, self.n_hidden[-1]].
-                If None, then the means of this VADER object are used. (default: None)
-            sigma2_c : float
-                The mixture component variances, representing diagonal covariance matrices, one for each self.K. Numpy
-                array of dimensions [self.K, self.n_hidden[-1]]. If None, then the variances of this VADER object are
-                used. (default: None)
-            phi_c : float
-                The mixture component probabilities. List of length self.K. If None, then the component probabilities of
-                 this VADER object are used. (default: None)
-
-            Returns
-            -------
-            Clusters encoded as integers.
-        '''
-        sess, saver, graph = self._restore_session()
-        mu_tilde = graph.get_tensor_by_name("mu_tilde:0")
-        X = graph.get_tensor_by_name("X_input:0")
-        W = graph.get_tensor_by_name("W_input:0")
-        G = graph.get_tensor_by_name("G_input:0")
-        if mu_c is None:
-            mu_c = graph.get_tensor_by_name("mu_c:0").eval()
-        if sigma2_c is None:
-            sigma2_c = graph.get_tensor_by_name("sigma2_c:0").eval()
-        if phi_c is None:
-            phi_c = graph.get_tensor_by_name("phi_c:0").eval()
-
-        if W_c is None:
-            W_c = np.ones(X_c.shape)
-        G_c = np.ones(X_c.shape)
-
-        clusters = self._cluster(mu_tilde.eval(feed_dict={X: X_c, W: W_c, G: G_c}), mu_c, sigma2_c, phi_c)
-        sess.close()
-        return clusters
-
-    def get_clusters(self):
-        '''
-            Get the cluster averages represented by this VADER object. Technically, this maps the latent Gaussian
-            mixture means to output values using this VADER object.
-
-            Returns
-            -------
-            Cluster averages.
-        '''
-        if self.seed is not None:
-            np.random.seed(self.seed)
-
-        sess, saver, graph = self._restore_session()
-        z = graph.get_tensor_by_name("z:0")
-        x = graph.get_tensor_by_name("x_output:0")
-        mu_c = graph.get_tensor_by_name("mu_c:0")
-
-        clusters = x.eval(feed_dict={z: mu_c.eval()})
-        sess.close()
-        return clusters
-
-    def get_latent_distribution(self):
-        '''
-            Get the parameters of the Gaussian mixture distribution of this VADER object.
-
-            Returns
-            -------
-            Dictionary with three components, "mu", "sigma2_sq", "phi".
-        '''
-        if self.seed is not None:
-            np.random.seed(self.seed)
-
-        sess, saver, graph = self._restore_session()
-        sigma2_c = graph.get_tensor_by_name("sigma2_c:0")
-        mu_c = graph.get_tensor_by_name("mu_c:0")
-        phi_c = graph.get_tensor_by_name("phi_c:0")
-
-        res = {
-            "mu": mu_c.eval(),
-            "sigma2_sq": sigma2_c.eval(),
-            "phi": phi_c.eval()
-        }
-
-        sess.close()
-        return res
-
-    def generate(self, n):
-        '''
-            Generate random samples from this VADER object.
-
-            n : int
-                The number of samples to generate.
-
-            Returns
-            -------
-            A dictionary with two components, "clusters" (cluster indicator) and "samples" (the random samples).
-        '''
-        if self.seed is not None:
-            np.random.seed(self.seed)
-
-        sess, saver, graph = self._restore_session()
-
-        z = graph.get_tensor_by_name("z:0")
-        x = graph.get_tensor_by_name("x_output:0")
-        sigma2_c = graph.get_tensor_by_name("sigma2_c:0")
-        mu_c = graph.get_tensor_by_name("mu_c:0")
-        phi_c = graph.get_tensor_by_name("phi_c:0")
-
-        if self.recurrent:
-            gen = np.zeros((n, self.D, self.I), dtype=np.float)
-        else:
-            gen = np.zeros((n, self.D), dtype=np.float)
-        c = np.random.choice(np.arange(self.K), size=n, p=phi_c.eval())
-        for k in np.arange(self.K):
-            ii = np.flatnonzero(c == k)
-            z_rnd = \
-                mu_c.eval()[None, k, :] + \
-                np.sqrt(sigma2_c.eval())[None, k, :] * \
-                np.random.normal(size=[ii.shape[0], self.n_hidden[-1]])
-            # gen[ii,:] = x.eval(feed_dict={z: z_rnd})
-            if self.recurrent:
-                gen[ii,:,:] = x.eval(feed_dict={z: z_rnd})
-            else:
-                gen[ii,:] = x.eval(feed_dict={z: z_rnd})
-        sess.close()
-        gen = {
-            "clusters": c,
-            "samples": gen
-        }
-        return gen
-
-    def predict(self, X_test, W_test=None):
-        '''
-            Map input data to output (i.e. reconstructed input).
-
-            Parameters
-            ----------
-            X_test : float numpy array
-                The input data to be mapped.
-            W_test : integer numpy array of same dimensions as X_test
-                Missingness indicator. Entries in X_test for which the corresponding entry in W_test equals 0 are
-                treated as missing. More precisely, their specific numeric value is completely ignored.
-
-            Returns
-            -------
-            numpy array with reconstructed input (i.e. the autoencoder output).
-        '''
-
-        if W_test is None:
-            W_test = np.ones(X_test.shape)
-
-        G_test = np.ones(X_test.shape)
-
-        if self.seed is not None:
-            np.random.seed(self.seed)
-
-        sess, saver, graph = self._restore_session()
-
-        X = graph.get_tensor_by_name("X_input:0")
-        W = graph.get_tensor_by_name("W_input:0")
-        G = graph.get_tensor_by_name("G_input:0")
-        x_output = graph.get_tensor_by_name("x_output:0")
-
-        pred = x_output.eval(feed_dict={X: X_test, W: W_test, G: G_test})
-        sess.close()
-        return pred
diff --git a/tensorflow2/requirements.txt b/tensorflow2/requirements.txt
deleted file mode 100644
index 8253295..0000000
--- a/tensorflow2/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-numpy==1.19.4
-scikit-learn==0.22.2.post1
-scipy==1.4.1
-tensorflow==2.4.0
-tensorflow-addons==0.12.0
diff --git a/tensorflow2/vader/__init__.py b/tensorflow2/vader/__init__.py
deleted file mode 100644
index 4c57a6d..0000000
--- a/tensorflow2/vader/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-import os;
-import sys
-sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-
-if __name__ == '__main__':
-    pass