From ebccefcc6ca4be97b64a8757d94ac5b010e62c45 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Wed, 6 Jun 2018 13:24:26 +0200
Subject: [PATCH 1/8] convolution for fixed centered gaussians

---
 dgs/dgs_gauss.h      | 21 +++++++++++
 dgs/dgs_gauss_dp.c   | 83 ++++++++++++++++++++++++++++++++++++++++++++
 tests/test_gauss_z.c | 16 +++++++--
 3 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/dgs/dgs_gauss.h b/dgs/dgs_gauss.h
index 38e090a..aa9d50e 100644
--- a/dgs/dgs_gauss.h
+++ b/dgs/dgs_gauss.h
@@ -124,6 +124,7 @@ typedef enum {
   DGS_DISC_GAUSS_UNIFORM_LOGTABLE  = 0x3, //<call dgs_disc_gauss_mp_call_uniform_logtable
   DGS_DISC_GAUSS_SIGMA2_LOGTABLE   = 0x7, //<call dgs_disc_gauss_mp_call_sigma2_logtable
   DGS_DISC_GAUSS_ALIAS             = 0x8, //<call dgs_disc_gauss_mp_call_alias
+  DGS_DISC_GAUSS_CONVOLUTION       = 0x9, //<call dgs_disc_gauss_mp_call_convolution
 } dgs_disc_gauss_alg_t;
 
 /**
@@ -311,6 +312,14 @@ typedef struct _dgs_disc_gauss_dp_t {
    
   long* alias;
   dgs_bern_dp_t** bias;
+  
+  /**
+   * Base sampler for convolution
+   */
+  struct _dgs_disc_gauss_dp_t* base_sampler;
+  size_t n_coefficients;
+  long* coefficients;
+
 } dgs_disc_gauss_dp_t;
 
 /**
@@ -370,6 +379,18 @@ long dgs_disc_gauss_dp_call_uniform_table_offset(dgs_disc_gauss_dp_t *self);
 
 long dgs_disc_gauss_dp_call_alias(dgs_disc_gauss_dp_t *self);
 
+/**
+   Sample from ``dgs_disc_gauss_dp_t`` by convolution sampling 
+   (base sampler = alias sampling). This can be used to reduce 
+   the memory overhead and setup costs of alias sampling for 
+   wide distributions, at the cost of increasing running time, but
+   is only implemented for centered Gaussians.
+
+   :param self: discrete Gaussian sampler
+ */
+
+long dgs_disc_gauss_dp_call_convolution(dgs_disc_gauss_dp_t *self);
+
 /**
   Sample from ``dgs_disc_gauss_dp_t`` by rejection sampling using the uniform
   distribution replacing all ``exp()`` calls with calls to Bernoulli
diff --git a/dgs/dgs_gauss_dp.c b/dgs/dgs_gauss_dp.c
index d1909d8..e46e7d9 100644
--- a/dgs/dgs_gauss_dp.c
+++ b/dgs/dgs_gauss_dp.c
@@ -213,6 +213,11 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
     self->alias = (long*)malloc(sizeof(long)*self->two_upper_bound_minus_one);
     self->bias = (dgs_bern_dp_t**)malloc(sizeof(dgs_bern_dp_t*)*self->two_upper_bound_minus_one);
     
+    if (!self->alias || !self->bias){
+      dgs_disc_gauss_dp_clear(self);
+      dgs_die("out of memory");
+    }
+    
     // simple robin hood strategy approximates good alias
     // this precomputation takes ~n^2, but could be reduced by 
     // using better data structures to compute min and max 
@@ -234,6 +239,70 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
     break;
   }
   
+  case DGS_DISC_GAUSS_CONVOLUTION: {
+    self->call = dgs_disc_gauss_dp_call_convolution;
+    
+    if (fabs(self->c_r) > DGS_DISC_GAUSS_INTEGER_CUTOFF) {
+      dgs_disc_gauss_dp_clear(self);
+      dgs_die("algorithm DGS_DISC_GAUSS_CONVOLUTION requires c%1 == 0");
+    }
+    
+    double eta = 2;
+    long table_size = 2*ceil(sigma*tau) * (sizeof(dgs_bern_dp_t) + sizeof(long));
+    int recursion_level = 0;
+    double current_sigma = sigma;
+    long z1 = 1;
+    long z2 = 1;
+    
+    // compute recursion level for convolution
+    while (table_size > (DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES >> 1)) {
+      recursion_level++;
+      z1 = floor(sqrt(current_sigma/(eta*2)));
+      if (z1 == 0) {
+        dgs_disc_gauss_dp_clear(self);
+        dgs_die("MAX_TABLE_SIZE too small to store alias sampler!");
+      }
+      z2 = (z1 > 1) ? z1 - 1 : 1;
+      current_sigma /= (sqrt(z1*z1 + z2*z2));
+      table_size = 2*ceil(current_sigma*tau) * (sizeof(dgs_bern_dp_t) + sizeof(long));
+    }
+    
+    self->n_coefficients = 1 << recursion_level;
+    self->coefficients = (long*)malloc(sizeof(long)*self->n_coefficients);
+    for (int i = 0; i < self->n_coefficients; ++i) {
+      self->coefficients[i] = 1;
+    }
+    
+    current_sigma = sigma;
+    
+    // redo above computation to store coefficients
+    for (int i = 0; i < recursion_level; ++i) {
+      z1 = floor(sqrt(current_sigma/(eta*2)));
+      z2 = (z1 > 1) ? z1 - 1 : 1;
+      
+      // we unroll the recursion on the coefficients on the fly
+      // so we don't have to use recursion during the call
+      int off = (1 << recursion_level - i - 1);
+      for (int j = 0; j < (1 << i); ++j) {
+        for (int k = 0; k < off;++k) {
+          self->coefficients[2*j*off + k] *= z1;
+        }
+      }
+      
+      for (int j = 0; j < (1 << i); ++j) {
+        for (int k = 0; k < off;++k) {
+          self->coefficients[(2*j + 1)*off + k] *= z2;
+        }
+      }
+      
+      current_sigma /= (sqrt(z1*z1 + z2*z2));
+    }
+    
+    self->base_sampler = dgs_disc_gauss_dp_init(current_sigma, 0, tau, DGS_DISC_GAUSS_ALIAS);
+    
+    break;
+  }
+  
   default:
     dgs_disc_gauss_dp_clear(self);
     dgs_die("unknown algorithm %d", algorithm);
@@ -289,6 +358,14 @@ long dgs_disc_gauss_dp_call_alias(dgs_disc_gauss_dp_t *self) {
   return x + self->c_z - self->upper_bound_minus_one;
 }
 
+long dgs_disc_gauss_dp_call_convolution(dgs_disc_gauss_dp_t *self) {
+  long x = 0;
+  for (int i = 0; i < self->n_coefficients; ++i) {
+    x += self->coefficients[i]*self->base_sampler->call(self->base_sampler);
+  }
+  return x + self->c_z;
+}
+
 long dgs_disc_gauss_dp_call_uniform_logtable(dgs_disc_gauss_dp_t *self) {
   long x;
   do {
@@ -334,6 +411,12 @@ void dgs_disc_gauss_dp_clear(dgs_disc_gauss_dp_t *self) {
     }
     free(self->bias);
   }
+  if (self->base_sampler) {
+    dgs_disc_gauss_dp_clear(self->base_sampler);
+  }
+  if (self->coefficients) {
+    free(self->coefficients);
+  }
   
   free(self);
 }
diff --git a/tests/test_gauss_z.c b/tests/test_gauss_z.c
index 0f44e37..383a66e 100644
--- a/tests/test_gauss_z.c
+++ b/tests/test_gauss_z.c
@@ -129,6 +129,12 @@ int main(int argc, char *argv[]) {
   test_ratios_dp(15.4, 3, DGS_DISC_GAUSS_UNIFORM_LOGTABLE);
   test_ratios_dp(15.4, 3, DGS_DISC_GAUSS_SIGMA2_LOGTABLE);
   test_ratios_dp(15.4, 3, DGS_DISC_GAUSS_ALIAS);
+  
+  // we need some large sigma test cases here, because
+  // everything else would just be retesting alias
+  test_ratios_dp(150, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_ratios_dp(1500, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_ratios_dp((1<<27), 6, DGS_DISC_GAUSS_CONVOLUTION);
   printf("\n");
 
   printf("# testing [⌊c⌋-⌈στ⌉,…, ⌊c⌋+⌈στ⌉] boundaries #\n");
@@ -155,6 +161,10 @@ int main(int argc, char *argv[]) {
   test_uniform_boundaries_dp( 3.3, 1.0, 1, DGS_DISC_GAUSS_ALIAS);
   test_uniform_boundaries_dp( 2.0, 2.0, 2, DGS_DISC_GAUSS_ALIAS);
   printf("\n");
+  
+  // Testing boundaries for convolution does not work in the 
+  // same way since the boundary constraints are imposed on
+  // the base sampler, but the convolution can exceed them
 
   printf("# testing c is center #\n");
   test_mean_dp( 3.0, 0.0, 6, DGS_DISC_GAUSS_UNIFORM_ONLINE);
@@ -179,7 +189,6 @@ int main(int argc, char *argv[]) {
   test_mean_dp(10.0, 0.0, 6, DGS_DISC_GAUSS_SIGMA2_LOGTABLE);
   test_mean_dp( 3.3, 1.0, 6, DGS_DISC_GAUSS_SIGMA2_LOGTABLE);
   test_mean_dp( 2.0, 2.0, 6, DGS_DISC_GAUSS_SIGMA2_LOGTABLE);
-
   printf("\n");
   
   test_mean_dp( 3.0, 0.0, 6, DGS_DISC_GAUSS_ALIAS);
@@ -187,8 +196,11 @@ int main(int argc, char *argv[]) {
   test_mean_dp( 3.3, 1.0, 6, DGS_DISC_GAUSS_ALIAS);
   test_mean_dp( 2.0, 2.0, 6, DGS_DISC_GAUSS_ALIAS);
   test_mean_dp( 2.0, 1.347, 6, DGS_DISC_GAUSS_ALIAS);
-
   printf("\n");
 
+  // testing mean for convolution also not suitable, since 
+  // it is likely to fail with large sigma (which is the only
+  // type that makes sense for convolution)
+
   return 0;
 }

From 34e6186cae1e39d55178f24db8494623e4641bf3 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Wed, 6 Jun 2018 16:09:15 +0200
Subject: [PATCH 2/8] adjust center for convolutions

---
 dgs/dgs_gauss.h      |  8 ++++++--
 dgs/dgs_gauss_dp.c   | 32 ++++++++++++++++++++++++--------
 tests/test_gauss_z.c | 14 +++++++++++---
 3 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/dgs/dgs_gauss.h b/dgs/dgs_gauss.h
index aa9d50e..cd30b4e 100644
--- a/dgs/dgs_gauss.h
+++ b/dgs/dgs_gauss.h
@@ -319,6 +319,11 @@ typedef struct _dgs_disc_gauss_dp_t {
   struct _dgs_disc_gauss_dp_t* base_sampler;
   size_t n_coefficients;
   long* coefficients;
+  
+  /**
+   * Sampler to adjust center in convolution
+   */
+  struct _dgs_disc_gauss_dp_t* coset_sampler;
 
 } dgs_disc_gauss_dp_t;
 
@@ -383,8 +388,7 @@ long dgs_disc_gauss_dp_call_alias(dgs_disc_gauss_dp_t *self);
    Sample from ``dgs_disc_gauss_dp_t`` by convolution sampling 
    (base sampler = alias sampling). This can be used to reduce 
    the memory overhead and setup costs of alias sampling for 
-   wide distributions, at the cost of increasing running time, but
-   is only implemented for centered Gaussians.
+   wide distributions, at the cost of increasing running time.
 
    :param self: discrete Gaussian sampler
  */
diff --git a/dgs/dgs_gauss_dp.c b/dgs/dgs_gauss_dp.c
index e46e7d9..7a7679e 100644
--- a/dgs/dgs_gauss_dp.c
+++ b/dgs/dgs_gauss_dp.c
@@ -242,15 +242,17 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
   case DGS_DISC_GAUSS_CONVOLUTION: {
     self->call = dgs_disc_gauss_dp_call_convolution;
     
-    if (fabs(self->c_r) > DGS_DISC_GAUSS_INTEGER_CUTOFF) {
-      dgs_disc_gauss_dp_clear(self);
-      dgs_die("algorithm DGS_DISC_GAUSS_CONVOLUTION requires c%1 == 0");
+    double eta = 2;
+    double sigma1 = sigma;
+    double coset_sigma = sqrt(2)*eta;
+    if (fabs(self->c_r) > 0) {
+      // we might need to adjust the center
+      sigma1 = sqrt(sigma*sigma - coset_sigma*coset_sigma);
     }
     
-    double eta = 2;
-    long table_size = 2*ceil(sigma*tau) * (sizeof(dgs_bern_dp_t) + sizeof(long));
+    long table_size = 2*ceil(sigma1*tau) * (sizeof(dgs_bern_dp_t) + sizeof(long));
     int recursion_level = 0;
-    double current_sigma = sigma;
+    double current_sigma = sigma1;
     long z1 = 1;
     long z2 = 1;
     
@@ -273,7 +275,9 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
       self->coefficients[i] = 1;
     }
     
-    current_sigma = sigma;
+    // if there is no convolution, we simply forward to alias and
+    // so we won't need adjustment of sigma, even if sampling off center
+    current_sigma = (recursion_level == 0)? sigma : sigma1;
     
     // redo above computation to store coefficients
     for (int i = 0; i < recursion_level; ++i) {
@@ -298,7 +302,14 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
       current_sigma /= (sqrt(z1*z1 + z2*z2));
     }
     
-    self->base_sampler = dgs_disc_gauss_dp_init(current_sigma, 0, tau, DGS_DISC_GAUSS_ALIAS);
+    double base_c = self->c_r;
+    self->coset_sampler = NULL;
+    if (recursion_level > 0 && fabs(self->c_r) > 0) {
+      // we'll need to adjust the center
+      base_c = 0.0;
+      self->coset_sampler = dgs_disc_gauss_dp_init(coset_sigma, self->c_r, tau, DGS_DISC_GAUSS_ALIAS);
+    }
+    self->base_sampler = dgs_disc_gauss_dp_init(current_sigma, base_c, tau, DGS_DISC_GAUSS_ALIAS);
     
     break;
   }
@@ -363,6 +374,11 @@ long dgs_disc_gauss_dp_call_convolution(dgs_disc_gauss_dp_t *self) {
   for (int i = 0; i < self->n_coefficients; ++i) {
     x += self->coefficients[i]*self->base_sampler->call(self->base_sampler);
   }
+  
+  // adjust center if necessary
+  if (self->coset_sampler) {
+    x += self->coset_sampler->call(self->coset_sampler);
+  }
   return x + self->c_z;
 }
 
diff --git a/tests/test_gauss_z.c b/tests/test_gauss_z.c
index 383a66e..85a018d 100644
--- a/tests/test_gauss_z.c
+++ b/tests/test_gauss_z.c
@@ -198,9 +198,17 @@ int main(int argc, char *argv[]) {
   test_mean_dp( 2.0, 1.347, 6, DGS_DISC_GAUSS_ALIAS);
   printf("\n");
 
-  // testing mean for convolution also not suitable, since 
-  // it is likely to fail with large sigma (which is the only
-  // type that makes sense for convolution)
+  // one would hope to test the convolution sampler with larger
+  // sigma, but in that case the mean test is likely to fail
+  // even if the sampler is correct. We still do some tests but this
+  // is mostly to ensure that the alias sampler is instatiated 
+  // correctly
+  test_mean_dp( 3.0, 0.0, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_mean_dp(10.0, 0.0, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_mean_dp( 3.3, 1.0, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_mean_dp( 2.0, 2.0, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  test_mean_dp( 2.0, 1.347, 6, DGS_DISC_GAUSS_CONVOLUTION);
+  printf("\n");
 
   return 0;
 }

From 7188a744ca87cf8e13b6092501310a979b84d1b1 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Mon, 11 Jun 2018 17:03:55 +0200
Subject: [PATCH 3/8] gauss convolution in mp

---
 dgs/dgs_gauss.h    |  23 ++++++-
 dgs/dgs_gauss_dp.c |   3 +
 dgs/dgs_gauss_mp.c | 165 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 189 insertions(+), 2 deletions(-)

diff --git a/dgs/dgs_gauss.h b/dgs/dgs_gauss.h
index cd30b4e..13f4fc4 100644
--- a/dgs/dgs_gauss.h
+++ b/dgs/dgs_gauss.h
@@ -575,9 +575,20 @@ typedef struct _dgs_disc_gauss_mp_t {
     /**
    * Tables required for alias sampling.
    */
-   
   mpz_t* alias;
   dgs_bern_mp_t** bias;
+  
+  /**
+   * Base sampler for convolution
+   */
+  struct _dgs_disc_gauss_mp_t* base_sampler;
+  size_t n_coefficients;
+  mpz_t* coefficients;
+  
+  /**
+   * Sampler to adjust center in convolution
+   */
+  struct _dgs_disc_gauss_mp_t* coset_sampler;
 
 } dgs_disc_gauss_mp_t;
 
@@ -620,6 +631,16 @@ void dgs_disc_gauss_mp_call_uniform_table_offset(mpz_t rop, dgs_disc_gauss_mp_t
  */
 void dgs_disc_gauss_mp_call_alias(mpz_t rop, dgs_disc_gauss_mp_t *self, gmp_randstate_t state);
 
+/**
+   Sample from ``dgs_disc_gauss_mp_t`` by convolution sampling 
+   (base sampler = alias sampling). This can be used to reduce 
+   the memory overhead and setup costs of alias sampling for 
+   wide distributions, at the cost of increasing running time.
+
+   :param self: discrete Gaussian sampler
+ */
+void dgs_disc_gauss_mp_call_convolution(mpz_t rop, dgs_disc_gauss_mp_t *self, gmp_randstate_t state);
+
 /**
   Sample from ``dgs_disc_gauss_mp_t`` by rejection sampling using the uniform
   distribution replacing all ``exp()`` calls with call to Bernoulli distributions.
diff --git a/dgs/dgs_gauss_dp.c b/dgs/dgs_gauss_dp.c
index 7a7679e..ed75c3b 100644
--- a/dgs/dgs_gauss_dp.c
+++ b/dgs/dgs_gauss_dp.c
@@ -433,6 +433,9 @@ void dgs_disc_gauss_dp_clear(dgs_disc_gauss_dp_t *self) {
   if (self->coefficients) {
     free(self->coefficients);
   }
+  if (self->coset_sampler) {
+    dgs_disc_gauss_dp_clear(self->coset_sampler);
+  }
   
   free(self);
 }
diff --git a/dgs/dgs_gauss_mp.c b/dgs/dgs_gauss_mp.c
index d793977..e2abf84 100644
--- a/dgs/dgs_gauss_mp.c
+++ b/dgs/dgs_gauss_mp.c
@@ -156,7 +156,7 @@ void dgs_disc_gauss_sigma2p_clear(dgs_disc_gauss_sigma2p_t *self) {
 /** GENERAL SIGMA :: INIT **/
 
 static inline void _dgs_disc_gauss_mp_init_f(mpfr_t f, const mpfr_t sigma) {
-  mpfr_init2(f, mpfr_get_prec(sigma));
+  // we are assuming, mpfr_init was called on f already
   mpfr_set(f, sigma, MPFR_RNDN);
   mpfr_sqr(f, f, MPFR_RNDN); // f = σ²
   mpfr_mul_ui(f, f, 2, MPFR_RNDN); // f = 2 σ²
@@ -209,6 +209,7 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
   mpz_init(self->k);
   mpfr_init2(self->y, prec);
   mpfr_init2(self->z, prec);
+  mpfr_init2(self->f, prec);
 
   mpfr_init2(self->sigma, prec);
   mpfr_set(self->sigma, sigma, MPFR_RNDN);
@@ -423,6 +424,137 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
     mpfr_clear(p);
     break;
   }
+  
+  case DGS_DISC_GAUSS_CONVOLUTION: {
+    self->call = dgs_disc_gauss_mp_call_convolution;
+    
+    //~ double sigma1 = sigma;
+    //~ double coset_sigma = sqrt(2)*eta;
+    mpfr_t eta, sigma1, coset_sigma;
+    mpfr_inits2(prec, eta, sigma1, coset_sigma, NULL);
+    
+    //~ double eta = sqrt((p+1)*log(2))/pi;
+    mpfr_const_log2(eta, MPFR_RNDN);
+    mpfr_const_pi(self->y, MPFR_RNDN);
+    mpfr_mul_si(eta, eta, prec + 1, MPFR_RNDN);
+    mpfr_sqrt(eta, eta, MPFR_RNDN);
+    mpfr_div(eta, eta, self->y, MPFR_RNDN);
+    
+    mpfr_log(eta, eta, MPFR_RNDN);
+    
+    mpfr_set(sigma1, sigma, MPFR_RNDN);
+    mpfr_set_si(coset_sigma, 2, MPFR_RNDN);
+    mpfr_sqrt(coset_sigma, coset_sigma, MPFR_RNDN);
+    mpfr_mul(coset_sigma, coset_sigma, eta, MPFR_RNDN);
+    
+    if (!mpfr_zero_p(self->c_r)) {
+      // we might need to adjust the center
+      //~ sigma1 = sqrt(sigma*sigma - coset_sigma*coset_sigma);
+      mpfr_mul(sigma1, sigma1, sigma1, MPFR_RNDN);
+      
+      mpfr_set(self->z, coset_sigma, MPFR_RNDN);
+      mpfr_mul(self->z, self->z, self->z, MPFR_RNDN);
+      
+      mpfr_sub(sigma1, sigma1, self->z, MPFR_RNDN);
+      mpfr_sqrt(sigma1, sigma1, MPFR_RNDN);
+    }
+    
+    long table_size = 2*ceil(mpfr_get_ui(sigma1, MPFR_RNDU)*tau) * (sizeof(dgs_bern_mp_t) + sizeof(mpz_t));
+    int recursion_level = 0;
+    // for computing the recusrion level, we can probably get away with doubles:
+    double current_sigma = mpfr_get_d(sigma1, MPFR_RNDN);
+    long z1 = 1;
+    long z2 = 1;
+    
+    // compute recursion level for convolution
+    while (table_size > (DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES >> 1)) {
+      recursion_level++;
+      z1 = floor(sqrt(current_sigma/(mpfr_get_d(eta, MPFR_RNDN)*2)));
+      if (z1 == 0) {
+        dgs_disc_gauss_mp_clear(self);
+        dgs_die("MAX_TABLE_SIZE too small to store alias sampler!");
+      }
+      z2 = (z1 > 1) ? z1 - 1 : 1;
+      current_sigma /= (sqrt(z1*z1 + z2*z2));
+      table_size = 2*ceil(current_sigma*tau) * (sizeof(dgs_bern_mp_t) + sizeof(mpz_t));
+    }
+    
+    self->n_coefficients = 1 << recursion_level;
+    self->coefficients = (mpz_t*)calloc(self->n_coefficients, sizeof(mpz_t));
+    if (!self->coefficients){
+      dgs_disc_gauss_mp_clear(self);
+      dgs_die("out of memory");
+    }
+    for (int i = 0; i < self->n_coefficients; ++i) {
+      //~ self->coefficients[i] = 1;
+      mpz_init(self->coefficients[i]);
+      mpz_set_si(self->coefficients[i], 1);
+    }
+    
+    // if there is no convolution, we simply forward to alias and
+    // so we won't need adjustment of sigma, even if sampling off center
+    //~ current_sigma = (recursion_level == 0)? sigma : sigma1;
+    
+    if (recursion_level == 0) {
+      mpfr_set(self->y, sigma, MPFR_RNDN);
+    } else {
+      mpfr_set(self->y, sigma1, MPFR_RNDN);
+    }
+    
+    // redo above computation to store coefficients
+    for (int i = 0; i < recursion_level; ++i) {
+      //~ z1 = floor(sqrt(current_sigma/(eta*2)));
+      mpfr_set(self->z, self->y, MPFR_RNDN);
+      mpfr_div(self->z, self->z, eta, MPFR_RNDN);
+      mpfr_div_si(self->z, self->z, 2, MPFR_RNDN);
+      mpfr_sqrt(self->z, self->z, MPFR_RNDN);
+      mpfr_get_z(self->x, self->z, MPFR_RNDZ);
+      
+      //~ z2 = (z1 > 1) ? z1 - 1 : 1;
+      int tmp = (mpz_cmp_si(self->x, 1) > 0);
+      mpz_sub_ui(self->x2, self->x, tmp);
+      
+      // we unroll the recursion on the coefficients on the fly
+      // so we don't have to use recursion during the call
+      int off = (1 << recursion_level - i - 1);
+      for (int j = 0; j < (1 << i); ++j) {
+        for (int k = 0; k < off;++k) {
+          //~ self->coefficients[2*j*off + k] *= z1;
+          mpz_mul(self->coefficients[2*j*off + k], self->coefficients[2*j*off + k], self->x);
+        }
+      }
+      
+      for (int j = 0; j < (1 << i); ++j) {
+        for (int k = 0; k < off;++k) {
+          //~ self->coefficients[(2*j + 1)*off + k] *= z2;
+          mpz_mul(self->coefficients[(2*j + 1)*off + k], self->coefficients[(2*j + 1)*off + k], self->x2);
+        }
+      }
+      
+      
+      //~ current_sigma /= (sqrt(z1*z1 + z2*z2));
+      mpz_mul(self->x, self->x, self->x);
+      mpz_mul(self->x2, self->x2, self->x2);
+      mpfr_set_z(self->z, self->x, MPFR_RNDN);
+      mpfr_add_z(self->z, self->z, self->x2, MPFR_RNDN);
+      mpfr_sqrt(self->z, self->z, MPFR_RNDN);
+      mpfr_div(self->y, self->y, self->z, MPFR_RNDN);
+    }
+    
+    //~ double base_c = self->c_r;
+    mpfr_set(self->z, self->c_r, MPFR_RNDN);
+    self->coset_sampler = NULL;
+    //~ if (recursion_level > 0 && fabs(self->c_r) > 0) {
+    if (recursion_level > 0 && !mpfr_zero_p(self->c_r)) {
+      // we'll need to adjust the center
+      //~ base_c = 0.0;
+      mpfr_set_zero(self->z, 0);
+      self->coset_sampler = dgs_disc_gauss_mp_init(coset_sigma, self->c_r, tau, DGS_DISC_GAUSS_ALIAS);
+    }
+    self->base_sampler = dgs_disc_gauss_mp_init(self->y, self->z, tau, DGS_DISC_GAUSS_ALIAS);
+    
+    break;
+  }
 
   default:
     free(self);
@@ -476,6 +608,23 @@ void dgs_disc_gauss_mp_call_alias(mpz_t rop, dgs_disc_gauss_mp_t *self, gmp_rand
   mpz_add(rop, rop, self->c_z);
 }
 
+void dgs_disc_gauss_mp_call_convolution(mpz_t rop, dgs_disc_gauss_mp_t *self, gmp_randstate_t state) {
+  mpz_set_si(rop, 0);
+  for (int i = 0; i < self->n_coefficients; ++i) {
+    self->base_sampler->call(self->x, self->base_sampler, state);
+    mpz_mul(self->x, self->x, self->coefficients[i]);
+    mpz_add(rop, rop, self->x);
+  }
+  
+  // adjust center if necessary
+  if (self->coset_sampler) {
+    self->coset_sampler->call(self->x, self->coset_sampler, state);
+    mpz_add(rop, rop, self->x);
+  }
+  
+  mpz_add(rop, rop, self->c_z);
+}
+
 void dgs_disc_gauss_mp_call_uniform_online(mpz_t rop, dgs_disc_gauss_mp_t *self, gmp_randstate_t state) {
   do {
     mpz_urandomm(self->x, state, self->two_upper_bound_minus_one);
@@ -577,5 +726,19 @@ void dgs_disc_gauss_mp_clear(dgs_disc_gauss_mp_t *self) {
   if (self->two_upper_bound_minus_one)
     mpz_clear(self->two_upper_bound_minus_one);
 
+  if (self->base_sampler) {
+    dgs_disc_gauss_mp_clear(self->base_sampler);
+  }
+  if (self->coefficients) {
+    for (int i = 0; i < self->n_coefficients;++i) {
+      if (self->coefficients[i])
+        mpz_clear(self->coefficients[i]);
+    }
+    free(self->coefficients);
+  }
+  if (self->coset_sampler) {
+    dgs_disc_gauss_mp_clear(self->coset_sampler);
+  }
+  
   free(self);
 }

From 7da12c31c2b85da0212ef5f3a7124ea3b8f8e4e8 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Fri, 15 Jun 2018 13:41:29 +0200
Subject: [PATCH 4/8] documentation for (fixed) convolution sampler

---
 README.md                | 16 ++++++++++++++++
 benchmarks/bench.c       |  2 ++
 benchmarks/bench_gauss.c |  1 +
 dgs/dgs_gauss.h          |  9 +++++++++
 4 files changed, 28 insertions(+)

diff --git a/README.md b/README.md
index c9e6928..b4db9b9 100644
--- a/README.md
+++ b/README.md
@@ -68,6 +68,10 @@ provided during initialization. Available algorithms are:
     Setup costs are roughly $σ²$ (as currently implemented) and table sizes linear
     in $σ$, but sampling is then just a randomized lookup. Any real-valued $c$ is 
     accepted.
+  
+  - `DGS_DISC_GAUSS_CONVOLUTION` - Applies the convolution technique to alias
+    sampling in order to reduce memory overhead and setup cost at the cost of
+    running time. This is suitable for large $σ$. Any real-valued $c$ is accepted.
     
   Algorithm 2-4 are described in:
 
@@ -75,6 +79,18 @@ provided during initialization. Available algorithms are:
   Signatures and Bimodal Gaussians*; in Advances in Cryptology – CRYPTO 2013;
   Lecture Notes in Computer Science Volume 8042, 2013, pp 40-56
   [(PDF)](http://www.di.ens.fr/~lyubash/papers/bimodal.pdf)
+  
+  Algorithm 6 is described in:
+  
+  Thomas Pöppelmann, Léo Ducas, Tim Güneysu. *Enhanced Lattice-Based Signatures 
+  on Reconfigurable Hardware*; in Cryptographic Hardware and Embedded 
+  Systems – CHES 2014; Lecture Notes in Computer Science Volume 8731, 
+  2014, pp 353-370 [(PDF)](https://eprint.iacr.org/2014/254.pdf)
+  
+  Daniele Micciancio, Michael Walter. *Gaussian Sampling over the Integers: 
+  Efficient, Generic, Constant-Time*; in Advances in Cryptology – CRYPTO 2017;
+  Lecture Notes in Computer Science Volume 10402, 2017, pp 455-485
+  [(PDF)](https://eprint.iacr.org/2017/259.pdf) 
 
 ### Randomized Rounders
 This type of algorithm is useful to produce samples where the parameters of 
diff --git a/benchmarks/bench.c b/benchmarks/bench.c
index cd977b1..d475d79 100644
--- a/benchmarks/bench.c
+++ b/benchmarks/bench.c
@@ -23,6 +23,8 @@ void print_gauss_z_help(const char *name) {
   printf("      %d -- sample from uniform distribution, exp() calls tabulated\n", DGS_DISC_GAUSS_UNIFORM_TABLE);
   printf("      %d -- sample from uniform distribution, exp() calls as Bernoulli oracles\n", DGS_DISC_GAUSS_UNIFORM_LOGTABLE);
   printf("      %d -- sample from k⋅σ2 distribution, exp() calls as Bernoulli oracles \n", DGS_DISC_GAUSS_SIGMA2_LOGTABLE);
+  printf("      %d -- use alias method (precomputed table) \n", DGS_DISC_GAUSS_ALIAS);
+  printf("      %d -- use convolution technique with alias method \n", DGS_DISC_GAUSS_CONVOLUTION);
   printf(" p -- precision: 0 for double precision, 1 for arbitrary precision\n");
   printf(" n -- number of trials > 0 (default: 100000)\n");
 }
diff --git a/benchmarks/bench_gauss.c b/benchmarks/bench_gauss.c
index b85a443..d4cab3e 100644
--- a/benchmarks/bench_gauss.c
+++ b/benchmarks/bench_gauss.c
@@ -72,6 +72,7 @@ const char *alg_to_str(dgs_disc_gauss_alg_t alg) {
   case DGS_DISC_GAUSS_UNIFORM_LOGTABLE: return "uniform+logtable";
   case DGS_DISC_GAUSS_SIGMA2_LOGTABLE: return "sigma2+logtable";
   case DGS_DISC_GAUSS_ALIAS:           return "alias";
+  case DGS_DISC_GAUSS_CONVOLUTION:           return "convolution";
   default: return "unknown";
   }
 }
diff --git a/dgs/dgs_gauss.h b/dgs/dgs_gauss.h
index 13f4fc4..c12210f 100644
--- a/dgs/dgs_gauss.h
+++ b/dgs/dgs_gauss.h
@@ -31,6 +31,15 @@
     Bernoulli distributions (but no calls to `\exp`). Note that this sampler
     adjusts sigma to match `σ₂·k` for some integer `k`.  Only integer-valued
     `c` are supported.
+   
+  - `DGS_DISC_GAUSS_ALIAS` - uses the [alias method](https://en.wikipedia.org/wiki/Alias_method).
+    Setup costs are roughly $σ²$ (as currently implemented) and table sizes linear
+    in $σ$, but sampling is then just a randomized lookup. Any real-valued $c$ is 
+    accepted.
+  
+  - `DGS_DISC_GAUSS_CONVOLUTION` - Applies the convolution technique to alias
+    sampling in order to reduce memory overhead and setup cost at the cost of
+    running time. This is suitable for large $σ$. Any real-valued $c$ is accepted.
 
   AVAILABLE PRECISIONS:
 

From 993f66a3de1c396c8540d6bb3bb0cbd3c9f540f9 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Thu, 21 Jun 2018 11:18:24 +0200
Subject: [PATCH 5/8] convolution rounder in dp

---
 dgs/dgs_gauss_dp.c  |   2 +-
 dgs/dgs_rround.h    |  30 ++++++++++++
 dgs/dgs_rround_dp.c | 109 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 140 insertions(+), 1 deletion(-)

diff --git a/dgs/dgs_gauss_dp.c b/dgs/dgs_gauss_dp.c
index ed75c3b..ebccc2f 100644
--- a/dgs/dgs_gauss_dp.c
+++ b/dgs/dgs_gauss_dp.c
@@ -257,7 +257,7 @@ dgs_disc_gauss_dp_t *dgs_disc_gauss_dp_init(double sigma, double c, size_t tau,
     long z2 = 1;
     
     // compute recursion level for convolution
-    while (table_size > (DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES >> 1)) {
+    while (table_size > DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES) {
       recursion_level++;
       z1 = floor(sqrt(current_sigma/(eta*2)));
       if (z1 == 0) {
diff --git a/dgs/dgs_rround.h b/dgs/dgs_rround.h
index 40ebb24..967c57b 100644
--- a/dgs/dgs_rround.h
+++ b/dgs/dgs_rround.h
@@ -21,6 +21,9 @@
  - ``DGS_RROUND_KARNEY`` - Use Karney's algorithm. This is better than 
    uniform rejection sampling.
 
+ - ``DGS_RROUND_CONVOLUTION`` - Use convolution to reduce to alias 
+   sampling. 
+
   AVAILABLE PRECISIONS:
 
   - ``mp`` - multi-precision using MPFR, cf. ``dgs_gauss_mp.c``
@@ -54,6 +57,7 @@ typedef enum {
   DGS_RROUND_DEFAULT           = 0x0, //<pick algorithm
   DGS_RROUND_UNIFORM_ONLINE    = 0x1, //<call dgs_disc_gauss_mp_call_uniform_online
   DGS_RROUND_KARNEY            = 0x2, //<call dgs_disc_gauss_mp_call_karney
+  DGS_RROUND_CONVOLUTION       = 0x3, //<call dgs_disc_gauss_mp_call_convolution
 } dgs_rround_alg_t;
 
 struct _dgs_rround_mp_t;
@@ -70,6 +74,12 @@ typedef struct _dgs_rround_dp_t {
 
   dgs_bern_uniform_t *B;
   dgs_bern_dp_t *B_half_exp;
+  
+  dgs_disc_gauss_dp_t **base_samplers;
+  int log_base, digits, flips;
+  double s_bar2, bm_sample;
+  uint64_t mask, pool;
+  
 
   dgs_rround_alg_t algorithm;  //<  which algorithm to use
 
@@ -115,6 +125,13 @@ long dgs_rround_dp_call_uniform_online(dgs_rround_dp_t *self, double sigma, doub
  */
 long dgs_rround_dp_call_karney(dgs_rround_dp_t *self, double sigma, double c);
 
+  /**
+   Sample from ``dgs_rround_dp_t`` using the convolution sampler
+
+   :param self: discrete Gaussian sampler
+ */
+long dgs_rround_dp_call_convolution(dgs_rround_dp_t *self, double sigma, double c);
+
 /**
    Free memory.
 
@@ -230,6 +247,19 @@ void dgs_rround_mp_call_uniform_online(mpz_t rop, dgs_rround_mp_t *self, const m
 
 void dgs_rround_mp_call_karney(mpz_t rop, dgs_rround_mp_t *self, const mpfr_t sigma, const mpfr_t c, gmp_randstate_t state);
 
+/**
+  Sample from ``dgs_rround_mp_t`` using the convolution sampler.
+
+  :param rop: return value
+  :param self: discrete Gaussian rounder
+  :param sigma: noise parameter
+  :param c: center
+  :param state: state
+
+ */
+
+void dgs_rround_mp_call_convolution(mpz_t rop, dgs_rround_mp_t *self, const mpfr_t sigma, const mpfr_t c, gmp_randstate_t state);
+
 
 /**
    Free memory.
diff --git a/dgs/dgs_rround_dp.c b/dgs/dgs_rround_dp.c
index e7c07b0..3b1c2b4 100644
--- a/dgs/dgs_rround_dp.c
+++ b/dgs/dgs_rround_dp.c
@@ -58,6 +58,20 @@ static inline long _dgs_rround_dp_unit_gauss(dgs_rround_dp_t *self) {
   return x;
 }
 
+double _box_muller(dgs_rround_dp_t *self) {
+  if (self->pool) {
+    self->pool--;
+    return self->bm_sample;
+  }
+  
+  double r1 = drand48();
+  double r2 = drand48();
+  
+  double R = sqrt(-2*log(r1));
+  self->bm_sample = R*cos(2*M_PI*r2);
+  self->pool = 1;
+  return R*sin(2*M_PI*r2);
+}
 
 dgs_rround_dp_t *dgs_rround_dp_init(size_t tau, dgs_rround_alg_t algorithm) {
   if (tau == 0)
@@ -87,6 +101,57 @@ dgs_rround_dp_t *dgs_rround_dp_init(size_t tau, dgs_rround_alg_t algorithm) {
     
     break;
   }
+  case DGS_RROUND_CONVOLUTION: {
+    self->call = dgs_rround_dp_call_convolution;
+    
+    self->pool = 0;
+    
+    // we set parameters so that the memory does not exceed 
+    // DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES
+    double eta = 2;            // smoothing paramter
+    double base_sigma = 2.5;  // sigma for 2^b base samplers
+    
+    long base_sampler_size = 2*ceil(base_sigma*tau) * (sizeof(dgs_bern_dp_t) + sizeof(long));
+    int base = (DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES)/base_sampler_size;
+    self->log_base = 0;
+    while (base >>= 1) { ++self->log_base; } // we want a power of 2
+    base = 1 << self->log_base;
+    self->mask = __DGS_LSB_BITMASK(self->log_base);
+    // we can now actually reduce base_sigma a little to save some 
+    // more memory and increase the range of this function w.r.t. sigma
+    base_sigma = sqrt(((double)(base + 1))/base)*eta; 
+    self->base_samplers = (dgs_disc_gauss_dp_t**)malloc(sizeof(dgs_disc_gauss_dp_t*)*base);
+    if (!self->base_samplers){
+      dgs_rround_dp_clear(self);
+      dgs_die("out of memory");
+    }
+    
+    for (int i = 0; i < base; ++i) {
+      self->base_samplers[i] = dgs_disc_gauss_dp_init(base_sigma, ((double)i)/base, tau, DGS_DISC_GAUSS_ALIAS);
+    }
+    
+    // we assume for the precision we're targeting here using 
+    // gaussian rounding on the first 25 bits is sufficient
+    // do the rest using bernoulli (linear interpolation of gaussian)
+    self->digits = (int) ceil(25.0/self->log_base);
+    
+    // compute rr_sigma2
+    self->s_bar2 = 1;
+    long double t = 1.0/ (base*base);
+    long double s = 1;
+    for (int i = 1; i < self->digits; ++i) {
+        s *= t;
+        self->s_bar2 += s;
+    }
+    self->s_bar2 *= (base_sigma*base_sigma);
+    
+    // we use karney as fallback for small sigma, so we initialize it
+    self->B = dgs_bern_uniform_init(0);
+    self->B_half_exp = dgs_bern_dp_init(exp(-.5));
+    
+    break;
+  }
+  
   default:
     dgs_rround_dp_clear(self);
     dgs_die("unknown algorithm %d", algorithm);
@@ -147,10 +212,54 @@ long dgs_rround_dp_call_karney(dgs_rround_dp_t *self, double sigma, double c) {
   } while (1);
 }
 
+long dgs_rround_dp_call_convolution(dgs_rround_dp_t *self, double sigma, double c) {
+  double sigma2 = sigma*sigma;
+  
+  // we need sigma to be larger than s_bar
+  // otherwise fall back to karney
+  if (self->s_bar2 > sigma2) {
+    return dgs_rround_dp_call_karney(self, sigma, c);
+  }
+  double K = sqrt(sigma2 - self->s_bar2);
+  double xr = _box_muller(self);
+  double c1 = c + K*xr;
+  
+  long c1_z = (long) floor(c1);
+  c1 -= floor(c1); // 0 <= c1 < 1
+  
+  c1 *= (1UL << self->digits*self->log_base);
+  int64_t center = (int64_t) c1;
+  c1 -= center; // 0 <= c1 < 1
+  
+  if (drand48() < c1)
+    ++center;
+  
+  for (int i = 0; i < self->digits; ++i) {
+    long x = self->base_samplers[center & self->mask]->call(self->base_samplers[center & self->mask]);
+    if ( (self->mask & center) > 0 && center < 0)
+            x -= 1;
+    
+    for (int j = 0; j < self->log_base; ++j) {
+      center /= 2;
+    }
+    center += x;
+  }
+  
+  return center + c1_z;
+}
+
 void dgs_rround_dp_clear(dgs_rround_dp_t *self) {
   assert(self != NULL);
   if (self->B) dgs_bern_uniform_clear(self->B);
   if (self->B_half_exp) dgs_bern_dp_clear(self->B_half_exp);
+  if (self->base_samplers) {
+    for (int i = 0; i < (1 << self->log_base); ++i) {
+      if (self->base_samplers[i]) {
+        dgs_disc_gauss_dp_clear(self->base_samplers[i]);
+      }
+    }
+    free(self->base_samplers);
+  }
   
   free(self);
 }

From 9be2c9742456c3b9175a771fc0b484a81ad024d4 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Fri, 22 Jun 2018 12:05:21 +0200
Subject: [PATCH 6/8] convolution rounder in mp

---
 dgs/dgs_gauss_mp.c  |  13 +++-
 dgs/dgs_rround.h    |  10 +++
 dgs/dgs_rround_dp.c |   2 +
 dgs/dgs_rround_mp.c | 186 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 207 insertions(+), 4 deletions(-)

diff --git a/dgs/dgs_gauss_mp.c b/dgs/dgs_gauss_mp.c
index e2abf84..b2b8f97 100644
--- a/dgs/dgs_gauss_mp.c
+++ b/dgs/dgs_gauss_mp.c
@@ -407,7 +407,10 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
     
     mpfr_t p;
     mpfr_init2(p, prec);
-    while(mpfr_cmp_d(self->z,0) > 0) {
+    
+    // we must be done after range rounds
+    int n = 0;
+    while(mpfr_cmp_d(self->z,0) > 0 && n < range) {
       high = _dgs_disc_gauss_mp_max_in_rho(self, range);
       mpfr_mul_z(p, self->rho[low], self->two_upper_bound_minus_one, MPFR_RNDN);
       
@@ -419,6 +422,7 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
       
       low = _dgs_disc_gauss_mp_min_in_rho(self, range);
       mpfr_sub(self->z, self->y, self->rho[low], MPFR_RNDD); // z = avg - rho[low]
+      ++n;
     }
     
     mpfr_clear(p);
@@ -440,8 +444,6 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
     mpfr_sqrt(eta, eta, MPFR_RNDN);
     mpfr_div(eta, eta, self->y, MPFR_RNDN);
     
-    mpfr_log(eta, eta, MPFR_RNDN);
-    
     mpfr_set(sigma1, sigma, MPFR_RNDN);
     mpfr_set_si(coset_sigma, 2, MPFR_RNDN);
     mpfr_sqrt(coset_sigma, coset_sigma, MPFR_RNDN);
@@ -461,7 +463,7 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
     
     long table_size = 2*ceil(mpfr_get_ui(sigma1, MPFR_RNDU)*tau) * (sizeof(dgs_bern_mp_t) + sizeof(mpz_t));
     int recursion_level = 0;
-    // for computing the recusrion level, we can probably get away with doubles:
+    // for computing the recursion level, we can probably get away with doubles:
     double current_sigma = mpfr_get_d(sigma1, MPFR_RNDN);
     long z1 = 1;
     long z2 = 1;
@@ -551,8 +553,11 @@ dgs_disc_gauss_mp_t *dgs_disc_gauss_mp_init(const mpfr_t sigma, const mpfr_t c,
       mpfr_set_zero(self->z, 0);
       self->coset_sampler = dgs_disc_gauss_mp_init(coset_sigma, self->c_r, tau, DGS_DISC_GAUSS_ALIAS);
     }
+    
     self->base_sampler = dgs_disc_gauss_mp_init(self->y, self->z, tau, DGS_DISC_GAUSS_ALIAS);
     
+    mpfr_clears(eta, sigma1, coset_sigma, (mpfr_ptr) NULL);
+    
     break;
   }
 
diff --git a/dgs/dgs_rround.h b/dgs/dgs_rround.h
index 967c57b..171f6ab 100644
--- a/dgs/dgs_rround.h
+++ b/dgs/dgs_rround.h
@@ -49,6 +49,10 @@
 
 #include "dgs_gauss.h"
 
+#define DGS_RROUND_SIGMA_LOG2_MAX 30
+#define DGS_RROUND_SIGMA_MAX (1 << DGS_RROUND_SIGMA_LOG2_MAX)
+
+
 /**
    Available Algorithms
 */
@@ -217,6 +221,12 @@ typedef struct _dgs_rround_mp_t {
   mpfr_t y; // space for temporary rational number
   mpfr_t z; // space for temporary rational number
   mpfr_t tmp;
+  
+  dgs_disc_gauss_mp_t *wide_sampler;
+  dgs_disc_gauss_mp_t **base_samplers;
+  int log_base, digits, flips, pool;
+  mpfr_t s_bar2, bm_sample;
+  uint64_t mask;
 } dgs_rround_mp_t;
 
 dgs_rround_mp_t *dgs_rround_mp_init(size_t tau, dgs_rround_alg_t algorithm, mpfr_prec_t prec);
diff --git a/dgs/dgs_rround_dp.c b/dgs/dgs_rround_dp.c
index 3b1c2b4..0a34351 100644
--- a/dgs/dgs_rround_dp.c
+++ b/dgs/dgs_rround_dp.c
@@ -221,6 +221,8 @@ long dgs_rround_dp_call_convolution(dgs_rround_dp_t *self, double sigma, double
     return dgs_rround_dp_call_karney(self, sigma, c);
   }
   double K = sqrt(sigma2 - self->s_bar2);
+  // we use continuous gaussians instead of wide samplers
+  // this is faster and (provably) works the same way
   double xr = _box_muller(self);
   double c1 = c + K*xr;
   
diff --git a/dgs/dgs_rround_mp.c b/dgs/dgs_rround_mp.c
index b9d3898..57761a6 100644
--- a/dgs/dgs_rround_mp.c
+++ b/dgs/dgs_rround_mp.c
@@ -94,6 +94,8 @@ dgs_rround_mp_t *dgs_rround_mp_init(size_t tau, dgs_rround_alg_t algorithm, mpfr
   
   mpfr_init2(self->tmp, prec);
   
+  mpfr_inits2(prec, self->s_bar2, self->bm_sample, (mpfr_ptr)NULL);
+  
   self->tau = tau;
   
   mpz_init(self->sigma_z);
@@ -127,6 +129,99 @@ dgs_rround_mp_t *dgs_rround_mp_init(size_t tau, dgs_rround_alg_t algorithm, mpfr
     
     break;
   }
+  
+  case DGS_RROUND_CONVOLUTION: {
+    self->call = dgs_rround_mp_call_convolution;
+    
+    // we set parameters so that the memory does not exceed 
+    // 2*DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES
+    //~ double eta = sqrt((p+1)*log(2))/pi;
+    mpfr_set_ui(self->tmp, DGS_RROUND_SIGMA_MAX, MPFR_RNDN);
+    mpfr_set_d(self->y, 0.0, MPFR_RNDN);
+    self->wide_sampler = dgs_disc_gauss_mp_init(self->tmp, self->y, tau, DGS_DISC_GAUSS_CONVOLUTION);
+    
+    mpfr_t eta, base_sigma;
+    mpfr_inits2(prec, eta, base_sigma, NULL);
+    self->pool = 0;
+    mpfr_const_log2(eta, MPFR_RNDN);
+    mpfr_const_pi(self->y, MPFR_RNDN);
+    mpfr_mul_si(eta, eta, prec + 1, MPFR_RNDN);
+    mpfr_sqrt(eta, eta, MPFR_RNDN);
+    mpfr_div(eta, eta, self->y, MPFR_RNDN);
+    
+    //~ double base_sigma = 2.5;  // sigma for 2^b base samplers
+    mpfr_set_si(base_sigma, 2, MPFR_RNDN);
+    mpfr_sqrt(base_sigma, base_sigma, MPFR_RNDN);
+    mpfr_mul(base_sigma, base_sigma, eta, MPFR_RNDN);
+    
+    long base_sampler_size = 2*ceil(mpfr_get_ui(base_sigma, MPFR_RNDU)*tau) * (sizeof(dgs_bern_mp_t) + sizeof(mpz_t));
+    int base = (DGS_DISC_GAUSS_MAX_TABLE_SIZE_BYTES)/base_sampler_size;
+    self->log_base = 0;
+    while (base >>= 1) { ++self->log_base; } // we want a power of 2
+    base = 1 << self->log_base;
+    self->mask = __DGS_LSB_BITMASK(self->log_base);
+    
+    // we can now actually reduce base_sigma a little to save some 
+    // more memory and increase the range of this function w.r.t. sigma
+    // base_sigma = sqrt(((double)(base + 1))/base)*eta; 
+    mpfr_set_si(base_sigma, base, MPFR_RNDN);
+    mpfr_add_si(base_sigma, base_sigma, 1, MPFR_RNDN);
+    mpfr_div_si(base_sigma, base_sigma, base, MPFR_RNDN);
+    mpfr_sqrt(base_sigma, base_sigma, MPFR_RNDN);
+    mpfr_mul(base_sigma, base_sigma, eta, MPFR_RNDN);
+    
+    self->base_samplers = (dgs_disc_gauss_mp_t**)malloc(sizeof(dgs_disc_gauss_mp_t*)*base);
+    if (!self->base_samplers){
+      dgs_rround_mp_clear(self);
+      dgs_die("out of memory");
+    }
+    
+    for (int i = 0; i < base; ++i) {
+      // center = ((double)i)/base
+      mpfr_set_d(self->tmp, ((double)i)/base, MPFR_RNDN);
+      self->base_samplers[i] = dgs_disc_gauss_mp_init(base_sigma, self->tmp, tau, DGS_DISC_GAUSS_ALIAS);
+    }
+    
+    // round half the precision bits using gaussian rounding
+    // do the rest using bernoulli (linear interpolation of gaussian)
+    self->digits = (int) ceil(((double)prec)/(2*self->log_base));
+    
+    // compute rr_sigma2
+    //~ self->s_bar2 = 1;
+    mpfr_set_si(self->s_bar2, 1, MPFR_RNDN);
+    
+    //~ long double t = 1.0/ (base*base);
+    mpfr_set_si(self->y, base, MPFR_RNDN); // y = t
+    mpfr_mul(self->y, self->y, self->y, MPFR_RNDN);
+    mpfr_si_div(self->y, 1, self->y, MPFR_RNDN);
+    
+    //~ long double s = 1;
+    mpfr_set_si(self->z, 1, MPFR_RNDN); // z = s
+    for (int i = 1; i < self->digits; ++i) {
+        //~ s *= t;
+        mpfr_mul(self->z, self->z, self->y, MPFR_RNDN);
+        //~ self->s_bar2 += s;
+        mpfr_add(self->s_bar2, self->s_bar2, self->z, MPFR_RNDN);
+    }
+    //~ self->s_bar2 *= (base_sigma*base_sigma);
+    mpfr_mul(self->s_bar2, self->s_bar2, base_sigma, MPFR_RNDN);
+    mpfr_mul(self->s_bar2, self->s_bar2, base_sigma, MPFR_RNDN);
+    
+    // we use karney as fallback for small sigma, so we initialize it
+    //~ self->B = dgs_bern_uniform_init(0);
+    //~ self->B_half_exp = dgs_bern_dp_init(exp(-.5));
+    self->B = dgs_bern_uniform_init(0);
+    
+    char half_exp_str[] = "0.60653065971263342360379953499118045344191813548718695568289";
+    mpfr_t half_exp; mpfr_init2(half_exp, prec);
+    mpfr_set_str(half_exp, half_exp_str, 10, MPFR_RNDN);
+    
+    self->B_half_exp = dgs_bern_mp_init(half_exp);
+    
+    mpfr_clears(eta, base_sigma, (mpfr_ptr) NULL);
+    
+    break;
+  }
   default:
     free(self);
     dgs_die("unknown algorithm %d", algorithm);
@@ -224,6 +319,81 @@ void dgs_rround_mp_call_karney(mpz_t rop, dgs_rround_mp_t *self, const mpfr_t si
   } while (1);
 }
 
+void dgs_rround_mp_call_convolution(mpz_t rop, dgs_rround_mp_t *self, const mpfr_t sigma, const mpfr_t c, gmp_randstate_t state) {
+  //~ double sigma2 = sigma*sigma;
+  mpfr_mul(self->y, sigma, sigma, MPFR_RNDN); // sigma2 = self->y
+  
+  // we need sigma to be larger than s_bar and smaller than sigma_max
+  // otherwise fall back to karney
+  //~ if (self->s_bar2 > sigma2) {
+  if (mpfr_cmp(self->s_bar2, self->y) > 0 || mpfr_cmp_ui(sigma, DGS_RROUND_SIGMA_MAX) > 0) {
+    dgs_rround_mp_call_karney(rop, self, sigma, c, state);
+    return;
+  }
+  //~ double K = sqrt(sigma2 - self->s_bar2);
+  mpfr_sub(self->y, self->y, self->s_bar2, MPFR_RNDN);
+  mpfr_sqrt(self->y, self->y, MPFR_RNDN); // self->y = K
+  
+  mpfr_div_2si(self->y, self->y, DGS_RROUND_SIGMA_LOG2_MAX, MPFR_RNDN);
+  
+  // we use a dicreet wide sampler instead of a standard continuous 
+  // gaussian (like in dp) here, because this is faster than mpfr_grandom
+  // once nrandom is available (MPFR 4) it might be better to switch
+  // back to continuous gaussians to adjust width
+  self->wide_sampler->call(self->x, self->wide_sampler, state);
+  
+  mpfr_mul_z(self->y, self->y, self->x, MPFR_RNDN);
+  
+  //~ double c1 = c + K*xr;
+  mpfr_add(self->y, self->y, c, MPFR_RNDN); // c1 = self->y
+  
+  //~ long c1_z = (long) floor(c1);
+  //~ c1 -= floor(c1); // 0 <= c1 < 1
+  mpfr_get_z(self->c_z, self->y, MPFR_RNDD);
+  mpfr_sub_z(self->y, self->y, self->c_z, MPFR_RNDN);
+  assert(mpfr_sgn(self->y) >= 0);
+  assert(mpfr_cmp_ui(self->y, 1) < 1);
+  //~ c1 *= (1UL << self->digits*self->log_base);
+  //~ int64_t center = (int64_t) c1;
+  //~ c1 -= center; // 0 <= c1 < 1
+  mpfr_mul_2ui(self->y, self->y, self->digits*self->log_base, MPFR_RNDN);
+  mpfr_get_z(rop, self->y, MPFR_RNDD);
+  mpfr_sub_z(self->y, self->y, rop, MPFR_RNDN);
+  assert(mpfr_sgn(self->y) >= 0);
+  assert(mpfr_cmp_ui(self->y, 1) < 1);
+  
+  //~ if (drand48() < c1)
+    //~ ++center;
+  mpfr_urandom(self->z, state, MPFR_RNDN);
+  //~ if (mpfr_cmp(self->y, self->z) > 0) {
+  if (mpfr_greater_p(self->y, self->z)) {
+    mpz_add_ui(rop, rop, 1);
+  }
+  
+  long center_lsbs;
+  for (int i = 0; i < self->digits; ++i) {
+    //~ long x = self->base_samplers[center & self->mask]->call(self->base_samplers[center & self->mask]);
+    center_lsbs = mpz_get_si(rop) & self->mask; // mpz_get_si returns the LSBs that fit into signed long int
+    
+    self->base_samplers[center_lsbs]->call(self->x, self->base_samplers[center_lsbs], state);
+    //~ if ( (self->mask & center) > 0 && center < 0)
+            //~ x -= 1;
+    if ( center_lsbs > 0 && mpz_sgn(rop) < 0)
+            mpz_sub_ui(self->x, self->x, 1);
+    
+    //~ for (int j = 0; j < self->log_base; ++j) {
+      //~ center /= 2;
+    //~ }
+    //~ center += x;
+    
+    mpz_tdiv_q_2exp(rop, rop, self->log_base);
+    mpz_add(rop, rop, self->x);
+  }
+  
+  //~ return center + c1_z;
+  mpz_add(rop, rop, self->c_z);
+}
+
 void dgs_rround_mp_clear(dgs_rround_mp_t *self) {
   mpz_clear(self->x);
   mpfr_clear(self->y);
@@ -246,5 +416,21 @@ void dgs_rround_mp_clear(dgs_rround_mp_t *self) {
   if (self->B) dgs_bern_uniform_clear(self->B);
   if (self->B_half_exp) dgs_bern_mp_clear(self->B_half_exp);
 
+  if (self->s_bar2) 
+    mpfr_clear(self->s_bar2);
+  if (self->bm_sample) 
+    mpfr_clear(self->bm_sample);
+  if (self->base_samplers) {
+    for (int i = 0; i < (1 << self->log_base); ++i) {
+      if (self->base_samplers[i]) {
+        dgs_disc_gauss_mp_clear(self->base_samplers[i]);
+      }
+    }
+    free(self->base_samplers);
+  }
+  
+  if (self->wide_sampler)
+    dgs_disc_gauss_mp_clear(self->wide_sampler);
+  
   free(self);
 }

From 7b00dfbd5ed552d24791eb2507981b1532375b4e Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Fri, 22 Jun 2018 12:07:19 +0200
Subject: [PATCH 7/8] tests for convolution rounder

---
 tests/test_rround_z.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test_rround_z.c b/tests/test_rround_z.c
index ba8f687..f69b985 100644
--- a/tests/test_rround_z.c
+++ b/tests/test_rround_z.c
@@ -102,6 +102,12 @@ int main(int argc, char *argv[]) {
   test_ratios_dp( 4.0, 3, DGS_RROUND_KARNEY);
   test_ratios_dp(15.4, 3, DGS_RROUND_KARNEY);
   printf("\n");
+  
+  test_ratios_dp( 3.0, 6, DGS_RROUND_CONVOLUTION);
+  test_ratios_dp( 2.0, 6, DGS_RROUND_CONVOLUTION);
+  test_ratios_dp( 4.0, 3, DGS_RROUND_CONVOLUTION);
+  test_ratios_dp(15.4, 3, DGS_RROUND_CONVOLUTION);
+  printf("\n");
 
   printf("# testing [⌊c⌋-⌈στ⌉,…, ⌊c⌋+⌈στ⌉] boundaries #\n");
   test_uniform_boundaries_dp( 3.0, 0.0, 2, DGS_RROUND_UNIFORM_ONLINE);
@@ -122,6 +128,12 @@ int main(int argc, char *argv[]) {
   test_mean_dp( 3.3, 1.0, 6, DGS_RROUND_KARNEY);
   test_mean_dp( 2.0, 1.5, 6, DGS_RROUND_KARNEY);
   printf("\n");
+  
+  test_mean_dp( 3.0, 0.0, 6, DGS_RROUND_CONVOLUTION);
+  test_mean_dp(10.0, 0.0, 6, DGS_RROUND_CONVOLUTION);
+  test_mean_dp( 3.3, 1.0, 6, DGS_RROUND_CONVOLUTION);
+  test_mean_dp( 2.0, 1.5, 6, DGS_RROUND_CONVOLUTION);
+  printf("\n");
 
   return 0;
 }

From 20b07cb25cd0f97f411a584f30c8ea591d867a87 Mon Sep 17 00:00:00 2001
From: Michael Walter <michael.walter@ist.ac.at>
Date: Fri, 22 Jun 2018 12:11:07 +0200
Subject: [PATCH 8/8] added documentation for convolution rounder

---
 README.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b4db9b9..36e7757 100644
--- a/README.md
+++ b/README.md
@@ -104,12 +104,22 @@ Available algorithms are:
   
   - `DGS_RROUND_KARNEY` - Karney's algorithm, similar in spirit to the sampler 
   `DGS_DISC_SIGMA2_LOGTABLE`, but without the need to precompute log tables and
-  without restriction on the center. 
+  without restriction on the center.
+  
+  - `DGS_RROUND_CONVOLUTION` - Reduces the rounding problem to the sampling 
+  problem and invokes alias sampling. 
   
   Karney's algorithm is described in:
   
   Charles F. F. Karney. *Sampling Exactly from the Normal Distribution*; in
   ACM Trans. Mathematical Software 42(1), 3:1-14 (Jan. 2016) [(PDF)](https://arxiv.org/pdf/1303.6257)
+  
+  The convolution approach to randomized rounding is described in
+  
+  Daniele Micciancio, Michael Walter. *Gaussian Sampling over the Integers: 
+  Efficient, Generic, Constant-Time*; in Advances in Cryptology – CRYPTO 2017;
+  Lecture Notes in Computer Science Volume 10402, 2017, pp 455-485
+  [(PDF)](https://eprint.iacr.org/2017/259.pdf) 
 
 ## Precisions ##