From 8b1793c0d18835c11a1a87ad8f0995f7e69df223 Mon Sep 17 00:00:00 2001
From: Charles Bouillaguet <charles.bouillaguet@lip6.fr>
Date: Mon, 28 Nov 2022 07:36:45 +0100
Subject: [PATCH 1/5] extract transpose from mzd.c

---
 m4ri/{mzd.c => transpose.c} | 939 ------------------------------------
 m4ri/transpose.h            |  35 ++
 2 files changed, 35 insertions(+), 939 deletions(-)
 rename m4ri/{mzd.c => transpose.c} (52%)
 create mode 100644 m4ri/transpose.h

diff --git a/m4ri/mzd.c b/m4ri/transpose.c
similarity index 52%
rename from m4ri/mzd.c
rename to m4ri/transpose.c
index ba04b7c..a381123 100644
--- a/m4ri/mzd.c
+++ b/m4ri/transpose.c
@@ -19,219 +19,7 @@
  *                  http://www.gnu.org/licenses/
  ******************************************************************************/
 
-#ifdef HAVE_CONFIG_H
-#include "config.h"
-#endif
-
-#ifdef __M4RI_HAVE_LIBPNG
-#include <png.h>
-#endif
-
-#include "mmc.h"
 #include "mzd.h"
-#include "parity.h"
-#include <stdlib.h>
-#include <string.h>
-
-/**
- * \brief Cache of mzd_t containers
- */
-
-typedef struct mzd_t_cache {
-  mzd_t mzd[64];            /*!< cached matrices */
-  struct mzd_t_cache *prev; /*!< previous block */
-  struct mzd_t_cache *next; /*!< next block */
-  uint64_t used;            /*!< bitmasks which matrices in this block are used */
-  unsigned char padding[sizeof(mzd_t) - 2 * sizeof(struct mzd_t_cache *) -
-                        sizeof(uint64_t)]; /*!< alignment */
-#ifdef __GNUC__
-} mzd_t_cache_t __attribute__((__aligned__(64)));
-#else
-} mzd_t_cache_t;
-#endif
-
-#define __M4RI_MZD_T_CACHE_MAX 16
-static mzd_t_cache_t mzd_cache;
-static mzd_t_cache_t *current_cache = &mzd_cache;
-
-static int log2_floor(uint64_t v) {
-  static uint64_t const b[]     = {0x2, 0xC, 0xF0, 0xFF00, 0xFFFF0000, 0xFFFFFFFF00000000};
-  static unsigned int const S[] = {1, 2, 4, 8, 16, 32};
-  unsigned int r                = 0;
-  for (int i = 5; i >= 0; --i) {
-    if ((v & b[i])) {
-      v >>= S[i];
-      r |= S[i];
-    }
-  }
-  return r;
-}
-
-/*
- * Return a pointer to a new mzd_t structure.
- * The structure will be 64 byte aligned.
- * Call mzd_t_free to free the structure for next use.
- */
-
-static mzd_t *mzd_t_malloc() {
-#if __M4RI_ENABLE_MZD_CACHE == 0
-  return (mzd_t *)m4ri_mm_malloc(sizeof(mzd_t));
-#else
-  mzd_t *ret = NULL;
-  int i      = 0;
-
-  if (current_cache->used == (uint64_t)-1) {
-    mzd_t_cache_t *cache = &mzd_cache;
-    while (cache && cache->used == (uint64_t)-1) {
-      current_cache = cache;
-      cache         = cache->next;
-      i++;
-    }
-    if (!cache && i < __M4RI_MZD_T_CACHE_MAX) {
-      cache = (mzd_t_cache_t *)m4ri_mm_malloc_aligned(sizeof(mzd_t_cache_t), 64);
-      memset((char *)cache, 0, sizeof(mzd_t_cache_t));
-
-      cache->prev         = current_cache;
-      current_cache->next = cache;
-      current_cache       = cache;
-    } else if (!cache && i >= __M4RI_MZD_T_CACHE_MAX) {
-      /* We have reached the upper limit on the number of caches */
-      ret = (mzd_t *)m4ri_mm_malloc(sizeof(mzd_t));
-    } else {
-      current_cache = cache;
-    }
-  }
-  if (ret == NULL) {
-    int free_entry = log2_floor(~current_cache->used);
-    current_cache->used |= ((uint64_t)1 << free_entry);
-    ret = &current_cache->mzd[free_entry];
-  }
-  return ret;
-#endif  //__M4RI_ENABLE_MZD_CACHE
-}
-
-static void mzd_t_free(mzd_t *M) {
-#if __M4RI_ENABLE_MZD_CACHE == 0
-  m4ri_mm_free(M);
-#else
-  int foundit          = 0;
-  mzd_t_cache_t *cache = &mzd_cache;
-  while (cache) {
-    size_t entry = M - cache->mzd;
-    if (entry < 64) {
-      cache->used &= ~((uint64_t)1 << entry);
-      if (cache->used == 0) {
-        if (cache == &mzd_cache) {
-          current_cache = cache;
-        } else {
-          if (cache == current_cache) { current_cache = cache->prev; }
-          cache->prev->next = cache->next;
-          if (cache->next) cache->next->prev = cache->prev;
-          m4ri_mm_free(cache);
-        }
-      }
-      foundit = 1;
-      break;
-    }
-    cache = cache->next;
-  }
-  if (!foundit) { m4ri_mm_free(M); }
-#endif  //__M4RI_ENABLE_MZD_CACHE
-}
-
-mzd_t *mzd_init(rci_t r, rci_t c) {
-  assert(sizeof(mzd_t) == 64);
-  mzd_t *A = mzd_t_malloc();
-  A->nrows         = r;
-  A->ncols         = c;
-  A->width         = (c + m4ri_radix - 1) / m4ri_radix;
-  A->rowstride     = ((A->width & 1) == 0) ? A->width : A->width + 1;
-  A->high_bitmask  = __M4RI_LEFT_BITMASK(c % m4ri_radix);
-  A->flags         = (A->high_bitmask != m4ri_ffff) ? mzd_flag_nonzero_excess : 0;
-  if (r && c) {
-    size_t block_words = r * A->rowstride;
-    A->data = m4ri_mmc_calloc(block_words, sizeof(word));
-  } else {
-    A->data = NULL;
-  }
-  return A;
-}
-
-mzd_t *mzd_init_window(mzd_t *M, const rci_t lowr, const rci_t lowc, const rci_t highr,
-                       const rci_t highc) {
-  assert(lowc % m4ri_radix == 0);
-
-  mzd_t *W = mzd_t_malloc();
-
-  rci_t nrows = MIN(highr - lowr, M->nrows - lowr);
-  rci_t ncols = highc - lowc;
-  W->nrows = nrows;
-  W->ncols = ncols;
-  W->rowstride = M->rowstride;
-  W->width = (ncols + m4ri_radix - 1) / m4ri_radix;
-  W->high_bitmask = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
-  W->flags = mzd_flag_windowed;
-  if (ncols % m4ri_radix != 0) W->flags |= mzd_flag_nonzero_excess;
-  W->data = M->data + lowr * M->rowstride + (lowc / m4ri_radix);
-  __M4RI_DD_MZD(W);
-  return W;
-}
-
-void mzd_free(mzd_t *A) {
-  if (!mzd_is_windowed(A)) {
-    size_t block_words = A->nrows * A->rowstride;
-    m4ri_mmc_free(A->data, block_words * sizeof(word));
-  }
-  mzd_t_free(A);
-}
-
-void mzd_row_add(mzd_t *M, rci_t sourcerow, rci_t destrow) {
-  mzd_row_add_offset(M, destrow, sourcerow, 0);
-}
-
-void mzd_row_clear_offset(mzd_t *M, rci_t row, rci_t coloffset) {
-  wi_t const startblock = coloffset / m4ri_radix;
-  word temp;
-  word *truerow = mzd_row(M, row);
-  /* make sure to start clearing at coloffset */
-  if (coloffset % m4ri_radix) {
-    temp = truerow[startblock];
-    temp &= __M4RI_RIGHT_BITMASK(m4ri_radix - coloffset);
-  } else {
-    temp = 0;
-  }
-  truerow[startblock] = temp;
-  for (wi_t i = startblock + 1; i < M->width; ++i) { truerow[i] = 0; }
-
-  __M4RI_DD_ROW(M, row);
-}
-
-rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
-  rci_t startrow = startcol;
-  rci_t pivots   = 0;
-  for (rci_t i = startcol; i < M->ncols; ++i) {
-    for (rci_t j = startrow; j < M->nrows; ++j) {
-      if (mzd_read_bit(M, j, i)) {
-        mzd_row_swap(M, startrow, j);
-        ++pivots;
-
-        for (rci_t ii = full ? 0 : startrow + 1; ii < M->nrows; ++ii) {
-          if (ii != startrow) {
-            if (mzd_read_bit(M, ii, i)) { mzd_row_add_offset(M, ii, startrow, i); }
-          }
-        }
-        startrow = startrow + 1;
-        break;
-      }
-    }
-  }
-
-  __M4RI_DD_MZD(M);
-  __M4RI_DD_RCI(pivots);
-  return pivots;
-}
-
-rci_t mzd_echelonize_naive(mzd_t *M, int full) { return mzd_gauss_delayed(M, 0, full); }
 
 /**
  * Transpose a 64 x 64 matrix with width 1.
@@ -1138,730 +926,3 @@ mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
   mzd_free(D);
   return DST;
 }
-
-mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
-  if (C == NULL) {
-    C = mzd_init(A->nrows, B->ncols);
-  } else {
-    if (C->nrows != A->nrows || C->ncols != B->ncols) {
-      m4ri_die("mzd_mul_naive: Provided return matrix has wrong dimensions.\n");
-    }
-  }
-  if (B->ncols < m4ri_radix - 10) { /* this cutoff is rather arbitrary */
-    mzd_t *BT = mzd_transpose(NULL, B);
-    _mzd_mul_naive(C, A, BT, 1);
-    mzd_free(BT);
-  } else {
-    _mzd_mul_va(C, A, B, 1);
-  }
-  return C;
-}
-
-mzd_t *mzd_addmul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
-  if (C->nrows != A->nrows || C->ncols != B->ncols) {
-    m4ri_die("mzd_addmul_naive: Provided return matrix has wrong dimensions.\n");
-  }
-
-  if (B->ncols < m4ri_radix - 10) { /* this cutoff is rather arbitrary */
-    mzd_t *BT = mzd_transpose(NULL, B);
-    _mzd_mul_naive(C, A, BT, 0);
-    mzd_free(BT);
-  } else {
-    _mzd_mul_va(C, A, B, 0);
-  }
-  return C;
-}
-
-mzd_t *_mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B, const int clear) {
-  wi_t eol;
-
-  if (clear) {
-    word const mask_end = C->high_bitmask;
-    /* improves performance on x86_64 but is not cross plattform */
-    /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
-    for (rci_t i = 0; i < C->nrows; ++i) {
-      wi_t j = 0;
-      word *row = mzd_row(C, i);
-      for (; j < C->width - 1; ++j) { row[j] = 0; }
-      row[j] &= ~mask_end;
-    }
-  }
-
-  if (C->ncols % m4ri_radix) {
-    eol = (C->width - 1);
-  } else {
-    eol = (C->width);
-  }
-
-  word parity[64];
-  for (int i = 0; i < 64; ++i) { parity[i] = 0; }
-  wi_t const wide     = A->width;
-  int const blocksize = __M4RI_MUL_BLOCKSIZE;
-  for (rci_t start = 0; start + blocksize <= C->nrows; start += blocksize) {
-    for (rci_t i = start; i < start + blocksize; ++i) {
-      word const *a = mzd_row_const(A, i);
-      word *c = mzd_row(C, i);
-      for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
-        for (int k = 0; k < m4ri_radix; ++k) {
-          word const *b = mzd_row_const(B, j + k);
-          parity[k] = a[0] & b[0];
-          for (wi_t ii = wide - 1; ii >= 1; --ii) parity[k] ^= a[ii] & b[ii];
-        }
-        c[j / m4ri_radix] ^= m4ri_parity64(parity);
-      }
-
-      if (eol != C->width) {
-        word const mask_end = C->high_bitmask;
-        /* improves performance on x86_64 but is not cross plattform */
-        /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop");
-         */
-        for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
-          word const *b = mzd_row_const(B, m4ri_radix * eol + k);
-          parity[k] = a[0] & b[0];
-          for (wi_t ii = 1; ii < A->width; ++ii) parity[k] ^= a[ii] & b[ii];
-        }
-        c[eol] ^= m4ri_parity64(parity) & mask_end;
-      }
-    }
-  }
-
-  for (rci_t i = C->nrows - (C->nrows % blocksize); i < C->nrows; ++i) {
-    word const *a = mzd_row_const(A, i);
-    word *c = mzd_row(C, i);
-    for (rci_t j = 0; j < m4ri_radix * eol; j += m4ri_radix) {
-      for (int k = 0; k < m4ri_radix; ++k) {
-        word const *b = mzd_row_const(B, j + k);
-        parity[k] = a[0] & b[0];
-        for (wi_t ii = wide - 1; ii >= 1; --ii) parity[k] ^= a[ii] & b[ii];
-      }
-      c[j / m4ri_radix] ^= m4ri_parity64(parity);
-    }
-
-    if (eol != C->width) {
-      word const mask_end = C->high_bitmask;
-      /* improves performance on x86_64 but is not cross plattform */
-      /* asm __volatile__ (".p2align 4\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop\n\tnop"); */
-      for (int k = 0; k < (C->ncols % m4ri_radix); ++k) {
-        word const *b = mzd_row_const(B, m4ri_radix * eol + k);
-        parity[k] = a[0] & b[0];
-        for (wi_t ii = 1; ii < A->width; ++ii) parity[k] ^= a[ii] & b[ii];
-      }
-      c[eol] ^= m4ri_parity64(parity) & mask_end;
-    }
-  }
-
-  __M4RI_DD_MZD(C);
-  return C;
-}
-
-mzd_t *_mzd_mul_va(mzd_t *C, mzd_t const *v, mzd_t const *A, int const clear) {
-  if (clear) mzd_set_ui(C, 0);
-
-  rci_t const m = v->nrows;
-  rci_t const n = v->ncols;
-
-  for (rci_t i = 0; i < m; ++i)
-    for (rci_t j = 0; j < n; ++j)
-      if (mzd_read_bit(v, i, j)) mzd_combine(C, i, 0, C, i, 0, A, j, 0);
-
-  __M4RI_DD_MZD(C);
-  return C;
-}
-
-void mzd_randomize(mzd_t *A) {
-  wi_t const width    = A->width - 1;
-  word const mask_end = A->high_bitmask;
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word *row = mzd_row(A, i);
-    for (wi_t j = 0; j < width; ++j) row[j] = m4ri_random_word();
-    row[width] ^= (row[width] ^ m4ri_random_word()) & mask_end;
-  }
-
-  __M4RI_DD_MZD(A);
-}
-
-void mzd_randomize_custom(mzd_t *A, m4ri_random_callback rc, void *data) {
-  wi_t const width    = A->width - 1;
-  word const mask_end = A->high_bitmask;
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word *row = mzd_row(A, i);
-    for (wi_t j = 0; j < width; ++j) row[j] = rc(data);
-    row[width] ^= (row[width] ^ rc(data)) & mask_end;
-  }
-
-  __M4RI_DD_MZD(A);
-}
-
-void mzd_set_ui(mzd_t *A, unsigned int value) {
-  word const mask_end = A->high_bitmask;
-
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word *row = mzd_row(A, i);
-    for (wi_t j = 0; j < A->width - 1; ++j) row[j] = 0;
-    row[A->width - 1] &= ~mask_end;
-  }
-
-  if (value % 2 == 0) {
-    __M4RI_DD_MZD(A);
-    return;
-  }
-
-  rci_t const stop = MIN(A->nrows, A->ncols);
-  for (rci_t i = 0; i < stop; ++i) { mzd_write_bit(A, i, i, 1); }
-
-  __M4RI_DD_MZD(A);
-}
-
-int mzd_equal(mzd_t const *A, mzd_t const *B) {
-  if (A->nrows != B->nrows) return FALSE;
-  if (A->ncols != B->ncols) return FALSE;
-  if (A == B) return TRUE;
-
-  wi_t Awidth = A->width - 1;
-  word const mask_end = A->high_bitmask;
-
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word const *rowa = mzd_row_const(A, i);
-    word const *rowb = mzd_row_const(B, i);
-    for (wi_t j = 0; j < Awidth; ++j) {
-      if (rowa[j] != rowb[j]) return FALSE;
-    }
-    if (((rowa[Awidth] ^ rowb[Awidth]) & mask_end)) return FALSE;
-  }
-  return TRUE;
-}
-
-int mzd_cmp(mzd_t const *A, mzd_t const *B) {
-  if (A->nrows < B->nrows) return -1;
-  if (B->nrows < A->nrows) return 1;
-  if (A->ncols < B->ncols) return -1;
-  if (B->ncols < A->ncols) return 1;
-
-  const word mask_end = A->high_bitmask;
-  const wi_t n        = A->width - 1;
-
-  /* Columns with large index are "larger", but rows with small index
-     are more important than with large index. */
-
-  for (rci_t i = 0; i < A->nrows; i++) {
-    word const *rowa = mzd_row_const(A, i);
-    word const *rowb = mzd_row_const(B, i);
-    if ((rowa[n] & mask_end) < (rowb[n] & mask_end))
-      return -1;
-    else if ((rowa[n] & mask_end) > (rowb[n] & mask_end))
-      return 1;
-
-    for (wi_t j = n - 1; j >= 0; j--) {
-      if (rowa[j] < rowb[j])
-        return -1;
-      else if (rowa[j] > rowb[j])
-        return 1;
-    }
-  }
-  return 0;
-}
-
-mzd_t *mzd_copy(mzd_t *N, mzd_t const *P) {
-  if (N == P) return N;
-
-  if (N == NULL) {
-    N = mzd_init(P->nrows, P->ncols);
-  } else {
-    if (N->nrows < P->nrows || N->ncols < P->ncols)
-      m4ri_die("mzd_copy: Target matrix is too small.");
-  }
-  wi_t const wide = P->width - 1;
-  word mask_end   = P->high_bitmask;
-  for (rci_t i = 0; i < P->nrows; ++i) {
-    word const *p_truerow = mzd_row_const(P, i);
-    word *n_truerow = mzd_row(N, i);
-    for (wi_t j = 0; j < wide; ++j) n_truerow[j] = p_truerow[j];
-    n_truerow[wide] = (n_truerow[wide] & ~mask_end) | (p_truerow[wide] & mask_end);
-  }
-  __M4RI_DD_MZD(N);
-  return N;
-}
-
-/* This is sometimes called augment */
-mzd_t *mzd_concat(mzd_t *C, mzd_t const *A, mzd_t const *B) {
-  if (A->nrows != B->nrows) { m4ri_die("mzd_concat: Bad arguments to concat!\n"); }
-
-  if (C == NULL) {
-    C = mzd_init(A->nrows, A->ncols + B->ncols);
-  } else if (C->nrows != A->nrows || C->ncols != (A->ncols + B->ncols)) {
-    m4ri_die("mzd_concat: C has wrong dimension!\n");
-  }
-
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word *dst_truerow = mzd_row(C, i);
-    word const *src_truerow = mzd_row_const(A, i);
-    for (wi_t j = 0; j < A->width; ++j) { dst_truerow[j] = src_truerow[j]; }
-  }
-
-  for (rci_t i = 0; i < B->nrows; ++i) {
-    for (rci_t j = 0; j < B->ncols; ++j) {
-      mzd_write_bit(C, i, j + A->ncols, mzd_read_bit(B, i, j));
-    }
-  }
-
-  __M4RI_DD_MZD(C);
-  return C;
-}
-
-mzd_t *mzd_stack(mzd_t *C, mzd_t const *A, mzd_t const *B) {
-  if (A->ncols != B->ncols) {
-    m4ri_die("mzd_stack: A->ncols (%d) != B->ncols (%d)!\n", A->ncols, B->ncols);
-  }
-
-  if (C == NULL) {
-    C = mzd_init(A->nrows + B->nrows, A->ncols);
-  } else if (C->nrows != (A->nrows + B->nrows) || C->ncols != A->ncols) {
-    m4ri_die("mzd_stack: C has wrong dimension!\n");
-  }
-
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word const *src_truerow = mzd_row_const(A, i);
-    word *dst_truerow = mzd_row(C, i);
-    for (wi_t j = 0; j < A->width; ++j) { dst_truerow[j] = src_truerow[j]; }
-  }
-
-  for (rci_t i = 0; i < B->nrows; ++i) {
-    word *dst_truerow = mzd_row(C, A->nrows + i);
-    word const *src_truerow = mzd_row_const(B, i);
-    for (wi_t j = 0; j < B->width; ++j) { dst_truerow[j] = src_truerow[j]; }
-  }
-
-  __M4RI_DD_MZD(C);
-  return C;
-}
-
-mzd_t *mzd_invert_naive(mzd_t *INV, mzd_t const *A, mzd_t const *I) {
-  mzd_t *H;
-
-  H = mzd_concat(NULL, A, I);
-
-  rci_t x = mzd_echelonize_naive(H, TRUE);
-
-  if (x == 0) {
-    mzd_free(H);
-    return NULL;
-  }
-
-  INV = mzd_submatrix(INV, H, 0, A->ncols, A->nrows, 2 * A->ncols);
-
-  mzd_free(H);
-
-  __M4RI_DD_MZD(INV);
-  return INV;
-}
-
-mzd_t *mzd_add(mzd_t *ret, mzd_t const *left, mzd_t const *right) {
-  if (left->nrows != right->nrows || left->ncols != right->ncols) {
-    m4ri_die("mzd_add: rows and columns must match.\n");
-  }
-  if (ret == NULL) {
-    ret = mzd_init(left->nrows, left->ncols);
-  } else if (ret != left) {
-    if (ret->nrows != left->nrows || ret->ncols != left->ncols) {
-      m4ri_die("mzd_add: rows and columns of returned matrix must match.\n");
-    }
-  }
-  return _mzd_add(ret, left, right);
-}
-
-mzd_t *_mzd_add(mzd_t *C, mzd_t const *A, mzd_t const *B) {
-  rci_t const nrows = MIN(MIN(A->nrows, B->nrows), C->nrows);
-
-  if (C == B) {  // swap
-    mzd_t const *tmp = A;
-    A                = B;
-    B                = tmp;
-  }
-
-  word const mask_end = C->high_bitmask;
-
-  switch (A->width) {
-  case 0: return C;
-  case 1:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] ^= ((rowa[0] ^ rowb[0] ^ rowc[0]) & mask_end);
-    }
-    break;
-  case 2:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] ^= ((rowa[1] ^ rowb[1] ^ rowc[1]) & mask_end);
-    }
-    break;
-  case 3:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] ^= ((rowa[2] ^ rowb[2] ^ rowc[2]) & mask_end);
-    }
-    break;
-  case 4:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] = rowa[2] ^ rowb[2];
-      rowc[3] ^= ((rowa[3] ^ rowb[3] ^ rowc[3]) & mask_end);
-    }
-    break;
-  case 5:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] = rowa[2] ^ rowb[2];
-      rowc[3] = rowa[3] ^ rowb[3];
-      rowc[4] ^= ((rowa[4] ^ rowb[4] ^ rowc[4]) & mask_end);
-    }
-    break;
-  case 6:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] = rowa[2] ^ rowb[2];
-      rowc[3] = rowa[3] ^ rowb[3];
-      rowc[4] = rowa[4] ^ rowb[4];
-      rowc[5] ^= ((rowa[5] ^ rowb[5] ^ rowc[5]) & mask_end);
-    }
-    break;
-  case 7:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] = rowa[2] ^ rowb[2];
-      rowc[3] = rowa[3] ^ rowb[3];
-      rowc[4] = rowa[4] ^ rowb[4];
-      rowc[5] = rowa[5] ^ rowb[5];
-      rowc[6] ^= ((rowa[6] ^ rowb[6] ^ rowc[6]) & mask_end);
-    }
-    break;
-  case 8:
-    for (rci_t i = 0; i < nrows; ++i) {
-      word const * rowa = mzd_row_const(A, i);
-      word const * rowb = mzd_row_const(B, i);
-      word * rowc = mzd_row(C, i);
-      rowc[0] = rowa[0] ^ rowb[0];
-      rowc[1] = rowa[1] ^ rowb[1];
-      rowc[2] = rowa[2] ^ rowb[2];
-      rowc[3] = rowa[3] ^ rowb[3];
-      rowc[4] = rowa[4] ^ rowb[4];
-      rowc[5] = rowa[5] ^ rowb[5];
-      rowc[6] = rowa[6] ^ rowb[6];
-      rowc[7] ^= ((rowa[7] ^ rowb[7] ^ rowc[7]) & mask_end);
-    }
-    break;
-
-  default:
-    for (rci_t i = 0; i < nrows; ++i) { mzd_combine_even(C, i, 0, A, i, 0, B, i, 0); }
-  }
-
-  __M4RI_DD_MZD(C);
-  return C;
-}
-
-mzd_t *mzd_submatrix(mzd_t *S, mzd_t const *M, rci_t const startrow, rci_t const startcol,
-                     rci_t const endrow, rci_t const endcol) {
-  rci_t const nrows = endrow - startrow;
-  rci_t const ncols = endcol - startcol;
-
-  if (S == NULL) {
-    S = mzd_init(nrows, ncols);
-  } else if ((S->nrows < nrows) | (S->ncols < ncols)) {
-    m4ri_die("mzd_submatrix: got S with dimension %d x %d but expected %d x %d\n", S->nrows,
-             S->ncols, nrows, ncols);
-  }
-
-  if (startcol % m4ri_radix == 0) {
-
-    wi_t const startword = startcol / m4ri_radix;
-    /* we start at the beginning of a word */
-    if (ncols / m4ri_radix != 0) {
-      for (rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
-        memcpy(mzd_row(S, i), mzd_row_const(M, x) + startword, sizeof(word) * (ncols / m4ri_radix));
-      }
-    }
-    if (ncols % m4ri_radix) {
-      word const mask_end = __M4RI_LEFT_BITMASK(ncols % m4ri_radix);
-      for (rci_t x = startrow, i = 0; i < nrows; ++i, ++x) {
-        /* process remaining bits */
-        word temp                      = mzd_row_const(M, x)[startword + ncols / m4ri_radix] & mask_end;
-        mzd_row(S, i)[ncols / m4ri_radix] = temp;
-      }
-    }
-  } else {
-    wi_t j;
-    for (rci_t i = 0; i < nrows; i++) {
-      word *srow = mzd_row(S, i);
-      for (j = 0; j + m4ri_radix < ncols; j += m4ri_radix)
-        srow[j / m4ri_radix] = mzd_read_bits(M, startrow + i, startcol + j, m4ri_radix);
-      srow[j / m4ri_radix] &= ~S->high_bitmask;
-      srow[j / m4ri_radix] |=
-          mzd_read_bits(M, startrow + i, startcol + j, ncols - j) & S->high_bitmask;
-    }
-  }
-  __M4RI_DD_MZD(S);
-  return S;
-}
-
-int mzd_is_zero(mzd_t const *A) {
-  word status   = 0;
-  word mask_end = A->high_bitmask;
-  for (rci_t i = 0; i < A->nrows; ++i) {
-    word const *row = mzd_row_const(A, i);
-    for (wi_t j = 0; j < A->width - 1; ++j) status |= row[j];
-    status |= row[A->width - 1] & mask_end;
-    if (status) return 0;
-  }
-  return !status;
-}
-
-void mzd_copy_row(mzd_t *B, rci_t i, mzd_t const *A, rci_t j) {
-  assert(B->ncols >= A->ncols);
-  wi_t const width = MIN(B->width, A->width) - 1;
-
-  word const *a = mzd_row_const(A, j);
-  word *b       = mzd_row(B, i);
-
-  word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
-
-  if (width != 0) {
-    for (wi_t k = 0; k < width; ++k) b[k] = a[k];
-    b[width] = (b[width] & ~mask_end) | (a[width] & mask_end);
-
-  } else {
-    b[0] = (a[0] & mask_end) | (b[0] & ~mask_end);
-  }
-
-  __M4RI_DD_ROW(B, i);
-}
-
-int mzd_find_pivot(mzd_t const *A, rci_t start_row, rci_t start_col, rci_t *r, rci_t *c) {
-  rci_t const nrows   = A->nrows;
-  rci_t const ncols   = A->ncols;
-  word data           = 0;
-  rci_t row_candidate = 0;
-  if (A->ncols - start_col < m4ri_radix) {
-    for (rci_t j = start_col; j < A->ncols; j += m4ri_radix) {
-      int const length = MIN(m4ri_radix, ncols - j);
-      for (rci_t i = start_row; i < nrows; ++i) {
-        word const curr_data = mzd_read_bits(A, i, j, length);
-        if (m4ri_lesser_LSB(curr_data, data)) {
-          row_candidate = i;
-          data          = curr_data;
-        }
-      }
-      if (data) {
-        *r = row_candidate;
-        for (int l = 0; l < length; ++l) {
-          if (__M4RI_GET_BIT(data, l)) {
-            *c = j + l;
-            break;
-          }
-        }
-        __M4RI_DD_RCI(*r);
-        __M4RI_DD_RCI(*c);
-        __M4RI_DD_INT(1);
-        return 1;
-      }
-    }
-  } else {
-    /* we definitely have more than one word */
-    /* handle first word */
-    int const bit_offset   = (start_col % m4ri_radix);
-    wi_t const word_offset = start_col / m4ri_radix;
-    word const mask_begin  = __M4RI_RIGHT_BITMASK(m4ri_radix - bit_offset);
-    for (rci_t i = start_row; i < nrows; ++i) {
-      word const *row = mzd_row_const(A, i);
-      word const curr_data = row[word_offset] & mask_begin;
-      if (m4ri_lesser_LSB(curr_data, data)) {
-        row_candidate = i;
-        data          = curr_data;
-        if (__M4RI_GET_BIT(data, bit_offset)) { break; }
-      }
-    }
-    if (data) {
-      *r = row_candidate;
-      data >>= bit_offset;
-      assert(data);
-      for (int l = 0; l < (m4ri_radix - bit_offset); ++l) {
-        if (__M4RI_GET_BIT(data, l)) {
-          *c = start_col + l;
-          break;
-        }
-      }
-      __M4RI_DD_RCI(*r);
-      __M4RI_DD_RCI(*c);
-      __M4RI_DD_INT(1);
-      return 1;
-    }
-    /* handle complete words */
-    for (wi_t wi = word_offset + 1; wi < A->width - 1; ++wi) {
-      for (rci_t i = start_row; i < nrows; ++i) {
-        word const *row = mzd_row_const(A, i);
-        word const curr_data = row[wi];
-        if (m4ri_lesser_LSB(curr_data, data)) {
-          row_candidate = i;
-          data          = curr_data;
-          if (__M4RI_GET_BIT(data, 0)) break;
-        }
-      }
-      if (data) {
-        *r = row_candidate;
-        for (int l = 0; l < m4ri_radix; ++l) {
-          if (__M4RI_GET_BIT(data, l)) {
-            *c = wi * m4ri_radix + l;
-            break;
-          }
-        }
-        __M4RI_DD_RCI(*r);
-        __M4RI_DD_RCI(*c);
-        __M4RI_DD_INT(1);
-        return 1;
-      }
-    }
-    /* handle last word */
-    int const end_offset = (A->ncols % m4ri_radix) ? (A->ncols % m4ri_radix) : m4ri_radix;
-    word const mask_end  = __M4RI_LEFT_BITMASK(end_offset % m4ri_radix);
-    wi_t wi              = A->width - 1;
-    for (rci_t i = start_row; i < nrows; ++i) {
-      word const *row = mzd_row_const(A, i);
-      word const curr_data = row[wi] & mask_end;
-      if (m4ri_lesser_LSB(curr_data, data)) {
-        row_candidate = i;
-        data          = curr_data;
-        if (__M4RI_GET_BIT(data, 0)) break;
-      }
-    }
-    if (data) {
-      *r = row_candidate;
-      for (int l = 0; l < end_offset; ++l) {
-        if (__M4RI_GET_BIT(data, l)) {
-          *c = wi * m4ri_radix + l;
-          break;
-        }
-      }
-      __M4RI_DD_RCI(*r);
-      __M4RI_DD_RCI(*c);
-      __M4RI_DD_INT(1);
-      return 1;
-    }
-  }
-  __M4RI_DD_RCI(*r);
-  __M4RI_DD_RCI(*c);
-  __M4RI_DD_INT(0);
-  return 0;
-}
-
-#define MASK(c) (((uint64_t)(-1)) / (__M4RI_TWOPOW(__M4RI_TWOPOW(c)) + 1))
-#define COUNT(x, c) ((x)&MASK(c)) + (((x) >> (__M4RI_TWOPOW(c))) & MASK(c))
-
-static inline int m4ri_bitcount(word w) {
-  uint64_t n = __M4RI_CONVERT_TO_UINT64_T(w);
-  n          = COUNT(n, 0);
-  n          = COUNT(n, 1);
-  n          = COUNT(n, 2);
-  n          = COUNT(n, 3);
-  n          = COUNT(n, 4);
-  n          = COUNT(n, 5);
-  return (int)n;
-}
-
-double _mzd_density(mzd_t const *A, wi_t res, rci_t r, rci_t c) {
-  size_t count = 0;
-  size_t total = 0;
-
-  if (A->width == 1) {
-    for (rci_t i = r; i < A->nrows; ++i)
-      for (rci_t j = c; j < A->ncols; ++j)
-        if (mzd_read_bit(A, i, j)) ++count;
-    return ((double)count) / (1.0 * A->ncols * A->nrows);
-  }
-
-  if (res == 0) res = A->width / 100;
-  if (res < 1) res = 1;
-
-  for (rci_t i = r; i < A->nrows; ++i) {
-    word const *truerow = mzd_row_const(A, i);
-    for (rci_t j = c; j < m4ri_radix; ++j)
-      if (mzd_read_bit(A, i, j)) ++count;
-    total += m4ri_radix;
-
-    for (wi_t j = MAX(1, c / m4ri_radix); j < A->width - 1; j += res) {
-      count += m4ri_bitcount(truerow[j]);
-      total += m4ri_radix;
-    }
-    for (int j = 0; j < A->ncols % m4ri_radix; ++j)
-      if (mzd_read_bit(A, i, m4ri_radix * (A->ncols / m4ri_radix) + j)) ++count;
-    total += A->ncols % m4ri_radix;
-  }
-
-  return (double)count / total;
-}
-
-double mzd_density(mzd_t const *A, wi_t res) { return _mzd_density(A, res, 0, 0); }
-
-rci_t mzd_first_zero_row(mzd_t const *A) {
-  word const mask_end = __M4RI_LEFT_BITMASK(A->ncols % m4ri_radix);
-  wi_t const end      = A->width - 1;
-  for (rci_t i = A->nrows - 1; i >= 0; --i) {
-    word const *row = mzd_row_const(A, i);
-    word tmp = row[0];
-    for (wi_t j = 1; j < end; ++j) tmp |= row[j];
-    tmp |= row[end] & mask_end;
-    if (tmp) {
-      __M4RI_DD_INT(i + 1);
-      return i + 1;
-    }
-  }
-  __M4RI_DD_INT(0);
-  return 0;
-}
-
-mzd_t *mzd_extract_u(mzd_t *U, mzd_t const *A) {
-  rci_t k = MIN(A->nrows, A->ncols);
-  if (U != NULL) { assert(U->nrows == k && U->ncols == k); }
-  U = mzd_submatrix(U, A, 0, 0, k, k);
-  for (rci_t i = 1; i < U->nrows; i++) {
-    word *row = mzd_row(U, i);
-    for (wi_t j = 0; j < i / m4ri_radix; j++) { row[j] = 0; }
-    if (i % m4ri_radix) mzd_clear_bits(U, i, (i / m4ri_radix) * m4ri_radix, i % m4ri_radix);
-  }
-  return U;
-}
-
-mzd_t *mzd_extract_l(mzd_t *L, mzd_t const *A) {
-  rci_t k = MIN(A->nrows, A->ncols);
-  if (L != NULL) { assert(L->nrows == k && L->ncols == k); }
-  L = mzd_submatrix(L, A, 0, 0, k, k);
-  for (rci_t i = 0; i < L->nrows - 1; i++) {
-    word *row = mzd_row(L, i);
-    if (m4ri_radix - (i + 1) % m4ri_radix)
-      mzd_clear_bits(L, i, i + 1, m4ri_radix - (i + 1) % m4ri_radix);
-    for (wi_t j = (i / m4ri_radix + 1); j < L->width; j++) { row[j] = 0; }
-  }
-  return L;
-}
diff --git a/m4ri/transpose.h b/m4ri/transpose.h
new file mode 100644
index 0000000..52e81eb
--- /dev/null
+++ b/m4ri/transpose.h
@@ -0,0 +1,35 @@
+/**
+ * \file transpose.h
+ *
+ * \brief Matrix transpose using a variant of the cache-oblivious algorithm
+ */
+
+#ifndef M4RI_STRASSEN_H
+#define M4RI_STRASSEN_H
+
+/*******************************************************************
+ *
+ *                 M4RI: Linear Algebra over GF(2)
+ *
+ *    Copyright (C) 2008 Martin Albrecht <M.R.Albrecht@rhul.ac.uk>
+ *    Copyright (C) 2008 Clement Pernet <pernet@math.washington.edu>
+ *
+ *  Distributed under the terms of the GNU General Public License (GPL)
+ *  version 2 or higher.
+ *
+ *    This code is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *    General Public License for more details.
+ *
+ *  The full text of the GPL is available at:
+ *
+ *                  http://www.gnu.org/licenses/
+ *
+ ********************************************************************/
+
+#include <m4ri/mzd.h>
+
+mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
+
+#endif  // M4RI_TRANSPOSE_H

From c5269b16d22563d4c00dcece8bc2387942743e03 Mon Sep 17 00:00:00 2001
From: Charles Bouillaguet <charles.bouillaguet@lip6.fr>
Date: Mon, 28 Nov 2022 07:43:35 +0100
Subject: [PATCH 2/5] renaming

---
 m4ri/{mzd.c => foobar.c} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename m4ri/{mzd.c => foobar.c} (100%)

diff --git a/m4ri/mzd.c b/m4ri/foobar.c
similarity index 100%
rename from m4ri/mzd.c
rename to m4ri/foobar.c

From 508b8a977aff51ee0c2abceb1ba812a971edbe05 Mon Sep 17 00:00:00 2001
From: Charles Bouillaguet <charles.bouillaguet@lip6.fr>
Date: Mon, 28 Nov 2022 08:07:27 +0100
Subject: [PATCH 3/5] finalize splitting transpose from mzd

---
 m4ri/{foobar.c => mzd.c} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename m4ri/{foobar.c => mzd.c} (100%)

diff --git a/m4ri/foobar.c b/m4ri/mzd.c
similarity index 100%
rename from m4ri/foobar.c
rename to m4ri/mzd.c

From 1960b8a603c295499ca8d702c503172c026bbd02 Mon Sep 17 00:00:00 2001
From: Charles Bouillaguet <charles.bouillaguet@lip6.fr>
Date: Mon, 28 Nov 2022 08:40:31 +0100
Subject: [PATCH 4/5] finishing touches

---
 Makefile.am      |  2 ++
 m4ri/transpose.h | 17 +++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/Makefile.am b/Makefile.am
index c648c3b..4070b44 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -9,6 +9,7 @@ libm4ri_la_SOURCES = \
 	m4ri/brilliantrussian.c \
 	m4ri/misc.c \
 	m4ri/mzd.c \
+	m4ri/transpose.c \
 	m4ri/graycode.c \
 	m4ri/strassen.c \
 	m4ri/mzp.c \
@@ -31,6 +32,7 @@ pkgincludesub_HEADERS = m4ri/m4ri.h \
 	m4ri/brilliantrussian.h \
 	m4ri/misc.h \
 	m4ri/mzd.h \
+	m4ri/transpose.h \
 	m4ri/graycode.h \
 	m4ri/strassen.h \
 	m4ri/parity.h \
diff --git a/m4ri/transpose.h b/m4ri/transpose.h
index 52e81eb..56215b8 100644
--- a/m4ri/transpose.h
+++ b/m4ri/transpose.h
@@ -4,8 +4,8 @@
  * \brief Matrix transpose using a variant of the cache-oblivious algorithm
  */
 
-#ifndef M4RI_STRASSEN_H
-#define M4RI_STRASSEN_H
+#ifndef M4RI_TRANSPOSE_H
+#define M4RI_TRANSPOSE_H
 
 /*******************************************************************
  *
@@ -30,6 +30,19 @@
 
 #include <m4ri/mzd.h>
 
+/**
+ * \brief Transpose a matrix.
+ *
+ * This function uses the fact that:
+\verbatim
+   [ A B ]T    [AT CT]
+   [ C D ]  =  [BT DT]
+ \endverbatim
+ * and thus rearranges the blocks recursively.
+ *
+ * \param DST Preallocated return matrix, may be NULL for automatic creation.
+ * \param A Matrix
+ */
 mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
 
 #endif  // M4RI_TRANSPOSE_H

From 86420d8ac98b0cdc2f6a94814c0f877ce5385ff0 Mon Sep 17 00:00:00 2001
From: Charles Bouillaguet <charles.bouillaguet@lip6.fr>
Date: Mon, 28 Nov 2022 08:45:08 +0100
Subject: [PATCH 5/5] It seems that I forgot everything

---
 m4ri/m4ri.h |   1 +
 m4ri/mzd.c  | 908 +---------------------------------------------------
 m4ri/mzd.h  |  16 -
 3 files changed, 3 insertions(+), 922 deletions(-)

diff --git a/m4ri/m4ri.h b/m4ri/m4ri.h
index eae9700..b18b166 100644
--- a/m4ri/m4ri.h
+++ b/m4ri/m4ri.h
@@ -67,6 +67,7 @@ extern "C" {
 #include <m4ri/ple_russian.h>
 #include <m4ri/solve.h>
 #include <m4ri/strassen.h>
+#include <m4ri/transpose.h>
 #include <m4ri/triangular.h>
 #include <m4ri/triangular_russian.h>
 
diff --git a/m4ri/mzd.c b/m4ri/mzd.c
index ba04b7c..81c9019 100644
--- a/m4ri/mzd.c
+++ b/m4ri/mzd.c
@@ -30,6 +30,8 @@
 #include "mmc.h"
 #include "mzd.h"
 #include "parity.h"
+#include "transpose.h"
+
 #include <stdlib.h>
 #include <string.h>
 
@@ -233,912 +235,6 @@ rci_t mzd_gauss_delayed(mzd_t *M, rci_t startcol, int full) {
 
 rci_t mzd_echelonize_naive(mzd_t *M, int full) { return mzd_gauss_delayed(M, 0, full); }
 
-/**
- * Transpose a 64 x 64 matrix with width 1.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of matrix dst.
- * \param rowstride_src Rowstride of matrix src.
- *
- * Rows of both matrices are expected to fit exactly in a word (offset == 0)
- * and lay entirely inside a single block.
- *
- * \note This function also works when dst == src.
- */
-
-static inline void _mzd_copy_transpose_64x64(word *dst, word const *src, wi_t rowstride_dst,
-                                             wi_t rowstride_src) {
-  /*
-   * m runs over the values:
-   *   0x00000000FFFFFFFF
-   *   0x0000FFFF0000FFFF
-   *   0x00FF00FF00FF00FF
-   *   0x0F0F0F0F0F0F0F0F
-   *   0x3333333333333333
-   *   0x5555555555555555,
-   * alternating j zeroes with j ones.
-   *
-   * Assume we have a matrix existing of four jxj matrices ((0,0) is in the top-right corner,
-   * this is the memory-model view, see the layout on
-   * http://m4ri.sagemath.org/doxygen/structmzd__t.html):
-   * ...[A1][B1][A0][B0]
-   * ...[C1][D1][C0][D0]
-   *          . [A2][B2]
-   *        .   [C2][B2]
-   *      .         .
-   *                .
-   * The following calulates the XOR between A and D,
-   * and subsequently applies that to A and D respectively,
-   * swapping A and D as a result.
-   * Therefore wk starts at the first row and then has rowstride
-   * added j times, running over the rows of A, then skips C
-   * by adding j * rowstride to continue with the next A below C.
-   */
-
-  word m               = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
-  wi_t j_rowstride_dst = rowstride_dst * 64;
-  wi_t j_rowstride_src = rowstride_src * 32;
-  word *const end      = dst + j_rowstride_dst;
-  // We start with j = 32, and a one-time unrolled loop, where
-  // we copy from src and write the result to dst, swapping
-  // the two 32x32 corner matrices.
-  int j = 32;
-  j_rowstride_dst >>= 1;
-  word *RESTRICT wk = dst;
-  for (word const *RESTRICT wks = src; wk < end; wk += j_rowstride_dst, wks += j_rowstride_src) {
-    for (int k = 0; k < j; ++k, wk += rowstride_dst, wks += rowstride_src) {
-      word xor                = ((*wks >> j) ^ *(wks + j_rowstride_src)) & m;
-      *wk                     = *wks ^ (xor << j);
-      *(wk + j_rowstride_dst) = *(wks + j_rowstride_src) ^ xor;
-    }
-  }
-  // Next we work in-place in dst and swap the corners of
-  // each of the last matrices, all in parallel, for all
-  // remaining values of j.
-  m ^= m << 16;
-  for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
-    j_rowstride_dst >>= 1;
-    for (wk = dst; wk < end; wk += j_rowstride_dst) {
-      for (int k = 0; k < j; ++k, wk += rowstride_dst) {
-        word xor = ((*wk >> j) ^ *(wk + j_rowstride_dst)) & m;
-        *wk ^= xor << j;
-        *(wk + j_rowstride_dst) ^= xor;
-      }
-    }
-  }
-}
-
-/**
- * Transpose two 64 x 64 matrix with width 1.
- *
- * \param dst1 First word of destination matrix 1.
- * \param dst2 First word of destination matrix 2.
- * \param src1 First word of source matrix 1.
- * \param src2 First word of source matrix 2.
- * \param rowstride_dst Rowstride of destination matrices.
- * \param rowstride_src Rowstride of source matrices.
- *
- * Rows of all matrices are expected to fit exactly in a word (offset == 0)
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_64x64_2(word *RESTRICT dst1, word *RESTRICT dst2,
-                                               word const *RESTRICT src1, word const *RESTRICT src2,
-                                               wi_t rowstride_dst, wi_t rowstride_src) {
-  word m               = __M4RI_CONVERT_TO_WORD(0xFFFFFFFF);
-  wi_t j_rowstride_dst = rowstride_dst * 64;
-  wi_t j_rowstride_src = rowstride_src * 32;
-  word *const end      = dst1 + j_rowstride_dst;
-  int j                = 32;
-  word *RESTRICT wk[2];
-  word const *RESTRICT wks[2];
-  word xor [2];
-
-  j_rowstride_dst >>= 1;
-  wk[0]  = dst1;
-  wk[1]  = dst2;
-  wks[0] = src1;
-  wks[1] = src2;
-
-  do {
-
-    for (int k = 0; k < j; ++k) {
-      xor[0]                     = ((*wks[0] >> j) ^ *(wks[0] + j_rowstride_src)) & m;
-      xor[1]                     = ((*wks[1] >> j) ^ *(wks[1] + j_rowstride_src)) & m;
-      *wk[0]                     = *wks[0] ^ (xor[0] << j);
-      *wk[1]                     = *wks[1] ^ (xor[1] << j);
-      *(wk[0] + j_rowstride_dst) = *(wks[0] + j_rowstride_src) ^ xor[0];
-      *(wk[1] + j_rowstride_dst) = *(wks[1] + j_rowstride_src) ^ xor[1];
-      wk[0] += rowstride_dst;
-      wk[1] += rowstride_dst;
-      wks[0] += rowstride_src;
-      wks[1] += rowstride_src;
-    }
-
-    wk[0] += j_rowstride_dst;
-    wk[1] += j_rowstride_dst;
-    wks[0] += j_rowstride_src;
-    wks[1] += j_rowstride_src;
-
-  } while (wk[0] < end);
-
-  m ^= m << 16;
-  for (j = 16; j != 0; j = j >> 1, m ^= m << j) {
-
-    j_rowstride_dst >>= 1;
-    wk[0] = dst1;
-    wk[1] = dst2;
-
-    do {
-
-      for (int k = 0; k < j; ++k) {
-        xor[0] = ((*wk[0] >> j) ^ *(wk[0] + j_rowstride_dst)) & m;
-        xor[1] = ((*wk[1] >> j) ^ *(wk[1] + j_rowstride_dst)) & m;
-        *wk[0] ^= xor[0] << j;
-        *wk[1] ^= xor[1] << j;
-        *(wk[0] + j_rowstride_dst) ^= xor[0];
-        *(wk[1] + j_rowstride_dst) ^= xor[1];
-        wk[0] += rowstride_dst;
-        wk[1] += rowstride_dst;
-      }
-
-      wk[0] += j_rowstride_dst;
-      wk[1] += j_rowstride_dst;
-
-    } while (wk[0] < end);
-  }
-}
-
-static unsigned char log2_ceil_table[64] = {
-    0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-    6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6};
-
-static inline int log2_ceil(int n) { return log2_ceil_table[n - 1]; }
-
-static word const transpose_mask[6] = {
-    0x5555555555555555ULL, 0x3333333333333333ULL, 0x0F0F0F0F0F0F0F0FULL,
-    0x00FF00FF00FF00FFULL, 0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL,
-};
-
-/**
- * Transpose 64/j matrices of size jxj in parallel.
- *
- * Where j equals n rounded up to the nearest power of 2.
- * The input array t must be of size j (containing the rows i of all matrices in t[i]).
- *
- * t[0..{j-1}]  = [Al]...[A1][A0]
- *
- * \param t An array of j words.
- * \param n The number of rows in each matrix.
- *
- * \return log2(j)
- */
-
-static inline int _mzd_transpose_Nxjx64(word *RESTRICT t, int n) {
-  int j  = 1;
-  int mi = 0;  // Index into the transpose_mask array.
-
-  while (j < n)  // Don't swap with entirely undefined data (where [D] exists entirely of
-                 // non-existant rows).
-  {
-    // Swap 64/j matrices of size jxj in 2j rows. Thus,
-    // <---- one word --->
-    // [Al][Bl]...[A0][B0]
-    // [Cl][Dl]...[C0][D0], where l = 64/j - 1 and each matrix [A], [B] etc is jxj.
-    // Then swap [A] and [D] in-place.
-
-    // m runs over the values in transpose_mask, so that at all
-    // times m exists of j zeroes followed by j ones, repeated.
-    word const m = transpose_mask[mi];
-    int k        = 0;  // Index into t[].
-    do {
-      // Run over all rows of [A] and [D].
-      for (int i = 0; i < j; ++i, ++k) {
-        // t[k] contains row i of all [A], and t[k + j] contains row i of all [D]. Swap them.
-        word xor = ((t[k] >> j) ^ t[k + j]) & m;
-        t[k] ^= xor << j;
-        t[k + j] ^= xor;
-      }
-      k += j;         // Skip [C].
-    } while (k < n);  // Stop if we passed all valid input.
-
-    // Double the size of j and repeat this for the next 2j rows until all
-    // n rows have been swapped (possibly with non-existant rows).
-    j <<= 1;
-    ++mi;
-  }
-
-  return mi;
-}
-
-/**
- * Transpose a n x 64 matrix with width 1.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of destination matrix.
- * \param rowstride_src Rowstride of source matrix.
- * \param n Number of rows in source matrix, must be less than 64.
- *
- * Rows of all matrices are expected have offset zero
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_lt64x64(word *RESTRICT dst, word const *RESTRICT src,
-                                               wi_t rowstride_dst, wi_t rowstride_src, int n) {
-  // Preload the n input rows into level 1, using a minimum of cache lines (compact storage).
-  word t[64];
-  word const *RESTRICT wks = src;
-  int k;
-  for (k = 0; k < n; ++k) {
-    t[k] = *wks;
-    wks += rowstride_src;
-  }
-  // see https://bitbucket.org/malb/m4ri/issues/53
-  for (; k < 64; ++k) { t[k] = 0; }
-  if (n > 32) {
-    while (k < 64) t[k++] = 0;
-    _mzd_copy_transpose_64x64(dst, t, rowstride_dst, 1);
-    return;
-  }
-  int log2j = _mzd_transpose_Nxjx64(t, n);
-  // All output bits are now transposed, but still might need to be shifted in place.
-  // What we have now is 64/j matrices of size jxj. Thus,
-  // [Al]...[A1][A0], where l = 64/j - 1.
-  // while the actual output is:
-  // [A0]
-  // [A1]
-  // ...
-  // [Al]
-  word const m      = __M4RI_LEFT_BITMASK(n);
-  word *RESTRICT wk = dst;
-  switch (log2j) {
-  case 5: {
-    wi_t const j_rowstride_dst = 32 * rowstride_dst;
-    for (int k = 0; k < 32; ++k) {
-      wk[0]               = t[k] & m;
-      wk[j_rowstride_dst] = (t[k] >> 32) & m;
-      wk += rowstride_dst;
-    }
-    break;
-  }
-  case 4: {
-    wi_t const j_rowstride_dst = 16 * rowstride_dst;
-    for (int k = 0; k < 16; ++k) {
-      wk[0]                   = t[k] & m;
-      wk[j_rowstride_dst]     = (t[k] >> 16) & m;
-      wk[2 * j_rowstride_dst] = (t[k] >> 32) & m;
-      wk[3 * j_rowstride_dst] = (t[k] >> 48) & m;
-      wk += rowstride_dst;
-    }
-    break;
-  }
-  case 3: {
-    wi_t const j_rowstride_dst = 8 * rowstride_dst;
-    for (int k = 0; k < 8; ++k) {
-      wk[0]                   = t[k] & m;
-      wk[j_rowstride_dst]     = (t[k] >> 8) & m;
-      wk[2 * j_rowstride_dst] = (t[k] >> 16) & m;
-      wk[3 * j_rowstride_dst] = (t[k] >> 24) & m;
-      wk[4 * j_rowstride_dst] = (t[k] >> 32) & m;
-      wk[5 * j_rowstride_dst] = (t[k] >> 40) & m;
-      wk[6 * j_rowstride_dst] = (t[k] >> 48) & m;
-      wk[7 * j_rowstride_dst] = (t[k] >> 56) & m;
-      wk += rowstride_dst;
-    }
-    break;
-  }
-  case 2: {
-    wi_t const j_rowstride_dst = 4 * rowstride_dst;
-    for (int k = 0; k < 4; ++k) {
-      word *RESTRICT wk2 = wk;
-      word tk            = t[k];
-      for (int i = 0; i < 2; ++i) {
-        wk2[0]                   = tk & m;
-        wk2[j_rowstride_dst]     = (tk >> 4) & m;
-        wk2[2 * j_rowstride_dst] = (tk >> 8) & m;
-        wk2[3 * j_rowstride_dst] = (tk >> 12) & m;
-        wk2[4 * j_rowstride_dst] = (tk >> 16) & m;
-        wk2[5 * j_rowstride_dst] = (tk >> 20) & m;
-        wk2[6 * j_rowstride_dst] = (tk >> 24) & m;
-        wk2[7 * j_rowstride_dst] = (tk >> 28) & m;
-        wk2 += 8 * j_rowstride_dst;
-        tk >>= 32;
-      }
-      wk += rowstride_dst;
-    }
-    break;
-  }
-  case 1: {
-    wi_t const j_rowstride_dst = 2 * rowstride_dst;
-    for (int k = 0; k < 2; ++k) {
-      word *RESTRICT wk2 = wk;
-      word tk            = t[k];
-      for (int i = 0; i < 8; ++i) {
-        wk2[0]                   = tk & m;
-        wk2[j_rowstride_dst]     = (tk >> 2) & m;
-        wk2[2 * j_rowstride_dst] = (tk >> 4) & m;
-        wk2[3 * j_rowstride_dst] = (tk >> 6) & m;
-        wk2 += 4 * j_rowstride_dst;
-        tk >>= 8;
-      }
-      wk += rowstride_dst;
-    }
-    break;
-  }
-  case 0: {
-    word *RESTRICT wk2 = wk;
-    word tk            = t[0];
-    for (int i = 0; i < 16; ++i) {
-      wk2[0]                 = tk & m;
-      wk2[rowstride_dst]     = (tk >> 1) & m;
-      wk2[2 * rowstride_dst] = (tk >> 2) & m;
-      wk2[3 * rowstride_dst] = (tk >> 3) & m;
-      wk2 += 4 * rowstride_dst;
-      tk >>= 4;
-    }
-    break;
-  }
-  }
-}
-
-/**
- * Transpose a 64 x n matrix with width 1.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of destination matrix.
- * \param rowstride_src Rowstride of source matrix.
- * \param n Number of columns in source matrix, must be less than 64.
- *
- * Rows of all matrices are expected have offset zero
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_64xlt64(word *RESTRICT dst, word const *RESTRICT src,
-                                               wi_t rowstride_dst, wi_t rowstride_src, int n) {
-  word t[64];
-  int log2j                = log2_ceil(n);
-  word const *RESTRICT wks = src;
-  switch (log2j) {
-  case 6: {
-    _mzd_copy_transpose_64x64(t, src, 1, rowstride_src);
-    word *RESTRICT wk = dst;
-    for (int k = 0; k < n; ++k) {
-      *wk = t[k];
-      wk += rowstride_dst;
-    }
-    return;
-  }
-  case 5: {
-    wi_t const j_rowstride_src = 32 * rowstride_src;
-    for (int k = 0; k < 32; ++k) {
-      t[k] = wks[0] | (wks[j_rowstride_src] << 32);
-      wks += rowstride_src;
-    }
-    break;
-  }
-  case 4: {
-    wi_t const j_rowstride_src = 16 * rowstride_src;
-    for (int k = 0; k < 16; ++k) {
-      t[k] = wks[0] | (wks[j_rowstride_src] << 16);
-      t[k] |= (wks[2 * j_rowstride_src] << 32) | (wks[3 * j_rowstride_src] << 48);
-      wks += rowstride_src;
-    }
-    break;
-  }
-  case 3: {
-    wi_t const j_rowstride_src = 8 * rowstride_src;
-    word tt;
-    for (int k = 0; k < 8; ++k) {
-      tt   = wks[0] | (wks[j_rowstride_src] << 8);
-      t[k] = (wks[2 * j_rowstride_src] << 16) | (wks[3 * j_rowstride_src] << 24);
-      tt |= (wks[4 * j_rowstride_src] << 32) | (wks[5 * j_rowstride_src] << 40);
-      t[k] |= (wks[6 * j_rowstride_src] << 48) | (wks[7 * j_rowstride_src] << 56);
-      wks += rowstride_src;
-      t[k] |= tt;
-    }
-    break;
-  }
-  case 2: {
-    word const *RESTRICT wks2 = wks + 60 * rowstride_src;
-    t[0]                      = wks2[0];
-    t[1]                      = wks2[rowstride_src];
-    t[2]                      = wks2[2 * rowstride_src];
-    t[3]                      = wks2[3 * rowstride_src];
-    for (int i = 0; i < 15; ++i) {
-      wks2 -= 4 * rowstride_src;
-      t[0] <<= 4;
-      t[1] <<= 4;
-      t[2] <<= 4;
-      t[3] <<= 4;
-      t[0] |= wks2[0];
-      t[1] |= wks2[rowstride_src];
-      t[2] |= wks2[2 * rowstride_src];
-      t[3] |= wks2[3 * rowstride_src];
-    }
-    break;
-  }
-  case 1: {
-    wks += 62 * rowstride_src;
-    t[0] = wks[0];
-    t[1] = wks[rowstride_src];
-    for (int i = 0; i < 31; ++i) {
-      wks -= 2 * rowstride_src;
-      t[0] <<= 2;
-      t[1] <<= 2;
-      t[0] |= wks[0];
-      t[1] |= wks[rowstride_src];
-    }
-    break;
-  }
-  case 0: {
-    word tt[2];
-    tt[0] = wks[0];
-    tt[1] = wks[rowstride_src];
-    for (int i = 2; i < 64; i += 2) {
-      wks += 2 * rowstride_src;
-      tt[0] |= wks[0] << i;
-      tt[1] |= wks[rowstride_src] << i;
-    }
-    *dst = tt[0] | (tt[1] << 1);
-    return;
-  }
-  }
-  int j = 1 << log2j;
-  _mzd_transpose_Nxjx64(t, j);
-  word *RESTRICT wk = dst;
-  for (int k = 0; k < n; ++k) {
-    *wk = t[k];
-    wk += rowstride_dst;
-  }
-}
-
-/**
- * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 8.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of destination matrix.
- * \param rowstride_src Rowstride of source matrix.
- * \param n Number of rows in source matrix, must be less than or equal 8.
- * \param m Number of columns in source matrix, must be less than or equal 8.
- *
- * Rows of all matrices are expected to have offset zero
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_le8xle8(word *RESTRICT dst, word const *RESTRICT src,
-                                               wi_t rowstride_dst, wi_t rowstride_src, int n, int m,
-                                               int maxsize) {
-  int end                  = maxsize * 7;
-  word const *RESTRICT wks = src;
-  word w                   = *wks;
-  int shift                = 0;
-  for (int i = 1; i < n; ++i) {
-    wks += rowstride_src;
-    shift += 8;
-    w |= (*wks << shift);
-  }
-  word mask = 0x80402010080402ULL;
-  word w7   = w >> 7;
-  shift     = 7;
-  --m;
-  do {
-    word xor = (w ^ w7) & mask;
-    mask >>= 8;
-    w ^= (xor << shift);
-    shift += 7;
-    w7 >>= 7;
-    w ^= xor;
-  } while (shift < end);
-  word *RESTRICT wk = dst + m * rowstride_dst;
-  for (int shift = 8 * m; shift > 0; shift -= 8) {
-    *wk = (unsigned char)(w >> shift);
-    wk -= rowstride_dst;
-  }
-  *wk = (unsigned char)w;
-}
-
-/**
- * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 16.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of destination matrix.
- * \param rowstride_src Rowstride of source matrix.
- * \param n Number of rows in source matrix, must be less than or equal 16.
- * \param m Number of columns in source matrix, must be less than or equal 16.
- *
- * Rows of all matrices are expected to have offset zero
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_le16xle16(word *RESTRICT dst, word const *RESTRICT src,
-                                                 wi_t rowstride_dst, wi_t rowstride_src, int n,
-                                                 int m, int maxsize) {
-  int end                  = maxsize * 3;
-  word const *RESTRICT wks = src;
-  word t[4];
-  int i = n;
-  do {
-    t[0] = wks[0];
-    if (--i == 0) {
-      t[1] = 0;
-      t[2] = 0;
-      t[3] = 0;
-      break;
-    }
-    t[1] = wks[rowstride_src];
-    if (--i == 0) {
-      t[2] = 0;
-      t[3] = 0;
-      break;
-    }
-    t[2] = wks[2 * rowstride_src];
-    if (--i == 0) {
-      t[3] = 0;
-      break;
-    }
-    t[3] = wks[3 * rowstride_src];
-    if (--i == 0) break;
-    wks += 4 * rowstride_src;
-    for (int shift = 16;; shift += 16) {
-      t[0] |= (*wks << shift);
-      if (--i == 0) break;
-      t[1] |= (wks[rowstride_src] << shift);
-      if (--i == 0) break;
-      t[2] |= (wks[2 * rowstride_src] << shift);
-      if (--i == 0) break;
-      t[3] |= (wks[3 * rowstride_src] << shift);
-      if (--i == 0) break;
-      wks += 4 * rowstride_src;
-    }
-  } while (0);
-  word mask = 0xF0000F0000F0ULL;
-  int shift = 12;
-  word xor [4];
-  do {
-    xor[0] = (t[0] ^ (t[0] >> shift)) & mask;
-    xor[1] = (t[1] ^ (t[1] >> shift)) & mask;
-    xor[2] = (t[2] ^ (t[2] >> shift)) & mask;
-    xor[3] = (t[3] ^ (t[3] >> shift)) & mask;
-    mask >>= 16;
-    t[0] ^= (xor[0] << shift);
-    t[1] ^= (xor[1] << shift);
-    t[2] ^= (xor[2] << shift);
-    t[3] ^= (xor[3] << shift);
-    shift += 12;
-    t[0] ^= xor[0];
-    t[1] ^= xor[1];
-    t[2] ^= xor[2];
-    t[3] ^= xor[3];
-  } while (shift < end);
-  _mzd_transpose_Nxjx64(t, 4);
-  i                 = m;
-  word *RESTRICT wk = dst;
-  do {
-    wk[0] = (uint16_t)t[0];
-    if (--i == 0) break;
-    wk[rowstride_dst] = (uint16_t)t[1];
-    if (--i == 0) break;
-    wk[2 * rowstride_dst] = (uint16_t)t[2];
-    if (--i == 0) break;
-    wk[3 * rowstride_dst] = (uint16_t)t[3];
-    if (--i == 0) break;
-    wk += 4 * rowstride_dst;
-    for (int shift = 16;; shift += 16) {
-      wk[0] = (uint16_t)(t[0] >> shift);
-      if (--i == 0) break;
-      wk[rowstride_dst] = (uint16_t)(t[1] >> shift);
-      if (--i == 0) break;
-      wk[2 * rowstride_dst] = (uint16_t)(t[2] >> shift);
-      if (--i == 0) break;
-      wk[3 * rowstride_dst] = (uint16_t)(t[3] >> shift);
-      if (--i == 0) break;
-      wk += 4 * rowstride_dst;
-    }
-  } while (0);
-}
-
-/**
- * Transpose a n x m matrix with width 1, offset 0 and m and n less than or equal 32.
- *
- * \param dst First word of destination matrix.
- * \param src First word of source matrix.
- * \param rowstride_dst Rowstride of destination matrix.
- * \param rowstride_src Rowstride of source matrix.
- * \param n Number of rows in source matrix, must be less than or equal 32.
- * \param m Number of columns in source matrix, must be less than or equal 32.
- *
- * Rows of all matrices are expected to have offset zero
- * and lay entirely inside a single block.
- *
- * \note This function also works to transpose in-place.
- */
-
-static inline void _mzd_copy_transpose_le32xle32(word *RESTRICT dst, word const *RESTRICT src,
-                                                 wi_t rowstride_dst, wi_t rowstride_src, int n,
-                                                 int m) {
-  word const *RESTRICT wks = src;
-  word t[16];
-  int i = n;
-  if (n > 16) {
-    i -= 16;
-    for (int j = 0; j < 16; ++j) {
-      t[j] = *wks;
-      wks += rowstride_src;
-    }
-    int j = 0;
-    do {
-      t[j++] |= (*wks << 32);
-      wks += rowstride_src;
-    } while (--i);
-  } else {
-    int j;
-    for (j = 0; j < n; ++j) {
-      t[j] = *wks;
-      wks += rowstride_src;
-    }
-    for (; j < 16; ++j) t[j] = 0;
-  }
-  _mzd_transpose_Nxjx64(t, 16);
-  int one_more      = (m & 1);
-  word *RESTRICT wk = dst;
-  if (m > 16) {
-    m -= 16;
-    for (int j = 0; j < 16; j += 2) {
-      *wk               = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
-      wk[rowstride_dst] = (t[j + 1] & 0xFFFF) | ((t[j + 1] >> 16) & 0xFFFF0000);
-      wk += 2 * rowstride_dst;
-    }
-    for (int j = 1; j < m; j += 2) {
-      *wk               = ((t[j - 1] >> 16) & 0xFFFF) | ((t[j - 1] >> 32) & 0xFFFF0000);
-      wk[rowstride_dst] = ((t[j] >> 16) & 0xFFFF) | ((t[j] >> 32) & 0xFFFF0000);
-      wk += 2 * rowstride_dst;
-    }
-    if (one_more) { *wk = ((t[m - 1] >> 16) & 0xFFFF) | ((t[m - 1] >> 32) & 0xFFFF0000); }
-  } else {
-    for (int j = 1; j < m; j += 2) {
-      *wk               = (t[j - 1] & 0xFFFF) | ((t[j - 1] >> 16) & 0xFFFF0000);
-      wk[rowstride_dst] = (t[j] & 0xFFFF) | ((t[j] >> 16) & 0xFFFF0000);
-      wk += 2 * rowstride_dst;
-    }
-    if (one_more) { *wk = (t[m - 1] & 0xFFFF) | ((t[m - 1] >> 16) & 0xFFFF0000); }
-  }
-}
-
-static inline void _mzd_copy_transpose_le64xle64(word *RESTRICT dst, word const *RESTRICT src,
-                                                 wi_t rowstride_dst, wi_t rowstride_src, int n,
-                                                 int m) {
-  word const *RESTRICT wks = src;
-  word t[64];
-  int k;
-  for (k = 0; k < n; ++k) {
-    t[k] = *wks;
-    wks += rowstride_src;
-  }
-  while (k < 64) t[k++] = 0;
-  _mzd_copy_transpose_64x64(t, t, 1, 1);
-  word *RESTRICT wk = dst;
-  for (int k = 0; k < m; ++k) {
-    *wk = t[k];
-    wk += rowstride_dst;
-  }
-  return;
-}
-
-static inline void _mzd_copy_transpose_small(word *RESTRICT fwd, word const *RESTRICT fws,
-                                                 wi_t rowstride_dst, wi_t rowstride_src, rci_t nrows,
-                                                 rci_t ncols, rci_t maxsize) {
-  assert(maxsize < 64);
-  if (maxsize <= 8) {
-    _mzd_copy_transpose_le8xle8(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-  } else if (maxsize <= 16) {
-    _mzd_copy_transpose_le16xle16(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-  } else if (maxsize <= 32) {
-    _mzd_copy_transpose_le32xle32(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols);
-  } else {
-    _mzd_copy_transpose_le64xle64(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols);
-  }
-}
-
-
-void _mzd_transpose_base(word *RESTRICT fwd, word const *RESTRICT fws, wi_t rowstride_dst, 
-                            wi_t rowstride_src, rci_t nrows, rci_t ncols, rci_t maxsize) {
-  assert(maxsize >= 64);
-    // Note that this code is VERY sensitive. ANY change to _mzd_transpose can easily
-    // reduce the speed for small matrices (up to 64x64) by 5 to 10%.   
-    if (nrows >= 64) {
-      /*
-       * This is an interesting #if ...
-       * I recommend to investigate the number of instructions, and the clocks per instruction,
-       * as function of various sizes of the matrix (most likely especially the number of columns
-       * (the size of a row) will have influence; also always use multiples of 64 or even 128),
-       * for both cases below.
-       *
-       * To measure this run for example:
-       *
-       * ./bench_mzd -m 10 -x 10 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 32000 32000
-       * ./bench_mzd -m 10 -x 100 -p PAPI_TOT_INS,PAPI_L1_TCM,PAPI_L2_TCM mzd_transpose 128 10240
-       * etc (increase -x for smaller sizes to get better accuracy).
-       *
-       * --Carlo Wood
-       */
-#if 1
-      int js = ncols & nrows & 64;  // True if the total number of whole 64x64 matrices is odd.
-      wi_t const rowstride_64_dst      = 64 * rowstride_dst;
-      word *RESTRICT fwd_current       = fwd;
-      word const *RESTRICT fws_current = fws;
-      if (js) {
-        js = 1;
-        _mzd_copy_transpose_64x64(fwd, fws, rowstride_dst, rowstride_src);
-        if ((nrows | ncols) == 64) {
-          return;
-        }
-        fwd_current += rowstride_64_dst;
-        ++fws_current;
-      }
-      rci_t const whole_64cols = ncols / 64;
-      // The use of delayed and even, is to avoid calling _mzd_copy_transpose_64x64_2 twice.
-      // This way it can be inlined without duplicating the amount of code that has to be loaded.
-      word *RESTRICT fwd_delayed       = NULL;
-      word const *RESTRICT fws_delayed = NULL;
-      int even                         = 0;
-      while (1) {
-        for (int j = js; j < whole_64cols; ++j) {
-          if (!even) {
-            fwd_delayed = fwd_current;
-            fws_delayed = fws_current;
-          } else {
-            _mzd_copy_transpose_64x64_2(fwd_delayed, fwd_current, fws_delayed, fws_current,
-                                        rowstride_dst, rowstride_src);
-          }
-          fwd_current += rowstride_64_dst;
-          ++fws_current;
-          even = !even;
-        }
-        nrows -= 64;
-        if (ncols % 64) {
-          _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols,
-                                      rowstride_dst, rowstride_src, ncols % 64);
-        }
-        fwd += 1;
-        fws += 64 * rowstride_src;
-        if (nrows < 64) break;
-        js          = 0;
-        fws_current = fws;
-        fwd_current = fwd;
-      }
-#else
-      // The same as the above, but without using _mzd_copy_transpose_64x64_2.
-      wi_t const rowstride_64_dst = 64 * DST->rowstride;
-      rci_t const whole_64cols    = ncols / 64;
-      assert(nrows >= 64);
-      do {
-        for (int j = 0; j < whole_64cols; ++j) {
-          _mzd_copy_transpose_64x64(fwd + j * rowstride_64_dst, fws + j, DST->rowstride,
-                                    A->rowstride);
-        }
-        nrows -= 64;
-        if (ncols % 64) {
-          _mzd_copy_transpose_64xlt64(fwd + whole_64cols * rowstride_64_dst, fws + whole_64cols,
-                                      DST->rowstride, A->rowstride, ncols % 64);
-        }
-        fwd += 1;
-        fws += 64 * A->rowstride;
-      } while (nrows >= 64);
-#endif
-    }
-
-    if (nrows == 0) {
-      return;
-    }
-
-    // Transpose the remaining top rows. Now 0 < nrows < 64.
-
-    while (ncols >= 64) {
-      _mzd_copy_transpose_lt64x64(fwd, fws, rowstride_dst, rowstride_src, nrows);
-      ncols -= 64;
-      fwd += 64 * rowstride_dst;
-      fws += 1;
-    }
-
-    if (ncols == 0) {
-      return ;
-    }
-
-  maxsize = MAX(nrows, ncols);
-  
-  // Transpose the remaining corner. Now both 0 < nrows < 64 and 0 < ncols < 64.
-  _mzd_copy_transpose_small(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-}
-
-/* return the smallest multiple of k larger than n/2 */
-static inline rci_t split_round(rci_t n, rci_t k) {
-  rci_t half = n / 2;
-  return ((half + (k - 1)) / k) * k;
-}
-
-static void _mzd_transpose_notsmall(word *RESTRICT fwd, word const *RESTRICT fws, wi_t rowstride_dst, 
-                            wi_t rowstride_src, rci_t nrows, rci_t ncols, rci_t maxsize) {
-  assert(maxsize >= 64);
-
-  if (maxsize <= 512) {  // just one big block
-    _mzd_transpose_base(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-  } else {
-    rci_t large_size = split_round(maxsize, (maxsize <= 768) ? 64 : 512);
-    wi_t offset = large_size / m4ri_radix;
-      if (nrows >= ncols) {
-        word const *RESTRICT fws_up = fws; 
-        word const *RESTRICT fws_down = fws + large_size * rowstride_src;
-        word *RESTRICT fwd_left = fwd;
-        word *RESTRICT fwd_right = fwd + offset; 
-        rci_t maxsize_up = MAX(large_size, ncols);
-        rci_t maxsize_down = MAX(nrows - large_size, ncols);
-        _mzd_transpose_notsmall(fwd_left, fws_up, rowstride_dst, rowstride_src, large_size, ncols, maxsize_up);
-        _mzd_transpose_notsmall(fwd_right, fws_down, rowstride_dst, rowstride_src, nrows - large_size, ncols, maxsize_down);
-      } else {
-        word const *RESTRICT fws_left = fws; 
-        word const *RESTRICT fws_right = fws + offset;
-        word *RESTRICT fwd_up = fwd;
-        word *RESTRICT fwd_down = fwd + large_size * rowstride_dst; 
-        rci_t maxsize_left = MAX(nrows, large_size);
-        rci_t maxsize_right = MAX(nrows, ncols - large_size);
-        _mzd_transpose_notsmall(fwd_up, fws_left, rowstride_dst, rowstride_src, nrows, large_size, maxsize_left);
-        _mzd_transpose_notsmall(fwd_down, fws_right, rowstride_dst, rowstride_src, nrows, ncols - large_size, maxsize_right);
-    }
-  }
-}
-
-static void _mzd_transpose(word *RESTRICT fwd, word const *RESTRICT fws, wi_t rowstride_dst, 
-                            wi_t rowstride_src, rci_t nrows, rci_t ncols, rci_t maxsize) {
-  // rationale: small blocks corresponds to the word size
-  //            two big blocks fit in L1 cache (512 --> 8KB).
-  
-  if (maxsize < 64) {  // super-fast path for very small matrices
-    _mzd_copy_transpose_small(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-  } else {
-    _mzd_transpose_notsmall(fwd, fws, rowstride_dst, rowstride_src, nrows, ncols, maxsize);
-  }
-}
-
-
-
-mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A) {
-  if (DST == NULL) {
-    DST = mzd_init(A->ncols, A->nrows);
-  } else if (__M4RI_UNLIKELY(DST->nrows != A->ncols || DST->ncols != A->nrows)) {
-    m4ri_die("mzd_transpose: Wrong size for return matrix.\n");
-  }
-
-  if (A->nrows == 0 || A->ncols == 0)
-    return mzd_copy(DST, A);
-
-  rci_t maxsize = MAX(A->nrows, A->ncols);
-  if (__M4RI_LIKELY(!mzd_is_dangerous_window(DST))) {
-    _mzd_transpose(DST->data, A->data, DST->rowstride, A->rowstride, A->nrows, A->ncols, maxsize);
-    return DST;
-  }
-  
-  mzd_t *D = mzd_init(DST->nrows, DST->ncols);
-  _mzd_transpose(D->data, A->data, D->rowstride, A->rowstride, A->nrows, A->ncols, maxsize);
-  mzd_copy(DST, D);
-  mzd_free(D);
-  return DST;
-}
-
 mzd_t *mzd_mul_naive(mzd_t *C, mzd_t const *A, mzd_t const *B) {
   if (C == NULL) {
     C = mzd_init(A->nrows, B->ncols);
diff --git a/m4ri/mzd.h b/m4ri/mzd.h
index b4a644a..c49d609 100644
--- a/m4ri/mzd.h
+++ b/m4ri/mzd.h
@@ -594,22 +594,6 @@ static inline void mzd_row_add_offset(mzd_t *M, rci_t dstrow, rci_t srcrow, rci_
 
 void mzd_row_add(mzd_t *M, rci_t const sourcerow, rci_t const destrow);
 
-/**
- * \brief Transpose a matrix.
- *
- * This function uses the fact that:
-\verbatim
-   [ A B ]T    [AT CT]
-   [ C D ]  =  [BT DT]
- \endverbatim
- * and thus rearranges the blocks recursively.
- *
- * \param DST Preallocated return matrix, may be NULL for automatic creation.
- * \param A Matrix
- */
-
-mzd_t *mzd_transpose(mzd_t *DST, mzd_t const *A);
-
 /**
  * \brief Naive cubic matrix multiplication.
  *