diff --git a/.github/workflows/Build.yml b/.github/workflows/Build.yml
index b0861cce..c9eb720c 100644
--- a/.github/workflows/Build.yml
+++ b/.github/workflows/Build.yml
@@ -18,17 +18,23 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Runs a single command using the runners shell
     - name: Install requirements
       run: |
         sudo apt-get update
-        sudo apt-get install -y gfortran openmpi-bin libopenmpi-dev
+        sudo apt-get install -y gfortran openmpi-bin libopenmpi-dev cmake
 
     # Runs a set of commands using the runners shell
     - name: Compile
-      run: make BUILD=dev
+      run: |
+        export FC=mpif90
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=dev -DBUILD_TESTING=ON
+        cmake --build build --target decomp2d
 
     - name: Examples
-      run: make examples
+      run: |
+        export FC=mpif90
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=dev -DBUILD_TESTING=ON
+        cmake --build build
diff --git a/.github/workflows/Build_all.yml b/.github/workflows/Build_all.yml
index ba4915ef..d4057275 100644
--- a/.github/workflows/Build_all.yml
+++ b/.github/workflows/Build_all.yml
@@ -18,17 +18,52 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Runs a single command using the runners shell
     - name: Install requirements
       run: |
         sudo apt-get update
-        sudo apt-get install -y gfortran openmpi-bin libopenmpi-dev
+        sudo apt-get install -y gfortran openmpi-bin libopenmpi-dev cmake
 
     # Runs a set of commands using the runners shell
     - name: Compile
-      run: make BUILD=dev && make examples
+      run: |
+        export FC=mpif90.openmpi
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON
+        cmake --build build --verbose
+
+    # Runs a set of commands using the runners shell
+    - name: Single precision
+      run: |
+        export FC=mpif90.openmpi
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DDOUBLE_PRECISION=OFF
+        cmake --build build --verbose
+
+    # Runs a set of commands using the runners shell
+    - name: Double precision with single precision IO
+      run: |
+        export FC=mpif90.openmpi
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DSINGLE_PRECISION_OUTPUT=ON
+        cmake --build build --verbose
+
+    # Runs a set of commands using the runners shell
+    - name: Padded alltoall transpose operations
+      run: |
+        export FC=mpif90.openmpi
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DEVEN=ON
+        cmake --build build --verbose
+
+    # Runs a set of commands using the runners shell
+    - name: Debug flag for halo operations
+      run: |
+        export FC=mpif90.openmpi
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DHALO_DEBUG=ON
+        cmake --build build --verbose
 
   bld_gnu_mpich:
     # The type of runner that the job will run on
@@ -37,45 +72,46 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Runs a single command using the runners shell
     - name: Install requirements
       run: |
         sudo apt-get update
-        sudo apt-get install -y gfortran mpich libmpich-dev
+        sudo apt-get install -y gfortran mpich libmpich-dev cmake
 
     # Runs a set of commands using the runners shell
     - name: Compile
-      run: make BUILD=dev && make examples
-
-###  bld_nvidia:
-###    # The type of runner that the job will run on
-###    runs-on: ubuntu-latest
-###    env:
-###      TMP: /opt/nvidia/hpc_sdk/Linux_x86_64/22.7/comm_libs/mpi/bin:/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/compilers/bin
-###
-###    # Steps represent a sequence of tasks that will be executed as part of the job
-###    steps:
-###    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-###    - uses: actions/checkout@v3
-###
-###    # Add Nvidia HPC SDK
-###    - name: setup repo
-###      run: |
-###        echo 'deb [trusted=yes] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
-###    # Install dependencies
-###    - name: install
-###      run: |
-###        sudo apt-get update  -y
-###        sudo apt-get install -y nvhpc-22-7-cuda-multi
-###
-###    # Runs a set of commands using the runners shell
-###    - name: CPU version
-###      run: |
-###        PATH=$TMP:$PATH make CMP=nvhpc BUILD=debug
-###        PATH=$TMP:$PATH make examples
-###
+      run: |
+        export FC=mpif90.mpich
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON
+        cmake --build build --verbose
+
+  bld_nvidia:
+    # The type of runner that the job will run on
+    runs-on: ubuntu-latest
+    env:
+      TMP: /opt/nvidia/hpc_sdk/Linux_x86_64/24.11/comm_libs/mpi/bin:/opt/nvidia/hpc_sdk/Linux_x86_64/24.11/compilers/bin
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+    - uses: actions/checkout@v4
+
+    # Add Nvidia HPC SDK
+    - name: setup repo
+      run: |
+        echo 'deb [trusted=yes] https://developer.download.nvidia.com/hpc-sdk/ubuntu/amd64 /' | sudo tee /etc/apt/sources.list.d/nvhpc.list
+    # Install dependencies
+    - name: install
+      run: |
+        sudo apt-get update  -y
+        sudo apt-get install -y nvhpc-24-11
+    # Runs a set of commands using the runners shell
+    - name: CPU version
+      run: |
+        PATH=$TMP:$PATH FC=mpif90 cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON
+        PATH=$TMP:$PATH cmake --build build --verbose
+
 ###    - name: GPU version
 ###      run: |
 ###        PATH=$TMP:$PATH make clean
@@ -88,7 +124,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Add Intel OneApi
     - name: setup repo
@@ -98,15 +134,25 @@ jobs:
     - name: install
       run: |
         sudo apt-get update
-        sudo apt-get install -y intel-oneapi-common-vars intel-oneapi-compiler-fortran intel-oneapi-mkl intel-oneapi-mkl-devel intel-oneapi-mpi intel-oneapi-mpi-devel
+        sudo apt-get install -y intel-oneapi-common-vars intel-oneapi-compiler-fortran intel-oneapi-mkl intel-oneapi-mkl-devel intel-oneapi-mpi intel-oneapi-mpi-devel cmake
 
     # Runs a set of commands using the runners shell
     - name: Compile
-      run: source /opt/intel/oneapi/setvars.sh && make CMP=intel BUILD=debug && make examples
+      run: |
+        source /opt/intel/oneapi/setvars.sh
+        export FC=mpiifx
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON
+        cmake --build build --verbose
 
     # Runs a set of commands using the runners shell
     - name: Compile
-      run: source /opt/intel/oneapi/setvars.sh && make clean && make CMP=intel FFT=mkl BUILD=debug && make examples
+      run: |
+        source /opt/intel/oneapi/setvars.sh
+        export FC=mpiifx
+        export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DFFT_Choice=mkl -DBUILD_TESTING=ON
+        cmake --build build --verbose
 
   bld_fftw3:
     # The type of runner that the job will run on
@@ -115,7 +161,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Runs a single command using the runners shell
     - name: Install requirements
@@ -126,15 +172,16 @@ jobs:
     # Runs a set of commands using the runners shell
     - name: Regular FFTW3
       run: |
-        make clean
-        make BUILD=dev FFT=fftw3 FFTW3_PATH_INCLUDE=/usr/include FFTW3_PATH_LIB=/usr/lib/x86_64-linux-gnu
-        make examples
+        export FC=mpif90
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DFFT_Choice=fftw -DBUILD_TESTING=ON
+        cmake --build build --verbose
 
     - name: New FFTW3
       run: |
-        make clean
-        make BUILD=dev FFT=fftw3_f03 FFTW3_PATH_INCLUDE=/usr/include FFTW3_PATH_LIB=/usr/lib/x86_64-linux-gnu
-        make examples
+        export FC=mpif90
+        rm -rf ./build
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DFFT_Choice=fftw_f03 -DBUILD_TESTING=ON
+        cmake --build build --verbose
 
   bld_caliper:
     # The type of runner that the job will run on
@@ -143,7 +190,7 @@ jobs:
     # Steps represent a sequence of tasks that will be executed as part of the job
     steps:
     # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
-    - uses: actions/checkout@v3
+    - uses: actions/checkout@v4
 
     # Runs a single command using the runners shell
     - name: Install requirements
@@ -154,14 +201,16 @@ jobs:
     # Runs a set of commands using the runners shell
     - name: Compile caliper
       run: |
-        wget --no-check-certificate https://github.com/LLNL/Caliper/archive/refs/tags/v2.8.0.tar.gz
-        tar xzf v2.8.0.tar.gz && cd Caliper-2.8.0
+        wget --no-check-certificate https://github.com/LLNL/Caliper/archive/refs/tags/v2.12.1.tar.gz
+        tar xzf v2.12.1.tar.gz && cd Caliper-2.12.1
         mkdir build && cd build
-        cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_INSTALL_PREFIX=../../Caliper-2.8.0_bld -DWITH_FORTRAN=yes -DWITH_MPI=yes ../
+        echo ${PWD}
+        cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_INSTALL_PREFIX=${HOME}/Caliper-2.12.1_bld -DWITH_FORTRAN=yes -DWITH_MPI=yes ../
         make -j && make install
         cd ../../
 
     - name: Compile
       run: |
-        make clean
-        make BUILD=dev PROFILER=caliper CALIPER_PATH=./Caliper-2.8.0_bld
+        export caliper_DIR=${HOME}/Caliper-2.12.1_bld/share/cmake/caliper
+        FC=mpif90 cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DENABLE_PROFILER=caliper -DBUILD_TESTING=ON
+        cmake --build build --verbose
diff --git a/.github/workflows/RunnerFastCI.yml b/.github/workflows/RunnerFastCI.yml
new file mode 100644
index 00000000..80bce909
--- /dev/null
+++ b/.github/workflows/RunnerFastCI.yml
@@ -0,0 +1,92 @@
+# This is a basic workflow to help you get started with Actions
+name: FastCI4Push
+
+defaults:
+  run:
+    shell: bash 
+
+# Controls when the action will run. Triggers the workflow on pull request
+# events but only for the master branch
+on: [push, fork]
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a job called "bld_gnu_ompi"
+  Full_CI_workflow:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, linux, x64]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+    - uses: actions/checkout@v3
+      with: 
+        clean: true
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile NHVHPC CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON
+        cmake -S . -B build -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run 
+    - name: Compile NHVHPC GPU NCCL
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DBUILD_TARGET=gpu -DENABLE_NCCL=yes
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build 
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile GNU_Dev openMPI FFTW_F03 ADIOS2 CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load foss
+        export FC=mpif90
+        export CC=mpicc
+        export CXX=mpicxx
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DIO_BACKEND=adios2 -Dadios2_DIR=~/GIT/ADIOS2/build/build_foss2023b/opt/lib/cmake/adios2 -DFFT_Choice=fftw_f03
+        cmake -S . -B build -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile IntelLLVM MKL CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        ml tbb
+        ml compiler-rt
+        ml umf
+        ml compiler
+        ml mpi
+        ml mkl
+        export FC=mpiifx
+        export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DFFT_Choice=mkl
+        cmake -S . -B build -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build 
+
diff --git a/.github/workflows/RunnerFullCI.yml b/.github/workflows/RunnerFullCI.yml
new file mode 100644
index 00000000..2ae07096
--- /dev/null
+++ b/.github/workflows/RunnerFullCI.yml
@@ -0,0 +1,236 @@
+# This is a basic workflow to help you get started with Actions
+name: FullCI4PR
+
+defaults:
+  run:
+    shell: bash 
+
+# Controls when the action will run. Triggers the workflow on pull request
+# events but only for the master branch
+on: [pull_request, fork]
+
+# A workflow run is made up of one or more jobs that can run sequentially or in parallel
+jobs:
+  # This workflow contains a job called "bld_gnu_ompi"
+  Full_CI_workflow:
+    # The type of runner that the job will run on
+    runs-on: [self-hosted, linux, x64]
+
+    # Steps represent a sequence of tasks that will be executed as part of the job
+    steps:
+    # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
+    - uses: actions/checkout@v3
+      with: 
+        clean: true
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile NHVHPC CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Run and Compile NHHPC for CPU with ADIOS2
+    # Need to fix issue with DEV build and latest NVHPC
+    # Need to revert to 23.9
+    - name: Compile NHVHPC CPU & ADIOS2
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc/23.9
+        export FC=mpif90
+        export CC=mpicc
+        export CXX=mpicxx
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DIO_BACKEND=adios2 -Dadios2_DIR=~/GIT/ADIOS2/build/build_nvhpc241/opt/lib/cmake/adios2
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile NHVHPC GPU cuMPI
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DBUILD_TARGET=gpu -DENABLE_NCCL=no
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run 
+    - name: Compile NHVHPC GPU NCCL
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DBUILD_TARGET=gpu -DENABLE_NCCL=yes
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run 
+    - name: Compile NHVHPC GPU NCCL MemManag
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load nvhpc
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DBUILD_TARGET=gpu -DENABLE_NCCL=yes -DENABLE_MANAGED=yes
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile GNU openMPI FFTW CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load foss
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DFFT_Choice=fftw
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile GNU openMPI FFTW_F03 CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load foss
+        export FC=mpif90
+        cmake -S . -B build -DBUILD_TESTING=ON -DFFT_Choice=fftw_f03
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile GNU openMPI FFTW_F03 Caliper CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load foss
+        export FC=mpif90
+        export CC=mpicc
+        export CXX=mpicxx
+        export caliper_DIR=~/GIT/caliper_github/build_gnu/opt/share/cmake/caliper/
+        #cmake -S . -B build -DBUILD_TESTING=ON -DFFT_Choice=fftw_f03 -DENABLE_PROFILER=caliper
+        cmake -S . -B build -DBUILD_TESTING=ON -DFFT_Choice=fftw_f03 
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile GNU_Dev openMPI FFTW_F03 ADIOS2 CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module load foss
+        export FC=mpif90
+        export CC=mpicc
+        export CXX=mpicxx
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DIO_BACKEND=adios2 -Dadios2_DIR=~/GIT/ADIOS2/build/build_foss2023b/opt/lib/cmake/adios2 -DFFT_Choice=fftw_f03
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile Intel MKL CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        module avail 
+        ml tbb
+        ml compiler-rt
+        ml umf
+        ml compiler
+        ml mpi
+        ml mkl
+        ml ifort
+        export FC=mpiifort
+        export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DFFT_Choice=mkl
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
+    
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile IntelLLVM MKL CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        ml tbb
+        ml compiler-rt
+        ml umf
+        ml compiler
+        ml mpi
+        ml mkl
+        export export FC=mpiifx
+        export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DFFT_Choice=mkl
+        cmake -S . -B build -DNX=128 -DNY=128 -DNX=128 -DMPIEXEC_MAX_NUMPROCS=4
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build 
+
+    # Configure-Build-Run-Run on 4 cores 
+    - name: Compile IntelLLVM MKL ADIOS2 CPU
+      run: |
+        ls -ltr ~/.bashrc
+        source /etc/profile.d/lmod.sh
+        source ~/.bash_aliases
+        ml tbb
+        ml compiler-rt
+        ml umf
+        ml compiler
+        ml mpi
+        ml mkl
+        export FC=mpiifx
+        export CXX=mpiicpx
+        export CC=mpiicx
+        export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DBUILD_TESTING=ON -DIO_BACKEND=adios2 -Dadios2_DIR=~/GIT/ADIOS2/build/build_intelLLVM/opt/lib/cmake/adios2 -DFFT_Choice=mkl
+        cmake -S . -B build -DCMAKE_BUILD_TYPE=Dev -DFFT_Choice=mkl
+        cmake --build build -j 4
+        cmake --install build 
+        ctest --test-dir build
+        rm -rf build  
diff --git a/.gitignore b/.gitignore
index 8bc244b8..743fe033 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,7 +32,7 @@
 *.app
 
 # Cmake
-build
+build*
 
 # Log files
 *.log
@@ -45,4 +45,7 @@ Makefile.settings
 
 # emacs
 .dir-locals.el
-TAGS
\ No newline at end of file
+TAGS
+
+# vim
+*.swp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 948f2b20..c74156f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,75 +1,113 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.20)                        
+cmake_policy(SET CMP0074 NEW)
 
-project(2decomp-fft Fortran)
+project(decomp2d
+  LANGUAGES Fortran)
+set(version 2.0.4)
+enable_testing()
+if (IO_BACKEND MATCHES "adios2")
+  # Can be useful to also activate CXX, sometimes is needed by packages
+  enable_language(C CXX)
+endif (IO_BACKEND MATCHES "adios2")
 
-if (NOT CMAKE_BUILD_TYPE)
-    message(STATUS "No build type selected, default to Release")
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type (default Release)" FORCE)
-endif()
+set(BUILD_TARGET "mpi" CACHE STRING "Target for acceleration (mpi (default) or gpu)")
+set_property(CACHE BUILD_TARGET PROPERTY STRINGS mpi gpu)
 
-find_package(MPI REQUIRED)
-
-if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
-    set(CMAKE_Fortran_FLAGS "-g -fbacktrace -fdefault-real-8 -ffree-line-length-none -ffpe-trap=invalid,zero,overflow -fallow-argument-mismatch -cpp")
-    set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -finit-real=nan -fcheck=all -Wextra -Wconversion -pedantic")
-    set(CMAKE_Fortran_FLAGS_RELEASE "-O3")
-elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Intel")
-    # FIXME: `-heap-arrays 10` required as using Intel Fortran
-    # means they are stored on the stack (by default), whereas GNU Fortran stores
-    # them on the heap (https://github.com/uDALES/u-dales/issues/13).
-    set(CMAKE_Fortran_FLAGS "-g -traceback -r8 -fpe0 -heap-arrays 10")
-    set(CMAKE_Fortran_FLAGS_DEBUG "-O0 -init=snan -CB -check all")
-    set(CMAKE_Fortran_FLAGS_RELEASE "-O3")
-elseif(CMAKE_Fortran_COMPILER_ID MATCHES "Cray")
-    # https://pubs.cray.com/content/S-3901/8.5/cray-fortran-reference-manual-85
-    set(CMAKE_Fortran_FLAGS "-eF -N 1023 -M 296")
-    set(CMAKE_Fortran_FLAGS_DEBUG "-G2 -en")
-    set(CMAKE_Fortran_FLAGS_RELEASE "-g -O3")
-else()
-    message(FATAL_ERROR "Only GNU, Intel, and Cray Fortran compilers are supported")
-endif()
+if (BUILD_TARGET MATCHES "gpu")
+  option(ENABLE_OPENACC "Allow user to activate/deactivate OpenACC support" ON)
+  option(ENABLE_CUDA "Allow user to activate/deactivate CUDA support" ON)
+  option(ENABLE_MANAGED "Allow user to activate/deactivate automatic memory managment from NVHPC" OFF)
+  option(ENABLE_NCCL "Allow user to activate/deactivate Collective Comunication NCCL" OFF)
 
-file(GLOB files_decomp src/factor.f90 src/decomp_2d.f90 src/log.f90 src/glassman.f90)
+  if (ENABLE_CUDA)
+    message(STATUS "Before enable CUDA")
+    enable_language(CUDA)
+    message(STATUS "After enable CUDA")
+  endif()
 
-#if(${FFT_Choice} MATCHES "generic")
-#  file(GLOB files_fft src/fft_generic.f90)
-#endif(${FFT_Choice} MATCHES "generic")
+endif(BUILD_TARGET MATCHES "gpu")
 
-if(FFTW_FOUND)
-  message (STATUS "Compiling using FFTW3")
-  file(GLOB files_fft src/fft_fftw3.f90)
-else(FFTW_FOUND)
-  message (STATUS "Compiling using Generic FFT")
-   file(GLOB files_fft src/fft_generic.f90)
-endif(FFTW_FOUND)
+set (ENABLE_PROFILER "OFF" CACHE STRING "Activate/select the profiler")
+set_property(CACHE ENABLE_PROFILER PROPERTY STRINGS OFF caliper)
+option(ENABLE_INPLACE "Enable in-place operation of FFT" OFF)
+option(HALO_DEBUG "Enable halo debugging options" OFF)
+option(EVEN "Padded alltoall transpose operations" OFF)
+option(BUILD_SHARED_LIBS "Set ON to build a shared library" OFF)
+set (IO_BACKEND "mpi" CACHE STRING "Default IO backend (mpi (default) or adios2)")
+set_property(CACHE IO_BACKEND PROPERTY STRINGS mpi adios2)
 
-option(DOUBLE_PRECISION "Build 2decomp_fft with double precision" ON)
-if (DOUBLE_PRECISION)
-  add_definitions("-DDOUBLE_PREC")
-endif()
+set(AUTHOR "Stefano Rolfo;Charles Moulinec;Paul Bartholomew")
+set(AUTHOR_DETAILS "stefano.rolfo@stfc.ac.uk;charles.moulinec@stfc.ac.uk;p.bartholomew@epcc.ed.ac.uk")
+set(DESCRIPTION "Building 2decomp&fft using cmake")
+
+message(STATUS "building ${PROJECT_NAME}")
 
-option(SINGLE_PRECISION_OUTPUT "Build 2decomp_fft with output in single precision" OFF)
-if (SINGLE_PRECISION_OUTPUT)
-  add_definitions("-DSAVE_SINGLE")
+include(GNUInstallDirs)
+set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_LIBDIR})
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_BINDIR})
+set(CMAKE_Fortran_MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/${CMAKE_INSTALL_INCLUDEDIR})
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/opt" CACHE PATH "..." FORCE)
 endif()
 
-set(SRCFILES ${files_decomp} ${files_fft})
+# Add support for CMAKE_DEPENDENT_OPTION
+INCLUDE(CMakeDependentOption)
+INCLUDE(CMakeParseArguments)
+
+# make sure that the default is a RELEASE
+if (NOT CMAKE_BUILD_TYPE)
+  set (CMAKE_BUILD_TYPE RELEASE CACHE STRING
+      "Choose the type of build, options are: None Dev Debug Release."
+      FORCE)
+endif (NOT CMAKE_BUILD_TYPE)
+
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake" "${CMAKE_SOURCE_DIR}/cmake/compilers" "${CMAKE_SOURCE_DIR}/cmake/fft")
+
+if (BUILD_TARGET MATCHES "gpu")
+  include(D2D_GPU)
+endif (BUILD_TARGET MATCHES "gpu")
+
+include(D2D_MPI)
+
+# FFT options
+if (BUILD_TARGET MATCHES "gpu")
+  set(FFT_Choice "cufft" CACHE STRING "FFT for XCompact3d project (with GPU cufft is the default)")
+else ()
+  set(FFT_Choice "generic" CACHE STRING "FFT for XCompact3d project (generic is the default)")
+endif (BUILD_TARGET MATCHES "gpu")
+set_property(CACHE FFT_Choice PROPERTY STRINGS generic fftw fftw_f03 mkl cufft)
+
+# Set now compilers flags
+include(D2D_Compilers)
+include(D2D_Profilers)
+include(fft)
 
-add_library(2decomp-fft STATIC ${SRCFILES})
-if (MPI_FOUND)
-  target_link_libraries(2decomp-fft PRIVATE MPI::MPI_Fortran)
-endif (MPI_FOUND)
+# Create a static library for the fft
+add_subdirectory(src)
 
-if(FFTW_FOUND)
-  target_include_directories(2decomp-fft PRIVATE ${FFTW_INCLUDE_DIRS})
-  #target_include_directories(2decomp-fft PRIVATE ${PKG_FFTW_LIBRARY_DIRS})
-endif(FFTW_FOUND)
+# Add tests
+option(BUILD_TESTING "Build with tests" OFF)
+if (${BUILD_TESTING})
+  add_subdirectory(examples)
+endif (${BUILD_TESTING})
 
-#install(TARGETS decomp2d
-#        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-#        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-#        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-#)
+# Add a prettify target
+add_custom_target(format sh ${CMAKE_SOURCE_DIR}/scripts/format.sh
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR})
 
-#add_subdirectory(examples)
-#include_directories(examples)
+# # Create an example dir with all input.i3d example files
+# option(BUILD_TESTING "Build with tests" ON)
+# set(test_dir "${PROJECT_BINARY_DIR}/Test")
+# message(STATUS "Before test main ${test_dir}")
+# if (${BUILD_TESTING})
+#   file(MAKE_DIRECTORY ${test_dir})
+#   include(CTest)
+#   message(STATUS "MPI INCL ALSO FOUND: ${MPI_INCLUDE_PATH}")
+#   message(STATUS "MPI EXEC: ${MPIEXEC_EXECUTABLE}")
+#   file(MAKE_DIRECTORY ${test_dir}/App)
+#   add_test(NAME TestAPP COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/xcompact3d WORKING_DIRECTORY ${test_dir}/App)
+#   #if (PFUNIT_FOUND)
+#   #  file(MAKE_DIRECTORY ${test_dir}/Verif_x_00)
+#   #  add_test(NAME Test_dx_00 COMMAND ${MPIEXEC_EXECUTABLE} -n ${MPIEXEC_MAX_NUMPROCS} ${CMAKE_INSTALL_PREFIX}/bin/verif_x_00 WORKING_DIRECTORY ${test_dir}/Verif_x_00)
+#   #endif (PFUNIT_FOUND)
+# endif()
diff --git a/CONTRIBUTORS b/CONTRIBUTORS
new file mode 100644
index 00000000..1135ecdc
--- /dev/null
+++ b/CONTRIBUTORS
@@ -0,0 +1,10 @@
+Alphabetic order:
+
+Paul Bartholomew (EPCC, UK)
+Cedric Flageul (Poitiers, France)
+Sylvain Laizet (ICL, UK)
+Thibault Lestang (ICL, UK)
+Ning Li (NAG, UK)
+Charles Moulinec (STFC, UK)
+Stefano Rolfo (STFC, UK)
+Filippo Spiga (NVIDIA, UK)
diff --git a/Contribute.md b/Contribute.md
new file mode 100644
index 00000000..e189fa6e
--- /dev/null
+++ b/Contribute.md
@@ -0,0 +1,57 @@
+# Contributing
+
+1. You want to contribute but have no idea how ? Please refer to the [Get started](#get-started) section.
+2. You have identified a bug ? Please refer to the [Bug](#bug) section.
+3. You have improved a part of the code or have developed a new functionality ? Please refer to the [advanced](#advanced-contribution) section.
+
+## Get started
+
+The recommended strategy to contribute is to start with a [discussion](https://github.com/2decomp-fft/2decomp-fft/discussions) or to pick an existing issue.
+To modify or experiment with the code, fork the 2DECOMP&FFT github repository and commit changes in a dedicated branch of your fork.
+When the modification is ready for review, one can open a pull request as described in the [advanced](#advanced-contribution) section below.
+
+## Bug
+
+It appears that you have identified a bug in the 2DECOMP&FFT library.
+If you are not sure this is really a bug in the library, you should go to the [discussions](https://github.com/2decomp-fft/2decomp-fft/discussions) section and open a new discussion.
+Otherwise, follow the steps below.
+
+Firstly, try to reproduce the error with a debug build of the library, a small problem size and a small number of MPI ranks.
+It makes bug-hunting much easier.
+Unfortunately, this is not always possible.
+Please note that for a debug build, the log contains all the environment variables.
+Use it to hunt the bug but think twice before sharing it as it can expose sensitive and personal information.
+At least, please try to reproduce the bug on another machine with another compiler.
+
+Secondly, if you have modified the source code of the 2DECOMP&FFT library, you must reproduce the bug without the modifications in 2DECOMP&FFT.
+The development team can only provide support for sections of code available in the present repository.
+
+Thirdly, you must provide a minimal working example.
+The program using 2DECOMP&FFT and exposing the bug should be relatively small.
+The development team will not provide support if the program exposing the bug is very long.
+The programs available in the examples section are a good starting point for a minimal working example, ideally you could contribute the minimal working example to the existing examples.
+
+At this stage, you probably did your best to simplify the problem at hand.
+Open an issue and select the bug report template.
+Provide a meaningful title, do your best to complete all the sections of the template and provide the version of the compiler, the version of the MPI / FFT library, ...
+If you think you have a fix for the bug, please expose it inside the issue.
+It is recommended to wait for feedback before opening a pull request.
+
+## Advanced contribution
+
+One should read this section before opening a pull request.
+To fix a bug, please open an issue and use the bug report template first.
+To improve a part of the code or develop a new functionality, please open an issue and use the feature request template first.
+If you are not sure about your contribution, open a [discussion](https://github.com/2decomp-fft/2decomp-fft/discussions) first.
+This helps raise awareness of the work and prevents repeated effort from occurring.
+
+The code in a pull-request should be formatted using the `fprettify` program.
+See the code sample in the `scripts` folder.
+
+Pull requests must be focused, small, coherent and have a detailed description.
+The longer the pull request, the harder the review.
+Please empathise with your fellow contributors who are going to spend time reviewing your code.
+
+As long as the pull request is open for discussion and not ready for merging, convert it to draft.
+Whenever it is ready for merging, convert it to a regular pull request.
+Please note that reviewers might push modifications directly to your branch or request changes.
diff --git a/HOWTO.md b/HOWTO.md
new file mode 100644
index 00000000..2f188e05
--- /dev/null
+++ b/HOWTO.md
@@ -0,0 +1,147 @@
+# How to use 2DECOMP&FFT
+This document presents the main features of 2DECOMP&FFT library. 
+Detailed instructions on how to use the 2DECOMP&FFT library can be found
+[here](https://2decomp-fft.github.io/).
+
+The 2D Pencil Decomposition API is defined with three Fortran module which should be used by applications as:
+```
+  use decomp_2d_constants
+  use decomp_2d_mpi
+  use decomp_2d
+```
+where ``use decomp_2d_constants`` defines all the parameters, ``use decomp_2d_mpi`` introduces all the MPI 
+related interfaces and ``use decomp_2d`` contains the main decomposition and transposition APIs. The library is initialized using
+```
+  call decomp_2d_init(nx, ny, nz, p_row, p_col)
+```
+where ``nx``, ``ny`` and ``nz`` are the spatial dimensions of the problem, to be distributed over
+a 2D processor grid :math:`p_row \times p_col`.
+Note that none of the dimensions need to be divisible by ``p_row`` or ``p_col`` however a load imbalance will occur if not.
+In case of ``p_row=p_col=0`` an automatic decomposition is selected among all possible combination available.
+A key element of this library is a set of communication routines that actually perform the data transpositions.
+As mentioned, one needs to perform 4 global transpositions to go through all 3 pencil orientations
+(i.e. one has to go from x-pencils to y-pencils to z-pencils to y-pencils to x-pencils)
+Correspondingly, the library provides 4 communication subroutines:
+```
+  call transpose_x_to_y(var_in,var_out)
+  call transpose_y_to_z(var_in,var_out)
+  call transpose_z_to_y(var_in,var_out)
+  call transpose_y_to_x(var_in,var_out)
+```
+The input array ``var_in`` and output array ``var_out`` are defined by the code using the library
+and contain distributed data for the correct pencil orientations.
+
+Note that the library is written using Fortran's generic interface so different data types are supported
+without user input. That means in and out above can be either real or complex arrays,
+the latter being useful for applications involving 3D Fast Fourier Transforms.
+Finally, before exit, applications should clean up the memory by:
+```
+  call decomp_2d_finalize
+```
+Detailed information about the decomposition API are available [here](https://2decomp-fft.github.io/pages/api_domain.html) 
+### Use of the FFT module
+To use the FFT programming interface, first of all, one additional Fortran module is needed:
+```
+  use decomp_2d_fft
+```
+Then one needs to initialise the FFT interface using
+```
+  call decomp_2d_fft_init(pencil, n1, n2, n3)
+```
+where ``pencil=PHYSICAL_IN_X`` or ``PHYSICAL_IN_Z`` and ``n1, n2, n3`` is an arbitrary problem size
+that can be different from :math:`nx\times ny\times nz`.
+For complex-to-complex (c2c) FFTs, the user interface is:
+```
+  call decomp_2d_fft_3d(input, output, direction)
+```
+where ``direction`` can be either ``DECOMP_2D_FFT_FORWARD == -1`` for forward transforms, 
+or ``DECOMP_2D_FFT_BACKWARD == 1`` for backward transforms.
+We recommend using the ``DECOMP_2D_FFT_XXX`` variables, rather than literal ``1`` or ``-1``,
+to avoid potential issues if these values change in future versions.
+The input array (``input``) and the output one (``output``) are both complex
+and have to be either a X-pencil/Z-pencil combination or vice-versa.
+The interface for real-to-complex and complex-to-real transform is
+```
+  call decomp_2d_fft_3d(input, output)
+```
+If the ``input`` data are real type a forward transform is assumed obtaining a complex ``output``.
+Similarly a backward FFT is computed if ``input`` is a complex array and ``output`` a real array.
+Finally, to release the memory used by the FFT interface:
+```
+  call decomp_2d_fft_finalize
+```
+Detailed information about the FFT API are available [here](https://2decomp-fft.github.io/pages/api_fft.html) 
+### Use of the IO module
+All the I/O functions have been packed in a Fortran module:
+```
+  use decomp_2d_io
+```
+To write a single three-dimensional array to a file, the following call to a subroutine can be used:
+```
+  call decomp_2d_write_one(ipencil,var,directory,filename,icoarse,io_name)
+```
+where ``ipencil`` describes how the data is distributed (valid values are: 1 for X-pencil; 2 for
+Y-pencil and 3 for Z-pencil); ``var`` is the data array to be written on disk, which can be either real or
+complex; ``directory`` is the path to where I/O should be written; ``filename`` is the name of the
+file to be written; ``icoarse`` indicates whether the I/O should be coarsened (valid values are: 0
+for no; 1 for the ``nstat`` and 2 for the ``nvisu`` coarsenings); ``io_name`` is the name of the I/O
+group to be used. When using ADIOS2 write operations are deferred by default, this means that before the
+end of the step the data stored in ``var`` may not have been written. Overwriting ``var``, for example
+when used as a temporary variable, would cause the output data to become corrupted. In such situations
+``decomp_2d_write_one`` accepts an optional argument ``opt_deferred_writes`` (default ``.true.``) which
+when set to ``.false.`` causes the data to be flushed immediately.
+The last argument ``io_name`` is a string used to group I/O operations together, and for the ADIOS2 backend
+allows for the runtime control of I/O through the file ``adios2_config.xml``, there must be a matching IO
+handle to specify the I/O engine - see the examples under ``examples/io_test/`` for how this works.
+
+There are two ways of writing multiple variables to a single file which may
+be used for check-pointing purposes, for example. The newer interface is described first and allows
+codes to use the ADIOS2 and MPI-IO backends, the original interface is supported for backwards
+compatibility.
+
+When ``decomp_2d_write_one`` is called, the ``directory`` and ``io_name`` are combined to check
+whether a particular output location is already opened, if not then a new file will be opened and
+written to - this is the "standard" use.  If, however, a file is opened first then the call to
+``decomp_2d_write_one`` will append to the current file, resulting in a single file with multiple
+fields.  Once the check-pointing is complete the file can then be closed.
+
+The original interface for writing multiple variables to a file, only
+supported by the MPI-IO backend, takes the following form:
+```
+   call decomp_2d_write_var(fh,disp,ipencil,var)
+```
+where ``fh`` is a MPI-IO file handle provided by the application (file opened using ``MPI_FILE_OPEN``);
+``ipencil`` describes the distribution of the input data (valid values are: 1 for X-pencil; 2 for
+Y-pencil and 3 for Z-pencil); ``disp`` (meaning displacement) is a variable of ``kind MPI_OFFSET_KIND``
+and of ``intent INOUT``. 
+Detailed information about the IO API are available [here](https://2decomp-fft.github.io/pages/api_io.html) 
+#### Initialising the IO module
+Before you can perform I/O you must initialise the IO module by calling
+```
+call decomp_2d_io_init()
+```
+somewhere near the start of the program (and before beginning any I/O operations).
+The IO module supports multiple handles (see above discussion of ``io_name``), these are initialised by calling
+```
+call decomp_2d_init_io(io_name)
+...
+```
+#### Registering variables for ADIOS2
+The ADIOS2 backend needs information about the variables it is going to write, during initialisation of I/O operations
+each variable must be registered by calling
+```
+call decomp_2d_register_variable(io_name,filename,ipencil,icoarse,iplanes,kind) 
+```
+where the arguments are described as above.
+The argument ``iplanes`` is used to declare writing planes from a field - pass ``0`` for 3-D field output and the ``kind``
+argument is the argument used when declaring the variable, e.g. ``real(kind)``.
+### Use of the halo exchange
+The halo-cell support API provides data structures and nearest-neighbour communication routines 
+that support explicit message passing between neighbouring pencils. 
+```
+  call update_halo(var, var_halo, level)
+```
+Here the first parameter ``var``, a 3D input array, contains the normal pencil-distributed data as defined by the decomposition library. 
+After the subroutine call, the second parameter ``var_halo`` returns all original data plus halo data from the neighbouring processes.
+The third parameter level defines how many layers of overlapping are required. 
+Detailed information about the halo module are available [here](https://2decomp-fft.github.io/pages/api_halo.html) 
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 00000000..70af86f5
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,338 @@
+# Building, installing and linking 2DECOMP&FFT
+
+The library 2decomp is a Fortran library compatible with the Fortran 2008 standard.
+It requires a MPI library compatible with MPI-2.0 with extended Fortran support.
+The following [optional libraries](#optional-dependencies) can be used :
+
+- ADIOS2, version 2.9.0 was tested
+- FFTW3, version 3.3.10 was tested
+- Intel oneMKL (oneAPI Math Kernel Library), version 2023.0.0 was tested
+- Nvidia GPU-related libraries, NVHPC version 22.7 and CUDA version 11.8 were tested
+- Caliper, version 2.9.1 was tested
+
+## Building
+
+The build system is driven by `cmake`. It is good practice to directly point to the 
+MPI Fortran wrapper that you would like to use to guarantee consistency between Fortran compiler and MPI. 
+This can be done by setting the default Fortran environmental variable 
+```
+$ export FC=my_mpif90
+```
+To generate the build system run
+```
+$ cmake -S $path_to_sources -B $path_to_build_directory -DOPTION1 -DOPTION2 ...
+```
+If the directory does not exist it will be generated and it will contain the configuration files. 
+The configuration can be further
+edited by using the `ccmake` utility as
+```
+$ ccmake $path_to_build_directory
+```
+and editing as desired, variables that are likely of interest are: `CMAKE_BUILD_TYPE` and `FFT_Choice`;
+additional variables can be shown by entering "advanced mode" by pressing `t`.
+By default a `RELEASE` build will built, other options for `CMAKE_BUILD_TYPE` are `DEBUG` and `DEV` which
+turn on debugging flags and additionally try to catch coding errors at compile time, respectively.
+The behaviour of debug and development versions of the library can be changed before the
+initialization using the variable ``decomp_debug`` or the environment variable ``DECOMP_2D_DEBUG``.
+The value provided with the environment variable must be a positive integer below 9999.
+
+Two `BUILD_TARGETS` are available namely `mpi` and `gpu`.  For the `mpi` target no additional 
+options should be required. whereas for `gpu` extra options are necessary at the configure stage. 
+Please see section [GPU Compilation](#gpu-compilation)
+
+Once the build system has been configured, you can build 2DECOMP&FFT` by running
+```
+$ cmake --build $path_to_build_directory -j <nproc>
+```
+appending `-v` will display additional information about the build, such as compiler flags.
+
+After building the library can be tested. Please see section [Testing and examples](#testing-and-examples)
+
+Options can be added to change the level of verbosity. Finally, the build library can be installed by running 
+```
+$ cmake --install $path_to_build_directory
+```
+The default location for `libdecomp2d.a` is `$path_to_build_directory/opt/lib` 
+or `$path_to_build_directory/opt/lib64` unless the variable `CMAKE_INSTALL_PREFIX` is modified.
+The module files generated by the build process will similarly be installed to 
+`$path_to_build_directory/opt/install`, 
+users of the library should add this to the include paths for their program.
+
+As indicated above, by default a static `libdecomp2d.a` will be compiled, 
+if desired a shared library can be built by setting `BUILD_SHARED_LIBS=ON` either on the command line:
+```
+$ cmake -S $path_to_sources -B $path_to_build_directory -DBUILD_SHARED_LIBS=ON
+```
+or by editing the configuration using `ccmake`.
+This might be useful for a centralised install supporting multiple users that is upgraded over time.
+
+Occasionally a clean build is required, this can be performed by running
+```
+$ cmake --build $path_to_build_directory --target clean
+```
+
+## GPU compilation
+
+The library can perform multi GPU offoloading using the NVHPC compiler suite for NVIDIA hardware. 
+The implementation is based on CUDA-aware MPI and NVIDIA Collective Communication Library (NCCL).
+The FFT is based on cuFFT. 
+
+To properly configure for GPU build the following needs to be used 
+```
+$ cmake -S $path_to_sources -B $path_to_build_directory -DBUILD_TARGET=gpu
+```
+Note, further configuration can be performed using `ccmake`, however the initial configuration of GPU builds must include the `-DBUILD_TARGET=gpu` flag as shown above.
+
+By default CUDA aware MPI will be used together with `cuFFT` for the FFT library. The configure will automatically look for the GPU architecture available on the system. If you are building on a HPC system please use a computing node for the installation. Useful variables to be added are 
+
+ - `-DENABLE_NCCL=yes` to activate the NCCL collectives
+ - `-DENABLE_MANAGED=yes` to activate the automatic memory management form the NVHPC compiler
+If you are getting the following error
+```
+-- The CUDA compiler identification is unknown  
+CMake Error at /usr/share/cmake/Modules/CMakeDetermineCUDACompiler.cmake:633 (message):  
+Failed to detect a default CUDA architecture. 
+```
+It is possible that your default C compiler is too recent and not supported by `nvcc` . You might be able to solve the issue by adding 
+ - `-DCMAKE_CUDA_HOST_COMPILER=$supported_gcc`
+ 
+ At the moment the supported CUDA host compilers are `gcc11` and earlier. 
+
+## Linking from external codes
+
+### Codes using Makefiles
+
+When building a code that links 2DECOMP&FFT using a Makefile you will need to add the include and link paths as appropriate (`inlude/` and `lib/` under the installation directory, respectively).
+```
+DECOMP_ROOT = /path/to/2decomp-fft
+DECOMP_BUILD_DIR = $(DECOMP_ROOT)/build
+DECOMP_INSTALL_DIR ?= $(DECOMP_BUILD_DIR)/opt # Use default unless set by user
+
+INC += -I$(DECOMP_INSTALL_DIR)/include
+
+# Users build/link targets
+LIBS = -L$(DECOMP_INSTALL_DIR)/lib64 -L$(DECOMP_INSTALL_DIR)/lib -ldecomp2d
+
+OBJ = my_exec.o
+
+my_exec: $(OBJ)
+	$(F90) -o $@ $(OBJ) $(LIBS)
+
+```
+In case 2DECOMP&FFT has been compiled with an external FFT, such as FFTW3, `LIBS` 
+should also contain the following 
+```
+FFTW3_PATH=/my_path_to_FFTW/lib
+LIBFFT=-L$(FFTW3_PATH) -lfftw3 -lfftw3f
+LIBS += $(LIBFFT)
+```
+In case of 2DECOMP&FFT compiled for GPU with NVHPC, linking against cuFFT is mandatory 
+```
+LIBS += -cudalib=cufft
+```
+In case of NCCL the following is required 
+```
+LIBS += -cudalib=cufft,nccl 
+```
+It is also possible to drive the build and installation of 2decomp-fft from a Makefile such as in the following example code
+```
+FC = mpif90
+BUILD = Release
+
+DECOMP_ROOT = /path/to/2decomp-fft
+DECOMP_BUILD_DIR = $(DECOMP_ROOT)/build
+DECOMP_INSTALL_DIR ?= $(DECOMP_BUILD_DIR)/opt # Use default unless set by user
+
+INC += -I$(DECOMP_INSTALL_DIR)/include
+
+# Users build/link targets
+LIBS = -L$(DECOMP_INSTALL_DIR)/lib64 -L$(DECOMP_INSTALL_DIR)/lib -ldecomp2d
+
+# Building libdecomp.a
+$(DECOMP_INSTALL_DIR)/lib/libdecomp.a:
+	FC=$(FC) cmake -S $(DECOMP_ROOT) -B $(DECOMP_BUILD_DIR) -DCMAKE_BUILD_TYPE=$(BUILD) -DCMAKE_INSTALL_PREFIX=$(DECOMP_INSTALL_DIR)
+	cmake --build $(DECOMP_BUILD_DIR) --target decomp2d
+	cmake --build $(DECOMP_BUILD_DIR) --target install
+
+# Clean libdecomp.a
+clean-decomp:
+	cmake --build $(DECOMP_BUILD_DIR) --target clean
+	rm -f $(DECOMP_INSTALL_DIR)/lib/libdecomp.a
+```
+
+## Profiling
+
+Profiling can be activated via `cmake` configuration, 
+the recommended approach is to run the initial configuration as follows:
+```
+$ export caliper_DIR=/path/to/caliper/install/share/cmake/caliper
+$ export CXX=mpicxx
+$ cmake -S $path_to_sources -B $path_to_build_directory -DENABLE_PROFILER=caliper
+```
+where `ENABLE_PROFILER` is set to the profiling tool desired, currently supported values are: `caliper`.
+Note that when using `caliper` a C++ compiler is required as indicated in the above command line.
+
+## Miscellaneous
+
+### Compiling with Intel oneAPI
+
+In order to compile with the MKL libraries the following environmental variable needs to be set up
+```
+$ export MKL_DIR=${MKLROOT}/lib/cmake/mkl
+```
+and select the MKL backend by setting `FFT_Choice=mkl`.
+
+To use the new IntelLLVM compiler, up until the 2023 version, specify it as the Fortran compiler using
+```
+export export FC="mpiifort -fc=ifx"
+```
+and when building with ADIOS2 support you must also specify the `C` and `CXX` compilers
+```
+export CXX="mpiicpc -cxx=icpx"
+export CC="mpiicc -cc=icx"
+```
+From the 2024 version new MPI wrapper are available as `mpiifx`, `mpiicx` and `mpiicpx`. 
+
+### List of preprocessor variables
+
+#### ADIOS2
+
+This variable is automatically added in builds with the adios2 IO backend.
+
+#### DEBUG
+
+This variable is automatically added in debug and dev builds. Extra information is printed when it is present.
+
+#### DOUBLE_PREC
+
+When this variable is not present, the library uses single precision. When it is present, the library uses double precision. This preprocessor variable is driven by the CMake on/off variable `DOUBLE_PRECISION`.
+
+#### SAVE_SINGLE
+
+This variable is valid for double precision builds only. When it is present, snapshots are written in single precision. This preprocessor variable is driven by the CMake on/off variable `SINGLE_PRECISION_OUTPUT`.
+
+#### PROFILER
+
+This variable is automatically added when selecting the profiler. It activates the profiling sections of the code.
+
+#### EVEN
+
+This preprocessor variable is not valid for GPU builds. It leads to padded alltoall operations. This preprocessor variable is driven by the CMake on/off variable `EVEN`.
+
+#### OVERWRITE
+
+This variable leads to overwrite the input array when computing FFT. The support of this flag does not always correspond to in-place transforms, depending on the FFT backend selected, as described above. This preprocessor variable is driven by the CMake on/off variable `ENABLE_INPLACE`.
+
+#### HALO_DEBUG
+
+This variable is used to debug the halo operations. This preprocessor variable is driven by the CMake on/off variable `HALO_DEBUG`.
+
+#### HALO_GLOBAL
+
+This variable is used in the example used to test the halo operations. If the variable is defined, the test are using arrays defined with the key `opt_global` set to `.true.`. Otherwise, the arrays are defined with the key set to `.false.`.
+
+#### _GPU
+
+This variable is automatically added in GPU builds.
+
+#### _NCCL
+
+This variable is valid only for GPU builds. The NVIDIA Collective Communication Library (NCCL) implements multi-GPU and multi-node communication primitives optimized for NVIDIA GPUs and Networking.
+
+## Optional dependencies
+
+### ADIOS2
+
+The library [adios2](https://adios2.readthedocs.io/en/latest/) can be used as a backend for IO. The version 2.9.0 was tested, is supported and can be downloaded [here](https://github.com/ornladios/ADIOS2/archive/refs/tags/v2.9.0.tar.gz). Below are build instructions for the library. However, it is recommended to use the one provided by the administrators of the computing centre if available.
+
+```
+$ wget https://github.com/ornladios/ADIOS2/archive/refs/tags/v2.9.0.tar.gz
+$ tar xzf v2.9.0.tar.gz
+$ mkdir 2.9.0_tmp && cd 2.9.0_tmp
+$ CC=mpicc CXX=mpicxx FC=mpif90 cmake -S ../ADIOS2-2.9.0 -DCMAKE_INSTALL_PREFIX=../2.9.0_bld
+$ make -j
+$ make -j test
+$ make -j install
+```
+
+To build 2DECOMP&FFT with the adios2 IO backend, one can provide the package configuration for adios2 
+in the `PKG_CONFIG_PATH` environment variable, 
+this should be found under `/path/to/adios2/install/lib/cmake/adios2`. 
+One can also provide the option `-Dadios2_DIR=/path/to/adios2/install/lib/cmake/adios2`. 
+Then either specify on the command line when configuring the build
+```
+$ cmake -S . -B ./build -DIO_BACKEND=adios2 -Dadios2_DIR=/path/to/adios2/install/lib/cmake/adios2
+```
+or modify the build configuration using `ccmake`. Please note that the support for ADIOS2 is not complete. Currently, for a given IO operation, when the ADIOS2 backend is not supported, the MPI backend is used.
+
+### FFTW
+
+The library [fftw](http://www.fftw.org/index.html) can be used as a backend for the FFT engine. 
+The version 3.3.10 was tested, is supported and can be downloaded 
+[here](http://www.fftw.org/download.html). 
+Please note that one should build fftw and decomp2d against the same compilers. 
+For build instructions, please check [here](http://www.fftw.org/fftw3_doc/Installation-on-Unix.html). 
+Below is a suggestion for the compilation of the library in double precision 
+(add `--enable-single` for a single precision build):
+
+```
+$ wget http://www.fftw.org/fftw-3.3.10.tar.gz
+$ tar xzf fftw-3.3.10.tar.gz
+$ mkdir fftw-3.3.10_tmp && cd fftw-3.3.10_tmp
+$ ../fftw-3.3.10/configure --prefix=xxxxxxx/fftw3/fftw-3.3.10_bld --enable-shared
+$ make -j
+$ make -j check
+$ make install
+```
+Please note that the resulting build is not compatible with `cmake` 
+(https://github.com/FFTW/fftw3/issues/130). 
+As a workaround, one can open the file 
+`/path/to/fftw3/install/lib/cmake/fftw3/FFTW3Config.cmake` 
+and comment the line
+```
+include ("${CMAKE_CURRENT_LIST_DIR}/FFTW3LibraryDepends.cmake")
+```
+
+To build `2decomp&fft` against fftw3, one can provide the package configuration for fftw3 
+in the `PKG_CONFIG_PATH` environment variable, 
+this should be found under `/path/to/fftw3/install/lib/pkgconfig`. 
+One can also provide the option `-DFFTW_ROOT=/path/to/fftw3/install`. 
+Then either specify on the command line when configuring the build
+```
+$ cmake -S . -B build -DFFT_Choice=<fftw|fftw_f03> -DFFTW_ROOT=/path/to/fftw3/install
+```
+or modify the build configuration using `ccmake`.
+
+Note the legacy `fftw` interface lacks interface definitions 
+and will fail when stricter compilation flags are used (e.g. when `-DCMAKE_BUILD_TYPE=Dev`) 
+for this it is recommended to use `fftw_f03` which provides proper interfaces.
+
+### Caliper
+
+The library [caliper](https://github.com/LLNL/Caliper) can be used to profile the execution of the code. 
+The version 2.9.1 was tested and is supported, version 2.8.0 has also been tested 
+and is still expected to work. 
+Please note that one must build caliper and decomp2d against the same C/C++/Fortran 
+compilers and MPI libray. 
+For build instructions, please check 
+[here](https://github.com/LLNL/Caliper#building-and-installing) 
+and [here](https://software.llnl.gov/Caliper/CaliperBasics.html#build-and-install). 
+Below is a suggestion for the compilation of the library using the GNU compilers:
+
+```
+$ git clone https://github.com/LLNL/Caliper.git caliper_github
+$ cd caliper_github
+$ git checkout v2.9.1
+$ mkdir build && cd build
+$ cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_INSTALL_PREFIX=../../caliper_build_2.9.1 -DWITH_FORTRAN=yes -DWITH_MPI=yes -DBUILD_TESTING=yes ../
+$ make -j
+$ make test
+$ make install
+```
+
+After installing Caliper ensure to set `caliper_DIR=/path/to/caliper/install/share/cmake/caliper`.
+Following this the `2decomp-fft` build can be configured to use Caliper profiling as
+```
+$ cmake -S . -B -DENABLE_PROFILER=caliper
+```
+or by modifying the configuration to set `ENABLE_PROFILER=caliper` via `ccmake`.
diff --git a/LICENSE b/LICENSE
index 0a58604f..17de2d2a 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 BSD 3-Clause License
 
-Copyright (c) 2011-2021, The Numerical Algorithms Group (NAG)
+Copyright (c) 2011-2021, Ning Li, The Numerical Algorithms Group (NAG)
 Copyright (c) 2022-    , Xcompact3d
 All rights reserved.
 
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 29817754..00000000
--- a/Makefile
+++ /dev/null
@@ -1,162 +0,0 @@
-#=======================================================================
-# Makefile for 2DECOMP&FFT
-#=======================================================================
-# Choose pre-processing options
-#   -DDOUBLE_PREC - use double-precision
-#   -DSAVE_SINGLE - Save 3D data in single-precision
-#   -DDEBG        - debugging
-# generate a Git version string
-GIT_VERSION := $(shell git describe --tag --long --always)
-
-DEFS = -DDOUBLE_PREC -DVERSION=\"$(GIT_VERSION)\"
-
-LCL = local# local,lad,sdu,archer
-CMP = gcc# intel,gcc,nagfor,cray,nvhpc
-FFT ?= generic# fftw3,fftw3_f03,generic,mkl,cufft
-PARAMOD = mpi # multicore,gpu
-PROFILER ?= none# none, caliper
-
-BUILD ?= # debug can be used with gcc
-FCFLAGS ?= # user can set default compiler flags
-LDFLAGS ?= # user can set default linker flags
-FFLAGS = $(FCFLAGS)
-LFLAGS = $(LDFLAGS)
-MODFLAG = -J 
-
-LIBDECOMP = decomp2d
-
-AR = ar
-LIBOPT = rcs
-
-#######CMP settings###########
-CMPINC = Makefile.compilers
-include $(CMPINC)
-
-### List of files for the main code
-SRCDECOMP = factor.f90 decomp_2d.f90 log.f90 io.f90
-
-#######FFT settings##########
-ifeq ($(FFT),fftw3)
-  FFTW3_PATH ?= /usr
-  FFTW3_PATH_INCLUDE ?= $(FFTW3_PATH)/include
-  FFTW3_PATH_LIB ?= $(FFTW3_PATH)/lib
-  INC=-I$(FFTW3_PATH_INCLUDE)
-  LIBFFT=-L$(FFTW3_PATH_LIB) -lfftw3 -lfftw3f
-else ifeq ($(FFT),fftw3_f03)
-  FFTW3_PATH ?= /usr
-  FFTW3_PATH_INCLUDE ?= $(FFTW3_PATH)/include
-  FFTW3_PATH_LIB ?= $(FFTW3_PATH)/lib
-  INC=-I$(FFTW3_PATH_INCLUDE)
-  LIBFFT=-L$(FFTW3_PATH_LIB) -lfftw3 -lfftw3f
-else ifeq ($(FFT),generic)
-  SRCDECOMP += ./glassman.f90
-  INC=
-  LIBFFT=
-else ifeq ($(FFT),mkl)
-  SRCDECOMP += $(MKLROOT)/include/mkl_dfti.f90
-  LIBFFT=-Wl,--start-group $(MKLROOT)/lib/intel64/libmkl_intel_lp64.a $(MKLROOT)/lib/intel64/libmkl_sequential.a $(MKLROOT)/lib/intel64/libmkl_core.a -Wl,--end-group -lpthread
-  INC=-I$(MKLROOT)/include
-else ifeq ($(FFT),cufft)
-  CUFFT_PATH ?= $(NVHPC)/Linux_x86_64/$(EBVERSIONNVHPC)/compilers
-  INC=-I$(CUFFT_PATH)/include
-  #LIBFFT=-L$(CUFFT_PATH)/lib64 -Mcudalib=cufft 
-endif
-
-### IO Options ###
-LIBIO :=
-OPTIO :=
-INCIO :=
-ADIOS2DIR :=
-ifeq ($(IO),adios2)
-  ifeq ($(ADIOS2DIR),)
-    $(error Set ADIOS2DIR=/path/to/adios2/install/)
-  endif
-  OPTIO := -DADIOS2 $(OPT)
-  INCIO := $(INC) $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-flags) #$(patsubst $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-libs),,$(shell $(ADIOS2DIR)/bin/adios2-config -f))
-  LIBIO := $(shell $(ADIOS2DIR)/bin/adios2-config --fortran-libs)
-endif
-
-### Add the profiler if needed
-ifneq ($(PROFILER),none)
-  DEFS += -DPROFILER
-endif
-ifeq ($(PROFILER),caliper)
-  CALIPER_PATH ?= xxxxxxxxx/caliper/caliper_2.8.0
-  SRCDECOMP := $(SRCDECOMP) profiler_caliper.f90
-  INC := $(INC) -I$(CALIPER_PATH)/include/caliper/fortran
-  LFLAGS := $(LFLAGS) -L$(CALIPER_PATH)/lib -lcaliper
-endif
-
-#######OPTIONS settings###########
-OPT =
-LINKOPT = $(FFLAGS)
-#-----------------------------------------------------------------------
-# Normally no need to change anything below
-
-OBJDIR = obj
-SRCDIR = src
-DECOMPINC = mod
-FFLAGS += $(MODFLAG)$(DECOMPINC) -I$(DECOMPINC)
-
-SRCDECOMP := $(SRCDECOMP) fft_$(FFT).f90
-SRCDECOMP_ = $(patsubst %.f90,$(SRCDIR)/%.f90,$(filter-out %/mkl_dfti.f90,$(SRCDECOMP)))
-SRCDECOMP_ += $(filter %/mkl_dfti.f90,$(SRCDECOMP))
-OBJDECOMP_MKL_ = $(patsubst $(MKLROOT)/include/%.f90,$(OBJDIR)/%.f90,$(filter %/mkl_dfti.f90,$(SRCDECOMP_)))
-OBJDECOMP_MKL = $(OBJDECOMP_MKL_:%.f90=%.o)
-OBJDECOMP = $(SRCDECOMP_:$(SRCDIR)/%.f90=$(OBJDIR)/%.o)
-
-OPT += $(OPTIO)
-INC += $(INCIO)
-
--include Makefile.settings
-
-all: $(DECOMPINC) $(OBJDIR) $(LIBDECOMP)
-
-$(DECOMPINC):
-	mkdir $(DECOMPINC)
-
-$(LIBDECOMP) : Makefile.settings lib$(LIBDECOMP).a
-
-lib$(LIBDECOMP).a: $(OBJDECOMP_MKL) $(OBJDECOMP)
-	$(AR) $(LIBOPT) $@ $^
-
-$(OBJDIR):
-	mkdir $(OBJDIR)
-
-$(OBJDECOMP) : $(OBJDIR)/%.o : $(SRCDIR)/%.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
-
-$(OBJDECOMP_MKL) : $(OBJDIR)/%.o : $(MKLROOT)/include/%.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $(MKLROOT)/include/mkl_dfti.f90 -o $(OBJDIR)/mkl_dfti.o
-
-examples: $(LIBDECOMP)
-	$(MAKE) -C examples
-
-.PHONY: check
-
-check: examples
-	$(MAKE) -C examples $@
-
-.PHONY: clean
-
-clean: clean-examples
-	rm -f $(OBJDIR)/*.o $(DECOMPINC)/*.mod $(DECOMPINC)/*.smod lib$(LIBDECOMP).a
-	rm -f ./*.o ./*.mod ./*.smod # Ensure old files are removed
-	rm -f Makefile.settings
-
-clean-examples:
-	$(MAKE) -C examples clean
-
-.PHONY: Makefile.settings
-
-Makefile.settings:
-	echo "FC = $(FC)" > $@
-	echo "FFLAGS = $(FFLAGS)" >> $@
-	echo "OPT = $(OPT)" >> $@
-	echo "DEFS = $(DEFS)" >> $@
-	echo "INC = $(INC)" >> $@
-	echo "LIBOPT = $(LIBOPT)" >> $@
-	echo "LIBFFT = ${LIBFFT}" >> $@
-	echo "LFLAGS = $(LFLAGS)" >> $@
-
-export
diff --git a/Makefile.compilers b/Makefile.compilers
deleted file mode 100644
index 7d0a2bda..00000000
--- a/Makefile.compilers
+++ /dev/null
@@ -1,118 +0,0 @@
-#=======================================================================
-# Makefile for 2DECOMP&FFT compilers
-#=======================================================================
-
-DEBUG_BUILD =
-ifeq ($(BUILD),debug)
-  DEBUG_BUILD = yes
-endif
-ifeq ($(BUILD),dev)
-  DEBUG_BUILD = yes
-endif
-
-ifeq ($(CMP),intel)
-  FC = mpiifort
-
-  FFLAGS += -fpp -std08
-  ifeq ($(DEBUG_BUILD),yes)
-    DEFS += -DDEBUG
-    FFLAGS += -g -O0 -debug extended -traceback
-  else
-    FFLAGS += -O3 -mavx2 -march=core-avx2 -mtune=core-avx2
-  endif
-
-  ifeq ($(BUILD),dev)
-    # Add additional, stricter flags
-    FFLAGS += -warn all,noexternal
-  endif
-
-  MODFLAG = -module # Explicit space
-else ifeq ($(CMP),gcc)
-  FC = mpif90
-  FFLAGS += -cpp -std=f2008
-  ifeq "$(shell expr `gfortran -dumpversion | cut -f1 -d.` \>= 10)" "1"
-    FFLAGS += -fallow-argument-mismatch
-  endif
-  ifeq ($(DEBUG_BUILD),yes)
-    DEFS += -DDEBUG
-    FFLAGS += -g3 -Og
-    FFLAGS += -ffpe-trap=invalid,zero -fcheck=all -fimplicit-none
-  else
-    FFLAGS += -O3 -march=native
-  endif
-
-  ifeq ($(findstring openmpi,$(shell $(FC) -show | grep openmpi)),openmpi)
-    MPI_FAMILY = openmpi
-  else
-    MPI_FAMILY = mpich # Probably
-  endif
-
-  ifeq ($(BUILD),dev)
-    # Add additional, stricter flags
-    FFLAGS += -Wall -Wpedantic
-    ifneq ($(FFT),fftw3)
-      # FFTW3 (legacy) doesn't have interfaces
-      ifeq ($(MPI_FAMILY),openmpi)
-        # MPICH doesn't export interfaces...
-        FFLAGS += -Wimplicit-procedure -Wimplicit-interface
-      endif
-    endif
-    FFLAGS += -Wno-unused-function
-    FFLAGS += -Werror -Wno-integer-division
-  endif
-
-else ifeq ($(CMP),nagfor)
-  FC = mpinagfor
-  FFLAGS += -fpp
-else ifeq ($(CMP),cray)
-  FC = ftn
-  FFLAGS += -eF
-  ifeq ($(DEBUG_BUILD),yes)
-    DEFS += -DDEBUG
-    FFLAGS += -G2
-    FFLAGS += -en
-  else
-    FFLAGS += -g -O3
-  endif
-  FFLAGS += -N1023 -M 296
-else ifeq ($(CMP),nvhpc)
-  FC = mpif90
-  MODFLAG = -module # Explicit space
-  FFLAGS += -cpp
-  ifeq ($(PARAMOD),multicore)
-     FFLAGS += -O3 -Minfo=accel -stdpar -acc -target=multicore
-     LFLAGS += -acc -lnvhpcwrapnvtx
-  else ifeq ($(PARAMOD),gpu)
-     CCXY=80
-     NCCL=no     
-     FFLAGS += -D_GPU
-     ifeq ($(NCCL),yes)
-       FFLAGS += -D_NCCL
-     endif
-     FFLAGS += -Mfree -Kieee -Minfo=accel,stdpar -stdpar=gpu -gpu=cc${CCXY},managed,lineinfo -acc -target=gpu -traceback -O3 -DUSE_CUDA -cuda 
-     ifeq ($(NCCL),yes)
-       FFLAGS += -cudalib=cufft,nccl
-     else
-       FFLAGS += -cudalib=cufft
-     endif
-     #FFLAGS += -D_GPU -Mfree -Kieee -Minfo=accel,stdpar -stdpar=gpu -gpu=cc80,managed,lineinfo -acc -target=gpu -traceback -O3 -DUSE_CUDA -cuda -cudalib=cufft
-     #FFLAGS += -Mfree -Kieee -Minfo=accel,stdpar -stdpar=gpu -gpu=cc80,managed,lineinfo -acc -target=gpu -traceback -O3 -DUSE_CUDA -cuda -cudalib=cufft
-     LFLAGS += -acc -lnvhpcwrapnvtx
-  else
-    ifeq ($(DEBUG_BUILD),yes)
-      DEFS += -DDEBUG
-      FFLAGS += -g -O0
-      FFLAGS += -traceback
-      FFLAGS += -Mbounds -Mchkptr
-      FFLAGS += -Ktrap=fp # Trap floating-point errors
-    else
-      FFLAGS += -O3 -march=native
-    endif
-  endif
-  #FFLAGS += -cpp -O3 -Minfo=accel -stdpar -acc -target=multicore
-  #FFLAGS = -cpp -Mfree -Kieee -Minfo=accel -g -acc -target=gpu -fast -O3 -Minstrument
-endif
-
-# Local Variables:
-# mode: makefile
-# End:
diff --git a/README.md b/README.md
index 43a9115a..5e1210c4 100644
--- a/README.md
+++ b/README.md
@@ -1,96 +1,160 @@
-# 2decomp-fft
+# 2DECOMP&FFT
+
+This README contains basic instructions for building and installing the 2DECOMP&FFT library, more
+detailed instructions about installation and linking to the library within an external project 
+can be found in the [install section](INSTALL.md).
+Please have a look at [HOWTO.md](HOWTO.md) and at the examples [examples](examples/README.md) for how to use the library with your application
 
 ## Building
 
-Different compilers can be set by specifying `CMP`, e.g. `make CMP=intel`
-to build with Intel compilers, see `Makefile` for options.
+The build system is driven by `cmake`. It is good practice to directly point to the MPI Fortran wrapper that you would like to use to guarantee consistency between Fortran compiler and MPI. This can be done by setting the default Fortran environmental variable 
+```
+$ export FC=my_mpif90
+```
+The build system can then be generated by running
+```
+$ cmake -S $path_to_sources -B $path_to_build_directory -DOPTION1 -DOPTION2 ...
+```
+for many users a configuration line
+```
+$ cmake -S . -B build
+```
+run from the 2DECOMP&FFT root directory will be sufficient.
+If the build directory does not exist it will be generated and it will contain the configuration files.
+By default a ``RELEASE`` build will built for CPU using MPI and the ``generic`` FFT backend included 
+with 2DECOMP&FFT, please see [INSTALL.md](INSTALL.md) for instructions on changing the build, including debugging builds, building for GPUs and selecting external FFT libraries.
 
-By default an optimised library will be built, debugging versions of the
-library can be built with `make BUILD=debug`, a development version which 
-additionally sets compile time flags to catch coding errors can be built 
-with `make BUILD=dev` (GNU compilers only currently). The behavior of debug
-and development versions of the library can be changed before the initialization
-using the variable ``decomp_debug`` or the environment variable ``DECOMP_2D_DEBUG``.
-The value provided with the environment variable must be a positive integer below 9999.
+Once the build system has been configured, you can build 2DECOMP&FFT by running
+```
+$ cmake --build $path_to_build_directory -j <nproc>
+```
+appending `-v` will display additional information about the build, such as compiler flags.
 
-On each build of the library (`make`, `make all`) a temporary file `Makefile.settings` with
-all current options (`FFLAGS`, `DEFS`, etc.) will be created, and included
-on subsequent invocations, the user therefore does not need to keep
-specifying options between builds.
+After building the library can be tested. Please see section [Testing and examples](#testing-and-examples)
 
-To perform a clean build run `make clean` first, this will delete all
-output files, including `Makefile.settings`.
+Finally, the build library can be installed by running 
+```
+$ cmake --install $path_to_build_directory
+```
+The default location for `libdecomp2d.a` is `$path_to_build_directory/opt/lib`or  `$path_to_build_directory/opt/lib64` unless the variable `CMAKE_INSTALL_PREFIX` is modified.
+The module files generated by the build process will similarly be installed to `$path_to_build_directory/opt/install`, users of the library should add this to the include paths for their program.
+
+Occasionally a clean build is required, this can be performed by running
+```
+$ cmake --build $path_to_build_directory --target clean
+```
+
+### GPU compilation
 
-On each build of the library (`make`, `make all`) a temporary file `Makefile.settings` with
-all current options (`FFLAGS`, `DEFS`, etc.) will be created, and included
-on subsequent invocations, the user therefore does not need to keep
-specifying options between builds.
+The library can perform multi GPU offoloading using the NVHPC compiler suite for NVIDIA hardware. 
+The implementation is based on CUDA-aware MPI and NVIDIA Collective Communication Library (NCCL).
+The FFT is based on cuFFT.
 
-To perform a clean build run `make clean` first, this will delete all
-output files, including `Makefile.settings`.
+For details of how to configure 2DECOMP&FFT for GPU offload, see the GPU compilation section in
+[INSTALL.md](INSTALL.md).
 
 ## Testing and examples
 
-Various example code to exercise 2decomp functionality can be found under ``examples/``
-and can be built from the top-level directory by executing
+By default building of the tests is deactivated. 
+To activate the testing the option `-DBUILD_TESTING=ON` can be added or 
+alternativey the option can be activated in the GUI interface `ccmake`.
+After building the library can be tested by running
 ```
-make check
+$ ctest --test-dir $path_to_build_directory
 ```
-which will (re)build 2decomp&fft as necessary.
+which uses the `ctest` utility. By default tests are performed in serial, 
+but more than 1 rank can be used by setting `MPIEXEC_MAX_NUMPROCS` under `ccmake` utility.
+It is also possible to specify the decomposition by setting 
+`PROW` and `PCOL` parameters at the configure stage or using `ccmake`. 
+During the configure stage users should ensure that the number of MPI tasks `MPIEXEC_MAX_NUMPROCS` 
+is equal to the product of PROW times PCOL. 
+Mesh resolution can also be imposed using the parameters `NX`, `NY` and `NZ`. 
+
+For the GPU implementation please be aware that it is based on a single MPI rank per GPU. 
+Therefore, to test multiple GPUs, use the maximum number of available GPUs 
+on the system/node and not the maximum number of MPI tasks. 
 
-**TODO** Convert examples to tests and automate running them
-
-## GPU compilation
-
-The library can perform multi GPU offoloading using the NVHPC compiler suite for NVIDIA hardware. 
-The implementation is based on CUDA-aware MPI and NVIDIA Collective Communication Library (NCCL).
-The FFT is based on cuFFT. 
-To compile the library for GPU it is possible to execute the following
-```
-make CMP=nvhpc FFT=cufft PARAMOD=gpu CUFFT_PATH=PATH_TO_NVHPC/Vers/Linux_x86_64/Vers/compilers/ 
-``` 
-The `Makefile` will look for the relative libraries (NVCC, cuFFT, etc) under the `${CUFFT_PATH}/include`
-NCCL is not activated by default. If NCCL is installed/required use `NCCL=yes`. 
-The current implementation relays also on opeanACC
-and on automatic optimization of `do concurrent` loops.
-By default the compute architecture for the GPU is 80 (i.e. Ampere), to change it use `CCXY=XY` 
- 
 ## Profiling
 
-Profiling can be activated in the Makefile. Set the variable `PROFILER` to one of the supported profilers (only `caliper` currently). If using `caliper`, provide the installation path in the variable `CALIPER_PATH`. When the profiling is active, one can tune it before calling `decomp_2d_init` using the subroutine `decomp_profiler_prep`. The input argument for this subroutine is a logical array of size 4. Each input allow activation / deactivation of the profiling as follows :
+The 2DECOMP&FFT library has integrated profiling support via external libraries, see the Profiling
+section of [INSTALL.md](INSTALL.md) for instructions on configuring a profiling build.
+Currently, support for profiling is provided by the `caliper` library.
+
+When the profiling is active, one can tune it before calling `decomp_2d_init` using the subroutine
+`decomp_profiler_prep`. 
+The input argument for this subroutine is a logical array of size 4. 
+Each input allow activation / deactivation of the profiling as follows :
 
 1. Profile transpose operations (default : true)
-2. Profile IO operations (default : true))
+2. Profile IO operations (default : true)
 3. Profile FFT operations (default : true)
 4. Profile decomp_2d init / fin subroutines (default : true)
 
-## Optional dependencies
-
-### FFTW
-
-The library [fftw](http://www.fftw.org/index.html) can be used as a backend for the FFT engine. The version 3.3.10 was tested, is supported and can be downloaded [here](http://www.fftw.org/download.html). Please note that one should build fftw and decomp2d against the same compilers. For build instructions, please check [here](http://www.fftw.org/fftw3_doc/Installation-on-Unix.html). Below is a suggestion for the compilation of the library in double precision (add `--enable-single` for a single precision build):
+## FFT backends
 
+The library provides a built-in FFT engine and supports various FFT backends : 
+FFTW, Intel oneMKL, Nvidia cuFFT. 
+The FFT engine selected during compilation is available through the variable `D2D_FFT_BACKEND` 
+defined in the module `decomp_2d_fft`. The expected value is defined by the integer constants
 ```
-wget http://www.fftw.org/fftw-3.3.10.tar.gz
-tar xzf fftw-3.3.10.tar.gz
-mkdir fftw-3.3.10_tmp && cd fftw-3.3.10_tmp
-../fftw-3.3.10/configure --prefix=xxxxxxx/fftw3/fftw-3.3.10_bld --enable-shared
-make -j
-make -j check
-make install
+integer, parameter, public :: D2D_FFT_BACKEND_GENERIC = 0   ! Built-in engine
+integer, parameter, public :: D2D_FFT_BACKEND_FFTW3 = 1     ! FFTW
+integer, parameter, public :: D2D_FFT_BACKEND_FFTW3_F03 = 2 ! FFTW (Fortran 2003)
+integer, parameter, public :: D2D_FFT_BACKEND_MKL = 3       ! Intel oneMKL
+integer, parameter, public :: D2D_FFT_BACKEND_CUFFT = 4     ! Nvidia cuFFT
 ```
+exported by the module `decomp_2d_constants`.
+The external code can use the named variables to check the FFT backend used in a given build.
+
+### OVERWRITE flag
+
+- The generic backend supports the OVERWRITE flag but it can not perform in-place transforms
+- The FFTW3 and FFTW3_F03 backends support the OVERWRITE flag and can perform in-place complex 1D fft
+- The oneMKL backend supports the OVERWRITE flag and can perform in-place complex 1D fft
+- The cuFFT backend supports the OVERWRITE flag and can perform in-place complex 1D fft
 
-### Caliper
+## Miscellaneous
 
-The library [caliper](https://github.com/LLNL/Caliper) can be used to profile the execution of the code. The version 2.8.0 was tested and is supported. Please note that one must build caliper and decomp2d against the same C/C++/Fortran compilers and MPI libray. For build instructions, please check [here](https://github.com/LLNL/Caliper#building-and-installing) and [here](https://software.llnl.gov/Caliper/CaliperBasics.html#build-and-install). Below is a suggestion for the compilation of the library using the GNU compilers:
+### Print the log to a file or to stdout
 
+Before calling `decomp_2d_init`, the external code can modify the variable `decomp_log` 
+to change the output for the log. The expected value is defined by the integer constants
 ```
-git clone https://github.com/LLNL/Caliper.git caliper_github
-cd caliper_github
-git checkout v2.8.0
-mkdir build && cd build
-cmake -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DCMAKE_Fortran_COMPILER=gfortran -DCMAKE_INSTALL_PREFIX=../../caliper_build_2.8.0 -DWITH_FORTRAN=yes -DWITH_MPI=yes -DBUILD_TESTING=yes ../
-make -j
-make test
-make install
+integer, parameter, public :: D2D_LOG_QUIET = 0       ! No logging output
+integer, parameter, public :: D2D_LOG_STDOUT = 1      ! Root rank logs output to stdout
+integer, parameter, public :: D2D_LOG_TOFILE = 2      ! Root rank logs output to the file "decomp_2d_setup.log"
+integer, parameter, public :: D2D_LOG_TOFILE_FULL = 3 ! All ranks log output to a dedicated file
 ```
+exported by the `decomp_2d_constants` module.
+Although their values are shown here, users should not rely on these and are recommended to prefer to use the named variables `D2D_LOG_QUIET`, etc. instead.
+The default value used is `D2D_LOG_TOFILE` for the default build and `D2D_LOG_TOFILE_FULL` for a debug build.
+
+### Change the debug level for debug builds
+
+Before calling `decomp_2d_init`, the external code can modify the variable `decomp_debug` 
+to change the debug level. The user can also modify this variable using the environment 
+variable `DECOMP_2D_DEBUG`. Please note that the environment variable is processed only for debug builds.
+The expected value for the variable `decomp_debug` is some integer between 0 and 6, bounds included.
+
+### Code formatting
+
+The code is formatted using the `fprettify` program (available via `pip`), 
+to ensure consistency of use there is a script file `scripts/format.sh` 
+which will run `fprettify` across the 2decomp&fft source, 
+you can also use the `format` build target to run the script.
+It is recommended that you should format the code before making a pull request.
+
+### Versioning
+
+The development of 2DECOMP&FFT occurs on Github, with release versions on the `main` branch.
+New features will be implemented on the `dev` branch 
+and merged into `main` once a new release
+is ready.
+For example, starting from `v2.0.0` the `main` branch will only be updated to receive fixes giving
+`v2.0.1`, etc. until the next release (either `v2.1.0` or `v3.0.0` depending on the magnitude of the
+change is ready).
+
+### Contributing
+
+If you would like to contribute to the development of the 2DECOMP&FFT library or report a bug please refer to 
+the [Contributing section](Contribute.md)
diff --git a/cmake/D2D_Compilers.cmake b/cmake/D2D_Compilers.cmake
new file mode 100644
index 00000000..101651fd
--- /dev/null
+++ b/cmake/D2D_Compilers.cmake
@@ -0,0 +1,117 @@
+# Compilers CMakeLists
+
+set(Fortran_COMPILER_NAME ${CMAKE_Fortran_COMPILER_ID} )
+message(STATUS "COMP ID ${Fortran_COMPILER_NAME}")
+message(STATUS "Fortran compiler name ${Fortran_COMPILER_NAME}")
+message(STATUS "Fortran compiler version ${CMAKE_Fortran_COMPILER_VERSION}")
+
+
+
+if (Fortran_COMPILER_NAME MATCHES "GNU")
+  # gfortran
+  message(STATUS "Setting gfortran flags")
+  include(D2D_flags_gnu)
+  #set(D2D_FFLAGS "-cpp -std=f2008 -ffree-line-length-none")
+  #if (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  #  message(STATUS "Set New Fortran basic flags")
+  #  set(D2D_FFLAGS "${D2D_FFLAGS} -fallow-argument-mismatch")
+  #endif (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  #set(D2D_FFLAGS_RELEASE "-O3 -march=native")
+  #set(D2D_FFLAGS_DEBUG   "-DDEBUG -g3 -Og -ffpe-trap=invalid,zero -fcheck=all -fimplicit-none")
+  #set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG} -Wall -Wpedantic -Wno-unused-function -Werror -Wno-integer-division")
+elseif (Fortran_COMPILER_NAME MATCHES "Intel")
+  message(STATUS "Setting ifort flags")
+  include(D2D_flags_intel)
+elseif (Fortran_COMPILER_NAME MATCHES "NAG")
+  message(STATUS "Setting nagfor flags")
+  include(D2D_flags_nag)
+elseif (Fortran_COMPILER_NAME MATCHES "Cray")
+  message(STATUS "Setting cray fortran flags")
+  include(D2D_flags_cray)
+elseif (Fortran_COMPILER_NAME MATCHES "NVHPC")
+  message(STATUS "Setting NVHPC fortran flags")
+  include(D2D_flags_nvidia)
+# elseif (Fortran_COMPILER_NAME MATCHES "Flang")
+#   message(STATUS "Setting Flang flags")
+#   set(CMAKE_Fortran_FLAGS "-cpp -std=f2008" CACHE STRING
+# 	  "Baseline FFLAGS"
+# 	  FORCE)
+elseif (Fortran_COMPILER_NAME MATCHES "Fujitsu")
+  message(STATUS "Setting Fujitsu fortran flags")
+  include(D2D_flags_fujitsu)
+elseif (Fortran_COMPILER_NAME MATCHES "Flang")
+  message(STATUS "Setting Flang flags")
+  include(D2D_flags_flang)
+else (Fortran_COMPILER_NAME MATCHES "GNU")
+  message ("CMAKE_Fortran_COMPILER full path: " ${CMAKE_Fortran_COMPILER})
+  message ("Fortran compiler: " ${Fortran_COMPILER_NAME})
+  message ("No optimized Fortran compiler flags are known, we just try -O2...")
+  set(D2D_FFLAGS_RELEASE "-O2")
+  set(D2D_FFLAGS_DEBUG   "-O0 -g")
+endif (Fortran_COMPILER_NAME MATCHES "GNU")
+
+if (NOT FLAGS_SET)
+  set(CMAKE_Fortran_FLAGS ${D2D_FFLAGS} CACHE STRING 
+	"Base FFLAGS for build" FORCE)
+  set(CMAKE_Fortran_FLAGS_RELEASE ${D2D_FFLAGS_RELEASE} CACHE STRING
+  	"Additional FFLAGS for Release (optimised) build" FORCE)
+  set(CMAKE_Fortran_FLAGS_DEBUG ${D2D_FFLAGS_DEBUG} CACHE STRING
+  	"Additional FFLAGS for Debug build" FORCE)
+  set(CMAKE_Fortran_FLAGS_DEV ${D2D_FFLAGS_DEV} CACHE STRING
+  	"Additional FFLAGS for Dev build" FORCE)
+  # Add profiler
+  #if (ENABLE_PROFILER)
+  #  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${D2D_EXE_LINKER_FLAGS}" CACHE STRING 
+  #          "Add profiler to exe" FORCE)
+  #endif(ENABLE_PROFILER)
+
+  set(FLAGS_SET 1 CACHE INTERNAL "Flags are set")
+endif()
+
+if (CMAKE_BUILD_TYPE MATCHES "DEBUG")
+  add_definitions("-DDEBUG")
+endif (CMAKE_BUILD_TYPE MATCHES "DEBUG")
+
+if (ENABLE_INPLACE)
+  add_definitions("-DOVERWRITE")
+endif ()
+
+# Padded MPI alltoall transpose operations (invalid for GPU)
+if (EVEN)
+  if (BUILD_TARGET MATCHES "gpu")
+    message(FATAL_ERROR "The GPU build is not compatible with padded alltoall")
+  endif ()
+  add_definitions("-DEVEN")
+endif ()
+
+execute_process(
+  COMMAND git describe --tag --long --always
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+  OUTPUT_VARIABLE GIT_VERSION
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+add_definitions("-DVERSION=\"${GIT_VERSION}\"")
+option(DOUBLE_PRECISION "Build Xcompact with double precision" ON)
+if (DOUBLE_PRECISION)
+  add_definitions("-DDOUBLE_PREC")
+endif()
+
+option(SINGLE_PRECISION_OUTPUT "Build XCompact with output in single precision" OFF)
+if (SINGLE_PRECISION_OUTPUT)
+  add_definitions("-DSAVE_SINGLE")
+endif()
+
+if (IO_BACKEND MATCHES "mpi")
+  message(STATUS "Using mpi (default) IO backend")
+elseif (IO_BACKEND MATCHES "adios2")
+  message(STATUS "Using ADIOS2 IO backend")
+  find_package(adios2 REQUIRED)
+  if (NOT ADIOS2_HAVE_MPI)
+    message(FATAL_ERROR "MPI support is missing in the provided ADIOS2 build")
+  endif (NOT ADIOS2_HAVE_MPI)
+  if (NOT ADIOS2_HAVE_Fortran)
+    message(FATAL_ERROR "Fortran support is missing in the provided ADIOS2 build")
+  endif (NOT ADIOS2_HAVE_Fortran)
+else (IO_BACKEND MATCHES "mpi")
+  message(FATAL_ERROR "Invalid value for CMake variable IO_BACKEND")
+endif (IO_BACKEND MATCHES "mpi")
diff --git a/cmake/D2D_GPU.cmake b/cmake/D2D_GPU.cmake
new file mode 100644
index 00000000..e173a8c2
--- /dev/null
+++ b/cmake/D2D_GPU.cmake
@@ -0,0 +1,46 @@
+# GPU CMakeLists
+message(STATUS "Check GPU")
+
+if (ENABLE_OPENACC)
+  include(FindOpenACC)
+  if(OpenACC_Fortran_FOUND)
+    message(STATUS "OpenACC for Fotran Compiler Found, version ${OpenACC_Fortran_VERSION_MAJOR}.${OpenACC_Fortran_VERSION_MINOR}")
+  else()
+    message(ERROR_CRITICAL "No OpenACC support detected")
+  endif()
+endif()
+
+if (ENABLE_CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  if (NOT SET_CUDA_ARCH)
+    set(SET_CUDA_ARCH 1 CACHE INTERNAL "Set CUDA Architecture" FORCE)
+    set(CUDA_ARCH_TEST 70 )
+    if(${CMAKE_VERSION} VERSION_LESS_EQUAL "3.13.4")
+      cuda_select_nvcc_arch_flags(ARCH_FLAGS "Auto") # optional argument for arch to add
+      message(STATUS "ARCH_FLAGS = ${ARCH_FLAGS}")
+      string(REPLACE "-gencode;" "--generate-code=" ARCH_FLAGS "${ARCH_FLAGS}")
+      string(APPEND CMAKE_CUDA_FLAGS "${ARCH_FLAGS}")
+      message(STATUS "ARCH_FLAGS WITH CUDA = ${ARCH_FLAGS}")
+    else()
+      include(FindCUDA/select_compute_arch)
+      CUDA_DETECT_INSTALLED_GPUS(INSTALLED_GPU_CCS_1)
+      string(STRIP "${INSTALLED_GPU_CCS_1}" INSTALLED_GPU_CCS_2)
+      string(REPLACE " " ";" INSTALLED_GPU_CCS_3 "${INSTALLED_GPU_CCS_2}")
+      string(REPLACE "." "" CUDA_ARCH_LIST "${INSTALLED_GPU_CCS_3}")
+      SET(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH_LIST})
+      set_property(GLOBAL PROPERTY CUDA_ARCHITECTURES "${CUDA_ARCH_LIST}")
+      message(STATUS "CUDA_ARCHITECTURES ${CUDA_ARCH_LIST}")
+      list(GET CUDA_ARCH_LIST 0 CUDA_ARCH_AUTO)
+      message(STATUS "CUDA_ARCH_AUTO ${CUDA_ARCH_AUTO}")
+    endif()
+    if(${CUDA_ARCH_AUTO} GREATER ${CUDA_ARCH_TEST})
+      set(CUDA_ARCH_COMP ${CUDA_ARCH_AUTO} CACHE STRING "Set CUDA Computing Architecture")
+    else()
+      set(CUDA_ARCH_COMP ${CUDA_ARCH_TEST} CACHE STRING "Set CUDA Computing Architecture")
+    endif()
+  else()
+    set(CUDA_ARCH_COMP ${SET_CUDA_ARCH})
+  endif()
+  message(STATUS "CUDA_COMP ${CUDA_ARCH_COMP}")
+endif()
+
diff --git a/cmake/D2D_MPI.cmake b/cmake/D2D_MPI.cmake
new file mode 100644
index 00000000..d2ac8060
--- /dev/null
+++ b/cmake/D2D_MPI.cmake
@@ -0,0 +1,72 @@
+# MPI CMakeLists
+find_package(MPI REQUIRED COMPONENTS Fortran)
+set(D2D_MPI_FAMILY "Unknown")
+
+# adios2 IO backend requires C and C++ MPI components
+if (IO_BACKEND MATCHES "adios2")
+    find_package(MPI REQUIRED COMPONENTS C CXX)
+endif (IO_BACKEND MATCHES "adios2")
+
+# Stop if there is no MPI_Fortran_Compiler
+if (MPI_Fortran_COMPILER)
+    message(STATUS "MPI_Fortran_COMPILER found: ${MPI_Fortran_COMPILER}")
+    message(STATUS "MPI_VERSION found: ${MPI_VERSION}")
+    # Try to guess the MPI type to adapt compilation flags if necessary
+    string(FIND "${MPI_Fortran_COMPILER}" "mpich" pos)
+    if(pos GREATER_EQUAL "0")
+      set(D2D_MPI_FAMILY "MPICH")
+      message(STATUS "MPI is MPICH type")
+    endif()
+    string(FIND "${MPI_Fortran_COMPILER}" "openmpi" pos)
+    if(pos GREATER_EQUAL "0")
+      set(D2D_MPI_FAMILY "OMPI")
+      message(STATUS "MPI is openMPI type")
+    endif()
+
+    if (${D2D_MPI_FAMILY} STREQUAL "Unknown")
+      execute_process(COMMAND ${MPI_Fortran_COMPILER} "-show"
+	OUTPUT_VARIABLE mpi_show
+	ERROR_QUIET)
+      
+      string(FIND "${mpi_show}" "openmpi" pos)
+      if(pos GREATER_EQUAL "0")
+	set(D2D_MPI_FAMILY "OMPI")
+      endif()
+
+      string(FIND "${mpi_show}" "mpich" pos)
+      if(pos GREATER_EQUAL "0")
+	set(D2D_MPI_FAMILY "MPICH")
+      endif()
+    endif()
+
+    message(STATUS "MPI Compiler family: ${D2D_MPI_FAMILY}")
+else (MPI_Fortran_COMPILER)
+    message(SEND_ERROR "This application cannot compile without MPI")
+endif(MPI_Fortran_COMPILER)
+# Warning if Include are not found => can be fixed with more recent cmake version
+if (MPI_FOUND)
+  message(STATUS "MPI FOUND: ${MPI_FOUND}")
+  include_directories(SYSTEM ${MPI_INCLUDE_PATH})
+  message(STATUS "MPI INCL ALSO FOUND: ${MPI_INCLUDE_PATH}")
+  if (NOT MPI_NUMPROCS_SET)
+    # Save the Maximim number of MPI ranks on the system
+    set(MAX_NUMPROCS ${MPIEXEC_MAX_NUMPROCS} CACHE STRING "SAVE NRANKS MAX" FORCE)
+    message(STATUS "Reset the number of ranks to 1")
+    set(MPIEXEC_MAX_NUMPROCS "1" CACHE STRING
+        "Set the initial value to 1 rank" FORCE)
+    set(MPI_NUMPROCS_SET 1 CACHE INTERNAL "MPI Ranks set" FORCE)
+    # Force the mpirun to be coherent with the mpifortran
+    string(REGEX REPLACE "mpif90" "mpirun" PATH_TO_MPIRUN "${MPI_Fortran_COMPILER}")
+    string(REPLACE "mpiifort" "mpirun" PATH_TO_MPIRUN "${PATH_TO_MPIRUN}")
+    string(REPLACE "mpiifx" "mpirun" PATH_TO_MPIRUN "${PATH_TO_MPIRUN}")
+    message(STATUS "Path to mpirun ${PATH_TO_MPIRUN}")
+    set(MPIEXEC_EXECUTABLE "${PATH_TO_MPIRUN}" CACHE STRING
+        "Force MPIRUN to be consistent with MPI_Fortran_COMPILER" FORCE)
+  endif()
+else (MPI_FOUND)
+  message(STATUS "NO MPI include have been found. The executable won't be targeted with MPI include")
+  message(STATUS "Code will compile but performaces can be compromised")
+  message(STATUS "Using a CMake vers > 3.10 should solve the problem")
+  message(STATUS "Alternatively use ccmake to manually set the include if available")
+endif (MPI_FOUND)
+
diff --git a/cmake/D2D_Profilers.cmake b/cmake/D2D_Profilers.cmake
new file mode 100644
index 00000000..fcf73f1d
--- /dev/null
+++ b/cmake/D2D_Profilers.cmake
@@ -0,0 +1,10 @@
+# Profilers CMakeLists
+
+if (ENABLE_PROFILER)
+
+  if (ENABLE_PROFILER MATCHES "caliper")
+    enable_language(CXX)
+    find_package(caliper REQUIRED)
+  endif()
+  
+endif()
diff --git a/cmake/compilers/D2D_flags_cray.cmake b/cmake/compilers/D2D_flags_cray.cmake
new file mode 100644
index 00000000..26dbf71c
--- /dev/null
+++ b/cmake/compilers/D2D_flags_cray.cmake
@@ -0,0 +1,5 @@
+#Compilers Flags for Cray
+
+set(D2D_FFLAGS "-eF -g -N 1023 -M878")
+set(D2D_FFLAGS_RELEASE "-O3")
+set(D2D_FFLAGS_DEBUG   "-O0 -g")
diff --git a/cmake/compilers/D2D_flags_flang.cmake b/cmake/compilers/D2D_flags_flang.cmake
new file mode 100644
index 00000000..e50be9b4
--- /dev/null
+++ b/cmake/compilers/D2D_flags_flang.cmake
@@ -0,0 +1,5 @@
+# Compilers Flags for AOCC
+set(D2D_FFLAGS "-cpp -g")
+set(D2D_FFLAGS_RELEASE "-O3")
+set(D2D_FFLAGS_DEBUG   "-g -O0 -DDEBUG")
+set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG}")
diff --git a/cmake/compilers/D2D_flags_fujitsu.cmake b/cmake/compilers/D2D_flags_fujitsu.cmake
new file mode 100644
index 00000000..5d1675de
--- /dev/null
+++ b/cmake/compilers/D2D_flags_fujitsu.cmake
@@ -0,0 +1,5 @@
+# Compilers flags for Fujitsu
+
+  set(D2D_FFLAGS "-Cpp")
+  set(D2D_FFLAGS_RELEASE "-O3")
+  set(D2D_FFLAGS_DEBUG   "-O0")
diff --git a/cmake/compilers/D2D_flags_gnu.cmake b/cmake/compilers/D2D_flags_gnu.cmake
new file mode 100644
index 00000000..9e0ba0f2
--- /dev/null
+++ b/cmake/compilers/D2D_flags_gnu.cmake
@@ -0,0 +1,19 @@
+# Flags for GNU compiler
+set(D2D_FFLAGS "-cpp -std=f2008 -ffree-line-length-none")
+if (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  message(STATUS "Set New Fortran basic flags")
+  set(D2D_FFLAGS "${D2D_FFLAGS} -fallow-argument-mismatch")
+  set(D2D_GNU10 TRUE)
+else (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+  set(D2D_GNU10 FALSE)
+endif (CMAKE_Fortran_COMPILER_VERSION GREATER_EQUAL "10")
+set(D2D_FFLAGS_RELEASE "-O3 -march=native")
+set(D2D_FFLAGS_DEBUG   "-DDEBUG -g3 -Og -ffpe-trap=invalid,zero -fcheck=all -fimplicit-none")
+if (((${D2D_MPI_FAMILY} STREQUAL "MPICH") OR (${D2D_MPI_FAMILY} STREQUAL "Unknown")) AND D2D_GNU10)
+  set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG} -Wall -Wno-unused-function -Wno-integer-division")
+else()
+  set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG} -Wall -Wpedantic -Wno-unused-function -Werror -Wno-integer-division")
+endif()
+if ((${D2D_MPI_FAMILY} STREQUAL "OMPI") AND (${FFT_Choice} MATCHES "generic"))
+  set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEV} -Wimplicit-procedure -Wimplicit-interface")
+endif()
diff --git a/cmake/compilers/D2D_flags_intel.cmake b/cmake/compilers/D2D_flags_intel.cmake
new file mode 100644
index 00000000..598a7a86
--- /dev/null
+++ b/cmake/compilers/D2D_flags_intel.cmake
@@ -0,0 +1,14 @@
+# Compilers Flags for Intel
+# Check is the compiler is the new ifx based on LLVM or the old ifort
+if (Fortran_COMPILER_NAME MATCHES "IntelLLVM")
+  set(D2D_FFLAGS "-fpp -std08 -safe-cray-ptr -g -traceback")
+  set(D2D_FFLAGS_RELEASE "-O3")
+  #set(D2D_FFLAGS "-fpp -std08 -xHost -heaparrays -safe-cray-ptr -g -traceback")
+  #set(D2D_FFLAGS_RELEASE "-O3 -ipo")
+else (Fortran_COMPILER_NAME MATCHES "IntelLLVM")
+  #set(CMAKE_Fortran_FLAGS "-cpp xSSE4.2 -axAVX,CORE-AVX-I,CORE-AVX2 -ipo -fp-model fast=2 -mcmodel=large -safe-cray-ptr")
+  set(D2D_FFLAGS "-fpp -std08 -xHost -heaparrays -safe-cray-ptr -g -traceback")
+  set(D2D_FFLAGS_RELEASE "-O3 -ipo")
+endif (Fortran_COMPILER_NAME MATCHES "IntelLLVM")
+set(D2D_FFLAGS_DEBUG   "-g -O0 -debug extended -traceback -DDEBUG")
+set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG} -warn all,noexternal")
diff --git a/cmake/compilers/D2D_flags_nag.cmake b/cmake/compilers/D2D_flags_nag.cmake
new file mode 100644
index 00000000..313361af
--- /dev/null
+++ b/cmake/compilers/D2D_flags_nag.cmake
@@ -0,0 +1,5 @@
+# Compiler F;ags for NAG
+
+set(D2D_FFLAGS "-fpp")
+set(D2D_FFLAGS_RELEASE "-O3")
+set(D2D_FFLAGS_DEBUG   "-O0 -g")
diff --git a/cmake/compilers/D2D_flags_nvidia.cmake b/cmake/compilers/D2D_flags_nvidia.cmake
new file mode 100644
index 00000000..53e7b87a
--- /dev/null
+++ b/cmake/compilers/D2D_flags_nvidia.cmake
@@ -0,0 +1,38 @@
+#Compilers Flags for NVIDIA
+
+set(D2D_FFLAGS "-cpp -Mfree -Kieee")
+set(D2D_FFLAGS_RELEASE "-O3")
+set(D2D_FFLAGS_DEBUG   "-O0 -g -traceback -Mbounds -Mchkptr -Ktrap=fp")
+set(D2D_FFLAGS_DEV     "${D2D_FFLAGS_DEBUG}")
+if (BUILD_TARGET MATCHES "mpi")
+  set(D2D_FFLAGS_RELEASE "${D2D_FFLAGS_RELEASE} -fast -march=native")
+endif (BUILD_TARGET MATCHES "mpi")
+
+if (BUILD_TARGET MATCHES "gpu")
+  set(D2D_FFLAGS "${D2D_FFLAGS} -Minfo=accel -target=gpu")
+  add_definitions("-D_GPU")
+  if (ENABLE_OPENACC)
+    set(D2D_FFLAGS "${D2D_FFLAGS} -acc")
+  endif()
+  if (ENABLE_CUDA)
+    add_definitions("-DUSE_CUDA")
+    set(D2D_FFLAGS "${D2D_FFLAGS} -cuda")
+    # Add Compute Capabilities and memory managemnt 
+    if (ENABLE_MANAGED)
+	    set(D2D_FFLAGS "${D2D_FFLAGS} -gpu=cc${CUDA_ARCH_COMP},managed,lineinfo")
+    else (ENABLE_MANAGED)
+	    set(D2D_FFLAGS "${D2D_FFLAGS} -gpu=cc${CUDA_ARCH_COMP},lineinfo")
+    endif(ENABLE_MANAGED)
+    # Add NCCL cuFFT
+    if (ENABLE_NCCL)
+      add_definitions("-D_NCCL")
+      set(D2D_FFLAGS "${D2D_FFLAGS} -cudalib=nccl,cufft")
+    else(ENABLE_NCCL)
+      set(D2D_FFLAGS "${D2D_FFLAGS} -cudalib=cufft")
+    endif(ENABLE_NCCL)
+  endif(ENABLE_CUDA)
+  # Add profiler
+  #if (ENABLE_PROFILER)
+  #  set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lnvhpcwrapnvtx")
+  #endif(ENABLE_PROFILER)
+endif (BUILD_TARGET MATCHES "gpu")
diff --git a/cmake/decomp2d-build_config.cmake.in b/cmake/decomp2d-build_config.cmake.in
new file mode 100644
index 00000000..8c4daec3
--- /dev/null
+++ b/cmake/decomp2d-build_config.cmake.in
@@ -0,0 +1,3 @@
+include("@PROJECT_BINARY_DIR@/decomp2d-targets.cmake")
+
+set(decomp2d_INCLUDE_DIRS "@CMAKE_Fortran_MODULE_DIRECTORY@")
diff --git a/cmake/decomp2d-config-version.cmake.in b/cmake/decomp2d-config-version.cmake.in
new file mode 100644
index 00000000..95bb9ec2
--- /dev/null
+++ b/cmake/decomp2d-config-version.cmake.in
@@ -0,0 +1,5 @@
+#
+# Versioning file for 2decomp&fft
+#
+
+set(PACKAGE_VERSION "@version@")
diff --git a/cmake/decomp2d-config.cmake.in b/cmake/decomp2d-config.cmake.in
new file mode 100644
index 00000000..f519615d
--- /dev/null
+++ b/cmake/decomp2d-config.cmake.in
@@ -0,0 +1,20 @@
+#
+# 2decomp CMake configuration.
+# This is used by other packages to configure themselves against 2decomp&fft.
+#
+
+# Compute installation prefix relative to this file
+get_filename_component(_dir "${CMAKE_CURRENT_LIST_FILE}" PATH)
+get_filename_component(_prefix "${_dir}/../.." ABSOLUTE)
+
+# Import the targets
+if (EXISTS ${_prefix}/lib )
+  message(STATUS "Found decomp2d under lib")
+  include("${_prefix}/lib/decomp2d-targets.cmake")
+else()
+  message(STATUS "Not Found decomp2d under lib, we'll use lib64")
+  include("${_prefix}/lib64/decomp2d-targets.cmake")
+endif()
+
+set(2decomp_INCLUDE_DIR "${_prefix}/include")
+
diff --git a/cmake/fft/downloadFindFFTW.cmake.in b/cmake/fft/downloadFindFFTW.cmake.in
new file mode 100644
index 00000000..28b2b4bb
--- /dev/null
+++ b/cmake/fft/downloadFindFFTW.cmake.in
@@ -0,0 +1,47 @@
+# downloadFindFFTW.cmake.in
+#
+# From the README at https://github.com/egpbos/findFFTW
+#
+# Copyright (c) 2015, Wenzel Jakob; 2017, Patrick Bos
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+cmake_minimum_required(VERSION 3.0.2)
+
+project(findFFTW-download NONE)
+
+include(ExternalProject)
+
+ExternalProject_Add(findFFTW_download
+    GIT_REPOSITORY    "https://github.com/egpbos/findfftw.git"
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+    SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/findFFTW-src"
+    BINARY_DIR        ""
+    INSTALL_DIR       ""
+)
diff --git a/cmake/fft/fft.cmake b/cmake/fft/fft.cmake
new file mode 100644
index 00000000..dace35eb
--- /dev/null
+++ b/cmake/fft/fft.cmake
@@ -0,0 +1,71 @@
+# FFT CMakeLists
+
+# Look for fftw if required
+# The download findFFTW code is based on the README at
+# https://github.com/egpbos/findFFTW
+#
+# Copyright (c) 2015, Wenzel Jakob; 2017, Patrick Bos
+# All rights reserved.
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+# 
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+# 
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+# 
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+# 
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+message(STATUS "SET UP FFT")
+if(${FFT_Choice} MATCHES "fftw")
+  configure_file(${CMAKE_SOURCE_DIR}/cmake/fft/downloadFindFFTW.cmake.in findFFTW-download/CMakeLists.txt)
+  execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
+          RESULT_VARIABLE result
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/findFFTW-download )
+  if(result)
+      message(FATAL_ERROR "CMake step for findFFTW failed: ${result}")
+      else()
+      message("CMake step for findFFTW completed (${result}).")
+  endif()
+  execute_process(COMMAND ${CMAKE_COMMAND} --build .
+          RESULT_VARIABLE result
+          WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/findFFTW-download )
+  if(result)
+      message(FATAL_ERROR "Build step for findFFTW failed: ${result}")
+  endif()
+  
+  set(findFFTW_DIR ${CMAKE_CURRENT_BINARY_DIR}/findFFTW-src)
+  
+  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${findFFTW_DIR}")
+  find_package(FFTW)
+  message(STATUS "FFTW_FOUND     : ${FFTW_FOUND}")
+  message(STATUS "FFTW_LIBRARIES : ${FFTW_LIBRARIES}")
+  message(STATUS "FFTW_INCLUDE   : ${FFTW_INCLUDE_DIRS}")
+
+  #add_definitions("-lfftw3 -lfftw3f")
+
+elseif(${FFT_Choice} MATCHES "mkl")
+  set(MKL_INTERFACE "lp64")
+  set(MKL_THREADING "sequential")
+  find_package(MKL CONFIG REQUIRED)
+elseif(${FFT_Choice} MATCHES "cufft")
+  message(STATUS "Enable cuFFT")
+  if (ENABLE_CUDA)
+    set(CUFFT_FOUND TRUE)
+  endif()
+endif(${FFT_Choice} MATCHES "fftw")
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 00000000..4ae36431
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Set the decomposition and resolution
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/examples/cmake")
+include(D2D_Set_Decomposition)
+include(D2D_Set_Resolution)
+# Create a folder directory for testing
+set(test_dir "${PROJECT_BINARY_DIR}/RunTests")
+file(MAKE_DIRECTORY ${test_dir})
+# add subdirectories
+add_subdirectory(init_test)
+add_subdirectory(test2d)
+add_subdirectory(fft_physical_x)
+add_subdirectory(fft_physical_z)
+add_subdirectory(halo_test)
+add_subdirectory(io_test)
+add_subdirectory(grad3d)
+
+# Set real/complex tests
+#set(COMPLEX_TESTS "OFF" CACHE STRING "Enables complex numbers for tests that support it")
diff --git a/examples/Makefile b/examples/Makefile
deleted file mode 100644
index 195a6e4a..00000000
--- a/examples/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-.PHONY: test2d fft_physical_z fft_physical_x halo_test io_test 
-
-# Just build the examples
-examples: test2d fft_physical_z fft_physical_x halo_test io_test
-	@echo "Built the examples"
-
-test2d:
-	$(MAKE) -C $@ $@
-fft_physical_z:
-	$(MAKE) -C $@ $@
-fft_physical_x:
-	$(MAKE) -C $@ $@
-halo_test:
-	$(MAKE) -C $@ $@
-io_test:
-	$(MAKE) -C $@ all
-
-check:
-	cd test2d; $(MAKE) $@
-	cd fft_physical_z; $(MAKE) $@
-	cd fft_physical_x; $(MAKE) $@
-	cd halo_test; $(MAKE) $@
-	cd io_test; $(MAKE) $@
-
-clean:
-	cd test2d; $(MAKE) $@
-	cd fft_physical_z; $(MAKE) $@
-	cd fft_physical_x; $(MAKE) $@
-	cd halo_test; $(MAKE) $@
-	cd io_test; $(MAKE) $@
-
-export
diff --git a/examples/README.md b/examples/README.md
index 48ebdea2..b9ae57ce 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,23 +1,11 @@
-Examples
-========
+# Examples
 
-* test2d       - to test the base 2D pencil decomposition module 
+- [Initialization test](init_test)         - to test the initialisation of the DECOMP2D&FFT library
+- [Test decomposition](test2d)             - various tests for the 2D pencil decomposition module and timing 
+- [Test FFT from X pencil](fft_physical_x) - various tests for the FFT starting from the ``X`` direction 
+- [Test FFT from Z pencil](fft_physical_z) - various tests for the FFT starting from the ``Z`` direction 
+- [Test HALO exchange](halo_test)          - to test the halo-cell exchange capability
+- [Test I/O features](io_test)             - various tests for the I/O module
+- [Example gradient of a scalar](grad3d)   - example of how to compute the gradient of a scalar field
 
-* fft_test_c2c - to test the complex-to-complex FFTs 
-
-* fft_test_r2c - to test the real-to-complex/complex-to-real FFTs
-
-* timing       - to benchmark the FFT library
-
-* halo_test    - to test the halo-cell exchange code
-
-* io_test      - to test various IO functions
-
-* p3dfft       - to crosscheck the library against P3DFFT
-
-* non_blocking - to test the idea of overlap communication and computation
-
-* tecplot_view - to generate Tecplot visualisation of the decomposition
-
-
-Some examples may require external libraries to be built first. Refer to the README files for each example for details.
+Refer to the README files for each example for details.
diff --git a/examples/cmake/D2D_Set_Decomposition.cmake b/examples/cmake/D2D_Set_Decomposition.cmake
new file mode 100644
index 00000000..3e577658
--- /dev/null
+++ b/examples/cmake/D2D_Set_Decomposition.cmake
@@ -0,0 +1,23 @@
+# PROW/PCOL options
+# Tests that accept PROW/PCOL as arguments will use these values.
+# To simplify out of the box testing, default to 0.
+set(PROW 0 CACHE STRING
+  "Number of processor rows - PROWxPCOL=NP must be satisfied, 0 for autotuning")
+set(PCOL 0 CACHE STRING
+  "Number of processor rows - PROWxPCOL=NP must be satisfied, 0 for autotuning")
+# In case decomposition is imposed force number of MPI task to be consistent
+math(EXPR NUMPROCS "${PROW} * ${PCOL}")
+message(STATUS "Computed NRANK: ${NUMPROCS} Max avail ${MAX_NUMPROCS}")
+set(ADD_DECOMP_TO_EXAMPLE FALSE)
+if (NUMPROCS GREATER "0")
+  if (NUMPROCS LESS "${MAX_NUMPROCS}")
+    message(STATUS "Decomposion has been imposed to ${PROW}X${PCOL}: number of MPI tasks to be used is imposed to ${NUMPROCS}")
+    set(MPIEXEC_MAX_NUMPROCS "${NUMPROCS}" CACHE STRING
+            "Force N to be p_row*p_col" FORCE)
+  else ()
+    message(STATUS "The decomposition ${PROW}x${PCOL} cannot be run. Only ${MAX_NUMPROCS} are available. Default testing is performed")
+  endif()
+endif ()
+#string(JOIN " " TEST_ARGUMENTS "${PROW}" "${PCOL}")
+set(TEST_ARGUMENTS "${PROW}" "${PCOL}")
+message(STATUS "Test argument string ${TEST_ARGUMENTS}")
diff --git a/examples/cmake/D2D_Set_Resolution.cmake b/examples/cmake/D2D_Set_Resolution.cmake
new file mode 100644
index 00000000..dae567c1
--- /dev/null
+++ b/examples/cmake/D2D_Set_Resolution.cmake
@@ -0,0 +1,11 @@
+# Set the resolution in each direction
+set(NX 0 CACHE STRING "Number of points in X direction, 0 for default resolution")
+set(NY 0 CACHE STRING "Number of points in Y direction, 0 for default resolution")
+set(NZ 0 CACHE STRING "Number of points in Z direction, 0 for default resolution")
+math(EXPR NPOINTS "${NX} * ${NY} * ${NZ}")
+message(STATUS "Total number of points: ${NPOINTS}")
+if (NPOINTS GREATER "0")
+  list(APPEND TEST_ARGUMENTS "${NX}")
+  list(APPEND TEST_ARGUMENTS "${NY}")
+  list(APPEND TEST_ARGUMENTS "${NZ}")
+endif()
diff --git a/examples/fft_physical_x/.gitignore b/examples/fft_physical_x/.gitignore
index d69230b2..91b1e5ef 100644
--- a/examples/fft_physical_x/.gitignore
+++ b/examples/fft_physical_x/.gitignore
@@ -1 +1,4 @@
 fft_physical_x
+fft_grid_x
+fft_c2c_x
+fft_r2c_x
diff --git a/examples/fft_physical_x/CMakeLists.txt b/examples/fft_physical_x/CMakeLists.txt
new file mode 100644
index 00000000..ca6c65c2
--- /dev/null
+++ b/examples/fft_physical_x/CMakeLists.txt
@@ -0,0 +1,45 @@
+file(GLOB files_fft_c2c fft_c2c_x.f90)
+file(GLOB files_fft_r2c fft_r2c_x.f90)
+file(GLOB files_fft_grid fft_grid_x.f90)
+
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(fft_c2c_x ${files_fft_c2c})
+add_executable(fft_r2c_x ${files_fft_r2c})
+add_executable(fft_grid_x ${files_fft_grid})
+
+target_link_libraries(fft_c2c_x PRIVATE decomp2d)
+target_link_libraries(fft_r2c_x PRIVATE decomp2d)
+target_link_libraries(fft_grid_x PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/fft_c2c_x")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME fft_c2c_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:fft_c2c_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME fft_c2c_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:fft_c2c_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif()
+#
+set(run_dir "${test_dir}/fft_r2c_x")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+
+  add_test(NAME fft_r2c_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:fft_r2c_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME fft_r2c_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:fft_r2c_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif()
+#
+set(run_dir "${test_dir}/fft_grid_x")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME fft_grid_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh  $<TARGET_FILE:fft_grid_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME fft_grid_x COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:fft_grid_x> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif()
diff --git a/examples/fft_physical_x/Makefile b/examples/fft_physical_x/Makefile
deleted file mode 100644
index 746c1c63..00000000
--- a/examples/fft_physical_x/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-#include ../../src/Makefile.inc
-
-#INCLUDE = -I../../include
-FFLAGS := $(subst $(MODFLAG),$(MODFLAG)../../,$(FFLAGS))
-FFLAGS := $(patsubst -I%,-I../../%,$(FFLAGS))
-LIBS = -L../../ -l$(LIBDECOMP) $(LIBFFT) $(LFLAGS)
-
-OBJ = fft_physical_x.o
-
-NP ?= 1
-MPIRUN ?= mpirun
-
-fft_physical_x: $(OBJ)
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $(OBJ) $(LIBS)
-
-ifeq ($(PARAMOD),gpu)
-check:
-	$(MPIRUN) -n $(NP) ./bind.sh ./fft_physical_x
-else
-check:
-	$(MPIRUN) -n $(NP) ./fft_physical_x
-endif
-
-mem_leak:
-	valgrind --leak-check=full --show-leak-kinds=all $(MPIRUN) -n 1 ./fft_physical_x 1 1
-
-clean:
-	rm -f *.o fft_physical_x *.log
-
-%.o : %.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
diff --git a/examples/fft_physical_x/README b/examples/fft_physical_x/README
deleted file mode 100644
index 01fa4dc7..00000000
--- a/examples/fft_physical_x/README
+++ /dev/null
@@ -1,13 +0,0 @@
-fft_physical_x
-------
-
-This program can be used to test the fft trasform using X as starting physical dimension. 
-Both c2c and r2c/c2r transforms are testes.
-The results should recover the input data up to machine accuracy
-after a forward and a backward transform and proper normalisation.
-The test automatically resize the problem depending on the number of MPI processes in use
-
-What to expect:
-- The timing results 
-- The error reported should be around machine accuracy (~ 10^-6 for single
-  precision and 10^-15 for double)
diff --git a/examples/fft_physical_x/README.md b/examples/fft_physical_x/README.md
new file mode 100644
index 00000000..732ea5d3
--- /dev/null
+++ b/examples/fft_physical_x/README.md
@@ -0,0 +1,33 @@
+# Test FFT for X pencil decomposition 
+
+List of the tests:
+- [fft_c2c_x](fft_c2c_x.f90): Test Complex to Complex FFT transform; 
+- [fft_r2c_x](fft_r2c_x.f90): Test Real to Complex FFT transform;
+- [fft_grid_x](fft_grid_x.f90): Test Real to Complex transform of a different grid than the one used 
+                                for the initialization. 
+
+
+These programs can be used to test the FFT transform using X-pencils as starting domain decomposition. 
+Both c2c (fft_c2c_x) and r2c/c2r (fft_r2c_x) transforms are tested.
+The case fft_grod uses a different resolution from the one used for the initialization. 
+The results should recover the input data up to machine accuracy
+after a forward and a backward transform and appropriate normalisation.
+The test automatically resize the problem depending on the number of MPI processes in use
+
+What to input: The program takes max 6 inputs as : 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+1. nt    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary
+
+What to expect:
+- The timing results 
+- The error reported should be around machine accuracy (~ 10^-6 for single
+  precision and 10^-15 for double)
+- In case of the GENERIC FFT expect an increase in the order of the error
diff --git a/examples/fft_physical_x/fft_c2c_x.f90 b/examples/fft_physical_x/fft_c2c_x.f90
new file mode 100644
index 00000000..e838d7b1
--- /dev/null
+++ b/examples/fft_physical_x/fft_c2c_x.f90
@@ -0,0 +1,220 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program fft_c2c_x
+
+   use decomp_2d
+   use decomp_2d_fft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use cufft
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ntest = 10  ! repeat test this times
+
+   type(decomp_info), pointer :: ph => null()
+   complex(mytype), allocatable, dimension(:, :, :) :: in, out
+
+   real(mytype) :: dr, di, error, err_all
+   integer :: ierror, i, j, k, m
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   double precision :: n1, flops, t1, t2, t3, t4
+#ifdef DOUBLE_PREC
+   real(mytype), parameter :: error_precision = 1.e-12_mytype
+#else
+   real(mytype), parameter :: error_precision = 5.e-6_mytype
+#endif
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            ntest = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Test the c2c interface
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+   call decomp_2d_fft_init(PHYSICAL_IN_X) ! force the default x pencil
+   ph => decomp_2d_fft_get_ph()
+   !  input is X-pencil data
+   ! output is Z-pencil data
+   call alloc_x(in, ph, opt_global=.true.)
+   call alloc_z(out, ph, opt_global=.true.)
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   ! initilise input
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            in(i, j, k) = cmplx(dr, di, mytype)
+         end do
+      end do
+   end do
+
+   !$acc data copyin(in) copy(out)
+   ! First iterations out of the counting loop
+   t1 = MPI_WTIME()
+   call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+   t2 = MPI_WTIME() - t1
+   t3 = MPI_WTIME()
+   call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+   t4 = MPI_WTIME() - t3
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc)
+   if (nrank == 0) then
+      write (*, *) '===== c2c interface ====='
+      write (*, *) 'First iteration with dedicated timer'
+      write (*, *) '     time (sec): ', t1, t3
+      write (*, *) ''
+   end if
+   ! Init the time
+   t2 = 0.d0
+   t4 = 0.d0
+   !$acc kernels
+   in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+   !$acc end kernels
+   do m = 1, ntest
+
+      ! forward FFT
+      t1 = MPI_WTIME()
+      call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+      t2 = t2 + MPI_WTIME() - t1
+
+      ! inverse FFT
+      t3 = MPI_WTIME()
+      call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+      t4 = t4 + MPI_WTIME() - t3
+
+      ! normalisation - note 2DECOMP&FFT doesn't normalise
+      !$acc kernels
+      in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      !$acc end kernels
+
+   end do
+#if defined(_GPU)
+   ierror = cudaDeviceSynchronize()
+#endif
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(ntest)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(ntest)
+
+   ! checking accuracy
+   error = 0._mytype
+   !$acc parallel loop default(present) reduction(+:error)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            dr = dr - real(in(i, j, k), mytype)
+            di = di - aimag(in(i, j, k))
+            error = error + sqrt(dr * dr + di * di)
+         end do
+      end do
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+   err_all = err_all / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+   if (err_all > error_precision) then
+      if (nrank == 0) write (*, *) 'error / mesh point: ', err_all
+      call decomp_2d_abort(1, "error for the FFT too large")
+   end if
+
+   if (nrank == 0) then
+      write (*, *) 'error / mesh point: ', err_all
+      write (*, *) 'Avg time (sec): ', t1, t3
+      n1 = real(nx) * real(ny) * real(nz)
+      n1 = n1**(1.d0 / 3.d0)
+      ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
+      flops = 5.d0 * n1 * log(n1) / log(2.d0)
+      ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
+      flops = flops * 3.d0 * n1**2
+      flops = 2.d0 * flops / (t1 + t3)
+      write (*, *) 'GFLOPS : ', flops / 1000.d0**3
+      write (*, *) '   '
+      write (*, *) 'fft_c2c_x completed '
+      write (*, *) '   '
+   end if
+   !$acc end data
+
+   deallocate (in, out)
+   nullify (ph)
+   call decomp_2d_fft_finalize
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+end program fft_c2c_x
+
diff --git a/examples/fft_physical_x/fft_grid_x.f90 b/examples/fft_physical_x/fft_grid_x.f90
new file mode 100644
index 00000000..15c1431c
--- /dev/null
+++ b/examples/fft_physical_x/fft_grid_x.f90
@@ -0,0 +1,221 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program fft_physical_x
+
+   use decomp_2d
+   use decomp_2d_fft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use cufft
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ntest = 10  ! repeat test this times
+
+   type(decomp_info), pointer :: ph => null()
+   complex(mytype), allocatable, dimension(:, :, :) :: in, out
+
+   real(mytype) :: dr, di, error, err_all
+   integer :: ierror, i, j, k, m
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   double precision :: n1, flops, t1, t2, t3, t4
+#ifdef DOUBLE_PREC
+   real(mytype), parameter :: error_precision = 1.e-12_mytype
+#else
+   real(mytype), parameter :: error_precision = 5.e-6_mytype
+#endif
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            ntest = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx + 1, ny + 1, nz + 1, p_row, p_col)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Test the c2c interface
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+   call decomp_2d_fft_init(PHYSICAL_IN_X, nx, ny, nz) ! force the default x pencil
+   ph => decomp_2d_fft_get_ph()
+   !  input is X-pencil data
+   ! output is Z-pencil data
+   call alloc_x(in, ph, opt_global=.true.)
+   call alloc_z(out, ph, opt_global=.true.)
+   ! Convert pointers to loops start/end to scalar
+   ! This is define loop on GPUs
+   xst1 = ph%xst(1); xen1 = ph%xen(1); 
+   xst2 = ph%xst(2); xen2 = ph%xen(2); 
+   xst3 = ph%xst(3); xen3 = ph%xen(3); 
+   ! initilise input
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            in(i, j, k) = cmplx(dr, di, mytype)
+         end do
+      end do
+   end do
+
+   !$acc data copyin(in) copy(out)
+   ! First iterations out of the counting loop
+   t1 = MPI_WTIME()
+   call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+   t2 = MPI_WTIME() - t1
+   t3 = MPI_WTIME()
+   call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+   t4 = MPI_WTIME() - t3
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc)
+   if (nrank == 0) then
+      write (*, *) '===== c2c interface ====='
+      write (*, *) 'First iteration with dedicated timer'
+      write (*, *) '     time (sec): ', t1, t3
+      write (*, *) ''
+   end if
+   ! Init the time
+   t2 = 0.d0
+   t4 = 0.d0
+   !$acc kernels
+   in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+   !$acc end kernels
+   do m = 1, ntest
+
+      ! forward FFT
+      t1 = MPI_WTIME()
+      call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+      t2 = t2 + MPI_WTIME() - t1
+
+      ! inverse FFT
+      t3 = MPI_WTIME()
+      call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+      t4 = t4 + MPI_WTIME() - t3
+
+      ! normalisation - note 2DECOMP&FFT doesn't normalise
+      !$acc kernels
+      in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      !$acc end kernels
+
+   end do
+#if defined(_GPU)
+   ierror = cudaDeviceSynchronize()
+#endif
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(ntest)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(ntest)
+
+   ! checking accuracy
+   error = 0._mytype
+   !$acc parallel loop default(present) reduction(+:error)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            dr = dr - real(in(i, j, k), mytype)
+            di = di - aimag(in(i, j, k))
+            error = error + sqrt(dr * dr + di * di)
+         end do
+      end do
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+   err_all = err_all / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+   if (err_all > error_precision) then
+      if (nrank == 0) write (*, *) 'error / mesh point: ', err_all
+      call decomp_2d_abort(1, "error for the FFT too large")
+   end if
+
+   if (nrank == 0) then
+      write (*, *) 'error / mesh point: ', err_all
+      write (*, *) 'Avg time (sec): ', t1, t3
+      n1 = real(nx) * real(ny) * real(nz)
+      n1 = n1**(1.d0 / 3.d0)
+      ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
+      flops = 5.d0 * n1 * log(n1) / log(2.d0)
+      ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
+      flops = flops * 3.d0 * n1**2
+      flops = 2.d0 * flops / (t1 + t3)
+      write (*, *) 'GFLOPS : ', flops / 1000.d0**3
+      write (*, *) '   '
+      write (*, *) 'fft_physical_x completed '
+      write (*, *) '   '
+   end if
+   !$acc end data
+
+   deallocate (in, out)
+   nullify (ph)
+   call decomp_2d_fft_finalize
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+end program fft_physical_x
+
diff --git a/examples/fft_physical_x/fft_physical_x.f90 b/examples/fft_physical_x/fft_physical_x.f90
deleted file mode 100644
index 9a999e95..00000000
--- a/examples/fft_physical_x/fft_physical_x.f90
+++ /dev/null
@@ -1,207 +0,0 @@
-program fft_physical_x
-
-  use decomp_2d
-  use decomp_2d_fft
-  use MPI
-#if defined(_GPU) 
-  use cudafor
-  use cufft
-  use openacc 
-#endif
-
-  implicit none
-  
-  integer, parameter :: nx_base=17, ny_base=13, nz_base=11
-  integer :: nx, ny, nz
-  integer :: p_row=0, p_col=0
-  integer :: resize_domain
-  integer :: nranks_tot
-  
-  integer, parameter :: ntest = 10  ! repeat test this times
-  
-  complex(mytype), allocatable, dimension(:,:,:) :: in, out
-  real(mytype), allocatable, dimension(:,:,:) :: in_r
-  
-  integer, dimension(3) :: fft_start, fft_end, fft_size
-  
-  real(mytype) :: dr,di, error, err_all, n1,flops
-  integer :: ierror, i,j,k,m
-  real(mytype) :: t1, t2, t3 ,t4
-  
-  call MPI_INIT(ierror)
-  ! To resize the domain we need to know global number of ranks
-  ! This operation is also done as part of decomp_2d_init
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
-  resize_domain = int(nranks_tot/4)+1
-  nx = nx_base*resize_domain
-  ny = ny_base*resize_domain
-  nz = nz_base*resize_domain
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Test the c2c interface
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-
-  call decomp_2d_fft_init(PHYSICAL_IN_X) ! force the default x pencil
-
-  !  input is X-pencil data
-  ! output is Z-pencil data
-  allocate (in(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-  allocate (out(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-  ! initilise input
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           di = dr
-           in(i,j,k) = cmplx(dr,di,mytype)
-        end do
-     end do
-  end do
-  
-  t2 = 0._mytype
-  t4 = 0._mytype
-  do m=1,ntest
-     
-     ! forward FFT
-     t1 = MPI_WTIME()
-     call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
-     t2 = t2 + MPI_WTIME() - t1
-
-     ! inverse FFT
-     t3 = MPI_WTIME()
-     call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
-     t4 = t4 + MPI_WTIME() - t3
-  
-     ! normalisation - note 2DECOMP&FFT doesn't normalise
-     !$acc kernels
-     in = in / real(nx,mytype) / real(ny,mytype) /real(nz,mytype)
-     !$acc end kernels
-
-  end do
-#if defined(_GPU)
-  ierror = cudaDeviceSynchronize()
-#endif
-  
-  call MPI_ALLREDUCE(t2,t1,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t1 = t1 / real(nproc,mytype)
-  call MPI_ALLREDUCE(t4,t3,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t3 = t3 / real(nproc,mytype)
-  
-  ! checking accuracy
-  error = 0._mytype
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           di = dr
-           dr = dr - real(in(i,j,k),mytype)
-           di = di - aimag(in(i,j,k))
-           error = error + sqrt(dr*dr + di*di)
-        end do
-     end do
-  end do
-  call MPI_ALLREDUCE(error,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
-  err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
-
-  if (nrank==0) then
-     write(*,*) '===== c2c interface ====='
-     write(*,*) 'error / mesh point: ', err_all 
-     write(*,*) 'time (sec): ', t1,t3
-     n1 = real(nx,mytype) * real(ny,mytype) * real(nz,mytype)
-     n1 = n1 ** (1._mytype/3._mytype)
-     ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
-     flops = 5._mytype * n1 * log(n1) / log(2.0_mytype)
-     ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
-     flops = flops * 3._mytype * n1**2  
-     flops = 2._mytype * flops / ((t1+t3)/real(NTEST,mytype))
-     write(*,*) 'GFLOPS : ', flops / 1000._mytype**3
-  end if
-  
-  deallocate(in,out)
-  call decomp_2d_fft_finalize
-
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Test the r2c/c2r interface
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  call decomp_2d_fft_init(PHYSICAL_IN_X)
-  
-  allocate (in_r(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-  call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
-  allocate (out(fft_start(1):fft_end(1), &
-                fft_start(2):fft_end(2), &
-                fft_start(3):fft_end(3)))
-  
-  ! initilise input
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           in_r(i,j,k) = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-        end do
-     end do
-  end do
-  
-  t2 = 0._mytype
-  t4 = 0._mytype
-  do m=1,ntest
-  
-     ! 3D r2c FFT
-     t1 = MPI_WTIME()
-     call decomp_2d_fft_3d(in_r, out)
-     t2 = t2 + MPI_WTIME() - t1
-  
-     ! 3D inverse FFT
-     t3 = MPI_WTIME()
-     call decomp_2d_fft_3d(out, in_r)
-     t4 = t4 + MPI_WTIME() - t3
-  
-     ! normalisation - note 2DECOMP&FFT doesn't normalise
-     !$acc kernels
-     in_r = in_r / (real(nx,mytype)*real(ny,mytype)*real(nz,mytype))
-     !$acc end kernels
-
-  end do
-#if defined(_GPU)
-  ierror = cudaDeviceSynchronize()
-#endif 
-
-  call MPI_ALLREDUCE(t2,t1,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t1 = t1 / real(nproc,mytype)
-  call MPI_ALLREDUCE(t4,t3,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t3 = t3 / real(nproc,mytype)
-  
-  ! checking accuracy
-  error = 0._mytype
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           error = error + abs(in_r(i,j,k)-dr)
-        end do
-     end do
-  end do
-  call MPI_ALLREDUCE(error,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
-  err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
-  
-  if (nrank==0) then
-     write(*,*) '===== r2c/c2r interface ====='
-     write(*,*) 'error / mesh point: ', err_all
-     write(*,*) 'time (sec): ', t1,t3
-  end if
-  
-  deallocate(in_r,out)
-  call decomp_2d_fft_finalize
-  call decomp_2d_finalize
-  call MPI_FINALIZE(ierror)
-
-end program fft_physical_x
-
diff --git a/examples/fft_physical_x/fft_r2c_x.f90 b/examples/fft_physical_x/fft_r2c_x.f90
new file mode 100644
index 00000000..8866b835
--- /dev/null
+++ b/examples/fft_physical_x/fft_r2c_x.f90
@@ -0,0 +1,211 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program fft_r2c_x
+
+   use decomp_2d
+   use decomp_2d_fft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use cufft
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ntest = 10  ! repeat test this times
+
+   type(decomp_info), pointer :: ph => null(), sp => null()
+   complex(mytype), allocatable, dimension(:, :, :) :: out
+   real(mytype), allocatable, dimension(:, :, :) :: in_r
+
+   real(mytype) :: dr, error, err_all
+   integer :: ierror, i, j, k, m
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   double precision :: t1, t2, t3, t4
+#ifdef DOUBLE_PREC
+   real(mytype), parameter :: error_precision = 1.e-12_mytype
+#else
+   real(mytype), parameter :: error_precision = 5.e-6_mytype
+#endif
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            ntest = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Test the r2c/c2r interface
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   call decomp_2d_fft_init(PHYSICAL_IN_X)
+
+   ph => decomp_2d_fft_get_ph()
+   sp => decomp_2d_fft_get_sp()
+   !  input is X-pencil data
+   ! output is Z-pencil data
+   call alloc_x(in_r, ph, opt_global=.true.)
+   call alloc_z(out, sp, opt_global=.true.)
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   ! initilise input
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            in_r(i, j, k) = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                            / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+         end do
+      end do
+   end do
+
+   !$acc data copyin(in_r) copy(out)
+   ! First iterations out of the counting loop
+   t1 = MPI_WTIME()
+   call decomp_2d_fft_3d(in_r, out)
+   t2 = MPI_WTIME() - t1
+   t3 = MPI_WTIME()
+   call decomp_2d_fft_3d(out, in_r)
+   t4 = MPI_WTIME() - t3
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc)
+   if (nrank == 0) then
+      write (*, *) '===== r2c/c2r interface ====='
+      write (*, *) 'First iteration with dedicated timer'
+      write (*, *) '     time (sec): ', t1, t3
+      write (*, *) ''
+   end if
+   ! Init the time
+   t2 = 0.d0
+   t4 = 0.d0
+   !$acc kernels
+   in_r = in_r / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+   !$acc end kernels
+   do m = 1, ntest
+
+      ! 3D r2c FFT
+      t1 = MPI_WTIME()
+      call decomp_2d_fft_3d(in_r, out)
+      t2 = t2 + MPI_WTIME() - t1
+
+      ! 3D inverse FFT
+      t3 = MPI_WTIME()
+      call decomp_2d_fft_3d(out, in_r)
+      t4 = t4 + MPI_WTIME() - t3
+
+      ! normalisation - note 2DECOMP&FFT doesn't normalise
+      !$acc kernels
+      in_r = in_r / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      !$acc end kernels
+
+   end do
+#if defined(_GPU)
+   ierror = cudaDeviceSynchronize()
+#endif
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(ntest)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(ntest)
+
+   ! checking accuracy
+   error = 0._mytype
+   !$acc parallel loop default(present) reduction(+:error)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            error = error + abs(in_r(i, j, k) - dr)
+         end do
+      end do
+   end do
+   !$acc end loop
+
+   call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+   err_all = err_all / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+   if (err_all > error_precision) then
+      if (nrank == 0) write (*, *) 'error / mesh point: ', err_all
+      call decomp_2d_abort(1, "error for the FFT too large")
+   end if
+
+   if (nrank == 0) then
+      write (*, *) 'error / mesh point: ', err_all
+      write (*, *) 'time (sec): ', t1, t3
+      write (*, *) '   '
+      write (*, *) 'fft_r2c_x completed '
+      write (*, *) '   '
+   end if
+   !$acc end data
+
+   deallocate (in_r, out)
+   nullify (ph)
+   nullify (sp)
+   call decomp_2d_fft_finalize
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+end program fft_r2c_x
+
diff --git a/examples/fft_physical_z/.gitignore b/examples/fft_physical_z/.gitignore
index 26337569..229609ed 100644
--- a/examples/fft_physical_z/.gitignore
+++ b/examples/fft_physical_z/.gitignore
@@ -1 +1,3 @@
 fft_physical_z
+fft_c2c_z
+fft_r2c_z
diff --git a/examples/fft_physical_z/CMakeLists.txt b/examples/fft_physical_z/CMakeLists.txt
new file mode 100644
index 00000000..05e79264
--- /dev/null
+++ b/examples/fft_physical_z/CMakeLists.txt
@@ -0,0 +1,31 @@
+file(GLOB files_fft_c2c fft_c2c_z.f90)
+file(GLOB files_fft_r2c fft_r2c_z.f90)
+
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(fft_c2c_z ${files_fft_c2c})
+add_executable(fft_r2c_z ${files_fft_r2c})
+
+target_link_libraries(fft_c2c_z PRIVATE decomp2d)
+target_link_libraries(fft_r2c_z PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/fft_c2c_z")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME fft_c2c_z COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:fft_c2c_z> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME fft_c2c_z COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:fft_c2c_z> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# 
+set(run_dir "${test_dir}/fft_r2c_z")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME fft_r2c_z COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:fft_r2c_z> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME fft_r2c_z COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:fft_r2c_z> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
diff --git a/examples/fft_physical_z/Makefile b/examples/fft_physical_z/Makefile
deleted file mode 100644
index 396ff848..00000000
--- a/examples/fft_physical_z/Makefile
+++ /dev/null
@@ -1,31 +0,0 @@
-#include ../../src/Makefile.inc
-
-#INCLUDE = -I../../include
-FFLAGS := $(subst $(MODFLAG),$(MODFLAG)../../,$(FFLAGS))
-FFLAGS := $(patsubst -I%,-I../../%,$(FFLAGS))
-LIBS = -L../../ -l$(LIBDECOMP) $(LIBFFT) $(LFLAGS)
-
-OBJ = fft_physical_z.o
-
-NP ?= 1
-MPIRUN ?= mpirun
-
-fft_physical_z: $(OBJ)
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $(OBJ) $(LIBS)
-
-ifeq ($(PARAMOD),gpu)
-check:
-	$(MPIRUN) -n $(NP) ./bind.sh ./fft_physical_z
-else
-check:
-	$(MPIRUN) -n $(NP) ./fft_physical_z
-endif
-
-mem_leak:
-	valgrind --leak-check=full --show-leak-kinds=all $(MPIRUN) -n 1 ./fft_physical_z 1 1
-
-clean:
-	rm -f *.o fft_physical_z *.log
-
-%.o : %.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
diff --git a/examples/fft_physical_z/README b/examples/fft_physical_z/README
deleted file mode 100644
index e726d909..00000000
--- a/examples/fft_physical_z/README
+++ /dev/null
@@ -1,13 +0,0 @@
-fft_physical_z
-------
-
-This program can be used to test the fft trasform using Z as starting physical dimension. 
-Both c2c and r2c/c2r transforms are testes.
-The results should recover the input data up to machine accuracy
-after a forward and a backward transform and proper normalisation.
-The test automatically resize the problem depending on the number of MPI processes in use
-
-What to expect:
-- The timing results 
-- The error reported should be around machine accuracy (~ 10^-6 for single
-  precision and 10^-15 for double)
diff --git a/examples/fft_physical_z/README.md b/examples/fft_physical_z/README.md
new file mode 100644
index 00000000..ec569db3
--- /dev/null
+++ b/examples/fft_physical_z/README.md
@@ -0,0 +1,30 @@
+# Test FFT for Z pencil decomposition 
+
+List of the tests:
+- [fft_c2c_z](fft_c2c_z.f90): Test Complex to Complex FFT transform; 
+- [fft_r2c_x](fft_r2c_z.f90): Test Real to Complex FFT transform;
+
+
+These programs can be used to test the FFT transform using Z-pencils as starting domain decomposition. 
+Both c2c (fft_c2c_z) and r2c/c2r (fft_r2c_z) transforms are tested.
+The results should recover the input data up to machine accuracy
+after a forward and a backward transform and appropriate normalisation.
+The test automatically resize the problem depending on the number of MPI processes in use
+
+What to input: The program takes max 6 inputs as : 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+1. nt    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary
+
+What to expect:
+- The timing results 
+- The error reported should be around machine accuracy (~ 10^-6 for single
+  precision and 10^-15 for double)
+- In case of the GENERIC FFT expect an increase in the order of the error
diff --git a/examples/fft_physical_z/fft_c2c_z.f90 b/examples/fft_physical_z/fft_c2c_z.f90
new file mode 100644
index 00000000..84f6e887
--- /dev/null
+++ b/examples/fft_physical_z/fft_c2c_z.f90
@@ -0,0 +1,219 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program fft_c2c_z
+
+   use decomp_2d
+   use decomp_2d_fft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use cufft
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ntest = 10  ! repeat test this times
+
+   type(decomp_info), pointer :: ph => null()
+   complex(mytype), allocatable, dimension(:, :, :) :: in, out
+
+   real(mytype) :: dr, di, error, err_all
+   integer :: ierror, i, j, k, m
+   integer :: zst1, zst2, zst3
+   integer :: zen1, zen2, zen3
+   double precision :: n1, flops, t1, t2, t3, t4
+#ifdef DOUBLE_PREC
+   real(mytype), parameter :: error_precision = 1.e-12_mytype
+#else
+   real(mytype), parameter :: error_precision = 5.e-6_mytype
+#endif
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            ntest = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Test the c2c interface
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input
+
+   ph => decomp_2d_fft_get_ph()
+   !  input is Z-pencil data
+   ! output is X-pencil data
+   call alloc_z(in, ph, opt_global=.true.)
+   call alloc_x(out, ph, opt_global=.true.)
+   zst1 = zstart(1); zen1 = zend(1)
+   zst2 = zstart(2); zen2 = zend(2)
+   zst3 = zstart(3); zen3 = zend(3)
+
+   ! initilise input
+   do k = zst3, zen3
+      do j = zst2, zen2
+         do i = zst1, zen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            in(i, j, k) = cmplx(dr, di, mytype)
+         end do
+      end do
+   end do
+
+   !$acc data copyin(in) copy(out)
+   ! First iterations out of the counting loop
+   t1 = MPI_WTIME()
+   call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+   t2 = MPI_WTIME() - t1
+   t3 = MPI_WTIME()
+   call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+   t4 = MPI_WTIME() - t3
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc)
+   if (nrank == 0) then
+      write (*, *) '===== c2c interface ====='
+      write (*, *) 'First iteration with dedicated timer'
+      write (*, *) '     time (sec): ', t1, t3
+      write (*, *) ''
+   end if
+   ! Init the time
+   t2 = 0.d0
+   t4 = 0.d0
+   !$acc kernels
+   in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+   !$acc end kernels
+   do m = 1, ntest
+
+      ! forward FFT
+      t1 = MPI_WTIME()
+      call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
+      t2 = t2 + MPI_WTIME() - t1
+
+      ! inverse FFT
+      t3 = MPI_WTIME()
+      call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
+      t4 = t4 + MPI_WTIME() - t3
+
+      ! normalisation - note 2DECOMP&FFT doesn't normalise
+      !$acc kernels
+      in = in / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      !$acc end kernels
+
+   end do
+#if defined(_GPU)
+   ierror = cudaDeviceSynchronize()
+#endif
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(ntest)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(ntest)
+
+   ! checking accuracy
+   error = 0._mytype
+   !$acc parallel loop default(present) reduction(+:error)
+   do k = zst3, zen3
+      do j = zst2, zen2
+         do i = zst1, zen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            di = dr
+            dr = dr - real(in(i, j, k), mytype)
+            di = di - aimag(in(i, j, k))
+            error = error + sqrt(dr * dr + di * di)
+         end do
+      end do
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+   err_all = err_all / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+   if (err_all > error_precision) then
+      if (nrank == 0) write (*, *) 'error / mesh point: ', err_all
+      call decomp_2d_abort(1, "error for the FFT too large")
+   end if
+
+   if (nrank == 0) then
+      write (*, *) 'error / mesh point: ', err_all
+      write (*, *) 'Avg time (sec): ', t1, t3
+      n1 = real(nx) * real(ny) * real(nz)
+      n1 = n1**(1.d0 / 3.d0)
+      ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
+      flops = 5.d0 * n1 * log(n1) / log(2.d0)
+      ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
+      flops = flops * 3.d0 * n1**2
+      flops = 2.d0 * flops / (t1 + t3)
+      write (*, *) 'GFLOPS : ', flops / 1000.d0**3
+      write (*, *) '   '
+      write (*, *) 'fft_c2c_z completed '
+   end if
+   !$acc end data
+
+   deallocate (in, out)
+   nullify (ph)
+   call decomp_2d_fft_finalize
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+end program fft_c2c_z
diff --git a/examples/fft_physical_z/fft_physical_z.f90 b/examples/fft_physical_z/fft_physical_z.f90
deleted file mode 100644
index 5610b8e9..00000000
--- a/examples/fft_physical_z/fft_physical_z.f90
+++ /dev/null
@@ -1,208 +0,0 @@
-program fft_physical_z
-
-  use decomp_2d
-  use decomp_2d_fft
-  use MPI
-#if defined(_GPU) 
-  use cudafor
-  use cufft
-  use openacc 
-#endif
-
-  implicit none
-  
-  !integer, parameter :: nx_base=4, ny_base=2, nz_base=3
-  integer, parameter :: nx_base=17, ny_base=13, nz_base=11
-  integer :: nx, ny, nz
-  integer :: p_row=0, p_col=0
-  integer :: resize_domain
-  integer :: nranks_tot
-  
-  integer, parameter :: ntest = 10  ! repeat test this times
-  
-  complex(mytype), allocatable, dimension(:,:,:) :: in, out
-  real(mytype), allocatable, dimension(:,:,:) :: in_r
-
-  integer, dimension(3) :: fft_start, fft_end, fft_size
-  
-  real(mytype) :: dr,di, error, err_all, n1,flops
-  integer :: ierror, i,j,k,m
-  real(mytype) :: t1, t2, t3 ,t4
-  
-  call MPI_INIT(ierror)
-  ! To resize the domain we need to know global number of ranks
-  ! This operation is also done as part of decomp_2d_init
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
-  resize_domain = int(nranks_tot/4)+1
-  nx = nx_base*resize_domain
-  ny = ny_base*resize_domain
-  nz = nz_base*resize_domain
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Test the c2c interface
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input
-  !  input is Z-pencil data
-  ! output is X-pencil data
-  allocate (in(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-  allocate (out(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-  ! initilise input
-  do k=zstart(3),zend(3)
-     do j=zstart(2),zend(2)
-        do i=zstart(1),zend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           di = dr
-           in(i,j,k) = cmplx(dr,di,mytype)
-        end do
-     end do
-  end do
-  
-  t2 = 0._mytype
-  t4 = 0._mytype
-  do m=1,ntest
-     
-     ! forward FFT
-     t1 = MPI_WTIME()
-     call decomp_2d_fft_3d(in, out, DECOMP_2D_FFT_FORWARD)
-     t2 = t2 + MPI_WTIME() - t1
-
-     ! inverse FFT
-     t3 = MPI_WTIME()
-     call decomp_2d_fft_3d(out, in, DECOMP_2D_FFT_BACKWARD)
-     t4 = t4 + MPI_WTIME() - t3
-  
-     ! normalisation - note 2DECOMP&FFT doesn't normalise
-     !$acc kernels
-     in = in / real(nx,mytype) / real(ny,mytype) /real(nz,mytype)
-     !$acc end kernels
-
-  end do
-#if defined(_GPU)
-  ierror = cudaDeviceSynchronize()
-#endif
-  
-
-  call MPI_ALLREDUCE(t2,t1,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t1 = t1 / real(nproc,mytype)
-  call MPI_ALLREDUCE(t4,t3,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t3 = t3 / real(nproc,mytype)
-  
-  ! checking accuracy
-  error = 0._mytype
-  do k=zstart(3),zend(3)
-     do j=zstart(2),zend(2)
-        do i=zstart(1),zend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           di = dr
-           dr = dr - real(in(i,j,k),mytype)
-           di = di - aimag(in(i,j,k))
-           error = error + sqrt(dr*dr + di*di)
-        end do
-     end do
-  end do
-  call MPI_ALLREDUCE(error,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
-  err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
-
-  if (nrank==0) then
-     write(*,*) '===== c2c interface ====='
-     write(*,*) 'error / mesh point: ', err_all 
-     write(*,*) 'time (sec): ', t1,t3
-     n1 = real(nx,mytype) * real(ny,mytype) * real(nz,mytype)
-     n1 = n1 ** (1._mytype/3._mytype)
-     ! 5n*log(n) flops per 1D FFT of size n using Cooley-Tukey algorithm
-     flops = 5._mytype * n1 * log(n1) / log(2.0_mytype)
-     ! 3 sets of 1D FFTs for 3 directions, each having n^2 1D FFTs
-     flops = flops * 3._mytype * n1**2  
-     flops = 2._mytype * flops / ((t1+t3)/real(NTEST,mytype))
-     write(*,*) 'GFLOPS : ', flops / 1000._mytype**3
-  end if
-  
-  deallocate(in,out)
-  call decomp_2d_fft_finalize
-
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Test the r2c/c2r interface
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input
-  
-  allocate (in_r(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-  call decomp_2d_fft_get_size(fft_start,fft_end,fft_size)
-  allocate (out(fft_start(1):fft_end(1), &
-                fft_start(2):fft_end(2), &
-                fft_start(3):fft_end(3)))
-  
-  ! initilise input
-  do k=zstart(3),zend(3)
-     do j=zstart(2),zend(2)
-        do i=zstart(1),zend(1)
-           in_r(i,j,k) = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-        end do
-     end do
-  end do
-  
-  t2 = 0._mytype
-  t4 = 0._mytype
-  do m=1,ntest
-     
-     ! 3D r2c FFT
-     t1 = MPI_WTIME()
-     call decomp_2d_fft_3d(in_r, out)
-     t2 = t2 + MPI_WTIME() - t1
-  
-     ! 3D inverse FFT
-     t3 = MPI_WTIME()
-     call decomp_2d_fft_3d(out, in_r)
-     t4 = t4 + MPI_WTIME() - t3
-  
-     !$acc kernels
-     in_r = in_r / real(nx,mytype) / real(ny,mytype) /real(nz,mytype)
-     !$acc end kernels     
-
-  end do
-#if defined(_GPU)
-  ierror = cudaDeviceSynchronize()
-#endif 
-
-  call MPI_ALLREDUCE(t2,t1,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t1 = t1 / real(nproc,mytype)
-  call MPI_ALLREDUCE(t4,t3,1,real_type,MPI_SUM, &
-       MPI_COMM_WORLD,ierror)
-  t3 = t3 / real(nproc,mytype)
-  
-  ! checking accuracy
-  error = 0._mytype
-  do k=zstart(3),zend(3)
-     do j=zstart(2),zend(2)
-        do i=zstart(1),zend(1)
-           dr = real(i,mytype)/real(nx,mytype)*real(j,mytype) &
-                /real(ny,mytype)*real(k,mytype)/real(nz,mytype)
-           error = error + abs(in_r(i,j,k)-dr)
-           !write(*,10) nrank,k,j,i,dr,in_r(i,j,k)
-        end do
-     end do
-  end do
-!10 format('in_r final ', I2,1x,I2,1x,I2,1x,I2,1x,F12.6,1x,F12.6)
-
-  call MPI_ALLREDUCE(error,err_all,1,real_type,MPI_SUM,MPI_COMM_WORLD,ierror)
-  err_all = err_all / real(nx,mytype) / real(ny,mytype) / real(nz,mytype)
-  
-  if (nrank==0) then
-     write(*,*) '===== r2c/c2r interface ====='
-     write(*,*) 'error / mesh point: ', err_all
-     write(*,*) 'time (sec): ', t1,t3
-  end if
-  
-  deallocate(in_r,out)
-  call decomp_2d_fft_finalize
-  call decomp_2d_finalize
-  call MPI_FINALIZE(ierror)
-
-end program fft_physical_z
diff --git a/examples/fft_physical_z/fft_r2c_z.f90 b/examples/fft_physical_z/fft_r2c_z.f90
new file mode 100644
index 00000000..371fe42e
--- /dev/null
+++ b/examples/fft_physical_z/fft_r2c_z.f90
@@ -0,0 +1,211 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program fft_r2c_z
+
+   use decomp_2d
+   use decomp_2d_fft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use cufft
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ntest = 10  ! repeat test this times
+
+   type(decomp_info), pointer :: ph => null(), sp => null()
+   complex(mytype), allocatable, dimension(:, :, :) :: out
+   real(mytype), allocatable, dimension(:, :, :) :: in_r
+
+   real(mytype) :: dr, error, err_all
+   integer :: ierror, i, j, k, m
+   integer :: zst1, zst2, zst3
+   integer :: zen1, zen2, zen3
+   double precision :: t1, t2, t3, t4
+#ifdef DOUBLE_PREC
+   real(mytype), parameter :: error_precision = 1.e-12_mytype
+#else
+   real(mytype), parameter :: error_precision = 5.e-6_mytype
+#endif
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            ntest = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Test the r2c/c2r interface
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   call decomp_2d_fft_init(PHYSICAL_IN_Z) ! non-default Z-pencil input
+
+   ph => decomp_2d_fft_get_ph()
+   sp => decomp_2d_fft_get_sp()
+   !  input is Z-pencil data
+   ! output is X-pencil data
+   call alloc_z(in_r, ph, opt_global=.true.)
+   call alloc_x(out, sp, opt_global=.true.)
+   zst1 = zstart(1); zen1 = zend(1)
+   zst2 = zstart(2); zen2 = zend(2)
+   zst3 = zstart(3); zen3 = zend(3)
+
+   ! initilise input
+   do k = zst3, zen3
+      do j = zst2, zen2
+         do i = zst1, zen1
+            in_r(i, j, k) = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                            / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+         end do
+      end do
+   end do
+
+   !$acc data copyin(in_r) copy(out)
+   ! First iterations out of the counting loop
+   t1 = MPI_WTIME()
+   call decomp_2d_fft_3d(in_r, out)
+   t2 = MPI_WTIME() - t1
+   t3 = MPI_WTIME()
+   call decomp_2d_fft_3d(out, in_r)
+   t4 = MPI_WTIME() - t3
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc)
+   if (nrank == 0) then
+      write (*, *) '===== r2c/c2r interface ====='
+      write (*, *) 'First iteration with dedicated timer'
+      write (*, *) '     time (sec): ', t1, t3
+      write (*, *) ''
+   end if
+   ! Init the time
+   t2 = 0.d0
+   t4 = 0.d0
+   !$acc kernels
+   in_r = in_r / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+   !$acc end kernels
+   do m = 1, ntest
+
+      ! 3D r2c FFT
+      t1 = MPI_WTIME()
+      call decomp_2d_fft_3d(in_r, out)
+      t2 = t2 + MPI_WTIME() - t1
+
+      ! 3D inverse FFT
+      t3 = MPI_WTIME()
+      call decomp_2d_fft_3d(out, in_r)
+      t4 = t4 + MPI_WTIME() - t3
+
+      ! normalisation - note 2DECOMP&FFT doesn't normalise
+      !$acc kernels
+      in_r = in_r / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      !$acc end kernels
+
+   end do
+#if defined(_GPU)
+   ierror = cudaDeviceSynchronize()
+#endif
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(ntest)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(ntest)
+
+   ! checking accuracy
+   error = 0._mytype
+   !$acc parallel loop default(present) reduction(+:error)
+   do k = zst3, zen3
+      do j = zst2, zen2
+         do i = zst1, zen1
+            dr = real(i, mytype) / real(nx, mytype) * real(j, mytype) &
+                 / real(ny, mytype) * real(k, mytype) / real(nz, mytype)
+            error = error + abs(in_r(i, j, k) - dr)
+         end do
+      end do
+   end do
+   !$acc end loop
+
+   call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+   err_all = err_all / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+   if (err_all > error_precision) then
+      if (nrank == 0) write (*, *) 'error / mesh point: ', err_all
+      call decomp_2d_abort(1, "error for the FFT too large")
+   end if
+
+   if (nrank == 0) then
+      write (*, *) 'error / mesh point: ', err_all
+      write (*, *) 'time (sec): ', t1, t3
+      write (*, *) '   '
+      write (*, *) 'fft_r2c_z completed '
+      write (*, *) '   '
+   end if
+   !$acc end data
+
+   deallocate (in_r, out)
+   nullify (ph)
+   nullify (sp)
+   call decomp_2d_fft_finalize
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+end program fft_r2c_z
diff --git a/examples/grad3d/CMakeLists.txt b/examples/grad3d/CMakeLists.txt
new file mode 100644
index 00000000..98a09de5
--- /dev/null
+++ b/examples/grad3d/CMakeLists.txt
@@ -0,0 +1,17 @@
+file(GLOB files_test grad3d.f90)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(grad3d ${files_test})
+target_link_libraries(grad3d PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/grad3d")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/grad3d/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME grad3d COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:grad3d> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME grad3d COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:grad3d> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
diff --git a/examples/grad3d/README.md b/examples/grad3d/README.md
new file mode 100644
index 00000000..470c3057
--- /dev/null
+++ b/examples/grad3d/README.md
@@ -0,0 +1,24 @@
+# Gradient example
+
+List of the tests:
+- [grad3d](grad3d.f90): Example to compute the gradient of a field. 
+
+This example demonstrates the use 2DECOMP&FFT library to compute the gradient
+of a field using an explicit second order finite difference scheme. 
+The purpose is to show how to use the transpose operations to allow explicit calculation
+of the gradient in all 3 directions. The results are written to a file and the function 
+is periodic over the interval [0-1] 
+
+What to input: The program takes max 5 inputs as: 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary.
+
+What to expect: the output is the original function and the gradient in the 3 direction. 
+                The program will also give the total error in L2 norm comparing with the anytical solution. 
diff --git a/examples/grad3d/adios2_config.xml b/examples/grad3d/adios2_config.xml
new file mode 100644
index 00000000..ffbcb2db
--- /dev/null
+++ b/examples/grad3d/adios2_config.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0"?>
+<adios-config>
+  <io name="grad-io">
+    <engine type="BP4">
+    </engine>
+    <transport type="File">
+      <parameter key="Library" value="fstream"/>
+      <parameter key="ProfileUnits" value="Milliseconds"/>
+    </transport>
+  </io>
+</adios-config>
diff --git a/examples/grad3d/bind.sh b/examples/grad3d/bind.sh
new file mode 100755
index 00000000..38d4fedf
--- /dev/null
+++ b/examples/grad3d/bind.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export LOCAL_RANK=${OMPI_COMM_WORLD_LOCAL_RANK}
+export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
+
+echo "[LOG] local rank $LOCAL_RANK: bind to $CUDA_VISIBLE_DEVICES"
+echo ""
+
+$* 
+
diff --git a/examples/grad3d/grad3d.f90 b/examples/grad3d/grad3d.f90
new file mode 100644
index 00000000..adf7feb0
--- /dev/null
+++ b/examples/grad3d/grad3d.f90
@@ -0,0 +1,636 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! This example calculates the gradient of a periodic field using global
+! transposition
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+program grad3d
+
+   use mpi
+
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 64, ny_base = 64, nz_base = 64
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   integer :: ierror
+   logical :: all_pass
+   character(len=80) :: InputFN
+
+   real(mytype), parameter :: lx = 1.0_mytype
+   real(mytype), parameter :: ly = 1.0_mytype
+   real(mytype), parameter :: lz = 1.0_mytype
+
+   real(mytype) :: dx, dy, dz
+   real(mytype) :: error_ref
+
+   real(mytype), allocatable, dimension(:, :, :) :: phi1, phi2, phi3
+   real(mytype), allocatable, dimension(:, :, :) :: dphiX, dphiY, dphiz
+   real(mytype), allocatable, dimension(:, :, :) :: wk2, wk3
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   dx = lx / real(nx, mytype)
+   dy = ly / real(ny, mytype)
+   dz = lz / real(nz, mytype)
+
+   if (nrank == 0) then
+      write (*, *) '-----------------------------------------------'
+      write (*, *) "Mesh Resolution ", nx, ny, nz
+      write (*, *) '-----------------------------------------------'
+   end if
+
+   call allocate_var()
+   !$acc data create(phi2,phi3,wk2,wk3) copy(phi1,dphiX, dphiY, dphiZ)
+   call init_phi()
+   call compute_grad()
+   call test_derX(dphiX)
+   call test_derY(dphiY)
+   call test_derZ(dphiZ)
+   call write_data()
+   !$acc end data
+
+   if (nrank == 0) then
+      write (*, *) '-----------------------------------------------'
+      write (*, *) "End GRAD calculation check all pass: ", all_pass
+      write (*, *) '==============================================='
+   end if
+
+   call finalize()
+
+   call decomp_2d_finalize
+
+   if (.not. all_pass) call decomp_2d_abort(1, "Error in grad3d")
+
+   call MPI_FINALIZE(ierror)
+
+contains
+
+   !=====================================================================
+   ! Initialize
+   !=====================================================================
+   subroutine allocate_var()
+
+      use decomp_2d
+
+      implicit none
+
+      logical, parameter :: glob = .false.
+
+      ! Allocate main variables in X-pencil
+      call alloc_x(phi1, opt_global=glob)
+      call alloc_x(dphiX, opt_global=glob)
+      call alloc_x(dphiY, opt_global=glob)
+      call alloc_x(dphiZ, opt_global=glob)
+
+      ! Working array used more than once
+      call alloc_y(phi2, opt_global=glob)
+      call alloc_y(wk2, opt_global=glob)
+
+      call alloc_z(phi3, opt_global=glob)
+      call alloc_z(wk3, opt_global=glob)
+
+      all_pass = .true.
+
+   end subroutine allocate_var
+
+   !=====================================================================
+   ! Initialize the scalar field
+   !=====================================================================
+   subroutine init_phi()
+
+      implicit none
+
+      integer :: i, j, k
+      real(mytype), parameter :: twopi = 2._mytype * acos(-1._mytype)
+      real(mytype) :: x, y, z
+      integer :: xe1, xe2, xe3
+      integer :: xs1, xs2, xs3
+
+      xe1 = xsize(1)
+      xe2 = xsize(2)
+      xe3 = xsize(3)
+      xs1 = xstart(1)
+      xs2 = xstart(2)
+      xs3 = xstart(3)
+
+      ! Scalar field
+      !$acc kernels default(present)
+      do concurrent(k=1:xe3, j=1:xe2, i=1:xe1)
+         z = (k + xs3 - 2) * dz
+         y = (j + xs2 - 2) * dy
+         x = (i + xs1 - 2) * dx
+         phi1(i, j, k) = -2._mytype * cos(twopi * (x / lx)) * cos(twopi * (y / ly)) * sin(twopi * (z / lz))
+      end do
+      !$acc end kernels
+
+   end subroutine init_phi
+   !=====================================================================
+   ! Finalize with deallocation of arrays
+   !=====================================================================
+   subroutine finalize()
+
+      implicit none
+
+      deallocate (phi1, phi2, phi3)
+      deallocate (dphiX, dphiY, dphiZ)
+      deallocate (wk2, wk3)
+
+   end subroutine finalize
+   !=====================================================================
+   ! Calculate gradient using global transposition
+   !=====================================================================
+   subroutine compute_grad()
+
+      implicit none
+
+      ! Compute X derivative
+      call derx(dphiX, phi1, dx, xsize(1), xsize(2), xsize(3))
+
+      ! Compute Y derivative
+      call transpose_x_to_y(phi1, phi2)
+      call dery(wk2, phi2, dy, ysize(1), ysize(2), ysize(3))
+      call transpose_y_to_x(wk2, dphiY)
+
+      ! Compute Z derivative
+      call transpose_y_to_z(phi2, phi3)
+      call derz(wk3, phi3, dz, zsize(1), zsize(2), zsize(3))
+      call transpose_z_to_y(wk3, wk2)
+      call transpose_y_to_x(wk2, dphiZ)
+
+   end subroutine compute_grad
+   !=====================================================================
+   ! Calculate gradient in X (data in X-pencil)
+   !=====================================================================
+   subroutine derx(df, ff, delta, nx, ny, nz)
+
+      implicit none
+
+      ! Arguments
+      integer, intent(in) :: nx, ny, nz
+      real(mytype), intent(in) :: delta
+      real(mytype), intent(out), dimension(nx, ny, nz) :: df
+      real(mytype), intent(in), dimension(nx, ny, nz) :: ff
+
+      ! Local variables
+      integer :: i, j, k
+      real(mytype) :: coeff = 0.5_mytype
+
+      coeff = coeff / delta
+
+      !$acc kernels default(present)
+      do concurrent(k=1:nz, j=1:ny)
+         df(1, j, k) = coeff * (ff(2, j, k) - ff(nx, j, k))
+         do concurrent(i=2:nx - 1)
+            df(i, j, k) = coeff * (ff(i + 1, j, k) - ff(i - 1, j, k))
+         end do
+         df(nx, j, k) = coeff * (ff(1, j, k) - ff(nx - 1, j, k))
+      end do
+      !$acc end kernels
+
+   end subroutine derx
+   !=====================================================================
+   ! Calculate gradient in Y (data in Y-pencil)
+   !=====================================================================
+   subroutine dery(df, ff, delta, nx, ny, nz)
+
+      implicit none
+
+      ! Arguments
+      integer, intent(in) :: nx, ny, nz
+      real(mytype), intent(in) :: delta
+      real(mytype), intent(out), dimension(nx, ny, nz) :: df
+      real(mytype), intent(in), dimension(nx, ny, nz) :: ff
+
+      ! Local variables
+      integer :: i, j, k
+      real(mytype) :: coeff = 0.5_mytype
+
+      coeff = coeff / delta
+
+      !$acc kernels default(present)
+      do concurrent(k=1:nz)
+         do concurrent(i=1:nx)
+            df(i, 1, k) = coeff * (ff(i, 2, k) - ff(i, ny, k))
+         end do
+         do concurrent(j=2:ny - 1, i=1:nx)
+            df(i, j, k) = coeff * (ff(i, j + 1, k) - ff(i, j - 1, k))
+         end do
+         do concurrent(i=1:nx)
+            df(i, ny, k) = coeff * (ff(i, 1, k) - ff(i, ny - 1, k))
+         end do
+      end do
+      !$acc end kernels
+
+   end subroutine dery
+   !=====================================================================
+   ! Calculate gradient in Z (data in Z-pencil)
+   !=====================================================================
+   subroutine derz(df, ff, delta, nx, ny, nz)
+
+      implicit none
+
+      ! Arguments
+      integer, intent(in) :: nx, ny, nz
+      real(mytype), intent(in) :: delta
+      real(mytype), intent(out), dimension(nx, ny, nz) :: df
+      real(mytype), intent(in), dimension(nx, ny, nz) :: ff
+
+      ! Local variables
+      integer :: i, j, k
+      real(mytype) :: coeff = 0.5_mytype
+
+      coeff = coeff / delta
+
+      !$acc kernels default(present)
+      do concurrent(j=1:ny, i=1:nx)
+         df(i, j, 1) = coeff * (ff(i, j, 2) - ff(i, j, nz))
+      end do
+      do concurrent(k=2:nz - 1, j=1:ny, i=1:nx)
+         df(i, j, k) = coeff * (ff(i, j, k + 1) - ff(i, j, k - 1))
+      end do
+      do concurrent(j=1:ny, i=1:nx)
+         df(i, j, nz) = coeff * (ff(i, j, 1) - ff(i, j, nz - 1))
+      end do
+      !$acc end kernels
+
+   end subroutine derz
+   !=====================================================================
+   ! Test derivatives against analytical solution (data in X-pencil)
+   !=====================================================================
+   subroutine test_derX(df)
+
+      implicit none
+      ! Arguments
+      real(mytype), intent(in), dimension(xsize(1), xsize(2), xsize(3)) :: df
+
+      integer :: i, j, k
+      real(mytype), parameter :: twopi = 2._mytype * acos(-1._mytype)
+      real(mytype) :: x, y, z
+      real(mytype) :: dphi, dphi_num
+      real(mytype) :: error = 0._mytype
+      real(mytype) :: err_all = 0._mytype
+      real(mytype) :: dphi2 = 0._mytype
+      real(mytype) :: sum_dphi2 = 0._mytype
+      integer :: xe1, xe2, xe3
+      integer :: xs1, xs2, xs3
+
+      xe1 = xsize(1)
+      xe2 = xsize(2)
+      xe3 = xsize(3)
+      xs1 = xstart(1)
+      xs2 = xstart(2)
+      xs3 = xstart(3)
+
+      ! Compute the error against analytical solution
+      !$acc parallel loop default(present) reduction(+:error)
+      do k = 1, xe3
+         do j = 1, xe2
+            do i = 1, xe1
+               z = (k + xs3 - 2) * dz
+               y = (j + xs2 - 2) * dy
+               x = (i + xs1 - 2) * dx
+               dphi = -2._mytype * (twopi / lx) &
+                      * sin(twopi * (x / lx)) * cos(twopi * (y / ly)) * sin(twopi * (z / lz))
+               dphi2 = dphi2 + dphi * dphi
+               dphi_num = df(i, j, k)
+               error = error + (dphi - dphi_num) * (dphi - dphi_num)
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      call MPI_ALLREDUCE(dphi2, sum_dphi2, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      err_all = sqrt(err_all / sum_dphi2) / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+      error_ref = err_all
+
+      if (nrank == 0) then
+         write (*, *) 'DX error / mesh point: ', err_all
+      end if
+
+   end subroutine test_derX
+
+   !=====================================================================
+   ! Test derivatives against analytical solution (data in Y-pencil)
+   !=====================================================================
+   subroutine test_derY(df)
+
+      implicit none
+      ! Arguments
+      real(mytype), intent(in), dimension(xsize(1), xsize(2), xsize(3)) :: df
+
+      integer :: i, j, k
+      real(mytype), parameter :: twopi = 2._mytype * acos(-1._mytype)
+      real(mytype) :: x, y, z
+      real(mytype) :: dphi, dphi_num
+      real(mytype) :: error = 0._mytype
+      real(mytype) :: err_all = 0._mytype
+      real(mytype) :: dphi2 = 0._mytype
+      real(mytype) :: sum_dphi2 = 0._mytype
+      integer :: xe1, xe2, xe3
+      integer :: xs1, xs2, xs3
+
+      xe1 = xsize(1)
+      xe2 = xsize(2)
+      xe3 = xsize(3)
+      xs1 = xstart(1)
+      xs2 = xstart(2)
+      xs3 = xstart(3)
+
+      ! Compute the error against analytical solution
+      !$acc parallel loop default(present) reduction(+:error)
+      do k = 1, xe3
+         do j = 1, xe2
+            do i = 1, xe1
+               z = (k + xs3 - 2) * dz
+               y = (j + xs2 - 2) * dy
+               x = (i + xs1 - 2) * dx
+               dphi = -2._mytype * (twopi / ly) &
+                      * cos(twopi * (x / lx)) * sin(twopi * (y / ly)) * sin(twopi * (z / lz))
+               dphi2 = dphi2 + dphi * dphi
+               dphi_num = df(i, j, k)
+               error = error + (dphi - dphi_num) * (dphi - dphi_num)
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      call MPI_ALLREDUCE(dphi2, sum_dphi2, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      err_all = sqrt(err_all / sum_dphi2) / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+      if (nrank == 0) then
+         write (*, *) 'DY error / mesh point: ', err_all
+      end if
+
+      if (abs(err_all - error_ref) > 1.0e-5_mytype) all_pass = .false.
+
+   end subroutine test_derY
+
+   !=====================================================================
+   ! Test derivatives against analytical solution (data in Z-pencil)
+   !=====================================================================
+   subroutine test_derZ(df)
+
+      implicit none
+      ! Arguments
+      real(mytype), intent(in), dimension(xsize(1), xsize(2), xsize(3)) :: df
+
+      integer :: i, j, k
+      real(mytype), parameter :: twopi = 2._mytype * acos(-1._mytype)
+      real(mytype) :: x, y, z
+      real(mytype) :: dphi, dphi_num
+      real(mytype) :: error = 0._mytype
+      real(mytype) :: err_all = 0._mytype
+      real(mytype) :: dphi2 = 0._mytype
+      real(mytype) :: sum_dphi2 = 0._mytype
+      integer :: xe1, xe2, xe3
+      integer :: xs1, xs2, xs3
+
+      xe1 = xsize(1)
+      xe2 = xsize(2)
+      xe3 = xsize(3)
+      xs1 = xstart(1)
+      xs2 = xstart(2)
+      xs3 = xstart(3)
+
+      ! Compute the error against analytical solution
+      !$acc parallel loop default(present) reduction(+:error)
+      do k = 1, xe3
+         do j = 1, xe2
+            do i = 1, xe1
+               z = (k + xs3 - 2) * dz
+               y = (j + xs2 - 2) * dy
+               x = (i + xs1 - 2) * dx
+               dphi = 2._mytype * (twopi / lz) &
+                      * cos(twopi * (x / lx)) * cos(twopi * (y / ly)) * cos(twopi * (z / lz))
+               dphi2 = dphi2 + dphi * dphi
+               dphi_num = df(i, j, k)
+               error = error + (dphi - dphi_num) * (dphi - dphi_num)
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(error, err_all, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      call MPI_ALLREDUCE(dphi2, sum_dphi2, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      err_all = sqrt(err_all / sum_dphi2) / (real(nx, mytype) * real(ny, mytype) * real(nz, mytype))
+
+      if (nrank == 0) then
+         write (*, *) 'DZ error / mesh point: ', err_all
+      end if
+
+      if (abs(err_all - error_ref) > 1.0e-5_mytype) all_pass = .false.
+
+   end subroutine test_derZ
+   !=====================================================================
+   ! Write of the results (all data are in X-pencil)
+   !=====================================================================
+   subroutine write_data()
+
+      use decomp_2d_io
+
+      implicit none
+
+      character(len=*), parameter :: io_name = "grad-io"
+#ifndef ADIOS2
+      logical :: dir_exists
+#endif
+
+      !$acc update self (phi1)
+      !$acc update self (dphiX)
+      !$acc update self (dphiY)
+      !$acc update self (dphiZ)
+#ifndef ADIOS2
+      if (nrank == 0) then
+         inquire (file="out", exist=dir_exists)
+         if (.not. dir_exists) then
+            call execute_command_line("mkdir out 2> /dev/null")
+         end if
+      end if
+#endif
+
+      call decomp_2d_io_init()
+      call decomp_2d_init_io(io_name)
+
+      call decomp_2d_register_variable(io_name, "phi1.dat", 1, 0, 0, mytype)
+      call decomp_2d_register_variable(io_name, "dphiX.dat", 1, 0, 0, mytype)
+      call decomp_2d_register_variable(io_name, "dphiY.dat", 1, 0, 0, mytype)
+      call decomp_2d_register_variable(io_name, "dphiZ.dat", 1, 0, 0, mytype)
+
+      ! Standard I/O pattern - file per field
+#ifdef ADIOS2
+      call decomp_2d_open_io(io_name, "out", decomp_2d_write_mode)
+      call decomp_2d_start_io(io_name, "out")
+#endif
+      call decomp_2d_write_one(1, phi1, 'out', 'phi1.dat', 0, io_name)
+      call decomp_2d_write_one(1, dphiX, 'out', 'dphiX.dat', 0, io_name)
+      call decomp_2d_write_one(1, dphiY, 'out', 'dphiY.dat', 0, io_name)
+      call decomp_2d_write_one(1, dphiZ, 'out', 'dphiZ.dat', 0, io_name)
+#ifdef ADIOS2
+      call decomp_2d_end_io(io_name, "out")
+      call decomp_2d_close_io(io_name, "out")
+#else
+      call write_xdmf()
+#endif
+
+   end subroutine write_data
+   !=====================================================================
+   ! Write of the xdmf file to visualise in paraview
+   !=====================================================================
+   subroutine write_xdmf()
+      ! This subroutine is based on the xdmf writers in Xcompact3d.
+      ! Copyright (c) 2012-2022, Xcompact3d
+      ! SPDX-License-Identifier: BSD 3-Clause
+
+      integer :: ioxdmf
+
+      character(len=:), allocatable :: fmt
+
+      integer :: precision
+      integer, parameter :: output2D = 0 ! Which plane to write in 2D (0 for 3D)
+
+      integer :: varctr
+      character(len=16) :: filename
+      character(len=5) :: varname
+      if (nrank == 0) then
+         OPEN (newunit=ioxdmf, file="./out.xdmf")
+
+         write (ioxdmf, '(A22)') '<?xml version="1.0" ?>'
+         write (ioxdmf, *) '<!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []>'
+         write (ioxdmf, *) '<Xdmf xmlns:xi="http://www.w3.org/2001/XInclude" Version="2.0">'
+         write (ioxdmf, *) '<Domain>'
+
+         write (ioxdmf, '(A)') '    <Topology name="topo" TopologyType="3DCoRectMesh"'
+         fmt = "(A, I0, A, I0, A, I0, A)"
+         write (ioxdmf, fmt) '        Dimensions="', nz, " ", ny, " ", nx, '">'
+         write (ioxdmf, '(A)') '    </Topology>'
+
+         write (ioxdmf, *) '    <Geometry name="geo" Type="ORIGIN_DXDYDZ">'
+         write (ioxdmf, *) '        <!-- Origin -->'
+         write (ioxdmf, *) '        <DataItem Format="XML" Dimensions="3">'
+         write (ioxdmf, *) '          0.0 0.0 0.0'
+         write (ioxdmf, *) '        </DataItem>'
+         write (ioxdmf, *) '        <!-- DxDyDz -->'
+         write (ioxdmf, *) '        <DataItem Format="XML" Dimensions="3">'
+         if (mytype == kind(0.0d0)) then
+            fmt = "(A, E24.17, A, E24.17, A, E24.17)"
+         else
+            fmt = "(A, E16.9, A, E16.9, A, E16.9)"
+         end if
+         write (ioxdmf, fmt) '        ', 1.0_mytype, " ", 1.0_mytype, " ", 1.0_mytype
+         write (ioxdmf, *) '        </DataItem>'
+         write (ioxdmf, *) '    </Geometry>'
+
+         write (ioxdmf, *) '   <Grid Name="1" GridType="Uniform">'
+         write (ioxdmf, *) '       <Topology Reference="/Xdmf/Domain/Topology[1]"/>'
+         write (ioxdmf, *) '       <Geometry Reference="/Xdmf/Domain/Geometry[1]"/>'
+         do varctr = 1, 4
+            select case (varctr)
+            case (1)
+               write (varname, "(A)") "phi1"
+               write (filename, '(A)') "./out/phi1.dat"
+            case (2)
+               write (varname, "(A)") "dphiX"
+               write (filename, '(A)') "./out/dphiX.dat"
+            case (3)
+               write (varname, "(A)") "dphiY"
+               write (filename, '(A)') "./out/dphiY.dat"
+            case (4)
+               write (varname, "(A)") "dphiZ"
+               write (filename, '(A)') "./out/dphiZ.dat"
+            end select
+            write (ioxdmf, *) '       <Attribute Name="'//trim(varname)//'" Center="Node">'
+#ifndef ADIOS2
+            write (ioxdmf, *) '          <DataItem Format="Binary"'
+#else
+            write (ioxdmf, *) '          <DataItem Format="HDF"'
+#endif
+
+#ifdef DOUBLE_PREC
+            print *, "Double precision build"
+#ifdef SAVE_SINGLE
+            if (output2D == 0) then
+               precision = 4
+            else
+               precision = 8
+            end if
+#else
+            precision = 8
+#endif
+#else
+            precision = 4
+#endif
+            write (ioxdmf, "(A,I0,A)") '            DataType="Float" Precision="', precision, '" Endian="little" Seek="0"'
+
+            fmt = "(A, I0, A, I0, A, I0, A)"
+            write (ioxdmf, fmt) '            Dimensions="', nz, " ", ny, " ", nx, '">'
+
+            write (ioxdmf, *) '              '//trim(filename)
+
+            write (ioxdmf, *) '           </DataItem>'
+            write (ioxdmf, *) '        </Attribute>'
+         end do
+         write (ioxdmf, '(/)')
+         write (ioxdmf, *) '    </Grid>'
+         write (ioxdmf, *) '</Domain>'
+         write (ioxdmf, '(A7)') '</Xdmf>'
+         close (ioxdmf)
+      end if
+
+   end subroutine write_xdmf
+end program grad3d
diff --git a/examples/halo_test/CMakeLists.txt b/examples/halo_test/CMakeLists.txt
new file mode 100644
index 00000000..b534612a
--- /dev/null
+++ b/examples/halo_test/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB files_test halo_test.f90)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(halo_test ${files_test})
+target_link_libraries(halo_test PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/halo_test")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME halo_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:halo_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()  
+  add_test(NAME halo_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:halo_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
diff --git a/examples/halo_test/Makefile b/examples/halo_test/Makefile
deleted file mode 100644
index 72c28439..00000000
--- a/examples/halo_test/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-#include ../../src/Makefile.inc
-
-#INCLUDE = -I../../include
-FFLAGS := $(subst $(MODFLAG),$(MODFLAG)../../,$(FFLAGS))
-FFLAGS := $(patsubst -I%,-I../../%,$(FFLAGS))
-LIBS = -L../../ -l$(LIBDECOMP) $(LFLAGS)
-
-OBJ = halo_test.o
-
-NP ?= 1
-MPIRUN ?= mpirun
-
-halo_test: $(OBJ)
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $(OBJ) $(LIBS)
-
-ifeq ($(PARAMOD),gpu)
-check:
-	$(MPIRUN) -n $(NP) ./bind.sh ./halo_test
-else
-check:
-	$(MPIRUN) -n $(NP) ./halo_test
-endif
-
-clean:
-	rm -f *.o halo_test *.log
-
-%.o : %.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
diff --git a/examples/halo_test/README b/examples/halo_test/README.md
similarity index 50%
rename from examples/halo_test/README
rename to examples/halo_test/README.md
index 8fb08e25..27c910b3 100644
--- a/examples/halo_test/README
+++ b/examples/halo_test/README.md
@@ -1,15 +1,25 @@
-halo_test
----------
+# Halo test
+
+List of the tests:
+- [HALO test](halo_test.f90): Test for the halo exchange capability of the library. 
 
 This example demonstrates the use of the halo-cell support API. It calculates
-the divergency of an arbitrary field, which contains evaluation of spatial
+the divergence of an arbitrary field, which contains evaluation of spatial
 derivatives in all three dimensions. The calculation was first implemented via
 the global transposition routines, then via halo-cell exchanges. Identical
 results are to be expected regardless of the communication algorithm. The 
 computation is based on an explicit finite difference method so clearly using 
 the halo-cell support API is more efficient.
 
-To run: use 12 MPI processes.
+What to input: The program takes max 5 inputs as: 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary.
 
-What to expect: the output using different communication algorithms should be 
-exactly the same.
+What to expect: the output using different communication algorithms should be the same up to machine precision. 
diff --git a/examples/halo_test/halo_test.f90 b/examples/halo_test/halo_test.f90
index 9f52a2bd..9e5c6fdf 100644
--- a/examples/halo_test/halo_test.f90
+++ b/examples/halo_test/halo_test.f90
@@ -1,3 +1,4 @@
+!! SPDX-License-Identifier: BSD-3-Clause
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! This example calculates the divergence of a random field using
 !   (1) global transposition
@@ -6,495 +7,620 @@
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 program halo_test
 
-  use mpi
+   use mpi
 
-  use decomp_2d
-
-  implicit none
-
-  integer, parameter :: nx=171, ny=132, nz=113
-  integer :: p_row=0, p_col=0
-
-  real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-  real(mytype), allocatable, dimension(:,:,:) :: v1, v2, v3
-  real(mytype), allocatable, dimension(:,:,:) :: w1, w2, w3
-  real(mytype), allocatable, dimension(:,:,:) :: wk2, wk3
-  real(mytype), allocatable, dimension(:,:,:) :: uh, vh, wh
-  real(mytype), allocatable, dimension(:,:,:) :: div1, div2, div3, div4
-
-  integer :: i,j,k, ierror, n
-
-  integer, allocatable, dimension(:) :: seed
-
-  real(mytype) :: err
-  integer :: xlast, ylast, zlast
-
-  integer :: nx_expected, ny_expected, nz_expected
-  
-  logical :: passing, all_pass
-
-  call MPI_INIT(ierror)
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  xlast = xsize(1) - 1
-  if (xend(2) == ny) then
-     ylast = xsize(2) - 1
-  else
-     ylast = xsize(2)
-  end if
-  if (xend(3) == nz) then
-     zlast = xsize(3) - 1
-  else
-     zlast = xsize(3)
-  end if
-
-  call initialise()
-  call test_div_transpose()
-  call test_div_haloX()
-  call test_div_haloY()
-  call test_div_haloZ()
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
 
-  if (nrank == 0) then
-     write(*,*) '-----------------------------------------------'
-     write(*,*) "All pass: ", all_pass
-     write(*,*) '==============================================='
-  end if
+   implicit none
+
+   integer, parameter :: nx_base = 65, ny_base = 48, nz_base = 33
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, v1, w1
+   real(mytype), allocatable, dimension(:, :, :) :: u2, v2, w2
+   real(mytype), allocatable, dimension(:, :, :) :: u3, v3, w3
+   real(mytype), allocatable, dimension(:, :, :) :: div, div1, div2, div3, wk2, wk3
+
+   integer :: i, j, k, ierror, n
+
+   integer, allocatable, dimension(:) :: seed
+
+   integer :: xlast, ylast, zlast
+
+   integer :: nx_expected, ny_expected, nz_expected
+
+   logical :: passing, all_pass
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   xlast = xsize(1) - 1
+   if (xend(2) == ny) then
+      ylast = xsize(2) - 1
+   else
+      ylast = xsize(2)
+   end if
+   if (xend(3) == nz) then
+      zlast = xsize(3) - 1
+   else
+      zlast = xsize(3)
+   end if
+
+   call initialise()
+   !$acc data copyin(u1,v1,w1) create(u2,v2,w2,u3,v3,w3,wk2,wk3) copy(div)
+   call test_div_transpose()
+   call test_div_haloX()
+   call test_div_haloY()
+   call test_div_haloZ()
+   !$acc end data
+
+   if (nrank == 0) then
+      write (*, *) '-----------------------------------------------'
+      write (*, *) "All pass: ", all_pass
+      write (*, *) '==============================================='
+   end if
+
+   call finalize()
+
+   call decomp_2d_finalize
+
+   if (.not. all_pass) call decomp_2d_abort(1, "Error in halo_test")
+
+   call MPI_FINALIZE(ierror)
 
-  deallocate(u1,v1,w1,u2,v2,w2,u3,v3,w3)
-  deallocate(div1,div2,div3,div4)
+contains
 
-  call decomp_2d_finalize 
+   !=====================================================================
+   ! Initialize
+   !=====================================================================
+   subroutine initialise()
 
-  if (.not. all_pass) call decomp_2d_abort(1, "Error in halo_test")
+      use decomp_2d
 
-  call MPI_FINALIZE(ierror)
+      implicit none
 
-contains
+#ifdef HALO_GLOBAL
+      logical, parameter :: global = .true.
+#else
+      logical, parameter :: global = .false.
+#endif
 
-  subroutine initialise()
+      ! initialise u,v,w with random numbers in X-pencil
+      call alloc_x(u1, opt_global=global)
+      call alloc_x(v1, opt_global=global)
+      call alloc_x(w1, opt_global=global)
+      call alloc_x(div, opt_global=global)
+      call alloc_x(div1, opt_global=global)
+      call alloc_x(div2, opt_global=global)
+      call alloc_x(div3, opt_global=global)
+
+      call random_seed(size=n)
+      allocate (seed(n))
+      seed = nrank + 1
+      call random_seed(put=seed)
+      call random_number(u1)
+      call random_number(v1)
+      call random_number(w1)
+
+      ! Working array used more than once
+      call alloc_y(u2, opt_global=global)
+      call alloc_y(v2, opt_global=global)
+      call alloc_y(w2, opt_global=global)
+      call alloc_y(wk2, opt_global=global)
+
+      call alloc_z(u3, opt_global=global)
+      call alloc_z(v3, opt_global=global)
+      call alloc_z(w3, opt_global=global)
+      call alloc_z(wk3, opt_global=global)
+
+      all_pass = .true.
+
+   end subroutine initialise
+
+   !=====================================================================
+   ! Finalize with deallocation of arrays
+   !=====================================================================
+   subroutine finalize()
+
+      implicit none
+
+      deallocate (u1, v1, w1)
+      deallocate (u2, v2, w2)
+      deallocate (u3, v3, w3)
+      deallocate (wk2, wk3)
+
+   end subroutine finalize
+   !=====================================================================
+   ! Calculate divergence using global transposition
+   !=====================================================================
+   subroutine test_div_transpose()
+
+      implicit none
 
-    ! initialise u,v,w with random numbers in X-pencil
 #ifdef HALO_GLOBAL
-    allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-    allocate(v1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-    allocate(w1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+      logical, parameter :: global = .true.
 #else
-    allocate(u1(xsize(1), xsize(2), xsize(3)))
-    allocate(v1(xsize(1), xsize(2), xsize(3)))
-    allocate(w1(xsize(1), xsize(2), xsize(3)))
+      logical, parameter :: global = .false.
 #endif
+      integer :: ifirst, ilast ! I loop start/end
+      integer :: jfirst, jlast ! J loop start/end
+      integer :: kfirst, klast ! K loop start/end
 
-    call random_seed(size = n)
-    allocate(seed(n))
-    seed = nrank+1
-    call random_seed(put=seed)
-    call random_number(u1)
-    call random_number(v1)
-    call random_number(w1)
+      ! du/dx calculated on X-pencil
+#ifdef HALO_GLOBAL
+      kfirst = xstart(3); klast = xend(3)
+      jfirst = xstart(2); jlast = xend(2)
+#else
+      kfirst = 1; klast = xsize(3)
+      jfirst = 1; jlast = xsize(2)
+#endif
+      ifirst = 2; ilast = xsize(1) - 1
+
+      !$acc kernels default(present)
+      div(:, :, :) = 0.0_mytype
+      !$acc end kernels
+      !$acc kernels default(present)
+      do k = kfirst, klast
+         do j = jfirst, jlast
+            do i = ifirst, ilast
+               div(i, j, k) = u1(i + 1, j, k) - u1(i - 1, j, k)
+            end do
+         end do
+      end do
+      !$acc end kernels
+
+      ! dv/dy calculated on Y-pencil
+#ifdef HALO_GLOBAL
+      kfirst = ystart(3); klast = yend(3)
+      ifirst = ystart(1); ilast = yend(1)
+#else
+      kfirst = 1; klast = ysize(3)
+      ifirst = 1; ilast = ysize(1)
+#endif
+      jfirst = 2; jlast = ysize(2) - 1
+
+      call transpose_x_to_y(v1, v2)
+      call transpose_x_to_y(div, wk2)
+
+      !$acc kernels default(present)
+      do k = kfirst, klast
+         do j = jfirst, jlast
+            do i = ifirst, ilast
+               wk2(i, j, k) = wk2(i, j, k) + v2(i, j + 1, k) - v2(i, j - 1, k)
+            end do
+         end do
+      end do
+      !$acc end kernels
+
+      ! dw/dz calculated on Z-pencil
+#ifdef HALO_GLOBAL
+      jfirst = zstart(2); jlast = zend(2)
+      ifirst = zstart(1); ilast = zend(1)
+#else
+      jfirst = 1; jlast = zsize(2)
+      ifirst = 1; ilast = zsize(1)
+#endif
+      kfirst = 2; klast = zsize(3) - 1
+
+      call transpose_x_to_y(w1, w2)
+      call transpose_y_to_z(w2, w3)
+      call transpose_y_to_z(wk2, wk3)
+
+      !$acc kernels default(present)
+      do k = kfirst, klast
+         do j = jfirst, jlast
+            do i = ifirst, ilast
+               wk3(i, j, k) = wk3(i, j, k) + w3(i, j, k + 1) - w3(i, j, k - 1)
+            end do
+         end do
+      end do
+      !$acc end kernels
+
+      ! result in X-pencil
+      call transpose_z_to_y(wk3, wk2)
+      call transpose_y_to_x(wk2, div)
 
-    all_pass = .true.
+#ifdef DEBUG
+      if (nrank == 0) then
+         write (*, *) 'Calculated via global transposition'
+         !$acc update self(div)
+         write (*, *) (div(i, i, i), i=2, 13)
+      end if
+#endif
 
-  end subroutine initialise
+   end subroutine test_div_transpose
 
-  !=====================================================================
-  ! Calculate divergence using global transposition
-  !=====================================================================
-  subroutine test_div_transpose()
+   !=====================================================================
+   ! Calculate divergence using halo-cell exchange (data in X-pencil)
+   !=====================================================================
+   subroutine test_div_haloX()
 
-    integer :: i1, in ! I loop start/end
-    integer :: j1, jn ! J loop start/end
-    integer :: k1, kn ! K loop start/end
+      implicit none
 
-    ! du/dx calculated on X-pencil
-#ifdef HALO_GLOBAL
-    allocate(div1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-    k1 = xstart(3); kn = xend(3)
-    j1 = xstart(2); jn = xend(2)
-#else
-    allocate(div1(xsize(1), xsize(2), xsize(3)))
-    k1 = 1; kn = xsize(3)
-    j1 = 1; jn = xsize(2)
+      real(mytype), allocatable, dimension(:, :, :) :: div1
+      real(mytype), allocatable, dimension(:, :, :) :: vh, wh
+#if defined(_GPU)
+      attributes(device) :: vh, wh
 #endif
-    i1 = 2; in = xsize(1) - 1
-
-    div1 = 0.0_mytype
-    do k=k1,kn
-       do j=j1,jn
-          do i=i1,in
-             div1(i,j,k) = u1(i+1,j,k)-u1(i-1,j,k)
-          end do
-       end do
-    end do
-
-    ! dv/dy calculated on Y-pencil
+
 #ifdef HALO_GLOBAL
-    allocate(v2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    allocate(wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    k1 = ystart(3); kn = yend(3)
-    i1 = ystart(1); in = yend(1)
+      logical, parameter :: global = .true.
 #else
-    allocate(v2(ysize(1), ysize(2), ysize(3)))
-    allocate(wk2(ysize(1), ysize(2), ysize(3)))
-    k1 = 1; kn = ysize(3)
-    i1 = 1; in = ysize(1)
+      logical, parameter :: global = .false.
 #endif
-    j1 = 2; jn = ysize(2) - 1
+      integer :: ifirst, ilast ! I loop start/end
+      integer :: jfirst, jlast ! J loop start/end
+      integer :: kfirst, klast ! K loop start/end
 
-    call transpose_x_to_y(v1,v2)
-    call transpose_x_to_y(div1,wk2)
+      call alloc_x(div1, opt_global=global)
 
-    do k=k1,kn
-       do j=j1,jn
-          do i=i1,in
-             wk2(i,j,k) = wk2(i,j,k) + v2(i,j+1,k)-v2(i,j-1,k)
-          end do
-       end do
-    end do
+      ! Expected sizes
+      nx_expected = nx
+      ny_expected = xsize(2) + 2
+      nz_expected = xsize(3) + 2
 
-    ! dw/dz calculated on Z-pencil
+      ! Only global arrays defined in initialise needs to be ported
+      ! Halo array are allocated in both host and device in update_halo
+      ! Halo arrays are just removed before being deallocated
 #ifdef HALO_GLOBAL
-    allocate(w2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    allocate(w3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-    allocate(wk3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-    j1 = zstart(2); jn = zend(2)
-    i1 = zstart(1); in = zend(1)
+      call update_halo(v1, vh, 1, opt_global=.true., opt_pencil=1)
+      call update_halo(w1, wh, 1, opt_global=.true., opt_pencil=1)
+
+      kfirst = xstart(3); klast = xend(3)
+      jfirst = xstart(2); jlast = xend(2)
 #else
-    allocate(w2(ysize(1), ysize(2), ysize(3)))
-    allocate(w3(zsize(1), zsize(2), zsize(3)))
-    allocate(wk3(zsize(1), zsize(2), zsize(3)))
-    j1 = 1; jn = zsize(2)
-    i1 = 1; in = zsize(1)
+      call update_halo(v1, vh, 1, opt_pencil=1)
+      call update_halo(w1, wh, 1, opt_pencil=1)
+
+      kfirst = 1; klast = xsize(3)
+      jfirst = 1; jlast = xsize(2)
 #endif
-    k1 = 2; kn = zsize(3) - 1
-    
-    call transpose_x_to_y(w1,w2)
-    call transpose_y_to_z(w2,w3)
-    call transpose_y_to_z(wk2,wk3)
-
-    do k=k1,kn
-       do j=j1,jn
-          do i=i1,in
-             wk3(i,j,k) = wk3(i,j,k) + w3(i,j,k+1)-w3(i,j,k-1)
-          end do
-       end do
-    end do
-
-    ! result in X-pencil
-    call transpose_z_to_y(wk3,wk2)
-    call transpose_y_to_x(wk2,div1)
-
-    if (nrank==0) then
-       write(*,*) 'Calculated via global transposition'
-#ifdef DEBUG
-       write(*,*) (div1(i,i,i), i=2,13)
+      ifirst = 2; ilast = xsize(1) - 1
+
+      call test_halo_size(vh, nx_expected, ny_expected, nz_expected, "X:v")
+      call test_halo_size(wh, nx_expected, ny_expected, nz_expected, "X:w")
+
+      !$acc data copy(div1)
+      !$acc kernels default(present)
+      div1(:, :, :) = 0._mytype
+      !$acc end kernels
+      !$acc kernels default(present)
+      do k = kfirst, klast
+         do j = jfirst, jlast
+            do i = ifirst, ilast
+               div1(i, j, k) = (u1(i + 1, j, k) - u1(i - 1, j, k)) &
+                               + (vh(i, j + 1, k) - vh(i, j - 1, k)) &
+                               + (wh(i, j, k + 1) - wh(i, j, k - 1))
+            end do
+         end do
+      end do
+      !$acc end kernels
+
+      ! Compute error
+      call check_err(div1, div, "X")
+      !$acc end data
+
+      deallocate (vh, wh, div1)
+
+   end subroutine test_div_haloX
+
+   !=====================================================================
+   ! Calculate divergence using halo-cell exchange (data in Y-pencil)
+   !=====================================================================
+   subroutine test_div_haloY()
+
+      implicit none
+
+      real(mytype), allocatable, dimension(:, :, :) :: div2
+      real(mytype), allocatable, dimension(:, :, :) :: uh, wh
+#if defined(_GPU)
+      attributes(device) :: uh, wh
 #endif
-    end if
-
-    deallocate(v2,w2,w3,wk2,wk3)
-
-  end subroutine test_div_transpose
-
-  !=====================================================================
-  ! Calculate divergence using halo-cell exchange (data in X-pencil)
-  !=====================================================================
-  subroutine test_div_haloX()
-
-    integer :: i1, in ! I loop start/end
-    integer :: j1, jn ! J loop start/end
-    integer :: k1, kn ! K loop start/end
-    
-    ! Expected sizes
-    nx_expected = nx
-    ny_expected = xsize(2) + 2
-    nz_expected = xsize(3) + 2
-    
-#ifdef HALO_GLOBAL
-    allocate(div2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-    call update_halo(v1,vh,1,opt_global=.true.,opt_pencil=1)
-    call update_halo(w1,wh,1,opt_global=.true.,opt_pencil=1)
-    
-    k1 = xstart(3); kn = xend(3)
-    j1 = xstart(2); jn = xend(2)
-#else
-    allocate(div2(xsize(1), xsize(2), xsize(3)))
-    call update_halo(v1,vh,1,opt_pencil=1)
-    call update_halo(w1,wh,1,opt_pencil=1)
 
-    k1 = 1; kn = xsize(3)
-    j1 = 1; jn = xsize(2)
-#endif
-    i1 = 2; in = xsize(1) - 1
-
-    call test_halo_size(vh, nx_expected, ny_expected, nz_expected, "X:v")
-    call test_halo_size(wh, nx_expected, ny_expected, nz_expected, "X:w")
-    
-    div2 = 0.0_mytype
-    do k=k1,kn
-       do j=j1,jn
-          do i=i1,in
-             div2(i,j,k) = (u1(i+1,j,k)-u1(i-1,j,k)) &
-                  + (vh(i,j+1,k)-vh(i,j-1,k)) &
-                  + (wh(i,j,k+1)-wh(i,j,k-1))
-          end do
-       end do
-    end do
-
-    ! Compute error
-    call check_err(div2, "X")
-           
-    deallocate(vh,wh)
-
-  end subroutine test_div_haloX
-
-  !=====================================================================
-  ! Calculate divergence using halo-cell exchange (data in Y-pencil)
-  !=====================================================================
-  subroutine test_div_haloY()
-
-    integer :: i1, in ! I loop start/end
-    integer :: j1, jn ! J loop start/end
-    integer :: k1, kn ! K loop start/end
-
-    ! Expected sizes
-    nx_expected = ysize(1) + 2
-    ny_expected = ny
-    nz_expected = ysize(3) + 2
-  
 #ifdef HALO_GLOBAL
-    allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    allocate(v2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    allocate(w2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-    allocate(div3(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-    allocate(wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
+      logical, parameter :: global = .true.
 #else
-    allocate(u2(ysize(1), ysize(2), ysize(3)))
-    allocate(v2(ysize(1), ysize(2), ysize(3)))
-    allocate(w2(ysize(1), ysize(2), ysize(3)))
-    allocate(div3(xsize(1), xsize(2), xsize(3)))
-    allocate(wk2(ysize(1), ysize(2), ysize(3)))
+      logical, parameter :: global = .false.
 #endif
-    call transpose_x_to_y(u1,u2)
-    call transpose_x_to_y(v1,v2)
-    call transpose_x_to_y(w1,w2)
+      integer :: ifirst, ilast ! I loop start/end
+      integer :: jfirst, jlast ! J loop start/end
+      integer :: kfirst, klast ! K loop start/end
+
+      call alloc_x(div2, opt_global=global)
+
+      ! Expected sizes
+      nx_expected = ysize(1) + 2
+      ny_expected = ny
+      nz_expected = ysize(3) + 2
+
+      call transpose_x_to_y(u1, u2)
+      call transpose_x_to_y(v1, v2)
+      call transpose_x_to_y(w1, w2)
 
-    ! du/dx
+      ! du/dx
 #ifdef HALO_GLOBAL
-    call update_halo(u2,uh,1,opt_global=.true.,opt_pencil=2)
-    call update_halo(w2,wh,1,opt_global=.true.,opt_pencil=2)
-    k1 = ystart(3); kn = yend(3)
-    i1 = ystart(1); in = yend(1)
+      call update_halo(u2, uh, 1, opt_global=.true., opt_pencil=2)
+      call update_halo(w2, wh, 1, opt_global=.true., opt_pencil=2)
+      kfirst = ystart(3); klast = yend(3)
+      ifirst = ystart(1); ilast = yend(1)
 #else
-    call update_halo(u2,uh,1,opt_pencil=2)
-    call update_halo(w2,wh,1,opt_pencil=2)
-    k1 = 1; kn = ysize(3)
-    i1 = 1; in = ysize(1)
+      call update_halo(u2, uh, 1, opt_pencil=2)
+      call update_halo(w2, wh, 1, opt_pencil=2)
+      kfirst = 1; klast = ysize(3)
+      ifirst = 1; ilast = ysize(1)
 #endif
-    j1 = 2; jn = ysize(2) - 1
+      jfirst = 2; jlast = ysize(2) - 1
 
-    call test_halo_size(uh, nx_expected, ny_expected, nz_expected, "Y:u")
-    call test_halo_size(wh, nx_expected, ny_expected, nz_expected, "Y:w")
+      call test_halo_size(uh, nx_expected, ny_expected, nz_expected, "Y:u")
+      call test_halo_size(wh, nx_expected, ny_expected, nz_expected, "Y:w")
 
-    do k=k1,kn
-       do j=j1,jn
-          do i=i1,in
-             wk2(i,j,k) = (uh(i+1,j,k)-uh(i-1,j,k)) &
-                  + (v2(i,j+1,k)-v2(i,j-1,k)) &
-                  + (wh(i,j,k+1)-wh(i,j,k-1))
-          end do
-       end do
-    end do
+      !$acc data copy(div2)
+      !$acc kernels default(present)
+      do k = kfirst, klast
+         do j = jfirst, jlast
+            do i = ifirst, ilast
+               wk2(i, j, k) = (uh(i + 1, j, k) - uh(i - 1, j, k)) &
+                              + (v2(i, j + 1, k) - v2(i, j - 1, k)) &
+                              + (wh(i, j, k + 1) - wh(i, j, k - 1))
+            end do
+         end do
+      end do
+      !$acc end kernels
 
-    call transpose_y_to_x(wk2,div3)
+      call transpose_y_to_x(wk2, div2)
 
-    ! Compute error
-    call check_err(div3, "Y")
+      ! Compute error
+      call check_err(div2, div, "Y")
+      !$acc end data
 
-    deallocate(uh,wh,wk2)
+      deallocate (uh, wh, div2)
 
-  end subroutine test_div_haloY
+   end subroutine test_div_haloY
 
-  !=====================================================================
-  ! Calculate divergence using halo-cell exchange (data in Z-pencil)
-  !=====================================================================
-  subroutine test_div_haloZ()
+   !=====================================================================
+   ! Calculate divergence using halo-cell exchange (data in Z-pencil)
+   !=====================================================================
+   subroutine test_div_haloZ()
 
-  ! Expected sizes
-  nx_expected = zsize(1) + 2
-  ny_expected = zsize(2) + 2
-  nz_expected = nz
+      implicit none
 
-#ifdef HALO_GLOBAL
-  allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-  allocate(v3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-  allocate(w3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-#else
-  allocate(u3(zsize(1), zsize(2), zsize(3)))
-  allocate(v3(zsize(1), zsize(2), zsize(3)))
-  allocate(w3(zsize(1), zsize(2), zsize(3)))
-#endif
-  call transpose_y_to_z(u2,u3)
-  call transpose_y_to_z(v2,v3)
-  call transpose_y_to_z(w2,w3)
+      real(mytype), allocatable, dimension(:, :, :) :: div3
 
-#ifdef HALO_GLOBAL
-  allocate(div4(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(wk3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-#else
-  allocate(div4(xsize(1), xsize(2), xsize(3)))
-  allocate(wk2(ysize(1), ysize(2), ysize(3)))
-  allocate(wk3(zsize(1), zsize(2), zsize(3)))
+      real(mytype), allocatable, dimension(:, :, :) :: uh, vh
+#if defined(_GPU)
+      attributes(device) :: vh, uh
 #endif
 
-  ! du/dx
 #ifdef HALO_GLOBAL
-  call update_halo(u3,uh,1,opt_global=.true.,opt_pencil=3)
+      logical, parameter :: global = .true.
 #else
-  call update_halo(u3,uh,1,opt_pencil=3)
+      logical, parameter :: global = .false.
 #endif
+      integer :: ifirst, ilast ! I loop start/end
+      integer :: jfirst, jlast ! J loop start/end
+      integer :: kfirst, klast ! K loop start/end
 
-  call test_halo_size(uh, nx_expected, ny_expected, nz_expected, "Z:u")
-  
-#ifdef HALO_GLOBAL
-  do j=zstart(2),zend(2)
-     do i=zstart(1),zend(1)
-#else
-  do j=1,zsize(2)
-     do i=1,zsize(1)
-#endif
-        do k=2,zsize(3)-1
-           wk3(i,j,k) = uh(i+1,j,k)-uh(i-1,j,k)
-        end do
-     end do
-  end do
+      call alloc_x(div3, opt_global=global)
 
-  ! dv/dy
-#ifdef HALO_GLOBAL
-  call update_halo(v3,vh,1,opt_global=.true.,opt_pencil=3)
-#else
-  call update_halo(v3,vh,1,opt_pencil=3)
-#endif
+      ! Expected sizes
+      nx_expected = zsize(1) + 2
+      ny_expected = zsize(2) + 2
+      nz_expected = nz
 
-  call test_halo_size(vh, nx_expected, ny_expected, nz_expected, "Z:v")
-  
-#ifdef HALO_GLOBAL
-  do j=zstart(2),zend(2)
-     do i=zstart(1),zend(1)
-#else
-  do j=1,zsize(2)
-     do i=1,zsize(1)
-#endif
-        do k=2,zsize(3)-1
-           wk3(i,j,k) = wk3(i,j,k) + vh(i,j+1,k)-vh(i,j-1,k)
-        end do
-     end do
-  end do
+      call transpose_y_to_z(u2, u3)
+      call transpose_y_to_z(v2, v3)
+      call transpose_y_to_z(w2, w3)
 
-  ! dw/dz
+      ! du/dx
 #ifdef HALO_GLOBAL
-  do j=zstart(2),zend(2)
-     do i=zstart(1),zend(1)
+      call update_halo(u3, uh, 1, opt_global=.true., opt_pencil=3)
+      call update_halo(v3, vh, 1, opt_global=.true., opt_pencil=3)
+      ifirst = zstart(1); ilast = zend(1)
+      jfirst = zstart(2); jlast = zend(2)
 #else
-  do j=1,zsize(2)
-     do i=1,zsize(1)
+      call update_halo(u3, uh, 1, opt_pencil=3)
+      call update_halo(v3, vh, 1, opt_pencil=3)
+      ifirst = 1; ilast = zsize(1)
+      jfirst = 1; jlast = zsize(2)
+#endif
+      kfirst = 2; klast = zsize(3) - 1
+
+      call test_halo_size(uh, nx_expected, ny_expected, nz_expected, "Z:u")
+      call test_halo_size(vh, nx_expected, ny_expected, nz_expected, "Z:v")
+
+      !$acc data copy(div3)
+      !$acc kernels default(present)
+      do j = jfirst, jlast
+         do i = ifirst, ilast
+            do k = kfirst, klast
+               wk3(i, j, k) = uh(i + 1, j, k) - uh(i - 1, j, k) &
+                              + vh(i, j + 1, k) - vh(i, j - 1, k) &
+                              + w3(i, j, k + 1) - w3(i, j, k - 1)
+            end do
+         end do
+      end do
+      !$acc end kernels
+
+      call transpose_z_to_y(wk3, wk2)
+      call transpose_y_to_x(wk2, div3)
+
+      ! Compute error
+      call check_err(div3, div, "Z")
+      !$acc end data
+
+      deallocate (uh, vh, div3)
+   end subroutine test_div_haloZ
+   !=====================================================================
+   ! Check the difference between halo and transpose divergence
+   !=====================================================================
+   subroutine check_err(divh, divref, pencil)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :), intent(in) :: divh
+      real(mytype), dimension(:, :, :), intent(in) :: divref
+      character(len=*), intent(in) :: pencil
+      real(mytype), dimension(:, :, :), allocatable :: tmp
+      real(mytype) :: divmag, error
+#if defined(_GPU)
+      attributes(device) :: tmp
 #endif
-        do k=2,zsize(3)-1
-           wk3(i,j,k) = wk3(i,j,k) + w3(i,j,k+1)-w3(i,j,k-1)
-        end do
-     end do
-  end do
-
-  call transpose_z_to_y(wk3,wk2)
-  call transpose_y_to_x(wk2,div4)
-
-  ! Compute error
-  call check_err(div4, "Z")
-
-  deallocate(uh,vh,wk2,wk3)
-  end subroutine test_div_haloZ
-
-  subroutine check_err(divh, pencil)
-
-    real(mytype), dimension(:,:,:), intent(in) :: divh
-    character(len=*), intent(in) :: pencil
-
-    real(mytype), dimension(:,:,:), allocatable :: tmp
-
-    real(mytype) :: divmag
-
-    ! XXX: The Intel compiler SEGFAULTs if the array difference is computed inplace
-    !      i.e. mag(divh(2:xlast,2:ylast,2:zlast) - div1(2:xlast,2:ylast,2:zlast))
-    !      causes a SEGFAULT. Explicitly computing the difference in a temporary
-    !      array seems to be OK.
-    allocate(tmp(size(divh, 1), size(divh, 2), size(divh, 3)))
-    tmp(2:xlast,2:ylast,2:zlast) = divh(2:xlast,2:ylast,2:zlast) - div1(2:xlast,2:ylast,2:zlast)
-    err = mag(tmp(2:xlast,2:ylast,2:zlast))
-    deallocate(tmp)
-    divmag = mag(div1(2:xlast,2:ylast,2:zlast))
-    if (err < epsilon(divmag) * divmag) then
-       passing = .true.
-    else
-       passing = .false.
-    end if
-    all_pass = all_pass .and. passing
-
-    if (nrank==0) then
-       write(*,*) '-----------------------------------------------'
-       write(*,*) 'Calculated via halo exchange (data in '//pencil//'-pencil)'
+      ! XXX: The Intel compiler SEGFAULTs if the array difference is computed inplace
+      !      i.e. mag(divh(2:xlast,2:ylast,2:zlast) - div1(2:xlast,2:ylast,2:zlast))
+      !      causes a SEGFAULT. Explicitly computing the difference in a temporary
+      !      array seems to be OK
+      allocate (tmp(size(divh, 1), size(divh, 2), size(divh, 3)))
+
+      !$acc kernels default(present)
+      tmp(2:xlast, 2:ylast, 2:zlast) = divh(2:xlast, 2:ylast, 2:zlast) - divref(2:xlast, 2:ylast, 2:zlast)
+      !$acc end kernels
+      error = mag(tmp)
+      !$acc kernels default(present)
+      tmp(2:xlast, 2:ylast, 2:zlast) = divref(2:xlast, 2:ylast, 2:zlast)
+      !$acc end kernels
+      divmag = mag(tmp)
+
+      if (error < real(2.0, mytype) * epsilon(divmag) * divmag) then
+         passing = .true.
+      else
+         passing = .false.
+      end if
+      all_pass = all_pass .and. passing
+
+      if (nrank == 0) then
+         write (*, *) '-----------------------------------------------'
+         write (*, *) 'Calculated via halo exchange (data in '//pencil//'-pencil)'
 #ifdef DEBUG
-       write(*,*) (divh(i,i,i), i=2,13)
+         write (*, *) (divh(i, i, i), i=2, 13)
+#endif
+         write (*, *) 'Error: ', error, '; Relative: ', error / divmag
+         write (*, *) 'Pass: ', passing
+      end if
+      deallocate (tmp)
+
+   end subroutine check_err
+   !=====================================================================
+   ! Compute the magnitude af the ayyays
+   !=====================================================================
+   real(mytype) function mag(a)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :), intent(in) :: a
+#if defined(_GPU)
+      attributes(device) :: a
 #endif
-       write(*,*) 'Error: ', err, '; Relative: ', err / divmag
-       write(*,*) 'Pass: ', passing
-    end if
-
-  end subroutine check_err
-
-  real(mytype) function mag(a)
-
-    real(mytype), dimension(:,:,:), intent(in) :: a
-    
-    real(mytype) :: lmag, gmag
-
-    lmag = sum(a(:,:,:)**2)
-    call MPI_Allreduce(lmag, gmag, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
-    if (ierror /= 0) then
-       call decomp_2d_abort(__FILE__, __LINE__, ierror, &
-            "halo_test::mag::MPI_Allreduce")
-    endif
-
-    mag = sqrt(gmag / (nx - 2) / (ny - 2) / (nz - 2))
 
-  end function mag
+      real(mytype) :: lmag, gmag
+
+      lmag = 0._mytype
+      !$acc parallel loop default(present) collapse(3) reduction(+:lmag)
+      do k = 2, zlast
+         do j = 2, ylast
+            do i = 2, xlast
+               lmag = lmag + a(i, j, k)**2
+            end do
+         end do
+      end do
+      !$acc end parallel
+
+      call MPI_Allreduce(lmag, gmag, 1, real_type, MPI_SUM, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) then
+         call decomp_2d_abort(__FILE__, __LINE__, ierror, &
+                              "halo_test::mag::MPI_Allreduce")
+      end if
+
+      mag = sqrt(gmag / (nx - 2) / (ny - 2) / (nz - 2))
+
+   end function mag
+   !=====================================================================
+   ! Check the dimensions of the halo arrays are the one expected
+   !=====================================================================
+   subroutine test_halo_size(arrh, nx_expected, ny_expected, nz_expected, tag)
+
+      real(mytype), dimension(:, :, :), intent(in) :: arrh
+#if defined(_GPU)
+      attributes(device) :: arrh
+#endif
+      integer, intent(in) :: nx_expected, ny_expected, nz_expected
+      character(len=*), intent(in) :: tag
 
-  subroutine test_halo_size(arrh, nx_expected, ny_expected, nz_expected, tag)
+      integer :: nx, ny, nz
 
-    real(mytype), dimension(:,:,:), intent(in) :: arrh
-    integer, intent(in) :: nx_expected, ny_expected, nz_expected
-    character(len=*), intent(in) :: tag
+      character(len=128) :: rank_lbl
 
-    integer :: nx, ny, nz
+      nx = size(arrh, 1)
+      ny = size(arrh, 2)
+      nz = size(arrh, 3)
 
-    character(len=128) :: rank_lbl
+      write (rank_lbl, "(A,I0,A)") "Rank", nrank, ":"
 
-    nx = size(arrh, 1)
-    ny = size(arrh, 2)
-    nz = size(arrh, 3)
+      if ((nx /= nx_expected) .or. &
+          (ny /= ny_expected) .or. &
+          (nz /= nz_expected)) then
+         write (*, *) trim(rank_lbl), " ", tag, ":ERROR: halo size"
+         write (*, *) trim(rank_lbl), " ", "+ Expected: ", nx_expected, " ", ny_expected, " ", nz_expected, " "
+         write (*, *) trim(rank_lbl), " ", "+ Got:      ", nx, " ", ny, " ", nz, " "
 
-    write(rank_lbl, "(A,I0,A)") "Rank", nrank, ":"
+         all_pass = .false.
+      end if
 
-    if ((nx /= nx_expected) .or. &
-         (ny /= ny_expected) .or. &
-         (nz /= nz_expected)) then
-       write(*,*) trim(rank_lbl), " ", tag, ":ERROR: halo size"
-       write(*,*) trim(rank_lbl), " ", "+ Expected: ", nx_expected, " ", ny_expected, " ", nz_expected, " "
-       write(*,*) trim(rank_lbl), " ", "+ Got:      ", nx, " ", ny, " ", nz, " "
+   end subroutine test_halo_size
 
-       all_pass = .false.
-    else
-       write(*,*) trim(rank_lbl), " ", tag, ":PASS"
-    end if
-    
-  end subroutine test_halo_size
-  
 end program halo_test
diff --git a/examples/init_test/.gitignore b/examples/init_test/.gitignore
new file mode 100644
index 00000000..a00b8f97
--- /dev/null
+++ b/examples/init_test/.gitignore
@@ -0,0 +1 @@
+init_test
\ No newline at end of file
diff --git a/examples/init_test/CMakeLists.txt b/examples/init_test/CMakeLists.txt
new file mode 100644
index 00000000..dea5ad7f
--- /dev/null
+++ b/examples/init_test/CMakeLists.txt
@@ -0,0 +1,16 @@
+file(GLOB files_test init_test.f90)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(init_test ${files_test})
+target_link_libraries(init_test PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/init_test")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME init_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:init_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME init_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:init_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif()
diff --git a/examples/init_test/README.md b/examples/init_test/README.md
new file mode 100644
index 00000000..466c9ed1
--- /dev/null
+++ b/examples/init_test/README.md
@@ -0,0 +1,22 @@
+# Initialization test
+
+List of the tests:
+- [Initialization test](init_test.f90): Test for initialization process. 
+
+This example demonstrates the initialisation of DECOMP2D&FFT library and tests 
+that the size of the mesh in every direction is as expected
+
+
+What to input: The program takes max 5 inputs as: 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary
+
+What to expect: Success/error message if initialization process has been complted or 
+                errors have been encountered.
diff --git a/examples/init_test/bind.sh b/examples/init_test/bind.sh
new file mode 100755
index 00000000..38d4fedf
--- /dev/null
+++ b/examples/init_test/bind.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+export LOCAL_RANK=${OMPI_COMM_WORLD_LOCAL_RANK}
+export CUDA_VISIBLE_DEVICES=${LOCAL_RANK}
+
+echo "[LOG] local rank $LOCAL_RANK: bind to $CUDA_VISIBLE_DEVICES"
+echo ""
+
+$* 
+
diff --git a/examples/init_test/init_test.f90 b/examples/init_test/init_test.f90
new file mode 100644
index 00000000..9f620015
--- /dev/null
+++ b/examples/init_test/init_test.f90
@@ -0,0 +1,129 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+!!!=====================================================
+!!!! init_test.f90
+!!!  Tests initialising the 2decomp&fft library.
+!!!=====================================================
+
+program init_test
+
+   use MPI
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+   integer :: nexpect
+   integer :: ierror
+
+   call MPI_Init(ierror)
+
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   nexpect = nx * ny * nz
+   call run(p_row, p_col)
+
+   call MPI_Finalize(ierror)
+
+contains
+
+   subroutine run(p_row, p_col)
+
+      integer, intent(inout) :: p_row, p_col
+
+      call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+      call check_axis("X")
+      call check_axis("Y")
+      call check_axis("Z")
+
+      call decomp_2d_finalize()
+
+   end subroutine run
+
+   subroutine check_axis(axis)
+
+      character(len=*), intent(in) :: axis
+
+      integer :: suml
+      integer :: sumg
+      integer, dimension(3) :: sizes
+
+      if (axis == "X") then
+         sizes = xsize
+      else if (axis == "Y") then
+         sizes = ysize
+      else if (axis == "Z") then
+         sizes = zsize
+      else
+         sizes = 0
+         if (nrank == 0) print *, "ERROR: unknown axis requested!"
+         stop 1
+      end if
+
+      suml = product(sizes)
+      call MPI_Allreduce(suml, sumg, 1, MPI_INTEGER, MPI_SUM, MPI_COMM_WORLD, ierror)
+
+      if (sumg /= nexpect) then
+         if (nrank == 0) print *, "ERROR: got ", sumg, " nodes, expected ", nexpect
+         stop 1
+      else
+         if (nrank == 0) print *, "Init Test pass for axis ", axis
+      end if
+
+   end subroutine check_axis
+
+end program init_test
diff --git a/examples/io_test/CMakeLists.txt b/examples/io_test/CMakeLists.txt
new file mode 100644
index 00000000..4bada169
--- /dev/null
+++ b/examples/io_test/CMakeLists.txt
@@ -0,0 +1,95 @@
+file(GLOB files_test io_test.f90)
+file(GLOB files_read io_read.f90)
+file(GLOB files_tmp_test io_tmp_test.f90)
+file(GLOB files_var_test io_var_test.f90)
+file(GLOB files_plane_test io_plane_test.f90)
+file(GLOB files_bench io_bench.f90)
+file(GLOB files_visu io_visu.f90)
+
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(io_test ${files_test})
+add_executable(io_tmp_test ${files_tmp_test})
+add_executable(io_read ${files_read})
+add_executable(io_var_test ${files_var_test})
+add_executable(io_plane_test ${files_plane_test})
+add_executable(io_bench ${files_bench})
+add_executable(io_visu ${files_visu})
+
+target_link_libraries(io_test PRIVATE decomp2d)
+target_link_libraries(io_tmp_test PRIVATE decomp2d)
+target_link_libraries(io_read PRIVATE decomp2d)
+target_link_libraries(io_var_test PRIVATE decomp2d)
+target_link_libraries(io_plane_test PRIVATE decomp2d)
+target_link_libraries(io_bench PRIVATE decomp2d)
+target_link_libraries(io_visu PRIVATE decomp2d)
+
+# Run the test(s)
+# Note visu is not a test - it is an example to show/create visu files
+set(run_dir "${test_dir}/io_test")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+  add_test(NAME io_read COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_read> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+  add_test(NAME io_read COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_read> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# Run the test(s)
+set(run_dir "${test_dir}/io_tmp_test")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_tmp_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_tmp_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_tmp_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_tmp_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# Run the test(s)
+set(run_dir "${test_dir}/io_var_test")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_var_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_var_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_var_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_var_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# Run the test(s)
+set(run_dir "${test_dir}/io_plane_test")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_plane_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_plane_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_plane_test COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_plane_test> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# Run the test(s)
+set(run_dir "${test_dir}/io_bench")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_bench COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_bench> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_bench COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_bench> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+# Run the test(s)
+set(run_dir "${test_dir}/io_visu")
+file (COPY "${CMAKE_SOURCE_DIR}/examples/io_test/adios2_config.xml" DESTINATION ${run_dir})
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME io_visu COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:io_visu> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME io_visu COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:io_visu> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
diff --git a/examples/io_test/Makefile b/examples/io_test/Makefile
deleted file mode 100644
index 26a7ab61..00000000
--- a/examples/io_test/Makefile
+++ /dev/null
@@ -1,56 +0,0 @@
-#include ../../src/Makefile.inc
-
-#INCLUDE = -I../../include
-FFLAGS := $(subst $(MODFLAG),$(MODFLAG)../../,$(FFLAGS))
-FFLAGS := $(patsubst -I%,-I../../%,$(FFLAGS))
-LIBS = -L../../ -l$(LIBDECOMP) $(LFLAGS)
-
-NP ?= 1
-MPIRUN ?= mpirun
-NROW ?= 0
-NCOL ?= 0
-
-all: $(DECOMPINC) io_test io_read io_var_test io_plane_test io_bench
-
-$(DECOMPINC):
-	mkdir $@
-
-io_test: io_test.o
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $< $(OBJ) $(LIBS)
-
-io_read: io_read.o
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $< $(OBJ) $(LIBS)
-
-io_var_test: io_var_test.o
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $< $(OBJ) $(LIBS)
-
-io_plane_test: io_plane_test.o
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $< $(OBJ) $(LIBS)
-
-io_bench: io_bench.o
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $< $(OBJ) $(LIBS)
-
-ifeq ($(PARAMOD),gpu)
-check:
-	$(MPIRUN) -n $(NP) ./bind.sh ./io_test
-	$(MPIRUN) -n $(NP) ./bind.sh ./io_read
-	$(MPIRUN) -n $(NP) ./bind.sh ./io_var_test $(NROW) $(NCOL)
-	$(MPIRUN) -n $(NP) ./bind.sh ./io_plane_test
-	$(MPIRUN) -n $(NP) ./bind.sh ./io_bench
-else
-check:
-	$(MPIRUN) -n $(NP) ./io_test
-	$(MPIRUN) -n $(NP) ./io_read
-	$(MPIRUN) -n $(NP) ./io_var_test $(NROW) $(NCOL)
-	$(MPIRUN) -n $(NP) ./io_plane_test
-	$(MPIRUN) -n $(NP) ./io_bench
-endif
-
-clean:
-	rm -f *.o io_test io_read io_var_test io_plane_test io_bench
-
-realclean: clean
-	rm -f *.dat io_var_data.* *.log
-
-%.o : %.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
diff --git a/examples/io_test/README.md b/examples/io_test/README.md
new file mode 100644
index 00000000..637e5064
--- /dev/null
+++ b/examples/io_test/README.md
@@ -0,0 +1,28 @@
+# IO tests
+
+These examples demostrate the use of the different I/O features of DECOMP2D&FFT
+using both MPI I/O and ADIOS2. The tests performed are the following: 
+- [Write Test](io_test.f90): write the files u1.dat, u2.dat and u3.dat defined in X, Y and Z pencils respectively
+                             and write these files in a combined single file (used for checkpointing)
+                             The program also checks that the written files are correct;
+- [Read Test](io_read.f90): read files u1.dat, u2.dat and u3.dat which were written by io_test and check that they are the expected ones;
+- [Write Real/Complex variables](io_var_test.f90): test the writing of real and complex data for scalar and 3D arrays 
+                                                   and for different resolutions;
+- [Write 2D planes files](io_plane_test.f90): test the writing of 2D planes in different direction;  
+- [Timing IO](io_bench.f90): timing the writing on disk of a 3D array.
+
+
+What to input: The program takes max 5 inputs as : 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary.
+
+What to expect: 
+- All programs print out a success message otherwise an error message with location of the error;
+- io_bench gives also the timing of the writing on disk.
diff --git a/examples/io_test/adios2_config.xml b/examples/io_test/adios2_config.xml
new file mode 100644
index 00000000..69867f4f
--- /dev/null
+++ b/examples/io_test/adios2_config.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<adios-config>
+  <io name="test-io">
+    <engine type="BP4">
+    </engine>
+    <transport type="File">
+      <parameter key="Library" value="fstream"/>
+      <parameter key="ProfileUnits" value="Milliseconds"/>
+    </transport>
+  </io>
+  <io name="restart-io">
+    <engine type="BP4">
+    </engine>
+    <transport type="File">
+      <parameter key="Library" value="fstream"/>
+      <parameter key="ProfileUnits" value="Milliseconds"/>
+    </transport>
+  </io>
+  <io name="visu-io">
+    <engine type="BP4">
+    </engine>
+    <transport type="File">
+      <parameter key="Library" value="fstream"/>
+      <parameter key="ProfileUnits" value="Milliseconds"/>
+    </transport>
+  </io>
+</adios-config>
diff --git a/examples/io_test/io_bench.f90 b/examples/io_test/io_bench.f90
index a8e05390..a44a2e4a 100644
--- a/examples/io_test/io_bench.f90
+++ b/examples/io_test/io_bench.f90
@@ -1,36 +1,96 @@
+!! SPDX-License-Identifier: BSD-3-Clause
 program io_bench
 
-  use decomp_2d
-  use decomp_2d_io
-  use MPI
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+   use MPI
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
 
-  implicit none
-  
-  integer, parameter :: nx=100, ny=100, nz=100
-  integer :: p_row=0, p_col=0
+   implicit none
 
-  real(mytype), allocatable, dimension(:,:,:) :: u1
-  
-  double precision :: t1, t2
-  integer :: ierror
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
 
-  call MPI_INIT(ierror)
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierror)
-  call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
+   real(mytype), allocatable, dimension(:, :, :) :: u1
 
-  allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  call random_number(u1)
+   double precision :: t1, t2
+   integer :: ierror
 
-  t1 = MPI_WTIME()
-  call decomp_2d_write_one(1,u1,'io.dat')
-  t2 = MPI_WTIME()
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
 
-  if (nrank==0) write(*,*) 'I/O time: ', t2-t1
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
 
-  call decomp_2d_finalize 
-  call MPI_FINALIZE(ierror)
-  deallocate(u1)
+   call alloc_x(u1, opt_global=.true.)
+   call random_number(u1)
+
+   t1 = MPI_WTIME()
+   call decomp_2d_write_one(1, u1, 'io.dat')
+   t2 = MPI_WTIME()
+
+   if (nrank == 0) then
+      write (*, *) 'I/O time: ', t2 - t1
+      write (*, *) '   '
+      write (*, *) 'IO_bench completed '
+      write (*, *) '   '
+   end if
+
+   deallocate (u1)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
 
 end program io_bench
-  
+
diff --git a/examples/io_test/io_plane_test.f90 b/examples/io_test/io_plane_test.f90
index 31f3d366..ce0165fe 100644
--- a/examples/io_test/io_plane_test.f90
+++ b/examples/io_test/io_plane_test.f90
@@ -1,122 +1,249 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+#define unused(x) associate(tmp => x); end associate
+
+!!
+!! FIXME The issue below is specific to GPU and should be discussed in a dedicated github issue
+!!
+!! NB in case of GPU only the writing in the aligned pencil (i.e. X for a 1 array) is performed.
+!! IO subrotines needs update for non managed GPU case
+!!
 program io_plane_test
 
-  use mpi
-
-  use decomp_2d
-  use decomp_2d_io
-
-  implicit none
-
-  integer, parameter :: nx=17, ny=13, nz=11
-  integer :: p_row=0, p_col=0
-
-  real(mytype), dimension(nx,ny,nz) :: data1
-  real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-
-  real(mytype), allocatable, dimension(:,:,:) :: work
-  
-  integer :: i,j,k, m, ierror, iol
-
-  call MPI_INIT(ierror)
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierror)
-  call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-  
-  ! ***** global data *****
-  m = 1
-  do k=1,nz
-     do j=1,ny
-        do i=1,nx
-           data1(i,j,k) = real(m,mytype)
-           m = m+1
-        end do
-     end do
-  end do
-
-  allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  ! original X-pensil based data 
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        u1(i,j,k) = data1(i,j,k)
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   real(mytype), allocatable, dimension(:, :, :) :: data1
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+
+   real(mytype), allocatable, dimension(:, :, :) :: work
+
+   integer :: i, j, k, m, ierror, iol
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   logical :: found
+
+   character(len=*), parameter :: io_name = "test-io"
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   call decomp_2d_io_init()
+   call decomp_2d_init_io(io_name)
+   call decomp_2d_register_variable(io_name, "x_pencil-x_plane.dat", 1, 0, 1, mytype)
+   call decomp_2d_register_variable(io_name, "x_pencil-y_plane.dat", 1, 0, 2, mytype)
+   call decomp_2d_register_variable(io_name, "x_pencil-z_plane.dat", 1, 0, 3, mytype)
+   call decomp_2d_register_variable(io_name, "y_pencil-x_plane.dat", 2, 0, 1, mytype)
+   call decomp_2d_register_variable(io_name, "y_pencil-y_plane.dat", 2, 0, 2, mytype)
+   call decomp_2d_register_variable(io_name, "y_pencil-z_plane.dat", 2, 0, 3, mytype)
+   call decomp_2d_register_variable(io_name, "z_pencil-x_plane.dat", 3, 0, 1, mytype)
+   call decomp_2d_register_variable(io_name, "z_pencil-y_plane.dat", 3, 0, 2, mytype)
+   call decomp_2d_register_variable(io_name, "z_pencil-z_plane.dat", 3, 0, 3, mytype)
+
+   ! ***** global data *****
+   allocate (data1(nx, ny, nz))
+   m = 1
+   do k = 1, nz
+      do j = 1, ny
+         do i = 1, nx
+            data1(i, j, k) = real(m, mytype)
+            m = m + 1
+         end do
       end do
-    end do
-  end do
-  call decomp_2d_write_plane(1,u1,1,nx/2,'.','x_pencil-x_plane.dat','test')
-  call decomp_2d_write_plane(1,u1,2,ny/2,'.','x_pencil-y_plane.dat','test')
-  call decomp_2d_write_plane(1,u1,3,nz/2,'.','x_pencil-z_plane.dat','test')
-
-  ! Y-pencil data
-  call transpose_x_to_y(u1,u2)
-  call decomp_2d_write_plane(2,u2,1,nx/2,'.','y_pencil-x_plane.dat','test')
-  call decomp_2d_write_plane(2,u2,2,ny/2,'.','y_pencil-y_plane.dat','test')
-  call decomp_2d_write_plane(2,u2,3,nz/2,'.','y_pencil-z_plane.dat','test')
-
-  ! Z-pencil data
-  call transpose_y_to_z(u2,u3)
-  call decomp_2d_write_plane(3,u3,1,nx/2,'.','z_pencil-x_plane.dat','test')
-  call decomp_2d_write_plane(3,u3,2,ny/2,'.','z_pencil-y_plane.dat','test')
-  call decomp_2d_write_plane(3,u3,3,nz/2,'.','z_pencil-z_plane.dat','test')
-
-  ! Attemp to read the files
-  if (nrank==0) then
-     inquire(iolength=iol) data1(1,1,1)
-
-     ! X-plane
-     allocate(work(1,ny,nz))
-     open(10, FILE='x_pencil-x_plane.dat', FORM='unformatted', &
-          ACCESS='DIRECT', RECL=iol)
-     m=1
-     do k=1,nz
-        do j=1,ny
-           read(10,rec=m) work(1,j,k)
-           m=m+1
-        end do
-     end do
-     write(*,*) ' '
-     write(*,'(15I5)') int(work)
-     close(10)
-     deallocate(work)
-
-     ! Y-plane
-     allocate(work(nx,1,nz))
-     open(10, FILE='x_pencil-y_plane.dat', FORM='unformatted', &
-          ACCESS='DIRECT', RECL=iol)
-     m=1
-     do k=1,nz
-        do i=1,nx
-           read(10,rec=m) work(i,1,k)
-           m=m+1
-        end do
-     end do
-     write(*,*) ' '
-     write(*,'(15I5)') int(work)
-     close(10)
-     deallocate(work)
-
-     ! Z-plane
-     allocate(work(nx,ny,1))
-     open(10, FILE='x_pencil-z_plane.dat', FORM='unformatted', &
-          ACCESS='DIRECT', RECL=iol)
-     m=1
-     do j=1,ny
-        do i=1,nx
-           read(10,rec=m) work(i,j,1)
-           m=m+1
-        end do
-     end do
-     write(*,*) ' '
-     write(*,'(15I5)') int(work)
-     close(10)
-     deallocate(work)
-
-  end if
-
-  call decomp_2d_finalize 
-  call MPI_FINALIZE(ierror)
-  deallocate(u1,u2,u3)
-  
+   end do
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   ! For GPU we port the global data create the different pencil arrays
+   ! Move back to host the arrays for writing on disk
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+
+   !$acc data copyin(data1) copy(u1,u2,u3)
+   ! original X-pensil based data
+   !$acc parallel loop default(present)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            u1(i, j, k) = data1(i, j, k)
+         end do
+      end do
+   end do
+   !$acc end loop
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   !$acc update self(u1)
+   !$acc update self(u2)
+   !$acc update self(u3)
+   !$acc end data
+   ! X-pencil data
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_write_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   call decomp_2d_write_plane(1, u1, 1, nx / 2, 'out', 'x_pencil-x_plane.dat', io_name)
+   call decomp_2d_write_plane(1, u1, 2, ny / 2, 'out', 'x_pencil-y_plane.dat', io_name)
+   call decomp_2d_write_plane(1, u1, 3, nz / 2, 'out', 'x_pencil-z_plane.dat', io_name)
+   ! Y-pencil data
+   call decomp_2d_write_plane(2, u2, 2, ny / 2, 'out', 'y_pencil-y_plane.dat', io_name)
+   call decomp_2d_write_plane(2, u2, 1, nx / 2, 'out', 'y_pencil-x_plane.dat', io_name)
+   call decomp_2d_write_plane(2, u2, 3, nz / 2, 'out', 'y_pencil-z_plane.dat', io_name)
+   ! Z-pencil data
+   call decomp_2d_write_plane(3, u3, 1, nx / 2, 'out', 'z_pencil-x_plane.dat', io_name)
+   call decomp_2d_write_plane(3, u3, 2, ny / 2, 'out', 'z_pencil-y_plane.dat', io_name)
+   call decomp_2d_write_plane(3, u3, 3, nz / 2, 'out', 'z_pencil-z_plane.dat', io_name)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
+#endif
+
+#ifndef ADIOS2
+   ! Attemp to read the files
+   if (nrank == 0) then
+      inquire (iolength=iol) data1(1, 1, 1)
+
+      ! X-plane
+      inquire (file='./out/x_pencil-x_plane.dat', exist=found)
+      if (found) then
+         allocate (work(1, ny, nz))
+         open (10, FILE='./out/x_pencil-x_plane.dat', FORM='unformatted', &
+               ACCESS='DIRECT', RECL=iol)
+         m = 1
+         do k = 1, nz
+            do j = 1, ny
+               read (10, rec=m) work(1, j, k)
+               m = m + 1
+            end do
+         end do
+!        write(*,*) ' '
+!        write(*,'(15I5)') int(work)
+         close (10)
+         deallocate (work)
+         write (*, *) 'passed self test x-plane'
+      else
+         write (*, *) "Warning : x_pencil-x_plane.dat is missing"
+      end if
+
+      ! Y-plane
+      inquire (file='./out/x_pencil-y_plane.dat', exist=found)
+      if (found) then
+         allocate (work(nx, 1, nz))
+         open (10, FILE='./out/x_pencil-y_plane.dat', FORM='unformatted', &
+               ACCESS='DIRECT', RECL=iol)
+         m = 1
+         do k = 1, nz
+            do i = 1, nx
+               read (10, rec=m) work(i, 1, k)
+               m = m + 1
+            end do
+         end do
+!        write(*,*) ' '
+!        write(*,'(15I5)') int(work)
+         close (10)
+         deallocate (work)
+         write (*, *) 'passed self test y-plane'
+      else
+         write (*, *) 'Warning : x_pencil-y_plane.dat is missing'
+      end if
+
+      ! Z-plane
+      inquire (file='./out/x_pencil-z_plane.dat', exist=found)
+      if (found) then
+         allocate (work(nx, ny, 1))
+         open (10, FILE='./out/x_pencil-z_plane.dat', FORM='unformatted', &
+               ACCESS='DIRECT', RECL=iol)
+         m = 1
+         do j = 1, ny
+            do i = 1, nx
+               read (10, rec=m) work(i, j, 1)
+               m = m + 1
+            end do
+         end do
+!        write(*,*) ' '
+!        write(*,'(15I5)') int(work)
+         close (10)
+         deallocate (work)
+         write (*, *) 'passed self test z-plane'
+      else
+         write (*, *) 'Warning : x_pencil-z_plane.dat is missing'
+      end if
+
+   end if
+#else
+   ! Avoid unused variables
+   unused(found)
+   unused(work)
+   unused(iol)
+#endif
+
+   deallocate (u1, u2, u3)
+   deallocate (data1)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
 end program io_plane_test
diff --git a/examples/io_test/io_read.f90 b/examples/io_test/io_read.f90
index 38c9cb85..ea25d2e1 100644
--- a/examples/io_test/io_read.f90
+++ b/examples/io_test/io_read.f90
@@ -1,86 +1,172 @@
 program io_read
 
-  use mpi
-
-  use decomp_2d
-  use decomp_2d_io
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
 
-  implicit none
+   implicit none
 
-  integer, parameter :: nx=17, ny=13, nz=11
-  ! use different number of processes
-  integer :: p_row=0, p_col=0
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
 
 #ifdef COMPLEX_TEST
-  complex(mytype), dimension(nx,ny,nz) :: data1
+   complex(mytype), allocatable, dimension(:, :, :) :: data1
 
-  complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
+   complex(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
 #else
-  real(mytype), dimension(nx,ny,nz) :: data1
+   real(mytype), allocatable, dimension(:, :, :) :: data1
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
+#endif
+
+   real(mytype), parameter :: eps = 1.0E-7_mytype
+
+   character(len=*), parameter :: io_name = "test-io"
+#ifndef ADIOS2
+   logical ::file_exists1, file_exists2, file_exists3
+#endif
+
+   integer :: i, j, k, m, ierror
+
+   integer, parameter :: output2D = 0 ! Which plane to write in 2D (0 for 3D)
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
 
-  real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
+#ifndef ADIOS2
+   if (nrank == 0) then
+      inquire (file="out/u1.dat", exist=file_exists1)
+      inquire (file="out/u2.dat", exist=file_exists2)
+      inquire (file="out/u3.dat", exist=file_exists3)
+      if (.not. (file_exists1 .and. file_exists2 .and. file_exists3)) then
+         call decomp_2d_abort(1, "Error, data 'out/u<1,2,3>.dat' must exist before running io_read test case!")
+      end if
+   end if
 #endif
 
-  real(mytype), parameter :: eps = 1.0E-7_mytype
- 
-  integer :: i,j,k, m, ierror
-  
-  call MPI_INIT(ierror)
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierror)
-  call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  ! ***** global data *****
-  m = 1
-  do k=1,nz
-     do j=1,ny
-        do i=1,nx
+   call decomp_2d_io_init()
+   call decomp_2d_init_io(io_name)
+   call decomp_2d_register_variable(io_name, "u1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u3.dat", 3, 0, output2D, mytype)
+
+   ! ***** global data *****
+   allocate (data1(nx, ny, nz))
+   m = 1
+   do k = 1, nz
+      do j = 1, ny
+         do i = 1, nx
 #ifdef COMPLEX_TEST
-           data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 
+            data1(i, j, k) = cmplx(real(m, mytype), real(nx * ny * nz - m, mytype))
 #else
-           data1(i,j,k) = real(m,mytype)
+            data1(i, j, k) = real(m, mytype)
+#endif
+            m = m + 1
+         end do
+      end do
+   end do
+
+   call alloc_x(u1b, opt_global=.true.)
+   call alloc_y(u2b, opt_global=.true.)
+   call alloc_z(u3b, opt_global=.true.)
+
+   ! read back to different arrays
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_read_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   call decomp_2d_read_one(1, u1b, 'out', 'u1.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(2, u2b, 'out', 'u2.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(3, u3b, 'out', 'u3.dat', io_name, reduce_prec=.false.)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
 #endif
-           m = m+1
-        end do
-     end do
-  end do
-
-  allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  ! read back to different arrays
-  call decomp_2d_read_one(1,u1b,'.','u1.dat','test',reduce_prec=.false.)
-  call decomp_2d_read_one(2,u2b,'.','u2.dat','test',reduce_prec=.false.)
-  call decomp_2d_read_one(3,u3b,'.','u3.dat','test',reduce_prec=.false.)
-
-  ! Check against the global data array
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        if (abs((data1(i,j,k)-u1b(i,j,k))) > eps) stop 4
+
+   ! Check against the global data array
+   do k = xstart(3), xend(3)
+      do j = xstart(2), xend(2)
+         do i = xstart(1), xend(1)
+            if (abs((data1(i, j, k) - u1b(i, j, k))) > eps) stop 4
+         end do
       end do
-    end do
-  end do
+   end do
 
-  do k=ystart(3),yend(3)
-    do j=ystart(2),yend(2)
-      do i=ystart(1),yend(1)
-        if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5
+   do k = ystart(3), yend(3)
+      do j = ystart(2), yend(2)
+         do i = ystart(1), yend(1)
+            if (abs((data1(i, j, k) - u2b(i, j, k))) > eps) stop 5
+         end do
       end do
-    end do
-  end do
-  
-  do k=zstart(3),zend(3)
-    do j=zstart(2),zend(2)
-      do i=zstart(1),zend(1)
-        if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6
+   end do
+
+   do k = zstart(3), zend(3)
+      do j = zstart(2), zend(2)
+         do i = zstart(1), zend(1)
+            if (abs((data1(i, j, k) - u3b(i, j, k))) > eps) stop 6
+         end do
       end do
-    end do
-  end do
+   end do
 
-  call decomp_2d_finalize 
-  call MPI_FINALIZE(ierror)
-  deallocate(u1b,u2b,u3b)
+   deallocate (u1b, u2b, u3b)
+   deallocate (data1)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
 
 end program io_read
diff --git a/examples/io_test/io_test.f90 b/examples/io_test/io_test.f90
index b38aa2a0..04852799 100644
--- a/examples/io_test/io_test.f90
+++ b/examples/io_test/io_test.f90
@@ -1,133 +1,301 @@
+!! SPDX-License-Identifier: BSD-3-Clause
 program io_test
 
-  use mpi
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
 
-  use decomp_2d
-  use decomp_2d_io
+   implicit none
 
-  implicit none
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
 
-  integer, parameter :: nx=17, ny=13, nz=11
-  integer :: p_row=0, p_col=0
+   integer :: ierr
 
 #ifdef COMPLEX_TEST
-  complex(mytype), dimension(nx,ny,nz) :: data1
+   complex(mytype), allocatable, dimension(:, :, :) :: data1
 
-  complex(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-  complex(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
+   complex(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+   complex(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
 #else
-  real(mytype), dimension(nx,ny,nz) :: data1
+   real(mytype), allocatable, dimension(:, :, :) :: data1
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+   real(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
+#endif
+
+   real(mytype), parameter :: eps = 1.0E-7_mytype
+
+   character(len=*), parameter :: io_name = "test-io"
+   character(len=*), parameter :: io_restart = "restart-io"
 
-  real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-  real(mytype), allocatable, dimension(:,:,:) :: u1b, u2b, u3b
+   integer :: i, j, k, m, ierror
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+
+#ifndef ADIOS2
+   logical :: dir_exists
 #endif
 
-  real(mytype), parameter :: eps = 1.0E-7_mytype
- 
-  integer :: i,j,k, m, ierror
-  
-  call MPI_INIT(ierror)
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  ! ***** global data *****
-  m = 1
-  do k=1,nz
-     do j=1,ny
-        do i=1,nx
+   integer, parameter :: output2D = 0 ! Which plane to write in 2D (0 for 3D)
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   call decomp_2d_io_init()
+   call decomp_2d_init_io(io_name)
+   call decomp_2d_register_variable(io_name, "u1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u3.dat", 3, 0, output2D, mytype)
+   call decomp_2d_init_io(io_restart)
+   call decomp_2d_register_variable(io_restart, "u1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "u2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "u3.dat", 3, 0, output2D, mytype)
+
+   ! ***** global data *****
+   allocate (data1(nx, ny, nz))
+   m = 1
+   do k = 1, nz
+      do j = 1, ny
+         do i = 1, nx
 #ifdef COMPLEX_TEST
-           data1(i,j,k) = cmplx(real(m,mytype), real(nx*ny*nz-m,mytype)) 
+            data1(i, j, k) = cmplx(real(m, mytype), real(nx * ny * nz - m, mytype))
 #else
-           data1(i,j,k) = real(m,mytype)
+            data1(i, j, k) = real(m, mytype)
 #endif
-           m = m+1
-        end do
-     end do
-  end do
-
-  allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  allocate(u1b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  ! original x-pencil based data 
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        u1(i,j,k) = data1(i,j,k)
+            m = m + 1
+         end do
+      end do
+   end do
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   call alloc_x(u1b, opt_global=.true.)
+   call alloc_y(u2b, opt_global=.true.)
+   call alloc_z(u3b, opt_global=.true.)
+
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   ! original x-pencil based data
+   !$acc data copyin(data1) copy(u1,u2,u3)
+   !$acc parallel loop default(present)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            u1(i, j, k) = data1(i, j, k)
+         end do
       end do
-    end do
-  end do
-
-  ! transpose
-  call transpose_x_to_y(u1,u2)
-  call transpose_y_to_z(u2,u3)
-
-  ! write to disk
-  call decomp_2d_write_one(1,u1,'.','u1.dat',0,'test')
-  call decomp_2d_write_one(2,u2,'.','u2.dat',0,'test')
-  call decomp_2d_write_one(3,u3,'.','u3.dat',0,'test')
-
-  ! read back to different arrays
-  call decomp_2d_read_one(1,u1b,'.','u1.dat','test',reduce_prec=.false.)
-  call decomp_2d_read_one(2,u2b,'.','u2.dat','test',reduce_prec=.false.)
-  call decomp_2d_read_one(3,u3b,'.','u3.dat','test',reduce_prec=.false.)
-
-  ! compare  
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        if (abs((u1(i,j,k)-u1b(i,j,k))) > eps) stop 1
+   end do
+   !$acc end loop
+
+   ! transpose
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   !$acc update self(u1)
+   !$acc update self(u2)
+   !$acc update self(u3)
+   !$acc end data
+
+   ! write to disk
+#ifndef ADIOS2
+   if (nrank == 0) then
+      inquire (file="out", exist=dir_exists)
+      if (.not. dir_exists) then
+         call execute_command_line("mkdir out 2> /dev/null")
+      end if
+   end if
+#endif
+
+   ! Standard I/O pattern - file per field
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_write_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   call decomp_2d_write_one(1, u1, 'out', 'u1.dat', 0, io_name)
+   call decomp_2d_write_one(2, u2, 'out', 'u2.dat', 0, io_name)
+   call decomp_2d_write_one(3, u3, 'out', 'u3.dat', 0, io_name)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
+#endif
+
+   ! read back to different arrays
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_read_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   call decomp_2d_read_one(1, u1b, 'out', 'u1.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(2, u2b, 'out', 'u2.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(3, u3b, 'out', 'u3.dat', io_name, reduce_prec=.false.)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
+#endif
+
+   ! compare
+   call check("file per field")
+
+   ! Checkpoint I/O pattern - multiple fields per file
+   call decomp_2d_open_io(io_name, "checkpoint", decomp_2d_write_mode)
+   call decomp_2d_start_io(io_name, "checkpoint")
+   call decomp_2d_write_one(1, u1, 'checkpoint', 'u1.dat', 0, io_name)
+   call decomp_2d_write_one(2, u2, 'checkpoint', 'u2.dat', 0, io_name)
+   call decomp_2d_write_one(3, u3, 'checkpoint', 'u3.dat', 0, io_name)
+   call decomp_2d_end_io(io_name, "checkpoint")
+   call decomp_2d_close_io(io_name, "checkpoint")
+
+   call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+   ! read back to different arrays
+   u1b = 0; u2b = 0; u3b = 0
+   call decomp_2d_open_io(io_restart, "checkpoint", decomp_2d_read_mode)
+   call decomp_2d_start_io(io_restart, "checkpoint")
+   call decomp_2d_read_one(1, u1b, 'checkpoint', 'u1.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_read_one(2, u2b, 'checkpoint', 'u2.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_read_one(3, u3b, 'checkpoint', 'u3.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_end_io(io_restart, "checkpoint")
+   call decomp_2d_close_io(io_restart, "checkpoint")
+
+   call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+   ! compare
+   call check("one file, multiple fields")
+
+   deallocate (u1, u2, u3)
+   deallocate (u1b, u2b, u3b)
+   deallocate (data1)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+contains
+
+   subroutine check(stage)
+
+      character(len=*), intent(in) :: stage
+
+      integer :: ierr
+
+      if (nrank == 0) then
+         print *, "Checking "//stage
+      end if
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+      do k = xstart(3), xend(3)
+         do j = xstart(2), xend(2)
+            do i = xstart(1), xend(1)
+               if (abs((u1(i, j, k) - u1b(i, j, k))) > eps) then
+                  print *, u1(i, j, k), u1b(i, j, k)
+                  stop 1
+               end if
+            end do
+         end do
       end do
-    end do
-  end do
 
-  do k=ystart(3),yend(3)
-    do j=ystart(2),yend(2)
-      do i=ystart(1),yend(1)
-        if (abs((u2(i,j,k)-u2b(i,j,k))) > eps) stop 2
+      do k = ystart(3), yend(3)
+         do j = ystart(2), yend(2)
+            do i = ystart(1), yend(1)
+               if (abs((u2(i, j, k) - u2b(i, j, k))) > eps) stop 2
+            end do
+         end do
       end do
-    end do
-  end do
 
-  do k=zstart(3),zend(3)
-    do j=zstart(2),zend(2)
-      do i=zstart(1),zend(1)
-        if (abs((u3(i,j,k)-u3b(i,j,k))) > eps) stop 3
+      do k = zstart(3), zend(3)
+         do j = zstart(2), zend(2)
+            do i = zstart(1), zend(1)
+               if (abs((u3(i, j, k) - u3b(i, j, k))) > eps) stop 3
+            end do
+         end do
       end do
-    end do
-  end do
-
-  ! Also check against the global data array
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        if (abs(data1(i,j,k)-u1b(i,j,k)) > eps) stop 4
+
+      ! Also check against the global data array
+      do k = xstart(3), xend(3)
+         do j = xstart(2), xend(2)
+            do i = xstart(1), xend(1)
+               if (abs(data1(i, j, k) - u1b(i, j, k)) > eps) stop 4
+            end do
+         end do
       end do
-    end do
-  end do
 
-  do k=ystart(3),yend(3)
-    do j=ystart(2),yend(2)
-      do i=ystart(1),yend(1)
-        if (abs((data1(i,j,k)-u2b(i,j,k))) > eps) stop 5
+      do k = ystart(3), yend(3)
+         do j = ystart(2), yend(2)
+            do i = ystart(1), yend(1)
+               if (abs((data1(i, j, k) - u2b(i, j, k))) > eps) stop 5
+            end do
+         end do
       end do
-    end do
-  end do
-  
-  do k=zstart(3),zend(3)
-    do j=zstart(2),zend(2)
-      do i=zstart(1),zend(1)
-        if (abs((data1(i,j,k)-u3b(i,j,k))) > eps) stop 6
+
+      do k = zstart(3), zend(3)
+         do j = zstart(2), zend(2)
+            do i = zstart(1), zend(1)
+               if (abs((data1(i, j, k) - u3b(i, j, k))) > eps) stop 6
+            end do
+         end do
       end do
-    end do
-  end do
 
-  call decomp_2d_finalize 
-  call MPI_FINALIZE(ierror)
-  deallocate(u1,u2,u3)
-  deallocate(u1b,u2b,u3b)
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+      if (nrank == 0) then
+         print *, "Checking "//stage//" pass!"
+      end if
+
+   end subroutine check
 
 end program io_test
diff --git a/examples/io_test/io_tmp_test.f90 b/examples/io_test/io_tmp_test.f90
new file mode 100644
index 00000000..86e9a0d9
--- /dev/null
+++ b/examples/io_test/io_tmp_test.f90
@@ -0,0 +1,390 @@
+!!! io_tmp_test.f90
+!!
+!! Tests writing from temporary arrays using the flush functionality, only relevant for ADIOS2.
+!!
+!! SPDX-License-Identifier: BSD-3-Clause
+program io_test
+
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   integer :: ierr
+
+#ifdef COMPLEX_TEST
+   complex(mytype), allocatable, dimension(:, :, :) :: data1
+
+   complex(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+   complex(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
+   complex(mytype), allocatable, dimension(:, :, :) :: v1, v2, v3
+   complex(mytype), allocatable, dimension(:, :, :) :: v1b, v2b, v3b
+
+   complex(mytype), allocatable, dimension(:, :, :) :: tmp1, tmp2, tmp3
+#else
+   real(mytype), allocatable, dimension(:, :, :) :: data1
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+   real(mytype), allocatable, dimension(:, :, :) :: u1b, u2b, u3b
+   real(mytype), allocatable, dimension(:, :, :) :: v1, v2, v3
+   real(mytype), allocatable, dimension(:, :, :) :: v1b, v2b, v3b
+
+   real(mytype), allocatable, dimension(:, :, :) :: tmp1, tmp2, tmp3
+#endif
+
+   real(mytype), parameter :: eps = 1.0E-7_mytype
+
+   character(len=*), parameter :: io_name = "test-io"
+   character(len=*), parameter :: io_restart = "restart-io"
+
+   integer :: i, j, k, m, ierror
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+
+#ifndef ADIOS2
+   logical :: dir_exists
+#endif
+
+   integer, parameter :: output2D = 0 ! Which plane to write in 2D (0 for 3D)
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   call decomp_2d_io_init()
+   call decomp_2d_init_io(io_name)
+   call decomp_2d_register_variable(io_name, "u1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "u3.dat", 3, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "v1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "v2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_name, "v3.dat", 3, 0, output2D, mytype)
+   call decomp_2d_init_io(io_restart)
+   call decomp_2d_register_variable(io_restart, "u1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "u2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "u3.dat", 3, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "v1.dat", 1, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "v2.dat", 2, 0, output2D, mytype)
+   call decomp_2d_register_variable(io_restart, "v3.dat", 3, 0, output2D, mytype)
+
+   ! ***** global data *****
+   allocate (data1(nx, ny, nz))
+   m = 1
+   do k = 1, nz
+      do j = 1, ny
+         do i = 1, nx
+#ifdef COMPLEX_TEST
+            data1(i, j, k) = cmplx(real(m, mytype), real(nx * ny * nz - m, mytype))
+#else
+            data1(i, j, k) = real(m, mytype)
+#endif
+            m = m + 1
+         end do
+      end do
+   end do
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   call alloc_x(u1b, opt_global=.true.)
+   call alloc_y(u2b, opt_global=.true.)
+   call alloc_z(u3b, opt_global=.true.)
+
+   call alloc_x(v1, opt_global=.true.)
+   call alloc_y(v2, opt_global=.true.)
+   call alloc_z(v3, opt_global=.true.)
+
+   call alloc_x(v1b, opt_global=.true.)
+   call alloc_y(v2b, opt_global=.true.)
+   call alloc_z(v3b, opt_global=.true.)
+
+   call alloc_x(tmp1, opt_global=.true.)
+   call alloc_y(tmp2, opt_global=.true.)
+   call alloc_z(tmp3, opt_global=.true.)
+
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   ! original x-pencil based data
+   !$acc data copyin(data1) copy(u1,u2,u3,v1,v2,v3)
+   !$acc parallel loop default(present)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            u1(i, j, k) = data1(i, j, k)
+            v1(i, j, k) = 10 * data1(i, j, k)
+         end do
+      end do
+   end do
+   !$acc end loop
+
+   ! transpose
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   call transpose_x_to_y(v1, v2)
+   call transpose_y_to_z(v2, v3)
+   !$acc update self(u1)
+   !$acc update self(u2)
+   !$acc update self(u3)
+   !$acc update self(v1)
+   !$acc update self(v2)
+   !$acc update self(v3)
+   !$acc end data
+
+   ! write to disk
+#ifndef ADIOS2
+   if (nrank == 0) then
+      inquire (file="out", exist=dir_exists)
+      if (.not. dir_exists) then
+         call execute_command_line("mkdir out 2> /dev/null")
+      end if
+   end if
+#endif
+
+   ! Standard I/O pattern - file per field
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_write_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   ! Copy data to temporary memory and write from the temporary memory
+
+   tmp1(:, :, :) = u1(:, :, :)
+   call decomp_2d_write_one(1, tmp1, 'out', 'u1.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp1(:, :, :) = v1(:, :, :)
+   call decomp_2d_write_one(1, tmp1, 'out', 'v1.dat', 0, io_name, opt_deferred_writes=.false.)
+
+   tmp2(:, :, :) = u2(:, :, :)
+   call decomp_2d_write_one(2, tmp2, 'out', 'u2.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp2(:, :, :) = v2(:, :, :)
+   call decomp_2d_write_one(2, tmp2, 'out', 'v2.dat', 0, io_name, opt_deferred_writes=.false.)
+
+   tmp3(:, :, :) = u3(:, :, :)
+   call decomp_2d_write_one(3, tmp3, 'out', 'u3.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp3(:, :, :) = v3(:, :, :)
+   call decomp_2d_write_one(3, tmp3, 'out', 'v3.dat', 0, io_name, opt_deferred_writes=.false.)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
+#endif
+
+   ! read back to different arrays
+#ifdef ADIOS2
+   call decomp_2d_open_io(io_name, "out", decomp_2d_read_mode)
+   call decomp_2d_start_io(io_name, "out")
+#endif
+   call decomp_2d_read_one(1, u1b, 'out', 'u1.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(2, u2b, 'out', 'u2.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(3, u3b, 'out', 'u3.dat', io_name, reduce_prec=.false.)
+
+   call decomp_2d_read_one(1, v1b, 'out', 'v1.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(2, v2b, 'out', 'v2.dat', io_name, reduce_prec=.false.)
+   call decomp_2d_read_one(3, v3b, 'out', 'v3.dat', io_name, reduce_prec=.false.)
+#ifdef ADIOS2
+   call decomp_2d_end_io(io_name, "out")
+   call decomp_2d_close_io(io_name, "out")
+#endif
+
+   ! compare
+   call check("file per field")
+
+   ! Checkpoint I/O pattern - multiple fields per file
+   call decomp_2d_open_io(io_name, "checkpoint", decomp_2d_write_mode)
+   call decomp_2d_start_io(io_name, "checkpoint")
+
+   tmp1(:, :, :) = u1(:, :, :)
+   call decomp_2d_write_one(1, tmp1, 'checkpoint', 'u1.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp1(:, :, :) = v1(:, :, :)
+   call decomp_2d_write_one(1, tmp1, 'checkpoint', 'v1.dat', 0, io_name, opt_deferred_writes=.false.)
+
+   tmp2(:, :, :) = u2(:, :, :)
+   call decomp_2d_write_one(2, tmp2, 'checkpoint', 'u2.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp2(:, :, :) = v2(:, :, :)
+   call decomp_2d_write_one(2, tmp2, 'checkpoint', 'v2.dat', 0, io_name, opt_deferred_writes=.false.)
+
+   tmp3(:, :, :) = u3(:, :, :)
+   call decomp_2d_write_one(3, tmp3, 'checkpoint', 'u3.dat', 0, io_name, opt_deferred_writes=.false.)
+   tmp3(:, :, :) = v3(:, :, :)
+   call decomp_2d_write_one(3, tmp3, 'checkpoint', 'v3.dat', 0, io_name, opt_deferred_writes=.false.)
+
+   call decomp_2d_end_io(io_name, "checkpoint")
+   call decomp_2d_close_io(io_name, "checkpoint")
+
+   call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+   ! read back to different arrays
+   ! XXX: For the MPI-IO backend the order of reading must match the order of writing!
+   u1b = 0; u2b = 0; u3b = 0
+   v1b = 0; v2b = 0; v3b = 0
+   call decomp_2d_open_io(io_restart, "checkpoint", decomp_2d_read_mode)
+   call decomp_2d_start_io(io_restart, "checkpoint")
+
+   call decomp_2d_read_one(1, u1b, 'checkpoint', 'u1.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_read_one(1, v1b, 'checkpoint', 'v1.dat', io_restart, reduce_prec=.false.)
+
+   call decomp_2d_read_one(2, u2b, 'checkpoint', 'u2.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_read_one(2, v2b, 'checkpoint', 'v2.dat', io_restart, reduce_prec=.false.)
+
+   call decomp_2d_read_one(3, u3b, 'checkpoint', 'u3.dat', io_restart, reduce_prec=.false.)
+   call decomp_2d_read_one(3, v3b, 'checkpoint', 'v3.dat', io_restart, reduce_prec=.false.)
+
+   call decomp_2d_end_io(io_restart, "checkpoint")
+   call decomp_2d_close_io(io_restart, "checkpoint")
+
+   call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+   ! compare
+   call check("one file, multiple fields")
+
+   deallocate (u1, u2, u3)
+   deallocate (u1b, u2b, u3b)
+   deallocate (v1, v2, v3)
+   deallocate (v1b, v2b, v3b)
+   deallocate (data1)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
+contains
+
+   subroutine check(stage)
+
+      character(len=*), intent(in) :: stage
+
+      integer :: ierr
+
+      if (nrank == 0) then
+         print *, "Checking "//stage
+      end if
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+
+      do k = xstart(3), xend(3)
+         do j = xstart(2), xend(2)
+            do i = xstart(1), xend(1)
+               if (abs((u1(i, j, k) - u1b(i, j, k))) > eps) then
+                  print *, u1(i, j, k), u1b(i, j, k)
+                  stop 1
+               end if
+               if (abs((v1(i, j, k) - v1b(i, j, k))) > eps) then
+                  print *, v1(i, j, k), v1b(i, j, k)
+                  stop 11
+               end if
+            end do
+         end do
+      end do
+
+      do k = ystart(3), yend(3)
+         do j = ystart(2), yend(2)
+            do i = ystart(1), yend(1)
+               if (abs((u2(i, j, k) - u2b(i, j, k))) > eps) then
+                  print *, u2(i, j, k), u2b(i, j, k)
+                  stop 2
+               end if
+               if (abs((v2(i, j, k) - v2b(i, j, k))) > eps) stop 22
+            end do
+         end do
+      end do
+
+      do k = zstart(3), zend(3)
+         do j = zstart(2), zend(2)
+            do i = zstart(1), zend(1)
+               if (abs((u3(i, j, k) - u3b(i, j, k))) > eps) stop 3
+               if (abs((v3(i, j, k) - v3b(i, j, k))) > eps) stop 33
+            end do
+         end do
+      end do
+
+      ! Also check against the global data array
+      do k = xstart(3), xend(3)
+         do j = xstart(2), xend(2)
+            do i = xstart(1), xend(1)
+               if (abs(data1(i, j, k) - u1b(i, j, k)) > eps) stop 4
+               if (abs(10 * data1(i, j, k) - v1b(i, j, k)) > eps) stop 44
+            end do
+         end do
+      end do
+
+      do k = ystart(3), yend(3)
+         do j = ystart(2), yend(2)
+            do i = ystart(1), yend(1)
+               if (abs((data1(i, j, k) - u2b(i, j, k))) > eps) stop 5
+               if (abs((10 * data1(i, j, k) - v2b(i, j, k))) > eps) stop 55
+            end do
+         end do
+      end do
+
+      do k = zstart(3), zend(3)
+         do j = zstart(2), zend(2)
+            do i = zstart(1), zend(1)
+               if (abs((data1(i, j, k) - u3b(i, j, k))) > eps) stop 6
+               if (abs((10 * data1(i, j, k) - v3b(i, j, k))) > eps) stop 66
+            end do
+         end do
+      end do
+
+      call MPI_Barrier(MPI_COMM_WORLD, ierr)
+      if (nrank == 0) then
+         print *, "Checking "//stage//" pass!"
+      end if
+
+   end subroutine check
+
+end program io_test
diff --git a/examples/io_test/io_var_test.f90 b/examples/io_test/io_var_test.f90
index 2d25dcf9..f63d2aa4 100644
--- a/examples/io_test/io_var_test.f90
+++ b/examples/io_test/io_var_test.f90
@@ -1,285 +1,341 @@
+!! SPDX-License-Identifier: BSD-3-Clause
 ! Sample application to test the read/write_var sets of routines
 ! in the IO library
 
 program io_var_test
 
-  use decomp_2d
-  use decomp_2d_io
-  use MPI
-
-  implicit none
-
-  integer, parameter :: nx=17, ny=13, nz=11
-  integer :: p_row, p_col
-
-  real(mytype), parameter :: eps = 1.0E-7
-
-  ! for global data
-  real(mytype), dimension(nx,ny,nz) :: data1
-  real(mytype), allocatable, dimension(:,:,:) :: data1_large
-  complex(mytype), dimension(nx,ny,nz) :: cdata1
-
-  ! for distributed data
-  real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-  real(mytype), allocatable, dimension(:,:,:) :: u1l, u2l, u3l
-  complex(mytype), allocatable, dimension(:,:,:) :: cu1, cu2, cu3
-
-  ! another copy
-  real(mytype), allocatable, dimension(:,:,:) :: u1_b, u2_b, u3_b
-  real(mytype), allocatable, dimension(:,:,:) :: u1l_b, u2l_b, u3l_b
-  complex(mytype), allocatable, dimension(:,:,:) :: cu1_b, cu2_b, cu3_b
-
-  real(mytype), allocatable, dimension(:) :: tmp
-  complex(mytype), allocatable, dimension(:) :: ctmp
-  integer, allocatable, dimension(:) :: itmp
-
-  TYPE(DECOMP_INFO) :: large
-
-  integer :: i,j,k, m, ierror, fh
-  character(len=15) :: filename, arg
-  integer (kind=MPI_OFFSET_KIND) :: filesize, disp
-
-  allocate(data1_large(nx*2,ny*2,nz*2))
-
-  call MPI_INIT(ierror)
-  call MPI_COMM_SIZE(MPI_COMM_WORLD, nproc, ierror)
-  call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
-
-  ! Defaults
-  p_row = 0
-  p_col = 0
-
-  ! Read commandline input
-  i = command_argument_count()
-  if (i/=2) then
-     call MPI_ABORT(MPI_COMM_WORLD, 1, ierror)
-  else
-     call get_command_argument(1, arg)
-     read(arg, '(I10)') i
-     p_row = i
-     call get_command_argument(2, arg)
-     read(arg, '(I10)') i
-     p_col = i
-  end if
-  
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  ! also create a data set over a large domain
-  call decomp_info_init(nx*2, ny*2, nz*2, large)
-
-  ! initialise global data
-  m = 1
-  do k=1,nz
-     do j=1,ny
-        do i=1,nx
-           data1(i,j,k) = real(m,mytype)
-           cdata1(i,j,k) = cmplx(real(m,mytype),real(m,mytype), kind=mytype)
-           m = m+1
-        end do
-     end do
-  end do
-
-  m = 1
-  do k=1,nz*2
-     do j=1,ny*2
-        do i=1,nx*2
-           data1_large(i,j,k) = real(m,mytype)
-           m = m+1
-        end do
-     end do
-  end do
-
-  ! allocate memory
-  allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-  allocate(u1_b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(u2_b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(u3_b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  allocate(cu1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(cu2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(cu3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-  allocate(cu1_b(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  allocate(cu2_b(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  allocate(cu3_b(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-
-  allocate(u1l(large%xst(1):large%xen(1), large%xst(2):large%xen(2), &
-       large%xst(3):large%xen(3)))
-  allocate(u2l(large%yst(1):large%yen(1), large%yst(2):large%yen(2), &
-       large%yst(3):large%yen(3)))
-  allocate(u3l(large%zst(1):large%zen(1), large%zst(2):large%zen(2), &
-       large%zst(3):large%zen(3)))
-  allocate(u1l_b(large%xst(1):large%xen(1), large%xst(2):large%xen(2), &
-       large%xst(3):large%xen(3)))
-  allocate(u2l_b(large%yst(1):large%yen(1), large%yst(2):large%yen(2), &
-       large%yst(3):large%yen(3)))
-  allocate(u3l_b(large%zst(1):large%zen(1), large%zst(2):large%zen(2), &
-       large%zst(3):large%zen(3)))
-
-  ! distribute the data
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           u1(i,j,k) = data1(i,j,k)
-           cu1(i,j,k) = cdata1(i,j,k)
-        end do
-     end do
-  end do
-  do k=large%xst(3),large%xen(3)
-     do j=large%xst(2),large%xen(2)
-        do i=large%xst(1),large%xen(1)
-           u1l(i,j,k) = data1_large(i,j,k)
-        end do
-     end do
-  end do
-
-  ! transpose
-  call transpose_x_to_y(u1,u2)
-  call transpose_y_to_z(u2,u3)
-  call transpose_x_to_y(u1l,u2l,large)
-  call transpose_y_to_z(u2l,u3l,large)
-  call transpose_x_to_y(cu1,cu2)
-  call transpose_y_to_z(cu2,cu3)
-
-  ! open file for IO
-  write(filename,'(A,I3.3)') 'io_var_data.', nproc
-  call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-       MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-       fh, ierror)
-  filesize = 0_MPI_OFFSET_KIND
-  call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-  disp = 0_MPI_OFFSET_KIND
-
-  ! test writing scalar data
-  allocate(tmp(2))
-  tmp(1) = 1._mytype
-  tmp(2) = 2._mytype
-  allocate(ctmp(3))
-  ctmp(1) = cmplx(1.0,1.0,mytype)
-  ctmp(2) = cmplx(2.0,2.0,mytype)
-  ctmp(3) = cmplx(3.0,3.0,mytype)
-  allocate(itmp(3))
-  call decomp_2d_write_scalar(fh,disp,2,tmp)
-  call decomp_2d_write_scalar(fh,disp,3,ctmp)
-  call decomp_2d_write_scalar(fh,disp,3,(/nx,ny,nz/))
-  
-  ! test the IO routines by writing all data to disk
-  call decomp_2d_write_var(fh,disp,1,u1)
-  call decomp_2d_write_var(fh,disp,2,u2)
-  call decomp_2d_write_var(fh,disp,3,u3)
-  call decomp_2d_write_var(fh,disp,1,u1l,large)
-  call decomp_2d_write_var(fh,disp,2,u2l,large)
-  call decomp_2d_write_var(fh,disp,3,u3l,large)
-  call decomp_2d_write_var(fh,disp,1,cu1)
-  call decomp_2d_write_var(fh,disp,2,cu2)
-  call decomp_2d_write_var(fh,disp,3,cu3)
-
-  call MPI_FILE_CLOSE(fh,ierror)
-
-  if (nrank==0) write(*,*) 'disp=',disp
-
-  ! read data back in from file
-  call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
-       MPI_MODE_RDONLY, MPI_INFO_NULL, &
-       fh, ierror)
-  ! skip the scalars (2 real, 3 cmplx, 3 int)
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+   use decomp_2d_io
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   real(mytype), parameter :: eps = 1.0E-7
+
+   ! for global data
+   real(mytype), allocatable, dimension(:, :, :) :: data1
+   real(mytype), allocatable, dimension(:, :, :) :: data1_large
+   complex(mytype), allocatable, dimension(:, :, :) :: cdata1
+
+   ! for distributed data
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+   real(mytype), allocatable, dimension(:, :, :) :: u1l, u2l, u3l
+   complex(mytype), allocatable, dimension(:, :, :) :: cu1, cu2, cu3
+
+   ! another copy
+   real(mytype), allocatable, dimension(:, :, :) :: u1_b, u2_b, u3_b
+   real(mytype), allocatable, dimension(:, :, :) :: u1l_b, u2l_b, u3l_b
+   complex(mytype), allocatable, dimension(:, :, :) :: cu1_b, cu2_b, cu3_b
+
+   real(mytype), allocatable, dimension(:) :: tmp
+   complex(mytype), allocatable, dimension(:) :: ctmp
+   integer, allocatable, dimension(:) :: itmp
+
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+
+   TYPE(DECOMP_INFO) :: large
+
+   integer :: i, j, k, m, ierror, fh
+   character(len=15) :: filename
+   integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+
+   call MPI_INIT(ierror)
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   ! also create a data set over a large domain
+   call decomp_info_init(nx * 2, ny * 2, nz * 2, large)
+
+   ! initialise global data
+   allocate (data1(nx, ny, nz))
+   allocate (cdata1(nx, ny, nz))
+   allocate (data1_large(nx * 2, ny * 2, nz * 2))
+   m = 1
+   do k = 1, nz
+      do j = 1, ny
+         do i = 1, nx
+            data1(i, j, k) = real(m, mytype)
+            cdata1(i, j, k) = cmplx(real(m, mytype), real(m, mytype), kind=mytype)
+            m = m + 1
+         end do
+      end do
+   end do
+
+   m = 1
+   do k = 1, nz * 2
+      do j = 1, ny * 2
+         do i = 1, nx * 2
+            data1_large(i, j, k) = real(m, mytype)
+            m = m + 1
+         end do
+      end do
+   end do
+
+   ! allocate memory
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+   call alloc_x(u1_b, opt_global=.true.)
+   call alloc_y(u2_b, opt_global=.true.)
+   call alloc_z(u3_b, opt_global=.true.)
+
+   call alloc_x(cu1, opt_global=.true.)
+   call alloc_y(cu2, opt_global=.true.)
+   call alloc_z(cu3, opt_global=.true.)
+   call alloc_x(cu1_b, opt_global=.true.)
+   call alloc_y(cu2_b, opt_global=.true.)
+   call alloc_z(cu3_b, opt_global=.true.)
+
+   call alloc_x(u1l, large, opt_global=.true.)
+   call alloc_y(u2l, large, opt_global=.true.)
+   call alloc_z(u3l, large, opt_global=.true.)
+   call alloc_x(u1l_b, large, opt_global=.true.)
+   call alloc_y(u2l_b, large, opt_global=.true.)
+   call alloc_z(u3l_b, large, opt_global=.true.)
+
+   ! distribute the data
+   !$acc data copyin(data1,cdata1,data1_large) copy(u1,u2,u3,u1l,u2l,u3l,cu1,cu2,cu3)
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   !$acc parallel loop default(present)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            u1(i, j, k) = data1(i, j, k)
+            cu1(i, j, k) = cdata1(i, j, k)
+         end do
+      end do
+   end do
+   !$acc end loop
+   xst1 = large%xst(1); xen1 = large%xen(1)
+   xst2 = large%xst(2); xen2 = large%xen(2)
+   xst3 = large%xst(3); xen3 = large%xen(3)
+   !$acc parallel loop default(present)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            u1l(i, j, k) = data1_large(i, j, k)
+         end do
+      end do
+   end do
+   !$acc end loop
+
+   ! transpose
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   call transpose_x_to_y(u1l, u2l, large)
+   call transpose_y_to_z(u2l, u3l, large)
+   call transpose_x_to_y(cu1, cu2)
+   call transpose_y_to_z(cu2, cu3)
+   !$acc update self (u1)
+   !$acc update self (u2)
+   !$acc update self (u3)
+   !$acc update self (u1l)
+   !$acc update self (u2l)
+   !$acc update self (u3l)
+   !$acc update self (cu1)
+   !$acc update self (cu2)
+   !$acc update self (cu3)
+   !$acc end data
+
+   ! open file for IO
+   write (filename, '(A,I3.3)') 'io_var_data.', nproc
+   call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
+                      MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, &
+                      fh, ierror)
+   filesize = 0_MPI_OFFSET_KIND
+   call MPI_FILE_SET_SIZE(fh, filesize, ierror)  ! guarantee overwriting
+   disp = 0_MPI_OFFSET_KIND
+
+   ! test writing scalar data
+   allocate (tmp(2))
+   tmp(1) = 1._mytype
+   tmp(2) = 2._mytype
+   allocate (ctmp(3))
+   ctmp(1) = cmplx(1.0, 1.0, mytype)
+   ctmp(2) = cmplx(2.0, 2.0, mytype)
+   ctmp(3) = cmplx(3.0, 3.0, mytype)
+   allocate (itmp(3))
+   call decomp_2d_write_scalar(fh, disp, 2, tmp)
+   call decomp_2d_write_scalar(fh, disp, 3, ctmp)
+   call decomp_2d_write_scalar(fh, disp, 3, (/nx, ny, nz/))
+
+   ! test the IO routines by writing all data to disk
+   call decomp_2d_write_var(fh, disp, 1, u1)
+   call decomp_2d_write_var(fh, disp, 2, u2)
+   call decomp_2d_write_var(fh, disp, 3, u3)
+   call decomp_2d_write_var(fh, disp, 1, u1l, large)
+   call decomp_2d_write_var(fh, disp, 2, u2l, large)
+   call decomp_2d_write_var(fh, disp, 3, u3l, large)
+   call decomp_2d_write_var(fh, disp, 1, cu1)
+   call decomp_2d_write_var(fh, disp, 2, cu2)
+   call decomp_2d_write_var(fh, disp, 3, cu3)
+
+   call MPI_FILE_CLOSE(fh, ierror)
+
+   if (nrank == 0) write (*, *) 'disp=', disp
+
+   ! read data back in from file
+   call MPI_FILE_OPEN(MPI_COMM_WORLD, filename, &
+                      MPI_MODE_RDONLY, MPI_INFO_NULL, &
+                      fh, ierror)
+   ! skip the scalars (2 real, 3 cmplx, 3 int)
 #ifdef DOUBLE_PREC
-  ! if double precision: 2*8+3*8*2+3*4
-  disp = 76_MPI_OFFSET_KIND
+   ! if double precision: 2*8+3*8*2+3*4
+   disp = 76_MPI_OFFSET_KIND
 #else
-  ! if single precision: 2*4+3*4*2+3*4
-  disp = 44_MPI_OFFSET_KIND
+   ! if single precision: 2*4+3*4*2+3*4
+   disp = 44_MPI_OFFSET_KIND
 #endif
-  
-  call decomp_2d_read_var(fh,disp,1,u1_b)
-  call decomp_2d_read_var(fh,disp,2,u2_b)
-  call decomp_2d_read_var(fh,disp,3,u3_b)
-  call decomp_2d_read_var(fh,disp,1,u1l_b,large)
-  call decomp_2d_read_var(fh,disp,2,u2l_b,large)
-  call decomp_2d_read_var(fh,disp,3,u3l_b,large)
-  call decomp_2d_read_var(fh,disp,1,cu1_b)
-  call decomp_2d_read_var(fh,disp,2,cu2_b)
-  call decomp_2d_read_var(fh,disp,3,cu3_b)
-
-  disp = 0_MPI_OFFSET_KIND
-  call decomp_2d_read_scalar(fh,disp,2,tmp)
-  call decomp_2d_read_scalar(fh,disp,3,ctmp)
-  call decomp_2d_read_scalar(fh,disp,3,itmp)
-  if (nrank==0) then
-     write(*,'(2F8.3)') tmp
-     write(*,20) ctmp
-20   format(3(:,'(',F5.2,',',F5.2,')'))
-     write(*,'(A,3I5)') 'nx,ny,nz', itmp
-  end if
-
-  call MPI_FILE_CLOSE(fh,ierror)
-  deallocate(tmp, ctmp, itmp)
-
-  ! validate the data
-  do k=xstart(3),xend(3)
-     do j=xstart(2),xend(2)
-        do i=xstart(1),xend(1)
-           if (abs(u1(i,j,k)-u1_b(i,j,k)) > eps) stop 1
-           if (abs(cu1(i,j,k)-cu1_b(i,j,k)) > eps) stop 2
-        end do
-     end do
-  end do
-
-  do k=ystart(3),yend(3)
-     do j=ystart(2),yend(2)
-        do i=ystart(1),yend(1)
-           if (abs(u2(i,j,k)-u2_b(i,j,k)) > eps) stop 3
-           if (abs(cu2(i,j,k)-cu2_b(i,j,k)) > eps) stop 4
-        end do
-     end do
-  end do
-
-  do k=zstart(3),zend(3)
-     do j=zstart(2),zend(2)
-        do i=zstart(1),zend(1)
-           if (abs(u3(i,j,k)-u3_b(i,j,k)) > eps) stop 5
-           if (abs(cu3(i,j,k)-cu3_b(i,j,k)) > eps) stop 6
-        end do
-     end do
-  end do
-
-  do k=large%xst(3),large%xen(3)
-     do j=large%xst(2),large%xen(2)
-        do i=large%xst(1),large%xen(1)
-           if (abs(u1l(i,j,k)-u1l_b(i,j,k)) > eps) stop 7
-        end do
-     end do
-  end do
-
-  do k=large%yst(3),large%yen(3)
-     do j=large%yst(2),large%yen(2)
-        do i=large%yst(1),large%yen(1)
-           if (abs(u2l(i,j,k)-u2l_b(i,j,k)) > eps) stop 8
-        end do
-     end do
-  end do
-
-  do k=large%zst(3),large%zen(3)
-     do j=large%zst(2),large%zen(2)
-        do i=large%zst(1),large%zen(1)
-           if (abs(u3l(i,j,k)-u3l_b(i,j,k)) > eps) stop 9
-        end do
-     end do
-  end do
-
-  if (nrank==0) write(*,*) 'passed self test'
-
-  ! clean up
-  call decomp_info_finalize(large)
-  call decomp_2d_finalize
-  call MPI_FINALIZE(ierror)
-  deallocate(u1,u2,u3)
-  deallocate(u1l,u2l,u3l)
-  deallocate(cu1,cu2,cu3)
-  deallocate(u1_b,u2_b,u3_b)
-  deallocate(u1l_b,u2l_b,u3l_b)
-  deallocate(cu1_b,cu2_b,cu3_b)
-  deallocate(data1_large)
-  
+
+   call decomp_2d_read_var(fh, disp, 1, u1_b)
+   call decomp_2d_read_var(fh, disp, 2, u2_b)
+   call decomp_2d_read_var(fh, disp, 3, u3_b)
+   call decomp_2d_read_var(fh, disp, 1, u1l_b, large)
+   call decomp_2d_read_var(fh, disp, 2, u2l_b, large)
+   call decomp_2d_read_var(fh, disp, 3, u3l_b, large)
+   call decomp_2d_read_var(fh, disp, 1, cu1_b)
+   call decomp_2d_read_var(fh, disp, 2, cu2_b)
+   call decomp_2d_read_var(fh, disp, 3, cu3_b)
+
+   disp = 0_MPI_OFFSET_KIND
+   call decomp_2d_read_scalar(fh, disp, 2, tmp)
+   call decomp_2d_read_scalar(fh, disp, 3, ctmp)
+   call decomp_2d_read_scalar(fh, disp, 3, itmp)
+   if (nrank == 0) then
+      write (*, '(2F8.3)') tmp
+      write (*, 20) ctmp
+20    format(3(:, '(', F5.2, ',', F5.2, ')'))
+      write (*, '(A,3I5)') 'nx,ny,nz', itmp
+   end if
+
+   call MPI_FILE_CLOSE(fh, ierror)
+   deallocate (tmp, ctmp, itmp)
+
+   ! validate the data
+   do k = xstart(3), xend(3)
+      do j = xstart(2), xend(2)
+         do i = xstart(1), xend(1)
+            if (abs(u1(i, j, k) - u1_b(i, j, k)) > eps) stop 1
+            if (abs(cu1(i, j, k) - cu1_b(i, j, k)) > eps) stop 2
+         end do
+      end do
+   end do
+
+   do k = ystart(3), yend(3)
+      do j = ystart(2), yend(2)
+         do i = ystart(1), yend(1)
+            if (abs(u2(i, j, k) - u2_b(i, j, k)) > eps) stop 3
+            if (abs(cu2(i, j, k) - cu2_b(i, j, k)) > eps) stop 4
+         end do
+      end do
+   end do
+
+   do k = zstart(3), zend(3)
+      do j = zstart(2), zend(2)
+         do i = zstart(1), zend(1)
+            if (abs(u3(i, j, k) - u3_b(i, j, k)) > eps) stop 5
+            if (abs(cu3(i, j, k) - cu3_b(i, j, k)) > eps) stop 6
+         end do
+      end do
+   end do
+
+   do k = large%xst(3), large%xen(3)
+      do j = large%xst(2), large%xen(2)
+         do i = large%xst(1), large%xen(1)
+            if (abs(u1l(i, j, k) - u1l_b(i, j, k)) > eps) stop 7
+         end do
+      end do
+   end do
+
+   do k = large%yst(3), large%yen(3)
+      do j = large%yst(2), large%yen(2)
+         do i = large%yst(1), large%yen(1)
+            if (abs(u2l(i, j, k) - u2l_b(i, j, k)) > eps) stop 8
+         end do
+      end do
+   end do
+
+   do k = large%zst(3), large%zen(3)
+      do j = large%zst(2), large%zen(2)
+         do i = large%zst(1), large%zen(1)
+            if (abs(u3l(i, j, k) - u3l_b(i, j, k)) > eps) stop 9
+         end do
+      end do
+   end do
+
+   if (nrank == 0) write (*, *) 'passed self test'
+
+   ! clean up
+   deallocate (u1, u2, u3)
+   deallocate (u1l, u2l, u3l)
+   deallocate (cu1, cu2, cu3)
+   deallocate (u1_b, u2_b, u3_b)
+   deallocate (u1l_b, u2l_b, u3l_b)
+   deallocate (cu1_b, cu2_b, cu3_b)
+   deallocate (data1, cdata1, data1_large)
+   call decomp_info_finalize(large)
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+
 end program io_var_test
diff --git a/examples/io_test/io_visu.f90 b/examples/io_test/io_visu.f90
new file mode 100644
index 00000000..d88c3e9e
--- /dev/null
+++ b/examples/io_test/io_visu.f90
@@ -0,0 +1,293 @@
+!!! SPDX-License-Identifier: BSD-3-Clause
+!!!
+!!! Example code to demonstrate visualisation of fields.
+
+program visu
+
+   use mpi
+
+   use decomp_2d
+   use decomp_2d_mpi
+   use decomp_2d_constants
+   use decomp_2d_io
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+
+   integer, parameter :: output2D = 0 ! Which plane to write in 2D (0 for 3D)
+   character(len=*), parameter :: io_name = "visu-io"
+#ifndef ADIOS2
+   logical :: dir_exists
+#endif
+
+   integer :: nargin, arg, FNLength, status, DecInd
+
+   integer :: i, j, k
+
+   integer :: ierr
+
+   call init_example()
+   call init_data()
+
+   call write_data()
+   call write_visu()
+
+   call fin()
+
+contains
+
+   subroutine init_example()
+
+      integer :: resize_domain
+      integer :: nranks_tot
+      character(len=80) :: InputFN
+
+      call MPI_INIT(ierr)
+      ! To resize the domain we need to know global number of ranks
+      ! This operation is also done as part of decomp_2d_init
+      call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierr)
+      resize_domain = int(nranks_tot / 4) + 1
+      nx = nx_base * resize_domain
+      ny = ny_base * resize_domain
+      nz = nz_base * resize_domain
+      ! Now we can check if user put some inputs
+      ! Handle input file like a boss -- GD
+      nargin = command_argument_count()
+      if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+         do arg = 1, nargin
+            call get_command_argument(arg, InputFN, FNLength, status)
+            read (InputFN, *, iostat=status) DecInd
+            if (arg == 1) then
+               p_row = DecInd
+            elseif (arg == 2) then
+               p_col = DecInd
+            elseif (arg == 3) then
+               nx = DecInd
+            elseif (arg == 4) then
+               ny = DecInd
+            elseif (arg == 5) then
+               nz = DecInd
+            end if
+         end do
+      else
+         ! nrank not yet computed we need to avoid write
+         ! for every rank
+         call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierr)
+         if (nrank == 0) then
+            print *, "This Test takes no inputs or 2 inputs as"
+            print *, "  1) p_row (default=0)"
+            print *, "  2) p_col (default=0)"
+            print *, "or 5 inputs as"
+            print *, "  1) p_row (default=0)"
+            print *, "  2) p_col (default=0)"
+            print *, "  3) nx "
+            print *, "  4) ny "
+            print *, "  5) nz "
+            print *, "Number of inputs is not correct and the defult settings"
+            print *, "will be used"
+         end if
+      end if
+
+      call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+      call decomp_2d_io_init()
+      call decomp_2d_init_io(io_name)
+      call decomp_2d_register_variable(io_name, "u1.dat", 1, 0, output2D, mytype)
+      call decomp_2d_register_variable(io_name, "u2.dat", 2, 0, output2D, mytype)
+      call decomp_2d_register_variable(io_name, "u3.dat", 3, 0, output2D, mytype)
+
+   end subroutine init_example
+
+   subroutine init_data()
+
+      integer :: xen1, xen2, xen3
+      integer :: yen1, yen2, yen3
+      integer :: zen1, zen2, zen3
+
+      call alloc_x(u1)
+      call alloc_y(u2)
+      call alloc_z(u3)
+      xen1 = xsize(1)
+      xen2 = xsize(2)
+      xen3 = xsize(3)
+      yen1 = ysize(1)
+      yen2 = ysize(2)
+      yen3 = ysize(3)
+      zen1 = zsize(1)
+      zen2 = zsize(2)
+      zen3 = zsize(3)
+
+      ! distribute the data
+      !$acc data copy(u1,u2,u3)
+
+      !$acc parallel loop default(present)
+      do k = 1, xen3
+         do j = 1, xen2
+            do i = 1, xen1
+               u1(i, j, k) = real(100 + nrank, mytype)
+            end do
+         end do
+      end do
+      !$acc end loop
+      !$acc parallel loop default(present)
+      do k = 1, yen3
+         do j = 1, yen2
+            do i = 1, yen1
+               u2(i, j, k) = real(100 + nrank, mytype)
+            end do
+         end do
+      end do
+      !$acc end loop
+      !$acc parallel loop default(present)
+      do k = 1, zen3
+         do j = 1, zen2
+            do i = 1, zen1
+               u3(i, j, k) = real(100 + nrank, mytype)
+            end do
+         end do
+      end do
+      !$acc end loop
+      !$acc update self (u1)
+      !$acc update self (u2)
+      !$acc update self (u3)
+      !$acc end data
+
+   end subroutine init_data
+
+   subroutine write_data()
+
+    !! Write arrays + visu data from orientations 1, 2 and 3
+#ifndef ADIOS2
+      if (nrank == 0) then
+         inquire (file="out", exist=dir_exists)
+         if (.not. dir_exists) then
+            call execute_command_line("mkdir out 2> /dev/null")
+         end if
+      end if
+#endif
+
+      ! Standard I/O pattern - file per field
+#ifdef ADIOS2
+      call decomp_2d_open_io(io_name, "out", decomp_2d_write_mode)
+      call decomp_2d_start_io(io_name, "out")
+#endif
+
+      call decomp_2d_write_one(1, u1, 'out', 'u1.dat', 0, io_name)
+      call decomp_2d_write_one(2, u2, 'out', 'u2.dat', 0, io_name)
+      call decomp_2d_write_one(3, u3, 'out', 'u3.dat', 0, io_name)
+
+#ifdef ADIOS2
+      call decomp_2d_end_io(io_name, "out")
+      call decomp_2d_close_io(io_name, "out")
+#endif
+
+   end subroutine write_data
+
+   subroutine write_visu()
+      ! This subroutine is based on the xdmf writers in Xcompact3d.
+      ! Copyright (c) 2012-2022, Xcompact3d
+      ! SPDX-License-Identifier: BSD 3-Clause
+
+      integer :: ioxdmf
+
+      character(len=:), allocatable :: fmt
+
+      integer :: precision
+
+      integer :: varctr
+      character(len=16) :: filename
+      character(len=2) :: varname
+
+      if (nrank == 0) then
+         OPEN (newunit=ioxdmf, file="./out.xdmf")
+
+         write (ioxdmf, '(A22)') '<?xml version="1.0" ?>'
+         write (ioxdmf, *) '<!DOCTYPE Xdmf SYSTEM "Xdmf.dtd" []>'
+         write (ioxdmf, *) '<Xdmf xmlns:xi="http://www.w3.org/2001/XInclude" Version="2.0">'
+         write (ioxdmf, *) '<Domain>'
+
+         write (ioxdmf, '(A)') '    <Topology name="topo" TopologyType="3DCoRectMesh"'
+         fmt = "(A, I0, A, I0, A, I0, A)"
+         write (ioxdmf, fmt) '        Dimensions="', nz, " ", ny, " ", nx, '">'
+         write (ioxdmf, '(A)') '    </Topology>'
+
+         write (ioxdmf, *) '    <Geometry name="geo" Type="ORIGIN_DXDYDZ">'
+         write (ioxdmf, *) '        <!-- Origin -->'
+         write (ioxdmf, *) '        <DataItem Format="XML" Dimensions="3">'
+         write (ioxdmf, *) '          0.0 0.0 0.0'
+         write (ioxdmf, *) '        </DataItem>'
+         write (ioxdmf, *) '        <!-- DxDyDz -->'
+         write (ioxdmf, *) '        <DataItem Format="XML" Dimensions="3">'
+         if (mytype == kind(0.0d0)) then
+            fmt = "(A, E24.17, A, E24.17, A, E24.17)"
+         else
+            fmt = "(A, E16.9, A, E16.9, A, E16.9)"
+         end if
+         write (ioxdmf, fmt) '        ', 1.0_mytype, " ", 1.0_mytype, " ", 1.0_mytype
+         write (ioxdmf, *) '        </DataItem>'
+         write (ioxdmf, *) '    </Geometry>'
+
+         write (ioxdmf, *) '   <Grid Name="1" GridType="Uniform">'
+         write (ioxdmf, *) '       <Topology Reference="/Xdmf/Domain/Topology[1]"/>'
+         write (ioxdmf, *) '       <Geometry Reference="/Xdmf/Domain/Geometry[1]"/>'
+
+         do varctr = 1, 3
+            write (varname, "(A, I0)") "u", varctr
+            write (filename, '(A, I0, A)') "./out/u", varctr, ".dat"
+            write (ioxdmf, *) '       <Attribute Name="'//trim(varname)//'" Center="Node">'
+#ifndef ADIOS2
+            write (ioxdmf, *) '          <DataItem Format="Binary"'
+#else
+            write (ioxdmf, *) '          <DataItem Format="HDF"'
+#endif
+
+#ifdef DOUBLE_PREC
+            print *, "Double precision build"
+#ifdef SAVE_SINGLE
+            if (output2D == 0) then
+               precision = 4
+            else
+               precision = 8
+            end if
+#else
+            precision = 8
+#endif
+#else
+            precision = 4
+#endif
+            write (ioxdmf, "(A,I0,A)") '            DataType="Float" Precision="', precision, '" Endian="little" Seek="0"'
+
+            fmt = "(A, I0, A, I0, A, I0, A)"
+            write (ioxdmf, fmt) '            Dimensions="', nz, " ", ny, " ", nx, '">'
+
+            write (ioxdmf, *) '              '//trim(filename)
+
+            write (ioxdmf, *) '           </DataItem>'
+            write (ioxdmf, *) '        </Attribute>'
+         end do
+
+         write (ioxdmf, '(/)')
+         write (ioxdmf, *) '    </Grid>'
+         write (ioxdmf, *) '</Domain>'
+         write (ioxdmf, '(A7)') '</Xdmf>'
+         close (ioxdmf)
+      end if
+
+   end subroutine write_visu
+
+   subroutine fin()
+
+      integer :: ierr
+
+      call decomp_2d_finalize
+      call MPI_FINALIZE(ierr)
+
+   end subroutine fin
+
+end program visu
+
diff --git a/examples/test2d/CMakeLists.txt b/examples/test2d/CMakeLists.txt
new file mode 100644
index 00000000..cf9e3d7b
--- /dev/null
+++ b/examples/test2d/CMakeLists.txt
@@ -0,0 +1,46 @@
+file(GLOB files_test2d test2d.f90)
+file(GLOB files_timing2d_real timing2d_real.f90)
+file(GLOB files_timing2d_complex timing2d_complex.f90)
+
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+add_executable(test2d ${files_test2d})
+add_executable(timing2d_real ${files_timing2d_real})
+add_executable(timing2d_complex ${files_timing2d_complex})
+
+target_link_libraries(test2d PRIVATE decomp2d)
+target_link_libraries(timing2d_real PRIVATE decomp2d)
+target_link_libraries(timing2d_complex PRIVATE decomp2d)
+
+# Run the test(s)
+set(run_dir "${test_dir}/test2d")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME test2d COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:test2d> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME test2d COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:test2d> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+
+# Run the test(s)
+set(run_dir "${test_dir}/timing2d_real")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME timing2d_real COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:timing2d_real> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME timing2d_real COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:timing2d_real> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
+
+# Run the test(s)
+set(run_dir "${test_dir}/timing2d_complex")
+message(STATUS "Example dir ${run_dir}")
+file(MAKE_DIRECTORY ${run_dir})
+if (BUILD_TARGET MATCHES "gpu")
+  file(COPY bind.sh DESTINATION ${run_dir})
+  add_test(NAME timing2d_complex COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ./bind.sh $<TARGET_FILE:timing2d_complex> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+else ()
+  add_test(NAME timing2d_complex COMMAND ${MPIEXEC_EXECUTABLE} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} $<TARGET_FILE:timing2d_complex> ${TEST_ARGUMENTS} WORKING_DIRECTORY ${run_dir})
+endif ()
diff --git a/examples/test2d/Makefile b/examples/test2d/Makefile
deleted file mode 100644
index 4b798294..00000000
--- a/examples/test2d/Makefile
+++ /dev/null
@@ -1,30 +0,0 @@
-#include ../../src/Makefile.inc
-
-
-#INCLUDE = -I../../include
-FFLAGS := $(subst $(MODFLAG),$(MODFLAG)../../,$(FFLAGS))
-FFLAGS := $(patsubst -I%,-I../../%,$(FFLAGS))
-LIBS = -L../../ -l$(LIBDECOMP) $(LFLAGS)
-
-OBJ = test2d.o
-
-NP ?= 1
-MPIRUN ?= mpirun
-
-test2d: $(OBJ)
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -o $@ $(OBJ) $(LIBS)
-
-ifeq ($(PARAMOD),gpu)
-check:
-	$(MPIRUN) -n $(NP) ./bind.sh ./test2d
-else
-check:
-	$(MPIRUN) -n $(NP) ./test2d
-endif
-
-
-clean:
-	rm -f *.o test2d u*.dat *.log
-
-%.o : %.f90
-	$(FC) $(FFLAGS) $(OPT) $(DEFS) $(INC) -c $< -o $@
diff --git a/examples/test2d/README b/examples/test2d/README
deleted file mode 100644
index 1bf62819..00000000
--- a/examples/test2d/README
+++ /dev/null
@@ -1,13 +0,0 @@
-test2d
-------
-
-This program is to validate the 2D pencil decomposition library. It transposes
-a set of data into different storage formats so that all communication routines
-are tested. The test code also demonstrates to use of the I/O library.
-
-To run: use 12 MPI processes.
-
-What to expect: many files are written to disk. Regardless of the pencil-
-orientation of the distributed data, files written collectively by all MPI
-processes should contain identical information. For example, u1.dat, u2.dat,
-u3.dat, u2b.dat and u1b.dat should be all identical.
diff --git a/examples/test2d/README.md b/examples/test2d/README.md
new file mode 100644
index 00000000..6ce544f0
--- /dev/null
+++ b/examples/test2d/README.md
@@ -0,0 +1,29 @@
+# Test and timing of the transpose operations 
+
+List of the tests:
+- [Test transpose operations](test2d.f90): Test the transpose operation for a 3D real array; 
+- [Time transpose real](timing2d_real.f90): Time to solution for the transpose operation for a 3D real array; 
+- [Time transpose complex](timing2d_complex.f90): Time to solution for the transpose operation for a 3D complex array.
+
+The program test2d is to validate the 2D pencil decomposition library. It transposes
+a set of data into different storage formats so that all communication routines
+are tested.
+The programs timing2d_real and timing2d_complex perform the same tests several times 
+and are used to report timing and performances for real and complex transforms. 
+
+What to input: The program takes max 6 inputs as : 
+
+1. p_row [optional]
+1. p_col [optional] 
+1. nx    [optional]
+1. ny    [optional]
+1. nz    [optional]
+1. nt    [optional]
+
+If the decomposition is imposed both (1) and (2) are necessary. 
+If the resolution is imposed (1-5) are necessary
+
+What to expect: 
+- For test2d the output is a success message or an error message with the direction of swapping;
+- For the timing, beside the error/success message timing for all transpostions
+  (X->Y, Y->Z, Z->Y, Y->X) together with the sum are reported. 
diff --git a/examples/test2d/test2d.f90 b/examples/test2d/test2d.f90
index 9c19841a..948330cc 100644
--- a/examples/test2d/test2d.f90
+++ b/examples/test2d/test2d.f90
@@ -1,166 +1,242 @@
+!! SPDX-License-Identifier: BSD-3-Clause
 program test2d
- 
-  use mpi
-
-  use decomp_2d
-  ! use decomp_2d_io
-
-  implicit none
-
-  integer, parameter :: nx=17, ny=13, nz=11
-  integer :: p_row=0, p_col=0
-
-  real(mytype), dimension(nx,ny,nz) :: data1
-  
-  real(mytype), allocatable, dimension(:,:,:) :: u1, u2, u3
-  
-  integer :: i,j,k, m, ierror
-  logical :: error_flag
-
-  ! Init
-  error_flag = .false.
-  call MPI_INIT(ierror)
-  if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_INIT")
-  call decomp_2d_init(nx,ny,nz,p_row,p_col)
-
-  ! ***** global data *****
-  m = 1
-  do k=1,nz
-     do j=1,ny
-        do i=1,nx
-           data1(i,j,k) = float(m)
-           m = m+1
-        end do
-     end do
-  end do
-
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Testing the swap routines
-  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  
-  !allocate(u1(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
-  !allocate(u2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
-  !allocate(u3(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
-  call alloc_x(u1, opt_global=.true.)
-  call alloc_y(u2, opt_global=.true.)
-  call alloc_z(u3, opt_global=.true.)
- 
-  ! original x-pensil based data 
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        u1(i,j,k) = data1(i,j,k)
+
+   use mpi
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+
+   integer :: i, j, k, ierror
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   integer :: yst1, yst2, yst3
+   integer :: yen1, yen2, yen3
+   integer :: zst1, zst2, zst3
+   integer :: zen1, zen2, zen3
+   logical :: error_flag
+   real(mytype) :: m
+
+   ! Init
+   error_flag = .false.
+   call MPI_INIT(ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_INIT")
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   ! Fill the local index
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   yst1 = ystart(1); yen1 = yend(1)
+   yst2 = ystart(2); yen2 = yend(2)
+   yst3 = ystart(3); yen3 = yend(3)
+   zst1 = zstart(1); zen1 = zend(1)
+   zst2 = zstart(2); zen2 = zend(2)
+   zst3 = zstart(3); zen3 = zend(3)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Testing the swap routines
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   !$acc data copy(u1,u2,u3)
+   ! original x-pensil based data
+   !$acc parallel loop default(present) collapse(3) private(m)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            u1(i, j, k) = m
+         end do
       end do
-    end do
-  end do
+   end do
+   !$acc end loop
 
 10 format(15I5)
 
 #ifdef DEBUG
-  if (nrank==0) then 
-     write(*,*) 'Numbers held on Rank 0'
-     write(*,*) ' '
-     write(*,*) 'X-pencil'
-     write(*,10) int(u1)
-  end if
+   if (nrank == 0) then
+      !$acc update self(u1)
+      write (*, *) 'Numbers held on Rank 0'
+      write (*, *) ' '
+      write (*, *) 'X-pencil'
+      write (*, 10) int(u1)
+   end if
 #endif
 
-  ! call decomp_2d_write_one(1,u1,'u1.dat')
+   ! call decomp_2d_write_one(1,u1,'u1.dat')
 
   !!!!!!!!!!!!!!!!!!!!!!!
-  ! x-pensil ==> y-pensil
-  call transpose_x_to_y(u1,u2)
+   ! x-pensil ==> y-pensil
+   call transpose_x_to_y(u1, u2)
 
 #ifdef DEBUG
-  if (nrank==0) then
-     write(*,*) ' '
-     write(*,*) 'Y-pencil'
-     write(*,10) int(u2)
-  end if
+   if (nrank == 0) then
+      !$acc update self(u2)
+      write (*, *) ' '
+      write (*, *) 'Y-pencil'
+      write (*, 10) int(u2)
+   end if
 #endif
 
-  ! call decomp_2d_write_one(2,u2,'u2.dat')
-  ! 'u1.dat' and 'u2.dat' should be identical byte-by-byte 
-
-  ! also check the transposition this way
-  do k=ystart(3),yend(3)
-    do j=ystart(2),yend(2)
-      do i=ystart(1),yend(1)
-        if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) error_flag = .true.
+   ! call decomp_2d_write_one(2,u2,'u2.dat')
+   ! 'u1.dat' and 'u2.dat' should be identical byte-by-byte
+
+   ! also check the transposition this way
+   !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+   do k = yst3, yen3
+      do j = yst2, yen2
+         do i = yst1, yen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            if (abs(u2(i, j, k) - m) > 0) error_flag = .true.
+         end do
       end do
-    end do
-  end do
-  call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
-  if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
-  if (error_flag) call decomp_2d_abort(1, "error swaping x->y")
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+   if (error_flag) call decomp_2d_abort(1, "error swaping x->y")
 
   !!!!!!!!!!!!!!!!!!!!!!!
-  ! y-pensil ==> z-pensil
-  call transpose_y_to_z(u2,u3)
+   ! y-pensil ==> z-pensil
+   call transpose_y_to_z(u2, u3)
 
 #ifdef DEBUG
-  if (nrank==0) then
-     write(*,*) ' '
-     write(*,*) 'Z-pencil'
-     write(*,10) int(u3)
-  end if
+   if (nrank == 0) then
+      !$acc update self(u3)
+      write (*, *) ' '
+      write (*, *) 'Z-pencil'
+      write (*, 10) int(u3)
+   end if
 #endif
 
-  ! call decomp_2d_write_one(3,u3,'u3.dat')
-  ! 'u1.dat','u2.dat' and 'u3.dat' should be identical
+   ! call decomp_2d_write_one(3,u3,'u3.dat')
+   ! 'u1.dat','u2.dat' and 'u3.dat' should be identical
 
-  do k=zstart(3),zend(3)
-    do j=zstart(2),zend(2)
-      do i=zstart(1),zend(1)
-        if (abs(u3(i,j,k)-data1(i,j,k)).gt.0) error_flag = .true.
+   !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+   do k = zst3, zen3
+      do j = zst2, zen2
+         do i = zst1, zen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            if (abs(u3(i, j, k) - m) > 0) error_flag = .true.
+         end do
       end do
-    end do
-  end do
-  call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
-  if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
-  if (error_flag) call decomp_2d_abort(2, "error swaping y->z")
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+   if (error_flag) call decomp_2d_abort(2, "error swaping y->z")
 
   !!!!!!!!!!!!!!!!!!!!!!!
-  ! z-pensil ==> y-pensil
-  call transpose_z_to_y(u3,u2)
-  ! call decomp_2d_write_one(2,u2,'u2b.dat')
-
-  do k=ystart(3),yend(3)
-    do j=ystart(2),yend(2)
-      do i=ystart(1),yend(1)
-        if (abs(u2(i,j,k)-data1(i,j,k)).gt.0) error_flag = .true.
+   ! z-pensil ==> y-pensil
+   call transpose_z_to_y(u3, u2)
+   ! call decomp_2d_write_one(2,u2,'u2b.dat')
+
+   !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+   do k = yst3, yen3
+      do j = yst2, yen2
+         do i = yst1, yen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            if (abs(u2(i, j, k) - m) > 0) error_flag = .true.
+         end do
       end do
-    end do
-  end do
-  call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
-  if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
-  if (error_flag) call decomp_2d_abort(3, "error swaping z->y")
-
-  !!!!!!!!!!!!!!!!!!!!!!!
-  ! y-pensil ==> x-pensil
-  call transpose_y_to_x(u2,u1)
-  ! call decomp_2d_write_one(1,u1,'u1b.dat')
-
-  do k=xstart(3),xend(3)
-    do j=xstart(2),xend(2)
-      do i=xstart(1),xend(1)
-        if (abs(u1(i,j,k)-data1(i,j,k)).gt.0) error_flag = .true.
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+   if (error_flag) call decomp_2d_abort(3, "error swaping z->y")
+
+   !!!!!!!!!!!!!!!!!!!!!!!
+   ! y-pensil ==> x-pensil
+   call transpose_y_to_x(u2, u1)
+   ! call decomp_2d_write_one(1,u1,'u1b.dat')
+
+   !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            if (abs(u1(i, j, k) - m) > 0) error_flag = .true.
+         end do
       end do
-    end do
-  end do
-  call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
-  if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
-  if (error_flag) call decomp_2d_abort(4, "error swaping y->x")
-
-  if (nrank == 0) then
-    write(*,*) " "
-    write(*,*) "test2d completed"
-    write(*,*) " "
-  endif
-
-  call decomp_2d_finalize 
-  call MPI_FINALIZE(ierror)
-  deallocate(u1,u2,u3)
+   end do
+   !$acc end loop
+   call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+   if (error_flag) call decomp_2d_abort(4, "error swaping y->x")
+
+   if (nrank == 0) then
+      write (*, *) " "
+      write (*, *) "test2d completed"
+      write (*, *) " "
+   end if
+
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+   !$acc end data
+   deallocate (u1, u2, u3)
 
 end program test2d
 
diff --git a/examples/test2d/timing2d_complex.f90 b/examples/test2d/timing2d_complex.f90
new file mode 100644
index 00000000..4916fbcb
--- /dev/null
+++ b/examples/test2d/timing2d_complex.f90
@@ -0,0 +1,306 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program timing2d_complex
+
+   use mpi
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   complex(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+
+   integer :: i, j, k, ierror
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   integer :: yst1, yst2, yst3
+   integer :: yen1, yen2, yen3
+   integer :: zst1, zst2, zst3
+   integer :: zen1, zen2, zen3
+   logical :: error_flag
+   real(mytype) :: m
+   complex(mytype) :: cm
+
+   double precision :: t1, t2, t3, t4, t5, t6, t7, t8
+   integer :: iter, niter = 10
+
+   ! Init
+   error_flag = .false.
+   call MPI_INIT(ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_INIT")
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            niter = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   !! ***** global data *****
+   !allocate(data1(nx,ny,nz))
+   !m = 1
+   !do k = 1, nz
+   !   do j = 1, ny
+   !      do i = 1, nx
+   !         data1(i, j, k) = cmplx(float(m),float(m-1))
+   !         m = m + 1
+   !      end do
+   !   end do
+   !end do
+
+   ! Fill the local index
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   yst1 = ystart(1); yen1 = yend(1)
+   yst2 = ystart(2); yen2 = yend(2)
+   yst3 = ystart(3); yen3 = yend(3)
+   zst1 = zstart(1); zen1 = zend(1)
+   zst2 = zstart(2); zen2 = zend(2)
+   zst3 = zstart(3); zen3 = zend(3)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Testing the swap routines
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   !$acc data copy(u1,u2,u3)
+   ! original x-pensil based data
+   !$acc parallel loop default(present) private(m)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            u1(i, j, k) = cmplx(m, real(m - 1, mytype), kind=mytype)
+         end do
+      end do
+   end do
+   !$acc end loop
+
+10 format(15I5)
+
+#ifdef DEBUG
+   if (nrank == 0) then
+      !$acc update self(u1)
+      write (*, *) 'Numbers held on Rank 0'
+      write (*, *) ' '
+      write (*, *) 'X-pencil'
+      write (*, 10) int(u1)
+   end if
+#endif
+
+   ! call decomp_2d_write_one(1,u1,'u1.dat')
+
+   t1 = MPI_WTIME()
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   call transpose_z_to_y(u3, u2)
+   call transpose_y_to_x(u2, u1)
+   t2 = MPI_WTIME() - t1
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   ! Init the total times
+   t2 = 0.d0
+   t4 = 0.d0
+   t6 = 0.d0
+   t8 = 0.d0
+   if (nrank == 0) then
+      write (*, *) 'Tot time it 0 ', t1
+   end if
+   do iter = 1, niter
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! x-pensil ==> y-pensil
+      t1 = MPI_WTIME()
+      call transpose_x_to_y(u1, u2)
+      t2 = t2 + MPI_WTIME() - t1
+
+#ifdef DEBUG
+      if (nrank == 0) then
+         !$acc update self(u2)
+         write (*, *) ' '
+         write (*, *) 'Y-pencil'
+         write (*, 10) int(u2)
+      end if
+#endif
+
+      ! also check the transposition this way
+      !$acc parallel loop default(present) collapse(3) private(m,cm) reduction(.or.:error_flag)
+      do k = yst3, yen3
+         do j = yst2, yen2
+            do i = yst1, yen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               cm = cmplx(m, real(m - 1, mytype), kind=mytype)
+               if (abs(u2(i, j, k) - cm) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(1, "error swaping x->y")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! y-pensil ==> z-pensil
+      t3 = MPI_WTIME()
+      call transpose_y_to_z(u2, u3)
+      t4 = t4 + MPI_WTIME() - t3
+
+#ifdef DEBUG
+      if (nrank == 0) then
+         !$acc update self(u3)
+         write (*, *) ' '
+         write (*, *) 'Z-pencil'
+         write (*, 10) int(u3)
+      end if
+#endif
+
+      !$acc parallel loop default(present) collapse(3) private(m,cm) reduction(.or.:error_flag)
+      do k = zst3, zen3
+         do j = zst2, zen2
+            do i = zst1, zen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               cm = cmplx(m, real(m - 1, mytype), kind=mytype)
+               if (abs(u3(i, j, k) - cm) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(2, "error swaping y->z")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! z-pensil ==> y-pensil
+      t5 = MPI_WTIME()
+      call transpose_z_to_y(u3, u2)
+      t6 = t6 + MPI_WTIME() - t5
+
+      !$acc parallel loop default(present) collapse(3) private(m,cm) reduction(.or.:error_flag)
+      do k = yst3, yen3
+         do j = yst2, yen2
+            do i = yst1, yen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               cm = cmplx(m, real(m - 1, mytype), kind=mytype)
+               if (abs(u2(i, j, k) - cm) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(3, "error swaping z->y")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! y-pensil ==> x-pensil
+      t7 = MPI_WTIME()
+      call transpose_y_to_x(u2, u1)
+      t8 = t8 + MPI_WTIME() - t7
+
+      !$acc parallel loop default(present) collapse(3) private(m,cm) reduction(.or.:error_flag)
+      do k = xst3, xen3
+         do j = xst2, xen2
+            do i = xst1, xen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               cm = cmplx(m, real(m - 1, mytype), kind=mytype)
+               if (abs(u1(i, j, k) - cm) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(4, "error swaping y->x")
+
+   end do
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t6, t5, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t5 = t5 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t8, t7, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t7 = t7 / dble(nproc) / dble(niter)
+   t8 = t1 + t3 + t5 + t7
+   if (nrank == 0) then
+      write (*, *) 'Avg Time X->Y ', t1
+      write (*, *) 'Avg Time Y->Z ', t3
+      write (*, *) 'Avg Time Z->Y ', t5
+      write (*, *) 'Avg Time Y->X ', t7
+      write (*, *) 'Avg Time TOT  ', t8
+   end if
+
+   if (nrank == 0) then
+      write (*, *) " "
+      write (*, *) "Complex transpose completed"
+      write (*, *) " "
+   end if
+
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+   !$acc end data
+   deallocate (u1, u2, u3)
+
+end program timing2d_complex
+
diff --git a/examples/test2d/timing2d_real.f90 b/examples/test2d/timing2d_real.f90
new file mode 100644
index 00000000..adb8f1e4
--- /dev/null
+++ b/examples/test2d/timing2d_real.f90
@@ -0,0 +1,298 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+program timing2d_real
+
+   use mpi
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+#if defined(_GPU)
+   use cudafor
+   use openacc
+#endif
+
+   implicit none
+
+   integer, parameter :: nx_base = 17, ny_base = 13, nz_base = 11
+   integer :: nx, ny, nz
+   integer :: p_row = 0, p_col = 0
+   integer :: resize_domain
+   integer :: nranks_tot
+   integer :: nargin, arg, FNLength, status, DecInd
+   character(len=80) :: InputFN
+
+   real(mytype), allocatable, dimension(:, :, :) :: u1, u2, u3
+
+   integer :: i, j, k, ierror
+   integer :: xst1, xst2, xst3
+   integer :: xen1, xen2, xen3
+   integer :: yst1, yst2, yst3
+   integer :: yen1, yen2, yen3
+   integer :: zst1, zst2, zst3
+   integer :: zen1, zen2, zen3
+   logical :: error_flag
+   real(mytype) :: m
+
+   double precision :: t1, t2, t3, t4, t5, t6, t7, t8
+   integer :: iter, niter = 10
+
+   ! Init
+   error_flag = .false.
+   call MPI_INIT(ierror)
+   if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_INIT")
+   ! To resize the domain we need to know global number of ranks
+   ! This operation is also done as part of decomp_2d_init
+   call MPI_COMM_SIZE(MPI_COMM_WORLD, nranks_tot, ierror)
+   resize_domain = int(nranks_tot / 4) + 1
+   nx = nx_base * resize_domain
+   ny = ny_base * resize_domain
+   nz = nz_base * resize_domain
+   ! Now we can check if user put some inputs
+   ! Handle input file like a boss -- GD
+   nargin = command_argument_count()
+   if ((nargin == 0) .or. (nargin == 2) .or. (nargin == 5) .or. (nargin == 6)) then
+      do arg = 1, nargin
+         call get_command_argument(arg, InputFN, FNLength, status)
+         read (InputFN, *, iostat=status) DecInd
+         if (arg == 1) then
+            p_row = DecInd
+         elseif (arg == 2) then
+            p_col = DecInd
+         elseif (arg == 3) then
+            nx = DecInd
+         elseif (arg == 4) then
+            ny = DecInd
+         elseif (arg == 5) then
+            nz = DecInd
+         elseif (arg == 6) then
+            niter = DecInd
+         end if
+      end do
+   else
+      ! nrank not yet computed we need to avoid write
+      ! for every rank
+      call MPI_COMM_RANK(MPI_COMM_WORLD, nrank, ierror)
+      if (nrank == 0) then
+         print *, "This Test takes no inputs or 2 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "or 5-6 inputs as"
+         print *, "  1) p_row (default=0)"
+         print *, "  2) p_col (default=0)"
+         print *, "  3) nx "
+         print *, "  4) ny "
+         print *, "  5) nz "
+         print *, "  6) n iterations (optional)"
+         print *, "Number of inputs is not correct and the defult settings"
+         print *, "will be used"
+      end if
+   end if
+   call decomp_2d_init(nx, ny, nz, p_row, p_col)
+
+   !! ***** global data *****
+   !allocate(data1(nx,ny,nz))
+   !m = 1
+   !do k = 1, nz
+   !   do j = 1, ny
+   !      do i = 1, nx
+   !         data1(i, j, k) = float(m)
+   !         m = m + 1
+   !      end do
+   !   end do
+   !end do
+
+   ! Fill the local index
+   xst1 = xstart(1); xen1 = xend(1)
+   xst2 = xstart(2); xen2 = xend(2)
+   xst3 = xstart(3); xen3 = xend(3)
+   yst1 = ystart(1); yen1 = yend(1)
+   yst2 = ystart(2); yen2 = yend(2)
+   yst3 = ystart(3); yen3 = yend(3)
+   zst1 = zstart(1); zen1 = zend(1)
+   zst2 = zstart(2); zen2 = zend(2)
+   zst3 = zstart(3); zen3 = zend(3)
+
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Testing the swap routines
+   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+
+   call alloc_x(u1, opt_global=.true.)
+   call alloc_y(u2, opt_global=.true.)
+   call alloc_z(u3, opt_global=.true.)
+
+   !$acc data copy(u1,u2,u3)
+   ! original x-pensil based data
+   !$acc parallel loop default(present) private(m)
+   do k = xst3, xen3
+      do j = xst2, xen2
+         do i = xst1, xen1
+            m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+            u1(i, j, k) = m
+         end do
+      end do
+   end do
+   !$acc end loop
+
+10 format(15I5)
+
+#ifdef DEBUG
+   if (nrank == 0) then
+      !$acc update self(u1)
+      write (*, *) 'Numbers held on Rank 0'
+      write (*, *) ' '
+      write (*, *) 'X-pencil'
+      write (*, 10) int(u1)
+   end if
+#endif
+
+   t1 = MPI_WTIME()
+   call transpose_x_to_y(u1, u2)
+   call transpose_y_to_z(u2, u3)
+   call transpose_z_to_y(u3, u2)
+   call transpose_y_to_x(u2, u1)
+   t2 = MPI_WTIME() - t1
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc)
+   ! Init the total times
+   t2 = 0.d0
+   t4 = 0.d0
+   t6 = 0.d0
+   t8 = 0.d0
+   if (nrank == 0) then
+      write (*, *) 'Tot time it 0 ', t1
+   end if
+   do iter = 1, niter
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! x-pensil ==> y-pensil
+      t1 = MPI_WTIME()
+      call transpose_x_to_y(u1, u2)
+      t2 = t2 + MPI_WTIME() - t1
+
+#ifdef DEBUG
+      if (nrank == 0) then
+         !$acc update self(u2)
+         write (*, *) ' '
+         write (*, *) 'Y-pencil'
+         write (*, 10) int(u2)
+      end if
+#endif
+
+      ! also check the transposition this way
+      !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+      do k = yst3, yen3
+         do j = yst2, yen2
+            do i = yst1, yen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               if (abs(u2(i, j, k) - m) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(1, "error swaping x->y")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! y-pensil ==> z-pensil
+      t3 = MPI_WTIME()
+      call transpose_y_to_z(u2, u3)
+      t4 = t4 + MPI_WTIME() - t3
+
+#ifdef DEBUG
+      if (nrank == 0) then
+         !$acc update self(u3)
+         write (*, *) ' '
+         write (*, *) 'Z-pencil'
+         write (*, 10) int(u3)
+      end if
+#endif
+
+      !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+      do k = zst3, zen3
+         do j = zst2, zen2
+            do i = zst1, zen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               if (abs(u3(i, j, k) - m) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(2, "error swaping y->z")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! z-pensil ==> y-pensil
+      t5 = MPI_WTIME()
+      call transpose_z_to_y(u3, u2)
+      t6 = t6 + MPI_WTIME() - t5
+
+      !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+      do k = yst3, yen3
+         do j = yst2, yen2
+            do i = yst1, yen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               if (abs(u2(i, j, k) - m) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(3, "error swaping z->y")
+
+      !!!!!!!!!!!!!!!!!!!!!!!
+      ! y-pensil ==> x-pensil
+      t7 = MPI_WTIME()
+      call transpose_y_to_x(u2, u1)
+      t8 = t8 + MPI_WTIME() - t7
+
+      !$acc parallel loop default(present) collapse(3) private(m) reduction(.or.:error_flag)
+      do k = xst3, xen3
+         do j = xst2, xen2
+            do i = xst1, xen1
+               m = real(i + (j - 1) * nx + (k - 1) * nx * ny, mytype)
+               if (abs(u1(i, j, k) - m) > 0) error_flag = .true.
+            end do
+         end do
+      end do
+      !$acc end loop
+      call MPI_ALLREDUCE(MPI_IN_PLACE, error_flag, 1, MPI_LOGICAL, MPI_LOR, MPI_COMM_WORLD, ierror)
+      if (ierror /= 0) call decomp_2d_abort(ierror, "MPI_ALLREDUCE")
+      if (error_flag) call decomp_2d_abort(4, "error swaping y->x")
+   end do
+
+   call MPI_ALLREDUCE(t2, t1, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t1 = t1 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t4, t3, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t3 = t3 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t6, t5, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t5 = t5 / dble(nproc) / dble(niter)
+   call MPI_ALLREDUCE(t8, t7, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                      MPI_COMM_WORLD, ierror)
+   t7 = t7 / dble(nproc) / dble(niter)
+   t8 = t1 + t3 + t5 + t7
+   if (nrank == 0) then
+      write (*, *) 'Avg Time X->Y ', t1
+      write (*, *) 'Avg Time Y->Z ', t3
+      write (*, *) 'Avg Time Z->Y ', t5
+      write (*, *) 'Avg Time Y->X ', t7
+      write (*, *) 'Avg Time TOT  ', t8
+   end if
+
+   if (nrank == 0) then
+      write (*, *) " "
+      write (*, *) "Real transpose completed"
+      write (*, *) " "
+   end if
+
+   call decomp_2d_finalize
+   call MPI_FINALIZE(ierror)
+   !$acc end data
+   deallocate (u1, u2, u3)
+
+end program timing2d_real
+
diff --git a/scripts/.fprettifyrc b/scripts/.fprettifyrc
new file mode 100644
index 00000000..80316756
--- /dev/null
+++ b/scripts/.fprettifyrc
@@ -0,0 +1,5 @@
+# Configuration options for fprettify
+
+whitespace = 3             # Whitespace preset - operators, print/read, *, /, +, -
+enable-replacements = true # Replaces relational operators, e.g. .lt. -> <
+c-relations = true         # Use C-style relational operators, e.g. <=
diff --git a/scripts/format.sh b/scripts/format.sh
new file mode 100644
index 00000000..50658aaf
--- /dev/null
+++ b/scripts/format.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env sh
+#
+# format.sh
+#
+# Runs the prettifier over 2decomp sources.
+
+# Find 2decomp root
+SCRDIR=$( dirname -- "$0"; )
+D2DDIR=${SCRDIR}/../
+
+# Run fprettify with config file scripts/.fprettifyrc
+fprettify -c ${SCRDIR}/.fprettifyrc ${D2DDIR}/src/*f90 ${D2DDIR}/examples/*/*f90
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000..2e80684a
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,122 @@
+file(GLOB files_decomp factor.f90
+	               decomp_2d_constants.f90
+		       decomp_2d_mpi.f90
+	               decomp_2d.f90
+		       log.f90
+		       io.f90)
+include_directories(${CMAKE_SOURCE_DIR}/src)
+
+if(${BUILD_TARGET} MATCHES "gpu")
+  list(APPEND files_decomp decomp_2d_cumpi.f90)
+  if(ENABLE_NCCL)
+    list(APPEND files_decomp decomp_2d_nccl.f90)
+  endif()
+endif(${BUILD_TARGET} MATCHES "gpu")
+
+if(FFTW_FOUND)
+  if (${FFT_Choice} MATCHES "fftw_f03")
+    message (STATUS "Compiling using FFTW3 F2003 interface")
+    file(GLOB files_fft fft_fftw3_f03.f90)
+  else()
+    message (STATUS "Compiling using FFTW3 legacy interface")
+    file(GLOB files_fft fft_fftw3.f90)
+  endif()
+elseif(MKL_FOUND)
+  message (STATUS "Compiling using MKL")
+  file(GLOB files_fft $ENV{MKLROOT}/include/mkl_dfti.f90 fft_mkl.f90)
+elseif(CUFFT_FOUND)
+  message (STATUS "Compiling using cuFFT")
+  file(GLOB files_fft fft_cufft.f90)
+else(FFTW_FOUND)
+  message (STATUS "Compiling using Generic FFT")
+  file(GLOB files_fft glassman.f90 fft_generic.f90)
+endif(FFTW_FOUND)
+list(APPEND files_fft fft_log.f90)
+
+if (ENABLE_PROFILER)
+  file(GLOB prof_files profiler_${ENABLE_PROFILER}.f90)
+endif()
+
+set(SRCFILES ${files_decomp} ${files_fft} ${prof_files})
+
+add_library(decomp2d ${SRCFILES})
+target_include_directories(decomp2d PUBLIC
+  $<BUILD_INTERFACE:${CMAKE_Fortran_MODULE_DIRECTORY}>
+  $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+if (MPI_FOUND)
+  target_link_libraries(decomp2d PRIVATE MPI::MPI_Fortran)
+endif (MPI_FOUND)
+
+if(FFTW_FOUND)
+  if (FFTW_DOUBLE_LIB_FOUND)
+    message(STATUS "FFTW_DOUBLE     : ${FFTW_DOUBLE_LIB}")
+    target_link_libraries(decomp2d PRIVATE ${FFTW_DOUBLE_LIB})
+  endif (FFTW_DOUBLE_LIB_FOUND)
+  if (FFTW_FLOAT_LIB_FOUND)
+    message(STATUS "FFTW_SINGLE     : ${FFTW_FLOAT_LIB}")
+    target_link_libraries(decomp2d PRIVATE ${FFTW_FLOAT_LIB})
+  endif (FFTW_FLOAT_LIB_FOUND)
+  if (NOT (FFTW_DOUBLE_LIB_FOUND OR FFTW_FLOAT_LIB_FOUND))
+    message(FATAL_ERROR "CMake step for FFTW failed")
+  endif (NOT (FFTW_DOUBLE_LIB_FOUND OR FFTW_FLOAT_LIB_FOUND))
+  target_include_directories(decomp2d PRIVATE ${FFTW_INCLUDE_DIRS})
+elseif(MKL_FOUND)
+  target_compile_options(decomp2d PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_COMPILE_OPTIONS>)
+  target_include_directories(decomp2d PUBLIC $<TARGET_PROPERTY:MKL::MKL,INTERFACE_INCLUDE_DIRECTORIES>)
+  target_link_libraries(decomp2d PUBLIC $<LINK_ONLY:MKL::MKL>)
+endif(FFTW_FOUND)
+
+if (ENABLE_PROFILER)
+  target_compile_definitions(decomp2d PUBLIC -DPROFILER)
+
+  if (caliper_FOUND)
+    target_include_directories(decomp2d PUBLIC ${caliper_INCLUDE_DIR}/caliper/fortran)
+    target_link_libraries(decomp2d PRIVATE caliper)
+  endif()
+endif()
+
+if (HALO_DEBUG)
+  target_compile_definitions(decomp2d PUBLIC -DHALO_DEBUG)
+endif()
+
+if (IO_BACKEND MATCHES "adios2")
+  target_compile_definitions(decomp2d PUBLIC -DADIOS2)
+  target_link_libraries(decomp2d PUBLIC adios2::fortran_mpi adios2::fortran)
+endif (IO_BACKEND MATCHES "adios2")
+
+install(TARGETS decomp2d
+  EXPORT decomp2d-targets
+  RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+  LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+install(DIRECTORY ${CMAKE_Fortran_MODULE_DIRECTORY}/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING PATTERN "*.mod")
+install(DIRECTORY ${CMAKE_Fortran_MODULE_DIRECTORY}/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  FILES_MATCHING PATTERN "*.smod")
+      
+## Packaging
+install(EXPORT decomp2d-targets
+  DESTINATION ${CMAKE_INSTALL_LIBDIR})
+
+configure_file(
+  ${CMAKE_SOURCE_DIR}/cmake/decomp2d-config.cmake.in
+  ${PROJECT_BINARY_DIR}/cmake/decomp2d-config.cmake @ONLY
+)
+configure_file(
+  ${CMAKE_SOURCE_DIR}/cmake/decomp2d-config-version.cmake.in
+  ${PROJECT_BINARY_DIR}/cmake/decomp2d-config-version.cmake
+)
+
+install(FILES
+  ${PROJECT_BINARY_DIR}/cmake/decomp2d-config.cmake
+  ${PROJECT_BINARY_DIR}/cmake/decomp2d-config-version.cmake
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/decomp2d)
+
+# Make 2decomp usable from build tree.
+export(TARGETS decomp2d
+  FILE ${PROJECT_BINARY_DIR}/decomp2d-targets.cmake)
+configure_file(
+  ${CMAKE_SOURCE_DIR}/cmake/decomp2d-build_config.cmake.in
+  ${PROJECT_BINARY_DIR}/decomp2d-config.cmake @ONLY
+)
diff --git a/src/alloc.f90 b/src/alloc.f90
index 867147ca..24d36990 100644
--- a/src/alloc.f90
+++ b/src/alloc.f90
@@ -1,14 +1,4 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
+!! SPDX-License-Identifier: BSD-3-Clause
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   ! Utility routine to help allocate 3D arrays
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -16,353 +6,396 @@
   ! X-pencil real arrays
   subroutine alloc_x_real_short(var, opt_xlevel, opt_global)
 
-    implicit none
+     implicit none
 
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_xlevel
-    integer, dimension(3) :: xlevel
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_xlevel
+     integer, dimension(3) :: xlevel
 
-    if (present(opt_xlevel)) then
-       xlevel = opt_xlevel
-    else
-       xlevel = decomp_main%xlevel
-    end if
+     if (present(opt_xlevel)) then
+        xlevel = opt_xlevel
+     else
+        xlevel = decomp_main%xlevel
+     end if
 
-    call alloc_x(var, decomp_main, xlevel, opt_global)
+     call alloc_x(var, decomp_main, xlevel, opt_global)
 
   end subroutine alloc_x_real_short
 
-  subroutine alloc_x_real(var, decomp, xlevel, opt_global)
-
-    implicit none
-
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: xlevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%xst(1)-xlevel(1):decomp%xen(1)+xlevel(1), &
-            decomp%xst(2)-xlevel(2):decomp%xen(2)+xlevel(2), &
-            decomp%xst(3)-xlevel(3):decomp%xen(3)+xlevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-xlevel(1):decomp%xsz(1)+xlevel(1), &
-            1-xlevel(2):decomp%xsz(2)+xlevel(2), &
-            1-xlevel(3):decomp%xsz(3)+xlevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_x_real(var, decomp, opt_xlevel, opt_global)
+
+     implicit none
+
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, dimension(3), intent(in), optional :: opt_xlevel
+     logical, intent(IN), optional :: opt_global
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+
+     integer, dimension(3) :: xlevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_xlevel)) then
+        xlevel = opt_xlevel
+     else
+        xlevel = decomp_main%xlevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%xst(1) - xlevel(1):decomp%xen(1) + xlevel(1), &
+                      decomp%xst(2) - xlevel(2):decomp%xen(2) + xlevel(2), &
+                      decomp%xst(3) - xlevel(3):decomp%xen(3) + xlevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - xlevel(1):decomp%xsz(1) + xlevel(1), &
+                      1 - xlevel(2):decomp%xsz(2) + xlevel(2), &
+                      1 - xlevel(3):decomp%xsz(3) + xlevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_x_real
 
   ! X-pencil complex arrays
   subroutine alloc_x_complex_short(var, opt_xlevel, opt_global)
 
-    implicit none
+     implicit none
 
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_xlevel
-    integer, dimension(3) :: xlevel
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_xlevel
+     integer, dimension(3) :: xlevel
 
-    if (present(opt_xlevel)) then
-       xlevel = opt_xlevel
-    else
-       xlevel = decomp_main%xlevel
-    end if
+     if (present(opt_xlevel)) then
+        xlevel = opt_xlevel
+     else
+        xlevel = decomp_main%xlevel
+     end if
 
-    call alloc_x(var, decomp_main, xlevel, opt_global)
+     call alloc_x(var, decomp_main, xlevel, opt_global)
 
   end subroutine alloc_x_complex_short
 
-  subroutine alloc_x_complex(var, decomp, xlevel, opt_global)
-
-    implicit none
-
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: xlevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%xst(1)-xlevel(1):decomp%xen(1)+xlevel(1), &
-            decomp%xst(2)-xlevel(2):decomp%xen(2)+xlevel(2), &
-            decomp%xst(3)-xlevel(3):decomp%xen(3)+xlevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-xlevel(1):decomp%xsz(1)+xlevel(1), &
-            1-xlevel(2):decomp%xsz(2)+xlevel(2), &
-            1-xlevel(3):decomp%xsz(3)+xlevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_x_complex(var, decomp, opt_xlevel, opt_global)
+
+     implicit none
+
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, dimension(3), intent(in), optional :: opt_xlevel
+     logical, intent(IN), optional :: opt_global
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+     integer, dimension(3) :: xlevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_xlevel)) then
+        xlevel = opt_xlevel
+     else
+        xlevel = decomp_main%xlevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%xst(1) - xlevel(1):decomp%xen(1) + xlevel(1), &
+                      decomp%xst(2) - xlevel(2):decomp%xen(2) + xlevel(2), &
+                      decomp%xst(3) - xlevel(3):decomp%xen(3) + xlevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - xlevel(1):decomp%xsz(1) + xlevel(1), &
+                      1 - xlevel(2):decomp%xsz(2) + xlevel(2), &
+                      1 - xlevel(3):decomp%xsz(3) + xlevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_x_complex
 
   ! Y-pencil real arrays
   subroutine alloc_y_real_short(var, opt_ylevel, opt_global)
 
-    implicit none
+     implicit none
 
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_ylevel
-    integer, dimension(3) :: ylevel
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_ylevel
+     integer, dimension(3) :: ylevel
 
-    if (present(opt_ylevel)) then
-       ylevel = opt_ylevel
-    else
-       ylevel = decomp_main%ylevel
-    end if
+     if (present(opt_ylevel)) then
+        ylevel = opt_ylevel
+     else
+        ylevel = decomp_main%ylevel
+     end if
 
-    call alloc_y(var, decomp_main, ylevel, opt_global)
+     call alloc_y(var, decomp_main, ylevel, opt_global)
 
   end subroutine alloc_y_real_short
 
-  subroutine alloc_y_real(var, decomp, ylevel, opt_global)
-
-    implicit none
-
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: ylevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%yst(1)-ylevel(1):decomp%yen(1)+ylevel(1), &
-            decomp%yst(2)-ylevel(2):decomp%yen(2)+ylevel(2), &
-            decomp%yst(3)-ylevel(3):decomp%yen(3)+ylevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-ylevel(1):decomp%ysz(1)+ylevel(1), &
-            1-ylevel(2):decomp%ysz(2)+ylevel(2), &
-            1-ylevel(3):decomp%ysz(3)+ylevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_y_real(var, decomp, opt_ylevel, opt_global)
+
+     implicit none
+
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, dimension(3), intent(IN), optional :: opt_ylevel
+     logical, intent(IN), optional :: opt_global
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+     integer, dimension(3) :: ylevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_ylevel)) then
+        ylevel = opt_ylevel
+     else
+        ylevel = decomp_main%ylevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%yst(1) - ylevel(1):decomp%yen(1) + ylevel(1), &
+                      decomp%yst(2) - ylevel(2):decomp%yen(2) + ylevel(2), &
+                      decomp%yst(3) - ylevel(3):decomp%yen(3) + ylevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - ylevel(1):decomp%ysz(1) + ylevel(1), &
+                      1 - ylevel(2):decomp%ysz(2) + ylevel(2), &
+                      1 - ylevel(3):decomp%ysz(3) + ylevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_y_real
 
   ! Y-pencil complex arrays
   subroutine alloc_y_complex_short(var, opt_ylevel, opt_global)
 
-    implicit none
+     implicit none
 
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_ylevel
-    integer, dimension(3) :: ylevel
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_ylevel
+     integer, dimension(3) :: ylevel
 
-    if (present(opt_ylevel)) then
-       ylevel = opt_ylevel
-    else
-       ylevel = decomp_main%ylevel
-    end if
+     if (present(opt_ylevel)) then
+        ylevel = opt_ylevel
+     else
+        ylevel = decomp_main%ylevel
+     end if
 
-    call alloc_y(var, decomp_main, ylevel, opt_global)
+     call alloc_y(var, decomp_main, ylevel, opt_global)
 
   end subroutine alloc_y_complex_short
 
-  subroutine alloc_y_complex(var, decomp, ylevel, opt_global)
-
-    implicit none
-
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: ylevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%yst(1)-ylevel(1):decomp%yen(1)+ylevel(1), &
-            decomp%yst(2)-ylevel(2):decomp%yen(2)+ylevel(2), &
-            decomp%yst(3)-ylevel(3):decomp%yen(3)+ylevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-ylevel(1):decomp%ysz(1)+ylevel(1), &
-            1-ylevel(2):decomp%ysz(2)+ylevel(2), &
-            1-ylevel(3):decomp%ysz(3)+ylevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_y_complex(var, decomp, opt_ylevel, opt_global)
+
+     implicit none
+
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, dimension(3), intent(IN), optional :: opt_ylevel
+     logical, intent(IN), optional :: opt_global
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+     integer, dimension(3) :: ylevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_ylevel)) then
+        ylevel = opt_ylevel
+     else
+        ylevel = decomp_main%ylevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%yst(1) - ylevel(1):decomp%yen(1) + ylevel(1), &
+                      decomp%yst(2) - ylevel(2):decomp%yen(2) + ylevel(2), &
+                      decomp%yst(3) - ylevel(3):decomp%yen(3) + ylevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - ylevel(1):decomp%ysz(1) + ylevel(1), &
+                      1 - ylevel(2):decomp%ysz(2) + ylevel(2), &
+                      1 - ylevel(3):decomp%ysz(3) + ylevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_y_complex
 
   ! Z-pencil real arrays
   subroutine alloc_z_real_short(var, opt_zlevel, opt_global)
 
-    implicit none
+     implicit none
 
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_zlevel
-    integer, dimension(3) :: zlevel
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_zlevel
+     integer, dimension(3) :: zlevel
 
-    if (present(opt_zlevel)) then
-       zlevel = opt_zlevel
-    else
-       zlevel = decomp_main%zlevel
-    end if
+     if (present(opt_zlevel)) then
+        zlevel = opt_zlevel
+     else
+        zlevel = decomp_main%zlevel
+     end if
 
-    call alloc_z(var, decomp_main, zlevel, opt_global)
+     call alloc_z(var, decomp_main, zlevel, opt_global)
 
   end subroutine alloc_z_real_short
 
-  subroutine alloc_z_real(var, decomp, zlevel, opt_global)
-
-    implicit none
-
-    real(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: zlevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%zst(1)-zlevel(1):decomp%zen(1)+zlevel(1), &
-            decomp%zst(2)-zlevel(2):decomp%zen(2)+zlevel(2), &
-            decomp%zst(3)-zlevel(3):decomp%zen(3)+zlevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-zlevel(1):decomp%zsz(1)+zlevel(1), &
-            1-zlevel(2):decomp%zsz(2)+zlevel(2), &
-            1-zlevel(3):decomp%zsz(3)+zlevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_z_real(var, decomp, opt_zlevel, opt_global)
+
+     implicit none
+
+     real(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_zlevel
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+     integer, dimension(3) :: zlevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_zlevel)) then
+        zlevel = opt_zlevel
+     else
+        zlevel = decomp_main%zlevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%zst(1) - zlevel(1):decomp%zen(1) + zlevel(1), &
+                      decomp%zst(2) - zlevel(2):decomp%zen(2) + zlevel(2), &
+                      decomp%zst(3) - zlevel(3):decomp%zen(3) + zlevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - zlevel(1):decomp%zsz(1) + zlevel(1), &
+                      1 - zlevel(2):decomp%zsz(2) + zlevel(2), &
+                      1 - zlevel(3):decomp%zsz(3) + zlevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_z_real
 
   ! Z-pencil complex arrays
   subroutine alloc_z_complex_short(var, opt_zlevel, opt_global)
 
-    implicit none
+     implicit none
 
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    logical, intent(IN), optional :: opt_global
-    integer, dimension(3), intent(IN), optional :: opt_zlevel
-    integer, dimension(3) :: zlevel
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     logical, intent(IN), optional :: opt_global
+     integer, dimension(3), intent(IN), optional :: opt_zlevel
+     integer, dimension(3) :: zlevel
 
-    if (present(opt_zlevel)) then
-       zlevel = opt_zlevel
-    else
-       zlevel = decomp_main%zlevel
-    end if
+     if (present(opt_zlevel)) then
+        zlevel = opt_zlevel
+     else
+        zlevel = decomp_main%zlevel
+     end if
 
-    call alloc_z(var, decomp_main, zlevel, opt_global)
+     call alloc_z(var, decomp_main, zlevel, opt_global)
 
   end subroutine alloc_z_complex_short
 
-  subroutine alloc_z_complex(var, decomp, zlevel, opt_global)
-
-    implicit none
-
-    complex(mytype), allocatable, dimension(:,:,:) :: var
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    logical, intent(IN), optional :: opt_global
-
-    logical :: global
-    integer :: alloc_stat, errorcode
-    integer, dimension(3) :: zlevel
-
-    if (present(opt_global)) then
-       global = opt_global
-    else
-       global = .false.
-    end if
-
-    if (global) then
-       allocate(var(decomp%zst(1)-zlevel(1):decomp%zen(1)+zlevel(1), &
-            decomp%zst(2)-zlevel(2):decomp%zen(2)+zlevel(2), &
-            decomp%zst(3)-zlevel(3):decomp%zen(3)+zlevel(3)), &
-            stat=alloc_stat)
-    else
-       allocate(var(1-zlevel(1):decomp%zsz(1)+zlevel(1), &
-            1-zlevel(2):decomp%zsz(2)+zlevel(2), &
-            1-zlevel(3):decomp%zsz(3)+zlevel(3)), &
-            stat=alloc_stat)
-    end if
-
-    if (alloc_stat /= 0) then
-       errorcode = 8
-       call decomp_2d_abort(errorcode, &
-            'Memory allocation failed when creating new arrays')
-    end if
-
-    return
+  subroutine alloc_z_complex(var, decomp, opt_zlevel, opt_global)
+
+     implicit none
+
+     complex(mytype), allocatable, dimension(:, :, :) :: var
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, dimension(3), intent(IN), optional :: opt_zlevel
+     logical, intent(IN), optional :: opt_global
+
+     logical :: global
+     integer :: alloc_stat, errorcode
+     integer, dimension(3) :: zlevel
+
+     if (present(opt_global)) then
+        global = opt_global
+     else
+        global = .false.
+     end if
+
+     if (present(opt_zlevel)) then
+        zlevel = opt_zlevel
+     else
+        zlevel = decomp_main%zlevel
+     end if
+
+     if (global) then
+        allocate (var(decomp%zst(1) - zlevel(1):decomp%zen(1) + zlevel(1), &
+                      decomp%zst(2) - zlevel(2):decomp%zen(2) + zlevel(2), &
+                      decomp%zst(3) - zlevel(3):decomp%zen(3) + zlevel(3)), &
+                  stat=alloc_stat)
+     else
+        allocate (var(1 - zlevel(1):decomp%zsz(1) + zlevel(1), &
+                      1 - zlevel(2):decomp%zsz(2) + zlevel(2), &
+                      1 - zlevel(3):decomp%zsz(3) + zlevel(3)), &
+                  stat=alloc_stat)
+     end if
+
+     if (alloc_stat /= 0) then
+        errorcode = 8
+        call decomp_2d_abort(errorcode, &
+                             'Memory allocation failed when creating new arrays')
+     end if
+
+     return
   end subroutine alloc_z_complex
diff --git a/src/decomp_2d.f90 b/src/decomp_2d.f90
index 1db7e452..4ca28b7f 100644
--- a/src/decomp_2d.f90
+++ b/src/decomp_2d.f90
@@ -1,374 +1,292 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This is the main 2D pencil decomposition module
 
 module decomp_2d
 
-  use MPI
-  use, intrinsic :: iso_fortran_env, only : real32, real64
-  use factor
+   use MPI
+   use, intrinsic :: iso_fortran_env, only: real32, real64
+   use factor
+   use decomp_2d_constants
+   use decomp_2d_mpi
 #if defined(_GPU)
-  use cudafor
+   use cudafor
+   use decomp_2d_cumpi
 #if defined(_NCCL)
-  use nccl
+   use nccl
+   use decomp_2d_nccl
 #endif
 #endif
 
-  implicit none
-
-  private        ! Make everything private unless declared public
-
-#ifdef DOUBLE_PREC
-  integer, parameter, public :: mytype = KIND(0._real64)
-  integer, parameter, public :: real_type = MPI_DOUBLE_PRECISION
-  integer, parameter, public :: real2_type = MPI_2DOUBLE_PRECISION
-  integer, parameter, public :: complex_type = MPI_DOUBLE_COMPLEX
-#ifdef SAVE_SINGLE
-  integer, parameter, public :: mytype_single = KIND(0._real32)
-  integer, parameter, public :: real_type_single = MPI_REAL
-#else
-  integer, parameter, public :: mytype_single = KIND(0._real64)
-  integer, parameter, public :: real_type_single = MPI_DOUBLE_PRECISION
-#endif
+   implicit none
+
+   private        ! Make everything private unless declared public
+
+   ! some key global variables
+   integer, save, public :: nx_global, ny_global, nz_global  ! global size
+
+   ! parameters for 2D Cartesian topology
+   integer, save, dimension(2) :: dims, coord
+   integer, save, public :: DECOMP_2D_COMM_CART_X = MPI_COMM_NULL
+   integer, save, public :: DECOMP_2D_COMM_CART_Y = MPI_COMM_NULL
+   integer, save, public :: DECOMP_2D_COMM_CART_Z = MPI_COMM_NULL
+   integer, save :: DECOMP_2D_COMM_ROW = MPI_COMM_NULL
+   integer, save :: DECOMP_2D_COMM_COL = MPI_COMM_NULL
+
+   ! define neighboring blocks (to be used in halo-cell support)
+   !  first dimension 1=X-pencil, 2=Y-pencil, 3=Z-pencil
+   ! second dimension 1=east, 2=west, 3=north, 4=south, 5=top, 6=bottom
+   integer, save, dimension(3, 6) :: neighbour
+
+   ! flags for periodic condition in three dimensions
+   logical, save :: periodic_x, periodic_y, periodic_z
+
+   !
+   ! Output for the log can be changed by the external code before calling decomp_2d_init
+   !
+   !    0 => No log output
+   !    1 => Master rank log output to stdout
+   !    2 => Master rank log output to the file "decomp_2d_setup.log"
+   !    3 => All ranks log output to a dedicated file
+   !
+   ! The default value is 2 (3 for debug builds)
+   !
+#ifdef DEBUG
+   integer, public, save :: decomp_log = D2D_LOG_TOFILE_FULL
 #else
-  integer, parameter, public :: mytype = KIND(0._real32)
-  integer, parameter, public :: real_type = MPI_REAL
-  integer, parameter, public :: real2_type = MPI_2REAL
-  integer, parameter, public :: complex_type = MPI_COMPLEX
-  integer, parameter, public :: mytype_single = KIND(0._real32)
-  integer, parameter, public :: real_type_single = MPI_REAL
+   integer, public, save :: decomp_log = D2D_LOG_TOFILE
 #endif
 
-  integer, save, public :: mytype_bytes
-
-  ! some key global variables
-  integer, save, public :: nx_global, ny_global, nz_global  ! global size
-
-  integer, save, public :: nrank = -1 ! local MPI rank
-  integer, save, public :: nproc = -1 ! total number of processors
-  integer, save, public :: decomp_2d_comm = MPI_COMM_NULL ! MPI communicator
-
-  ! parameters for 2D Cartesian topology
-  integer, save, dimension(2) :: dims, coord
-  integer, save, public :: DECOMP_2D_COMM_CART_X = MPI_COMM_NULL
-  integer, save, public :: DECOMP_2D_COMM_CART_Y = MPI_COMM_NULL
-  integer, save, public :: DECOMP_2D_COMM_CART_Z = MPI_COMM_NULL
-  integer, save :: DECOMP_2D_COMM_ROW = MPI_COMM_NULL
-  integer, save :: DECOMP_2D_COMM_COL = MPI_COMM_NULL
-
-  ! define neighboring blocks (to be used in halo-cell support)
-  !  first dimension 1=X-pencil, 2=Y-pencil, 3=Z-pencil
-  ! second dimension 1=east, 2=west, 3=north, 4=south, 5=top, 6=bottom
-  integer, save, dimension(3,6) :: neighbour
-
-  ! flags for periodic condition in three dimensions
-  logical, save :: periodic_x, periodic_y, periodic_z
-
-  !
-  ! Debug level can be changed by the external code before calling decomp_2d_init
-  !
-  ! The environment variable "DECOMP_2D_DEBUG" can be used to change the debug level
-  !
-  ! Debug checks are performed only when the preprocessor variable DEBUG is defined
-  !
-  enum, bind(c)
-     enumerator :: D2D_DEBUG_LEVEL_OFF = 0
-     enumerator :: D2D_DEBUG_LEVEL_CRITICAL = 1
-     enumerator :: D2D_DEBUG_LEVEL_ERROR = 2
-     enumerator :: D2D_DEBUG_LEVEL_WARN = 3
-     enumerator :: D2D_DEBUG_LEVEL_INFO = 4
-     enumerator :: D2D_DEBUG_LEVEL_DEBUG = 5
-     enumerator :: D2D_DEBUG_LEVEL_TRACE = 6
-  end enum
+   !
+   ! Debug level can be changed by the external code before calling decomp_2d_init
+   !
+   ! The environment variable "DECOMP_2D_DEBUG" can be used to change the debug level
+   !
+   ! Debug checks are performed only when the preprocessor variable DEBUG is defined
+   !
 #ifdef DEBUG
-  integer(kind(D2D_DEBUG_LEVEL_OFF)), public, save :: decomp_debug = D2D_DEBUG_LEVEL_INFO
+   integer(kind(D2D_DEBUG_LEVEL_OFF)), public, save :: decomp_debug = D2D_DEBUG_LEVEL_INFO
 #else
-  integer(kind(D2D_DEBUG_LEVEL_OFF)), public, save :: decomp_debug = D2D_DEBUG_LEVEL_OFF
+   integer(kind(D2D_DEBUG_LEVEL_OFF)), public, save :: decomp_debug = D2D_DEBUG_LEVEL_OFF
 #endif
 
-#if defined(_GPU)
-#if defined(_NCCL)
-  integer, save :: row_rank, col_rank
-#endif
-#endif
+   ! derived type to store decomposition info for a given global data size
+   TYPE, public :: DECOMP_INFO
+      ! staring/ending index and size of data held by current processor
+      integer, dimension(3) :: xst, xen, xsz  ! x-pencil
+      integer, dimension(3) :: yst, yen, ysz  ! y-pencil
+      integer, dimension(3) :: zst, zen, zsz  ! z-pencil
 
-#ifdef SHM
-  ! derived type to store shared-memory info
-  TYPE, public :: SMP_INFO
-     integer MPI_COMM          ! SMP associated with this communicator
-     integer NODE_ME           ! rank in this communicator
-     integer NCPU              ! size of this communicator
-     integer SMP_COMM          ! communicator for SMP-node masters
-     integer CORE_COMM         ! communicator for cores on SMP-node
-     integer SMP_ME            ! SMP-node id starting from 1 ... NSMP
-     integer NSMP              ! number of SMP-nodes in this communicator
-     integer CORE_ME           ! core id starting from 1 ... NCORE
-     integer NCORE             ! number of cores on this SMP-node
-     integer MAXCORE           ! maximum no. cores on any SMP-node
-     integer N_SND             ! size of SMP shared memory buffer
-     integer N_RCV             ! size of SMP shared memory buffer
-     integer(8) SND_P          ! SNDBUF address (cray pointer), for real
-     integer(8) RCV_P          ! RCVBUF address (cray pointer), for real
-     integer(8) SND_P_c        ! for complex
-     integer(8) RCV_P_c        ! for complex
-  END TYPE SMP_INFO
-#endif
+      ! in addition to local information, processors also need to know
+      ! some global information for global communications to work
+
+      ! how each dimension is distributed along pencils
+      integer, allocatable, dimension(:) :: &
+         x1dist, y1dist, y2dist, z2dist
+
+      ! send/receive buffer counts and displacements for MPI_ALLTOALLV
+      integer, allocatable, dimension(:) :: &
+         x1cnts, y1cnts, y2cnts, z2cnts
+      integer, allocatable, dimension(:) :: &
+         x1disp, y1disp, y2disp, z2disp
 
-  ! derived type to store decomposition info for a given global data size
-  TYPE, public :: DECOMP_INFO
-     ! staring/ending index and size of data held by current processor
-     integer, dimension(3) :: xst, xen, xsz  ! x-pencil
-     integer, dimension(3) :: yst, yen, ysz  ! y-pencil
-     integer, dimension(3) :: zst, zen, zsz  ! z-pencil
-
-     ! in addition to local information, processors also need to know
-     ! some global information for global communications to work
-
-     ! how each dimension is distributed along pencils
-     integer, allocatable, dimension(:) :: &
-          x1dist, y1dist, y2dist, z2dist
-
-     ! send/receive buffer counts and displacements for MPI_ALLTOALLV
-     integer, allocatable, dimension(:) :: &
-          x1cnts, y1cnts, y2cnts, z2cnts
-     integer, allocatable, dimension(:) :: &
-          x1disp, y1disp, y2disp, z2disp
-
-     ! buffer counts for MPI_ALLTOALL: either for evenly distributed data
-     ! or for padded-alltoall
-     integer :: x1count, y1count, y2count, z2count
-
-     ! evenly distributed data
-     logical :: even
-
-     ! number of halo cells in each direction (index 1,2,3), for each pencil (x,y,z)
-     integer, dimension(3) :: xlevel, ylevel, zlevel
-
-#ifdef SHM
-     ! For shared-memory implementation
-
-     ! one instance of this derived type for each communicator
-     ! shared moemory info, such as which MPI rank belongs to which node
-     TYPE(SMP_INFO) :: ROW_INFO, COL_INFO
-
-     ! shared send/recv buffers for ALLTOALLV
-     integer, allocatable, dimension(:) :: x1cnts_s, y1cnts_s, &
-          y2cnts_s, z2cnts_s
-     integer, allocatable, dimension(:) :: x1disp_s, y1disp_s, &
-          y2disp_s, z2disp_s
-     ! A copy of original buffer displacement (will be overwriten)
-     integer, allocatable, dimension(:) :: x1disp_o, y1disp_o, &
-          y2disp_o, z2disp_o
+#ifdef EVEN
+      ! buffer counts for MPI_ALLTOALL for padded-alltoall
+      integer :: x1count, y1count, y2count, z2count
+      ! evenly distributed data
+      logical :: even
 #endif
-  END TYPE DECOMP_INFO
 
-  ! main (default) decomposition information for global size nx*ny*nz
-  TYPE(DECOMP_INFO), target, save, public :: decomp_main
-  ! FIXME The extra decomp_info objects should be defined in the external code, not here
-  !       Currently keeping them to avoid breaking external codes
-  TYPE(DECOMP_INFO), save, public :: phG,ph1,ph2,ph3,ph4
+      ! number of halo cells in each direction (index 1,2,3), for each pencil (x,y,z)
+      integer, dimension(3) :: xlevel, ylevel, zlevel
 
-  ! staring/ending index and size of data held by current processor
-  ! duplicate 'decomp_main', needed by apps to define data structure
-  integer, save, dimension(3), public :: xstart, xend, xsize  ! x-pencil
-  integer, save, dimension(3), public :: ystart, yend, ysize  ! y-pencil
-  integer, save, dimension(3), public :: zstart, zend, zsize  ! z-pencil
+   END TYPE DECOMP_INFO
 
-  ! These are the buffers used by MPI_ALLTOALL(V) calls
-  integer, save :: decomp_buf_size = 0
-  real(mytype),    allocatable, dimension(:) :: work1_r, work2_r
-  complex(mytype), allocatable, dimension(:) :: work1_c, work2_c
+   ! main (default) decomposition information for global size nx*ny*nz
+   TYPE(DECOMP_INFO), target, save, public :: decomp_main
+   ! FIXME The extra decomp_info objects should be defined in the external code, not here
+   !       Currently keeping them to avoid breaking external codes
+   TYPE(DECOMP_INFO), save, public :: phG, ph1, ph2, ph3, ph4
 
-#if defined(_GPU)
-  real(mytype), allocatable, dimension(:), device :: work1_r_d, work2_r_d
-  complex(mytype), allocatable, dimension(:), device :: work1_c_d, work2_c_d
+   ! staring/ending index and size of data held by current processor
+   ! duplicate 'decomp_main', needed by apps to define data structure
+   integer, save, dimension(3), public :: xstart, xend, xsize  ! x-pencil
+   integer, save, dimension(3), public :: ystart, yend, ysize  ! y-pencil
+   integer, save, dimension(3), public :: zstart, zend, zsize  ! z-pencil
+
+   ! These are the buffers used by MPI_ALLTOALL(V) calls
+   integer, save :: decomp_buf_size = 0
+   ! Shared real/complex buffers
+   real(mytype), target, allocatable, dimension(:) :: work1, work2
+   ! Real/complex pointers to buffers
+   real(mytype), pointer, contiguous, dimension(:) :: work1_r, work2_r
+   complex(mytype), pointer, contiguous, dimension(:) :: work1_c, work2_c
 
-#if defined(_NCCL)
-  integer col_comm_size, row_comm_size
-  integer, allocatable, dimension(:) :: local_to_global_col, local_to_global_row
-  type(ncclUniqueId) :: nccl_uid_2decomp
-  type(ncclComm) :: nccl_comm_2decomp
-  integer(kind=cuda_stream_kind) :: cuda_stream_2decomp
-#endif
-#endif
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! To define smaller arrays using every several mesh points
-  integer, save, dimension(3), public :: xszS,yszS,zszS,xstS,ystS,zstS,xenS,yenS,zenS
-  integer, save, dimension(3), public :: xszV,yszV,zszV,xstV,ystV,zstV,xenV,yenV,zenV
-  integer, save, dimension(3), public :: xszP,yszP,zszP,xstP,ystP,zstP,xenP,yenP,zenP
-  logical, save :: coarse_mesh_starts_from_1
-  integer, save :: iskipS, jskipS, kskipS
-  integer, save :: iskipV, jskipV, kskipV
-  integer, save :: iskipP, jskipP, kskipP
-
-  !
-  ! Profiler section
-  !
-  ! Integer to select the profiling tool
-  !    0 => no profiling, default
-  !    1 => Caliper (https://github.com/LLNL/Caliper)
-  !
-  enum, bind(c)
-    enumerator :: decomp_profiler_none = 0
-    enumerator :: decomp_profiler_caliper = 1
-  end enum
-  integer(kind(decomp_profiler_none)), save, public :: decomp_profiler = decomp_profiler_none
-  ! Default : profile everything
-  logical, save, public :: decomp_profiler_transpose = .true.
-  logical, save, public :: decomp_profiler_io = .true.
-  logical, save, public :: decomp_profiler_fft = .true.
-  logical, save, public :: decomp_profiler_d2d = .true.
-
-  ! public user routines
-  public :: decomp_2d_init, decomp_2d_finalize, &
-       transpose_x_to_y, transpose_y_to_z, &
-       transpose_z_to_y, transpose_y_to_x, &
-       decomp_info_init, decomp_info_finalize, partition, &
-       decomp_info_print, decomp_profiler_prep, &
-       decomp_profiler_start, decomp_profiler_end, &
-       init_coarser_mesh_statS,fine_to_coarseS,&
-       init_coarser_mesh_statV,fine_to_coarseV,&
-       init_coarser_mesh_statP,fine_to_coarseP,&
-       alloc_x, alloc_y, alloc_z, &
-       update_halo, decomp_2d_abort, &
-       decomp_2d_warning, get_decomp_info, &
-       decomp_mpi_comm_free, get_decomp_dims, &
-       exchange_halo_x, exchange_halo_y, exchange_halo_z
+   ! To define smaller arrays using every several mesh points
+   integer, save, dimension(3), public :: xszS, yszS, zszS, xstS, ystS, zstS, xenS, yenS, zenS
+   integer, save, dimension(3), public :: xszV, yszV, zszV, xstV, ystV, zstV, xenV, yenV, zenV
+   integer, save, dimension(3), public :: xszP, yszP, zszP, xstP, ystP, zstP, xenP, yenP, zenP
+   logical, save :: coarse_mesh_starts_from_1
+   integer, save :: iskipS, jskipS, kskipS
+   integer, save :: iskipV, jskipV, kskipV
+   integer, save :: iskipP, jskipP, kskipP
+
+   !
+   ! Profiler section
+   !
+   ! Integer to select the profiling tool
+   !    0 => no profiling, default
+   !    1 => Caliper (https://github.com/LLNL/Caliper)
+   !
+   integer(kind(decomp_profiler_none)), save, public :: decomp_profiler = decomp_profiler_none
+   ! Default : profile everything
+   logical, save, public :: decomp_profiler_transpose = .true.
+   logical, save, public :: decomp_profiler_io = .true.
+   logical, save, public :: decomp_profiler_fft = .true.
+   logical, save, public :: decomp_profiler_d2d = .true.
+
+   ! public user routines
+   public :: decomp_2d_init, decomp_2d_finalize, &
+             transpose_x_to_y, transpose_y_to_z, &
+             transpose_z_to_y, transpose_y_to_x, &
+             decomp_info_init, decomp_info_finalize, partition, &
+             decomp_info_print, decomp_profiler_prep, &
+             decomp_profiler_start, decomp_profiler_end, &
+             init_coarser_mesh_statS, fine_to_coarseS, &
+             init_coarser_mesh_statV, fine_to_coarseV, &
+             init_coarser_mesh_statP, fine_to_coarseP, &
+             alloc_x, alloc_y, alloc_z, &
+             update_halo, &
+             exchange_halo_x, exchange_halo_y, exchange_halo_z, &
+             get_decomp_info, &
+             get_decomp_dims, &
+             d2d_listing_get_unit, d2d_listing_close_unit
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! These are routines to perform global data transpositions
-  !
-  !   Four combinations are available, enough to cover all situations
-  !    - transpose_x_to_y (X-pencil --> Y-pencil)
-  !    - transpose_y_to_z (Y-pencil --> Z-pencil)
-  !    - transpose_z_to_y (Z-pencil --> Y-pencil)
-  !    - transpose_y_to_x (Y-pencil --> X-pencil)
-  !
-  !   Generic interface provided here to support multiple data types
-  !    - real and complex types supported through generic interface
-  !    - single/double precision supported through pre-processing
-  !       * see 'mytype' variable at the beginning
-  !    - an optional argument can be supplied to transpose data whose
-  !      global size is not the default nx*ny*nz
-  !       * as the case in fft r2c/c2r interface
+   ! These are routines to perform global data transpositions
+   !
+   !   Four combinations are available, enough to cover all situations
+   !    - transpose_x_to_y (X-pencil --> Y-pencil)
+   !    - transpose_y_to_z (Y-pencil --> Z-pencil)
+   !    - transpose_z_to_y (Z-pencil --> Y-pencil)
+   !    - transpose_y_to_x (Y-pencil --> X-pencil)
+   !
+   !   Generic interface provided here to support multiple data types
+   !    - real and complex types supported through generic interface
+   !    - single/double precision supported through pre-processing
+   !       * see 'mytype' variable at the beginning
+   !    - an optional argument can be supplied to transpose data whose
+   !      global size is not the default nx*ny*nz
+   !       * as the case in fft r2c/c2r interface
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-  interface decomp_2d_init
-     module procedure decomp_2d_init_ref
-  end interface decomp_2d_init
-
-  interface decomp_2d_finalize
-     module procedure decomp_2d_finalize_ref
-  end interface decomp_2d_finalize
-
-  interface transpose_x_to_y
-     module procedure transpose_x_to_y_real
-     module procedure transpose_x_to_y_real_short
-     module procedure transpose_x_to_y_complex
-     module procedure transpose_x_to_y_complex_short
-  end interface transpose_x_to_y
-
-  interface transpose_y_to_z
-     module procedure transpose_y_to_z_real
-     module procedure transpose_y_to_z_real_short
-     module procedure transpose_y_to_z_complex
-     module procedure transpose_y_to_z_complex_short
-  end interface transpose_y_to_z
-
-  interface transpose_z_to_y
-     module procedure transpose_z_to_y_real
-     module procedure transpose_z_to_y_real_short
-     module procedure transpose_z_to_y_complex
-     module procedure transpose_z_to_y_complex_short
-  end interface transpose_z_to_y
-
-  interface transpose_y_to_x
-     module procedure transpose_y_to_x_real
-     module procedure transpose_y_to_x_real_short
-     module procedure transpose_y_to_x_complex
-     module procedure transpose_y_to_x_complex_short
-  end interface transpose_y_to_x
-
-  interface update_halo
-     module procedure update_halo_real
-     module procedure update_halo_real_short
-     module procedure update_halo_complex
-     module procedure update_halo_complex_short
-  end interface update_halo
-
-  interface exchange_halo_x
-     module procedure exchange_halo_x_real
-     module procedure exchange_halo_x_complex
-  end interface exchange_halo_x
-
-  interface exchange_halo_y
+   interface decomp_2d_init
+      module procedure decomp_2d_init_ref
+   end interface decomp_2d_init
+
+   interface decomp_2d_finalize
+      module procedure decomp_2d_finalize_ref
+   end interface decomp_2d_finalize
+
+   interface transpose_x_to_y
+      module procedure transpose_x_to_y_real_long
+      module procedure transpose_x_to_y_real_short
+      module procedure transpose_x_to_y_complex_long
+      module procedure transpose_x_to_y_complex_short
+   end interface transpose_x_to_y
+
+   interface transpose_y_to_z
+      module procedure transpose_y_to_z_real_long
+      module procedure transpose_y_to_z_real_short
+      module procedure transpose_y_to_z_complex_long
+      module procedure transpose_y_to_z_complex_short
+   end interface transpose_y_to_z
+
+   interface transpose_z_to_y
+      module procedure transpose_z_to_y_real_long
+      module procedure transpose_z_to_y_real_short
+      module procedure transpose_z_to_y_complex_long
+      module procedure transpose_z_to_y_complex_short
+   end interface transpose_z_to_y
+
+   interface transpose_y_to_x
+      module procedure transpose_y_to_x_real_long
+      module procedure transpose_y_to_x_real_short
+      module procedure transpose_y_to_x_complex_long
+      module procedure transpose_y_to_x_complex_short
+   end interface transpose_y_to_x
+
+   interface exchange_halo_x
+      module procedure exchange_halo_x_real
+      module procedure exchange_halo_x_complex
+      module procedure exchange_halo_x_real_short
+      module procedure exchange_halo_x_complex_short
+   end interface exchange_halo_x
+
+   interface exchange_halo_y
       module procedure exchange_halo_y_real
       module procedure exchange_halo_y_complex
-  end interface exchange_halo_y
+      module procedure exchange_halo_y_real_short
+      module procedure exchange_halo_y_complex_short
+   end interface exchange_halo_y
 
-  interface exchange_halo_z
+   interface exchange_halo_z
       module procedure exchange_halo_z_real
       module procedure exchange_halo_z_complex
-  end interface exchange_halo_z
-
-  interface alloc_x
-     module procedure alloc_x_real
-     module procedure alloc_x_real_short
-     module procedure alloc_x_complex
-     module procedure alloc_x_complex_short
-  end interface alloc_x
-
-  interface alloc_y
-     module procedure alloc_y_real
-     module procedure alloc_y_real_short
-     module procedure alloc_y_complex
-     module procedure alloc_y_complex_short
-  end interface alloc_y
-
-  interface alloc_z
-     module procedure alloc_z_real
-     module procedure alloc_z_real_short
-     module procedure alloc_z_complex
-     module procedure alloc_z_complex_short
-  end interface alloc_z
-
-  interface decomp_2d_abort
-     module procedure decomp_2d_abort_basic
-     module procedure decomp_2d_abort_file_line
-#if defined(_GPU) && defined(_NCCL)
-     module procedure decomp_2d_abort_nccl_basic
-     module procedure decomp_2d_abort_nccl_file_line
-#endif
-  end interface decomp_2d_abort
-
-  interface decomp_2d_warning
-     module procedure decomp_2d_warning_basic
-     module procedure decomp_2d_warning_file_line
-  end interface decomp_2d_warning
-
-  interface
+      module procedure exchange_halo_z_real_short
+      module procedure exchange_halo_z_complex_short
+   end interface exchange_halo_z
+
+   interface update_halo
+      module procedure update_halo_real
+      module procedure update_halo_real_short
+      module procedure update_halo_complex
+      module procedure update_halo_complex_short
+   end interface update_halo
+
+   interface alloc_x
+      module procedure alloc_x_real
+      module procedure alloc_x_real_short
+      module procedure alloc_x_complex
+      module procedure alloc_x_complex_short
+   end interface alloc_x
+
+   interface alloc_y
+      module procedure alloc_y_real
+      module procedure alloc_y_real_short
+      module procedure alloc_y_complex
+      module procedure alloc_y_complex_short
+   end interface alloc_y
+
+   interface alloc_z
+      module procedure alloc_z_real
+      module procedure alloc_z_real_short
+      module procedure alloc_z_complex
+      module procedure alloc_z_complex_short
+   end interface alloc_z
+
+   interface
+
+      module function d2d_listing_get_unit()
+         integer :: d2d_listing_get_unit
+      end function d2d_listing_get_unit
+
+      module subroutine d2d_listing_close_unit(io_unit)
+         integer, intent(in) :: io_unit
+      end subroutine d2d_listing_close_unit
 
-     module subroutine d2d_listing(given_io_unit)
-        integer, intent(in), optional :: given_io_unit
-     end subroutine d2d_listing
+      module subroutine d2d_listing(given_io_unit)
+         integer, intent(in), optional :: given_io_unit
+      end subroutine d2d_listing
 
-     module subroutine decomp_info_print(d2d, io_unit, d2dname)
-        type(decomp_info), intent(in) :: d2d
-        integer, intent(in) :: io_unit
-        character(len=*), intent(in) :: d2dname
-     end subroutine decomp_info_print
+      module subroutine decomp_info_print(d2d, io_unit, d2dname)
+         type(decomp_info), intent(in) :: d2d
+         integer, intent(in) :: io_unit
+         character(len=*), intent(in) :: d2dname
+      end subroutine decomp_info_print
 
-  end interface
+   end interface
 
    ! Generic interface to initialize the profiler
    interface decomp_profiler_init
@@ -414,1230 +332,822 @@ end subroutine decomp_profiler_end_char
 
 #include "decomp_2d_init_fin.f90"
 
-  !
-  ! Small wrapper to free a MPI communicator
-  !
-  subroutine decomp_mpi_comm_free(mpi_comm)
-
-    implicit none
-
-    integer, intent(inout) :: mpi_comm
-    integer :: ierror
-
-    ! Return if no MPI comm to free
-    if (mpi_comm == MPI_COMM_NULL) return
-
-    ! Free the provided MPI communicator
-    call MPI_COMM_FREE(mpi_comm, ierror)
-    if (ierror /= 0) call decomp_2d_warning(__FILE__, __LINE__, ierror, "MPI_COMM_FREE")
-    mpi_comm = MPI_COMM_NULL
-
-  end subroutine decomp_mpi_comm_free
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Return the default decomposition object
+   ! Return the default decomposition object
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-  ! FIXME avoid a copy and return a pointer to decomp_main
-  ! TODO list the external codes using this subroutine
-  subroutine get_decomp_info(decomp)
+   ! FIXME avoid a copy and return a pointer to decomp_main
+   ! TODO list the external codes using this subroutine
+   subroutine get_decomp_info(decomp)
 
-    implicit none
+      implicit none
 
-    ! FIXME TYPE(DECOMP_INFO), pointer :: decomp
-    TYPE(DECOMP_INFO), intent(OUT) :: decomp
+      ! FIXME TYPE(DECOMP_INFO), pointer :: decomp
+      TYPE(DECOMP_INFO), intent(OUT) :: decomp
 
-    ! FIXME decomp => decomp_main
-    decomp = decomp_main
+      ! FIXME decomp => decomp_main
+      decomp = decomp_main
 
-    return
-  end subroutine get_decomp_info
+      return
+   end subroutine get_decomp_info
 
-  !
-  ! Return the 2D processor grid
-  !
-  function get_decomp_dims()
+   !
+   ! Return the 2D processor grid
+   !
+   function get_decomp_dims()
 
-    implicit none
+      implicit none
 
-    integer, dimension(2) :: get_decomp_dims
+      integer, dimension(2) :: get_decomp_dims
 
-    get_decomp_dims = dims
+      get_decomp_dims = dims
 
-  end function get_decomp_dims
+   end function get_decomp_dims
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Advanced Interface allowing applications to define globle domain of
-  ! any size, distribute it, and then transpose data among pencils.
-  !  - generate 2D decomposition details as defined in DECOMP_INFO
-  !  - the default global data size is nx*ny*nz
-  !  - a different global size nx/2+1,ny,nz is used in FFT r2c/c2r
-  !  - multiple global sizes can co-exist in one application, each
-  !    using its own DECOMP_INFO object
+   ! Advanced Interface allowing applications to define globle domain of
+   ! any size, distribute it, and then transpose data among pencils.
+   !  - generate 2D decomposition details as defined in DECOMP_INFO
+   !  - the default global data size is nx*ny*nz
+   !  - a different global size nx/2+1,ny,nz is used in FFT r2c/c2r
+   !  - multiple global sizes can co-exist in one application, each
+   !    using its own DECOMP_INFO object
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_init(nx,ny,nz,decomp)
-
-    implicit none
-
-    integer, intent(IN) :: nx,ny,nz
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    integer :: buf_size, status, errorcode
-
-    ! verify the global size can actually be distributed as pencils
-    if (nx_global<dims(1) .or. ny_global<dims(1) .or. ny_global<dims(2) .or. nz_global<dims(2)) then
-       errorcode = 6
-       call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-            'Invalid 2D processor grid. ' // &
-            'Make sure that min(nx,ny) >= p_row and ' // &
-            'min(ny,nz) >= p_col')
-    end if
-
-    if (mod(nx,dims(1))==0 .and. mod(ny,dims(1))==0 .and. &
-         mod(ny,dims(2))==0 .and. mod(nz,dims(2))==0) then
-       decomp%even = .true.
-    else
-       decomp%even = .false.
-    end if
-
-    ! distribute mesh points
-    allocate(decomp%x1dist(0:dims(1)-1),decomp%y1dist(0:dims(1)-1), &
-         decomp%y2dist(0:dims(2)-1),decomp%z2dist(0:dims(2)-1))
-    call get_dist(nx,ny,nz,decomp)
-
-    ! generate partition information - starting/ending index etc.
-    call partition(nx, ny, nz, (/ 1,2,3 /), &
-         decomp%xst, decomp%xen, decomp%xsz)
-    call partition(nx, ny, nz, (/ 2,1,3 /), &
-         decomp%yst, decomp%yen, decomp%ysz)
-    call partition(nx, ny, nz, (/ 2,3,1 /), &
-         decomp%zst, decomp%zen, decomp%zsz)
-
-    ! set halo levels to zero by default
-    decomp%xlevel = 0
-    decomp%ylevel = 0
-    decomp%zlevel = 0
-
-    ! prepare send/receive buffer displacement and count for ALLTOALL(V)
-    allocate(decomp%x1cnts(0:dims(1)-1),decomp%y1cnts(0:dims(1)-1), &
-         decomp%y2cnts(0:dims(2)-1),decomp%z2cnts(0:dims(2)-1))
-    allocate(decomp%x1disp(0:dims(1)-1),decomp%y1disp(0:dims(1)-1), &
-         decomp%y2disp(0:dims(2)-1),decomp%z2disp(0:dims(2)-1))
-    call prepare_buffer(decomp)
-
-#ifdef SHM
-    ! prepare shared-memory information if required
-    call decomp_info_init_shm(decomp)
-#endif
+   subroutine decomp_info_init(nx, ny, nz, decomp)
+
+      use, intrinsic:: iso_c_binding, only: c_f_pointer, c_loc
+
+      implicit none
+
+      integer, intent(IN) :: nx, ny, nz
+      TYPE(DECOMP_INFO), intent(INOUT) :: decomp
+
+      integer :: buf_size, status, errorcode
+
+      ! verify the global size can actually be distributed as pencils
+      if (nx_global < dims(1) .or. ny_global < dims(1) .or. ny_global < dims(2) .or. nz_global < dims(2)) then
+         errorcode = 6
+         call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                              'Invalid 2D processor grid. '// &
+                              'Make sure that min(nx,ny) >= p_row and '// &
+                              'min(ny,nz) >= p_col')
+      end if
+
+      ! distribute mesh points
+      allocate (decomp%x1dist(0:dims(1) - 1), decomp%y1dist(0:dims(1) - 1), &
+                decomp%y2dist(0:dims(2) - 1), decomp%z2dist(0:dims(2) - 1))
+      call get_dist(nx, ny, nz, decomp)
+
+      ! generate partition information - starting/ending index etc.
+      call partition(nx, ny, nz, (/1, 2, 3/), &
+                     decomp%xst, decomp%xen, decomp%xsz)
+      call partition(nx, ny, nz, (/2, 1, 3/), &
+                     decomp%yst, decomp%yen, decomp%ysz)
+      call partition(nx, ny, nz, (/2, 3, 1/), &
+                     decomp%zst, decomp%zen, decomp%zsz)
+
+      ! set halo levels to zero by default
+      decomp%xlevel = 0
+      decomp%ylevel = 0
+      decomp%zlevel = 0
+
+      ! prepare send/receive buffer displacement and count for ALLTOALL(V)
+      allocate (decomp%x1cnts(0:dims(1) - 1), decomp%y1cnts(0:dims(1) - 1), &
+                decomp%y2cnts(0:dims(2) - 1), decomp%z2cnts(0:dims(2) - 1))
+      allocate (decomp%x1disp(0:dims(1) - 1), decomp%y1disp(0:dims(1) - 1), &
+                decomp%y2disp(0:dims(2) - 1), decomp%z2disp(0:dims(2) - 1))
+      call prepare_buffer(decomp)
+
+      ! allocate memory for the MPI_ALLTOALL(V) buffers
+      ! define the buffers globally for performance reason
+
+      buf_size = max(decomp%xsz(1) * decomp%xsz(2) * decomp%xsz(3), &
+                     max(decomp%ysz(1) * decomp%ysz(2) * decomp%ysz(3), &
+                         decomp%zsz(1) * decomp%zsz(2) * decomp%zsz(3)))
 
-    ! allocate memory for the MPI_ALLTOALL(V) buffers
-    ! define the buffers globally for performance reason
-
-    buf_size = max(decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3), &
-         max(decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3), &
-         decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)) )
 #ifdef EVEN
-    ! padded alltoall optimisation may need larger buffer space
-    buf_size = max(buf_size, &
-         max(decomp%x1count*dims(1),decomp%y2count*dims(2)) )
+      ! padded alltoall optimisation may need larger buffer space
+      buf_size = max(buf_size, &
+                     max(decomp%x1count * dims(1), decomp%y2count * dims(2)))
+      ! evenly distributed data ?
+      if (mod(nx, dims(1)) == 0 .and. mod(ny, dims(1)) == 0 .and. &
+          mod(ny, dims(2)) == 0 .and. mod(nz, dims(2)) == 0) then
+         decomp%even = .true.
+      else
+         decomp%even = .false.
+      end if
 #endif
 
-    ! check if additional memory is required
-    ! *** TODO: consider how to share the real/complex buffers
-    if (buf_size > decomp_buf_size) then
-       decomp_buf_size = buf_size
+      ! check if additional memory is required
+      if (buf_size > decomp_buf_size) then
+         decomp_buf_size = buf_size
 #if defined(_GPU)
-       if (allocated(work1_r_d)) deallocate(work1_r_d)
-       if (allocated(work2_r_d)) deallocate(work2_r_d)
-       if (allocated(work1_c_d)) deallocate(work1_c_d)
-       if (allocated(work2_c_d)) deallocate(work2_c_d)
-       allocate(work1_r_d(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work1_c_d(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work2_r_d(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work2_c_d(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
+         call decomp_2d_cumpi_init(buf_size)
 #endif
-       if (allocated(work1_r)) deallocate(work1_r)
-       if (allocated(work2_r)) deallocate(work2_r)
-       if (allocated(work1_c)) deallocate(work1_c)
-       if (allocated(work2_c)) deallocate(work2_c)
-       allocate(work1_r(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work2_r(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work1_c(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-       allocate(work2_c(buf_size), STAT=status)
-       if (status /= 0) then
-          errorcode = 2
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Out of memory when allocating 2DECOMP workspace')
-       end if
-    end if
-
-    return
-  end subroutine decomp_info_init
-
+         if (associated(work1_r)) nullify (work1_r)
+         if (associated(work2_r)) nullify (work2_r)
+         if (associated(work1_c)) nullify (work1_c)
+         if (associated(work2_c)) nullify (work2_c)
+         if (allocated(work1)) deallocate (work1)
+         if (allocated(work2)) deallocate (work2)
+         allocate (work1(2 * buf_size), STAT=status)
+         if (status /= 0) then
+            errorcode = 2
+            call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                                 'Out of memory when allocating 2DECOMP workspace')
+         end if
+         allocate (work2(2 * buf_size), STAT=status)
+         if (status /= 0) then
+            errorcode = 2
+            call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                                 'Out of memory when allocating 2DECOMP workspace')
+         end if
+         call c_f_pointer(c_loc(work1), work1_r, [buf_size])
+         call c_f_pointer(c_loc(work2), work2_r, [buf_size])
+         call c_f_pointer(c_loc(work1), work1_c, [buf_size])
+         call c_f_pointer(c_loc(work2), work2_c, [buf_size])
+      end if
+
+   end subroutine decomp_info_init
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Release memory associated with a DECOMP_INFO object
+   ! Release memory associated with a DECOMP_INFO object
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_finalize(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    if (allocated(decomp%x1dist)) deallocate(decomp%x1dist)
-    if (allocated(decomp%y1dist)) deallocate(decomp%y1dist)
-    if (allocated(decomp%y2dist)) deallocate(decomp%y2dist)
-    if (allocated(decomp%z2dist)) deallocate(decomp%z2dist)
-    if (allocated(decomp%x1cnts)) deallocate(decomp%x1cnts)
-    if (allocated(decomp%y1cnts)) deallocate(decomp%y1cnts)
-    if (allocated(decomp%y2cnts)) deallocate(decomp%y2cnts)
-    if (allocated(decomp%z2cnts)) deallocate(decomp%z2cnts)
-    if (allocated(decomp%x1disp)) deallocate(decomp%x1disp)
-    if (allocated(decomp%y1disp)) deallocate(decomp%y1disp)
-    if (allocated(decomp%y2disp)) deallocate(decomp%y2disp)
-    if (allocated(decomp%z2disp)) deallocate(decomp%z2disp)
-
-#ifdef SHM
-    if (allocated(decomp%x1disp_o)) deallocate(decomp%x1disp_o)
-    if (allocated(decomp%y1disp_o)) deallocate(decomp%y1disp_o)
-    if (allocated(decomp%y2disp_o)) deallocate(decomp%y2disp_o)
-    if (allocated(decomp%z2disp_o)) deallocate(decomp%z2disp_o)
-    if (allocated(decomp%x1cnts_s)) deallocate(decomp%x1cnts_s)
-    if (allocated(decomp%y1cnts_s)) deallocate(decomp%y1cnts_s)
-    if (allocated(decomp%y2cnts_s)) deallocate(decomp%y2cnts_s)
-    if (allocated(decomp%z2cnts_s)) deallocate(decomp%z2cnts_s)
-    if (allocated(decomp%x1disp_s)) deallocate(decomp%x1disp_s)
-    if (allocated(decomp%y1disp_s)) deallocate(decomp%y1disp_s)
-    if (allocated(decomp%y2disp_s)) deallocate(decomp%y2disp_s)
-    if (allocated(decomp%z2disp_s)) deallocate(decomp%z2disp_s)
-#endif
+   subroutine decomp_info_finalize(decomp)
+
+      implicit none
 
-    return
-  end subroutine decomp_info_finalize
+      TYPE(DECOMP_INFO), intent(INOUT) :: decomp
 
+      if (allocated(decomp%x1dist)) deallocate (decomp%x1dist)
+      if (allocated(decomp%y1dist)) deallocate (decomp%y1dist)
+      if (allocated(decomp%y2dist)) deallocate (decomp%y2dist)
+      if (allocated(decomp%z2dist)) deallocate (decomp%z2dist)
+      if (allocated(decomp%x1cnts)) deallocate (decomp%x1cnts)
+      if (allocated(decomp%y1cnts)) deallocate (decomp%y1cnts)
+      if (allocated(decomp%y2cnts)) deallocate (decomp%y2cnts)
+      if (allocated(decomp%z2cnts)) deallocate (decomp%z2cnts)
+      if (allocated(decomp%x1disp)) deallocate (decomp%x1disp)
+      if (allocated(decomp%y1disp)) deallocate (decomp%y1disp)
+      if (allocated(decomp%y2disp)) deallocate (decomp%y2disp)
+      if (allocated(decomp%z2disp)) deallocate (decomp%z2disp)
+
+      return
+   end subroutine decomp_info_finalize
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for statistic
+   ! Coarser mesh support for statistic
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statS(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipS = i_skip
-    jskipS = j_skip
-    kskipS = k_skip
-
-    skip(1)=iskipS
-    skip(2)=jskipS
-    skip(3)=kskipS
-
-    do i=1,3
-       if (from1) then
-          xstS(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstS(i)=xstS(i)+1
-          xenS(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstS(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstS(i)=xstS(i)+1
-          xenS(i) = xend(i)/skip(i)
-       end if
-       xszS(i) = xenS(i)-xstS(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystS(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystS(i)=ystS(i)+1
-          yenS(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystS(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystS(i)=ystS(i)+1
-          yenS(i) = yend(i)/skip(i)
-       end if
-       yszS(i) = yenS(i)-ystS(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstS(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstS(i)=zstS(i)+1
-          zenS(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstS(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstS(i)=zstS(i)+1
-          zenS(i) = zend(i)/skip(i)
-       end if
-       zszS(i) = zenS(i)-zstS(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statS
+   subroutine init_coarser_mesh_statS(i_skip, j_skip, k_skip, from1)
+
+      implicit none
+
+      integer, intent(IN) :: i_skip, j_skip, k_skip
+      logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
+      ! .false. - save n,2n,3n...
+
+      integer, dimension(3) :: skip
+      integer :: i
+
+      coarse_mesh_starts_from_1 = from1
+      iskipS = i_skip
+      jskipS = j_skip
+      kskipS = k_skip
+
+      skip(1) = iskipS
+      skip(2) = jskipS
+      skip(3) = kskipS
+
+      do i = 1, 3
+         if (from1) then
+            xstS(i) = (xstart(i) + skip(i) - 1) / skip(i)
+            if (mod(xstart(i) + skip(i) - 1, skip(i)) /= 0) xstS(i) = xstS(i) + 1
+            xenS(i) = (xend(i) + skip(i) - 1) / skip(i)
+         else
+            xstS(i) = xstart(i) / skip(i)
+            if (mod(xstart(i), skip(i)) /= 0) xstS(i) = xstS(i) + 1
+            xenS(i) = xend(i) / skip(i)
+         end if
+         xszS(i) = xenS(i) - xstS(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            ystS(i) = (ystart(i) + skip(i) - 1) / skip(i)
+            if (mod(ystart(i) + skip(i) - 1, skip(i)) /= 0) ystS(i) = ystS(i) + 1
+            yenS(i) = (yend(i) + skip(i) - 1) / skip(i)
+         else
+            ystS(i) = ystart(i) / skip(i)
+            if (mod(ystart(i), skip(i)) /= 0) ystS(i) = ystS(i) + 1
+            yenS(i) = yend(i) / skip(i)
+         end if
+         yszS(i) = yenS(i) - ystS(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            zstS(i) = (zstart(i) + skip(i) - 1) / skip(i)
+            if (mod(zstart(i) + skip(i) - 1, skip(i)) /= 0) zstS(i) = zstS(i) + 1
+            zenS(i) = (zend(i) + skip(i) - 1) / skip(i)
+         else
+            zstS(i) = zstart(i) / skip(i)
+            if (mod(zstart(i), skip(i)) /= 0) zstS(i) = zstS(i) + 1
+            zenS(i) = zend(i) / skip(i)
+         end if
+         zszS(i) = zenS(i) - zstS(i) + 1
+      end do
+
+      return
+   end subroutine init_coarser_mesh_statS
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for visualization
+   ! Coarser mesh support for visualization
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statV(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipV = i_skip
-    jskipV = j_skip
-    kskipV = k_skip
-
-    skip(1)=iskipV
-    skip(2)=jskipV
-    skip(3)=kskipV
-
-    do i=1,3
-       if (from1) then
-          xstV(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstV(i)=xstV(i)+1
-          xenV(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstV(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstV(i)=xstV(i)+1
-          xenV(i) = xend(i)/skip(i)
-       end if
-       xszV(i) = xenV(i)-xstV(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystV(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystV(i)=ystV(i)+1
-          yenV(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystV(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystV(i)=ystV(i)+1
-          yenV(i) = yend(i)/skip(i)
-       end if
-       yszV(i) = yenV(i)-ystV(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstV(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstV(i)=zstV(i)+1
-          zenV(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstV(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstV(i)=zstV(i)+1
-          zenV(i) = zend(i)/skip(i)
-       end if
-       zszV(i) = zenV(i)-zstV(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statV
+   subroutine init_coarser_mesh_statV(i_skip, j_skip, k_skip, from1)
+
+      implicit none
+
+      integer, intent(IN) :: i_skip, j_skip, k_skip
+      logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
+      ! .false. - save n,2n,3n...
+
+      integer, dimension(3) :: skip
+      integer :: i
+
+      coarse_mesh_starts_from_1 = from1
+      iskipV = i_skip
+      jskipV = j_skip
+      kskipV = k_skip
+
+      skip(1) = iskipV
+      skip(2) = jskipV
+      skip(3) = kskipV
+
+      do i = 1, 3
+         if (from1) then
+            xstV(i) = (xstart(i) + skip(i) - 1) / skip(i)
+            if (mod(xstart(i) + skip(i) - 1, skip(i)) /= 0) xstV(i) = xstV(i) + 1
+            xenV(i) = (xend(i) + skip(i) - 1) / skip(i)
+         else
+            xstV(i) = xstart(i) / skip(i)
+            if (mod(xstart(i), skip(i)) /= 0) xstV(i) = xstV(i) + 1
+            xenV(i) = xend(i) / skip(i)
+         end if
+         xszV(i) = xenV(i) - xstV(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            ystV(i) = (ystart(i) + skip(i) - 1) / skip(i)
+            if (mod(ystart(i) + skip(i) - 1, skip(i)) /= 0) ystV(i) = ystV(i) + 1
+            yenV(i) = (yend(i) + skip(i) - 1) / skip(i)
+         else
+            ystV(i) = ystart(i) / skip(i)
+            if (mod(ystart(i), skip(i)) /= 0) ystV(i) = ystV(i) + 1
+            yenV(i) = yend(i) / skip(i)
+         end if
+         yszV(i) = yenV(i) - ystV(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            zstV(i) = (zstart(i) + skip(i) - 1) / skip(i)
+            if (mod(zstart(i) + skip(i) - 1, skip(i)) /= 0) zstV(i) = zstV(i) + 1
+            zenV(i) = (zend(i) + skip(i) - 1) / skip(i)
+         else
+            zstV(i) = zstart(i) / skip(i)
+            if (mod(zstart(i), skip(i)) /= 0) zstV(i) = zstV(i) + 1
+            zenV(i) = zend(i) / skip(i)
+         end if
+         zszV(i) = zenV(i) - zstV(i) + 1
+      end do
+
+      return
+   end subroutine init_coarser_mesh_statV
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Coarser mesh support for probe
+   ! Coarser mesh support for probe
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_coarser_mesh_statP(i_skip,j_skip,k_skip,from1)
-
-    implicit none
-
-    integer, intent(IN) :: i_skip,j_skip,k_skip
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    integer, dimension(3) :: skip
-    integer :: i
-
-    coarse_mesh_starts_from_1 = from1
-    iskipP = i_skip
-    jskipP = j_skip
-    kskipP = k_skip
-
-    skip(1)=iskipP
-    skip(2)=jskipP
-    skip(3)=kskipP
-
-    do i=1,3
-       if (from1) then
-          xstP(i) = (xstart(i)+skip(i)-1)/skip(i)
-          if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xstP(i)=xstP(i)+1
-          xenP(i) = (xend(i)+skip(i)-1)/skip(i)
-       else
-          xstP(i) = xstart(i)/skip(i)
-          if (mod(xstart(i),skip(i))/=0) xstP(i)=xstP(i)+1
-          xenP(i) = xend(i)/skip(i)
-       end if
-       xszP(i) = xenP(i)-xstP(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          ystP(i) = (ystart(i)+skip(i)-1)/skip(i)
-          if (mod(ystart(i)+skip(i)-1,skip(i))/=0) ystP(i)=ystP(i)+1
-          yenP(i) = (yend(i)+skip(i)-1)/skip(i)
-       else
-          ystP(i) = ystart(i)/skip(i)
-          if (mod(ystart(i),skip(i))/=0) ystP(i)=ystP(i)+1
-          yenP(i) = yend(i)/skip(i)
-       end if
-       yszP(i) = yenP(i)-ystP(i)+1
-    end do
-
-    do i=1,3
-       if (from1) then
-          zstP(i) = (zstart(i)+skip(i)-1)/skip(i)
-          if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zstP(i)=zstP(i)+1
-          zenP(i) = (zend(i)+skip(i)-1)/skip(i)
-       else
-          zstP(i) = zstart(i)/skip(i)
-          if (mod(zstart(i),skip(i))/=0) zstP(i)=zstP(i)+1
-          zenP(i) = zend(i)/skip(i)
-       end if
-       zszP(i) = zenP(i)-zstP(i)+1
-    end do
-
-    return
-  end subroutine init_coarser_mesh_statP
-
-  ! Copy data from a fine-resolution array to a coarse one for statistic
-  subroutine fine_to_coarseS(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstS(1):xenS(1),xstS(2):xenS(2),xstS(3):xenS(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstS(3),xenS(3)
-             do j=xstS(2),xenS(2)
-                do i=xstS(1),xenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstS(3),xenS(3)
-             do j=xstS(2),xenS(2)
-                do i=xstS(1),xenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystS(1):yenS(1),ystS(2):yenS(2),ystS(3):yenS(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystS(3),yenS(3)
-             do j=ystS(2),yenS(2)
-                do i=ystS(1),yenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystS(3),yenS(3)
-             do j=ystS(2),yenS(2)
-                do i=ystS(1),yenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstS(1):zenS(1),zstS(2):zenS(2),zstS(3):zenS(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstS(3),zenS(3)
-             do j=zstS(2),zenS(2)
-                do i=zstS(1),zenS(1)
-                   wk(i,j,k) = wk2((i-1)*iskipS+1,(j-1)*jskipS+1,(k-1)*kskipS+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstS(3),zenS(3)
-             do j=zstS(2),zenS(2)
-                do i=zstS(1),zenS(1)
-                   wk(i,j,k) = wk2(i*iskipS,j*jskipS,k*kskipS)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseS
-
-  ! Copy data from a fine-resolution array to a coarse one for visualization
-  subroutine fine_to_coarseV(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstV(3),xenV(3)
-             do j=xstV(2),xenV(2)
-                do i=xstV(1),xenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstV(3),xenV(3)
-             do j=xstV(2),xenV(2)
-                do i=xstV(1),xenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystV(1):yenV(1),ystV(2):yenV(2),ystV(3):yenV(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystV(3),yenV(3)
-             do j=ystV(2),yenV(2)
-                do i=ystV(1),yenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystV(3),yenV(3)
-             do j=ystV(2),yenV(2)
-                do i=ystV(1),yenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstV(1):zenV(1),zstV(2):zenV(2),zstV(3):zenV(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstV(3),zenV(3)
-             do j=zstV(2),zenV(2)
-                do i=zstV(1),zenV(1)
-                   wk(i,j,k) = wk2((i-1)*iskipV+1,(j-1)*jskipV+1,(k-1)*kskipV+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstV(3),zenV(3)
-             do j=zstV(2),zenV(2)
-                do i=zstV(1),zenV(1)
-                   wk(i,j,k) = wk2(i*iskipV,j*jskipV,k*kskipV)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseV
-
-  ! Copy data from a fine-resolution array to a coarse one for probe
-  subroutine fine_to_coarseP(ipencil,var_fine,var_coarse)
-
-    implicit none
-
-    real(mytype), dimension(:,:,:) :: var_fine
-    real(mytype), dimension(:,:,:) :: var_coarse
-    integer, intent(IN) :: ipencil
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer :: i,j,k
-
-    if (ipencil==1) then
-       allocate(wk(xstP(1):xenP(1),xstP(2):xenP(2),xstP(3):xenP(3)))
-       allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=xstP(3),xenP(3)
-             do j=xstP(2),xenP(2)
-                do i=xstP(1),xenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=xstP(3),xenP(3)
-             do j=xstP(2),xenP(2)
-                do i=xstP(1),xenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==2) then
-       allocate(wk(ystP(1):yenP(1),ystP(2):yenP(2),ystP(3):yenP(3)))
-       allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=ystP(3),yenP(3)
-             do j=ystP(2),yenP(2)
-                do i=ystP(1),yenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=ystP(3),yenP(3)
-             do j=ystP(2),yenP(2)
-                do i=ystP(1),yenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    else if (ipencil==3) then
-       allocate(wk(zstP(1):zenP(1),zstP(2):zenP(2),zstP(3):zenP(3)))
-       allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-       wk2=var_fine
-       if (coarse_mesh_starts_from_1) then
-          do k=zstP(3),zenP(3)
-             do j=zstP(2),zenP(2)
-                do i=zstP(1),zenP(1)
-                   wk(i,j,k) = wk2((i-1)*iskipP+1,(j-1)*jskipP+1,(k-1)*kskipP+1)
-                end do
-             end do
-          end do
-       else
-          do k=zstP(3),zenP(3)
-             do j=zstP(2),zenP(2)
-                do i=zstP(1),zenP(1)
-                   wk(i,j,k) = wk2(i*iskipP,j*jskipP,k*kskipP)
-                end do
-             end do
-          end do
-       end if
-       var_coarse=wk
-    end if
-
-    deallocate(wk,wk2)
-
-    return
-  end subroutine fine_to_coarseP
-
+   subroutine init_coarser_mesh_statP(i_skip, j_skip, k_skip, from1)
+
+      implicit none
+
+      integer, intent(IN) :: i_skip, j_skip, k_skip
+      logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
+      ! .false. - save n,2n,3n...
+
+      integer, dimension(3) :: skip
+      integer :: i
+
+      coarse_mesh_starts_from_1 = from1
+      iskipP = i_skip
+      jskipP = j_skip
+      kskipP = k_skip
+
+      skip(1) = iskipP
+      skip(2) = jskipP
+      skip(3) = kskipP
+
+      do i = 1, 3
+         if (from1) then
+            xstP(i) = (xstart(i) + skip(i) - 1) / skip(i)
+            if (mod(xstart(i) + skip(i) - 1, skip(i)) /= 0) xstP(i) = xstP(i) + 1
+            xenP(i) = (xend(i) + skip(i) - 1) / skip(i)
+         else
+            xstP(i) = xstart(i) / skip(i)
+            if (mod(xstart(i), skip(i)) /= 0) xstP(i) = xstP(i) + 1
+            xenP(i) = xend(i) / skip(i)
+         end if
+         xszP(i) = xenP(i) - xstP(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            ystP(i) = (ystart(i) + skip(i) - 1) / skip(i)
+            if (mod(ystart(i) + skip(i) - 1, skip(i)) /= 0) ystP(i) = ystP(i) + 1
+            yenP(i) = (yend(i) + skip(i) - 1) / skip(i)
+         else
+            ystP(i) = ystart(i) / skip(i)
+            if (mod(ystart(i), skip(i)) /= 0) ystP(i) = ystP(i) + 1
+            yenP(i) = yend(i) / skip(i)
+         end if
+         yszP(i) = yenP(i) - ystP(i) + 1
+      end do
+
+      do i = 1, 3
+         if (from1) then
+            zstP(i) = (zstart(i) + skip(i) - 1) / skip(i)
+            if (mod(zstart(i) + skip(i) - 1, skip(i)) /= 0) zstP(i) = zstP(i) + 1
+            zenP(i) = (zend(i) + skip(i) - 1) / skip(i)
+         else
+            zstP(i) = zstart(i) / skip(i)
+            if (mod(zstart(i), skip(i)) /= 0) zstP(i) = zstP(i) + 1
+            zenP(i) = zend(i) / skip(i)
+         end if
+         zszP(i) = zenP(i) - zstP(i) + 1
+      end do
+
+      return
+   end subroutine init_coarser_mesh_statP
+
+   ! Copy data from a fine-resolution array to a coarse one for statistic
+   subroutine fine_to_coarseS(ipencil, var_fine, var_coarse)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :) :: var_fine
+      real(mytype), dimension(:, :, :) :: var_coarse
+      integer, intent(IN) :: ipencil
+
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer :: i, j, k
+
+      if (ipencil == 1) then
+         allocate (wk(xstS(1):xenS(1), xstS(2):xenS(2), xstS(3):xenS(3)))
+         allocate (wk2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = xstS(3), xenS(3)
+               do j = xstS(2), xenS(2)
+                  do i = xstS(1), xenS(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipS + 1, (j - 1) * jskipS + 1, (k - 1) * kskipS + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = xstS(3), xenS(3)
+               do j = xstS(2), xenS(2)
+                  do i = xstS(1), xenS(1)
+                     wk(i, j, k) = wk2(i * iskipS, j * jskipS, k * kskipS)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 2) then
+         allocate (wk(ystS(1):yenS(1), ystS(2):yenS(2), ystS(3):yenS(3)))
+         allocate (wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = ystS(3), yenS(3)
+               do j = ystS(2), yenS(2)
+                  do i = ystS(1), yenS(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipS + 1, (j - 1) * jskipS + 1, (k - 1) * kskipS + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = ystS(3), yenS(3)
+               do j = ystS(2), yenS(2)
+                  do i = ystS(1), yenS(1)
+                     wk(i, j, k) = wk2(i * iskipS, j * jskipS, k * kskipS)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 3) then
+         allocate (wk(zstS(1):zenS(1), zstS(2):zenS(2), zstS(3):zenS(3)))
+         allocate (wk2(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = zstS(3), zenS(3)
+               do j = zstS(2), zenS(2)
+                  do i = zstS(1), zenS(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipS + 1, (j - 1) * jskipS + 1, (k - 1) * kskipS + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = zstS(3), zenS(3)
+               do j = zstS(2), zenS(2)
+                  do i = zstS(1), zenS(1)
+                     wk(i, j, k) = wk2(i * iskipS, j * jskipS, k * kskipS)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      end if
+
+      deallocate (wk, wk2)
+
+      return
+   end subroutine fine_to_coarseS
+
+   ! Copy data from a fine-resolution array to a coarse one for visualization
+   subroutine fine_to_coarseV(ipencil, var_fine, var_coarse)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :) :: var_fine
+      real(mytype), dimension(:, :, :) :: var_coarse
+      integer, intent(IN) :: ipencil
+
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer :: i, j, k
+
+      if (ipencil == 1) then
+         allocate (wk(xstV(1):xenV(1), xstV(2):xenV(2), xstV(3):xenV(3)))
+         allocate (wk2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = xstV(3), xenV(3)
+               do j = xstV(2), xenV(2)
+                  do i = xstV(1), xenV(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipV + 1, (j - 1) * jskipV + 1, (k - 1) * kskipV + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = xstV(3), xenV(3)
+               do j = xstV(2), xenV(2)
+                  do i = xstV(1), xenV(1)
+                     wk(i, j, k) = wk2(i * iskipV, j * jskipV, k * kskipV)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 2) then
+         allocate (wk(ystV(1):yenV(1), ystV(2):yenV(2), ystV(3):yenV(3)))
+         allocate (wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = ystV(3), yenV(3)
+               do j = ystV(2), yenV(2)
+                  do i = ystV(1), yenV(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipV + 1, (j - 1) * jskipV + 1, (k - 1) * kskipV + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = ystV(3), yenV(3)
+               do j = ystV(2), yenV(2)
+                  do i = ystV(1), yenV(1)
+                     wk(i, j, k) = wk2(i * iskipV, j * jskipV, k * kskipV)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 3) then
+         allocate (wk(zstV(1):zenV(1), zstV(2):zenV(2), zstV(3):zenV(3)))
+         allocate (wk2(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = zstV(3), zenV(3)
+               do j = zstV(2), zenV(2)
+                  do i = zstV(1), zenV(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipV + 1, (j - 1) * jskipV + 1, (k - 1) * kskipV + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = zstV(3), zenV(3)
+               do j = zstV(2), zenV(2)
+                  do i = zstV(1), zenV(1)
+                     wk(i, j, k) = wk2(i * iskipV, j * jskipV, k * kskipV)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      end if
+
+      deallocate (wk, wk2)
+
+      return
+   end subroutine fine_to_coarseV
+
+   ! Copy data from a fine-resolution array to a coarse one for probe
+   subroutine fine_to_coarseP(ipencil, var_fine, var_coarse)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :) :: var_fine
+      real(mytype), dimension(:, :, :) :: var_coarse
+      integer, intent(IN) :: ipencil
+
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer :: i, j, k
+
+      if (ipencil == 1) then
+         allocate (wk(xstP(1):xenP(1), xstP(2):xenP(2), xstP(3):xenP(3)))
+         allocate (wk2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = xstP(3), xenP(3)
+               do j = xstP(2), xenP(2)
+                  do i = xstP(1), xenP(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipP + 1, (j - 1) * jskipP + 1, (k - 1) * kskipP + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = xstP(3), xenP(3)
+               do j = xstP(2), xenP(2)
+                  do i = xstP(1), xenP(1)
+                     wk(i, j, k) = wk2(i * iskipP, j * jskipP, k * kskipP)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 2) then
+         allocate (wk(ystP(1):yenP(1), ystP(2):yenP(2), ystP(3):yenP(3)))
+         allocate (wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = ystP(3), yenP(3)
+               do j = ystP(2), yenP(2)
+                  do i = ystP(1), yenP(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipP + 1, (j - 1) * jskipP + 1, (k - 1) * kskipP + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = ystP(3), yenP(3)
+               do j = ystP(2), yenP(2)
+                  do i = ystP(1), yenP(1)
+                     wk(i, j, k) = wk2(i * iskipP, j * jskipP, k * kskipP)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      else if (ipencil == 3) then
+         allocate (wk(zstP(1):zenP(1), zstP(2):zenP(2), zstP(3):zenP(3)))
+         allocate (wk2(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
+         wk2 = var_fine
+         if (coarse_mesh_starts_from_1) then
+            do k = zstP(3), zenP(3)
+               do j = zstP(2), zenP(2)
+                  do i = zstP(1), zenP(1)
+                     wk(i, j, k) = wk2((i - 1) * iskipP + 1, (j - 1) * jskipP + 1, (k - 1) * kskipP + 1)
+                  end do
+               end do
+            end do
+         else
+            do k = zstP(3), zenP(3)
+               do j = zstP(2), zenP(2)
+                  do i = zstP(1), zenP(1)
+                     wk(i, j, k) = wk2(i * iskipP, j * jskipP, k * kskipP)
+                  end do
+               end do
+            end do
+         end if
+         var_coarse = wk
+      end if
+
+      deallocate (wk, wk2)
+
+      return
+   end subroutine fine_to_coarseP
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Find sub-domain information held by current processor
-  !   INPUT:
-  !     nx, ny, nz - global data dimension
-  !     pdim(3)    - number of processor grid in each dimension,
-  !                  valid values: 1 - distibute locally;
-  !                                2 - distribute across p_row;
-  !                                3 - distribute across p_col
-  !   OUTPUT:
-  !     lstart(3)  - starting index
-  !     lend(3)    - ending index
-  !     lsize(3)   - size of the sub-block (redundant)
+   ! Find sub-domain information held by current processor
+   !   INPUT:
+   !     nx, ny, nz - global data dimension
+   !     pdim(3)    - number of processor grid in each dimension,
+   !                  valid values: 1 - distibute locally;
+   !                                2 - distribute across p_row;
+   !                                3 - distribute across p_col
+   !   OUTPUT:
+   !     lstart(3)  - starting index
+   !     lend(3)    - ending index
+   !     lsize(3)   - size of the sub-block (redundant)
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine partition(nx, ny, nz, pdim, lstart, lend, lsize)
-
-    implicit none
-
-    integer, intent(IN) :: nx, ny, nz
-    integer, dimension(3), intent(IN) :: pdim
-    integer, dimension(3), intent(OUT) :: lstart, lend, lsize
-
-    integer, allocatable, dimension(:) :: st,en,sz
-    integer :: i, gsize
-
-    do i = 1, 3
-
-       if (i==1) then
-          gsize = nx
-       else if (i==2) then
-          gsize = ny
-       else if (i==3) then
-          gsize = nz
-       end if
-
-       if (pdim(i) == 1) then        ! all local
-          lstart(i) = 1
-          lend(i)   = gsize
-          lsize(i)  = gsize
-       elseif (pdim(i) == 2) then    ! distribute across dims(1)
-          allocate(st(0:dims(1)-1))
-          allocate(en(0:dims(1)-1))
-          allocate(sz(0:dims(1)-1))
-          call distribute(gsize,dims(1),st,en,sz)
-          lstart(i) = st(coord(1))
-          lend(i)   = en(coord(1))
-          lsize(i)  = sz(coord(1))
-          deallocate(st,en,sz)
-       elseif (pdim(i) == 3) then    ! distribute across dims(2)
-          allocate(st(0:dims(2)-1))
-          allocate(en(0:dims(2)-1))
-          allocate(sz(0:dims(2)-1))
-          call distribute(gsize,dims(2),st,en,sz)
-          lstart(i) = st(coord(2))
-          lend(i)   = en(coord(2))
-          lsize(i)  = sz(coord(2))
-          deallocate(st,en,sz)
-       end if
-
-    end do
-    return
-
-  end subroutine partition
+   subroutine partition(nx, ny, nz, pdim, lstart, lend, lsize)
+
+      implicit none
+
+      integer, intent(IN) :: nx, ny, nz
+      integer, dimension(3), intent(IN) :: pdim
+      integer, dimension(3), intent(OUT) :: lstart, lend, lsize
+
+      integer, allocatable, dimension(:) :: st, en, sz
+      integer :: i, gsize
+
+      do i = 1, 3
+
+         if (i == 1) then
+            gsize = nx
+         else if (i == 2) then
+            gsize = ny
+         else if (i == 3) then
+            gsize = nz
+         end if
+
+         if (pdim(i) == 1) then        ! all local
+            lstart(i) = 1
+            lend(i) = gsize
+            lsize(i) = gsize
+         elseif (pdim(i) == 2) then    ! distribute across dims(1)
+            allocate (st(0:dims(1) - 1))
+            allocate (en(0:dims(1) - 1))
+            allocate (sz(0:dims(1) - 1))
+            call distribute(gsize, dims(1), st, en, sz)
+            lstart(i) = st(coord(1))
+            lend(i) = en(coord(1))
+            lsize(i) = sz(coord(1))
+            deallocate (st, en, sz)
+         elseif (pdim(i) == 3) then    ! distribute across dims(2)
+            allocate (st(0:dims(2) - 1))
+            allocate (en(0:dims(2) - 1))
+            allocate (sz(0:dims(2) - 1))
+            call distribute(gsize, dims(2), st, en, sz)
+            lstart(i) = st(coord(2))
+            lend(i) = en(coord(2))
+            lsize(i) = sz(coord(2))
+            deallocate (st, en, sz)
+         end if
+
+      end do
+      return
+
+   end subroutine partition
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !   - distibutes grid points in one dimension
-  !   - handles uneven distribution properly
+   !   - distibutes grid points in one dimension
+   !   - handles uneven distribution properly
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine distribute(data1,proc,st,en,sz)
-
-    implicit none
-    ! data1 -- data size in any dimension to be partitioned
-    ! proc  -- number of processors in that dimension
-    ! st    -- array of starting index
-    ! en    -- array of ending index
-    ! sz    -- array of local size  (redundent)
-    integer data1,proc,st(0:proc-1),en(0:proc-1),sz(0:proc-1)
-    integer i,size1,nl,nu
-
-    size1=data1/proc
-    nu = data1 - size1 * proc
-    nl = proc - nu
-    st(0) = 1
-    sz(0) = size1
-    en(0) = size1
-    do i=1,nl-1
-       st(i) = st(i-1) + size1
-       sz(i) = size1
-       en(i) = en(i-1) + size1
-    end do
-    size1 = size1 + 1
-    do i=nl,proc-1
-       st(i) = en(i-1) + 1
-       sz(i) = size1
-       en(i) = en(i-1) + size1
-    end do
-    en(proc-1)= data1
-    sz(proc-1)= data1-st(proc-1)+1
-
-    return
-  end subroutine distribute
+   subroutine distribute(data1, proc, st, en, sz)
+
+      implicit none
+      ! data1 -- data size in any dimension to be partitioned
+      ! proc  -- number of processors in that dimension
+      ! st    -- array of starting index
+      ! en    -- array of ending index
+      ! sz    -- array of local size  (redundent)
+      integer data1, proc, st(0:proc - 1), en(0:proc - 1), sz(0:proc - 1)
+      integer i, size1, nl, nu
+
+      size1 = data1 / proc
+      nu = data1 - size1 * proc
+      nl = proc - nu
+      st(0) = 1
+      sz(0) = size1
+      en(0) = size1
+      do i = 1, nl - 1
+         st(i) = st(i - 1) + size1
+         sz(i) = size1
+         en(i) = en(i - 1) + size1
+      end do
+      size1 = size1 + 1
+      do i = nl, proc - 1
+         st(i) = en(i - 1) + 1
+         sz(i) = size1
+         en(i) = en(i - 1) + size1
+      end do
+      en(proc - 1) = data1
+      sz(proc - 1) = data1 - st(proc - 1) + 1
+
+      return
+   end subroutine distribute
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  Define how each dimension is distributed across processors
-  !    e.g. 17 meshes across 4 processor would be distibuted as (4,4,4,5)
-  !    such global information is required locally at MPI_ALLTOALLV time
+   !  Define how each dimension is distributed across processors
+   !    e.g. 17 meshes across 4 processor would be distibuted as (4,4,4,5)
+   !    such global information is required locally at MPI_ALLTOALLV time
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine get_dist(nx,ny,nz,decomp)
+   subroutine get_dist(nx, ny, nz, decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: nx, ny, nz
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-    integer, allocatable, dimension(:) :: st,en
+      integer, intent(IN) :: nx, ny, nz
+      TYPE(DECOMP_INFO), intent(INOUT) :: decomp
+      integer, allocatable, dimension(:) :: st, en
 
-    allocate(st(0:dims(1)-1))
-    allocate(en(0:dims(1)-1))
-    call distribute(nx,dims(1),st,en,decomp%x1dist)
-    call distribute(ny,dims(1),st,en,decomp%y1dist)
-    deallocate(st,en)
+      allocate (st(0:dims(1) - 1))
+      allocate (en(0:dims(1) - 1))
+      call distribute(nx, dims(1), st, en, decomp%x1dist)
+      call distribute(ny, dims(1), st, en, decomp%y1dist)
+      deallocate (st, en)
 
-    allocate(st(0:dims(2)-1))
-    allocate(en(0:dims(2)-1))
-    call distribute(ny,dims(2),st,en,decomp%y2dist)
-    call distribute(nz,dims(2),st,en,decomp%z2dist)
-    deallocate(st,en)
+      allocate (st(0:dims(2) - 1))
+      allocate (en(0:dims(2) - 1))
+      call distribute(ny, dims(2), st, en, decomp%y2dist)
+      call distribute(nz, dims(2), st, en, decomp%z2dist)
+      deallocate (st, en)
 
-    return
-  end subroutine get_dist
+      return
+   end subroutine get_dist
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Prepare the send / receive buffers for MPI_ALLTOALLV communications
+   ! Prepare the send / receive buffers for MPI_ALLTOALLV communications
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine prepare_buffer(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    integer :: i
-
-    !LG : AJOUTS "bidons" pour eviter un plantage en -O3 avec gcc9.3
-    !       * la fonction sortait des valeurs 'aleatoires'
-    !         et le calcul plantait dans MPI_ALLTOALLV
-    !       * pas de plantage en O2
-
-    if (nrank==0) then
-       open(newunit=i, file='temp.dat', form='unformatted')
-       write(i) decomp%x1dist,decomp%y1dist,decomp%y2dist,decomp%z2dist, &
-                decomp%xsz,decomp%ysz,decomp%zsz
-       close(i, status='delete')
-    endif
-
-    ! MPI_ALLTOALLV buffer information
-
-    do i=0, dims(1)-1
-       decomp%x1cnts(i) = decomp%x1dist(i)*decomp%xsz(2)*decomp%xsz(3)
-       decomp%y1cnts(i) = decomp%ysz(1)*decomp%y1dist(i)*decomp%ysz(3)
-       if (i==0) then
-          decomp%x1disp(i) = 0  ! displacement is 0-based index
-          decomp%y1disp(i) = 0
-       else
-          decomp%x1disp(i) = decomp%x1disp(i-1) + decomp%x1cnts(i-1)
-          decomp%y1disp(i) = decomp%y1disp(i-1) + decomp%y1cnts(i-1)
-       end if
-    end do
-
-    do i=0, dims(2)-1
-       decomp%y2cnts(i) = decomp%ysz(1)*decomp%y2dist(i)*decomp%ysz(3)
-       decomp%z2cnts(i) = decomp%zsz(1)*decomp%zsz(2)*decomp%z2dist(i)
-       if (i==0) then
-          decomp%y2disp(i) = 0  ! displacement is 0-based index
-          decomp%z2disp(i) = 0
-       else
-          decomp%y2disp(i) = decomp%y2disp(i-1) + decomp%y2cnts(i-1)
-          decomp%z2disp(i) = decomp%z2disp(i-1) + decomp%z2cnts(i-1)
-       end if
-    end do
-
-    ! MPI_ALLTOALL buffer information
-
-    ! For evenly distributed data, following is an easier implementation.
-    ! But it should be covered by the more general formulation below.
-    !decomp%x1count = decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3)/dims(1)
-    !decomp%y1count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(1)
-    !decomp%y2count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(2)
-    !decomp%z2count = decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)/dims(2)
-
-    ! For unevenly distributed data, pad smaller messages. Note the
-    ! last blocks along pencils always get assigned more mesh points
-    ! for X <=> Y transposes
-    decomp%x1count = decomp%x1dist(dims(1)-1) * &
-         decomp%y1dist(dims(1)-1) * decomp%xsz(3)
-    decomp%y1count = decomp%x1count
-    ! for Y <=> Z transposes
-    decomp%y2count = decomp%y2dist(dims(2)-1) * &
-         decomp%z2dist(dims(2)-1) * decomp%zsz(1)
-    decomp%z2count = decomp%y2count
-
-    return
-  end subroutine prepare_buffer
-
-#ifdef SHM
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  Generate shared-memory information
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_info_init_shm(decomp)
-
-    implicit none
-
-    TYPE(DECOMP_INFO), intent(INOUT) :: decomp
-
-    ! a copy of old displacement array (will be overwritten by shm code)
-    allocate(decomp%x1disp_o(0:dims(1)-1),decomp%y1disp_o(0:dims(1)-1), &
-         decomp%y2disp_o(0:dims(2)-1),decomp%z2disp_o(0:dims(2)-1))
-    decomp%x1disp_o = decomp%x1disp
-    decomp%y1disp_o = decomp%y1disp
-    decomp%y2disp_o = decomp%y2disp
-    decomp%z2disp_o = decomp%z2disp
-
-    call prepare_shared_buffer(decomp%ROW_INFO,DECOMP_2D_COMM_ROW,decomp)
-    call prepare_shared_buffer(decomp%COL_INFO,DECOMP_2D_COMM_COL,decomp)
-
-    return
-  end subroutine decomp_info_init_shm
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! For shared-memory implementation, prepare send/recv shared buffer
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine prepare_shared_buffer(C,MPI_COMM,decomp)
-
-    implicit none
-
-    TYPE(SMP_INFO) :: C
-    INTEGER :: MPI_COMM
-    TYPE(DECOMP_INFO) :: decomp
-
-    INTEGER, ALLOCATABLE :: KTBL(:,:),NARY(:,:),KTBLALL(:,:)
-    INTEGER MYSMP, MYCORE, COLOR
-
-    integer :: ierror
-
-    C%MPI_COMM = MPI_COMM
-    CALL MPI_COMM_SIZE(MPI_COMM,C%NCPU,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
-    CALL MPI_COMM_RANK(MPI_COMM,C%NODE_ME,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_RANK")
-    C%SMP_COMM  = MPI_COMM_NULL
-    C%CORE_COMM = MPI_COMM_NULL
-    C%SMP_ME= 0
-    C%NCORE = 0
-    C%CORE_ME = 0
-    C%MAXCORE = 0
-    C%NSMP  = 0
-    C%N_SND = 0
-    C%N_RCV = 0
-    C%SND_P = 0
-    C%RCV_P = 0
-    C%SND_P_c = 0
-    C%RCV_P_c = 0
-
-    ! get smp-node map for this communicator and set up smp communicators
-    CALL GET_SMP_MAP(C%MPI_COMM, C%NSMP, MYSMP, &
-         C%NCORE, MYCORE, C%MAXCORE)
-    C%SMP_ME = MYSMP + 1
-    C%CORE_ME = MYCORE + 1
-    ! - set up inter/intra smp-node communicators
-    COLOR = MYCORE
-    IF (COLOR.GT.0) COLOR = MPI_UNDEFINED
-    CALL MPI_Comm_split(C%MPI_COMM, COLOR, MYSMP, C%SMP_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
-    CALL MPI_Comm_split(C%MPI_COMM, MYSMP, MYCORE, C%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
-    ! - allocate work space
-    ALLOCATE(KTBL(C%MAXCORE,C%NSMP),NARY(C%NCPU,C%NCORE))
-    ALLOCATE(KTBLALL(C%MAXCORE,C%NSMP))
-    ! - set up smp-node/core to node_me lookup table
-    KTBL = 0
-    KTBL(C%CORE_ME,C%SMP_ME) = C%NODE_ME + 1
-    CALL MPI_ALLREDUCE(KTBL,KTBLALL,C%NSMP*C%MAXCORE,MPI_INTEGER, &
-         MPI_SUM,MPI_COMM,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
-    KTBL=KTBLALL
-    !  IF (SUM(KTBL) /= C%NCPU*(C%NCPU+1)/2) &
-    !       CALL MPI_ABORT(...
-
-    ! compute offsets in shared SNDBUF and RCVBUF
-    CALL MAPSET_SMPSHM(C, KTBL, NARY, decomp)
-
-    DEALLOCATE(KTBL,NARY)
-
-    return
-  end subroutine prepare_shared_buffer
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Use Ian Bush's FreeIPC to generate shared-memory information
-  !  - system independent solution
-  !  - replacing David Tanqueray's implementation in alloc_shm.c
-  !    (old C code renamed to get_smp_map2)
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine get_smp_map(comm, nnodes, my_node, ncores, my_core, maxcor)
-
-    use FIPC_module
-
-    implicit none
-
-    integer, intent(IN) :: comm
-    integer, intent(OUT) :: nnodes, my_node, ncores, my_core, maxcor
-
-    integer :: intra_comm, extra_comm
-    integer :: ierror
-
-    call FIPC_init(comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "FIPC_init")
-
-    ! intra_comm: communicator for processes on this shared memory node
-    ! extra_comm: communicator for all rank 0 on each shared memory node
-    call FIPC_ctxt_intra_comm(FIPC_ctxt_world, intra_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "FIPC_ctxt_intra_comm")
-    call FIPC_ctxt_extra_comm(FIPC_ctxt_world, extra_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "FIPC_ctxt_extra_comm")
-
-    call MPI_COMM_SIZE(intra_comm,  ncores, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
-    call MPI_COMM_RANK(intra_comm, my_core, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
-
-    ! only rank 0 on each shared memory node member of extra_comm
-    ! for others extra_comm = MPI_COMM_NULL
-    if (extra_comm /= MPI_COMM_NULL) then
-       call MPI_COMM_SIZE(extra_comm,  nnodes, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
-       call MPI_COMM_RANK(extra_comm, my_node, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
-    end if
-
-    ! other ranks share the same information as their leaders
-    call MPI_BCAST( nnodes, 1, MPI_INTEGER, 0, intra_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BCAST")
-    call MPI_BCAST(my_node, 1, MPI_INTEGER, 0, intra_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BCAST")
-
-    ! maxcor
-    call MPI_ALLREDUCE(ncores, maxcor, 1, MPI_INTEGER, MPI_MAX, &
-         decomp_2d_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
-
-    call FIPC_finalize(ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "FIPC_finalize")
-
-    return
-
-  end subroutine get_smp_map
-
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Set up smp-node based shared memory maps
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  SUBROUTINE MAPSET_SMPSHM(C, KTBL, NARY, decomp)
-
-    IMPLICIT NONE
-
-    TYPE (SMP_INFO) C
-    INTEGER KTBL(C%MAXCORE,C%NSMP)
-    INTEGER NARY(C%NCPU,C%NCORE)
-    TYPE (DECOMP_INFO) :: decomp
-
-    INTEGER i, j, k, l, N, PTR, BSIZ, ierror, status, seed
-    character*16 s
-
-    BSIZ = C%N_SND
-
-    ! a - SNDBUF
-    IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       ALLOCATE(decomp%x1cnts_s(C%NSMP),decomp%x1disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%x1cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLGATHER")
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%x1disp_s(i) = PTR
-          N = 0
-          DO j=1,C%MAXCORE
-             k = KTBL(j,i)
-             IF (k > 0) then
-                DO l=1,C%NCORE
-                   IF (l == C%CORE_ME) decomp%x1disp_o(k-1) = PTR
-                   N = N + NARY(k,l)
-                   PTR = PTR + NARY(k,l)
-                END DO
-             END IF
-          END DO
-          decomp%x1cnts_s(i) = N
-       END DO
-       decomp%x1disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       ALLOCATE(decomp%y2cnts_s(C%NSMP),decomp%y2disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%y2cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLGATHER")
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%y2disp_s(i) = PTR
-          N = 0
-          DO j=1,C%MAXCORE
-             k = KTBL(j,i)
-             IF (k > 0) then
-                DO l=1,C%NCORE
-                   IF (l == C%CORE_ME) decomp%y2disp_o(k-1) = PTR
-                   N = N + NARY(k,l)
-                   PTR = PTR + NARY(k,l)
-                END DO
-             END IF
-          END DO
-          decomp%y2cnts_s(i) = N
-       END DO
-       decomp%y2disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-    END IF
-
-    ! b - RCVBUF
-
-    IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       ALLOCATE(decomp%y1cnts_s(C%NSMP),decomp%y1disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%y1cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLGATHER")
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%y1disp_s(i) = PTR
-          N=0
-          DO j=1,C%NCORE
-             DO l=1,C%MAXCORE
-                k = KTBL(l,i)
-                IF (k > 0) then
-                   IF (j == C%CORE_ME) decomp%y1disp_o(k-1) = PTR
-                   N = N + NARY(k,j)
-                   PTR = PTR + NARY(k,j)
-                END IF
-             END DO
-          END DO
-          decomp%y1cnts_s(i) = N
-       END DO
-       decomp%y1disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       ALLOCATE(decomp%z2cnts_s(C%NSMP),decomp%z2disp_s(C%NSMP+1), &
-            stat=status)
-       CALL MPI_Allgather(decomp%z2cnts, C%NCPU, MPI_INTEGER, &
-            NARY, C%NCPU, MPI_INTEGER, C%CORE_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLGATHER")
-       PTR = 0
-       DO i=1,C%NSMP
-          decomp%z2disp_s(i) = PTR
-          N=0
-          DO j=1,C%NCORE
-             DO l=1,C%MAXCORE
-                k = KTBL(l,i)
-                IF (k > 0) then
-                   IF (j == C%CORE_ME) decomp%z2disp_o(k-1) = PTR
-                   N = N + NARY(k,j)
-                   PTR = PTR + NARY(k,j)
-                END IF
-             END DO
-          END DO
-          decomp%z2cnts_s(i) = N
-       END DO
-       decomp%z2disp_s(C%NSMP+1) = PTR
-       IF (PTR > BSIZ) BSIZ = PTR
-
-    END IF
-
-    ! check buffer size and (re)-allocate buffer space if necessary
-    IF (BSIZ > C%N_SND) then
-       IF (C%SND_P /= 0) CALL DEALLOC_SHM(C%SND_P, C%CORE_COMM)
-       ! make sure each rank has unique keys to get shared memory
-       !IF (C%MPI_COMM==DECOMP_2D_COMM_COL) THEN
-       !   seed = nrank+nproc*0+1 ! has to be non-zero
-       !ELSE IF (C%MPI_COMM==DECOMP_2D_COMM_ROW) THEN
-       !   seed = nrank+nproc*1+1
-       !END IF
-       status = 1
-       !CALL ALLOC_SHM(C%SND_P, BSIZ, real_type, C%CORE_COMM, status, &
-       !     seed)
-       CALL ALLOC_SHM(C%SND_P, BSIZ, real_type, C%CORE_COMM, status)
-       C%N_SND = BSIZ
-
-       IF (C%RCV_P /= 0) CALL DEALLOC_SHM(C%RCV_P, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%RCV_P, BSIZ, real_type, C%CORE_COMM, status)
-       C%N_RCV = BSIZ
-
-       IF (C%SND_P_c /= 0) CALL DEALLOC_SHM(C%SND_P_c, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%SND_P_c, BSIZ, complex_type, C%CORE_COMM, status)
-       C%N_SND = BSIZ
-
-       IF (C%RCV_P_c /= 0) CALL DEALLOC_SHM(C%RCV_P_c, C%CORE_COMM)
-       status = 1
-       CALL ALLOC_SHM(C%RCV_P_c, BSIZ, complex_type, C%CORE_COMM, status)
-       C%N_RCV = BSIZ
-
-
-    END IF
-
-    RETURN
-  END SUBROUTINE MAPSET_SMPSHM
-
-#endif
-
-
-#ifdef OCC
-  ! For non-blocking communication code, progress the comminication stack
-  subroutine transpose_test(handle)
-
-    implicit none
-
-    integer :: handle, ierror
-
-    call NBC_TEST(handle,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "NBC_TEST")
-
-    return
-  end subroutine transpose_test
+   subroutine prepare_buffer(decomp)
+
+      implicit none
+
+      TYPE(DECOMP_INFO), intent(INOUT) :: decomp
+
+      integer :: i
+
+      !LG : AJOUTS "bidons" pour eviter un plantage en -O3 avec gcc9.3
+      !       * la fonction sortait des valeurs 'aleatoires'
+      !         et le calcul plantait dans MPI_ALLTOALLV
+      !       * pas de plantage en O2
+
+      if (nrank == 0) then
+         open (newunit=i, file='temp.dat', form='unformatted')
+         write (i) decomp%x1dist, decomp%y1dist, decomp%y2dist, decomp%z2dist, &
+            decomp%xsz, decomp%ysz, decomp%zsz
+         close (i, status='delete')
+      end if
+
+      ! MPI_ALLTOALLV buffer information
+
+      do i = 0, dims(1) - 1
+         decomp%x1cnts(i) = decomp%x1dist(i) * decomp%xsz(2) * decomp%xsz(3)
+         decomp%y1cnts(i) = decomp%ysz(1) * decomp%y1dist(i) * decomp%ysz(3)
+         if (i == 0) then
+            decomp%x1disp(i) = 0  ! displacement is 0-based index
+            decomp%y1disp(i) = 0
+         else
+            decomp%x1disp(i) = decomp%x1disp(i - 1) + decomp%x1cnts(i - 1)
+            decomp%y1disp(i) = decomp%y1disp(i - 1) + decomp%y1cnts(i - 1)
+         end if
+      end do
+
+      do i = 0, dims(2) - 1
+         decomp%y2cnts(i) = decomp%ysz(1) * decomp%y2dist(i) * decomp%ysz(3)
+         decomp%z2cnts(i) = decomp%zsz(1) * decomp%zsz(2) * decomp%z2dist(i)
+         if (i == 0) then
+            decomp%y2disp(i) = 0  ! displacement is 0-based index
+            decomp%z2disp(i) = 0
+         else
+            decomp%y2disp(i) = decomp%y2disp(i - 1) + decomp%y2cnts(i - 1)
+            decomp%z2disp(i) = decomp%z2disp(i - 1) + decomp%z2cnts(i - 1)
+         end if
+      end do
+
+      ! MPI_ALLTOALL buffer information
+#ifdef EVEN
+      ! For evenly distributed data, following is an easier implementation.
+      ! But it should be covered by the more general formulation below.
+      !decomp%x1count = decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3)/dims(1)
+      !decomp%y1count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(1)
+      !decomp%y2count = decomp%ysz(1)*decomp%ysz(2)*decomp%ysz(3)/dims(2)
+      !decomp%z2count = decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)/dims(2)
+
+      ! For unevenly distributed data, pad smaller messages. Note the
+      ! last blocks along pencils always get assigned more mesh points
+      ! for X <=> Y transposes
+      decomp%x1count = decomp%x1dist(dims(1) - 1) * &
+                       decomp%y1dist(dims(1) - 1) * decomp%xsz(3)
+      decomp%y1count = decomp%x1count
+      ! for Y <=> Z transposes
+      decomp%y2count = decomp%y2dist(dims(2) - 1) * &
+                       decomp%z2dist(dims(2) - 1) * decomp%zsz(1)
+      decomp%z2count = decomp%y2count
 #endif
 
+   end subroutine prepare_buffer
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Transposition routines
+   ! Transposition routines
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #include "transpose_x_to_y.f90"
 #include "transpose_y_to_z.f90"
@@ -1645,173 +1155,12 @@ end subroutine transpose_test
 #include "transpose_y_to_x.f90"
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Halo cell support
+   ! Halo cell support
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #include "halo.f90"
 
-
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Error handling
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_abort_basic(errorcode, msg)
-
-    use iso_fortran_env, only : error_unit
-
-    implicit none
-
-    integer, intent(IN) :: errorcode
-    character(len=*), intent(IN) :: msg
-
-    integer :: ierror
-
-    if (nrank==0) then
-       write(*,*) '2DECOMP&FFT ERROR - errorcode: ', errorcode
-       write(*,*) 'ERROR MESSAGE: ' // msg
-       write(error_unit,*) '2DECOMP&FFT ERROR - errorcode: ', errorcode
-       write(error_unit,*) 'ERROR MESSAGE: ' // msg
-    end if
-    call MPI_ABORT(decomp_2d_comm,errorcode,ierror)
-
-  end subroutine decomp_2d_abort_basic
-
-  subroutine decomp_2d_abort_file_line(file, line, errorcode, msg)
-
-    use iso_fortran_env, only : error_unit
-
-    implicit none
-
-    integer, intent(IN) :: errorcode, line
-    character(len=*), intent(IN) :: msg, file
-
-    integer :: ierror
-
-    if (nrank==0) then
-       write(*,*) '2DECOMP&FFT ERROR'
-       write(*,*) '  errorcode:     ', errorcode
-       write(*,*) '  error in file  ' // file
-       write(*,*) '           line  ', line
-       write(*,*) '  error message: ' // msg
-       write(error_unit,*) '2DECOMP&FFT ERROR'
-       write(error_unit,*) '  errorcode:     ', errorcode
-       write(error_unit,*) '  error in file  ' // file
-       write(error_unit,*) '           line  ', line
-       write(error_unit,*) '  error message: ' // msg
-    end if
-    call MPI_ABORT(decomp_2d_comm,errorcode,ierror)
-
-  end subroutine decomp_2d_abort_file_line
-
-#if defined(_GPU) && defined(_NCCL)
-  !
-  ! This is based on the file "nccl.h" in nvhpc 22.1
-  !
-  function _ncclresult_to_integer(errorcode)
-
-    implicit none
-
-    type(ncclresult), intent(IN) :: errorcode
-    integer :: _ncclresult_to_integer
-
-    if (errorcode == ncclSuccess) then
-        _ncclresult_to_integer = 0
-    elseif (errorcode == ncclUnhandledCudaError) then
-        _ncclresult_to_integer = 1
-    elseif (errorcode == ncclSystemError) then
-        _ncclresult_to_integer = 2
-    elseif (errorcode == ncclInternalError) then
-        _ncclresult_to_integer = 3
-    elseif (errorcode == ncclInvalidArgument) then
-        _ncclresult_to_integer = 4
-    elseif (errorcode == ncclInvalidUsage) then
-        _ncclresult_to_integer = 5
-    elseif (errorcode == ncclNumResults) then
-        _ncclresult_to_integer = 6
-    else
-      _ncclresult_to_integer = -1
-      call decomp_2d_warning(__FILE__, __LINE__, _ncclresult_to_integer, &
-                             "NCCL error handling needs some update")
-    end if
-
-  end function _ncclresult_to_integer
-
-  !
-  ! Small wrapper for basic NCCL errors
-  !
-  subroutine decomp_2d_abort_nccl_basic(errorcode, msg)
-
-    implicit none
-
-    type(ncclresult), intent(IN) :: errorcode
-    character(len=*), intent(IN) :: msg
-
-    call decomp_2d_abort(_ncclresult_to_integer(errorcode), &
-                         msg // " " // ncclGetErrorString(errorcode))
-
-  end subroutine decomp_2d_abort_nccl_basic
-
-  !
-  ! Small wrapper for NCCL errors
-  !
-  subroutine decomp_2d_abort_nccl_file_line(file, line, errorcode, msg)
-
-    implicit none
-
-    type(ncclresult), intent(IN) :: errorcode
-    integer, intent(in) :: line
-    character(len=*), intent(IN) :: msg, file
-
-    call decomp_2d_abort(file, &
-                         line, &
-                         _ncclresult_to_integer(errorcode), &
-                         msg // " " // ncclGetErrorString(errorcode))
-
-  end subroutine decomp_2d_abort_nccl_file_line
-#endif
-
-  subroutine decomp_2d_warning_basic(errorcode, msg)
-
-    use iso_fortran_env, only : error_unit
-
-    implicit none
-
-    integer, intent(IN) :: errorcode
-    character(len=*), intent(IN) :: msg
-
-    if (nrank==0) then
-       write(*,*) '2DECOMP&FFT WARNING - errorcode: ', errorcode
-       write(*,*) 'ERROR MESSAGE: ' // msg
-       write(error_unit,*) '2DECOMP&FFT WARNING - errorcode: ', errorcode
-       write(error_unit,*) 'ERROR MESSAGE: ' // msg
-    end if
-
-  end subroutine decomp_2d_warning_basic
-
-  subroutine decomp_2d_warning_file_line(file, line, errorcode, msg)
-
-    use iso_fortran_env, only : error_unit
-
-    implicit none
-
-    integer, intent(IN) :: errorcode, line
-    character(len=*), intent(IN) :: msg, file
-
-    if (nrank==0) then
-       write(*,*) '2DECOMP&FFT WARNING'
-       write(*,*) '  errorcode:     ', errorcode
-       write(*,*) '  error in file  ' // file
-       write(*,*) '           line  ', line
-       write(*,*) '  error message: ' // msg
-       write(error_unit,*) '2DECOMP&FFT WARNING'
-       write(error_unit,*) '  errorcode:     ', errorcode
-       write(error_unit,*) '  error in file  ' // file
-       write(error_unit,*) '           line  ', line
-       write(error_unit,*) '  error message: ' // msg
-    end if
-
-  end subroutine decomp_2d_warning_file_line
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Utility routines to help allocate 3D arrays
+   ! Utility routines to help allocate 3D arrays
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 #include "alloc.f90"
 
diff --git a/src/decomp_2d_constants.f90 b/src/decomp_2d_constants.f90
new file mode 100644
index 00000000..6023ee62
--- /dev/null
+++ b/src/decomp_2d_constants.f90
@@ -0,0 +1,109 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! Constants for the 2decomp&fft library
+
+module decomp_2d_constants
+
+   use mpi
+   use, intrinsic :: iso_fortran_env, only: real32, real64
+#if defined(_GPU) && defined(_NCCL)
+   use nccl
+#endif
+
+   implicit none
+
+   !private
+
+#ifdef DOUBLE_PREC
+   integer, parameter, public :: mytype = KIND(0._real64)
+   integer, parameter, public :: real_type = MPI_DOUBLE_PRECISION
+   integer, parameter, public :: real2_type = MPI_2DOUBLE_PRECISION
+   integer, parameter, public :: complex_type = MPI_DOUBLE_COMPLEX
+#ifdef SAVE_SINGLE
+   integer, parameter, public :: mytype_single = KIND(0._real32)
+   integer, parameter, public :: real_type_single = MPI_REAL
+#else
+   integer, parameter, public :: mytype_single = KIND(0._real64)
+   integer, parameter, public :: real_type_single = MPI_DOUBLE_PRECISION
+#endif
+#else
+   integer, parameter, public :: mytype = KIND(0._real32)
+   integer, parameter, public :: real_type = MPI_REAL
+   integer, parameter, public :: real2_type = MPI_2REAL
+   integer, parameter, public :: complex_type = MPI_COMPLEX
+   integer, parameter, public :: mytype_single = KIND(0._real32)
+   integer, parameter, public :: real_type_single = MPI_REAL
+#endif
+
+   !
+   ! Output for the log can be changed by the external code before calling decomp_2d_init
+   !
+   !    0 => No log output
+   !    1 => Master rank log output to stdout
+   !    2 => Master rank log output to the file "decomp_2d_setup.log"
+   !    3 => All ranks log output to a dedicated file
+   !
+   ! The default value is 2 (3 for debug builds)
+   !
+   integer, parameter, public :: D2D_LOG_QUIET = 0
+   integer, parameter, public :: D2D_LOG_STDOUT = 1
+   integer, parameter, public :: D2D_LOG_TOFILE = 2
+   integer, parameter, public :: D2D_LOG_TOFILE_FULL = 3
+
+   !
+   ! Debug level can be changed by the external code before calling decomp_2d_init
+   !
+   ! The environment variable "DECOMP_2D_DEBUG" can be used to change the debug level
+   !
+   ! Debug checks are performed only when the preprocessor variable DEBUG is defined
+   !
+   enum, bind(c)
+      enumerator :: D2D_DEBUG_LEVEL_OFF = 0
+      enumerator :: D2D_DEBUG_LEVEL_CRITICAL = 1
+      enumerator :: D2D_DEBUG_LEVEL_ERROR = 2
+      enumerator :: D2D_DEBUG_LEVEL_WARN = 3
+      enumerator :: D2D_DEBUG_LEVEL_INFO = 4
+      enumerator :: D2D_DEBUG_LEVEL_DEBUG = 5
+      enumerator :: D2D_DEBUG_LEVEL_TRACE = 6
+   end enum
+
+   !
+   ! Profiler section
+   !
+   ! Integer to select the profiling tool
+   !    0 => no profiling, default
+   !    1 => Caliper (https://github.com/LLNL/Caliper)
+   !
+   integer, parameter, public :: DECOMP_PROFILER_NONE = 0
+   integer, parameter, public :: DECOMP_PROFILER_CALIPER = 1
+
+   !
+   ! Supported FFT backends
+   !
+   integer, parameter, public :: D2D_FFT_BACKEND_GENERIC = 0
+   integer, parameter, public :: D2D_FFT_BACKEND_FFTW3 = 1
+   integer, parameter, public :: D2D_FFT_BACKEND_FFTW3_F03 = 2
+   integer, parameter, public :: D2D_FFT_BACKEND_MKL = 3
+   integer, parameter, public :: D2D_FFT_BACKEND_CUFFT = 4
+
+   !
+   ! Complex-to-complex FFT can be forward or backward
+   !
+   integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1
+   integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1
+
+   !
+   ! Input / output of the FFT are distributed along different pencils
+   !
+   integer, parameter, public :: PHYSICAL_IN_X = 1 ! Forward is input in X, output in Z
+   integer, parameter, public :: PHYSICAL_IN_Z = 3 ! Forward is input in Z, output in X
+
+   !
+   ! Major and minor version number
+   !
+   integer, parameter :: D2D_MAJOR = 2
+   integer, parameter :: D2D_MINOR = 0
+   logical, parameter :: D2D_RELEASE = .true.
+
+end module decomp_2d_constants
+
diff --git a/src/decomp_2d_cumpi.f90 b/src/decomp_2d_cumpi.f90
new file mode 100644
index 00000000..45917cca
--- /dev/null
+++ b/src/decomp_2d_cumpi.f90
@@ -0,0 +1,74 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! Module for the cuda aware MPI
+
+module decomp_2d_cumpi
+
+   use decomp_2d_constants
+   use decomp_2d_mpi
+
+   implicit none
+
+   private        ! Make everything private unless declared public
+
+   ! Device working arrays
+   real(mytype), allocatable, dimension(:), device, public:: work1_r_d, work2_r_d
+   complex(mytype), allocatable, dimension(:), device, public :: work1_c_d, work2_c_d
+
+   public :: decomp_2d_cumpi_init, &
+             decomp_2d_cumpi_fin
+
+contains
+   !
+   ! init of the arrays
+   !
+   subroutine decomp_2d_cumpi_init(buf_size)
+
+      implicit none
+
+      integer, intent(in) :: buf_size
+      integer :: status, errorcode
+
+      if (allocated(work1_r_d)) deallocate (work1_r_d)
+      if (allocated(work2_r_d)) deallocate (work2_r_d)
+      if (allocated(work1_c_d)) deallocate (work1_c_d)
+      if (allocated(work2_c_d)) deallocate (work2_c_d)
+      allocate (work1_r_d(buf_size), STAT=status)
+      if (status /= 0) then
+         errorcode = 2
+         call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                              'Out of memory when allocating 2DECOMP workspace')
+      end if
+      allocate (work1_c_d(buf_size), STAT=status)
+      if (status /= 0) then
+         errorcode = 2
+         call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                              'Out of memory when allocating 2DECOMP workspace')
+      end if
+      allocate (work2_r_d(buf_size), STAT=status)
+      if (status /= 0) then
+         errorcode = 2
+         call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                              'Out of memory when allocating 2DECOMP workspace')
+      end if
+      allocate (work2_c_d(buf_size), STAT=status)
+      if (status /= 0) then
+         errorcode = 2
+         call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                              'Out of memory when allocating 2DECOMP workspace')
+      end if
+
+   end subroutine decomp_2d_cumpi_init
+   !
+   ! init of the arrays
+   !
+   subroutine decomp_2d_cumpi_fin
+
+      implicit none
+
+      deallocate (work1_r_d, work2_r_d, work1_c_d, work2_c_d)
+
+   end subroutine decomp_2d_cumpi_fin
+
+end module decomp_2d_cumpi
+
diff --git a/src/decomp_2d_init_fin.f90 b/src/decomp_2d_init_fin.f90
index fda17fd0..71ace9e8 100644
--- a/src/decomp_2d_init_fin.f90
+++ b/src/decomp_2d_init_fin.f90
@@ -1,289 +1,209 @@
-!!!=======================================================================
-!!! This is part of the 2DECOMP&FFT library
-!!! 
-!!! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-!!! decomposition. It also implements a highly scalable distributed
-!!! three-dimensional Fast Fourier Transform (FFT).
-!!!
-!!! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!!! Copyright (C) 2022-              the Xcompact3d developers
-!!!
-!!!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
   !======================================================================
   ! Routine to be called by applications to initialise this library
   !   INPUT:
   !     nx, ny, nz   - global data dimension
   !     p_row, p_col - 2D processor grid
+  !     periodic_bc  - optional, periodicity flag for halo operations
+  !     comm         - optional, MPI communicator, default MPI_COMM_WORLD
   !   OUTPUT:
   !     all internal data structures initialised properly
   !     library ready to use
   !======================================================================
-  subroutine decomp_2d_init_ref(nx,ny,nz,p_row,p_col,periodic_bc,comm)
+  subroutine decomp_2d_init_ref(nx, ny, nz, p_row, p_col, periodic_bc, comm)
 
-    use mpi
-    use iso_fortran_env, only : output_unit
+     use mpi
+     use iso_fortran_env, only: output_unit
 
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: nx,ny,nz
-    integer, intent(INOUT) :: p_row,p_col
-    logical, dimension(3), intent(IN), optional :: periodic_bc
-    integer, intent(in), optional :: comm
+     integer, intent(IN) :: nx, ny, nz
+     integer, intent(INOUT) :: p_row, p_col
+     logical, dimension(3), intent(IN), optional :: periodic_bc
+     integer, intent(in), optional :: comm
 
-    integer :: errorcode, ierror, row, col, iounit
-    logical, dimension(2) :: periodic
-#if defined(_GPU) && defined(_NCCL)
-    integer :: cuda_stat
-    type(ncclResult) :: nccl_stat
-#endif
-#ifdef DEBUG
-    character(len=7) fname ! Sufficient for up to O(1M) ranks
-#endif
+     integer :: errorcode, ierror, row, col, iounit
+     logical, dimension(2) :: periodic
 
 #ifdef PROFILER
-    ! Prepare the profiler if it was not already prepared
-    if (decomp_profiler.eq.decomp_profiler_none) call decomp_profiler_prep()
-    ! Start the profiler
-    call decomp_profiler_init()
-    ! Start the timer for decomp_2d_init
-    if (decomp_profiler_d2d) call decomp_profiler_start("decomp_2d_init")
+     ! Prepare the profiler if it was not already prepared
+     if (decomp_profiler == decomp_profiler_none) call decomp_profiler_prep()
+     ! Start the profiler
+     call decomp_profiler_init()
+     ! Start the timer for decomp_2d_init
+     if (decomp_profiler_d2d) call decomp_profiler_start("decomp_2d_init")
 #endif
 
-    ! Use the provided MPI communicator if present
-    if (present(comm)) then
-       decomp_2d_comm = comm
-    else
-       decomp_2d_comm = MPI_COMM_WORLD
-    endif
-
-    ! If the external code has not set nrank and nproc
-    if (nrank == -1) then
-       call MPI_COMM_RANK(decomp_2d_comm, nrank, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, &
-                                             __LINE__, &
-                                             ierror, &
-                                             "MPI_COMM_RANK")
-    endif
-    if (nproc == -1) then
-       call MPI_COMM_SIZE(decomp_2d_comm, nproc, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, &
-                                             __LINE__, &
-                                             ierror, &
-                                             "MPI_COMM_SIZE")
-    endif
+     call decomp_2d_mpi_init(comm)
+
+     if (nx <= 0) call decomp_2d_abort(__FILE__, __LINE__, nx, "Invalid value for nx")
+     if (ny <= 0) call decomp_2d_abort(__FILE__, __LINE__, ny, "Invalid value for ny")
+     if (nz <= 0) call decomp_2d_abort(__FILE__, __LINE__, nz, "Invalid value for nz")
+
 #ifdef DEBUG
-    ! Check if a modification of the debug level is needed
-    call decomp_2d_debug()
+     ! Check if a modification of the debug level is needed
+     call decomp_2d_debug()
 #endif
 
-    nx_global = nx
-    ny_global = ny
-    nz_global = nz
-
-    if (present(periodic_bc)) then
-       periodic_x = periodic_bc(1)
-       periodic_y = periodic_bc(2)
-       periodic_z = periodic_bc(3)
-    else
-       periodic_x = .false.
-       periodic_y = .false.
-       periodic_z = .false.
-    end if
-
-    if (p_row==0 .and. p_col==0) then
-       ! determine the best 2D processor grid
-       call best_2d_grid(nproc, row, col)
-       p_row = row
-       p_col = col
-    else
-       if (nproc /= p_row*p_col) then
-          errorcode = 1
-          call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-               'Invalid 2D processor grid - nproc /= p_row*p_col')
-       else
-          row = p_row
-          col = p_col
-       end if
-    end if
-
-    ! Create 2D Catersian topology
-    ! Note that in order to support periodic B.C. in the halo-cell code,
-    ! need to create multiple topology objects: DECOMP_2D_COMM_CART_?,
-    ! corresponding to three pencil orientations. They contain almost
-    ! identical topological information but allow different combinations
-    ! of periodic conditions.
-    dims(1) = row
-    dims(2) = col
-    periodic(1) = periodic_y
-    periodic(2) = periodic_z
-    call MPI_CART_CREATE(decomp_2d_comm,2,dims,periodic, &
-         .false., &  ! do not reorder rank
-         DECOMP_2D_COMM_CART_X, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
-    periodic(1) = periodic_x
-    periodic(2) = periodic_z
-    call MPI_CART_CREATE(decomp_2d_comm,2,dims,periodic, &
-         .false., DECOMP_2D_COMM_CART_Y, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
-    periodic(1) = periodic_x
-    periodic(2) = periodic_y
-    call MPI_CART_CREATE(decomp_2d_comm,2,dims,periodic, &
-         .false., DECOMP_2D_COMM_CART_Z, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
-
-    call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X,nrank,2,coord,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_COORDS")
-
-    ! derive communicators defining sub-groups for ALLTOALL(V)
-    call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.true.,.false./), &
-         DECOMP_2D_COMM_COL,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SUB")
-    call MPI_CART_SUB(DECOMP_2D_COMM_CART_X,(/.false.,.true./), &
-         DECOMP_2D_COMM_ROW,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SUB")
-
-    ! gather information for halo-cell support code
-    call init_neighbour
-
-    ! actually generate all 2D decomposition information
-    call decomp_info_init(nx,ny,nz,decomp_main)
-
-    ! make a copy of the decomposition information associated with the
-    ! default global size in these global variables so applications can
-    ! use them to create data structures 
-    xstart = decomp_main%xst
-    ystart = decomp_main%yst
-    zstart = decomp_main%zst
-    xend   = decomp_main%xen
-    yend   = decomp_main%yen
-    zend   = decomp_main%zen
-    xsize  = decomp_main%xsz
-    ysize  = decomp_main%ysz
-    zsize  = decomp_main%zsz
-
-    ! determine the number of bytes per float number
-    ! do not use 'mytype' which is compiler dependent
-    ! also possible to use inquire(iolength=...) 
-    call MPI_TYPE_SIZE(real_type,mytype_bytes,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+     nx_global = nx
+     ny_global = ny
+     nz_global = nz
+
+     if (present(periodic_bc)) then
+        periodic_x = periodic_bc(1)
+        periodic_y = periodic_bc(2)
+        periodic_z = periodic_bc(3)
+     else
+        periodic_x = .false.
+        periodic_y = .false.
+        periodic_z = .false.
+     end if
+
+     if (p_row <= 0 .or. p_col <= 0) then
+        ! determine the best 2D processor grid
+        call best_2d_grid(nproc, row, col)
+        p_row = row
+        p_col = col
+     else
+        if (nproc /= p_row * p_col) then
+           errorcode = 1
+           call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                                'Invalid 2D processor grid - nproc /= p_row*p_col')
+        else
+           row = p_row
+           col = p_col
+        end if
+     end if
+
+     ! Create 2D Catersian topology
+     ! Note that in order to support periodic B.C. in the halo-cell code,
+     ! need to create multiple topology objects: DECOMP_2D_COMM_CART_?,
+     ! corresponding to three pencil orientations. They contain almost
+     ! identical topological information but allow different combinations
+     ! of periodic conditions.
+     dims(1) = row
+     dims(2) = col
+     periodic(1) = periodic_y
+     periodic(2) = periodic_z
+     call MPI_CART_CREATE(decomp_2d_comm, 2, dims, periodic, &
+                          .false., &  ! do not reorder rank
+                          DECOMP_2D_COMM_CART_X, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
+     periodic(1) = periodic_x
+     periodic(2) = periodic_z
+     call MPI_CART_CREATE(decomp_2d_comm, 2, dims, periodic, &
+                          .false., DECOMP_2D_COMM_CART_Y, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
+     periodic(1) = periodic_x
+     periodic(2) = periodic_y
+     call MPI_CART_CREATE(decomp_2d_comm, 2, dims, periodic, &
+                          .false., DECOMP_2D_COMM_CART_Z, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_CREATE")
+
+     call MPI_CART_COORDS(DECOMP_2D_COMM_CART_X, nrank, 2, coord, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_COORDS")
+
+     ! derive communicators defining sub-groups for ALLTOALL(V)
+     call MPI_CART_SUB(DECOMP_2D_COMM_CART_X, (/.true., .false./), &
+                       DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SUB")
+     call MPI_CART_SUB(DECOMP_2D_COMM_CART_X, (/.false., .true./), &
+                       DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SUB")
+
+     ! gather information for halo-cell support code
+     call init_neighbour
+
+     ! actually generate all 2D decomposition information
+     call decomp_info_init(nx, ny, nz, decomp_main)
+
+     ! make a copy of the decomposition information associated with the
+     ! default global size in these global variables so applications can
+     ! use them to create data structures
+     xstart = decomp_main%xst
+     ystart = decomp_main%yst
+     zstart = decomp_main%zst
+     xend = decomp_main%xen
+     yend = decomp_main%yen
+     zend = decomp_main%zen
+     xsize = decomp_main%xsz
+     ysize = decomp_main%ysz
+     zsize = decomp_main%zsz
+
+     ! determine the number of bytes per float number
+     ! do not use 'mytype' which is compiler dependent
+     ! also possible to use inquire(iolength=...)
+     call MPI_TYPE_SIZE(real_type, mytype_bytes, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
 
 #ifdef EVEN
-    if (nrank==0) write(*,*) 'Padded ALLTOALL optimisation on'
-#endif 
-
-#if defined(_GPU)
-#if defined(_NCCL)
-    call MPI_COMM_RANK(DECOMP_2D_COMM_COL,col_rank,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
-    call MPI_COMM_RANK(DECOMP_2D_COMM_ROW,row_rank,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
-    call MPI_COMM_SIZE(DECOMP_2D_COMM_COL,col_comm_size,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
-    call MPI_COMM_SIZE(DECOMP_2D_COMM_ROW,row_comm_size,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
-
-    allocate(local_to_global_col(col_comm_size), local_to_global_row(row_comm_size))
-    
-    local_to_global_col(:) = 0
-    local_to_global_row(:) = 0
-    local_to_global_col(col_rank+1) = nrank
-    local_to_global_row(row_rank+1) = nrank
-    
-    call mpi_allreduce(MPI_IN_PLACE,local_to_global_col,col_comm_size,MPI_INTEGER,MPI_SUM,DECOMP_2D_COMM_COL,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
-    call mpi_allreduce(MPI_IN_PLACE,local_to_global_row,row_comm_size,MPI_INTEGER,MPI_SUM,DECOMP_2D_COMM_ROW,ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
-
-    if (nrank .eq. 0) then
-       nccl_stat = ncclGetUniqueId(nccl_uid_2decomp)
-    end if
-    call MPI_Bcast(nccl_uid_2decomp, int(sizeof(ncclUniqueId)), MPI_BYTE, 0, decomp_2d_comm, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BCAST")
-
-    nccl_stat = ncclCommInitRank(nccl_comm_2decomp, nproc, nccl_uid_2decomp, nrank)
-    cuda_stat = cudaStreamCreate(cuda_stream_2decomp)
-#endif
+     if (nrank == 0) write (*, *) 'Padded ALLTOALL optimisation on'
 #endif
 
-    !
-    ! Select the IO unit for decomp_2d setup
-    !
-#ifdef DEBUG
-    write(fname, "(I0)") nrank ! Adapt to magnitude of nrank
-    open(newunit=iounit, file='decomp_2d_setup_'//trim(fname)//'.log', iostat=ierror)
-#else
-    if (nrank == 0) then
-       open(newunit=iounit, file="decomp_2d_setup.log", iostat=ierror)
-    else
-       iounit = output_unit
-       ierror = 0
-    endif
+#if defined(_GPU) && defined(_NCCL)
+     call decomp_2d_nccl_init(DECOMP_2D_COMM_COL, DECOMP_2D_COMM_ROW)
 #endif
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "Could not open log file")
-    !
-    ! Print the decomp_2d setup
-    !
-    call d2d_listing(iounit)
-    !
-    ! Close the IO unit if it was not stdout
-    !
-    if (iounit /= output_unit) then
-       close(iounit, iostat=ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "Could not close log file")
-    endif
+
+     !
+     ! Get the IO unit for decomp_2d setup
+     !
+     iounit = d2d_listing_get_unit()
+     !
+     ! Print the decomp_2d setup
+     !
+     call d2d_listing(iounit)
 
 #ifdef PROFILER
-    ! Stop the timer for decomp_2d_init
-    if (decomp_profiler_d2d) call decomp_profiler_end("decomp_2d_init")
+     ! Stop the timer for decomp_2d_init
+     if (decomp_profiler_d2d) call decomp_profiler_end("decomp_2d_init")
 #endif
 
-    return
+     return
   end subroutine decomp_2d_init_ref
 
   !======================================================================
   ! Routine to be called by applications to clean things up
   !======================================================================
   subroutine decomp_2d_finalize_ref
-    
-    implicit none
-#if defined(_GPU) && defined(_NCCL)
-    type(ncclResult) :: nccl_stat
-#endif
-    
- 
-#ifdef PROFILER
-    if (decomp_profiler_d2d) call decomp_profiler_start("decomp_2d_fin")
-#endif
 
-    call decomp_mpi_comm_free(DECOMP_2D_COMM_ROW)
-    call decomp_mpi_comm_free(DECOMP_2D_COMM_COL)
-    call decomp_mpi_comm_free(DECOMP_2D_COMM_CART_X)
-    call decomp_mpi_comm_free(DECOMP_2D_COMM_CART_Y)
-    call decomp_mpi_comm_free(DECOMP_2D_COMM_CART_Z)
+     implicit none
 
-    call decomp_info_finalize(decomp_main)
+#ifdef PROFILER
+     if (decomp_profiler_d2d) call decomp_profiler_start("decomp_2d_fin")
+#endif
 
-    decomp_buf_size = 0
-    deallocate(work1_r, work2_r, work1_c, work2_c)
+     call decomp_2d_mpi_comm_free(DECOMP_2D_COMM_ROW)
+     call decomp_2d_mpi_comm_free(DECOMP_2D_COMM_COL)
+     call decomp_2d_mpi_comm_free(DECOMP_2D_COMM_CART_X)
+     call decomp_2d_mpi_comm_free(DECOMP_2D_COMM_CART_Y)
+     call decomp_2d_mpi_comm_free(DECOMP_2D_COMM_CART_Z)
+
+     call decomp_info_finalize(decomp_main)
+
+     decomp_buf_size = 0
+     if (associated(work1_r)) nullify (work1_r)
+     if (associated(work2_r)) nullify (work2_r)
+     if (associated(work1_c)) nullify (work1_c)
+     if (associated(work2_c)) nullify (work2_c)
+     if (allocated(work1)) deallocate (work1)
+     if (allocated(work2)) deallocate (work2)
 #if defined(_GPU)
-    deallocate(work1_r_d, work2_r_d, work1_c_d, work2_c_d)
-
+     call decomp_2d_cumpi_fin()
 #if defined(_NCCL)
-    nccl_stat = ncclCommDestroy(nccl_comm_2decomp)
+     call decomp_2d_nccl_fin()
 #endif
 #endif
 
-    nrank = -1
-    nproc = -1
+     call decomp_2d_mpi_fin()
 
 #ifdef PROFILER
-    if (decomp_profiler_d2d) call decomp_profiler_end("decomp_2d_fin")
-    ! Finalize the profiler
-    call decomp_profiler_fin()
+     if (decomp_profiler_d2d) call decomp_profiler_end("decomp_2d_fin")
+     ! Finalize the profiler
+     call decomp_profiler_fin()
 #endif
 
-    return
+     return
   end subroutine decomp_2d_finalize_ref
 
 #ifdef DEBUG
@@ -294,33 +214,33 @@ end subroutine decomp_2d_finalize_ref
   !
   subroutine decomp_2d_debug
 
-    implicit none
+     implicit none
 
-    integer :: ierror
-    character(len=4) :: val
-    character(len=*), parameter :: varname = "DECOMP_2D_DEBUG"
+     integer :: ierror
+     character(len=4) :: val
+     character(len=*), parameter :: varname = "DECOMP_2D_DEBUG"
 
-    ! Read the variable
-    call get_environment_variable(varname, value=val, status=ierror)
+     ! Read the variable
+     call get_environment_variable(varname, value=val, status=ierror)
 
-    ! Return if no variable, or no support for env. variable
-    if (ierror >= 1) return
+     ! Return if no variable, or no support for env. variable
+     if (ierror >= 1) return
 
-    ! Minor error, print warning and return
-    if (ierror /= 0) then
-       call decomp_2d_warning(__FILE__, &
-                              __LINE__, &
-                              ierror, &
-                              "Error when reading DECOMP_2D_DEBUG : "//val)
-       return
-    endif
+     ! Minor error, print warning and return
+     if (ierror /= 0) then
+        call decomp_2d_warning(__FILE__, &
+                               __LINE__, &
+                               ierror, &
+                               "Error when reading DECOMP_2D_DEBUG : "//val)
+        return
+     end if
 
-    ! Conversion to integer if possible
-    read(val, '(i4)', iostat=ierror) decomp_debug
-    if (ierror /= 0) call decomp_2d_warning(__FILE__, &
-                                            __LINE__, &
-                                            ierror, &
-                                            "Error when reading DECOMP_2D_DEBUG : "//val)
+     ! Conversion to integer if possible
+     read (val, '(i4)', iostat=ierror) decomp_debug
+     if (ierror /= 0) then
+        call decomp_2d_warning(__FILE__, __LINE__, ierror, &
+                               "Error when reading DECOMP_2D_DEBUG : "//val)
+     end if
 
   end subroutine decomp_2d_debug
 #endif
@@ -330,34 +250,34 @@ end subroutine decomp_2d_debug
   !---------------------------------------------------------------------
   subroutine best_2d_grid(iproc, best_p_row, best_p_col)
 
-    implicit none
+     implicit none
+
+     integer, intent(IN) :: iproc
+     integer, intent(OUT) :: best_p_row, best_p_col
 
-    integer, intent(IN) :: iproc
-    integer, intent(OUT) :: best_p_row, best_p_col
+     integer, allocatable, dimension(:) :: factors
+     integer :: nfact, i, col, i_best
 
-    integer, allocatable, dimension(:) :: factors
-    integer :: nfact, i, col, i_best
+     if (nrank == 0) write (*, *) 'In auto-tuning mode......'
 
-    if (nrank==0) write(*,*) 'In auto-tuning mode......'
+     i = int(sqrt(real(iproc))) + 10  ! enough space to save all factors
+     allocate (factors(i))
+     call findfactor(iproc, factors, nfact)
+     if (nrank == 0) write (*, *) 'factors: ', (factors(i), i=1, nfact)
 
-    i = int(sqrt(real(iproc))) + 10  ! enough space to save all factors 
-    allocate(factors(i))
-    call findfactor(iproc, factors, nfact)
-    if (nrank==0) write(*,*) 'factors: ', (factors(i), i=1,nfact)
+     i_best = nfact / 2 + 1
+     col = factors(i_best)
 
-    i_best=nfact/2+1
-    col=factors(i_best)
+     best_p_col = col
+     best_p_row = iproc / col
+     if (nrank == 0) print *, 'p_row x p_col', best_p_row, best_p_col
+     if ((best_p_col == 1) .and. (nrank == 0)) then
+        print *, 'WARNING: current 2D DECOMP set-up might not work'
+     end if
 
-    best_p_col = col
-    best_p_row=iproc/col
-    if (nrank==0) print *,'p_row x p_col', best_p_row, best_p_col
-    if ((best_p_col==1).and.(nrank==0)) then
-       print *,'WARNING: current 2D DECOMP set-up might not work'
-    endif
-    
-    deallocate(factors)
+     deallocate (factors)
 
-    return
+     return
   end subroutine best_2d_grid
 
   !---------------------------------------------------------------------
@@ -367,36 +287,36 @@ end subroutine best_2d_grid
   !---------------------------------------------------------------------
   subroutine init_neighbour
 
-    integer :: ierror
-
-    ! For X-pencil
-    neighbour(1,1) = MPI_PROC_NULL               ! east
-    neighbour(1,2) = MPI_PROC_NULL               ! west
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 0, 1, &
-         neighbour(1,4), neighbour(1,3), ierror) ! north & south
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 1, 1, &
-         neighbour(1,6), neighbour(1,5), ierror) ! top & bottom
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-
-    ! For Y-pencil
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 0, 1, &
-         neighbour(2,2), neighbour(2,1), ierror) ! east & west
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-    neighbour(2,3) = MPI_PROC_NULL               ! north
-    neighbour(2,4) = MPI_PROC_NULL               ! south
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 1, 1, &
-         neighbour(2,6), neighbour(2,5), ierror) ! top & bottom
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-
-    ! For Z-pencil
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 0, 1, &
-         neighbour(3,2), neighbour(3,1), ierror) ! east & west
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-    call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 1, 1, &
-         neighbour(3,4), neighbour(3,3), ierror) ! north & south
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
-    neighbour(3,5) = MPI_PROC_NULL               ! top
-    neighbour(3,6) = MPI_PROC_NULL               ! bottom
-    return
+     integer :: ierror
+
+     ! For X-pencil
+     neighbour(1, 1) = MPI_PROC_NULL               ! east
+     neighbour(1, 2) = MPI_PROC_NULL               ! west
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 0, 1, &
+                         neighbour(1, 4), neighbour(1, 3), ierror) ! north & south
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_X, 1, 1, &
+                         neighbour(1, 6), neighbour(1, 5), ierror) ! top & bottom
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+
+     ! For Y-pencil
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 0, 1, &
+                         neighbour(2, 2), neighbour(2, 1), ierror) ! east & west
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+     neighbour(2, 3) = MPI_PROC_NULL               ! north
+     neighbour(2, 4) = MPI_PROC_NULL               ! south
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Y, 1, 1, &
+                         neighbour(2, 6), neighbour(2, 5), ierror) ! top & bottom
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+
+     ! For Z-pencil
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 0, 1, &
+                         neighbour(3, 2), neighbour(3, 1), ierror) ! east & west
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+     call MPI_CART_SHIFT(DECOMP_2D_COMM_CART_Z, 1, 1, &
+                         neighbour(3, 4), neighbour(3, 3), ierror) ! north & south
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_CART_SHIFT")
+     neighbour(3, 5) = MPI_PROC_NULL               ! top
+     neighbour(3, 6) = MPI_PROC_NULL               ! bottom
+     return
   end subroutine init_neighbour
diff --git a/src/decomp_2d_mpi.f90 b/src/decomp_2d_mpi.f90
new file mode 100644
index 00000000..df4531c9
--- /dev/null
+++ b/src/decomp_2d_mpi.f90
@@ -0,0 +1,268 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! MPI module for 2decomp&fft library
+
+module decomp_2d_mpi
+
+   use MPI
+   use decomp_2d_constants
+#if defined(_GPU) && defined(_NCCL)
+   use nccl
+#endif
+
+   implicit none
+
+   integer, save, public :: mytype_bytes
+   integer, save, public :: nrank = -1 ! local MPI rank
+   integer, save, public :: nproc = -1 ! total number of processors
+   integer, save, public :: decomp_2d_comm = MPI_COMM_NULL ! MPI communicator
+
+   public :: decomp_2d_mpi_init, &
+             decomp_2d_mpi_fin, &
+             decomp_2d_mpi_comm_free, &
+             decomp_2d_abort, &
+             decomp_2d_warning
+
+   interface decomp_2d_abort
+      module procedure decomp_2d_abort_basic
+      module procedure decomp_2d_abort_file_line
+#if defined(_GPU) && defined(_NCCL)
+      module procedure decomp_2d_abort_nccl_basic
+      module procedure decomp_2d_abort_nccl_file_line
+#endif
+   end interface decomp_2d_abort
+
+   interface decomp_2d_warning
+      module procedure decomp_2d_warning_basic
+      module procedure decomp_2d_warning_file_line
+   end interface decomp_2d_warning
+
+contains
+
+   !
+   ! Initialize the MPI module
+   !
+   subroutine decomp_2d_mpi_init(comm)
+
+      implicit none
+
+      integer, intent(in), optional :: comm
+
+      integer :: ierror
+
+      ! Use the provided MPI communicator if present
+      if (present(comm)) then
+         decomp_2d_comm = comm
+      else
+         decomp_2d_comm = MPI_COMM_WORLD
+      end if
+
+      ! If the external code has not set nrank and nproc
+      if (nrank == -1) then
+         call MPI_COMM_RANK(decomp_2d_comm, nrank, ierror)
+         if (ierror /= 0) then
+            call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
+         end if
+      end if
+      if (nproc == -1) then
+         call MPI_COMM_SIZE(decomp_2d_comm, nproc, ierror)
+         if (ierror /= 0) then
+            call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
+         end if
+      end if
+
+      if (MPI_SUCCESS /= 0) then
+         call decomp_2d_abort(__FILE__, __LINE__, MPI_SUCCESS, "MPI error check is broken")
+      end if
+
+   end subroutine decomp_2d_mpi_init
+
+   !
+   ! Finalize the MPI module
+   !
+   subroutine decomp_2d_mpi_fin
+
+      implicit none
+
+      nrank = -1
+      nproc = -1
+
+   end subroutine decomp_2d_mpi_fin
+
+   !
+   ! Small wrapper to free a MPI communicator
+   !
+   subroutine decomp_2d_mpi_comm_free(mpi_comm)
+
+      implicit none
+
+      integer, intent(inout) :: mpi_comm
+      integer :: ierror
+
+      ! Return if no MPI comm to free
+      if (mpi_comm == MPI_COMM_NULL) return
+
+      ! Free the provided MPI communicator
+      call MPI_COMM_FREE(mpi_comm, ierror)
+      if (ierror /= 0) call decomp_2d_warning(__FILE__, __LINE__, ierror, "MPI_COMM_FREE")
+      mpi_comm = MPI_COMM_NULL
+
+   end subroutine decomp_2d_mpi_comm_free
+
+   subroutine decomp_2d_abort_basic(errorcode, msg)
+
+      use iso_fortran_env, only: error_unit
+
+      implicit none
+
+      integer, intent(IN) :: errorcode
+      character(len=*), intent(IN) :: msg
+
+      integer :: ierror
+
+      if (nrank == 0) then
+         write (*, *) '2DECOMP&FFT ERROR - errorcode: ', errorcode
+         write (*, *) 'ERROR MESSAGE: '//msg
+         write (error_unit, *) '2DECOMP&FFT ERROR - errorcode: ', errorcode
+         write (error_unit, *) 'ERROR MESSAGE: '//msg
+      end if
+      call MPI_ABORT(decomp_2d_comm, errorcode, ierror)
+
+   end subroutine decomp_2d_abort_basic
+
+   subroutine decomp_2d_abort_file_line(file, line, errorcode, msg)
+
+      use iso_fortran_env, only: error_unit
+
+      implicit none
+
+      integer, intent(IN) :: errorcode, line
+      character(len=*), intent(IN) :: msg, file
+
+      integer :: ierror
+
+      if (nrank == 0) then
+         write (*, *) '2DECOMP&FFT ERROR'
+         write (*, *) '  errorcode:     ', errorcode
+         write (*, *) '  error in file  '//file
+         write (*, *) '           line  ', line
+         write (*, *) '  error message: '//msg
+         write (error_unit, *) '2DECOMP&FFT ERROR'
+         write (error_unit, *) '  errorcode:     ', errorcode
+         write (error_unit, *) '  error in file  '//file
+         write (error_unit, *) '           line  ', line
+         write (error_unit, *) '  error message: '//msg
+      end if
+      call MPI_ABORT(decomp_2d_comm, errorcode, ierror)
+
+   end subroutine decomp_2d_abort_file_line
+
+#if defined(_GPU) && defined(_NCCL)
+   !
+   ! This is based on the file "nccl.h" in nvhpc 22.1
+   !
+   function _ncclresult_to_integer(errorcode)
+
+      implicit none
+
+      type(ncclresult), intent(IN) :: errorcode
+      integer :: _ncclresult_to_integer
+
+      if (errorcode == ncclSuccess) then
+         _ncclresult_to_integer = 0
+      elseif (errorcode == ncclUnhandledCudaError) then
+         _ncclresult_to_integer = 1
+      elseif (errorcode == ncclSystemError) then
+         _ncclresult_to_integer = 2
+      elseif (errorcode == ncclInternalError) then
+         _ncclresult_to_integer = 3
+      elseif (errorcode == ncclInvalidArgument) then
+         _ncclresult_to_integer = 4
+      elseif (errorcode == ncclInvalidUsage) then
+         _ncclresult_to_integer = 5
+      elseif (errorcode == ncclNumResults) then
+         _ncclresult_to_integer = 6
+      else
+         _ncclresult_to_integer = -1
+         call decomp_2d_warning(__FILE__, __LINE__, _ncclresult_to_integer, &
+                                "NCCL error handling needs some update")
+      end if
+
+   end function _ncclresult_to_integer
+   !
+   ! Small wrapper for basic NCCL errors
+   !
+   subroutine decomp_2d_abort_nccl_basic(errorcode, msg)
+
+      implicit none
+
+      type(ncclresult), intent(IN) :: errorcode
+      character(len=*), intent(IN) :: msg
+
+      call decomp_2d_abort(_ncclresult_to_integer(errorcode), &
+                           msg//" "//ncclGetErrorString(errorcode))
+
+   end subroutine decomp_2d_abort_nccl_basic
+
+   !
+   ! Small wrapper for NCCL errors
+   !
+   subroutine decomp_2d_abort_nccl_file_line(file, line, errorcode, msg)
+
+      implicit none
+
+      type(ncclresult), intent(IN) :: errorcode
+      integer, intent(in) :: line
+      character(len=*), intent(IN) :: msg, file
+
+      call decomp_2d_abort(file, &
+                           line, &
+                           _ncclresult_to_integer(errorcode), &
+                           msg//" "//ncclGetErrorString(errorcode))
+
+   end subroutine decomp_2d_abort_nccl_file_line
+#endif
+
+   subroutine decomp_2d_warning_basic(errorcode, msg)
+
+      use iso_fortran_env, only: error_unit
+
+      implicit none
+
+      integer, intent(IN) :: errorcode
+      character(len=*), intent(IN) :: msg
+
+      if (nrank == 0) then
+         write (*, *) '2DECOMP&FFT WARNING - errorcode: ', errorcode
+         write (*, *) 'ERROR MESSAGE: '//msg
+         write (error_unit, *) '2DECOMP&FFT WARNING - errorcode: ', errorcode
+         write (error_unit, *) 'ERROR MESSAGE: '//msg
+      end if
+
+   end subroutine decomp_2d_warning_basic
+
+   subroutine decomp_2d_warning_file_line(file, line, errorcode, msg)
+
+      use iso_fortran_env, only: error_unit
+
+      implicit none
+
+      integer, intent(IN) :: errorcode, line
+      character(len=*), intent(IN) :: msg, file
+
+      if (nrank == 0) then
+         write (*, *) '2DECOMP&FFT WARNING'
+         write (*, *) '  errorcode:     ', errorcode
+         write (*, *) '  error in file  '//file
+         write (*, *) '           line  ', line
+         write (*, *) '  error message: '//msg
+         write (error_unit, *) '2DECOMP&FFT WARNING'
+         write (error_unit, *) '  errorcode:     ', errorcode
+         write (error_unit, *) '  error in file  '//file
+         write (error_unit, *) '           line  ', line
+         write (error_unit, *) '  error message: '//msg
+      end if
+
+   end subroutine decomp_2d_warning_file_line
+
+end module decomp_2d_mpi
diff --git a/src/decomp_2d_nccl.f90 b/src/decomp_2d_nccl.f90
new file mode 100644
index 00000000..620e83ef
--- /dev/null
+++ b/src/decomp_2d_nccl.f90
@@ -0,0 +1,307 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! Module for the cuda aware MPI
+
+module decomp_2d_nccl
+
+   use mpi
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d_cumpi
+   use cudafor 
+   use nccl
+
+   implicit none
+
+   private        ! Make everything private unless declared public
+
+#ifdef DOUBLE_PREC
+   type(ncclDataType), parameter, public :: ncclType = ncclDouble
+#else
+   type(ncclDataType), parameter, public :: ncclType = ncclFloat
+#endif
+
+   integer, save, public :: row_rank, col_rank
+
+   integer, save, public :: col_comm_size, row_comm_size
+   integer, allocatable, dimension(:), save, public :: local_to_global_col, local_to_global_row
+   type(ncclUniqueId), save, public :: nccl_uid_2decomp
+   type(ncclComm), save, public :: nccl_comm_2decomp
+   integer(kind=cuda_stream_kind), save, public :: cuda_stream_2decomp
+
+   public :: decomp_2d_nccl_init, &
+             decomp_2d_nccl_fin, &
+             decomp_2d_nccl_send_recv_col, &
+             decomp_2d_nccl_send_recv_row
+
+   interface decomp_2d_nccl_send_recv_col
+      module procedure decomp_2d_nccl_send_recv_real_col
+      module procedure decomp_2d_nccl_send_recv_cmplx_col
+   end interface decomp_2d_nccl_send_recv_col
+
+   interface decomp_2d_nccl_send_recv_row
+      module procedure decomp_2d_nccl_send_recv_real_row
+      module procedure decomp_2d_nccl_send_recv_cmplx_row
+   end interface decomp_2d_nccl_send_recv_row
+
+contains
+   !
+   ! init of the arrays
+   !
+   subroutine decomp_2d_nccl_init(COMM_COL, COMM_ROW)
+
+      implicit none
+
+      integer, intent(in) :: COMM_COL, COMM_ROW
+      integer             :: ierror
+      integer :: cuda_stat
+      type(ncclResult) :: nccl_stat
+
+      call MPI_COMM_RANK(COMM_COL, col_rank, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
+      call MPI_COMM_RANK(COMM_ROW, row_rank, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_RANK")
+      call MPI_COMM_SIZE(COMM_COL, col_comm_size, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
+      call MPI_COMM_SIZE(COMM_ROW, row_comm_size, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SIZE")
+
+      allocate (local_to_global_col(col_comm_size), local_to_global_row(row_comm_size))
+
+      local_to_global_col(:) = 0
+      local_to_global_row(:) = 0
+      local_to_global_col(col_rank + 1) = nrank
+      local_to_global_row(row_rank + 1) = nrank
+
+      call mpi_allreduce(MPI_IN_PLACE, local_to_global_col, col_comm_size, MPI_INTEGER, MPI_SUM, COMM_COL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
+      call mpi_allreduce(MPI_IN_PLACE, local_to_global_row, row_comm_size, MPI_INTEGER, MPI_SUM, COMM_ROW, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLREDUCE")
+
+      if (nrank == 0) then
+         nccl_stat = ncclGetUniqueId(nccl_uid_2decomp)
+         if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGetUniqueId")
+      end if
+      call MPI_Bcast(nccl_uid_2decomp, int(sizeof(ncclUniqueId)), MPI_BYTE, 0, decomp_2d_comm, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BCAST")
+
+      nccl_stat = ncclCommInitRank(nccl_comm_2decomp, nproc, nccl_uid_2decomp, nrank)
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclCommInitRank")
+      cuda_stat = cudaStreamCreate(cuda_stream_2decomp)
+      if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "cudaStreamCreate")
+
+   end subroutine decomp_2d_nccl_init
+
+   !
+   ! Finalize the module (release nccl communicator)
+   !
+   subroutine decomp_2d_nccl_fin()
+
+      implicit none
+
+      integer :: cuda_stat
+      type(ncclResult) :: nccl_stat
+
+      nccl_stat = ncclCommDestroy(nccl_comm_2decomp)
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclCommDestroy")
+      cuda_stat = cudaStreamDestroy(cuda_stream_2decomp)
+      if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamDestroy")
+
+   end subroutine decomp_2d_nccl_fin
+   !
+   ! Send-Recv Real Col
+   !
+   subroutine decomp_2d_nccl_send_recv_real_col(dst_d, &
+                                                src_d, &
+                                                disp_s, &
+                                                cnts_s, &
+                                                disp_r, &
+                                                cnts_r, &
+                                                dime)
+
+      implicit none
+
+      integer, intent(in) :: dime
+      real(mytype), dimension(:), intent(in), device :: src_d
+      real(mytype), dimension(:), intent(out), device :: dst_d
+      integer, dimension(0:dime - 1), intent(in) :: disp_s, cnts_s
+      integer, dimension(0:dime - 1), intent(in) :: disp_r, cnts_r
+
+      integer :: col_rank_id, cuda_stat
+      type(ncclResult) :: nccl_stat
+
+      nccl_stat = ncclGroupStart()
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
+      do col_rank_id = 0, (col_comm_size - 1)
+         nccl_stat = ncclSend(src_d(disp_s(col_rank_id) + 1), cnts_s(col_rank_id), &
+                              ncclType, local_to_global_col(col_rank_id + 1), nccl_comm_2decomp, cuda_stream_2decomp)
+         if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
+         nccl_stat = ncclRecv(dst_d(disp_r(col_rank_id) + 1), cnts_r(col_rank_id), &
+                              ncclType, local_to_global_col(col_rank_id + 1), nccl_comm_2decomp, cuda_stream_2decomp)
+         if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
+      end do
+      nccl_stat = ncclGroupEnd()
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
+      cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
+      if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+
+   end subroutine decomp_2d_nccl_send_recv_real_col
+   !
+   ! Send-Recv complex
+   !
+   subroutine decomp_2d_nccl_send_recv_cmplx_col(dst_d, &
+                                                 src_d, &
+                                                 disp_s, &
+                                                 cnts_s, &
+                                                 disp_r, &
+                                                 cnts_r, &
+                                                 dime, &
+                                                 buf_size)
+
+      implicit none
+
+      integer, intent(in) :: dime, buf_size
+      complex(mytype), dimension(buf_size), intent(in), device :: src_d
+      complex(mytype), dimension(buf_size), intent(out), device :: dst_d
+      integer, dimension(0:dime - 1), intent(in) :: disp_s, cnts_s
+      integer, dimension(0:dime - 1), intent(in) :: disp_r, cnts_r
+
+      integer :: ii
+
+      ! Send-Recv Real part
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         work1_r_d(ii) = real(src_d(ii), mytype)
+      end do
+      !$acc end kernels
+      call decomp_2d_nccl_send_recv_col(work2_r_d, &
+                                        work1_r_d, &
+                                        disp_s, &
+                                        cnts_s, &
+                                        disp_r, &
+                                        cnts_r, &
+                                        dime)
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         dst_d(ii) = cmplx(work2_r_d(ii), 0._mytype, mytype)
+      end do
+      !$acc end kernels
+      ! Send-Recv Immaginary Part
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         work1_r_d(ii) = aimag(src_d(ii))
+      end do
+      !$acc end kernels
+      call decomp_2d_nccl_send_recv_col(work2_r_d, &
+                                        work1_r_d, &
+                                        disp_s, &
+                                        cnts_s, &
+                                        disp_r, &
+                                        cnts_r, &
+                                        dime)
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         dst_d(ii) = cmplx(dst_d(ii), work2_r_d(ii), mytype)
+      end do
+      !$acc end kernels
+   end subroutine decomp_2d_nccl_send_recv_cmplx_col
+   !
+   ! Send-Recv Real Row
+   !
+   subroutine decomp_2d_nccl_send_recv_real_row(dst_d, &
+                                                src_d, &
+                                                disp_s, &
+                                                cnts_s, &
+                                                disp_r, &
+                                                cnts_r, &
+                                                dime)
+
+      implicit none
+
+      integer, intent(in) :: dime
+      real(mytype), dimension(:), intent(in), device :: src_d
+      real(mytype), dimension(:), intent(out), device :: dst_d
+      integer, dimension(0:dime - 1), intent(in) :: disp_s, cnts_s
+      integer, dimension(0:dime - 1), intent(in) :: disp_r, cnts_r
+
+      integer :: row_rank_id, cuda_stat
+      type(ncclResult) :: nccl_stat
+
+      nccl_stat = ncclGroupStart()
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
+      do row_rank_id = 0, (row_comm_size - 1)
+         nccl_stat = ncclSend(src_d(disp_s(row_rank_id) + 1), cnts_s(row_rank_id), &
+                              ncclType, local_to_global_row(row_rank_id + 1), nccl_comm_2decomp, cuda_stream_2decomp)
+         if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
+         nccl_stat = ncclRecv(dst_d(disp_r(row_rank_id) + 1), cnts_r(row_rank_id), &
+                              ncclType, local_to_global_row(row_rank_id + 1), nccl_comm_2decomp, cuda_stream_2decomp)
+         if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
+      end do
+      nccl_stat = ncclGroupEnd()
+      if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
+      cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
+      if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+
+   end subroutine decomp_2d_nccl_send_recv_real_row
+   !
+   ! Send-Recv complex
+   !
+   subroutine decomp_2d_nccl_send_recv_cmplx_row(dst_d, &
+                                                 src_d, &
+                                                 disp_s, &
+                                                 cnts_s, &
+                                                 disp_r, &
+                                                 cnts_r, &
+                                                 dime, &
+                                                 buf_size)
+
+      implicit none
+
+      integer, intent(in) :: dime, buf_size
+      complex(mytype), dimension(buf_size), intent(in), device :: src_d
+      complex(mytype), dimension(buf_size), intent(out), device :: dst_d
+      integer, dimension(0:dime - 1), intent(in) :: disp_s, cnts_s
+      integer, dimension(0:dime - 1), intent(in) :: disp_r, cnts_r
+
+      integer :: ii
+
+      ! Send-Recv Real part
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         work1_r_d(ii) = real(src_d(ii), mytype)
+      end do
+      !$acc end kernels
+      call decomp_2d_nccl_send_recv_row(work2_r_d, &
+                                        work1_r_d, &
+                                        disp_s, &
+                                        cnts_s, &
+                                        disp_r, &
+                                        cnts_r, &
+                                        dime)
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         dst_d(ii) = cmplx(work2_r_d(ii), 0._mytype, mytype)
+      end do
+      !$acc end kernels
+      ! Send-Recv Immaginary Part
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         work1_r_d(ii) = aimag(src_d(ii))
+      end do
+      !$acc end kernels
+      call decomp_2d_nccl_send_recv_row(work2_r_d, &
+                                        work1_r_d, &
+                                        disp_s, &
+                                        cnts_s, &
+                                        disp_r, &
+                                        cnts_r, &
+                                        dime)
+      !$acc kernels default(present)
+      do ii = 1, buf_size
+         dst_d(ii) = cmplx(dst_d(ii), work2_r_d(ii), mytype)
+      end do
+      !$acc end kernels
+   end subroutine decomp_2d_nccl_send_recv_cmplx_row
+
+end module decomp_2d_nccl
+
diff --git a/src/factor.f90 b/src/factor.f90
index 73d96e15..e067c5a1 100644
--- a/src/factor.f90
+++ b/src/factor.f90
@@ -1,92 +1,84 @@
-!!!=======================================================================
-!!! This is part of the 2DECOMP&FFT library
-!!! 
-!!! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-!!! decomposition. It also implements a highly scalable distributed
-!!! three-dimensional Fast Fourier Transform (FFT).
-!!!
-!!! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!!! Copyright (C) 2022-              the Xcompact3d developers
-!!!
-!!!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 !!! A few utility routines to find factors of integer numbers
 module factor
 
-  implicit none
+   implicit none
 
-  private
+   private
+
+   public :: findfactor
+   public :: primefactors
 
-  public :: findfactor
-  public :: primefactors
-  
 contains
-  
-  subroutine findfactor(num, factors, nfact)
-    
-    implicit none
-
-    integer, intent(IN) :: num
-    integer, intent(OUT), dimension(*) :: factors
-    integer, intent(OUT) :: nfact
-    integer :: i, m
-
-    ! find the factors <= sqrt(num)
-    m = int(sqrt(real(num)))
-    nfact = 1
-    do i=1,m
-       if (num/i*i == num) then
-          factors(nfact) = i
-          nfact = nfact + 1
-       end if
-    end do
-    nfact = nfact - 1
-
-    ! derive those > sqrt(num)
-    if (factors(nfact)**2/=num) then
-       do i=nfact+1, 2*nfact
-          factors(i) = num / factors(2*nfact-i+1)
-       end do
-       nfact = nfact * 2
-    else
-       do i=nfact+1, 2*nfact-1
-          factors(i) = num / factors(2*nfact-i)
-       end do
-       nfact = nfact * 2 - 1
-    endif
-       
-    return
-
-  end subroutine findfactor
-
-  subroutine primefactors(num, factors, nfact)
-
-    implicit none
-  
-    integer, intent(IN) :: num
-    integer, intent(OUT), dimension(*) :: factors
-    integer, intent(INOUT) :: nfact
-
-    integer :: i, n
-    
-    i = 2  
-    nfact = 1
-    n = num 
-    do
-       if (mod(n,i) == 0) then
-          factors(nfact) = i
-          nfact = nfact + 1
-          n = n / i
-       else
-          i = i + 1
-       end if
-       if (n == 1) then
-          nfact = nfact - 1
-          exit
-       end if
-    end do
-    
-    return
-
-  end subroutine primefactors
+
+   subroutine findfactor(num, factors, nfact)
+
+      implicit none
+
+      integer, intent(IN) :: num
+      integer, intent(OUT), dimension(*) :: factors
+      integer, intent(OUT) :: nfact
+      integer :: i, m
+
+      ! find the factors <= sqrt(num)
+      ! Cast the int as double to make sure of the correct result of sqrt
+      ! IntelLLVM got an issue with 1.0 but not with 1.d0
+      m = int(sqrt(num * 1.d0))
+      nfact = 1
+      do i = 1, m
+         if (num / i * i == num) then
+            factors(nfact) = i
+            nfact = nfact + 1
+         end if
+      end do
+      nfact = nfact - 1
+
+      ! derive those > sqrt(num)
+      if (factors(nfact)**2 /= num) then
+         do i = nfact + 1, 2 * nfact
+            factors(i) = num / factors(2 * nfact - i + 1)
+         end do
+         nfact = nfact * 2
+      else
+         do i = nfact + 1, 2 * nfact - 1
+            factors(i) = num / factors(2 * nfact - i)
+         end do
+         nfact = nfact * 2 - 1
+      end if
+
+      return
+
+   end subroutine findfactor
+
+   subroutine primefactors(num, factors, nfact)
+
+      implicit none
+
+      integer, intent(IN) :: num
+      integer, intent(OUT), dimension(*) :: factors
+      integer, intent(INOUT) :: nfact
+
+      integer :: i, n
+
+      i = 2
+      nfact = 1
+      n = num
+      do
+         if (mod(n, i) == 0) then
+            factors(nfact) = i
+            nfact = nfact + 1
+            n = n / i
+         else
+            i = i + 1
+         end if
+         if (n == 1) then
+            nfact = nfact - 1
+            exit
+         end if
+      end do
+
+      return
+
+   end subroutine primefactors
 end module
diff --git a/src/fft_common.f90 b/src/fft_common.f90
index 62fef974..12c6650d 100644
--- a/src/fft_common.f90
+++ b/src/fft_common.f90
@@ -1,27 +1,11 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains common code shared by all FFT engines
 
-integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1
-integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1
-
-! Physical space data can be stored in either X-pencil or Z-pencil
-integer, parameter, public :: PHYSICAL_IN_X = 1
-integer, parameter, public :: PHYSICAL_IN_Z = 3 
-
 integer, save :: format                 ! input X-pencil or Z-pencil
 
 ! The libary can only be initialised once
-logical, save :: initialised = .false. 
+logical, save :: initialised = .false.
 
 ! Global size of the FFT
 integer, save :: nx_fft, ny_fft, nz_fft
@@ -31,85 +15,97 @@
 integer, save, dimension(2) :: dims
 
 ! Decomposition objects
-TYPE(DECOMP_INFO), pointer, save :: ph=>null()  ! physical space
-TYPE(DECOMP_INFO), save :: sp  ! spectral space
+TYPE(DECOMP_INFO), pointer, save :: ph => null()  ! physical space
+TYPE(DECOMP_INFO), target, save :: ph_target  ! ph => ph_target or ph => decomp_main
+TYPE(DECOMP_INFO), target, save :: sp  ! spectral space
 
 ! Workspace to store the intermediate Y-pencil data
-! *** TODO: investigate how to use only one workspace array
-complex(mytype), allocatable, dimension(:,:,:) :: wk2_c2c, wk2_r2c
-complex(mytype), allocatable, dimension(:,:,:) :: wk13
+complex(mytype), allocatable, target, dimension(:, :, :) :: wk2_c2c
+complex(mytype), contiguous, pointer, dimension(:, :, :) :: wk2_r2c
+! Workspace for r2c and c2r transforms
+! FIXME could be removed using in-place r2c and c2r ?
+complex(mytype), allocatable, dimension(:, :, :) :: wk13
 
 public :: decomp_2d_fft_init, decomp_2d_fft_3d, &
-decomp_2d_fft_finalize, decomp_2d_fft_get_size
+          decomp_2d_fft_finalize, decomp_2d_fft_get_size, &
+          decomp_2d_fft_get_ph, decomp_2d_fft_get_sp
 
 ! Declare generic interfaces to handle different inputs
 
 interface decomp_2d_fft_init
-module procedure fft_init_noarg
-module procedure fft_init_arg
-module procedure fft_init_general
+   module procedure fft_init_noarg
+   module procedure fft_init_arg
+   module procedure fft_init_general
 end interface
 
 interface decomp_2d_fft_3d
-module procedure fft_3d_c2c
-module procedure fft_3d_r2c
-module procedure fft_3d_c2r
+   module procedure fft_3d_c2c
+   module procedure fft_3d_r2c
+   module procedure fft_3d_c2r
 end interface
 
+interface
+   module subroutine decomp_2d_fft_log(backend)
+      character(len=*), intent(in) :: backend
+   end subroutine decomp_2d_fft_log
+end interface
 
 contains
 
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Initialise the FFT module
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine fft_init_noarg
 
-implicit none
+   implicit none
 
-call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
+   call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
 
-return
+   return
 end subroutine fft_init_noarg
 
 subroutine fft_init_arg(pencil)     ! allow to handle Z-pencil input
 
-implicit none
+   implicit none
 
-integer, intent(IN) :: pencil
+   integer, intent(IN) :: pencil
 
-call fft_init_general(pencil, nx_global, ny_global, nz_global)
+   call fft_init_general(pencil, nx_global, ny_global, nz_global)
 
-return
+   return
 end subroutine fft_init_arg
 
 ! Initialise the FFT library to perform arbitrary size transforms
 subroutine fft_init_general(pencil, nx, ny, nz)
 
-implicit none
+   implicit none
 
-integer, intent(IN) :: pencil
-integer, intent(IN) :: nx, ny, nz
+   integer, intent(IN) :: pencil
+   integer, intent(IN) :: nx, ny, nz
 
-integer :: status, errorcode
+   integer :: status, errorcode
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_init")
+   if (decomp_profiler_fft) call decomp_profiler_start("fft_init")
 #endif
 
-if (initialised) then
-errorcode = 4
-call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-'FFT library should only be initialised once')
-end if
-
-format = pencil
-nx_fft = nx
-ny_fft = ny
-nz_fft = nz
+   ! Safety checks
+   if (initialised) then
+      errorcode = 4
+      call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                           'FFT library should only be initialised once')
+   end if
+   if (nx <= 0) call decomp_2d_abort(__FILE__, __LINE__, nx, "Invalid value for nx")
+   if (ny <= 0) call decomp_2d_abort(__FILE__, __LINE__, ny, "Invalid value for ny")
+   if (nz <= 0) call decomp_2d_abort(__FILE__, __LINE__, nz, "Invalid value for nz")
+
+   format = pencil
+   nx_fft = nx
+   ny_fft = ny
+   nz_fft = nz
 
 ! determine the processor grid in use
-dims = get_decomp_dims()
+   dims = get_decomp_dims()
 
 ! for c2r/r2c interface:
 ! if in physical space, a real array is of size: nx*ny*nz
@@ -117,95 +113,154 @@ subroutine fft_init_general(pencil, nx, ny, nz)
 !         (nx/2+1)*ny*nz, if PHYSICAL_IN_X
 !      or nx*ny*(nz/2+1), if PHYSICAL_IN_Z
 
-if (nx_fft==nx_global.and.ny_fft==ny_global.and.nz_fft==nz_global) then
-ph=>decomp_main
-else
-call decomp_info_init(nx, ny, nz, ph)
-endif
-if (format==PHYSICAL_IN_X) then
-call decomp_info_init(nx/2+1, ny, nz, sp)
-else if (format==PHYSICAL_IN_Z) then
-call decomp_info_init(nx, ny, nz/2+1, sp)
-end if
-
-allocate(wk2_c2c(ph%ysz(1),ph%ysz(2),ph%ysz(3)), STAT=status)
-allocate(wk2_r2c(sp%ysz(1),sp%ysz(2),sp%ysz(3)), STAT=status)
-if (format==PHYSICAL_IN_X) then
-allocate(wk13(sp%xsz(1),sp%xsz(2),sp%xsz(3)), STAT=status)
-else if (format==PHYSICAL_IN_Z) then
-allocate(wk13(sp%zsz(1),sp%zsz(2),sp%zsz(3)), STAT=status)
-end if
-if (status /= 0) then
-errorcode = 3
-call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
-'Out of memory when initialising FFT')
-end if
-
-call init_fft_engine
-
-initialised = .true.
+   if (nx_fft == nx_global .and. ny_fft == ny_global .and. nz_fft == nz_global) then
+      ph => decomp_main
+   else
+      call decomp_info_init(nx, ny, nz, ph_target)
+      ph => ph_target
+   end if
+   if (format == PHYSICAL_IN_X) then
+      call decomp_info_init(nx / 2 + 1, ny, nz, sp)
+   else if (format == PHYSICAL_IN_Z) then
+      call decomp_info_init(nx, ny, nz / 2 + 1, sp)
+   else
+      call decomp_2d_abort(__FILE__, __LINE__, format, "Invalid value for format")
+   end if
+
+   !
+   ! Allocate the workspace for intermediate y-pencil data
+   ! The largest memory block needed is the one for c2c transforms
+   !
+   allocate (wk2_c2c(ph%ysz(1), ph%ysz(2), ph%ysz(3)), STAT=status)
+   if (status /= 0) then
+      errorcode = 3
+      call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                           'Out of memory when initialising FFT')
+   end if
+   !
+   ! A smaller memory block is needed for r2c and c2r transforms
+   ! wk2_c2c and wk2_r2c start at the same memory location
+   !
+   !    Size of wk2_c2c : ph%ysz(1), ph%ysz(2), ph%ysz(3)
+   !    Size of wk2_r2c : sp%ysz(1), sp%ysz(2), sp%ysz(3)
+   !
+   call c_f_pointer(c_loc(wk2_c2c), wk2_r2c, sp%ysz)
+   !
+   ! Allocate the workspace for r2c and c2r transforms
+   !
+   ! wk13 can not be easily fused with wk2_*2c due to statements such as
+   ! transpose_y_to_x(wk2_r2c, wk13, sp)
+   ! transpose_y_to_z(wk2_r2c, wk13, sp)
+   !
+   if (format == PHYSICAL_IN_X) then
+      allocate (wk13(sp%xsz(1), sp%xsz(2), sp%xsz(3)), STAT=status)
+   else if (format == PHYSICAL_IN_Z) then
+      allocate (wk13(sp%zsz(1), sp%zsz(2), sp%zsz(3)), STAT=status)
+   end if
+   if (status /= 0) then
+      errorcode = 3
+      call decomp_2d_abort(__FILE__, __LINE__, errorcode, &
+                           'Out of memory when initialising FFT')
+   end if
+
+   call init_fft_engine
+
+   initialised = .true.
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_init")
+   if (decomp_profiler_fft) call decomp_profiler_end("fft_init")
 #endif
 
-return
+   return
 end subroutine fft_init_general
 
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! Final clean up
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine decomp_2d_fft_finalize
 
-implicit none
+   implicit none
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_fin")
+   if (decomp_profiler_fft) call decomp_profiler_start("fft_fin")
 #endif
 
-if (nx_fft==nx_global.and.ny_fft==ny_global.and.nz_fft==nz_global) then
-nullify(ph)
-else
-call decomp_info_finalize(ph)
-endif
-call decomp_info_finalize(sp)
+   if (nx_fft /= nx_global .or. ny_fft /= ny_global .or. nz_fft /= nz_global) then
+      call decomp_info_finalize(ph_target)
+   end if
+   nullify (ph)
+   call decomp_info_finalize(sp)
 
-if (allocated(wk2_c2c)) deallocate(wk2_c2c)
-if (allocated(wk2_r2c)) deallocate(wk2_r2c)
-if (allocated(wk13)) deallocate(wk13)
+   if (allocated(wk2_c2c)) deallocate (wk2_c2c)
+   if (associated(wk2_r2c)) nullify (wk2_r2c)
+   if (allocated(wk13)) deallocate (wk13)
 
-call finalize_fft_engine
+   call finalize_fft_engine
 
-initialised = .false.
+   initialised = .false.
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_fin")
+   if (decomp_profiler_fft) call decomp_profiler_end("fft_fin")
 #endif
 
-return
+   return
 end subroutine decomp_2d_fft_finalize
 
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-! Return the size, starting/ending index of the distributed array 
+! Return the size, starting/ending index of the distributed array
 !  whose global size is (nx/2+1)*ny*nz, for defining data structures
 !  in r2c and c2r interfaces
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine decomp_2d_fft_get_size(istart, iend, isize)
 
-implicit none
-integer, dimension(3), intent(OUT) :: istart, iend, isize
+   implicit none
+   integer, dimension(3), intent(OUT) :: istart, iend, isize
 
-if (format==PHYSICAL_IN_X) then
-istart = sp%zst
-iend   = sp%zen
-isize  = sp%zsz
-else if (format==PHYSICAL_IN_Z) then
-istart = sp%xst
-iend   = sp%xen
-isize  = sp%xsz
-end if
+   if (format == PHYSICAL_IN_X) then
+      istart = sp%zst
+      iend = sp%zen
+      isize = sp%zsz
+   else if (format == PHYSICAL_IN_Z) then
+      istart = sp%xst
+      iend = sp%xen
+      isize = sp%xsz
+   end if
 
-return
+   return
 end subroutine decomp_2d_fft_get_size
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! Return a pointer to the decomp_info object ph
+!
+! The caller should not apply decomp_info_finalize on the pointer
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+function decomp_2d_fft_get_ph()
+
+   implicit none
+
+   type(decomp_info), pointer :: decomp_2d_fft_get_ph
+
+   if (.not. associated(ph)) then
+      call decomp_2d_abort(__FILE__, __LINE__, -1, 'FFT library must be initialised first')
+   end if
+   decomp_2d_fft_get_ph => ph
+
+end function decomp_2d_fft_get_ph
+
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+! Return a pointer to the decomp_info object sp
+!
+! The caller should not apply decomp_info_finalize on the pointer
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+function decomp_2d_fft_get_sp()
+
+   implicit none
+
+   type(decomp_info), pointer :: decomp_2d_fft_get_sp
+
+   if (.not. associated(ph)) then
+      call decomp_2d_abort(__FILE__, __LINE__, -1, 'FFT library must be initialised first')
+   end if
+   decomp_2d_fft_get_sp => sp
+
+end function decomp_2d_fft_get_sp
diff --git a/src/fft_common_3d.f90 b/src/fft_common_3d.f90
index 3c0abc1b..28c4ec0d 100644
--- a/src/fft_common_3d.f90
+++ b/src/fft_common_3d.f90
@@ -1,299 +1,286 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains 3D c2c/r2c/c2r transform subroutines which are
-! identical for several FFT engines 
+! identical for several FFT engines
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! 3D FFT - complex to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine fft_3d_c2c(in, out, isign)
 
-implicit none
+   implicit none
 
-complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-complex(mytype), dimension(:,:,:), intent(OUT) :: out
-integer, intent(IN) :: isign
-
-integer :: i, j, k
+   complex(mytype), dimension(:, :, :), intent(INOUT) :: in
+   complex(mytype), dimension(:, :, :), intent(OUT) :: out
+   integer, intent(IN) :: isign
 
 #ifndef OVERWRITE
-complex(mytype), allocatable, dimension(:,:,:) :: wk1
+   integer :: i, j, k
+   complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
+   if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
 #endif
 
-if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
+   if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_FORWARD .OR. &
+       format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_BACKWARD) then
 
 ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-call c2c_1m_x(in,isign,ph)
+      call c2c_1m_x(in, isign, ph)
 #else
-allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-do concurrent (k=1:ph%xsz(3), j=1:ph%xsz(2), i=1:ph%xsz(1))
-wk1(i,j,k) = in(i,j,k)
-end do
-call c2c_1m_x(wk1,isign,ph)
+      allocate (wk1(ph%xsz(1), ph%xsz(2), ph%xsz(3)))
+      do concurrent(k=1:ph%xsz(3), j=1:ph%xsz(2), i=1:ph%xsz(1))
+         wk1(i, j, k) = in(i, j, k)
+      end do
+      call c2c_1m_x(wk1, isign, ph)
 #endif
 
 ! ===== Swap X --> Y; 1D FFTs in Y =====
 
-if (dims(1)>1) then
+      if (dims(1) > 1) then
 #ifdef OVERWRITE
-call transpose_x_to_y(in,wk2_c2c,ph)
+         call transpose_x_to_y(in, wk2_c2c, ph)
 #else
-call transpose_x_to_y(wk1,wk2_c2c,ph)
+         call transpose_x_to_y(wk1, wk2_c2c, ph)
 #endif
-call c2c_1m_y(wk2_c2c,isign,ph)
-else
+         call c2c_1m_y(wk2_c2c, isign, ph)
+      else
 #ifdef OVERWRITE
-call c2c_1m_y(in,isign,ph)
+         call c2c_1m_y(in, isign, ph)
 #else
-call c2c_1m_y(wk1,isign,ph)
+         call c2c_1m_y(wk1, isign, ph)
 #endif
-end if
+      end if
 
 ! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_c2c,out,ph)
-else
+      if (dims(1) > 1) then
+         call transpose_y_to_z(wk2_c2c, out, ph)
+      else
 #ifdef OVERWRITE
-call transpose_y_to_z(in,out,ph)
+         call transpose_y_to_z(in, out, ph)
 #else
-call transpose_y_to_z(wk1,out,ph)
+         call transpose_y_to_z(wk1, out, ph)
 #endif
-end if
-call c2c_1m_z(out,isign,ph)
+      end if
+      call c2c_1m_z(out, isign, ph)
 
-else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-.OR. & 
-format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
+   else if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_BACKWARD &
+            .OR. &
+            format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_FORWARD) then
 
 ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-call c2c_1m_z(in,isign,ph)
+      call c2c_1m_z(in, isign, ph)
 #else
-allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-do concurrent (k=1:ph%zsz(3), j=1:ph%zsz(2), i=1:ph%zsz(1))
-wk1(i,j,k) = in(i,j,k)
-end do
-call c2c_1m_z(wk1,isign,ph)
+      allocate (wk1(ph%zsz(1), ph%zsz(2), ph%zsz(3)))
+      do concurrent(k=1:ph%zsz(3), j=1:ph%zsz(2), i=1:ph%zsz(1))
+         wk1(i, j, k) = in(i, j, k)
+      end do
+      call c2c_1m_z(wk1, isign, ph)
 #endif
 
 ! ===== Swap Z --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
+      if (dims(1) > 1) then
 #ifdef OVERWRITE
-call transpose_z_to_y(in,wk2_c2c,ph)
+         call transpose_z_to_y(in, wk2_c2c, ph)
 #else
-call transpose_z_to_y(wk1,wk2_c2c,ph)
+         call transpose_z_to_y(wk1, wk2_c2c, ph)
 #endif
-call c2c_1m_y(wk2_c2c,isign,ph)
-else  ! out==wk2_c2c if 1D decomposition
+         call c2c_1m_y(wk2_c2c, isign, ph)
+      else  ! out==wk2_c2c if 1D decomposition
 #ifdef OVERWRITE
-call transpose_z_to_y(in,out,ph)
+         call transpose_z_to_y(in, out, ph)
 #else
-call transpose_z_to_y(wk1,out,ph)
+         call transpose_z_to_y(wk1, out, ph)
 #endif
-call c2c_1m_y(out,isign,ph)
-end if
+         call c2c_1m_y(out, isign, ph)
+      end if
 
 ! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_c2c,out,ph)
-end if
-call c2c_1m_x(out,isign,ph)
+      if (dims(1) > 1) then
+         call transpose_y_to_x(wk2_c2c, out, ph)
+      end if
+      call c2c_1m_x(out, isign, ph)
 
-end if
+   end if
 
 #ifndef OVERWRITE
 ! Free memory
-if (allocated(wk1)) deallocate(wk1)
+   if (allocated(wk1)) deallocate (wk1)
 #endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
+   if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
 #endif
 
-return
+   return
 end subroutine fft_3d_c2c
 
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! 3D forward FFT - real to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine fft_3d_r2c(in_r, out_c)
 
-implicit none
+   implicit none
 
-real(mytype), dimension(:,:,:), intent(IN) :: in_r
-complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
+   real(mytype), dimension(:, :, :), intent(IN) :: in_r
+   complex(mytype), dimension(:, :, :), intent(OUT) :: out_c
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
+   if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
 #endif
 
-if (format==PHYSICAL_IN_X) then
+   if (format == PHYSICAL_IN_X) then
 
 ! ===== 1D FFTs in X =====
-call r2c_1m_x(in_r,wk13)
+      call r2c_1m_x(in_r, wk13)
 
 ! ===== Swap X --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-call transpose_x_to_y(wk13,wk2_r2c,sp)
-call c2c_1m_y(wk2_r2c,-1,sp)
-else
-call c2c_1m_y(wk13,-1,sp)
-end if
+      if (dims(1) > 1) then
+         call transpose_x_to_y(wk13, wk2_r2c, sp)
+         call c2c_1m_y(wk2_r2c, -1, sp)
+      else
+         call c2c_1m_y(wk13, -1, sp)
+      end if
 
 ! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_r2c,out_c,sp)
-else
-call transpose_y_to_z(wk13,out_c,sp)
-end if
-call c2c_1m_z(out_c,-1,sp)
+      if (dims(1) > 1) then
+         call transpose_y_to_z(wk2_r2c, out_c, sp)
+      else
+         call transpose_y_to_z(wk13, out_c, sp)
+      end if
+      call c2c_1m_z(out_c, -1, sp)
 
-else if (format==PHYSICAL_IN_Z) then
+   else if (format == PHYSICAL_IN_Z) then
 
 ! ===== 1D FFTs in Z =====
-call r2c_1m_z(in_r,wk13)
+      call r2c_1m_z(in_r, wk13)
 
 ! ===== Swap Z --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
-call transpose_z_to_y(wk13,wk2_r2c,sp)
-call c2c_1m_y(wk2_r2c,-1,sp)
-else  ! out_c==wk2_r2c if 1D decomposition
-call transpose_z_to_y(wk13,out_c,sp)
-call c2c_1m_y(out_c,-1,sp)
-end if
+      if (dims(1) > 1) then
+         call transpose_z_to_y(wk13, wk2_r2c, sp)
+         call c2c_1m_y(wk2_r2c, -1, sp)
+      else  ! out_c==wk2_r2c if 1D decomposition
+         call transpose_z_to_y(wk13, out_c, sp)
+         call c2c_1m_y(out_c, -1, sp)
+      end if
 
 ! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_r2c,out_c,sp)
-end if
-call c2c_1m_x(out_c,-1,sp)
+      if (dims(1) > 1) then
+         call transpose_y_to_x(wk2_r2c, out_c, sp)
+      end if
+      call c2c_1m_x(out_c, -1, sp)
 
-end if
+   end if
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
+   if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
 #endif
 
-return
+   return
 end subroutine fft_3d_r2c
 
-
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 ! 3D inverse FFT - complex to real
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 subroutine fft_3d_c2r(in_c, out_r)
 
-implicit none
-
-complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-real(mytype), dimension(:,:,:), intent(OUT) :: out_r
+   implicit none
 
-integer :: i, j, k
+   complex(mytype), dimension(:, :, :), intent(INOUT) :: in_c
+   real(mytype), dimension(:, :, :), intent(OUT) :: out_r
 
 #ifndef OVERWRITE
-complex(mytype), allocatable, dimension(:,:,:) :: wk1
+   integer :: i, j, k
+   complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
+   if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
 #endif
 
-if (format==PHYSICAL_IN_X) then
+   if (format == PHYSICAL_IN_X) then
 
 ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-call c2c_1m_z(in_c,1,sp)       
+      call c2c_1m_z(in_c, 1, sp)
 #else
-allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-do concurrent (k=1:sp%zsz(3), j=1:sp%zsz(2), i=1:sp%zsz(1))
-wk1(i,j,k) = in_c(i,j,k)
-end do
-call c2c_1m_z(wk1,1,sp)
+      allocate (wk1(sp%zsz(1), sp%zsz(2), sp%zsz(3)))
+      do concurrent(k=1:sp%zsz(3), j=1:sp%zsz(2), i=1:sp%zsz(1))
+         wk1(i, j, k) = in_c(i, j, k)
+      end do
+      call c2c_1m_z(wk1, 1, sp)
 #endif
 
 ! ===== Swap Z --> Y; 1D FFTs in Y =====
 #ifdef OVERWRITE
-call transpose_z_to_y(in_c,wk2_r2c,sp)
+      call transpose_z_to_y(in_c, wk2_r2c, sp)
 #else
-call transpose_z_to_y(wk1,wk2_r2c,sp)
+      call transpose_z_to_y(wk1, wk2_r2c, sp)
 #endif
-call c2c_1m_y(wk2_r2c,1,sp)
+      call c2c_1m_y(wk2_r2c, 1, sp)
 
 ! ===== Swap Y --> X; 1D FFTs in X =====
-if (dims(1)>1) then
-call transpose_y_to_x(wk2_r2c,wk13,sp)
-call c2r_1m_x(wk13,out_r)
-else
-call c2r_1m_x(wk2_r2c,out_r)
-end if
+      if (dims(1) > 1) then
+         call transpose_y_to_x(wk2_r2c, wk13, sp)
+         call c2r_1m_x(wk13, out_r)
+      else
+         call c2r_1m_x(wk2_r2c, out_r)
+      end if
 
-else if (format==PHYSICAL_IN_Z) then
+   else if (format == PHYSICAL_IN_Z) then
 
 ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-call c2c_1m_x(in_c,1,sp)
+      call c2c_1m_x(in_c, 1, sp)
 #else
-allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-do concurrent (k=1:sp%xsz(3), j=1:sp%xsz(2), i=1:sp%xsz(1))
-wk1(i,j,k) = in_c(i,j,k)
-end do
-call c2c_1m_x(wk1,1,sp)
+      allocate (wk1(sp%xsz(1), sp%xsz(2), sp%xsz(3)))
+      do concurrent(k=1:sp%xsz(3), j=1:sp%xsz(2), i=1:sp%xsz(1))
+         wk1(i, j, k) = in_c(i, j, k)
+      end do
+      call c2c_1m_x(wk1, 1, sp)
 #endif
 
 ! ===== Swap X --> Y; 1D FFTs in Y =====
-if (dims(1)>1) then
+      if (dims(1) > 1) then
 #ifdef OVERWRITE
-call transpose_x_to_y(in_c,wk2_r2c,sp)
+         call transpose_x_to_y(in_c, wk2_r2c, sp)
 #else
-call transpose_x_to_y(wk1,wk2_r2c,sp)
+         call transpose_x_to_y(wk1, wk2_r2c, sp)
 #endif
-call c2c_1m_y(wk2_r2c,1,sp)
-else  ! in_c==wk2_r2c if 1D decomposition
+         call c2c_1m_y(wk2_r2c, 1, sp)
+      else  ! in_c==wk2_r2c if 1D decomposition
 #ifdef OVERWRITE
-call c2c_1m_y(in_c,1,sp)
+         call c2c_1m_y(in_c, 1, sp)
 #else
-call c2c_1m_y(wk1,1,sp)
+         call c2c_1m_y(wk1, 1, sp)
 #endif
-end if
+      end if
 
 ! ===== Swap Y --> Z; 1D FFTs in Z =====
-if (dims(1)>1) then
-call transpose_y_to_z(wk2_r2c,wk13,sp)
-else
+      if (dims(1) > 1) then
+         call transpose_y_to_z(wk2_r2c, wk13, sp)
+      else
 #ifdef OVERWRITE
-call transpose_y_to_z(in_c,wk13,sp)
+         call transpose_y_to_z(in_c, wk13, sp)
 #else
-call transpose_y_to_z(wk1,wk13,sp)
+         call transpose_y_to_z(wk1, wk13, sp)
 #endif
-end if
-call c2r_1m_z(wk13,out_r)
+      end if
+      call c2r_1m_z(wk13, out_r)
 
-end if
+   end if
 
 #ifndef OVERWRITE
 ! Free memory
-if (allocated(wk1)) deallocate(wk1)
+   if (allocated(wk1)) deallocate (wk1)
 #endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
+   if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
 #endif
 
-return
+   return
 end subroutine fft_3d_c2r
diff --git a/src/fft_cufft.f90 b/src/fft_cufft.f90
index 0622f958..8a2d84eb 100644
--- a/src/fft_cufft.f90
+++ b/src/fft_cufft.f90
@@ -1,1044 +1,1031 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This is the FFTW (version 3.x) implementation of the FFT library
 
 module decomp_2d_fft
 
-  use decomp_2d  ! 2D decomposition module
-  use iso_c_binding
-  use cudafor
-  use cufft
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d  ! 2D decomposition module
+   use iso_c_binding
+   use cudafor
+   use cufft
 
-  implicit none
+   implicit none
 
-  private        ! Make everything private unless declared public
+   private        ! Make everything private unless declared public
 
-  ! engine-specific global variables
-  ! integer, save :: plan_type = FFTW_MEASURE
+   ! engine-specific global variables
+   ! integer, save :: plan_type = FFTW_MEASURE
 
-  ! FFTW plans
-  ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
-  ! For c2c transforms: 
-  !     use plan(-1,j) for  forward transform; 
-  !     use plan( 1,j) for backward transform;
-  ! For r2c/c2r transforms:
-  !     use plan(0,j) for r2c transforms;
-  !     use plan(2,j) for c2r transforms;
-  integer*4, save :: plan(-1:2,3)
-  complex*8, device, allocatable, dimension(:) :: cufft_workspace
+   ! FFTW plans
+   ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
+   ! For c2c transforms:
+   !     use plan(-1,j) for  forward transform;
+   !     use plan( 1,j) for backward transform;
+   ! For r2c/c2r transforms:
+   !     use plan(0,j) for r2c transforms;
+   !     use plan(2,j) for c2r transforms;
+   integer*4, save :: plan(-1:2, 3)
+   complex*8, device, allocatable, dimension(:) :: cufft_workspace
 
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
+   integer, parameter, public :: D2D_FFT_BACKEND = D2D_FFT_BACKEND_CUFFT
+
+   ! common code used for all engines, including global variables,
+   ! generic interface definitions and several subroutines
 #include "fft_common.f90"
 
-  ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2C case
-  subroutine c2c_1m_x_plan(plan1, decomp, cufft_type, worksize)
-    
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-   
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp%xsz(1),     &
-                                      decomp%xsz(1),1,decomp%xsz(1),     &
-                                      decomp%xsz(1),1,decomp%xsz(1),     &
-                                      cufft_type,decomp%xsz(2)*decomp%xsz(3),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine c2c_1m_x_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in Y direction: C2C case
-  subroutine c2c_1m_y_plan(plan1, decomp, cufft_type,worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: cufft_type
-
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-   
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp%ysz(2),     &
-                                      decomp%ysz(2),decomp%ysz(1),1,     &
-                                      decomp%ysz(2),decomp%ysz(1),1,     &
-                                      cufft_type,decomp%ysz(1),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine c2c_1m_y_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in Z direction: C2C case
-  subroutine c2c_1m_z_plan(plan1, decomp, cufft_type,worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp%zsz(3),                           &
-                                      decomp%zsz(3),decomp%zsz(1)*decomp%zsz(2),1,     &
-                                      decomp%zsz(3),decomp%zsz(1)*decomp%zsz(2),1,     &
-                                      cufft_type,decomp%zsz(1)*decomp%zsz(2),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine c2c_1m_z_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in X direction: R2C case
-  subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp, cufft_type, worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp_ph%xsz(1),     &
-                                      decomp_ph%xsz(1),1,decomp_ph%xsz(1),     &
-                                      decomp_sp%xsz(1),1,decomp_sp%xsz(1),     &
-                                      cufft_type,decomp_ph%xsz(2)*decomp_ph%xsz(3),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine r2c_1m_x_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2R case
-  subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph, cufft_type, worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp_ph%xsz(1),     &
-                                      decomp_sp%xsz(1),1,decomp_sp%xsz(1),     &
-                                      decomp_ph%xsz(1),1,decomp_ph%xsz(1),     &
-                                      cufft_type,decomp_ph%xsz(2)*decomp_ph%xsz(3),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine c2r_1m_x_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in X direction: R2C case
-  subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp, cufft_type, worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp_ph%zsz(3),                           &
-                                      decomp_ph%zsz(3),decomp_ph%zsz(1)*decomp_ph%zsz(2),1,     &
-                                      decomp_sp%zsz(3),decomp_sp%zsz(1)*decomp_sp%zsz(2),1,     &
-                                      cufft_type,decomp_ph%zsz(1)*decomp_ph%zsz(2),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine r2c_1m_z_plan
-
-  ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2R case
-  subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph, cufft_type, worksize)
-
-    implicit none
-
-    integer*4, intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    integer, intent(IN) :: cufft_type
-
-    integer :: istat
-    integer(int_ptr_kind()), intent(out) :: worksize
-    integer, pointer :: null_fptr
-    call c_f_pointer( c_null_ptr, null_fptr )
-
-    istat = cufftCreate(plan1)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
-    istat = cufftSetAutoAllocation(plan1,0)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
-    istat = cufftMakePlanMany(plan1,1,decomp_ph%zsz(3),                           &
-                                      decomp_sp%zsz(3),decomp_sp%zsz(1)*decomp_sp%zsz(2),1,     &
-                                      decomp_ph%zsz(3),decomp_ph%zsz(1)*decomp_ph%zsz(2),1,     &
-                                      cufft_type,decomp_ph%zsz(1)*decomp_ph%zsz(2),worksize)
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
-
-  end subroutine c2r_1m_z_plan
+   ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2C case
+   subroutine c2c_1m_x_plan(plan1, decomp, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp%xsz(1), &
+                                decomp%xsz(1), 1, decomp%xsz(1), &
+                                decomp%xsz(1), 1, decomp%xsz(1), &
+                                cufft_type, decomp%xsz(2) * decomp%xsz(3), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine c2c_1m_x_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in Y direction: C2C case
+   subroutine c2c_1m_y_plan(plan1, decomp, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: cufft_type
+
+      ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
+      ! done one Z-plane at a time. So plan for 2D data sets here.
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp%ysz(2), &
+                                decomp%ysz(2), decomp%ysz(1), 1, &
+                                decomp%ysz(2), decomp%ysz(1), 1, &
+                                cufft_type, decomp%ysz(1), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine c2c_1m_y_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in Z direction: C2C case
+   subroutine c2c_1m_z_plan(plan1, decomp, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp%zsz(3), &
+                                decomp%zsz(3), decomp%zsz(1) * decomp%zsz(2), 1, &
+                                decomp%zsz(3), decomp%zsz(1) * decomp%zsz(2), 1, &
+                                cufft_type, decomp%zsz(1) * decomp%zsz(2), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine c2c_1m_z_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in X direction: R2C case
+   subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp_ph%xsz(1), &
+                                decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                cufft_type, decomp_ph%xsz(2) * decomp_ph%xsz(3), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine r2c_1m_x_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2R case
+   subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp_ph%xsz(1), &
+                                decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                cufft_type, decomp_ph%xsz(2) * decomp_ph%xsz(3), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine c2r_1m_x_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in X direction: R2C case
+   subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp_ph%zsz(3), &
+                                decomp_ph%zsz(3), decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, &
+                                decomp_sp%zsz(3), decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, &
+                                cufft_type, decomp_ph%zsz(1) * decomp_ph%zsz(2), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine r2c_1m_z_plan
+
+   ! Return a cuFFT plan for multiple 1D FFTs in X direction: C2R case
+   subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph, cufft_type, worksize)
+
+      implicit none
+
+      integer*4, intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      integer, intent(IN) :: cufft_type
+
+      integer :: istat
+      integer(int_ptr_kind()), intent(out) :: worksize
+      integer, pointer :: null_fptr
+      call c_f_pointer(c_null_ptr, null_fptr)
+
+      istat = cufftCreate(plan1)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftCreate")
+      istat = cufftSetAutoAllocation(plan1, 0)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetAutoAllocation")
+      istat = cufftMakePlanMany(plan1, 1, decomp_ph%zsz(3), &
+                                decomp_sp%zsz(3), decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, &
+                                decomp_ph%zsz(3), decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, &
+                                cufft_type, decomp_ph%zsz(1) * decomp_ph%zsz(2), worksize)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftMakePlanMany")
+
+   end subroutine c2r_1m_z_plan
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
+   !  This routine performs one-time initialisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
+   subroutine init_fft_engine
+
+      implicit none
 
-    implicit none
+      !integer*4 :: cufft_ws, ws
+      integer(int_ptr_kind()) :: cufft_ws, ws
+      integer :: i, j, istat
 
-    !integer*4 :: cufft_ws, ws
-    integer(int_ptr_kind()) :: cufft_ws, ws
-    integer   :: i, j, istat
+      call decomp_2d_fft_log("cuFFT")
 
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the New cuFFT engine *****'
-       write(*,*) ' '
-    end if
-    
-     cufft_ws = 0 
+      cufft_ws = 0
 #ifdef DOUBLE_PREC
-    if (format == PHYSICAL_IN_X) then
-       ! For C2C transforms
-       call c2c_1m_x_plan(plan(-1,1), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(-1,2), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(-1,3), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan( 1,3), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan( 1,2), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan( 1,1), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       ! For R2C/C2R tranforms
-       call r2c_1m_x_plan(plan(0,1), ph, sp, CUFFT_D2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(0,2), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(0,3), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(2,3), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(2,2), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2r_1m_x_plan(plan(2,1), sp, ph, CUFFT_Z2D,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-    else if (format == PHYSICAL_IN_Z) then
-
-       ! For C2C transforms
-       write(*,*) 'Create the plans'
-       call c2c_1m_z_plan(plan(-1,3), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(-1,2), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(-1,1), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan( 1,1), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan( 1,2), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan( 1,3), ph, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_z_plan(plan(0,3), ph, sp, CUFFT_D2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(0,2), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(0,1), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(2,1), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(2,2), sp, CUFFT_Z2Z,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2r_1m_z_plan(plan(2,3), sp, ph, CUFFT_Z2D,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-    end if
+      if (format == PHYSICAL_IN_X) then
+         ! For C2C transforms
+         call c2c_1m_x_plan(plan(-1, 1), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(-1, 2), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(-1, 3), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(1, 3), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(1, 2), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(1, 1), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         ! For R2C/C2R tranforms
+         call r2c_1m_x_plan(plan(0, 1), ph, sp, CUFFT_D2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(0, 2), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(0, 3), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(2, 3), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(2, 2), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2r_1m_x_plan(plan(2, 1), sp, ph, CUFFT_Z2D, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+      else if (format == PHYSICAL_IN_Z) then
+
+         ! For C2C transforms
+         call c2c_1m_z_plan(plan(-1, 3), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(-1, 2), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(-1, 1), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(1, 1), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(1, 2), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(1, 3), ph, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+         ! For R2C/C2R tranforms
+         call r2c_1m_z_plan(plan(0, 3), ph, sp, CUFFT_D2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(0, 2), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(0, 1), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(2, 1), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(2, 2), sp, CUFFT_Z2Z, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2r_1m_z_plan(plan(2, 3), sp, ph, CUFFT_Z2D, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+      end if
 #else
-    if (format == PHYSICAL_IN_X) then
-       ! For C2C transforms
-       call c2c_1m_x_plan(plan(-1,1), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(-1,2), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(-1,3), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan( 1,3), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan( 1,2), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan( 1,1), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       ! For R2C/C2R tranforms
-       call r2c_1m_x_plan(plan(0,1), ph, sp, CUFFT_R2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(0,2), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(0,3), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan(2,3), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(2,2), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2r_1m_x_plan(plan(2,1), sp, ph, CUFFT_C2R,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-    else if (format == PHYSICAL_IN_Z) then
-
-       ! For C2C transforms
-       write(*,*) 'Create the plans on rank ', nproc
-       call c2c_1m_z_plan(plan(-1,3), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(-1,2), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(-1,1), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan( 1,1), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan( 1,2), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_z_plan(plan( 1,3), ph, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_z_plan(plan(0,3), ph, sp, CUFFT_R2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(0,2), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(0,1), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_x_plan(plan(2,1), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2c_1m_y_plan(plan(2,2), sp, CUFFT_C2C,ws)
-       cufft_ws = max (cufft_ws,ws)
-       call c2r_1m_z_plan(plan(2,3), sp, ph, CUFFT_C2R,ws)
-       cufft_ws = max (cufft_ws,ws)
-
-    end if
+      if (format == PHYSICAL_IN_X) then
+         ! For C2C transforms
+         call c2c_1m_x_plan(plan(-1, 1), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(-1, 2), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(-1, 3), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(1, 3), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(1, 2), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(1, 1), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         ! For R2C/C2R tranforms
+         call r2c_1m_x_plan(plan(0, 1), ph, sp, CUFFT_R2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(0, 2), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(0, 3), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(2, 3), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(2, 2), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2r_1m_x_plan(plan(2, 1), sp, ph, CUFFT_C2R, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+      else if (format == PHYSICAL_IN_Z) then
+
+         ! For C2C transforms
+         call c2c_1m_z_plan(plan(-1, 3), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(-1, 2), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(-1, 1), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(1, 1), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(1, 2), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_z_plan(plan(1, 3), ph, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+         ! For R2C/C2R tranforms
+         call r2c_1m_z_plan(plan(0, 3), ph, sp, CUFFT_R2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(0, 2), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(0, 1), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_x_plan(plan(2, 1), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2c_1m_y_plan(plan(2, 2), sp, CUFFT_C2C, ws)
+         cufft_ws = max(cufft_ws, ws)
+         call c2r_1m_z_plan(plan(2, 3), sp, ph, CUFFT_C2R, ws)
+         cufft_ws = max(cufft_ws, ws)
+
+      end if
 #endif
-    allocate(cufft_workspace(cufft_ws))
-    do j=1,3
-       do i=-1,2
-         istat = cufftSetWorkArea(plan(i,j),cufft_workspace)
-         if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetWorkArea")
-      enddo
-    enddo
-
-  end subroutine init_fft_engine
+      cufft_ws = cufft_ws / sizeof(1._mytype)
+      allocate (cufft_workspace(cufft_ws))
+      do j = 1, 3
+         do i = -1, 2
+            istat = cufftSetWorkArea(plan(i, j), cufft_workspace)
+            if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftSetWorkArea")
+         end do
+      end do
 
+   end subroutine init_fft_engine
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
+   !  This routine performs one-time finalisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
+   subroutine finalize_fft_engine
 
-    integer :: i,j, istat
+      implicit none
 
-    do j=1,3
-       do i=-1,2
-          istat = cufftDestroy(plan(i,j))
-          if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftDestroy")
-       end do
-    end do
+      integer :: i, j, istat
 
-  end subroutine finalize_fft_engine
+      do j = 1, 3
+         do i = -1, 2
+            istat = cufftDestroy(plan(i, j))
+            if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftDestroy")
+         end do
+      end do
 
+   end subroutine finalize_fft_engine
 
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
+   ! Following routines calculate multiple one-dimensional FFTs to form
+   ! the basis of three-dimensional FFTs.
 
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, plan1)
+   ! c2c transform, multiple 1D FFTs in x direction
+   subroutine c2c_1m_x(inout, isign, plan1)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*4, intent(IN) :: plan1
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      integer*4, intent(IN) :: plan1
 
-    complex(mytype), dimension(:,:,:), allocatable :: output
-    integer :: istat
+      integer :: istat
 
-    allocate(output,mold=inout)
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(inout,output)
-    istat = cufftExecZ2Z(plan1, inout, output,isign)
-    !$acc end host_data
+      !$acc host_data use_device(inout)
+      istat = cufftExecZ2Z(plan1, inout, inout, isign)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(inout,output)
-    istat = cufftExecC2C(plan1, inout, output,isign)
-    !$acc end host_data
+      !$acc host_data use_device(inout)
+      istat = cufftExecC2C(plan1, inout, inout, isign)
+      !$acc end host_data
 #endif
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
-    !$acc kernels
-    inout = output
-    !$acc end kernels 
-    deallocate(output)
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
 
-  end subroutine c2c_1m_x
+   end subroutine c2c_1m_x
 
+   ! c2c transform, multiple 1D FFTs in y direction
+   subroutine c2c_1m_y(inout, isign, plan1)
 
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, plan1)
+      implicit none
 
-    implicit none
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      integer*4, intent(IN) :: plan1
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*4, intent(IN) :: plan1
+      integer :: s3, k, istat
 
-    complex(mytype), dimension(:,:,:), allocatable :: output
-    integer :: s3, k, istat
-    
-    allocate(output,mold=inout)
-    ! transform on one Z-plane at a time
-    s3 = size(inout,3)
-    do k=1,s3
+      ! transform on one Z-plane at a time
+      s3 = size(inout, 3)
+      do k = 1, s3
 #ifdef DOUBLE_PREC
-       !$acc host_data use_device(inout,output)
-       istat = cufftExecZ2Z(plan1, inout(:,:,k), output(:,:,k),isign)
-       !$acc end host_data
+         !$acc host_data use_device(inout)
+         istat = cufftExecZ2Z(plan1, inout(:, :, k), inout(:, :, k), isign)
+         !$acc end host_data
 #else
-       !$acc host_data use_device(inout,output)
-       istat = cufftExecC2C(plan1, inout(:,:,k), output(:,:,k),isign)
-       !$acc end host_data
+         !$acc host_data use_device(inout)
+         istat = cufftExecC2C(plan1, inout(:, :, k), inout(:, :, k), isign)
+         !$acc end host_data
 #endif
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
-    enddo
-    !$acc kernels
-    inout = output
-    !$acc end kernels 
-    deallocate(output)
+         if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
+      end do
 
-  end subroutine c2c_1m_y
+   end subroutine c2c_1m_y
 
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, plan1)
+   ! c2c transform, multiple 1D FFTs in z direction
+   subroutine c2c_1m_z(inout, isign, plan1)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    integer*4, intent(IN) :: plan1
-    
-    complex(mytype), dimension(:,:,:), allocatable :: output
-    integer :: istat
-
-    allocate(output,mold=inout)
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      integer*4, intent(IN) :: plan1
 
+      integer :: istat
 
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(inout,output)
-    istat = cufftExecZ2Z(plan1, inout, output,isign)
-    !$acc end host_data
+      !$acc host_data use_device(inout)
+      istat = cufftExecZ2Z(plan1, inout, inout, isign)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(inout,output)
-    istat = cufftExecC2C(plan1, inout, output,isign)
-    !$acc end host_data
+      !$acc host_data use_device(inout)
+      istat = cufftExecC2C(plan1, inout, inout, isign)
+      !$acc end host_data
 #endif
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
-    !$acc kernels
-    inout = output
-    !$acc end kernels
-    deallocate(output)
-
-  end subroutine c2c_1m_z
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2C/Z2Z")
 
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
+   end subroutine c2c_1m_z
 
-    implicit none
+   ! r2c transform, multiple 1D FFTs in x direction
+   subroutine r2c_1m_x(input, output)
 
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-    integer :: istat
+      implicit none
 
+      real(mytype), dimension(:, :, :), intent(IN)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
+      integer :: istat
 
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(input,output)
-    istat =  cufftExecD2Z(plan(0,1), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecD2Z(plan(0, 1), input, output)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(input,output)
-    istat =  cufftExecR2C(plan(0,1), input, output)
-    !$acc end host_data
-#endif    
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecR2C/D2Z")
+      !$acc host_data use_device(input,output)
+      istat = cufftExecR2C(plan(0, 1), input, output)
+      !$acc end host_data
+#endif
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecR2C/D2Z")
 
-  end subroutine r2c_1m_x
+   end subroutine r2c_1m_x
 
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
+   ! r2c transform, multiple 1D FFTs in z direction
+   subroutine r2c_1m_z(input, output)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN)     ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
+      real(mytype), dimension(:, :, :), intent(IN)     ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
 
-    integer :: istat
+      integer :: istat
 
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(input,output)
-    istat = cufftExecD2Z(plan(0,3), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecD2Z(plan(0, 3), input, output)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(input,output)
-    istat = cufftExecR2C(plan(0,3), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecR2C(plan(0, 3), input, output)
+      !$acc end host_data
 #endif
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecR2C/D2Z")
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecR2C/D2Z")
 
-  end subroutine r2c_1m_z
+   end subroutine r2c_1m_z
 
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
+   ! c2r transform, multiple 1D FFTs in x direction
+   subroutine c2r_1m_x(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT)    :: output
+      complex(mytype), dimension(:, :, :), intent(IN)  ::  input
+      real(mytype), dimension(:, :, :), intent(OUT)    :: output
 
-    integer :: istat
+      integer :: istat
 
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(input,output)
-    istat = cufftExecZ2D(plan(2,1), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecZ2D(plan(2, 1), input, output)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(input,output)
-    istat = cufftExecC2R(plan(2,1), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecC2R(plan(2, 1), input, output)
+      !$acc end host_data
 #endif
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2R/Z2D")
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2R/Z2D")
 
-  end subroutine c2r_1m_x
+   end subroutine c2r_1m_x
 
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
+   ! c2r transform, multiple 1D FFTs in z direction
+   subroutine c2r_1m_z(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
+      complex(mytype), dimension(:, :, :), intent(IN) :: input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
 
-    integer :: istat
+      integer :: istat
 
 #ifdef DOUBLE_PREC
-    !$acc host_data use_device(input,output)
-    istat = cufftExecZ2D(plan(2,3), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecZ2D(plan(2, 3), input, output)
+      !$acc end host_data
 #else
-    !$acc host_data use_device(input,output)
-    istat = cufftExecC2R(plan(2,3), input, output)
-    !$acc end host_data
+      !$acc host_data use_device(input,output)
+      istat = cufftExecC2R(plan(2, 3), input, output)
+      !$acc end host_data
 #endif
-    if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2R/Z2D")
-
-  end subroutine c2r_1m_z
-
+      if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cufftExecC2R/Z2D")
 
+   end subroutine c2r_1m_z
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
+   ! 3D FFT - complex to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
+   subroutine fft_3d_c2c(in, out, isign)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out
+      integer, intent(IN) :: isign
 
 #ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
 #endif
 
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
+      !$acc data create(wk2_c2c) present(in,out)
+      if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_FORWARD .OR. &
+          format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_BACKWARD) then
 
-       ! ===== 1D FFTs in X =====
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in,isign,plan(isign,1))
+         call c2c_1m_x(in, isign, plan(isign, 1))
 #else
-       allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       wk1 = in
-       call c2c_1m_x(wk1,isign,plan(isign,1))
+         allocate (wk1(ph%xsz(1), ph%xsz(2), ph%xsz(3)))
+         !$acc enter data create(wk1) async
+         !$acc wait
+         !$acc kernels default(present)
+         wk1 = in
+         !$acc end kernels
+         call c2c_1m_x(wk1, isign, plan(isign, 1))
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
 
-       if (dims(1)>1) then
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in,wk2_c2c,ph)
+            call transpose_x_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_x_to_y(wk1,wk2_c2c,ph)
+            call transpose_x_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else
+            call c2c_1m_y(wk2_c2c, isign, plan(isign, 2))
+         else
 #ifdef OVERWRITE
-          call c2c_1m_y(in,isign,plan(isign,2))
+            call c2c_1m_y(in, isign, plan(isign, 2))
 #else
-          call c2c_1m_y(wk1,isign,plan(isign,2))
+            call c2c_1m_y(wk1, isign, plan(isign, 2))
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_c2c,out,ph)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_c2c, out, ph)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in,out,ph)
+            call transpose_y_to_z(in, out, ph)
 #else
-          call transpose_y_to_z(wk1,out,ph)
+            call transpose_y_to_z(wk1, out, ph)
 #endif
-       end if
-       call c2c_1m_z(out,isign,plan(isign,3))
+         end if
+         call c2c_1m_z(out, isign, plan(isign, 3))
 
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. & 
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
+      else if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_BACKWARD &
+               .OR. &
+               format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_FORWARD) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in,isign,plan(isign,3))
+         call c2c_1m_z(in, isign, plan(isign, 3))
 #else
-       allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       wk1 = in
-       call c2c_1m_z(wk1,isign,plan(isign,3))
+         allocate (wk1(ph%zsz(1), ph%zsz(2), ph%zsz(3)))
+         !$acc enter data create(wk1) async
+         !$acc wait
+         !$acc kernels default(present)
+         wk1 = in
+         !$acc end kernels
+         call c2c_1m_z(wk1, isign, plan(isign, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,wk2_c2c,ph)
+            call transpose_z_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_z_to_y(wk1,wk2_c2c,ph)
+            call transpose_z_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else  ! out==wk2_c2c if 1D decomposition
+            call c2c_1m_y(wk2_c2c, isign, plan(isign, 2))
+         else  ! out==wk2_c2c if 1D decomposition
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,out,ph)
+            call transpose_z_to_y(in, out, ph)
 #else
-          call transpose_z_to_y(wk1,out,ph)
+            call transpose_z_to_y(wk1, out, ph)
 #endif
-          call c2c_1m_y(out,isign,plan(isign,2))
-       end if
+            call c2c_1m_y(out, isign, plan(isign, 2))
+         end if
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_c2c,out,ph)
-       end if
-       call c2c_1m_x(out,isign,plan(isign,1))
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_c2c, out, ph)
+         end if
+         call c2c_1m_x(out, isign, plan(isign, 1))
 
-    end if
+      end if
 
 #ifndef OVERWRITE
-    deallocate (wk1)
+      !$acc exit data delete(wk1) async
+      !$acc wait
+      deallocate (wk1)
 #endif
+      !$acc end data
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
 #endif
 
-  end subroutine fft_3d_c2c
-
+   end subroutine fft_3d_c2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
+   ! 3D forward FFT - real to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
-    !use nvtx
-    implicit none
+   subroutine fft_3d_r2c(in_r, out_c)
+      !use nvtx
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
-    integer :: i, j ,k
-    integer, dimension(3) :: dim3d
+      real(mytype), dimension(:, :, :), intent(IN) :: in_r
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out_c
+      integer :: i, j, k
+      integer, dimension(3) :: dim3d
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
 #endif
 
-    if (format==PHYSICAL_IN_X) then
+      !$acc data create(wk13,wk2_r2c) present(in_r,out_c)
+      if (format == PHYSICAL_IN_X) then
 
-       ! ===== 1D FFTs in X =====
-       call r2c_1m_x(in_r,wk13)
+         ! ===== 1D FFTs in X =====
+         call r2c_1m_x(in_r, wk13)
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_x_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else
-          call c2c_1m_y(wk13,-1,plan(0,2))
-       end if
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            call transpose_x_to_y(wk13, wk2_r2c, sp)
+            call c2c_1m_y(wk2_r2c, -1, plan(0, 2))
+         else
+            call c2c_1m_y(wk13, -1, plan(0, 2))
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,out_c,sp)
-       else
-          call transpose_y_to_z(wk13,out_c,sp)
-       end if
-       call c2c_1m_z(out_c,-1,plan(0,3))
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, out_c, sp)
+         else
+            call transpose_y_to_z(wk13, out_c, sp)
+         end if
+         call c2c_1m_z(out_c, -1, plan(0, 3))
 
-    else if (format==PHYSICAL_IN_Z) then
+      else if (format == PHYSICAL_IN_Z) then
 
-       ! ===== 1D FFTs in Z =====
-       !call nvtxStartRange("Z r2c_1m_z")
+         ! ===== 1D FFTs in Z =====
+         !call nvtxStartRange("Z r2c_1m_z")
 #ifdef DEBUG
-       dim3d = shape(in_r)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_r(i,j,k))
+         dim3d = shape(in_r)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_r(i, j, k))
+               end do
             end do
          end do
-       end do
 #endif
-       call r2c_1m_z(in_r,wk13)
-
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          !call nvtxStartRange("Z1 transpose_z_to_y")
-          call transpose_z_to_y(wk13,wk2_r2c,sp)
-          !call nvtxEndRange
-          !call nvtxStartRange("Z1 c2c_1m_y")
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-          !call nvtxEndRange
+         call r2c_1m_z(in_r, wk13)
+
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            !call nvtxStartRange("Z1 transpose_z_to_y")
+            call transpose_z_to_y(wk13, wk2_r2c, sp)
+            !call nvtxEndRange
+            !call nvtxStartRange("Z1 c2c_1m_y")
+            call c2c_1m_y(wk2_r2c, -1, plan(0, 2))
+            !call nvtxEndRange
 #ifdef DEBUG
-          write(*,*) 'c2c_1m_y'
-          dim3d = shape(wk2_r2c)
-          do k = 1, dim3d(3),dim3d(3)/8
-            do j = 1, dim3d(2),dim3d(2)/8
-               do i = 1, dim3d(1),dim3d(1)/8
-                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk2_r2c(i,j,k)),&
-                                aimag(wk2_r2c(i,j,k))
+            write (*, *) 'c2c_1m_y'
+            dim3d = shape(wk2_r2c)
+            do k = 1, dim3d(3), dim3d(3) / 8
+               do j = 1, dim3d(2), dim3d(2) / 8
+                  do i = 1, dim3d(1), dim3d(1) / 8
+                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk2_r2c(i, j, k)), &
+                        aimag(wk2_r2c(i, j, k))
+                  end do
                end do
             end do
-          end do
-          write(*,*)
-          write(*,*)
+            write (*, *)
+            write (*, *)
 #endif
-       else  ! out_c==wk2_r2c if 1D decomposition
-          !call nvtxStartRange("Z transpose_z_to_y")
-          call transpose_z_to_y(wk13,out_c,sp)
-          !call nvtxEndRange
-          !call nvtxStartRange("Z c2c_1m_y")
-          call c2c_1m_y(out_c,-1,plan(0,2))
-          !call nvtxEndRange
+         else  ! out_c==wk2_r2c if 1D decomposition
+            !call nvtxStartRange("Z transpose_z_to_y")
+            call transpose_z_to_y(wk13, out_c, sp)
+            !call nvtxEndRange
+            !call nvtxStartRange("Z c2c_1m_y")
+            call c2c_1m_y(out_c, -1, plan(0, 2))
+            !call nvtxEndRange
 #ifdef DEBUG
-          write(*,*) 'c2c_1m_y2'
-          dim3d = shape(out_c)
-          do k = 1, dim3d(3),dim3d(3)/8
-            do j = 1, dim3d(2),dim3d(2)/8
-               do i = 1, dim3d(1),dim3d(1)/8
-                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_c(i,j,k)),&
-                                aimag(out_c(i,j,k))
+            write (*, *) 'c2c_1m_y2'
+            dim3d = shape(out_c)
+            do k = 1, dim3d(3), dim3d(3) / 8
+               do j = 1, dim3d(2), dim3d(2) / 8
+                  do i = 1, dim3d(1), dim3d(1) / 8
+                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_c(i, j, k)), &
+                        aimag(out_c(i, j, k))
+                  end do
                end do
             end do
-          end do
-          write(*,*)
-          write(*,*)
+            write (*, *)
+            write (*, *)
 #endif
-       end if
-
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          !call nvtxStartRange("Z1 transpose_y_to_x")
-          call transpose_y_to_x(wk2_r2c,out_c,sp)
-          !call nvtxEndRange
-       end if
-       !call nvtxStartRange("c2c_1m_x")
-       call c2c_1m_x(out_c,-1,plan(0,1))
-       !call nvtxEndRange
+         end if
+
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            !call nvtxStartRange("Z1 transpose_y_to_x")
+            call transpose_y_to_x(wk2_r2c, out_c, sp)
+            !call nvtxEndRange
+         end if
+         !call nvtxStartRange("c2c_1m_x")
+         call c2c_1m_x(out_c, -1, plan(0, 1))
+         !call nvtxEndRange
 #ifdef DEBUG
-       write(*,*) 'c2c_1m_x'
-       dim3d = shape(out_c)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_c(i,j,k)),&
-                             aimag(out_c(i,j,k))
+         write (*, *) 'c2c_1m_x'
+         dim3d = shape(out_c)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_c(i, j, k)), &
+                     aimag(out_c(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
-    end if
+      end if
+      !$acc end data
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
 #endif
 
-  end subroutine fft_3d_r2c
-
+   end subroutine fft_3d_r2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
+   ! 3D inverse FFT - complex to real
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
+   subroutine fft_3d_c2r(in_c, out_r)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
-    integer :: i,j,k
-    integer, dimension(3) :: dim3d
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in_c
+      real(mytype), dimension(:, :, :), intent(OUT) :: out_r
+      integer :: i, j, k
+      integer, dimension(3) :: dim3d
 #ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
 #endif
 
-    if (format==PHYSICAL_IN_X) then
+      !$acc data create(wk2_r2c,wk13) present(in_c,out_r)
+      if (format == PHYSICAL_IN_X) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in_c,1,plan(2,3))       
+         call c2c_1m_z(in_c, 1, plan(2, 3))
 #else
-       allocate (wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       wk1 = in_c
-       call c2c_1m_z(wk1,1,plan(2,3))
+         allocate (wk1(sp%zsz(1), sp%zsz(2), sp%zsz(3)))
+         !$acc enter data create(wk1) async
+         !$acc wait
+         !$acc kernels default(present)
+         wk1 = in_c
+         !$acc end kernels
+         call c2c_1m_z(wk1, 1, plan(2, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
 #ifdef OVERWRITE
-       call transpose_z_to_y(in_c,wk2_r2c,sp)
+         call transpose_z_to_y(in_c, wk2_r2c, sp)
 #else
-       call transpose_z_to_y(wk1,wk2_r2c,sp)
+         call transpose_z_to_y(wk1, wk2_r2c, sp)
 #endif
-       call c2c_1m_y(wk2_r2c,1,plan(2,2))
+         call c2c_1m_y(wk2_r2c, 1, plan(2, 2))
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,wk13,sp)
-          call c2r_1m_x(wk13,out_r)
-       else
-          call c2r_1m_x(wk2_r2c,out_r)
-       end if
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_r2c, wk13, sp)
+            call c2r_1m_x(wk13, out_r)
+         else
+            call c2r_1m_x(wk2_r2c, out_r)
+         end if
 
-    else if (format==PHYSICAL_IN_Z) then
+      else if (format == PHYSICAL_IN_Z) then
 #ifdef DEBUG
-       write(*,*) 'Back Init c2c_1m_x line 788'
-       dim3d = shape(in_c)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i,j,k)),&
-                             aimag(in_c(i,j,k))
+         write (*, *) 'Back Init c2c_1m_x line 788'
+         dim3d = shape(in_c)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i, j, k)), &
+                     aimag(in_c(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
-       ! ===== 1D FFTs in X =====
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in_c,1,plan(2,1))
+         call c2c_1m_x(in_c, 1, plan(2, 1))
 #ifdef DEBUG
-       write(*,*) 'Back c2c_1m_x overwrite line 804'
-       dim3d = shape(in_c)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i,j,k)),&
-                             aimag(in_c(i,j,k))
+         write (*, *) 'Back c2c_1m_x overwrite line 804'
+         dim3d = shape(in_c)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i, j, k)), &
+                     aimag(in_c(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
 #else
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       wk1 = in_c
-       call c2c_1m_x(wk1,1,plan(2,1))
+         allocate (wk1(sp%xsz(1), sp%xsz(2), sp%xsz(3)))
+         !$acc enter data create(wk1) async
+         !$acc wait
+         !$acc kernels default(present)
+         wk1 = in_c
+         !$acc end kernels
+         call c2c_1m_x(wk1, 1, plan(2, 1))
 #ifdef DEBUG
-       write(*,*) 'Back2 c2c_1m_x line 821'
-       dim3d = shape(wk1)
-       do k = 1, dim3d(3),dim3d(1)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk1(i,j,k)),&
-                             aimag(wk1(i,j,k))
+         write (*, *) 'Back2 c2c_1m_x line 821'
+         dim3d = shape(wk1)
+         do k = 1, dim3d(3), dim3d(1) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk1(i, j, k)), &
+                     aimag(wk1(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in_c,wk2_r2c,sp)
+            call transpose_x_to_y(in_c, wk2_r2c, sp)
 #else
-          call transpose_x_to_y(wk1,wk2_r2c,sp)
+            call transpose_x_to_y(wk1, wk2_r2c, sp)
 #endif
-          call c2c_1m_y(wk2_r2c,1,plan(2,2))
+            call c2c_1m_y(wk2_r2c, 1, plan(2, 2))
 #ifdef DEBUG
-          write(*,*) 'Back c2c_1m_y line 844'
-          dim3d = shape(wk2_r2c)
-          do k = 1, dim3d(3),dim3d(3)/8
-            do j = 1, dim3d(2),dim3d(2)/8
-               do i = 1, dim3d(1),dim3d(1)/8
-                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk2_r2c(i,j,k)),&
-                                aimag(wk2_r2c(i,j,k))
+            write (*, *) 'Back c2c_1m_y line 844'
+            dim3d = shape(wk2_r2c)
+            do k = 1, dim3d(3), dim3d(3) / 8
+               do j = 1, dim3d(2), dim3d(2) / 8
+                  do i = 1, dim3d(1), dim3d(1) / 8
+                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk2_r2c(i, j, k)), &
+                        aimag(wk2_r2c(i, j, k))
+                  end do
                end do
             end do
-          end do
-          write(*,*)
-          write(*,*)
+            write (*, *)
+            write (*, *)
 #endif
-       else  ! in_c==wk2_r2c if 1D decomposition
+         else  ! in_c==wk2_r2c if 1D decomposition
 #ifdef OVERWRITE
-          call c2c_1m_y(in_c,1,plan(2,2))
+            call c2c_1m_y(in_c, 1, plan(2, 2))
 #ifdef DEBUG
-          write(*,*) 'Back2 c2c_1m_y line 860'
-          dim3d = shape(in_c)
-          do k = 1, dim3d(3),dim3d(3)/8
-            do j = 1, dim3d(2),dim3d(2)/8
-               do i = 1, dim3d(1),dim3d(1)/8
-                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i,j,k)),&
-                                aimag(in_c(i,j,k))
+            write (*, *) 'Back2 c2c_1m_y line 860'
+            dim3d = shape(in_c)
+            do k = 1, dim3d(3), dim3d(3) / 8
+               do j = 1, dim3d(2), dim3d(2) / 8
+                  do i = 1, dim3d(1), dim3d(1) / 8
+                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(in_c(i, j, k)), &
+                        aimag(in_c(i, j, k))
+                  end do
                end do
             end do
-          end do
-          write(*,*)
-          write(*,*)
+            write (*, *)
+            write (*, *)
 #endif
 #else
-          call c2c_1m_y(wk1,1,plan(2,2))
+            call c2c_1m_y(wk1, 1, plan(2, 2))
 #ifdef DEBUG
-          write(*,*) 'Back3 c2c_1m_y line 875'
-          dim3d = shape(wk1)
-          do k = 1, dim3d(3),dim3d(3)/8
-            do j = 1, dim3d(2),dim3d(2)/8
-               do i = 1, dim3d(1),dim3d(1)/8
-                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk1(i,j,k)),&
-                                aimag(wk1(i,j,k))
+            write (*, *) 'Back3 c2c_1m_y line 875'
+            dim3d = shape(wk1)
+            do k = 1, dim3d(3), dim3d(3) / 8
+               do j = 1, dim3d(2), dim3d(2) / 8
+                  do i = 1, dim3d(1), dim3d(1) / 8
+                     print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk1(i, j, k)), &
+                        aimag(wk1(i, j, k))
+                  end do
                end do
             end do
-          end do
-          write(*,*)
-          write(*,*)
+            write (*, *)
+            write (*, *)
 #endif
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,wk13,sp)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, wk13, sp)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in_c,wk13,sp)
+            call transpose_y_to_z(in_c, wk13, sp)
 #else
-          call transpose_y_to_z(wk1,wk13,sp)
+            call transpose_y_to_z(wk1, wk13, sp)
 #endif
-       end if
+         end if
 #ifdef DEBUG
-       write(*,*) 'Back2 after tr_y2z'
-       dim3d = shape(wk13)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-              print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk13(i,j,k)),&
-                               aimag(wk13(i,j,k))
+         write (*, *) 'Back2 after tr_y2z'
+         dim3d = shape(wk13)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(wk13(i, j, k)), &
+                     aimag(wk13(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
-       call c2r_1m_z(wk13,out_r)
+         call c2r_1m_z(wk13, out_r)
 #ifdef DEBUG
-       write(*,*) 'Back2 c2r_1m_z out_r line 902'
-       dim3d = shape(out_r)
-       do k = 1, dim3d(3),dim3d(3)/8
-         do j = 1, dim3d(2),dim3d(2)/8
-            do i = 1, dim3d(1),dim3d(1)/8
-                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_r(i,j,k))
+         write (*, *) 'Back2 c2r_1m_z out_r line 902'
+         dim3d = shape(out_r)
+         do k = 1, dim3d(3), dim3d(3) / 8
+            do j = 1, dim3d(2), dim3d(2) / 8
+               do i = 1, dim3d(1), dim3d(1) / 8
+                  print "(i3,1x,i3,1x,i3,1x,e12.5,1x,e12.5)", i, j, k, real(out_r(i, j, k))
+               end do
             end do
          end do
-       end do
-       write(*,*)
-       write(*,*)
+         write (*, *)
+         write (*, *)
 #endif
 
-    end if
+      end if
 
 #ifndef OVERWRITE
-    deallocate (wk1)
+      !$acc exit data delete(wk1) async
+      !$acc wait
+      deallocate (wk1)
 #endif
+      !$acc end data
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
 #endif
 
-  end subroutine fft_3d_c2r
-
+   end subroutine fft_3d_c2r
 
 end module decomp_2d_fft
 
diff --git a/src/fft_fftw3.f90 b/src/fft_fftw3.f90
index 3ee08276..9c1284ef 100644
--- a/src/fft_fftw3.f90
+++ b/src/fft_fftw3.f90
@@ -1,746 +1,708 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This is the FFTW (version 3.x) implementation of the FFT library
 
 module decomp_2d_fft
 
-  use decomp_2d  ! 2D decomposition module
-  use iso_c_binding
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d  ! 2D decomposition module
+   use iso_c_binding
 
-  implicit none
+   implicit none
 
-  include "fftw3.f"
+   include "fftw3.f"
 
-  private        ! Make everything private unless declared public
+   private        ! Make everything private unless declared public
 
-  ! engine-specific global variables
-  integer, save :: plan_type = FFTW_MEASURE
+   ! engine-specific global variables
+   integer, save :: plan_type = FFTW_MEASURE
 
-  ! FFTW plans
-  ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
-  ! For c2c transforms:
-  !     use plan(-1,j) for  forward transform;
-  !     use plan( 1,j) for backward transform;
-  ! For r2c/c2r transforms:
-  !     use plan(0,j) for r2c transforms;
-  !     use plan(2,j) for c2r transforms;
-  type(C_PTR), save, public :: plan(-1:2,3)
+   ! FFTW plans
+   ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
+   ! For c2c transforms:
+   !     use plan(-1,j) for  forward transform;
+   !     use plan( 1,j) for backward transform;
+   ! For r2c/c2r transforms:
+   !     use plan(0,j) for r2c transforms;
+   !     use plan(2,j) for c2r transforms;
+   type(C_PTR), save :: plan(-1:2, 3)
 
-  ! This is defined in fftw3.f03 but not in fftw3.f
-  interface
-    subroutine fftw_cleanup() bind(C, name='fftw_cleanup')
-      import
-    end subroutine fftw_cleanup
-  end interface
+   ! This is defined in fftw3.f03 but not in fftw3.f
+   interface
+      subroutine fftw_cleanup() bind(C, name='fftw_cleanup')
+         import
+      end subroutine fftw_cleanup
+   end interface
 
-  public :: r2c_1m_x, c2c_1m_y, c2r_1m_x
-  ! common code used for all engines, including global variables,
-  ! generic interface definitions and several subroutines
+   integer, parameter, public :: D2D_FFT_BACKEND = D2D_FFT_BACKEND_FFTW3
+
+   ! common code used for all engines, including global variables,
+   ! generic interface definitions and several subroutines
 #include "fft_common.f90"
 
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in X direction
-  subroutine c2c_1m_x_plan(plan1, decomp, isign)
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in X direction
+   subroutine c2c_1m_x_plan(plan1, decomp, isign)
 
-    implicit none
+      implicit none
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
+      complex(mytype), allocatable, dimension(:, :, :) :: a1
 
-    allocate(a1(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
+      allocate (a1(decomp%xsz(1), decomp%xsz(2), decomp%xsz(3)))
 
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
+      call dfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
+                               decomp%xsz(2) * decomp%xsz(3), a1, decomp%xsz(1), 1, &
+                               decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
+                               isign, plan_type)
 #else
-    call sfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
+      call sfftw_plan_many_dft(plan1, 1, decomp%xsz(1), &
+                               decomp%xsz(2) * decomp%xsz(3), a1, decomp%xsz(1), 1, &
+                               decomp%xsz(1), a1, decomp%xsz(1), 1, decomp%xsz(1), &
+                               isign, plan_type)
 #endif
 
-    deallocate(a1)
+      deallocate (a1)
 
-    return
-  end subroutine c2c_1m_x_plan
+      return
+   end subroutine c2c_1m_x_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Y direction
-  subroutine c2c_1m_y_plan(plan1, decomp, isign)
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in Y direction
+   subroutine c2c_1m_y_plan(plan1, decomp, isign)
 
-    implicit none
+      implicit none
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
-    complex(mytype), allocatable, dimension(:,:) :: a1
+      complex(mytype), allocatable, dimension(:, :) :: a1
 
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
+      ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
+      ! done one Z-plane at a time. So plan for 2D data sets here.
 
-    allocate(a1(decomp%ysz(1),decomp%ysz(2)))
+      allocate (a1(decomp%ysz(1), decomp%ysz(2)))
 
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
+      call dfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
+                               a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
+                               decomp%ysz(1), 1, isign, plan_type)
 #else
-    call sfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
+      call sfftw_plan_many_dft(plan1, 1, decomp%ysz(2), decomp%ysz(1), &
+                               a1, decomp%ysz(2), decomp%ysz(1), 1, a1, decomp%ysz(2), &
+                               decomp%ysz(1), 1, isign, plan_type)
 #endif
 
-    deallocate(a1)
-
-    return
-  end subroutine c2c_1m_y_plan
+      deallocate (a1)
 
+      return
+   end subroutine c2c_1m_y_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Z direction
-  subroutine c2c_1m_z_plan(plan1, decomp, isign)
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in Z direction
+   subroutine c2c_1m_z_plan(plan1, decomp, isign)
 
-    implicit none
+      implicit none
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
+      complex(mytype), allocatable, dimension(:, :, :) :: a1
 
-    allocate(a1(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
+      allocate (a1(decomp%zsz(1), decomp%zsz(2), decomp%zsz(3)))
 
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
+      call dfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), a1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), 1, a1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), 1, isign, plan_type)
 #else
-    call sfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
+      call sfftw_plan_many_dft(plan1, 1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), a1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), 1, a1, decomp%zsz(3), &
+                               decomp%zsz(1) * decomp%zsz(2), 1, isign, plan_type)
 #endif
 
-    deallocate(a1)
+      deallocate (a1)
 
-    return
-  end subroutine c2c_1m_z_plan
+      return
+   end subroutine c2c_1m_z_plan
 
+   ! Return a FFTW3 plan for multiple 1D r2c FFTs in X direction
+   subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp)
 
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in X direction
-  subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp)
+      implicit none
 
-    implicit none
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      real(mytype), allocatable, dimension(:, :, :) :: a1
+      complex(mytype), allocatable, dimension(:, :, :) :: a2
 
-    real(mytype), allocatable, dimension(:,:,:) :: a1
-    complex(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)))
-    allocate(a2(decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)))
+      allocate (a1(decomp_ph%xsz(1), decomp_ph%xsz(2), decomp_ph%xsz(3)))
+      allocate (a2(decomp_sp%xsz(1), decomp_sp%xsz(2), decomp_sp%xsz(3)))
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
+      call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
+                                   decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
+                                   decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                   plan_type)
 #else
-    call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
+      call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%xsz(1), &
+                                   decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
+                                   decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                   plan_type)
 #endif
-    deallocate(a1,a2)
-
-    return
-  end subroutine r2c_1m_x_plan
+      deallocate (a1, a2)
 
+      return
+   end subroutine r2c_1m_x_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in X direction
-  subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph)
+   ! Return a FFTW3 plan for multiple 1D c2r FFTs in X direction
+   subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph)
 
-    implicit none
+      implicit none
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
 
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-    real(mytype), allocatable, dimension(:,:,:) :: a2
+      complex(mytype), allocatable, dimension(:, :, :) :: a1
+      real(mytype), allocatable, dimension(:, :, :) :: a2
 
-    allocate(a1(decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)))
-    allocate(a2(decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)))
+      allocate (a1(decomp_sp%xsz(1), decomp_sp%xsz(2), decomp_sp%xsz(3)))
+      allocate (a2(decomp_ph%xsz(1), decomp_ph%xsz(2), decomp_ph%xsz(3)))
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
+      call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
+                                   decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
+                                   decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                   plan_type)
 #else
-    call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
+      call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%xsz(1), &
+                                   decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
+                                   decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                   plan_type)
 #endif
-    deallocate(a1,a2)
+      deallocate (a1, a2)
 
-    return
-  end subroutine c2r_1m_x_plan
+      return
+   end subroutine c2r_1m_x_plan
 
+   ! Return a FFTW3 plan for multiple 1D r2c FFTs in Z direction
+   subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp)
 
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in Z direction
-  subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp)
+      implicit none
 
-    implicit none
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      real(mytype), allocatable, dimension(:, :, :) :: a1
+      complex(mytype), allocatable, dimension(:, :, :) :: a2
 
-    real(mytype), allocatable, dimension(:,:,:) :: a1
-    complex(mytype), allocatable, dimension(:,:,:) :: a2
-
-    allocate(a1(decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)))
-    allocate(a2(decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)))
+      allocate (a1(decomp_ph%zsz(1), decomp_ph%zsz(2), decomp_ph%zsz(3)))
+      allocate (a2(decomp_sp%zsz(1), decomp_sp%zsz(2), decomp_sp%zsz(3)))
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
+      call dfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
+                                   decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, plan_type)
 #else
-    call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
+      call sfftw_plan_many_dft_r2c(plan1, 1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
+                                   decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, plan_type)
 #endif
-    deallocate(a1,a2)
-
-    return
-  end subroutine r2c_1m_z_plan
+      deallocate (a1, a2)
 
+      return
+   end subroutine r2c_1m_z_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in Z direction
-  subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph)
+   ! Return a FFTW3 plan for multiple 1D c2r FFTs in Z direction
+   subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph)
 
-    implicit none
+      implicit none
 
-    type(C_PTR), intent(OUT) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      type(C_PTR), intent(OUT) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
 
-    complex(mytype), allocatable, dimension(:,:,:) :: a1
-    real(mytype), allocatable, dimension(:,:,:) :: a2
+      complex(mytype), allocatable, dimension(:, :, :) :: a1
+      real(mytype), allocatable, dimension(:, :, :) :: a2
 
-    allocate(a1(decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)))
-    allocate(a2(decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)))
+      allocate (a1(decomp_sp%zsz(1), decomp_sp%zsz(2), decomp_sp%zsz(3)))
+      allocate (a2(decomp_ph%zsz(1), decomp_ph%zsz(2), decomp_ph%zsz(3)))
 
 #ifdef DOUBLE_PREC
-    call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
+      call dfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
+                                   decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, plan_type)
 #else
-    call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
+      call sfftw_plan_many_dft_c2r(plan1, 1, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
+                                   decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
+                                   decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, plan_type)
 #endif
-    deallocate(a1,a2)
-
-    return
-  end subroutine c2r_1m_z_plan
+      deallocate (a1, a2)
 
+      return
+   end subroutine c2r_1m_z_plan
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
+   !  This routine performs one-time initialisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the FFTW (version 3.x) engine *****'
-       write(*,*) ' '
-    end if
-
-    if (format == PHYSICAL_IN_X) then
-
-       ! For C2C transforms
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_x_plan(plan(0,1), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(0,3), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(2,3), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_x_plan(plan(2,1), sp, ph)
-
-    else if (format == PHYSICAL_IN_Z) then
-
-       ! For C2C transforms
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-
-       ! For R2C/C2R tranforms
-       call r2c_1m_z_plan(plan(0,3), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(0,1), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(2,1), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_z_plan(plan(2,3), sp, ph)
-
-    end if
-
-    return
-  end subroutine init_fft_engine
+   subroutine init_fft_engine
 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
+      implicit none
 
-    implicit none
+      call decomp_2d_fft_log("FFTW (version 3.x)")
 
-    integer :: i,j
+      if (format == PHYSICAL_IN_X) then
 
-    do j=1,3
-       do i=-1,2
-#ifdef DOUBLE_PREC
-          call dfftw_destroy_plan(plan(i,j))
-#else
-          call sfftw_destroy_plan(plan(i,j))
-#endif
-       end do
-    end do
+         ! For C2C transforms
+         call c2c_1m_x_plan(plan(-1, 1), ph, FFTW_FORWARD)
+         call c2c_1m_y_plan(plan(-1, 2), ph, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(-1, 3), ph, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(1, 3), ph, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(1, 2), ph, FFTW_BACKWARD)
+         call c2c_1m_x_plan(plan(1, 1), ph, FFTW_BACKWARD)
 
-    call fftw_cleanup()
+         ! For R2C/C2R tranforms
+         call r2c_1m_x_plan(plan(0, 1), ph, sp)
+         call c2c_1m_y_plan(plan(0, 2), sp, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(0, 3), sp, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(2, 3), sp, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(2, 2), sp, FFTW_BACKWARD)
+         call c2r_1m_x_plan(plan(2, 1), sp, ph)
 
-    return
-  end subroutine finalize_fft_engine
+      else if (format == PHYSICAL_IN_Z) then
 
+         ! For C2C transforms
+         call c2c_1m_z_plan(plan(-1, 3), ph, FFTW_FORWARD)
+         call c2c_1m_y_plan(plan(-1, 2), ph, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(-1, 1), ph, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(1, 1), ph, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(1, 2), ph, FFTW_BACKWARD)
+         call c2c_1m_z_plan(plan(1, 3), ph, FFTW_BACKWARD)
 
-  ! Following routines calculate multiple one-dimensional FFTs to form
-  ! the basis of three-dimensional FFTs.
+         ! For R2C/C2R tranforms
+         call r2c_1m_z_plan(plan(0, 3), ph, sp)
+         call c2c_1m_y_plan(plan(0, 2), sp, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(0, 1), sp, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(2, 1), sp, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(2, 2), sp, FFTW_BACKWARD)
+         call c2r_1m_z_plan(plan(2, 3), sp, ph)
 
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, plan1)
+      end if
 
-    implicit none
+      return
+   end subroutine init_fft_engine
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR), intent(IN) :: plan1
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   !  This routine performs one-time finalisations for the FFT engine
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   subroutine finalize_fft_engine
 
-    integer :: foo
+      implicit none
 
-    foo = isign ! Silence unused dummy argument
+      integer :: i, j
 
+      do j = 1, 3
+         do i = -1, 2
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft(plan1, inout, inout)
+            call dfftw_destroy_plan(plan(i, j))
 #else
-    call sfftw_execute_dft(plan1, inout, inout)
+            call sfftw_destroy_plan(plan(i, j))
 #endif
+         end do
+      end do
 
-    return
-  end subroutine c2c_1m_x
+      call fftw_cleanup()
 
+      return
+   end subroutine finalize_fft_engine
 
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, plan1)
+   ! Following routines calculate multiple one-dimensional FFTs to form
+   ! the basis of three-dimensional FFTs.
 
-    implicit none
+   ! c2c transform, multiple 1D FFTs in x direction
+   subroutine c2c_1m_x(inout, plan1)
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR), intent(IN) :: plan1
+      implicit none
 
-    integer :: k, s3
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR), intent(IN) :: plan1
 
-    integer :: foo
-
-    foo = isign ! Silence unused dummy argument
-
-    ! transform on one Z-plane at a time
-    s3 = size(inout,3)
-    do k=1,s3
 #ifdef DOUBLE_PREC
-       call dfftw_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
+      call dfftw_execute_dft(plan1, inout, inout)
 #else
-       call sfftw_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
+      call sfftw_execute_dft(plan1, inout, inout)
 #endif
-    end do
 
-    return
-  end subroutine c2c_1m_y
+      return
+   end subroutine c2c_1m_x
 
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, plan1)
+   ! c2c transform, multiple 1D FFTs in y direction
+   subroutine c2c_1m_y(inout, plan1)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR), intent(IN) :: plan1
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR), intent(IN) :: plan1
 
-    integer :: foo
-
-    foo = isign ! Silence unused dummy argument
+      integer :: k, s3
 
+      ! transform on one Z-plane at a time
+      s3 = size(inout, 3)
+      do k = 1, s3
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft(plan1, inout, inout)
+         call dfftw_execute_dft(plan1, inout(:, :, k), inout(:, :, k))
 #else
-    call sfftw_execute_dft(plan1, inout, inout)
+         call sfftw_execute_dft(plan1, inout(:, :, k), inout(:, :, k))
 #endif
+      end do
 
-    return
-  end subroutine c2c_1m_z
+      return
+   end subroutine c2c_1m_y
 
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
+   ! c2c transform, multiple 1D FFTs in z direction
+   subroutine c2c_1m_z(inout, plan1)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR), intent(IN) :: plan1
 
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft_r2c(plan(0,1), input, output)
+      call dfftw_execute_dft(plan1, inout, inout)
 #else
-    call sfftw_execute_dft_r2c(plan(0,1), input, output)
+      call sfftw_execute_dft(plan1, inout, inout)
 #endif
 
-    return
-
-  end subroutine r2c_1m_x
+      return
+   end subroutine c2c_1m_z
 
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
+   ! r2c transform, multiple 1D FFTs in x direction
+   subroutine r2c_1m_x(input, output)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
+      real(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft_r2c(plan(0,3), input, output)
+      call dfftw_execute_dft_r2c(plan(0, 1), input, output)
 #else
-    call sfftw_execute_dft_r2c(plan(0,3), input, output)
+      call sfftw_execute_dft_r2c(plan(0, 1), input, output)
 #endif
 
-    return
+      return
 
-  end subroutine r2c_1m_z
+   end subroutine r2c_1m_x
 
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
+   ! r2c transform, multiple 1D FFTs in z direction
+   subroutine r2c_1m_z(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
+      real(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft_c2r(plan(2,1), input, output)
+      call dfftw_execute_dft_r2c(plan(0, 3), input, output)
 #else
-    call sfftw_execute_dft_c2r(plan(2,1), input, output)
+      call sfftw_execute_dft_r2c(plan(0, 3), input, output)
 #endif
 
-    return
+      return
 
-  end subroutine c2r_1m_x
+   end subroutine r2c_1m_z
 
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
+   ! c2r transform, multiple 1D FFTs in x direction
+   subroutine c2r_1m_x(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
+      complex(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call dfftw_execute_dft_c2r(plan(2,3), input, output)
+      call dfftw_execute_dft_c2r(plan(2, 1), input, output)
 #else
-    call sfftw_execute_dft_c2r(plan(2,3), input, output)
+      call sfftw_execute_dft_c2r(plan(2, 1), input, output)
 #endif
 
-    return
+      return
+
+   end subroutine c2r_1m_x
+
+   ! c2r transform, multiple 1D FFTs in z direction
+   subroutine c2r_1m_z(input, output)
+
+      implicit none
 
-  end subroutine c2r_1m_z
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
 
+#ifdef DOUBLE_PREC
+      call dfftw_execute_dft_c2r(plan(2, 3), input, output)
+#else
+      call sfftw_execute_dft_c2r(plan(2, 3), input, output)
+#endif
+
+      return
 
+   end subroutine c2r_1m_z
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
+   ! 3D FFT - complex to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
+   subroutine fft_3d_c2c(in, out, isign)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out
+      integer, intent(IN) :: isign
 
 #ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
+      if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_FORWARD .OR. &
+          format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_BACKWARD) then
 
-       ! ===== 1D FFTs in X =====
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in,isign,plan(isign,1))
+         call c2c_1m_x(in, plan(isign, 1))
 #else
-       allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       wk1 = in
-       call c2c_1m_x(wk1,isign,plan(isign,1))
+         allocate (wk1(ph%xsz(1), ph%xsz(2), ph%xsz(3)))
+         wk1 = in
+         call c2c_1m_x(wk1, plan(isign, 1))
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
 
-       if (dims(1)>1) then
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in,wk2_c2c,ph)
+            call transpose_x_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_x_to_y(wk1,wk2_c2c,ph)
+            call transpose_x_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else
+            call c2c_1m_y(wk2_c2c, plan(isign, 2))
+         else
 #ifdef OVERWRITE
-          call c2c_1m_y(in,isign,plan(isign,2))
+            call c2c_1m_y(in, plan(isign, 2))
 #else
-          call c2c_1m_y(wk1,isign,plan(isign,2))
+            call c2c_1m_y(wk1, plan(isign, 2))
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_c2c,out,ph)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_c2c, out, ph)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in,out,ph)
+            call transpose_y_to_z(in, out, ph)
 #else
-          call transpose_y_to_z(wk1,out,ph)
+            call transpose_y_to_z(wk1, out, ph)
 #endif
-       end if
-       call c2c_1m_z(out,isign,plan(isign,3))
+         end if
+         call c2c_1m_z(out, plan(isign, 3))
 
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
+      else if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_BACKWARD &
+               .OR. &
+               format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_FORWARD) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in,isign,plan(isign,3))
+         call c2c_1m_z(in, plan(isign, 3))
 #else
-       allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       wk1 = in
-       call c2c_1m_z(wk1,isign,plan(isign,3))
+         allocate (wk1(ph%zsz(1), ph%zsz(2), ph%zsz(3)))
+         wk1 = in
+         call c2c_1m_z(wk1, plan(isign, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,wk2_c2c,ph)
+            call transpose_z_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_z_to_y(wk1,wk2_c2c,ph)
+            call transpose_z_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else  ! out==wk2_c2c if 1D decomposition
+            call c2c_1m_y(wk2_c2c, plan(isign, 2))
+         else  ! out==wk2_c2c if 1D decomposition
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,out,ph)
+            call transpose_z_to_y(in, out, ph)
 #else
-          call transpose_z_to_y(wk1,out,ph)
+            call transpose_z_to_y(wk1, out, ph)
 #endif
-          call c2c_1m_y(out,isign,plan(isign,2))
-       end if
+            call c2c_1m_y(out, plan(isign, 2))
+         end if
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_c2c,out,ph)
-       end if
-       call c2c_1m_x(out,isign,plan(isign,1))
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_c2c, out, ph)
+         end if
+         call c2c_1m_x(out, plan(isign, 1))
 
-    end if
+      end if
 
 #ifndef OVERWRITE
-    deallocate (wk1)
+      deallocate (wk1)
 #endif
 
-    return
-  end subroutine fft_3d_c2c
-
+      return
+   end subroutine fft_3d_c2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
+   ! 3D forward FFT - real to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
-
-    implicit none
+   subroutine fft_3d_r2c(in_r, out_c)
 
-    real(mytype), dimension(:,:,:), intent(IN) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
+      implicit none
 
-    if (format==PHYSICAL_IN_X) then
+      real(mytype), dimension(:, :, :), intent(INOUT) :: in_r
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out_c
 
-       ! ===== 1D FFTs in X =====
-       call r2c_1m_x(in_r,wk13)
+      if (format == PHYSICAL_IN_X) then
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_x_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else
-          call c2c_1m_y(wk13,-1,plan(0,2))
-       end if
+         ! ===== 1D FFTs in X =====
+         call r2c_1m_x(in_r, wk13)
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,out_c,sp)
-       else
-          call transpose_y_to_z(wk13,out_c,sp)
-       end if
-       call c2c_1m_z(out_c,-1,plan(0,3))
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            call transpose_x_to_y(wk13, wk2_r2c, sp)
+            call c2c_1m_y(wk2_r2c, plan(0, 2))
+         else
+            call c2c_1m_y(wk13, plan(0, 2))
+         end if
 
-    else if (format==PHYSICAL_IN_Z) then
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, out_c, sp)
+         else
+            call transpose_y_to_z(wk13, out_c, sp)
+         end if
+         call c2c_1m_z(out_c, plan(0, 3))
 
-       ! ===== 1D FFTs in Z =====
-       call r2c_1m_z(in_r,wk13)
+      else if (format == PHYSICAL_IN_Z) then
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_z_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else  ! out_c==wk2_r2c if 1D decomposition
-          call transpose_z_to_y(wk13,out_c,sp)
-          call c2c_1m_y(out_c,-1,plan(0,2))
-       end if
+         ! ===== 1D FFTs in Z =====
+         call r2c_1m_z(in_r, wk13)
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,out_c,sp)
-       end if
-       call c2c_1m_x(out_c,-1,plan(0,1))
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            call transpose_z_to_y(wk13, wk2_r2c, sp)
+            call c2c_1m_y(wk2_r2c, plan(0, 2))
+         else  ! out_c==wk2_r2c if 1D decomposition
+            call transpose_z_to_y(wk13, out_c, sp)
+            call c2c_1m_y(out_c, plan(0, 2))
+         end if
 
-    end if
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_r2c, out_c, sp)
+         end if
+         call c2c_1m_x(out_c, plan(0, 1))
 
-    return
-  end subroutine fft_3d_r2c
+      end if
 
+      return
+   end subroutine fft_3d_r2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
+   ! 3D inverse FFT - complex to real
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
+   subroutine fft_3d_c2r(in_c, out_r)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in_c
+      real(mytype), dimension(:, :, :), intent(OUT) :: out_r
 
 #ifndef OVERWRITE
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1
 #endif
 
-    if (format==PHYSICAL_IN_X) then
+      if (format == PHYSICAL_IN_X) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in_c,1,plan(2,3))
+         call c2c_1m_z(in_c, plan(2, 3))
 #else
-       allocate (wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       wk1 = in_c
-       call c2c_1m_z(wk1,1,plan(2,3))
+         allocate (wk1(sp%zsz(1), sp%zsz(2), sp%zsz(3)))
+         wk1 = in_c
+         call c2c_1m_z(wk1, plan(2, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
 #ifdef OVERWRITE
-       call transpose_z_to_y(in_c,wk2_r2c,sp)
+         call transpose_z_to_y(in_c, wk2_r2c, sp)
 #else
-       call transpose_z_to_y(wk1,wk2_r2c,sp)
+         call transpose_z_to_y(wk1, wk2_r2c, sp)
 #endif
-       call c2c_1m_y(wk2_r2c,1,plan(2,2))
+         call c2c_1m_y(wk2_r2c, plan(2, 2))
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,wk13,sp)
-          call c2r_1m_x(wk13,out_r)
-       else
-          call c2r_1m_x(wk2_r2c,out_r)
-       end if
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_r2c, wk13, sp)
+            call c2r_1m_x(wk13, out_r)
+         else
+            call c2r_1m_x(wk2_r2c, out_r)
+         end if
 
-    else if (format==PHYSICAL_IN_Z) then
+      else if (format == PHYSICAL_IN_Z) then
 
-       ! ===== 1D FFTs in X =====
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in_c,1,plan(2,1))
+         call c2c_1m_x(in_c, plan(2, 1))
 #else
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       wk1 = in_c
-       call c2c_1m_x(wk1,1,plan(2,1))
+         allocate (wk1(sp%xsz(1), sp%xsz(2), sp%xsz(3)))
+         wk1 = in_c
+         call c2c_1m_x(wk1, plan(2, 1))
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in_c,wk2_r2c,sp)
+            call transpose_x_to_y(in_c, wk2_r2c, sp)
 #else
-          call transpose_x_to_y(wk1,wk2_r2c,sp)
+            call transpose_x_to_y(wk1, wk2_r2c, sp)
 #endif
-          call c2c_1m_y(wk2_r2c,1,plan(2,2))
-       else  ! in_c==wk2_r2c if 1D decomposition
+            call c2c_1m_y(wk2_r2c, plan(2, 2))
+         else  ! in_c==wk2_r2c if 1D decomposition
 #ifdef OVERWRITE
-          call c2c_1m_y(in_c,1,plan(2,2))
+            call c2c_1m_y(in_c, plan(2, 2))
 #else
-          call c2c_1m_y(wk1,1,plan(2,2))
+            call c2c_1m_y(wk1, plan(2, 2))
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,wk13,sp)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, wk13, sp)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in_c,wk13,sp)
+            call transpose_y_to_z(in_c, wk13, sp)
 #else
-          call transpose_y_to_z(wk1,wk13,sp)
+            call transpose_y_to_z(wk1, wk13, sp)
 #endif
-       end if
-       call c2r_1m_z(wk13,out_r)
+         end if
+         call c2r_1m_z(wk13, out_r)
 
-    end if
+      end if
 
 #ifndef OVERWRITE
-    deallocate (wk1)
+      deallocate (wk1)
 #endif
 
-    return
-  end subroutine fft_3d_c2r
-
+      return
+   end subroutine fft_3d_c2r
 
 end module decomp_2d_fft
diff --git a/src/fft_fftw3_f03.f90 b/src/fft_fftw3_f03.f90
index 6dab0f44..5057cc0d 100644
--- a/src/fft_fftw3_f03.f90
+++ b/src/fft_fftw3_f03.f90
@@ -1,1052 +1,1075 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This is the FFTW implementation of the FFT library using 
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! This is the FFTW implementation of the FFT library using
 ! the Fortran 2003 interface introduced in FFTW 3.3-beta1
 
 module decomp_2d_fft
 
-  use decomp_2d  ! 2D decomposition module
-  use, intrinsic :: iso_c_binding
-  
-  implicit none
-
-  include "fftw3.f03"
-  
-  private        ! Make everything private unless declared public
-
-  ! engine-specific global variables
-  integer, save :: plan_type = FFTW_MEASURE
-
-  ! FFTW plans
-  ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
-  ! For c2c transforms: 
-  !     use plan(-1,j) for  forward transform; 
-  !     use plan( 1,j) for backward transform;
-  ! For r2c/c2r transforms:
-  !     use plan(0,j) for r2c transforms;
-  !     use plan(2,j) for c2r transforms;
-  type(C_PTR), save :: plan(-1:2,3)
-
-  integer, parameter, public :: DECOMP_2D_FFT_FORWARD = -1
-  integer, parameter, public :: DECOMP_2D_FFT_BACKWARD = 1
-  
-  ! Physical space data can be stored in either X-pencil or Z-pencil
-  integer, parameter, public :: PHYSICAL_IN_X = 1
-  integer, parameter, public :: PHYSICAL_IN_Z = 3 
-
-  integer, save :: format                 ! input X-pencil or Z-pencil
-  
-  ! The libary can only be initialised once
-  logical, save :: initialised = .false. 
-
-  ! Global size of the FFT
-  integer, save :: nx_fft, ny_fft, nz_fft
-
-  ! 2D processor grid
-  ! FIXME this is already available in the module decomp_2d
-  integer, save, dimension(2) :: dims
-
-  ! Decomposition objects
-  TYPE(DECOMP_INFO), pointer, save :: ph=>null()  ! physical space
-  TYPE(DECOMP_INFO), save :: sp  ! spectral space
-
-  ! Workspace to store the intermediate Y-pencil data
-  ! *** TODO: investigate how to use only one workspace array
-  complex(mytype), pointer :: wk2_c2c(:,:,:), wk2_r2c(:,:,:), wk13(:,:,:)
-  type(C_PTR) :: wk2_c2c_p, wk2_r2c_p, wk13_p
-
-  public :: decomp_2d_fft_init, decomp_2d_fft_3d, &
-       decomp_2d_fft_finalize, decomp_2d_fft_get_size
-  
-  ! Declare generic interfaces to handle different inputs
-  
-  interface decomp_2d_fft_init
-     module procedure fft_init_noarg
-     module procedure fft_init_arg
-     module procedure fft_init_general
-  end interface
-  
-  interface decomp_2d_fft_3d
-     module procedure fft_3d_c2c
-     module procedure fft_3d_r2c
-     module procedure fft_3d_c2r
-  end interface
-
-  
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d  ! 2D decomposition module
+   use, intrinsic :: iso_c_binding
+
+   implicit none
+
+   include "fftw3.f03"
+
+   private        ! Make everything private unless declared public
+
+   ! engine-specific global variables
+   integer, save :: plan_type = FFTW_MEASURE
+
+   ! FFTW plans
+   ! j=1,2,3 corresponds to the 1D FFTs in X,Y,Z direction, respectively
+   ! For c2c transforms:
+   !     use plan(-1,j) for  forward transform;
+   !     use plan( 1,j) for backward transform;
+   ! For r2c/c2r transforms:
+   !     use plan(0,j) for r2c transforms;
+   !     use plan(2,j) for c2r transforms;
+   type(C_PTR), save :: plan(-1:2, 3)
+
+   integer, parameter, public :: D2D_FFT_BACKEND = D2D_FFT_BACKEND_FFTW3_F03
+
+   integer, save :: format                 ! input X-pencil or Z-pencil
+
+   ! The libary can only be initialised once
+   logical, save :: initialised = .false.
+
+   ! Global size of the FFT
+   integer, save :: nx_fft, ny_fft, nz_fft
+
+   ! 2D processor grid
+   ! FIXME this is already available in the module decomp_2d
+   integer, save, dimension(2) :: dims
+
+   ! Decomposition objects
+   TYPE(DECOMP_INFO), pointer, save :: ph => null()  ! physical space
+   TYPE(DECOMP_INFO), target, save :: ph_target ! ph => ph_target or ph => decomp_main
+   TYPE(DECOMP_INFO), target, save :: sp  ! spectral space
+
+   ! Workspace to store the intermediate Y-pencil data
+   ! *** TODO: investigate how to use only one workspace array
+   complex(mytype), contiguous, pointer :: wk2_c2c(:, :, :), wk2_r2c(:, :, :), wk13(:, :, :)
+   type(C_PTR) :: wk2_c2c_p, wk13_p
+
+   public :: decomp_2d_fft_init, decomp_2d_fft_3d, &
+             decomp_2d_fft_finalize, decomp_2d_fft_get_size, &
+             decomp_2d_fft_get_ph, decomp_2d_fft_get_sp
+
+   ! Declare generic interfaces to handle different inputs
+
+   interface decomp_2d_fft_init
+      module procedure fft_init_noarg
+      module procedure fft_init_arg
+      module procedure fft_init_general
+   end interface
+
+   interface decomp_2d_fft_3d
+      module procedure fft_3d_c2c
+      module procedure fft_3d_r2c
+      module procedure fft_3d_c2r
+   end interface
+
+   interface
+      module subroutine decomp_2d_fft_log(backend)
+         character(len=*), intent(in) :: backend
+      end subroutine decomp_2d_fft_log
+   end interface
+
 contains
-  
-  
+
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Initialise the FFT module
+   ! Initialise the FFT module
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_init_noarg
-    
-    implicit none
-    
-    call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
-    
-    return
-  end subroutine fft_init_noarg
+   subroutine fft_init_noarg
+
+      implicit none
+
+      call fft_init_arg(PHYSICAL_IN_X)  ! default input is X-pencil data
 
-  subroutine fft_init_arg(pencil)     ! allow to handle Z-pencil input
+      return
+   end subroutine fft_init_noarg
 
-    implicit none
+   subroutine fft_init_arg(pencil)     ! allow to handle Z-pencil input
 
-    integer, intent(IN) :: pencil
+      implicit none
 
-    call fft_init_general(pencil, nx_global, ny_global, nz_global)
+      integer, intent(IN) :: pencil
 
-    return
-  end subroutine fft_init_arg
+      call fft_init_general(pencil, nx_global, ny_global, nz_global)
 
-  ! Initialise the FFT library to perform arbitrary size transforms
-  subroutine fft_init_general(pencil, nx, ny, nz)
+      return
+   end subroutine fft_init_arg
 
-    implicit none
+   ! Initialise the FFT library to perform arbitrary size transforms
+   subroutine fft_init_general(pencil, nx, ny, nz)
 
-    integer, intent(IN) :: pencil
-    integer, intent(IN) :: nx, ny, nz
+      implicit none
 
-    integer :: errorcode
-    integer(C_SIZE_T) :: sz
+      integer, intent(IN) :: pencil
+      integer, intent(IN) :: nx, ny, nz
+
+      integer :: errorcode
+      integer(C_SIZE_T) :: sz
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_init")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_init")
 #endif
 
-    if (initialised) then
-       errorcode = 4
-       call decomp_2d_abort(errorcode, &
-            'FFT library should only be initialised once')
-    end if
-    
-    format = pencil
-    nx_fft = nx
-    ny_fft = ny
-    nz_fft = nz
-
-    ! determine the processor grid in use
-    dims = get_decomp_dims()
-
-    ! for c2r/r2c interface:
-    ! if in physical space, a real array is of size: nx*ny*nz
-    ! in spectral space, the complex array is of size:
-    !         (nx/2+1)*ny*nz, if PHYSICAL_IN_X
-    !      or nx*ny*(nz/2+1), if PHYSICAL_IN_Z
-
-    if (nx_fft==nx_global.and.ny_fft==ny_global.and.nz_fft==nz_global) then
-       ph => decomp_main
-    else
-       call decomp_info_init(nx, ny, nz, ph)
-    endif
-    if (format==PHYSICAL_IN_X) then
-       call decomp_info_init(nx/2+1, ny, nz, sp)
-    else if (format==PHYSICAL_IN_Z) then
-       call decomp_info_init(nx, ny, nz/2+1, sp)
-    end if
-
-    sz = ph%ysz(1)*ph%ysz(2)*ph%ysz(3)
-    wk2_c2c_p = fftw_alloc_complex(sz)
-    call c_f_pointer(wk2_c2c_p,wk2_c2c,[ph%ysz(1),ph%ysz(2),ph%ysz(3)])
-
-    sz = sp%ysz(1)*sp%ysz(2)*sp%ysz(3)
-    wk2_r2c_p = fftw_alloc_complex(sz)
-    call c_f_pointer(wk2_r2c_p,wk2_r2c,[sp%ysz(1),sp%ysz(2),sp%ysz(3)])
-
-
-    if (format==PHYSICAL_IN_X) then
-       sz = sp%xsz(1)*sp%xsz(2)*sp%xsz(3)
-       wk13_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk13_p,wk13,[sp%xsz(1),sp%xsz(2),sp%xsz(3)])
-    else if (format==PHYSICAL_IN_Z) then
-       sz = sp%zsz(1)*sp%zsz(2)*sp%zsz(3)
-       wk13_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk13_p,wk13,[sp%zsz(1),sp%zsz(2),sp%zsz(3)])
-    end if
-
-    call init_fft_engine
-    
-    initialised = .true.
+      ! Safety checks
+      if (initialised) then
+         errorcode = 4
+         call decomp_2d_abort(errorcode, &
+                              'FFT library should only be initialised once')
+      end if
+      if (nx <= 0) call decomp_2d_abort(__FILE__, __LINE__, nx, "Invalid value for nx")
+      if (ny <= 0) call decomp_2d_abort(__FILE__, __LINE__, ny, "Invalid value for ny")
+      if (nz <= 0) call decomp_2d_abort(__FILE__, __LINE__, nz, "Invalid value for nz")
+
+      format = pencil
+      nx_fft = nx
+      ny_fft = ny
+      nz_fft = nz
+
+      ! determine the processor grid in use
+      dims = get_decomp_dims()
+
+      ! for c2r/r2c interface:
+      ! if in physical space, a real array is of size: nx*ny*nz
+      ! in spectral space, the complex array is of size:
+      !         (nx/2+1)*ny*nz, if PHYSICAL_IN_X
+      !      or nx*ny*(nz/2+1), if PHYSICAL_IN_Z
+
+      if (nx_fft == nx_global .and. ny_fft == ny_global .and. nz_fft == nz_global) then
+         ph => decomp_main
+      else
+         call decomp_info_init(nx, ny, nz, ph_target)
+         ph => ph_target
+      end if
+      if (format == PHYSICAL_IN_X) then
+         call decomp_info_init(nx / 2 + 1, ny, nz, sp)
+      else if (format == PHYSICAL_IN_Z) then
+         call decomp_info_init(nx, ny, nz / 2 + 1, sp)
+      else
+         call decomp_2d_abort(__FILE__, __LINE__, format, "Invalid value for format")
+      end if
+
+      !
+      ! Allocate the workspace fo intermediate y-pencil data
+      ! The largest memory block needed is the one for c2c transforms
+      !
+      sz = ph%ysz(1) * ph%ysz(2) * ph%ysz(3)
+      wk2_c2c_p = fftw_alloc_complex(sz)
+      call c_f_pointer(wk2_c2c_p, wk2_c2c, [ph%ysz(1), ph%ysz(2), ph%ysz(3)])
+      !
+      ! A smaller memory block is needed for r2c and c2r transforms
+      ! wk2_c2c and wk2_r2c start at the same location
+      !
+      !    Size of wk2_c2c : ph%ysz(1), ph%ysz(2), ph%ysz(3)
+      !    Size of wk2_r2c : sp%ysz(1), sp%ysz(2), sp%ysz(3)
+      !
+      call c_f_pointer(wk2_c2c_p, wk2_r2c, [sp%ysz(1), sp%ysz(2), sp%ysz(3)])
+      !
+      ! Allocate the workspace for r2c and c2r transforms
+      !
+      ! wk13 can not be easily fused with wk2_*2c due to statements such as
+      ! transpose_y_to_x(wk2_r2c, wk13, sp)
+      ! transpose_y_to_z(wk2_r2c, wk13, sp)
+      !
+      if (format == PHYSICAL_IN_X) then
+         sz = sp%xsz(1) * sp%xsz(2) * sp%xsz(3)
+         wk13_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk13_p, wk13, [sp%xsz(1), sp%xsz(2), sp%xsz(3)])
+      else if (format == PHYSICAL_IN_Z) then
+         sz = sp%zsz(1) * sp%zsz(2) * sp%zsz(3)
+         wk13_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk13_p, wk13, [sp%zsz(1), sp%zsz(2), sp%zsz(3)])
+      end if
+
+      call init_fft_engine
+
+      initialised = .true.
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_init")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_init")
 #endif
 
-    return
-  end subroutine fft_init_general
+      return
+   end subroutine fft_init_general
 
-  
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Final clean up
+   ! Final clean up
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_fft_finalize
-    
-    implicit none
+   subroutine decomp_2d_fft_finalize
+
+      implicit none
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_fin")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_fin")
 #endif
 
-    if (nx_fft==nx_global.and.ny_fft==ny_global.and.nz_fft==nz_global) then
-      nullify(ph)
-    else
-      call decomp_info_finalize(ph)
-    endif
-    call decomp_info_finalize(sp)
+      if (nx_fft /= nx_global .or. ny_fft /= ny_global .or. nz_fft /= nz_global) then
+         call decomp_info_finalize(ph_target)
+      end if
+      nullify (ph)
+      call decomp_info_finalize(sp)
 
-    call fftw_free(wk2_c2c_p)
-    call fftw_free(wk2_r2c_p)
-    call fftw_free(wk13_p)
+      call fftw_free(wk2_c2c_p)
+      nullify (wk2_c2c)
+      nullify (wk2_r2c)
+      call fftw_free(wk13_p)
+      nullify (wk13)
 
-    call finalize_fft_engine
+      call finalize_fft_engine
 
-    initialised = .false.
+      initialised = .false.
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_fin")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_fin")
 #endif
 
-    return
-  end subroutine decomp_2d_fft_finalize
+      return
+   end subroutine decomp_2d_fft_finalize
+
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Return the size, starting/ending index of the distributed array
+   !  whose global size is (nx/2+1)*ny*nz, for defining data structures
+   !  in r2c and c2r interfaces
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   subroutine decomp_2d_fft_get_size(istart, iend, isize)
+
+      implicit none
+      integer, dimension(3), intent(OUT) :: istart, iend, isize
 
+      if (format == PHYSICAL_IN_X) then
+         istart = sp%zst
+         iend = sp%zen
+         isize = sp%zsz
+      else if (format == PHYSICAL_IN_Z) then
+         istart = sp%xst
+         iend = sp%xen
+         isize = sp%xsz
+      end if
+
+      return
+   end subroutine decomp_2d_fft_get_size
 
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Return the size, starting/ending index of the distributed array 
-  !  whose global size is (nx/2+1)*ny*nz, for defining data structures
-  !  in r2c and c2r interfaces
+   ! Return a pointer to the decomp_info object ph
+   !
+   ! The caller should not apply decomp_info_finalize on the pointer
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine decomp_2d_fft_get_size(istart, iend, isize)
-    
-    implicit none
-    integer, dimension(3), intent(OUT) :: istart, iend, isize
-    
-    if (format==PHYSICAL_IN_X) then
-       istart = sp%zst
-       iend   = sp%zen
-       isize  = sp%zsz
-    else if (format==PHYSICAL_IN_Z) then
-       istart = sp%xst
-       iend   = sp%xen
-       isize  = sp%xsz
-    end if
-    
-    return
-  end subroutine decomp_2d_fft_get_size
-
-
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in X direction
-  subroutine c2c_1m_x_plan(plan1, decomp, isign)
-
-    implicit none
-
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+   function decomp_2d_fft_get_ph()
+
+      implicit none
+
+      type(decomp_info), pointer :: decomp_2d_fft_get_ph
+
+      if (.not. associated(ph)) then
+         call decomp_2d_abort(__FILE__, __LINE__, -1, 'FFT library must be initialised first')
+      end if
+      decomp_2d_fft_get_ph => ph
+
+   end function decomp_2d_fft_get_ph
+
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Return a pointer to the decomp_info object sp
+   !
+   ! The caller should not apply decomp_info_finalize on the pointer
+  !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   function decomp_2d_fft_get_sp()
+
+      implicit none
+
+      type(decomp_info), pointer :: decomp_2d_fft_get_sp
+
+      if (.not. associated(ph)) then
+         call decomp_2d_abort(__FILE__, __LINE__, -1, 'FFT library must be initialised first')
+      end if
+      decomp_2d_fft_get_sp => sp
+
+   end function decomp_2d_fft_get_sp
+
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in X direction
+   subroutine c2c_1m_x_plan(plan1, decomp, isign)
+
+      implicit none
+
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
 #ifdef DOUBLE_PREC
-    complex(C_DOUBLE_COMPLEX), pointer :: a1(:,:,:)
-    complex(C_DOUBLE_COMPLEX), pointer :: a1o(:,:,:)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1(:, :, :)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1o(:, :, :)
 #else
-    complex(C_FLOAT_COMPLEX), pointer :: a1(:,:,:)
-    complex(C_FLOAT_COMPLEX), pointer :: a1o(:,:,:)
+      complex(C_FLOAT_COMPLEX), pointer :: a1(:, :, :)
+      complex(C_FLOAT_COMPLEX), pointer :: a1o(:, :, :)
 #endif
-    type(C_PTR) :: a1_p
-    integer(C_SIZE_T) :: sz
+      type(C_PTR) :: a1_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp%xsz(1)*decomp%xsz(2)*decomp%xsz(3)
-    a1_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a1_p,a1,[decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)])
-    call c_f_pointer(a1_p,a1o,[decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)])
+      sz = decomp%xsz(1) * decomp%xsz(2) * decomp%xsz(3)
+      a1_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a1_p, a1, [decomp%xsz(1), decomp%xsz(2), decomp%xsz(3)])
+      call c_f_pointer(a1_p, a1o, [decomp%xsz(1), decomp%xsz(2), decomp%xsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 = fftw_plan_many_dft(1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1o, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
+      plan1 = fftw_plan_many_dft(1, decomp%xsz(1), &
+                                 decomp%xsz(2) * decomp%xsz(3), a1, decomp%xsz(1), 1, &
+                                 decomp%xsz(1), a1o, decomp%xsz(1), 1, decomp%xsz(1), &
+                                 isign, plan_type)
 #else
-    plan1 = fftwf_plan_many_dft(1, decomp%xsz(1), &
-         decomp%xsz(2)*decomp%xsz(3), a1, decomp%xsz(1), 1, &
-         decomp%xsz(1), a1o, decomp%xsz(1), 1, decomp%xsz(1), &
-         isign, plan_type)
+      plan1 = fftwf_plan_many_dft(1, decomp%xsz(1), &
+                                  decomp%xsz(2) * decomp%xsz(3), a1, decomp%xsz(1), 1, &
+                                  decomp%xsz(1), a1o, decomp%xsz(1), 1, decomp%xsz(1), &
+                                  isign, plan_type)
 #endif
 
-    call fftw_free(a1_p)
+      call fftw_free(a1_p)
 
-    return
-  end subroutine c2c_1m_x_plan
+      return
+   end subroutine c2c_1m_x_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Y direction
-  subroutine c2c_1m_y_plan(plan1, decomp, isign)
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in Y direction
+   subroutine c2c_1m_y_plan(plan1, decomp, isign)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
 #ifdef DOUBLE_PREC
-    complex(C_DOUBLE_COMPLEX), pointer :: a1(:,:)
-    complex(C_DOUBLE_COMPLEX), pointer :: a1o(:,:)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1(:, :)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1o(:, :)
 #else
-    complex(C_FLOAT_COMPLEX), pointer :: a1(:,:)
-    complex(C_FLOAT_COMPLEX), pointer :: a1o(:,:)
+      complex(C_FLOAT_COMPLEX), pointer :: a1(:, :)
+      complex(C_FLOAT_COMPLEX), pointer :: a1o(:, :)
 #endif
-    type(C_PTR) :: a1_p
-    integer(C_SIZE_T) :: sz
+      type(C_PTR) :: a1_p
+      integer(C_SIZE_T) :: sz
 
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
-    sz = decomp%ysz(1)*decomp%ysz(2)
-    a1_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a1_p,a1,[decomp%ysz(1),decomp%ysz(2)])
-    call c_f_pointer(a1_p,a1o,[decomp%ysz(1),decomp%ysz(2)])
+      ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
+      ! done one Z-plane at a time. So plan for 2D data sets here.
+      sz = decomp%ysz(1) * decomp%ysz(2)
+      a1_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a1_p, a1, [decomp%ysz(1), decomp%ysz(2)])
+      call c_f_pointer(a1_p, a1o, [decomp%ysz(1), decomp%ysz(2)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft(1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1o, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
+      plan1 = fftw_plan_many_dft(1, decomp%ysz(2), decomp%ysz(1), &
+                                 a1, decomp%ysz(2), decomp%ysz(1), 1, a1o, decomp%ysz(2), &
+                                 decomp%ysz(1), 1, isign, plan_type)
 #else
-    plan1 = fftwf_plan_many_dft(1, decomp%ysz(2), decomp%ysz(1), &
-         a1, decomp%ysz(2), decomp%ysz(1), 1, a1o, decomp%ysz(2), &
-         decomp%ysz(1), 1, isign, plan_type)
+      plan1 = fftwf_plan_many_dft(1, decomp%ysz(2), decomp%ysz(1), &
+                                  a1, decomp%ysz(2), decomp%ysz(1), 1, a1o, decomp%ysz(2), &
+                                  decomp%ysz(1), 1, isign, plan_type)
 #endif
 
-    call fftw_free(a1_p)
+      call fftw_free(a1_p)
 
-    return
-  end subroutine c2c_1m_y_plan
+      return
+   end subroutine c2c_1m_y_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2c FFTs in Z direction
-  subroutine c2c_1m_z_plan(plan1, decomp, isign)
+   ! Return a FFTW3 plan for multiple 1D c2c FFTs in Z direction
+   subroutine c2c_1m_z_plan(plan1, decomp, isign)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    integer, intent(IN) :: isign
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer, intent(IN) :: isign
 
 #ifdef DOUBLE_PREC
-    complex(C_DOUBLE_COMPLEX), pointer :: a1(:,:,:)
-    complex(C_DOUBLE_COMPLEX), pointer :: a1o(:,:,:)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1(:, :, :)
+      complex(C_DOUBLE_COMPLEX), pointer :: a1o(:, :, :)
 #else
-    complex(C_FLOAT_COMPLEX), pointer :: a1(:,:,:)
-    complex(C_FLOAT_COMPLEX), pointer :: a1o(:,:,:)
+      complex(C_FLOAT_COMPLEX), pointer :: a1(:, :, :)
+      complex(C_FLOAT_COMPLEX), pointer :: a1o(:, :, :)
 #endif
-    type(C_PTR) :: a1_p
-    integer(C_SIZE_T) :: sz
+      type(C_PTR) :: a1_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp%zsz(1)*decomp%zsz(2)*decomp%zsz(3)
-    a1_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a1_p,a1,[decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)])
-    call c_f_pointer(a1_p,a1o,[decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)])
+      sz = decomp%zsz(1) * decomp%zsz(2) * decomp%zsz(3)
+      a1_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a1_p, a1, [decomp%zsz(1), decomp%zsz(2), decomp%zsz(3)])
+      call c_f_pointer(a1_p, a1o, [decomp%zsz(1), decomp%zsz(2), decomp%zsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft(1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1o, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
+      plan1 = fftw_plan_many_dft(1, decomp%zsz(3), &
+                                 decomp%zsz(1) * decomp%zsz(2), a1, decomp%zsz(3), &
+                                 decomp%zsz(1) * decomp%zsz(2), 1, a1o, decomp%zsz(3), &
+                                 decomp%zsz(1) * decomp%zsz(2), 1, isign, plan_type)
 #else
-    plan1 = fftwf_plan_many_dft(1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), a1, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, a1o, decomp%zsz(3), &
-         decomp%zsz(1)*decomp%zsz(2), 1, isign, plan_type)
+      plan1 = fftwf_plan_many_dft(1, decomp%zsz(3), &
+                                  decomp%zsz(1) * decomp%zsz(2), a1, decomp%zsz(3), &
+                                  decomp%zsz(1) * decomp%zsz(2), 1, a1o, decomp%zsz(3), &
+                                  decomp%zsz(1) * decomp%zsz(2), 1, isign, plan_type)
 #endif
 
-    call fftw_free(a1_p)
+      call fftw_free(a1_p)
 
-    return
-  end subroutine c2c_1m_z_plan
+      return
+   end subroutine c2c_1m_z_plan
 
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in X direction
-  subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp)
+   ! Return a FFTW3 plan for multiple 1D r2c FFTs in X direction
+   subroutine r2c_1m_x_plan(plan1, decomp_ph, decomp_sp)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
 
-    real(mytype), pointer :: a1(:,:,:)
-    complex(mytype), pointer :: a2(:,:,:)
-    type(C_PTR) :: a1_p, a2_p
-    integer(C_SIZE_T) :: sz
+      real(mytype), pointer :: a1(:, :, :)
+      complex(mytype), pointer :: a2(:, :, :)
+      type(C_PTR) :: a1_p, a2_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp_ph%xsz(1)*decomp_ph%xsz(2)*decomp_ph%xsz(3)
-    a1_p = fftw_alloc_real(sz)
-    call c_f_pointer(a1_p,a1, &
-         [decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)])
-    sz = decomp_sp%xsz(1)*decomp_sp%xsz(2)*decomp_sp%xsz(3)
-    a2_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a2_p,a2, &
-         [decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)])
+      sz = decomp_ph%xsz(1) * decomp_ph%xsz(2) * decomp_ph%xsz(3)
+      a1_p = fftw_alloc_real(sz)
+      call c_f_pointer(a1_p, a1, &
+                       [decomp_ph%xsz(1), decomp_ph%xsz(2), decomp_ph%xsz(3)])
+      sz = decomp_sp%xsz(1) * decomp_sp%xsz(2) * decomp_sp%xsz(3)
+      a2_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a2_p, a2, &
+                       [decomp_sp%xsz(1), decomp_sp%xsz(2), decomp_sp%xsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft_r2c(1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
+      plan1 = fftw_plan_many_dft_r2c(1, decomp_ph%xsz(1), &
+                                     decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
+                                     decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                     plan_type)
 #else
-    plan1 = fftwf_plan_many_dft_r2c(1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
-         decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
-         plan_type)
+      plan1 = fftwf_plan_many_dft_r2c(1, decomp_ph%xsz(1), &
+                                      decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_ph%xsz(1), 1, &
+                                      decomp_ph%xsz(1), a2, decomp_sp%xsz(1), 1, decomp_sp%xsz(1), &
+                                      plan_type)
 #endif
 
-    call fftw_free(a1_p)
-    call fftw_free(a2_p)
+      call fftw_free(a1_p)
+      call fftw_free(a2_p)
 
-    return
-  end subroutine r2c_1m_x_plan
+      return
+   end subroutine r2c_1m_x_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in X direction
-  subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph)
+   ! Return a FFTW3 plan for multiple 1D c2r FFTs in X direction
+   subroutine c2r_1m_x_plan(plan1, decomp_sp, decomp_ph)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
 
-    complex(mytype), pointer :: a1(:,:,:)
-    real(mytype), pointer :: a2(:,:,:)
-    type(C_PTR) :: a1_p, a2_p
-    integer(C_SIZE_T) :: sz
+      complex(mytype), pointer :: a1(:, :, :)
+      real(mytype), pointer :: a2(:, :, :)
+      type(C_PTR) :: a1_p, a2_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp_sp%xsz(1)*decomp_sp%xsz(2)*decomp_sp%xsz(3)
-    a1_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a1_p,a1, &
-         [decomp_sp%xsz(1),decomp_sp%xsz(2),decomp_sp%xsz(3)])
-    sz = decomp_ph%xsz(1)*decomp_ph%xsz(2)*decomp_ph%xsz(3)
-    a2_p = fftw_alloc_real(sz)
-    call c_f_pointer(a2_p,a2, &
-         [decomp_ph%xsz(1),decomp_ph%xsz(2),decomp_ph%xsz(3)])
+      sz = decomp_sp%xsz(1) * decomp_sp%xsz(2) * decomp_sp%xsz(3)
+      a1_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a1_p, a1, &
+                       [decomp_sp%xsz(1), decomp_sp%xsz(2), decomp_sp%xsz(3)])
+      sz = decomp_ph%xsz(1) * decomp_ph%xsz(2) * decomp_ph%xsz(3)
+      a2_p = fftw_alloc_real(sz)
+      call c_f_pointer(a2_p, a2, &
+                       [decomp_ph%xsz(1), decomp_ph%xsz(2), decomp_ph%xsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft_c2r(1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
+      plan1 = fftw_plan_many_dft_c2r(1, decomp_ph%xsz(1), &
+                                     decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
+                                     decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                     plan_type)
 #else
-    plan1 = fftwf_plan_many_dft_c2r(1, decomp_ph%xsz(1), &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
-         decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
-         plan_type)
+      plan1 = fftwf_plan_many_dft_c2r(1, decomp_ph%xsz(1), &
+                                      decomp_ph%xsz(2) * decomp_ph%xsz(3), a1, decomp_sp%xsz(1), 1, &
+                                      decomp_sp%xsz(1), a2, decomp_ph%xsz(1), 1, decomp_ph%xsz(1), &
+                                      plan_type)
 #endif
 
-    call fftw_free(a1_p)
-    call fftw_free(a2_p)
+      call fftw_free(a1_p)
+      call fftw_free(a2_p)
 
-    return
-  end subroutine c2r_1m_x_plan
+      return
+   end subroutine c2r_1m_x_plan
 
-  ! Return a FFTW3 plan for multiple 1D r2c FFTs in Z direction
-  subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp)
+   ! Return a FFTW3 plan for multiple 1D r2c FFTs in Z direction
+   subroutine r2c_1m_z_plan(plan1, decomp_ph, decomp_sp)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
 
-    real(mytype), pointer :: a1(:,:,:)
-    complex(mytype), pointer :: a2(:,:,:)
-    type(C_PTR) :: a1_p, a2_p
-    integer(C_SIZE_T) :: sz
+      real(mytype), pointer :: a1(:, :, :)
+      complex(mytype), pointer :: a2(:, :, :)
+      type(C_PTR) :: a1_p, a2_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp_ph%zsz(1)*decomp_ph%zsz(2)*decomp_ph%zsz(3)
-    a1_p = fftw_alloc_real(sz)
-    call c_f_pointer(a1_p,a1, &
-         [decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)])
-    sz = decomp_sp%zsz(1)*decomp_sp%zsz(2)*decomp_sp%zsz(3)
-    a2_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a2_p,a2, &
-         [decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)])
+      sz = decomp_ph%zsz(1) * decomp_ph%zsz(2) * decomp_ph%zsz(3)
+      a1_p = fftw_alloc_real(sz)
+      call c_f_pointer(a1_p, a1, &
+                       [decomp_ph%zsz(1), decomp_ph%zsz(2), decomp_ph%zsz(3)])
+      sz = decomp_sp%zsz(1) * decomp_sp%zsz(2) * decomp_sp%zsz(3)
+      a2_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a2_p, a2, &
+                       [decomp_sp%zsz(1), decomp_sp%zsz(2), decomp_sp%zsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft_r2c(1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
+      plan1 = fftw_plan_many_dft_r2c(1, decomp_ph%zsz(3), &
+                                     decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
+                                     decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
+                                     decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, plan_type)
 #else
-    plan1 = fftwf_plan_many_dft_r2c(1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, plan_type)
+      plan1 = fftwf_plan_many_dft_r2c(1, decomp_ph%zsz(3), &
+                                      decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_ph%zsz(3), &
+                                      decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, a2, decomp_sp%zsz(3), &
+                                      decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, plan_type)
 #endif
 
-    call fftw_free(a1_p)
-    call fftw_free(a2_p)
+      call fftw_free(a1_p)
+      call fftw_free(a2_p)
 
-    return
-  end subroutine r2c_1m_z_plan
+      return
+   end subroutine r2c_1m_z_plan
 
-  ! Return a FFTW3 plan for multiple 1D c2r FFTs in Z direction
-  subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph)
+   ! Return a FFTW3 plan for multiple 1D c2r FFTs in Z direction
+   subroutine c2r_1m_z_plan(plan1, decomp_sp, decomp_ph)
 
-    implicit none
+      implicit none
 
-    type(C_PTR) :: plan1
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
+      type(C_PTR) :: plan1
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_sp
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph
 
-    complex(mytype), pointer :: a1(:,:,:)
-    real(mytype), pointer :: a2(:,:,:)
-    type(C_PTR) :: a1_p, a2_p
-    integer(C_SIZE_T) :: sz
+      complex(mytype), pointer :: a1(:, :, :)
+      real(mytype), pointer :: a2(:, :, :)
+      type(C_PTR) :: a1_p, a2_p
+      integer(C_SIZE_T) :: sz
 
-    sz = decomp_sp%zsz(1)*decomp_sp%zsz(2)*decomp_sp%zsz(3)
-    a1_p = fftw_alloc_complex(sz)
-    call c_f_pointer(a1_p,a1, &
-         [decomp_sp%zsz(1),decomp_sp%zsz(2),decomp_sp%zsz(3)])
-    sz = decomp_ph%zsz(1)*decomp_ph%zsz(2)*decomp_ph%zsz(3)
-    a2_p = fftw_alloc_real(sz)
-    call c_f_pointer(a2_p,a2, &
-         [decomp_ph%zsz(1),decomp_ph%zsz(2),decomp_ph%zsz(3)])
+      sz = decomp_sp%zsz(1) * decomp_sp%zsz(2) * decomp_sp%zsz(3)
+      a1_p = fftw_alloc_complex(sz)
+      call c_f_pointer(a1_p, a1, &
+                       [decomp_sp%zsz(1), decomp_sp%zsz(2), decomp_sp%zsz(3)])
+      sz = decomp_ph%zsz(1) * decomp_ph%zsz(2) * decomp_ph%zsz(3)
+      a2_p = fftw_alloc_real(sz)
+      call c_f_pointer(a2_p, a2, &
+                       [decomp_ph%zsz(1), decomp_ph%zsz(2), decomp_ph%zsz(3)])
 
 #ifdef DOUBLE_PREC
-    plan1 =  fftw_plan_many_dft_c2r(1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
+      plan1 = fftw_plan_many_dft_c2r(1, decomp_ph%zsz(3), &
+                                     decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
+                                     decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
+                                     decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, plan_type)
 #else
-    plan1 = fftwf_plan_many_dft_c2r(1, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
-         decomp_sp%zsz(1)*decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2), 1, plan_type)
+      plan1 = fftwf_plan_many_dft_c2r(1, decomp_ph%zsz(3), &
+                                      decomp_ph%zsz(1) * decomp_ph%zsz(2), a1, decomp_sp%zsz(3), &
+                                      decomp_sp%zsz(1) * decomp_sp%zsz(2), 1, a2, decomp_ph%zsz(3), &
+                                      decomp_ph%zsz(1) * decomp_ph%zsz(2), 1, plan_type)
 #endif
 
-    call fftw_free(a1_p)
-    call fftw_free(a2_p)
-
-    return
-  end subroutine c2r_1m_z_plan
+      call fftw_free(a1_p)
+      call fftw_free(a2_p)
 
+      return
+   end subroutine c2r_1m_z_plan
 
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
+   !  This routine performs one-time initialisations for the FFT engine
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
-
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the FFTW (F2003 interface) engine *****'
-       write(*,*) ' '
-    end if
-
-    if (format == PHYSICAL_IN_X) then
-
-       ! For C2C transforms
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-       
-       ! For R2C/C2R tranforms
-       call r2c_1m_x_plan(plan(0,1), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(0,3), sp, FFTW_FORWARD )
-       call c2c_1m_z_plan(plan(2,3), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_x_plan(plan(2,1), sp, ph)
-
-    else if (format == PHYSICAL_IN_Z) then
-
-       ! For C2C transforms
-       call c2c_1m_z_plan(plan(-1,3), ph, FFTW_FORWARD )
-       call c2c_1m_y_plan(plan(-1,2), ph, FFTW_FORWARD ) 
-       call c2c_1m_x_plan(plan(-1,1), ph, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan( 1,1), ph, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan( 1,2), ph, FFTW_BACKWARD)
-       call c2c_1m_z_plan(plan( 1,3), ph, FFTW_BACKWARD)
-       
-       ! For R2C/C2R tranforms
-       call r2c_1m_z_plan(plan(0,3), ph, sp)
-       call c2c_1m_y_plan(plan(0,2), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(0,1), sp, FFTW_FORWARD )
-       call c2c_1m_x_plan(plan(2,1), sp, FFTW_BACKWARD)
-       call c2c_1m_y_plan(plan(2,2), sp, FFTW_BACKWARD)
-       call c2r_1m_z_plan(plan(2,3), sp, ph)
-       
-    end if
-
-    return
-  end subroutine init_fft_engine
+   subroutine init_fft_engine
+
+      implicit none
+
+      call decomp_2d_fft_log("FFTW (F2003 interface)")
+
+      if (format == PHYSICAL_IN_X) then
+
+         ! For C2C transforms
+         call c2c_1m_x_plan(plan(-1, 1), ph, FFTW_FORWARD)
+         call c2c_1m_y_plan(plan(-1, 2), ph, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(-1, 3), ph, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(1, 3), ph, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(1, 2), ph, FFTW_BACKWARD)
+         call c2c_1m_x_plan(plan(1, 1), ph, FFTW_BACKWARD)
+
+         ! For R2C/C2R tranforms
+         call r2c_1m_x_plan(plan(0, 1), ph, sp)
+         call c2c_1m_y_plan(plan(0, 2), sp, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(0, 3), sp, FFTW_FORWARD)
+         call c2c_1m_z_plan(plan(2, 3), sp, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(2, 2), sp, FFTW_BACKWARD)
+         call c2r_1m_x_plan(plan(2, 1), sp, ph)
+
+      else if (format == PHYSICAL_IN_Z) then
+
+         ! For C2C transforms
+         call c2c_1m_z_plan(plan(-1, 3), ph, FFTW_FORWARD)
+         call c2c_1m_y_plan(plan(-1, 2), ph, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(-1, 1), ph, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(1, 1), ph, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(1, 2), ph, FFTW_BACKWARD)
+         call c2c_1m_z_plan(plan(1, 3), ph, FFTW_BACKWARD)
+
+         ! For R2C/C2R tranforms
+         call r2c_1m_z_plan(plan(0, 3), ph, sp)
+         call c2c_1m_y_plan(plan(0, 2), sp, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(0, 1), sp, FFTW_FORWARD)
+         call c2c_1m_x_plan(plan(2, 1), sp, FFTW_BACKWARD)
+         call c2c_1m_y_plan(plan(2, 2), sp, FFTW_BACKWARD)
+         call c2r_1m_z_plan(plan(2, 3), sp, ph)
+
+      end if
+
+      return
+   end subroutine init_fft_engine
 
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
+   !  This routine performs one-time finalisations for the FFT engine
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
+   subroutine finalize_fft_engine
+
+      implicit none
 
-    implicit none
+      integer :: i, j
 
-    integer :: i,j
-    
-    do j=1,3
-       do i=-1,2
+      do j = 1, 3
+         do i = -1, 2
 #ifdef DOUBLE_PREC
-          call fftw_destroy_plan(plan(i,j))
+            call fftw_destroy_plan(plan(i, j))
 #else
-          call fftwf_destroy_plan(plan(i,j))
+            call fftwf_destroy_plan(plan(i, j))
 #endif
-       end do
-    end do
+         end do
+      end do
 
-    call fftw_cleanup()
+      call fftw_cleanup()
 
-    return
-  end subroutine finalize_fft_engine
+      return
+   end subroutine finalize_fft_engine
 
+   ! Following routines calculate multiple one-dimensional FFTs to form
+   ! the basis of three-dimensional FFTs.
 
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
+   ! c2c transform, multiple 1D FFTs in x direction
+   subroutine c2c_1m_x(inout, plan1)
 
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, plan1)
+      implicit none
 
-    implicit none
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR) :: plan1
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR) :: plan1
-
-    integer :: foo
-
-    foo = isign ! Silence unused dummy argument
-    
 #ifdef DOUBLE_PREC
-    call fftw_execute_dft(plan1, inout, inout)
+      call fftw_execute_dft(plan1, inout, inout)
 #else
-    call fftwf_execute_dft(plan1, inout, inout)
+      call fftwf_execute_dft(plan1, inout, inout)
 #endif
 
-    return
-  end subroutine c2c_1m_x
-
+      return
+   end subroutine c2c_1m_x
 
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, plan1)
+   ! c2c transform, multiple 1D FFTs in y direction
+   subroutine c2c_1m_y(inout, plan1)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR) :: plan1
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR) :: plan1
 
-    integer :: k, s3
+      integer :: k, s3
 
-    integer :: foo
+      s3 = size(inout, 3)
 
-    foo = isign ! Silence unused dummy argument
-    
-    s3 = size(inout,3)
-
-    do k=1,s3  ! transform on one Z-plane at a time
+      do k = 1, s3  ! transform on one Z-plane at a time
 #ifdef DOUBLE_PREC
-       call fftw_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
+         call fftw_execute_dft(plan1, inout(:, :, k), inout(:, :, k))
 #else
-       call fftwf_execute_dft(plan1, inout(:,:,k), inout(:,:,k))
+         call fftwf_execute_dft(plan1, inout(:, :, k), inout(:, :, k))
 #endif
-    end do
-
-    return
-  end subroutine c2c_1m_y
+      end do
 
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, plan1)
+      return
+   end subroutine c2c_1m_y
 
-    implicit none
+   ! c2c transform, multiple 1D FFTs in z direction
+   subroutine c2c_1m_z(inout, plan1)
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    type(C_PTR) :: plan1
+      implicit none
 
-    integer :: foo
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      type(C_PTR) :: plan1
 
-    foo = isign ! Silence unused dummy argument
-    
 #ifdef DOUBLE_PREC
-       call fftw_execute_dft(plan1, inout, inout)
+      call fftw_execute_dft(plan1, inout, inout)
 #else
-       call fftwf_execute_dft(plan1, inout, inout)
+      call fftwf_execute_dft(plan1, inout, inout)
 #endif
 
-    return
-  end subroutine c2c_1m_z
+      return
+   end subroutine c2c_1m_z
 
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
+   ! r2c transform, multiple 1D FFTs in x direction
+   subroutine r2c_1m_x(input, output)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(INOUT)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
+      real(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call fftw_execute_dft_r2c(plan(0,1), input, output)
+      call fftw_execute_dft_r2c(plan(0, 1), input, output)
 #else
-    call fftwf_execute_dft_r2c(plan(0,1), input, output)
-#endif    
+      call fftwf_execute_dft_r2c(plan(0, 1), input, output)
+#endif
 
-    return
+      return
 
-  end subroutine r2c_1m_x
+   end subroutine r2c_1m_x
 
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
+   ! r2c transform, multiple 1D FFTs in z direction
+   subroutine r2c_1m_z(input, output)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(INOUT)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
+      real(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call fftw_execute_dft_r2c(plan(0,3), input, output)
+      call fftw_execute_dft_r2c(plan(0, 3), input, output)
 #else
-    call fftwf_execute_dft_r2c(plan(0,3), input, output)
+      call fftwf_execute_dft_r2c(plan(0, 3), input, output)
 #endif
 
-    return
+      return
 
-  end subroutine r2c_1m_z
+   end subroutine r2c_1m_z
 
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
+   ! c2r transform, multiple 1D FFTs in x direction
+   subroutine c2r_1m_x(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
+      complex(mytype), dimension(:, :, :), intent(INOUT)  ::  input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call fftw_execute_dft_c2r(plan(2,1), input, output)
+      call fftw_execute_dft_c2r(plan(2, 1), input, output)
 #else
-    call fftwf_execute_dft_c2r(plan(2,1), input, output)
+      call fftwf_execute_dft_c2r(plan(2, 1), input, output)
 #endif
 
-    return
+      return
 
-  end subroutine c2r_1m_x
+   end subroutine c2r_1m_x
 
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
+   ! c2r transform, multiple 1D FFTs in z direction
+   subroutine c2r_1m_z(input, output)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
 
 #ifdef DOUBLE_PREC
-    call fftw_execute_dft_c2r(plan(2,3), input, output)
+      call fftw_execute_dft_c2r(plan(2, 3), input, output)
 #else
-    call fftwf_execute_dft_c2r(plan(2,3), input, output)
-#endif    
-
-    return
-
-  end subroutine c2r_1m_z
+      call fftwf_execute_dft_c2r(plan(2, 3), input, output)
+#endif
 
+      return
 
+   end subroutine c2r_1m_z
 
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
+   ! 3D FFT - complex to complex
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
-    
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign
+   subroutine fft_3d_c2c(in, out, isign)
+
+      implicit none
+
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out
+      integer, intent(IN) :: isign
 
 #ifndef OVERWRITE
-    complex(mytype), pointer :: wk1(:,:,:)
-    integer(C_SIZE_T) :: sz
-    type(C_PTR) :: wk1_p
+      complex(mytype), pointer :: wk1(:, :, :)
+      integer(C_SIZE_T) :: sz
+      type(C_PTR) :: wk1_p
 
-    wk1_p = c_null_ptr ! Initialise to NULL pointer
+      wk1_p = c_null_ptr ! Initialise to NULL pointer
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
 #endif
 
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-       
-       ! ===== 1D FFTs in X =====
+      if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_FORWARD .OR. &
+          format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_BACKWARD) then
+
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in,isign,plan(isign,1))
+         call c2c_1m_x(in, plan(isign, 1))
 #else
-       sz = ph%xsz(1)*ph%xsz(2)*ph%xsz(3)
-       wk1_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk1_p, wk1, [ph%xsz(1),ph%xsz(2),ph%xsz(3)])
-       wk1 = in
-       call c2c_1m_x(wk1,isign,plan(isign,1))
+         sz = ph%xsz(1) * ph%xsz(2) * ph%xsz(3)
+         wk1_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk1_p, wk1, [ph%xsz(1), ph%xsz(2), ph%xsz(3)])
+         wk1 = in
+         call c2c_1m_x(wk1, plan(isign, 1))
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
 
-       if (dims(1)>1) then
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in,wk2_c2c,ph)
+            call transpose_x_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_x_to_y(wk1,wk2_c2c,ph)
+            call transpose_x_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else
+            call c2c_1m_y(wk2_c2c, plan(isign, 2))
+         else
 #ifdef OVERWRITE
-          call c2c_1m_y(in,isign,plan(isign,2))
+            call c2c_1m_y(in, plan(isign, 2))
 #else
-          call c2c_1m_y(wk1,isign,plan(isign,2))
+            call c2c_1m_y(wk1, plan(isign, 2))
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_c2c,out,ph)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_c2c, out, ph)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in,out,ph)
+            call transpose_y_to_z(in, out, ph)
 #else
-          call transpose_y_to_z(wk1,out,ph)
+            call transpose_y_to_z(wk1, out, ph)
 #endif
-       end if
-       call c2c_1m_z(out,isign,plan(isign,3))
+         end if
+         call c2c_1m_z(out, plan(isign, 3))
 
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. & 
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
+      else if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_BACKWARD &
+               .OR. &
+               format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_FORWARD) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in,isign,plan(isign,3))
+         call c2c_1m_z(in, plan(isign, 3))
 #else
-       sz = ph%zsz(1)*ph%zsz(2)*ph%zsz(3)
-       wk1_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk1_p, wk1, [ph%zsz(1),ph%zsz(2),ph%zsz(3)])
-       wk1 = in
-       call c2c_1m_z(wk1,isign,plan(isign,3))
+         sz = ph%zsz(1) * ph%zsz(2) * ph%zsz(3)
+         wk1_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk1_p, wk1, [ph%zsz(1), ph%zsz(2), ph%zsz(3)])
+         wk1 = in
+         call c2c_1m_z(wk1, plan(isign, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,wk2_c2c,ph)
+            call transpose_z_to_y(in, wk2_c2c, ph)
 #else
-          call transpose_z_to_y(wk1,wk2_c2c,ph)
+            call transpose_z_to_y(wk1, wk2_c2c, ph)
 #endif
-          call c2c_1m_y(wk2_c2c,isign,plan(isign,2))
-       else  ! out==wk2_c2c if 1D decomposition
+            call c2c_1m_y(wk2_c2c, plan(isign, 2))
+         else  ! out==wk2_c2c if 1D decomposition
 #ifdef OVERWRITE
-          call transpose_z_to_y(in,out,ph)
+            call transpose_z_to_y(in, out, ph)
 #else
-          call transpose_z_to_y(wk1,out,ph)
+            call transpose_z_to_y(wk1, out, ph)
 #endif
-          call c2c_1m_y(out,isign,plan(isign,2))
-       end if
+            call c2c_1m_y(out, plan(isign, 2))
+         end if
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_c2c,out,ph)
-       end if
-       call c2c_1m_x(out,isign,plan(isign,1))
-       
-    end if
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_c2c, out, ph)
+         end if
+         call c2c_1m_x(out, plan(isign, 1))
+
+      end if
 
 #ifndef OVERWRITE
-    call fftw_free(wk1_p)
+      call fftw_free(wk1_p)
+      nullify (wk1)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
 #endif
 
-    return
-  end subroutine fft_3d_c2c
+      return
+   end subroutine fft_3d_c2c
 
-  
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
+   ! 3D forward FFT - real to complex
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
-    
-    implicit none
-    
-    real(mytype), dimension(:,:,:), intent(INOUT) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
+   subroutine fft_3d_r2c(in_r, out_c)
+
+      implicit none
+
+      real(mytype), dimension(:, :, :), intent(INOUT) :: in_r
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out_c
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
 #endif
 
-    if (format==PHYSICAL_IN_X) then
-
-       ! ===== 1D FFTs in X =====
-       call r2c_1m_x(in_r,wk13)
-
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_x_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else
-          call c2c_1m_y(wk13,-1,plan(0,2))
-       end if
-
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,out_c,sp)
-       else
-          call transpose_y_to_z(wk13,out_c,sp)
-       end if
-       call c2c_1m_z(out_c,-1,plan(0,3))
-                
-    else if (format==PHYSICAL_IN_Z) then
-
-       ! ===== 1D FFTs in Z =====
-       call r2c_1m_z(in_r,wk13)
-
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
-          call transpose_z_to_y(wk13,wk2_r2c,sp)
-          call c2c_1m_y(wk2_r2c,-1,plan(0,2))
-       else  ! out_c==wk2_r2c if 1D decomposition
-          call transpose_z_to_y(wk13,out_c,sp)
-          call c2c_1m_y(out_c,-1,plan(0,2))
-       end if
-
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,out_c,sp)
-       end if
-       call c2c_1m_x(out_c,-1,plan(0,1))
-
-    end if
+      if (format == PHYSICAL_IN_X) then
+
+         ! ===== 1D FFTs in X =====
+         call r2c_1m_x(in_r, wk13)
+
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            call transpose_x_to_y(wk13, wk2_r2c, sp)
+            call c2c_1m_y(wk2_r2c, plan(0, 2))
+         else
+            call c2c_1m_y(wk13, plan(0, 2))
+         end if
+
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, out_c, sp)
+         else
+            call transpose_y_to_z(wk13, out_c, sp)
+         end if
+         call c2c_1m_z(out_c, plan(0, 3))
+
+      else if (format == PHYSICAL_IN_Z) then
+
+         ! ===== 1D FFTs in Z =====
+         call r2c_1m_z(in_r, wk13)
+
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
+            call transpose_z_to_y(wk13, wk2_r2c, sp)
+            call c2c_1m_y(wk2_r2c, plan(0, 2))
+         else  ! out_c==wk2_r2c if 1D decomposition
+            call transpose_z_to_y(wk13, out_c, sp)
+            call c2c_1m_y(out_c, plan(0, 2))
+         end if
+
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_r2c, out_c, sp)
+         end if
+         call c2c_1m_x(out_c, plan(0, 1))
+
+      end if
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
 #endif
 
-    return
-  end subroutine fft_3d_r2c
-  
-  
+      return
+   end subroutine fft_3d_r2c
+
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
+   ! 3D inverse FFT - complex to real
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
-    
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
+   subroutine fft_3d_c2r(in_c, out_r)
+
+      implicit none
+
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: in_c
+      real(mytype), dimension(:, :, :), intent(OUT) :: out_r
 
 #ifndef OVERWRITE
-    complex(mytype), pointer :: wk1(:,:,:)
-    integer(C_SIZE_T) :: sz
-    type(C_PTR) :: wk1_p
+      complex(mytype), pointer :: wk1(:, :, :)
+      integer(C_SIZE_T) :: sz
+      type(C_PTR) :: wk1_p
 
-    wk1_p = c_null_ptr ! Initialise to NULL pointer
+      wk1_p = c_null_ptr ! Initialise to NULL pointer
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
 #endif
 
-    if (format==PHYSICAL_IN_X) then
+      if (format == PHYSICAL_IN_X) then
 
-       ! ===== 1D FFTs in Z =====
+         ! ===== 1D FFTs in Z =====
 #ifdef OVERWRITE
-       call c2c_1m_z(in_c,1,plan(2,3))       
+         call c2c_1m_z(in_c, plan(2, 3))
 #else
-       sz = sp%zsz(1)*sp%zsz(2)*sp%zsz(3)
-       wk1_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk1_p, wk1, [sp%zsz(1),sp%zsz(2),sp%zsz(3)])
-       wk1 = in_c
-       call c2c_1m_z(wk1,1,plan(2,3))
+         sz = sp%zsz(1) * sp%zsz(2) * sp%zsz(3)
+         wk1_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk1_p, wk1, [sp%zsz(1), sp%zsz(2), sp%zsz(3)])
+         wk1 = in_c
+         call c2c_1m_z(wk1, plan(2, 3))
 #endif
 
-       ! ===== Swap Z --> Y; 1D FFTs in Y =====
+         ! ===== Swap Z --> Y; 1D FFTs in Y =====
 #ifdef OVERWRITE
-       call transpose_z_to_y(in_c,wk2_r2c,sp)
+         call transpose_z_to_y(in_c, wk2_r2c, sp)
 #else
-       call transpose_z_to_y(wk1,wk2_r2c,sp)
+         call transpose_z_to_y(wk1, wk2_r2c, sp)
 #endif
-       call c2c_1m_y(wk2_r2c,1,plan(2,2))
+         call c2c_1m_y(wk2_r2c, plan(2, 2))
 
-       ! ===== Swap Y --> X; 1D FFTs in X =====
-       if (dims(1)>1) then
-          call transpose_y_to_x(wk2_r2c,wk13,sp)
-          call c2r_1m_x(wk13,out_r)
-       else
-          call c2r_1m_x(wk2_r2c,out_r)
-       end if
+         ! ===== Swap Y --> X; 1D FFTs in X =====
+         if (dims(1) > 1) then
+            call transpose_y_to_x(wk2_r2c, wk13, sp)
+            call c2r_1m_x(wk13, out_r)
+         else
+            call c2r_1m_x(wk2_r2c, out_r)
+         end if
 
-    else if (format==PHYSICAL_IN_Z) then
+      else if (format == PHYSICAL_IN_Z) then
 
-       ! ===== 1D FFTs in X =====
+         ! ===== 1D FFTs in X =====
 #ifdef OVERWRITE
-       call c2c_1m_x(in_c,1,plan(2,1))
+         call c2c_1m_x(in_c, plan(2, 1))
 #else
-       sz = sp%xsz(1)*sp%xsz(2)*sp%xsz(3)
-       wk1_p = fftw_alloc_complex(sz)
-       call c_f_pointer(wk1_p, wk1, [sp%xsz(1),sp%xsz(2),sp%xsz(3)])
-       wk1 = in_c
-       call c2c_1m_x(wk1,1,plan(2,1))
+         sz = sp%xsz(1) * sp%xsz(2) * sp%xsz(3)
+         wk1_p = fftw_alloc_complex(sz)
+         call c_f_pointer(wk1_p, wk1, [sp%xsz(1), sp%xsz(2), sp%xsz(3)])
+         wk1 = in_c
+         call c2c_1m_x(wk1, plan(2, 1))
 #endif
 
-       ! ===== Swap X --> Y; 1D FFTs in Y =====
-       if (dims(1)>1) then
+         ! ===== Swap X --> Y; 1D FFTs in Y =====
+         if (dims(1) > 1) then
 #ifdef OVERWRITE
-          call transpose_x_to_y(in_c,wk2_r2c,sp)
+            call transpose_x_to_y(in_c, wk2_r2c, sp)
 #else
-          call transpose_x_to_y(wk1,wk2_r2c,sp)
+            call transpose_x_to_y(wk1, wk2_r2c, sp)
 #endif
-          call c2c_1m_y(wk2_r2c,1,plan(2,2))
-       else  ! in_c==wk2_r2c if 1D decomposition
+            call c2c_1m_y(wk2_r2c, plan(2, 2))
+         else  ! in_c==wk2_r2c if 1D decomposition
 #ifdef OVERWRITE
-          call c2c_1m_y(in_c,1,plan(2,2))
+            call c2c_1m_y(in_c, plan(2, 2))
 #else
-          call c2c_1m_y(wk1,1,plan(2,2))
+            call c2c_1m_y(wk1, plan(2, 2))
 #endif
-       end if
+         end if
 
-       ! ===== Swap Y --> Z; 1D FFTs in Z =====
-       if (dims(1)>1) then
-          call transpose_y_to_z(wk2_r2c,wk13,sp)
-       else
+         ! ===== Swap Y --> Z; 1D FFTs in Z =====
+         if (dims(1) > 1) then
+            call transpose_y_to_z(wk2_r2c, wk13, sp)
+         else
 #ifdef OVERWRITE
-          call transpose_y_to_z(in_c,wk13,sp)
+            call transpose_y_to_z(in_c, wk13, sp)
 #else
-          call transpose_y_to_z(wk1,wk13,sp)
+            call transpose_y_to_z(wk1, wk13, sp)
 #endif
-       end if
-       call c2r_1m_z(wk13,out_r)
+         end if
+         call c2r_1m_z(wk13, out_r)
 
-    end if
+      end if
 
 #ifndef OVERWRITE
-    call fftw_free(wk1_p)
+      call fftw_free(wk1_p)
+      nullify (wk1)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
 #endif
 
-    return
-  end subroutine fft_3d_c2r
+      return
+   end subroutine fft_3d_c2r
 
-  
 end module decomp_2d_fft
diff --git a/src/fft_generic.f90 b/src/fft_generic.f90
index d66425bd..6d9e18cb 100644
--- a/src/fft_generic.f90
+++ b/src/fft_generic.f90
@@ -1,332 +1,319 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This is the 'generic' implementation of the FFT library
 
 module decomp_2d_fft
 
-  use decomp_2d  ! 2D decomposition module
-  use glassman
+   use iso_c_binding, only: c_f_pointer, c_loc
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d  ! 2D decomposition module
+   use glassman
 
-  implicit none
+   implicit none
 
-  private        ! Make everything private unless declared public
+   private        ! Make everything private unless declared public
 
-  ! engine-specific global variables
-  complex(mytype), allocatable, dimension(:) :: buf, scratch
+   ! engine-specific global variables
+   complex(mytype), allocatable, dimension(:) :: buf, scratch
 
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
+   integer, parameter, public :: D2D_FFT_BACKEND = D2D_FFT_BACKEND_GENERIC
+
+   ! common code used for all engines, including global variables,
+   ! generic interface definitions and several subroutines
 #include "fft_common.f90"
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
+   !  This routine performs one-time initialisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
+   subroutine init_fft_engine
 
-    integer :: cbuf_size
+      implicit none
 
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the generic FFT engine *****'
-       write(*,*) ' '
-    end if
+      integer :: cbuf_size
 
-    cbuf_size = max(ph%xsz(1), ph%ysz(2))
-    cbuf_size = max(cbuf_size, ph%zsz(3))
-    allocate(buf(cbuf_size))
-    allocate(scratch(cbuf_size))
+      call decomp_2d_fft_log("generic")
 
-    return
-  end subroutine init_fft_engine
+      cbuf_size = max(ph%xsz(1), ph%ysz(2))
+      cbuf_size = max(cbuf_size, ph%zsz(3))
+      allocate (buf(cbuf_size))
+      allocate (scratch(cbuf_size))
 
+   end subroutine init_fft_engine
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
+   !  This routine performs one-time finalisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
+   subroutine finalize_fft_engine
 
-    if (allocated(buf)) deallocate(buf)
-    if (allocated(scratch)) deallocate(scratch)
+      implicit none
 
-    return
-  end subroutine finalize_fft_engine
+      if (allocated(buf)) deallocate (buf)
+      if (allocated(scratch)) deallocate (scratch)
 
+      return
+   end subroutine finalize_fft_engine
 
-  ! Following routines calculate multiple one-dimensional FFTs to form 
-  ! the basis of three-dimensional FFTs.
+   ! Following routines calculate multiple one-dimensional FFTs to form
+   ! the basis of three-dimensional FFTs.
 
-  ! c2c transform, multiple 1D FFTs in x direction
-  subroutine c2c_1m_x(inout, isign, decomp)
+   ! c2c transform, multiple 1D FFTs in x direction
+   subroutine c2c_1m_x(inout, isign, decomp)
 
-    !$acc routine(spcfft) seq
+      !$acc routine(spcfft) seq
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    integer :: i,j,k
+      integer :: i, j, k
 
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do k=1,decomp%xsz(3)
-       do j=1,decomp%xsz(2)
-          do i=1,decomp%xsz(1)
-             buf(i) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%xsz(1),isign,scratch)
-          do i=1,decomp%xsz(1)
-             inout(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do k = 1, decomp%xsz(3)
+         do j = 1, decomp%xsz(2)
+            do i = 1, decomp%xsz(1)
+               buf(i) = inout(i, j, k)
+            end do
+            call spcfft(buf, decomp%xsz(1), isign, scratch)
+            do i = 1, decomp%xsz(1)
+               inout(i, j, k) = buf(i)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
 
-    return
+      return
 
-  end subroutine c2c_1m_x
+   end subroutine c2c_1m_x
 
-  ! c2c transform, multiple 1D FFTs in y direction
-  subroutine c2c_1m_y(inout, isign, decomp)
+   ! c2c transform, multiple 1D FFTs in y direction
+   subroutine c2c_1m_y(inout, isign, decomp)
 
-    !$acc routine(spcfft) seq
+      !$acc routine(spcfft) seq
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    integer :: i,j,k
+      integer :: i, j, k
 
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do k=1,decomp%ysz(3)
-       do i=1,decomp%ysz(1)
-          do j=1,decomp%ysz(2)
-             buf(j) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%ysz(2),isign,scratch)
-          do j=1,decomp%ysz(2)
-             inout(i,j,k) = buf(j)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do k = 1, decomp%ysz(3)
+         do i = 1, decomp%ysz(1)
+            do j = 1, decomp%ysz(2)
+               buf(j) = inout(i, j, k)
+            end do
+            call spcfft(buf, decomp%ysz(2), isign, scratch)
+            do j = 1, decomp%ysz(2)
+               inout(i, j, k) = buf(j)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
 
-    return
+      return
 
-  end subroutine c2c_1m_y
+   end subroutine c2c_1m_y
 
-  ! c2c transform, multiple 1D FFTs in z direction
-  subroutine c2c_1m_z(inout, isign, decomp)
+   ! c2c transform, multiple 1D FFTs in z direction
+   subroutine c2c_1m_z(inout, isign, decomp)
 
-    !$acc routine(spcfft) seq
+      !$acc routine(spcfft) seq
+
+      implicit none
 
-    implicit none
+      complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+      integer, intent(IN) :: isign
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    integer, intent(IN) :: isign
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+      integer :: i, j, k
 
-    integer :: i,j,k
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do j = 1, decomp%zsz(2)
+         do i = 1, decomp%zsz(1)
+            do k = 1, decomp%zsz(3)
+               buf(k) = inout(i, j, k)
+            end do
+            call spcfft(buf, decomp%zsz(3), isign, scratch)
+            do k = 1, decomp%zsz(3)
+               inout(i, j, k) = buf(k)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
 
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do j=1,decomp%zsz(2)
-       do i=1,decomp%zsz(1)
-          do k=1,decomp%zsz(3)
-             buf(k) = inout(i,j,k)
-          end do
-          call spcfft(buf,decomp%zsz(3),isign,scratch)
-          do k=1,decomp%zsz(3)
-             inout(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
+      return
 
-    return
+   end subroutine c2c_1m_z
 
-  end subroutine c2c_1m_z
-
-  ! r2c transform, multiple 1D FFTs in x direction
-  subroutine r2c_1m_x(input, output)
-
-    !$acc routine(spcfft) seq
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d1
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d1 = size(output,1)
-
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do k=1,s3
-       do j=1,s2
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do i=1,s1
-             buf(i) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,s1,-1,scratch)
-          ! note d1 ~ s1/2+1
-          ! simply drop the redundant part of the complex output
-          do i=1,d1
-             output(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
-
-    return
-
-  end subroutine r2c_1m_x
-
-  ! r2c transform, multiple 1D FFTs in z direction
-  subroutine r2c_1m_z(input, output)
-
-    !$acc routine(spcfft) seq
-
-    implicit none
-
-    real(mytype), dimension(:,:,:), intent(IN)  ::  input
-    complex(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, s1,s2,s3, d3
-
-    s1 = size(input,1)
-    s2 = size(input,2)
-    s3 = size(input,3)
-    d3 = size(output,3)
-
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do j=1,s2
-       do i=1,s1
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do k=1,s3
-             buf(k) = cmplx(input(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,s3,-1,scratch)
-          ! note d3 ~ s3/2+1
-          ! simply drop the redundant part of the complex output
-          do k=1,d3
-             output(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
-
-    return
-
-  end subroutine r2c_1m_z
-
-  ! c2r transform, multiple 1D FFTs in x direction
-  subroutine c2r_1m_x(input, output)
-
-    !$acc routine(spcfft) seq
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do k=1,d3
-       do j=1,d2
-          ! Glassman's FFT is c2c only, 
-          ! needing some pre- and post-processing for c2r
-          do i=1,d1/2+1
-             buf(i) = input(i,j,k)
-          end do
-          ! expanding to a full-size complex array
-          ! For odd N, the storage is:
-          !  1, 2, ...... N/2+1   integer division rounded down
-          !     N, ...... N/2+2   => a(i) is conjugate of a(N+2-i)
-          ! For even N, the storage is:
-          !  1, 2, ...... N/2  , N/2+1
-          !     N, ...... N/2+2  again a(i) conjugate of a(N+2-i)
-          do i=d1/2+2,d1
-             buf(i) =  conjg(buf(d1+2-i))
-          end do
-          call spcfft(buf,d1,1,scratch)
-          do i=1,d1
-             ! simply drop imaginary part
-             output(i,j,k) = real(buf(i), kind=mytype)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
-
-    return
-
-  end subroutine c2r_1m_x
-
-  ! c2r transform, multiple 1D FFTs in z direction
-  subroutine c2r_1m_z(input, output)
-
-    !$acc routine(spcfft) seq
-
-    implicit none
-
-    complex(mytype), dimension(:,:,:), intent(IN)  ::  input
-    real(mytype), dimension(:,:,:), intent(OUT) :: output
-
-    integer :: i,j,k, d1,d2,d3
-
-    d1 = size(output,1)
-    d2 = size(output,2)
-    d3 = size(output,3)
-
-    !$acc parallel loop gang vector collapse(2) private(buf, scratch)
-    do j=1,d2
-       do i=1,d1
-          do k=1,d3/2+1
-             buf(k) = input(i,j,k)
-          end do
-          do k=d3/2+2,d3
-             buf(k) =  conjg(buf(d3+2-k))
-          end do
-          call spcfft(buf,d3,1,scratch)
-          do k=1,d3
-             output(i,j,k) = real(buf(k), kind=mytype)
-          end do
-       end do
-    end do
-    !$acc end parallel loop
-
-    return
-
-  end subroutine c2r_1m_z
+   ! r2c transform, multiple 1D FFTs in x direction
+   subroutine r2c_1m_x(input, output)
 
+      !$acc routine(spcfft) seq
+
+      implicit none
+
+      real(mytype), dimension(:, :, :), intent(IN)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
+
+      integer :: i, j, k, s1, s2, s3, d1
+
+      s1 = size(input, 1)
+      s2 = size(input, 2)
+      s3 = size(input, 3)
+      d1 = size(output, 1)
+
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do k = 1, s3
+         do j = 1, s2
+            ! Glassman's FFT is c2c only,
+            ! needing some pre- and post-processing for r2c
+            ! pack real input in complex storage
+            do i = 1, s1
+               buf(i) = cmplx(input(i, j, k), 0._mytype, kind=mytype)
+            end do
+            call spcfft(buf, s1, -1, scratch)
+            ! note d1 ~ s1/2+1
+            ! simply drop the redundant part of the complex output
+            do i = 1, d1
+               output(i, j, k) = buf(i)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
+
+      return
+
+   end subroutine r2c_1m_x
+
+   ! r2c transform, multiple 1D FFTs in z direction
+   subroutine r2c_1m_z(input, output)
+
+      !$acc routine(spcfft) seq
+
+      implicit none
+
+      real(mytype), dimension(:, :, :), intent(IN)  ::  input
+      complex(mytype), dimension(:, :, :), intent(OUT) :: output
+
+      integer :: i, j, k, s1, s2, s3, d3
+
+      s1 = size(input, 1)
+      s2 = size(input, 2)
+      s3 = size(input, 3)
+      d3 = size(output, 3)
+
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do j = 1, s2
+         do i = 1, s1
+            ! Glassman's FFT is c2c only,
+            ! needing some pre- and post-processing for r2c
+            ! pack real input in complex storage
+            do k = 1, s3
+               buf(k) = cmplx(input(i, j, k), 0._mytype, kind=mytype)
+            end do
+            call spcfft(buf, s3, -1, scratch)
+            ! note d3 ~ s3/2+1
+            ! simply drop the redundant part of the complex output
+            do k = 1, d3
+               output(i, j, k) = buf(k)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
+
+      return
+
+   end subroutine r2c_1m_z
+
+   ! c2r transform, multiple 1D FFTs in x direction
+   subroutine c2r_1m_x(input, output)
+
+      !$acc routine(spcfft) seq
+
+      implicit none
+
+      complex(mytype), dimension(:, :, :), intent(IN)  ::  input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
+
+      integer :: i, j, k, d1, d2, d3
+
+      d1 = size(output, 1)
+      d2 = size(output, 2)
+      d3 = size(output, 3)
+
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do k = 1, d3
+         do j = 1, d2
+            ! Glassman's FFT is c2c only,
+            ! needing some pre- and post-processing for c2r
+            do i = 1, d1 / 2 + 1
+               buf(i) = input(i, j, k)
+            end do
+            ! expanding to a full-size complex array
+            ! For odd N, the storage is:
+            !  1, 2, ...... N/2+1   integer division rounded down
+            !     N, ...... N/2+2   => a(i) is conjugate of a(N+2-i)
+            ! For even N, the storage is:
+            !  1, 2, ...... N/2  , N/2+1
+            !     N, ...... N/2+2  again a(i) conjugate of a(N+2-i)
+            do i = d1 / 2 + 2, d1
+               buf(i) = conjg(buf(d1 + 2 - i))
+            end do
+            call spcfft(buf, d1, 1, scratch)
+            do i = 1, d1
+               ! simply drop imaginary part
+               output(i, j, k) = real(buf(i), kind=mytype)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
+
+      return
+
+   end subroutine c2r_1m_x
+
+   ! c2r transform, multiple 1D FFTs in z direction
+   subroutine c2r_1m_z(input, output)
+
+      !$acc routine(spcfft) seq
+
+      implicit none
+
+      complex(mytype), dimension(:, :, :), intent(IN)  ::  input
+      real(mytype), dimension(:, :, :), intent(OUT) :: output
+
+      integer :: i, j, k, d1, d2, d3
+
+      d1 = size(output, 1)
+      d2 = size(output, 2)
+      d3 = size(output, 3)
+
+      !$acc parallel loop gang vector collapse(2) private(buf, scratch)
+      do j = 1, d2
+         do i = 1, d1
+            do k = 1, d3 / 2 + 1
+               buf(k) = input(i, j, k)
+            end do
+            do k = d3 / 2 + 2, d3
+               buf(k) = conjg(buf(d3 + 2 - k))
+            end do
+            call spcfft(buf, d3, 1, scratch)
+            do k = 1, d3
+               output(i, j, k) = real(buf(k), kind=mytype)
+            end do
+         end do
+      end do
+      !$acc end parallel loop
+
+      return
+
+   end subroutine c2r_1m_z
 
 #include "fft_common_3d.f90"
 
-
 end module decomp_2d_fft
diff --git a/src/fft_log.f90 b/src/fft_log.f90
new file mode 100644
index 00000000..059bb258
--- /dev/null
+++ b/src/fft_log.f90
@@ -0,0 +1,63 @@
+!! SPDX-License-Identifier: BSD-3-Clause
+
+submodule(decomp_2d_fft) d2d_fft_log
+
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d
+
+   implicit none
+
+contains
+
+   !
+   ! Log subroutine for the decomp_2d_fft module
+   !
+   module subroutine decomp_2d_fft_log(backend)
+
+      implicit none
+
+      ! Argument
+      character(len=*), intent(in) :: backend
+
+      ! Local variable
+      integer :: io_unit
+
+      if ((decomp_log == D2D_LOG_STDOUT .and. nrank == 0) .or. &
+          (decomp_log == D2D_LOG_TOFILE .and. nrank == 0) .or. &
+          (decomp_log == D2D_LOG_TOFILE_FULL)) then
+         io_unit = d2d_listing_get_unit()
+         write (io_unit, *) ''
+         write (io_unit, *) '***** Using the '//trim(backend)//' FFT engine *****'
+         write (io_unit, *) ''
+         write (io_unit, *) 'Id of the backend : ', D2D_FFT_BACKEND
+         if (format == PHYSICAL_IN_X) then
+            write (io_unit, *) 'Format : Physical in x'
+         else
+            write (io_unit, *) 'Format : Physical in z'
+         end if
+         write (io_unit, *) ''
+         if (nx_fft == nx_global .and. ny_fft == ny_global .and. nz_fft == nz_global) then
+            write (io_unit, *) 'decomp_info object ph is a pointer to decomp_main'
+         end if
+         call decomp_info_print(decomp_2d_fft_get_ph(), io_unit, "ph")
+         call decomp_info_print(decomp_2d_fft_get_sp(), io_unit, "sp")
+         write (io_unit, *) ''
+#ifdef OVERWRITE
+         if (D2D_FFT_BACKEND == D2D_FFT_BACKEND_GENERIC) then
+            write (io_unit, *) 'OVERWRITE is supported but transforms are not performed in-place'
+            write (io_unit, *) ''
+         else if (D2D_FFT_BACKEND == D2D_FFT_BACKEND_CUFFT .or. &
+                  D2D_FFT_BACKEND == D2D_FFT_BACKEND_FFTW3 .or. &
+                  D2D_FFT_BACKEND == D2D_FFT_BACKEND_FFTW3_F03 .or. &
+                  D2D_FFT_BACKEND == D2D_FFT_BACKEND_MKL) then
+            write (io_unit, *) 'OVERWRITE is supported but in-place transforms is limited to complex transforms'
+            write (io_unit, *) ''
+         end if
+#endif
+         call d2d_listing_close_unit(io_unit)
+      end if
+
+   end subroutine decomp_2d_fft_log
+
+end submodule d2d_fft_log
diff --git a/src/fft_mkl.f90 b/src/fft_mkl.f90
index b50120aa..da064e46 100644
--- a/src/fft_mkl.f90
+++ b/src/fft_mkl.f90
@@ -1,678 +1,819 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This is the Intel MKL implementation of the FFT library
 
 module decomp_2d_fft
 
-  use decomp_2d  ! 2D decomposition module
-  use MKL_DFTI   ! MKL FFT module
+   use iso_c_binding, only: c_f_pointer, c_loc
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use decomp_2d  ! 2D decomposition module
+   use MKL_DFTI   ! MKL FFT module
 
-  implicit none
+   implicit none
 
-  private        ! Make everything private unless declared public
+   private        ! Make everything private unless declared public
 
-  ! engine-specific global variables
+   ! engine-specific global variables
 
-  ! Descriptors for MKL FFT, one for each set of 1D FFTs
-  !  for c2c transforms
-  type(DFTI_DESCRIPTOR), pointer :: c2c_x, c2c_y, c2c_z     
-  !  for r2c/c2r transforms, PHYSICAL_IN_X
-  type(DFTI_DESCRIPTOR), pointer :: r2c_x, c2c_y2, c2c_z2, c2r_x
-  !  for r2c/c2r transforms, PHYSICAL_IN_Z
-  type(DFTI_DESCRIPTOR), pointer :: r2c_z, c2c_x2, c2r_z
+   ! Descriptors for MKL FFT, one for each set of 1D FFTs
+   !  for c2c transforms
+   type(DFTI_DESCRIPTOR), pointer :: c2c_x, c2c_y, c2c_z
+   !  for r2c/c2r transforms, PHYSICAL_IN_X
+   type(DFTI_DESCRIPTOR), pointer :: r2c_x, c2c_y2, c2c_z2, c2r_x
+   !  for r2c/c2r transforms, PHYSICAL_IN_Z
+   type(DFTI_DESCRIPTOR), pointer :: r2c_z, c2c_x2, c2r_z
 
-  ! common code used for all engines, including global variables, 
-  ! generic interface definitions and several subroutines
+   integer, parameter, public :: D2D_FFT_BACKEND = D2D_FFT_BACKEND_MKL
+
+   ! common code used for all engines, including global variables,
+   ! generic interface definitions and several subroutines
 #include "fft_common.f90"
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time initialisations for the FFT engine
+   !  This routine performs one-time initialisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine init_fft_engine
-
-    implicit none
+   subroutine init_fft_engine
 
-    if (nrank==0) then
-       write(*,*) ' '
-       write(*,*) '***** Using the MKL engine *****'
-       write(*,*) ' '
-    end if
-#ifdef OVERWRITE
-    call decomp_2d_warning(0, "MKL FFT engine does not support the OVERWRITE flag.")
-#endif
+      implicit none
 
-    ! For C2C transforms
-    call c2c_1m_x_plan(c2c_x, ph)
-    call c2c_1m_y_plan(c2c_y, ph)
-    call c2c_1m_z_plan(c2c_z, ph)
+      call decomp_2d_fft_log("MKL")
 
-    ! For R2C/C2R tranfroms with physical space in X-pencil
-    if (format == PHYSICAL_IN_X) then
-       call r2c_1m_x_plan(r2c_x, ph, sp, -1)
-       call c2c_1m_y_plan(c2c_y2, sp)
-       call c2c_1m_z_plan(c2c_z2, sp)
-       call r2c_1m_x_plan(c2r_x, ph, sp,  1)
+      ! For C2C transforms
+      call c2c_1m_x_plan(c2c_x, ph)
+      call c2c_1m_y_plan(c2c_y, ph)
+      call c2c_1m_z_plan(c2c_z, ph)
 
-       ! For R2C/C2R tranfroms with physical space in Z-pencil
-    else if (format == PHYSICAL_IN_Z) then
-       call r2c_1m_z_plan(r2c_z, ph, sp, -1)
-       call c2c_1m_y_plan(c2c_y2, sp)
-       call c2c_1m_x_plan(c2c_x2, sp)
-       call r2c_1m_z_plan(c2r_z, ph, sp,  1)
-    end if
+      ! For R2C/C2R tranfroms with physical space in X-pencil
+      if (format == PHYSICAL_IN_X) then
+         call r2c_1m_x_plan(r2c_x, ph, sp, -1)
+         call c2c_1m_y_plan(c2c_y2, sp)
+         call c2c_1m_z_plan(c2c_z2, sp)
+         call r2c_1m_x_plan(c2r_x, ph, sp, 1)
 
-    return
-  end subroutine init_fft_engine
+         ! For R2C/C2R tranfroms with physical space in Z-pencil
+      else if (format == PHYSICAL_IN_Z) then
+         call r2c_1m_z_plan(r2c_z, ph, sp, -1)
+         call c2c_1m_y_plan(c2c_y2, sp)
+         call c2c_1m_x_plan(c2c_x2, sp)
+         call r2c_1m_z_plan(c2r_z, ph, sp, 1)
+      end if
 
+      return
+   end subroutine init_fft_engine
 
-  ! Return an MKL plan for multiple 1D c2c FFTs in X direction
-  subroutine c2c_1m_x_plan(desc, decomp)
+   ! Return an MKL plan for multiple 1D c2c FFTs in X direction
+   subroutine c2c_1m_x_plan(desc, decomp)
 
-    implicit none
+      implicit none
 
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    integer :: status
+      integer :: status
 
 #ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%xsz(1))
-#else       
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%xsz(1))
+      status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
+                                    DFTI_COMPLEX, 1, decomp%xsz(1))
+#else
+      status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
+                                    DFTI_COMPLEX, 1, decomp%xsz(1))
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
+      status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
+                            decomp%xsz(2) * decomp%xsz(3))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+#ifdef OVERWRITE
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+#else
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
 #endif
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp%xsz(2)*decomp%xsz(3))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, decomp%xsz(1))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, decomp%xsz(1))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiCommitDescriptor(desc)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, decomp%xsz(1))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, decomp%xsz(1))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiCommitDescriptor(desc)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
 
-    return
-  end subroutine c2c_1m_x_plan
+      return
+   end subroutine c2c_1m_x_plan
 
-  ! Return an MKL plan for multiple 1D c2c FFTs in Y direction
-  subroutine c2c_1m_y_plan(desc, decomp)
+   ! Return an MKL plan for multiple 1D c2c FFTs in Y direction
+   subroutine c2c_1m_y_plan(desc, decomp)
 
-    implicit none
+      implicit none
 
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    integer :: status, strides(2)
+      integer :: status, strides(2)
 
-    ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
-    ! done one Z-plane at a time. So plan for 2D data sets here.
+      ! Due to memory pattern of 3D arrays, 1D FFTs along Y have to be
+      ! done one Z-plane at a time. So plan for 2D data sets here.
 
 #ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%ysz(2))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%ysz(2))
-#endif
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, decomp%ysz(1))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    strides(1) = 0
-    strides(2) = decomp%ysz(1)
-    status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiCommitDescriptor(desc)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
-
-    return
-  end subroutine c2c_1m_y_plan
-
-  ! Return an MKL plan for multiple 1D c2c FFTs in Z direction
-  subroutine c2c_1m_z_plan(desc, decomp)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-
-    integer :: status, strides(2)
+      status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
+                                    DFTI_COMPLEX, 1, decomp%ysz(2))
+#else
+      status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
+                                    DFTI_COMPLEX, 1, decomp%ysz(2))
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
+#ifdef OVERWRITE
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+#else
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, decomp%ysz(1))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      strides(1) = 0
+      strides(2) = decomp%ysz(1)
+      status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiCommitDescriptor(desc)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
+
+      return
+   end subroutine c2c_1m_y_plan
+
+   ! Return an MKL plan for multiple 1D c2c FFTs in Z direction
+   subroutine c2c_1m_z_plan(desc, decomp)
+
+      implicit none
+
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+      integer :: status, strides(2)
 
 #ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_COMPLEX, 1, decomp%zsz(3))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_COMPLEX, 1, decomp%zsz(3))
-#endif
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp%zsz(1)*decomp%zsz(2))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    strides(1) = 0
-    strides(2) = decomp%zsz(1)*decomp%zsz(2)
-    status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiCommitDescriptor(desc)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
-
-    return
-  end subroutine c2c_1m_z_plan
-
-  ! Return an MKL plan for multiple 1D r2c FFTs in X direction
-  subroutine r2c_1m_x_plan(desc, decomp_ph, decomp_sp, direction)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
-    integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
-
-    integer :: status
-
-    ! c2r and r2c plans are almost the same, just swap input/output
+      status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
+                                    DFTI_COMPLEX, 1, decomp%zsz(3))
+#else
+      status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
+                                    DFTI_COMPLEX, 1, decomp%zsz(3))
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
+#ifdef OVERWRITE
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_INPLACE)
+#else
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
+                            decomp%zsz(1) * decomp%zsz(2))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      strides(1) = 0
+      strides(2) = decomp%zsz(1) * decomp%zsz(2)
+      status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiCommitDescriptor(desc)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
+
+      return
+   end subroutine c2c_1m_z_plan
+
+   ! Return an MKL plan for multiple 1D r2c FFTs in X direction
+   subroutine r2c_1m_x_plan(desc, decomp_ph, decomp_sp, direction)
+
+      implicit none
+
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
+      integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
+
+      integer :: status
+
+      ! c2r and r2c plans are almost the same, just swap input/output
 
 #ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_REAL, 1, decomp_ph%xsz(1))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_REAL, 1, decomp_ph%xsz(1))
-#endif
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp_ph%xsz(2)*decomp_ph%xsz(3))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
-         DFTI_COMPLEX_COMPLEX)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    if (direction == -1) then  ! r2c
-       status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
-            decomp_ph%xsz(1))
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-       status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
-            decomp_sp%xsz(1))
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    else if (direction == 1) then  ! c2r
-       status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
-            decomp_sp%xsz(1))
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-       status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
-            decomp_ph%xsz(1))
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    end if
-    status = DftiCommitDescriptor(desc)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
-
-    return
-  end subroutine r2c_1m_x_plan
-
-  ! Return an MKL plan for multiple 1D r2c FFTs in Z direction
-  subroutine r2c_1m_z_plan(desc, decomp_ph, decomp_sp, direction)
-
-    implicit none
-
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
-    integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
-
-    integer :: status, strides(2)
-
-    ! c2r and r2c plans are almost the same, just swap input/output
+      status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
+                                    DFTI_REAL, 1, decomp_ph%xsz(1))
+#else
+      status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
+                                    DFTI_REAL, 1, decomp_ph%xsz(1))
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
+      status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
+                            decomp_ph%xsz(2) * decomp_ph%xsz(3))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
+                            DFTI_COMPLEX_COMPLEX)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      if (direction == -1) then  ! r2c
+         status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
+                               decomp_ph%xsz(1))
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+         status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
+                               decomp_sp%xsz(1))
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      else if (direction == 1) then  ! c2r
+         status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, &
+                               decomp_sp%xsz(1))
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+         status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, &
+                               decomp_ph%xsz(1))
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      end if
+      status = DftiCommitDescriptor(desc)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
+
+      return
+   end subroutine r2c_1m_x_plan
+
+   ! Return an MKL plan for multiple 1D r2c FFTs in Z direction
+   subroutine r2c_1m_z_plan(desc, decomp_ph, decomp_sp, direction)
+
+      implicit none
+
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      TYPE(DECOMP_INFO), intent(IN) :: decomp_ph, decomp_sp
+      integer, intent(IN) :: direction ! (-1=r2c; 1=c2r)
+
+      integer :: status, strides(2)
+
+      ! c2r and r2c plans are almost the same, just swap input/output
 
 #ifdef DOUBLE_PREC
-    status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
-         DFTI_REAL, 1, decomp_ph%zsz(3))
-#else
-    status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
-         DFTI_REAL, 1, decomp_ph%zsz(3))
-#endif
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
-    status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
-         decomp_ph%zsz(1)*decomp_ph%zsz(2))
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
-         DFTI_COMPLEX_COMPLEX)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    strides(1) = 0
-    strides(2) = decomp_ph%zsz(1)*decomp_ph%zsz(2)
-    if (direction == -1) then
-       status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    else if (direction == 1) then
-       status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    end if
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    strides(2) = decomp_sp%zsz(1)*decomp_sp%zsz(2)
-    if (direction == -1) then
-       status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
-    else if (direction == 1) then
-       status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
-    end if
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
-    status = DftiCommitDescriptor(desc)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
-
-    return
-  end subroutine r2c_1m_z_plan
-
+      status = DftiCreateDescriptor(desc, DFTI_DOUBLE, &
+                                    DFTI_REAL, 1, decomp_ph%zsz(3))
+#else
+      status = DftiCreateDescriptor(desc, DFTI_SINGLE, &
+                                    DFTI_REAL, 1, decomp_ph%zsz(3))
+#endif
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCreateDescriptor")
+      status = DftiSetValue(desc, DFTI_NUMBER_OF_TRANSFORMS, &
+                            decomp_ph%zsz(1) * decomp_ph%zsz(2))
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_CONJUGATE_EVEN_STORAGE, &
+                            DFTI_COMPLEX_COMPLEX)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_PLACEMENT, DFTI_NOT_INPLACE)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_INPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiSetValue(desc, DFTI_OUTPUT_DISTANCE, 1)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      strides(1) = 0
+      strides(2) = decomp_ph%zsz(1) * decomp_ph%zsz(2)
+      if (direction == -1) then
+         status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
+      else if (direction == 1) then
+         status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
+      end if
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      strides(2) = decomp_sp%zsz(1) * decomp_sp%zsz(2)
+      if (direction == -1) then
+         status = DftiSetValue(desc, DFTI_OUTPUT_STRIDES, strides)
+      else if (direction == 1) then
+         status = DftiSetValue(desc, DFTI_INPUT_STRIDES, strides)
+      end if
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiSetValue")
+      status = DftiCommitDescriptor(desc)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiCommitDescriptor")
+
+      return
+   end subroutine r2c_1m_z_plan
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !  This routine performs one-time finalisations for the FFT engine
+   !  This routine performs one-time finalisations for the FFT engine
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine finalize_fft_engine
-
-    implicit none
-
-    integer :: status
-
-    status = DftiFreeDescriptor(c2c_x)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-    status = DftiFreeDescriptor(c2c_y)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-    status = DftiFreeDescriptor(c2c_z)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-    if (format==PHYSICAL_IN_X) then
-       status = DftiFreeDescriptor(r2c_x)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-       status = DftiFreeDescriptor(c2c_z2)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-       status = DftiFreeDescriptor(c2r_x)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-    else if (format==PHYSICAL_IN_Z) then
-       status = DftiFreeDescriptor(r2c_z)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-       status = DftiFreeDescriptor(c2c_x2)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-       status = DftiFreeDescriptor(c2r_z)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-    end if
-    status = DftiFreeDescriptor(c2c_y2)
-    if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
-
-    return
-  end subroutine finalize_fft_engine
-
+   subroutine finalize_fft_engine
+
+      implicit none
+
+      integer :: status
+
+      status = DftiFreeDescriptor(c2c_x)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+      status = DftiFreeDescriptor(c2c_y)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+      status = DftiFreeDescriptor(c2c_z)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+      if (format == PHYSICAL_IN_X) then
+         status = DftiFreeDescriptor(r2c_x)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+         status = DftiFreeDescriptor(c2c_z2)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+         status = DftiFreeDescriptor(c2r_x)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+      else if (format == PHYSICAL_IN_Z) then
+         status = DftiFreeDescriptor(r2c_z)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+         status = DftiFreeDescriptor(c2c_x2)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+         status = DftiFreeDescriptor(c2r_z)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+      end if
+      status = DftiFreeDescriptor(c2c_y2)
+      if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "DftiFreeDescriptor")
+
+      return
+   end subroutine finalize_fft_engine
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D FFT - complex to complex
+   ! 3D FFT - complex to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2c(in, out, isign)
+   subroutine fft_3d_c2c(in, out, isign)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: in
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out
-    integer, intent(IN) :: isign   
+      complex(mytype), dimension(:, :, :), intent(IN) :: in
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out
+      integer, intent(IN) :: isign
 
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status
+#ifndef OVERWRITE
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1, wk2b, wk3
+#endif
+      integer :: k, status
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
-#endif
-
-    if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_FORWARD .OR.  &
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_BACKWARD) then
-
-       ! ===== 1D FFTs in X =====
-       allocate (wk1(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_x, in(:,1,1), wk1(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_x, in(:,1,1), wk1(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_x, in, wk1, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-
-       ! ===== Swap X --> Y =====
-       allocate (wk2(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,ph)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(ph%ysz(1),ph%ysz(2),ph%ysz(3))) 
-       do k=1,ph%xsz(3) ! one Z-plane at a time
-          !          if (isign==DECOMP_2D_FFT_FORWARD) then
-          !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
-          !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          end if
-          status = wrapper_c2c(c2c_y, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
-
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,ph)
-
-       ! ===== 1D FFTs in Z =====
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_z, wk3(:,1,1), out(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_z, wk3(:,1,1), out(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_z, wk3, out, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-
-    else if (format==PHYSICAL_IN_X .AND. isign==DECOMP_2D_FFT_BACKWARD &
-         .OR. & 
-         format==PHYSICAL_IN_Z .AND. isign==DECOMP_2D_FFT_FORWARD) then
-
-       ! ===== 1D FFTs in Z =====
-       allocate (wk1(ph%zsz(1),ph%zsz(2),ph%zsz(3)))
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_z, in(:,1,1), wk1(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_z, in(:,1,1), wk1(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_z, in, wk1, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,ph)
-
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(ph%ysz(1),ph%ysz(2),ph%ysz(3)))
-       do k=1,ph%xsz(3) ! one Z-plane at a time
-          !          if (isign==DECOMP_2D_FFT_FORWARD) then
-          !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
-          !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
-          !          end if
-          status = wrapper_c2c(c2c_y, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
-
-       ! ===== Swap Y --> X =====
-       allocate (wk3(ph%xsz(1),ph%xsz(2),ph%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,ph)
-
-       ! ===== 1D FFTs in X =====
-       !       if (isign==DECOMP_2D_FFT_FORWARD) then
-       !          status = DftiComputeForward(c2c_x, wk3(:,1,1), out(:,1,1))
-       !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
-       !          status = DftiComputeBackward(c2c_x, wk3(:,1,1), out(:,1,1))
-       !       end if
-       status = wrapper_c2c(c2c_x, wk3, out, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-
-    end if
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2c")
+#endif
 
-#ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
+      if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_FORWARD .OR. &
+          format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_BACKWARD) then
+
+         ! ===== 1D FFTs in X =====
+         !       if (isign==DECOMP_2D_FFT_FORWARD) then
+         !          status = DftiComputeForward(c2c_x, in(:,1,1), wk1(:,1,1))
+         !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
+         !          status = DftiComputeBackward(c2c_x, in(:,1,1), wk1(:,1,1))
+         !       end if
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_x, in, isign)
+#else
+         allocate (wk1(ph%xsz(1), ph%xsz(2), ph%xsz(3)))
+         status = wrapper_c2c(c2c_x, in, wk1, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+
+         ! ===== Swap X --> Y =====
+#ifdef OVERWRITE
+         call transpose_x_to_y(in, wk2_c2c, ph)
+#else
+         call transpose_x_to_y(wk1, wk2_c2c, ph)
+#endif
+
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, ph%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y, wk2_c2c(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(ph%ysz(1), ph%ysz(2), ph%ysz(3)))
+         do k = 1, ph%ysz(3) ! one Z-plane at a time
+            !          if (isign==DECOMP_2D_FFT_FORWARD) then
+            !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
+            !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
+            !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
+            !          end if
+            status = wrapper_c2c(c2c_y, wk2_c2c(1, 1, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
+
+         ! ===== Swap Y --> Z =====
+#ifdef OVERWRITE
+         call transpose_y_to_z(wk2_c2c, out, ph)
+#else
+         allocate (wk3(ph%zsz(1), ph%zsz(2), ph%zsz(3)))
+         call transpose_y_to_z(wk2b, wk3, ph)
+#endif
+
+         ! ===== 1D FFTs in Z =====
+         !       if (isign==DECOMP_2D_FFT_FORWARD) then
+         !          status = DftiComputeForward(c2c_z, wk3(:,1,1), out(:,1,1))
+         !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
+         !          status = DftiComputeBackward(c2c_z, wk3(:,1,1), out(:,1,1))
+         !       end if
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_z, out, isign)
+#else
+         status = wrapper_c2c(c2c_z, wk3, out, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+
+      else if (format == PHYSICAL_IN_X .AND. isign == DECOMP_2D_FFT_BACKWARD &
+               .OR. &
+               format == PHYSICAL_IN_Z .AND. isign == DECOMP_2D_FFT_FORWARD) then
+
+         ! ===== 1D FFTs in Z =====
+         !       if (isign==DECOMP_2D_FFT_FORWARD) then
+         !          status = DftiComputeForward(c2c_z, in(:,1,1), wk1(:,1,1))
+         !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
+         !          status = DftiComputeBackward(c2c_z, in(:,1,1), wk1(:,1,1))
+         !       end if
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_z, in, isign)
+#else
+         allocate (wk1(ph%zsz(1), ph%zsz(2), ph%zsz(3)))
+         status = wrapper_c2c(c2c_z, in, wk1, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+
+         ! ===== Swap Z --> Y =====
+#ifdef OVERWRITE
+         call transpose_z_to_y(in, wk2_c2c, ph)
+#else
+         call transpose_z_to_y(wk1, wk2_c2c, ph)
+#endif
+
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, ph%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y, wk2_c2c(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(ph%ysz(1), ph%ysz(2), ph%ysz(3)))
+         do k = 1, ph%ysz(3) ! one Z-plane at a time
+            !          if (isign==DECOMP_2D_FFT_FORWARD) then
+            !             status = DftiComputeForward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
+            !          else if (isign==DECOMP_2D_FFT_BACKWARD) then
+            !             status = DftiComputeBackward(c2c_y, wk2(:,1,k), wk2b(:,1,k))
+            !          end if
+            status = wrapper_c2c(c2c_y, wk2_c2c(1, 1, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
+
+         ! ===== Swap Y --> X =====
+#ifdef OVERWRITE
+         call transpose_y_to_x(wk2_c2c, out, ph)
+#else
+         allocate (wk3(ph%xsz(1), ph%xsz(2), ph%xsz(3)))
+         call transpose_y_to_x(wk2b, wk3, ph)
+#endif
+
+         ! ===== 1D FFTs in X =====
+         !       if (isign==DECOMP_2D_FFT_FORWARD) then
+         !          status = DftiComputeForward(c2c_x, wk3(:,1,1), out(:,1,1))
+         !       else if (isign==DECOMP_2D_FFT_BACKWARD) then
+         !          status = DftiComputeBackward(c2c_x, wk3(:,1,1), out(:,1,1))
+         !       end if
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_x, out, isign)
+#else
+         status = wrapper_c2c(c2c_x, wk3, out, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+
+      end if
+
+      ! Free memory
+#ifndef OVERWRITE
+      deallocate (wk1, wk2b, wk3)
 #endif
 
-    return
-  end subroutine fft_3d_c2c
+#ifdef PROFILER
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2c")
+#endif
 
+      return
+   end subroutine fft_3d_c2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D forward FFT - real to complex
+   ! 3D forward FFT - real to complex
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_r2c(in_r, out_c)
+   subroutine fft_3d_r2c(in_r, out_c)
 
-    implicit none
+      implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: in_r
-    complex(mytype), dimension(:,:,:), intent(OUT) :: out_c
+      real(mytype), dimension(:, :, :), intent(IN) :: in_r
+      complex(mytype), dimension(:, :, :), intent(OUT) :: out_c
 
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status, isign
+#ifndef OVERWRITE
+      complex(mytype), allocatable, dimension(:, :, :) :: wk2b, wk3
+#endif
+      integer :: k, status, isign
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_r2c")
 #endif
 
-    isign = DECOMP_2D_FFT_FORWARD
+      isign = DECOMP_2D_FFT_FORWARD
+
+      if (format == PHYSICAL_IN_X) then
 
-    if (format==PHYSICAL_IN_X) then
+         ! ===== 1D FFTs in X =====
+         !       status = DftiComputeForward(r2c_x, in_r(:,1,1), wk1(:,1,1))
+         status = wrapper_r2c(r2c_x, in_r, wk13)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_r2c")
 
-       ! ===== 1D FFTs in X =====
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       !       status = DftiComputeForward(r2c_x, in_r(:,1,1), wk1(:,1,1))       
-       status = wrapper_r2c(r2c_x, in_r, wk1)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_r2c")
+         ! ===== Swap X --> Y =====
+         call transpose_x_to_y(wk13, wk2_r2c, sp)
 
-       ! ===== Swap X --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,sp)
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, sp%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y2, wk2_r2c(:, :, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(sp%ysz(1), sp%ysz(2), sp%ysz(3)))
+         do k = 1, sp%ysz(3)
+            !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
+            status = wrapper_c2c(c2c_y2, wk2_r2c(:, :, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
 
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
+         ! ===== Swap Y --> Z =====
+#ifdef OVERWRITE
+         call transpose_y_to_z(wk2_r2c, out_c, sp)
+#else
+         allocate (wk3(sp%zsz(1), sp%zsz(2), sp%zsz(3)))
+         call transpose_y_to_z(wk2b, wk3, sp)
+#endif
 
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,sp)
+         ! ===== 1D FFTs in Z =====
+         !       status = DftiComputeForward(c2c_z2, wk3(:,1,1), out_c(:,1,1))
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_z2, out_c, isign)
+#else
+         status = wrapper_c2c(c2c_z2, wk3, out_c, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
 
-       ! ===== 1D FFTs in Z =====
-       !       status = DftiComputeForward(c2c_z2, wk3(:,1,1), out_c(:,1,1))
-       status = wrapper_c2c(c2c_z2, wk3, out_c, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+      else if (format == PHYSICAL_IN_Z) then
 
-    else if (format==PHYSICAL_IN_Z) then
+         ! ===== 1D FFTs in Z =====
+         !       status = DftiComputeForward(r2c_z, in_r(:,1,1), wk1(:,1,1))
+         status = wrapper_r2c(r2c_z, in_r, wk13)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_r2c")
 
-       ! ===== 1D FFTs in Z =====
-       allocate(wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       !       status = DftiComputeForward(r2c_z, in_r(:,1,1), wk1(:,1,1))
-       status = wrapper_r2c(r2c_z, in_r, wk1)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_r2c")
+         ! ===== Swap Z --> Y =====
+         call transpose_z_to_y(wk13, wk2_r2c, sp)
 
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,sp)
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, sp%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y2, wk2_r2c(:, :, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(sp%ysz(1), sp%ysz(2), sp%ysz(3)))
+         do k = 1, sp%ysz(3)
+            !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
+            status = wrapper_c2c(c2c_y2, wk2_r2c(:, :, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
 
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeForward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
+         ! ===== Swap Y --> X =====
+#ifdef OVERWRITE
+         call transpose_y_to_x(wk2_r2c, out_c, sp)
+#else
+         allocate (wk3(sp%xsz(1), sp%xsz(2), sp%xsz(3)))
+         call transpose_y_to_x(wk2b, wk3, sp)
+#endif
 
-       ! ===== Swap Y --> X =====
-       allocate (wk3(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,sp)
+         ! ===== 1D FFTs in X =====
+         !       status = DftiComputeForward(c2c_x2, wk3(:,1,1), out_c(:,1,1))
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_x2, out_c, isign)
+#else
+         status = wrapper_c2c(c2c_x2, wk3, out_c, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
 
-       ! ===== 1D FFTs in X =====
-       !       status = DftiComputeForward(c2c_x2, wk3(:,1,1), out_c(:,1,1))
-       status = wrapper_c2c(c2c_x2, wk3, out_c, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+      end if
 
-    end if
+      ! Free memory
+#ifndef OVERWRITE
+      deallocate (wk2b, wk3)
+#endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_r2c")
 #endif
 
-    return
-  end subroutine fft_3d_r2c
-
+      return
+   end subroutine fft_3d_r2c
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! 3D inverse FFT - complex to real
+   ! 3D inverse FFT - complex to real
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine fft_3d_c2r(in_c, out_r)
+   subroutine fft_3d_c2r(in_c, out_r)
 
-    implicit none
+      implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: in_c
-    real(mytype), dimension(:,:,:), intent(OUT) :: out_r
+      complex(mytype), dimension(:, :, :), intent(IN) :: in_c
+      real(mytype), dimension(:, :, :), intent(OUT) :: out_r
 
-    complex(mytype), allocatable, dimension(:,:,:) :: wk1,wk2,wk2b,wk3
-    integer :: k, status, isign
+#ifndef OVERWRITE
+      complex(mytype), allocatable, dimension(:, :, :) :: wk1, wk2b
+#endif
+      integer :: k, status, isign
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_start("fft_c2r")
 #endif
 
-    isign = DECOMP_2D_FFT_BACKWARD
+      isign = DECOMP_2D_FFT_BACKWARD
+
+      if (format == PHYSICAL_IN_X) then
 
-    if (format==PHYSICAL_IN_X) then
+         ! ===== 1D FFTs in Z =====
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_z2, in_c, isign)
+#else
+         allocate (wk1(sp%zsz(1), sp%zsz(2), sp%zsz(3)))
+         !       status = DftiComputeBackward(c2c_z2, in_c(:,1,1), wk1(:,1,1))
+         status = wrapper_c2c(c2c_z2, in_c, wk1, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
 
-       ! ===== 1D FFTs in Z =====
-       allocate (wk1(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       !       status = DftiComputeBackward(c2c_z2, in_c(:,1,1), wk1(:,1,1))
-       status = wrapper_c2c(c2c_z2, in_c, wk1, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         ! ===== Swap Z --> Y =====
+#ifdef OVERWRITE
+         call transpose_z_to_y(in_c, wk2_r2c, sp)
+#else
+         call transpose_z_to_y(wk1, wk2_r2c, sp)
+#endif
 
-       ! ===== Swap Z --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_z_to_y(wk1,wk2,sp)
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, sp%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y2, wk2_r2c(:, :, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(sp%ysz(1), sp%ysz(2), sp%ysz(3)))
+         do k = 1, sp%ysz(3)
+            !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
+            status = wrapper_c2c(c2c_y2, wk2_r2c(:, :, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
 
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
+         ! ===== Swap Y --> X =====
+#ifdef OVERWRITE
+         call transpose_y_to_x(wk2_r2c, wk13, sp)
+#else
+         call transpose_y_to_x(wk2b, wk13, sp)
+#endif
 
-       ! ===== Swap Y --> X =====
-       allocate (wk3(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       call transpose_y_to_x(wk2b,wk3,sp)
+         ! ===== 1D FFTs in X =====
+         !       status = DftiComputeBackward(c2r_x, wk3(:,1,1), out_r(:,1,1))
+         status = wrapper_c2r(c2r_x, wk13, out_r)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2r")
 
-       ! ===== 1D FFTs in X =====
-       !       status = DftiComputeBackward(c2r_x, wk3(:,1,1), out_r(:,1,1))
-       status =  wrapper_c2r(c2r_x, wk3, out_r)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2r")
+      else if (format == PHYSICAL_IN_Z) then
 
-    else if (format==PHYSICAL_IN_Z) then
+         ! ===== 1D FFTs in X =====
+#ifdef OVERWRITE
+         status = wrapper_c2c_inplace(c2c_x2, in_c, isign)
+#else
+         allocate (wk1(sp%xsz(1), sp%xsz(2), sp%xsz(3)))
+         !       status = DftiComputeBackward(c2c_x2, in_c(:,1,1), wk1(:,1,1))
+         status = wrapper_c2c(c2c_x2, in_c, wk1, isign)
+#endif
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
 
-       ! ===== 1D FFTs in X =====
-       allocate(wk1(sp%xsz(1),sp%xsz(2),sp%xsz(3)))
-       !       status = DftiComputeBackward(c2c_x2, in_c(:,1,1), wk1(:,1,1))
-       status = wrapper_c2c(c2c_x2, in_c, wk1, isign)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         ! ===== Swap X --> Y =====
+#ifdef OVERWRITE
+         call transpose_x_to_y(in_c, wk2_r2c, sp)
+#else
+         call transpose_x_to_y(wk1, wk2_r2c, sp)
+#endif
 
-       ! ===== Swap X --> Y =====
-       allocate (wk2(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       call transpose_x_to_y(wk1,wk2,sp)
+         ! ===== 1D FFTs in Y =====
+#ifdef OVERWRITE
+         do k = 1, sp%ysz(3)
+            status = wrapper_c2c_inplace(c2c_y2, wk2_r2c(:, :, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#else
+         allocate (wk2b(sp%ysz(1), sp%ysz(2), sp%ysz(3)))
+         do k = 1, sp%ysz(3)
+            !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
+            status = wrapper_c2c(c2c_y2, wk2_r2c(:, :, k), wk2b(1, 1, k), isign)
+            if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
+         end do
+#endif
 
-       ! ===== 1D FFTs in Y =====
-       allocate (wk2b(sp%ysz(1),sp%ysz(2),sp%ysz(3)))
-       do k=1,sp%ysz(3)
-          !          status = DftiComputeBackward(c2c_y2, wk2(:,1,k), wk2b(:,1,k))
-          status = wrapper_c2c(c2c_y2, wk2(1,1,k), wk2b(1,1,k), isign)
-          if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2c")
-       end do
+         ! ===== Swap Y --> Z =====
+#ifdef OVERWRITE
+         call transpose_y_to_z(wk2_r2c, wk13, sp)
+#else
+         call transpose_y_to_z(wk2b, wk13, sp)
+#endif
 
-       ! ===== Swap Y --> Z =====
-       allocate (wk3(sp%zsz(1),sp%zsz(2),sp%zsz(3)))
-       call transpose_y_to_z(wk2b,wk3,sp)
+         ! ===== 1D FFTs in Z =====
+         !       status = DftiComputeBackward(c2r_z, wk3(:,1,1), out_r(:,1,1))
+         status = wrapper_c2r(c2r_z, wk13, out_r)
+         if (status /= 0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2r")
 
-       ! ===== 1D FFTs in Z =====
-       !       status = DftiComputeBackward(c2r_z, wk3(:,1,1), out_r(:,1,1))
-       status =  wrapper_c2r(c2r_z, wk3, out_r)
-       if (status/=0) call decomp_2d_abort(__FILE__, __LINE__, status, "wrapper_c2r")
+      end if
 
-    end if
+      ! Free memory
+#ifndef OVERWRITE
+      deallocate (wk1, wk2b)
+#endif
 
 #ifdef PROFILER
-if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
+      if (decomp_profiler_fft) call decomp_profiler_end("fft_c2r")
 #endif
 
-    return
-  end subroutine fft_3d_c2r
-
+      return
+   end subroutine fft_3d_c2r
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Wrapper functions so that one can pass 3D arrays to DftiCompute
-  !  -- MKL accepts only 1D arrays as input/output for its multi-
-  !     dimensional FFTs.
-  !  -- Using EQUIVALENCE as suggested by MKL documents is impossible 
-  !     for allocated arrays, not to mention bad coding style
-  !  -- All code commented out above may well work but not safe. There 
-  !     is no guarantee that compiler wouldn't make copies of 1D arrays
-  !     (which would contain only one slice of the original 3D data) 
-  !     rather than referring to the same memory address, i.e. 3D array
-  !     A and 1D array A(:,1,1) may refer to different memory location.
-  !  -- Using the following wrappers is safe and standard conforming.
+   ! Wrapper functions so that one can pass 3D arrays to DftiCompute
+   !  -- MKL accepts only 1D arrays as input/output for its multi-
+   !     dimensional FFTs.
+   !  -- Using EQUIVALENCE as suggested by MKL documents is impossible
+   !     for allocated arrays, not to mention bad coding style
+   !  -- All code commented out above may well work but not safe. There
+   !     is no guarantee that compiler wouldn't make copies of 1D arrays
+   !     (which would contain only one slice of the original 3D data)
+   !     rather than referring to the same memory address, i.e. 3D array
+   !     A and 1D array A(:,1,1) may refer to different memory location.
+   !  -- Using the following wrappers is safe and standard conforming.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-  integer function wrapper_c2c(desc, in, out, isign)
+   integer function wrapper_c2c(desc, in, out, isign)
 
-    implicit none
+      implicit none
 
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    complex(mytype), dimension(*) :: in, out
-    integer :: isign, status
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      complex(mytype), dimension(*) :: in, out
+      integer :: isign, status
 
-    if (isign == DECOMP_2D_FFT_FORWARD) then
-       status = DftiComputeForward(desc, in, out)
-    else if (isign == DECOMP_2D_FFT_BACKWARD) then
-       status = DftiComputeBackward(desc, in, out)
-    end if
+      if (isign == DECOMP_2D_FFT_FORWARD) then
+         status = DftiComputeForward(desc, in, out)
+      else if (isign == DECOMP_2D_FFT_BACKWARD) then
+         status = DftiComputeBackward(desc, in, out)
+      end if
 
-    wrapper_c2c = status
+      wrapper_c2c = status
 
-    return
-  end function wrapper_c2c
+      return
+   end function wrapper_c2c
+
+#ifdef OVERWRITE
+   integer function wrapper_c2c_inplace(desc, inout, isign)
+
+      implicit none
+
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      complex(mytype), dimension(*) :: inout
+      integer :: isign, status
+
+      if (isign == DECOMP_2D_FFT_FORWARD) then
+         status = DftiComputeForward(desc, inout)
+      else if (isign == DECOMP_2D_FFT_BACKWARD) then
+         status = DftiComputeBackward(desc, inout)
+      end if
+
+      wrapper_c2c_inplace = status
+
+      return
+   end function wrapper_c2c_inplace
+#endif
 
-  integer function wrapper_r2c(desc, in, out)
+   integer function wrapper_r2c(desc, in, out)
 
-    implicit none
+      implicit none
 
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    real(mytype), dimension(*) :: in 
-    complex(mytype), dimension(*) :: out
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      real(mytype), dimension(*) :: in
+      complex(mytype), dimension(*) :: out
 
-    wrapper_r2c = DftiComputeForward(desc, in, out)
+      wrapper_r2c = DftiComputeForward(desc, in, out)
 
-    return
-  end function wrapper_r2c
+      return
+   end function wrapper_r2c
 
-  integer function wrapper_c2r(desc, in, out)
+   integer function wrapper_c2r(desc, in, out)
 
-    implicit none
+      implicit none
 
-    type(DFTI_DESCRIPTOR), pointer :: desc
-    complex(mytype), dimension(*) :: in 
-    real(mytype), dimension(*) :: out
+      type(DFTI_DESCRIPTOR), pointer :: desc
+      complex(mytype), dimension(*) :: in
+      real(mytype), dimension(*) :: out
 
-    wrapper_c2r = DftiComputeBackward(desc, in, out)
+      wrapper_c2r = DftiComputeBackward(desc, in, out)
 
-    return
-  end function wrapper_c2r
+      return
+   end function wrapper_c2r
 
 end module decomp_2d_fft
diff --git a/src/glassman.f90 b/src/glassman.f90
index 89fcf41e..8fca08fe 100644
--- a/src/glassman.f90
+++ b/src/glassman.f90
@@ -1,192 +1,180 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This module contains a few 'generic' FFT routines, making the 
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! This module contains a few 'generic' FFT routines, making the
 ! 2DECOMP&FFT library not dependent on any external libraries
 
 module glassman
 
-  use decomp_2d, only : mytype
+   use decomp_2d_constants, only: mytype
 
-  implicit none
+   implicit none
 
 contains
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Following is a FFT implementation based on algorithm proposed by
-  ! Glassman, a general FFT algorithm supporting arbitrary input length.
-  !
-  ! W. E. Ferguson, Jr., "A simple derivation of Glassman general-n fast
-  ! Fourier transform," Comput. and Math. with Appls., vol. 8, no. 6, pp.
-  ! 401-411, 1982.
-  !
-  ! Original implemtation online at http://www.jjj.de/fft/fftpage.html
-  !
-  ! Updated  
-  !  -  to handle double-precision as well
-  !  -  unnecessary scaling code removed
+   ! Following is a FFT implementation based on algorithm proposed by
+   ! Glassman, a general FFT algorithm supporting arbitrary input length.
+   !
+   ! W. E. Ferguson, Jr., "A simple derivation of Glassman general-n fast
+   ! Fourier transform," Comput. and Math. with Appls., vol. 8, no. 6, pp.
+   ! 401-411, 1982.
+   !
+   ! Original implemtation online at http://www.jjj.de/fft/fftpage.html
+   !
+   ! Updated
+   !  -  to handle double-precision as well
+   !  -  unnecessary scaling code removed
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 
-  PURE SUBROUTINE SPCFFT(U,N,ISIGN,WORK)
-
-    !$acc routine seq
-
-    IMPLICIT NONE
-
-    LOGICAL :: INU
-    INTEGER :: A,B,C,I
-    INTEGER, INTENT(IN) :: N, ISIGN
-    COMPLEX(mytype), INTENT(INOUT) :: U(*),WORK(*)
-
-    A = 1
-    B = N
-    C = 1
-    INU = .TRUE.
-
-    DO WHILE ( B .GT. 1 )
-       A = C * A
-       C = 2
-       DO WHILE ( MOD(B,C) .NE. 0 )
-          C = C + 1
-       END DO
-       B = B / C
-       IF ( INU ) THEN
-          CALL SPCPFT (A,B,C,U,WORK,ISIGN)
-       ELSE
-          CALL SPCPFT (A,B,C,WORK,U,ISIGN)
-       END IF
-       INU = ( .NOT. INU )
-    END DO
-
-    IF ( .NOT. INU ) THEN
-       DO I = 1, N
-          U(I) = WORK(I)
-       END DO
-    END IF
-
-    RETURN
-  END SUBROUTINE SPCFFT
-
-
-  PURE SUBROUTINE SPCPFT( A, B, C, UIN, UOUT, ISIGN )
-
-    !$acc routine seq
-
-    IMPLICIT NONE
-
-    INTEGER, INTENT(IN), VALUE :: A, B, C, ISIGN
-    INTEGER :: IA,IB,IC,JCR,JC
-
-    DOUBLE PRECISION :: ANGLE
-
-    COMPLEX(mytype), INTENT(IN) :: UIN(B,C,A)
-    COMPLEX(mytype), INTENT(OUT) :: UOUT(B,A,C)
-    COMPLEX(mytype) :: DELTA,OMEGA,SUM
-
-    ANGLE = 6.28318530717958_mytype / REAL( A * C, kind=mytype )
-    OMEGA = CMPLX( 1.0, 0.0, kind=mytype )
-
-    IF( ISIGN .EQ. 1 ) THEN
-       DELTA = CMPLX( COS(ANGLE), SIN(ANGLE), kind=mytype )
-    ELSE
-       DELTA = CMPLX( COS(ANGLE), -SIN(ANGLE), kind=mytype )
-    END IF
-
-    DO IC = 1, C
-       DO IA = 1, A
-          DO IB = 1, B
-             SUM = UIN( IB, C, IA )
-             DO JCR = 2, C
-                JC = C + 1 - JCR
-                SUM = UIN( IB, JC, IA ) + OMEGA * SUM
-             END DO
-             UOUT( IB, IA, IC ) = SUM
-          END DO
-          OMEGA = DELTA * OMEGA
-       END DO
-    END DO
-
-    RETURN
-  END SUBROUTINE SPCPFT
-
+   PURE SUBROUTINE SPCFFT(U, N, ISIGN, WORK)
+
+      !$acc routine seq
+
+      IMPLICIT NONE
+
+      LOGICAL :: INU
+      INTEGER :: A, B, C, I
+      INTEGER, INTENT(IN) :: N, ISIGN
+      COMPLEX(mytype), INTENT(INOUT) :: U(*), WORK(*)
+
+      A = 1
+      B = N
+      C = 1
+      INU = .TRUE.
+
+      DO WHILE (B > 1)
+         A = C * A
+         C = 2
+         DO WHILE (MOD(B, C) /= 0)
+            C = C + 1
+         END DO
+         B = B / C
+         IF (INU) THEN
+            CALL SPCPFT(A, B, C, U, WORK, ISIGN)
+         ELSE
+            CALL SPCPFT(A, B, C, WORK, U, ISIGN)
+         END IF
+         INU = (.NOT. INU)
+      END DO
+
+      IF (.NOT. INU) THEN
+         DO I = 1, N
+            U(I) = WORK(I)
+         END DO
+      END IF
+
+      RETURN
+   END SUBROUTINE SPCFFT
+
+   PURE SUBROUTINE SPCPFT(A, B, C, UIN, UOUT, ISIGN)
+
+      !$acc routine seq
+
+      IMPLICIT NONE
+
+      INTEGER, INTENT(IN), VALUE :: A, B, C, ISIGN
+      INTEGER :: IA, IB, IC, JCR, JC
+
+      DOUBLE PRECISION :: ANGLE
+
+      COMPLEX(mytype), INTENT(IN) :: UIN(B, C, A)
+      COMPLEX(mytype), INTENT(OUT) :: UOUT(B, A, C)
+      COMPLEX(mytype) :: DELTA, OMEGA, SUM
+
+      ANGLE = 6.28318530717958_mytype / REAL(A * C, kind=mytype)
+      OMEGA = CMPLX(1.0, 0.0, kind=mytype)
+
+      IF (ISIGN == 1) THEN
+         DELTA = CMPLX(COS(ANGLE), SIN(ANGLE), kind=mytype)
+      ELSE
+         DELTA = CMPLX(COS(ANGLE), -SIN(ANGLE), kind=mytype)
+      END IF
+
+      DO IC = 1, C
+         DO IA = 1, A
+            DO IB = 1, B
+               SUM = UIN(IB, C, IA)
+               DO JCR = 2, C
+                  JC = C + 1 - JCR
+                  SUM = UIN(IB, JC, IA) + OMEGA * SUM
+               END DO
+               UOUT(IB, IA, IC) = SUM
+            END DO
+            OMEGA = DELTA * OMEGA
+         END DO
+      END DO
+
+      RETURN
+   END SUBROUTINE SPCPFT
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! A 3D real-to-complex routine implemented using the 1D FFT above
-  !   Input:   nx*ny*nz real numbers
-  !   Output:  (nx/2+1)*ny*nz complex numbers
-  ! Just like big FFT libraries (such as FFTW) do
+   ! A 3D real-to-complex routine implemented using the 1D FFT above
+   !   Input:   nx*ny*nz real numbers
+   !   Output:  (nx/2+1)*ny*nz complex numbers
+   ! Just like big FFT libraries (such as FFTW) do
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine glassman_3d_r2c(in_r,nx,ny,nz,out_c)
-
-    implicit none
-
-    integer, intent(IN) :: nx,ny,nz
-    real(mytype), dimension(nx,ny,nz) :: in_r
-    complex(mytype), dimension(nx/2+1,ny,nz) :: out_c
-
-    complex(mytype), allocatable, dimension(:) :: buf, scratch
-    integer :: maxsize, i,j,k
-
-    maxsize = max(nx, max(ny,nz))
-    allocate(buf(maxsize))
-    allocate(scratch(maxsize))
-
-    ! ===== 1D FFTs in X =====
-    do k=1,nz
-       do j=1,ny
-          ! Glassman's 1D FFT is c2c only, 
-          ! needing some pre- and post-processing for r2c
-          ! pack real input in complex storage
-          do i=1,nx
-             buf(i) = cmplx(in_r(i,j,k),0._mytype, kind=mytype)
-          end do
-          call spcfft(buf,nx,-1,scratch)
-          ! simply drop the redundant part of the complex output
-          do i=1,nx/2+1
-             out_c(i,j,k) = buf(i)
-          end do
-       end do
-    end do
-
-    ! ===== 1D FFTs in Y =====
-    do k=1,nz
-       do i=1,nx/2+1
-          do j=1,ny
-             buf(j) = out_c(i,j,k)
-          end do
-          call spcfft(buf,ny,-1,scratch)
-          do j=1,ny
-             out_c(i,j,k) = buf(j)
-          end do
-       end do
-    end do
-
-    ! ===== 1D FFTs in Z =====
-    do j=1,ny
-       do i=1,nx/2+1
-          do k=1,nz
-             buf(k) = out_c(i,j,k)
-          end do
-          call spcfft(buf,nz,-1,scratch)
-          do k=1,nz
-             out_c(i,j,k) = buf(k)
-          end do
-       end do
-    end do
-
-    deallocate(buf,scratch)
-
-    return
-  end subroutine glassman_3d_r2c
-
+   subroutine glassman_3d_r2c(in_r, nx, ny, nz, out_c)
+
+      implicit none
+
+      integer, intent(IN) :: nx, ny, nz
+      real(mytype), dimension(nx, ny, nz) :: in_r
+      complex(mytype), dimension(nx/2 + 1, ny, nz) :: out_c
+
+      complex(mytype), allocatable, dimension(:) :: buf, scratch
+      integer :: maxsize, i, j, k
+
+      maxsize = max(nx, max(ny, nz))
+      allocate (buf(maxsize))
+      allocate (scratch(maxsize))
+
+      ! ===== 1D FFTs in X =====
+      do k = 1, nz
+         do j = 1, ny
+            ! Glassman's 1D FFT is c2c only,
+            ! needing some pre- and post-processing for r2c
+            ! pack real input in complex storage
+            do i = 1, nx
+               buf(i) = cmplx(in_r(i, j, k), 0._mytype, kind=mytype)
+            end do
+            call spcfft(buf, nx, -1, scratch)
+            ! simply drop the redundant part of the complex output
+            do i = 1, nx / 2 + 1
+               out_c(i, j, k) = buf(i)
+            end do
+         end do
+      end do
+
+      ! ===== 1D FFTs in Y =====
+      do k = 1, nz
+         do i = 1, nx / 2 + 1
+            do j = 1, ny
+               buf(j) = out_c(i, j, k)
+            end do
+            call spcfft(buf, ny, -1, scratch)
+            do j = 1, ny
+               out_c(i, j, k) = buf(j)
+            end do
+         end do
+      end do
+
+      ! ===== 1D FFTs in Z =====
+      do j = 1, ny
+         do i = 1, nx / 2 + 1
+            do k = 1, nz
+               buf(k) = out_c(i, j, k)
+            end do
+            call spcfft(buf, nz, -1, scratch)
+            do k = 1, nz
+               out_c(i, j, k) = buf(k)
+            end do
+         end do
+      end do
+
+      deallocate (buf, scratch)
+
+      return
+   end subroutine glassman_3d_r2c
 
 end module glassman
 
diff --git a/src/halo.f90 b/src/halo.f90
index 6939f9bd..ccb98594 100644
--- a/src/halo.f90
+++ b/src/halo.f90
@@ -1,270 +1,352 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   ! Halo cell support for neighbouring pencils to exchange data
   !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   subroutine update_halo_real_short(in, out, level, opt_global, opt_pencil)
 
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: level      ! levels of halo cells required
-    real(mytype), dimension(:,:,:), intent(IN) :: in
-    real(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-    logical, optional :: opt_global
-    integer, intent(in), optional :: opt_pencil
+     integer, intent(IN) :: level      ! levels of halo cells required
+     real(mytype), dimension(:, :, :), intent(IN) :: in
+     real(mytype), allocatable, dimension(:, :, :), intent(OUT) :: out
+#if defined(_GPU)
+     attributes(device) :: out
+#endif
+     logical, optional :: opt_global
+     integer, intent(in), optional :: opt_pencil
 
-    call update_halo(in, out, level, decomp_main, opt_global, opt_pencil)
+     call update_halo(in, out, level, decomp_main, opt_global, opt_pencil)
 
   end subroutine update_halo_real_short
 
   subroutine update_halo_real(in, out, level, decomp, opt_global, opt_pencil)
 
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: level      ! levels of halo cells required
-    real(mytype), dimension(:,:,:), intent(IN) :: in
-    real(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-    TYPE(DECOMP_INFO), intent(in) :: decomp
-    logical, optional :: opt_global
-    integer, intent(in), optional :: opt_pencil
+     integer, intent(IN) :: level      ! levels of halo cells required
+     real(mytype), dimension(:, :, :), intent(IN) :: in
+     real(mytype), allocatable, dimension(:, :, :), intent(OUT) :: out
+#if defined(_GPU)
+     attributes(device) :: out
+#endif
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     logical, optional :: opt_global
+     integer, intent(in), optional :: opt_pencil
 
-    logical :: global
+     logical :: global
 
-    ! starting/ending index of array with halo cells
-    integer :: xs, ys, zs, xe, ye, ze
+     ! starting/ending index of array with halo cells
+     integer :: xs, ys, zs, xe, ye, ze
+     ! additional start end
+     integer :: ist, ien, jst, jen, kst, ken
 
-    integer :: i, j, k, s1, s2, s3, ierror
-    integer :: data_type
+     integer :: i, j, k, s1, s2, s3
+     integer :: data_type
 
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
+     integer :: ipencil
+     logical, save :: first_call_x = .true., first_call_y = .true., first_call_z = .true.
 
-    integer :: ipencil
-    logical, save :: first_call_x = .true., first_call_y = .true., first_call_z = .true.
-
-    data_type = real_type
+     data_type = real_type
 
 #include "halo_common.f90"
 
-    return
+     return
   end subroutine update_halo_real
 
   subroutine update_halo_complex_short(in, out, level, opt_global, opt_pencil)
 
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: level      ! levels of halo cells required
-    complex(mytype), dimension(:,:,:), intent(IN) :: in
-    complex(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-    logical, optional :: opt_global
-    integer, intent(in), optional :: opt_pencil
+     integer, intent(IN) :: level      ! levels of halo cells required
+     complex(mytype), dimension(:, :, :), intent(IN) :: in
+     complex(mytype), allocatable, dimension(:, :, :), intent(OUT) :: out
+#if defined(_GPU)
+     attributes(device) :: out
+#endif
+     logical, optional :: opt_global
+     integer, intent(in), optional :: opt_pencil
 
-    call update_halo(in, out, level, decomp_main, opt_global, opt_pencil)
+     call update_halo(in, out, level, decomp_main, opt_global, opt_pencil)
 
   end subroutine update_halo_complex_short
 
   subroutine update_halo_complex(in, out, level, decomp, opt_global, opt_pencil)
 
-    implicit none
-
-    integer, intent(IN) :: level      ! levels of halo cells required
-    complex(mytype), dimension(:,:,:), intent(IN) :: in
-    complex(mytype), allocatable, dimension(:,:,:), intent(OUT) :: out
-    TYPE(DECOMP_INFO), intent(in) :: decomp
-    logical, optional :: opt_global
-    integer, intent(in), optional :: opt_pencil
+     implicit none
 
-    logical :: global
+     integer, intent(IN) :: level      ! levels of halo cells required
+     complex(mytype), dimension(:, :, :), intent(IN) :: in
+     complex(mytype), allocatable, dimension(:, :, :), intent(OUT) :: out
+#if defined(_GPU)
+     attributes(device) :: out
+#endif
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     logical, optional :: opt_global
+     integer, intent(in), optional :: opt_pencil
 
-    ! starting/ending index of array with halo cells
-    integer :: xs, ys, zs, xe, ye, ze
+     logical :: global
 
-    integer :: i, j, k, s1, s2, s3, ierror
-    integer :: data_type
+     ! starting/ending index of array with halo cells
+     integer :: xs, ys, zs, xe, ye, ze
+     ! additional start end
+     integer :: ist, ien, jst, jen, kst, ken
 
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
+     integer :: i, j, k, s1, s2, s3
+     integer :: data_type
 
-    integer :: ipencil
-    logical, save :: first_call_x = .true., first_call_y = .true., first_call_z = .true.
+     integer :: ipencil
+     logical, save :: first_call_x = .true., first_call_y = .true., first_call_z = .true.
 
-    data_type = complex_type
+     data_type = complex_type
 
 #include "halo_common.f90"
 
-    return
+     return
   end subroutine update_halo_complex
 
+  subroutine exchange_halo_x_real_short(inout, opt_xlevel)
+
+     implicit none
+
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_xlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     call exchange_halo_x_real(inout, decomp_main, opt_xlevel)
+
+  end subroutine exchange_halo_x_real_short
 
-  subroutine exchange_halo_x_real(inout, opt_decomp, opt_xlevel)
+  subroutine exchange_halo_x_real(inout, decomp, opt_xlevel)
 
-    implicit none
+     implicit none
 
-    real(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_xlevel
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_xlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, ye, ze, s1, s2, s3
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo12
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_n, tag_s, tag_t, tag_b
+     integer :: data_type
+     integer :: xs, ys, zs, ye, ze, s1, s2, s3
 
-    data_type = real_type
+     data_type = real_type
 
 #include "halo_comm_x.f90"
 
-    return
+     return
   end subroutine exchange_halo_x_real
 
+  subroutine exchange_halo_x_complex_short(inout, opt_xlevel)
 
-  subroutine exchange_halo_x_complex(inout, opt_decomp, opt_xlevel)
+     implicit none
 
-    implicit none
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_xlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_xlevel
+     call exchange_halo_x_complex(inout, decomp_main, opt_xlevel)
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, ye, ze, s1, s2, s3
+  end subroutine exchange_halo_x_complex_short
 
-    data_type = complex_type
+  subroutine exchange_halo_x_complex(inout, decomp, opt_xlevel)
+
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_xlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo12
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_n, tag_s, tag_t, tag_b
+     integer :: data_type
+     integer :: xs, ys, zs, ye, ze, s1, s2, s3
+
+     data_type = complex_type
 
 #include "halo_comm_x.f90"
 
-    return
+     return
   end subroutine exchange_halo_x_complex
 
+  subroutine exchange_halo_y_real_short(inout, opt_ylevel)
+
+     implicit none
+
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_ylevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
-  subroutine exchange_halo_y_real(inout, opt_decomp, opt_ylevel)
+     call exchange_halo_y_real(inout, decomp_main, opt_ylevel)
 
-    implicit none
+  end subroutine exchange_halo_y_real_short
 
-    real(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_ylevel
+  subroutine exchange_halo_y_real(inout, decomp, opt_ylevel)
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, xe, ze, s1, s2, s3
+     implicit none
 
-    data_type = real_type
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_ylevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo21
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_e, tag_w, tag_t, tag_b
+     integer :: data_type
+     integer :: xs, ys, zs, xe, ze, s1, s2, s3
+
+     data_type = real_type
 
 #include "halo_comm_y.f90"
 
-    return
+     return
   end subroutine exchange_halo_y_real
 
+  subroutine exchange_halo_y_complex_short(inout, opt_ylevel)
 
-  subroutine exchange_halo_y_complex(inout, opt_decomp, opt_ylevel)
+     implicit none
 
-    implicit none
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_ylevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_ylevel
+     call exchange_halo_y_complex(inout, decomp_main, opt_ylevel)
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, xe, ze, s1, s2, s3
+  end subroutine exchange_halo_y_complex_short
 
-    data_type = complex_type
+  subroutine exchange_halo_y_complex(inout, decomp, opt_ylevel)
 
-#include "halo_comm_y.f90"
+     implicit none
 
-    return
-  end subroutine exchange_halo_y_complex
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_ylevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo21
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_e, tag_w, tag_t, tag_b
+     integer :: data_type
+     integer :: xs, ys, zs, xe, ze, s1, s2, s3
 
-  subroutine exchange_halo_z_real(inout, opt_decomp, opt_zlevel)
-    implicit none
-    real(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_zlevel
+     data_type = complex_type
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, xe, ye, s1, s2, s3
+#include "halo_comm_y.f90"
 
-    data_type = real_type
+     return
+  end subroutine exchange_halo_y_complex
+
+  subroutine exchange_halo_z_real_short(inout, opt_zlevel)
+     implicit none
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_zlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     call exchange_halo_z_real(inout, decomp_main, opt_zlevel)
+  end subroutine exchange_halo_z_real_short
+
+  subroutine exchange_halo_z_real(inout, decomp, opt_zlevel)
+     implicit none
+     real(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_zlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo31, halo32
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_e, tag_w, tag_n, tag_s
+     integer :: data_type
+     integer :: xs, ys, zs, xe, ye, s1, s2, s3
+
+     data_type = real_type
 
 #include "halo_comm_z.f90"
 
-    return
+     return
   end subroutine exchange_halo_z_real
 
-  subroutine exchange_halo_z_complex(inout, opt_decomp, opt_zlevel)
+  subroutine exchange_halo_z_complex_short(inout, opt_zlevel)
+
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     integer, dimension(3), optional :: opt_zlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
+
+     call exchange_halo_z_complex(inout, decomp_main, opt_zlevel)
+  end subroutine exchange_halo_z_complex_short
+
+  subroutine exchange_halo_z_complex(inout, decomp, opt_zlevel)
 
-    implicit none
+     implicit none
 
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: inout
-    TYPE(DECOMP_INFO), optional :: opt_decomp
-    integer, dimension(3), optional :: opt_zlevel
+     complex(mytype), dimension(:, :, :), intent(INOUT) :: inout
+     TYPE(DECOMP_INFO), intent(in) :: decomp
+     integer, dimension(3), optional :: opt_zlevel
+#if defined(_GPU)
+     attributes(device) :: inout
+#endif
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: level_x, level_y, level_z
-    integer :: i, j, k, ierror
-    integer :: icount, ilength, ijump
-    integer :: halo12, halo21, halo31, halo32
-    integer, dimension(4) :: requests
-    integer, dimension(MPI_STATUS_SIZE,4) :: status
-    integer :: tag_e, tag_w, tag_n, tag_s, tag_t, tag_b
-    integer :: data_type
-    integer :: xs, ys, zs, xe, ye, s1, s2, s3
+     integer :: level_x, level_y, level_z
+     integer :: ierror
+     integer :: icount, ilength, ijump
+     integer :: halo31, halo32
+     integer, dimension(4) :: requests
+     integer, dimension(MPI_STATUS_SIZE, 4) :: status
+     integer :: tag_e, tag_w, tag_n, tag_s
+     integer :: data_type
+     integer :: xs, ys, zs, xe, ye, s1, s2, s3
 
-    data_type = complex_type
+     data_type = complex_type
 
 #include "halo_comm_z.f90"
 
-    return
+     return
   end subroutine exchange_halo_z_complex
diff --git a/src/halo_comm_x.f90 b/src/halo_comm_x.f90
index b2d5d568..0ba475f1 100644
--- a/src/halo_comm_x.f90
+++ b/src/halo_comm_x.f90
@@ -12,20 +12,14 @@
 ! This file contain common code to be included by subroutines
 ! 'exchange_halo_x_...' in halo.f90
 
-    if (present(opt_decomp)) then
-      decomp = opt_decomp
-    else
-      decomp = decomp_main
-    end if
-
     if (present(opt_xlevel)) then
-      level_x = opt_xlevel(1)
-      level_y = opt_xlevel(2)
-      level_z = opt_xlevel(3)
+       level_x = opt_xlevel(1)
+       level_y = opt_xlevel(2)
+       level_z = opt_xlevel(3)
     else
-      level_x = decomp%xlevel(1)
-      level_y = decomp%xlevel(2)
-      level_z = decomp%xlevel(3)
+       level_x = decomp%xlevel(1)
+       level_y = decomp%xlevel(2)
+       level_z = decomp%xlevel(3)
     end if
 
     s1 = decomp%xsz(1)
@@ -34,9 +28,9 @@
 
     xs = 1 + level_x
     ys = 1
-    ye = s2 + 2*level_y
+    ye = s2 + 2 * level_y
     zs = 1
-    ze = s3 + 2*level_z
+    ze = s3 + 2 * level_z
 
     ! if (decomp%halos_for_pencil) then
     !   ! don't communicate lower halo (west boundary)
@@ -47,14 +41,14 @@
     ! end if
 
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'X-pencil input'
-      write(*,*) '=============='
-      write(*,*) 'Data on a y-z plane is shown'
-      write(*,*) 'Before halo exchange'
-      do j=ye,ys,-1
-        write(*,'(10F4.0)') (inout(1,j,k),k=zs,ze)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'X-pencil input'
+       write (*, *) '=============='
+       write (*, *) 'Data on a y-z plane is shown'
+       write (*, *) 'Before halo exchange'
+       do j = ye, ys, -1
+          write (*, '(10F4.0)') (inout(1, j, k), k=zs, ze)
+       end do
     end if
 #endif
 
@@ -63,49 +57,49 @@
 
     ! *** north/south ***
     tag_s = coord(1)
-    if (coord(1)==dims(1)-1 .AND. periodic_y) then
-      tag_n = 0
+    if (coord(1) == dims(1) - 1 .AND. periodic_y) then
+       tag_n = 0
     else
-      tag_n = coord(1) + 1
+       tag_n = coord(1) + 1
     end if
-    icount = s3 + 2*level_z
+    icount = s3 + 2 * level_z
     ilength = level_y * s1
-    ijump = s1 * (s2 + 2*level_y)
+    ijump = s1 * (s2 + 2 * level_y)
     call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-      data_type, halo12, ierror)
+                         data_type, halo12, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_VECTOR")
     call MPI_TYPE_COMMIT(halo12, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
     ! receive from south
-    call MPI_IRECV(inout(xs,ys,zs), 1, halo12, &
-      neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
-      requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), 1, halo12, &
+                   neighbour(1, 4), tag_s, DECOMP_2D_COMM_CART_X, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from north
-    call MPI_IRECV(inout(xs,ye-level_y+1,zs), 1, halo12, &
-      neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
-      requests(2), ierror)
+    call MPI_IRECV(inout(xs, ye - level_y + 1, zs), 1, halo12, &
+                   neighbour(1, 3), tag_n, DECOMP_2D_COMM_CART_X, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to south
-    call MPI_ISSEND(inout(xs,ys+level_y,zs), 1, halo12, &
-      neighbour(1,4), tag_s, DECOMP_2D_COMM_CART_X, &
-      requests(3), ierror)
+    call MPI_ISSEND(inout(xs, ys + level_y, zs), 1, halo12, &
+                    neighbour(1, 4), tag_s, DECOMP_2D_COMM_CART_X, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to north
-    call MPI_ISSEND(inout(xs,ye-level_y-level_y+1,zs), 1, halo12, &
-      neighbour(1,3), tag_n, DECOMP_2D_COMM_CART_X, &
-      requests(4), ierror)
+    call MPI_ISSEND(inout(xs, ye - level_y - level_y + 1, zs), 1, halo12, &
+                    neighbour(1, 3), tag_n, DECOMP_2D_COMM_CART_X, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
     call MPI_TYPE_FREE(halo12, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'After exchange in Y'
-      do j=ye,ys,-1
-        write(*,'(10F4.0)') (inout(1,j,k),k=zs,ze)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in Y'
+       do j = ye, ys, -1
+          write (*, '(10F4.0)') (inout(1, j, k), k=zs, ze)
+       end do
     end if
 #endif
 
@@ -114,40 +108,40 @@
     ! all contiguous in memory, which can be sent/received using
     ! MPI directly
     tag_b = coord(2)
-    if (coord(2)==dims(2)-1 .AND. periodic_z) then
-      tag_t = 0
+    if (coord(2) == dims(2) - 1 .AND. periodic_z) then
+       tag_t = 0
     else
-      tag_t = coord(2) + 1
+       tag_t = coord(2) + 1
     end if
-    icount = (s1 * (s2 + 2*level_y)) * level_z
+    icount = (s1 * (s2 + 2 * level_y)) * level_z
     ! receive from bottom
-    call MPI_IRECV(inout(xs,ys,zs), icount, data_type, &
-      neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
-      requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), icount, data_type, &
+                   neighbour(1, 6), tag_b, DECOMP_2D_COMM_CART_X, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from top
-    call MPI_IRECV(inout(xs,ys,ze-level_z+1), icount, data_type, &
-      neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
-      requests(2), ierror)
+    call MPI_IRECV(inout(xs, ys, ze - level_z + 1), icount, data_type, &
+                   neighbour(1, 5), tag_t, DECOMP_2D_COMM_CART_X, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to bottom
-    call MPI_ISSEND(inout(xs,ys,zs+level_z), icount, data_type, &
-      neighbour(1,6), tag_b, DECOMP_2D_COMM_CART_X, &
-      requests(3), ierror)
+    call MPI_ISSEND(inout(xs, ys, zs + level_z), icount, data_type, &
+                    neighbour(1, 6), tag_b, DECOMP_2D_COMM_CART_X, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to top
-    call MPI_ISSEND(inout(xs,ys,ze-level_z-level_z+1), icount, data_type, &
-      neighbour(1,5), tag_t, DECOMP_2D_COMM_CART_X, &
-      requests(4), ierror)
+    call MPI_ISSEND(inout(xs, ys, ze - level_z - level_z + 1), icount, data_type, &
+                    neighbour(1, 5), tag_t, DECOMP_2D_COMM_CART_X, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
 
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'After exchange in Z'
-      do j=ye,ys,-1
-        write(*,'(10F4.0)') (inout(1,j,k),k=zs,ze)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in Z'
+       do j = ye, ys, -1
+          write (*, '(10F4.0)') (inout(1, j, k), k=zs, ze)
+       end do
     end if
 #endif
diff --git a/src/halo_comm_y.f90 b/src/halo_comm_y.f90
index 38ab6b8a..7a6c2ac2 100644
--- a/src/halo_comm_y.f90
+++ b/src/halo_comm_y.f90
@@ -12,20 +12,14 @@
 ! This file contain common code to be included by subroutines
 ! 'exchange_halo_y_...' in halo.f90
 
-    if (present(opt_decomp)) then
-      decomp = opt_decomp
-    else
-      decomp = decomp_main
-    end if
-
     if (present(opt_ylevel)) then
-      level_x = opt_ylevel(1)
-      level_y = opt_ylevel(2)
-      level_z = opt_ylevel(3)
+       level_x = opt_ylevel(1)
+       level_y = opt_ylevel(2)
+       level_z = opt_ylevel(3)
     else
-      level_x = decomp%ylevel(1)
-      level_y = decomp%ylevel(2)
-      level_z = decomp%ylevel(3)
+       level_x = decomp%ylevel(1)
+       level_y = decomp%ylevel(2)
+       level_z = decomp%ylevel(3)
     end if
 
     s1 = decomp%ysz(1)
@@ -33,10 +27,10 @@
     s3 = decomp%ysz(3)
 
     xs = 1
-    xe = s1 + 2*level_x
+    xe = s1 + 2 * level_x
     ys = 1 + level_y
     zs = 1
-    ze = s3 + 2*level_z
+    ze = s3 + 2 * level_z
 
     ! if (decomp%halos_for_pencil) then
     !   ! don't communicate lower halo (south boundary)
@@ -47,62 +41,62 @@
     ! end if
 
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'Y-pencil input'
-      write(*,*) '=============='
-      write(*,*) 'Data on a x-z plane is shown'
-      write(*,*) 'Before halo exchange'
-      do i=xe,xs,-1
-        write(*,'(10F4.0)') (inout(i,1,k),k=zs,ze)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'Y-pencil input'
+       write (*, *) '=============='
+       write (*, *) 'Data on a x-z plane is shown'
+       write (*, *) 'Before halo exchange'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, 1, k), k=zs, ze)
+       end do
     end if
 #endif
 
     ! *** east/west ***
     tag_w = coord(1)
-    if (coord(1)==dims(1)-1 .AND. periodic_x) then
-      tag_e = 0
+    if (coord(1) == dims(1) - 1 .AND. periodic_x) then
+       tag_e = 0
     else
-      tag_e = coord(1) + 1
+       tag_e = coord(1) + 1
     end if
-    icount = s2 * (s3 + 2*level_z)
+    icount = s2 * (s3 + 2 * level_z)
     ilength = level_x
-    ijump = s1 + 2*level_x
+    ijump = s1 + 2 * level_x
     call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-      data_type, halo21, ierror)
+                         data_type, halo21, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_VECTOR")
     call MPI_TYPE_COMMIT(halo21, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
     ! receive from west
-    call MPI_IRECV(inout(xs,ys,zs), 1, halo21, &
-      neighbour(2,2), tag_w, DECOMP_2D_COMM_CART_Y, &
-      requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), 1, halo21, &
+                   neighbour(2, 2), tag_w, DECOMP_2D_COMM_CART_Y, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from east
-    call MPI_IRECV(inout(xe-level_x+1,ys,zs), 1, halo21, &
-      neighbour(2,1), tag_e, DECOMP_2D_COMM_CART_Y, &
-      requests(2), ierror)
+    call MPI_IRECV(inout(xe - level_x + 1, ys, zs), 1, halo21, &
+                   neighbour(2, 1), tag_e, DECOMP_2D_COMM_CART_Y, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to west
-    call MPI_ISSEND(inout(xs+level_x,ys,zs), 1, halo21, &
-      neighbour(2,2), tag_w, DECOMP_2D_COMM_CART_Y, &
-      requests(3), ierror)
+    call MPI_ISSEND(inout(xs + level_x, ys, zs), 1, halo21, &
+                    neighbour(2, 2), tag_w, DECOMP_2D_COMM_CART_Y, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to east
-    call MPI_ISSEND(inout(xe-level_x-level_x+1,ys,zs), 1, halo21, &
-      neighbour(2,1), tag_e, DECOMP_2D_COMM_CART_Y, &
-      requests(4), ierror)
+    call MPI_ISSEND(inout(xe - level_x - level_x + 1, ys, zs), 1, halo21, &
+                    neighbour(2, 1), tag_e, DECOMP_2D_COMM_CART_Y, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
     call MPI_TYPE_FREE(halo21, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-    write(*,*) 'After exchange in X'
-    do i=xe,xs,-1
-      write(*,'(10F4.0)') (inout(i,1,k),k=zs,ze)
-    end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in X'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, 1, k), k=zs, ze)
+       end do
     end if
 #endif
 
@@ -114,39 +108,39 @@
     ! all contiguous in memory, which can be sent/received using
     ! MPI directly
     tag_b = coord(2)
-    if (coord(2)==dims(2)-1 .AND. periodic_z) then
-      tag_t = 0
+    if (coord(2) == dims(2) - 1 .AND. periodic_z) then
+       tag_t = 0
     else
-      tag_t = coord(2) + 1
+       tag_t = coord(2) + 1
     end if
-    icount = (s2 * (s1 + 2*level_x)) * level_z
+    icount = (s2 * (s1 + 2 * level_x)) * level_z
     ! receive from bottom
-    call MPI_IRECV(inout(xs,ys,zs), icount, data_type, &
-      neighbour(2,6), tag_b, DECOMP_2D_COMM_CART_Y, &
-      requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), icount, data_type, &
+                   neighbour(2, 6), tag_b, DECOMP_2D_COMM_CART_Y, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from top
-    call MPI_IRECV(inout(xs,ys,ze-level_z+1), icount, data_type, &
-      neighbour(2,5), tag_t, DECOMP_2D_COMM_CART_Y, &
-      requests(2), ierror)
+    call MPI_IRECV(inout(xs, ys, ze - level_z + 1), icount, data_type, &
+                   neighbour(2, 5), tag_t, DECOMP_2D_COMM_CART_Y, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to bottom
-    call MPI_ISSEND(inout(xs,ys,zs+level_z), icount, data_type, &
-      neighbour(2,6), tag_b, DECOMP_2D_COMM_CART_Y, &
-      requests(3), ierror)
+    call MPI_ISSEND(inout(xs, ys, zs + level_z), icount, data_type, &
+                    neighbour(2, 6), tag_b, DECOMP_2D_COMM_CART_Y, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to top
-    call MPI_ISSEND(inout(xs,ys,ze-level_z-level_z+1), icount, data_type, &
-      neighbour(2,5), tag_t, DECOMP_2D_COMM_CART_Y, &
-      requests(4), ierror)
+    call MPI_ISSEND(inout(xs, ys, ze - level_z - level_z + 1), icount, data_type, &
+                    neighbour(2, 5), tag_t, DECOMP_2D_COMM_CART_Y, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'After exchange in Z'
-      do i=xe,xs,-1
-        write(*,'(10F4.0)') (inout(i,1,k),k=zs,ze)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in Z'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, 1, k), k=zs, ze)
+       end do
     end if
 #endif
diff --git a/src/halo_comm_z.f90 b/src/halo_comm_z.f90
index 78524f43..78f3d1d2 100644
--- a/src/halo_comm_z.f90
+++ b/src/halo_comm_z.f90
@@ -12,21 +12,15 @@
 ! This file contain common code to be included by subroutines
 ! 'exchange_halo_z_...' in halo.f90
 
-    if (present(opt_decomp)) then
-      decomp = opt_decomp
-    else
-      decomp = decomp_main
-    end if
-
     if (present(opt_zlevel)) then ! assume same level for all directions
-      level_x = opt_zlevel(1)
-      level_y = opt_zlevel(2)
-      level_z = opt_zlevel(3)
+       level_x = opt_zlevel(1)
+       level_y = opt_zlevel(2)
+       level_z = opt_zlevel(3)
     else
-      level_x = decomp%zlevel(1)
-      level_y = decomp%zlevel(2)
-      level_z = decomp%zlevel(3)
-      ! add checks to make sure level_x and level_y are sensible values (positive integer)?
+       level_x = decomp%zlevel(1)
+       level_y = decomp%zlevel(2)
+       level_z = decomp%zlevel(3)
+       ! add checks to make sure level_x and level_y are sensible values (positive integer)?
     end if
 
     s1 = decomp%zsz(1)
@@ -34,9 +28,9 @@
     s3 = decomp%zsz(3)
 
     xs = 1
-    xe = s1 + 2*level_x
+    xe = s1 + 2 * level_x
     ys = 1
-    ye = s2 + 2*level_y
+    ye = s2 + 2 * level_y
     !zs = 1 + level_z
     zs = 1
 
@@ -49,51 +43,51 @@
     ! end if
 
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'Z-pencil input'
-      write(*,*) '=============='
-      write(*,*) 'Data on a x-y plane is shown'
-      write(*,*) 'Before halo exchange'
-      do i=xe,xs,-1
-        write(*,'(10F4.0)') (inout(i,j,1),j=ys,ye)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'Z-pencil input'
+       write (*, *) '=============='
+       write (*, *) 'Data on a x-y plane is shown'
+       write (*, *) 'Before halo exchange'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, j, 1), j=ys, ye)
+       end do
     end if
 #endif
     ! *** east/west ***
     tag_w = coord(1)
-    if (coord(1)==dims(1)-1 .AND. periodic_x) then
-      tag_e = 0
+    if (coord(1) == dims(1) - 1 .AND. periodic_x) then
+       tag_e = 0
     else
-      tag_e = coord(1) + 1
+       tag_e = coord(1) + 1
     end if
     !icount = (s2 + 2*level_y) * s3
-    icount = (s2 + 2*level_y) * (s3 + 2*level_z)
+    icount = (s2 + 2 * level_y) * (s3 + 2 * level_z)
     ilength = level_x
-    ijump = s1 + 2*level_x
+    ijump = s1 + 2 * level_x
     call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-        data_type, halo31, ierror)
+                         data_type, halo31, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_VECTOR")
     call MPI_TYPE_COMMIT(halo31, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
     ! receive from west
-    call MPI_IRECV(inout(xs,ys,zs), 1, halo31, &
-        neighbour(3,2), tag_w, DECOMP_2D_COMM_CART_Z, &
-        requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), 1, halo31, &
+                   neighbour(3, 2), tag_w, DECOMP_2D_COMM_CART_Z, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from east
-    call MPI_IRECV(inout(xe-level_x+1,ys,zs), 1, halo31, &
-        neighbour(3,1), tag_e, DECOMP_2D_COMM_CART_Z, &
-        requests(2), ierror)
+    call MPI_IRECV(inout(xe - level_x + 1, ys, zs), 1, halo31, &
+                   neighbour(3, 1), tag_e, DECOMP_2D_COMM_CART_Z, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to west
-    call MPI_ISSEND(inout(xs+level_x,ys,zs), 1, halo31, &
-        neighbour(3,2), tag_w, DECOMP_2D_COMM_CART_Z, &
-        requests(3), ierror)
+    call MPI_ISSEND(inout(xs + level_x, ys, zs), 1, halo31, &
+                    neighbour(3, 2), tag_w, DECOMP_2D_COMM_CART_Z, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to east
-    call MPI_ISSEND(inout(xe-level_x-level_x+1,ys,zs), 1, halo31, &
-        neighbour(3,1), tag_e, DECOMP_2D_COMM_CART_Z, &
-        requests(4), ierror)
+    call MPI_ISSEND(inout(xe - level_x - level_x + 1, ys, zs), 1, halo31, &
+                    neighbour(3, 1), tag_e, DECOMP_2D_COMM_CART_Z, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
@@ -101,60 +95,60 @@
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'After exchange in X'
-      do i=xe,xs,-1
-        write(*,'(10F4.0)') (inout(i,j,1),j=ys,ye)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in X'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, j, 1), j=ys, ye)
+       end do
     end if
 #endif
 
     ! *** north/south ***
     tag_s = coord(2)
-    if (coord(2)==dims(2)-1 .AND. periodic_y) then
-      tag_n = 0
+    if (coord(2) == dims(2) - 1 .AND. periodic_y) then
+       tag_n = 0
     else
-      tag_n = coord(2) + 1
+       tag_n = coord(2) + 1
     end if
     !icount = s3
-    icount = s3 + (2*level_z)
-    ilength = level_y * (s1 + 2*level_x)
-    ijump = (s1 + 2*level_x) * (s2 + 2*level_y)
+    icount = s3 + (2 * level_z)
+    ilength = level_y * (s1 + 2 * level_x)
+    ijump = (s1 + 2 * level_x) * (s2 + 2 * level_y)
     call MPI_TYPE_VECTOR(icount, ilength, ijump, &
-        data_type, halo32, ierror)
+                         data_type, halo32, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_VECTOR")
     call MPI_TYPE_COMMIT(halo32, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
     ! receive from south
-    call MPI_IRECV(inout(xs,ys,zs), 1, halo32, &
-        neighbour(3,4), tag_s, DECOMP_2D_COMM_CART_Z, &
-        requests(1), ierror)
+    call MPI_IRECV(inout(xs, ys, zs), 1, halo32, &
+                   neighbour(3, 4), tag_s, DECOMP_2D_COMM_CART_Z, &
+                   requests(1), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! receive from north
-    call MPI_IRECV(inout(xs,ye-level_y+1,zs), 1, halo32, &
-        neighbour(3,3), tag_n, DECOMP_2D_COMM_CART_Z, &
-        requests(2), ierror)
+    call MPI_IRECV(inout(xs, ye - level_y + 1, zs), 1, halo32, &
+                   neighbour(3, 3), tag_n, DECOMP_2D_COMM_CART_Z, &
+                   requests(2), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_IRECV")
     ! send to south
-    call MPI_ISSEND(inout(xs,ys+level_y,zs), 1, halo32, &
-        neighbour(3,4), tag_s, DECOMP_2D_COMM_CART_Z, &
-        requests(3), ierror)
+    call MPI_ISSEND(inout(xs, ys + level_y, zs), 1, halo32, &
+                    neighbour(3, 4), tag_s, DECOMP_2D_COMM_CART_Z, &
+                    requests(3), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     ! send to north
-    call MPI_ISSEND(inout(xs,ye-level_y-level_y+1,zs), 1, halo32, &
-        neighbour(3,3), tag_n, DECOMP_2D_COMM_CART_Z, &
-        requests(4), ierror)
+    call MPI_ISSEND(inout(xs, ye - level_y - level_y + 1, zs), 1, halo32, &
+                    neighbour(3, 3), tag_n, DECOMP_2D_COMM_CART_Z, &
+                    requests(4), ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ISSEND")
     call MPI_WAITALL(4, requests, status, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_WAITALL")
     call MPI_TYPE_FREE(halo32, ierror)
     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 #ifdef HALO_DEBUG
-    if (nrank==4) then
-      write(*,*) 'After exchange in Y'
-      do i=xe,xs,-1
-        write(*,'(10F4.0)') (inout(i,j,1),j=ys,ye)
-      end do
+    if (nrank == 4) then
+       write (*, *) 'After exchange in Y'
+       do i = xe, xs, -1
+          write (*, '(10F4.0)') (inout(i, j, 1), j=ys, ye)
+       end do
     end if
 #endif
 
diff --git a/src/halo_common.f90 b/src/halo_common.f90
index 61d340d2..9053b958 100644
--- a/src/halo_common.f90
+++ b/src/halo_common.f90
@@ -1,13 +1,4 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contain common code to be included by subroutines
 ! 'update_halo_...' in halo.f90
@@ -18,12 +9,16 @@
        global = .false.
     end if
 
-    s1 = size(in,1)
-    s2 = size(in,2)
-    s3 = size(in,3)
+    s1 = size(in, 1)
+    s2 = size(in, 2)
+    s3 = size(in, 3)
 
     if (present(opt_pencil)) then
        ipencil = opt_pencil
+       ! Check for invalid argument
+       if ((ipencil < 1) .or. (ipencil > 3)) then
+          call decomp_2d_abort(__FILE__, __LINE__, ipencil, "Invalid pencil for halo exchange, should be in range 1<=pencil<=3")
+       end if
     else
        ! Historic/default behaviour
        if (s1 == decomp%xsz(1)) then
@@ -31,21 +26,21 @@
           if (first_call_x) then
              first_call_x = .false.
              call decomp_2d_warning(__FILE__, __LINE__, &
-                  0, "Deprecated interface - calling halo in X without explicit pencil")
+                                    0, "Deprecated interface - calling halo in X without explicit pencil")
           end if
        else if (s2 == decomp%ysz(2)) then
           ipencil = 2
           if (first_call_y) then
              first_call_y = .false.
              call decomp_2d_warning(__FILE__, __LINE__, &
-                  0, "Deprecated interface - calling halo in Y without explicit pencil")
+                                    0, "Deprecated interface - calling halo in Y without explicit pencil")
           end if
        else if (s3 == decomp%zsz(3)) then
           ipencil = 3
           if (first_call_z) then
              first_call_z = .false.
              call decomp_2d_warning(__FILE__, __LINE__, &
-                  0, "Deprecated interface - calling halo in Z without explicit pencil")
+                                    0, "Deprecated interface - calling halo in Z without explicit pencil")
           end if
        else
           ipencil = 0
@@ -111,56 +106,73 @@
        zs = 1; ze = 1
 
        call decomp_2d_abort(__FILE__, __LINE__, 10, &
-            'Invalid data passed to update_halo')
+                            'Invalid data passed to update_halo')
     end if
-    allocate(out(xs:xe, ys:ye, zs:ze))
-!    out = -1.0_mytype ! fill the halo for debugging
+    allocate (out(xs:xe, ys:ye, zs:ze))
+
+    !    out = -1.0_mytype ! fill the halo for debugging
 
+    !$acc enter data create(requests,neighbour)
     ! copy input data to output
     if (global) then
        ! using global coordinate
        ! note the input array passed in always has index starting from 1
        ! need to work out the corresponding global index
        if (ipencil == 1) then
-          do k=decomp%xst(3),decomp%xen(3)
-             do j=decomp%xst(2),decomp%xen(2)
-                do i=1,s1  ! x all local
-                   out(i,j,k) = in(i,j-decomp%xst(2)+1,k-decomp%xst(3)+1)
+          kst = decomp%xst(3); ken = decomp%xen(3)
+          jst = decomp%xst(2); jen = decomp%xen(2)
+          !$acc kernels default(present)
+          do k = kst, ken
+             do j = jst, jen
+                do i = 1, s1  ! x all local
+                   out(i, j, k) = in(i, j - decomp%xst(2) + 1, k - decomp%xst(3) + 1)
                 end do
              end do
           end do
+          !$acc end kernels
        else if (ipencil == 2) then
-          do k=decomp%yst(3),decomp%yen(3)
-             do j=1,s2  ! y all local
-                do i=decomp%yst(1),decomp%yen(1)
-                   out(i,j,k) = in(i-decomp%yst(1)+1,j,k-decomp%yst(3)+1)
+          kst = decomp%yst(3); ken = decomp%yen(3)
+          ist = decomp%yst(1); ien = decomp%yen(1)
+          !$acc kernels default(present)
+          do k = kst, ken
+             do j = 1, s2  ! y all local
+                do i = ist, ien
+                   out(i, j, k) = in(i - decomp%yst(1) + 1, j, k - decomp%yst(3) + 1)
                 end do
              end do
           end do
+          !$acc end kernels
        else if (ipencil == 3) then
-          do k=1,s3  ! z all local
-             do j=decomp%zst(2),decomp%zen(2)
-                do i=decomp%zst(1),decomp%zen(1)
-                   out(i,j,k) = in(i-decomp%zst(1)+1,j-decomp%zst(2)+1,k)
+          jst = decomp%zst(2); jen = decomp%zen(2)
+          ist = decomp%zst(1); ien = decomp%zen(1)
+          !$acc kernels default(present)
+          do k = 1, s3  ! z all local
+             do j = jst, jen
+                do i = ist, ien
+                   out(i, j, k) = in(i - decomp%zst(1) + 1, j - decomp%zst(2) + 1, k)
                 end do
              end do
           end do
+          !$acc end kernels
        end if
     else
        ! not using global coordinate
-       do k=1,s3
-          do j=1,s2
-             do i=1,s1
-                out(i,j,k) = in(i,j,k)
+       !$acc kernels default(present)
+       do k = 1, s3
+          do j = 1, s2
+             do i = 1, s1
+                out(i, j, k) = in(i, j, k)
              end do
           end do
        end do
+       !$acc end kernels
+       !!! istat = cudaMemcpy(out,in,s1*s2*s3,cudaMemcpyDeviceToDevice)
     end if
 
     if (ipencil == 1) then
-      call exchange_halo_x(out,opt_xlevel=(/0,level,level/))
+       call exchange_halo_x(out, opt_xlevel=(/0, level, level/))
     else if (ipencil == 2) then
-      call exchange_halo_y(out,opt_ylevel=(/level,0,level/))
+       call exchange_halo_y(out, opt_ylevel=(/level, 0, level/))
     else if (ipencil == 3) then
-      call exchange_halo_z(out,opt_zlevel=(/level,level,0/))
+       call exchange_halo_z(out, opt_zlevel=(/level, level, 0/))
     end if
diff --git a/src/io.f90 b/src/io.f90
index 160b4257..a8152449 100644
--- a/src/io.f90
+++ b/src/io.f90
@@ -1,2037 +1,2046 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2013 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This module provides parallel IO facilities for applications based on
 ! 2D decomposition.
 
 module decomp_2d_io
 
-  use decomp_2d
-  use MPI
-#ifdef T3PIO
-  use t3pio
-#endif
+   use decomp_2d
+   use decomp_2d_constants
+   use decomp_2d_mpi
+   use MPI
 
 #ifdef ADIOS2
-  use adios2
+   use adios2
 #endif
 
-  implicit none
+   implicit none
 
-  integer, parameter, public :: decomp_2d_write_mode = 1, decomp_2d_read_mode = 2, &
-       decomp_2d_append_mode = 3
-  integer, parameter :: MAX_IOH = 10 ! How many live IO things should we handle?
-  character(len=*), parameter :: io_sep = "::"
-  integer, save :: nreg_io = 0
+   integer, parameter, public :: decomp_2d_write_mode = 1, decomp_2d_read_mode = 2, &
+                                 decomp_2d_append_mode = 3
+   integer, parameter :: MAX_IOH = 10 ! How many live IO things should we handle?
+   character(len=*), parameter :: io_sep = "::"
+   integer, save :: nreg_io = 0
 #ifndef ADIOS2
-  integer, dimension(MAX_IOH), save :: fh_registry
-  logical, dimension(MAX_IOH), target, save :: fh_live
-  character(len=1024), dimension(MAX_IOH), target, save :: fh_names
-  integer(kind=MPI_OFFSET_KIND), dimension(MAX_IOH), save :: fh_disp
+   integer, dimension(MAX_IOH), save :: fh_registry
+   logical, dimension(MAX_IOH), target, save :: fh_live
+   character(len=1024), dimension(MAX_IOH), target, save :: fh_names
+   integer(kind=MPI_OFFSET_KIND), dimension(MAX_IOH), save :: fh_disp
 #else
-  type(adios2_adios) :: adios
-  character(len=1024), dimension(MAX_IOH), target, save :: engine_names
-  logical, dimension(MAX_IOH), target, save :: engine_live
-  type(adios2_engine), dimension(MAX_IOH), save :: engine_registry
-#endif
-  
-  private        ! Make everything private unless declared public
-
-  public :: decomp_2d_write_one, decomp_2d_read_one, &
-       decomp_2d_write_var, decomp_2d_read_var, &
-       decomp_2d_write_scalar, decomp_2d_read_scalar, &
-       decomp_2d_write_plane, decomp_2d_write_every, &
-       decomp_2d_write_subdomain, &
-       decomp_2d_write_outflow, decomp_2d_read_inflow, &
-       decomp_2d_io_init, decomp_2d_io_finalise, & ! XXX: initialise/finalise 2decomp&fft IO module
-       decomp_2d_init_io, & ! XXX: initialise an io process - awful naming
-       decomp_2d_register_variable, &
-       decomp_2d_open_io, decomp_2d_close_io, &
-       decomp_2d_start_io, decomp_2d_end_io, &
-       gen_iodir_name
-
-  ! Generic interface to handle multiple data types
-
-  interface decomp_2d_write_one
-     module procedure write_one_real
-     module procedure write_one_complex
-     module procedure mpiio_write_real_coarse
-     module procedure mpiio_write_real_probe
-  end interface decomp_2d_write_one
-
-  interface decomp_2d_read_one
-     module procedure read_one_real
-     module procedure read_one_complex
-  end interface decomp_2d_read_one
-
-  interface decomp_2d_write_var
-     module procedure write_var_real
-     module procedure write_var_complex
-  end interface decomp_2d_write_var
-
-  interface decomp_2d_read_var
-     module procedure read_var_real
-     module procedure read_var_complex
-  end interface decomp_2d_read_var
-
-  interface decomp_2d_write_scalar
-     module procedure write_scalar_real
-     module procedure write_scalar_complex
-     module procedure write_scalar_integer
-     module procedure write_scalar_logical
-  end interface decomp_2d_write_scalar
-
-  interface decomp_2d_read_scalar
-     module procedure read_scalar_real
-     module procedure read_scalar_complex
-     module procedure read_scalar_integer
-     module procedure read_scalar_logical
-  end interface decomp_2d_read_scalar
-
-  interface decomp_2d_write_plane
-     module procedure write_plane_3d_real
-     module procedure write_plane_3d_complex
-     !     module procedure write_plane_2d
-  end interface decomp_2d_write_plane
-
-  interface decomp_2d_write_every
-     module procedure write_every_real
-     module procedure write_every_complex
-  end interface decomp_2d_write_every
-
-  interface decomp_2d_write_subdomain
-     module procedure write_subdomain
-  end interface decomp_2d_write_subdomain
-
-  interface decomp_2d_write_outflow
-     module procedure write_outflow
-  end interface decomp_2d_write_outflow
-
-  interface decomp_2d_read_inflow
-     module procedure read_inflow
-  end interface decomp_2d_read_inflow
+   type(adios2_adios) :: adios
+   character(len=1024), dimension(MAX_IOH), target, save :: engine_names
+   logical, dimension(MAX_IOH), target, save :: engine_live
+   type(adios2_engine), dimension(MAX_IOH), save :: engine_registry
+#endif
+
+   private        ! Make everything private unless declared public
+
+   public :: decomp_2d_write_one, decomp_2d_read_one, &
+             decomp_2d_write_var, decomp_2d_read_var, &
+             decomp_2d_write_scalar, decomp_2d_read_scalar, &
+             decomp_2d_write_plane, decomp_2d_write_every, &
+             decomp_2d_write_subdomain, &
+             decomp_2d_write_outflow, decomp_2d_read_inflow, &
+             decomp_2d_io_init, decomp_2d_io_finalise, & ! XXX: initialise/finalise 2decomp&fft IO module
+             decomp_2d_init_io, & ! XXX: initialise an io process - awful naming
+             decomp_2d_register_variable, &
+             decomp_2d_open_io, decomp_2d_close_io, &
+             decomp_2d_start_io, decomp_2d_end_io, &
+             gen_iodir_name
+
+   ! Generic interface to handle multiple data types
+
+   interface decomp_2d_write_one
+      module procedure write_one_real
+      module procedure write_one_complex
+      module procedure mpiio_write_real_coarse
+      module procedure mpiio_write_real_probe
+   end interface decomp_2d_write_one
+
+   interface decomp_2d_read_one
+      module procedure read_one_real
+      module procedure read_one_complex
+   end interface decomp_2d_read_one
+
+   interface decomp_2d_write_var
+      module procedure write_var_real
+      module procedure write_var_complex
+   end interface decomp_2d_write_var
+
+   interface decomp_2d_read_var
+      module procedure read_var_real
+      module procedure read_var_complex
+   end interface decomp_2d_read_var
+
+   interface decomp_2d_write_scalar
+      module procedure write_scalar_real
+      module procedure write_scalar_complex
+      module procedure write_scalar_integer
+      module procedure write_scalar_logical
+   end interface decomp_2d_write_scalar
+
+   interface decomp_2d_read_scalar
+      module procedure read_scalar_real
+      module procedure read_scalar_complex
+      module procedure read_scalar_integer
+      module procedure read_scalar_logical
+   end interface decomp_2d_read_scalar
+
+   interface decomp_2d_write_plane
+      module procedure write_plane_3d_real
+      module procedure write_plane_3d_complex
+      !     module procedure write_plane_2d
+   end interface decomp_2d_write_plane
+
+   interface decomp_2d_write_every
+      module procedure write_every_real
+      module procedure write_every_complex
+   end interface decomp_2d_write_every
+
+   interface decomp_2d_write_subdomain
+      module procedure write_subdomain
+   end interface decomp_2d_write_subdomain
+
+   interface decomp_2d_write_outflow
+      module procedure write_outflow
+   end interface decomp_2d_write_outflow
+
+   interface decomp_2d_read_inflow
+      module procedure read_inflow
+   end interface decomp_2d_read_inflow
 
 contains
 
-  subroutine decomp_2d_io_init()
+   subroutine decomp_2d_io_init()
 
 #ifdef ADIOS2
-  integer :: ierror
-  logical :: adios2_debug_mode
-  character(len=80) :: config_file="adios2_config.xml"
+      integer :: ierror
+      character(len=80) :: config_file = "adios2_config.xml"
 #endif
 
 #ifdef ADIOS2
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_start("io_init")
+      if (decomp_profiler_io) call decomp_profiler_start("io_init")
 #endif
 
-  !! TODO: make this a runtime-option
-  adios2_debug_mode = .true.
-
-  call adios2_init(adios, trim(config_file), decomp_2d_comm, adios2_debug_mode, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, &
-     "Error initialising ADIOS2 - is "//trim(config_file)//" present and valid?")
-
-  engine_live(:) = .false.
+      call adios2_init(adios, trim(config_file), decomp_2d_comm, ierror)
+      if (ierror /= 0) then
+         call decomp_2d_abort(__FILE__, __LINE__, ierror, &
+                              "Error initialising ADIOS2 - is "//trim(config_file)//" present and valid?")
+      end if
+      engine_live(:) = .false.
 
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_end("io_init")
+      if (decomp_profiler_io) call decomp_profiler_end("io_init")
 #endif
 #endif
-  
-  end subroutine decomp_2d_io_init
-  subroutine decomp_2d_io_finalise()
+
+   end subroutine decomp_2d_io_init
+   subroutine decomp_2d_io_finalise()
 
 #ifdef ADIOS2
-    use adios2
+      use adios2
 #endif
 
-    implicit none
+      implicit none
 
 #ifdef ADIOS2
-    integer :: ierror
+      integer :: ierror
 #endif
-    
+
 #ifdef ADIOS2
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_fin")
+      if (decomp_profiler_io) call decomp_profiler_start("io_fin")
 #endif
-    call adios2_finalize(adios, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, &
-       "adios2_finalize")
+      call adios2_finalize(adios, ierror)
+      if (ierror /= 0) then
+         call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_finalize")
+      end if
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_fin")
+      if (decomp_profiler_io) call decomp_profiler_end("io_fin")
 #endif
 #endif
 
-  end subroutine decomp_2d_io_finalise
-  
+   end subroutine decomp_2d_io_finalise
+
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Using MPI-IO library to write a single 3D array to a file
+   ! Using MPI-IO library to write a single 3D array to a file
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_one_real(ipencil,var,filename,opt_decomp)
+   subroutine write_one_real(ipencil, var, filename, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: ipencil
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      character(len=*), intent(IN) :: filename
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type, info, gs
+      TYPE(DECOMP_INFO) :: decomp
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, fh, data_type
 
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_start("io_write_one_real")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_one_real")
 #endif
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_write_one.inc"
 
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_end("io_write_one_real")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_one_real")
 #endif
 
-    return
-  end subroutine write_one_real
+      return
+   end subroutine write_one_real
 
-  subroutine write_one_complex(ipencil,var,filename,opt_decomp)
+   subroutine write_one_complex(ipencil, var, filename, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: ipencil
+      complex(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      character(len=*), intent(IN) :: filename
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type, info, gs
+      TYPE(DECOMP_INFO) :: decomp
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, fh, data_type
 
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_start("io_write_one_cplx")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_one_cplx")
 #endif
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_write_one.inc"
 
 #ifdef PROFILER
-  if (decomp_profiler_io) call decomp_profiler_end("io_write_one_cplx")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_one_cplx")
 #endif
 
-    return
-  end subroutine write_one_complex
-
+      return
+   end subroutine write_one_complex
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Using MPI-IO library to read from a file a single 3D array
+   ! Using MPI-IO library to read from a file a single 3D array
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_one_real(ipencil,var,dirname,varname,io_name,opt_decomp,reduce_prec)
+   subroutine read_one_real(ipencil, var, dirname, varname, io_name, opt_decomp, reduce_prec)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    character(len=*), intent(IN) :: varname, dirname, io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-    logical, intent(in), optional :: reduce_prec
+      integer, intent(IN) :: ipencil
+      real(mytype), contiguous, dimension(:, :, :), intent(INOUT) :: var
+      character(len=*), intent(IN) :: varname, dirname, io_name
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      logical, intent(in), optional :: reduce_prec
 
-    logical :: read_reduce_prec
-    
-    integer :: idx
+      logical :: read_reduce_prec
+
+      integer :: idx
 #ifndef ADIOS2
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    real(mytype_single), allocatable, dimension(:,:,:) :: varsingle
-    integer :: data_type
-    logical :: dir_exists
-    integer :: disp_bytes
-    integer :: ierror, newtype
-    character(len=:), allocatable :: full_io_name
-    logical :: opened_new
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      real(mytype_single), allocatable, dimension(:, :, :) :: varsingle
+      integer :: data_type
+      logical :: dir_exists
+      integer :: disp_bytes
+      integer :: ierror, newtype
+      character(len=:), allocatable :: full_io_name
+      logical :: opened_new
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_read_one_real")
+      if (decomp_profiler_io) call decomp_profiler_start("io_read_one_real")
 #endif
-    
-    read_reduce_prec = .true.
-    
-    idx = get_io_idx(io_name, dirname)
+
+      read_reduce_prec = .true.
+
+      idx = get_io_idx(io_name, dirname)
 #ifndef ADIOS2
-    opened_new = .false.
-    if (idx .lt. 1) then
-       ! Check file exists
-       full_io_name = dirname//"/"//varname
-       if (nrank==0) then
-          inquire(file=full_io_name, exist=dir_exists)
-          if (.not.dir_exists) then
-             print *, "ERROR: cannot read from", full_io_name, " directory doesn't exist!"
-             stop
-          end if
-       end if
-       
-       call decomp_2d_open_io(io_name, full_io_name, decomp_2d_read_mode)
-       idx = get_io_idx(io_name, full_io_name)
-       opened_new = .true.
-    else
-       full_io_name = "" ! Ensure string is not unset
-    end if
-    
-    if (present(reduce_prec)) then
-       if (.not. reduce_prec) then
-          read_reduce_prec = .false.
-       end if
-    end if
-    if (read_reduce_prec) then
-       data_type = real_type_single
-    else
-       data_type = real_type
-    end if
-    call MPI_TYPE_SIZE(data_type,disp_bytes,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    
-    if (present(opt_decomp)) then
-       decomp = opt_decomp
-    else
-       call get_decomp_info(decomp)
-    end if
-
-    ! determine subarray parameters
-    sizes(1) = decomp%xsz(1)
-    sizes(2) = decomp%ysz(2)
-    sizes(3) = decomp%zsz(3)
-
-    if (ipencil == 1) then
-       subsizes(1) = decomp%xsz(1)
-       subsizes(2) = decomp%xsz(2)
-       subsizes(3) = decomp%xsz(3)
-       starts(1) = decomp%xst(1)-1  ! 0-based index
-       starts(2) = decomp%xst(2)-1
-       starts(3) = decomp%xst(3)-1
-    else if (ipencil == 2) then
-       subsizes(1) = decomp%ysz(1)
-       subsizes(2) = decomp%ysz(2)
-       subsizes(3) = decomp%ysz(3)
-       starts(1) = decomp%yst(1)-1
-       starts(2) = decomp%yst(2)-1
-       starts(3) = decomp%yst(3)-1
-    else if (ipencil == 3) then
-       subsizes(1) = decomp%zsz(1)
-       subsizes(2) = decomp%zsz(2)
-       subsizes(3) = decomp%zsz(3)
-       starts(1) = decomp%zst(1)-1
-       starts(2) = decomp%zst(2)-1
-       starts(3) = decomp%zst(3)-1
-    endif
-
-    associate(fh => fh_registry(idx), &
-         disp => fh_disp(idx))
-      call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-           MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-      if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-      call MPI_TYPE_COMMIT(newtype,ierror)
-      if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-      call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-           newtype,'native',MPI_INFO_NULL,ierror)
-      if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      opened_new = .false.
+      if (idx < 1) then
+         ! Check file exists
+         full_io_name = trim(dirname)//"/"//trim(varname)
+         if (nrank == 0) then
+            inquire (file=full_io_name, exist=dir_exists)
+            if (.not. dir_exists) then
+               print *, "ERROR: cannot read from", full_io_name, " directory doesn't exist!"
+               stop
+            end if
+         end if
+
+         call decomp_2d_open_io(io_name, full_io_name, decomp_2d_read_mode)
+         idx = get_io_idx(io_name, full_io_name)
+         opened_new = .true.
+      else
+         full_io_name = "" ! Ensure string is not unset
+      end if
+
+      if (present(reduce_prec)) then
+         if (.not. reduce_prec) then
+            read_reduce_prec = .false.
+         end if
+      end if
       if (read_reduce_prec) then
-         allocate (varsingle(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-         call MPI_FILE_READ_ALL(fh, varsingle, &
-              subsizes(1)*subsizes(2)*subsizes(3), &
-              data_type, MPI_STATUS_IGNORE, ierror)
-         if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-         var = real(varsingle,mytype)
-         deallocate(varsingle)
+         data_type = real_type_single
+      else
+         data_type = real_type
+      end if
+      call MPI_TYPE_SIZE(data_type, disp_bytes, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+
+      if (present(opt_decomp)) then
+         decomp = opt_decomp
+      else
+         call get_decomp_info(decomp)
+      end if
+
+      ! determine subarray parameters
+      sizes(1) = decomp%xsz(1)
+      sizes(2) = decomp%ysz(2)
+      sizes(3) = decomp%zsz(3)
+
+      if (ipencil == 1) then
+         subsizes(1) = decomp%xsz(1)
+         subsizes(2) = decomp%xsz(2)
+         subsizes(3) = decomp%xsz(3)
+         starts(1) = decomp%xst(1) - 1  ! 0-based index
+         starts(2) = decomp%xst(2) - 1
+         starts(3) = decomp%xst(3) - 1
+      else if (ipencil == 2) then
+         subsizes(1) = decomp%ysz(1)
+         subsizes(2) = decomp%ysz(2)
+         subsizes(3) = decomp%ysz(3)
+         starts(1) = decomp%yst(1) - 1
+         starts(2) = decomp%yst(2) - 1
+         starts(3) = decomp%yst(3) - 1
+      else if (ipencil == 3) then
+         subsizes(1) = decomp%zsz(1)
+         subsizes(2) = decomp%zsz(2)
+         subsizes(3) = decomp%zsz(3)
+         starts(1) = decomp%zst(1) - 1
+         starts(2) = decomp%zst(2) - 1
+         starts(3) = decomp%zst(3) - 1
       else
-         call MPI_FILE_READ_ALL(fh, var, &
-              subsizes(1)*subsizes(2)*subsizes(3), &
-              data_type, MPI_STATUS_IGNORE, ierror)
-         if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-      endif
-      call MPI_TYPE_FREE(newtype,ierror)
-      if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
-
-      disp = disp + sizes(1) * sizes(2) * sizes(3) * disp_bytes
-    end associate
-
-    if (opened_new) then
-       call decomp_2d_close_io(io_name, full_io_name)
-       deallocate(full_io_name)
-    end if
+         call decomp_2d_abort(-1, "IO/read_one_real : Wrong value for ipencil")
+      end if
+
+      associate (fh => fh_registry(idx), &
+                 disp => fh_disp(idx))
+         call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+         call MPI_TYPE_COMMIT(newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+         call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                                newtype, 'native', MPI_INFO_NULL, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+         if (read_reduce_prec) then
+            allocate (varsingle(starts(1):(starts(1)+subsizes(1)), &
+                                starts(2):(starts(2)+subsizes(2)), &
+                                starts(3):(starts(3)+subsizes(3))))
+            call MPI_FILE_READ_ALL(fh, varsingle, &
+                                   subsizes(1) * subsizes(2) * subsizes(3), &
+                                   data_type, MPI_STATUS_IGNORE, ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+            var = real(varsingle, mytype)
+            deallocate (varsingle)
+         else
+            call MPI_FILE_READ_ALL(fh, var, &
+                                   subsizes(1) * subsizes(2) * subsizes(3), &
+                                   data_type, MPI_STATUS_IGNORE, ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+         end if
+         call MPI_TYPE_FREE(newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+
+         disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                * int(sizes(2), kind=MPI_OFFSET_KIND) &
+                * int(sizes(3), kind=MPI_OFFSET_KIND) &
+                * int(disp_bytes, kind=MPI_OFFSET_KIND)
+      end associate
+
+      if (opened_new) then
+         call decomp_2d_close_io(io_name, full_io_name)
+         deallocate (full_io_name)
+      end if
 #else
-    call adios2_read_one_real(var, dirname, varname, io_name)
+      call adios2_read_one_real(var, dirname, varname, io_name)
 
-    associate(pncl => ipencil, opdcmp => opt_decomp, rdprec => reduce_prec) ! Silence unused arguments
-    end associate
+      associate (pncl => ipencil, opdcmp => opt_decomp, rdprec => reduce_prec) ! Silence unused arguments
+      end associate
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_read_one_real")
+      if (decomp_profiler_io) call decomp_profiler_end("io_read_one_real")
 #endif
 
-  end subroutine read_one_real
-
+   end subroutine read_one_real
 
-  subroutine read_one_complex(ipencil,var,filename,opt_decomp)
+   subroutine read_one_complex(ipencil, var, filename, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: var
-    character(len=*), intent(IN) :: filename
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: ipencil
+      complex(mytype), contiguous, dimension(:, :, :), intent(INOUT) :: var
+      character(len=*), intent(IN) :: filename
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer(kind=MPI_OFFSET_KIND) :: disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh, data_type
+      TYPE(DECOMP_INFO) :: decomp
+      integer(kind=MPI_OFFSET_KIND) :: disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, fh, data_type
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_read_one_cplx")
+      if (decomp_profiler_io) call decomp_profiler_start("io_read_one_cplx")
 #endif
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_read_one.inc"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_read_one_cplx")
+      if (decomp_profiler_io) call decomp_profiler_end("io_read_one_cplx")
 #endif
 
-  end subroutine read_one_complex
+   end subroutine read_one_complex
 
 #ifdef ADIOS2
-  subroutine adios2_read_one_real(var,engine_name,varname,io_name)
+   subroutine adios2_read_one_real(var, engine_name, varname, io_name)
 
-    implicit none
+      implicit none
 
-    character(len=*), intent(in) :: engine_name
-    character(len=*), intent(in) :: io_name
-    character*(*), intent(in) :: varname
-    real(mytype), dimension(:,:,:), intent(out) :: var
+      character(len=*), intent(in) :: engine_name
+      character(len=*), intent(in) :: io_name
+      character(len=*), intent(in) :: varname
+      real(mytype), contiguous, dimension(:, :, :), intent(out) :: var
 
-    integer :: ierror
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer :: idx
+      integer :: ierror
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
+      integer :: idx
 
-    integer(kind=8) :: nsteps
-    integer(kind=8) :: curstep
+      integer(kind=8) :: nsteps
+      integer(kind=8) :: curstep
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("adios2_read_one_real")
-#endif
-
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
-    call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-    if (.not.var_handle % valid) call decomp_2d_abort(__FILE__, __LINE__, -1, &
-       "ERROR: trying to read variable without registering first! "//trim(varname))
-
-    call adios2_variable_steps(nsteps, var_handle, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_variable_steps")
-    print *, "AVAILABLE steps for ", nsteps
-    
-    print *, "IO_NAME: ", io_name
-    print *, "ENGINE_NAME: ", engine_name
-    print *, "VAR_NAME: ", varname
-    idx = get_io_idx(io_name, engine_name)
-    print *, idx
-    call adios2_get(engine_registry(idx), var_handle, var, adios2_mode_deferred, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_get")
-
-    print *, "MAX: ", maxval(var)
-    
-    call adios2_current_step(curstep, engine_registry(idx), ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_current_step")
-    print *, "Current step: ", curstep
+      if (decomp_profiler_io) call decomp_profiler_start("adios2_read_one_real")
+#endif
+
+      call adios2_at_io(io_handle, adios, io_name, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+      call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+      if (.not. var_handle%valid) then
+         call decomp_2d_abort(__FILE__, __LINE__, -1, &
+                              "ERROR: trying to read variable without registering first! "//trim(varname))
+      end if
+
+      call adios2_variable_steps(nsteps, var_handle, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_variable_steps")
+      ! print *, "AVAILABLE steps for ", nsteps
+
+      ! print *, "IO_NAME: ", io_name
+      ! print *, "ENGINE_NAME: ", engine_name
+      ! print *, "VAR_NAME: ", varname
+      idx = get_io_idx(io_name, engine_name)
+      ! print *, idx
+      call adios2_get(engine_registry(idx), var_handle, var, adios2_mode_deferred, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_get")
+
+      ! print *, "MAX: ", maxval(var)
+
+      call adios2_current_step(curstep, engine_registry(idx), ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_current_step")
+      ! print *, "Current step: ", curstep
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("adios2_read_one_real")
+      if (decomp_profiler_io) call decomp_profiler_end("adios2_read_one_real")
 #endif
-    
-  end subroutine adios2_read_one_real
+
+   end subroutine adios2_read_one_real
 #endif
-  
+
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 3D array as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the writing
-  !  operation to prepare the writing of next chunk of data.
+   ! Write a 3D array as part of a big MPI-IO file, starting from
+   !  displacement 'disp'; 'disp' will be updated after the writing
+   !  operation to prepare the writing of next chunk of data.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_var_real(fh,disp,ipencil,var,opt_decomp)
+   subroutine write_var_real(fh, disp, ipencil, var, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: ipencil
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, data_type
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_write_var.inc"
 
-    return
-  end subroutine write_var_real
+      return
+   end subroutine write_var_real
 
-  subroutine write_var_complex(fh,disp,ipencil,var,opt_decomp)
+   subroutine write_var_complex(fh, disp, ipencil, var, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: ipencil
+      complex(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, data_type
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_write_var.inc"
 
-    return
-  end subroutine write_var_complex
-
+      return
+   end subroutine write_var_complex
 
-  subroutine write_outflow(dirname,varname,ntimesteps,var,io_name,opt_decomp)
+   subroutine write_outflow(dirname, varname, ntimesteps, var, io_name, opt_decomp)
 
-    implicit none
+      implicit none
 
-    character(len=*), intent(in) :: dirname, varname, io_name
-    integer, intent(IN) :: ntimesteps 
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      character(len=*), intent(in) :: dirname, varname, io_name
+      integer, intent(IN) :: ntimesteps
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, data_type
-    integer :: idx
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, data_type
+      integer :: idx
 #ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
 #else
-    integer :: newtype
+      integer :: newtype
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_outflow")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_outflow")
 #endif
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_write_outflow.f90"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_outflow")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_outflow")
 #endif
 
-  end subroutine write_outflow
-
+   end subroutine write_outflow
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Read a 3D array as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
+   ! Read a 3D array as part of a big MPI-IO file, starting from
+   !  displacement 'disp'; 'disp' will be updated after the reading
+   !  operation to prepare the reading of next chunk of data.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_var_real(fh,disp,ipencil,var,opt_decomp)
+   subroutine read_var_real(fh, disp, ipencil, var, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: ipencil
+      real(mytype), contiguous, dimension(:, :, :), intent(INOUT) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, data_type
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_read_var.inc"
 
-    return
-  end subroutine read_var_real
-
+      return
+   end subroutine read_var_real
 
-  subroutine read_var_complex(fh,disp,ipencil,var,opt_decomp)
+   subroutine read_var_complex(fh, disp, ipencil, var, opt_decomp)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: ipencil
-    complex(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: ipencil
+      complex(mytype), contiguous, dimension(:, :, :), intent(INOUT) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, newtype, data_type
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, newtype, data_type
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_read_var.inc"
 
-    return
-  end subroutine read_var_complex
-
+      return
+   end subroutine read_var_complex
 
-  subroutine read_inflow(dirname,varname,ntimesteps,var,io_name,opt_decomp)
+   subroutine read_inflow(dirname, varname, ntimesteps, var, io_name, opt_decomp)
 
-    implicit none
+      implicit none
 
-    character(len=*), intent(in) :: dirname, varname, io_name
-    integer, intent(IN) :: ntimesteps
-    real(mytype), dimension(:,:,:), intent(INOUT) :: var
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+      character(len=*), intent(in) :: dirname, varname, io_name
+      integer, intent(IN) :: ntimesteps
+      real(mytype), contiguous, dimension(:, :, :), intent(INOUT) :: var
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    TYPE(DECOMP_INFO) :: decomp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: ierror, data_type
-    integer :: idx
+      TYPE(DECOMP_INFO) :: decomp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: ierror, data_type
+      integer :: idx
 #ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
 #else
-    integer :: newtype
+      integer :: newtype
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_read_inflow")
+      if (decomp_profiler_io) call decomp_profiler_start("io_read_inflow")
 #endif
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_read_inflow.f90"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_read_inflow")
+      if (decomp_profiler_io) call decomp_profiler_end("io_read_inflow")
 #endif
 
-  end subroutine read_inflow
+   end subroutine read_inflow
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write scalar variables as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
+   ! Write scalar variables as part of a big MPI-IO file, starting from
+   !  displacement 'disp'; 'disp' will be updated after the reading
+   !  operation to prepare the reading of next chunk of data.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_scalar_real(fh,disp,n,var)
+   subroutine write_scalar_real(fh, disp, n, var)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh             ! file handle
-    integer(KIND=MPI_OFFSET_KIND), &
+      integer, intent(IN) :: fh             ! file handle
+      integer(KIND=MPI_OFFSET_KIND), &
          intent(INOUT) :: disp            ! displacement
-    integer, intent(IN) :: n              ! number of scalars
-    real(mytype), dimension(n), &
+      integer, intent(IN) :: n              ! number of scalars
+      real(mytype), dimension(n), &
          intent(IN) :: var                ! array of scalars
 
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         real_type,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    if (nrank==0) then
-       m = n ! only one rank needs to write
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, real_type, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    disp = disp + n*mytype_bytes
-
-    return
-  end subroutine write_scalar_real
-
-
-  subroutine write_scalar_complex(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    complex(mytype), dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,complex_type, &
-         complex_type,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, complex_type, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    disp = disp + n*mytype_bytes*2
-
-    return
-  end subroutine write_scalar_complex
-
-
-  subroutine write_scalar_integer(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    integer, dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER, &
-         MPI_INTEGER,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, MPI_INTEGER, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    call MPI_TYPE_SIZE(MPI_INTEGER,m,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    disp = disp + n*m
-
-    return
-  end subroutine write_scalar_integer
-
-
-  subroutine write_scalar_logical(fh,disp,n,var)
-
-    implicit none
-
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    logical, dimension(n), intent(IN) :: var
-
-    integer :: m, ierror
-
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_LOGICAL, &
-         MPI_LOGICAL,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    if (nrank==0) then
-       m = n
-    else
-       m = 0
-    end if
-    call MPI_FILE_WRITE_ALL(fh, var, m, MPI_LOGICAL, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    call MPI_TYPE_SIZE(MPI_LOGICAL,m,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    disp = disp + n*m
-
-    return
-  end subroutine write_scalar_logical
+      integer :: m, ierror
+
+      call MPI_FILE_SET_VIEW(fh, disp, real_type, &
+                             real_type, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      if (nrank == 0) then
+         m = n ! only one rank needs to write
+      else
+         m = 0
+      end if
+      call MPI_FILE_WRITE_ALL(fh, var, m, real_type, &
+                              MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(mytype_bytes, kind=MPI_OFFSET_KIND)
+
+      return
+   end subroutine write_scalar_real
+
+   subroutine write_scalar_complex(fh, disp, n, var)
+
+      implicit none
 
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      complex(mytype), dimension(n), intent(IN) :: var
+
+      integer :: m, ierror
+
+      call MPI_FILE_SET_VIEW(fh, disp, complex_type, &
+                             complex_type, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      if (nrank == 0) then
+         m = n
+      else
+         m = 0
+      end if
+      call MPI_FILE_WRITE_ALL(fh, var, m, complex_type, &
+                              MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(mytype_bytes, kind=MPI_OFFSET_KIND) &
+             * 2_MPI_OFFSET_KIND
+
+      return
+   end subroutine write_scalar_complex
+
+   subroutine write_scalar_integer(fh, disp, n, var)
+
+      implicit none
+
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      integer, dimension(n), intent(IN) :: var
+
+      integer :: m, ierror
+
+      call MPI_FILE_SET_VIEW(fh, disp, MPI_INTEGER, &
+                             MPI_INTEGER, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      if (nrank == 0) then
+         m = n
+      else
+         m = 0
+      end if
+      call MPI_FILE_WRITE_ALL(fh, var, m, MPI_INTEGER, &
+                              MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+      call MPI_TYPE_SIZE(MPI_INTEGER, m, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(m, kind=MPI_OFFSET_KIND)
+
+      return
+   end subroutine write_scalar_integer
+
+   subroutine write_scalar_logical(fh, disp, n, var)
+
+      implicit none
+
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      logical, dimension(n), intent(IN) :: var
+
+      integer :: m, ierror
+
+      call MPI_FILE_SET_VIEW(fh, disp, MPI_LOGICAL, &
+                             MPI_LOGICAL, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      if (nrank == 0) then
+         m = n
+      else
+         m = 0
+      end if
+      call MPI_FILE_WRITE_ALL(fh, var, m, MPI_LOGICAL, &
+                              MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+      call MPI_TYPE_SIZE(MPI_LOGICAL, m, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(m, kind=MPI_OFFSET_KIND)
+
+      return
+   end subroutine write_scalar_logical
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Read scalar variables as part of a big MPI-IO file, starting from 
-  !  displacement 'disp'; 'disp' will be updated after the reading
-  !  operation to prepare the reading of next chunk of data.
+   ! Read scalar variables as part of a big MPI-IO file, starting from
+   !  displacement 'disp'; 'disp' will be updated after the reading
+   !  operation to prepare the reading of next chunk of data.
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine read_scalar_real(fh,disp,n,var)
+   subroutine read_scalar_real(fh, disp, n, var)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: fh             ! file handle
-    integer(KIND=MPI_OFFSET_KIND), &
+      integer, intent(IN) :: fh             ! file handle
+      integer(KIND=MPI_OFFSET_KIND), &
          intent(INOUT) :: disp            ! displacement
-    integer, intent(IN) :: n              ! number of scalars
-    real(mytype), dimension(n), &
+      integer, intent(IN) :: n              ! number of scalars
+      real(mytype), dimension(n), &
          intent(INOUT) :: var             ! array of scalars
 
-    integer :: ierror
+      integer :: ierror
 
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         real_type,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    call MPI_FILE_READ_ALL(fh, var, n, real_type, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-    disp = disp + n*mytype_bytes
+      call MPI_FILE_SET_VIEW(fh, disp, real_type, &
+                             real_type, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      call MPI_FILE_READ_ALL(fh, var, n, real_type, &
+                             MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(mytype_bytes, kind=MPI_OFFSET_KIND)
 
-    return
-  end subroutine read_scalar_real
+      return
+   end subroutine read_scalar_real
 
+   subroutine read_scalar_complex(fh, disp, n, var)
 
-  subroutine read_scalar_complex(fh,disp,n,var)
+      implicit none
 
-    implicit none
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      complex(mytype), dimension(n), intent(INOUT) :: var
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    complex(mytype), dimension(n), intent(INOUT) :: var
+      integer :: ierror
 
-    integer :: ierror
+      call MPI_FILE_SET_VIEW(fh, disp, complex_type, &
+                             complex_type, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      call MPI_FILE_READ_ALL(fh, var, n, complex_type, &
+                             MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(mytype_bytes, kind=MPI_OFFSET_KIND) &
+             * 2_MPI_OFFSET_KIND
 
-    call MPI_FILE_SET_VIEW(fh,disp,complex_type, &
-         complex_type,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    call MPI_FILE_READ_ALL(fh, var, n, complex_type, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-    disp = disp + n*mytype_bytes*2
+      return
+   end subroutine read_scalar_complex
 
-    return
-  end subroutine read_scalar_complex
+   subroutine read_scalar_integer(fh, disp, n, var)
 
+      implicit none
 
-  subroutine read_scalar_integer(fh,disp,n,var)
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      integer, dimension(n), intent(INOUT) :: var
 
-    implicit none
+      integer :: m, ierror
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    integer, dimension(n), intent(INOUT) :: var
+      call MPI_FILE_SET_VIEW(fh, disp, MPI_INTEGER, &
+                             MPI_INTEGER, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      call MPI_FILE_READ_ALL(fh, var, n, MPI_INTEGER, &
+                             MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+      call MPI_TYPE_SIZE(MPI_INTEGER, m, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(m, kind=MPI_OFFSET_KIND)
 
-    integer :: m, ierror
+      return
+   end subroutine read_scalar_integer
 
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_INTEGER, &
-         MPI_INTEGER,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    call MPI_FILE_READ_ALL(fh, var, n, MPI_INTEGER, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-    call MPI_TYPE_SIZE(MPI_INTEGER,m,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    disp = disp + n*m
+   subroutine read_scalar_logical(fh, disp, n, var)
 
-    return
-  end subroutine read_scalar_integer
+      implicit none
 
+      integer, intent(IN) :: fh
+      integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
+      integer, intent(IN) :: n
+      logical, dimension(n), intent(INOUT) :: var
 
-  subroutine read_scalar_logical(fh,disp,n,var)
+      integer :: m, ierror
 
-    implicit none
+      call MPI_FILE_SET_VIEW(fh, disp, MPI_LOGICAL, &
+                             MPI_LOGICAL, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      call MPI_FILE_READ_ALL(fh, var, n, MPI_LOGICAL, &
+                             MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+      call MPI_TYPE_SIZE(MPI_LOGICAL, m, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+      disp = disp + int(n, kind=MPI_OFFSET_KIND) &
+             * int(m, kind=MPI_OFFSET_KIND)
 
-    integer, intent(IN) :: fh
-    integer(KIND=MPI_OFFSET_KIND), intent(INOUT) :: disp
-    integer, intent(IN) :: n
-    logical, dimension(n), intent(INOUT) :: var
+      return
+   end subroutine read_scalar_logical
 
-    integer :: m, ierror
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   ! Write a 2D slice of the 3D data to a file
+!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+   subroutine plane_extents(sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes)
 
-    call MPI_FILE_SET_VIEW(fh,disp,MPI_LOGICAL, &
-         MPI_LOGICAL,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    call MPI_FILE_READ_ALL(fh, var, n, MPI_LOGICAL, &
-         MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-    call MPI_TYPE_SIZE(MPI_LOGICAL,m,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    disp = disp + n*m
+      integer, intent(in) :: iplane
+      type(decomp_info), intent(in), optional :: opt_decomp
+      integer, intent(in), optional :: opt_nplanes
 
-    return
-  end subroutine read_scalar_logical
+      integer, dimension(3), intent(out) :: sizes, subsizes, starts
 
+      integer :: nplanes
+      type(decomp_info) :: decomp
 
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 2D slice of the 3D data to a file
-!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine plane_extents (sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes)
-
-    integer, intent(in) :: iplane
-    type(decomp_info), intent(in), optional :: opt_decomp
-    integer, intent(in), optional :: opt_nplanes
-    
-    integer, dimension(3), intent(out) :: sizes, subsizes, starts
-
-    integer :: nplanes
-    type(decomp_info) :: decomp
-
-    if (present(opt_decomp)) then
-       decomp = opt_decomp
-    else
-       call get_decomp_info(decomp)
-    end if
-
-    if (present(opt_nplanes)) then
-       nplanes = opt_nplanes
-    else
-       nplanes = 1
-    end if
-    
-    if (iplane == 1) then
-       sizes(1) = nplanes
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = decomp%zsz(3)
-       subsizes(1) = nplanes
-       subsizes(2) = decomp%xsz(2)
-       subsizes(3) = decomp%xsz(3)
-       starts(1) = 0
-       starts(2) = decomp%xst(2)-1
-       starts(3) = decomp%xst(3)-1
-    else if (iplane == 2) then
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = nplanes
-       sizes(3) = decomp%zsz(3)
-       subsizes(1) = decomp%ysz(1)
-       subsizes(2) = nplanes
-       subsizes(3) = decomp%ysz(3)
-       starts(1) = decomp%yst(1)-1
-       starts(2) = 0
-       starts(3) = decomp%yst(3)-1
-    else if (iplane == 3) then
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = nplanes
-       subsizes(1) = decomp%zsz(1)
-       subsizes(2) = decomp%zsz(2)
-       subsizes(3) = nplanes
-       starts(1) = decomp%zst(1)-1
-       starts(2) = decomp%zst(2)-1
-       starts(3) = 0
-    else
-       print *, "Can't work with plane ", iplane
-       stop
-    endif
-    
-  end subroutine plane_extents
-
-  subroutine write_plane_3d_real(ipencil,var,iplane,n,dirname,varname,io_name, &
-       opt_decomp)
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
-    integer, intent(IN) :: n ! which plane to write (global coordinate)
-    character(len=*), intent(IN) :: dirname,varname,io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    real(mytype), allocatable, dimension(:,:,:) :: wk2d
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: i,j,k, ierror, data_type
-
-    logical :: opened_new
-    integer :: idx
+      if (present(opt_decomp)) then
+         decomp = opt_decomp
+      else
+         call get_decomp_info(decomp)
+      end if
+
+      if (present(opt_nplanes)) then
+         nplanes = opt_nplanes
+      else
+         nplanes = 1
+      end if
+
+      if (iplane == 1) then
+         sizes(1) = nplanes
+         sizes(2) = decomp%ysz(2)
+         sizes(3) = decomp%zsz(3)
+         subsizes(1) = nplanes
+         subsizes(2) = decomp%xsz(2)
+         subsizes(3) = decomp%xsz(3)
+         starts(1) = 0
+         starts(2) = decomp%xst(2) - 1
+         starts(3) = decomp%xst(3) - 1
+      else if (iplane == 2) then
+         sizes(1) = decomp%xsz(1)
+         sizes(2) = nplanes
+         sizes(3) = decomp%zsz(3)
+         subsizes(1) = decomp%ysz(1)
+         subsizes(2) = nplanes
+         subsizes(3) = decomp%ysz(3)
+         starts(1) = decomp%yst(1) - 1
+         starts(2) = 0
+         starts(3) = decomp%yst(3) - 1
+      else if (iplane == 3) then
+         sizes(1) = decomp%xsz(1)
+         sizes(2) = decomp%ysz(2)
+         sizes(3) = nplanes
+         subsizes(1) = decomp%zsz(1)
+         subsizes(2) = decomp%zsz(2)
+         subsizes(3) = nplanes
+         starts(1) = decomp%zst(1) - 1
+         starts(2) = decomp%zst(2) - 1
+         starts(3) = 0
+      else
+         print *, "Can't work with plane ", iplane
+         stop
+      end if
+
+   end subroutine plane_extents
+
+   subroutine write_plane_3d_real(ipencil, var, iplane, n, dirname, varname, io_name, &
+                                  opt_decomp)
+
+      implicit none
+
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
+      integer, intent(IN) :: n ! which plane to write (global coordinate)
+      character(len=*), intent(IN) :: dirname, varname, io_name
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
+
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      real(mytype), allocatable, dimension(:, :, :) :: wk2d
+      TYPE(DECOMP_INFO) :: decomp
+      integer :: i, j, k, ierror, data_type
+
+      logical :: opened_new
+      integer :: idx
 #ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
 #else
-    integer, dimension(3) :: sizes, subsizes, starts
-    logical :: dir_exists
-    character(len=:), allocatable :: full_io_name
-    integer :: newtype
+      integer, dimension(3) :: sizes, subsizes, starts
+      logical :: dir_exists
+      character(len=:), allocatable :: full_io_name
+      integer :: newtype
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_plane_3d_real")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_plane_3d_real")
 #endif
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_write_plane.inc"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_plane_3d_real")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_plane_3d_real")
 #endif
 
-  end subroutine write_plane_3d_real
+   end subroutine write_plane_3d_real
 
+   subroutine write_plane_3d_complex(ipencil, var, iplane, n, &
+                                     dirname, varname, io_name, opt_decomp)
 
-  subroutine write_plane_3d_complex(ipencil,var,iplane,n, &
-       dirname,varname,io_name,opt_decomp)
+      implicit none
 
-    implicit none
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      complex(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
+      integer, intent(IN) :: n ! which plane to write (global coordinate)
+      character(len=*), intent(IN) :: dirname, varname, io_name
+      TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iplane !(x-plane=1; y-plane=2; z-plane=3)
-    integer, intent(IN) :: n ! which plane to write (global coordinate)
-    character(len=*), intent(IN) :: dirname,varname,io_name
-    TYPE(DECOMP_INFO), intent(IN), optional :: opt_decomp
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    complex(mytype), allocatable, dimension(:,:,:) :: wk2d
-    TYPE(DECOMP_INFO) :: decomp
-    integer :: i,j,k, ierror, data_type
-    logical :: opened_new
-    integer :: idx
+      complex(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      complex(mytype), allocatable, dimension(:, :, :) :: wk2d
+      TYPE(DECOMP_INFO) :: decomp
+      integer :: i, j, k, ierror, data_type
+      logical :: opened_new
+      integer :: idx
 #ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
 #else
-    integer, dimension(3) :: sizes, subsizes, starts
-    logical :: dir_exists
-    character(len=:), allocatable :: full_io_name
-    integer :: newtype
+      integer, dimension(3) :: sizes, subsizes, starts
+      logical :: dir_exists
+      character(len=:), allocatable :: full_io_name
+      integer :: newtype
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_plane_3d_cplx")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_plane_3d_cplx")
 #endif
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_write_plane.inc"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_plane_3d_cplx")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_plane_3d_cplx")
 #endif
 
-  end subroutine write_plane_3d_complex
-
+   end subroutine write_plane_3d_complex
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 2D array to a file
+   ! Write a 2D array to a file
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  !************** TO DO ***************
-  !* Consider handling distributed 2D data set
-  !  subroutine write_plane_2d(ipencil,var,filename)
-  !    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-  !    real(mytype), dimension(:,:), intent(IN) :: var ! 2D array
-  !    character(len=*), intent(IN) :: filename
-  !
-  !    if (ipencil==1) then
-  !       ! var should be defined as var(xsize(2)
-  !
-  !    else if (ipencil==2) then
-  !
-  !    else if (ipencil==3) then
-  !
-  !    end if
-  !
-  !    return
-  !  end subroutine write_plane_2d
-
+   !************** TO DO ***************
+   !* Consider handling distributed 2D data set
+   !  subroutine write_plane_2d(ipencil,var,filename)
+   !    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+   !    real(mytype), dimension(:,:), intent(IN) :: var ! 2D array
+   !    character(len=*), intent(IN) :: filename
+   !
+   !    if (ipencil==1) then
+   !       ! var should be defined as var(xsize(2)
+   !
+   !    else if (ipencil==2) then
+   !
+   !    else if (ipencil==3) then
+   !
+   !    end if
+   !
+   !    return
+   !  end subroutine write_plane_2d
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write 3D array data for every specified mesh point
+   ! Write 3D array data for every specified mesh point
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_every_real(ipencil,var,iskip,jskip,kskip, &
-       filename, from1)
+   subroutine write_every_real(ipencil, var, iskip, jskip, kskip, &
+                               filename, from1)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iskip,jskip,kskip 
-    character(len=*), intent(IN) :: filename
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      integer, intent(IN) :: iskip, jskip, kskip
+      character(len=*), intent(IN) :: filename
+      logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
+      ! .false. - save n,2n,3n...
 
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, fh, key,color,newcomm, data_type
-    integer, dimension(3) :: xsz,ysz,zsz,xst,yst,zst,xen,yen,zen,skip
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: i, j, k, ierror, newtype, fh, key, color, newcomm, data_type
+      integer, dimension(3) :: xsz, ysz, zsz, xst, yst, zst, xen, yen, zen, skip
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_every_real")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_every_real")
 #endif
 
-    data_type = real_type
+      data_type = real_type
 
 #include "io_write_every.inc"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_every_real")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_every_real")
 #endif
 
-  end subroutine write_every_real
+   end subroutine write_every_real
 
+   subroutine write_every_complex(ipencil, var, iskip, jskip, kskip, &
+                                  filename, from1)
 
-  subroutine write_every_complex(ipencil,var,iskip,jskip,kskip, &
-       filename, from1)
+      implicit none
 
-    implicit none
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      complex(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      integer, intent(IN) :: iskip, jskip, kskip
+      character(len=*), intent(IN) :: filename
+      logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
+      ! .false. - save n,2n,3n...
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    complex(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: iskip,jskip,kskip 
-    character(len=*), intent(IN) :: filename
-    logical, intent(IN) :: from1  ! .true.  - save 1,n+1,2n+1...
-    ! .false. - save n,2n,3n...
-
-    complex(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: i,j,k, ierror, newtype, fh, key,color,newcomm, data_type
-    integer, dimension(3) :: xsz,ysz,zsz,xst,yst,zst,xen,yen,zen,skip
+      complex(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: i, j, k, ierror, newtype, fh, key, color, newcomm, data_type
+      integer, dimension(3) :: xsz, ysz, zsz, xst, yst, zst, xen, yen, zen, skip
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_every_cplx")
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_every_cplx")
 #endif
 
-    data_type = complex_type
+      data_type = complex_type
 
 #include "io_write_every.inc"
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_every_cplx")
-#endif
-
-  end subroutine write_every_complex
-
-  subroutine coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-
-    implicit none
-    
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    type(decomp_info), intent(in), optional :: opt_decomp
-
-    integer, dimension(3) :: sizes, subsizes, starts
-    type(decomp_info) :: decomp
-
-    if ((icoarse.lt.0).or.(icoarse.gt.2)) then
-       call decomp_2d_abort(__FILE__, __LINE__, icoarse, "Error invalid value of icoarse")
-    endif
-    if ((ipencil.lt.1).or.(ipencil.gt.3)) then
-       call decomp_2d_abort(__FILE__, __LINE__, ipencil, "Error invalid value of ipencil ")
-    endif
-
-    if (icoarse==0) then
-       !! Use full fields
-       
-       if (present(opt_decomp)) then
-          decomp = opt_decomp
-       else
-          call get_decomp_info(decomp)
-       endif
-
-       sizes(1) = decomp%xsz(1)
-       sizes(2) = decomp%ysz(2)
-       sizes(3) = decomp%zsz(3)
-
-       if (ipencil == 1) then
-          subsizes(1:3) = decomp%xsz(1:3)
-          starts(1:3) = decomp%xst(1:3) - 1
-       elseif (ipencil == 2) then
-          subsizes(1:3) = decomp%ysz(1:3)
-          starts(1:3) = decomp%yst(1:3) - 1
-       elseif (ipencil == 3) then
-          subsizes(1:3) = decomp%zsz(1:3)
-          starts(1:3) = decomp%zst(1:3) - 1
-       endif
-    elseif (icoarse==1) then
-       sizes(1) = xszS(1)
-       sizes(2) = yszS(2)
-       sizes(3) = zszS(3)
-
-       if (ipencil == 1) then
-          subsizes(1) = xszS(1)
-          subsizes(2) = xszS(2)
-          subsizes(3) = xszS(3)
-          starts(1) = xstS(1)-1  ! 0-based index
-          starts(2) = xstS(2)-1
-          starts(3) = xstS(3)-1
-       else if (ipencil == 2) then
-          subsizes(1) = yszS(1)
-          subsizes(2) = yszS(2)
-          subsizes(3) = yszS(3)
-          starts(1) = ystS(1)-1
-          starts(2) = ystS(2)-1
-          starts(3) = ystS(3)-1
-       else if (ipencil == 3) then
-          subsizes(1) = zszS(1)
-          subsizes(2) = zszS(2)
-          subsizes(3) = zszS(3)
-          starts(1) = zstS(1)-1
-          starts(2) = zstS(2)-1
-          starts(3) = zstS(3)-1
-       endif
-    elseif (icoarse==2) then
-       sizes(1) = xszV(1)
-       sizes(2) = yszV(2)
-       sizes(3) = zszV(3)
-
-       if (ipencil == 1) then
-          subsizes(1) = xszV(1)
-          subsizes(2) = xszV(2)
-          subsizes(3) = xszV(3)
-          starts(1) = xstV(1)-1  ! 0-based index
-          starts(2) = xstV(2)-1
-          starts(3) = xstV(3)-1
-       else if (ipencil == 2) then
-          subsizes(1) = yszV(1)
-          subsizes(2) = yszV(2)
-          subsizes(3) = yszV(3)
-          starts(1) = ystV(1)-1
-          starts(2) = ystV(2)-1
-          starts(3) = ystV(3)-1
-       else if (ipencil == 3) then
-          subsizes(1) = zszV(1)
-          subsizes(2) = zszV(2)
-          subsizes(3) = zszV(3)
-          starts(1) = zstV(1)-1
-          starts(2) = zstV(2)-1
-          starts(3) = zstV(3)-1
-       endif
-    endif
-    
-  end subroutine coarse_extents
-
-  subroutine mpiio_write_real_coarse(ipencil,var,dirname,varname,icoarse,io_name,opt_decomp,reduce_prec,opt_deferred_writes)
-
-    ! USE param
-    ! USE variables
-
-    implicit none
-
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    character(len=*), intent(in) :: dirname, varname, io_name
-    type(decomp_info), intent(in), optional :: opt_decomp
-    logical, intent(in), optional :: reduce_prec
-    logical, intent(in), optional :: opt_deferred_writes
-
-    logical :: write_reduce_prec
-    logical :: deferred_writes
-    
-    integer :: ierror
-    integer :: idx
-    logical :: opened_new
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_every_cplx")
+#endif
+
+   end subroutine write_every_complex
+
+   subroutine coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
+
+      implicit none
+
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
+      type(decomp_info), intent(in), optional :: opt_decomp
+
+      integer, dimension(3) :: sizes, subsizes, starts
+      type(decomp_info) :: decomp
+
+      if ((icoarse < 0) .or. (icoarse > 2)) then
+         call decomp_2d_abort(__FILE__, __LINE__, icoarse, "Error invalid value of icoarse")
+      end if
+      if ((ipencil < 1) .or. (ipencil > 3)) then
+         call decomp_2d_abort(__FILE__, __LINE__, ipencil, "Error invalid value of ipencil ")
+      end if
+
+      if (icoarse == 0) then
+         ! Use full fields
+
+         if (present(opt_decomp)) then
+            decomp = opt_decomp
+         else
+            call get_decomp_info(decomp)
+         end if
+
+         sizes(1) = decomp%xsz(1)
+         sizes(2) = decomp%ysz(2)
+         sizes(3) = decomp%zsz(3)
+
+         if (ipencil == 1) then
+            subsizes(1:3) = decomp%xsz(1:3)
+            starts(1:3) = decomp%xst(1:3) - 1
+         elseif (ipencil == 2) then
+            subsizes(1:3) = decomp%ysz(1:3)
+            starts(1:3) = decomp%yst(1:3) - 1
+         elseif (ipencil == 3) then
+            subsizes(1:3) = decomp%zsz(1:3)
+            starts(1:3) = decomp%zst(1:3) - 1
+         else
+            call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
+         end if
+      elseif (icoarse == 1) then
+         sizes(1) = xszS(1)
+         sizes(2) = yszS(2)
+         sizes(3) = zszS(3)
+
+         if (ipencil == 1) then
+            subsizes(1) = xszS(1)
+            subsizes(2) = xszS(2)
+            subsizes(3) = xszS(3)
+            starts(1) = xstS(1) - 1  ! 0-based index
+            starts(2) = xstS(2) - 1
+            starts(3) = xstS(3) - 1
+         else if (ipencil == 2) then
+            subsizes(1) = yszS(1)
+            subsizes(2) = yszS(2)
+            subsizes(3) = yszS(3)
+            starts(1) = ystS(1) - 1
+            starts(2) = ystS(2) - 1
+            starts(3) = ystS(3) - 1
+         else if (ipencil == 3) then
+            subsizes(1) = zszS(1)
+            subsizes(2) = zszS(2)
+            subsizes(3) = zszS(3)
+            starts(1) = zstS(1) - 1
+            starts(2) = zstS(2) - 1
+            starts(3) = zstS(3) - 1
+         else
+            call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
+         end if
+      elseif (icoarse == 2) then
+         sizes(1) = xszV(1)
+         sizes(2) = yszV(2)
+         sizes(3) = zszV(3)
+
+         if (ipencil == 1) then
+            subsizes(1) = xszV(1)
+            subsizes(2) = xszV(2)
+            subsizes(3) = xszV(3)
+            starts(1) = xstV(1) - 1  ! 0-based index
+            starts(2) = xstV(2) - 1
+            starts(3) = xstV(3) - 1
+         else if (ipencil == 2) then
+            subsizes(1) = yszV(1)
+            subsizes(2) = yszV(2)
+            subsizes(3) = yszV(3)
+            starts(1) = ystV(1) - 1
+            starts(2) = ystV(2) - 1
+            starts(3) = ystV(3) - 1
+         else if (ipencil == 3) then
+            subsizes(1) = zszV(1)
+            subsizes(2) = zszV(2)
+            subsizes(3) = zszV(3)
+            starts(1) = zstV(1) - 1
+            starts(2) = zstV(2) - 1
+            starts(3) = zstV(3) - 1
+         else
+            call decomp_2d_abort(-1, "IO/coarse_extents : Wrong value for ipencil")
+         end if
+      end if
+
+   end subroutine coarse_extents
+
+   subroutine mpiio_write_real_coarse(ipencil, var, dirname, varname, icoarse, io_name, &
+                                      opt_decomp, reduce_prec, opt_deferred_writes)
+
+      ! USE param
+      ! USE variables
+
+      implicit none
+
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      character(len=*), intent(in) :: dirname, varname, io_name
+      type(decomp_info), intent(in), optional :: opt_decomp
+      logical, intent(in), optional :: reduce_prec
+      logical, intent(in), optional :: opt_deferred_writes
+
+      logical :: write_reduce_prec
+      logical :: deferred_writes
+
+      integer :: ierror
+      integer :: idx
+      logical :: opened_new
 #ifdef ADIOS2
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer :: write_mode
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
+      integer :: write_mode
 #else
-    real(mytype_single), allocatable, dimension(:,:,:) :: varsingle
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: newtype
-    logical :: dir_exists
-    integer :: disp_bytes
-    character(len=:), allocatable :: full_io_name
+      real(mytype_single), allocatable, dimension(:, :, :) :: varsingle
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: newtype
+      logical :: dir_exists
+      integer :: disp_bytes
+      character(len=:), allocatable :: full_io_name
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("mpiio_write_real_coarse")
-#endif
-
-    !! Set defaults
-    write_reduce_prec = .true.
-    if (present(opt_deferred_writes)) then
-       deferred_writes = opt_deferred_writes
-    else
-       deferred_writes = .true.
-    end if
-    
-    opened_new = .false.
-    idx = get_io_idx(io_name, dirname)
+      if (decomp_profiler_io) call decomp_profiler_start("mpiio_write_real_coarse")
+#endif
+
+      ! Set defaults
+      write_reduce_prec = .true.
+      if (present(opt_deferred_writes)) then
+         deferred_writes = opt_deferred_writes
+      else
+         deferred_writes = .true.
+      end if
+
+      opened_new = .false.
+      idx = get_io_idx(io_name, dirname)
 #ifndef ADIOS2
-    if (present(reduce_prec)) then
-       if (.not. reduce_prec) then
-          write_reduce_prec = .false.
-       end if
-    end if
-    if (write_reduce_prec) then
-       call MPI_TYPE_SIZE(real_type_single,disp_bytes,ierror)
-    else
-       call MPI_TYPE_SIZE(real_type,disp_bytes,ierror)
-    end if
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
-    
-    !! Use original MPIIO writers
-    if (present(opt_decomp)) then
-       call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-    else
-       call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
-    end if
-    if (write_reduce_prec) then
-       allocate (varsingle(xstV(1):xenV(1),xstV(2):xenV(2),xstV(3):xenV(3)))
-       varsingle=real(var, mytype_single)
-    end if
-
-    if (write_reduce_prec) then
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, real_type_single, newtype, ierror)
-    else
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, real_type, newtype, ierror)
-    end if
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-    call MPI_TYPE_COMMIT(newtype,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-
-    if (idx .lt. 1) then
-       ! Create folder if needed
-       if (nrank==0) then
-          inquire(file=dirname, exist=dir_exists)
-          if (.not.dir_exists) then
-             call execute_command_line("mkdir "//dirname//" 2> /dev/null", wait=.true.)
-          end if
-       end if
-       full_io_name = dirname//"/"//varname
-       call decomp_2d_open_io(io_name, full_io_name, decomp_2d_write_mode)
-       idx = get_io_idx(io_name, full_io_name)
-       opened_new = .true.
-    else
-       full_io_name = "" ! Ensure string is set
-    end if
-
-    if (write_reduce_prec) then
-       call MPI_FILE_SET_VIEW(fh_registry(idx),fh_disp(idx),real_type_single, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-       call MPI_FILE_WRITE_ALL(fh_registry(idx), varsingle, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            real_type_single, MPI_STATUS_IGNORE, ierror)
-    else
-       call MPI_FILE_SET_VIEW(fh_registry(idx),fh_disp(idx),real_type, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-       call MPI_FILE_WRITE_ALL(fh_registry(idx), var, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            real_type, MPI_STATUS_IGNORE, ierror)
-    end if
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    
-    fh_disp(idx) = fh_disp(idx) + sizes(1) * sizes(2) * sizes(3) * disp_bytes
-    
-    if (opened_new) then
-       call decomp_2d_close_io(io_name, full_io_name)
-       deallocate(full_io_name)
-    end if
-
-    call MPI_TYPE_FREE(newtype,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
-    if (write_reduce_prec) then
-       deallocate(varsingle)
-    end if
+      if (present(reduce_prec)) then
+         if (.not. reduce_prec) then
+            write_reduce_prec = .false.
+         end if
+      end if
+      if (write_reduce_prec) then
+         call MPI_TYPE_SIZE(real_type_single, disp_bytes, ierror)
+      else
+         call MPI_TYPE_SIZE(real_type, disp_bytes, ierror)
+      end if
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_SIZE")
+
+      ! Use original MPIIO writers
+      if (present(opt_decomp)) then
+         call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
+      else
+         call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
+      end if
+      if (write_reduce_prec) then
+         allocate (varsingle(starts(1):(starts(1)+subsizes(1)), &
+                             starts(2):(starts(2)+subsizes(2)), &
+                             starts(3):(starts(3)+subsizes(3))))
+         varsingle = real(var, mytype_single)
+      end if
+
+      if (write_reduce_prec) then
+         call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                       MPI_ORDER_FORTRAN, real_type_single, newtype, ierror)
+      else
+         call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                       MPI_ORDER_FORTRAN, real_type, newtype, ierror)
+      end if
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+      call MPI_TYPE_COMMIT(newtype, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+
+      if (idx < 1) then
+         ! Create folder if needed
+         if (nrank == 0) then
+            inquire (file=dirname, exist=dir_exists)
+            if (.not. dir_exists) then
+               call execute_command_line("mkdir "//dirname//" 2> /dev/null", wait=.true.)
+            end if
+         end if
+         full_io_name = trim(dirname)//"/"//trim(varname)
+         call decomp_2d_open_io(io_name, full_io_name, decomp_2d_write_mode)
+         idx = get_io_idx(io_name, full_io_name)
+         opened_new = .true.
+      else
+         full_io_name = "" ! Ensure string is set
+      end if
+
+      if (write_reduce_prec) then
+         call MPI_FILE_SET_VIEW(fh_registry(idx), fh_disp(idx), real_type_single, &
+                                newtype, 'native', MPI_INFO_NULL, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+         call MPI_FILE_WRITE_ALL(fh_registry(idx), varsingle, &
+                                 subsizes(1) * subsizes(2) * subsizes(3), &
+                                 real_type_single, MPI_STATUS_IGNORE, ierror)
+      else
+         call MPI_FILE_SET_VIEW(fh_registry(idx), fh_disp(idx), real_type, &
+                                newtype, 'native', MPI_INFO_NULL, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+         call MPI_FILE_WRITE_ALL(fh_registry(idx), var, &
+                                 subsizes(1) * subsizes(2) * subsizes(3), &
+                                 real_type, MPI_STATUS_IGNORE, ierror)
+      end if
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+
+      fh_disp(idx) = fh_disp(idx) + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                     * int(sizes(2), kind=MPI_OFFSET_KIND) &
+                     * int(sizes(3), kind=MPI_OFFSET_KIND) &
+                     * int(disp_bytes, kind=MPI_OFFSET_KIND)
+
+      if (opened_new) then
+         call decomp_2d_close_io(io_name, full_io_name)
+         deallocate (full_io_name)
+      end if
+
+      call MPI_TYPE_FREE(newtype, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+      if (write_reduce_prec) then
+         deallocate (varsingle)
+      end if
 #else
-    if (.not. engine_live(idx)) then
-       call decomp_2d_abort(__FILE__, __LINE__, -1, "ERROR: Engine is not live!")
-    end if
-    
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
-    call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-    if (.not.var_handle % valid) call decomp_2d_abort(__FILE__, __LINE__, -1, &
-       "ERROR: trying to write variable before registering! "//trim(varname))
-
-    if (idx .lt. 1) call decomp_2d_abort(__FILE__, __LINE__, idx, &
-       "You haven't opened "//trim(io_name)//":"//trim(dirname))
-
-    if (deferred_writes) then
-       write_mode = adios2_mode_deferred
-    else
-       write_mode = adios2_mode_sync
-    end if
-
-    if (engine_registry(idx)%valid) then
-       call adios2_put(engine_registry(idx), var_handle, var, write_mode, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
-    else
-       call decomp_2d_abort(__FILE__, __LINE__, -1, &
-          "ERROR: decomp2d thinks engine is live, but adios2 engine object is not valid")
-    end if
-
-    associate(crs => icoarse, pncl => ipencil, opdcmp => opt_decomp, rdprec => reduce_prec) ! Silence unused arguments
-    end associate
+      if (.not. engine_live(idx)) then
+         call decomp_2d_abort(__FILE__, __LINE__, -1, "ERROR: Engine is not live!")
+      end if
+
+      call adios2_at_io(io_handle, adios, io_name, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+      call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+      if (.not. var_handle%valid) then
+         call decomp_2d_abort(__FILE__, __LINE__, -1, &
+                              "ERROR: trying to write variable before registering! "//trim(varname))
+      end if
+
+      if (idx < 1) then
+         call decomp_2d_abort(__FILE__, __LINE__, idx, &
+                              "You haven't opened "//trim(io_name)//":"//trim(dirname))
+      end if
+
+      if (deferred_writes) then
+         write_mode = adios2_mode_deferred
+      else
+         write_mode = adios2_mode_sync
+      end if
+
+      if (engine_registry(idx)%valid) then
+         call adios2_put(engine_registry(idx), var_handle, var, write_mode, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
+      else
+         call decomp_2d_abort(__FILE__, __LINE__, -1, &
+                              "ERROR: decomp2d thinks engine is live, but adios2 engine object is not valid")
+      end if
+
+      associate (crs => icoarse, pncl => ipencil, opdcmp => opt_decomp, rdprec => reduce_prec) ! Silence unused arguments
+      end associate
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("mpiio_write_real_coarse")
+      if (decomp_profiler_io) call decomp_profiler_end("mpiio_write_real_coarse")
 #endif
 
-  end subroutine mpiio_write_real_coarse
+   end subroutine mpiio_write_real_coarse
 
-  subroutine decomp_2d_register_variable(io_name, varname, ipencil, icoarse, iplane, type, opt_decomp, opt_nplanes)
+   subroutine decomp_2d_register_variable(io_name, varname, ipencil, icoarse, iplane, type, opt_decomp, opt_nplanes)
 
-    use, intrinsic :: iso_fortran_env, only : real32, real64
+      use, intrinsic :: iso_fortran_env, only: real32, real64
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
-    character(len=*), intent(in) :: io_name
-    integer, intent(in) :: type
-    integer, intent(in) :: iplane
-    type(decomp_info), intent(in), optional :: opt_decomp
-    integer, intent(in), optional :: opt_nplanes
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      integer, intent(IN) :: icoarse !(nstat=1; nvisu=2)
+      character(len=*), intent(in) :: io_name
+      integer, intent(in) :: type
+      integer, intent(in) :: iplane
+      type(decomp_info), intent(in), optional :: opt_decomp
+      integer, intent(in), optional :: opt_nplanes
 
-    integer :: nplanes
-    character*(*), intent(in) :: varname
+      integer :: nplanes
+      character(len=*), intent(in) :: varname
 #ifdef ADIOS2
-    integer, dimension(3) :: sizes, subsizes, starts
-    type(adios2_io) :: io_handle
-    type(adios2_variable) :: var_handle
-    integer, parameter :: ndims = 3
-    logical, parameter :: adios2_constant_dims = .true.
-    integer :: data_type
-    integer :: ierror
-
-    if (iplane .eq. 0) then
-       if (present(opt_decomp)) then
-          call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
-       else
-          call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
-       endif
-    else
-       if (present(opt_nplanes)) then
-          nplanes = opt_nplanes
-       else
-          nplanes = 1
-       end if
-       if (present(opt_decomp)) then
-          call plane_extents(sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes=nplanes)
-       else
-          call plane_extents(sizes, subsizes, starts, iplane, opt_nplanes=nplanes)
-       endif
-    end if
-    
-    ! Check if variable already exists, if not create it
-    call adios2_at_io(io_handle, adios, io_name, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
-    if (io_handle%valid) then
-       call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-       if (.not.var_handle % valid) then
-          !! New variable
-          if (nrank .eq. 0) then
-             print *, "Registering variable for IO: ", varname
-          endif
-
-          ! Need to set the ADIOS2 data type
-          if (type.eq.kind(0._real64)) then
-             !! Double
-             data_type = adios2_type_dp
-          else if (type.eq.kind(0._real32)) then
-             !! Single
-             data_type = adios2_type_real
-          else
-             call decomp_2d_abort(__FILE__, __LINE__, -1, "Trying to write unknown data type!")
-          endif
-
-          call adios2_define_variable(var_handle, io_handle, varname, data_type, &
-               ndims, int(sizes, kind=8), int(starts, kind=8), int(subsizes, kind=8), &
-               adios2_constant_dims, ierror)
-          if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, &
-             "adios2_define_variable, ERROR registering variable "//trim(varname))
-       endif
-    else
-       call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to register variable with invalid IO!")
-    end if
+      integer, dimension(3) :: sizes, subsizes, starts
+      type(adios2_io) :: io_handle
+      type(adios2_variable) :: var_handle
+      integer, parameter :: ndims = 3
+      logical, parameter :: adios2_constant_dims = .true.
+      integer :: data_type
+      integer :: ierror
+
+      if (iplane == 0) then
+         if (present(opt_decomp)) then
+            call coarse_extents(ipencil, icoarse, sizes, subsizes, starts, opt_decomp)
+         else
+            call coarse_extents(ipencil, icoarse, sizes, subsizes, starts)
+         end if
+      else
+         if (present(opt_nplanes)) then
+            nplanes = opt_nplanes
+         else
+            nplanes = 1
+         end if
+         if (present(opt_decomp)) then
+            call plane_extents(sizes, subsizes, starts, iplane, opt_decomp, opt_nplanes=nplanes)
+         else
+            call plane_extents(sizes, subsizes, starts, iplane, opt_nplanes=nplanes)
+         end if
+      end if
+
+      ! Check if variable already exists, if not create it
+      call adios2_at_io(io_handle, adios, io_name, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+      if (io_handle%valid) then
+         call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
+         if (.not. var_handle%valid) then
+            ! New variable
+            if (nrank == 0) then
+               print *, "Registering variable for IO: ", varname
+            end if
+
+            ! Need to set the ADIOS2 data type
+            if (type == kind(0._real64)) then
+               ! Double
+               data_type = adios2_type_dp
+            else if (type == kind(0._real32)) then
+               ! Single
+               data_type = adios2_type_real
+            else
+               call decomp_2d_abort(__FILE__, __LINE__, -1, "Trying to write unknown data type!")
+            end if
+
+            call adios2_define_variable(var_handle, io_handle, varname, data_type, &
+                                        ndims, int(sizes, kind=8), int(starts, kind=8), int(subsizes, kind=8), &
+                                        adios2_constant_dims, ierror)
+            if (ierror /= 0) then
+               call decomp_2d_abort(__FILE__, __LINE__, ierror, &
+                                    "adios2_define_variable, ERROR registering variable "//trim(varname))
+            end if
+         else
+            ! This probably can't happen, however if the inquiry to a variable returns a NULL
+            ! pointer an exception is returned in ierr. As the point is to check the existence of
+            ! the variable we are already checking if it is valid or not.
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+         end if
+      else
+         call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to register variable with invalid IO!")
+      end if
 #else
-    nplanes = 1 ! Silence unused variable
-    associate(crs => icoarse, nm =>io_name, pncl => ipencil, pln => iplane, &
-         opdcmp => opt_decomp, opnpl => opt_nplanes, tp => type, &
-         vnm => varname) ! Silence unused dummy argument
-    end associate
+      nplanes = 1 ! Silence unused variable
+      associate (crs => icoarse, nm => io_name, pncl => ipencil, pln => iplane, &
+                 opdcmp => opt_decomp, opnpl => opt_nplanes, tp => type, &
+                 vnm => varname) ! Silence unused dummy argument
+      end associate
 #endif
-    
-  end subroutine decomp_2d_register_variable
-  
-  subroutine mpiio_write_real_probe(ipencil,var,filename,nlength)
 
-    ! USE param
-    ! USE variables
+   end subroutine decomp_2d_register_variable
 
-    implicit none
+   subroutine mpiio_write_real_probe(ipencil, var, filename, nlength)
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    integer, intent(in) :: nlength
-    real(mytype), dimension(:,:,:,:), intent(IN) :: var
+      ! USE param
+      ! USE variables
 
-    character(len=*) :: filename
+      implicit none
 
-    integer (kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(4) :: sizes, subsizes, starts
-    integer :: ierror, newtype, fh
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      integer, intent(in) :: nlength
+      real(mytype), contiguous, dimension(:, :, :, :), intent(IN) :: var
+
+      character(len=*) :: filename
+
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(4) :: sizes, subsizes, starts
+      integer :: ierror, newtype, fh
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("mpiio_write_real_probe")
-#endif
-
-    sizes(1) = xszP(1)
-    sizes(2) = yszP(2)
-    sizes(3) = zszP(3)
-    sizes(4) = nlength
-    if (ipencil == 1) then
-       subsizes(1) = xszP(1)
-       subsizes(2) = xszP(2)
-       subsizes(3) = xszP(3)
-       subsizes(4) = nlength
-       starts(1) = xstP(1)-1  ! 0-based index
-       starts(2) = xstP(2)-1
-       starts(3) = xstP(3)-1
-       starts(4) = 0
-    else if (ipencil == 2) then
-       subsizes(1) = yszP(1)
-       subsizes(2) = yszP(2)
-       subsizes(3) = yszP(3)
-       starts(1) = ystP(1)-1
-       starts(2) = ystP(2)-1
-       starts(3) = ystP(3)-1
-    else if (ipencil == 3) then
-       subsizes(1) = zszP(1)
-       subsizes(2) = zszP(2)
-       subsizes(3) = zszP(3)
-       starts(1) = zstP(1)-1
-       starts(2) = zstP(2)-1
-       starts(3) = zstP(3)-1
-    endif
-    !   print *,nrank,starts(1),starts(2),starts(3),starts(4)
-    call MPI_TYPE_CREATE_SUBARRAY(4, sizes, subsizes, starts,  &
-         MPI_ORDER_FORTRAN, real_type, newtype, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-    call MPI_TYPE_COMMIT(newtype,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-    call MPI_FILE_OPEN(decomp_2d_comm, filename, &
-         MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-         fh, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
-    filesize = 0_MPI_OFFSET_KIND
-    call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
-    disp = 0_MPI_OFFSET_KIND
-    call MPI_FILE_SET_VIEW(fh,disp,real_type, &
-         newtype,'native',MPI_INFO_NULL,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-    call MPI_FILE_WRITE_ALL(fh, var, &
-         subsizes(1)*subsizes(2)*subsizes(3)*subsizes(4), &
-         real_type, MPI_STATUS_IGNORE, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-    call MPI_FILE_CLOSE(fh,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
-    call MPI_TYPE_FREE(newtype,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+      if (decomp_profiler_io) call decomp_profiler_start("mpiio_write_real_probe")
+#endif
+
+      sizes(1) = xszP(1)
+      sizes(2) = yszP(2)
+      sizes(3) = zszP(3)
+      sizes(4) = nlength
+      if (ipencil == 1) then
+         subsizes(1) = xszP(1)
+         subsizes(2) = xszP(2)
+         subsizes(3) = xszP(3)
+         subsizes(4) = nlength
+         starts(1) = xstP(1) - 1  ! 0-based index
+         starts(2) = xstP(2) - 1
+         starts(3) = xstP(3) - 1
+         starts(4) = 0
+      else if (ipencil == 2) then
+         subsizes(1) = yszP(1)
+         subsizes(2) = yszP(2)
+         subsizes(3) = yszP(3)
+         starts(1) = ystP(1) - 1
+         starts(2) = ystP(2) - 1
+         starts(3) = ystP(3) - 1
+      else if (ipencil == 3) then
+         subsizes(1) = zszP(1)
+         subsizes(2) = zszP(2)
+         subsizes(3) = zszP(3)
+         starts(1) = zstP(1) - 1
+         starts(2) = zstP(2) - 1
+         starts(3) = zstP(3) - 1
+      end if
+      !   print *,nrank,starts(1),starts(2),starts(3),starts(4)
+      call MPI_TYPE_CREATE_SUBARRAY(4, sizes, subsizes, starts, &
+                                    MPI_ORDER_FORTRAN, real_type, newtype, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+      call MPI_TYPE_COMMIT(newtype, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+      call MPI_FILE_OPEN(decomp_2d_comm, filename, &
+                         MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, &
+                         fh, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+      filesize = 0_MPI_OFFSET_KIND
+      call MPI_FILE_SET_SIZE(fh, filesize, ierror)  ! guarantee overwriting
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
+      disp = 0_MPI_OFFSET_KIND
+      call MPI_FILE_SET_VIEW(fh, disp, real_type, &
+                             newtype, 'native', MPI_INFO_NULL, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+      call MPI_FILE_WRITE_ALL(fh, var, &
+                              subsizes(1) * subsizes(2) * subsizes(3) * subsizes(4), &
+                              real_type, MPI_STATUS_IGNORE, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+      call MPI_FILE_CLOSE(fh, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+      call MPI_TYPE_FREE(newtype, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("mpiio_write_real_probe")
+      if (decomp_profiler_io) call decomp_profiler_end("mpiio_write_real_probe")
 #endif
 
-  end subroutine mpiio_write_real_probe
+   end subroutine mpiio_write_real_probe
 
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  ! Write a 3D data set covering a smaller sub-domain only
+   ! Write a 3D data set covering a smaller sub-domain only
 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
-  subroutine write_subdomain(ipencil,var,is,ie,js,je,ks,ke,filename)
+   subroutine write_subdomain(ipencil, var, is, ie, js, je, ks, ke, filename)
 
-    implicit none
+      implicit none
 
-    integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
-    real(mytype), dimension(:,:,:), intent(IN) :: var
-    integer, intent(IN) :: is, ie, js, je, ks, ke
-    character(len=*), intent(IN) :: filename
+      integer, intent(IN) :: ipencil !(x-pencil=1; y-pencil=2; z-pencil=3)
+      real(mytype), contiguous, dimension(:, :, :), intent(IN) :: var
+      integer, intent(IN) :: is, ie, js, je, ks, ke
+      character(len=*), intent(IN) :: filename
 
-    real(mytype), allocatable, dimension(:,:,:) :: wk, wk2
-    integer(kind=MPI_OFFSET_KIND) :: filesize, disp
-    integer, dimension(3) :: sizes, subsizes, starts
-    integer :: color, key, errorcode, newcomm, ierror
-    integer :: newtype, fh, data_type, i, j, k
-    integer :: i1, i2, j1, j2, k1, k2
+      real(mytype), allocatable, dimension(:, :, :) :: wk, wk2
+      integer(kind=MPI_OFFSET_KIND) :: filesize, disp
+      integer, dimension(3) :: sizes, subsizes, starts
+      integer :: color, key, errorcode, newcomm, ierror
+      integer :: newtype, fh, data_type, i, j, k
+      integer :: i1, i2, j1, j2, k1, k2
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_write_subdomain")
-#endif
-
-    data_type = real_type
-
-    ! validate the input paramters
-    if (is<1 .OR. ie>nx_global .OR. js<1 .OR. je>ny_global .OR. &
-         ks<1 .OR. ke>nz_global) then
-       errorcode = 10
-       call decomp_2d_abort(errorcode, &
-            'Invalid subdomain specified in I/O')
-    end if
-
-    ! create a communicator for all those MPI ranks containing the subdomain
-    color = 1
-    key = 1
-    if (ipencil==1) then
-       if (xstart(1)>ie .OR. xend(1)<is .OR. xstart(2)>je .OR. xend(2)<js &
-            .OR. xstart(3)>ke .OR. xend(3)<ks) then
-          color = 2
-       end if
-    else if (ipencil==2) then
-       if (ystart(1)>ie .OR. yend(1)<is .OR. ystart(2)>je .OR. yend(2)<js &
-            .OR. ystart(3)>ke .OR. yend(3)<ks) then
-          color = 2
-       end if
-    else if (ipencil==3) then
-       if (zstart(1)>ie .OR. zend(1)<is .OR. zstart(2)>je .OR. zend(2)<js &
-            .OR. zstart(3)>ke .OR. zend(3)<ks) then
-          color = 2
-       end if
-    end if
-    call MPI_COMM_SPLIT(decomp_2d_comm,color,key,newcomm,ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
-
-    if (color==1) then ! only ranks in this group do IO collectively
-
-       ! generate MPI-IO subarray information
-
-       ! global size of the sub-domain to write
-       sizes(1) = ie - is + 1
-       sizes(2) = je - js + 1
-       sizes(3) = ke - ks + 1
-
-       ! 'subsizes' & 'starts' as required by MPI_TYPE_CREATE_SUBARRAY
-       ! note the special code whe subdomain only occupy part of the pencil
-       if (ipencil==1) then
-
-          subsizes(1) = xsize(1)
-          starts(1) = xstart(1) - is
-          if (xend(1)>ie .AND. xstart(1)<is) then
-             subsizes(1) = ie - is + 1
-             starts(1) = 0
-          else if (xstart(1)<is) then
-             subsizes(1) = xend(1) - is + 1
-             starts(1) = 0
-          else if (xend(1)>ie) then
-             subsizes(1) = ie - xstart(1) + 1
-          end if
-          subsizes(2) = xsize(2)
-          starts(2) = xstart(2) - js
-          if (xend(2)>je .AND. xstart(2)<js) then
-             subsizes(2) = je - js + 1
-             starts(2) = 0
-          else if (xstart(2)<js) then
-             subsizes(2) = xend(2) - js + 1
-             starts(2) = 0
-          else if (xend(2)>je) then
-             subsizes(2) = je - xstart(2) + 1
-          end if
-          subsizes(3) = xsize(3)
-          starts(3) = xstart(3) - ks
-          if (xend(3)>ke .AND. xstart(3)<ks) then
-             subsizes(3) = ke - ks + 1
-             starts(3) = 0
-          else if (xstart(3)<ks) then
-             subsizes(3) = xend(3) - ks + 1
-             starts(3) = 0
-          else if (xend(3)>ke) then
-             subsizes(3) = ke - xstart(3) + 1
-          end if
-
-       else if (ipencil==2) then
-
-          ! TODO 
-
-       else if (ipencil==3) then
-
-          ! TODO
-
-       end if
-
-
-       ! copy data from orginal to a temp array
-       ! pay attention to blocks only partially cover the sub-domain
-       if (ipencil==1) then
-
-          if (xend(1)>ie .AND. xstart(1)<is) then
-             i1 = is
-             i2 = ie
-          else if (xend(1)>ie) then
-             i1 = xstart(1)
-             i2 = ie
-          else if (xstart(1)<is) then
-             i1 = is
-             i2 = xend(1)
-          else
-             i1 = xstart(1)
-             i2 = xend(1)
-          end if
-
-          if (xend(2)>je .AND. xstart(2)<js) then
-             j1 = js
-             j2 = je
-          else if (xend(2)>je) then
-             j1 = xstart(2)
-             j2 = je
-          else if (xstart(2)<js) then
-             j1 = js
-             j2 = xend(2)
-          else
-             j1 = xstart(2)
-             j2 = xend(2)
-          end if
-
-          if (xend(3)>ke .AND. xstart(3)<ks) then
-             k1 = ks
-             k2 = ke
-          else if (xend(3)>ke) then
-             k1 = xstart(3)
-             k2 = ke
-          else if (xstart(3)<ks) then
-             k1 = ks
-             k2 = xend(3)
-          else
-             k1 = xstart(3)
-             k2 = xend(3)
-          end if
-
-          allocate(wk(i1:i2, j1:j2, k1:k2))
-          allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-          wk2 = var
-          do k=k1,k2
-             do j=j1,j2
-                do i=i1,i2
-                   wk(i,j,k) = wk2(i,j,k)
-                end do
-             end do
-          end do
-
-       else if (ipencil==2) then
-
-          ! TODO
-
-       else if (ipencil==3) then
-
-          ! TODO
-
-       end if
-
-       deallocate(wk2)
-
-       ! MPI-IO
-       call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-            MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-       call MPI_TYPE_COMMIT(newtype,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-       call MPI_FILE_OPEN(newcomm, filename, &
-            MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-            fh, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
-       filesize = 0_MPI_OFFSET_KIND
-       call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
-       disp = 0_MPI_OFFSET_KIND
-       call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-            newtype,'native',MPI_INFO_NULL,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-       call MPI_FILE_WRITE_ALL(fh, wk, &
-            subsizes(1)*subsizes(2)*subsizes(3), &
-            data_type, MPI_STATUS_IGNORE, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-       call MPI_FILE_CLOSE(fh,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
-       call MPI_TYPE_FREE(newtype,ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
-
-       deallocate(wk)
-
-    end if
-
-    call decomp_mpi_comm_free(newcomm)
+      if (decomp_profiler_io) call decomp_profiler_start("io_write_subdomain")
+#endif
+
+      data_type = real_type
+
+      ! validate the input paramters
+      if (is < 1 .OR. ie > nx_global .OR. js < 1 .OR. je > ny_global .OR. &
+          ks < 1 .OR. ke > nz_global) then
+         errorcode = 10
+         call decomp_2d_abort(errorcode, &
+                              'Invalid subdomain specified in I/O')
+      end if
+
+      ! create a communicator for all those MPI ranks containing the subdomain
+      color = 1
+      key = 1
+      if (ipencil == 1) then
+         if (xstart(1) > ie .OR. xend(1) < is .OR. xstart(2) > je .OR. xend(2) < js &
+             .OR. xstart(3) > ke .OR. xend(3) < ks) then
+            color = 2
+         end if
+      else if (ipencil == 2) then
+         if (ystart(1) > ie .OR. yend(1) < is .OR. ystart(2) > je .OR. yend(2) < js &
+             .OR. ystart(3) > ke .OR. yend(3) < ks) then
+            color = 2
+         end if
+      else if (ipencil == 3) then
+         if (zstart(1) > ie .OR. zend(1) < is .OR. zstart(2) > je .OR. zend(2) < js &
+             .OR. zstart(3) > ke .OR. zend(3) < ks) then
+            color = 2
+         end if
+      end if
+      call MPI_COMM_SPLIT(decomp_2d_comm, color, key, newcomm, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
+
+      if (color == 1) then ! only ranks in this group do IO collectively
+
+         ! generate MPI-IO subarray information
+
+         ! global size of the sub-domain to write
+         sizes(1) = ie - is + 1
+         sizes(2) = je - js + 1
+         sizes(3) = ke - ks + 1
+
+         ! 'subsizes' & 'starts' as required by MPI_TYPE_CREATE_SUBARRAY
+         ! note the special code whe subdomain only occupy part of the pencil
+         if (ipencil == 1) then
+
+            subsizes(1) = xsize(1)
+            starts(1) = xstart(1) - is
+            if (xend(1) > ie .AND. xstart(1) < is) then
+               subsizes(1) = ie - is + 1
+               starts(1) = 0
+            else if (xstart(1) < is) then
+               subsizes(1) = xend(1) - is + 1
+               starts(1) = 0
+            else if (xend(1) > ie) then
+               subsizes(1) = ie - xstart(1) + 1
+            end if
+            subsizes(2) = xsize(2)
+            starts(2) = xstart(2) - js
+            if (xend(2) > je .AND. xstart(2) < js) then
+               subsizes(2) = je - js + 1
+               starts(2) = 0
+            else if (xstart(2) < js) then
+               subsizes(2) = xend(2) - js + 1
+               starts(2) = 0
+            else if (xend(2) > je) then
+               subsizes(2) = je - xstart(2) + 1
+            end if
+            subsizes(3) = xsize(3)
+            starts(3) = xstart(3) - ks
+            if (xend(3) > ke .AND. xstart(3) < ks) then
+               subsizes(3) = ke - ks + 1
+               starts(3) = 0
+            else if (xstart(3) < ks) then
+               subsizes(3) = xend(3) - ks + 1
+               starts(3) = 0
+            else if (xend(3) > ke) then
+               subsizes(3) = ke - xstart(3) + 1
+            end if
+
+         else if (ipencil == 2) then
+
+            ! TODO
+
+         else if (ipencil == 3) then
+
+            ! TODO
+
+         end if
+
+         ! copy data from orginal to a temp array
+         ! pay attention to blocks only partially cover the sub-domain
+         if (ipencil == 1) then
+
+            if (xend(1) > ie .AND. xstart(1) < is) then
+               i1 = is
+               i2 = ie
+            else if (xend(1) > ie) then
+               i1 = xstart(1)
+               i2 = ie
+            else if (xstart(1) < is) then
+               i1 = is
+               i2 = xend(1)
+            else
+               i1 = xstart(1)
+               i2 = xend(1)
+            end if
+
+            if (xend(2) > je .AND. xstart(2) < js) then
+               j1 = js
+               j2 = je
+            else if (xend(2) > je) then
+               j1 = xstart(2)
+               j2 = je
+            else if (xstart(2) < js) then
+               j1 = js
+               j2 = xend(2)
+            else
+               j1 = xstart(2)
+               j2 = xend(2)
+            end if
+
+            if (xend(3) > ke .AND. xstart(3) < ks) then
+               k1 = ks
+               k2 = ke
+            else if (xend(3) > ke) then
+               k1 = xstart(3)
+               k2 = ke
+            else if (xstart(3) < ks) then
+               k1 = ks
+               k2 = xend(3)
+            else
+               k1 = xstart(3)
+               k2 = xend(3)
+            end if
+
+            allocate (wk(i1:i2, j1:j2, k1:k2))
+            allocate (wk2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+            wk2 = var
+            do k = k1, k2
+               do j = j1, j2
+                  do i = i1, i2
+                     wk(i, j, k) = wk2(i, j, k)
+                  end do
+               end do
+            end do
+
+         else if (ipencil == 2) then
+
+            ! TODO
+
+         else if (ipencil == 3) then
+
+            ! TODO
+
+         end if
+
+         deallocate (wk2)
+
+         ! MPI-IO
+         call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+         call MPI_TYPE_COMMIT(newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+         call MPI_FILE_OPEN(newcomm, filename, &
+                            MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, &
+                            fh, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+         filesize = 0_MPI_OFFSET_KIND
+         call MPI_FILE_SET_SIZE(fh, filesize, ierror)  ! guarantee overwriting
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
+         disp = 0_MPI_OFFSET_KIND
+         call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                                newtype, 'native', MPI_INFO_NULL, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+         call MPI_FILE_WRITE_ALL(fh, wk, &
+                                 subsizes(1) * subsizes(2) * subsizes(3), &
+                                 data_type, MPI_STATUS_IGNORE, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+         call MPI_FILE_CLOSE(fh, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+         call MPI_TYPE_FREE(newtype, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+
+         deallocate (wk)
+
+      end if
+
+      call decomp_2d_mpi_comm_free(newcomm)
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_write_subdomain")
+      if (decomp_profiler_io) call decomp_profiler_end("io_write_subdomain")
 #endif
 
-  end subroutine write_subdomain
+   end subroutine write_subdomain
 
-  subroutine decomp_2d_init_io(io_name)
+   subroutine decomp_2d_init_io(io_name)
 
-    implicit none
+      implicit none
 
-    character(len=*), intent(in) :: io_name
+      character(len=*), intent(in) :: io_name
 #ifdef ADIOS2
-    integer :: ierror
-    type(adios2_io) :: io
+      integer :: ierror
+      type(adios2_io) :: io
 #endif
-    
-    if (nrank .eq. 0) then
-       print *, "Initialising IO for ", io_name
-    end if
+
+      if (nrank == 0) then
+         print *, "Initialising IO for ", io_name
+      end if
 
 #ifdef ADIOS2
-    if (adios%valid) then
-       call adios2_declare_io(io, adios, io_name, ierror)
-       if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_declare_io "//trim(io_name))
-    else
-       call decomp_2d_abort(__FILE__, __LINE__, -1, "couldn't declare IO - adios object not valid")
-    end if
-#endif
-    
-  end subroutine decomp_2d_init_io
-  
-  subroutine decomp_2d_open_io(io_name, io_dir, mode)
-
-    implicit none
-
-    character(len=*), intent(in) :: io_name, io_dir
-    integer, intent(in) :: mode
-
-    logical, dimension(:), pointer :: live_ptrh
-    character(len=1024), dimension(:), pointer :: names_ptr
-    character(len=(len(io_name)+len(io_sep)+len(io_dir))) :: full_name
-    
-    integer :: idx, ierror
-    integer :: access_mode
+      if (adios%valid) then
+         call adios2_declare_io(io, adios, io_name, ierror)
+         if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_declare_io "//trim(io_name))
+      else
+         call decomp_2d_abort(__FILE__, __LINE__, -1, "couldn't declare IO - adios object not valid")
+      end if
+#endif
+
+   end subroutine decomp_2d_init_io
+
+   subroutine decomp_2d_open_io(io_name, io_dir, mode)
+
+      implicit none
+
+      character(len=*), intent(in) :: io_name, io_dir
+      integer, intent(in) :: mode
+
+      logical, dimension(:), pointer :: live_ptrh
+      character(len=1024), dimension(:), pointer :: names_ptr
+      character(len=(len(io_name) + len(io_sep) + len(io_dir))) :: full_name
+
+      integer :: idx, ierror
+      integer :: access_mode
 #ifndef ADIOS2
-    integer(MPI_OFFSET_KIND) :: filesize
+      integer(MPI_OFFSET_KIND) :: filesize
 #else
-    type(adios2_io) :: io
+      type(adios2_io) :: io
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_start("io_open_close")
+      if (decomp_profiler_io) call decomp_profiler_start("io_open_close")
 #endif
 
 #ifndef ADIOS2
-    live_ptrh => fh_live
-    names_ptr => fh_names
+      live_ptrh => fh_live
+      names_ptr => fh_names
 #else
-    live_ptrh => engine_live
-    names_ptr => engine_names
-#endif
-    
-    idx = get_io_idx(io_name, io_dir)
-    if (idx .lt. 1) then
-       !! New io destination
-       if (nreg_io .lt. MAX_IOH) then
-          nreg_io = nreg_io + 1
-          do idx = 1, MAX_IOH
-             if (.not. live_ptrh(idx)) then
-                live_ptrh(idx) = .true.
-                exit
-             end if
-          end do
-          
-          full_name = io_name//io_sep//io_dir
-          names_ptr(idx) = full_name
-
-          if (mode .eq. decomp_2d_write_mode) then
-             !! Setup writers
+      live_ptrh => engine_live
+      names_ptr => engine_names
+#endif
+
+      idx = get_io_idx(io_name, io_dir)
+      if (idx < 1) then
+         ! New io destination
+         if (nreg_io < MAX_IOH) then
+            nreg_io = nreg_io + 1
+            do idx = 1, MAX_IOH
+               if (.not. live_ptrh(idx)) then
+                  live_ptrh(idx) = .true.
+                  exit
+               end if
+            end do
+
+            full_name = io_name//io_sep//io_dir
+            names_ptr(idx) = full_name
+
+            if (mode == decomp_2d_write_mode) then
+               ! Setup writers
 #ifndef ADIOS2
-             filesize = 0_MPI_OFFSET_KIND
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
+               filesize = 0_MPI_OFFSET_KIND
+               fh_disp(idx) = 0_MPI_OFFSET_KIND
+               access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
 #else
-             access_mode = adios2_mode_write
+               access_mode = adios2_mode_write
 #endif
-          else if (mode .eq. decomp_2d_read_mode) then
-             !! Setup readers
+            else if (mode == decomp_2d_read_mode) then
+               ! Setup readers
 #ifndef ADIOS2
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_RDONLY
+               fh_disp(idx) = 0_MPI_OFFSET_KIND
+               access_mode = MPI_MODE_RDONLY
 #else
-             access_mode = adios2_mode_read
+               access_mode = adios2_mode_read
 #endif
-          else if (mode .eq. decomp_2d_append_mode) then
+            else if (mode == decomp_2d_append_mode) then
 #ifndef ADIOS2
-             filesize = 0_MPI_OFFSET_KIND
-             fh_disp(idx) = 0_MPI_OFFSET_KIND
-             access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
+               filesize = 0_MPI_OFFSET_KIND
+               fh_disp(idx) = 0_MPI_OFFSET_KIND
+               access_mode = MPI_MODE_CREATE + MPI_MODE_WRONLY
 #else
-             access_mode = adios2_mode_append
+               access_mode = adios2_mode_append
 #endif
-          else
-             print *, "ERROR: Unknown mode!"
-             stop
-          endif
+            else
+               print *, "ERROR: Unknown mode!"
+               stop
+            end if
 
-          !! Open IO
+            ! Open IO
 #ifndef ADIOS2
-          call MPI_FILE_OPEN(decomp_2d_comm, io_dir, &
-               access_mode, MPI_INFO_NULL, &
-               fh_registry(idx), ierror)
-          if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
-          if (mode .eq. decomp_2d_write_mode) then
-             !! Guarantee overwriting
-             call MPI_FILE_SET_SIZE(fh_registry(idx), filesize, ierror)
-             if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
-          end if
+            call MPI_FILE_OPEN(decomp_2d_comm, io_dir, &
+                               access_mode, MPI_INFO_NULL, &
+                               fh_registry(idx), ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+            if (mode == decomp_2d_write_mode) then
+               ! Guarantee overwriting
+               call MPI_FILE_SET_SIZE(fh_registry(idx), filesize, ierror)
+               if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
+            end if
 #else
-          call adios2_at_io(io, adios, io_name, ierror)
-          if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
-          if (io%valid) then
-             call adios2_open(engine_registry(idx), io, trim(gen_iodir_name(io_dir, io_name)), access_mode, ierror)
-             if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "ERROR opening engine!")
-          else
-             call decomp_2d_abort(__FILE__, __LINE__, -1, "Couldn't find IO handle")
-          end if
-#endif
-       end if
-    end if
+            call adios2_at_io(io, adios, io_name, ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+            if (io%valid) then
+               call adios2_open(engine_registry(idx), io, trim(gen_iodir_name(io_dir, io_name)), access_mode, ierror)
+               if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "ERROR opening engine!")
+            else
+               call decomp_2d_abort(__FILE__, __LINE__, -1, "Couldn't find IO handle")
+            end if
+#endif
+         end if
+      end if
+
+   end subroutine decomp_2d_open_io
 
-  end subroutine decomp_2d_open_io
+   subroutine decomp_2d_close_io(io_name, io_dir)
 
-  subroutine decomp_2d_close_io(io_name, io_dir)
+      implicit none
 
-    implicit none
+      character(len=*), intent(in) :: io_name, io_dir
 
-    character(len=*), intent(in) :: io_name, io_dir
-    
-    character(len=1024), dimension(:), pointer :: names_ptr
-    logical, dimension(:), pointer :: live_ptrh
-    integer :: idx, ierror
+      character(len=1024), dimension(:), pointer :: names_ptr
+      logical, dimension(:), pointer :: live_ptrh
+      integer :: idx, ierror
 
-    idx = get_io_idx(io_name, io_dir)
+      idx = get_io_idx(io_name, io_dir)
 #ifndef ADIOS2
-    names_ptr => fh_names
-    live_ptrh => fh_live
-    call MPI_FILE_CLOSE(fh_registry(idx), ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+      names_ptr => fh_names
+      live_ptrh => fh_live
+      call MPI_FILE_CLOSE(fh_registry(idx), ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
 #else
-    names_ptr => engine_names
-    live_ptrh => engine_live
-    call adios2_close(engine_registry(idx), ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_close")
+      names_ptr => engine_names
+      live_ptrh => engine_live
+      call adios2_close(engine_registry(idx), ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_close")
 #endif
-    names_ptr(idx) = ""
-    live_ptrh(idx) = .false.
-    nreg_io = nreg_io - 1
+      names_ptr(idx) = ""
+      live_ptrh(idx) = .false.
+      nreg_io = nreg_io - 1
 
 #ifdef PROFILER
-    if (decomp_profiler_io) call decomp_profiler_end("io_open_close")
+      if (decomp_profiler_io) call decomp_profiler_end("io_open_close")
 #endif
 
-  end subroutine decomp_2d_close_io
+   end subroutine decomp_2d_close_io
 
-  subroutine decomp_2d_start_io(io_name, io_dir)
+   subroutine decomp_2d_start_io(io_name, io_dir)
 
-    implicit none
+      implicit none
 
-    character(len=*), intent(in) :: io_name, io_dir
+      character(len=*), intent(in) :: io_name, io_dir
 #ifdef ADIOS2
-    integer :: idx, ierror
-
-    idx = get_io_idx(io_name, io_dir)
-    associate(engine => engine_registry(idx))
-      if (engine%valid) then
-         call adios2_begin_step(engine, ierror)
-         if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_begin_step")
-      else
-         call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to begin step with invalid engine")
-      end if
-    end associate
+      integer :: idx, ierror
+
+      idx = get_io_idx(io_name, io_dir)
+      associate (engine => engine_registry(idx))
+         if (engine%valid) then
+            call adios2_begin_step(engine, ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_begin_step")
+         else
+            call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to begin step with invalid engine")
+         end if
+      end associate
 #else
-    associate(nm => io_name, dr => io_dir) ! Silence unused dummy argument
-    end associate
+      associate (nm => io_name, dr => io_dir) ! Silence unused dummy argument
+      end associate
 #endif
-    
-  end subroutine decomp_2d_start_io
 
-  subroutine decomp_2d_end_io(io_name, io_dir)
+   end subroutine decomp_2d_start_io
 
-    implicit none
+   subroutine decomp_2d_end_io(io_name, io_dir)
 
-    character(len=*), intent(in) :: io_name, io_dir
-#ifdef ADIOS2
-    integer :: idx, ierror
+      implicit none
 
-    idx = get_io_idx(io_name, io_dir)
-    associate(engine => engine_registry(idx))
-      if (engine%valid) then
-         call adios2_end_step(engine, ierror)
-         if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_end_step")
-      else
-         call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to end step with invalid engine")
-      end if
-    end associate
+      character(len=*), intent(in) :: io_name, io_dir
+#ifdef ADIOS2
+      integer :: idx, ierror
+
+      idx = get_io_idx(io_name, io_dir)
+      associate (engine => engine_registry(idx))
+         if (engine%valid) then
+            call adios2_end_step(engine, ierror)
+            if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_end_step")
+         else
+            call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to end step with invalid engine")
+         end if
+      end associate
 #else
-    associate(nm => io_name, dr => io_dir) ! Silence unused dummy argument
-    end associate
+      associate (nm => io_name, dr => io_dir) ! Silence unused dummy argument
+      end associate
 #endif
 
-  end subroutine decomp_2d_end_io
-  
-  integer function get_io_idx(io_name, engine_name)
+   end subroutine decomp_2d_end_io
 
-    implicit none
+   integer function get_io_idx(io_name, engine_name)
 
-    character(len=*), intent(in) :: io_name
-    character(len=*), intent(in) :: engine_name
+      implicit none
 
-    character(len=(len(io_name)+len(io_sep)+len(engine_name))) :: full_name
-    integer :: idx
-    logical :: found
+      character(len=*), intent(in) :: io_name
+      character(len=*), intent(in) :: engine_name
 
-    character(len=1024), dimension(:), pointer :: names_ptr
+      character(len=(len(io_name) + len(io_sep) + len(engine_name))) :: full_name
+      integer :: idx
+      logical :: found
+
+      character(len=1024), dimension(:), pointer :: names_ptr
 
 #ifndef ADIOS2
-    names_ptr => fh_names
+      names_ptr => fh_names
 #else
-    names_ptr => engine_names
+      names_ptr => engine_names
 #endif
 
-    full_name = io_name//io_sep//engine_name
-    
-    found = .false.
-    do idx = 1, MAX_IOH
-       if (names_ptr(idx) .eq. full_name) then
-          found = .true.
-          exit
-       end if
-    end do
+      full_name = io_name//io_sep//engine_name
+
+      found = .false.
+      do idx = 1, MAX_IOH
+         if (names_ptr(idx) == full_name) then
+            found = .true.
+            exit
+         end if
+      end do
 
-    if (.not. found) then
-       idx = -1
-    end if
+      if (.not. found) then
+         idx = -1
+      end if
 
-    get_io_idx = idx
-    
-  end function get_io_idx
+      get_io_idx = idx
 
-  function gen_iodir_name(io_dir, io_name)
+   end function get_io_idx
 
-    character(len=*), intent(in) :: io_dir, io_name
-    character(len=(len(io_dir) + 5)) :: gen_iodir_name
+   function gen_iodir_name(io_dir, io_name)
+
+      character(len=*), intent(in) :: io_dir, io_name
+      character(len=(len(io_dir) + 5)) :: gen_iodir_name
 #ifdef ADIOS2
-    integer :: ierror
-    type(adios2_io) :: io
-    character(len=5) :: ext
+      integer :: ierror
+      type(adios2_io) :: io
+      character(len=5) :: ext
 #endif
 
 #ifndef ADIOS2
-    associate(nm => io_name) ! Silence unused dummy argument
-    end associate
-    write(gen_iodir_name, "(A)") io_dir
+      associate (nm => io_name) ! Silence unused dummy argument
+      end associate
+      write (gen_iodir_name, "(A)") io_dir
 #else
-    call adios2_at_io(io, adios, io_name, ierror)
-    if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
-    if (io%engine_type .eq. "BP4") then
-       ext = ".bp4"
-    else if (io%engine_type .eq. "HDF5") then
-       ext = ".hdf5"
-    else if (io%engine_type .eq. "SST") then
-       ext = ""
-    else
-       print *, "ERROR: Unkown engine type! ", io%engine_type
-       print *, "-  IO: ", io_name
-       print *, "- DIR:", io_dir
-       stop
-    endif
-    write(gen_iodir_name, "(A,A)") io_dir, trim(ext)
-#endif
-    
-  end function gen_iodir_name
+      call adios2_at_io(io, adios, io_name, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+      if (io%engine_type == "BP4") then
+         ext = ".bp4"
+      else if (io%engine_type == "HDF5") then
+         ext = ".hdf5"
+      else if (io%engine_type == "SST") then
+         ext = ""
+      else
+         print *, "ERROR: Unkown engine type! ", io%engine_type
+         print *, "-  IO: ", io_name
+         print *, "- DIR:", io_dir
+         stop
+      end if
+      write (gen_iodir_name, "(A,A)") io_dir, trim(ext)
+#endif
+
+   end function gen_iodir_name
 
 end module decomp_2d_io
diff --git a/src/io_read_inflow.f90 b/src/io_read_inflow.f90
index 8acec522..19e3a05c 100644
--- a/src/io_read_inflow.f90
+++ b/src/io_read_inflow.f90
@@ -1,20 +1,11 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'read_var_...' in io.f90
 
-! Using MPI-IO to read a distributed 3D variable from a file. File 
-! operations (open/close) need to be done in calling application. This 
-! allows multiple variables to be read from a single file. Together 
+! Using MPI-IO to read a distributed 3D variable from a file. File
+! operations (open/close) need to be done in calling application. This
+! allows multiple variables to be read from a single file. Together
 ! with the corresponding write operation, this is the perfect solution
 ! for applications to perform restart/checkpointing.
 
@@ -25,58 +16,64 @@
 end if
 
 ! Create file type and set file view
-sizes(1) = ntimesteps 
+sizes(1) = ntimesteps
 sizes(2) = decomp%ysz(2)
 sizes(3) = decomp%zsz(3)
 subsizes(1) = ntimesteps
 subsizes(2) = decomp%xsz(2)
 subsizes(3) = decomp%xsz(3)
 starts(1) = 0  ! 0-based index
-starts(2) = decomp%xst(2)-1
-starts(3) = decomp%xst(3)-1
+starts(2) = decomp%xst(2) - 1
+starts(3) = decomp%xst(3) - 1
 
 idx = get_io_idx(io_name, dirname)
 
 #ifndef ADIOS2
 !! Use default MPIIO
-associate(fh=>fh_registry(idx), &
-  disp => fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-  call MPI_FILE_READ_ALL(fh, var, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-  call MPI_TYPE_FREE(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+associate (fh => fh_registry(idx), &
+           disp => fh_disp(idx))
+   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                 MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+   call MPI_TYPE_COMMIT(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+   call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                          newtype, 'native', MPI_INFO_NULL, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+   call MPI_FILE_READ_ALL(fh, var, &
+                          subsizes(1) * subsizes(2) * subsizes(3), &
+                          data_type, MPI_STATUS_IGNORE, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+   call MPI_TYPE_FREE(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
-  ! update displacement for the next read operation
-  disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  if (data_type == complex_type) then
-     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  end if
+   ! update displacement for the next read operation
+   disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+          * int(sizes(2), kind=MPI_OFFSET_KIND) &
+          * int(sizes(3), kind=MPI_OFFSET_KIND) &
+          * int(mytype_bytes, kind=MPI_OFFSET_KIND)
+   if (data_type == complex_type) disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                                         * int(sizes(2), kind=MPI_OFFSET_KIND) &
+                                         * int(sizes(3), kind=MPI_OFFSET_KIND) &
+                                         * int(mytype_bytes, kind=MPI_OFFSET_KIND)
 end associate
 
-associate(vnm => varname) ! Silence unused argument
+associate (vnm => varname) ! Silence unused argument
 end associate
 
 #else
 !! Use ADIOS2
 call adios2_at_io(io_handle, adios, io_name, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
 call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-if (.not.var_handle % valid) call decomp_2d_abort(__FILE__, __LINE__, -1, &
-   "trying to write variable before registering! "//trim(varname))
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+if (.not. var_handle%valid) then
+   call decomp_2d_abort(__FILE__, __LINE__, -1, &
+                        "trying to write variable before registering!  "//trim(varname))
+end if
 
 !! Note - need to use sync mode as we are using a view into the array - unsure how this works with deferred writes
 ! call adios2_set_step_selection(var_handle, int(0, kind=8), int(1, kind=8), ierror)
 call adios2_get(engine_registry(idx), var_handle, var, adios2_mode_sync, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_get")
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_get")
 #endif
diff --git a/src/io_read_one.inc b/src/io_read_one.inc
index 211cbc2c..0063f377 100644
--- a/src/io_read_one.inc
+++ b/src/io_read_one.inc
@@ -1,16 +1,7 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'mpiio_read_one_...' in io.f90
 
 ! Using MPI-IO to write a distributed 3D array into a file
@@ -30,43 +21,43 @@ if (ipencil == 1) then
    subsizes(1) = decomp%xsz(1)
    subsizes(2) = decomp%xsz(2)
    subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
+   starts(1) = decomp%xst(1) - 1  ! 0-based index
+   starts(2) = decomp%xst(2) - 1
+   starts(3) = decomp%xst(3) - 1
 else if (ipencil == 2) then
    subsizes(1) = decomp%ysz(1)
    subsizes(2) = decomp%ysz(2)
    subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
+   starts(1) = decomp%yst(1) - 1
+   starts(2) = decomp%yst(2) - 1
+   starts(3) = decomp%yst(3) - 1
 else if (ipencil == 3) then
    subsizes(1) = decomp%zsz(1)
    subsizes(2) = decomp%zsz(2)
    subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
+   starts(1) = decomp%zst(1) - 1
+   starts(2) = decomp%zst(2) - 1
+   starts(3) = decomp%zst(3) - 1
+end if
 
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-call MPI_TYPE_COMMIT(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                              MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+call MPI_TYPE_COMMIT(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
 call MPI_FILE_OPEN(decomp_2d_comm, filename, &
-     MPI_MODE_RDONLY, MPI_INFO_NULL, &
-     fh, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+                   MPI_MODE_RDONLY, MPI_INFO_NULL, &
+                   fh, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
 disp = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                       newtype, 'native', MPI_INFO_NULL, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
 call MPI_FILE_READ_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-call MPI_FILE_CLOSE(fh,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
-call MPI_TYPE_FREE(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+                       subsizes(1)*subsizes(2)*subsizes(3), &
+                       data_type, MPI_STATUS_IGNORE, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+call MPI_FILE_CLOSE(fh, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+call MPI_TYPE_FREE(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
diff --git a/src/io_read_var.inc b/src/io_read_var.inc
index 0bc0bf1d..c72d09a7 100644
--- a/src/io_read_var.inc
+++ b/src/io_read_var.inc
@@ -1,21 +1,12 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'read_var_...' in io.f90
 
-! Using MPI-IO to read a distributed 3D variable from a file. File 
-! operations (open/close) need to be done in calling application. This 
-! allows multiple variables to be read from a single file. Together 
+! Using MPI-IO to read a distributed 3D variable from a file. File
+! operations (open/close) need to be done in calling application. This
+! allows multiple variables to be read from a single file. Together
 ! with the corresponding write operation, this is the perfect solution
 ! for applications to perform restart/checkpointing.
 
@@ -33,42 +24,46 @@ if (ipencil == 1) then
    subsizes(1) = decomp%xsz(1)
    subsizes(2) = decomp%xsz(2)
    subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
+   starts(1) = decomp%xst(1) - 1  ! 0-based index
+   starts(2) = decomp%xst(2) - 1
+   starts(3) = decomp%xst(3) - 1
 else if (ipencil == 2) then
    subsizes(1) = decomp%ysz(1)
    subsizes(2) = decomp%ysz(2)
    subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
+   starts(1) = decomp%yst(1) - 1
+   starts(2) = decomp%yst(2) - 1
+   starts(3) = decomp%yst(3) - 1
 else if (ipencil == 3) then
    subsizes(1) = decomp%zsz(1)
    subsizes(2) = decomp%zsz(2)
    subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
+   starts(1) = decomp%zst(1) - 1
+   starts(2) = decomp%zst(2) - 1
+   starts(3) = decomp%zst(3) - 1
+end if
 
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-call MPI_TYPE_COMMIT(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                              MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+call MPI_TYPE_COMMIT(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                       newtype, 'native', MPI_INFO_NULL, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
 call MPI_FILE_READ_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
-call MPI_TYPE_FREE(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+                       subsizes(1)*subsizes(2)*subsizes(3), &
+                       data_type, MPI_STATUS_IGNORE, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_READ_ALL")
+call MPI_TYPE_FREE(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
 ! update displacement for the next read operation
-disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-if (data_type == complex_type) then
-   disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-end if
+disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+       *int(sizes(2), kind=MPI_OFFSET_KIND) &
+       *int(sizes(3), kind=MPI_OFFSET_KIND) &
+       *int(mytype_bytes, kind=MPI_OFFSET_KIND)
+if (data_type == complex_type) disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                                      *int(sizes(2), kind=MPI_OFFSET_KIND) &
+                                      *int(sizes(3), kind=MPI_OFFSET_KIND) &
+                                      *int(mytype_bytes, kind=MPI_OFFSET_KIND)
diff --git a/src/io_write_every.inc b/src/io_write_every.inc
index cbf98b7a..4fa4b61a 100644
--- a/src/io_write_every.inc
+++ b/src/io_write_every.inc
@@ -1,21 +1,12 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
-
-! This file contain common code to be included by subroutines 
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
+
+! This file contain common code to be included by subroutines
 ! 'write_every_...' in io.f90
 
 ! To write every few points of a 3D array to a file
 
-! work out the distribution parameters, which may be different from 
+! work out the distribution parameters, which may be different from
 ! the default distribution used by the decomposition library
 !  For exmample if nx=17 and p_row=4
 !    distribution is: 4 4 4 5
@@ -38,199 +29,199 @@
 !             4th block (13-17) contains then 12th & 15th point
 !    giving a 1 2 2 1 distribution
 
-skip(1)=iskip
-skip(2)=jskip
-skip(3)=kskip
+skip(1) = iskip
+skip(2) = jskip
+skip(3) = kskip
 
-do i=1,3
+do i = 1, 3
    if (from1) then
-      xst(i) = (xstart(i)+skip(i)-1)/skip(i)
-      if (mod(xstart(i)+skip(i)-1,skip(i))/=0) xst(i)=xst(i)+1
-      xen(i) = (xend(i)+skip(i)-1)/skip(i)
+      xst(i) = (xstart(i) + skip(i) - 1)/skip(i)
+      if (mod(xstart(i) + skip(i) - 1, skip(i)) /= 0) xst(i) = xst(i) + 1
+      xen(i) = (xend(i) + skip(i) - 1)/skip(i)
    else
       xst(i) = xstart(i)/skip(i)
-      if (mod(xstart(i),skip(i))/=0) xst(i)=xst(i)+1
+      if (mod(xstart(i), skip(i)) /= 0) xst(i) = xst(i) + 1
       xen(i) = xend(i)/skip(i)
    end if
-   xsz(i) = xen(i)-xst(i)+1
+   xsz(i) = xen(i) - xst(i) + 1
 end do
 
-do i=1,3
+do i = 1, 3
    if (from1) then
-      yst(i) = (ystart(i)+skip(i)-1)/skip(i)
-      if (mod(ystart(i)+skip(i)-1,skip(i))/=0) yst(i)=yst(i)+1
-      yen(i) = (yend(i)+skip(i)-1)/skip(i)
+      yst(i) = (ystart(i) + skip(i) - 1)/skip(i)
+      if (mod(ystart(i) + skip(i) - 1, skip(i)) /= 0) yst(i) = yst(i) + 1
+      yen(i) = (yend(i) + skip(i) - 1)/skip(i)
    else
       yst(i) = ystart(i)/skip(i)
-      if (mod(ystart(i),skip(i))/=0) yst(i)=yst(i)+1
+      if (mod(ystart(i), skip(i)) /= 0) yst(i) = yst(i) + 1
       yen(i) = yend(i)/skip(i)
    end if
-   ysz(i) = yen(i)-yst(i)+1
+   ysz(i) = yen(i) - yst(i) + 1
 end do
 
-do i=1,3
+do i = 1, 3
    if (from1) then
-      zst(i) = (zstart(i)+skip(i)-1)/skip(i)
-      if (mod(zstart(i)+skip(i)-1,skip(i))/=0) zst(i)=zst(i)+1
-      zen(i) = (zend(i)+skip(i)-1)/skip(i)
+      zst(i) = (zstart(i) + skip(i) - 1)/skip(i)
+      if (mod(zstart(i) + skip(i) - 1, skip(i)) /= 0) zst(i) = zst(i) + 1
+      zen(i) = (zend(i) + skip(i) - 1)/skip(i)
    else
       zst(i) = zstart(i)/skip(i)
-      if (mod(zstart(i),skip(i))/=0) zst(i)=zst(i)+1
+      if (mod(zstart(i), skip(i)) /= 0) zst(i) = zst(i) + 1
       zen(i) = zend(i)/skip(i)
    end if
-   zsz(i) = zen(i)-zst(i)+1
+   zsz(i) = zen(i) - zst(i) + 1
 end do
 
-! if 'skip' value is large it is possible that some ranks do not 
-! contain any points to be written. Subarray constructor requires 
+! if 'skip' value is large it is possible that some ranks do not
+! contain any points to be written. Subarray constructor requires
 ! nonzero size so it is not possible to use decomp_2d_comm for IO.
 ! Create a sub communicator for this...
 color = 1
 key = 0  ! rank order doesn't matter
-if (ipencil==1) then
-   if (xsz(1)==0 .or. xsz(2)==0 .or. xsz(3)==0) then
+if (ipencil == 1) then
+   if (xsz(1) == 0 .or. xsz(2) == 0 .or. xsz(3) == 0) then
       color = 2
    end if
-else if (ipencil==2) then
-   if (ysz(1)==0 .or. ysz(2)==0 .or. ysz(3)==0) then
+else if (ipencil == 2) then
+   if (ysz(1) == 0 .or. ysz(2) == 0 .or. ysz(3) == 0) then
       color = 2
    end if
-else if (ipencil==3) then
-   if (zsz(1)==0 .or. zsz(2)==0 .or. zsz(3)==0) then
+else if (ipencil == 3) then
+   if (zsz(1) == 0 .or. zsz(2) == 0 .or. zsz(3) == 0) then
       color = 2
    end if
 end if
-call MPI_COMM_SPLIT(decomp_2d_comm,color,key,newcomm,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
+call MPI_COMM_SPLIT(decomp_2d_comm, color, key, newcomm, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_COMM_SPLIT")
 
-if (color==1) then ! only ranks in this group do IO collectively
+if (color == 1) then ! only ranks in this group do IO collectively
 
    ! generate subarray information
    sizes(1) = xsz(1)
    sizes(2) = ysz(2)
    sizes(3) = zsz(3)
-   if (ipencil==1) then
+   if (ipencil == 1) then
       subsizes(1) = xsz(1)
       subsizes(2) = xsz(2)
       subsizes(3) = xsz(3)
-      starts(1) = xst(1)-1
-      starts(2) = xst(2)-1
-      starts(3) = xst(3)-1
-   else if (ipencil==2) then
+      starts(1) = xst(1) - 1
+      starts(2) = xst(2) - 1
+      starts(3) = xst(3) - 1
+   else if (ipencil == 2) then
       subsizes(1) = ysz(1)
       subsizes(2) = ysz(2)
       subsizes(3) = ysz(3)
-      starts(1) = yst(1)-1
-      starts(2) = yst(2)-1
-      starts(3) = yst(3)-1
-   else if (ipencil==3) then
+      starts(1) = yst(1) - 1
+      starts(2) = yst(2) - 1
+      starts(3) = yst(3) - 1
+   else if (ipencil == 3) then
       subsizes(1) = zsz(1)
       subsizes(2) = zsz(2)
       subsizes(3) = zsz(3)
-      starts(1) = zst(1)-1
-      starts(2) = zst(2)-1
-      starts(3) = zst(3)-1
+      starts(1) = zst(1) - 1
+      starts(2) = zst(2) - 1
+      starts(3) = zst(3) - 1
    end if
 
    ! copy data from original array
-   ! needs a copy of original array in global coordinate 
-   if (ipencil==1) then
-      allocate(wk(xst(1):xen(1),xst(2):xen(2),xst(3):xen(3)))
-      allocate(wk2(xstart(1):xend(1),xstart(2):xend(2),xstart(3):xend(3)))
-      wk2=var
+   ! needs a copy of original array in global coordinate
+   if (ipencil == 1) then
+      allocate (wk(xst(1):xen(1), xst(2):xen(2), xst(3):xen(3)))
+      allocate (wk2(xstart(1):xend(1), xstart(2):xend(2), xstart(3):xend(3)))
+      wk2 = var
       if (from1) then
-         do k=xst(3),xen(3)
-            do j=xst(2),xen(2)
-               do i=xst(1),xen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
+         do k = xst(3), xen(3)
+            do j = xst(2), xen(2)
+               do i = xst(1), xen(1)
+                  wk(i, j, k) = wk2((i - 1)*iskip + 1, (j - 1)*jskip + 1, (k - 1)*kskip + 1)
                end do
             end do
          end do
       else
-         do k=xst(3),xen(3)
-            do j=xst(2),xen(2)
-               do i=xst(1),xen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
+         do k = xst(3), xen(3)
+            do j = xst(2), xen(2)
+               do i = xst(1), xen(1)
+                  wk(i, j, k) = wk2(i*iskip, j*jskip, k*kskip)
                end do
             end do
          end do
       end if
-   else if (ipencil==2) then
-      allocate(wk(yst(1):yen(1),yst(2):yen(2),yst(3):yen(3)))
-      allocate(wk2(ystart(1):yend(1),ystart(2):yend(2),ystart(3):yend(3)))
-      wk2=var
+   else if (ipencil == 2) then
+      allocate (wk(yst(1):yen(1), yst(2):yen(2), yst(3):yen(3)))
+      allocate (wk2(ystart(1):yend(1), ystart(2):yend(2), ystart(3):yend(3)))
+      wk2 = var
       if (from1) then
-         do k=yst(3),yen(3)
-            do j=yst(2),yen(2)
-               do i=yst(1),yen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
+         do k = yst(3), yen(3)
+            do j = yst(2), yen(2)
+               do i = yst(1), yen(1)
+                  wk(i, j, k) = wk2((i - 1)*iskip + 1, (j - 1)*jskip + 1, (k - 1)*kskip + 1)
                end do
             end do
          end do
       else
-         do k=yst(3),yen(3)
-            do j=yst(2),yen(2)
-               do i=yst(1),yen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
+         do k = yst(3), yen(3)
+            do j = yst(2), yen(2)
+               do i = yst(1), yen(1)
+                  wk(i, j, k) = wk2(i*iskip, j*jskip, k*kskip)
                end do
             end do
          end do
       end if
-   else if (ipencil==3) then
-      allocate(wk(zst(1):zen(1),zst(2):zen(2),zst(3):zen(3)))
-      allocate(wk2(zstart(1):zend(1),zstart(2):zend(2),zstart(3):zend(3)))
-      wk2=var
+   else if (ipencil == 3) then
+      allocate (wk(zst(1):zen(1), zst(2):zen(2), zst(3):zen(3)))
+      allocate (wk2(zstart(1):zend(1), zstart(2):zend(2), zstart(3):zend(3)))
+      wk2 = var
       if (from1) then
-         do k=zst(3),zen(3)
-            do j=zst(2),zen(2)
-               do i=zst(1),zen(1)
-                  wk(i,j,k) = wk2((i-1)*iskip+1,(j-1)*jskip+1,(k-1)*kskip+1)
+         do k = zst(3), zen(3)
+            do j = zst(2), zen(2)
+               do i = zst(1), zen(1)
+                  wk(i, j, k) = wk2((i - 1)*iskip + 1, (j - 1)*jskip + 1, (k - 1)*kskip + 1)
                end do
             end do
          end do
       else
-         do k=zst(3),zen(3)
-            do j=zst(2),zen(2)
-               do i=zst(1),zen(1)
-                  wk(i,j,k) = wk2(i*iskip,j*jskip,k*kskip)
+         do k = zst(3), zen(3)
+            do j = zst(2), zen(2)
+               do i = zst(1), zen(1)
+                  wk(i, j, k) = wk2(i*iskip, j*jskip, k*kskip)
                end do
             end do
          end do
       end if
    end if
-   deallocate(wk2)
+   deallocate (wk2)
 
    ! MPI-IO
-   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-        MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-   call MPI_TYPE_COMMIT(newtype,ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                 MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+   call MPI_TYPE_COMMIT(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
    call MPI_FILE_OPEN(newcomm, filename, &
-        MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-        fh, ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+                      MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, &
+                      fh, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
    filesize = 0_MPI_OFFSET_KIND
-   call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
+   call MPI_FILE_SET_SIZE(fh, filesize, ierror)  ! guarantee overwriting
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
    disp = 0_MPI_OFFSET_KIND
-   call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-        newtype,'native',MPI_INFO_NULL,ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+   call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                          newtype, 'native', MPI_INFO_NULL, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
    call MPI_FILE_WRITE_ALL(fh, wk, &
-        subsizes(1)*subsizes(2)*subsizes(3), &
-        data_type, MPI_STATUS_IGNORE, ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-   call MPI_FILE_CLOSE(fh,ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
-   call MPI_TYPE_FREE(newtype,ierror)
-   if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+                           subsizes(1)*subsizes(2)*subsizes(3), &
+                           data_type, MPI_STATUS_IGNORE, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+   call MPI_FILE_CLOSE(fh, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+   call MPI_TYPE_FREE(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
-   deallocate(wk)
+   deallocate (wk)
 
 end if ! color==1
 
-call decomp_mpi_comm_free(newcomm)
+call decomp_2d_mpi_comm_free(newcomm)
 
 call MPI_BARRIER(decomp_2d_comm, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
diff --git a/src/io_write_one.inc b/src/io_write_one.inc
index 29facac5..85cb8399 100644
--- a/src/io_write_one.inc
+++ b/src/io_write_one.inc
@@ -1,17 +1,7 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG) 
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'mpiio_write_one_...' in io.f90
 
 ! Using MPI-IO to write a distributed 3D array into a file
@@ -31,67 +21,46 @@ if (ipencil == 1) then
    subsizes(1) = decomp%xsz(1)
    subsizes(2) = decomp%xsz(2)
    subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
+   starts(1) = decomp%xst(1) - 1  ! 0-based index
+   starts(2) = decomp%xst(2) - 1
+   starts(3) = decomp%xst(3) - 1
 else if (ipencil == 2) then
    subsizes(1) = decomp%ysz(1)
    subsizes(2) = decomp%ysz(2)
    subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
+   starts(1) = decomp%yst(1) - 1
+   starts(2) = decomp%yst(2) - 1
+   starts(3) = decomp%yst(3) - 1
 else if (ipencil == 3) then
    subsizes(1) = decomp%zsz(1)
    subsizes(2) = decomp%zsz(2)
    subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
-
-#ifdef T3PIO
-call MPI_INFO_CREATE(info, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_INFO_CREATE")
-gs = ceiling(real(sizes(1),mytype)*real(sizes(2),mytype)* &
-     real(sizes(3),mytype)/1024./1024.)
-call t3pio_set_info(decomp_2d_comm, info, "./", ierror, &
-     GLOBAL_SIZE=gs, factor=1)
-#else
-gs = 1 ! Silence unused variable
-info = 1 ! Silence unused variable
-#endif
+   starts(1) = decomp%zst(1) - 1
+   starts(2) = decomp%zst(2) - 1
+   starts(3) = decomp%zst(3) - 1
+end if
 
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-call MPI_TYPE_COMMIT(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-#ifdef T3PIO
-call MPI_FILE_OPEN(decomp_2d_comm, filename, &
-     MPI_MODE_CREATE+MPI_MODE_WRONLY, info, fh, ierror)
-#else
+call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                              MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+call MPI_TYPE_COMMIT(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
 call MPI_FILE_OPEN(decomp_2d_comm, filename, &
-     MPI_MODE_CREATE+MPI_MODE_WRONLY, MPI_INFO_NULL, &
-     fh, ierror)
-#endif
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
+                   MPI_MODE_CREATE + MPI_MODE_WRONLY, MPI_INFO_NULL, &
+                   fh, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_OPEN")
 filesize = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_SIZE(fh,filesize,ierror)  ! guarantee overwriting
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
+call MPI_FILE_SET_SIZE(fh, filesize, ierror)  ! guarantee overwriting
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_SIZE")
 disp = 0_MPI_OFFSET_KIND
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                       newtype, 'native', MPI_INFO_NULL, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
 call MPI_FILE_WRITE_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-call MPI_FILE_CLOSE(fh,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
-call MPI_TYPE_FREE(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
-#ifdef T3PIO
-call MPI_INFO_FREE(info,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_INFO_FREE")
-#endif
+                        subsizes(1)*subsizes(2)*subsizes(3), &
+                        data_type, MPI_STATUS_IGNORE, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+call MPI_FILE_CLOSE(fh, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_CLOSE")
+call MPI_TYPE_FREE(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
diff --git a/src/io_write_outflow.f90 b/src/io_write_outflow.f90
index fbd845a2..7b680170 100644
--- a/src/io_write_outflow.f90
+++ b/src/io_write_outflow.f90
@@ -1,20 +1,11 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'write_var_...' in io.f90
 
-! Using MPI-IO to write a distributed 3D variable to a file. File 
+! Using MPI-IO to write a distributed 3D variable to a file. File
 ! operations (open/close) need to be done in calling application. This
-! allows multiple variables to be written to a single file. Together 
+! allows multiple variables to be written to a single file. Together
 ! with the corresponding read operation, this is the perfect solution
 ! for applications to perform restart/checkpointing.
 
@@ -25,58 +16,62 @@
 end if
 
 ! Create file type and set file view
-sizes(1) = ntimesteps 
+sizes(1) = ntimesteps
 sizes(2) = decomp%ysz(2)
 sizes(3) = decomp%zsz(3)
-subsizes(1) = ntimesteps 
+subsizes(1) = ntimesteps
 subsizes(2) = decomp%xsz(2)
 subsizes(3) = decomp%xsz(3)
 starts(1) = 0  ! 0-based index
-starts(2) = decomp%xst(2)-1
-starts(3) = decomp%xst(3)-1
+starts(2) = decomp%xst(2) - 1
+starts(3) = decomp%xst(3) - 1
 
 idx = get_io_idx(io_name, dirname)
 
 #ifndef ADIOS2
 !! Use default MPIIO
-associate(fh=>fh_registry(idx), &
-     disp=>fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-  call MPI_FILE_WRITE_ALL(fh, var, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-  call MPI_TYPE_FREE(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+associate (fh => fh_registry(idx), &
+           disp => fh_disp(idx))
+   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                 MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+   call MPI_TYPE_COMMIT(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+   call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                          newtype, 'native', MPI_INFO_NULL, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+   call MPI_FILE_WRITE_ALL(fh, var, &
+                           subsizes(1) * subsizes(2) * subsizes(3), &
+                           data_type, MPI_STATUS_IGNORE, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+   call MPI_TYPE_FREE(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
-  ! update displacement for the next write operation
-  disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  if (data_type == complex_type) then
-     disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-  end if
+   ! update displacement for the next write operation
+   disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+          * int(sizes(2), kind=MPI_OFFSET_KIND) &
+          * int(sizes(3), kind=MPI_OFFSET_KIND) &
+          * int(mytype_bytes, kind=MPI_OFFSET_KIND)
+   if (data_type == complex_type) disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                                         * int(sizes(2), kind=MPI_OFFSET_KIND) &
+                                         * int(sizes(3), kind=MPI_OFFSET_KIND) &
+                                         * int(mytype_bytes, kind=MPI_OFFSET_KIND)
 end associate
 
-associate(vnm => varname) ! Silence unused dummy argument
+associate (vnm => varname) ! Silence unused dummy argument
 end associate
 
 #else
 !! Use ADIOS2
 call adios2_at_io(io_handle, adios, io_name, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
 call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-if (.not.var_handle % valid) then
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+if (.not. var_handle%valid) then
    call decomp_2d_abort(__FILE__, __LINE__, -1, "trying to write variable before registering!"//trim(varname))
-endif
+end if
 
 !! Note - need to use sync mode as we are using a view into the array - unsure how this works with deferred writes
 call adios2_put(engine_registry(idx), var_handle, var, adios2_mode_sync, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
 #endif
diff --git a/src/io_write_plane.inc b/src/io_write_plane.inc
index 088fe097..69b51bd7 100644
--- a/src/io_write_plane.inc
+++ b/src/io_write_plane.inc
@@ -1,16 +1,7 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'mpiio_write_plane_3d_...' in io.f90
 
 ! It is much easier to implement if all mpi ranks participate I/O.
@@ -22,83 +13,94 @@ else
    call get_decomp_info(decomp)
 end if
 
+! The current GPU build will not perform transpose operations when writing planes
+#if defined(_GPU)
+if (ipencil /= iplane) then
+   call decomp_2d_warning(__FILE__, &
+                          __LINE__, &
+                          ipencil, &
+                          "GPU build does not allow transpose operations when writing planes")
+   return
+endif
+#endif
+
 opened_new = .false.
 
-if (iplane==1) then
-   allocate(wk(decomp%xsz(1),decomp%xsz(2),decomp%xsz(3)))
-   if (ipencil==1) then
+if (iplane == 1) then
+   allocate (wk(decomp%xsz(1), decomp%xsz(2), decomp%xsz(3)))
+   if (ipencil == 1) then
       wk = var
-   else if (ipencil==2) then
-      call transpose_y_to_x(var,wk,decomp)
-   else if (ipencil==3) then
-      allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-      call transpose_z_to_y(var,wk2,decomp)
-      call transpose_y_to_x(wk2,wk,decomp)
-      deallocate(wk2)
+   else if (ipencil == 2) then
+      call transpose_y_to_x(var, wk, decomp)
+   else if (ipencil == 3) then
+      allocate (wk2(decomp%ysz(1), decomp%ysz(2), decomp%ysz(3)))
+      call transpose_z_to_y(var, wk2, decomp)
+      call transpose_y_to_x(wk2, wk, decomp)
+      deallocate (wk2)
    end if
-   allocate(wk2d(1,decomp%xsz(2),decomp%xsz(3)))
-   if (n.ge.1) then
-      do k=1,decomp%xsz(3)
-         do j=1,decomp%xsz(2)
-            wk2d(1,j,k)=wk(n,j,k)
+   allocate (wk2d(1, decomp%xsz(2), decomp%xsz(3)))
+   if (n >= 1) then
+      do k = 1, decomp%xsz(3)
+         do j = 1, decomp%xsz(2)
+            wk2d(1, j, k) = wk(n, j, k)
          end do
       end do
    else
-      do k=1,decomp%xsz(3)
-         do j=1,decomp%xsz(2)
-            wk2d(1,j,k)=sum(wk(:,j,k))/real(decomp%xsz(1),kind=mytype)
+      do k = 1, decomp%xsz(3)
+         do j = 1, decomp%xsz(2)
+            wk2d(1, j, k) = sum(wk(:, j, k))/real(decomp%xsz(1), kind=mytype)
          end do
       end do
-   endif
-else if (iplane==2) then
-   allocate(wk(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-   if (ipencil==1) then
-      call transpose_x_to_y(var,wk,decomp)
-   else if (ipencil==2) then
+   end if
+else if (iplane == 2) then
+   allocate (wk(decomp%ysz(1), decomp%ysz(2), decomp%ysz(3)))
+   if (ipencil == 1) then
+      call transpose_x_to_y(var, wk, decomp)
+   else if (ipencil == 2) then
       wk = var
-   else if (ipencil==3) then
-      call transpose_z_to_y(var,wk,decomp)
+   else if (ipencil == 3) then
+      call transpose_z_to_y(var, wk, decomp)
    end if
-   allocate(wk2d(decomp%ysz(1),1,decomp%ysz(3)))
-   if (n.ge.1) then
-      do k=1,decomp%ysz(3)
-         do i=1,decomp%ysz(1)
-            wk2d(i,1,k)=wk(i,n,k)
+   allocate (wk2d(decomp%ysz(1), 1, decomp%ysz(3)))
+   if (n >= 1) then
+      do k = 1, decomp%ysz(3)
+         do i = 1, decomp%ysz(1)
+            wk2d(i, 1, k) = wk(i, n, k)
          end do
       end do
    else
-      do k=1,decomp%ysz(3)
-         do i=1,decomp%ysz(1)
-            wk2d(i,1,k)=sum(wk(i,:,k))/real(decomp%ysz(2),kind=mytype)
+      do k = 1, decomp%ysz(3)
+         do i = 1, decomp%ysz(1)
+            wk2d(i, 1, k) = sum(wk(i, :, k))/real(decomp%ysz(2), kind=mytype)
          end do
       end do
-   endif
-else if (iplane==3) then
-   allocate(wk(decomp%zsz(1),decomp%zsz(2),decomp%zsz(3)))
-   if (ipencil==1) then
-      allocate(wk2(decomp%ysz(1),decomp%ysz(2),decomp%ysz(3)))
-      call transpose_x_to_y(var,wk2,decomp)
-      call transpose_y_to_z(wk2,wk,decomp)
-      deallocate(wk2)
-   else if (ipencil==2) then
-      call transpose_y_to_z(var,wk,decomp)
-   else if (ipencil==3) then
+   end if
+else if (iplane == 3) then
+   allocate (wk(decomp%zsz(1), decomp%zsz(2), decomp%zsz(3)))
+   if (ipencil == 1) then
+      allocate (wk2(decomp%ysz(1), decomp%ysz(2), decomp%ysz(3)))
+      call transpose_x_to_y(var, wk2, decomp)
+      call transpose_y_to_z(wk2, wk, decomp)
+      deallocate (wk2)
+   else if (ipencil == 2) then
+      call transpose_y_to_z(var, wk, decomp)
+   else if (ipencil == 3) then
       wk = var
    end if
-   allocate(wk2d(decomp%zsz(1),decomp%zsz(2),1))
-   if (n.ge.1) then
-      do j=1,decomp%zsz(2)
-         do i=1,decomp%zsz(1) 
-            wk2d(i,j,1)=wk(i,j,n)
+   allocate (wk2d(decomp%zsz(1), decomp%zsz(2), 1))
+   if (n >= 1) then
+      do j = 1, decomp%zsz(2)
+         do i = 1, decomp%zsz(1)
+            wk2d(i, j, 1) = wk(i, j, n)
          end do
       end do
    else
-      do j=1,decomp%zsz(2)
-         do i=1,decomp%zsz(1)
-            wk2d(i,j,1)=sum(wk(i,j,:))/real(decomp%zsz(3),kind=mytype)
+      do j = 1, decomp%zsz(2)
+         do i = 1, decomp%zsz(1)
+            wk2d(i, j, 1) = sum(wk(i, j, :))/real(decomp%zsz(3), kind=mytype)
          end do
       end do
-   endif
+   end if
 end if
 
 idx = get_io_idx(io_name, dirname)
@@ -106,15 +108,15 @@ idx = get_io_idx(io_name, dirname)
 #ifndef ADIOS2
 !! Use default MPIIO writers
 
-if (idx .lt. 1) then
+if (idx < 1) then
    ! Create folder if needed
-   if (nrank==0) then
-      inquire(file=dirname, exist=dir_exists)
-      if (.not.dir_exists) then
+   if (nrank == 0) then
+      inquire (file=dirname, exist=dir_exists)
+      if (.not. dir_exists) then
          call execute_command_line("mkdir "//dirname//" 2> /dev/null", wait=.true.)
       end if
    end if
-   allocate(character(len(trim(dirname)) + 1 + len(trim(varname))) :: full_io_name)
+   allocate (character(len(trim(dirname)) + 1 + len(trim(varname))) :: full_io_name)
    full_io_name = dirname//"/"//varname
    call decomp_2d_open_io(io_name, full_io_name, decomp_2d_write_mode)
    idx = get_io_idx(io_name, full_io_name)
@@ -122,40 +124,42 @@ if (idx .lt. 1) then
 end if
 
 call plane_extents(sizes, subsizes, starts, iplane, decomp)
-associate(fh=>fh_registry(idx), &
-     disp=>fh_disp(idx))
-  call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-       MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-  call MPI_TYPE_COMMIT(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-  call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-       newtype,'native',MPI_INFO_NULL,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
-  call MPI_FILE_WRITE_ALL(fh, wk2d, &
-       subsizes(1)*subsizes(2)*subsizes(3), &
-       data_type, MPI_STATUS_IGNORE, ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-  call MPI_TYPE_FREE(newtype,ierror)
-  if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+associate (fh => fh_registry(idx), &
+           disp => fh_disp(idx))
+   call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                                 MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+   call MPI_TYPE_COMMIT(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+   call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                          newtype, 'native', MPI_INFO_NULL, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+   call MPI_FILE_WRITE_ALL(fh, wk2d, &
+                           subsizes(1)*subsizes(2)*subsizes(3), &
+                           data_type, MPI_STATUS_IGNORE, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+   call MPI_TYPE_FREE(newtype, ierror)
+   if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 end associate
-    
+
 if (opened_new) then
    call decomp_2d_close_io(io_name, full_io_name)
-   deallocate(full_io_name)
+   deallocate (full_io_name)
 end if
 #else
 !! Write using ADIOS2
 call adios2_at_io(io_handle, adios, io_name, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_at_io "//trim(io_name))
 call adios2_inquire_variable(var_handle, io_handle, varname, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
-if (.not.var_handle % valid) call decomp_2d_abort(__FILE__, __LINE__, -1, &
-   "trying to write variable before registering! "//trim(varname))
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_inquire_variable "//trim(varname))
+if (.not. var_handle%valid) then
+   call decomp_2d_abort(__FILE__, __LINE__, -1, &
+        "trying to write variable before registering!  "//trim(varname))
+end if
 
 !! Note - need to use sync mode as the array for the output plane gets reused.
 call adios2_put(engine_registry(idx), var_handle, wk2d, adios2_mode_sync, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "adios2_put")
 #endif
 
-deallocate(wk,wk2d)
+deallocate (wk, wk2d)
diff --git a/src/io_write_var.inc b/src/io_write_var.inc
index b497e589..d6eef576 100644
--- a/src/io_write_var.inc
+++ b/src/io_write_var.inc
@@ -1,21 +1,12 @@
-!                          -*- mode: f90 -*-                            
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2011 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!                          -*- mode: f90 -*-
+!! SPDX-License-Identifier: BSD-3-Clause
 
-! This file contain common code to be included by subroutines 
+! This file contain common code to be included by subroutines
 ! 'write_var_...' in io.f90
 
-! Using MPI-IO to write a distributed 3D variable to a file. File 
+! Using MPI-IO to write a distributed 3D variable to a file. File
 ! operations (open/close) need to be done in calling application. This
-! allows multiple variables to be written to a single file. Together 
+! allows multiple variables to be written to a single file. Together
 ! with the corresponding read operation, this is the perfect solution
 ! for applications to perform restart/checkpointing.
 
@@ -33,42 +24,46 @@ if (ipencil == 1) then
    subsizes(1) = decomp%xsz(1)
    subsizes(2) = decomp%xsz(2)
    subsizes(3) = decomp%xsz(3)
-   starts(1) = decomp%xst(1)-1  ! 0-based index
-   starts(2) = decomp%xst(2)-1
-   starts(3) = decomp%xst(3)-1
+   starts(1) = decomp%xst(1) - 1  ! 0-based index
+   starts(2) = decomp%xst(2) - 1
+   starts(3) = decomp%xst(3) - 1
 else if (ipencil == 2) then
    subsizes(1) = decomp%ysz(1)
    subsizes(2) = decomp%ysz(2)
    subsizes(3) = decomp%ysz(3)
-   starts(1) = decomp%yst(1)-1
-   starts(2) = decomp%yst(2)-1
-   starts(3) = decomp%yst(3)-1
+   starts(1) = decomp%yst(1) - 1
+   starts(2) = decomp%yst(2) - 1
+   starts(3) = decomp%yst(3) - 1
 else if (ipencil == 3) then
    subsizes(1) = decomp%zsz(1)
    subsizes(2) = decomp%zsz(2)
    subsizes(3) = decomp%zsz(3)
-   starts(1) = decomp%zst(1)-1
-   starts(2) = decomp%zst(2)-1
-   starts(3) = decomp%zst(3)-1
-endif
+   starts(1) = decomp%zst(1) - 1
+   starts(2) = decomp%zst(2) - 1
+   starts(3) = decomp%zst(3) - 1
+end if
 
-call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts,  &
-     MPI_ORDER_FORTRAN, data_type, newtype, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
-call MPI_TYPE_COMMIT(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
-call MPI_FILE_SET_VIEW(fh,disp,data_type, &
-     newtype,'native',MPI_INFO_NULL,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
+call MPI_TYPE_CREATE_SUBARRAY(3, sizes, subsizes, starts, &
+                              MPI_ORDER_FORTRAN, data_type, newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_CREATE_SUBARRAY")
+call MPI_TYPE_COMMIT(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_COMMIT")
+call MPI_FILE_SET_VIEW(fh, disp, data_type, &
+                       newtype, 'native', MPI_INFO_NULL, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_SET_VIEW")
 call MPI_FILE_WRITE_ALL(fh, var, &
-     subsizes(1)*subsizes(2)*subsizes(3), &
-     data_type, MPI_STATUS_IGNORE, ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
-call MPI_TYPE_FREE(newtype,ierror)
-if (ierror.ne.0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
+                        subsizes(1)*subsizes(2)*subsizes(3), &
+                        data_type, MPI_STATUS_IGNORE, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_FILE_WRITE_ALL")
+call MPI_TYPE_FREE(newtype, ierror)
+if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_TYPE_FREE")
 
 ! update displacement for the next write operation
-disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-if (data_type == complex_type) then
-   disp = disp + sizes(1)*sizes(2)*sizes(3)*mytype_bytes
-end if
+disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+       *int(sizes(2), kind=MPI_OFFSET_KIND) &
+       *int(sizes(3), kind=MPI_OFFSET_KIND) &
+       *int(mytype_bytes, kind=MPI_OFFSET_KIND)
+if (data_type == complex_type) disp = disp + int(sizes(1), kind=MPI_OFFSET_KIND) &
+                                      *int(sizes(2), kind=MPI_OFFSET_KIND) &
+                                      *int(sizes(3), kind=MPI_OFFSET_KIND) &
+                                      *int(mytype_bytes, kind=MPI_OFFSET_KIND)
diff --git a/src/log.f90 b/src/log.f90
index 3581ea7c..ae121eca 100644
--- a/src/log.f90
+++ b/src/log.f90
@@ -1,309 +1,295 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
-
-submodule (decomp_2d) d2d_log
-
-  implicit none
-
-  contains
-
-  !
-  ! Print some information about decomp_2d
-  !
-  module subroutine d2d_listing(given_io_unit)
-
-    use iso_fortran_env, only : output_unit, compiler_version, compiler_options
-
-    implicit none
-
-    ! Argument
-    integer, intent(in), optional :: given_io_unit
-
-    ! Local variable
-    integer :: io_unit
-    integer :: version, subversion, ierror
+!! SPDX-License-Identifier: BSD-3-Clause
+
+submodule(decomp_2d) d2d_log
+
+   use decomp_2d_constants
+   use decomp_2d_mpi
+
+   implicit none
+
+contains
+
+   !
+   ! Get the IO unit for the log
+   !
+   module function d2d_listing_get_unit()
+
+      use iso_fortran_env, only: output_unit
+
+      implicit none
+
+      ! Output
+      integer :: d2d_listing_get_unit
+
+      ! Local variables
+      logical :: found
+      integer :: ierror
+      character(len=7) fname ! Sufficient for up to O(1M) ranks
+
+      if (decomp_log == D2D_LOG_TOFILE_FULL) then
+         write (fname, "(I0)") nrank ! Adapt to magnitude of nrank
+         inquire (file='decomp_2d_setup_'//trim(fname)//'.log', &
+                  exist=found)
+         if (found) then
+            open (newunit=d2d_listing_get_unit, &
+                  file='decomp_2d_setup_'//trim(fname)//'.log', &
+                  status="old", &
+                  position="append", &
+                  iostat=ierror)
+         else
+            open (newunit=d2d_listing_get_unit, &
+                  file='decomp_2d_setup_'//trim(fname)//'.log', &
+                  status="new", &
+                  iostat=ierror)
+         end if
+      elseif (nrank == 0 .and. decomp_log == D2D_LOG_TOFILE) then
+         inquire (file="decomp_2d_setup.log", &
+                  exist=found)
+         if (found) then
+            open (newunit=d2d_listing_get_unit, &
+                  file="decomp_2d_setup.log", &
+                  status="old", &
+                  position="append", &
+                  iostat=ierror)
+         else
+            open (newunit=d2d_listing_get_unit, &
+                  file="decomp_2d_setup.log", &
+                  status="new", &
+                  iostat=ierror)
+         end if
+      else
+         d2d_listing_get_unit = output_unit
+         ierror = 0
+      end if
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "Could not open log file")
+
+   end function d2d_listing_get_unit
+
+   !
+   ! Close the IO unit for the log if needed
+   !
+   module subroutine d2d_listing_close_unit(io_unit)
+
+      use iso_fortran_env, only: output_unit
+
+      implicit none
+
+      ! Input
+      integer, intent(in) :: io_unit
+
+      ! Local variables
+      integer :: ierror
+
+      !
+      ! Close the IO unit if it was not stdout
+      !
+      if (io_unit /= output_unit) then
+         close (io_unit, iostat=ierror)
+         if (ierror /= 0) call decomp_2d_abort(ierror, "Could not close log file")
+      end if
+
+   end subroutine d2d_listing_close_unit
+
+   !
+   ! Print some information about decomp_2d
+   !
+   module subroutine d2d_listing(given_io_unit)
+
+      use iso_fortran_env, only: output_unit, compiler_version, compiler_options
+
+      implicit none
+
+      ! Argument
+      integer, intent(in), optional :: given_io_unit
+
+      ! Local variable
+      integer :: io_unit
+      integer :: version, subversion, ierror
 #ifdef DEBUG
-    character(len=64) :: fname
-#endif
-
-    !
-    ! Default : only rank 0 will print a listing
-    !
-    ! In DEBUG mode, all ranks will print a listing
-    !
-#ifndef DEBUG
-    if (nrank /= 0) return
+      character(len=512) :: fname
 #endif
 
-    ! If no IO unit provided, use stdout
-    if (present(given_io_unit)) then
-       io_unit = given_io_unit
-    else
-       io_unit = output_unit
-    endif
-
-    ! Header
-    write (io_unit, *) '==========================================================='
-    write (io_unit, *) '=================== Decomp2D - log ========================'
-    write (io_unit, *) '==========================================================='
-
-    ! Git hash if available
+      ! Output log if needed
+      if (decomp_log == D2D_LOG_QUIET) return
+      if (decomp_log == D2D_LOG_STDOUT .and. nrank /= 0) return
+      if (decomp_log == D2D_LOG_TOFILE .and. nrank /= 0) return
+
+      ! If no IO unit provided, use stdout
+      if (present(given_io_unit)) then
+         io_unit = given_io_unit
+      else
+         io_unit = output_unit
+      end if
+
+      ! Header
+      write (io_unit, *) '==========================================================='
+      write (io_unit, *) '=================== Decomp2D - log ========================'
+      write (io_unit, *) '==========================================================='
+
+      ! Major and minor version number
+      if (D2D_RELEASE) then
+         write (io_unit, "(A,I0,A,I0)") ' Release ', D2D_MAJOR, '.', D2D_MINOR
+      else
+         write (io_unit, "(A,I0,A,I0,A)") ' Release ', D2D_MAJOR, '.', D2D_MINOR, '.alpha'
+      end if
+
+      ! Git hash if available
 #if defined(VERSION)
-    write (io_unit, *) 'Git version        : ', VERSION
+      write (io_unit, *) 'Git version        : ', VERSION
 #else
-    write (io_unit, *) 'Git version        : unknown'
+      write (io_unit, *) 'Git version        : unknown'
 #endif
 
-    ! Basic info
+      ! Basic info
 #ifdef DEBUG
-    if (decomp_debug >= D2D_DEBUG_LEVEL_INFO) &
-       write (io_unit, *) 'I am mpi rank ', nrank
+      if (decomp_debug >= D2D_DEBUG_LEVEL_INFO) &
+         write (io_unit, *) 'I am mpi rank ', nrank
 #endif
-    write (io_unit, *) 'Total ranks ', nproc
-    write (io_unit, *) 'Global data size : ', nx_global, ny_global, nz_global
-    write (io_unit, *) 'p_row, p_col : ', dims(1), dims(2)
-    write (io_unit, *) 'Periodicity : ', periodic_x, periodic_y, periodic_z
-    write (io_unit, *) 'Number of bytes / float number : ', mytype_bytes
-    write (io_unit, *) '==========================================================='
-
-    ! Show detected flags, compiler options, version of the MPI library
-    write (io_unit, *) 'Compile flags detected :'
+      write (io_unit, *) 'Total ranks ', nproc
+      write (io_unit, *) 'Global data size : ', nx_global, ny_global, nz_global
+      write (io_unit, *) 'p_row, p_col : ', dims(1), dims(2)
+      write (io_unit, *) 'Periodicity : ', periodic_x, periodic_y, periodic_z
+      write (io_unit, *) 'Number of bytes / float number : ', mytype_bytes
+      write (io_unit, *) '==========================================================='
+
+      ! Show detected flags, compiler options, version of the MPI library
+      write (io_unit, *) 'Compile flags detected :'
 #ifdef DOUBLE_PREC
 #ifdef SAVE_SINGLE
-    write (io_unit, *) 'Numerical precision: Double, saving in single'
+      write (io_unit, *) 'Numerical precision: Double, saving in single'
 #else
-    write (io_unit, *) 'Numerical precision: Double'
+      write (io_unit, *) 'Numerical precision: Double'
 #endif
 #else
-    write (io_unit, *) 'Numerical precision: Single'
+      write (io_unit, *) 'Numerical precision: Single'
 #endif
-    write (io_unit, *) 'Compiled with ', compiler_version()
-    write (io_unit, *) 'Compiler options : ', compiler_options()
-    call MPI_Get_version(version, subversion, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_Get_version")
-    write (io_unit, '(" Version of the MPI library : ",I0,".",I0)') version, subversion
+      write (io_unit, *) 'Compiled with ', compiler_version()
+      write (io_unit, *) 'Compiler options : ', compiler_options()
+      call MPI_Get_version(version, subversion, ierror)
+      if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_Get_version")
+      write (io_unit, '(" Version of the MPI library : ",I0,".",I0)') version, subversion
 #ifdef DEBUG
-    write (io_unit, *) 'Compile flag DEBUG detected'
-    write (io_unit, *) '   debug level : ', decomp_debug
+      write (io_unit, *) 'Compile flag DEBUG detected'
+      write (io_unit, *) '   debug level : ', decomp_debug
 #endif
 #ifdef PROFILER
-    write (io_unit, *) 'Compile flag PROFILER detected'
-#endif
-#ifdef SHM
-    write (io_unit, *) 'Compile flag SHM detected'
+      write (io_unit, *) 'Compile flag PROFILER detected'
 #endif
 #ifdef EVEN
-    write (io_unit, *) 'Compile flag EVEN detected'
-#endif
-#ifdef OCC
-    write (io_unit, *) 'Compile flag OCC detected'
+      write (io_unit, *) 'Compile flag EVEN detected'
 #endif
 #ifdef OVERWRITE
-    write (io_unit, *) 'Compile flag OVERWRITE detected'
+      write (io_unit, *) 'Compile flag OVERWRITE detected'
 #endif
 #ifdef HALO_DEBUG
-    write (io_unit, *) 'Compile flag HALO_DEBUG detected'
-#endif
-#ifdef SHM_DEBUG
-    write (io_unit, *) 'Compile flag SHM_DEBUG detected'
+      write (io_unit, *) 'Compile flag HALO_DEBUG detected'
 #endif
 #ifdef _GPU
-    write (io_unit, *) 'Compile flag _GPU detected'
+      write (io_unit, *) 'Compile flag _GPU detected'
 #endif
 #ifdef _NCCL
-    write (io_unit, *) 'Compile flag _NCCL detected'
+      write (io_unit, *) 'Compile flag _NCCL detected'
 #endif
-    write (io_unit, *) '==========================================================='
-    write (io_unit, *) 'Profiler id : ', decomp_profiler
+      write (io_unit, *) '==========================================================='
+      write (io_unit, *) 'Profiler id : ', decomp_profiler
 #ifdef PROFILER
-    call decomp_profiler_log(io_unit)
-    write(io_unit, *) "   Profiling transpose : ", decomp_profiler_transpose
-    write(io_unit, *) "   Profiling IO : ", decomp_profiler_io
-    write(io_unit, *) "   Profiling FFT : ", decomp_profiler_fft
-    write(io_unit, *) "   Profiling decomp_2d : ", decomp_profiler_d2d
+      call decomp_profiler_log(io_unit)
+      write (io_unit, *) "   Profiling transpose : ", decomp_profiler_transpose
+      write (io_unit, *) "   Profiling IO : ", decomp_profiler_io
+      write (io_unit, *) "   Profiling FFT : ", decomp_profiler_fft
+      write (io_unit, *) "   Profiling decomp_2d : ", decomp_profiler_d2d
 #endif
-    write (io_unit, *) '==========================================================='
-    ! Info about each decomp_info object
-    call decomp_info_print(decomp_main, io_unit, "decomp_main")
-    call decomp_info_print(phG, io_unit, "phG")
-    call decomp_info_print(ph1, io_unit, "ph1")
-    call decomp_info_print(ph2, io_unit, "ph2")
-    call decomp_info_print(ph3, io_unit, "ph3")
-    call decomp_info_print(ph4, io_unit, "ph4")
-#ifdef SHM_DEBUG
-    write (io_unit, *) '==========================================================='
-    call print_smp(io_unit)
-#endif
-    write (io_unit, *) '==========================================================='
-    write (io_unit, *) '==========================================================='
+      write (io_unit, *) '==========================================================='
+      ! Info about each decomp_info object
+      call decomp_info_print(decomp_main, io_unit, "decomp_main")
+      call decomp_info_print(phG, io_unit, "phG")
+      call decomp_info_print(ph1, io_unit, "ph1")
+      call decomp_info_print(ph2, io_unit, "ph2")
+      call decomp_info_print(ph3, io_unit, "ph3")
+      call decomp_info_print(ph4, io_unit, "ph4")
+      write (io_unit, *) '==========================================================='
+      write (io_unit, *) '==========================================================='
+
 #ifdef DEBUG
-    !
-    ! In DEBUG mode, rank 0 will also print environment variables
-    !
-    ! At high debug level, all ranks will print env. variables
-    !
-    ! The system call, if writing to a file, is not blocking if supported
-    !
-    if (nrank == 0 .or. decomp_debug >= D2D_DEBUG_LEVEL_INFO) then
-       write (io_unit, *) '============== Environment variables ======================'
-       write (io_unit, *) '==========================================================='
-       write (io_unit, *) '==========================================================='
-       if (io_unit == output_unit ) then
-          call execute_command_line("env", wait = .true.)
-       else
-          inquire(unit = io_unit, name = fname, iostat = ierror)
-          if (ierror /= 0) call decomp_2d_abort(__FILE__, &
-                                                __LINE__, &
-                                                ierror, &
-                                                "No name for the log file")
-          call execute_command_line("env >> "//trim(fname), wait = .false.)
-       endif
-    endif
+      !
+      ! In DEBUG mode, rank 0 will also print environment variables
+      !
+      ! At high debug level, all ranks will print env. variables
+      !
+      ! The system call, if writing to a file, is not blocking if supported
+      !
+      if (nrank == 0 .or. decomp_debug >= D2D_DEBUG_LEVEL_INFO) then
+         write (io_unit, *) '============== Environment variables ======================'
+         write (io_unit, *) '==========================================================='
+         write (io_unit, *) '==========================================================='
+         if (io_unit == output_unit) then
+            call execute_command_line("env", wait=.true.)
+         else
+            inquire (unit=io_unit, name=fname, iostat=ierror)
+            if (ierror /= 0) then
+               call decomp_2d_abort(__FILE__, __LINE__, ierror, "No name for the log file")
+            end if
+            ! Close the IO unit to print the environment variables
+            call d2d_listing_close_unit(io_unit)
+            call execute_command_line("env >> "//trim(fname), wait=.true.)
+         end if
+      end if
+#else
+      ! Close the IO unit if needed
+      call d2d_listing_close_unit(io_unit)
 #endif
 
-  end subroutine d2d_listing
+   end subroutine d2d_listing
 
-  !
-  ! Print some information about given decomp_info object
-  !
-  module subroutine decomp_info_print(d2d, io_unit, d2dname)
+   !
+   ! Print some information about given decomp_info object
+   !
+   module subroutine decomp_info_print(d2d, io_unit, d2dname)
 
-    implicit none
+      implicit none
 
-    ! Arguments
-    type(decomp_info), intent(in) :: d2d
-    integer, intent(in) :: io_unit
-    character(len=*), intent(in) :: d2dname
+      ! Arguments
+      type(decomp_info), intent(in) :: d2d
+      integer, intent(in) :: io_unit
+      character(len=*), intent(in) :: d2dname
 
-    ! Nothing to print if not initialized
-    if (.not.allocated(d2d%x1dist)) then
-      write (io_unit, *) 'Uninitialized decomp_info ', d2dname
-      return
-    endif
+      ! Nothing to print if not initialized
+      if (.not. allocated(d2d%x1dist)) then
+         write (io_unit, *) 'Uninitialized decomp_info ', d2dname
+         return
+      end if
 
-    !
-    ! If DEBUG mode, print everything
-    ! Otherwise, print only global size
-    !
-    write (io_unit, *) 'Decomp_info : ', d2dname
-    write (io_unit, *) '   Global size : ', d2d%xsz(1), d2d%ysz(2), d2d%zsz(3)
+      !
+      ! If DEBUG mode, print everything
+      ! Otherwise, print only global size
+      !
+      write (io_unit, *) 'Decomp_info : ', d2dname
+      write (io_unit, *) '   Global size : ', d2d%xsz(1), d2d%ysz(2), d2d%zsz(3)
 #ifdef DEBUG
-    write (io_unit, *) '   xsz, xst, xen : ', d2d%xsz, d2d%xst, d2d%xen
-    write (io_unit, *) '   ysz, yst, yen : ', d2d%ysz, d2d%yst, d2d%yen
-    write (io_unit, *) '   zsz, zst, zen : ', d2d%zsz, d2d%zst, d2d%zen
-    write (io_unit, *) '   x1dist : ', d2d%x1dist
-    write (io_unit, *) '   y1dist : ', d2d%y1dist
-    write (io_unit, *) '   y2dist : ', d2d%y2dist
-    write (io_unit, *) '   z2dist : ', d2d%z2dist
-    write (io_unit, *) '   x1cnts : ', d2d%x1cnts
-    write (io_unit, *) '   y1cnts : ', d2d%y1cnts
-    write (io_unit, *) '   y2cnts : ', d2d%y2cnts
-    write (io_unit, *) '   z2cnts : ', d2d%z2cnts
-    write (io_unit, *) '   x1disp : ', d2d%x1disp
-    write (io_unit, *) '   y1disp : ', d2d%y1disp
-    write (io_unit, *) '   y2disp : ', d2d%y2disp
-    write (io_unit, *) '   z2disp : ', d2d%z2disp
-    write (io_unit, *) '   x1count : ', d2d%x1count
-    write (io_unit, *) '   y1count : ', d2d%y1count
-    write (io_unit, *) '   y2count : ', d2d%y2count
-    write (io_unit, *) '   z2count : ', d2d%z2count
-    write (io_unit, *) '   even : ', d2d%even
-#ifdef SHM
-    write (io_unit, *) '   listing of the SHM part is not yet implemented'
+      write (io_unit, *) '   xsz, xst, xen : ', d2d%xsz, d2d%xst, d2d%xen
+      write (io_unit, *) '   ysz, yst, yen : ', d2d%ysz, d2d%yst, d2d%yen
+      write (io_unit, *) '   zsz, zst, zen : ', d2d%zsz, d2d%zst, d2d%zen
+      write (io_unit, *) '   x1dist : ', d2d%x1dist
+      write (io_unit, *) '   y1dist : ', d2d%y1dist
+      write (io_unit, *) '   y2dist : ', d2d%y2dist
+      write (io_unit, *) '   z2dist : ', d2d%z2dist
+      write (io_unit, *) '   x1cnts : ', d2d%x1cnts
+      write (io_unit, *) '   y1cnts : ', d2d%y1cnts
+      write (io_unit, *) '   y2cnts : ', d2d%y2cnts
+      write (io_unit, *) '   z2cnts : ', d2d%z2cnts
+      write (io_unit, *) '   x1disp : ', d2d%x1disp
+      write (io_unit, *) '   y1disp : ', d2d%y1disp
+      write (io_unit, *) '   y2disp : ', d2d%y2disp
+      write (io_unit, *) '   z2disp : ', d2d%z2disp
+#ifdef EVEN
+      write (io_unit, *) '   x1count : ', d2d%x1count
+      write (io_unit, *) '   y1count : ', d2d%y1count
+      write (io_unit, *) '   y2count : ', d2d%y2count
+      write (io_unit, *) '   z2count : ', d2d%z2count
+      write (io_unit, *) '   even : ', d2d%even
 #endif
 #endif
 
-  end subroutine decomp_info_print
-
-#ifdef SHM_DEBUG
-
-  ! For debugging, print the shared-memory structure
-  module subroutine print_smp(io_unit)
-
-    implicit none
-    
-    ! Argument
-    integer, intent(in) :: io_unit
-
-    ! print out shared-memory information
-    write(io_unit,*)'I am mpi rank ', nrank, 'Total ranks ', nproc
-    write(io_unit,*)' '
-    write(io_unit,*)'Global data size:'
-    write(io_unit,*)'nx*ny*nz', nx,ny,nz
-    write(io_unit,*)' '
-    write(io_unit,*)'2D processor grid:'
-    write(io_unit,*)'p_row*p_col:', dims(1), dims(2)
-    write(io_unit,*)' '
-    write(io_unit,*)'Portion of global data held locally:'
-    write(io_unit,*)'xsize:',xsize
-    write(io_unit,*)'ysize:',ysize
-    write(io_unit,*)'zsize:',zsize
-    write(io_unit,*)' '
-    write(io_unit,*)'How pensils are to be divided and sent in alltoallv:'
-    write(io_unit,*)'x1dist:',decomp_main%x1dist
-    write(io_unit,*)'y1dist:',decomp_main%y1dist
-    write(io_unit,*)'y2dist:',decomp_main%y2dist
-    write(io_unit,*)'z2dist:',decomp_main%z2dist
-    write(io_unit,*)' '
-    write(io_unit,*)'######Shared buffer set up after this point######'
-    write(io_unit,*)' '
-    write(io_unit,*) 'col communicator detais:'
-    call print_smp_info(decomp_main%COL_INFO, io_unit)
-    write(io_unit,*)' '
-    write(io_unit,*) 'row communicator detais:'
-    call print_smp_info(decomp_main%ROW_INFO; io_unit)
-    write(io_unit,*)' '
-    write(io_unit,*)'Buffer count and displacement of per-core buffers'
-    write(io_unit,*)'x1cnts:',decomp_main%x1cnts
-    write(io_unit,*)'y1cnts:',decomp_main%y1cnts
-    write(io_unit,*)'y2cnts:',decomp_main%y2cnts
-    write(io_unit,*)'z2cnts:',decomp_main%z2cnts
-    write(io_unit,*)'x1disp:',decomp_main%x1disp
-    write(io_unit,*)'y1disp:',decomp_main%y1disp
-    write(io_unit,*)'y2disp:',decomp_main%y2disp
-    write(io_unit,*)'z2disp:',decomp_main%z2disp
-    write(io_unit,*)' '
-    write(io_unit,*)'Buffer count and displacement of shared buffers'
-    write(io_unit,*)'x1cnts:',decomp_main%x1cnts_s
-    write(io_unit,*)'y1cnts:',decomp_main%y1cnts_s
-    write(io_unit,*)'y2cnts:',decomp_main%y2cnts_s
-    write(io_unit,*)'z2cnts:',decomp_main%z2cnts_s
-    write(io_unit,*)'x1disp:',decomp_main%x1disp_s
-    write(io_unit,*)'y1disp:',decomp_main%y1disp_s
-    write(io_unit,*)'y2disp:',decomp_main%y2disp_s
-    write(io_unit,*)'z2disp:',decomp_main%z2disp_s
-
-  end subroutine print_smp
-
-  ! For debugging, print the shared-memory structure
-  module subroutine print_smp_info(s, io_unit)
-
-    implicit none
-
-    ! Argument
-    TYPE(SMP_INFO), intent(in) :: s
-    integer, intent(in) :: io_unit
-
-    write(io_unit,*) 'size of current communicator:', s%NCPU
-    write(io_unit,*) 'rank in current communicator:', s%NODE_ME
-    write(io_unit,*) 'number of SMP-nodes in this communicator:', s%NSMP
-    write(io_unit,*) 'SMP-node id (1 ~ NSMP):', s%SMP_ME
-    write(io_unit,*) 'NCORE - number of cores on this SMP-node', s%NCORE
-    write(io_unit,*) 'core id (1 ~ NCORE):', s%CORE_ME
-    write(io_unit,*) 'maximum no. cores on any SMP-node:', s%MAXCORE
-    write(io_unit,*) 'size of SMP shared memory SND buffer:', s%N_SND
-    write(io_unit,*) 'size of SMP shared memory RCV buffer:', s%N_RCV
-
-  end subroutine print_smp_info
-#endif
+   end subroutine decomp_info_print
 
 end submodule d2d_log
diff --git a/src/profiler_caliper.f90 b/src/profiler_caliper.f90
index 5c5a1fa2..944129fb 100644
--- a/src/profiler_caliper.f90
+++ b/src/profiler_caliper.f90
@@ -1,14 +1,4 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-!
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil)
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2012 Ning Li, the Numerical Algorithms Group (NAG)
-! Copyright (C) 2021               the University of Edinburgh (UoE)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 !
 ! Submodule for the caliper profiler
diff --git a/src/transpose_x_to_y.f90 b/src/transpose_x_to_y.f90
index c5d83fa4..efc59538 100644
--- a/src/transpose_x_to_y.f90
+++ b/src/transpose_x_to_y.f90
@@ -1,492 +1,447 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains the routines that transpose data from X to Y pencil
 
   subroutine transpose_x_to_y_real_short(src, dst)
 
-    implicit none
+     implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_x_to_y(src, dst, decomp_main)
+     call transpose_x_to_y(src, dst, decomp_main)
 
   end subroutine transpose_x_to_y_real_short
 
-  subroutine transpose_x_to_y_real(src, dst, decomp)
-
-    implicit none
-    
-    Real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+  subroutine transpose_x_to_y_real_long(src, dst, decomp)
 
+     implicit none
 
-#if defined(_GPU) && defined(_NCCL)
-    integer :: col_rank_id, cuda_stat
-    type(ncclResult) :: nccl_stat
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+#if defined(_GPU)
+     integer :: istat, nsize
 #endif
 
-#ifdef SHM
-    real(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     if (dims(1) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%xsz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+     else
+        call transpose_x_to_y_real(src, dst, decomp)
+     end if
+
+  end subroutine transpose_x_to_y_real_long
+
+  subroutine transpose_x_to_y_real(src, dst, decomp)
+
+     implicit none
+
+     Real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_x_y_r")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_x_y_r")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%COL_INFO%SND_P
-    call mem_split_xy_real(src, s1, s2, s3, work1, dims(1), &
-         decomp%x1dist, decomp)
-#else
-  
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
+
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_xy_real(src, s1, s2, s3, work1_r_d, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_split_xy_real(src, s1, s2, s3, work1_r_d, dims(1), &
+                            decomp%x1dist, decomp)
 #else
-    call mem_split_xy_real(src, s1, s2, s3, work1_r, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_split_xy_real(src, s1, s2, s3, work1_r, dims(1), &
+                            decomp%x1dist, decomp)
 #endif
 
-#endif
-
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%COL_INFO%RCV_P
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-    ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-    if (decomp%COL_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%x1cnts_s, decomp%x1disp_s, &
-            real_type, work2, decomp%y1cnts_s, decomp%y1disp_s, &
-            real_type, decomp%COL_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
+     ! transpose using MPI_ALLTOALL(V)
 #ifdef EVEN
-    call MPI_ALLTOALL(work1_r, decomp%x1count, &
-         real_type, work2_r, decomp%y1count, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     call MPI_ALLTOALL(work1_r, decomp%x1count, &
+                       real_type, work2_r, decomp%y1count, &
+                       real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
 #if defined(_NCCL)
-    nccl_stat = ncclGroupStart()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
-    do col_rank_id = 0, (col_comm_size - 1)
-        nccl_stat = ncclSend(work1_r_d( decomp%x1disp(col_rank_id)+1 ), decomp%x1cnts(col_rank_id), &
-          ncclDouble, local_to_global_col(col_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
-        nccl_stat = ncclRecv(work2_r_d( decomp%y1disp(col_rank_id)+1 ), decomp%y1cnts(col_rank_id), &
-          ncclDouble, local_to_global_col(col_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
-    end do
-    nccl_stat = ncclGroupEnd()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
-    cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
-    if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+     call decomp_2d_nccl_send_recv_col(work2_r_d, &
+                                       work1_r_d, &
+                                       decomp%x1disp, &
+                                       decomp%x1cnts, &
+                                       decomp%y1disp, &
+                                       decomp%y1cnts, &
+                                       dims(1))
 #else
-    call MPI_ALLTOALLV(work1_r_d, decomp%x1cnts, decomp%x1disp, &
-         real_type, work2_r_d, decomp%y1cnts, decomp%y1disp, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r_d, decomp%x1cnts, decomp%x1disp, &
+                        real_type, work2_r_d, decomp%y1cnts, decomp%y1disp, &
+                        real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 #else
-    call MPI_ALLTOALLV(work1_r, decomp%x1cnts, decomp%x1disp, &
-         real_type, work2_r, decomp%y1cnts, decomp%y1disp, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r, decomp%x1cnts, decomp%x1disp, &
+                        real_type, work2_r, decomp%y1cnts, decomp%y1disp, &
+                        real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-#endif
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_xy_real(work2, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
-#else
-
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_xy_real(work2_r_d, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
+     call mem_merge_xy_real(work2_r_d, d1, d2, d3, dst, dims(1), &
+                            decomp%y1dist, decomp)
 #else
-    call mem_merge_xy_real(work2_r, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
-#endif
-
+     call mem_merge_xy_real(work2_r, d1, d2, d3, dst, dims(1), &
+                            decomp%y1dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_x_y_r")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_x_y_r")
 #endif
 
-    return
+     return
   end subroutine transpose_x_to_y_real
 
-
   subroutine transpose_x_to_y_complex_short(src, dst)
 
-    implicit none
+     implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_x_to_y(src, dst, decomp_main)
+     call transpose_x_to_y(src, dst, decomp_main)
 
   end subroutine transpose_x_to_y_complex_short
 
-  subroutine transpose_x_to_y_complex(src, dst, decomp)
+  subroutine transpose_x_to_y_complex_long(src, dst, decomp)
 
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
-#ifdef SHM
-    complex(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+#if defined(_GPU)
+     integer :: istat, nsize
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+     if (dims(1) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%xsz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
+#endif
+     else
+        call transpose_x_to_y_complex(src, dst, decomp)
+     end if
+
+  end subroutine transpose_x_to_y_complex_long
+
+  subroutine transpose_x_to_y_complex(src, dst, decomp)
+
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_x_y_c")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_x_y_c")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-    
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%COL_INFO%SND_P_c
-    call mem_split_xy_complex(src, s1, s2, s3, work1, dims(1), &
-         decomp%x1dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
 
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_xy_complex(src, s1, s2, s3, work1_c_d, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_split_xy_complex(src, s1, s2, s3, work1_c_d, dims(1), &
+                               decomp%x1dist, decomp)
 #else
-    call mem_split_xy_complex(src, s1, s2, s3, work1_c, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_split_xy_complex(src, s1, s2, s3, work1_c, dims(1), &
+                               decomp%x1dist, decomp)
 #endif
 
-#endif
-    
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%COL_INFO%RCV_P_c
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-    ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-    if (decomp%COL_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%x1cnts_s, decomp%x1disp_s, &
-            complex_type, work2, decomp%y1cnts_s, decomp%y1disp_s, &
-            complex_type, decomp%COL_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
+     ! transpose using MPI_ALLTOALL(V)
 #ifdef EVEN
-    call MPI_ALLTOALL(work1_c, decomp%x1count, &
-         complex_type, work2_c, decomp%y1count, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     call MPI_ALLTOALL(work1_c, decomp%x1count, &
+                       complex_type, work2_c, decomp%y1count, &
+                       complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
-    call MPI_ALLTOALLV(work1_c_d, decomp%x1cnts, decomp%x1disp, &
-         complex_type, work2_c_d, decomp%y1cnts, decomp%y1disp, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+#if defined(_NCCL)
+     call decomp_2d_nccl_send_recv_col(work2_c_d, &
+                                       work1_c_d, &
+                                       decomp%x1disp, &
+                                       decomp%x1cnts, &
+                                       decomp%y1disp, &
+                                       decomp%y1cnts, &
+                                       dims(1), &
+                                       decomp_buf_size)
 #else
-    call MPI_ALLTOALLV(work1_c, decomp%x1cnts, decomp%x1disp, &
-         complex_type, work2_c, decomp%y1cnts, decomp%y1disp, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-#endif
-
+     call MPI_ALLTOALLV(work1_c_d, decomp%x1cnts, decomp%x1disp, &
+                        complex_type, work2_c_d, decomp%y1cnts, decomp%y1disp, &
+                        complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
+#else
+     call MPI_ALLTOALLV(work1_c, decomp%x1cnts, decomp%x1disp, &
+                        complex_type, work2_c, decomp%y1cnts, decomp%y1disp, &
+                        complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_xy_complex(work2, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
-#else
+#endif
 
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_xy_complex(work2_c_d, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
+     call mem_merge_xy_complex(work2_c_d, d1, d2, d3, dst, dims(1), &
+                               decomp%y1dist, decomp)
 #else
-    call mem_merge_xy_complex(work2_c, d1, d2, d3, dst, dims(1), &
-         decomp%y1dist, decomp)
-#endif
-
+     call mem_merge_xy_complex(work2_c, d1, d2, d3, dst, dims(1), &
+                               decomp%y1dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_x_y_c")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_x_y_c")
 #endif
 
-    return
+     return
   end subroutine transpose_x_to_y_complex
 
+  subroutine mem_split_xy_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_xy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    real(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     real(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
-    
-    integer :: i,j,k, m,i1,i2,pos
-
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
-
-#ifdef SHM
-       pos = decomp%x1disp_o(m) + 1
-#else
+
+     integer :: i, j, k, m, i1, i2, pos
+
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
+
 #ifdef EVEN
-       pos = m * decomp%x1count + 1
+        pos = m * decomp%x1count + 1
 #else
-       pos = decomp%x1disp(m) + 1
-#endif
+        pos = decomp%x1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), i2-i1+1, in(i1,1,1), n1, i2-i1+1, n2*n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), i2 - i1 + 1, in(i1, 1, 1), n1, i2 - i1 + 1, n2 * n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=1,n2
-             do i=i1,i2
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = 1, n2
+              do i = i1, i2
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_xy_real
 
+  subroutine mem_split_xy_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_xy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    complex(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     complex(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%x1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%x1count + 1
+        pos = m * decomp%x1count + 1
 #else
-       pos = decomp%x1disp(m) + 1
-#endif
+        pos = decomp%x1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), i2-i1+1, in(i1,1,1), n1, i2-i1+1, n2*n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), i2 - i1 + 1, in(i1, 1, 1), n1, i2 - i1 + 1, n2 * n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=1,n2
-             do i=i1,i2
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = 1, n2
+              do i = i1, i2
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_xy_complex
 
+  subroutine mem_merge_xy_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_merge_xy_real(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(*), intent(IN) :: in
-    real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(*), intent(IN) :: in
+     real(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
-        
-    integer :: i,j,k, m,i1,i2, pos
-
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
-
-#ifdef SHM
-       pos = decomp%y1disp_o(m) + 1
-#else
+
+     integer :: i, j, k, m, i1, i2, pos
+
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
+
 #ifdef EVEN
-       pos = m * decomp%y1count + 1
+        pos = m * decomp%y1count + 1
 #else
-       pos = decomp%y1disp(m) + 1
-#endif
+        pos = decomp%y1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(1,i1,1), n1*n2, in(pos), n1*(i2-i1+1), n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(1, i1, 1), n1 * n2, in(pos), n1 * (i2 - i1 + 1), n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_xy_real
 
+  subroutine mem_merge_xy_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_merge_xy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(*), intent(IN) :: in
-    complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(*), intent(IN) :: in
+     complex(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2, pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y1count + 1
+        pos = m * decomp%y1count + 1
 #else
-       pos = decomp%y1disp(m) + 1
-#endif
+        pos = decomp%y1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(1,i1,1), n1*n2, in(pos), n1*(i2-i1+1), n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(1, i1, 1), n1 * n2, in(pos), n1 * (i2 - i1 + 1), n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_xy_complex
diff --git a/src/transpose_y_to_x.f90 b/src/transpose_y_to_x.f90
index 1e7c62c2..e5fa2a16 100644
--- a/src/transpose_y_to_x.f90
+++ b/src/transpose_y_to_x.f90
@@ -1,491 +1,447 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains the routines that transpose data from Y to X pencil
 
   subroutine transpose_y_to_x_real_short(src, dst)
 
-    implicit none
+     implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_y_to_x(src, dst, decomp_main)
+     call transpose_y_to_x(src, dst, decomp_main)
 
   end subroutine transpose_y_to_x_real_short
 
-  subroutine transpose_y_to_x_real(src, dst, decomp)
+  subroutine transpose_y_to_x_real_long(src, dst, decomp)
 
-    implicit none
-    
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
-#if defined(_GPU) && defined(_NCCL)
-    type(ncclResult) :: nccl_stat
-    integer :: col_rank_id, cuda_stat
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+#if defined(_GPU)
+     integer :: istat, nsize
 #endif
 
-#ifdef SHM
-    real(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     if (dims(1) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%ysz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+     else
+        call transpose_y_to_x_real(src, dst, decomp)
+     end if
+
+  end subroutine transpose_y_to_x_real_long
+
+  subroutine transpose_y_to_x_real(src, dst, decomp)
+
+     implicit none
+
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_x_r")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_x_r")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%COL_INFO%SND_P
-    call mem_split_yx_real(src, s1, s2, s3, work1, dims(1), &
-         decomp%y1dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
 
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_yx_real(src, s1, s2, s3, work1_r_d, dims(1), &
-         decomp%y1dist, decomp)
+     call mem_split_yx_real(src, s1, s2, s3, work1_r_d, dims(1), &
+                            decomp%y1dist, decomp)
 #else
-    call mem_split_yx_real(src, s1, s2, s3, work1_r, dims(1), &
-         decomp%y1dist, decomp)
-#endif
-
+     call mem_split_yx_real(src, s1, s2, s3, work1_r, dims(1), &
+                            decomp%y1dist, decomp)
 #endif
 
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%COL_INFO%RCV_P
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-    ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-    if (decomp%COL_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%y1cnts_s, decomp%y1disp_s, &
-            real_type, work2, decomp%x1cnts_s, decomp%x1disp_s, &
-            real_type, decomp%COL_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
+     ! transpose using MPI_ALLTOALL(V)
 #ifdef EVEN
-    call MPI_ALLTOALL(work1_r, decomp%y1count, &
-         real_type, work2_r, decomp%x1count, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     call MPI_ALLTOALL(work1_r, decomp%y1count, &
+                       real_type, work2_r, decomp%x1count, &
+                       real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
 #if defined(_NCCL)
-    nccl_stat = ncclGroupStart()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
-    do col_rank_id = 0, (col_comm_size - 1)
-     nccl_stat = ncclSend(work1_r_d( decomp%y1disp(col_rank_id)+1 ), decomp%y1cnts(col_rank_id), &
-       ncclDouble, local_to_global_col(col_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-     if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
-     nccl_stat = ncclRecv(work2_r_d( decomp%x1disp(col_rank_id)+1 ), decomp%x1cnts(col_rank_id), &
-       ncclDouble, local_to_global_col(col_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-     if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
-    end do
-    nccl_stat = ncclGroupEnd()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
-    cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
-    if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+     call decomp_2d_nccl_send_recv_col(work2_r_d, &
+                                       work1_r_d, &
+                                       decomp%y1disp, &
+                                       decomp%y1cnts, &
+                                       decomp%x1disp, &
+                                       decomp%x1cnts, &
+                                       dims(1))
 #else
-    call MPI_ALLTOALLV(work1_r_d, decomp%y1cnts, decomp%y1disp, &
-         real_type, work2_r_d, decomp%x1cnts, decomp%x1disp, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r_d, decomp%y1cnts, decomp%y1disp, &
+                        real_type, work2_r_d, decomp%x1cnts, decomp%x1disp, &
+                        real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 #else
-    call MPI_ALLTOALLV(work1_r, decomp%y1cnts, decomp%y1disp, &
-         real_type, work2_r, decomp%x1cnts, decomp%x1disp, &
-         real_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r, decomp%y1cnts, decomp%y1disp, &
+                        real_type, work2_r, decomp%x1cnts, decomp%x1disp, &
+                        real_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-#endif
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_yx_real(work2, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
-#else
-
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_yx_real(work2_r_d, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_merge_yx_real(work2_r_d, d1, d2, d3, dst, dims(1), &
+                            decomp%x1dist, decomp)
 #else
-    call mem_merge_yx_real(work2_r, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
-#endif
-
+     call mem_merge_yx_real(work2_r, d1, d2, d3, dst, dims(1), &
+                            decomp%x1dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_x_r")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_x_r")
 #endif
 
-    return
+     return
   end subroutine transpose_y_to_x_real
 
-
   subroutine transpose_y_to_x_complex_short(src, dst)
 
-    implicit none
+     implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_y_to_x(src, dst, decomp_main)
+     call transpose_y_to_x(src, dst, decomp_main)
 
   end subroutine transpose_y_to_x_complex_short
 
-  subroutine transpose_y_to_x_complex(src, dst, decomp)
+  subroutine transpose_y_to_x_complex_long(src, dst, decomp)
 
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
-#ifdef SHM
-    complex(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+#if defined(_GPU)
+     integer :: istat, nsize
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+     if (dims(1) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%ysz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
+#endif
+     else
+        call transpose_y_to_x_complex(src, dst, decomp)
+     end if
+
+  end subroutine transpose_y_to_x_complex_long
+
+  subroutine transpose_y_to_x_complex(src, dst, decomp)
+
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_x_c")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_x_c")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-    
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%COL_INFO%SND_P_c
-    call mem_split_yx_complex(src, s1, s2, s3, work1, dims(1), &
-         decomp%y1dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
 
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_yx_complex(src, s1, s2, s3, work1_c_d, dims(1), &
-         decomp%y1dist, decomp)
+     call mem_split_yx_complex(src, s1, s2, s3, work1_c_d, dims(1), &
+                               decomp%y1dist, decomp)
 #else
-    call mem_split_yx_complex(src, s1, s2, s3, work1_c, dims(1), &
-         decomp%y1dist, decomp)
+     call mem_split_yx_complex(src, s1, s2, s3, work1_c, dims(1), &
+                               decomp%y1dist, decomp)
 #endif
 
-#endif
-    
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%COL_INFO%RCV_P_c
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-    ! transpose using MPI_ALLTOALL(V)
-#ifdef SHM
-    if (decomp%COL_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%y1cnts_s, decomp%y1disp_s, &
-            complex_type, work2, decomp%x1cnts_s, decomp%x1disp_s, &
-            complex_type, decomp%COL_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
+     ! transpose using MPI_ALLTOALL(V)
 #ifdef EVEN
-    call MPI_ALLTOALL(work1_c, decomp%y1count, &
-         complex_type, work2_c, decomp%x1count, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     call MPI_ALLTOALL(work1_c, decomp%y1count, &
+                       complex_type, work2_c, decomp%x1count, &
+                       complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
-    call MPI_ALLTOALLV(work1_c_d, decomp%y1cnts, decomp%y1disp, &
-         complex_type, work2_c_d, decomp%x1cnts, decomp%x1disp, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+#if defined(_NCCL)
+     call decomp_2d_nccl_send_recv_col(work2_c_d, &
+                                       work1_c_d, &
+                                       decomp%y1disp, &
+                                       decomp%y1cnts, &
+                                       decomp%x1disp, &
+                                       decomp%x1cnts, &
+                                       dims(1), &
+                                       decomp_buf_size)
 #else
-    call MPI_ALLTOALLV(work1_c, decomp%y1cnts, decomp%y1disp, &
-         complex_type, work2_c, decomp%x1cnts, decomp%x1disp, &
-         complex_type, DECOMP_2D_COMM_COL, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-#endif
-
+     call MPI_ALLTOALLV(work1_c_d, decomp%y1cnts, decomp%y1disp, &
+                        complex_type, work2_c_d, decomp%x1cnts, decomp%x1disp, &
+                        complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
+#else
+     call MPI_ALLTOALLV(work1_c, decomp%y1cnts, decomp%y1disp, &
+                        complex_type, work2_c, decomp%x1cnts, decomp%x1disp, &
+                        complex_type, DECOMP_2D_COMM_COL, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%COL_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_yx_complex(work2, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
-#else
+#endif
 
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_yx_complex(work2_c_d, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
+     call mem_merge_yx_complex(work2_c_d, d1, d2, d3, dst, dims(1), &
+                               decomp%x1dist, decomp)
 #else
-    call mem_merge_yx_complex(work2_c, d1, d2, d3, dst, dims(1), &
-         decomp%x1dist, decomp)
-#endif
-
+     call mem_merge_yx_complex(work2_c, d1, d2, d3, dst, dims(1), &
+                               decomp%x1dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_x_c")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_x_c")
 #endif
 
-    return
+     return
   end subroutine transpose_y_to_x_complex
 
+  subroutine mem_split_yx_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_yx_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    real(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     real(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y1count + 1
+        pos = m * decomp%y1count + 1
 #else
-       pos = decomp%y1disp(m) + 1
-#endif
+        pos = decomp%y1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), n1*(i2-i1+1), in(1,i1,1), n1*n2, n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), n1 * (i2 - i1 + 1), in(1, i1, 1), n1 * n2, n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_yx_real
 
+  subroutine mem_split_yx_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_yx_complex(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    complex(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     complex(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y1count + 1
+        pos = m * decomp%y1count + 1
 #else
-       pos = decomp%y1disp(m) + 1
-#endif
+        pos = decomp%y1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), n1*(i2-i1+1), in(1,i1,1), n1*n2, n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), n1 * (i2 - i1 + 1), in(1, i1, 1), n1 * n2, n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_yx_complex
 
+  subroutine mem_merge_yx_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_merge_yx_real(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(*), intent(IN) :: in
-    real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(*), intent(IN) :: in
+     real(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2, pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%x1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%x1count + 1
+        pos = m * decomp%x1count + 1
 #else
-       pos = decomp%x1disp(m) + 1
-#endif
+        pos = decomp%x1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(i1,1,1), n1, in(pos), i2-i1+1, i2-i1+1, n2*n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(i1, 1, 1), n1, in(pos), i2 - i1 + 1, i2 - i1 + 1, n2 * n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=1,n2
-             do i=i1,i2
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = 1, n2
+              do i = i1, i2
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_yx_real
 
+  subroutine mem_merge_yx_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_merge_yx_complex(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(*), intent(IN) :: in
-    complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(*), intent(IN) :: in
+     complex(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2, pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%x1disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%x1count + 1
+        pos = m * decomp%x1count + 1
 #else
-       pos = decomp%x1disp(m) + 1
-#endif
+        pos = decomp%x1disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(i1,1,1), n1, in(pos), i2-i1+1, i2-i1+1, n2*n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(i1, 1, 1), n1, in(pos), i2 - i1 + 1, i2 - i1 + 1, n2 * n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=1,n2
-             do i=i1,i2
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = 1, n2
+              do i = i1, i2
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_yx_complex
diff --git a/src/transpose_y_to_z.f90 b/src/transpose_y_to_z.f90
index 70f427cf..4d212f4f 100644
--- a/src/transpose_y_to_z.f90
+++ b/src/transpose_y_to_z.f90
@@ -1,500 +1,466 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains the routines that transpose data from Y to Z pencil
 
   subroutine transpose_y_to_z_real_short(src, dst)
 
-    implicit none
+     implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_y_to_z(src, dst, decomp_main)
+     call transpose_y_to_z(src, dst, decomp_main)
 
   end subroutine transpose_y_to_z_real_short
 
-  subroutine transpose_y_to_z_real(src, dst, decomp)
+  subroutine transpose_y_to_z_real_long(src, dst, decomp)
 
-    implicit none
-    
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-#if defined(_NCCL)
-    type(ncclResult) :: nccl_stat
-    integer :: row_rank_id, cuda_stat
+     integer :: istat, nsize
 #endif
-    integer :: istat
+
+     if (dims(2) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%ysz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
+     else
+        call transpose_y_to_z_real(src, dst, decomp)
+     end if
+
+  end subroutine transpose_y_to_z_real_long
+
+  subroutine transpose_y_to_z_real(src, dst, decomp)
+
+     implicit none
 
-#ifdef SHM
-    real(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+#if defined(_GPU)
+     integer :: istat
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_z_r")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_z_r")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%ROW_INFO%SND_P
-    call mem_split_yz_real(src, s1, s2, s3, work1, dims(2), &
-         decomp%y2dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
 
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_yz_real(src, s1, s2, s3, work1_r_d, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_split_yz_real(src, s1, s2, s3, work1_r_d, dims(2), &
+                            decomp%y2dist, decomp)
 #else
-    call mem_split_yz_real(src, s1, s2, s3, work1_r, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_split_yz_real(src, s1, s2, s3, work1_r, dims(2), &
+                            decomp%y2dist, decomp)
 #endif
 
-#endif
-
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%ROW_INFO%RCV_P
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-#ifdef SHM
-    if (decomp%ROW_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%y2cnts_s, decomp%y2disp_s, &
-            real_type, work2, decomp%z2cnts_s, decomp%z2disp_s, &
-            real_type, decomp%ROW_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
 #ifdef EVEN
-    if (decomp%even) then
-       call MPI_ALLTOALL(work1_r, decomp%y2count, &
-            real_type, dst, decomp%z2count, &
-            real_type, DECOMP_2D_COMM_ROW, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    else
-       call MPI_ALLTOALL(work1_r, decomp%y2count, &
-            real_type, work2_r, decomp%z2count, &
-            real_type, DECOMP_2D_COMM_ROW, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
-    end if
+     if (decomp%even) then
+        call MPI_ALLTOALL(work1_r, decomp%y2count, &
+                          real_type, dst, decomp%z2count, &
+                          real_type, DECOMP_2D_COMM_ROW, ierror)
+        if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     else
+        call MPI_ALLTOALL(work1_r, decomp%y2count, &
+                          real_type, work2_r, decomp%z2count, &
+                          real_type, DECOMP_2D_COMM_ROW, ierror)
+        if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     end if
 #else
 
 #if defined(_GPU)
 #if defined(_NCCL)
-    nccl_stat = ncclGroupStart()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
-    do row_rank_id = 0, (row_comm_size - 1)
-        nccl_stat = ncclSend(work1_r_d( decomp%y2disp(row_rank_id)+1 ), decomp%y2cnts(row_rank_id), &
-          ncclDouble, local_to_global_row(row_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
-        nccl_stat = ncclRecv(work2_r_d( decomp%z2disp(row_rank_id)+1 ), decomp%z2cnts(row_rank_id), &
-          ncclDouble, local_to_global_row(row_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
-    end do
-    nccl_stat = ncclGroupEnd()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
-    cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
-    if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+     call decomp_2d_nccl_send_recv_row(work2_r_d, &
+                                       work1_r_d, &
+                                       decomp%y2disp, &
+                                       decomp%y2cnts, &
+                                       decomp%z2disp, &
+                                       decomp%z2cnts, &
+                                       dims(2))
 #else
-    call MPI_ALLTOALLV(work1_r_d, decomp%y2cnts, decomp%y2disp, &
-         real_type, work2_r_d, decomp%z2cnts, decomp%z2disp, &
-         real_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r_d, decomp%y2cnts, decomp%y2disp, &
+                        real_type, work2_r_d, decomp%z2cnts, decomp%z2disp, &
+                        real_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 #else
-    call MPI_ALLTOALLV(work1_r, decomp%y2cnts, decomp%y2disp, &
-         real_type, dst, decomp%z2cnts, decomp%z2disp, &
-         real_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r, decomp%y2cnts, decomp%y2disp, &
+                        real_type, dst, decomp%z2cnts, decomp%z2disp, &
+                        real_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-#endif
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_yz_real(work2, d1, d2, d3, dst, dims(2), &
-         decomp%z2dist, decomp)
-#else
+     ! rearrange receive buffer
 #ifdef EVEN
-    if (.not. decomp%even) then
-       call mem_merge_yz_real(work2_r, d1, d2, d3, dst, dims(2), &
-         decomp%z2dist, decomp)
-    end if
+     if (.not. decomp%even) then
+        call mem_merge_yz_real(work2_r, d1, d2, d3, dst, dims(2), &
+                               decomp%z2dist, decomp)
+     end if
 #else
-    ! note the receive buffer is already in natural (i,j,k) order
-    ! so no merge operation needed
+     ! note the receive buffer is already in natural (i,j,k) order
+     ! so no merge operation needed
 
 #if defined(_GPU)
-    istat = cudaMemcpy( dst, work2_r_d, d1*d2*d3, cudaMemcpyDeviceToDevice )
+     !If one of the array in cuda call is not device we need to add acc host_data
+     !$acc host_data use_device(dst)
+     istat = cudaMemcpy(dst, work2_r_d, d1 * d2 * d3, cudaMemcpyDeviceToDevice)
+     !$acc end host_data
+     if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #endif
 
-#endif
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_z_r")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_z_r")
 #endif
 
-    return
+     return
   end subroutine transpose_y_to_z_real
 
-
   subroutine transpose_y_to_z_complex_short(src, dst)
 
-    implicit none
+     implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_y_to_z(src, dst, decomp_main)
+     call transpose_y_to_z(src, dst, decomp_main)
 
   end subroutine transpose_y_to_z_complex_short
 
-  subroutine transpose_y_to_z_complex(src, dst, decomp)
+  subroutine transpose_y_to_z_complex_long(src, dst, decomp)
+
+     implicit none
 
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+#if defined(_GPU)
+     integer :: istat, nsize
+#endif
 
+     if (dims(2) == 1) then
 #if defined(_GPU)
-    integer :: istat
+        nsize = product(decomp%ysz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
+     else
+        call transpose_y_to_z_complex(src, dst, decomp)
+     end if
+
+  end subroutine transpose_y_to_z_complex_long
+
+  subroutine transpose_y_to_z_complex(src, dst, decomp)
 
-#ifdef SHM
-    complex(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+#if defined(_GPU)
+     integer :: istat
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_z_c")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_y_z_c")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-    
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%ROW_INFO%SND_P_c
-    call mem_split_yz_complex(src, s1, s2, s3, work1, dims(2), &
-         decomp%y2dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
 
+     ! rearrange source array as send buffer
 #if defined(_GPU)
-    call mem_split_yz_complex(src, s1, s2, s3, work1_c_d, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_split_yz_complex(src, s1, s2, s3, work1_c_d, dims(2), &
+                               decomp%y2dist, decomp)
 #else
-    call mem_split_yz_complex(src, s1, s2, s3, work1_c, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_split_yz_complex(src, s1, s2, s3, work1_c, dims(2), &
+                               decomp%y2dist, decomp)
 #endif
 
-#endif
-    
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%ROW_INFO%RCV_P_c
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-#ifdef SHM
-    if (decomp%ROW_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%y2cnts_s, decomp%y2disp_s, &
-            complex_type, work2, decomp%z2cnts_s, decomp%z2disp_s, &
-            complex_type, decomp%ROW_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+     ! define receive buffer
 #ifdef EVEN
-    if (decomp%even) then
-       call MPI_ALLTOALL(work1_c, decomp%y2count, &
-            complex_type, dst, decomp%z2count, &
-            complex_type, DECOMP_2D_COMM_ROW, ierror)
-    else
-       call MPI_ALLTOALL(work1_c, decomp%y2count, &
-            complex_type, work2_c, decomp%z2count, &
-            complex_type, DECOMP_2D_COMM_ROW, ierror)
-    end if
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     if (decomp%even) then
+        call MPI_ALLTOALL(work1_c, decomp%y2count, &
+                          complex_type, dst, decomp%z2count, &
+                          complex_type, DECOMP_2D_COMM_ROW, ierror)
+     else
+        call MPI_ALLTOALL(work1_c, decomp%y2count, &
+                          complex_type, work2_c, decomp%z2count, &
+                          complex_type, DECOMP_2D_COMM_ROW, ierror)
+     end if
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
-    call MPI_ALLTOALLV(work1_c_d, decomp%y2cnts, decomp%y2disp, &
-         complex_type, work2_c_d, decomp%z2cnts, decomp%z2disp, &
-         complex_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+#if defined(_NCCL)
+     call decomp_2d_nccl_send_recv_row(work2_c_d, &
+                                       work1_c_d, &
+                                       decomp%y2disp, &
+                                       decomp%y2cnts, &
+                                       decomp%z2disp, &
+                                       decomp%z2cnts, &
+                                       dims(2), &
+                                       decomp_buf_size)
 #else
-    call MPI_ALLTOALLV(work1_c, decomp%y2cnts, decomp%y2disp, &
-         complex_type, dst, decomp%z2cnts, decomp%z2disp, &
-         complex_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_c_d, decomp%y2cnts, decomp%y2disp, &
+                        complex_type, work2_c_d, decomp%z2cnts, decomp%z2disp, &
+                        complex_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
-
+#else
+     call MPI_ALLTOALLV(work1_c, decomp%y2cnts, decomp%y2disp, &
+                        complex_type, dst, decomp%z2cnts, decomp%z2disp, &
+                        complex_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
+
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_yz_complex(work2, d1, d2, d3, dst, dims(2), &
-         decomp%z2dist, decomp)
-#else
+     ! rearrange receive buffer
 #ifdef EVEN
-    if (.not. decomp%even) then
-       call mem_merge_yz_complex(work2_c, d1, d2, d3, dst, dims(2), &
-         decomp%z2dist, decomp)
-    end if
+     if (.not. decomp%even) then
+        call mem_merge_yz_complex(work2_c, d1, d2, d3, dst, dims(2), &
+                                  decomp%z2dist, decomp)
+     end if
 #else
-    ! note the receive buffer is already in natural (i,j,k) order
-    ! so no merge operation needed
+     ! note the receive buffer is already in natural (i,j,k) order
+     ! so no merge operation needed
 
 #if defined(_GPU)
-    istat = cudaMemcpy( dst, work2_c_d, d1*d2*d3, cudaMemcpyDeviceToDevice )
+     !$acc host_data use_device(dst)
+     istat = cudaMemcpy(dst, work2_c_d, d1 * d2 * d3, cudaMemcpyDeviceToDevice)
+     !$acc end host_data
+     if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #endif
 
-#endif
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_z_c")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_y_z_c")
 #endif
 
-    return
+     return
   end subroutine transpose_y_to_z_complex
 
-  subroutine mem_split_yz_real(in,n1,n2,n3,out,iproc,dist,decomp)
+  subroutine mem_split_yz_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    real(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     real(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y2count + 1
+        pos = m * decomp%y2count + 1
 #else
-       pos = decomp%y2disp(m) + 1
-#endif
+        pos = decomp%y2disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), n1*(i2-i1+1), in(1,i1,1), n1*n2, n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), n1 * (i2 - i1 + 1), in(1, i1, 1), n1 * n2, n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_yz_real
 
+  subroutine mem_split_yz_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_yz_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
+     implicit none
 
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    complex(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     complex(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: out
-    integer :: istat
+     attributes(device) :: out
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y2count + 1
+        pos = m * decomp%y2count + 1
 #else
-       pos = decomp%y2disp(m) + 1
-#endif
+        pos = decomp%y2disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(pos), n1*(i2-i1+1), in(1,i1,1), n1*n2, n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(in)
+        istat = cudaMemcpy2D(out(pos), n1 * (i2 - i1 + 1), in(1, i1, 1), n1 * n2, n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_split_yz_complex
 
+  subroutine mem_merge_yz_real(in, n1, n2, n3, out, iproc, dist, decomp)
+
+     implicit none
+
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(*), intent(IN) :: in
+     real(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: i, j, k, m, i1, i2, pos
+
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-  subroutine mem_merge_yz_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(*), intent(IN) :: in
-    real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    
-    integer :: i,j,k, m,i1,i2, pos
-
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
-
-#ifdef SHM
-       pos = decomp%z2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%z2count + 1
+        pos = m * decomp%z2count + 1
 #else
-       pos = decomp%z2disp(m) + 1
-#endif
+        pos = decomp%z2disp(m) + 1
 #endif
 
-       do k=i1,i2
-          do j=1,n2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-    end do
-
-    return
+        do k = i1, i2
+           do j = 1, n2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+     end do
+
+     return
   end subroutine mem_merge_yz_real
 
+  subroutine mem_merge_yz_complex(in, n1, n2, n3, out, iproc, dist, decomp)
+
+     implicit none
+
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(*), intent(IN) :: in
+     complex(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: i, j, k, m, i1, i2, pos
+
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-  subroutine mem_merge_yz_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-    
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(*), intent(IN) :: in
-    complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    
-    integer :: i,j,k, m,i1,i2, pos
-
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
-
-#ifdef SHM
-       pos = decomp%z2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%z2count + 1
+        pos = m * decomp%z2count + 1
 #else
-       pos = decomp%z2disp(m) + 1
-#endif
+        pos = decomp%z2disp(m) + 1
 #endif
 
-       do k=i1,i2
-          do j=1,n2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-    end do
-
-    return
+        do k = i1, i2
+           do j = 1, n2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+     end do
+
+     return
   end subroutine mem_merge_yz_complex
diff --git a/src/transpose_z_to_y.f90 b/src/transpose_z_to_y.f90
index 33030a7d..6bfb36b2 100644
--- a/src/transpose_z_to_y.f90
+++ b/src/transpose_z_to_y.f90
@@ -1,502 +1,466 @@
-!=======================================================================
-! This is part of the 2DECOMP&FFT library
-! 
-! 2DECOMP&FFT is a software framework for general-purpose 2D (pencil) 
-! decomposition. It also implements a highly scalable distributed
-! three-dimensional Fast Fourier Transform (FFT).
-!
-! Copyright (C) 2009-2021 Ning Li, the Numerical Algorithms Group (NAG)
-!
-!=======================================================================
+!! SPDX-License-Identifier: BSD-3-Clause
 
 ! This file contains the routines that transpose data from Z to Y pencil
 
   subroutine transpose_z_to_y_real_short(src, dst)
 
-    implicit none
+     implicit none
 
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_z_to_y(src, dst, decomp_main)
+     call transpose_z_to_y(src, dst, decomp_main)
 
   end subroutine transpose_z_to_y_real_short
 
-  subroutine transpose_z_to_y_real(src, dst, decomp)
+  subroutine transpose_z_to_y_real_long(src, dst, decomp)
 
-    implicit none
-    
-    real(mytype), dimension(:,:,:), intent(IN) :: src
-    real(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-#if defined(_NCCL)
-    type(ncclResult) :: nccl_stat
-    integer :: row_rank_id, cuda_stat
+     integer :: istat, nsize
 #endif
-    integer :: istat
+
+     if (dims(2) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%zsz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
+     else
+        call transpose_z_to_y_real(src, dst, decomp)
+     end if
+
+  end subroutine transpose_z_to_y_real_long
+
+  subroutine transpose_z_to_y_real(src, dst, decomp)
+
+     implicit none
 
-#ifdef SHM
-    real(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     real(mytype), dimension(:, :, :), intent(IN) :: src
+     real(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+#if defined(_GPU)
+     integer :: istat
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_z_y_r")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_z_y_r")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%ROW_INFO%SND_P
-    call mem_split_zy_real(src, s1, s2, s3, work1, dims(2), &
-         decomp%z2dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
+
+     ! rearrange source array as send buffer
 #ifdef EVEN
-    if (.not. decomp%even) then
-       call mem_split_zy_real(src, s1, s2, s3, work1_r, dims(2), &
-            decomp%z2dist, decomp)
-    end if
+     if (.not. decomp%even) then
+        call mem_split_zy_real(src, s1, s2, s3, work1_r, dims(2), &
+                               decomp%z2dist, decomp)
+     end if
 #else
-    ! note the src array is suitable to be a send buffer
-    ! so no split operation needed
+     ! note the src array is suitable to be a send buffer
+     ! so no split operation needed
 
 #if defined(_GPU)
-    istat = cudaMemcpy( work1_r_d, src, s1*s2*s3 )
+     !$acc host_data use_device(src)
+     istat = cudaMemcpy(work1_r_d, src, s1 * s2 * s3, cudaMemcpyDeviceToDevice)
+     !$acc end host_data
+     if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #endif
 
 #endif
-#endif
-    
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%ROW_INFO%RCV_P
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-#ifdef SHM
-    if (decomp%ROW_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%z2cnts_s, decomp%z2disp_s, &
-            real_type, work2, decomp%y2cnts_s, decomp%y2disp_s, &
-            real_type, decomp%ROW_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+
+     ! define receive buffer
 #ifdef EVEN
-    if (decomp%even) then
-       call MPI_ALLTOALL(src, decomp%z2count, &
-            real_type, work2_r, decomp%y2count, &
-            real_type, DECOMP_2D_COMM_ROW, ierror)
-    else
-       call MPI_ALLTOALL(work1_r, decomp%z2count, &
-            real_type, work2_r, decomp%y2count, &
-            real_type, DECOMP_2D_COMM_ROW, ierror)
-    end if
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     if (decomp%even) then
+        call MPI_ALLTOALL(src, decomp%z2count, &
+                          real_type, work2_r, decomp%y2count, &
+                          real_type, DECOMP_2D_COMM_ROW, ierror)
+     else
+        call MPI_ALLTOALL(work1_r, decomp%z2count, &
+                          real_type, work2_r, decomp%y2count, &
+                          real_type, DECOMP_2D_COMM_ROW, ierror)
+     end if
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
 #if defined(_NCCL)
-    nccl_stat = ncclGroupStart()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupStart")
-    do row_rank_id = 0, (row_comm_size - 1)
-        nccl_stat = ncclSend(work1_r_d( decomp%z2disp(row_rank_id)+1 ), decomp%z2cnts(row_rank_id), &
-          ncclDouble, local_to_global_row(row_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclSend")
-        nccl_stat = ncclRecv(work2_r_d( decomp%y2disp(row_rank_id)+1 ), decomp%y2cnts(row_rank_id), &
-          ncclDouble, local_to_global_row(row_rank_id+1), nccl_comm_2decomp, cuda_stream_2decomp)
-        if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclRecv")
-    end do
-    nccl_stat = ncclGroupEnd()
-    if (nccl_stat /= ncclSuccess) call decomp_2d_abort(__FILE__, __LINE__, nccl_stat, "ncclGroupEnd")
-    cuda_stat = cudaStreamSynchronize(cuda_stream_2decomp)
-    if (cuda_stat /= 0) call decomp_2d_abort(__FILE__, __LINE__, cuda_stat, "cudaStreamSynchronize")
+     call decomp_2d_nccl_send_recv_row(work2_r_d, &
+                                       work1_r_d, &
+                                       decomp%z2disp, &
+                                       decomp%z2cnts, &
+                                       decomp%y2disp, &
+                                       decomp%y2cnts, &
+                                       dims(2))
 #else
-    call MPI_ALLTOALLV(work1_r_d, decomp%z2cnts, decomp%z2disp, &
-         real_type, work2_r_d, decomp%y2cnts, decomp%y2disp, &
-         real_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_r_d, decomp%z2cnts, decomp%z2disp, &
+                        real_type, work2_r_d, decomp%y2cnts, decomp%y2disp, &
+                        real_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 #else
-    call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
-         real_type, work2_r, decomp%y2cnts, decomp%y2disp, &
-         real_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
+                        real_type, work2_r, decomp%y2cnts, decomp%y2disp, &
+                        real_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_zy_real(work2, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
-#else
-
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_zy_real(work2_r_d, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_merge_zy_real(work2_r_d, d1, d2, d3, dst, dims(2), &
+                            decomp%y2dist, decomp)
 #else
-    call mem_merge_zy_real(work2_r, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
-#endif
-
+     call mem_merge_zy_real(work2_r, d1, d2, d3, dst, dims(2), &
+                            decomp%y2dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_z_y_r")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_z_y_r")
 #endif
 
-    return
+     return
   end subroutine transpose_z_to_y_real
 
-
   subroutine transpose_z_to_y_complex_short(src, dst)
 
-    implicit none
+     implicit none
 
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
 
-    call transpose_z_to_y(src, dst, decomp_main)
+     call transpose_z_to_y(src, dst, decomp_main)
 
   end subroutine transpose_z_to_y_complex_short
 
-  subroutine transpose_z_to_y_complex(src, dst, decomp)
+  subroutine transpose_z_to_y_complex_long(src, dst, decomp)
 
-    implicit none
-    
-    complex(mytype), dimension(:,:,:), intent(IN) :: src
-    complex(mytype), dimension(:,:,:), intent(OUT) :: dst
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     implicit none
 
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    integer :: istat
+     integer :: istat, nsize
 #endif
 
-#ifdef SHM
-    complex(mytype) :: work1(*), work2(*)
-    POINTER  (work1_p, work1), (work2_p, work2)  ! Cray pointers
+     if (dims(2) == 1) then
+#if defined(_GPU)
+        nsize = product(decomp%zsz)
+        !$acc host_data use_device(src,dst)
+        istat = cudaMemcpy(dst, src, nsize, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy")
+#else
+        dst = src
 #endif
-    
-    integer :: s1,s2,s3,d1,d2,d3
-    integer :: ierror
+     else
+        call transpose_z_to_y_complex(src, dst, decomp)
+     end if
+
+  end subroutine transpose_z_to_y_complex_long
+
+  subroutine transpose_z_to_y_complex(src, dst, decomp)
+
+     implicit none
+
+     complex(mytype), dimension(:, :, :), intent(IN) :: src
+     complex(mytype), dimension(:, :, :), intent(OUT) :: dst
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+#if defined(_GPU)
+     integer :: istat
+#endif
+
+     integer :: s1, s2, s3, d1, d2, d3
+     integer :: ierror
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_start("transp_z_y_c")
+     if (decomp_profiler_transpose) call decomp_profiler_start("transp_z_y_c")
 #endif
 
-    s1 = SIZE(src,1)
-    s2 = SIZE(src,2)
-    s3 = SIZE(src,3)
-    d1 = SIZE(dst,1)
-    d2 = SIZE(dst,2)
-    d3 = SIZE(dst,3)
-    
-    ! rearrange source array as send buffer
-#ifdef SHM
-    work1_p = decomp%ROW_INFO%SND_P_c
-    call mem_split_zy_complex(src, s1, s2, s3, work1, dims(2), &
-         decomp%z2dist, decomp)
-#else
+     s1 = SIZE(src, 1)
+     s2 = SIZE(src, 2)
+     s3 = SIZE(src, 3)
+     d1 = SIZE(dst, 1)
+     d2 = SIZE(dst, 2)
+     d3 = SIZE(dst, 3)
+
+     ! rearrange source array as send buffer
 #ifdef EVEN
-    if (.not. decomp%even) then
-       call mem_split_zy_complex(src, s1, s2, s3, work1_c, dims(2), &
-            decomp%z2dist, decomp)
-    end if
+     if (.not. decomp%even) then
+        call mem_split_zy_complex(src, s1, s2, s3, work1_c, dims(2), &
+                                  decomp%z2dist, decomp)
+     end if
 #else
-    ! note the src array is suitable to be a send buffer
-    ! so no split operation needed
+     ! note the src array is suitable to be a send buffer
+     ! so no split operation needed
 
 #if defined(_GPU)
-    istat = cudaMemcpy( work1_c_d, src, s1*s2*s3 )
+     !$acc host_data use_device(src)
+     istat = cudaMemcpy(work1_c_d, src, s1 * s2 * s3, cudaMemcpyDeviceToDevice)
+     !$acc end host_data
+     if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #endif
 
 #endif
-#endif
-    
-    ! define receive buffer
-#ifdef SHM
-    work2_p = decomp%ROW_INFO%RCV_P_c
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-#endif
-    
-#ifdef SHM
-    if (decomp%ROW_INFO%CORE_ME==1) THEN
-       call MPI_ALLTOALLV(work1, decomp%z2cnts_s, decomp%z2disp_s, &
-            complex_type, work2, decomp%y2cnts_s, decomp%y2disp_s, &
-            complex_type, decomp%ROW_INFO%SMP_COMM, ierror)
-       if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
-    end if
-#else
+
+     ! define receive buffer
 #ifdef EVEN
-    if (decomp%even) then
-       call MPI_ALLTOALL(src, decomp%z2count, &
-            complex_type, work2_c, decomp%y2count, &
-            complex_type, DECOMP_2D_COMM_ROW, ierror)
-    else
-       call MPI_ALLTOALL(work1_c, decomp%z2count, &
-            complex_type, work2_c, decomp%y2count, &
-            complex_type, DECOMP_2D_COMM_ROW, ierror)
-    end if
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
+     if (decomp%even) then
+        call MPI_ALLTOALL(src, decomp%z2count, &
+                          complex_type, work2_c, decomp%y2count, &
+                          complex_type, DECOMP_2D_COMM_ROW, ierror)
+     else
+        call MPI_ALLTOALL(work1_c, decomp%z2count, &
+                          complex_type, work2_c, decomp%y2count, &
+                          complex_type, DECOMP_2D_COMM_ROW, ierror)
+     end if
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALL")
 #else
 
 #if defined(_GPU)
-    call MPI_ALLTOALLV(work1_c_d, decomp%z2cnts, decomp%z2disp, &
-         complex_type, work2_c_d, decomp%y2cnts, decomp%y2disp, &
-         complex_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+#if defined(_NCCL)
+     call decomp_2d_nccl_send_recv_row(work2_c_d, &
+                                       work1_c_d, &
+                                       decomp%z2disp, &
+                                       decomp%z2cnts, &
+                                       decomp%y2disp, &
+                                       decomp%y2cnts, &
+                                       dims(2), &
+                                       decomp_buf_size)
 #else
-    call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
-         complex_type, work2_c, decomp%y2cnts, decomp%y2disp, &
-         complex_type, DECOMP_2D_COMM_ROW, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
+     call MPI_ALLTOALLV(work1_c_d, decomp%z2cnts, decomp%z2disp, &
+                        complex_type, work2_c_d, decomp%y2cnts, decomp%y2disp, &
+                        complex_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
+#else
+     call MPI_ALLTOALLV(src, decomp%z2cnts, decomp%z2disp, &
+                        complex_type, work2_c, decomp%y2cnts, decomp%y2disp, &
+                        complex_type, DECOMP_2D_COMM_ROW, ierror)
+     if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_ALLTOALLV")
 #endif
 
-#endif
 #endif
 
-    ! rearrange receive buffer
-#ifdef SHM
-    call MPI_BARRIER(decomp%ROW_INFO%CORE_COMM, ierror)
-    if (ierror /= 0) call decomp_2d_abort(__FILE__, __LINE__, ierror, "MPI_BARRIER")
-    call mem_merge_zy_complex(work2, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
-#else
-
+     ! rearrange receive buffer
 #if defined(_GPU)
-    call mem_merge_zy_complex(work2_c_d, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
+     call mem_merge_zy_complex(work2_c_d, d1, d2, d3, dst, dims(2), &
+                               decomp%y2dist, decomp)
 #else
-    call mem_merge_zy_complex(work2_c, d1, d2, d3, dst, dims(2), &
-         decomp%y2dist, decomp)
-#endif
-
+     call mem_merge_zy_complex(work2_c, d1, d2, d3, dst, dims(2), &
+                               decomp%y2dist, decomp)
 #endif
 
 #ifdef PROFILER
-    if (decomp_profiler_transpose) call decomp_profiler_end("transp_z_y_c")
+     if (decomp_profiler_transpose) call decomp_profiler_end("transp_z_y_c")
 #endif
 
-    return
+     return
   end subroutine transpose_z_to_y_complex
 
-
   ! pack/unpack ALLTOALL(V) buffers
 
-  subroutine mem_split_zy_real(in,n1,n2,n3,out,iproc,dist,decomp)
-
-    implicit none
-
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    real(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    
-    integer :: i,j,k, m,i1,i2,pos
-
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
-
-#ifdef SHM
-       pos = decomp%z2disp_o(m) + 1
-#else
+  subroutine mem_split_zy_real(in, n1, n2, n3, out, iproc, dist, decomp)
+
+     implicit none
+
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     real(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
+
+     integer :: i, j, k, m, i1, i2, pos
+
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
+
 #ifdef EVEN
-       pos = m * decomp%z2count + 1
+        pos = m * decomp%z2count + 1
 #else
-       pos = decomp%z2disp(m) + 1
-#endif
+        pos = decomp%z2disp(m) + 1
 #endif
 
-       do k=i1,i2
-          do j=1,n2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-    end do
-
-    return
+        do k = i1, i2
+           do j = 1, n2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+     end do
+
+     return
   end subroutine mem_split_zy_real
 
+  subroutine mem_split_zy_complex(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_split_zy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(n1, n2, n3), intent(IN) :: in
+     complex(mytype), dimension(*), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(n1,n2,n3), intent(IN) :: in
-    complex(mytype), dimension(*), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    
-    integer :: i,j,k, m,i1,i2,pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then 
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%z2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%z2count + 1
+        pos = m * decomp%z2count + 1
 #else
-       pos = decomp%z2disp(m) + 1
-#endif
+        pos = decomp%z2disp(m) + 1
 #endif
 
-       do k=i1,i2
-          do j=1,n2
-             do i=1,n1
-                out(pos) = in(i,j,k)
-                pos = pos + 1
-             end do
-          end do
-       end do
-    end do
-
-    return
+        do k = i1, i2
+           do j = 1, n2
+              do i = 1, n1
+                 out(pos) = in(i, j, k)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+     end do
+
+     return
   end subroutine mem_split_zy_complex
 
+  subroutine mem_merge_zy_real(in, n1, n2, n3, out, iproc, dist, decomp)
 
-  subroutine mem_merge_zy_real(in,n1,n2,n3,out,iproc,dist,decomp)
+     implicit none
 
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    real(mytype), dimension(*), intent(IN) :: in
-    real(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
+     integer, intent(IN) :: n1, n2, n3
+     real(mytype), dimension(*), intent(IN) :: in
+     real(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2, pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y2count + 1
+        pos = m * decomp%y2count + 1
 #else
-       pos = decomp%y2disp(m) + 1
-#endif
+        pos = decomp%y2disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(1,i1,1), n1*n2, in(pos), n1*(i2-i1+1), n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(1, i1, 1), n1 * n2, in(pos), n1 * (i2 - i1 + 1), n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_zy_real
 
+  subroutine mem_merge_zy_complex(in, n1, n2, n3, out, iproc, dist, decomp)
+
+     implicit none
+
+     integer, intent(IN) :: n1, n2, n3
+     complex(mytype), dimension(*), intent(IN) :: in
+     complex(mytype), dimension(n1, n2, n3), intent(OUT) :: out
+     integer, intent(IN) :: iproc
+     integer, dimension(0:iproc - 1), intent(IN) :: dist
+     TYPE(DECOMP_INFO), intent(IN) :: decomp
 
-  subroutine mem_merge_zy_complex(in,n1,n2,n3,out,iproc,dist,decomp)
-    
-    implicit none
-    
-    integer, intent(IN) :: n1,n2,n3
-    complex(mytype), dimension(*), intent(IN) :: in
-    complex(mytype), dimension(n1,n2,n3), intent(OUT) :: out
-    integer, intent(IN) :: iproc
-    integer, dimension(0:iproc-1), intent(IN) :: dist
-    TYPE(DECOMP_INFO), intent(IN) :: decomp
-    
 #if defined(_GPU)
-    attributes(device) :: in
-    integer :: istat
+     attributes(device) :: in
+     integer :: istat
 #endif
 
-    integer :: i,j,k, m,i1,i2, pos
+     integer :: i, j, k, m, i1, i2, pos
 
-    do m=0,iproc-1
-       if (m==0) then
-          i1 = 1
-          i2 = dist(0)
-       else
-          i1 = i2+1
-          i2 = i1+dist(m)-1
-       end if
+     do m = 0, iproc - 1
+        if (m == 0) then
+           i1 = 1
+           i2 = dist(0)
+        else
+           i1 = i2 + 1
+           i2 = i1 + dist(m) - 1
+        end if
 
-#ifdef SHM
-       pos = decomp%y2disp_o(m) + 1
-#else
 #ifdef EVEN
-       pos = m * decomp%y2count + 1
+        pos = m * decomp%y2count + 1
 #else
-       pos = decomp%y2disp(m) + 1
-#endif
+        pos = decomp%y2disp(m) + 1
 #endif
 
 #if defined(_GPU)
-       istat = cudaMemcpy2D( out(1,i1,1), n1*n2, in(pos), n1*(i2-i1+1), n1*(i2-i1+1), n3, cudaMemcpyDeviceToDevice )
-       if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
+        !$acc host_data use_device(out)
+        istat = cudaMemcpy2D(out(1, i1, 1), n1 * n2, in(pos), n1 * (i2 - i1 + 1), n1 * (i2 - i1 + 1), n3, cudaMemcpyDeviceToDevice)
+        !$acc end host_data
+        if (istat /= 0) call decomp_2d_abort(__FILE__, __LINE__, istat, "cudaMemcpy2D")
 #else
-       do k=1,n3
-          do j=i1,i2
-             do i=1,n1
-                out(i,j,k) = in(pos)
-                pos = pos + 1
-             end do
-          end do
-       end do
-#endif
-    end do
-
-    return
+        do k = 1, n3
+           do j = i1, i2
+              do i = 1, n1
+                 out(i, j, k) = in(pos)
+                 pos = pos + 1
+              end do
+           end do
+        end do
+#endif
+     end do
+
+     return
   end subroutine mem_merge_zy_complex