From aa6de2bba300fa704fc09b245dd68d8018d8df3e Mon Sep 17 00:00:00 2001
From: Abishek Gopal <agopal@ucar.edu>
Date: Mon, 12 May 2025 23:08:59 -0600
Subject: [PATCH] Enable GPU execution of loops in atm_srk3 involving module
 level variables

This commit ports the loops in the mpas_atm_time_integration and mpas_atm_core
modules, which initialize the garbage cells of module level variables belonging
to the mpas_atm_time_integration module, to OpenACC in preparation for the
consolidating all data transfers between host and device to before and after
each dynamics call.

In order to do this, we also declare the allocatable module level variables in
this scope using the OpenACC declare create statement, which instructs the
nvhpc compiler to automatically create and delete the variable whenever it
encounters an allocate or deallocate statement, respectively. This commit also
removes these variables from manual data movement statements as required.

This commit also introduces integer loop bounds, so as to dereference scalar
integer pointers which the OpenACC parallel regions do not correctly copy to
device memory.
---
 .../dynamics/mpas_atm_time_integration.F      | 174 +++++++++++-------
 src/core_atmosphere/mpas_atm_core.F           |  31 +++-
 2 files changed, 130 insertions(+), 75 deletions(-)

diff --git a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
index fa41f66418..e3fe7f8a02 100644
--- a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
+++ b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F
@@ -65,9 +65,13 @@ end subroutine halo_exchange_routine
 !   real (kind=RKIND), allocatable, dimension(:,:) :: delsq_circulation    ! no longer used -> removed 
    real (kind=RKIND), allocatable, dimension(:,:) :: delsq_vorticity
    real (kind=RKIND), allocatable, dimension(:,:) :: dpdz
+   !$acc declare create(qtot)
+   !$acc declare create(delsq_theta, delsq_w, delsq_divergence)
+   !$acc declare create(delsq_u, delsq_vorticity, dpdz)
 
    ! Used in atm_advance_scalars
    real (kind=RKIND), dimension(:,:,:), allocatable :: horiz_flux_array
+   !$acc declare create(horiz_flux_array)
 
    ! Used in atm_advance_scalars_mono
    real (kind=RKIND), dimension(:,:), allocatable :: scalar_old_arr, scalar_new_arr
@@ -77,6 +81,10 @@ end subroutine halo_exchange_routine
    real (kind=RKIND), dimension(:,:), allocatable :: flux_tmp_arr
    real (kind=RKIND), dimension(:,:), allocatable :: wdtn_arr
    real (kind=RKIND), dimension(:,:), allocatable :: rho_zz_int
+   !$acc declare create(scalar_old_arr, scalar_new_arr)
+   !$acc declare create(s_max_arr, s_min_arr)
+   !$acc declare create(flux_array, flux_upwind_tmp_arr)
+   !$acc declare create(flux_tmp_arr, wdtn_arr)
 
    real (kind=RKIND), dimension(:,:), allocatable :: ru_driving_tend ! regional_MPAS addition 
    real (kind=RKIND), dimension(:,:), allocatable :: rt_driving_tend ! regional_MPAS addition 
@@ -100,6 +108,7 @@ end subroutine halo_exchange_routine
    ! Used in compute_solve_diagnostics
    real (kind=RKIND), allocatable, dimension(:,:) :: ke_vertex
    real (kind=RKIND), allocatable, dimension(:,:) :: ke_edge
+   !$acc declare create(ke_vertex, ke_edge)
 
    type (MPAS_Clock_type), pointer, private :: clock
    type (block_type), pointer, private :: block
@@ -842,9 +851,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       character (len=StrKIND), pointer :: config_microp_scheme
       character (len=StrKIND), pointer :: config_convection_scheme
 
-      integer, pointer :: num_scalars, index_qv, nCells, nCellsSolve, nVertices, nVerticesSolve
-      integer, pointer :: nEdges_ptr, nEdgesSolve_ptr, nVertLevels_ptr
-      integer :: nEdges, nEdgesSolve, nVertLevels
+      integer, pointer :: index_qv, nCellsSolve, nVertices_ptr, nVerticesSolve
+      integer, pointer :: nCells_ptr, nEdges_ptr, nEdgesSolve_ptr
+      integer, pointer :: nVertLevels_ptr, num_scalars_ptr
+      integer :: nCells, nEdges, nVertices, nEdgesSolve, nVertLevels, num_scalars
 
       character(len=StrKIND), pointer :: config_IAU_option
 
@@ -907,10 +917,10 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       ! Retrieve dimensions
       ! Note: nCellsSolve and nVerticesSolve are not currently used in this function
       !
-      call mpas_pool_get_dimension(mesh, 'nCells', nCells)
+      call mpas_pool_get_dimension(mesh, 'nCells', nCells_ptr)
       call mpas_pool_get_dimension(mesh, 'nCellsSolve', nCellsSolve)
       call mpas_pool_get_dimension(mesh, 'nEdges', nEdges_ptr)
-      call mpas_pool_get_dimension(mesh, 'nVertices', nVertices)
+      call mpas_pool_get_dimension(mesh, 'nVertices', nVertices_ptr)
       call mpas_pool_get_dimension(mesh, 'nVertLevels', nVertLevels_ptr)
 
       !call mpas_pool_get_dimension(mesh, 'nCellsSolve', nCellsSolve)
@@ -921,6 +931,8 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
       ! bounds rather than pointers to integers, as the former are implicitly
       ! copied to the device
       nEdges = nEdges_ptr
+      nCells = nCells_ptr
+      nVertices = nVertices_ptr
       nEdgesSolve = nEdgesSolve_ptr
       nVertLevels = nVertLevels_ptr
 
@@ -945,13 +957,19 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
 #ifdef DO_PHYSICS
       call mpas_pool_get_dimension(state, 'index_qv', index_qv)
 #endif
-      call mpas_pool_get_dimension(state, 'num_scalars', num_scalars)
+      call mpas_pool_get_dimension(state, 'num_scalars', num_scalars_ptr)
+      num_scalars = num_scalars_ptr
 
       !
       ! allocate storage for physics tendency save
       !
       allocate(qtot(nVertLevels,nCells+1))
-      qtot(:,nCells+1) = 0.0_RKIND
+      !$acc parallel default(present)
+      !$acc loop vector
+      do k = 1, nVertLevels
+         qtot(k,nCells+1) = 0.0_RKIND
+      end do
+      !$acc end parallel
 
 #ifndef MPAS_CAM_DYCORE
       call mpas_pool_get_field(tend_physics, 'tend_rtheta_physics', tend_rtheta_physicsField)
@@ -1130,19 +1148,25 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             call mpas_timer_start('atm_compute_dyn_tend')
    
             allocate(delsq_theta(nVertLevels,nCells+1))
-            delsq_theta(:,nCells+1) = 0.0_RKIND
             allocate(delsq_w(nVertLevels,nCells+1))
-            delsq_w(:,nCells+1) = 0.0_RKIND
-!!            allocate(qtot(nVertLevels,nCells+1))  ! initializing this earlier in solution sequence
+            !!  allocate(qtot(nVertLevels,nCells+1))  ! initializing this earlier in solution sequence
             allocate(delsq_divergence(nVertLevels,nCells+1))
-            delsq_divergence(:,nCells+1) = 0.0_RKIND
             allocate(delsq_u(nVertLevels,nEdges+1))
-            delsq_u(:,nEdges+1) = 0.0_RKIND
-!!            allocate(delsq_circulation(nVertLevels,nVertices+1))  ! no longer used -> removed 
+            !!  allocate(delsq_circulation(nVertLevels,nVertices+1))  ! no longer used -> removed
             allocate(delsq_vorticity(nVertLevels,nVertices+1))
-            delsq_vorticity(:,nVertices+1) = 0.0_RKIND
             allocate(dpdz(nVertLevels,nCells+1))
-            dpdz(:,nCells+1) = 0.0_RKIND
+
+            !$acc parallel default(present)
+            !$acc loop vector
+            do k = 1, nVertLevels
+               delsq_theta(k,nCells+1) = 0.0_RKIND
+               delsq_w(k,nCells+1) = 0.0_RKIND
+               delsq_divergence(k,nCells+1) = 0.0_RKIND
+               delsq_u(k,nEdges+1) = 0.0_RKIND
+               delsq_vorticity(k,nVertices+1) = 0.0_RKIND
+               dpdz(k,nCells+1) = 0.0_RKIND
+            end do
+            !$acc end parallel
 
 !$OMP PARALLEL DO
             do thread=1,nThreads
@@ -1411,9 +1435,15 @@ subroutine atm_srk3(domain, dt, itimestep, exchange_halo_group)
             call mpas_timer_start('atm_compute_solve_diagnostics')
 
             allocate(ke_vertex(nVertLevels,nVertices+1))
-            ke_vertex(:,nVertices+1) = 0.0_RKIND
             allocate(ke_edge(nVertLevels,nEdges+1))
-            ke_edge(:,nEdges+1) = 0.0_RKIND
+
+            !$acc parallel default(present)
+            !$acc loop vector
+            do k = 1, nVertLevels
+               ke_vertex(k,nVertices+1) = 0.0_RKIND
+               ke_edge(k,nEdges+1) = 0.0_RKIND
+            end do
+            !$acc end parallel
    
 !$OMP PARALLEL DO
             do thread=1,nThreads
@@ -1745,10 +1775,12 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
       type (mpas_pool_type), pointer :: mesh
       type (mpas_pool_type), pointer :: halo_scratch
 
-      integer, pointer :: nCells
-      integer, pointer :: nEdges
-      integer, pointer :: nVertLevels
-      integer, pointer :: num_scalars
+      integer, pointer :: nCells_ptr
+      integer, pointer :: nEdges_ptr
+      integer, pointer :: nVertLevels_ptr
+      integer, pointer :: num_scalars_ptr
+      integer :: nCells, nEdges, nVertLevels, num_scalars
+      integer :: iScalar, k
 
       integer, pointer :: nThreads
       integer, dimension(:), pointer :: cellThreadStart
@@ -1771,10 +1803,10 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
       call mpas_pool_get_subpool(block % structs, 'mesh', mesh)
       call mpas_pool_get_subpool(block % structs, 'halo_scratch', halo_scratch)
 
-      call mpas_pool_get_dimension(mesh, 'nCells', nCells)
-      call mpas_pool_get_dimension(mesh, 'nEdges', nEdges)
-      call mpas_pool_get_dimension(mesh, 'nVertLevels', nVertLevels)
-      call mpas_pool_get_dimension(state, 'num_'//trim(field_name), num_scalars)
+      call mpas_pool_get_dimension(mesh, 'nCells', nCells_ptr)
+      call mpas_pool_get_dimension(mesh, 'nEdges', nEdges_ptr)
+      call mpas_pool_get_dimension(mesh, 'nVertLevels', nVertLevels_ptr)
+      call mpas_pool_get_dimension(state, 'num_'//trim(field_name), num_scalars_ptr)
 
       call mpas_pool_get_dimension(block % dimensions, 'nThreads', nThreads)
 
@@ -1786,18 +1818,35 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
       call mpas_pool_get_dimension(block % dimensions, 'edgeThreadStart', edgeThreadStart)
       call mpas_pool_get_dimension(block % dimensions, 'edgeThreadEnd', edgeThreadEnd)
 
+      nCells = nCells_ptr
+      nEdges = nEdges_ptr
+      nVertLevels = nVertLevels_ptr
+      num_scalars = num_scalars_ptr
+
       allocate(scalar_old_arr(nVertLevels,nCells+1))
-      scalar_old_arr(:,nCells+1) = 0.0_RKIND
       allocate(scalar_new_arr(nVertLevels,nCells+1))
-      scalar_new_arr(:,nCells+1) = 0.0_RKIND
       allocate(s_max_arr(nVertLevels,nCells+1))
-      s_max_arr(:,nCells+1) = 0.0_RKIND
       allocate(s_min_arr(nVertLevels,nCells+1))
-      s_min_arr(:,nCells+1) = 0.0_RKIND
       allocate(flux_array(nVertLevels,nEdges+1))
-      flux_array(:,nEdges+1) = 0.0_RKIND
+      !$acc parallel default(present)
+      !$acc loop vector
+      do k = 1, nVertLevels
+         scalar_old_arr(k,nCells+1) = 0.0_RKIND
+         scalar_new_arr(k,nCells+1) = 0.0_RKIND
+         s_max_arr(k,nCells+1) = 0.0_RKIND
+         s_min_arr(k,nCells+1) = 0.0_RKIND
+         flux_array(k,nEdges+1) = 0.0_RKIND
+      end do
+      !$acc end parallel
+
       allocate(wdtn_arr(nVertLevels+1,nCells+1))
-      wdtn_arr(:,nCells+1) = 0.0_RKIND
+      !$acc parallel default(present)
+      !$acc loop vector
+      do k = 1, nVertLevels+1
+         wdtn_arr(k,nCells+1) = 0.0_RKIND
+      end do
+      !$acc end parallel
+
       if (config_split_dynamics_transport) then
          allocate(rho_zz_int(nVertLevels,nCells+1))
          rho_zz_int(:,nCells+1) = 0.0_RKIND
@@ -1806,12 +1855,24 @@ subroutine advance_scalars(field_name, domain, rk_step, rk_timestep, config_mono
       end if
       if (rk_step < 3 .or. (.not. config_monotonic .and. .not. config_positive_definite)) then
          allocate(horiz_flux_array(num_scalars,nVertLevels,nEdges+1))
-         horiz_flux_array(:,:,nEdges+1) = 0.0_RKIND
+         !$acc parallel default(present)
+         !$acc loop gang vector collapse(2)
+         do k = 1, nVertLevels
+            do iScalar = 1, num_scalars
+               horiz_flux_array(iScalar,k,nEdges+1) = 0.0_RKIND
+            end do
+         end do
+         !$acc end parallel
       else
          allocate(flux_upwind_tmp_arr(nVertLevels,nEdges+1))
-         flux_upwind_tmp_arr(:,nEdges+1) = 0.0_RKIND
          allocate(flux_tmp_arr(nVertLevels,nEdges+1))
-         flux_tmp_arr(:,nEdges+1) = 0.0_RKIND
+         !$acc parallel default(present)
+         !$acc loop vector
+         do k = 1, nVertLevels
+            flux_upwind_tmp_arr(k,nEdges+1) = 0.0_RKIND
+            flux_tmp_arr(k,nEdges+1) = 0.0_RKIND
+         end do
+         !$acc end parallel
       end if
 
       !
@@ -2024,7 +2085,7 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, &
       moist_end = moist_end_ptr
 
       MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]')
-      !$acc enter data create(qtot, cqw, cqu) & 
+      !$acc enter data create(cqw, cqu) & 
       !$acc            copyin(scalars)
       MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]')
 
@@ -2077,7 +2138,7 @@ subroutine atm_compute_moist_coefficients( dims, state, diag, mesh, &
       !$acc end parallel
 
       MPAS_ACC_TIMER_START('atm_compute_moist_coefficients [ACC_data_xfer]')
-      !$acc exit data copyout(cqw, cqu, qtot) &
+      !$acc exit data copyout(cqw, cqu) &
       !$acc           delete(scalars)
       MPAS_ACC_TIMER_STOP('atm_compute_moist_coefficients [ACC_data_xfer]')
 
@@ -2212,7 +2273,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts,
       real (kind=RKIND), dimension( nVertLevels ) :: b_tri, c_tri
 
       MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
-      !$acc enter data copyin(cqw, p, t, qtot, rb, rtb, rt, pb)
+      !$acc enter data copyin(cqw, p, t, rb, rtb, rt, pb)
       !$acc enter data create(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, &
       !$acc                   c_tri, alpha_tri, gamma_tri)
       MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
@@ -2298,7 +2359,7 @@ subroutine atm_compute_vert_imp_coefs_work(nCells, moist_start, moist_end, dts,
       MPAS_ACC_TIMER_START('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
       !$acc exit data copyout(cofrz, cofwr, cofwz, coftz, cofwt, a_tri, b_tri, &
       !$acc                   c_tri, alpha_tri, gamma_tri)
-      !$acc exit data delete(cqw, p, t, qtot, rb, rtb, rt, pb)
+      !$acc exit data delete(cqw, p, t, rb, rtb, rt, pb)
       MPAS_ACC_TIMER_STOP('atm_compute_vert_imp_coefs_work [ACC_data_xfer]')
 
    end subroutine atm_compute_vert_imp_coefs_work
@@ -3600,7 +3661,6 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
 
 
       MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]')
-      !$acc enter data create(horiz_flux_arr)
       !$acc enter data copyin(uhAvg, scalar_new)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
 
@@ -3704,7 +3764,7 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
       !$acc enter data copyin(scalar_tend_save)
 #endif
       !$acc enter data copyin(scalar_old, fnm, fnp, rdnw, wwAvg, rho_zz_old, rho_zz_new)
-      !$acc enter data create(scalar_tend_column, wdtn)
+      !$acc enter data create(scalar_tend_column)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
 
       !$acc parallel wait
@@ -3787,8 +3847,8 @@ subroutine atm_advance_scalars_work(nCells, num_scalars, dt, &
 
       MPAS_ACC_TIMER_START('atm_advance_scalars [ACC_data_xfer]')
       !$acc exit data copyout(scalar_new)
-      !$acc exit data delete(scalar_tend_column, wdtn, uhAvg, wwAvg, scalar_old, fnm, fnp, &
-      !$acc                  rdnw, rho_zz_old, rho_zz_new, horiz_flux_arr, scalar_tend_save)
+      !$acc exit data delete(scalar_tend_column, uhAvg, wwAvg, scalar_old, fnm, fnp, &
+      !$acc                  rdnw, rho_zz_old, rho_zz_new, scalar_tend_save)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars [ACC_data_xfer]')
 
    end subroutine atm_advance_scalars_work
@@ -4153,8 +4213,7 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
          !$acc enter data copyin(rho_zz_new)
       end if
       !$acc enter data copyin(scalars_new, fnm, fnp)
-      !$acc enter data create(scalar_old, scalar_new, scale_arr, s_min, s_max, &
-      !$acc                   flux_arr, flux_tmp, flux_upwind_tmp, wdtn)
+      !$acc enter data create(scale_arr)
       MPAS_ACC_TIMER_STOP('atm_advance_scalars_mono [ACC_data_xfer]')
 
       do iScalar = 1, num_scalars
@@ -4665,8 +4724,7 @@ subroutine atm_advance_scalars_mono_work(field_name, block, state, nCells, nEdge
          !$acc exit data delete(rho_zz_new)
       end if
       !$acc exit data copyout(scalars_new)
-      !$acc exit data delete(scalars_old, scalar_old, scalar_new, scale_arr, s_min, s_max, &
-      !$acc                  rho_zz_old, flux_arr, flux_tmp, flux_upwind_tmp, wdtn, wwAvg, &
+      !$acc exit data delete(scalars_old, scale_arr, rho_zz_old, wwAvg, &
       !$acc                  uhAvg, fnm, fnp, rdnw)
 
       !$acc end data
@@ -5111,20 +5169,16 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
          !$acc enter data create(kdiff)
          !$acc enter data copyin(tend_rho_physics)
-         !$acc enter data copyin(rb, qtot, rr_save)
+         !$acc enter data copyin(rb, rr_save)
          !$acc enter data copyin(divergence, vorticity)
-         !$acc enter data create(delsq_u)
          !$acc enter data copyin(v)
-         !$acc enter data create(delsq_vorticity, delsq_divergence)
          !$acc enter data copyin(u_init, v_init)
-         !$acc enter data create(delsq_w)
       else
          !$acc enter data copyin(tend_w_euler)
          !$acc enter data copyin(tend_u_euler)
          !$acc enter data copyin(tend_theta_euler)
          !$acc enter data copyin(tend_rho)
       end if
-      !$acc enter data create(dpdz)
       !$acc enter data create(tend_u)
       !$acc enter data copyin(cqu, pp, u, w, pv_edge, rho_edge, ke)
       !$acc enter data create(h_divergence)
@@ -5136,7 +5190,6 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       !$acc enter data create(tend_theta)
       !$acc enter data copyin(theta_m)
       !$acc enter data copyin(ru_save, theta_m_save)
-      !$acc enter data create(delsq_theta)
       !$acc enter data copyin(cqw)
       !$acc enter data copyin(tend_rtheta_physics)
       !$acc enter data copyin(rw_save, rt_diabatic_tend)
@@ -6152,20 +6205,16 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
 
          !$acc exit data delete(kdiff)
          !$acc exit data delete(tend_rho_physics)
-         !$acc exit data delete(rb, qtot, rr_save)
+         !$acc exit data delete(rb, rr_save)
          !$acc exit data delete(divergence, vorticity)
-         !$acc exit data copyout(delsq_u)
          !$acc exit data delete(v)
-         !$acc exit data delete(delsq_vorticity, delsq_divergence)
          !$acc exit data delete(u_init, v_init)
-         !$acc exit data copyout(delsq_w)
       else
          !$acc exit data delete(tend_w_euler)
          !$acc exit data delete(tend_u_euler)
          !$acc exit data delete(tend_theta_euler)
          !$acc exit data delete(tend_rho)
       end if
-      !$acc exit data delete(dpdz)
       !$acc exit data copyout(tend_u)
       !$acc exit data delete(cqu, pp, u, w, pv_edge, rho_edge, ke)
       !$acc exit data copyout(h_divergence)
@@ -6177,7 +6226,6 @@ subroutine atm_compute_dyn_tend_work(nCells, nEdges, nVertices, nVertLevels_dumm
       !$acc exit data copyout(tend_theta)
       !$acc exit data delete(theta_m)
       !$acc exit data delete(ru_save, theta_m_save)
-      !$acc exit data delete(delsq_theta)
       !$acc exit data delete(cqw)
       !$acc exit data delete(tend_rtheta_physics)
       !$acc exit data delete(rw_save, rt_diabatic_tend)
@@ -6372,7 +6420,7 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       ! Compute height on cell edges at velocity locations
       !
       MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-      !$acc enter data create(h_edge,ke_edge,vorticity,divergence)
+      !$acc enter data create(h_edge,vorticity,divergence)
       MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       !$acc parallel default(present)
       !$acc loop gang
@@ -6487,9 +6535,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
 
 
       if (hollingsworth) then
-         MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-         !$acc enter data create(ke_vertex)
-         MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
 
          ! Compute ke at cell vertices - AG's new KE construction, part 1
          ! *** approximation here because we don't have inner triangle areas
@@ -6547,9 +6592,6 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
          end do
          !$acc end parallel
 
-         MPAS_ACC_TIMER_START('atm_compute_solve_diagnostics [ACC_data_xfer]')
-         !$acc exit data delete(ke_vertex)
-         MPAS_ACC_TIMER_STOP('atm_compute_solve_diagnostics [ACC_data_xfer]')
       end if
 
       !
@@ -6720,7 +6762,7 @@ subroutine atm_compute_solve_diagnostics_work(nCells, nEdges, nVertices, &
       !$acc                  verticesOnEdge, &
       !$acc                  fVertex,invDvEdge,invDcEdge)
       !$acc exit data delete(u,h)
-      !$acc exit data copyout(h_edge,ke_edge,vorticity,divergence, &
+      !$acc exit data copyout(h_edge,vorticity,divergence, &
       !$acc                   ke, &
       !$acc                   v, &
       !$acc                   pv_vertex, &
diff --git a/src/core_atmosphere/mpas_atm_core.F b/src/core_atmosphere/mpas_atm_core.F
index d948c81604..77f6b521fb 100644
--- a/src/core_atmosphere/mpas_atm_core.F
+++ b/src/core_atmosphere/mpas_atm_core.F
@@ -397,7 +397,7 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
       type (mpas_pool_type), pointer :: atm_input
       type (mpas_pool_type), pointer :: output_noahmp
 
-      integer :: iCell,iEdge,iVertex
+      integer :: iCell,iEdge,iVertex,k
       
       real (kind=RKIND), dimension(:,:), pointer :: u, uReconstructX, uReconstructY, uReconstructZ, uReconstructZonal, uReconstructMeridional
       real (kind=RKIND), dimension(:), pointer :: meshScalingDel2, meshScalingDel4
@@ -405,7 +405,8 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
       real (kind=RKIND), dimension(:), pointer :: dvEdge, invDvEdge
       real (kind=RKIND), dimension(:), pointer :: dcEdge, invDcEdge
       real (kind=RKIND), dimension(:), pointer :: areaTriangle, invAreaTriangle
-      integer, pointer :: nCells, nEdges, nVertices, nVertLevels, nEdgesSolve
+      integer, pointer :: nCells_ptr, nEdges_ptr, nVertices_ptr, nVertLevels_ptr, nEdgesSolve
+      integer :: nCells, nEdges, nVertices, nVertLevels
       integer :: thread
       character(len=StrKIND), pointer :: mminlu
 
@@ -444,9 +445,13 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
       call mpas_pool_get_array(mesh, 'areaTriangle', areaTriangle)
       call mpas_pool_get_array(mesh, 'invAreaTriangle', invAreaTriangle)
       
-      call mpas_pool_get_dimension(mesh, 'nCells', nCells)      
-      call mpas_pool_get_dimension(mesh, 'nEdges', nEdges)
-      call mpas_pool_get_dimension(mesh, 'nVertices', nVertices)
+      call mpas_pool_get_dimension(mesh, 'nCells', nCells_ptr)      
+      call mpas_pool_get_dimension(mesh, 'nEdges', nEdges_ptr)
+      call mpas_pool_get_dimension(mesh, 'nVertices', nVertices_ptr)
+
+      nCells = nCells_ptr
+      nEdges = nEdges_ptr
+      nVertices = nVertices_ptr
  
       do iCell=1,nCells 
          invAreaCell(iCell) = 1.0_RKIND / areaCell(iCell)
@@ -471,13 +476,21 @@ subroutine atm_mpas_init_block(dminfo, stream_manager, block, mesh, dt)
 
       call atm_couple_coef_3rd_order(mesh, block % configs)
       
-      call mpas_pool_get_dimension(state, 'nVertices', nVertices)
-      call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels)
+      call mpas_pool_get_dimension(state, 'nVertices', nVertices_ptr)
+      call mpas_pool_get_dimension(state, 'nVertLevels', nVertLevels_ptr)
+
+      nVertices = nVertices_ptr
+      nVertLevels = nVertLevels_ptr
 
       allocate(ke_vertex(nVertLevels,nVertices+1))  ! ke_vertex is a module variable defined in mpas_atm_time_integration.F
-      ke_vertex(:,nVertices+1) = 0.0_RKIND
       allocate(ke_edge(nVertLevels,nEdges+1))       ! ke_edge is a module variable defined in mpas_atm_time_integration.F
-      ke_edge(:,nEdges+1) = 0.0_RKIND
+      !$acc parallel default(present)
+      !$acc loop vector
+      do k = 1, nVertLevels
+         ke_vertex(k,nVertices+1) = 0.0_RKIND
+         ke_edge(k,nEdges+1) = 0.0_RKIND
+      end do
+      !$acc end parallel
 
       call mpas_pool_get_dimension(block % dimensions, 'nThreads', nThreads)