From 88272f024c1886e70c5a9b230a8add15c0f9de80 Mon Sep 17 00:00:00 2001 From: "G. Dylan Dickerson" Date: Thu, 6 Feb 2025 20:04:14 -0700 Subject: [PATCH 1/2] Prepare for OpenACC porting in mpas_reconstruct_2d Add nVertLevels, derefernce integer pointers to loop bounds so they transfer to the GPU correctly, and make loops in vertical dimension explicit for OpenACC parallel loop directives. Also ensure that, after initialization finishes, the invariant fields used in this routine will be on the device. --- .../dynamics/mpas_atm_time_integration.F | 24 +++++++ src/operators/mpas_vector_reconstruction.F | 65 +++++++++++++------ 2 files changed, 68 insertions(+), 21 deletions(-) diff --git a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F index 5ea2ca1154..32a0fe212e 100644 --- a/src/core_atmosphere/dynamics/mpas_atm_time_integration.F +++ b/src/core_atmosphere/dynamics/mpas_atm_time_integration.F @@ -243,6 +243,9 @@ subroutine mpas_atm_dynamics_init(domain) real (kind=RKIND), dimension(:,:), pointer :: zxu real (kind=RKIND), dimension(:,:), pointer :: dss real (kind=RKIND), dimension(:), pointer :: specZoneMaskCell + real (kind=RKIND), dimension(:), pointer :: latCell + real (kind=RKIND), dimension(:), pointer :: lonCell + real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct #endif @@ -395,6 +398,15 @@ subroutine mpas_atm_dynamics_init(domain) call mpas_pool_get_array(mesh, 'specZoneMaskCell', specZoneMaskCell) !$acc enter data copyin(specZoneMaskCell) + + call mpas_pool_get_array(mesh, 'latCell', latCell) + !$acc enter data copyin(latCell) + + call mpas_pool_get_array(mesh, 'lonCell', lonCell) + !$acc enter data copyin(lonCell) + + call mpas_pool_get_array(mesh, 'coeffs_reconstruct', coeffs_reconstruct) + !$acc enter data copyin(coeffs_reconstruct) #endif end subroutine mpas_atm_dynamics_init @@ -474,6 +486,9 @@ subroutine mpas_atm_dynamics_finalize(domain) real (kind=RKIND), dimension(:,:), pointer :: zxu real (kind=RKIND), dimension(:,:), pointer :: dss real (kind=RKIND), dimension(:), pointer :: specZoneMaskCell + real (kind=RKIND), dimension(:), pointer :: latCell + real (kind=RKIND), dimension(:), pointer :: lonCell + real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct #endif @@ -626,6 +641,15 @@ subroutine mpas_atm_dynamics_finalize(domain) call mpas_pool_get_array(mesh, 'specZoneMaskCell', specZoneMaskCell) !$acc exit data delete(specZoneMaskCell) + + call mpas_pool_get_array(mesh, 'latCell', latCell) + !$acc exit data delete(latCell) + + call mpas_pool_get_array(mesh, 'lonCell', lonCell) + !$acc exit data delete(lonCell) + + call mpas_pool_get_array(mesh, 'coeffs_reconstruct', coeffs_reconstruct) + !$acc exit data delete(coeffs_reconstruct) #endif end subroutine mpas_atm_dynamics_finalize diff --git a/src/operators/mpas_vector_reconstruction.F b/src/operators/mpas_vector_reconstruction.F index 789ba50c1e..6772dbc8ae 100644 --- a/src/operators/mpas_vector_reconstruction.F +++ b/src/operators/mpas_vector_reconstruction.F @@ -24,6 +24,16 @@ module mpas_vector_reconstruction use mpas_rbf_interpolation use mpas_vector_operations +#ifdef MPAS_OPENACC + ! For use in regions ported with OpenACC to track in-function transfers + use mpas_timer, only : mpas_timer_start, mpas_timer_stop +#define MPAS_ACC_TIMER_START(X) call mpas_timer_start(X) +#define MPAS_ACC_TIMER_STOP(X) call mpas_timer_stop(X) +#else +#define MPAS_ACC_TIMER_START(X) +#define MPAS_ACC_TIMER_STOP(X) +#endif + implicit none public :: mpas_init_reconstruct, mpas_reconstruct @@ -207,10 +217,11 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon ! temporary arrays needed in the compute procedure logical :: includeHalosLocal - integer, pointer :: nCells + integer, pointer :: nCells_ptr, nVertLevels_ptr + integer :: nCells, nVertLevels integer, dimension(:,:), pointer :: edgesOnCell integer, dimension(:), pointer :: nEdgesOnCell - integer :: iCell,iEdge, i + integer :: iCell,iEdge, i, k real(kind=RKIND), dimension(:), pointer :: latCell, lonCell real (kind=RKIND), dimension(:,:,:), pointer :: coeffs_reconstruct @@ -233,10 +244,14 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon call mpas_pool_get_array(meshPool, 'edgesOnCell', edgesOnCell) if ( includeHalosLocal ) then - call mpas_pool_get_dimension(meshPool, 'nCells', nCells) + call mpas_pool_get_dimension(meshPool, 'nCells', nCells_ptr) else - call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells) + call mpas_pool_get_dimension(meshPool, 'nCellsSolve', nCells_ptr) end if + call mpas_pool_get_dimension(meshPool, 'nVertLevels', nVertLevels_ptr) + ! Dereference scalar (single-value) pointers to ensure OpenACC copies the value pointed to implicitly + nCells = nCells_ptr + nVertLevels = nVertLevels_ptr call mpas_pool_get_array(meshPool, 'latCell', latCell) call mpas_pool_get_array(meshPool, 'lonCell', lonCell) @@ -247,20 +262,24 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon !$omp do schedule(runtime) do iCell = 1, nCells ! initialize the reconstructed vectors - uReconstructX(:,iCell) = 0.0 - uReconstructY(:,iCell) = 0.0 - uReconstructZ(:,iCell) = 0.0 + do k = 1, nVertLevels + uReconstructX(k,iCell) = 0.0 + uReconstructY(k,iCell) = 0.0 + uReconstructZ(k,iCell) = 0.0 + end do ! a more efficient reconstruction where rbf_values*matrix_reconstruct ! has been precomputed in coeffs_reconstruct - do i=1,nEdgesOnCell(iCell) + do i = 1, nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) - uReconstructX(:,iCell) = uReconstructX(:,iCell) & - + coeffs_reconstruct(1,i,iCell) * u(:,iEdge) - uReconstructY(:,iCell) = uReconstructY(:,iCell) & - + coeffs_reconstruct(2,i,iCell) * u(:,iEdge) - uReconstructZ(:,iCell) = uReconstructZ(:,iCell) & - + coeffs_reconstruct(3,i,iCell) * u(:,iEdge) + do k = 1, nVertLevels + uReconstructX(k,iCell) = uReconstructX(k,iCell) & + + coeffs_reconstruct(1,i,iCell) * u(k,iEdge) + uReconstructY(k,iCell) = uReconstructY(k,iCell) & + + coeffs_reconstruct(2,i,iCell) * u(k,iEdge) + uReconstructZ(k,iCell) = uReconstructZ(k,iCell) & + + coeffs_reconstruct(3,i,iCell) * u(k,iEdge) + end do enddo enddo ! iCell @@ -275,18 +294,22 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon slat = sin(latCell(iCell)) clon = cos(lonCell(iCell)) slon = sin(lonCell(iCell)) - uReconstructZonal(:,iCell) = -uReconstructX(:,iCell)*slon + & - uReconstructY(:,iCell)*clon - uReconstructMeridional(:,iCell) = -(uReconstructX(:,iCell)*clon & - + uReconstructY(:,iCell)*slon)*slat & - + uReconstructZ(:,iCell)*clat + do k = 1, nVertLevels + uReconstructZonal(k,iCell) = -uReconstructX(k,iCell)*slon + & + uReconstructY(k,iCell)*clon + uReconstructMeridional(k,iCell) = -(uReconstructX(k,iCell)*clon & + + uReconstructY(k,iCell)*slon)*slat & + + uReconstructZ(k,iCell)*clat + end do end do !$omp end do else !$omp do schedule(runtime) do iCell = 1, nCells - uReconstructZonal (:,iCell) = uReconstructX(:,iCell) - uReconstructMeridional(:,iCell) = uReconstructY(:,iCell) + do k = 1, nVertLevels + uReconstructZonal (k,iCell) = uReconstructX(k,iCell) + uReconstructMeridional(k,iCell) = uReconstructY(k,iCell) + end do end do !$omp end do end if From 607d8586741a0ff7c6fc0eac6432d9141fe54242 Mon Sep 17 00:00:00 2001 From: "G. Dylan Dickerson" Date: Thu, 13 Feb 2025 20:21:59 -0700 Subject: [PATCH 2/2] Initial OpenACC port of mpas_vector_reconstruct_2d Ensures the data needed for the mpas_reconstruct_2d routine has been fetched onto the device (GPU) at the beginning and end of the routine. The time for these transfers are captured in a new timer 'mpas_reconstruct_2d [ACC_data_xfer]'. This is enforced by the default(present) clauses. NOTE: coeffs_reconstruct, nEdgesOnCell, edgesOnCell, latCell, and lonCell are also fetched in mpas_reconstruct_2d. This is because this routine is called before these variables would be uploaded to the device during mpas_atm_dynamics_init as part of atmosphere_core initialization. The copyins will not execute anymore once the model starts timestepping and the OpenACC runtime sees the variables are present on the device. --- src/operators/mpas_vector_reconstruction.F | 32 ++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/src/operators/mpas_vector_reconstruction.F b/src/operators/mpas_vector_reconstruction.F index 6772dbc8ae..605da9cd6d 100644 --- a/src/operators/mpas_vector_reconstruction.F +++ b/src/operators/mpas_vector_reconstruction.F @@ -258,10 +258,23 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon call mpas_pool_get_config(meshPool, 'on_a_sphere', on_a_sphere) + MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]') + ! Only use sections needed, nCells may be all cells or only non-halo cells + !$acc enter data copyin(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), & + !$acc edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells)) + !$acc enter data copyin(u(:,:)) + !$acc enter data create(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), & + !$acc uReconstructZ(:,1:nCells),uReconstructZonal(:,1:nCells), & + !$acc uReconstructMeridional(:,1:nCells)) + MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]') + ! loop over cell centers !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang do iCell = 1, nCells ! initialize the reconstructed vectors + !$acc loop vector do k = 1, nVertLevels uReconstructX(k,iCell) = 0.0 uReconstructY(k,iCell) = 0.0 @@ -270,8 +283,10 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon ! a more efficient reconstruction where rbf_values*matrix_reconstruct ! has been precomputed in coeffs_reconstruct + !$acc loop seq do i = 1, nEdgesOnCell(iCell) iEdge = edgesOnCell(i,iCell) + !$acc loop vector do k = 1, nVertLevels uReconstructX(k,iCell) = uReconstructX(k,iCell) & + coeffs_reconstruct(1,i,iCell) * u(k,iEdge) @@ -283,17 +298,21 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon enddo enddo ! iCell + !$acc end parallel !$omp end do call mpas_threading_barrier() if (on_a_sphere) then !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang do iCell = 1, nCells clat = cos(latCell(iCell)) slat = sin(latCell(iCell)) clon = cos(lonCell(iCell)) slon = sin(lonCell(iCell)) + !$acc loop vector do k = 1, nVertLevels uReconstructZonal(k,iCell) = -uReconstructX(k,iCell)*slon + & uReconstructY(k,iCell)*clon @@ -302,18 +321,31 @@ subroutine mpas_reconstruct_2d(meshPool, u, uReconstructX, uReconstructY, uRecon + uReconstructZ(k,iCell)*clat end do end do + !$acc end parallel !$omp end do else !$omp do schedule(runtime) + !$acc parallel default(present) + !$acc loop gang vector collapse(2) do iCell = 1, nCells do k = 1, nVertLevels uReconstructZonal (k,iCell) = uReconstructX(k,iCell) uReconstructMeridional(k,iCell) = uReconstructY(k,iCell) end do end do + !$acc end parallel !$omp end do end if + MPAS_ACC_TIMER_START('mpas_reconstruct_2d [ACC_data_xfer]') + !$acc exit data delete(coeffs_reconstruct(:,:,1:nCells),nEdgesOnCell(1:nCells), & + !$acc edgesOnCell(:,1:nCells),latCell(1:nCells),lonCell(1:nCells)) + !$acc exit data delete(u(:,:)) + !$acc exit data copyout(uReconstructX(:,1:nCells),uReconstructY(:,1:nCells), & + !$acc uReconstructZ(:,1:nCells), uReconstructZonal(:,1:nCells), & + !$acc uReconstructMeridional(:,1:nCells)) + MPAS_ACC_TIMER_STOP('mpas_reconstruct_2d [ACC_data_xfer]') + end subroutine mpas_reconstruct_2d!}}}