From 3f350c9f4c50077f7fdd29f36ab380ca3de314fe Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Tue, 10 Mar 2026 11:59:22 -0600 Subject: [PATCH 1/5] Tidy up graph update before offload work --- examples/gpmdk/src/gpmdcov_part.F90 | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/gpmdk/src/gpmdcov_part.F90 b/examples/gpmdk/src/gpmdcov_part.F90 index 1c54707..67c66f7 100644 --- a/examples/gpmdk/src/gpmdcov_part.F90 +++ b/examples/gpmdk/src/gpmdcov_part.F90 @@ -16,11 +16,11 @@ subroutine gpmdcov_Part(ipreMD) use gpmdcov_allocation_mod implicit none integer, allocatable :: graph_h(:,:) - integer, allocatable, save :: graph_p(:,:), graph_p_flat(:) + integer, allocatable, save :: graph_p(:,:) integer, allocatable, save :: graph_p_old(:,:) integer, allocatable, save :: G_added(:,:), G_removed(:,:) integer, allocatable, save :: N_added(:), N_removed(:), NNZ1(:), NNZ2(:), NNZ_updated(:) - logical, allocatable, save :: v(:), v_check(:) + logical, allocatable, save :: v(:) integer :: n_atoms, max_updates, k, ktot_a, ktot_r real(dp) :: mls_ii real, allocatable :: onesMat(:,:) @@ -69,7 +69,6 @@ subroutine gpmdcov_Part(ipreMD) allocate(NNZ2(n_atoms)) allocate(NNZ_updated(n_atoms)) allocate(v(n_atoms)) - allocate(v_check(n_atoms)) endif #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -117,7 +116,6 @@ subroutine gpmdcov_Part(ipreMD) mls_i = mls() ! call gpmdcov_mat2VectInt(graph_p,auxVectInt,sy%nats,myMdim) - if(.not.allocated(graph_p_flat))allocate(graph_p_flat(myMdim*sy%nats)) parteach_offset = 0 if(gsp2%partition_type=="Box")then parteach_offset = 1 @@ -126,9 +124,6 @@ subroutine gpmdcov_Part(ipreMD) if (getNRanks() > 1) then call prg_barrierParallel if((gsp2%parteach == 1) .or. (mod(mdstep,gsp2%parteach)==parteach_offset) .or. (mdstep <= 1))then - !graph_p_flat = RESHAPE(graph_p,shape(graph_p_flat)) - !call prg_sumIntReduceN(graph_p_flat, size(graph_p_flat)) - !graph_p = RESHAPE(graph_p_flat,shape(graph_p)) write(*,*)"DEBUG: Doing full graph reduction at mdstep ",mdstep if(any(graph_p.gt.sy%nats))then write(*,*)"DEBUG: GPMDCOV_PART before reduction: graph_p has elems > nats" From a751440f7f02adee3f125b19e49546c39faec4cc Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Tue, 10 Mar 2026 12:37:42 -0600 Subject: [PATCH 2/5] Numerically correct offloaded graph update --- examples/gpmdk/src/gpmdcov_part.F90 | 88 +++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/examples/gpmdk/src/gpmdcov_part.F90 b/examples/gpmdk/src/gpmdcov_part.F90 index 67c66f7..6709fe2 100644 --- a/examples/gpmdk/src/gpmdcov_part.F90 +++ b/examples/gpmdk/src/gpmdcov_part.F90 @@ -20,7 +20,6 @@ subroutine gpmdcov_Part(ipreMD) integer, allocatable, save :: graph_p_old(:,:) integer, allocatable, save :: G_added(:,:), G_removed(:,:) integer, allocatable, save :: N_added(:), N_removed(:), NNZ1(:), NNZ2(:), NNZ_updated(:) - logical, allocatable, save :: v(:) integer :: n_atoms, max_updates, k, ktot_a, ktot_r real(dp) :: mls_ii real, allocatable :: onesMat(:,:) @@ -31,7 +30,10 @@ subroutine gpmdcov_Part(ipreMD) integer :: coreHaloP1, coreP1 integer :: myMdim,parteach_offset logical :: check_chi,check_graph,graphs_end - + logical, allocatable, save :: v(:) +#ifdef USE_OFFLOAD + logical, allocatable, save :: vacc(:,:) +#endif if(gsp2%mdim < 0)then myMdim = sy%nats elseif(gsp2%mdim > sy%nats)then @@ -54,7 +56,6 @@ subroutine gpmdcov_Part(ipreMD) #ifdef DO_MPI n_atoms = sy%nats - max_updates = gpmdt%max_updates if(.not.allocated(graph_p_old))then allocate(graph_p(myMdim,n_atoms)) @@ -69,6 +70,12 @@ subroutine gpmdcov_Part(ipreMD) allocate(NNZ2(n_atoms)) allocate(NNZ_updated(n_atoms)) allocate(v(n_atoms)) +#ifdef USE_OFFLOAD + allocate(vacc(n_atoms,n_atoms)) + !$acc enter data copyin(graph_p(:,:),graph_p_old(:,:)) & + !$acc create(G_added(:,:),G_removed(:,:),N_added(:),N_removed(:)) & + !$acc create(NNZ1(:),NNZ2(:),NNZ_updated(:),vacc(:,:)) +#endif endif #endif !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! @@ -133,19 +140,28 @@ subroutine gpmdcov_Part(ipreMD) write(*,*)"DEBUG: GPMDCOV_PART after reduction: graph_p has elems > nats" endif graph_p_old = graph_p + NNZ1 = count(graph_p_old.ne.0,DIM=1) +#ifdef USE_OFFLOAD + !$acc update device(graph_p_old(:,:),NNZ1(:)) +#endif else #ifdef USE_NVTX call gpmdStartRange("Fast graph update",2) #endif write(*,*)"DEBUG: Doing graph update reduction at mdstep ",mdstep - - ktot_a = 0 NNZ1 = count(graph_p_old.ne.0,DIM=1) NNZ2 = count(graph_p.ne.0,DIM=1) + +#ifdef USE_OFFLOAD + !$acc update device(NNZ1(:)) +#endif + + !Added edges + ktot_a = 0 !$omp parallel do do i = 1,n_atoms - G_added(:,i) = 0 + G_added(:,i) = 0 enddo N_added = 0 v = .false. @@ -176,8 +192,6 @@ subroutine gpmdcov_Part(ipreMD) do j = 1,NNZ2(i) v(graph_p(j,i)) = .false. end do - ! v(graph_p_old(1:NNZ1(i),i)) = .false. - ! v(graph_p(1:NNZ2(i),i)) = .false. end do enddo ! Removed edges @@ -215,8 +229,6 @@ subroutine gpmdcov_Part(ipreMD) do j = 1,NNZ2(i) v(graph_p(j,i)) = .false. enddo - ! v(graph_p_old(1:NNZ1(i),i)) = .false. - ! v(graph_p(1:NNZ2(i),i)) = .false. end do enddo ! % Check NNZ_Updated: NNZ_Updated = NNZ1 + N_Added - N_Removed @@ -232,6 +244,60 @@ subroutine gpmdcov_Part(ipreMD) ! %% Use G_removed and G_added to update from G1 to G2 call prg_sumIntReduceN(G_added,n_atoms*max_updates) call prg_sumIntReduceN(G_removed,n_atoms*max_updates) +#ifdef USE_OFFLOAD + NNZ_updated = 0 + !$acc update device(G_added(:,:),G_removed(:,:)) & + !$acc device(N_added(:),N_removed(:),NNZ_updated(:)) & + !$acc device(NNZ2(:)) + + !$acc parallel loop gang present(vacc) + do i = 1,n_atoms + !$acc loop vector + do j = 1,n_atoms + vacc(j,i) = .false. + enddo + enddo + + !$acc parallel loop gang & + !$acc private(i,j,k) & + !$acc present(vacc) & + !$acc present(G_added,G_removed,N_added,N_removed) & + !$acc present(graph_p_old,graph_p,NNZ1,NNZ2,NNZ_updated) + do i = 1,n_atoms + !$acc loop vector + do j = 1,NNZ1(i) + vacc(graph_p_old(j,i),i) = .true. + end do + !$acc loop vector + do j = 1,N_removed(i) + vacc(G_removed(j,i),i) = .false. ! % Remove edges + end do + k = 0 +!!$omp loop reduction(+:k) + !$acc loop vector + do j = 1,mymdim + graph_p(j,i) = 0 + enddo + do j = 1,NNZ1(i) + if (vacc(graph_p_old(j,i),i) .eqv. .true.)then ! % Account only for the remaining edges + k = k + 1; + graph_p(k,i) = graph_p_old(j,i); + end if + end do + NNZ_updated(i) = k + N_added(i) + !$acc loop vector + do j = k+1,NNZ_updated(i) + graph_p(j,i) = G_added(j-k,i) ! Add new edges at the end + end do + k = max(NNZ1(i),NNZ2(i)) + !NNZ1(i) = k + !$acc loop vector + do j = 1,k + graph_p_old(j,i) = graph_p(j,i) + enddo + end do + !$acc update self(graph_p(:,:),graph_p_old(:,:),NNZ_updated(:)) +#else NNZ_updated = 0 v = .false. !$omp parallel do & @@ -267,12 +333,14 @@ subroutine gpmdcov_Part(ipreMD) graph_p(j,i) = G_added(j-k,i) ! Add new edges at the end end do k = max(NNZ1(i),NNZ2(i)) + !NNZ1(i) = NNZ2(i) !$omp loop do j = 1,k graph_p_old(j,i) = graph_p(j,i) enddo end do !$omp end parallel do +#endif else write(*,*)"GPMDCOV_PART: WARNING: Number of changes exceeds max_updates. System might be unstable. Doing full reduction." call prg_sumIntReduceN(graph_p, myMdim*sy%nats) From 9f5ca04715d17e85bb371cfff07dda574c26a214 Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Tue, 10 Mar 2026 22:42:35 -0600 Subject: [PATCH 3/5] A bit more threading --- examples/gpmdk/src/gpmdcov_part.F90 | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/gpmdk/src/gpmdcov_part.F90 b/examples/gpmdk/src/gpmdcov_part.F90 index 6709fe2..8d76fcc 100644 --- a/examples/gpmdk/src/gpmdcov_part.F90 +++ b/examples/gpmdk/src/gpmdcov_part.F90 @@ -150,8 +150,13 @@ subroutine gpmdcov_Part(ipreMD) #endif write(*,*)"DEBUG: Doing graph update reduction at mdstep ",mdstep - NNZ1 = count(graph_p_old.ne.0,DIM=1) - NNZ2 = count(graph_p.ne.0,DIM=1) + !$omp parallel do shared(graph_p_old,graph_p,n_atoms) + do i =1,n_atoms + NNZ1(i) = count(graph_p_old(:,i).ne.0) + NNZ2(i) = count(graph_p(:,i).ne.0) + enddo + ! NNZ1 = count(graph_p_old.ne.0,DIM=1) + ! NNZ2 = count(graph_p.ne.0,DIM=1) #ifdef USE_OFFLOAD !$acc update device(NNZ1(:)) @@ -290,7 +295,7 @@ subroutine gpmdcov_Part(ipreMD) graph_p(j,i) = G_added(j-k,i) ! Add new edges at the end end do k = max(NNZ1(i),NNZ2(i)) - !NNZ1(i) = k + !NNZ1(i) = NNZ2(i) !$acc loop vector do j = 1,k graph_p_old(j,i) = graph_p(j,i) From b908699aa5fd01560e32d02178a8915676147066 Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Sun, 15 Mar 2026 17:58:12 -0600 Subject: [PATCH 4/5] Revert to calling prg_get_hscf_v2 (not offloaded) in the rankN update --- examples/gpmdk/src/gpmdcov_kernel.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/gpmdk/src/gpmdcov_kernel.F90 b/examples/gpmdk/src/gpmdcov_kernel.F90 index 7d88e92..b1c4b04 100644 --- a/examples/gpmdk/src/gpmdcov_kernel.F90 +++ b/examples/gpmdk/src/gpmdcov_kernel.F90 @@ -1149,7 +1149,7 @@ subroutine gpmdcov_rankN_update_byParts(myqn,myn,mysyprt,mysyprtk,maxRanks,KK0Re call gpmdStartRange("prg_get_hscf",7) #endif !ptaux_bml corresponds to H0 which is 0 in this case. - call prg_get_hscf(ptaux_bml,mysyprt(ipt)%estr%over,ptham_bml,mysyprt(ipt)%spindex,& + call prg_get_hscf_v2(ptaux_bml,mysyprt(ipt)%estr%over,ptham_bml,mysyprt(ipt)%spindex,& mysyprt(ipt)%estr%hindex,tb%hubbardu,ptnet_charge,& ptcoul_pot_r,ptcoul_pot_k,norbs,lt%threshold) #ifdef USE_NVTX From 81c3e1db2e7a432a60b989377331addd5c070590 Mon Sep 17 00:00:00 2001 From: Mike Wall Date: Sun, 15 Mar 2026 19:58:45 -0600 Subject: [PATCH 5/5] Restore prg_get_hscf for offload build --- examples/gpmdk/src/gpmdcov_kernel.F90 | 6 ++++++ examples/gpmdk/src/gpmdcov_writeout.F90 | 2 -- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/examples/gpmdk/src/gpmdcov_kernel.F90 b/examples/gpmdk/src/gpmdcov_kernel.F90 index b1c4b04..f53f0bd 100644 --- a/examples/gpmdk/src/gpmdcov_kernel.F90 +++ b/examples/gpmdk/src/gpmdcov_kernel.F90 @@ -1149,9 +1149,15 @@ subroutine gpmdcov_rankN_update_byParts(myqn,myn,mysyprt,mysyprtk,maxRanks,KK0Re call gpmdStartRange("prg_get_hscf",7) #endif !ptaux_bml corresponds to H0 which is 0 in this case. +#ifdef USE_OFFLOAD + call prg_get_hscf(ptaux_bml,mysyprt(ipt)%estr%over,ptham_bml,mysyprt(ipt)%spindex,& + mysyprt(ipt)%estr%hindex,tb%hubbardu,ptnet_charge,& + ptcoul_pot_r,ptcoul_pot_k,norbs,lt%threshold) +#else call prg_get_hscf_v2(ptaux_bml,mysyprt(ipt)%estr%over,ptham_bml,mysyprt(ipt)%spindex,& mysyprt(ipt)%estr%hindex,tb%hubbardu,ptnet_charge,& ptcoul_pot_r,ptcoul_pot_k,norbs,lt%threshold) +#endif #ifdef USE_NVTX call gpmdEndRange #endif diff --git a/examples/gpmdk/src/gpmdcov_writeout.F90 b/examples/gpmdk/src/gpmdcov_writeout.F90 index 4a11bb9..bacb8b3 100644 --- a/examples/gpmdk/src/gpmdcov_writeout.F90 +++ b/examples/gpmdk/src/gpmdcov_writeout.F90 @@ -281,8 +281,6 @@ end function cudaMemGetInfo endif endif endif -#else - print *, message//": No GPU memory report available without USE_NVTX build" #endif end subroutine gpmdcov_msMemGPU