From 5539588e49d14a3daaee623f02e007c2738576d6 Mon Sep 17 00:00:00 2001 From: Joseph Zhang Date: Fri, 13 Dec 2024 10:19:02 -0500 Subject: [PATCH] Added safety check to ensure bcast is done correctly under SH_MEM_COMM --- cmake/SCHISM.local.build | 2 +- src/Core/schism_msgp.F90 | 36 +++++++++++++++++++++++++----------- src/Readme.beta_notes | 2 +- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/cmake/SCHISM.local.build b/cmake/SCHISM.local.build index 1af0d79b..63c31c2d 100644 --- a/cmake/SCHISM.local.build +++ b/cmake/SCHISM.local.build @@ -13,7 +13,7 @@ ##################################################################### #Leave this on set(BLD_STANDALONE ON CACHE BOOLEAN "SCHISM standalone") -set(SH_MEM_COMM OFF CACHE BOOLEAN "Use shared memory communicator") +set(SH_MEM_COMM ON CACHE BOOLEAN "Use shared memory communicator") #Default is NO_PARMETIS=OFF, i.e. use ParMETIS set(NO_PARMETIS OFF CACHE BOOLEAN "Turn off ParMETIS") diff --git a/src/Core/schism_msgp.F90 b/src/Core/schism_msgp.F90 index 14214485..047fb576 100644 --- a/src/Core/schism_msgp.F90 +++ b/src/Core/schism_msgp.F90 @@ -103,7 +103,7 @@ module schism_msgp TYPE(C_PTR),public,save :: c_window_ptr ! real(4),public,save,pointer :: ath3(:,:,:,:) integer,public,save :: disp_unit ! displacement stride in shared window -#endif ! SH_MEM_COMM +#endif /*SH_MEM_COMM*/ !----------------------------------------------------------------------------- ! Private data @@ -394,7 +394,7 @@ subroutine parallel_init(communicator) implicit none integer, optional :: communicator - integer :: comm2,nproc3,myrank3,nproc_compute + integer :: comm2,nproc3,myrank3,nproc_compute,itmp,itmp2 if (present(communicator)) then @@ -433,15 +433,6 @@ subroutine parallel_init(communicator) CALL MPI_Comm_size(comm2, nproc3, ierr) CALL MPI_Comm_rank(comm2, myrank3,ierr) -#ifdef SH_MEM_COMM -! Set up by-node shared-memory communicator for compute nodes - if(task_id==1) then - CALL MPI_Comm_split_type(comm2,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,comm_node,ierr) - CALL MPI_Comm_size(comm_node, nproc_node, ierr) - CALL MPI_Comm_rank(comm_node, myrank_node,ierr) - endif -#endif ! SH_MEM_COMM - if(task_id==1) then !compute comm=comm2 nproc=nproc3 @@ -454,6 +445,29 @@ subroutine parallel_init(communicator) ! print*, 'Scribes:',myrank_schism,nproc_schism,myrank3,nproc3,nproc_compute,nscribes,task_id endif +#ifdef SH_MEM_COMM +! Set up by-node shared-memory communicator for compute nodes + if(task_id==1) then + CALL MPI_Comm_split_type(comm,MPI_COMM_TYPE_SHARED,0,MPI_INFO_NULL,comm_node,ierr) + CALL MPI_Comm_size(comm_node, nproc_node, ierr) + CALL MPI_Comm_rank(comm_node, myrank_node,ierr) + + !Ensure that myrank_node=0 includes myrank=0 for some bcast (read in by + !myrank_node=0 and then bcast from myrank=0 in comm) + if(myrank_node==0.and.myrank==0) then + itmp=1 !true + else + itmp=0 + endif + !comm_node seems to cause error, maybe due to non-unique ranks? + call mpi_allreduce(itmp,itmp2,1,itype,MPI_SUM,comm,ierr) + if(itmp2==0) then + write(*,*)'Ranks:',myrank_node,myrank,itmp,itmp2 + call parallel_abort('MSGP: myrank=0 is not the head process in share mem') + endif + endif !task_id +#endif /*SH_MEM_COMM*/ + end subroutine parallel_init diff --git a/src/Readme.beta_notes b/src/Readme.beta_notes index d0751e23..9eb580dc 100644 --- a/src/Readme.beta_notes +++ b/src/Readme.beta_notes @@ -616,7 +616,7 @@ git versions: 161. `ca44ba0` (1 May 2024): fixed bugs in vegetation&marsh module (uninited sav_h0 etc) 162. `cfb58a5` [5 July 2024] replace fatal errors with warning in btrack (no intersecting edges). These are exaordinary exceptions that occur rarely. 163. `2f88fc5` (7 Aug 2024): changed horizontal diffusion method to filter; -164. `c441ace` (12 Dec 2024): added share memory communicator for efficient bcast; +164. `c441ace` (12 Dec 2024): added share memory communicator for efficient bcast for if_source/=0; ================================================================================================ (D) Auto-test history: