subroutine store_whole_core_space(rep)
use FciMCData, only: CoreSpaceTag
use MemoryManager, only: LogMemAlloc
use Parallel_neci, only: MPIAllGatherV, iProcIndex_inter, iProcIndex_intra, &
mpi_comm_inter, mpi_comm_intra, nNodes, NodeLengths, &
iNodeIndex, MPIAllGather
type(core_space_t), intent(inout) :: rep
integer(MPIArg) :: MPIerr
integer :: ierr
character(len=*), parameter :: t_r = "store_whole_core_space"
integer(MPIArg), allocatable :: sizes_this_node(:)
integer(MPIArg) :: total_size_this_node
integer(MPIArg), allocatable :: node_offsets(:), sizes_per_node(:)
integer(MPIArg) :: proc_offset
integer(MPIArg) :: core_width
integer :: i
integer(MPIArg) :: node_size, num_nodes
integer(MPIArg) :: global_offset
call mpi_comm_size(mpi_comm_intra, node_size, MPIerr)
call mpi_comm_size(mpi_comm_inter, num_nodes, MPIerr)
allocate(sizes_this_node(0:node_size - 1))
allocate(node_offsets(0:num_nodes - 1))
allocate(sizes_per_node(0:num_nodes - 1))
call shared_allocate_mpi(rep%core_space_win, rep%core_space_direct, &
[int(1 + NIfTot, int64), int(rep%determ_space_size, int64)], ierr)
! Convert from 1-based first dimension to 0-based first dimension as used in iluts
rep%core_space(0:, 1:) => rep%core_space_direct(1:, 1:)
call LogMemAlloc('core_space', maxval(rep%determ_sizes) * (NIfTot + 1), 8, t_r, &
rep%CoreSpaceTag, err=ierr)
if (iProcIndex_intra == 0) rep%core_space = 0_n_int
! Write the core-space on this node into core_space
call MPI_AllGather(rep%determ_sizes(iProcIndex), 1, MPI_INTEGER, &
sizes_this_node, 1, MPI_INTEGER, mpi_comm_intra, MPIerr)
! Get the intra-node offset
proc_offset = 0
do i = 1, iProcIndex_intra
proc_offset = proc_offset + sizes_this_node(i - 1)
end do
! Sum the size on this node
total_size_this_node = sum(sizes_this_node)
call MPI_AllGather(total_size_this_node, 1, MPI_INTEGER, sizes_per_node, 1, MPI_INTEGER, &
mpi_comm_inter, MPIerr)
! Get the inter-node offset
node_offsets(0) = 0
do i = 1, num_nodes - 1
node_offsets(i) = node_offsets(i - 1) + sizes_per_node(i - 1)
end do
global_offset = node_offsets(iProcIndex_inter) + proc_offset
call MPI_Win_Sync(rep%core_space_win, MPIerr)
call MPI_Barrier(mpi_comm_intra, MPIerr)
rep%core_space(0:NIfTot, (global_offset + 1): &
(global_offset + rep%determ_sizes(iProcIndex))) = &
SpawnedParts(0:NIfTot, 1:rep%determ_sizes(iProcIndex))
call MPI_Win_Sync(rep%core_space_win, MPIerr)
call MPI_Barrier(mpi_comm_intra, MPIerr)
call MPI_Win_Sync(rep%core_space_win, MPIerr)
! Multiply with message width (1+NIfTot)
core_width = int(size(rep%core_space, dim=1), MPIArg)
node_offsets = node_offsets * core_width
total_size_this_node = total_size_this_node * core_width
sizes_per_node = sizes_per_node * core_width
! Give explicit limits for SpawnedParts slice, as NIfTot is not nesc.
! equal to NIfBCast. (It may be longer)
call MPI_AllGatherV(MPI_IN_PLACE, total_size_this_node, MPI_INTEGER8, rep%core_space, &
sizes_per_node, node_offsets, MPI_INTEGER8, mpi_comm_inter, MPIerr)
! And sync the shared window
call MPI_Win_Sync(rep%core_space_win, MPIerr)
call MPI_Barrier(mpi_comm_intra, MPIerr)
! Communicate the indices in the full vector at which the various processors take over, relative
! to the first index position in the vector (i.e. the array displs in MPI routines).
call MPIAllGather(global_offset, rep%determ_displs, ierr)
deallocate(sizes_per_node)
deallocate(node_offsets)
deallocate(sizes_this_node)
end subroutine store_whole_core_space