subroutine generate_space_most_populated(target_space_size, tApproxSpace, &
nApproxSpace, ilut_list, space_size, run, opt_source, opt_source_size, t_opt_fast_core)
! In: target_space_size - The number of determinants to attempt to keep
! from if less determinants are present then use all of them.
! In: tApproxSpace - If true then only find *approximately* the best
! space, to save memory (although in many cases it will end up
! finding the best space).
! In: nApproxSpace - factor that defines how many states are kept on each
! process if tApproxSpace is true, 1 =< nApproxSpace =< nProcessors.
! The larger nApproxSpace, the more memory is consumed and the slower
! (but more accurate) the semi-stochastic initialisation is.
! In/Out: ilut_list - List of determinants generated.
! In/Out: space_size - Number of determinants in the generated space.
! If ilut_list is not empty on input and you want to keep
! the states already in it, then on input space_size should
! be equal to the number of states to be kept in ilut_list,
! and new states will be added in from space_size+1.
! Otherwise, space_size must equal 0 on input.
! On output space_size will equal the total number of
! generated plus what space_size was on input.
integer, intent(in) :: target_space_size, nApproxSpace
integer(n_int), intent(in), optional :: opt_source_size
integer(n_int), intent(in), optional :: opt_source(0:, :)
logical, intent(in) :: tApproxSpace
integer(n_int), intent(inout) :: ilut_list(0:, :)
integer, intent(inout) :: space_size
integer, intent(in) :: run
logical, intent(in), optional :: t_opt_fast_core
real(dp), allocatable, dimension(:) :: amps_this_proc, amps_all_procs
real(dp) :: real_sign(lenof_sign)
integer(MPIArg) :: length_this_proc, total_length
integer(MPIArg) :: lengths(0:nProcessors - 1), disps(0:nProcessors - 1)
integer(n_int) :: temp_ilut(0:NIfTot)
real(dp) :: core_sum, all_core_sum
integer(n_int), dimension(:, :), allocatable :: largest_states
integer, allocatable, dimension(:) :: indices_to_keep
integer :: j, ierr, ind, n_pops_keep, n_states_this_proc
integer(int64) :: i
integer(TagIntType) :: TagA, TagB, TagC, TagD
character(len=*), parameter :: this_routine = "generate_space_most_populated"
integer :: nzero_dets
real(dp) :: max_vals(0:nProcessors - 1), min_vals(0:nProcessors - 1), max_sign, min_sign
integer(int64) :: source_size
integer(n_int), allocatable :: loc_source(:, :)
integer(MPIARg) :: max_size
logical :: t_use_fast_pops_core
if (present(opt_source)) then
ASSERT(present(opt_source_size))
source_size = int(opt_source_size, int64)
allocate(loc_source(0:niftot, 1:source_size), &
source=opt_source(0:niftot, 1:source_size))
! loc_source => opt_source
else
source_size = TotWalkers
allocate(loc_source(0:niftot, 1:source_size), &
source=CurrentDets(0:niftot, 1:source_size))
! loc_source => CurrentDets
end if
def_default(t_use_fast_pops_core, t_opt_fast_core, .false.)
n_pops_keep = target_space_size
! Quickly loop through and find the number of determinants with
! zero sign.
nzero_dets = 0
do i = 1, source_size
call extract_sign(loc_source(:, i), real_sign)
if (core_space_weight(real_sign, run) < 1.e-8_dp) &
nzero_dets = nzero_dets + 1
end do
if (tApproxSpace .and. nProcessors > nApproxSpace) then
! Look at nApproxSpace times the number of states that we expect to keep on
! this process. This is done instead of sending the best
! target_space_size states to all processes, which is often
! overkill and uses up too much memory.
max_size = ceiling(real(nApproxSpace * n_pops_keep) / real(nProcessors), MPIArg)
else
max_size = int(n_pops_keep, MPIArg)
end if
length_this_proc = min(max_size, int(source_size - nzero_dets, MPIArg))
call MPIAllGather(length_this_proc, lengths, ierr)
total_length = sum(lengths)
if (total_length < n_pops_keep) then
call warning_neci(this_routine, "The number of states in the walker list is less &
&than the number you requested. All states &
&will be used.")
n_pops_keep = total_length
end if
! Allocate necessary arrays and log the memory used.
allocate(amps_this_proc(length_this_proc), stat=ierr)
call LogMemAlloc("amps_this_proc", int(length_this_proc), 8, this_routine, TagA, ierr)
allocate(amps_all_procs(total_length), stat=ierr)
call LogMemAlloc("amps_all_procs", int(total_length), 8, this_routine, TagB, ierr)
allocate(indices_to_keep(n_pops_keep), stat=ierr)
call LogMemAlloc("indices_to_keep", n_pops_keep, sizeof_int, this_routine, TagC, ierr)
allocate(largest_states(0:NIfTot, length_this_proc), stat=ierr)
call LogMemAlloc("largest_states", int(length_this_proc) * (NIfTot + 1), &
size_n_int, this_routine, TagD, ierr)
disps(0) = 0_MPIArg
do i = 1, nProcessors - 1
disps(i) = disps(i - 1) + lengths(i - 1)
end do
! Return the most populated states in source on *this* processor.
call proc_most_populated_states(int(length_this_proc), run, largest_states)
do i = 1, length_this_proc
! Store the real amplitudes in their real form.
call extract_sign(largest_states(:, i), real_sign)
! We are interested in the absolute values of the ampltiudes.
amps_this_proc(i) = core_space_weight(real_sign, run)
end do
if (t_use_fast_pops_core) then
if (length_this_proc > 0) then
min_sign = amps_this_proc(1)
max_sign = amps_this_proc(ubound(amps_this_proc, dim=1))
else
min_sign = 0.0
max_sign = 0.0
end if
! Make the max/min values available on all procs
call MPIAllGather(min_sign, min_vals, ierr)
call MPIAllGather(max_sign, max_vals, ierr)
call return_proc_share(n_pops_keep, min_vals, max_vals, int(lengths), &
amps_this_proc, n_states_this_proc)
else
! Now we want to combine all the most populated states from each processor to find
! how many states to keep from each processor.
! Take the top length_this_proc states from each processor.
call MPIAllGatherV(amps_this_proc(1:length_this_proc), amps_all_procs(1:total_length), lengths, disps)
! This routine returns indices_to_keep, which will store the indices in amps_all_procs
! of those amplitudes which are among the n_pops_keep largest (but not sorted).
call return_largest_indices(n_pops_keep, int(total_length), amps_all_procs, indices_to_keep)
n_states_this_proc = 0
do i = 1, n_pops_keep
ind = indices_to_keep(i)
! Find the processor label for this state. The states of the ampltidues in
! amps_all_procs are together in blocks, in order of the corresponding processor
! label. Hence, if a state has index <= lengths(0) then it is on processor 0.
do j = 0, nProcessors - 1
ind = ind - lengths(j)
if (ind <= 0) then
! j now gives the processor label. If the state is on *this* processor,
! update n_states_this_proc.
if (j == iProcIndex) n_states_this_proc = n_states_this_proc + 1
exit
end if
end do
end do
end if
! Add the states to the ilut_list array.
temp_ilut = 0_n_int
core_sum = 0.0_dp
do i = 1, n_states_this_proc
! The states in largest_states are sorted from smallest to largest.
temp_ilut(0:NIfTot) = largest_states(0:NIfTot, length_this_proc - i + 1)
call add_state_to_space(temp_ilut, ilut_list, space_size)
core_sum = core_sum + amps_this_proc(length_this_proc - i + 1)
end do
call MPISum(core_sum, all_core_sum)
write(stdout, *) "Total core population", all_core_sum
deallocate(amps_this_proc)
call LogMemDealloc(this_routine, TagA, ierr)
deallocate(amps_all_procs)
call LogMemDealloc(this_routine, TagB, ierr)
deallocate(indices_to_keep)
call LogMemDealloc(this_routine, TagC, ierr)
deallocate(largest_states)
call LogMemDealloc(this_routine, TagD, ierr)
end subroutine generate_space_most_populated