subroutine init_trial_wf(trial_in, nexcit_calc, nexcit_keep, replica_pairs)
use DetBitOps, only: ilut_lt, ilut_gt
use enumerate_excitations, only: generate_connected_space
use FciMCData, only: trial_space, trial_space_size, con_space, con_space_size, trial_wfs, tot_trial_space_size
use FciMCData, only: trial_energies, ConTag, ConVecTag, TempTag, TrialTag, TrialWFTag
use FciMCData, only: TrialTempTag, ConTempTag, OccTrialTag, Trial_Init_Time
use FciMCData, only: OccConTag, CurrentTrialTag, current_trial_amps
use FciMCData, only: MaxWalkersPart, tTrialHash, tIncCancelledInitEnergy
use FciMCData, only: con_space_vecs, ntrial_excits, trial_numerator, trial_denom
use FciMCData, only: tot_trial_numerator, tot_trial_denom, HashIndex
use FciMCData, only: tot_init_trial_numerator, tot_init_trial_denom
use FciMCData, only: init_trial_numerator, init_trial_denom
use initial_trial_states, only: calc_trial_states_lanczos, calc_trial_states_qmc, calc_trial_states_direct
use LoggingData, only: tWriteTrial, tCompareTrialAmps
use MemoryManager, only: LogMemAlloc, LogMemDealloc
use MPI_wrapper, only: root
use ras_data, only: trial_ras
use searching, only: remove_repeated_states
use sort_mod, only: sort
use SystemData, only: tAllSymSectors
type(subspace_in) :: trial_in
integer, intent(in) :: nexcit_calc, nexcit_keep
logical, intent(in) :: replica_pairs
integer :: i, ierr, num_states_on_proc, con_space_size_old
integer :: excit, tot_con_space_size
integer :: con_counts(0:nProcessors - 1)
integer :: min_elem, max_elem, num_elem
integer(MPIArg) :: trial_counts(0:nProcessors - 1), trial_displs(0:nProcessors - 1)
integer(MPIArg) :: con_sendcounts(0:nProcessors - 1), con_recvcounts(0:nProcessors - 1)
integer(MPIArg) :: con_senddispls(0:nProcessors - 1), con_recvdispls(0:nProcessors - 1)
integer(n_int), allocatable, dimension(:, :) :: temp_space
HElement_t(dp), allocatable :: trial_wfs_all_procs(:, :), temp_wfs(:, :)
real(dp) :: temp_energies(nexcit_calc)
character(len=*), parameter :: t_r = "init_trial_wf"
! Perform checks.
if (tIncCancelledInitEnergy .and. (.not. tTrialHash)) &
call stop_all(t_r, "The inc-cancelled-init-energy option cannot be used with the &
&trial-bin-search option.")
if (.not. tUseRealCoeffs) call stop_all(t_r, "To use a trial wavefunction you must also &
&use real coefficients.")
if (nexcit_keep > nexcit_calc) call stop_all(t_r, "The number of required trial wave functions &
&is more than the number that has been requested to be calculated.")
call set_timer(Trial_Init_Time)
write(stdout, '(/,11("="),1X,"Trial wavefunction initialisation",1X,10("="))')
ntrial_excits = nexcit_keep
! Simply allocate the trial vector to have up to 1 million elements for now...
allocate(trial_space(0:NIfTot, 1000000), stat=ierr)
call LogMemAlloc('trial_space', 1000000 * (NIfTot + 1), size_n_int, t_r, TrialTag, ierr)
allocate(trial_energies(nexcit_keep))
trial_energies = 0.0_dp
trial_space = 0_n_int
write(stdout, '("Generating the trial space...")'); call neci_flush(stdout)
if (qmc_trial_wf) then
#ifdef CMPLX_
call stop_all(t_r, "QMC trial state initiation not supported for complex wavefunctions.")
#else
call calc_trial_states_qmc(trial_in, nexcit_keep, CurrentDets, HashIndex, replica_pairs, &
trial_space_size, trial_space, trial_wfs, trial_counts, trial_displs)
#endif
else
if (allocated(trial_est_reorder)) then
call calc_trial_states_lanczos(trial_in, nexcit_calc, trial_space_size, trial_space, temp_wfs, &
temp_energies, trial_counts, trial_displs, trial_est_reorder)
else
call calc_trial_states_lanczos(trial_in, nexcit_calc, trial_space_size, trial_space, temp_wfs, &
temp_energies, trial_counts, trial_displs)
end if
end if
write(stdout, '("Size of trial space on this processor:",1X,i8)') trial_space_size; call neci_flush(stdout)
if (.not. qmc_trial_wf) then
! Allocate the array to hold the final trial wave functions which we
! decide to keep, in the correct order.
allocate(trial_wfs(nexcit_keep, trial_space_size), stat=ierr)
if (ierr /= 0) call stop_all(t_r, "Error allocating trial_wfs.")
! Go through each replica and find which trial state matches it best.
if (nexcit_calc > 1) then
call assign_trial_states(replica_pairs, CurrentDets, HashIndex, trial_space, temp_wfs, &
trial_wfs, temp_energies, trial_energies)
else
trial_wfs = temp_wfs
trial_energies = temp_energies
root_print "energy eigenvalue(s): ", trial_energies(1:nexcit_keep)
end if
deallocate(temp_wfs, stat=ierr)
if (ierr /= 0) call stop_all(t_r, "Error deallocating temp_wfs.")
end if
! At this point, each processor has only those states which reside on them, and
! have only counted those states. Send all states to all processors for the next bit.
tot_trial_space_size = int(sum(trial_counts))
write(stdout, '("Total size of the trial space:",1X,i8)') tot_trial_space_size; call neci_flush(stdout)
! Use SpawnedParts as temporary space:
call MPIAllGatherV(trial_space(0:NIfTot, 1:trial_space_size), &
SpawnedParts(0:NIfTot, 1:tot_trial_space_size), trial_counts, trial_displs)
call sort(SpawnedParts(0:NIfTot, 1:tot_trial_space_size), ilut_lt, ilut_gt)
call assign_elements_on_procs(tot_trial_space_size, min_elem, max_elem, num_elem)
! set the size of the entries in con_ht
#ifdef CMPLX_
NConEntry = nifd + 2 * nexcit_keep
#else
NConEntry = nifd + nexcit_keep
#endif
if (num_elem > 0) then
! Find the states connected to the trial space. This typically takes a long time, so
! it is done in parallel by letting each processor find the states connected to a
! portion of the trial space.
write(stdout, '("Calculating the number of states in the connected space...")'); call neci_flush(stdout)
call generate_connected_space(num_elem, SpawnedParts(0:NIfTot, min_elem:max_elem), con_space_size)
write(stdout, '("Attempting to allocate con_space. Size =",1X,F12.3,1X,"Mb")') &
real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
allocate(con_space(0:NIfTot, con_space_size), stat=ierr)
call LogMemAlloc('con_space', con_space_size * (NIfTot + 1), size_n_int, t_r, ConTag, ierr)
con_space = 0_n_int
write(stdout, '("States found on this processor, including repeats:",1X,i8)') con_space_size
write(stdout, '("Generating and storing the connected space...")'); call neci_flush(stdout)
call generate_connected_space(num_elem, SpawnedParts(0:NIfTot, min_elem:max_elem), &
con_space_size, con_space)
write(stdout, '("Removing repeated states and sorting by processor...")'); call neci_flush(stdout)
call remove_repeated_states(con_space, con_space_size)
call sort_space_by_proc(con_space(:, 1:con_space_size), con_space_size, con_sendcounts)
else
con_space_size = 0
con_sendcounts = 0
allocate(con_space(0, 0), stat=ierr)
write(stdout, '("This processor will not search for connected states.")'); call neci_flush(stdout)
!Although the size is zero, we should allocate it, because the rest of the code use it.
!Otherwise, we get segmentation fault later.
allocate(con_space(0:NIfTot, con_space_size), stat=ierr)
end if
write(stdout, '("Performing MPI communication of connected states...")'); call neci_flush(stdout)
! Send the connected states to their processors.
! con_sendcounts holds the number of states to send to other processors from this one.
! con_recvcounts will hold the number of states to be sent to this processor from the others.
call MPIAlltoAll(con_sendcounts, 1, con_recvcounts, 1, ierr)
con_space_size_old = con_space_size
con_space_size = sum(con_recvcounts)
! The displacements necessary for mpi_alltoall.
con_sendcounts = con_sendcounts * int(NIfTot + 1, MPIArg)
con_recvcounts = con_recvcounts * int(NIfTot + 1, MPIArg)
con_senddispls(0) = 0
con_recvdispls(0) = 0
do i = 1, nProcessors - 1
con_senddispls(i) = con_senddispls(i - 1) + con_sendcounts(i - 1)
con_recvdispls(i) = con_recvdispls(i - 1) + con_recvcounts(i - 1)
end do
write(stdout, '("Attempting to allocate temp_space. Size =",1X,F12.3,1X,"Mb")') &
real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
allocate(temp_space(0:NIfTot, con_space_size), stat=ierr)
call LogMemAlloc('temp_space', con_space_size * (NIfTot + 1), size_n_int, t_r, TempTag, ierr)
call MPIAlltoAllV(con_space(:, 1:con_space_size_old), con_sendcounts, con_senddispls, &
temp_space(:, 1:con_space_size), con_recvcounts, con_recvdispls, ierr)
if (allocated(con_space)) then
deallocate(con_space, stat=ierr)
call LogMemDealloc(t_r, ConTag, ierr)
end if
write(stdout, '("Attempting to allocate con_space. Size =",1X,F12.3,1X,"Mb")') &
real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
allocate(con_space(0:NIfTot, 1:con_space_size), stat=ierr)
call LogMemAlloc('con_space', con_space_size * (NIfTot + 1), size_n_int, t_r, ConTag, ierr)
con_space = temp_space
deallocate(temp_space, stat=ierr)
call LogMemDealloc(t_r, TempTag, ierr)
! Finished sending to states to their processors.
! This will also sort the connected space.
call remove_repeated_states(con_space, con_space_size)
! Remove states in the connected space which are also in the trial space.
! We don't use this anymore, but may want to try using it again at
! some point, so just leave it commented out.
! call remove_list1_states_from_list2(SpawnedParts, con_space, tot_trial_space_size, con_space_size)
call MPISumAll(con_space_size, tot_con_space_size)
! get the sizes of the connected/trial space on the other procs, for
! estimating the max size of the buffer
call MPIAllGather(con_space_size, con_counts, ierr)
! allocate buffer for communication of con_ht
! it is normally also allocated upon initialization, so deallocate
! the dummy version
if (allocated(con_send_buf)) deallocate(con_send_buf)
! the most we can communicate at once is the full size of the largest
! trial/connected space on a single proc
allocate(con_send_buf(0:NConEntry, max(maxval(con_counts), maxval(trial_counts))))
write(stdout, '("Total size of connected space:",1X,i10)') tot_con_space_size
write(stdout, '("Size of connected space on this processor:",1X,i10)') con_space_size
call neci_flush(stdout)
! Create the trial wavefunction from all processors, on all processors.
allocate(trial_wfs_all_procs(nexcit_keep, tot_trial_space_size), stat=ierr)
call MPIAllGatherV(trial_wfs, trial_wfs_all_procs, trial_counts, trial_displs)
call sort_space_by_proc(SpawnedParts(0:NIfTot, 1:tot_trial_space_size), tot_trial_space_size, trial_counts)
write(stdout, '("Generating the vector \sum_j H_{ij} \psi^T_j...")'); call neci_flush(stdout)
allocate(con_space_vecs(nexcit_keep, con_space_size), stat=ierr)
call LogMemAlloc('con_space_vecs', con_space_size, 8, t_r, ConVecTag, ierr)
call generate_connected_space_vector(SpawnedParts, trial_wfs_all_procs, con_space, con_space_vecs)
call MPIBarrier(ierr)
if (tWriteTrial) call write_trial_space()
if (tCompareTrialAmps) call update_compare_trial_file(.true.)
allocate(current_trial_amps(nexcit_keep, MaxWalkersPart), stat=ierr)
call LogMemAlloc('current_trial_amps', nexcit_keep * MaxWalkersPart, 8, t_r, CurrentTrialTag, ierr)
call init_current_trial_amps()
if (tTrialHash) call create_trial_hashtables(nexcit_keep)
! Set these to zero, to prevent junk being printed in the initial report.
trial_numerator = 0.0_dp
tot_trial_numerator = 0.0_dp
trial_denom = 0.0_dp
tot_trial_denom = 1.0_dp
init_trial_numerator = 0.0_dp
tot_init_trial_numerator = 0.0_dp
init_trial_denom = 0.0_dp
tot_init_trial_denom = 0.0_dp
call halt_timer(Trial_Init_Time)
if (.not. qmc_trial_wf) then
root_print "Energy eigenvalue(s) of the trial space:", trial_energies(1:nexcit_keep)
end if
root_print "Trial wavefunction initialisation complete."
root_print "Total time (seconds) taken for trial wavefunction initialisation:", &
get_total_time(Trial_Init_Time)
call neci_flush(stdout)
if (tAS_TrialOffset) then
call set_AS_TrialOffset(nexcit_keep, replica_pairs)
end if
end subroutine init_trial_wf