init_trial_wf Subroutine

public subroutine init_trial_wf(trial_in, nexcit_calc, nexcit_keep, replica_pairs)

Arguments

Type IntentOptional Attributes Name
type(subspace_in) :: trial_in
integer, intent(in) :: nexcit_calc
integer, intent(in) :: nexcit_keep
logical, intent(in) :: replica_pairs

Contents

Source Code


Source Code

    subroutine init_trial_wf(trial_in, nexcit_calc, nexcit_keep, replica_pairs)

        use DetBitOps, only: ilut_lt, ilut_gt
        use enumerate_excitations, only: generate_connected_space
        use FciMCData, only: trial_space, trial_space_size, con_space, con_space_size, trial_wfs, tot_trial_space_size
        use FciMCData, only: trial_energies, ConTag, ConVecTag, TempTag, TrialTag, TrialWFTag
        use FciMCData, only: TrialTempTag, ConTempTag, OccTrialTag, Trial_Init_Time
        use FciMCData, only: OccConTag, CurrentTrialTag, current_trial_amps
        use FciMCData, only: MaxWalkersPart, tTrialHash, tIncCancelledInitEnergy
        use FciMCData, only: con_space_vecs, ntrial_excits, trial_numerator, trial_denom
        use FciMCData, only: tot_trial_numerator, tot_trial_denom, HashIndex
        use FciMCData, only: tot_init_trial_numerator, tot_init_trial_denom
        use FciMCData, only: init_trial_numerator, init_trial_denom
        use initial_trial_states, only: calc_trial_states_lanczos, calc_trial_states_qmc, calc_trial_states_direct
        use LoggingData, only: tWriteTrial, tCompareTrialAmps
        use MemoryManager, only: LogMemAlloc, LogMemDealloc
        use MPI_wrapper, only: root
        use ras_data, only: trial_ras
        use searching, only: remove_repeated_states
        use sort_mod, only: sort
        use SystemData, only: tAllSymSectors

        type(subspace_in) :: trial_in
        integer, intent(in) :: nexcit_calc, nexcit_keep
        logical, intent(in) :: replica_pairs

        integer :: i, ierr, num_states_on_proc, con_space_size_old
        integer :: excit, tot_con_space_size
        integer :: con_counts(0:nProcessors - 1)
        integer :: min_elem, max_elem, num_elem
        integer(MPIArg) :: trial_counts(0:nProcessors - 1), trial_displs(0:nProcessors - 1)
        integer(MPIArg) :: con_sendcounts(0:nProcessors - 1), con_recvcounts(0:nProcessors - 1)
        integer(MPIArg) :: con_senddispls(0:nProcessors - 1), con_recvdispls(0:nProcessors - 1)
        integer(n_int), allocatable, dimension(:, :) :: temp_space
        HElement_t(dp), allocatable :: trial_wfs_all_procs(:, :), temp_wfs(:, :)
        real(dp) :: temp_energies(nexcit_calc)
        character(len=*), parameter :: t_r = "init_trial_wf"

        ! Perform checks.
        if (tIncCancelledInitEnergy .and. (.not. tTrialHash)) &
            call stop_all(t_r, "The inc-cancelled-init-energy option cannot be used with the &
                               &trial-bin-search option.")
        if (.not. tUseRealCoeffs) call stop_all(t_r, "To use a trial wavefunction you must also &
            &use real coefficients.")
        if (nexcit_keep > nexcit_calc) call stop_all(t_r, "The number of required trial wave functions &
                               &is more than the number that has been requested to be calculated.")

        call set_timer(Trial_Init_Time)

        write(stdout, '(/,11("="),1X,"Trial wavefunction initialisation",1X,10("="))')

        ntrial_excits = nexcit_keep

        ! Simply allocate the trial vector to have up to 1 million elements for now...
        allocate(trial_space(0:NIfTot, 1000000), stat=ierr)
        call LogMemAlloc('trial_space', 1000000 * (NIfTot + 1), size_n_int, t_r, TrialTag, ierr)

        allocate(trial_energies(nexcit_keep))

        trial_energies = 0.0_dp
        trial_space = 0_n_int

        write(stdout, '("Generating the trial space...")'); call neci_flush(stdout)

        if (qmc_trial_wf) then
#ifdef CMPLX_
            call stop_all(t_r, "QMC trial state initiation not supported for complex wavefunctions.")
#else
            call calc_trial_states_qmc(trial_in, nexcit_keep, CurrentDets, HashIndex, replica_pairs, &
                                       trial_space_size, trial_space, trial_wfs, trial_counts, trial_displs)
#endif
        else
            if (allocated(trial_est_reorder)) then
                call calc_trial_states_lanczos(trial_in, nexcit_calc, trial_space_size, trial_space, temp_wfs, &
                                               temp_energies, trial_counts, trial_displs, trial_est_reorder)
            else
                call calc_trial_states_lanczos(trial_in, nexcit_calc, trial_space_size, trial_space, temp_wfs, &
                                               temp_energies, trial_counts, trial_displs)
            end if
        end if

        write(stdout, '("Size of trial space on this processor:",1X,i8)') trial_space_size; call neci_flush(stdout)

        if (.not. qmc_trial_wf) then
            ! Allocate the array to hold the final trial wave functions which we
            ! decide to keep, in the correct order.
            allocate(trial_wfs(nexcit_keep, trial_space_size), stat=ierr)
            if (ierr /= 0) call stop_all(t_r, "Error allocating trial_wfs.")
            ! Go through each replica and find which trial state matches it best.
            if (nexcit_calc > 1) then
                call assign_trial_states(replica_pairs, CurrentDets, HashIndex, trial_space, temp_wfs, &
                                         trial_wfs, temp_energies, trial_energies)
            else
                trial_wfs = temp_wfs
                trial_energies = temp_energies
                root_print "energy eigenvalue(s): ", trial_energies(1:nexcit_keep)
            end if
            deallocate(temp_wfs, stat=ierr)
            if (ierr /= 0) call stop_all(t_r, "Error deallocating temp_wfs.")
        end if

        ! At this point, each processor has only those states which reside on them, and
        ! have only counted those states. Send all states to all processors for the next bit.
        tot_trial_space_size = int(sum(trial_counts))
        write(stdout, '("Total size of the trial space:",1X,i8)') tot_trial_space_size; call neci_flush(stdout)

        ! Use SpawnedParts as temporary space:
        call MPIAllGatherV(trial_space(0:NIfTot, 1:trial_space_size), &
                           SpawnedParts(0:NIfTot, 1:tot_trial_space_size), trial_counts, trial_displs)

        call sort(SpawnedParts(0:NIfTot, 1:tot_trial_space_size), ilut_lt, ilut_gt)

        call assign_elements_on_procs(tot_trial_space_size, min_elem, max_elem, num_elem)

        ! set the size of the entries in con_ht
#ifdef CMPLX_
        NConEntry = nifd + 2 * nexcit_keep
#else
        NConEntry = nifd + nexcit_keep
#endif

        if (num_elem > 0) then

            ! Find the states connected to the trial space. This typically takes a long time, so
            ! it is done in parallel by letting each processor find the states connected to a
            ! portion of the trial space.
            write(stdout, '("Calculating the number of states in the connected space...")'); call neci_flush(stdout)

            call generate_connected_space(num_elem, SpawnedParts(0:NIfTot, min_elem:max_elem), con_space_size)

            write(stdout, '("Attempting to allocate con_space. Size =",1X,F12.3,1X,"Mb")') &
                real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
            allocate(con_space(0:NIfTot, con_space_size), stat=ierr)
            call LogMemAlloc('con_space', con_space_size * (NIfTot + 1), size_n_int, t_r, ConTag, ierr)
            con_space = 0_n_int

            write(stdout, '("States found on this processor, including repeats:",1X,i8)') con_space_size

            write(stdout, '("Generating and storing the connected space...")'); call neci_flush(stdout)

            call generate_connected_space(num_elem, SpawnedParts(0:NIfTot, min_elem:max_elem), &
                                          con_space_size, con_space)

            write(stdout, '("Removing repeated states and sorting by processor...")'); call neci_flush(stdout)

            call remove_repeated_states(con_space, con_space_size)

            call sort_space_by_proc(con_space(:, 1:con_space_size), con_space_size, con_sendcounts)

        else
            con_space_size = 0
            con_sendcounts = 0
            allocate(con_space(0, 0), stat=ierr)
            write(stdout, '("This processor will not search for connected states.")'); call neci_flush(stdout)
            !Although the size is zero, we should allocate it, because the rest of the code use it.
            !Otherwise, we get segmentation fault later.
            allocate(con_space(0:NIfTot, con_space_size), stat=ierr)
        end if

        write(stdout, '("Performing MPI communication of connected states...")'); call neci_flush(stdout)

        ! Send the connected states to their processors.
        ! con_sendcounts holds the number of states to send to other processors from this one.
        ! con_recvcounts will hold the number of states to be sent to this processor from the others.
        call MPIAlltoAll(con_sendcounts, 1, con_recvcounts, 1, ierr)
        con_space_size_old = con_space_size
        con_space_size = sum(con_recvcounts)
        ! The displacements necessary for mpi_alltoall.
        con_sendcounts = con_sendcounts * int(NIfTot + 1, MPIArg)
        con_recvcounts = con_recvcounts * int(NIfTot + 1, MPIArg)
        con_senddispls(0) = 0
        con_recvdispls(0) = 0
        do i = 1, nProcessors - 1
            con_senddispls(i) = con_senddispls(i - 1) + con_sendcounts(i - 1)
            con_recvdispls(i) = con_recvdispls(i - 1) + con_recvcounts(i - 1)
        end do

        write(stdout, '("Attempting to allocate temp_space. Size =",1X,F12.3,1X,"Mb")') &
            real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
        allocate(temp_space(0:NIfTot, con_space_size), stat=ierr)
        call LogMemAlloc('temp_space', con_space_size * (NIfTot + 1), size_n_int, t_r, TempTag, ierr)

        call MPIAlltoAllV(con_space(:, 1:con_space_size_old), con_sendcounts, con_senddispls, &
                          temp_space(:, 1:con_space_size), con_recvcounts, con_recvdispls, ierr)

        if (allocated(con_space)) then
            deallocate(con_space, stat=ierr)
            call LogMemDealloc(t_r, ConTag, ierr)
        end if
        write(stdout, '("Attempting to allocate con_space. Size =",1X,F12.3,1X,"Mb")') &
            real(con_space_size, dp) * (NIfTot + 1.0_dp) * 7.629392e-06_dp; call neci_flush(stdout)
        allocate(con_space(0:NIfTot, 1:con_space_size), stat=ierr)
        call LogMemAlloc('con_space', con_space_size * (NIfTot + 1), size_n_int, t_r, ConTag, ierr)
        con_space = temp_space
        deallocate(temp_space, stat=ierr)
        call LogMemDealloc(t_r, TempTag, ierr)
        ! Finished sending to states to their processors.

        ! This will also sort the connected space.
        call remove_repeated_states(con_space, con_space_size)

        ! Remove states in the connected space which are also in the trial space.
        ! We don't use this anymore, but may want to try using it again at
        ! some point, so just leave it commented out.
!        call remove_list1_states_from_list2(SpawnedParts, con_space, tot_trial_space_size, con_space_size)

        call MPISumAll(con_space_size, tot_con_space_size)
        ! get the sizes of the connected/trial space on the other procs, for
        ! estimating the max size of the buffer
        call MPIAllGather(con_space_size, con_counts, ierr)
        ! allocate buffer for communication of con_ht
        ! it is normally also allocated upon initialization, so deallocate
        ! the dummy version
        if (allocated(con_send_buf)) deallocate(con_send_buf)
        ! the most we can communicate at once is the full size of the largest
        ! trial/connected space on a single proc
        allocate(con_send_buf(0:NConEntry, max(maxval(con_counts), maxval(trial_counts))))

        write(stdout, '("Total size of connected space:",1X,i10)') tot_con_space_size
        write(stdout, '("Size of connected space on this processor:",1X,i10)') con_space_size
        call neci_flush(stdout)

        ! Create the trial wavefunction from all processors, on all processors.
        allocate(trial_wfs_all_procs(nexcit_keep, tot_trial_space_size), stat=ierr)
        call MPIAllGatherV(trial_wfs, trial_wfs_all_procs, trial_counts, trial_displs)

        call sort_space_by_proc(SpawnedParts(0:NIfTot, 1:tot_trial_space_size), tot_trial_space_size, trial_counts)

        write(stdout, '("Generating the vector \sum_j H_{ij} \psi^T_j...")'); call neci_flush(stdout)
        allocate(con_space_vecs(nexcit_keep, con_space_size), stat=ierr)
        call LogMemAlloc('con_space_vecs', con_space_size, 8, t_r, ConVecTag, ierr)
        call generate_connected_space_vector(SpawnedParts, trial_wfs_all_procs, con_space, con_space_vecs)

        call MPIBarrier(ierr)

        if (tWriteTrial) call write_trial_space()
        if (tCompareTrialAmps) call update_compare_trial_file(.true.)

        allocate(current_trial_amps(nexcit_keep, MaxWalkersPart), stat=ierr)
        call LogMemAlloc('current_trial_amps', nexcit_keep * MaxWalkersPart, 8, t_r, CurrentTrialTag, ierr)
        call init_current_trial_amps()

        if (tTrialHash) call create_trial_hashtables(nexcit_keep)

        ! Set these to zero, to prevent junk being printed in the initial report.
        trial_numerator = 0.0_dp
        tot_trial_numerator = 0.0_dp
        trial_denom = 0.0_dp
        tot_trial_denom = 1.0_dp

        init_trial_numerator = 0.0_dp
        tot_init_trial_numerator = 0.0_dp
        init_trial_denom = 0.0_dp
        tot_init_trial_denom = 0.0_dp

        call halt_timer(Trial_Init_Time)

        if (.not. qmc_trial_wf) then
            root_print "Energy eigenvalue(s) of the trial space:", trial_energies(1:nexcit_keep)
        end if
        root_print "Trial wavefunction initialisation complete."
        root_print "Total time (seconds) taken for trial wavefunction initialisation:", &
            get_total_time(Trial_Init_Time)
        call neci_flush(stdout)

        if (tAS_TrialOffset) then
            call set_AS_TrialOffset(nexcit_keep, replica_pairs)
        end if

    end subroutine init_trial_wf