FciMCPar Subroutine

public subroutine FciMCPar(energy_final_output)


Type IntentOptional Attributes Name
real(kind=dp), intent(out), allocatable :: energy_final_output(:)


Source Code

Source Code

    subroutine FciMCPar(energy_final_output)

        use rdm_data, only: rdm_estimates, two_rdm_main, two_rdm_recv, two_rdm_recv_2
        use rdm_data, only: two_rdm_spawn, one_rdms, rdm_definitions, en_pert_main
        use rdm_estimators, only: calc_2rdm_estimates_wrapper, write_rdm_estimates
        use rdm_data_utils, only: clear_rdm_list_t, add_rdm_1_to_rdm_2
        USE MolproPlugin, only: MolproPluginResult

        real(dp), intent(out), allocatable :: energy_final_output(:)

        integer :: iroot, isymh
        real(dp) :: Weight, Energyxw, BestEnergy
        INTEGER :: error, irdm
        LOGICAL :: TIncrement, tWritePopsFound, tSingBiasChange, tPrintWarn
        REAL(dp) :: s_start, s_end, tstart(2), tend(2), totaltime
        real(dp) :: TotalTime8
        real(dp) :: Inpair(2), Outpair(2)
        integer, dimension(lenof_sign) :: tmp_sgn
        integer :: tmp_int(lenof_sign), i, istart, iRDMSamplingIter
        real(dp) :: grow_rate, EnergyDiff, Norm_2RDM
        TYPE(BasisFn) RefSym
        real(dp) :: mean_ProjE_re, mean_ProjE_im, mean_Shift
        real(dp) :: ProjE_Err_re, ProjE_Err_im, Shift_Err
        logical :: tNoProjEValue, tNoShiftValue
        real(dp) :: BestErr
        real(dp) :: start_time, stop_time
        logical :: tStartedFromCoreGround
        real(dp), dimension(100) :: lt_sum, lt_max

        real(dp):: lt_imb, lt_imb_cycle
        integer:: rest, err, allErr

        real(dp) :: CurrentSign(lenof_sign)
        integer :: j
        integer :: pops_iter, nJ(nel)

        HElement_t(dp):: InstE(inum_runs)
        HElement_t(dp):: AccumE(inum_runs)
        integer :: ExcitLevel

        logical :: t_comm_done, tScheduledLoadBalance
        integer :: run

        ! Procedure pointer temporaries
        procedure(generate_excitation_t), pointer :: ge_tmp
        procedure(get_spawn_helement_t), pointer :: gs_tmp
        procedure(attempt_die_t), pointer :: ad_tmp

        character(*), parameter :: this_routine = 'FciMCPar'
        character(6), parameter :: excit_descriptor(0:3) = &
                                   (/"IC0   ", "single", "double", "triple"/)

        integer :: tmp_det(nel)

        if (tJustBlocking) then
            ! Just reblock the current data, and do not perform an fcimc calculation.
            write(stdout, "(A)") "Skipping FCIQMC calculation and simply reblocking previous output"
            call Standalone_Errors()
        end if
        err = 0
        ProjE_Err_re = 1.0_dp
        ProjE_Err_im = 1.0_dp
        shift_err = 1.0_dp

        TDebug = .false.  ! Set debugging flag

        if (t_store_ci_coeff .and. n_store_ci_level > 3) then
            call stop_all(this_routine,'!ERROR! CI COEFFICIENTS collection not implemented for &
                                        &excitation levels higher than 3')
        else if (t_store_ci_coeff .and. tHPHF) then
            call stop_all(this_routine,'!ERROR! CI COEFFICIENTS collection not working with HPHF')

        ! This is set here not in SetupParameters, as otherwise it would be
        ! wiped just when we need it!
        tPopsAlreadyRead = .false.

        call SetupParameters()
        call init_fcimc_fn_pointers()
        call InitFCIMCCalcPar()

        if (tLogGreensfunction .and. .not. t_real_time_fciqmc) then
            call init_overlap_buffers()
        end if

        if (t_new_real_space_hubbard) then
            call init_real_space_hubbard()
        end if
        if (t_tJ_model) then
            if (tGUGA) then
                call init_guga_tj_model()
                call init_tJ_model()
            end if
        end if
        if (t_heisenberg_model) then
            if (tGUGA) then
                call init_guga_heisenberg_model()
                call init_heisenberg_model()
            end if
        end if
        ! try to call this earlier..
        ! just do it twice for now..
        if (t_k_space_hubbard) then
            call init_k_space_hubbard()
        end if

#ifdef DEBUG_
        call decode_bit_det(tmp_det, ilutHF)
        write(stdout, *) "HF: ", tmp_det
        call decode_bit_det(tmp_det, ilutHF_true)
        write(stdout, *) "HF_true: ", tmp_det
        call decode_bit_det(tmp_det, ilutRef(:, 1))
        write(stdout, *) "Ref: ", tmp_det
        write(stdout, *) "ProjEDet: ", ProjEDet

        ! Attach signal handlers to give a more graceful death-mannerism
        call init_signals()

        ! We want to do some population checking before we run any iterations.
        ! In the normal case this is run between iterations, but it is
        ! helpful to do it here.
        call population_check()

        if (n_int /= int64) then
            call stop_all('setup parameters', 'Use of realcoefficients requires 64 bit integers.')
        end if

        if (tDetermProj) then
            ! If performing a deterministic projection instead of an FCIQMC calc:
            if (tDetermProjApproxHamil) then
                call perform_determ_proj_approx_ham()
                call perform_determ_proj()
            end if
        else if (tFTLM) then
            ! If performing a finite-temperature Lanczos method job instead of FCIQMC:
            call perform_ftlm()
        else if (tSpecLanc) then
            call perform_spectral_lanczos()
        else if (tExactSpec) then
            call get_exact_spectrum()
        else if (tExactDiagAllSym) then
            call perform_exact_diag_all_symmetry()
        end if

        ! Initial output
        if (tFCIMCStats2) then
            call write_fcimcstats2(iter_data_fciqmc, initial=.true.)
            call write_fcimcstats2(iter_data_fciqmc)
            call WriteFciMCStatsHeader()
            ! Prepend a # to the initial status line so analysis doesn't pick
            ! up repetitions in the FCIMCStats or INITIATORStats files from
            ! restarts.
            !write(stdout,'("#")', advance='no')
            if (iProcIndex == root) then
                write(fcimcstats_unit, '("#")', advance='no')
                if (inum_runs == 2) &
                    write(fcimcstats_unit2, '("#")', advance='no')
                write(initiatorstats_unit, '("#")', advance='no')
                if (tLogEXLEVELStats) &
                    write(EXLEVELStats_unit, '("#")', advance='no')
            end if
            call WriteFCIMCStats()
        end if

        if (t_measure_local_spin) then
            call write_local_spin_stats(initial = .true.)
            call write_local_spin_stats()
        end if
        ! double occupancy:
        if (t_calc_double_occ) then
            call write_double_occ_stats(initial=.true.)
            call write_double_occ_stats()

            if (t_spin_measurements) then

                call write_spin_diff_stats(initial=.true.)
                call write_spin_diff_stats()

                call write_spat_doub_occ_stats(initial=.true.)
                call write_spat_doub_occ_stats()
            end if
        end if
        ! Put a barrier here so all processes synchronise before we begin.
        call MPIBarrier(error)

        ! Start MC simulation...
        ! If tIncrument is true, it means that when it comes out of the loop,
        ! it wants to subtract 1 from the iteration count to get the true
        ! number of iterations
        tIncrement = .true.
        Iter = 1
        iRDMSamplingIter = 1    !For how many iterations have we accumulated the RDM

        SumSigns = 0.0_dp
        SumSpawns = 0.0_dp

        ! In we go - start the timer for scaling curve!
        start_time = neci_etime(tstart)
        lt_imb = 0.
        lt_max = 0.
        lt_sum = 0.
        ! For calculations with only few iterations
        lt_arr = 0.
        lt_imb_cycle = 0.

        main_iteration_loop: do while (.true.)
            if (TestMCExit(Iter, iRDMSamplingIter)) then
                ! The popsfile requires the right total walker number, so
                ! update it (TotParts is updated in the annihilation step)
                call MPISumAll(TotParts, AllTotParts)
            end if

            ! start logging average spawns once we enter the variable shift mode
            if (.not. any(tSinglePartPhase) .and. tActivateLAS) tLogAverageSpawns = .true.

            IFDEBUG(FCIMCDebug, 2) write(stdout, *) 'Iter', iter

            if (iProcIndex == root) s_start = neci_etime(tstart)

            ! Update the semistochastic space if requested
            if (tSemiStochastic .and. tDynamicCoreSpace .and. &
                mod(iter - semistochStartIter, &
                    coreSpaceUpdateCycle) == 0) then
                call refresh_semistochastic_space()
                write(stdout, *) "Refereshing semistochastic space at iteration ", iter
            end if

            ! Is this an iteration where semi-stochastic is turned on?
            if (semistoch_shift_iter /= 0 .and. .not. any(tSinglePartPhase)) then
                if ((Iter - maxval(VaryShiftIter)) == semistoch_shift_iter + 1) then
                    tSemiStochastic = .true.
                    call init_semi_stochastic(ss_space_in, tStartedFromCoreGround)
                    if (tStartedFromCoreGround) call set_initial_run_references()
                    ! Count iterations for corespace updates from here
                    semistochStartIter = iter
                    ! and switch how iterations for SI updates are counted
                    SIUpdateOffset = semistochStartIter
                end if
            end if

            ! Update the trial wavefunction if requested
            if (tTrialWavefunction .and. tDynamicTrial .and. &
                mod(iter - trial_shift_iter, trialSpaceUpdateCycle) == 0) then
                if (tPairedReplicas) then
                    call refresh_trial_wf(trial_space_in, ntrial_ex_calc, &
                                          inum_runs.div.2, .true.)
                    call refresh_trial_wf(trial_space_in, ntrial_ex_calc, &
                                          inum_runs, .false.)
                end if
                write(stdout, *) "Refreshing trial wavefunction at iteration ", iter
            end if

            if (((Iter - maxval(VaryShiftIter)) == allDoubsInitsDelay + 1 &
                 .and. all(.not. tSinglePartPhase))) then
                ! Start the all-doubs-initiator procedure
                if (tDelayAllDoubsInits) call enable_adi()
                ! If desired, we now set up the references for the purpose of the
                ! all-doubs-initiators
                if (tDelayGetRefs) then
                    ! Re-initialize the reference space
                    call setup_reference_space(.true.)
                end if
            end if

            ! turn on double occ measurement after equilibration
            if (equi_iter_double_occ /= 0 .and. all(.not. tSinglePartPhase)) then
                if ((iter - maxval(VaryShiftIter)) == equi_iter_double_occ + 1) then
                    t_calc_double_occ_av = .true.
                end if
            end if

            ! [W.D]
            ! option to enable back-spawning after a certain number of iterations
            ! but this should be independent if the shift is varied already (or?)
            if (back_spawn_delay /= 0 .and. iter == back_spawn_delay + 1) then
                if (t_back_spawn_flex_option) then
                    t_back_spawn_flex = .true.
                else if (t_back_spawn_option) then
                    t_back_spawn = .true.
                end if
                call init_back_spawn()
            end if

            if (t_cc_amplitudes .and. cc_delay /= 0 .and. all(.not. tSinglePartPhase)) then
                if ((iter - maxval(VaryShiftIter)) == cc_delay + 1) then
                    ! for now just test if it works
                    call init_cc_amplitudes()
                end if
            end if

            ! Is this an iteration where trial-wavefunction estimators are
            ! turned on?
            if (tStartTrialLater .and. all(.not. tSinglePartPhase)) then
                if ((Iter - maxval(VaryShiftIter)) == trial_shift_iter + 1) then
                    tTrialWavefunction = .true.

                    if (tPairedReplicas) then
                        call init_trial_wf(trial_space_in, ntrial_ex_calc, inum_runs.div.2, .true.)
                        call init_trial_wf(trial_space_in, ntrial_ex_calc, inum_runs, .false.)
                    end if
                end if
            end if

            if (equi_iter_double_occ /= 0 .and. all(.not. tSinglePartPhase)) then
                if ((iter - maxval(VaryShiftIter)) == equi_iter_double_occ + 1) then
                    t_calc_double_occ_av = .true.
                end if
            end if

            if (tRDMonFly .and. (.not. tFillingExplicRDMonFly) &
                & .and. (.not. tFillingStochRDMonFly)) call check_start_rdm()

            if (tContTimeFCIMC) then
                call iterate_cont_time(iter_data_fciqmc)
                call PerformFciMCycPar(iter_data_fciqmc, err)
                if (err /= 0) call stop_all(this_routine, 'Failure from PerformFciMCycPar')
            end if

            if (tAccumPops .and. iter + PreviousCycles >= iAccumPopsIter) then
                if (.not. tAccumPopsActive) then
                    tAccumPopsActive = .true.
                    write(stdout, *) "Starting to accumulate populations ..."
                end if

                call update_pops_sum_all(TotWalkers, iter + PreviousCycles)
                iAccumPopsCounter = iAccumPopsCounter + 1

                ! The currentdets is almost full, we should start removing
                ! dets which have been empty long enough
                if (iAccumPopsExpireIters > 0 .and. TotWalkers > AccumPopsExpirePercent * real(MaxWalkersPart, dp)) then
                    do j = 1, int(TotWalkers)
                        ! The loop is over empty dets only
                        call extract_sign(CurrentDets(:, j), CurrentSign)
                        if (.not. IsUnoccDet(CurrentSign)) cycle

                        ! Keep semi-stochastic dets
                        if (check_determ_flag(CurrentDets(:, j))) cycle

                        ! Keep up to double excitations
                        ExcitLevel = FindBitExcitLevel(iLutHF, CurrentDets(:, j))
                        if (ExcitLevel <= 2) cycle

                        ! Check if the det has already been removed
                        if (test_flag(CurrentDets(:, j), flag_removed)) cycle

                        ! Now if it has been empty for a long time, remove it
                        pops_iter = INT(get_pops_iter(j))
                        if (iter + PreviousCycles - pops_iter > iAccumPopsExpireIters) then
                            call decode_bit_det(nJ, CurrentDets(:, j))
                            call RemoveHashDet(HashIndex, nJ, j)
                        end if
                    end do
                end if
            end if

            ! if the iteration failed, stop the calculation now
            if (err /= 0) then
                call stop_all(this_routine, 'if the iteration failed, stop the calculation now')
            end if

            if (iProcIndex == root) then
                s_end = neci_etime(tend)
                IterTime = real(IterTime + (s_end - s_start), kind=sp)
            end if

            if (loadBalanceInterval > 0) then
                tScheduledLoadBalance = mod(iter, loadBalanceInterval) == 0
                tScheduledLoadBalance = .false.
            end if

            ! Add some load balancing magic!
            if (tLoadBalanceBlocks .and. (tScheduledLoadBalance .or. &
                                          (mod(iter, lb_measure_cycle) == 1 .and. loadBalanceInterval == 0)) .and. &
                .not. tSemiStochastic .and. .not. tFillingStochRDMOnFly) then
                ! Use the ratio of time lost due to load imbalance as an estimtor
                ! whether load balancing should be used
                if (iter > lb_measure_cycle) then
                    lt_imb_cycle = lt_imb_cycle / sum(lt_sum)
                    lt_imb_cycle = 0.0
                end if
                if (need_load_balancing(lt_imb_cycle) .or. tScheduledLoadBalance) then
                    call adjust_load_balance(iter_data_fciqmc)
                end if
            end if

            if (SIUpdateInterval > 0) then
                ! Regular update of the superinitiators. Use with care as it
                ! is still rather expensive if secondary superinitiators are used
                if (mod(iter - SIUpdateOffset, SIUpdateInterval) == 0) then
                    ! the reference population needs some time to equilibrate
                    ! hence, nRefs cannot be updated that often
                    if (mod(iter, nRefUpdateInterval) == 0) call adjust_nRefs()
                    call update_reference_space(tReadPops .or. all(.not. tSinglePartPhase))
                end if
            end if

            ! as alternative to the update cycle bound output, a specified output interval
            ! may be given that is not correlated with the update cycle
            ! -> do the output independently, only requires communication + write
            ! An output frequency below 1 means no output
            t_comm_done = .false.
            if (StepsPrint > 0 .and. .not. tCoupleCycleOutput) then
                if (mod(Iter, StepsPrint) == 0) then
                    ! just perform a communication + output, without update
                    call iteration_output_wrapper(iter_data_fciqmc, TotParts, tPairedReplicas)
                    ! mark that the communication has been done
                    t_comm_done = .true.
                end if
            end if

            if (mod(Iter, StepsSft) == 0) then

                ! Has there been a particle bloom this update cycle? Loop
                ! through the spawned particle types, and output details.
                ! If outputting only the biggest blooms to date, keep
                ! track of that.
                if (iProcIndex == Root) then
                    istart = 1
                    ! if (tSpinProjDets) istart = 0
                    do i = istart, max_ex_level
                        if (bloom_count(i) /= 0) then
                            if (.not. tMaxBloom .or. &
                                bloom_sizes(i) > bloom_max(i)) then
                                bloom_max(i) = bloom_sizes(i)

                                write(stderr, bloom_warn_string) &
                                    trim(excit_descriptor(i)), bloom_sizes(i), &

                                if (tUEG2) &
                                    call stop_all(this_routine, "Should never &
                                                 &bloom in UEG2")

                            end if
                        end if
                    end do
                end if
                ! Zero the accumulators
                bloom_sizes = 0
                bloom_count = 0
                ! Calculate the a new value for the shift (amongst other
                ! things). Generally, collate information from all processors,
                ! update statistics and output them to the user.
                call set_timer(Stats_Comms_Time)
                call calculate_new_shift_wrapper(iter_data_fciqmc, TotParts, &
                                                 tPairedReplicas, t_comm_req=.not. t_comm_done)
                ! If the output is in sync, do a non-communicated output
                if (tCoupleCycleOutput) call iteration_output_wrapper(iter_data_fciqmc, TotParts, &
                                                                      tPairedReplicas, t_comm_req=.false.)
                call halt_timer(Stats_Comms_Time)

                if (t_measure_local_spin) then
                    call write_local_spin_stats()
                end if
                ! in calculate_new_shift_wrapper output is plotted too!
                ! so for now do it here for double occupancy
                if (t_calc_double_occ) then
                    call write_double_occ_stats()
                    if (t_spin_measurements) then
                        call write_spin_diff_stats()
                        call write_spat_doub_occ_stats()
                    end if
                end if

                if (tRestart) cycle

                IF ((tTruncCAS .or. tTruncSpace .or. tTruncInitiator) .and. (Iter > iFullSpaceIter) &
                    .and. (iFullSpaceIter /= 0)) THEN
!Test if we want to expand to the full space if an EXPANDSPACE variable has been set
                    IF (tHistSpawn .or. tCalcFCIMCPsi) THEN
                        IF (iProcIndex == 0) write(stdout, *) "Unable to expand space since histgramming the wavefunction..."
                        ICILevel = 0
                        tTruncSpace = .false.
                        tTruncCAS = .false.
                        IF (tTruncInitiator) tTruncInitiator = .false.
                        IF (iProcIndex == 0) THEN
                            write(stdout, *) "Expanding to the full space on iteration ", Iter + PreviousCycles
                        end if
                    end if
                end if

                if (iProcIndex == root) TotalTime8 = real(s_end - s_global_start, dp)

                call MPIBCast(TotalTime8)    !TotalTime is local - broadcast to all procs

!This routine will check for a CHANGEVARS file and change the parameters of the calculation accordingly.
                CALL ChangeVars(tSingBiasChange, tWritePopsFound)
                IF (tSoftExitFound) THEN
                    !Now changed such that we do one more block of iterations and then exit, to allow for proper
                    !inclusion of all the RDM elements
                    NMCyc = Iter + StepsSft
                    tSoftExitFound = .false.

                end if
                IF (tTimeExit .and. (TotalTime8 >= MaxTimeExit)) THEN
                    !Is it time to exit yet?
                    write(stdout, "(A,F8.2,A)") "Time limit reached for simulation of: ", MaxTimeExit / 60.0_dp, " minutes - exiting..."
                    NMCyc = Iter + StepsSft
                    ! Set this to false so that this if statement won't be entered next time.
                    tTimeExit = .false.
                end if
                IF (iExitWalkers /= -1_int64 .and. sum(AllTotParts) > iExitWalkers) THEN
                    !Exit criterion based on total walker number met.
                    write(stdout, "(A,I15)") "Total walker population exceeds that given by &
                        &EXITWALKERS criteria - exiting...", sum(AllTotParts)
                    tIncrement = .false.
                end if
                IF (tWritePopsFound) THEN
!We have explicitly asked to write out the POPSFILE from the CHANGEVARS file.
                    CALL WriteToPopsfileParOneArr(CurrentDets, TotWalkers)
                end if
                IF (tSingBiasChange) THEN
                    CALL CalcApproxpDoubles()
                end if

                if ((PopsfileTimer > 0.0_dp) .and. ((iPopsTimers * PopsfileTimer) < (TotalTime8 / 3600.0_dp))) then
                    !Write out a POPSFILE every PopsfileTimer hours
                    if (iProcIndex == Root) then
                        CALL RENAME('popsfile.h5', 'popsfile.h5.bk')
                        CALL RENAME('POPSFILEBIN', 'POPSFILEBIN.bk')
                        CALL RENAME('POPSFILEHEAD', 'POPSFILEHEAD.bk')
                        write(stdout, "(A,F7.3,A)") "Writing out a popsfile after ", iPopsTimers * PopsfileTimer, " hours..."
                    end if
                    call WriteToPopsfileParOneArr(CurrentDets, TotWalkers)
                    iPopsTimers = iPopsTimers + 1
                    if (iProcIndex == Root) then
                        s_end = neci_etime(tend)
                        write(stdout, "(A,F7.3,A)") "Time taken to write out POPSFILE: ", real(s_end, dp) - TotalTime8, " seconds."
                    end if
                end if
                if (tHistExcitToFrom) &
                    call write_zero_hist_excit_tofrom()

            end if   !Endif end of update cycle

            IF (TPopsFile .and. (.not. tPrintPopsDefault) .and. (mod(Iter, iWritePopsEvery) == 0)) THEN
                ! differentiate between normal routine and the real-time
                ! preperation
                if (t_prepare_real_time) then
                    if (cnt_real_time_copies < n_real_time_copies) then
                        CALL WriteToPopsfileParOneArr(CurrentDets, TotWalkers)
                        cnt_real_time_copies = cnt_real_time_copies + 1
                        ! if the number of wanted copies is reached exit the
                        ! calculation. REMINDER: at the end of the calc. an
                        ! additional popsfile is printed -> so start count at 1
                        ! todo: cleanly exit calc. here!
                        NMCyc = Iter + StepsSft
                        t_prepare_real_time = .false.
                        ! do want to finish all the rdm stuff to go on, but
                        ! do not want anymore popsfile to be printed except
                        ! at the very end..
                        tPrintPopsDefault = .true.
                    end if
                    !This will write out the POPSFILE if wanted
                    CALL WriteToPopsfileParOneArr(CurrentDets, TotWalkers)
                end if
            end if

            if (TPopsFile .and. tHDF5TruncPopsWrite .and. iHDF5TruncPopsIter > 0) then
                if (mod(Iter, iHDF5TruncPopsIter) == 0) then
                    call write_popsfile_hdf5(iHDF5TruncPopsEx, .true.)
                end if
            end if
            IF (tHistSpawn .and. (mod(Iter, iWriteHistEvery) == 0) .and. (.not. tRDMonFly)) THEN
                CALL WriteHistogram()
            end if

            if (t_store_ci_coeff .and. all(.not. tSinglePartPhase) .and. iter >= NMCyc-n_iter_ci_coeff+1) then
                if (t_start_ci_coeff) write(stdout,'(A45,I9)') 'START CI COEFFICIENTS COLLECTION at iteration',iter
                t_start_ci_coeff = .false.
                call store_ci_coeff()
            else if (t_store_ci_coeff .and. iter == NMCyc ) then
                t_store_ci_coeff = .false.
                write(stdout,*) ''
                write(stdout,*) '***CI COEFFICIENTS COLLECTION HAS NOT OCCURRED: NMCyc too small***'
                write(stdout,*) ''
            end if

            ! accumulate the rdm correction due to adaptive shift
            if (tAdaptiveShift .and. all(.not. tSinglePartPhase)) call UpdateRDMCorrectionTerm()

            if (tRDMonFly .and. all(.not. tSinglePartPhase)) then
                ! If we wish to calculate the energy, have started accumulating the RDMs,
                ! and this is an iteration where the energy should be calculated, do so.
                if (print_2rdm_est .and. ((Iter - maxval(VaryShiftIter)) > IterRDMonFly) &
                    .and. (mod(Iter + PreviousCycles - IterRDMStart + 1, RDMEnergyIter) == 0)) then

                    ! rezero the count of how many iterations we have been averaging over
                    ThisRDMIter = 0.0_dp
                    if (tOutputInitsRDM) then
                        call calc_2rdm_estimates_wrapper(rdm_inits_defs, inits_estimates, &
                                                         two_rdm_inits, en_pert_main)
                    end if
                    if (tInitsRDMRef .and. tNonInitsForRDMs) then
                        ! add the initiator-only rdm of this cycle to the main rdm (rescaled
                        ! with the correction factor)
                        call add_rdm_1_to_rdm_2(two_rdm_inits, two_rdm_main, RDMCorrectionFactor)
                        ! get the inits-only energy
                        call calc_2rdm_estimates_wrapper(rdm_inits_defs, inits_estimates, &
                                                         two_rdm_inits, en_pert_main)
                        tSetupInitsEst = .true.
                        ! and reset the initiator-only rdm
                        call clear_rdm_list_t(two_rdm_inits)
                    end if
                    call calc_2rdm_estimates_wrapper(rdm_definitions, rdm_estimates, two_rdm_main, en_pert_main)

                    if (iProcIndex == 0) then
                        if (.not. tInitsRDMRef .or. tSetupInitsEst) &
                            call write_rdm_estimates(rdm_definitions, rdm_estimates, .false., print_2rdm_est, .false.)
                        if (tOutputInitsRDM) call write_rdm_estimates(rdm_inits_defs, &
                                                                      inits_estimates, .false., print_2rdm_est, .true.)
                    end if

                    if (tEN2) then
                        ! If calculating the Epstein-Nesbet perturbation, reset the
                        ! array and hash table where contributions are accumulated.
                        en_pert_main%ndets = 0
                        call clear_hash_table(en_pert_main%hash_table)
                    end if
                end if

            end if

            if (tChangeVarsRDM) then
                ! Decided during the CHANGEVARS that the RDMs should be calculated.
                call init_rdms(rdm_definitions%nrdms_standard, rdm_definitions%nrdms_transition)
                tRDMonFly = .true.
                tChangeVarsRDM = .false.
            end if

            if (tDiagWalkerSubspace .and. (mod(Iter, iDiagSubspaceIter) == 0)) then
                ! Diagonalise a subspace consisting of the occupied determinants
                call DiagWalkerSubspace()
            end if

            ! If requested and on a correct iteration, update the COMPARETRIAL file.
            if (tCompareTrialAmps) then
                ASSERT(compare_amps_period /= 0)
                if (mod(Iter, compare_amps_period) == 0) then
                    call update_compare_trial_file(.false.)
                end if
            end if

            ! Compute the time lost due to load imbalance - aggregation done for 100 iterations
            ! at a time to avoid unnecessary synchronisation points
            if (mod(Iter, lb_measure_cycle) == 0) then
                call MPIAllReduce(lt_arr, MPI_SUM, lt_sum)
                call MPIAllReduce(lt_arr, MPI_MAX, lt_max)
                lt_imb_cycle = sum(lt_max - lt_sum / nProcessors)
                lt_imb = lt_imb + lt_imb_cycle
            end if

            Iter = Iter + 1
            if (tFillingStochRDMonFly) iRDMSamplingIter = iRDMSamplingIter + 1

        end do main_iteration_loop

        ! Final output is always enabled
        tSuppressSIOutput = .false.

        ! We are at the end - get the stop-time. Output the timing details
        stop_time = neci_etime(tend)
        write(stdout, *) '- - - - - - - - - - - - - - - - - - - - - - - -'
        write(stdout, *) 'Total loop-time: ', stop_time - start_time

        !add load imbalance from remaining iterations (if any)
        rest = mod(Iter - 1, lb_measure_cycle)
        if (rest > 0) then
            call MPIReduce(lt_arr, MPI_SUM, lt_sum)
            call MPIReduce(lt_arr, MPI_MAX, lt_max)
            lt_imb = lt_imb + sum(lt_max(1:rest) - lt_sum(1:rest) / nProcessors)
        end if
        if (iProcIndex == 0) write(stdout, *) 'Time lost due to load imbalance: ', lt_imb
        write(stdout, *) '- - - - - - - - - - - - - - - - - - - - - - - -'

        if (t_store_ci_coeff) then
            call output_ci_coeff()
        end if

        if (allocated(input_tau_search_method)) then
            if (t_print_frq_histograms .and. input_tau_search_method == possible_tau_search_methods%HISTOGRAMMING) then
                call print_frequency_histograms()
            end if
        end if

        if (t_cc_amplitudes .and. t_plot_cc_amplitudes) then
            call print_cc_amplitudes()
        end if

        if (tFValEnergyHist) call print_fval_energy_hist(FvalEnergyHist_EnergyBins, FvalEnergyHist_FValBins)
        if (tFValPopHist) call print_fval_pop_hist(FvalPopHist_PopBins, FvalPopHist_FValBins)

        ! Remove the signal handlers now that there is no way for the
        ! soft-exit part to work
        call clear_signals()

        ! Reduce the iteration count fro the POPSFILE since it is incremented
        ! upon leaving the loop (If done naturally).
        IF (TIncrement) Iter = Iter - 1
        IF (TPopsFile) THEN
            CALL WriteToPopsfileParOneArr(CurrentDets, TotWalkers)

            if (tHDF5TruncPopsWrite) then
                ! If we have already written a file in the last iteration,
                ! we should not write it again
                if (iHDF5TruncPopsIter == 0 .or. (mod(Iter, iHDF5TruncPopsIter) /= 0)) then
                    call write_popsfile_hdf5(iHDF5TruncPopsEx)
                    write(stdout, *)
                    write(stdout, *) "============== Writing Truncated HDF5 popsfile =============="
                    write(stdout, *) "Unnecessary duplication of truncated popsfile is avoided."
                    write(stdout, *) "It has already been written in the last iteration."
                end if
            end if

            if (tPopsProjE) then
                call calc_proje(InstE, AccumE)
                write(stdout, *)
                write(stdout, *) 'Instantaneous projected energy of popsfile:', InstE + Hii
                if (tAccumPopsActive) &
                    write(stdout, *) 'Accumulated projected energy of popsfile:', AccumE + Hii
            end if
        end if

        IF (tCalcFCIMCPsi) THEN
!This routine will actually only print the matrix if tPrintFCIMCPsi is on
            CALL PrintFCIMCPsi()

            IF (tFindCINatOrbs) THEN
                ! This routine takes the wavefunction Psi, calculates the one
                ! electron density matrix, and rotates the HF orbitals to
                ! produce a new ROFCIDUMP file.
                CALL RotateOrbs()
                CALL MPIBarrier(error)
            end if
        end if

        ! If requested, write the most populated states in CurrentDets to a
        ! CORESPACE file, for use in future semi-stochastic calculations.
        if (tWriteCoreEnd) call write_most_pop_core_at_end(write_end_core_size)

        IF (tHistSpawn) CALL WriteHistogram()

        IF (tHistEnergies) CALL WriteHistogramEnergies()

        IF (tPrintOrbOcc) THEN
            CALL PrintOrbOccs(OrbOccs)
        end if

        if (t_measure_local_spin) then
            call finalize_local_spin_measurement()
        end if

        if (t_calc_double_occ) then
            ! also output the final estimates from the summed up
            ! variable:
            if (iProcIndex == root) then
                print *, " ===== "
                print *, " Double occupancy from direct measurement: ", &
                    sum_double_occ / (sum_norm_psi_squared * real(StepsSft, dp))
                print *, " ===== "
            end if
            if (t_spin_measurements) then
                call finalize_double_occ_and_spin_diff()
            end if
        end if

        if (tFillingStochRDMonFly .or. tFillingExplicRDMonFly) then
            call finalise_rdms(rdm_definitions, one_rdms, two_rdm_main, two_rdm_recv, &
                               two_rdm_recv_2, en_pert_main, two_rdm_spawn, rdm_estimates, &
            ! if available, also output the initiator-rdms
            if (tInitsRDM) call finalise_rdms(rdm_inits_defs, inits_one_rdms, two_rdm_inits, &
                                              two_rdm_recv, two_rdm_recv_2, en_pert_main, two_rdm_inits_spawn, inits_estimates, &
        end if
        if (tAdaptiveShift) then
            write(stdout, *) "Prefactor of RDM correction due to adaptive shift", RDMCorrectionFactor
        end if

        call PrintHighPops()

        if (tSemiStochastic .and. t_print_core_vec) then
            call print_determ_vec()
            if (tFillingStochRDMonFly) then
                call print_determ_vec_av()
            end if
        end if

        if (t_symmetry_analysis) then
            call analyze_wavefunction_symmetry()
        end if

        !Close open files.
        IF (iProcIndex == Root) THEN
            if (inum_runs == 2) close(fcimcstats_unit2)
            IF (tTruncInitiator) close(initiatorstats_unit)
            IF (tLogComplexPops) close(complexstats_unit)
            if (tWritePopsNorm) close(pops_norm_unit)
            if (tLogEXLEVELStats) close(EXLEVELStats_unit)
        end if
        IF (TDebug) close(11)

        call finalize_tau()

        if (tHistSpawn) then
        end if
        if (tDiagWalkerSubspace) then
        end if

        if (tReplicaEstimates .and. iProcIndex == 0) then
        end if

        ! Print out some load balancing stats nicely to end.
        ! n.B. TotWalkers is the number of determinants. (Horrible naming indeed).
        !       TotParts is the number of walkers.
        if (iProcIndex == Root) write(stdout, *) ! linebreak
        if (iProcIndex == Root) then
            write(stdout, '(/,1X,a55)') 'Load balancing information based on the last iteration:'
        end if
        if (iProcIndex == Root) write(stdout, *) ! linebreak
            INTEGER(int64) :: MaxWalkers, MinWalkers

            CALL MPIReduce(TotWalkers, MPI_MAX, MaxWalkers)
            CALL MPIReduce(TotWalkers, MPI_MIN, MinWalkers)
            CALL MPIAllReduce(TotWalkers, MPI_SUM, AllTotWalkers)
            if (iProcIndex == Root) then
                write(stdout, '(1X,a35,1X,f18.10)') 'Mean number of determinants/process:', &
                        real(AllTotWalkers, dp) / real(nNodes, dp)
                write(stdout, '(1X,a34,1X,i18)') 'Min number of determinants/process:', MinWalkers
                write(stdout, '(1X,a34,1X,i18,/)') 'Max number of determinants/process:', MaxWalkers
            end if
        end block
        if (iProcIndex == Root) write(stdout, *) ! linebreak
            real(dp) :: total_n_walkers, min_n_walkers, max_n_walkers

            CALL MPIReduce(sum(abs(TotParts)), MPI_MAX, max_n_walkers)
            CALL MPIReduce(sum(abs(TotParts)), MPI_MIN, min_n_walkers)
            CALL MPIReduce(sum(abs(TotParts)), MPI_SUM, total_n_walkers)
            if (iProcIndex == Root) then
                write(stdout, '(/,1X,a55)') 'Load balancing information based on the last iteration:'
                write(stdout, '(1X,a35,1X,f18.10)') 'Mean number of walkers/process:', &
                        total_n_walkers / real(nNodes, dp)
                write(stdout, '(1X,a34,1X,f18.5)') 'Min number of walkers/process:', min_n_walkers
                write(stdout, '(1X,a34,1X,f18.5,/)') 'Max number of walkers/process:', max_n_walkers
            end if
        end block
        if (iProcIndex == Root) write(stdout, *) ! linebreak

        ! Automatic error analysis.
        call error_analysis(tSinglePartPhase(1), iBlockingIter(1), mean_ProjE_re, ProjE_Err_re, &
                            mean_ProjE_im, ProjE_Err_im, mean_Shift, Shift_Err, tNoProjEValue, tNoShiftValue)

        call MPIBCast(ProjectionE)
        call MPIBCast(mean_ProjE_re)
        call MPIBCast(ProjE_Err_re)
        call MPIBCast(mean_ProjE_im)
        call MPIBCast(ProjE_Err_im)
        call MPIBCast(mean_Shift)
        call MPIBCast(Shift_Err)
        call MPIBCast(tNoProjEValue)
        call MPIBCast(tNoShiftValue)

        if (tTrialWavefunction) then
            energy_final_output = tot_trial_numerator / tot_trial_denom
            energy_final_output = ProjectionE + Hii
        end if

        ! get the value of OutputHii - the offset used for the projected energy
        ! (not necessarily the one used for the Shift!)
        call getProjEOffset()
        iroot = 1
        CALL GetSym(ProjEDet(:, 1), NEl, G1, NBasisMax, RefSym)
        isymh = int(RefSym%Sym%S) + 1
        write(stdout, '('' Current reference energy'',T52,F19.12)') OutputHii
        if (tNoProjEValue) then
            write(stdout, '('' Projected correlation energy'',T52,F19.12)') real(ProjectionE(1), dp)
            write(stdout, "(A)") " No automatic errorbar obtained for projected energy"
            write(stdout, '('' Projected correlation energy'',T52,F19.12)') mean_ProjE_re
            write(stdout, '('' Estimated error in Projected correlation energy'',T52,F19.12)') ProjE_Err_re
            if (lenof_sign == 2 .and. inum_runs == 1) then
                write(stdout, '('' Projected imaginary energy'',T52,F19.12)') mean_ProjE_im
                write(stdout, '('' Estimated error in Projected imaginary energy'',T52,F19.12)') ProjE_Err_im
            end if
        end if
        if (.not. tNoShiftValue) then
            write(stdout, '('' Shift correlation energy'',T52,F19.12)') mean_Shift
            write(stdout, '('' Estimated error in shift correlation energy'',T52,F19.12)') shift_err
            write(stdout, "(A)") " No reliable averaged shift correlation energy could be obtained automatically"
        end if
        if ((.not. tNoProjEValue) .and. (.not. tNoShiftValue)) then
            !Do shift and projected energy agree?
            write(stdout, "(A)")
            EnergyDiff = abs(mean_Shift - mean_ProjE_re)
            if (EnergyDiff <= sqrt(shift_err**2 + ProjE_Err_re**2)) then
                write(stdout, "(A,F15.8)") " Projected and shift energy estimates agree " &
                    & //"within errorbars: EDiff = ", EnergyDiff
            else if (EnergyDiff <= sqrt((max(shift_err, ProjE_Err_re) * 2)**2 + min(shift_err, ProjE_Err_re)**2)) then
                write(stdout, "(A,F15.8)") " Projected and shift energy estimates agree to within " &
                    & //"two sigma of largest error: EDiff = ", EnergyDiff
                write(stdout, "(A,F15.8)") " Projected and shift energy estimates do not agree to " &
                    & //"within approximate errorbars: EDiff = ", EnergyDiff
            end if
            if (ProjE_Err_re < shift_err) then
                BestEnergy = mean_ProjE_re + OutputHii
                BestErr = ProjE_Err_re
                BestEnergy = mean_shift + Hii
                BestErr = shift_err
            end if
        else if (tNoShiftValue .and. (.not. tNoProjEValue)) then
            BestEnergy = mean_ProjE_re + OutputHii
            BestErr = ProjE_Err_re
        else if (tNoProjEValue .and. (.not. tNoShiftValue)) then
            BestEnergy = mean_shift + Hii
            BestErr = shift_err
            BestEnergy = ProjectionE(1) + OutputHii
            BestErr = 0.0_dp
        end if
        write(stdout, "(A)")
        if (tNoProjEValue) then
            write(stdout, "(A,F20.8)") " Total projected energy ", real(ProjectionE(1), dp) + OutputHii
            write(stdout, "(A,F20.8,A,G15.6)") " Total projected energy ", &
                mean_ProjE_re + OutputHii, " +/- ", ProjE_Err_re
        end if
        if (.not. tNoShiftValue) then
            write(stdout, "(A,F20.8,A,G15.6)") " Total shift energy     ", &
                mean_shift + Hii, " +/- ", shift_err
        end if

        ! Output replica_estimates data for the test suite
        if (tReplicaEstimates) then
            write(stdout, '(/,1x,"THE FOLLOWING IS FOR TEST SUITE USE ONLY!",/)')
            do i = 1, replica_est_len
                write(stdout, '(1x,"REPLICA ESTIMATES FOR STATE",1x,'//int_fmt(i)//',":",)') i

                write(stdout, '(1x,"Variational energy from replica_estimates:",1x,es20.13)') &
                    var_e_num_all(i) / rep_est_overlap_all(i)
                write(stdout, '(1x,"Energy squared from replica_estimates:",1x,es20.13)') &
                    e_squared_num_all(i) / rep_est_overlap_all(i)
                if (tEN2Init) then
                    write(stdout, '(1x,"EN2 estimate from replica_estimates:",1x,es20.13)') &
                        en2_pert_all(i) / rep_est_overlap_all(i)
                end if
                if (tEN2Rigorous) then
                    write(stdout, '(1x,"EN2 New estimate from replica_estimates:",1x,es20.13)') &
                        en2_new_all(i) / rep_est_overlap_all(i)
                end if
                write(stdout, '(1x,"Preconditioned energy from replica_estimates:",1x,es20.13,/)') &
                    precond_e_num_all(i) / precond_denom_all(i)
            end do
        end if

        CALL MolproPluginResult('ENERGY', [BestEnergy])
        CALL MolproPluginResult('FCIQMC_ERR', [min(ProjE_Err_re, shift_err)])
        write(stdout, "(/)")

        ! Deallocate memory
        call DeallocFCIMCMemPar()

    end subroutine FciMCPar