tau_search_hist.F90 Source File


Contents

Source Code


Source Code

#include "macros.h"

module tau_search_hist

    use SystemData, only: tGen_4ind_weighted, AB_hole_pairs, par_hole_pairs, tHub, &
                          tGen_4ind_reverse, nOccAlpha, nOccBeta, tUEG, tGen_4ind_2, &
                          nBasis, tGen_sym_guga_mol, &
                          tReal, t_k_space_hubbard, t_trans_corr_2body, &
                          t_trans_corr, t_new_real_space_hubbard, t_3_body_excits, &
                          t_trans_corr_hop, tGUGA, tgen_guga_crude, t_mixed_hubbard, &
                          t_olle_hubbard, t_mol_3_body, t_exclude_3_body_excits, &
                          tGAS, t_fci_pchb_excitgen

    use CalcData, only: tTruncInitiator, InitiatorWalkNo, &
                        t_truncate_spawns, t_mix_ratios, mix_ratio, matele_cutoff, &
                        t_consider_par_bias

    use FciMCData, only: tRestart, pSingles, pDoubles, pParallel

    use Parallel_neci, only: MPIAllReduce, MPI_MIN, MPI_MAX, MPI_SUM, MPIAllLORLogical, &
                             MPISumAll, MPISUM, mpireduce

    use MPI_wrapper, only: iprocindex, root

    use constants, only: dp, EPS, stdout, maxExcit, int64

    use tau_main, only: tau, min_tau, max_tau, possible_tau_search_methods, &
                    tau_search_method, input_tau_search_method, &
                    assign_value_to_tau, max_death_cpt, MaxWalkerBloom

    use MemoryManager, only: LogMemAlloc, LogMemDealloc, TagIntType

    use procedure_pointers, only: get_umat_el
    use UMatCache, only: gtid, UMat2d
    use util_mod, only: operator(.isclose.), near_zero, clamp, stop_all
    use LoggingData, only: t_log_ija, ija_bins_sing, all_ija_bins_sing, ija_thresh, &
                           ija_bins_para, all_ija_bins, ija_bins_anti, &
                           ija_orbs_sing, all_ija_orbs_sing, &
                           ija_orbs_para, all_ija_orbs, ija_orbs_anti
    use tc_three_body_data, only: pTriples, lMatEps

    implicit none
    private
    public :: finalize_hist_tau_search, fill_frequency_histogram_4ind, &
              fill_frequency_histogram_sd, &
              fill_frequency_histogram, init_hist_tau_search, update_tau_hist, &
              print_frequency_histograms

    public :: t_fill_frequency_hists, t_test_hist_tau

    public :: frq_ratio_cutoff, max_frequency_bound, n_frequency_bins


    ! variables which i might have to define differently:
    logical :: consider_par_bias
    integer :: n_opp, n_par
    ! do i have to define this here or in the CalcData:??
    integer :: cnt_sing_hist, cnt_doub_hist, cnt_opp_hist, cnt_par_hist, cnt_trip_hist
    integer :: above_max_singles, above_max_para, above_max_anti, above_max_doubles
    integer :: above_max_triples
    integer :: below_thresh_singles, below_thresh_para, below_thresh_anti, &
               below_thresh_doubles, below_thresh_triples
    logical :: enough_sing_hist, enough_doub_hist, enough_par_hist, enough_opp_hist
    logical :: enough_trip_hist
    ! store the necessary quantities here and not in CalcData!
    real(dp) :: frq_step_size = 0.1_dp
    real(dp) :: max_frequency_bound = 10000.0_dp
    integer :: n_frequency_bins = 100000 ! optional input to adjust the number of bins


! use also an input dependent ratio cutoff for the time-step adaptation
    real(dp) :: frq_ratio_cutoff = 0.999_dp

    ! i need bin arrays for all types of possible spawns:
    integer(int64), allocatable :: &
        frequency_bins_singles(:), frequency_bins_para(:), &
        frequency_bins_anti(:), frequency_bins_doubles(:), &
        frequency_bins(:), frequency_bins_triples(:)

    integer(TagIntType) :: mem_tag_histograms = 0

    ! also start to keep track of the aborted excitations, having 0 matrix
    ! element although they have been chosen as excitations, for the
    ! different type of excitations!
    ! do i need to communicate this?
    ! if i have to communicate this i just have to do it at the end..
    ! so i do not need to keep track of this during simulation
    ! this can easily exceed 2**31 -> needs 8-byte integer
    integer(int64) :: zero_singles, zero_para, zero_anti, zero_doubles, zero_triples

    ! also do keep track of the maximum H_ij/pgen ratios here too, just to
    ! be able to efficiently compare it with the old implementation!
    real(dp) :: gamma_sing, gamma_doub, gamma_opp, gamma_par, gamma_trip
    real(dp) :: min_sing, min_doub, min_opp, min_par, min_trip

    real(dp), parameter :: thresh = 1.0e-6_dp


    logical :: t_fill_frequency_hists = .false.

    logical :: t_test_hist_tau = .false.
contains

    subroutine optimize_hubbard_time_step()
        ! routine to set the optimal time-step for the hubbard model,
        ! where the pgens and matrix elements are set
        use SystemData, only: uhub, bhub, nel, omega, treal

        real(dp) :: p_elec, p_hole, time_step, mat_ele, death_prob

        ! first of all write down the notes here..

        ! the first implementation should just use the non-optimized values
        ! from the current implementation..
        ! afterwards i want to optimize especially the real-space hubbard
        ! implementation, since this is done really inefficient right now!
        ! although it is just wrong how it is done currently in the
        ! real-space hubbard model.. damn..
        ! are my results still valid??

        if (tReal) then
            ! in the real-space hubbard model the electron is picked
            ! uniformly
            p_elec = 1.0_dp / real(nel, dp)
            ! and the hole should be picked from the neighbors:
!             p_hole = 1.0_dp / (2.0_dp * real(dims, dp))

            p_hole = min(1.0_dp / real(nBasis / 2 - nOccAlpha, dp), &
                         1.0_dp / real(nBasis / 2 - nOccBeta, dp))

            ! and the matrix element is always just -t
            mat_ele = bhub

            ! so the off-diagonal time-step should be
            time_step = p_elec * p_hole / abs(mat_ele)

            ! but one has to also consider the diagonal part for the
            ! death probability (to limit it to < 1)
            ! there can be at most half the number of electrons double
            ! occupied sites!
            death_prob = Uhub * nel / 2.0_dp

            print *, "optimized time-step for real-space hubbard: ", time_step
            if (t_trans_corr_2body .or. t_trans_corr) then
                print *, "BUT: transcorrelated Hamiltonian used! "
                print *, " so matrix elements are not uniform anymore!"
            end if

        else
            ! in the momentum space hubbard the electrons fix the the holes
            ! to be picked! atleast if one is picked the 2nd hole is also
            ! chosen!

            ! for the 2-body transcorrelation we have to consider the
            ! different types of excitations and find the minimum tau
            ! for it..
            ! but also the matrix elements are not uniform.. so one
            ! would need to find the worst H_ij/pgen ratio, which is
            ! no feasible here.. thats actually what the tau-search is
            ! doing..

            p_elec = 2.0_dp / real(nOccAlpha * nOccBeta, dp)

            ! and the holes are all the remaining possible ones
            p_hole = 1.0_dp / real(nbasis - nel, dp)

            ! the matrix element is always U (or U/2 ?? ) check!
            mat_ele = uhub / omega

            time_step = p_elec * p_hole / abs(mat_ele)

            ! the diagonal element is -2t cos(k_vec)
            death_prob = 2.0_dp * abs(bhub)

            print *, "optimized time-step for the momentum space hubbard: ", time_step
            if (t_trans_corr_2body .or. t_trans_corr) then
                print *, "BUT: transcorrelated Hamiltonian used! "
                print *, " so matrix elements and pgens are not uniform anymore!"
                print *, " thus TAU is just a rough estimate! "
            end if

        end if

        ! with this stuff i can make the optimal time-step!
        ! but for the real-space hubbard i have to first implement the
        ! better choosing of only neighboring holes!
        if (tau > time_step) then
            print *, "initial guessed or input-provided time-step too large!"
        else
            print *, "initial guessed or input-provided time-step too small!"
        end if
        call assign_value_to_tau(clamp(0.1_dp * time_step, min_tau, max_tau), 'Initial guess hubbard')
        print *, "setting time-step to 0.1 * optimal: ", tau
        print *, "and turning tau-search OFF!"

        tau_search_method = possible_tau_search_methods%OFF
        t_fill_frequency_hists = .false.
        t_test_hist_tau = .false.

    end subroutine optimize_hubbard_time_step

    subroutine init_hist_tau_search
        ! split the new and old tau-search routines up, to no mix up
        ! too much stuff
        character(*), parameter :: this_routine = "init_hist_tau_search"

        integer :: ierr

        if (.not. input_tau_search_method == possible_tau_search_methods%HISTOGRAMMING ) then
            write(stdout, '(A)') 'init_hist_tau_search called but &
                &HISTOGRAMMING was not given as tau-search method'
            call stop_all(this_routine, 'Inconsistent input')
        end if

        ! if no truncating spawns is chosen warn here againg that that might
        ! cause problems!
        if (.not. t_truncate_spawns) then
            write(stdout, '("WARNING: NO spawn truncation chosen with keyword: &
                &truncate-spawns [float] in input. this might cause &
                &bloom problems with histogramming tau-search! BE CAUTIOUS!")')
        end if

        if (tHub) then
            ! for the transcorrelated hamiltonian we need to re-enable the
            ! Histogramming tau-search!
            if (.not.(t_trans_corr .or. t_trans_corr_2body)) then
                call optimize_hubbard_time_step()
                return
            end if
        end if

        ! print out the standard quantities:
        root_print "Setup of the Histogramming tau-search: "
        root_print "  Integration cut-off: ", frq_ratio_cutoff
        root_print "  Number of bins: ", n_frequency_bins
        root_print "  Max. ratio: ", max_frequency_bound

        ! do the initialization of the frequency analysis here..
        ! i think otherwise it is not done on all the nodes..
        ! don't need to check if t_frequency_analysis here anymore
        ! determine the global and fixed step-size quantitiy!
        frq_step_size = max_frequency_bound / real(n_frequency_bins, dp)

        root_print "  Bin-width: ", frq_step_size

        ! and do the rest of the initialisation:

        ! Are we considering parallel-spin bias in the doubles?
        ! Do this logic here, so that if we add opposite spin bias to more
        ! excitation generators, then there is only one place that this logic
        ! needs to be updated!
        if (tGen_4ind_weighted .or. tGen_4ind_2) then
            consider_par_bias = .true.
        else if (tGen_4ind_reverse) then
            consider_par_bias = .true.
            n_opp = AB_hole_pairs
            n_par = par_hole_pairs
        else if (t_k_space_hubbard .and. t_trans_corr_2body) then
            ! for the 2-body transcorrelated k-space hubbard we also have
            ! possible parallel excitations now. and to make the tau-search
            ! working we need to set this to true ofc:
            consider_par_bias = .true.
        else if (t_fci_pchb_excitgen .and. .not. tGUGA) then
            ! The default pchb excitgen also uses parallel biases
            consider_par_bias = .true.
        else if (tGAS) then
            consider_par_bias = .true.
        else
            consider_par_bias = .false.
        end if

        ! If there are only a few electrons in the system, then this has
        ! impacts for the choices that can be made.
        if (nOccAlpha == 0 .or. nOccBeta == 0) then
            ! do i really need a histogramming tau-search in this case??
            ! come on..
            call stop_all(this_routine, &
                          "Do you really need a tau-search for such a small system?")
            consider_par_bias = .false.
            pParallel = 1.0_dp
        end if
        if (nOccAlpha == 1 .and. nOccBeta == 1) then
            consider_par_bias = .false.
            call stop_all(this_routine, &
                          "Do you really need a tau-search for such a small system?")
            pParallel = 0.0_dp
        end if

        if (t_mixed_hubbard .or. t_olle_hubbard) then
            pParallel = 0.0_dp
        end if

        t_consider_par_bias = consider_par_bias

        cnt_sing_hist = 0
        enough_sing_hist = .false.
        above_max_singles = 0
        below_thresh_singles = 0
        gamma_sing = 0.0_dp
        min_sing = huge(0.0_dp)
        zero_singles = 0

        cnt_doub_hist = 0
        enough_doub_hist = .false.
        above_max_doubles = 0
        below_thresh_doubles = 0
        zero_doubles = 0
        gamma_doub = 0.0_dp
        min_doub = huge(0.0_dp)

        ! triples data
        cnt_trip_hist = 0
        enough_trip_hist = .false.
        gamma_trip = 0.0_dp
        min_trip = huge(0.0_dp)
        above_max_triples = 0
        below_thresh_triples = 0

        ! dependent if we use pParallel or not init specific hists
        if (consider_par_bias) then

            cnt_par_hist = 0
            cnt_opp_hist = 0

            enough_par_hist = .false.
            enough_opp_hist = .false.

            above_max_para = 0
            below_thresh_para = 0
            above_max_anti = 0
            below_thresh_anti = 0

            zero_para = 0
            zero_anti = 0

            gamma_par = 0.0_dp
            gamma_opp = 0.0_dp
            min_opp = huge(0.0_dp)
            min_par = huge(0.0_dp)

            allocate(frequency_bins_anti(n_frequency_bins), &
                     frequency_bins_para(n_frequency_bins), &
                     frequency_bins_singles(n_frequency_bins), &
                     stat=ierr, source=0_int64)

            call LogMemAlloc('frequency_bins', n_frequency_bins * 3, int(sizeof(frequency_bins_anti(1))), &
                             this_routine, mem_tag_histograms, ierr)

        else if (tGen_sym_guga_mol .or. &
                 (tgen_guga_crude .and. .not. &
                  (t_new_real_space_hubbard .or. t_k_space_hubbard))) then

            ! i always use the singles histogram dont I? i think so..
            allocate(frequency_bins_singles(n_frequency_bins), &
                     frequency_bins_doubles(n_frequency_bins), source=0_int64)

        else
            ! just to be save also use my new flags..
            if (tHub .or. tUEG .or. &
                (t_k_space_hubbard .and. .not. t_trans_corr_2body) .or. &
                (t_new_real_space_hubbard .and. .not. t_trans_corr_hop)) then
                ! only one histogram is used!
                allocate(frequency_bins(n_frequency_bins), stat=ierr, source=0_int64)

                call LogMemAlloc('frequency_bins', n_frequency_bins, int(sizeof(frequency_bins(1))), &
                                 this_routine, mem_tag_histograms, ierr)

            else

                ! i always use the singles histogram dont I? i think so..
                allocate(frequency_bins_singles(n_frequency_bins), &
                         frequency_bins_doubles(n_frequency_bins), source=0_int64, stat=ierr)
                call LogMemAlloc('frequency_bins', n_frequency_bins * 2, int(sizeof(frequency_bins(1))), &
                                 this_routine, mem_tag_histograms, ierr)
            end if
        end if

        if (t_mol_3_body) then
            ! when doing integral-based triples, we are here
            allocate(frequency_bins_triples(n_frequency_bins), source=0_int64)
        end if

        if (t_log_ija) then
            ! allocate the new bins (although i am not sure i should do this
            ! here..
            ! change the logging of these functions to sepereate between the
            ! different sorts of excitations and log them through spatial orbitals!
            allocate(ija_bins_sing(nBasis / 2), &
                    ija_bins_para(nBasis / 2, nBasis / 2, nBasis / 2), &
                    ija_bins_anti(nBasis / 2, nBasis / 2, nBasis / 2), &
                    ija_orbs_sing(nBasis / 2), &
                    ija_orbs_para(nBasis / 2, nBasis / 2, nBasis / 2), &
                    ija_orbs_anti(nBasis / 2, nBasis / 2, nBasis / 2), &
                source=0)

            root_print "Logging dead-end (a|ij) excitations below threshold: ", ija_thresh

        end if
    end subroutine init_hist_tau_search

    subroutine update_tau_hist()
        ! split up the new tau-update routine from the old one!
        character(*), parameter :: this_routine = "update_tau_hist"

        real(dp) :: psingles_new, tau_new, mpi_tmp, tau_death, pParallel_new, pTriples_new
        real(dp) :: ratio_singles, ratio_anti, ratio_para, ratio_doubles, ratio_triples, ratio
        logical :: mpi_ltmp

        ASSERT(tau_search_method == possible_tau_search_methods%HISTOGRAMMING)

        ! What needs doing depends on the number of parametrs that are being
        ! updated.
        call MPIAllLORLogical(enough_sing_hist, mpi_ltmp)
        enough_sing_hist = mpi_ltmp
        call MPIAllLORLogical(enough_doub_hist, mpi_ltmp)
        enough_doub_hist = mpi_ltmp
        if (t_mol_3_body) then
            call MPIAllLORLogical(enough_trip_hist, mpi_ltmp)
            enough_trip_hist = mpi_ltmp
        end if

        ! singles is always used..
        ! thats not quite right.. for the hubbard/UEG case it is not..
        if (tUEG .or. tHub .or. &
            (t_new_real_space_hubbard .and. .not. t_trans_corr_hop) .or. &
            (t_k_space_hubbard .and. .not. t_trans_corr_2body)) then
            ! also use my new flags and exclude the 2-body transcorrelation
            ! in the k-space hubbard due to triple excitations and
            ! parallel doubles

            ! here i could implement the summing in the case of Hubbard
            ! and UEG models.. although I could "just" implement the
            ! optimal time-step in the case of Hubbard models!
            ! for UEG not i guess..
            call integrate_frequency_histogram_spec(frequency_bins, ratio)

            if (ratio < 0.0_dp) then
                ! TODO: and also print out in this case!
                ! this means i had an int overflow and should stop the tau-searching
                root_print "The single excitation histogram is full!"
                root_print "stop the hist_tau_search with last time-step: ", tau
                tau_search_method = possible_tau_search_methods%OFF

                return
            end if

            if (enough_doub_hist) then
                ! in this case the enough_doub_hist flag is (mis)used to
                ! indicate enough spawning events!
                ! the doubles flag is also used for single excitations in the
                ! real-space hubbard! be careful
                tau_new = MaxWalkerBloom / ratio

                ! and use the mixing now:
                if (t_mix_ratios) then
                    tau_new = (1.0_dp - mix_ratio) * tau + mix_ratio * tau_new
                end if

            else
                ! what do i do in the case if not enough spawns..
                ! nothing i guess..
                tau_new = tau
            end if

        else

            ! NOTE: in the case of the 2-body transcorrelated k-space hubbard
            ! the singles histogram is used to store the triples!
            call integrate_frequency_histogram_spec(frequency_bins_singles, &
                                                    ratio_singles)

            ! for analyis print this also out:
            ! if i have a integer overflow i should probably deal here with it..
            if (ratio_singles < 0.0_dp) then
                ! TODO: and also print out in this case!
                ! this means i had an int overflow and should stop the tau-searching
                root_print "The single excitation histogram is full!"
                root_print "stop the hist_tau_search with last time-step: ", tau
                root_print "and pSingles and pDoubles:", pSingles, pDoubles
                tau_search_method = possible_tau_search_methods%OFF

                return
            end if

            ! for tc, also consider the triples
            if (t_mol_3_body) then
                call integrate_frequency_histogram_spec(frequency_bins_triples, ratio_triples)
                ! overflow error -> disable hist_tau_search
                if (ratio_triples < 0.0_dp) then
                    tau_search_method = possible_tau_search_methods%OFF
                    return
                end if
            else
                ratio_triples = 0.0
            end if

            if (consider_par_bias) then

                ! sum up all 3 ratios: single, parallel and anti-parallel
                call integrate_frequency_histogram_spec(frequency_bins_para, &
                                                        ratio_para)

                if (ratio_para < 0.0_dp) then
                    root_print "The parallel excitation histogram is full!"
                    root_print "stop the hist_tau_search with last time-step: ", tau
                    root_print "and pSingles and pParallel: ", pSingles, pParallel

                    tau_search_method = possible_tau_search_methods%OFF
                    return
                end if

                call integrate_frequency_histogram_spec(frequency_bins_anti, &
                                                        ratio_anti)

                if (ratio_anti < 0.0_dp) then
                    root_print "The anti-parallel excitation histogram is full!"
                    root_print "stop the hist_tau_search with last time-step: ", tau
                    root_print "and pSingles and pParallel: ", pSingles, pParallel

                    tau_search_method = possible_tau_search_methods%OFF
                    return
                end if

                ! to compare the influences on the time-step:
                ! change that the ratios are already stored in an unbiased
                ! way, so no feedback happens!

                ! also calculate new time-step through this method and
                ! check the difference to the old method
                if (enough_sing_hist .and. enough_doub_hist) then
                    ! puh.. for the mixing i am not sure how to exactly do
                    ! that.. since all of the ratios depend on each other..
                    ! test that!
                    pparallel_new = ratio_para / (ratio_anti + ratio_para)
                    if (t_mix_ratios) then
                        pparallel_new = (1.0_dp - mix_ratio) * pParallel + &
                                        mix_ratio * pparallel_new
                    end if

                    psingles_new = ratio_singles * pparallel_new / &
                                   (ratio_para + ratio_singles * pparallel_new)

                    if (t_mix_ratios) then
                        psingles_new = (1.0_dp - mix_ratio) * psingles + &
                                       mix_ratio * psingles_new
                    end if

                    tau_new = psingles_new * MaxWalkerBloom / ratio_singles

                    if (t_mix_ratios) then
                        tau_new = (1.0_dp - mix_ratio) * tau + mix_ratio * tau_new
                    end if

                    if (psingles_new > 1e-5_dp .and. &
                        psingles_new < (1.0_dp - 1e-5_dp)) then

                        if (abs(psingles - psingles_new) / psingles > 0.0001_dp) then
                            root_print "Updating singles/doubles bias. pSingles = ", psingles_new
                            root_print " pDoubles = ", 1.0_dp - psingles_new
                        end if

                        pSingles = psingles_new
                        pDoubles = 1.0_dp - pSingles
                    end if

                    ! although checking for enough doubles is probably more efficient
                    ! than always checking the reals below..
                    if (pParallel_new > 1e-4_dp .and. pParallel_new < (1.0_dp - 1e-4_dp)) then
                        ! enough_doub implies that both enough_opp and enough_par are
                        ! true.. so this if statement makes no sense
                        ! and otherwise pParallel_new is the same as before
                        if (abs(pparallel_new - pParallel) / pParallel > 0.0001_dp) then
                            root_print "Updating parallel-spin bias; new pParallel = ", pParallel_new
                        end if
                        ! in this new implementation the weighting make pParallel
                        ! smaller and smaller.. so also limit it to some lower bound
                        pParallel = pParallel_new
                    end if

                else
                    pparallel_new = pParallel
                    psingles_new = pSingles
                    ! im not sure if this possible division by 0 is cool...
                    ! and if this is somehow cool in the histogramming
                    ! approach.. this again is some kind of worst case
                    ! adaptation.. hm.. todo
                    if (abs(ratio_singles) > EPS .or. abs(ratio_para) > EPS &
                        .or. abs(ratio_anti) > EPS) then
                        tau_new = MaxWalkerBloom * min( &
                                  pSingles / max(EPS, ratio_singles), &
                                  pDoubles * pParallel / max(EPS, ratio_para), &
                                  pDoubles * (1.0_dp - pParallel) / max(EPS, ratio_anti))
                    else
                        tau_new = tau
                    end if

                end if

            else
                ! here i only use doubles for now
                call integrate_frequency_histogram_spec(frequency_bins_doubles, &
                                                        ratio_doubles)

                if (ratio_doubles < 0.0_dp) then
                    root_print "The double excitation histogram is full!"
                    root_print "stop the hist_tau_search with last time-step: ", tau
                    root_print "and pSingles and pDoubles: ", pSingles, pDoubles

                    tau_search_method = possible_tau_search_methods%OFF
                    return
                end if

                ! to compare the influences on the time-step:
                ! change that so it does store the ratio unbiased

                if (enough_sing_hist .and. enough_doub_hist) then

                    psingles_new = ratio_singles / (ratio_doubles + ratio_singles)

                    if (t_mix_ratios) then
                        psingles_new = (1.0_dp - mix_ratio) * pSingles + &
                                       mix_ratio * psingles_new
                    end if

                    tau_new = MaxWalkerBloom / (ratio_doubles + ratio_singles + ratio_triples)

                    if (t_mix_ratios) then
                        tau_new = (1.0_dp - mix_ratio) * tau + mix_ratio * tau_new
                    end if

                    if (psingles_new > 1e-5_dp .and. &
                        psingles_new < (1.0_dp - 1e-5_dp)) then

                        if (abs(psingles - psingles_new) / psingles > 0.0001_dp) then
                            root_print "Updating singles/doubles bias. pSingles = ", psingles_new
                            root_print " pDoubles = ", 1.0_dp - psingles_new
                        end if

                        pSingles = psingles_new
                        pDoubles = 1.0_dp - pSingles
                    end if

                    ! update pTriples analogously - note that as pTriples already modifies
                    ! the effective pSingles/pDoubles, they do not have to be updated
                    ! with ratio_triples
                    if (t_mol_3_body .and. enough_doub_hist) then
                        pTriples_new = ratio_triples / (ratio_doubles + ratio_singles + &
                                                        ratio_triples)

                        ! update pTriples if it has a reasonable value
                        if (.not. t_exclude_3_body_excits .and. pTriples_new > 1e-5_dp &
                            .and. pTriples_new < (1.0_dp - 1e-5_dp)) then

                            if (abs(pTriples_new - pTriples) > 0.001_dp) then
                                root_print "Updating triples bias. pTriples = ", pTriples_new
                            end if
                            pTriples = pTriples_new
                        end if
                    end if

                else
                    psingles_new = pSingles
                    if (abs(ratio_singles) > EPS .or. abs(ratio_doubles) > EPS) then
                        tau_new = MaxWalkerBloom * &
                                  min(pSingles / max(EPS, ratio_singles), pDoubles / max(EPS, ratio_doubles))
                    else
                        tau_new = tau
                    end if
                end if
            end if
        end if

        ! to deatch check again and finally update time-step
        ! The range of tau is restricted by particle death. It MUST be <=
        ! the value obtained to restrict the maximum death-factor to 1.0.
        call MPIAllReduce(max_death_cpt, MPI_MAX, mpi_tmp)
        max_death_cpt = mpi_tmp
        ! again, only count deaths if any occured
        if(abs(max_death_cpt) > EPS) then
            tau_death = 1.0_dp / max_death_cpt
            if (tau_death < tau_new) then
                if (tau_death < min_tau) then
                    root_print "time-step reduced, due to death events! reset min_tau to:", tau_death
                    min_tau = tau_death
                end if
                tau_new = tau_death
            end if
        end if

        ! If the calculated tau is less than the current tau, we should ALWAYS
        ! update it. Once we have a reasonable sample of excitations, then we
        ! can permit tau to increase if we have started too low.

        ! make the right if-statements here.. and remember enough_doub_hist is
        ! used for singles in the case of the real-space transcorrelated hubbard!
        tau_new = clamp(tau_new, min_tau, max_tau)
        if (tau_new < tau .or. &
            (tUEG .or. tHub .or. enough_sing_hist .or. &
             (t_k_space_hubbard .and. .not. t_trans_corr_2body) .and. enough_doub_hist) .or. &
            (t_new_real_space_hubbard .and. (enough_doub_hist .and. &
                                             (.not. t_trans_corr_hop .or. enough_sing_hist)))) then
            ! remove this constriction to just work in the transcorrelated
            ! case and let the user decide!

            ! Make the final tau smaller than tau_new by a small amount
            ! so that we don't get spawns exactly equal to the
            ! initiator threshold, but slightly below it instead.
            ! does this make sense in the new implmentation? this way
            ! i will always decrease the time-step even if its not necessary..
            call assign_value_to_tau(tau_new, 'Histogramming tau search')
        end if

    end subroutine update_tau_hist

    subroutine fill_frequency_histogram_4ind(mat_ele, pgen, ic, t_parallel, ex)
        ! this is the specific routine to fill up the frequency histograms
        ! for the 4ind-weighted excitation generators, which use pParallel too
        ! (do they always by default?? check that! todo!
        ! i just realised, due to the linear and ordered bins, i actually dont
        ! need to binary search in them but can determine the index just
        ! through the ratio and frequency step size
        ! i want to stop filling the histograms after a certain number of
        ! excitations is stored. 1.: since i dont want integer overflows and
        ! i think it is unnecessary to store it as an 64bit integer array
        ! and 2.: the distribution shouldn't change too much after a
        ! certain point..
        ! but this would also mean, that the tau-update would not give
        ! any new values after i dont change the histograms anymore
        ! so i could stop the tau-adaptation after that point too..
        ! so maybe first experiment a bit with 64bit array to see if
        ! the time does change more after the 32 bit limit is reached
        ! and then change it back to 32 bits..
        real(dp), intent(in) :: mat_ele, pgen
        integer, intent(in) :: ic
        logical, intent(in) :: t_parallel
        integer, intent(in), optional :: ex(2, ic)
        character(*), parameter :: this_routine = "fill_frequency_histogram_4ind"
        ! i think in my histogramming tau-search i have to increase this
        ! threshold by a LOT to avoid fluctuating behavior at the
        ! beginning of the calculation
        integer, parameter :: cnt_threshold = 50

        real(dp) :: ratio
        integer :: ind
        integer :: indi, indj, inda, indb

#ifdef DEBUG_
        if (pgen < EPS) then
            print *, "pgen: ", pgen
            print *, "matrix element: ", mat_ele
        end if
#endif

        if (pgen < EPS) return

        ASSERT(pgen > 0.0_dp)
        ASSERT(ic == 1 .or. ic == 2 .or. (ic == 3 .and. t_3_body_excits))

        ! i can't make this exception here without changing alot in the
        ! other parts of the code.. and i have to talk to ali first about
        ! that
        ! but with my cutoff change i should change this here to keep
        ! track of the excitations above matele cutoff or?? hm..
        ! and in the rest of the code i have to abort these excitations really!
        ! talk to ali about that!
        if (mat_ele < matele_cutoff) then
            ! but i still should keep track of these events!
            select case (ic)
            case (1)
                ! what if i get an int overflow here? should i also
                ! stop histogramming?
                zero_singles = zero_singles + 1

            case (2)
                if (t_parallel) then
                    zero_para = zero_para + 1
                else
                    zero_anti = zero_anti + 1
                end if
            case (3)
                ! adapt this to possible triples now
                ASSERT(t_3_body_excits)
                ! but still re-use the singles counters in this case
                zero_singles = zero_singles + 1

            end select

            return
        end if

        if (mat_ele < thresh) then
            ! maybe it would be better to measure if the ratio is
            ! below the thresh
            select case (ic)
            case (1)
                below_thresh_singles = below_thresh_singles + 1

            case (2)
                if (t_parallel) then
                    below_thresh_para = below_thresh_para + 1
                else
                    below_thresh_anti = below_thresh_anti + 1
                end if

            case (3)
                ASSERT(t_3_body_excits)
                below_thresh_singles = below_thresh_singles + 1

            end select
        end if

        ratio = mat_ele / pgen

        ! then i have to decide which histogram to fill

        if (ic == 1 .or. (ic == 3 .and. t_3_body_excits)) then

            ! if i ignore the ratios above the upper limit i also can
            ! only count these excitations if they do not get ignored..

            ! i have to change the way how we store those excitations in the
            ! histograms! i have to unbias it against the psingles, etc.
            ! quantities, so the histograms do not have feedback!

            ! i have to ensure pSingles is also set to 1.0_dp - pDoubles
            ! in the transcorr hubbard case..
            ratio = ratio * pSingles

            if (ratio < max_frequency_bound) then

                ! also keep track of the number of done excitations
                if (.not. enough_sing_hist) then
                    cnt_sing_hist = cnt_sing_hist + 1
                    if (cnt_sing_hist > cnt_threshold) enough_sing_hist = .true.
                end if

                ! find where to put the ratio
                ! since ordered and linear bin bounds i can just divide..
                ! i just hope there are no numerical errors creeping in ..
                ! if the ratio happens to be exactly the upper bound, it
                ! still has to be put in the last bin or? yes i guess..
                ! so take the minimum of the ind and the max bin index
                ind = int(ratio / frq_step_size) + 1
                frequency_bins_singles(ind) = frequency_bins_singles(ind) + 1

                ! for now also test if the actually ratio is below the
                ! threshold, although.. i already now the minumum ratio,
                ! which is not so small.. just too many of those happen!
                ! we have to avoid that!
            else
                ! store the number of excitation which exceed the upper limit!
                above_max_singles = above_max_singles + 1
                print *, "Warning: single excitation H_ij/pgen above max_frequency_bound!"
                print *, " H_ij: ", mat_ele, ", pgen: ", pgen, ", pSingles: ", pSingles
                print *, "ex-maxtrix: ", get_src(ex), " -> ", get_tgt(ex)
                print *, " H_ij/pgen: ", ratio, " ; bound: ", max_frequency_bound
                print *, " Consider increasing the bound!"
#ifdef DEBUG_
                indi = gtid(ex(1, 1))
                indj = gtid(ex(1, 2))
                inda = gtid(ex(2, 1))
                indb = gtid(ex(2, 2))
                print *, "umat (ij|ab) ", get_umat_el(indi, indj, inda, indb)
                print *, "umat (ij|ba) ", get_umat_el(indi, indj, indb, inda)
                print *, "diff: ", abs(get_umat_el(indi, indj, inda, indb) - &
                                       get_umat_el(indi, indj, indb, inda))
                print *, "(ia|ia): ", abs(get_umat_el(indi, inda, indi, inda))
                print *, "(ja|ja): ", abs(get_umat_el(indj, inda, indj, inda))
                print *, "(ib|ib): ", abs(get_umat_el(indi, indb, indi, indb))
                print *, "(jb|jb): ", abs(get_umat_el(indj, indb, indj, indb))
                print *, "******************"
#endif
            end if

            ! also start to store the maximum values anyway..
            if (ratio > gamma_sing) gamma_sing = ratio
            ! also start to store the smallest allowed ratio:
            if (ratio < min_sing) min_sing = ratio

        else
            ! check if parallel or anti-parallel
            if (t_parallel) then

                ratio = ratio * (pDoubles * pParallel)
#ifdef DEBUG_
                ! analyse the really low and really high ratios:
                if (ratio < 0.001_dp) then
                    print *, "******************"
                    print *, "parallel excitation:"
                    print *, "ratio: ", ratio
                    print *, "mat_ele: ", mat_ele
                    print *, "pgen: ", pgen
                    print *, "ex-maxtrix: ", get_src(ex), " -> ", get_tgt(ex)
                    indi = gtid(ex(1, 1))
                    indj = gtid(ex(1, 2))
                    inda = gtid(ex(2, 1))
                    indb = gtid(ex(2, 2))
                    print *, "umat (ij|ab) ", get_umat_el(indi, indj, inda, indb)
                    print *, "umat (ij|ba) ", get_umat_el(indi, indj, indb, inda)
                    print *, "diff: ", abs(get_umat_el(indi, indj, inda, indb) - &
                                           get_umat_el(indi, indj, indb, inda))
                    print *, "(ia|ia): ", abs(get_umat_el(indi, inda, indi, inda))
                    print *, "(ja|ja): ", abs(get_umat_el(indj, inda, indj, inda))
                    print *, "(ib|ib): ", abs(get_umat_el(indi, indb, indi, indb))
                    print *, "(jb|jb): ", abs(get_umat_el(indj, indb, indj, indb))
                    print *, "******************"

                end if
#endif
                if (ratio < max_frequency_bound) then
                    if (.not. enough_par_hist) then
                        cnt_par_hist = cnt_par_hist + 1
                        if (cnt_par_hist > 2 * cnt_threshold) enough_par_hist = .true.
                    end if

                    ind = int(ratio / frq_step_size) + 1

                    frequency_bins_para(ind) = frequency_bins_para(ind) + 1
                else
                    above_max_para = above_max_para + 1
#ifdef DEBUG_
                    print *, "mat_ele: ", mat_ele
                    print *, "pgen: ", pgen
                    print *, "ex-maxtrix: ", get_src(ex), " -> ", get_tgt(ex)
                    indi = gtid(ex(1, 1))
                    indj = gtid(ex(1, 2))
                    inda = gtid(ex(2, 1))
                    indb = gtid(ex(2, 2))
                    print *, "umat (ij|ab) ", get_umat_el(indi, indj, inda, indb)
                    print *, "umat (ij|ba) ", get_umat_el(indi, indj, indb, inda)
                    print *, "diff: ", abs(get_umat_el(indi, indj, inda, indb) - &
                                           get_umat_el(indi, indj, indb, inda))
                    print *, "(ia|ia): ", abs(get_umat_el(indi, inda, indi, inda))
                    print *, "(ja|ja): ", abs(get_umat_el(indj, inda, indj, inda))
                    print *, "(ib|ib): ", abs(get_umat_el(indi, indb, indi, indb))
                    print *, "(jb|jb): ", abs(get_umat_el(indj, indb, indj, indb))

                    print *, "******************"
#endif
                end if

                if (ratio > gamma_par) gamma_par = ratio
                if (ratio < min_par) min_par = ratio
            else

                ratio = ratio * (pDoubles * (1.0_dp - pParallel))
                ! analyse the really low and really high ratios:
#ifdef DEBUG_
                if (ratio < 0.001_dp) then
                    print *, "******************"
                    print *, "anti-parallel excitation:"
                    print *, "ratio: ", ratio
                    print *, "mat_ele: ", mat_ele
                    print *, "pgen: ", pgen
                    print *, "ex-maxtrix: ", get_src(ex), " -> ", get_tgt(ex)
                    indi = gtid(ex(1, 1))
                    indj = gtid(ex(1, 2))
                    inda = gtid(ex(2, 1))
                    indb = gtid(ex(2, 2))
                    print *, "umat (ij|ab) ", get_umat_el(indi, indj, inda, indb)
                    print *, "umat (ij|ba) ", get_umat_el(indi, indj, indb, inda)
                    print *, "diff: ", abs(get_umat_el(indi, indj, inda, indb) - &
                                           get_umat_el(indi, indj, indb, inda))
                    print *, "(ia|ia): ", abs(get_umat_el(indi, inda, indi, inda))
                    print *, "(ja|ja): ", abs(get_umat_el(indj, inda, indj, inda))
                    print *, "(ib|ib): ", abs(get_umat_el(indi, indb, indi, indb))
                    print *, "(jb|jb): ", abs(get_umat_el(indj, indb, indj, indb))
                    print *, "******************"
                end if
#endif

                if (ratio < max_frequency_bound) then
                    if (.not. enough_opp_hist) then
                        cnt_opp_hist = cnt_opp_hist + 1
                        if (cnt_opp_hist > cnt_threshold) enough_opp_hist = .true.
                    end if

                    ind = int(ratio / frq_step_size) + 1

                    frequency_bins_anti(ind) = frequency_bins_anti(ind) + 1

                else
                    above_max_anti = above_max_anti + 1
#ifdef DEBUG_
                    print *, "mat_ele: ", mat_ele
                    print *, "pgen: ", pgen
                    print *, "ex-maxtrix: ", get_src(ex), " -> ", get_tgt(ex)
                    indi = gtid(ex(1, 1))
                    indj = gtid(ex(1, 2))
                    inda = gtid(ex(2, 1))
                    indb = gtid(ex(2, 2))
                    print *, "umat (ij|ab) ", get_umat_el(indi, indj, inda, indb)
                    print *, "umat (ij|ba) ", get_umat_el(indi, indj, indb, inda)
                    print *, "diff: ", abs(get_umat_el(indi, indj, inda, indb) - &
                                           get_umat_el(indi, indj, indb, inda))
                    print *, "(ia|ia): ", abs(get_umat_el(indi, inda, indi, inda))
                    print *, "(ja|ja): ", abs(get_umat_el(indj, inda, indj, inda))
                    print *, "(ib|ib): ", abs(get_umat_el(indi, indb, indi, indb))
                    print *, "(jb|jb): ", abs(get_umat_el(indj, indb, indj, indb))
                    print *, "******************"

#endif
                end if

                if (ratio > gamma_opp) gamma_opp = ratio
                if (ratio < min_opp) min_opp = ratio

            end if
            ! combine par and opp into enough_doub
            if (enough_par_hist .and. enough_opp_hist) enough_doub_hist = .true.

        end if

    end subroutine fill_frequency_histogram_4ind

    subroutine fill_frequency_histogram_sd(mat_ele, pgen, ic)
        ! this is the routine, which for now i can use in the symmetry
        ! adapted GUGA code, where i only distinguish between single and
        ! double excitation for now..
        ! if i adapt that to the already used method in nosym_guga to
        ! distinguish between more excitaiton i have to write an additional
        ! subroutine which deals with that
        ! and this can also be used in the case where we dont differentiate
        ! between parallel or antiparallel double excitations in the old
        ! excitation generators
        real(dp), intent(in) :: mat_ele, pgen
        integer, intent(in) :: ic
        character(*), parameter :: this_routine = "fill_frequency_histogram_sd"
        integer, parameter :: cnt_threshold = 50

        real(dp) :: ratio
        integer :: ind

        ASSERT(pgen > EPS)
        ASSERT(ic == 1 .or. ic == 2 .or. ic == 3)

        if (mat_ele < matele_cutoff) then
            select case (ic)
            case (1)
                zero_singles = zero_singles + 1
            case (2)
                zero_doubles = zero_doubles + 1
            case (3)
                if (mat_ele < lMatEps) zero_triples = zero_triples + 1
            end select
            return
        end if

        if (mat_ele < thresh) then
            select case (ic)
            case (1)
                below_thresh_singles = below_thresh_singles + 1
            case (2)
                below_thresh_doubles = below_thresh_doubles + 1
            case (3)
                below_thresh_triples = below_thresh_triples + 1
            end select
        end if

        if (pgen < EPS) return

        ratio = mat_ele / pgen

        if (t_mol_3_body .and. ic < 3) ratio = ratio * (1.0 - pTriples)

        if (ic == 1) then

            ! change to unbias the stored ratios:
            ratio = ratio * pSingles

            if (ratio < max_frequency_bound) then

                if (.not. enough_sing_hist) then
                    cnt_sing_hist = cnt_sing_hist + 1
                    if (cnt_sing_hist > cnt_threshold) enough_sing_hist = .true.
                end if

                ind = int(ratio / frq_step_size) + 1

                frequency_bins_singles(ind) = frequency_bins_singles(ind) + 1

            else
                above_max_singles = above_max_singles + 1
                print *, "Warning: single excitation H_ij/pgen above max_frequency_bound!"
                print *, " H_ij/pgen: ", ratio, " ; bound: ", max_frequency_bound
                print *, " Consider increasing the bound!"
            end if

            if (ratio > gamma_sing) gamma_sing = ratio
            if (ratio < min_sing) min_sing = ratio

        else if (ic == 2) then

            ratio = ratio * pDoubles

            if (ratio < max_frequency_bound) then

                if (.not. enough_doub_hist) then
                    cnt_doub_hist = cnt_doub_hist + 1
                    if (cnt_doub_hist > cnt_threshold) enough_doub_hist = .true.
                end if

                ind = int(ratio / frq_step_size) + 1

                frequency_bins_doubles(ind) = frequency_bins_doubles(ind) + 1

            else
                above_max_doubles = above_max_doubles + 1
                print *, "Warning: double excitation H_ij/pgen above max_frequency_bound!"
                print *, " H_ij/pgen: ", ratio, " ; bound: ", max_frequency_bound
                print *, " Consider increasing the bound!"
            end if

            if (ratio > gamma_doub) gamma_doub = ratio
            if (ratio < min_doub) min_doub = ratio

        else

            ratio = ratio * pTriples
            ! check if ratio is within range
            if (ratio < max_frequency_bound) then

                ! check if enough triple spawns have been logged
                if (.not. enough_trip_hist) then
                    cnt_trip_hist = cnt_trip_hist + 1
                    if (cnt_trip_hist > cnt_threshold) enough_trip_hist = .true.
                end if

                ! get the corresponding bin in the histogram
                ind = int(ratio / frq_step_size) + 1
                frequency_bins_triples(ind) = frequency_bins_triples(ind) + 1

            else
                ! the ratio is not within range, log this event
                above_max_triples = above_max_triples + 1
            end if

            ! update largest/smallest ratio for triples
            if (ratio > gamma_trip) gamma_trip = ratio
            if (ratio < min_trip) min_trip = ratio

        end if

    end subroutine fill_frequency_histogram_sd

    subroutine fill_frequency_histogram(mat_ele, pgen)
        ! routine to accumulate the H_ij/pgen ration into the frequency bins
        ! keep parallelism in mind, especially if we have to adjust the
        ! bin list and boundary list on the fly.. this has to happen on
        ! all the processors then or? or can i just do it in the end of a loop
        ! adapt this function now, to discriminate between single and double
        ! excitations, and it pParallel is used, which can be determined
        ! through the excitation matrix and the 2 involved determinants
        ! (or even the starting determinant) and fill up to 3 frequency
        ! histograms .. this also means that we have to reset them maybe
        ! after pSingles or pParallel have been changed since the pgens
        ! and so the H_ij/pgen ratios change (or? can i modify that somehow
        ! on the fly?) think about that later and with ali..
        ! and for the guga stuff, if i finally implement similar pgens like
        ! pParallel, as it is already done in the nosym_guga case, i have
        ! to use even more histograms, but that should be fine.
        real(dp), intent(in) :: mat_ele, pgen
        character(*), parameter :: this_routine = "fill_frequency_histogram"
        integer, parameter :: cnt_threshold = 50

        real(dp) :: ratio
        integer :: ind
        ! first have to take correct matrix element, dependent if we use
        ! complex code or not, or no: for now just input the absolute value
        ! of H_ij, so its always a real then..
        ! nah.. it puts in 0 mat_eles too.. so just return if 0 mat_ele
        ASSERT(pgen > EPS)

        ! if the matrix element is 0, no excitation will or would be done
        ! and if the pgen is 0 i also shouldnt be here i guess.. so assert that
        if (mat_ele < matele_cutoff) then

            ! misuse doubles for the counting here
            zero_doubles = zero_doubles + 1

            return
        end if

        if (pgen < EPS) return

        ! then i have to first check if i have to make the histogram bigger...
        ratio = mat_ele / pgen

        if (ratio < max_frequency_bound) then
            ! the ratio fits in to the bins now i just have to find the
            ! correct one
            if (.not. enough_doub_hist) then
                cnt_doub_hist = cnt_doub_hist + 1
                if (cnt_doub_hist > cnt_threshold) enough_doub_hist = .true.
            end if

            ind = int(ratio / frq_step_size) + 1

            ! increase counter
            frequency_bins(ind) = frequency_bins(ind) + 1

        else
            above_max_doubles = above_max_doubles + 1
        end if

        if (ratio > gamma_doub) gamma_doub = ratio
        if (ratio < min_doub) min_doub = ratio

    end subroutine fill_frequency_histogram

    ! also provide the printing routines here:
    subroutine print_frequency_histograms
        ! try to write one general one and not multiple as in my GUGA branch
        use constants, only: int64
        use util_mod, only: get_free_unit, get_unique_filename
        use MPI_wrapper, only: root

        character(255) :: filename, exname
        integer(int64) :: all_frequency_bins_spec(n_frequency_bins)
        integer(int64) :: all_frequency_bins(n_frequency_bins)
        integer :: iunit, i
        ! sashas tip: why do i not just use a real as summation?
        real(dp) :: sum_all
        integer :: tmp_int, j, k
        integer(int64) :: tmp_int_64
        real(dp) :: cnt, threshold
        real(dp) :: max_tmp, min_tmp
        real(dp) :: temp_bins(n_frequency_bins)

        all_frequency_bins = 0

        ! also do additional output here.. to get information about the
        ! number of ratios above the threshold and about the number of
        ! zero excitations and stuff

        ! maybe first check if we have only singles or only doubles like in
        ! the real-space or momentum space hubbard:
        if (tHub .or. tUEG .or. &
            (t_new_real_space_hubbard .and. .not. t_trans_corr_hop) .or. &
            (t_k_space_hubbard .and. .not. t_3_body_excits)) then
            ! we only need to print one frequency_histogram:

            ! i need to change the rest of the code to use this
            ! frequency_bins histogram in this case! TODO
            call MPIAllReduce(frequency_bins, MPI_SUM, all_frequency_bins)

            max_tmp = 0.0_dp
            min_tmp = huge(0.0_dp)

            call mpiallreduce(gamma_doub, MPI_MAX, max_tmp)
            call MPIAllReduce(min_doub, MPI_MIN, min_tmp)

            if (iProcIndex == root) then

                ! also outout the obtained ratio integration here for now!
                temp_bins = real(all_frequency_bins, dp)
                sum_all = sum(temp_bins)
                threshold = frq_ratio_cutoff * sum_all

                iunit = get_free_unit()
                call get_unique_filename('frequency_histogram', .true., &
                                         .true., 1, filename)
                open(iunit, file=filename, status='unknown')

                cnt = 0.0_dp

                write(stdout, *) "writing frequency histogram..."
                do i = 1, n_frequency_bins
                    if (all_frequency_bins(i) == 0) cycle
                    write(iunit, "(f16.7)", advance='no') frq_step_size * i
                    write(iunit, "(i12)") all_frequency_bins(i)
                    if (cnt < threshold) then
                        cnt = cnt + temp_bins(i)
                        j = i
                    end if
                    ! also print only as far as necessary until largest
                    ! H_ij/pgen ratio
                    if (frq_step_size * i > max_tmp) exit
                end do
                close(iunit)
                write(stdout, *) "Done!"

            end if

            tmp_int_64 = 0
            call mpisum(zero_doubles, tmp_int_64)

            if (iProcIndex == root) then
                write(stdout, *) "Number of zero-valued excitations: ", tmp_int_64
                write(stdout, *) "Number of valid excitations: ", sum_all
                write(stdout, *) "ratio of zero-valued excitations: ", &
                    real(tmp_int_64, dp) / sum_all
                ! i guess i should also output the number of excitations
                ! above the threshold!
                ! this is not really working..
                ! because sasha had these cases
            end if

            tmp_int = 0
            call mpisum(above_max_doubles, tmp_int)

            if (iProcIndex == root) then
                write(stdout, *) "Number of excitations above threshold: ", tmp_int
                write(stdout, *) "ratio of excitations above threshold: ", &
                    real(tmp_int, dp) / sum_all

                ! also output the obtained integrated threshold and the
                ! maximum values H_ij/pgen ratio for this type of excitation:
                write(stdout, *) "integrated H_ij/pgen ratio: ", j * frq_step_size
                write(stdout, *) "for ", frq_ratio_cutoff, " percent coverage!"

                ! also  output the maximum H_ij/pgen ratio.. and maybe the
                ! improvement of the integrated tau search?
                write(stdout, *) "maximum H_ij/pgen ratio: ", max_tmp
                write(stdout, *) "maximum/integrated ratio: ", max_tmp / (j * frq_step_size)
                write(stdout, *) "minimum H_ij/pgen ratio: ", min_tmp

            end if
        else

            if (iProcIndex == root) all_frequency_bins = 0
            ! in the other cases we definetly have singles and more
            ! first the singles:

            if (iProcIndex == root) then
                ! change the name in case of the 2-body transcorrelated k-space hubbard
                if (t_3_body_excits .and. .not. t_mol_3_body) then
                    call get_unique_filename('frequency_histogram_triples', .true., &
                                             .true., 1, filename)
                else
                    call get_unique_filename('frequency_histogram_singles', .true., &
                                             .true., 1, filename)
                end if
            end if
            exname = "singles"
            call output_histogram(exname, frequency_bins_singles, gamma_sing, &
                                  min_sing, zero_singles, above_max_singles, below_thresh_singles)

            all_frequency_bins_spec = 0

            ! output the three-body histogram (if existing)
            if (t_mol_3_body) then
                call get_unique_filename('frequency_histogram_triples', .true., &
                                         .true., 1, filename)
                ! output the triples in the same fashion as the singles
                exname = "triples"
                call output_histogram(exname, frequency_bins_triples, gamma_trip, &
                                      min_trip, zero_triples, above_max_triples, below_thresh_triples)

                all_frequency_bins_spec = 0
            end if

            ! do the cases where there is antiparallel or parallel
            if (t_consider_par_bias) then

                ! then the parallel:
                call MPIAllReduce(frequency_bins_para, MPI_SUM, all_frequency_bins_spec)

                max_tmp = 0.0_dp
                min_tmp = huge(0.0_dp)
                call mpiallreduce(gamma_par, MPI_MAX, max_tmp)
                call MPIAllReduce(min_par, MPI_MIN, min_tmp)

                if (iProcIndex == root) then

                    temp_bins = real(all_frequency_bins_spec, dp)
                    sum_all = sum(temp_bins)

                    threshold = frq_ratio_cutoff * sum_all

                    iunit = get_free_unit()
                    call get_unique_filename('frequency_histogram_para', .true., &
                                             .true., 1, filename)
                    open(iunit, file=filename, status='unknown')

                    cnt = 0.0_dp

                    write(stdout, *) "writing parallel frequency histogram..."
                    do i = 1, n_frequency_bins
                        if (all_frequency_bins_spec(i) == 0) cycle
                        write(iunit, "(f16.7)", advance='no') frq_step_size * i
                        write(iunit, "(i12)") all_frequency_bins_spec(i)
                        if (cnt < threshold) then
                            cnt = cnt + temp_bins(i)
                            j = i
                        end if
                        if (frq_step_size * i > max_tmp) exit
                    end do
                    close(iunit)
                    write(stdout, *) "Done!"

                end if

                tmp_int_64 = 0
                call MPISum(zero_para, tmp_int_64)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of zero-valued parallel excitations: ", tmp_int_64
                    write(stdout, *) "Number of valid parallel excitations: ", sum_all
                    write(stdout, *) "ratio of zero-valued parallel excitations: ", &
                        real(tmp_int_64, dp) / sum_all
                end if

                tmp_int = 0
                call mpisum(above_max_para, tmp_int)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of parallel excitations above threshold: ", tmp_int
                    write(stdout, *) "ratio of parallel excitations above threshold: ", &
                        real(tmp_int, dp) / sum_all
                end if

                tmp_int = 0
                call MPISum(below_thresh_para, tmp_int)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of parallel excitations below threshold: ", tmp_int
                    write(stdout, *) "ratio of parallel excitations below threshold: ", &
                        real(tmp_int, dp) / sum_all

                    write(stdout, *) "integrated parallel H_ij/pgen ratio: ", j * frq_step_size
                    write(stdout, *) "for ", frq_ratio_cutoff, " percent coverage!"

                    write(stdout, *) "maximum parallel H_ij/pgen ratio: ", max_tmp
                    write(stdout, *) "maximum/integrated parallel ratio: ", &
                        max_tmp / (j * frq_step_size)
                    write(stdout, *) "minimum parallel H_ij/pgen ratio: ", min_tmp

                    ! and add them up for the final normed one
                    all_frequency_bins = all_frequency_bins + all_frequency_bins_spec

                end if

                all_frequency_bins_spec = 0
                ! then anti:
                call MPIAllReduce(frequency_bins_anti, MPI_SUM, all_frequency_bins_spec)

                max_tmp = 0.0_dp
                min_tmp = huge(0.0_dp)
                call mpiallreduce(gamma_opp, MPI_MAX, max_tmp)
                call MPIAllReduce(min_opp, MPI_MIN, min_tmp)

                if (iProcIndex == root) then

                    temp_bins = real(all_frequency_bins_spec, dp)
                    sum_all = sum(temp_bins)
                    threshold = frq_ratio_cutoff * sum_all

                    iunit = get_free_unit()
                    call get_unique_filename('frequency_histogram_anti', .true., &
                                             .true., 1, filename)
                    open(iunit, file=filename, status='unknown')

                    cnt = 0.0_dp

                    write(stdout, *) "writing anti-parallel frequency histogram..."
                    do i = 1, n_frequency_bins
                        if (all_frequency_bins_spec(i) == 0) cycle
                        write(iunit, "(f16.7)", advance='no') frq_step_size * i
                        write(iunit, "(i12)") all_frequency_bins_spec(i)
                        if (cnt < threshold) then
                            cnt = cnt + temp_bins(i)
                            j = i
                        end if
                        if (frq_step_size * i > max_tmp) exit
                    end do
                    close(iunit)
                    write(stdout, *) "Done!"
                end if

                tmp_int_64 = 0
                call MPISum(zero_anti, tmp_int_64)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of zero-valued anti-parallel excitations: ", tmp_int_64
                    write(stdout, *) "Number of valid anti-parallel excitations: ", sum_all
                    write(stdout, *) "ratio of zero-valued anti-parallel excitations: ", &
                        real(tmp_int_64, dp) / sum_all
                end if

                tmp_int = 0
                call mpisum(above_max_anti, tmp_int)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of anti-parallel excitations above threshold: ", &
                        tmp_int
                    write(stdout, *) "ratio of anti-parallel excitations above threshold: ", &
                        real(tmp_int, dp) / sum_all
                end if

                tmp_int = 0
                call mpisum(below_thresh_anti, tmp_int)

                if (iProcIndex == root) then
                    write(stdout, *) "Number of anti-parallel excitations below threshold: ", &
                        tmp_int
                    write(stdout, *) "ratio of anti-parallel excitations below threshold: ", &
                        real(tmp_int, dp) / sum_all

                    write(stdout, *) "integrated anti-parallel H_ij/pgen ratio: ", j * frq_step_size
                    write(stdout, *) "for ", frq_ratio_cutoff, " percent coverage!"

                    write(stdout, *) "maximum anti-parallel H_ij/pgen ratio: ", max_tmp
                    write(stdout, *) "maximum/integrated anti-parallel ratio: ", &
                        max_tmp / (j * frq_step_size)
                    write(stdout, *) "minimum anti-parallel H_ij/pgen ratio: ", min_tmp

                    ! and add them up for the final normed one
                    all_frequency_bins = all_frequency_bins + all_frequency_bins_spec

                end if

                all_frequency_bins_spec = 0

            else
                ! we only have additional doubles:
                call MPIAllReduce(frequency_bins_doubles, MPI_SUM, all_frequency_bins_spec)

                max_tmp = 0.0_dp
                min_tmp = huge(0.0_dp)
                call mpiallreduce(gamma_doub, MPI_MAX, max_tmp)
                call MPIAllReduce(min_doub, MPI_MIN, min_tmp)

                if (iProcIndex == root) then

                    temp_bins = real(all_frequency_bins_spec, dp)
                    sum_all = sum(temp_bins)
                    threshold = frq_step_size * sum_all

                    iunit = get_free_unit()
                    call get_unique_filename('frequency_histogram_doubles', .true., &
                                             .true., 1, filename)
                    open(iunit, file=filename, status='unknown')

                    cnt = 0.0_dp

                    write(stdout, *) "writing doubles frequency histogram..."
                    do i = 1, n_frequency_bins
                        if (all_frequency_bins_spec(i) == 0) cycle
                        write(iunit, "(f16.7)", advance='no') frq_step_size * i
                        write(iunit, "(i12)") all_frequency_bins_spec(i)
                        if (cnt < threshold) then
                            cnt = cnt + temp_bins(i)
                            j = i
                        end if
                        if (frq_step_size * i > max_tmp) exit
                    end do
                    close(iunit)
                    write(stdout, *) "Done!"
                end if

                tmp_int_64 = 0
                call MPISUM(zero_doubles, tmp_int_64)

                if (iprocindex == root) then
                    write(stdout, *) "Number of zero-valued double excitations: ", tmp_int_64
                    write(stdout, *) "Number of valid double excitations: ", sum_all
                    write(stdout, *) "ratio of zero-valued double excitations: ", &
                        real(tmp_int_64, dp) / sum_all
                end if

                tmp_int = 0
                call mpisum(above_max_doubles, tmp_int)

                if (iprocindex == root) then
                    write(stdout, *) "Number of excitations above threshold: ", tmp_int
                    write(stdout, *) "ratio of excitations above threshold: ", &
                        real(tmp_int, dp) / sum_all

                end if

                tmp_int = 0
                call MPISUM(below_thresh_doubles, tmp_int)

                if (iprocindex == root) then
                    write(stdout, *) "Number of excitations below threshold: ", tmp_int
                    write(stdout, *) "ratio of excitations below threshold: ", &
                        real(tmp_int, dp) / sum_all

                    write(stdout, *) "integrated doubles H_ij/pgen ratio: ", j * frq_step_size
                    write(stdout, *) "for ", frq_ratio_cutoff, " percent coverage!"

                    write(stdout, *) "maximum doubles H_ij/pgen ratio: ", max_tmp
                    write(stdout, *) "maximum/integrated doubles ratio: ", max_tmp / (j * frq_step_size)
                    write(stdout, *) "minimum doubles H_ij/pgen ratio: ", min_tmp
                    ! and add them up for the final normed one
                    all_frequency_bins = all_frequency_bins + all_frequency_bins_spec

                end if
            end if
        end if

        ! and the norm then is the same for all cases:

        if (iprocindex == root) then
            ! also print out a normed version for comparison
            temp_bins = real(all_frequency_bins, dp)
            sum_all = sum(temp_bins)

            ! check if integer overflow:
            if (.not. sum_all < 0.0_dp) then

                iunit = get_free_unit()
                call get_unique_filename('frequency_histogram_normed', .true., &
                                         .true., 1, filename)
                open(iunit, file=filename, status='unknown')

                do i = 1, n_frequency_bins
                    ! only print if above threshold
                    if (temp_bins(i) / sum_all < EPS) cycle
                    write(iunit, "(f16.7)", advance='no') frq_step_size * i
                    write(iunit, "(f16.7)") temp_bins(i) / sum_all
                end do

                close(iunit)
            else
                write(stdout, *) "Integer overflow in normed frequency histogram!"
                write(stdout, *) "DO NOT PRINT IT!"
            end if
        end if

        ! why am i misusing the hist-tau-search also for these method?
        ! because i want to have even more info on the dead-end excitations!
        if (t_log_ija) then
            ! i hope this mpiallreduce works..
            allocate(all_ija_bins_sing(nBasis / 2))
            all_ija_bins_sing = 0
            call MPIAllReduce(ija_bins_sing, MPI_SUM, all_ija_bins_sing)

            allocate(all_ija_orbs_sing(nBasis / 2))
            all_ija_orbs_sing = 0
            call MPIAllReduce(ija_orbs_sing, MPI_MAX, all_ija_orbs_sing)

            if (iprocindex == root) then
                iunit = get_free_unit()
                call get_unique_filename('ija_bins_sing', .true., .true., 1, filename)

                open(iunit, file=filename, status='unknown')

                do i = 1, nBasis / 2
                    if (all_ija_bins_sing(i) == 0) cycle
                    write(iunit, '(i12, i6, i12)') all_ija_bins_sing(i), i, all_ija_orbs_sing(i)
                end do
                close(iunit)
            end if

            deallocate(all_ija_bins_sing)
            deallocate(all_ija_orbs_sing)
            deallocate(ija_bins_sing)
            deallocate(ija_orbs_sing)

            allocate(all_ija_bins(nBasis / 2, nBasis / 2, nBasis / 2))
            allocate(all_ija_orbs(nBasis / 2, nBasis / 2, nBasis / 2))
            all_ija_bins = 0
            all_ija_orbs = 0

            do i = 1, nBasis / 2
                call MPIAllReduce(ija_bins_para(i, :, :), MPI_SUM, all_ija_bins(i, :, :))
                ! can i also communicate the number of symmetry allowed orbitals?
                ! and can i store it in spatial orbital information?
                ! i could make seperate ones for parallel and anti-parallel
                ! excitations.. and also for singles or?
                call MPIAllReduce(ija_orbs_para(i, :, :), MPI_MAX, all_ija_orbs(i, :, :))
            end do

            if (iprocindex == root) then
                iunit = get_free_unit()
                call get_unique_filename('ija_bins_para', .true., .true., 1, filename)
                open(iunit, file=filename, status='unknown')

                ! i know it is slower but loop over first row to get it
                ! nicely ordered in the output
                ! and change the number of occurences to the first line
                ! so it can be easily sorted with bash..
                do i = 1, nBasis / 2
                    do j = 1, nbasis / 2
                        do k = 1, nBasis / 2
                            if (all_ija_bins(i, j, k) == 0) cycle
                            write(iunit, '(i12,i6, 2i3, i12)') all_ija_bins(i, j, k), i, j, k, all_ija_orbs(i, j, k)
                        end do
                    end do
                end do

                close(iunit)

            end if

            deallocate(ija_bins_para)
            deallocate(ija_orbs_para)

            all_ija_bins = 0
            all_ija_orbs = 0

            do i = 1, nBasis / 2
                call MPIAllReduce(ija_bins_anti(i, :, :), MPI_SUM, all_ija_bins(i, :, :))
                ! can i also communicate the number of symmetry allowed orbitals?
                ! and can i store it in spatial orbital information?
                ! i could make seperate ones for parallel and anti-parallel
                ! excitations.. and also for singles or?
                call MPIAllReduce(ija_orbs_anti(i, :, :), MPI_MAX, all_ija_orbs(i, :, :))
            end do

            if (iprocindex == root) then
                iunit = get_free_unit()
                call get_unique_filename('ija_bins_anti', .true., .true., 1, filename)
                open(iunit, file=filename, status='unknown')

                do i = 1, nBasis / 2
                    do j = 1, nbasis / 2
                        do k = 1, nBasis / 2
                            if (all_ija_bins(i, j, k) == 0) cycle
                            write(iunit, '(i12, i6, 2i3, i12)') all_ija_bins(i, j, k), i, j, k, all_ija_orbs(i, j, k)
                        end do
                    end do
                end do

                close(iunit)
            end if

            deallocate(ija_bins_anti)
            deallocate(ija_orbs_anti)

        end if

    contains

        subroutine output_histogram(histname, frequency_bins_loc, gamma_loc, min_loc, &
                                    zero_loc, above_max_loc, below_thresh_loc)
            implicit none
            character(255), intent(in) :: histname
            integer(int64), intent(in) :: frequency_bins_loc(:)
            real(dp), intent(in) :: gamma_loc, min_loc
            integer(int64), intent(in) :: zero_loc
            integer, intent(in) :: above_max_loc, below_thresh_loc

            all_frequency_bins_spec = 0
            call MPIAllReduce(frequency_bins_loc, MPI_SUM, all_frequency_bins_spec)

            max_tmp = 0.0_dp
            min_tmp = huge(0.0_dp)

            call mpiallreduce(gamma_loc, MPI_MAX, max_tmp)
            call MPIAllReduce(min_loc, MPI_MIN, min_tmp)

            if (iProcIndex == root) then
                temp_bins = real(all_frequency_bins_spec, dp)
                sum_all = sum(temp_bins)
                threshold = frq_ratio_cutoff * sum_all

                iunit = get_free_unit()
                write(stdout, *) "writing "//trim(histname)//" frequency histogram..."
                open(iunit, file=filename, status='unknown')
                cnt = 0.0_dp
                ! also output here the integrated ratio!
                do i = 1, n_frequency_bins
                    if (all_frequency_bins_spec(i) == 0) cycle
                    write(iunit, "(f16.7)", advance='no') frq_step_size * i
                    write(iunit, "(i12)") all_frequency_bins_spec(i)
                    if (cnt < threshold) then
                        !                         cnt = cnt + all_frequency_bins_spec(i)
                        cnt = cnt + temp_bins(i)
                        j = i
                    end if
                    if (frq_step_size * i > max_tmp) exit
                end do
                close(iunit)
                write(stdout, *) "Done!"
            end if

            tmp_int_64 = 0
            call mpisum(zero_loc, tmp_int_64)

            if (iProcIndex == root) then
                write(stdout, *) "Number of zero-valued "//trim(histname)//" excitations: ", tmp_int_64
                ! maybe also check the number of valid excitations
                write(stdout, *) "Number of valid "//trim(histname)//" excitations: ", sum_all
                if (abs(sum_all) > 0._dp) write(stdout, *) "ratio of zero-valued "//trim(histname)//" excitations: ", &
                    real(tmp_int_64, dp) / sum_all

            end if

            tmp_int = 0
            call mpisum(above_max_loc, tmp_int)

            if (iProcIndex == root) then
                write(stdout, *) "Number of "//trim(histname)//" excitations above threshold: ", tmp_int
                if (abs(sum_all) > 0._dp) write(stdout, *) "ratio of "//trim(histname)//" excitations above threshold: ", &
                    real(tmp_int, dp) / sum_all

            end if

            tmp_int = 0
            call MPISum(below_thresh_loc, tmp_int)

            if (iProcIndex == root) then
                write(stdout, *) "Number of "//trim(histname)//" excitations below threshold: ", tmp_int
                if (abs(sum_all) > 0._dp) write(stdout, *) "ratio of "//trim(histname)//" excitations below threshold: ", &
                    real(tmp_int, dp) / sum_all

                write(stdout, *) "integrated "//trim(histname)//" H_ij/pgen ratio: ", j * frq_step_size
                write(stdout, *) "for ", frq_ratio_cutoff, " percent coverage!"

                write(stdout, *) "maximum "//trim(histname)//" H_ij/pgen ratio: ", max_tmp
                write(stdout, *) ""//trim(histname)//" maximum/integrated ratio: ", max_tmp / (j * frq_step_size)

                write(stdout, *) "minimum "//trim(histname)//" H_ij/pgen ratio: ", min_tmp

                ! and add them up for the final normed one
                all_frequency_bins = all_frequency_bins + all_frequency_bins_spec

            end if

        end subroutine output_histogram

    end subroutine print_frequency_histograms

    subroutine finalize_hist_tau_search
        character(*), parameter :: this_routine = "finalize_hist_tau_search"

        call LogMemDealloc(this_routine, mem_tag_histograms)
        if (allocated(frequency_bins)) deallocate(frequency_bins)
        if (allocated(frequency_bins_singles)) deallocate(frequency_bins_singles)
        if (allocated(frequency_bins_doubles)) deallocate(frequency_bins_doubles)
        if (allocated(frequency_bins_para)) deallocate(frequency_bins_para)
        if (allocated(frequency_bins_anti)) deallocate(frequency_bins_anti)
        if (allocated(frequency_bins_triples)) deallocate(frequency_bins_triples)

    end subroutine finalize_hist_tau_search

    subroutine integrate_frequency_histogram_spec(spec_frequency_bins, ratio)
        ! specific histogram integration routine which sums up the inputted
        ! frequency_bins
        integer(int64), intent(in) :: spec_frequency_bins(n_frequency_bins)
        real(dp), intent(out) :: ratio
        character(*), parameter :: this_routine = "integrate_frequency_histogram_spec"

        integer(int64) :: all_frequency_bins(n_frequency_bins)
        integer(int64) :: i, threshold
        integer(int64) :: n_elements, cnt
        real(dp) :: test_ratio, all_test_ratio
        logical :: mpi_ltmp

        ! test a change to the tau-search by now integrating on each
        ! processor seperately and communicate the maximas
        if (t_test_hist_tau) then
            test_ratio = 0.0_dp
            n_elements = sum(spec_frequency_bins)
            if (n_elements == 0) then
                test_ratio = 0.0_dp

            else if (n_elements < 0) then
                test_ratio = -1.0_dp
                ! if any of the frequency_ratios is full i guess i should
                ! also end the histogramming tau-search or?
                ! yes i have to communicate that.. or else it gets
                ! fucked up..

                t_fill_frequency_hists = .false.

            else

                threshold = int(frq_ratio_cutoff * n_elements, kind=int64)
                cnt = 0_int64
                i = 0_int64
                do while (cnt < threshold)
                    i = i + 1
                    cnt = cnt + spec_frequency_bins(i)
                end do

                test_ratio = i * frq_step_size

            end if

            ! how do i best deal with the mpi communication.
            ! i could use a mpialllor on (.not. t_fill_frequency_hists) to
            ! check if one of them is false on any processor..
            call MPIAllLORLogical(.not. t_fill_frequency_hists, mpi_ltmp)
            if (mpi_ltmp) then
                ! then i know one of the frequency histograms is full.. so
                ! stop on all nodes!
                t_fill_frequency_hists = .false.
                ratio = -1.0_dp
                return
            else
                all_test_ratio = 0.0_dp
                call MPIAllReduce(test_ratio, MPI_MAX, all_test_ratio)

                ratio = all_test_ratio
            end if

            return
        end if

        ! MPI communicate
        all_frequency_bins = 0
        call MPIAllReduce(spec_frequency_bins, MPI_SUM, all_frequency_bins)

        n_elements = sum(all_frequency_bins)

        ! have to check if no elements are yet stored into the histogram!
        if (n_elements == 0) then
            ratio = 0.0_dp
            return

        else if (n_elements < 0) then
            ! i reached an integer overflow.. and should stop histogramming
            ! this also means i should make an additional flag for only
            ! the histogramming option without the tau-search to it
            ! so i can also stop just the histogramming after an int
            ! overflow in the histograms
            ! TODO: in this case i also have to decide if i want to print
            ! it at this moment.. or maybe still at the end of the
            ! calculation.. but yes, maybe i want to, by default, always
            ! print them to be able to continue from a certain setting
            call stop_all(this_routine, "Overflow reached")
            ratio = -1.0_dp
            t_fill_frequency_hists = .false.
            return
        end if

        threshold = int(frq_ratio_cutoff * n_elements, kind=int64)

        cnt = 0
        i = 0
        do while (cnt < threshold)
            i = i + 1
            cnt = cnt + all_frequency_bins(i)
        end do

        ratio = i * frq_step_size

    end subroutine integrate_frequency_histogram_spec

end module