initial_trial_states.F90 Source File


Contents


Source Code

#include "macros.h"

module initial_trial_states
    use semi_stoch_procs, only: GLOBAL_RUN
    use semi_stoch_gen, only: generate_ras, generate_cas, generate_fci_core, &
        generate_space_from_file, generate_sing_doub_guga, &
        generate_sing_doub_determinants, &
        generate_optimised_space, generate_space_most_populated, &
        add_state_to_space, generate_using_mp1_criterion
    use hphf_integrals, only: hphf_diag_helement, hphf_off_diag_helement
    use gndts_mod, only: gndts_all_sym_this_proc
    use bit_rep_data
    use constants
    use kp_fciqmc_data_mod
    use SystemData, only: t_non_hermitian_2_body
    use core_space_util, only: cs_replicas
    use FciMCData, only: core_run
    use SystemData, only: tGUGA, tHPHF
    use determinants, only: get_helement
    use guga_bitRepOps, only: CSF_Info_t
#ifdef CMPLX_
    use blas_interface_mod, only: zheev
#else
    use matrix_util, only: eig, print_matrix
#endif
    use util_mod, only: operator(.div.), stop_all
    use parallel_neci, only: MPIAllGather, MPIBarrier

    better_implicit_none

    ! if the space is smaller than this parameter, use LAPACK instead
    integer, parameter :: lanczos_space_size_cutoff = 2000

contains

    subroutine calc_trial_states_lanczos(space_in, nexcit, ndets_this_proc, trial_iluts, evecs_this_proc, evals, &
                                         space_sizes, space_displs, reorder)

        use bit_reps, only: decode_bit_det
        use CalcData, only: subspace_in, t_force_lanczos
        use DetBitOps, only: ilut_lt, ilut_gt
        use FciMCData, only: ilutHF, CurrentDets, TotWalkers
        use lanczos_wrapper, only: frsblk_wrapper
        use Parallel_neci, only: MPIScatterV, MPIGatherV, MPIBCast, MPIArg, iProcIndex
        use Parallel_neci, only: nProcessors
        use MPI_wrapper, only: root
        use sort_mod, only: sort
        use SystemData, only: nel, tAllSymSectors, tGUGA
        use sparse_arrays, only: calculate_sparse_ham_par, sparse_ham

        use hamiltonian_linalg, only: sparse_hamil_type, parallel_sparse_hamil_type

        use lanczos_general, only: LanczosCalcType, perform_lanczos, DestroyLanczosCalc

        type(subspace_in) :: space_in
        integer, intent(in) :: nexcit
        integer, intent(out) :: ndets_this_proc
        integer(n_int), intent(out) :: trial_iluts(0:, :)
        HElement_t(dp), allocatable, intent(out) :: evecs_this_proc(:, :)
        real(dp), intent(out) :: evals(:)
        integer(MPIArg), intent(out) :: space_sizes(0:nProcessors - 1), space_displs(0:nProcessors - 1)
        integer, optional, intent(in) :: reorder(nexcit)

        integer(n_int), allocatable :: ilut_list(:, :)
        integer, allocatable :: det_list(:, :)
        integer :: i, max_elem_ind(1), ierr
        integer :: temp_reorder(nexcit)
        integer(MPIArg) :: ndets_all_procs, ndets_this_proc_mpi
        integer(MPIArg) :: sndcnts(0:nProcessors - 1), displs(0:nProcessors - 1)
        integer(MPIArg) :: rcvcnts
        integer, allocatable :: evec_abs(:)
        HElement_t(dp), allocatable :: evecs(:, :), evecs_transpose(:, :)
        character(len=*), parameter :: this_routine = "calc_trial_states_lanczos"

        type(LanczosCalcType) :: lanczosCalc

        ndets_this_proc = 0
        trial_iluts = 0_n_int

        ! do some GUGA checks to abort non-supported trial wavefunctions
        if (tGUGA .and. (space_in%tCAS .or. space_in%tRAS .or. space_in%tFCI)) then
            call stop_all(this_routine, "non-supported trial space for GUGA!")
        end if

        write(stdout, *) " Initialising wavefunctions by the Lanczos algorithm"

        ! Choose the correct generating routine.
        if (space_in%tHF) call add_state_to_space(ilutHF, trial_iluts, ndets_this_proc)
        if (space_in%tPops) call generate_space_most_populated(space_in%npops, &
                                                     space_in%tApproxSpace, space_in%nApproxSpace, trial_iluts, ndets_this_proc, GLOBAL_RUN)
        if (space_in%tRead) call generate_space_from_file(space_in%read_filename, trial_iluts, ndets_this_proc)
        if (space_in%tDoubles) then
            if (tGUGA) then
                call generate_sing_doub_guga(trial_iluts, ndets_this_proc, .false.)
            else
                call generate_sing_doub_determinants(trial_iluts, ndets_this_proc, .false.)
            end if
        end if
        if (space_in%tCAS) call generate_cas(space_in%occ_cas, space_in%virt_cas, trial_iluts, ndets_this_proc)
        if (space_in%tRAS) call generate_ras(space_in%ras, trial_iluts, ndets_this_proc)
        if (space_in%tOptimised) call generate_optimised_space(space_in%opt_data, space_in%tLimitSpace, &
                                                               trial_iluts, ndets_this_proc, space_in%max_size)
        if (space_in%tMP1) call generate_using_mp1_criterion(space_in%mp1_ndets, trial_iluts, ndets_this_proc)
        if (space_in%tFCI) then
            if (tAllSymSectors) then
                call gndts_all_sym_this_proc(trial_iluts, .true., ndets_this_proc)
            else
                call generate_fci_core(trial_iluts, ndets_this_proc)
            end if
        end if

        if (.not. (space_in%tPops .or. space_in%tRead .or. space_in%tDoubles .or. space_in%tCAS .or. &
                   space_in%tRAS .or. space_in%tOptimised .or. space_in%tMP1 .or. space_in%tFCI)) then
            call stop_all(this_routine, "A space for the trial functions was not chosen.")
        end if

        ndets_this_proc_mpi = int(ndets_this_proc, MPIArg)
        call MPIAllGather(ndets_this_proc_mpi, space_sizes, ierr)
        ndets_all_procs = sum(space_sizes)

        if (ndets_all_procs < lanczos_space_size_cutoff .and. .not. t_force_lanczos) then
            write(stdout, *) " Aborting Lanczos and initialising trial states with direct diagonalisation"
            call calc_trial_states_direct(space_in, nexcit, ndets_this_proc, trial_iluts, evecs_this_proc, evals, &
                                          space_sizes, space_displs, reorder)
            return
        end if

        if (ndets_all_procs < nexcit) call stop_all(this_routine, "The number of excited states that you have asked &
            &for is larger than the size of the trial space used to create the excited states. Since this &
            &routine generates trial states that are orthogonal, this is not possible.")

        space_displs(0) = 0_MPIArg
        do i = 1, nProcessors - 1
            space_displs(i) = space_displs(i - 1) + space_sizes(i - 1)
        end do

        call sort(trial_iluts(:, 1:ndets_this_proc), ilut_lt, ilut_gt)

        if (iProcIndex == root) then
            allocate(ilut_list(0:NIfTot, ndets_all_procs))
        else
            ! On these other processes ilut_list and evecs_transpose are not
            ! needed, but we need them to be allocated for the MPI wrapper
            ! function to work, so just allocate them to be small.
            allocate(ilut_list(1, 1))
            allocate(evecs_transpose(1, 1))
        end if

        call MPIGatherV(trial_iluts(:, 1:space_sizes(iProcIndex)), ilut_list, &
                        space_sizes, space_displs, ierr)

        call calculate_sparse_ham_par(space_sizes, trial_iluts, .true.)

        call MPIBarrier(ierr)

        if (iProcIndex == root) then
            allocate(det_list(nel, ndets_all_procs))

            do i = 1, ndets_all_procs
                call decode_bit_det(det_list(:, i), ilut_list(:, i))
            end do

            deallocate(ilut_list)

            allocate(evecs(ndets_all_procs, nexcit), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating eigenvectors array.")
            evecs = 0.0_dp

            allocate(evec_abs(ndets_all_procs), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating evec_abs array.")
            evec_abs = 0
        end if

        ! Perform the Lanczos procedure in parallel.
        if (t_non_hermitian_2_body) then
            call stop_all(this_routine, &
                          "perform_lanczos not implemented for non-hermitian Hamiltonians!")
        end if

        call perform_lanczos(lanczosCalc, det_list, nexcit, parallel_sparse_hamil_type, .true.)

        if (iProcIndex == root) then
            evals = lanczosCalc%eigenvalues(1:nexcit)
            evecs = lanczosCalc%eigenvectors(1:lanczosCalc%super%space_size, 1:nexcit)

            ! For consistency between compilers, enforce a rule for the sign of
            ! the eigenvector. To do this, make sure that the largest component
            ! of each vector is positive. The largest component is found using
            ! maxloc. If there are multiple determinants with the same weight
            ! then the FORTRAN standard says that the first such component will
            ! be used, so this will hopefully be consistent across compilers.
            ! To avoid numerical errors bwteen compilers for such elements, we
            ! also round the array components to integers. Because evecs will
            ! be normalised, also multiply by 100000 before rounding.
            do i = 1, nexcit
                ! First find the maximum element.
                evec_abs = nint(100000.0_dp * abs(evecs(:, i)))
                max_elem_ind = maxloc(evec_abs)
                if (abs(evecs(max_elem_ind(1), i)) < 0.0_dp) then
                    evecs(:, i) = -evecs(:, i)
                end if
            end do

            deallocate(det_list)
            deallocate(evec_abs)

            ! Reorder the trial states, if the user has asked for this to be done.
            if (present(reorder)) then
                temp_reorder = reorder
                call sort(temp_reorder, evals)
                temp_reorder = reorder
                call sort(temp_reorder, evecs)
            end if

            ! Unfortunately to perform the MPIScatterV call we need the transpose
            ! of the eigenvector array.
            safe_malloc(evecs_transpose, (nexcit, ndets_all_procs))
            evecs_transpose = transpose(evecs)
        else
            safe_free(ilut_list)
        end if

        call MPIBCast(evals, size(evals), root)

        ndets_this_proc_mpi = space_sizes(iProcIndex)
        ! The number of elements to send and receive in the MPI call, and the
        ! displacements.
        sndcnts = space_sizes * int(nexcit, MPIArg)
        rcvcnts = ndets_this_proc_mpi * int(nexcit, MPIArg)
        displs = space_displs * int(nexcit, MPIArg)

        ! Send the components to the correct processors using the following
        ! array as temporary space.
        allocate(evecs_this_proc(nexcit, ndets_this_proc))
        call MPIScatterV(evecs_transpose, sndcnts, displs, evecs_this_proc, rcvcnts, ierr)
        if (ierr /= 0) call stop_all(this_routine, "Error in MPIScatterV call.")

        ! Clean up.
        if (iProcIndex == root) deallocate(evecs)
        safe_free(evecs_transpose)
        call DestroyLanczosCalc(lanczosCalc)
        safe_free(sparse_ham)

    end subroutine calc_trial_states_lanczos

    subroutine calc_trial_states_direct(space_in, nexcit, ndets_this_proc, trial_iluts, evecs_this_proc, evals, &
                                        space_sizes, space_displs, reorder)

        use bit_reps, only: decode_bit_det
        use CalcData, only: subspace_in
        use DetBitOps, only: ilut_lt, ilut_gt
        use FciMCData, only: ilutHF, CurrentDets, TotWalkers
        use Parallel_neci, only: MPIScatterV, MPIGatherV, MPIBCast, MPIArg, iProcIndex
        use Parallel_neci, only: nProcessors
        use MPI_wrapper, only: root
        use semi_stoch_gen
        use sort_mod, only: sort
        use SystemData, only: nel, tAllSymSectors
        use lanczos_wrapper, only: frsblk_wrapper
        use guga_matrixElements, only: calc_guga_matrix_element
        use guga_data, only: ExcitationInformation_t

        type(subspace_in) :: space_in
        integer, intent(in) :: nexcit
        integer, intent(out) :: ndets_this_proc
        integer(n_int), intent(out) :: trial_iluts(0:, :)
        HElement_t(dp), allocatable, intent(out) :: evecs_this_proc(:, :)
        real(dp), intent(out) :: evals(:)
        integer(MPIArg), intent(out) :: space_sizes(0:nProcessors - 1), space_displs(0:nProcessors - 1)
        integer, optional, intent(in) :: reorder(nexcit)

        integer(n_int), allocatable :: ilut_list(:, :)
        integer, allocatable :: det_list(:, :)
        integer :: i, j, max_elem_ind(1), ierr, info, ndets_int
        integer :: temp_reorder(nexcit)
        integer(MPIArg) :: ndets_all_procs, ndets_this_proc_mpi
        integer(MPIArg) :: sndcnts(0:nProcessors - 1), displs(0:nProcessors - 1)
        integer(MPIArg) :: rcvcnts
        integer, allocatable :: evec_abs(:)
        HElement_t(dp), allocatable :: H_tmp(:, :), evecs_all(:, :)
        HElement_t(dp), allocatable :: evecs(:, :), evecs_transpose(:, :)
        HElement_t(dp), allocatable :: work(:)
        real(dp), allocatable :: evals_all(:), rwork(:)
        character(len=*), parameter :: this_routine = "calc_trial_states_direct"
        type(ExcitationInformation_t) :: excitInfo

        ndets_this_proc = 0
        trial_iluts = 0_n_int

        ! Choose the correct generating routine.
        if (space_in%tHF) call add_state_to_space(ilutHF, trial_iluts, ndets_this_proc)
        if (space_in%tPops) call generate_space_most_populated(space_in%npops, &
                                                     space_in%tApproxSpace, space_in%nApproxSpace, trial_iluts, ndets_this_proc, GLOBAL_RUN)
        if (space_in%tRead) call generate_space_from_file(space_in%read_filename, trial_iluts, ndets_this_proc)
        if (space_in%tDoubles) then
            if (tGUGA) then
                call generate_sing_doub_guga(trial_iluts, ndets_this_proc, .false.)
            else
                call generate_sing_doub_determinants(trial_iluts, ndets_this_proc, .false.)
            end if
        end if
        if (space_in%tCAS) call generate_cas(space_in%occ_cas, space_in%virt_cas, trial_iluts, ndets_this_proc)
        if (space_in%tRAS) call generate_ras(space_in%ras, trial_iluts, ndets_this_proc)
        if (space_in%tOptimised) call generate_optimised_space(space_in%opt_data, space_in%tLimitSpace, &
                                                               trial_iluts, ndets_this_proc, space_in%max_size)
        if (space_in%tMP1) call generate_using_mp1_criterion(space_in%mp1_ndets, trial_iluts, ndets_this_proc)
        if (space_in%tFCI) then
            if (tAllSymSectors) then
                call gndts_all_sym_this_proc(trial_iluts, .true., ndets_this_proc)
            else
                call generate_fci_core(trial_iluts, ndets_this_proc)
            end if
        end if

        if (.not. (space_in%tPops .or. space_in%tRead .or. space_in%tDoubles .or. space_in%tCAS .or. &
                   space_in%tRAS .or. space_in%tOptimised .or. space_in%tMP1 .or. space_in%tFCI)) then
            call stop_all(this_routine, "A space for the trial functions was not chosen.")
        end if

        ndets_this_proc_mpi = int(ndets_this_proc, MPIArg)
        call MPIAllGather(ndets_this_proc_mpi, space_sizes, ierr)
        ndets_all_procs = sum(space_sizes)

        if (ndets_all_procs < nexcit) call stop_all(this_routine, "The number of excited states that you have asked &
            &for is larger than the size of the trial space used to create the excited states. Since this &
            &routine generates trial states that are orthogonal, this is not possible.")

        space_displs(0) = 0_MPIArg
        do i = 1, nProcessors - 1
            space_displs(i) = space_displs(i - 1) + space_sizes(i - 1)
        end do

        ! [W.D. 15.5.2017:]
        ! is the sort behaving different, depending on the compiler?
        ! since different references for different compilers..??

        call sort(trial_iluts(:, 1:ndets_this_proc), ilut_lt, ilut_gt)

        if (iProcIndex == root) then
            allocate(ilut_list(0:NIfTot, ndets_all_procs))
        else
            ! On these other processes ilut_list and evecs_transpose are not
            ! needed, but we need them to be allocated for the MPI wrapper
            ! function to work, so just allocate them to be small.
            allocate(ilut_list(0:niftot, 1))
            allocate(evecs_transpose(1, 1))
        end if

        call MPIGatherV(trial_iluts(0:niftot, 1:space_sizes(iProcIndex)), ilut_list(0:NIfTot, :), &
                        space_sizes, space_displs, ierr)

        ! Only perform the diagonalisation on the root process.
        if (iProcIndex == root) then

            allocate(det_list(nel, ndets_all_procs))

            do i = 1, ndets_all_procs
                call decode_bit_det(det_list(:, i), ilut_list(:, i))
            end do

            allocate(evecs(ndets_all_procs, nexcit), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating eigenvectors array.")
            evecs = 0.0_dp

            allocate(evec_abs(ndets_all_procs), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating evec_abs array.")
            evec_abs = 0

            ! [W.D.]
            ! here the change to the previous implementation comes..
            ! previously frsblk_wrapper was called..
            ! try to brint that back??
            ! yes that was the problem! so bring it back for non-complex
            ! problems atleast

            ! Perform a direct diagonalisation in the trial space.

#ifdef CMPLX_
            ! First to build the Hamiltonian matrix
            ndets_int = int(ndets_all_procs)
            allocate(H_tmp(ndets_all_procs, ndets_all_procs), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating H_tmp array")
            H_tmp = 0.0_dp

            do i = 1, ndets_all_procs
                ! diagonal elements
                if (tHPHF) then
                    H_tmp(i, i) = hphf_diag_helement(det_list(:, i), ilut_list(:, i))
                else
                    H_tmp(i, i) = get_helement(det_list(:, i), det_list(:, i), 0)
                end if
                ! off diagonal elements
                block
                    type(CSF_Info_t) :: csf_i
                    if (tGUGA) csf_i = CSF_Info_t(ilut_list(:, i))
                    do j = 1, i - 1
                        if (tHPHF) then
                            H_tmp(i, j) = hphf_off_diag_helement(det_list(:, i), det_list(:, j), ilut_list(:, i), &
                                                                 ilut_list(:, j))
                        else if (tGUGA) then
                            call calc_guga_matrix_element(ilut_list(:, i), csf_i, &
                                                          ilut_list(:, j), CSF_Info_t(ilut_list(:, j)), excitInfo, H_tmp(j, i), .true.)
                        else
                            H_tmp(i, j) = get_helement(det_list(:, i), det_list(:, j), ilut_list(:, i), ilut_list(:, j))
                        end if
                    end do
                end block
            end do
! The H matrix if ready. Now diagonalize it.
            allocate(work(3 * ndets_all_procs), stat=ierr)
            work = 0.0_dp
            allocate(evals_all(ndets_all_procs), stat=ierr)
            evals_all = 0.0_dp

            allocate(rwork(3 * ndets_all_procs), stat=ierr)
            call zheev('V', 'L', ndets_int, H_tmp, ndets_int, evals_all, work, 3 * ndets_int, rwork, info)
            deallocate(rwork)
! copy H_tmp to evecs, and keep only the first nexcit entries of evals_all
            do i = 1, nexcit
                evals(i) = evals_all(i)
                do j = 1, ndets_all_procs
                    evecs(j, i) = H_tmp(j, i)
                end do
            end do
            deallocate(evals_all)
            deallocate(work)

            deallocate(H_tmp)
            deallocate(ilut_list)
#else

            ! should we switch here, if it is not hermitian?
            if (t_non_hermitian_2_body) then
                ASSERT(.not. tGUGA)
                ndets_int = int(ndets_all_procs)
                allocate(H_tmp(ndets_all_procs, ndets_all_procs), stat=ierr)
                if (ierr /= 0) call stop_all(this_routine, "Error allocating H_tmp array")
                H_tmp = 0.0_dp

                do i = 1, ndets_all_procs
                    ! diagonal elements
                    if (tHPHF) then
                        H_tmp(i, i) = hphf_diag_helement(det_list(:, i), ilut_list(:, i))
                    else
                        H_tmp(i, i) = get_helement(det_list(:, i), det_list(:, i), 0)
                    end if
                    ! off diagonal elements
                    ! we have to loop over all the orbitals now
                    do j = 1, ndets_all_procs
                        if (i == j) cycle
                        if (tHPHF) then
                            H_tmp(j, i) = hphf_off_diag_helement(det_list(:, i), &
                                                                 det_list(:, j), ilut_list(:, i), ilut_list(:, j))
                        else
                            H_tmp(j, i) = get_helement(det_list(:, i), &
                                                       det_list(:, j), ilut_list(:, i), ilut_list(:, j))
                        end if
                    end do
                end do

                allocate(evals_all(ndets_all_procs), stat=ierr)
                evals_all = 0.0_dp
                allocate(evecs_all(ndets_all_procs, ndets_all_procs), stat=ierr)
                evecs_all = 0.0_dp

                ! i think i need the left eigenvector for the trial-projection
                ! if it is non-hermitian..
                ! apparently not.. maybe with the j,i convention above not..
                ! confusing
                call eig(H_tmp, evals_all, evecs_all, .false.)

                evals = evals_all(1:nexcit)
                evecs = evecs_all(:, 1:nexcit)

                deallocate(H_tmp)
                deallocate(evecs_all)

            else
                call frsblk_wrapper(det_list, int(ndets_all_procs), &
                                    nexcit, evals, evecs)
            end if

#endif
            ! For consistency between compilers, enforce a rule for the sign of
            ! the eigenvector. To do this, make sure that the largest component
            ! of each vector is positive. The largest component is found using
            ! maxloc. If there are multiple determinants with the same weight
            ! then the FORTRAN standard says that the first such component will
            ! be used, so this will hopefully be consistent across compilers.
            ! To avoid numerical errors bwteen compilers for such elements, we
            ! also round the array components to integers. Because evecs will
            ! be normalised, also multiply by 100000 before rounding.
            do i = 1, nexcit
                ! First find the maximum element.
                evec_abs = nint(100000.0_dp * abs(evecs(:, i)))
                max_elem_ind = maxloc(evec_abs)
                if (Real(evecs(max_elem_ind(1), i), dp) < 0.0_dp) then
                    evecs(:, i) = -evecs(:, i)
                end if
            end do

            deallocate(det_list)
            deallocate(evec_abs)

            ! Reorder the trial states, if the user has asked for this to be done.
            if (present(reorder)) then
                temp_reorder = reorder
                call sort(temp_reorder, evals)
                temp_reorder = reorder
                call sort(temp_reorder, evecs)
            end if

            ! Unfortunately to perform the MPIScatterV call we need the transpose
            ! of the eigenvector array.
            allocate(evecs_transpose(nexcit, ndets_all_procs), stat=ierr)
            if (ierr /= 0) call stop_all(this_routine, "Error allocating transposed eigenvectors array.")
            evecs_transpose = transpose(evecs)
        else
            deallocate(ilut_list)
        end if

        call MPIBCast(evals, size(evals), root)

        ndets_this_proc_mpi = space_sizes(iProcIndex)
        ! The number of elements to send and receive in the MPI call, and the
        ! displacements.
        sndcnts = space_sizes * int(nexcit, MPIArg)
        rcvcnts = ndets_this_proc_mpi * int(nexcit, MPIArg)
        displs = space_displs * int(nexcit, MPIArg)

        ! Send the components to the correct processors using the following
        ! array as temporary space.

        allocate(evecs_this_proc(nexcit, ndets_this_proc), stat=ierr)
        call MPIScatterV(evecs_transpose, sndcnts, displs, evecs_this_proc, rcvcnts, ierr)
        if (ierr /= 0) call stop_all(this_routine, "Error in MPIScatterV call.")

        ! Clean up.
        if (iProcIndex == root) deallocate(evecs)
        deallocate(evecs_transpose)

    end subroutine calc_trial_states_direct

    subroutine calc_trial_states_qmc(space_in, nexcit, qmc_iluts, qmc_ht, paired_replicas, ndets_this_proc, &
                                     trial_iluts, evecs_this_proc, space_sizes, space_displs)

        use CalcData, only: subspace_in
        use DetBitOps, only: ilut_lt, ilut_gt
        use FciMCData, only: ilutHF, CurrentDets, TotWalkers
        use Parallel_neci, only: nProcessors
        use semi_stoch_gen
        use sort_mod, only: sort
        use SystemData, only: tAllSymSectors

        type(subspace_in) :: space_in
        integer, intent(in) :: nexcit
        integer(n_int), intent(in) :: qmc_iluts(0:, :)
        type(ll_node), pointer, intent(inout) :: qmc_ht(:)
        logical, intent(in) :: paired_replicas
        integer, intent(out) :: ndets_this_proc
        integer(n_int), intent(out) :: trial_iluts(0:, :)
        real(dp), allocatable, intent(out) :: evecs_this_proc(:, :)
        integer(MPIArg), intent(out) :: space_sizes(0:nProcessors - 1), space_displs(0:nProcessors - 1)

        integer(n_int), allocatable :: ilut_list(:, :)
        integer, allocatable :: det_list(:, :)
        integer :: i, j, ierr
        integer(MPIArg) :: ndets_all_procs, ndets_this_proc_mpi
        character(len=*), parameter :: this_routine = "calc_trial_states_qmc"

        if (paired_replicas) then
            ASSERT(nexcit == (lenof_sign.div.2))
        else
            ASSERT(nexcit == lenof_sign)
        end if

        ndets_this_proc = 0
        trial_iluts = 0_n_int

        ! Choose the correct generating routine.
        if (space_in%tHF) call add_state_to_space(ilutHF, trial_iluts, ndets_this_proc)
        if (space_in%tPops) call generate_space_most_populated(space_in%npops, &
                                                     space_in%tApproxSpace, space_in%nApproxSpace, trial_iluts, ndets_this_proc, GLOBAL_RUN)
        if (space_in%tRead) call generate_space_from_file(space_in%read_filename, trial_iluts, ndets_this_proc)
        if (space_in%tDoubles) then
            if (tGUGA) then
                call generate_sing_doub_guga(trial_iluts, ndets_this_proc, .false.)
            else
                call generate_sing_doub_determinants(trial_iluts, ndets_this_proc, .false.)
            end if
        end if
        if (space_in%tCAS) call generate_cas(space_in%occ_cas, space_in%virt_cas, trial_iluts, ndets_this_proc)
        if (space_in%tRAS) call generate_ras(space_in%ras, trial_iluts, ndets_this_proc)
        if (space_in%tOptimised) call generate_optimised_space(space_in%opt_data, space_in%tLimitSpace, &
                                                               trial_iluts, ndets_this_proc, space_in%max_size)
        if (space_in%tMP1) call generate_using_mp1_criterion(space_in%mp1_ndets, trial_iluts, ndets_this_proc)
        if (space_in%tFCI) then
            if (tAllSymSectors) then
                call gndts_all_sym_this_proc(trial_iluts, .true., ndets_this_proc)
            else
                call generate_fci_core(trial_iluts, ndets_this_proc)
            end if
        end if

        if (.not. (space_in%tPops .or. space_in%tRead .or. space_in%tDoubles .or. space_in%tCAS .or. &
                   space_in%tRAS .or. space_in%tOptimised .or. space_in%tMP1 .or. space_in%tFCI)) then
            call stop_all(this_routine, "A space for the trial functions was not chosen.")
        end if

        ndets_this_proc_mpi = int(ndets_this_proc, MPIArg)
        call MPIAllGather(ndets_this_proc_mpi, space_sizes, ierr)
        ndets_all_procs = sum(space_sizes)

        space_displs(0) = 0_MPIArg
        do i = 1, nProcessors - 1
            space_displs(i) = space_displs(i - 1) + space_sizes(i - 1)
        end do

        call sort(trial_iluts(:, 1:ndets_this_proc), ilut_lt, ilut_gt)

        allocate(evecs_this_proc(nexcit, ndets_this_proc), stat=ierr)

        call get_qmc_trial_weights(trial_iluts, ndets_this_proc, qmc_iluts, qmc_ht, nexcit, paired_replicas, evecs_this_proc)

    end subroutine calc_trial_states_qmc

    subroutine get_qmc_trial_weights(trial_iluts, ntrial, qmc_iluts, qmc_ht, nexcit, paired_replicas, evecs_this_proc)

        use bit_reps, only: decode_bit_det
        use hash, only: hash_table_lookup
        use Parallel_neci, only: MPISumAll
        use SystemData, only: nel

        integer(n_int), intent(in) :: trial_iluts(0:, :)
        integer, intent(in) :: ntrial
        integer(n_int), intent(in) :: qmc_iluts(0:, :)
        type(ll_node), pointer, intent(inout) :: qmc_ht(:)
        integer, intent(in) :: nexcit
        logical, intent(in) :: paired_replicas
        real(dp), intent(out) :: evecs_this_proc(:, :)

        integer :: i, j, ind, hash_val
        integer :: nI(nel)
        real(dp) :: qmc_sign(lenof_sign), trial_sign(nexcit)
        real(dp) :: norm(nexcit), tot_norm(nexcit)
        logical :: found
        character(*), parameter :: this_routine = 'get_qmc_trial_weights'

        if (paired_replicas) then
            ASSERT(nexcit == (lenof_sign.div.2))
        else
            ASSERT(nexcit == lenof_sign)
        end if

        norm = 0.0_dp

        ! Loop over all trial states.
        do i = 1, ntrial
            ! Check the QMC hash table to see if this state exists in the
            ! QMC list.
            call decode_bit_det(nI, trial_iluts(:, i))
            call hash_table_lookup(nI, trial_iluts(:, i), nifd, qmc_ht, &
                                   qmc_iluts, ind, hash_val, found)
            if (found) then
                call extract_sign(qmc_iluts(:, ind), qmc_sign)

                ! If using paired replicas then average the sign on each pair.
                if (paired_replicas) then
                    do j = 2, lenof_sign, 2
                        trial_sign(j / 2) = sum(qmc_sign(j - 1:j) / 2.0_dp)
                    end do
                else
                    trial_sign = qmc_sign
                end if

                norm = norm + trial_sign**2
                evecs_this_proc(:, i) = trial_sign
            else
                evecs_this_proc(:, i) = 0.0_dp
            end if
        end do

        call MPISumAll(norm, tot_norm)

        do i = 1, ntrial
            evecs_this_proc(:, i) = evecs_this_proc(:, i) / sqrt(tot_norm)
        end do

    end subroutine get_qmc_trial_weights

    subroutine set_trial_populations(nexcit, ndets_this_proc, trial_vecs)

        use CalcData, only: InitialPart, InitWalkers, tStartSinglePart
        use Parallel_neci, only: MPISumAll

        integer, intent(in) :: nexcit, ndets_this_proc
        HElement_t(dp), intent(inout) :: trial_vecs(:, :)

        real(dp) :: eigenvec_pop, tot_eigenvec_pop
        integer :: i, j

        ! We need to normalise all of the vectors to have the correct number of
        ! walkers.

        do j = 1, nexcit
            eigenvec_pop = 0.0_dp
            do i = 1, ndets_this_proc
                eigenvec_pop = eigenvec_pop + abs(trial_vecs(j, i))
            end do

            call MPISumAll(eigenvec_pop, tot_eigenvec_pop)

            if (tStartSinglePart) then
                trial_vecs(j, :) = trial_vecs(j, :) * InitialPart / tot_eigenvec_pop
            else
                trial_vecs(j, :) = trial_vecs(j, :) * InitWalkers / tot_eigenvec_pop
            end if
        end do

    end subroutine set_trial_populations

    subroutine set_trial_states(ndets_this_proc, init_vecs, trial_iluts, &
                                semistoch_started, paired_replicas)

        use bit_reps, only: encode_sign
        use CalcData, only: tSemiStochastic, tTrialWavefunction
        use FciMCData, only: CurrentDets, TotWalkers, HashIndex, nWalkerHashes
        use replica_data, only: set_initial_global_data
        use hash, only: clear_hash_table, fill_in_hash_table
        use semi_stoch_procs, only: fill_in_diag_helements, copy_core_dets_to_spawnedparts
        use semi_stoch_procs, only: add_core_states_currentdet_hash, reinit_current_trial_amps

        integer, intent(in) :: ndets_this_proc
        HElement_t(dp), intent(in) :: init_vecs(:, :)
        integer(n_int), intent(in) :: trial_iluts(0:, :)
        logical, intent(in) :: semistoch_started
        logical, intent(in), optional :: paired_replicas

        real(dp) :: real_sign(lenof_sign)
        integer :: i, j
        logical :: paired_reps_local

        if (present(paired_replicas)) then
            paired_reps_local = paired_replicas
        else
            paired_reps_local = .true.
        end if

        ! Now copy the amplitudes across to the CurrentDets array:
        ! First, get the correct states in CurrentDets.
        CurrentDets(0:NIfTot, 1:ndets_this_proc) = 0_n_int
        CurrentDets(0:nifd, 1:ndets_this_proc) = trial_iluts(0:nifd, 1:ndets_this_proc)

        ! Set signs.
        do i = 1, ndets_this_proc
            ! Construct the sign array to be encoded.
            if (paired_reps_local) then
                do j = 2, lenof_sign, 2
                    real_sign(j - 1:j) = init_vecs(j / 2, i)
                end do
            else
                do j = 1, inum_runs
                    real_sign(min_part_type(j):max_part_type(j)) = h_to_array(init_vecs(j, i))
                end do
            end if
            call encode_sign(CurrentDets(:, i), real_sign)
        end do

        TotWalkers = int(ndets_this_proc, int64)

        ! Reset and fill in the hash table.
        call clear_hash_table(HashIndex)
        call fill_in_hash_table(HashIndex, nWalkerHashes, CurrentDets, ndets_this_proc, .true.)

        if (tSemiStochastic .and. semistoch_started) then
            ! core_space stores all core determinants from all processors. Move those on this
            ! processor to trial_iluts, which add_core_states_currentdet_hash uses.
            call copy_core_dets_to_spawnedparts(cs_replicas(core_run))
            ! Any core space determinants which are not already in CurrentDets will be added
            ! by this routine.
            call add_core_states_currentdet_hash(core_run)
        end if

        if (tTrialWavefunction) call reinit_current_trial_amps()

        ! Calculate and store the diagonal elements of the Hamiltonian for
        ! determinants in CurrentDets.
        call fill_in_diag_helements()

        call set_initial_global_data(TotWalkers, CurrentDets)

    end subroutine set_trial_states

end module initial_trial_states