#include "macros.h" ! Some general procedures, created for the semi-stochastic (and trial wavefunction) code. ! Some are just used for initialisation and others in the main FCIQMC loop. module semi_stoch_procs use bit_rep_data, only: flag_deterministic, NIfD, NIfTot, test_flag, & test_flag_multi, IlutBits, extract_sign, & flag_connected, flag_trial use bit_reps, only: decode_bit_det, get_initiator_flag_by_run, & set_flag, encode_sign use calcrho_mod, only: igetexcitlevel use mpi use CalcData use constants use util_mod, only: stop_all, neci_flush, warning_neci use orb_idx_mod, only: SpinOrbIdx_t use FciMCData, only: SpawnedParts, TotWalkers, CurrentDets, & MaxSpawned, ilutRef, & t_global_core_space, core_run, Hii use core_space_util, only: core_space_t, cs_replicas, min_pt, max_pt, & deallocate_sparse_ham use Parallel_neci, only: iProcIndex, nProcessors, MPIArg use SystemData, only: nel, tHPHF, tGUGA, t_non_hermitian_2_body use procedure_pointers, only: shiftScaleFunction use hphf_integrals, only: hphf_diag_helement, hphf_off_diag_helement use Determinants, only: get_helement use sparse_arrays, only: approx_ham use procedure_pointers, only: shiftFactorFunction use timing_neci use hamiltonian_linalg, only: parallel_sparse_hamil_type use davidson_neci, only: DavidsonCalcType, perform_davidson, DestroyDavidsonCalc use FciMCData, only: DavidsonTag use MemoryManager, only: LogMemAlloc, LogMemDealloc, TagIntType use Parallel_neci, only: MPIScatterV use MPI_wrapper, only: root #ifndef IFORT_ use MPI_wrapper, only: MPI_IN_PLACE, MPI_INTEGER, MPI_INTEGER8, & MPI_COMM_SIZE, MPI_Win_Sync, MPI_Barrier, MPI_2DOUBLE_PRECISION, & MPI_MAXLOC #endif use sparse_arrays, only: sparse_ham, hamil_diag, HDiagTag use shared_rhash, only: shared_rhash_t, shared_rht_lookup use sparse_arrays, only: SparseHamilTags, allocate_sparse_ham_row use matrix_util, only: print_matrix, eig use adi_data, only: tSignedRepAv use global_det_data, only: set_supergroup_idx use gasci_supergroup_index, only: lookup_supergroup_indexer use LoggingData, only: tAccumPopsActive use gdata_io, only: gdata_io_t use LoggingData, only: t_print_core_info use shared_memory_mpi, only: shared_allocate_mpi, shared_deallocate_mpi use guga_bitrepops, only: fill_csf_i, CSF_Info_t use util_mod, only: get_free_unit use DeterminantData, only: write_det use matel_getter, only: get_diagonal_matel, get_off_diagonal_matel use tau_main, only: tau better_implicit_none ! Distinguishing value for 'use all runs' integer, parameter :: GLOBAL_RUN = -45 contains subroutine determ_projection() ! This subroutine gathers together partial_determ_vecs from each processor so ! that the full vector for the whole deterministic space is stored on each processor. ! It then performs the deterministic multiplication of the projector on this full vector. use FciMCData, only: SemiStoch_Comms_Time use FciMCData, only: SemiStoch_Multiply_Time use Parallel_neci, only: MPIBarrier, MPIAllGatherV use DetBitOps, only: DetBitEQ integer :: run integer :: i, j, part_type, c_run integer :: ierr integer(MPIArg) :: MPIerr real(dp) :: scaledDiagSft(inum_runs) do run = 1, size(cs_replicas) associate(rep => cs_replicas(run)) call MPIBarrier(ierr) call set_timer(SemiStoch_Comms_Time) call MPIAllGatherV(rep%partial_determ_vecs, rep%full_determ_vecs, & rep%determ_sizes, rep%determ_displs) call halt_timer(SemiStoch_Comms_Time) call set_timer(SemiStoch_Multiply_Time) if (rep%determ_sizes(iProcIndex) >= 1) then ! For the moment, we're only adding in these contributions when we need the energy ! This will need refinement if we want to continue with the option of inst vs true full RDMs ! (as in another CMO branch). ! Perform the multiplication. rep%partial_determ_vecs = 0.0_dp #ifdef CMPLX_ block integer :: r_pt, i_pt do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements do r_pt = rep%min_part(), rep%max_part(), 2 i_pt = r_pt + 1 rep%partial_determ_vecs(r_pt, i) = rep%partial_determ_vecs(r_pt, i) & - Real(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(r_pt, rep%sparse_core_ham(i)%positions(j)) & + Aimag(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(i_pt, rep%sparse_core_ham(i)%positions(j)) rep%partial_determ_vecs(i_pt, i) = rep%partial_determ_vecs(i_pt, i) & - Aimag(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(r_pt, rep%sparse_core_ham(i)%positions(j)) & - Real(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(i_pt, rep%sparse_core_ham(i)%positions(j)) end do end do end do end block #else do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & - rep%sparse_core_ham(i)%elements(j) & * rep%full_determ_vecs(:, rep%sparse_core_ham(i)%positions(j)) end do end do #endif ! Now add shift*full_determ_vecs to account for the shift, not stored in ! sparse_core_ham. #ifdef CMPLX_ do i = 1, rep%determ_sizes(iProcIndex) ! Only scale the shift for the corespace when the option is set if (tCoreAdaptiveShift .and. tAdaptiveShift) then ! scale the shift using the abs of this run's complex coefficient scaledDiagSft(run) = & shiftFactorFunction( & rep%indices_of_determ_states(i), run, & sqrt(rep%full_determ_vecs(& min_pt, i + rep%determ_displs(iProcIndex))**2 + & rep%full_determ_vecs(& max_pt, i + rep%determ_displs(iProcIndex))**2)) & * DiagSft(run) else scaledDiagSft = DiagSft end if do part_type = 1, size(rep%partial_determ_vecs, dim=1) ! Convert the index along partial_determ_vecs into a part_type rep%partial_determ_vecs(part_type, i) = & rep%partial_determ_vecs(part_type, i) + & scaledDiagSft(part_type_to_run(rep%min_part() + part_type - 1)) & * rep%full_determ_vecs(part_type, i + rep%determ_displs(iProcIndex)) end do end do #else do i = 1, rep%determ_sizes(iProcIndex) ! Only scale the shift for the corespace when the option is set if (tCoreAdaptiveShift .and. tAdaptiveShift) then ! Here, translate between positins in the full_determ_vecs ! and replicas if (rep%t_global) then c_run = run else c_run = 1 end if ! get the re-scaled shift accounting for undersampling error scaledDiagSft(run) = DiagSft(run) * shiftFactorFunction( & rep%indices_of_determ_states(i), run, & abs(rep%full_determ_vecs(c_run, i + rep%determ_displs(iProcIndex)))) else scaledDiagSft = DiagSft end if rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & + scaledDiagSft(run) * rep%full_determ_vecs(:, i + rep%determ_displs(iProcIndex)) end do #endif ! Now multiply the vector by tau to get the final projected vector. rep%partial_determ_vecs = rep%partial_determ_vecs * tau do i = 1, rep%determ_sizes(iProcIndex) if (tSkipRef(run) .and. DetBitEQ(CurrentDets(:, rep%indices_of_determ_states(i)), & iLutRef(:, run), nIfD)) then rep%partial_determ_vecs(:, i) = 0.0_dp end if end do end if end associate call halt_timer(SemiStoch_Multiply_Time) end do end subroutine determ_projection subroutine determ_projection_kp_hamil(partial_vecs, full_vecs, rep) use FciMCData, only: SemiStoch_Comms_Time, SemiStoch_Multiply_Time use Parallel_neci, only: MPIBarrier, MPIAllGatherV real(dp), allocatable, intent(inout) :: partial_vecs(:, :) real(dp), allocatable, intent(inout) :: full_vecs(:, :) type(core_space_t), intent(inout) :: rep integer :: i, j, ierr, run call MPIBarrier(ierr) call set_timer(SemiStoch_Comms_Time) call MPIAllGatherV(partial_vecs, full_vecs, rep%determ_sizes, rep%determ_displs) call halt_timer(SemiStoch_Comms_Time) call MPIBarrier(ierr) call set_timer(SemiStoch_Multiply_Time) if (rep%determ_sizes(iProcIndex) >= 1) then ! Start with this because sparse_core_hamil has Hii taken off, but actually we ! don't want the projected Hamiltonian to be relative to the HF determinant. partial_vecs = Hii * full_vecs(:, rep%determ_displs(iProcIndex) + 1: & rep%determ_displs(iProcIndex) + rep%determ_sizes(iProcIndex)) do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements partial_vecs(:, i) = partial_vecs(:, i) & + rep%sparse_core_ham(i)%elements(j) & * full_vecs(:, rep%sparse_core_ham(i)%positions(j)) end do end do end if call halt_timer(SemiStoch_Multiply_Time) end subroutine determ_projection_kp_hamil subroutine determ_projection_no_death() ! This subroutine gathers together partial_determ_vecs from each processor so ! that the full vector for the whole deterministic space is stored on each processor. ! It then performs the deterministic multiplication of the projector on this full vector. use DetBitOps, only: DetBitEQ use FciMCData, only: SemiStoch_Comms_Time use FciMCData, only: SemiStoch_Multiply_Time use Parallel_neci, only: MPIBarrier, MPIAllGatherV integer :: i, j, ierr, run, part_type do run = 1, size(cs_replicas) associate(rep => cs_replicas(run)) call MPIBarrier(ierr) call set_timer(SemiStoch_Comms_Time) call MPIAllGatherV(rep%partial_determ_vecs, rep%full_determ_vecs, & rep%determ_sizes, rep%determ_displs) call halt_timer(SemiStoch_Comms_Time) call set_timer(SemiStoch_Multiply_Time) if (rep%determ_sizes(iProcIndex) >= 1) then ! Perform the multiplication. rep%partial_determ_vecs = 0.0_dp #ifdef CMPLX_ block integer :: r_pt, i_pt do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements do r_pt = rep%min_part(), rep%max_part(), 2 i_pt = r_pt + 1 rep%partial_determ_vecs(r_pt, i) = rep%partial_determ_vecs(r_pt, i) & - Real(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(r_pt, rep%sparse_core_ham(i)%positions(j)) & + Aimag(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(i_pt, rep%sparse_core_ham(i)%positions(j)) rep%partial_determ_vecs(i_pt, i) = rep%partial_determ_vecs(i_pt, i) & - Aimag(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(r_pt, rep%sparse_core_ham(i)%positions(j)) & - Real(rep%sparse_core_ham(i)%elements(j)) & * rep%full_determ_vecs(i_pt, rep%sparse_core_ham(i)%positions(j)) end do end do end do end block #else do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & - rep%sparse_core_ham(i)%elements(j) & * rep%full_determ_vecs(:, rep%sparse_core_ham(i)%positions(j)) end do end do #endif ! Remove contribution from the diagonal elements, since the ! propagator has no diagonal. do i = 1, rep%determ_sizes(iProcIndex) rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & + rep%core_ham_diag(i) & * rep%full_determ_vecs(:, i + rep%determ_displs(iProcIndex)) end do ! Now multiply the vector by tau to get the final projected vector. rep%partial_determ_vecs = & rep%partial_determ_vecs * tau do i = 1, rep%determ_sizes(iProcIndex) if (tSkipRef(run) .and. DetBitEQ(CurrentDets(:, rep%indices_of_determ_states(i)), & iLutRef(:, run), nIfD)) then rep%partial_determ_vecs(:, i) = 0.0_dp end if end do end if call halt_timer(SemiStoch_Multiply_Time) end associate end do end subroutine determ_projection_no_death subroutine determ_proj_approx() ! This subroutine gathers together partial_determ_vecs from each processor so ! that the full vector for the whole deterministic space is stored on each processor. ! It then performs the deterministic multiplication of the projector on this full vector. use FciMCData, only: SemiStoch_Comms_Time use FciMCData, only: SemiStoch_Multiply_Time use Parallel_neci, only: MPIBarrier, MPIAllGatherV use DetBitOps, only: DetBitEQ integer :: i, j, ierr, run, part_type, c_run character(*), parameter :: t_r = "determ_proj_approx" if (.not. t_global_core_space) then call stop_all(t_r, "Cannot do approximate projection with core-space replicas") end if associate(rep => cs_replicas(core_run)) call MPIBarrier(ierr) call set_timer(SemiStoch_Comms_Time) call MPIAllGatherV(rep%partial_determ_vecs, rep%full_determ_vecs, & rep%determ_sizes, rep%determ_displs) call halt_timer(SemiStoch_Comms_Time) call set_timer(SemiStoch_Multiply_Time) if (rep%determ_sizes(iProcIndex) >= 1) then ! For the moment, we're only adding in these contributions when we need the energy ! This will need refinement if we want to continue with the option of inst vs true full RDMs ! (as in another CMO branch). ! Perform the multiplication. rep%partial_determ_vecs = 0.0_dp #ifdef CMPLX_ do i = 1, rep%determ_sizes(iProcIndex) do j = 1, approx_ham(i)%num_elements rep%partial_determ_vecs(min_pt, i) = rep%partial_determ_vecs(min_pt, i) & - Real(approx_ham(i)%elements(j)) & * rep%full_determ_vecs(min_pt, approx_ham(i)%positions(j)) & + Aimag(approx_ham(i)%elements(j)) & * rep%full_determ_vecs(max_pt, approx_ham(i)%positions(j)) rep%partial_determ_vecs(max_pt, i) = rep%partial_determ_vecs(max_pt, i) & - Aimag(approx_ham(i)%elements(j)) & * rep%full_determ_vecs(min_pt, approx_ham(i)%positions(j)) & - Real(approx_ham(i)%elements(j)) & * rep%full_determ_vecs(max_pt, approx_ham(i)%positions(j)) end do end do #else do i = 1, rep%determ_sizes(iProcIndex) do j = 1, approx_ham(i)%num_elements rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & - approx_ham(i)%elements(j) * rep%full_determ_vecs(:, approx_ham(i)%positions(j)) end do end do #endif ! Now add shift*full_determ_vecs to account for the shift, not stored in ! approx_ham. #ifdef CMPLX_ do i = 1, rep%determ_sizes(iProcIndex) do part_type = 1, lenof_sign rep%partial_determ_vecs(part_type, i) = rep%partial_determ_vecs(part_type, i) & + DiagSft(run) * rep%full_determ_vecs(part_type, i + rep%determ_displs(iProcIndex)) end do end do #else do i = 1, rep%determ_sizes(iProcIndex) rep%partial_determ_vecs(:, i) = rep%partial_determ_vecs(:, i) & + DiagSft * rep%full_determ_vecs(:, i + rep%determ_displs(iProcIndex)) end do #endif ! Now multiply the vector by tau to get the final projected vector. rep%partial_determ_vecs = rep%partial_determ_vecs * tau do i = 1, rep%determ_sizes(iProcIndex) do part_type = 1, rep_size if (tSkipRef(run) .and. DetBitEQ(CurrentDets(:, rep%indices_of_determ_states(i)), & iLutRef(:, run), nIfD)) then rep%partial_determ_vecs(part_type, i) = 0.0_dp end if end do end do end if call halt_timer(SemiStoch_Multiply_Time) end associate end subroutine determ_proj_approx subroutine average_determ_vector() use FciMCData, only: Iter, IterRDMStart use FciMCData, only: PreviousCycles use LoggingData, only: RDMEnergyIter real(dp) :: iter_curr, iter_start_av integer :: run ! If this condition is met then RDM energies were added in on the ! previous iteration. We now want to start a new averaging block so ! that the same contributions aren't added in again later. if (mod(Iter + PreviousCycles - IterRDMStart, RDMEnergyIter) == 0) then do run = 1, size(cs_replicas) cs_replicas(run)%full_determ_vecs_av = 0.0_dp end do write(stdout, *) "Reset fdv av at iteration ", iter end if ! The current iteration, converted to a double precision real. iter_curr = real(Iter + PreviousCycles, dp) ! The iteration that this averaging block started on. iter_start_av = real(RDMEnergyIter * ((Iter + PreviousCycles - IterRDMStart) & / RDMEnergyIter) + IterRDMStart, dp) ! Add in the current deterministic vector to the running average. do run = 1, size(cs_replicas) associate(rep => cs_replicas(run)) rep%full_determ_vecs_av = (((iter_curr - iter_start_av) & * rep%full_determ_vecs_av) + rep%full_determ_vecs) / & (iter_curr - iter_start_av + 1.0_dp) end associate end do end subroutine average_determ_vector subroutine print_determ_vec_av(run) integer, intent(in), optional :: run integer :: run_, iunit, i def_default(run_, run, 1) iunit = get_free_unit() open(iunit, file = 'determ_vecs_av', status = 'replace') associate ( rep => cs_replicas(run_)) do i = 1, rep%determ_space_size write(iunit, *) rep%full_determ_vecs_av(1, i) end do end associate close(iunit) end subroutine print_determ_vec_av subroutine print_determ_vec(run) integer, intent(in), optional :: run integer :: run_, iunit, i def_default(run_, run, 1) iunit = get_free_unit() open(iunit, file = 'determ_vecs', status = 'replace') associate ( rep => cs_replicas(run_)) do i = 1, rep%determ_space_size write(iunit, *) rep%full_determ_vecs(1, i) end do end associate close(iunit) end subroutine print_determ_vec !> Check whether an ilut belongs to the core space !> @param[in] ilut ilut we want to check !> @param[in] nI determinant corresponding to this ilut. Redundant, but is passed !! for performance reasons (decoding is expensive and we likely !! already know nI at this point) !> @return t_core true if and only if ilut is in the core space function is_core_state(ilut, nI, run_) result(t_core) use hash, only: FindWalkerHash integer(n_int), intent(in) :: ilut(0:NIfTot) integer, intent(in) :: nI(:) integer, intent(in), optional :: run_ integer :: i, run logical :: t_core def_default(run, run_, core_run) if (t_global_core_space) run = core_run associate(rep => cs_replicas(run)) call shared_rht_lookup(rep%core_ht, ilut, nI, rep%core_space, i, t_core) end associate end function is_core_state !> Check where an ilut is in the core space !> @param[in] ilut ilut we want to check !> @param[in] nI determinant corresponding to this ilut. Redundant, but is passed !! for performance reasons (decoding is expensive and we likely !! already know nI at this point) !> @return pos position of ilut in the core space, 0 if ilut is not in the core space function core_space_pos(ilut, nI, run_) result(pos) integer(n_int), intent(in) :: ilut(0:NIfTot) integer, intent(in) :: nI(:) integer, intent(in), optional :: run_ character(len=*), parameter :: t_r = "core_space_pos" integer :: pos, run logical :: t_core def_default(run, run_, core_run) if (t_global_core_space) run = core_run associate(rep => cs_replicas(run)) call shared_rht_lookup(rep%core_ht, ilut, nI, rep%core_space, pos, t_core) end associate if (pos == 0) then call stop_all(t_r, "State not found in core hash table.") end if end function core_space_pos function check_determ_flag(ilut, run_) result(core_state) integer(n_int), intent(in) :: ilut(0:NIfTot) integer, intent(in), optional :: run_ logical :: core_state integer :: run def_default(run, run_, GLOBAL_RUN) if (t_global_core_space) run = core_run if (tSemiStochastic) then if (run == GLOBAL_RUN) then core_state = test_flag_multi(ilut, flag_deterministic) else core_state = test_flag(ilut, flag_deterministic(run)) end if else core_state = .false. end if end function check_determ_flag subroutine recalc_core_hamil_diag(old_Hii, new_Hii) real(dp) :: old_Hii, new_Hii real(dp) :: Hii_shift integer :: i, j, run ! Only attempt this if we have already performed the semi-stochastic ! initialisation, in which case determ_sizes will have been allocated. do run = 1, size(cs_replicas) associate(rep => cs_replicas(run)) if (allocated(rep%determ_sizes)) then write(stdout, '(a56)') "Recalculating diagonal elements of the core Hamiltonian." Hii_shift = old_Hii - new_Hii do i = 1, rep%determ_sizes(iProcIndex) do j = 1, rep%sparse_core_ham(i)%num_elements if (rep%sparse_core_ham(i)%positions(j) == & i + rep%determ_displs(iProcIndex)) then rep%sparse_core_ham(i)%elements(j) = & rep%sparse_core_ham(i)%elements(j) + Hii_shift end if end do end do rep%core_ham_diag = rep%core_ham_diag + Hii_shift end if end associate end do end subroutine recalc_core_hamil_diag subroutine generate_core_connections(rep) use DetBitOps, only: FindBitExcitLevel, GetBitExcitation use Parallel_neci, only: MPIAllGatherV type(core_space_t), intent(inout) :: rep integer :: i, j, ic, counter, ierr integer :: Ex(2, nel) logical :: tSign integer(n_int), allocatable, dimension(:, :) :: temp_store integer(TagIntType) :: TempStoreTag character(len=*), parameter :: t_r = "calculate_det_hamiltonian_sparse" allocate(rep%core_connections(rep%determ_sizes(iProcIndex))) allocate(temp_store(0:NIfTot, rep%determ_space_size), stat=ierr) call LogMemAlloc('temp_store', maxval(rep%determ_sizes) * (NIfTot + 1), 8, t_r, & TempStoreTag, ierr) ! Stick together the deterministic states from all processors, on all processors. call MPIAllGatherV(SpawnedParts(0:NIfTot, 1:rep%determ_sizes(iProcIndex)), temp_store, & rep%determ_sizes, rep%determ_displs) ! Over all core states on this processor. do i = 1, rep%determ_sizes(iProcIndex) ! The number of non-zero elements in this array will be almost the same as in ! the core Hamiltonian array, except the diagonal element is not considered, ! so there will actually be one less. allocate(rep%core_connections(i)%elements(rep%sparse_core_ham(i)%num_elements - 1)) allocate(rep%core_connections(i)%positions(rep%sparse_core_ham(i)%num_elements - 1)) ! The total number of non-zero elements in row i. ! thats a problem for the RDM sampling! I need not only ! connected states by the Hamiltonian.. because RDMs are more general... rep%core_connections(i)%num_elements = rep%sparse_core_ham(i)%num_elements - 1 counter = 0 do j = 1, rep%sparse_core_ham(i)%num_elements ! If not the diagonal element. if (rep%sparse_core_ham(i)%positions(j) /= i + rep%determ_displs(iProcIndex)) then Ex = 0 Ex(1, 1) = nel counter = counter + 1 ! The positions of the non-zero and non-diagonal elements in this row i. rep%core_connections(i)%positions(counter) = rep%sparse_core_ham(i)%positions(j) ! for the GUGA implementation this has to be changed in the ! future. but since this routine is only called if we calc. ! RDMs on the fly, i can postpone that until then.. todo ic = FindBitExcitLevel(SpawnedParts(:, i), & temp_store(:, rep%sparse_core_ham(i)%positions(j))) call GetBitExcitation(SpawnedParts(0:NIfD, i), temp_store(0:NIfD, & rep%sparse_core_ham(i)%positions(j)), Ex, tSign) if (tSign) then ! Odd number of permutations. Minus the excitation level. rep%core_connections(i)%elements(counter) = -ic else ! Even number of permutations. The excitation level. rep%core_connections(i)%elements(counter) = ic end if end if end do end do deallocate(temp_store, stat=ierr) call LogMemDealloc(t_r, TempStoreTag, ierr) end subroutine generate_core_connections subroutine store_whole_core_space(rep) use FciMCData, only: CoreSpaceTag use MemoryManager, only: LogMemAlloc use Parallel_neci, only: MPIAllGatherV, iProcIndex_inter, iProcIndex_intra, & mpi_comm_inter, mpi_comm_intra, nNodes, NodeLengths, & iNodeIndex, MPIAllGather type(core_space_t), intent(inout) :: rep integer(MPIArg) :: MPIerr integer :: ierr character(len=*), parameter :: t_r = "store_whole_core_space" integer(MPIArg), allocatable :: sizes_this_node(:) integer(MPIArg) :: total_size_this_node integer(MPIArg), allocatable :: node_offsets(:), sizes_per_node(:) integer(MPIArg) :: proc_offset integer(MPIArg) :: core_width integer :: i integer(MPIArg) :: node_size, num_nodes integer(MPIArg) :: global_offset call mpi_comm_size(mpi_comm_intra, node_size, MPIerr) call mpi_comm_size(mpi_comm_inter, num_nodes, MPIerr) allocate(sizes_this_node(0:node_size - 1)) allocate(node_offsets(0:num_nodes - 1)) allocate(sizes_per_node(0:num_nodes - 1)) call shared_allocate_mpi(rep%core_space_win, rep%core_space_direct, & [int(1 + NIfTot, int64), int(rep%determ_space_size, int64)], ierr) ! Convert from 1-based first dimension to 0-based first dimension as used in iluts rep%core_space(0:, 1:) => rep%core_space_direct(1:, 1:) call LogMemAlloc('core_space', maxval(rep%determ_sizes) * (NIfTot + 1), 8, t_r, & rep%CoreSpaceTag, err=ierr) if (iProcIndex_intra == 0) rep%core_space = 0_n_int ! Write the core-space on this node into core_space call MPI_AllGather(rep%determ_sizes(iProcIndex), 1, MPI_INTEGER, & sizes_this_node, 1, MPI_INTEGER, mpi_comm_intra, MPIerr) ! Get the intra-node offset proc_offset = 0 do i = 1, iProcIndex_intra proc_offset = proc_offset + sizes_this_node(i - 1) end do ! Sum the size on this node total_size_this_node = sum(sizes_this_node) call MPI_AllGather(total_size_this_node, 1, MPI_INTEGER, sizes_per_node, 1, MPI_INTEGER, & mpi_comm_inter, MPIerr) ! Get the inter-node offset node_offsets(0) = 0 do i = 1, num_nodes - 1 node_offsets(i) = node_offsets(i - 1) + sizes_per_node(i - 1) end do global_offset = node_offsets(iProcIndex_inter) + proc_offset call MPI_Win_Sync(rep%core_space_win, MPIerr) call MPI_Barrier(mpi_comm_intra, MPIerr) rep%core_space(0:NIfTot, (global_offset + 1): & (global_offset + rep%determ_sizes(iProcIndex))) = & SpawnedParts(0:NIfTot, 1:rep%determ_sizes(iProcIndex)) call MPI_Win_Sync(rep%core_space_win, MPIerr) call MPI_Barrier(mpi_comm_intra, MPIerr) call MPI_Win_Sync(rep%core_space_win, MPIerr) ! Multiply with message width (1+NIfTot) core_width = int(size(rep%core_space, dim=1), MPIArg) node_offsets = node_offsets * core_width total_size_this_node = total_size_this_node * core_width sizes_per_node = sizes_per_node * core_width ! Give explicit limits for SpawnedParts slice, as NIfTot is not nesc. ! equal to NIfBCast. (It may be longer) call MPI_AllGatherV(MPI_IN_PLACE, total_size_this_node, MPI_INTEGER8, rep%core_space, & sizes_per_node, node_offsets, MPI_INTEGER8, mpi_comm_inter, MPIerr) ! And sync the shared window call MPI_Win_Sync(rep%core_space_win, MPIerr) call MPI_Barrier(mpi_comm_intra, MPIerr) ! Communicate the indices in the full vector at which the various processors take over, relative ! to the first index position in the vector (i.e. the array displs in MPI routines). call MPIAllGather(global_offset, rep%determ_displs, ierr) deallocate(sizes_per_node) deallocate(node_offsets) deallocate(sizes_this_node) end subroutine store_whole_core_space subroutine remove_high_energy_orbs(ilut_list, num_states, target_num_states, tParallel) use Parallel_neci, only: MPISumAll use sort_mod, only: sort use SystemData, only: nBasis, BRR, Arr integer, intent(inout) :: num_states integer(n_int), intent(inout) :: ilut_list(0:NIfTot, 1:num_states) integer, intent(in) :: target_num_states logical, intent(in) :: tParallel integer, allocatable, dimension(:) :: orbitals_rmvd integer :: i, j, orb, num_orbs_rmvd integer :: bit, elem integer :: states_rmvd_this_proc, counter integer(MPIArg) :: tot_num_states, states_rmvd_all_procs logical :: occupied states_rmvd_this_proc = 0 num_orbs_rmvd = 0 if (tParallel) then call MPISumAll(int(num_states, MPIArg), tot_num_states) else tot_num_states = int(num_states, MPIArg) end if if (tot_num_states <= target_num_states) return write(stdout, '(a32)') "Removing high energy orbitals..." ! Loop through all orbitals, from highest energy to lowest. do i = nBasis, 1, -1 num_orbs_rmvd = nBasis - i + 1 orb = BRR(i) bit = mod(orb - 1, bits_n_int) elem = (orb - 1 - bit) / bits_n_int ! Loop through all states and remove those states with orbital orb ! occupied. do j = 1, num_states occupied = btest(ilut_list(elem, j), bit) if (occupied) then ilut_list(:, j) = 0_n_int states_rmvd_this_proc = states_rmvd_this_proc + 1 end if end do if (tParallel) then call MPISumAll(int(states_rmvd_this_proc, MPIArg), states_rmvd_all_procs) else states_rmvd_all_procs = int(states_rmvd_this_proc, MPIArg) end if ! If there are degenerate orbitals, then cycle to remove the ! degenerate orbitals too, before giving the program chance to quit. if (i > 1) then ! If the orbitals energies are the same: if (abs(Arr(i, 1) - Arr((i - 1), 1)) < 1.0e-12_dp) cycle end if if (tot_num_states - states_rmvd_all_procs <= target_num_states) exit end do ! Loop through the list and shuffle states down to fill in the gaps ! created above. counter = 0 do i = 1, num_states ! If the state wasn't set to 0: if (.not. all(ilut_list(:, i) == 0_n_int)) then counter = counter + 1 ilut_list(:, counter) = ilut_list(:, i) end if end do if (counter /= num_states - states_rmvd_this_proc) & call stop_all("remove_high_energy_orbs", & "Wrong number of states found.") num_states = counter ! Finally, output information: write(stdout, '(a36)', advance='no') "The following orbitals were removed:" allocate(orbitals_rmvd(num_orbs_rmvd)) orbitals_rmvd = BRR(nBasis - num_orbs_rmvd + 1:nBasis) call sort(orbitals_rmvd) do i = 1, num_orbs_rmvd write(stdout, '(i5)', advance='no') orbitals_rmvd(i) if (i == num_orbs_rmvd) write(stdout, '()', advance='yes') end do deallocate(orbitals_rmvd) write(stdout, '(i6,1x,a29)') states_rmvd_all_procs, "states were removed in total." write(stdout, '(i6,1x,a17)') tot_num_states - states_rmvd_all_procs, "states were kept." call neci_flush(stdout) end subroutine remove_high_energy_orbs subroutine sort_states_by_energy(ilut_list, num_states, tSortDoubles) ! Note: If requested, keep all doubles at the top, then sort by energy. use DetBitOps, only: FindBitExcitLevel use FciMCData, only: ilutHF use sort_mod, only: sort integer, intent(in) :: num_states integer(n_int), intent(inout) :: ilut_list(0:NIfTot, 1:num_states) logical, intent(in) :: tSortDoubles integer(n_int) :: temp_ilut(0:NIfTot) integer :: nI(nel) integer :: i, excit_level, num_sing_doub, block_size real(dp) :: energies(num_states) do i = 1, num_states call decode_bit_det(nI, ilut_list(:, i)) if (tHPHF) then energies(i) = hphf_diag_helement(nI, ilut_list(:, i)) else energies(i) = get_helement(nI, nI, 0) end if end do ! Sort the states in order of energy, from smallest to largest. call sort(energies, ilut_list(0:NIfTot, 1:num_states)) ! If requested, sort singles and doubles to the top, keeping the rest of ! the ordering the same. if (tSortDoubles) then num_sing_doub = 0 block_size = 0 do i = 1, num_states ! for GUGA this would have to be changed, but apparently this ! function is never called in the rest of the code so ! ignore it for now.. excit_level = FindBitExcitLevel(ilut_list(:, i), ilutHF) if (excit_level <= 2) then num_sing_doub = num_sing_doub + 1 temp_ilut = ilut_list(:, i) ! Move block of non-singles or doubles down one in ilut_list. ilut_list(:, num_sing_doub + 1:num_sing_doub + block_size) = & ilut_list(:, num_sing_doub:num_sing_doub + block_size - 1) ! Then move the single or double just found into the space ! created above this block. ilut_list(:, num_sing_doub) = temp_ilut else block_size = block_size + 1 end if end do end if end subroutine sort_states_by_energy subroutine sort_space_by_proc(ilut_list, ilut_list_size, num_states_procs) ! And also output the number of states on each processor in the space. use load_balance_calcnodes, only: DetermineDetNode integer, intent(in) :: ilut_list_size integer(n_int), intent(inout) :: ilut_list(0:, :) integer(MPIArg), intent(out) :: num_states_procs(0:nProcessors - 1) integer(n_int), allocatable, dimension(:, :) :: temp_list integer, allocatable, dimension(:) :: proc_list integer :: nI(nel) integer :: i, ierr integer :: width, max_ind integer :: counter(0:nProcessors - 1) integer(TagIntType) :: TempConTag, ProcListTag character(len=*), parameter :: t_r = "sort_space_by_proc" width = int(size(ilut_list, 1), MPIArg) max_ind = width - 1 allocate(proc_list(ilut_list_size), stat=ierr) call LogMemAlloc('proc_list', ilut_list_size, sizeof_int, t_r, ProcListTag, ierr) allocate(temp_list(0:max_ind, ilut_list_size), stat=ierr) call LogMemAlloc('temp_list', ilut_list_size * width, size_n_int, t_r, & TempConTag, ierr) num_states_procs = 0 ! Create a list, proc_list, with the processor numbers of the corresponding iluts. do i = 1, ilut_list_size call decode_bit_det(nI, ilut_list(0:NIfTot, i)) proc_list(i) = DetermineDetNode(nel, nI, 0) num_states_procs(proc_list(i)) = int(num_states_procs(proc_list(i)) + 1, MPIArg) end do counter(0) = 0 do i = 1, nProcessors - 1 counter(i) = counter(i - 1) + num_states_procs(i - 1) end do do i = 1, ilut_list_size counter(proc_list(i)) = counter(proc_list(i)) + 1 temp_list(0:NIfTot, counter(proc_list(i))) = ilut_list(0:NIfTot, i) end do ilut_list(:, 1:ilut_list_size) = temp_list(:, 1:ilut_list_size) deallocate(temp_list, stat=ierr) deallocate(proc_list, stat=ierr) call LogMemDealloc(t_r, TempConTag, ierr) call LogMemDealloc(t_r, ProcListTag, ierr) end subroutine sort_space_by_proc subroutine fill_in_diag_helements() use FciMCData, only: Hii use global_det_data, only: set_det_diagH, set_det_offdiagH integer :: i integer :: nI(nel) real(dp) :: tmpH HElement_t(dp) :: tmpHoff real(dp) :: sgn(lenof_sign) do i = 1, int(TotWalkers) call decode_bit_det(nI, CurrentDets(:, i)) tmpH = get_diagonal_matel(nI, CurrentDets(:, i)) - Hii tmpHoff = get_off_diagonal_matel(nI, CurrentDets(:, i)) call set_det_diagh(i, tmpH) call set_det_offdiagh(i, tmpHoff) end do end subroutine fill_in_diag_helements subroutine write_core_space(rep) use Parallel_neci, only: MPIBarrier type(core_space_t), intent(in) :: rep integer :: i, k, iunit, ierr write(stdout, '(a35)') "Writing the core space to a file..." iunit = get_free_unit() ! Only let the root process write the states. if (iProcIndex == root) then open(iunit, file='CORESPACE', status='replace') do i = 1, rep%determ_space_size do k = 0, nifd write(iunit, '(i24)', advance='no') rep%core_space(k, i) end do write(iunit, '()') end do call neci_flush(iunit) close(iunit) end if call MPIBarrier(ierr) end subroutine write_core_space subroutine add_core_states_currentdets(run) ! And if the state is already present, simply set its flag. ! Also sort the states afterwards. use DetBitOps, only: ilut_lt, ilut_gt, DetBitLT use searching, only: BinSearchParts use sort_mod, only: sort integer, intent(in) :: run integer :: i, comp, MinInd, PartInd, nwalkers logical :: tSuccess associate(rep => cs_replicas(run)) MinInd = 1 nwalkers = int(TotWalkers) do i = 1, rep%determ_sizes(iProcIndex) if (nwalkers > 0) then ! If there is only one state in CurrentDets to check then BinSearchParts doesn't ! return the desired value for PartInd, so do this separately... if (MinInd == nwalkers) then comp = DetBitLT(CurrentDets(:, MinInd), SpawnedParts(0:NIfTot, i), nifd) if (comp == 0) then tSuccess = .true. PartInd = MinInd else if (comp == 1) then tSuccess = .false. PartInd = MinInd else if (comp == -1) then tSuccess = .false. PartInd = MinInd - 1 end if else call BinSearchParts(SpawnedParts(:, i), MinInd, nwalkers, PartInd, tSuccess) end if else tSuccess = .false. PartInd = 0 end if if (tSuccess) then call set_flag(CurrentDets(:, PartInd), flag_deterministic(run)) if (tTruncInitiator .and. t_core_inits) then call set_flag(CurrentDets(:, PartInd), get_initiator_flag_by_run(run)) end if MinInd = PartInd else ! Move all states below PartInd down one and insert the new state in the slot. CurrentDets(:, PartInd + 2:nwalkers + 1) = CurrentDets(:, PartInd + 1:nwalkers) CurrentDets(:, PartInd + 1) = SpawnedParts(0:NIfTot, i) nwalkers = nwalkers + 1 MinInd = PartInd + 1 end if end do end associate call sort(CurrentDets(:, 1:nwalkers), ilut_lt, ilut_gt) TotWalkers = int(nwalkers, int64) end subroutine add_core_states_currentdets subroutine add_core_states_currentdet_hash(run) ! This routine adds the core states in SpawnedParts into CurrentDets. For all ! such states already in CurrentDets, we want to keep the amplitude (which ! may have come from a popsfile). ! This routine is for when the hashed walker main list. In this case, ! as all core states are always kept in the list, it is beneficial to keep ! them at the top always. So, in this routine, we move the non-core states ! in CurrentDets to the end and add the new core states in the gaps. ! WARNING: If there are any determinants in CurrentDets on input which are ! unoccupied then, for this function to work correctly, the determiant ! *must* have an entry in the hash table. Otherwise, these determinants ! will end up being repeated in CurrentDets is they are core determinants. ! This isn't ideal because when the FCIQMC calculation starts, such ! unoccupied determinants should *not* be in the hash table. During this ! routine, such determinants will be removed from the hash table and so ! on output, everything will be fine and ready for the FCIQMC calculation ! to start. use FciMCData, only: ll_node, HashIndex, nWalkerHashes use hash, only: clear_hash_table, FindWalkerHash use DetBitOps, only: tAccumEmptyDet integer, intent(in) :: run integer :: i, hash_val, PartInd, nwalkers, i_non_core integer :: nI(nel) real(dp) :: walker_sign(lenof_sign) type(ll_node), pointer :: temp_node logical :: tSuccess integer :: ierr character(*), parameter :: this_routine = 'add_core_states_currentdet' real(dp), allocatable :: gdata_buf(:, :) type(gdata_io_t) :: reorder_handler nwalkers = int(TotWalkers) associate(rep => cs_replicas(run)) ! Test that SpawnedParts is going to be big enough if (rep%determ_sizes(iProcIndex) > MaxSpawned) then #ifdef DEBUG_ write(stderr, *) 'Spawned parts array will not be big enough for & &Semi-Stochastic initialisation' write(stderr, *) 'Please increase MEMORYFACSPAWN' #else write(stderr, *) 'Spawned parts array will not be big enough for & &Semi-Stochastic initialisation on task ', iProcIndex write(stderr, *) 'Please increase MEMORYFACSPAWN' #endif call stop_all(this_routine, "Insufficient memory assigned") end if call reorder_handler%init_gdata_io(tAutoAdaptiveShift, tScaleBlooms, tAccumPopsActive, & 2 * inum_runs, 1, lenof_sign + 1) ! we need to reorder the adaptive shift data, too ! the maximally required buffer size is the current size of the ! determinant list plus the size of the semi-stochastic space (in case ! all core-dets are new) allocate(gdata_buf(reorder_handler%entry_size(), (nwalkers + rep%determ_sizes(iProcIndex))), & stat=ierr) if (ierr /= 0) call stop_all(this_routine, & "Failed to allocate buffer for global det data") ! First find which CurrentDet states are in the core space. ! The warning above refers to this bit of code: If a core determinant is not in the ! hash table then they won't be found here and the deterministic flag won't be set! do i = 1, rep%determ_sizes(iProcIndex) tSuccess = .false. call decode_bit_det(nI, SpawnedParts(:, i)) hash_val = FindWalkerHash(nI, nWalkerHashes) temp_node => HashIndex(hash_val) if (temp_node%ind /= 0) then do while (associated(temp_node)) if (all(SpawnedParts(0:nifd, i) == CurrentDets(0:nifd, temp_node%ind))) then tSuccess = .true. PartInd = temp_node%ind exit end if temp_node => temp_node%next end do end if nullify (temp_node) ! Core state i is in CurrentDets. if (tSuccess) then call set_flag(CurrentDets(:, PartInd), flag_deterministic(run)) ! Copy the amplitude of the state across to SpawnedParts. call extract_sign(CurrentDets(:, PartInd), walker_sign) call encode_sign(SpawnedParts(:, i), walker_sign) ! Add up the already set flags to those to be set SpawnedParts(IlutBits%ind_flag, i) = ior(CurrentDets(IlutBits%ind_flag, PartInd), & SpawnedParts(IlutBits%ind_flag, i)) ! Cache the accumulated global det data call reorder_handler%write_gdata(gdata_buf, 1, PartInd, i) else ! This will be a new state added to CurrentDets. nwalkers = nwalkers + 1 ! no auto-adaptive shift data available gdata_buf(:, i) = 0.0_dp end if end do ! Next loop through CurrentDets and move all non-core states to after the last ! core state slot in SpawnedParts. i_non_core = rep%determ_sizes(iProcIndex) do i = 1, int(TotWalkers) if (.not. check_determ_flag(CurrentDets(:, i), run)) then i_non_core = i_non_core + 1 ! Add a quick test in, to ensure that we don't overflow the ! spawned parts array... if (i_non_core > MaxSpawned) then #ifdef DEBUG_ write(stderr, *) 'Spawned parts array too small for & &semi-stochastic initialisation' write(stderr, *) 'Please increase MEMORYFACSPAWN' #else write(stderr, *) 'Spawned parts array too small for & &semi-stochastic initialisation on task ', iProcIndex write(stderr, *) 'Please increase MEMORYFACSPAWN' #endif call stop_all(this_routine, 'Insufficient memory assigned') end if SpawnedParts(0:NIfTot, i_non_core) = CurrentDets(0:NIfTot, i) call reorder_handler%write_gdata(gdata_buf, 1, i, i_non_core) end if end do ! Now copy all the core states in SpawnedParts into CurrentDets. ! Note that the amplitude in CurrentDets was copied across, so this is fine. do i = 1, nwalkers CurrentDets(0:NifTot, i) = SpawnedParts(0:NIfTot, i) end do ! Re-assign the reordered global det data cached in gdata_buf call reorder_handler%read_gdata(gdata_buf, nwalkers) ! After reordering the dets, we have to reset all supergroup ! idx in the global_det_data if (associated(lookup_supergroup_indexer)) then do i = 1, nwalkers call decode_bit_det(nI, CurrentDets(:, i)) call set_supergroup_idx(& i, lookup_supergroup_indexer%idx_nI(nI)) end do end if call clear_hash_table(HashIndex) ! Finally, add the indices back into the hash index array. do i = 1, nwalkers call extract_sign(CurrentDets(:, i), walker_sign) ! Don't add the determinant to the hash table if its unoccupied and not ! in the core space and not accumulated. if (IsUnoccDet(walker_sign) .and. (.not. check_determ_flag(CurrentDets(:, i))) & .and. .not. tAccumEmptyDet(CurrentDets(:, i))) cycle call decode_bit_det(nI, CurrentDets(:, i)) hash_val = FindWalkerHash(nI, nWalkerHashes) temp_node => HashIndex(hash_val) ! If the first element in the list has not been used. if (temp_node%ind == 0) then temp_node%ind = i else do while (associated(temp_node%next)) temp_node => temp_node%next end do allocate(temp_node%next) nullify (temp_node%next%next) temp_node%next%ind = i end if nullify (temp_node) ! These core states will always stay in the same position. if (i <= rep%determ_sizes(iProcIndex)) rep%indices_of_determ_states(i) = i end do end associate TotWalkers = int(nwalkers, int64) end subroutine add_core_states_currentdet_hash subroutine proc_most_populated_states(n_keep, run, & largest_walkers, opt_source, opt_source_size, norm) ! Return the most populated states in CurrentDets on *this* processor only. ! Also return the norm of these states, if requested. use DetBitOps, only: sign_lt, sign_gt use sort_mod, only: sort integer(int64), intent(in), optional :: opt_source_size integer(n_int), intent(in), optional :: opt_source(0:, 1:) integer, intent(in) :: n_keep, run integer(n_int), intent(out) :: largest_walkers(0:NIfTot, n_keep) real(dp), intent(out), optional :: norm integer :: i, j, smallest_pos, part_type real(dp) :: smallest_sign, sign_curr_real real(dp), dimension(lenof_sign) :: sign_curr, low_sign character(*), parameter :: this_routine = "return_most_populated_states" integer(n_int), allocatable :: loc_source(:, :) integer(int64) :: source_size if (present(opt_source)) then ASSERT(present(opt_source_size)) source_size = opt_source_size ! ask Kai if I have to allocate allocate(loc_source(0:niftot, 1:source_size), & source=opt_source(0:NIfTot, 1:source_size)) ! loc_source => opt_source else source_size = TotWalkers allocate(loc_source(0:niftot, 1:source_size), & source=CurrentDets(0:NIfTot, 1:source_size)) ! loc_source => CurrentDets end if largest_walkers = 0_n_int smallest_sign = 0.0_dp smallest_pos = 1 if (present(norm)) norm = 0.0_dp ! Run through all walkers on process. do i = 1, int(source_size) call extract_sign(loc_source(:, i), sign_curr) sign_curr_real = core_space_weight(sign_curr, run) if (present(norm)) norm = norm + (sign_curr_real**2.0) ! Is this determinant more populated than the smallest? First in ! the list is always the smallest. if (sign_curr_real > smallest_sign) then largest_walkers(:, smallest_pos) = loc_source(:, i) ! Instead of resorting, just find new smallest sign and position. call extract_sign(largest_walkers(:, 1), low_sign) smallest_sign = core_space_weight(low_sign, run) smallest_pos = 1 do j = 2, n_keep call extract_sign(largest_walkers(:, j), low_sign) sign_curr_real = core_space_weight(low_sign, run) if (sign_curr_real < smallest_sign .or. all(largest_walkers(:, j) == 0_n_int)) then smallest_pos = j smallest_sign = sign_curr_real end if end do end if end do call sort(largest_walkers(:, 1:n_keep), sign_lt_run, sign_gt_run) contains pure function sign_lt_run(ilutI, ilutJ) result(bLt) ! This is a comparison function between two bit strings of length ! 0:NIfTot, and will return true if absolute value of the sign of ! ilutI is less than ilutJ integer(n_int), intent(in) :: iLutI(0:), iLutJ(0:) logical :: bLt real(dp) :: SignI(lenof_sign), SignJ(lenof_sign) call extract_sign(ilutI, SignI) call extract_sign(ilutJ, SignJ) bLt = core_space_weight(signI, run) < core_space_weight(signJ, run) end function sign_lt_run pure function sign_gt_run(ilutI, ilutJ) result(bGt) ! This is a comparison function between two bit strings of length ! 0:NIfTot, and will return true if the abs sign of ilutI is greater ! than ilutJ integer(n_int), intent(in) :: iLutI(0:), iLutJ(0:) logical :: bGt real(dp) :: SignI(lenof_sign), SignJ(lenof_sign) call extract_sign(ilutI, SignI) call extract_sign(ilutJ, SignJ) bGt = core_space_weight(signI, run) > core_space_weight(signJ, run) end function sign_gt_run end subroutine proc_most_populated_states !> @brief !> Return the most populated states over all processors. !> !> @details !> Reducing version of `proc_most_populated_states`, which works per process. !> Return as many states as the size of largest_walkers. !> Returns the norm as well, if requested. !> @param[out] largest_walkers, Array of most `n_keep` most populated states. subroutine global_most_populated_states(n_keep, run, largest_walkers, norm, rank_of_largest, hdiag_largest) use Parallel_neci, only: MPISumAll, MPIAllReduceDatatype, MPIBCast integer, intent(in) :: n_keep, run integer(n_int), intent(out) :: largest_walkers(0:NIfTot, n_keep) real(dp), intent(out), optional :: norm integer, intent(out), optional :: rank_of_largest(n_keep) real(dp), intent(inout), optional :: hdiag_largest(n_keep) character(*), parameter :: this_routine = 'global_most_populated_states' integer(n_int), allocatable :: proc_largest_walkers(:, :) integer :: sort_size, sort_size_delta, sort_max_delta real(dp) :: proc_norm integer :: bcast_size integer(n_int), allocatable :: allproc_ilut_list(:,:,:) real(dp), allocatable :: allproc_sign_list(:,:) real(dp), allocatable :: allproc_hdiag_list(:,:) integer :: allproc_nlist(0:nProcessors-1) integer(n_int) :: ilut(0:NIfTot) integer :: i, ilist, iproc, proc_idet, imax, nI(nel), nlist real(dp) :: det_sign, curr_sign(lenof_sign), hdiag, cmax ! Initialize output. largest_walkers = 0_n_int if (present(hdiag_largest)) hdiag_largest = 0_dp if (present(rank_of_largest)) rank_of_largest = 0 if (n_keep<1) return ! Determine sensible sort size and increment. sort_max_delta = 0 bcast_size = 1 if (nProcessors<2) then ! compute target sort size (fixed from here on out) sort_size = n_keep else ! Rough estimate (typical population fluctuations, decent load balancing). sort_max_delta = ceiling( 2.d0*sqrt( real(n_keep,dp) / real(nProcessors,dp) ) ) sort_size = ceiling( ( real(n_keep,dp) / real(nProcessors,dp) ) ) + sort_max_delta bcast_size = ceiling( ( real(n_keep,dp) / real(nProcessors,dp) ) ) ! target broadcast size endif sort_size = int(min(TotWalkers, int(n_keep,int64), int(sort_size,int64))) ! allocation size ! Allocate initial sort vector. allocate(proc_largest_walkers(0:NIfTot, sort_size), source=0_n_int) ! Perform first partial sort, and get norm if required. call proc_most_populated_states( & sort_size, run, proc_largest_walkers, CurrentDets, TotWalkers, proc_norm) if (present(norm)) then call MpiSumAll(proc_norm, norm) norm = sqrt(norm) end if ! Allocate shared det lists. allocate( allproc_ilut_list(0:NIfTot, bcast_size, 0:nProcessors-1), & allproc_sign_list(bcast_size, 0:nProcessors-1), & allproc_hdiag_list(bcast_size, 0:nProcessors-1) ) ! Loop over determinants to put in list. proc_idet = sort_size + 1 ! index of last det in this process' sorted list added to bcast list allproc_nlist = 0 ! size of bcast list on all processes main_det_loop: do i = 1, n_keep need_prepare_bcast: if (allproc_nlist(iProcIndex)==0) then ! Prepare list of walkers on this process to braodcast. potential_resort_loop: do ! loop over potential sort operations ! Figure out this process' buffer size - but keep in "nlist" not "allproc_nlist" ! so we can use the latter in logic below. nlist = bcast_size ! Fill this process' buffer. NB, (1:nlist) goes from smallest to largest coeff. do ilist = nlist, 1, -1 ! Find next determinant on this process. do proc_idet = proc_idet - 1 if (proc_idet<1) exit if (any(proc_largest_walkers(:, proc_idet) /= 0)) exit end do ! Exit loop if we ran out. if (proc_idet<1) exit ! Get the relevant determinant properties. ilut = proc_largest_walkers(:, proc_idet) call extract_sign(proc_largest_walkers(:, proc_idet), curr_sign) #ifdef CMPLX_ det_sign = sqrt(sum(abs(curr_sign(1::2)))**2 & + sum(abs(curr_sign(2::2)))**2) #else det_sign = sum(real(abs(curr_sign), dp)) #endif if (present(hdiag_largest)) then ! Get hdiag if requested. call decode_bit_det(nI, ilut) if (tHPHF) then hdiag = hphf_diag_helement(nI, ilut) else hdiag = get_helement(nI, nI, 0, ilut, ilut) end if end if ! Store the relevant determinant properties. allproc_ilut_list(:, ilist, iProcIndex) = ilut allproc_sign_list(ilist, iProcIndex) = det_sign if (present(hdiag_largest)) then allproc_hdiag_list(ilist, iProcIndex) = hdiag endif end do ! ilist if (ilist>0) then ! We have had to abort, so adjust nlist and shift arrays. nlist = nlist - ilist if (nlist>0) then allproc_ilut_list(:, 1:nlist, iProcIndex) = & allproc_ilut_list(:, ilist+1:nlist+ilist, iProcIndex) allproc_sign_list(1:nlist, iProcIndex) = & allproc_sign_list(ilist+1:nlist+ilist, iProcIndex) if (present(hdiag_largest)) then allproc_hdiag_list(1:nlist, iProcIndex) = & allproc_hdiag_list(ilist+1:nlist+ilist, iProcIndex) end if end if end if if (nlist>0) exit ! We have an empty broadcast list, so attempt a sort and go again. ! FIXME - By construction of the "sort" procedure we need to re-sort the start of ! the list again! To fix this we need to work with the full vector of ! partially sorted indices not a partial vector of partially sorted elements. ! FIXME - Re-sorts can be triggered at different times on different processes, which ! could make things really slow. sort_size_delta = sort_size + sort_max_delta sort_size_delta = int( min(int(sort_size_delta,int64), int(n_keep-sort_size,int64), TotWalkers-int(sort_size,int64) ) ) if (sort_size_delta<1) exit if (allocated(proc_largest_walkers)) deallocate(proc_largest_walkers) sort_size = sort_size + sort_size_delta allocate (proc_largest_walkers(0:NIfTot, 1:sort_size), source=0_n_int) call proc_most_populated_states( & sort_size, run, proc_largest_walkers, CurrentDets, TotWalkers) proc_idet = sort_size_delta + 1 ! index of last det in this process' sorted list added to bcast list end do potential_resort_loop end if need_prepare_bcast ! Perform any outstanding broadcasts. do iproc = 0, nProcessors-1 if (allproc_nlist(iproc)==0) then if (iProcIndex==iproc) allproc_nlist(iproc) = nlist ! now we update this call MPIBCast(allproc_nlist(iproc), 1, iproc) if (allproc_nlist(iproc)==0) then allproc_nlist(iproc) = -1 ! flag that there are no more dets on process else ! Broadcast. call MPIBCast(allproc_ilut_list(0:NIfTot, 1:allproc_nlist(iproc), iproc), & (NIfTot+1)*allproc_nlist(iproc), iproc) call MPIBCast(allproc_sign_list(1:allproc_nlist(iproc), iproc), & allproc_nlist(iproc), iproc) if (present(hdiag_largest)) then call MPIBCast(allproc_hdiag_list(1:allproc_nlist(iproc), iproc), & allproc_nlist(iproc), iproc) end if end if end if end do ! iproc ! Safety - if there is nothing left to do, exit loop. if (all(allproc_nlist<0)) exit ! Find next largest "sign" among all processes. cmax = 0_dp do iproc = 0, nProcessors-1 if (allproc_nlist(iproc)<1) cycle if (allproc_sign_list(allproc_nlist(iproc), iproc)>cmax) then imax = iproc cmax = allproc_sign_list(allproc_nlist(iproc), iproc) end if end do ! iproc ! Copy to output. iproc = imax ilist = allproc_nlist(iproc) largest_walkers(0:NIfTot, i) = allproc_ilut_list(0:NIfTot, ilist, iproc) if (present(hdiag_largest)) hdiag_largest(i) = allproc_hdiag_list(ilist, iproc) if (present(rank_of_largest)) rank_of_largest(i) = iproc ! Update counter. allproc_nlist(iproc) = allproc_nlist(iproc) - 1 end do main_det_loop end subroutine !> Weight function for picking the most populated states. Trivial in !! single run mode, but multiple options exist in mneci pure function core_space_weight(sign_curr, run) result(sign_curr_real) real(dp), intent(in) :: sign_curr(lenof_sign) integer, intent(in) :: run real(dp) :: sign_curr_real #ifdef CMPLX_ sign_curr_real = sqrt(sum(abs(sign_curr(1::2)))**2 + sum(abs(sign_curr(2::2)))**2) unused_var(run) #else if (tSignedRepAv) then sign_curr_real = real(abs(sum(sign_curr)), dp) else if (run == GLOBAL_RUN) then sign_curr_real = real(sum(abs(sign_curr)), dp) else sign_curr_real = mag_of_run(sign_curr, run) end if end if #endif end function core_space_weight !> Specialized routine that returns the number of determinants that are going into !! the core-space on this processor. Once the leading determinants have been obtained on each !! processor, this function requires the minimal and maximal populations among these for all !! processors (requires previous MPI_Gather), then the procedure to determine the core-space !! size on this processor is as follows: !! 1) Get the minimum of the maximal populations, then count the number of determinants !! above this population. This count is then broadcasted to the other procs, and the !! total number of determinants above the smallest maximum is determined. If it is smaller !! than the core-space size, these determinants are put into the core-space, else !! we repeat with the second smallest of the maximal populations, and so on. !! 2) Get the maximum of the minimal populations, then count the number of determinants below !! this population. This count is then broadcasted to the other procs, and the total number !! of determinants below the largest minimum is determined. If the number of determinants !! that are remaining (i.e. larger than the largest minimum and smaller than the smallest !! maximum) is sufficient to fill up the core-space (in particular, the smallest maximum !! has to be bigger than the largest minimum), the small determinants are discarded. Else, !! we repeat this with the second largest minimum, and so on. !! 3) From the remaining determinants, each processor contributes a share that equals to the !! ratio of the remaining determinants on this proc to the total remaining determinants !> @param[in] n_keep core-space size !> @param[in] min_vals minimal population of the canditates per processor !> @param[in] max_vals maximal population of the canditates per processor !> @param[in] lengths number of candidates per processor !> @param[in] list candidates on this processor !> @param[out] n_dets_this_proc number of core-space determinants on this processor subroutine return_proc_share(n_keep, min_vals, max_vals, lengths, list, n_dets_this_proc) use util_mod, only: binary_search_first_ge real(dp), intent(inout) :: max_vals(0:nProcessors - 1), min_vals(0:nProcessors - 1) real(dp), intent(in) :: list(:) integer, intent(in) :: n_keep, lengths(0:nProcessors - 1) integer, intent(out) :: n_dets_this_proc integer :: sum_max, sum_min real(dp) :: min_max, max_min integer :: dets_left, pool_left real(dp) :: total_pool, ip_ratio integer :: n_max(0:nProcessors - 1), n_min(0:nProcessors - 1) integer :: missing, n_full, i dets_left = -1 ! Increase the cutoff until our selection is small enough do while (dets_left < 0) call get_pp_ex(min_max, n_max, sum_max, max_vals) ! Number of determinants left when keeping the maximal ones dets_left = n_keep - sum_max end do ! Definitely take these determinants n_dets_this_proc = n_max(iProcIndex) max_min = min_max + 1 total_pool = -1 ! Reduce the cutoff until we are below the min_max do while (max_min > min_max .or. total_pool < dets_left) call get_pp_ex(max_min, n_min, sum_min, min_vals, t_max=.true.) ! Size of the pool left when not keeping the minimal ones ( has to be at least ! big enough to fill the core-space) total_pool = sum(lengths) - sum_max - sum_min end do ! If the corespace consists of all chosen determinants, the remaining pool might be 0 ! -> no further action, take all determinants if (total_pool > 0) then ! Number of available dets on this proc after removing min/max pool_left = lengths(iProcIndex) - n_max(iProcIndex) - n_min(iProcIndex) ! Ratio of available dets on this proc vs. in totap ip_ratio = pool_left / real(total_pool, dp) ! If any further dets have to be picked, get them from all procs weighted with the pool sizes n_dets_this_proc = n_dets_this_proc + int(ip_ratio * dets_left) end if contains subroutine get_pp_ex(ex, n_ex, sum_ex, vals, t_max) use Parallel_neci, only: MPIAllGather real(dp), intent(out) :: ex integer, intent(out) :: n_ex(0:nProcessors - 1), sum_ex real(dp), intent(inout) :: vals(0:nProcessors - 1) logical, intent(in), optional :: t_max integer :: ex_ind, n_ex_loc integer :: ierr real(dp) :: pre logical :: t_max_ def_default(t_max_, t_max, .false.) if (t_max_) then pre = -1.0 else pre = 1.0 end if ! Get the smallest value of the per-proc max ex_ind = minloc(pre * vals, dim=1) - 1 ex = vals(ex_ind) ! Invalidate this value, such that the next call finds the second smallest value and so on vals(ex_ind) = pre * sum(abs(vals)) ! Now, get the location of the first element above the extremum if (size(list) > 0) then n_ex_loc = binary_search_first_ge(list, ex) else ! it might be possible that a proc is empty (has no candidates) ! in this case, never find anything above/below global extremal values n_ex_loc = -1 end if ! If no such element exists, return 0 on this proc if (n_ex_loc < 0) then n_ex_loc = 0 else if (t_max_) then ! From the position, get the number of elements below the extremum (for max_min) n_ex_loc = n_ex_loc - 1 else ! Or above the extremum (for min_max) n_ex_loc = lengths(iProcIndex) - n_ex_loc + 1 end if call MPIAllGather(n_ex_loc, n_ex, ierr) ! Check if the maximum pop is already sufficient sum_ex = sum(n_ex) end subroutine get_pp_ex end subroutine return_proc_share subroutine return_largest_indices(n_keep, list_size, list, largest_indices) ! Return the indices of the largest elements in list. integer, intent(in) :: n_keep, list_size real(dp), intent(in) :: list(list_size) integer, intent(out) :: largest_indices(n_keep) integer :: i, j, ind, smallest_pos real(dp) :: smallest_sign, sign_curr, sign_curr_abs, low_sign largest_indices = 0 smallest_sign = 0.0_dp smallest_pos = 1 do i = 1, list_size sign_curr = list(i) sign_curr_abs = abs(sign_curr) if (sign_curr_abs > smallest_sign) then largest_indices(smallest_pos) = i low_sign = list(largest_indices(1)) smallest_sign = abs(low_sign) smallest_pos = 1 do j = 2, n_keep ind = largest_indices(j) if (ind == 0) then low_sign = 0.0_dp else low_sign = list(ind) end if sign_curr_abs = abs(low_sign) if (sign_curr_abs < smallest_sign .or. largest_indices(j) == 0) then smallest_pos = j smallest_sign = sign_curr_abs end if end do end if end do end subroutine return_largest_indices subroutine start_walkers_from_core_ground(tPrintInfo, run) use davidson_semistoch, only: davidson_ss, perform_davidson_ss, destroy_davidson_ss use Parallel_neci, only: MPISumAll logical, intent(in) :: tPrintInfo integer, intent(in) :: run integer :: nI(nel) integer :: i, counter, ierr real(dp) :: eigenvec_pop, eigenvec_pop_tot, pop_sign(lenof_sign) integer(n_int) :: tmp character(len=*), parameter :: t_r = "start_walkers_from_core_ground" type(davidson_ss) :: dc if (tPrintInfo) then write(stdout, '(a69)') "Using the deterministic ground state as initial walker configuration." write(stdout, '(a34)') "Performing Davidson calculation..." call neci_flush(stdout) end if ! Call the Davidson routine to find the ground state of the core space. call perform_davidson_ss(dc, .true., run) if (tPrintInfo) then write(stdout, '(a30)') "Davidson calculation complete." write(stdout, '("Deterministic total energy:",1X,f15.10)') dc%davidson_eigenvalue + Hii call neci_flush(stdout) end if associate(rep => cs_replicas(run)) ! We need to normalise this vector to have the correct 'number of walkers'. eigenvec_pop = 0.0_dp do i = 1, rep%determ_sizes(iProcIndex) eigenvec_pop = eigenvec_pop + abs(dc%davidson_eigenvector(i)) end do call MPISumAll(eigenvec_pop, eigenvec_pop_tot) if (tStartSinglePart) then dc%davidson_eigenvector = dc%davidson_eigenvector & * InitialPart / eigenvec_pop_tot else dc%davidson_eigenvector = dc%davidson_eigenvector & * InitWalkers / eigenvec_pop_tot end if ! Then copy these amplitudes across to the corresponding states in CurrentDets. counter = 0 do i = 1, int(TotWalkers) if (check_determ_flag(CurrentDets(:, i), run)) then counter = counter + 1 pop_sign = dc%davidson_eigenvector(counter) call decode_bit_det(nI, CurrentDets(:, i)) tmp = transfer(pop_sign(rep%min_part():rep%max_part()), tmp) CurrentDets(IlutBits%ind_pop + rep%min_part() - 1:IlutBits%ind_pop + rep%max_part() - 1, i) = tmp end if end do end associate call destroy_davidson_ss(dc) end subroutine start_walkers_from_core_ground subroutine start_walkers_from_core_ground_nonhermit(tPrintInfo, run) logical, intent(in) :: tPrintInfo integer, intent(in) :: run integer :: i, counter, ierr integer :: nI(nel) real(dp), allocatable :: e_values(:) HElement_t(dp), allocatable :: e_vectors(:, :) integer(n_int) :: tmp real(dp) :: eigenvec_pop, pop_sign(lenof_sign) character(len=*), parameter :: t_r = "start_walkers_from_core_ground_nonhermit" if (tPrintInfo) then write(stdout, '(a69)') "Using the deterministic ground state as initial walker configuration." write(stdout, '(a53)') "Performing diagonalization of non-Hermitian matrix..." call neci_flush(stdout) end if associate(rep => cs_replicas(run)) ! Call the non-Hermitian diagonalizer to find the ground state of the core space. call diagonalize_core_non_hermitian(e_values, e_vectors, rep) if (tPrintInfo) then write(stdout, '("Energies of the deterministic subspace:")') write(stdout, *) e_values(1:rep%determ_space_size) call neci_flush(stdout) end if ! We need to normalise this vector to have the correct 'number of walkers'. eigenvec_pop = 0.0_dp do i = 1, rep%determ_space_size eigenvec_pop = eigenvec_pop + abs(e_vectors(i, 1)) end do if (tStartSinglePart) then e_vectors(:, 1) = e_vectors(:, 1) * InitialPart / eigenvec_pop else e_vectors(:, 1) = e_vectors(:, 1) * InitWalkers / eigenvec_pop end if write(stdout, *) 'The ground state vector:' write(stdout, *) e_vectors(:, 1) ! Then copy these amplitudes across to the corresponding states in CurrentDets. counter = 0 do i = 1, iProcIndex counter = counter + rep%determ_sizes(i - 1) end do do i = 1, rep%determ_space_size !int(TotWalkers) if (check_determ_flag(CurrentDets(:, i), run)) then counter = counter + 1 pop_sign = e_vectors(counter, 1) tmp = transfer(pop_sign(min_pt:max_pt), tmp) CurrentDets(IlutBits%ind_pop + min_part_type(run) - 1:IlutBits%ind_pop + max_part_type(run) - 1, i) = tmp end if end do end associate deallocate(e_values, e_vectors) end subroutine start_walkers_from_core_ground_nonhermit subroutine diagonalize_core(e_value, e_vector, rep) real(dp), intent(out) :: e_value HElement_t(dp), intent(out), allocatable :: e_vector(:) type(core_space_t), intent(in) :: rep type(DavidsonCalcType) :: davidsonCalc integer :: ierr character(*), parameter :: t_r = "diagonalize_core" call create_sparse_ham_from_core(rep) ! Call the Davidson routine to find the ground state of the core space. call perform_davidson(davidsonCalc, parallel_sparse_hamil_type, .true.) e_value = davidsonCalc%davidson_eigenvalue allocate(e_vector(rep%determ_space_size)) e_vector = davidsonCalc%davidson_eigenvector write(stdout, '(a30)') "Davidson calculation complete." write(stdout, '("Deterministic total energy:",1X,f15.10)') & e_value + Hii call neci_flush(stdout) call DestroyDavidsonCalc(davidsonCalc) ! call LogMemDealloc(t_r, DavidsonTag, ierr) deallocate(hamil_diag, stat=ierr) call LogMemDealloc(t_r, HDiagTag, ierr) call deallocate_sparse_ham(sparse_ham, SparseHamilTags) end subroutine diagonalize_core subroutine create_sparse_ham_from_core(rep) type(core_space_t), intent(in) :: rep character(*), parameter :: t_r = "create_sparse_ham_from_core" integer :: ierr, i ! Create the arrays used by the Davidson routine. ! First, the whole Hamiltonian in sparse form. if (allocated(sparse_ham)) deallocate(sparse_ham) if (allocated(SparseHamilTags)) deallocate(SparseHamilTags) allocate(sparse_ham(rep%determ_sizes(iProcIndex))) allocate(SparseHamilTags(2, rep%determ_sizes(iProcIndex))) do i = 1, rep%determ_sizes(iProcIndex) call allocate_sparse_ham_row(sparse_ham, i, & rep%sparse_core_ham(i)%num_elements, "sparse_ham", SparseHamilTags(:, i)) sparse_ham(i)%elements = rep%sparse_core_ham(i)%elements sparse_ham(i)%positions = rep%sparse_core_ham(i)%positions sparse_ham(i)%num_elements = rep%sparse_core_ham(i)%num_elements end do ! Next create the diagonal used by Davidson by copying the core one. if (allocated(hamil_diag)) deallocate(hamil_diag) allocate(hamil_diag(rep%determ_sizes(iProcIndex)), stat=ierr) call LogMemAlloc('hamil_diag', int(rep%determ_sizes(iProcIndex)), & 8, t_r, HDiagTag, ierr) hamil_diag = rep%core_ham_diag end subroutine create_sparse_ham_from_core subroutine diagonalize_core_non_hermitian(e_values, e_vectors, rep) type(core_space_t), intent(in) :: rep real(dp), allocatable, intent(out) :: e_values(:) HElement_t(dp), allocatable :: e_vectors(:, :) HElement_t(dp), allocatable :: full_H(:, :) integer i, nI(nel), space_size ! if the Hamiltonian is non-hermitian we cannot use the ! standard Lanzcos or Davidson routines. so: ! build the full Hamiltonian call calc_determin_hamil_full(full_H, rep) if (t_print_core_info) then root_print "The determinants are" root_print "semistochastic basis:" if_root do i = 1, rep%determ_space_size call decode_bit_det(nI, rep%core_space(:, i)) print *, nI end do end_if_root root_print "deterministic hamiltonian:" if_root call print_matrix(full_H) end_if_root end if allocate(e_values(size(full_H, 1))) allocate(e_vectors(size(full_H, 1), size(full_H, 1))) e_values = 0.0_dp e_vectors = 0.0_dp call eig(full_H, e_values, e_vectors) ! maybe we also want to start from a different eigenvector in ! this case? this would be practial for the hubbard problem case.. root_print "Full diagonalisation for non-hermitian Hamiltonian completed!" end subroutine diagonalize_core_non_hermitian subroutine print_basis(rep) type(core_space_t), intent(in) :: rep integer :: iunit, i, nI(nel) iunit = get_free_unit() open(iunit, file = 'semistoch-basis', status = 'replace') do i = 1, rep%determ_space_size call decode_bit_det(nI, rep%core_space(:, i)) call write_det(iunit, nI, .true.) end do close(iunit) end subroutine print_basis subroutine print_hamiltonian(rep) type(core_space_t), intent(in) :: rep HElement_t(dp), allocatable :: full_H(:, :) integer :: iunit call calc_determin_hamil_full(full_H, rep) iunit = get_free_unit() open(iunit, file = 'semistoch-hamil', status = 'replace') call print_matrix(full_H, iunit) close(iunit) end subroutine print_hamiltonian subroutine calc_determin_hamil_full(hamil, rep) use guga_data, only: ExcitationInformation_t use guga_matrixElements, only: calc_guga_matrix_element type(core_space_t) :: rep type(CSF_Info_t) :: csf_i, csf_j type(ExcitationInformation_t) :: excitInfo HElement_t(dp), allocatable, intent(out) :: hamil(:, :) integer :: i, j, nI(nel), nJ(nel) allocate(hamil(rep%determ_space_size, rep%determ_space_size)) hamil = h_cast(0.0_dp) do i = 1, rep%determ_space_size call decode_bit_det(nI, rep%core_space(:, i)) if (tGUGA) csf_i = CSF_Info_t(rep%core_space(0:nifd, i)) if (tHPHF) then hamil(i, i) = hphf_diag_helement(nI, rep%core_space(:, i)) else hamil(i, i) = get_helement(nI, nI, 0) end if do j = 1, rep%determ_space_size if (i == j) cycle call decode_bit_det(nJ, rep%core_space(:, j)) if (tGUGA) csf_j = CSF_Info_t(rep%core_space(:, j)) if (tHPHF) then hamil(i, j) = hphf_off_diag_helement(nI, nJ, & rep%core_space(:, i), rep%core_space(:, j)) else if (tGUGA) then call calc_guga_matrix_element(& rep%core_space(:, i), csf_i, & rep%core_space(:, j), csf_j, excitInfo, hamil(i, j), .true.) else hamil(i, j) = get_helement(nI, nJ, rep%core_space(:, i), & rep%core_space(:, j)) end if end do end do end subroutine calc_determin_hamil_full subroutine copy_core_dets_to_spawnedparts(rep) ! This routine will copy all the core determinants *ON THIS PROCESS ! ONLY* to the SpawnedParts array. use load_balance_calcnodes, only: DetermineDetNode type(core_space_t), intent(in) :: rep integer :: i, ncore, proc integer :: nI(nel) character(len=*), parameter :: t_r = "copy_core_dets_to_spawnedparts" ncore = 0 SpawnedParts = 0_n_int do i = 1, rep%determ_space_size call decode_bit_det(nI, rep%core_space(:, i)) proc = DetermineDetNode(nel, nI, 0) if (proc == iProcIndex) then ncore = ncore + 1 SpawnedParts(0:NIfTot, ncore) = rep%core_space(:, i) end if end do if (ncore /= rep%determ_sizes(iProcIndex)) call stop_all(t_r, "The number of & &core determinants counted is less than was previously counted.") end subroutine copy_core_dets_to_spawnedparts subroutine return_mp1_amp_and_mp2_energy(nI, ilut, ex, tParity, amp, energy_contrib) ! For a given determinant (input as nI), find the amplitude of it in the MP1 wavefunction. ! Also return the contribution from this determinant in the MP2 energy. ! To use this routine, generate an excitation from the Hartree-Fock determinant using the ! GenExcitations3 routine. This will return nI, ex and tParity which can be input into this ! routine. use Determinants, only: GetH0Element3, GetH0Element4 use FciMCData, only: ilutHF, HFDet, Fii use SystemData, only: tUEG use SystemData, only: tGUGA use guga_matrixElements, only: calcDiagMatEleGUGA_nI, calc_guga_matrix_element use guga_data, only: ExcitationInformation_t integer, intent(in) :: nI(nel) integer(n_int), intent(in) :: ilut(0:NIfTot) integer, intent(in) :: ex(2, maxExcit) logical, intent(in) :: tParity real(dp), intent(out) :: amp, energy_contrib integer :: ic HElement_t(dp) :: hel, H0tmp, denom type(ExcitationInformation_t) :: excitInfo amp = 0.0_dp energy_contrib = 0.0_dp if (ex(1, 2) == 0) then ic = 1 else ic = 2 end if if (tHPHF) then ! Assume since we are using HPHF that the alpha and ! beta orbitals of the same spatial orbital have the same ! fock energies, so can consider either. hel = hphf_off_diag_helement(HFDet, nI, iLutHF, ilut) else if (tGUGA) then call calc_guga_matrix_element(& ilut, CSF_Info_t(ilut), ilutHF, CSF_Info_t(ilutHF), excitInfo, hel, .true.) else hel = get_helement(HFDet, nI, ic, ex, tParity) end if if (tUEG) then ! This will calculate the MP2 energies without having to use the fock eigenvalues. ! This is done via the diagonal determinant hamiltonian energies. H0tmp = getH0Element4(nI, HFDet) else if (tGUGA) then ! do i have a routine to calculate the diagonal and double ! contributions for GUGA csfs? yes! H0tmp = calcDiagMatEleGUGA_nI(nI) else H0tmp = getH0Element3(nI) end if ! If the relevant excitation from the Hartree-Fock takes electrons from orbitals ! (i,j) to (a,b), then denom will be equal to ! \epsilon_a + \epsilon_b - \epsilon_i - \epsilon_j ! as required in the denominator of the MP1 amplitude and MP2 energy. denom = Fii - H0tmp if (.not. (abs(denom) > 0.0_dp)) then call warning_neci("return_mp1_amp_and_mp2_energy", & "One of the determinants under consideration for the MP1 wave function is degenerate & &with the Hartree-Fock determinant. Degenerate perturbation theory has not been & &considered, but the amplitude of this determinant will be returned as huge(0.0_dp) & &so that it should be included in the space.") amp = huge(0.0_dp) else amp = hel / denom energy_contrib = (hel**2) / denom end if end subroutine return_mp1_amp_and_mp2_energy subroutine reinit_current_trial_amps() ! Recreate current trial amps, without using arrays such as trial_space ! and trial_wfs, which are deallocated after the first init_trial_wf ! call. use FciMCData, only: CurrentDets, TotWalkers, tTrialHash, current_trial_amps, ntrial_excits use searching, only: hash_search_trial, bin_search_trial use SystemData, only: nel integer(int64) :: i integer :: nI(nel) HElement_t(dp) :: trial_amps(ntrial_excits) logical :: tTrial, tCon ! Don't do anything if this is called before the trial wave function ! initialisation. if (.not. allocated(current_trial_amps)) return current_trial_amps = 0.0_dp do i = 1, TotWalkers if (tTrialHash) then call decode_bit_det(nI, CurrentDets(:, i)) call hash_search_trial(CurrentDets(:, i), nI, trial_amps, tTrial, tCon) else call bin_search_trial(CurrentDets(:, i), trial_amps, tTrial, tCon) end if ! Set the appropraite flag (if any). Unset flags which aren't ! appropriate, just in case. if (tTrial) then call set_flag(CurrentDets(:, i), flag_trial, .true.) call set_flag(CurrentDets(:, i), flag_connected, .false.) else if (tCon) then call set_flag(CurrentDets(:, i), flag_trial, .false.) call set_flag(CurrentDets(:, i), flag_connected, .true.) else call set_flag(CurrentDets(:, i), flag_trial, .false.) call set_flag(CurrentDets(:, i), flag_connected, .false.) end if ! Set the amplitude (which may be zero). current_trial_amps(:, i) = trial_amps end do end subroutine reinit_current_trial_amps subroutine end_semistoch() use FciMCData, only: HamTag use MemoryManager, only: LogMemDealloc character(len=*), parameter :: t_r = "end_semistoch" integer :: ierr integer :: run do run = 1, size(cs_replicas) call cs_replicas(run)%dealloc() end do deallocate(cs_replicas) end subroutine end_semistoch end module semi_stoch_procs