subroutine calc_trial_states_direct(space_in, nexcit, ndets_this_proc, trial_iluts, evecs_this_proc, evals, &
space_sizes, space_displs, reorder)
use bit_reps, only: decode_bit_det
use CalcData, only: subspace_in
use DetBitOps, only: ilut_lt, ilut_gt
use FciMCData, only: ilutHF, CurrentDets, TotWalkers
use Parallel_neci, only: MPIScatterV, MPIGatherV, MPIBCast, MPIArg, iProcIndex
use Parallel_neci, only: nProcessors
use MPI_wrapper, only: root
use semi_stoch_gen
use sort_mod, only: sort
use SystemData, only: nel, tAllSymSectors
use lanczos_wrapper, only: frsblk_wrapper
use guga_matrixElements, only: calc_guga_matrix_element
use guga_data, only: ExcitationInformation_t
type(subspace_in) :: space_in
integer, intent(in) :: nexcit
integer, intent(out) :: ndets_this_proc
integer(n_int), intent(out) :: trial_iluts(0:, :)
HElement_t(dp), allocatable, intent(out) :: evecs_this_proc(:, :)
real(dp), intent(out) :: evals(:)
integer(MPIArg), intent(out) :: space_sizes(0:nProcessors - 1), space_displs(0:nProcessors - 1)
integer, optional, intent(in) :: reorder(nexcit)
integer(n_int), allocatable :: ilut_list(:, :)
integer, allocatable :: det_list(:, :)
integer :: i, j, max_elem_ind(1), ierr, info, ndets_int
integer :: temp_reorder(nexcit)
integer(MPIArg) :: ndets_all_procs, ndets_this_proc_mpi
integer(MPIArg) :: sndcnts(0:nProcessors - 1), displs(0:nProcessors - 1)
integer(MPIArg) :: rcvcnts
integer, allocatable :: evec_abs(:)
HElement_t(dp), allocatable :: H_tmp(:, :), evecs_all(:, :)
HElement_t(dp), allocatable :: evecs(:, :), evecs_transpose(:, :)
HElement_t(dp), allocatable :: work(:)
real(dp), allocatable :: evals_all(:), rwork(:)
character(len=*), parameter :: this_routine = "calc_trial_states_direct"
type(ExcitationInformation_t) :: excitInfo
ndets_this_proc = 0
trial_iluts = 0_n_int
! Choose the correct generating routine.
if (space_in%tHF) call add_state_to_space(ilutHF, trial_iluts, ndets_this_proc)
if (space_in%tPops) call generate_space_most_populated(space_in%npops, &
space_in%tApproxSpace, space_in%nApproxSpace, trial_iluts, ndets_this_proc, GLOBAL_RUN)
if (space_in%tRead) call generate_space_from_file(space_in%read_filename, trial_iluts, ndets_this_proc)
if (space_in%tDoubles) then
if (tGUGA) then
call generate_sing_doub_guga(trial_iluts, ndets_this_proc, .false.)
else
call generate_sing_doub_determinants(trial_iluts, ndets_this_proc, .false.)
end if
end if
if (space_in%tCAS) call generate_cas(space_in%occ_cas, space_in%virt_cas, trial_iluts, ndets_this_proc)
if (space_in%tRAS) call generate_ras(space_in%ras, trial_iluts, ndets_this_proc)
if (space_in%tOptimised) call generate_optimised_space(space_in%opt_data, space_in%tLimitSpace, &
trial_iluts, ndets_this_proc, space_in%max_size)
if (space_in%tMP1) call generate_using_mp1_criterion(space_in%mp1_ndets, trial_iluts, ndets_this_proc)
if (space_in%tFCI) then
if (tAllSymSectors) then
call gndts_all_sym_this_proc(trial_iluts, .true., ndets_this_proc)
else
call generate_fci_core(trial_iluts, ndets_this_proc)
end if
end if
if (.not. (space_in%tPops .or. space_in%tRead .or. space_in%tDoubles .or. space_in%tCAS .or. &
space_in%tRAS .or. space_in%tOptimised .or. space_in%tMP1 .or. space_in%tFCI)) then
call stop_all(this_routine, "A space for the trial functions was not chosen.")
end if
ndets_this_proc_mpi = int(ndets_this_proc, MPIArg)
call MPIAllGather(ndets_this_proc_mpi, space_sizes, ierr)
ndets_all_procs = sum(space_sizes)
if (ndets_all_procs < nexcit) call stop_all(this_routine, "The number of excited states that you have asked &
&for is larger than the size of the trial space used to create the excited states. Since this &
&routine generates trial states that are orthogonal, this is not possible.")
space_displs(0) = 0_MPIArg
do i = 1, nProcessors - 1
space_displs(i) = space_displs(i - 1) + space_sizes(i - 1)
end do
! [W.D. 15.5.2017:]
! is the sort behaving different, depending on the compiler?
! since different references for different compilers..??
call sort(trial_iluts(:, 1:ndets_this_proc), ilut_lt, ilut_gt)
if (iProcIndex == root) then
allocate(ilut_list(0:NIfTot, ndets_all_procs))
else
! On these other processes ilut_list and evecs_transpose are not
! needed, but we need them to be allocated for the MPI wrapper
! function to work, so just allocate them to be small.
allocate(ilut_list(0:niftot, 1))
allocate(evecs_transpose(1, 1))
end if
call MPIGatherV(trial_iluts(0:niftot, 1:space_sizes(iProcIndex)), ilut_list(0:NIfTot, :), &
space_sizes, space_displs, ierr)
! Only perform the diagonalisation on the root process.
if (iProcIndex == root) then
allocate(det_list(nel, ndets_all_procs))
do i = 1, ndets_all_procs
call decode_bit_det(det_list(:, i), ilut_list(:, i))
end do
allocate(evecs(ndets_all_procs, nexcit), stat=ierr)
if (ierr /= 0) call stop_all(this_routine, "Error allocating eigenvectors array.")
evecs = 0.0_dp
allocate(evec_abs(ndets_all_procs), stat=ierr)
if (ierr /= 0) call stop_all(this_routine, "Error allocating evec_abs array.")
evec_abs = 0
! [W.D.]
! here the change to the previous implementation comes..
! previously frsblk_wrapper was called..
! try to brint that back??
! yes that was the problem! so bring it back for non-complex
! problems atleast
! Perform a direct diagonalisation in the trial space.
#ifdef CMPLX_
! First to build the Hamiltonian matrix
ndets_int = int(ndets_all_procs)
allocate(H_tmp(ndets_all_procs, ndets_all_procs), stat=ierr)
if (ierr /= 0) call stop_all(this_routine, "Error allocating H_tmp array")
H_tmp = 0.0_dp
do i = 1, ndets_all_procs
! diagonal elements
if (tHPHF) then
H_tmp(i, i) = hphf_diag_helement(det_list(:, i), ilut_list(:, i))
else
H_tmp(i, i) = get_helement(det_list(:, i), det_list(:, i), 0)
end if
! off diagonal elements
block
type(CSF_Info_t) :: csf_i
if (tGUGA) csf_i = CSF_Info_t(ilut_list(:, i))
do j = 1, i - 1
if (tHPHF) then
H_tmp(i, j) = hphf_off_diag_helement(det_list(:, i), det_list(:, j), ilut_list(:, i), &
ilut_list(:, j))
else if (tGUGA) then
call calc_guga_matrix_element(ilut_list(:, i), csf_i, &
ilut_list(:, j), CSF_Info_t(ilut_list(:, j)), excitInfo, H_tmp(j, i), .true.)
else
H_tmp(i, j) = get_helement(det_list(:, i), det_list(:, j), ilut_list(:, i), ilut_list(:, j))
end if
end do
end block
end do
! The H matrix if ready. Now diagonalize it.
allocate(work(3 * ndets_all_procs), stat=ierr)
work = 0.0_dp
allocate(evals_all(ndets_all_procs), stat=ierr)
evals_all = 0.0_dp
allocate(rwork(3 * ndets_all_procs), stat=ierr)
call zheev('V', 'L', ndets_int, H_tmp, ndets_int, evals_all, work, 3 * ndets_int, rwork, info)
deallocate(rwork)
! copy H_tmp to evecs, and keep only the first nexcit entries of evals_all
do i = 1, nexcit
evals(i) = evals_all(i)
do j = 1, ndets_all_procs
evecs(j, i) = H_tmp(j, i)
end do
end do
deallocate(evals_all)
deallocate(work)
deallocate(H_tmp)
deallocate(ilut_list)
#else
! should we switch here, if it is not hermitian?
if (t_non_hermitian_2_body) then
ASSERT(.not. tGUGA)
ndets_int = int(ndets_all_procs)
allocate(H_tmp(ndets_all_procs, ndets_all_procs), stat=ierr)
if (ierr /= 0) call stop_all(this_routine, "Error allocating H_tmp array")
H_tmp = 0.0_dp
do i = 1, ndets_all_procs
! diagonal elements
if (tHPHF) then
H_tmp(i, i) = hphf_diag_helement(det_list(:, i), ilut_list(:, i))
else
H_tmp(i, i) = get_helement(det_list(:, i), det_list(:, i), 0)
end if
! off diagonal elements
! we have to loop over all the orbitals now
do j = 1, ndets_all_procs
if (i == j) cycle
if (tHPHF) then
H_tmp(j, i) = hphf_off_diag_helement(det_list(:, i), &
det_list(:, j), ilut_list(:, i), ilut_list(:, j))
else
H_tmp(j, i) = get_helement(det_list(:, i), &
det_list(:, j), ilut_list(:, i), ilut_list(:, j))
end if
end do
end do
allocate(evals_all(ndets_all_procs), stat=ierr)
evals_all = 0.0_dp
allocate(evecs_all(ndets_all_procs, ndets_all_procs), stat=ierr)
evecs_all = 0.0_dp
! i think i need the left eigenvector for the trial-projection
! if it is non-hermitian..
! apparently not.. maybe with the j,i convention above not..
! confusing
call eig(H_tmp, evals_all, evecs_all, .false.)
evals = evals_all(1:nexcit)
evecs = evecs_all(:, 1:nexcit)
deallocate(H_tmp)
deallocate(evecs_all)
else
call frsblk_wrapper(det_list, int(ndets_all_procs), &
nexcit, evals, evecs)
end if
#endif
! For consistency between compilers, enforce a rule for the sign of
! the eigenvector. To do this, make sure that the largest component
! of each vector is positive. The largest component is found using
! maxloc. If there are multiple determinants with the same weight
! then the FORTRAN standard says that the first such component will
! be used, so this will hopefully be consistent across compilers.
! To avoid numerical errors bwteen compilers for such elements, we
! also round the array components to integers. Because evecs will
! be normalised, also multiply by 100000 before rounding.
do i = 1, nexcit
! First find the maximum element.
evec_abs = nint(100000.0_dp * abs(evecs(:, i)))
max_elem_ind = maxloc(evec_abs)
if (Real(evecs(max_elem_ind(1), i), dp) < 0.0_dp) then
evecs(:, i) = -evecs(:, i)
end if
end do
deallocate(det_list)
deallocate(evec_abs)
! Reorder the trial states, if the user has asked for this to be done.
if (present(reorder)) then
temp_reorder = reorder
call sort(temp_reorder, evals)
temp_reorder = reorder
call sort(temp_reorder, evecs)
end if
! Unfortunately to perform the MPIScatterV call we need the transpose
! of the eigenvector array.
allocate(evecs_transpose(nexcit, ndets_all_procs), stat=ierr)
if (ierr /= 0) call stop_all(this_routine, "Error allocating transposed eigenvectors array.")
evecs_transpose = transpose(evecs)
else
deallocate(ilut_list)
end if
call MPIBCast(evals, size(evals), root)
ndets_this_proc_mpi = space_sizes(iProcIndex)
! The number of elements to send and receive in the MPI call, and the
! displacements.
sndcnts = space_sizes * int(nexcit, MPIArg)
rcvcnts = ndets_this_proc_mpi * int(nexcit, MPIArg)
displs = space_displs * int(nexcit, MPIArg)
! Send the components to the correct processors using the following
! array as temporary space.
allocate(evecs_this_proc(nexcit, ndets_this_proc), stat=ierr)
call MPIScatterV(evecs_transpose, sndcnts, displs, evecs_this_proc, rcvcnts, ierr)
if (ierr /= 0) call stop_all(this_routine, "Error in MPIScatterV call.")
! Clean up.
if (iProcIndex == root) deallocate(evecs)
deallocate(evecs_transpose)
end subroutine calc_trial_states_direct