sparse_arrays.F90 Source File


Contents

Source Code


Source Code

#include "macros.h"

! This module contains a type and routines for defining and creating a sparse Hamiltonian.
! The type created to store this information is sparse_matrix_real. For an N-by-N matrix,
! one creates a 1d array of type sparse_matrix_real. Each element of this array stores the
! information on one single row of the matrix - the positions of the non-zero elements
! and their values, in the same order. This completley defines the matrix.

! Note that in some parts of NECI, the sparse nature is used in a different form (i.e. in
! the Lanczos code).

module sparse_arrays

    use bit_rep_data, only: NIfTot, NIfD, nifguga
    use bit_reps, only: decode_bit_det
    use CalcData, only: tReadPops
    use constants
    use DetBitOps, only: DetBitEq, CountBits, TestClosedShellDet
    use Determinants, only: get_helement
    use FciMCData, only: SpawnedParts, Hii
    use core_space_util, only: cs_replicas, sparse_matrix_real, sparse_matrix_int, &
                               core_space_t
    use hphf_integrals, only: hphf_diag_helement, hphf_off_diag_helement, &
                              hphf_off_diag_helement_opt
    use MemoryManager, only: TagIntType, LogMemAlloc, LogMemDealloc
    use Parallel_neci, only: iProcIndex, nProcessors, MPIBarrier, MPIAllGatherV
    use SystemData, only: tHPHF, nel
    use global_det_data, only: set_det_diagH, set_det_offdiagH
    use shared_rhash, only: shared_rhash_t
    use SystemData, only: tGUGA
    use guga_excitations, only: actHamiltonian
    use guga_matrixElements, only: calc_guga_matrix_element
    use guga_bitRepOps, only: convert_ilut_toGUGA, extract_h_element, &
                              CSF_Info_t
    use util_mod, only: binary_search_ilut, near_zero
    use guga_data, only: tag_excitations, ExcitationInformation_t
    use guga_matrixElements, only: calcDiagMatEleGuga_nI
    use matel_getter, only: get_diagonal_matel, get_off_diagonal_matel
    use util_mod, only: stop_all
    use basic_float_math, only: conjgt

    implicit none

    type trial_hashtable
        ! All the states with this hash value.
        integer(n_int), allocatable, dimension(:, :) :: states
        ! The number of clashes for ths hash value.
        integer :: nclash
    end type trial_hashtable

    type core_hashtable
        ! The indices of states with this hash table.
        integer, allocatable, dimension(:) :: ind
        ! The number of clashes for this hash value.
        integer :: nclash
    end type core_hashtable

    type(sparse_matrix_real), allocatable, dimension(:) :: sparse_ham
    integer(TagIntType), allocatable, dimension(:, :) :: SparseHamilTags

    ! For quick access it is often useful to have just the diagonal elements. Note,
    ! however, that they *are* stored in sparse_ham too.
    real(dp), allocatable, dimension(:) :: hamil_diag
    integer(TagIntType) :: HDiagTag

    type(trial_hashtable), allocatable, dimension(:) :: trial_ht
    type(trial_hashtable), allocatable, dimension(:) :: con_ht

    ! --- For when using the determ-proj-approx-hamil option -----
    type(shared_rhash_t) :: var_ht
    type(sparse_matrix_real), allocatable, dimension(:) :: approx_ham

contains

    subroutine calculate_sparse_hamiltonian_non_hermitian(num_states, ilut_list)
        ! same routine as below, but for non-hermitian Hamiltonians in the
        ! case of transcorrelation
        integer, intent(in) :: num_states
        integer(n_int), intent(in) :: ilut_list(0:NIfTot, num_states)
        integer :: i, j, counter, ierr
        integer :: nI(nel), nJ(nel)
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row
        integer, allocatable, dimension(:) :: sparse_diag_positions, sparse_row_sizes, indices
        integer(TagIntType) :: HRTag, SRTag, SDTag, ITag
        character(len=*), parameter :: t_r = "calculate_sparse_hamiltonian_non_hermitian"

        allocate(sparse_ham(num_states))
        allocate(SparseHamilTags(2, num_states))
        allocate(hamiltonian_row(num_states), stat=ierr)
        call LogMemAlloc('hamiltonian_row', num_states, 8, t_r, HRTag, ierr)
        allocate(hamil_diag(num_states), stat=ierr)
        call LogMemAlloc('hamil_diag', num_states, 8, t_r, HDiagTag, ierr)
        allocate(sparse_row_sizes(num_states), stat=ierr)
        call LogMemAlloc('sparse_row_sizes', num_states, sizeof_int, t_r, SRTag, ierr)

        ! Set each element to one to count the diagonal elements straight away.
        sparse_row_sizes = 1

        if (tGUGA) then
            call stop_all(t_r, "modify get_helement for GUGA!")
        end if

        do i = 1, num_states

            hamiltonian_row = 0.0_dp

            call decode_bit_det(nI, ilut_list(:, i))

            ! we have to loop over everything in case on non-hermiticity
            do j = 1, num_states

                call decode_bit_det(nJ, ilut_list(:, j))
                if (i == j) then
                    if (tHPHF) then
                        hamiltonian_row(i) = hphf_diag_helement(nI, ilut_list(:, i))
                    else
                        hamiltonian_row(i) = get_helement(nI, nI, 0)
                    end if
                    hamil_diag(i) = hamiltonian_row(i)
                else
                    if (tHPHF) then
                        !TODO: do i need <I|H|J> or <J|H|I>?
                        hamiltonian_row(j) = hphf_off_diag_helement( &
                                             nI, nJ, ilut_list(:, i), ilut_list(:, j))
                    else
                        hamiltonian_row(j) = get_helement( &
                                             nI, nJ, ilut_list(:, i), ilut_list(:, j))
                    end if
                    if (abs(hamiltonian_row(j)) > EPS) then
                        ! i think in the non-hermitian i only need to update
                        ! one of the counters..
                        sparse_row_sizes(i) = sparse_row_sizes(i) + 1
                    end if
                end if
            end do

            call allocate_sparse_ham_row(sparse_ham, i, sparse_row_sizes(i), &
                                         "sparse_ham", SparseHamilTags(:, i))

            sparse_ham(i)%elements = 0.0_dp
            sparse_ham(i)%positions = 0
            sparse_ham(i)%num_elements = sparse_row_sizes(i)

            ! now fill in the elements, all of them
            counter = 1
            do j = 1, num_states
                if (abs(hamiltonian_row(j)) > EPS) then
                    sparse_ham(i)%positions(counter) = j
                    sparse_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
            end do

        end do

        deallocate(hamiltonian_row, stat=ierr)
        call LogMemDealloc(t_r, HRTag, ierr)
        deallocate(sparse_row_sizes, stat=ierr)
        call LogMemDealloc(t_r, SRTag, ierr)

    end subroutine calculate_sparse_hamiltonian_non_hermitian

    subroutine calculate_sparse_hamiltonian(num_states, ilut_list)

        integer, intent(in) :: num_states
        integer(n_int), intent(in) :: ilut_list(0:NIfTot, num_states)
        integer :: i, j, counter, ierr
        integer :: nI(nel), nJ(nel)
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row
        integer, allocatable, dimension(:) :: sparse_diag_positions, sparse_row_sizes, indices
        integer(TagIntType) :: HRTag, SRTag, SDTag, ITag
        character(len=*), parameter :: t_r = "calculate_sparse_hamiltonian"

        integer :: pos, nexcits
        integer(n_int) :: ilutG(0:nifguga)
        integer(n_int), pointer :: excitations(:, :)
        type(ExcitationInformation_t) :: excitInfo
        type(CSF_Info_t) :: csf_i, csf_j

        allocate(sparse_ham(num_states))
        allocate(SparseHamilTags(2, num_states))
        allocate(hamiltonian_row(num_states), stat=ierr)
        call LogMemAlloc('hamiltonian_row', num_states, 8, t_r, HRTag, ierr)
        allocate(hamil_diag(num_states), stat=ierr)
        call LogMemAlloc('hamil_diag', num_states, 8, t_r, HDiagTag, ierr)
        allocate(sparse_row_sizes(num_states), stat=ierr)
        call LogMemAlloc('sparse_row_sizes', num_states, sizeof_int, t_r, SRTag, ierr)
        allocate(sparse_diag_positions(num_states), stat=ierr)
        call LogMemAlloc('sparse_diag_positions', num_states, sizeof_int, t_r, SDTag, ierr)
        allocate(indices(num_states), stat=ierr)
        call LogMemAlloc('indices', num_states, sizeof_int, t_r, ITag, ierr)

        ! Set each element to one to count the diagonal elements straight away.
        sparse_row_sizes = 1

        ! also need to change this routine here for the guga implementation

        do i = 1, num_states

            hamiltonian_row = 0.0_dp

            call decode_bit_det(nI, ilut_list(:, i))

            ! sparse_diag_positions(i) stores the number of non-zero elements in row i of the
            ! Hamiltonian, up to and including the diagonal element.
            ! sparse_row_sizes(i) stores this number currently as all non-zero elements before
            ! the diagonal have been counted (as the Hamiltonian is symmetric).
            sparse_diag_positions(i) = sparse_row_sizes(i)

            if (tGUGA) csf_i = CSF_Info_t(ilut_list(0:nifd, i))

            do j = i, num_states

                call decode_bit_det(nJ, ilut_list(:, j))
                if (tGUGA) csf_j = CSF_Info_t(ilut_list(:, j))

                ! If on the diagonal of the Hamiltonian.
                if (i == j) then
                    if (tHPHF) then
                        hamiltonian_row(j) = hphf_diag_helement(nI, ilut_list(:, i))
                    else if (tGUGA) then
                        hamiltonian_row(j) = calcDiagMatEleGuga_nI(nI)
                    else
                        hamiltonian_row(j) = get_helement(nI, nJ, 0)
                    end if
                    hamil_diag(j) = hamiltonian_row(j)
                else
                    if (tHPHF) then
                        hamiltonian_row(j) = hphf_off_diag_helement(nI, nJ, ilut_list(:, i), &
                                                                    ilut_list(:, j))
                    else if (tGUGA) then
                        call calc_guga_matrix_element(ilut_list(:, i), csf_i, ilut_list(:, j), csf_j, &
                                                      excitInfo, hamiltonian_row(j), .true.)
#ifdef CMPLX_
                        hamiltonian_row(j) = conjg(hamiltonian_row(j))
#endif
                        ! call calc_guga_matrix_element(ilut_list(:,j), ilut_list(:,i), &
                        !         excitInfo, hamiltonian_row(j), .true., 2)
                    else
                        hamiltonian_row(j) = get_helement(nI, nJ, ilut_list(:, i), &
                                                          ilut_list(:, j))
                    end if
                    if (abs(hamiltonian_row(j)) > 0.0_dp) then
                        ! If element is nonzero, update the following sizes.
                        sparse_row_sizes(i) = sparse_row_sizes(i) + 1
                        sparse_row_sizes(j) = sparse_row_sizes(j) + 1
                    end if
                end if
            end do
            ! Now we know the number of non-zero elements in this row of the Hamiltonian, so allocate it.
            call allocate_sparse_ham_row(sparse_ham, i, sparse_row_sizes(i), "sparse_ham", SparseHamilTags(:, i))

            sparse_ham(i)%elements = 0.0_dp
            sparse_ham(i)%positions = 0
            sparse_ham(i)%num_elements = sparse_row_sizes(i)

            ! Now fill in all matrix elements beyond and including the diagonal, as these are
            ! stored in hamiltonian_row.
            counter = sparse_diag_positions(i)
            do j = i, num_states
                if (abs(hamiltonian_row(j)) > 0.0_dp) then
                    sparse_ham(i)%positions(counter) = j
                    sparse_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
            end do
        end do

        ! At this point, sparse_ham has been allocated with the correct sizes, but only the
        ! matrix elements above and including the diagonal have been filled in. Now we must
        ! fill in the other elements. To do this, cycle through every element above the
        ! diagonal and fill in every corresponding element below it:
        indices = 1
        do i = 1, num_states
            do j = sparse_diag_positions(i) + 1, sparse_row_sizes(i)

                sparse_ham(sparse_ham(i)%positions(j))%&
                    &positions(indices(sparse_ham(i)%positions(j))) = i
                sparse_ham(sparse_ham(i)%positions(j))%&
                    &elements(indices(sparse_ham(i)%positions(j))) = h_conjg(sparse_ham(i)%elements(j))

                indices(sparse_ham(i)%positions(j)) = indices(sparse_ham(i)%positions(j)) + 1

            end do
        end do

        deallocate(hamiltonian_row, stat=ierr)
        call LogMemDealloc(t_r, HRTag, ierr)
        deallocate(sparse_row_sizes, stat=ierr)
        call LogMemDealloc(t_r, SRTag, ierr)
        deallocate(sparse_diag_positions, stat=ierr)
        call LogMemDealloc(t_r, SDTag, ierr)
        deallocate(indices, stat=ierr)
        call LogMemDealloc(t_r, ITag, ierr)

    end subroutine calculate_sparse_hamiltonian

    subroutine calculate_sparse_ham_par(num_states, ilut_list, tPrintInfo)

        integer(MPIArg), intent(in) :: num_states(0:nProcessors - 1)
        integer(n_int), intent(in) :: ilut_list(0:NIfTot, num_states(iProcIndex))
        logical, intent(in) :: tPrintInfo
        integer(MPIArg) :: disps(0:nProcessors - 1)
        integer :: i, j, row_size, counter, num_states_tot, ierr, bytes_required
        integer :: nI(nel), nJ(nel)
        integer(n_int), allocatable, dimension(:, :) :: temp_store
        integer(TagIntType) :: TempStoreTag, HRTag, SDTag
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row
        character(len=*), parameter :: t_r = "calculate_sparse_ham_par"

        integer :: pos, nexcits
        integer(n_int) :: ilutG(0:nifguga)
        integer(n_int), pointer :: excitations(:, :)
        type(ExcitationInformation_t) :: excitInfo
        type(CSF_Info_t) :: csf_i, csf_j

        num_states_tot = int(sum(num_states))
        disps(0) = 0
        do i = 1, nProcessors - 1
            disps(i) = disps(i - 1) + num_states(i - 1)
        end do

        safe_realloc(sparse_ham, (num_states(iProcIndex)))
        safe_realloc(SparseHamilTags, (2, num_states(iProcIndex)))
        safe_realloc(hamiltonian_row, (num_states_tot))
        call LogMemAlloc('hamiltonian_row', num_states_tot, 8, t_r, HRTag)
        safe_realloc(hamil_diag, (num_states(iProcIndex)))
        call LogMemAlloc('hamil_diag', int(num_states(iProcIndex)), 8, t_r, HDiagTag)
        safe_realloc(temp_store, (0:NIfTot, num_states_tot))
        call LogMemAlloc('temp_store', num_states_tot * (NIfTot + 1), 8, t_r, TempStoreTag)

        ! Stick together the determinants from all processors, on all processors.
        call MPIAllGatherV(ilut_list(:, 1:num_states(iProcIndex)), temp_store, num_states, disps)

        ! Loop over all determinants on this processor.
        do i = 1, num_states(iProcIndex)

            call decode_bit_det(nI, ilut_list(:, i))

            row_size = 0
            hamiltonian_row = 0.0_dp
            ! Loop over all determinants on all processors.

            if (tGUGA) csf_i = CSF_Info_t(ilut_list(0:nifd, i))

            do j = 1, num_states_tot

                call decode_bit_det(nJ, temp_store(:, j))
                if (tGUGA) csf_j = CSF_Info_t(temp_store(:, j))

                ! If on the diagonal of the Hamiltonian.
                if (DetBitEq(ilut_list(:, i), temp_store(:, j), nifd)) then
                    if (tHPHF) then
                        hamiltonian_row(j) = hphf_diag_helement(nI, ilut_list(:, i))
                    else if (tGUGA) then
                        hamiltonian_row(j) = calcDiagMatEleGuga_nI(nI)
                    else
                        hamiltonian_row(j) = get_helement(nI, nJ, 0)
                    end if
                    hamil_diag(i) = hamiltonian_row(j)
                    ! Always include the diagonal elements.
                    row_size = row_size + 1
                else
                    if (tHPHF) then
                        hamiltonian_row(j) = hphf_off_diag_helement(nI, nJ, ilut_list(:, i), temp_store(:, j))
                    else if (tGUGA) then
                        call calc_guga_matrix_element(&
                            ilut_list(:, i), csf_i, temp_store(:, j), &
                            csf_j, excitInfo, hamiltonian_row(j), .true.)
#ifdef CMPLX_
                        hamiltonian_row(j) = conjg(hamiltonian_row(j))
#endif
                        ! call calc_guga_matrix_element(temp_store(:,j), ilut_list(:,i), &
                        !     excitInfo, hamiltonian_row(j), .true., 2)
                    else
                        hamiltonian_row(j) = get_helement(nI, nJ, ilut_list(:, i), temp_store(:, j))
                    end if
                    if (abs(hamiltonian_row(j)) > 0.0_dp) row_size = row_size + 1
                end if
            end do

            if (tPrintInfo) then
                if (i == 1) then
                    bytes_required = row_size * (8 + sizeof_int)
                    write(stdout, '(1x,a43)') "About to allocate first row of Hamiltonian."
                    write(stdout, '(1x,a40,1x,i8)') "The memory (bytes) required for this is:", bytes_required
                    write(stdout, '(1x,a71,1x,i7)') "The total number of determinants (and hence rows) on this processor is:", &
                        num_states(iProcIndex)
                    write(stdout, '(1x,a58,1x,i7)') "The total number of determinants across all processors is:", num_states_tot
                    write(stdout, '(1x,a77,1x,i7)') "It is therefore expected that the total memory (MB) required will be roughly:", &
                        num_states_tot * bytes_required / 1000000
                else if (mod(i, 1000) == 0) then
                    write(stdout, '(1x,a23,1x,i7)') "Finished computing row:", i
                end if
            end if

            ! Now we know the number of non-zero elements in this row of the Hamiltonian, so allocate it.
            call allocate_sparse_ham_row(sparse_ham, i, row_size, "sparse_ham", SparseHamilTags(:, i))

            sparse_ham(i)%elements = 0.0_dp
            sparse_ham(i)%positions = 0
            sparse_ham(i)%num_elements = row_size

            counter = 1
            do j = 1, num_states_tot
                ! If non-zero or a diagonal element.
                if (abs(hamiltonian_row(j)) > 0.0_dp .or. (j == i + disps(iProcIndex))) then
                    sparse_ham(i)%positions(counter) = j
                    sparse_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
                if (counter == row_size + 1) exit
            end do

        end do

        call MPIBarrier(ierr)

        deallocate(temp_store, stat=ierr)
        call LogMemDealloc(t_r, TempStoreTag, ierr)
        deallocate(hamiltonian_row, stat=ierr)
        call LogMemDealloc(t_r, HRTag, ierr)

    end subroutine calculate_sparse_ham_par

    subroutine calc_determ_hamil_sparse(rep)

        type(core_space_t), intent(inout) :: rep
        integer :: i, j, row_size, counter, ierr
        integer :: nI(nel), nJ(nel)
        integer(n_int), allocatable, dimension(:, :) :: temp_store
        integer(TagIntType) :: HRTag, TempStoreTag
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row

        integer :: pos, nExcit
        integer(n_int) :: ilutG(0:nifguga)
        integer(n_int), pointer :: excitations(:, :)
        type(ExcitationInformation_t) :: excitInfo

        character(len=*), parameter :: this_routine = "calc_determ_hamil_sparse"

        integer(n_int) :: tmp(0:NIfD), ilutI_tmp(0:NIfTot), ilutJ_tmp(0:niftot)
        integer :: IC, nI_tmp(nel), nJ_tmp(nel)
        integer(n_int) :: ilutI(0:niftot), ilutJ(0:niftot)
        type(CSF_Info_t) :: csf_i, csf_j
        HElement_t(dp) :: tmp_mat, tmp_mat_2, HOffDiag

        allocate(rep%sparse_core_ham(rep%determ_sizes(iProcIndex)), stat=ierr)
        allocate(rep%SparseCoreHamilTags(2, rep%determ_sizes(iProcIndex)))
        allocate(hamiltonian_row(rep%determ_space_size), stat=ierr)
        call LogMemAlloc('hamiltonian_row', int(rep%determ_space_size), 8, this_routine, HRTag, ierr)
        allocate(rep%core_ham_diag(rep%determ_sizes(iProcIndex)), stat=ierr)
        allocate(temp_store(0:NIfTot, rep%determ_space_size), stat=ierr)
        call LogMemAlloc('temp_store', rep%determ_space_size * (NIfTot + 1), 8, this_routine, TempStoreTag, ierr)

        ! Stick together the deterministic states from all processors, on
        ! all processors.
        ! n.b. Explicitly use 0:NIfTot, as NIfTot may not equal NIfBCast
        call MPIAllGatherV(SpawnedParts(0:NIfTot, 1:rep%determ_sizes(iProcIndex)), &
                           temp_store(0:niftot, 1:), rep%determ_sizes, rep%determ_displs)


        ! Loop over all deterministic states on this processor.
        do i = 1, rep%determ_sizes(iProcIndex)

            ilutI = SpawnedParts(0:niftot, i)
            call decode_bit_det(nI, IlutI)
            if (tGUGA) csf_i = CSF_Info_t(ilutI)

            row_size = 0
            hamiltonian_row = 0.0_dp
            ! Loop over all deterministic states.
            do j = 1, rep%determ_space_size

                ilutJ = temp_store(:, j)
                call decode_bit_det(nJ, ilutJ)
                if (tGUGA) csf_j = CSF_Info_t(ilutJ)

                if(t_evolve_adjoint(rep%first_run())) then
                    nI_tmp = nJ
                    nJ_tmp = nI
                    ilutI_tmp = ilutJ
                    ilutJ_tmp = ilutI
                else
                    nI_tmp = nI
                    nJ_tmp = nJ
                    ilutI_tmp = ilutI
                    ilutJ_tmp = ilutJ
                end if

                ! If on the diagonal of the Hamiltonian.
                if (DetBitEq(IlutI, ilutJ_tmp, nifd)) then
                    hamiltonian_row(j) = get_diagonal_matel(nI_tmp, IlutI_tmp) - Hii
                    rep%core_ham_diag(i) = hamiltonian_row(j)
                    ! We calculate and store the diagonal matrix element at
                    ! this point for later access.
                    if (.not. tReadPops) then
                        call set_det_diagH(i, Real(hamiltonian_row(j), dp))
                        HOffDiag = get_off_diagonal_matel(nI_tmp, ilutI_tmp)
                        call set_det_offdiagH(i, HOffDiag)
                    end if
                    ! Always include the diagonal elements.
                    row_size = row_size + 1
                else
                    if (tHPHF) then
                        hamiltonian_row(j) = hphf_off_diag_helement(nI_tmp, nJ_tmp, IlutI_tmp, ilutJ_tmp)
                    else if (tGUGA) then
                        call calc_guga_matrix_element(&
                                ilutI_tmp, csf_i, ilutJ_tmp, csf_j, excitInfo, tmp_mat, .true.)
#ifdef DEBUG_
                        call calc_guga_matrix_element(&
                                ilutI_tmp,  csf_i, ilutJ_tmp, csf_j, excitInfo, tmp_mat_2, .true.)
                        if (.not. near_zero(tmp_mat - tmp_mat_2)) then
                            call stop_all(this_routine, "type 1 and 2 do not agree!")
                        end if
                        call calc_guga_matrix_element(&
                                ilutJ_tmp, csf_j, ilutI_tmp, csf_i, excitInfo, tmp_mat_2, .true.)
                        if (.not. near_zero(tmp_mat - conjgt(tmp_mat_2))) then
                            call stop_all(this_routine, "not hermititan!")
                        end if
#endif

#ifdef CMPLX_
                        hamiltonian_row(j) = conjg(tmp_mat)
#else
                        hamiltonian_row(j) = tmp_mat
#endif
                    else

                        tmp = ieor(IlutI_tmp(0:NIfD), ilutJ_tmp(0:NIfD))
                        tmp = iand(IlutI_tmp(0:NIfD), tmp)
                        IC = CountBits(tmp, NIfD)

                        if (IC <= maxExcit) then
!                             hamiltonian_row(j) = get_helement(nI_tmp, nJ_tmp, IC, ilutI_tmp, ilutJ_tmp)
                            hamiltonian_row(j) = get_helement(nJ_tmp, nI_tmp, IC, ilutJ_tmp, ilutI_tmp)
                        end if
                    end if
                    if (abs(hamiltonian_row(j)) > 0.0_dp) row_size = row_size + 1
                end if
            end do
            ! Now we know the number of non-zero elements in this row of the Hamiltonian, so allocate it.
            call allocate_sparse_ham_row(rep%sparse_core_ham, i, row_size, "sparse_core_ham", rep%SparseCoreHamilTags(:, i))

            rep%sparse_core_ham(i)%elements = 0.0_dp
            rep%sparse_core_ham(i)%positions = 0
            rep%sparse_core_ham(i)%num_elements = row_size

            counter = 1
            do j = 1, rep%determ_space_size
                ! If non-zero or a diagonal element.
                if (abs(hamiltonian_row(j)) > 0.0_dp .or. (j == i + rep%determ_displs(iProcIndex))) then
                    rep%sparse_core_ham(i)%positions(counter) = j
                    rep%sparse_core_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
                if (counter == row_size + 1) exit
            end do

        end do

        ! Don't time the mpi_barrier call, because if we did then we wouldn't
        ! be able separate out some of the core Hamiltonian creation time from
        ! the MPIBarrier calls in the main loop.
        call MPIBarrier(ierr, tTimeIn=.false.)

        deallocate(temp_store, stat=ierr)
        call LogMemDealloc(this_routine, TempStoreTag, ierr)
        deallocate(hamiltonian_row, stat=ierr)
        call LogMemDealloc(this_routine, HRTag, ierr)

    end subroutine calc_determ_hamil_sparse

    subroutine calc_determ_hamil_sparse_hphf(rep)
        type(core_space_t), intent(inout) :: rep
        integer :: i, j, row_size, counter, ierr
        integer :: nI(nel), nJ(nel)
        integer(n_int), allocatable, dimension(:, :) :: temp_store
        integer, allocatable :: temp_store_nI(:, :)
        integer(TagIntType) :: HRTag, TempStoreTag
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row
        HElement_t(dp) :: HOffDiag
        character(len=*), parameter :: t_r = "calc_determ_hamil_sparse_hphf"

        integer(n_int) :: tmp(0:NIfD)
        integer :: IC
        logical :: CS_I
        logical, allocatable :: cs(:)

        allocate(rep%sparse_core_ham(rep%determ_sizes(iProcIndex)), stat=ierr)
        allocate(rep%SparseCoreHamilTags(2, rep%determ_sizes(iProcIndex)))
        allocate(hamiltonian_row(rep%determ_space_size), stat=ierr)
        call LogMemAlloc('hamiltonian_row', int(rep%determ_space_size), 8, t_r, HRTag, ierr)
        allocate(rep%core_ham_diag(rep%determ_sizes(iProcIndex)), stat=ierr)
        allocate(temp_store(0:NIfTot, rep%determ_space_size), stat=ierr)
        call LogMemAlloc('temp_store', rep%determ_space_size * (NIfTot + 1), 8, t_r, TempStoreTag, ierr)
        allocate(temp_store_nI(nel, rep%determ_space_size), stat=ierr)
        allocate(cs(rep%determ_space_size), stat=ierr)

        ! Stick together the deterministic states from all processors, on
        ! all processors.
        ! n.b. Explicitly use 0:NIfTot, as NIfTot may not equal NIfBCast
        call MPIAllGatherV(SpawnedParts(0:NIfTot, 1:rep%determ_sizes(iProcIndex)), &
                           temp_store, rep%determ_sizes, rep%determ_displs)

        do i = 1, rep%determ_space_size
            call decode_bit_det(temp_store_nI(:, i), temp_store(:, i))
            cs(i) = TestClosedShellDet(temp_store(:, i))
        end do

        ! Loop over all deterministic states on this processor.
        do i = 1, rep%determ_sizes(iProcIndex)

            !call decode_bit_det(nI, SpawnedParts(:, i))
            nI = temp_store_nI(:, i + rep%determ_displs(iProcIndex))

            row_size = 0
            hamiltonian_row = 0.0_dp

            CS_I = cs(i + rep%determ_displs(iProcIndex))

            ! Loop over all deterministic states.
            do j = 1, rep%determ_space_size

                !call decode_bit_det(nJ, temp_store(:,j))
                nJ = temp_store_nI(:, j)

                ! If on the diagonal of the Hamiltonian.
                if (j == i + rep%determ_displs(iProcIndex)) then
                    hamiltonian_row(j) = get_diagonal_matel(nI, SpawnedParts(:, i)) - Hii
                    rep%core_ham_diag(i) = hamiltonian_row(j)
                    ! We calculate and store the diagonal matrix element at
                    ! this point for later access.
                    if (.not. tReadPops) then
                        call set_det_diagH(i, real(hamiltonian_row(j), dp))
                        HOffDiag = get_off_diagonal_matel(nI, SpawnedParts(:, i))
                        call set_det_offdiagH(i, HOffDiag)
                    end if
                    ! Always include the diagonal elements.
                    row_size = row_size + 1
                else
                    tmp = ieor(SpawnedParts(0:NIfD, i), temp_store(0:NIfD, j))
                    tmp = iand(SpawnedParts(0:NIfD, i), tmp)
                    IC = CountBits(tmp, NIfD)

                    if (IC <= maxExcit .or. ((.not. CS_I) .and. (.not. cs(j)))) then
                        if(t_evolve_adjoint(rep%first_run())) then
                            hamiltonian_row(j) = hphf_off_diag_helement_opt(nJ, temp_store(:, j), SpawnedParts(:, i), IC, cs(j), CS_I)
                        else
                            hamiltonian_row(j) = hphf_off_diag_helement_opt(nI, SpawnedParts(:, i), temp_store(:, j), IC, CS_I, cs(j))
                        endif
                        if (abs(hamiltonian_row(j)) > 0.0_dp) row_size = row_size + 1
                    end if
                end if

            end do

            ! Now we know the number of non-zero elements in this row of the Hamiltonian, so allocate it.
            call allocate_sparse_ham_row(rep%sparse_core_ham, i, row_size, "sparse_core_ham", rep%SparseCoreHamilTags(:, i))

            rep%sparse_core_ham(i)%elements = 0.0_dp
            rep%sparse_core_ham(i)%positions = 0
            rep%sparse_core_ham(i)%num_elements = row_size

            counter = 1
            do j = 1, rep%determ_space_size
                ! If non-zero or a diagonal element.
                if (abs(hamiltonian_row(j)) > 0.0_dp .or. (j == i + rep%determ_displs(iProcIndex))) then
                    rep%sparse_core_ham(i)%positions(counter) = j
                    rep%sparse_core_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
                if (counter == row_size + 1) exit
            end do

        end do

        ! Don't time the mpi_barrier call, because if we did then we wouldn't
        ! be able separate out some of the core Hamiltonian creation time from
        ! the MPIBarrier calls in the main loop.
        call MPIBarrier(ierr, tTimeIn=.false.)

        deallocate(temp_store, stat=ierr)
        call LogMemDealloc(t_r, TempStoreTag, ierr)
        deallocate(hamiltonian_row, stat=ierr)
        call LogMemDealloc(t_r, HRTag, ierr)
        deallocate(temp_store_nI, stat=ierr)
        deallocate(cs, stat=ierr)

    end subroutine calc_determ_hamil_sparse_hphf

    subroutine calc_approx_hamil_sparse_hphf(rep)
        type(core_space_t), intent(in) :: rep
        integer :: i, j, row_size, counter, ierr
        integer :: nI(nel), nJ(nel)
        integer, allocatable :: temp_store_nI(:, :)
        HElement_t(dp), allocatable, dimension(:) :: hamiltonian_row
        HElement_t(dp) :: HOffDiag
        character(len=*), parameter :: t_r = "calc_approx_hamil_sparse_hphf"

        integer(n_int) :: tmp(0:NIfD)
        integer :: IC
        logical :: CS_I, var_state_i, var_state_j
        logical, allocatable :: cs(:)

        allocate(approx_ham(rep%determ_sizes(iProcIndex)), stat=ierr)
        allocate(hamiltonian_row(rep%determ_space_size), stat=ierr)
        allocate(temp_store_nI(nel, rep%determ_space_size), stat=ierr)
        allocate(cs(rep%determ_space_size), stat=ierr)

        do i = 1, rep%determ_space_size
            call decode_bit_det(temp_store_nI(:, i), rep%core_space(:, i))
            cs(i) = TestClosedShellDet(rep%core_space(:, i))
        end do

        ! Loop over all deterministic states on this processor.
        do i = 1, rep%determ_sizes(iProcIndex)

            !call decode_bit_det(nI, SpawnedParts(:, i))
            nI = temp_store_nI(:, i + rep%determ_displs(iProcIndex))

            var_state_i = is_var_state(rep%core_space(:, i + rep%determ_displs(iProcIndex)), nI)

            row_size = 0
            hamiltonian_row = 0.0_dp

            CS_I = cs(i + rep%determ_displs(iProcIndex))

            ! Loop over all deterministic states.
            do j = 1, rep%determ_space_size

                !call decode_bit_det(nJ, core_space(:,j))
                nJ = temp_store_nI(:, j)

                ! If on the diagonal of the Hamiltonian.
                if (j == i + rep%determ_displs(iProcIndex)) then
                    hamiltonian_row(j) = get_diagonal_matel(nI, rep%core_space(:, i + rep%determ_displs(iProcIndex))) - Hii
                    !core_ham_diag(i) = hamiltonian_row(j)
                    ! We calculate and store the diagonal matrix element at
                    ! this point for later access.
                    if (.not. tReadPops) then
                        call set_det_diagH(i, real(hamiltonian_row(j), dp))
                        HOffDiag = get_off_diagonal_matel ( nI, rep%core_space(:, i + rep%determ_displs(iProcIndex) ) )
                        call set_det_offdiagH(i, HOffDiag)
                    end if
                    ! Always include the diagonal elements.
                    row_size = row_size + 1
                else
                    var_state_j = is_var_state(rep%core_space(:, j), nJ)

                    ! Only add a matrix element if both states are variational
                    ! states, or if one of them is (the rectangular portion of
                    ! H connected var_space to the space of connections to it).
                    if (var_state_i .or. var_state_j) then
                        tmp = ieor(rep%core_space(0:NIfD, i + rep%determ_displs(iProcIndex)), rep%core_space(0:NIfD, j))
                        tmp = iand(rep%core_space(0:NIfD, i + rep%determ_displs(iProcIndex)), tmp)
                        IC = CountBits(tmp, NIfD)

                        if (IC <= maxExcit .or. ((.not. CS_I) .and. (.not. cs(j)))) then
                            if(t_evolve_adjoint(rep%first_run())) then
                                hamiltonian_row(j) = hphf_off_diag_helement_opt(nJ, &
                                    rep%core_space(:, j), &
                                    rep%core_space(:, i + rep%determ_displs(iProcIndex)), &
                                    IC, cs(j), CS_I)
                            else
                                hamiltonian_row(j) = hphf_off_diag_helement_opt(nI, &
                                    rep%core_space(:, i + rep%determ_displs(iProcIndex)), &
                                    rep%core_space(:, j), IC, CS_I, cs(j))
                            endif
                            if (abs(hamiltonian_row(j)) > 0.0_dp) row_size = row_size + 1
                        end if
                    end if
                end if

            end do

            ! Now we know the number of non-zero elements in this row of the Hamiltonian, so allocate it.
            allocate(approx_ham(i)%elements(row_size), stat=ierr)
            allocate(approx_ham(i)%positions(row_size), stat=ierr)

            approx_ham(i)%elements = 0.0_dp
            approx_ham(i)%positions = 0
            approx_ham(i)%num_elements = row_size

            counter = 1
            do j = 1, rep%determ_space_size
                ! If non-zero or a diagonal element.
                if (abs(hamiltonian_row(j)) > 0.0_dp .or. (j == i + rep%determ_displs(iProcIndex))) then
                    approx_ham(i)%positions(counter) = j
                    approx_ham(i)%elements(counter) = hamiltonian_row(j)
                    counter = counter + 1
                end if
                if (counter == row_size + 1) exit
            end do

        end do

        ! Don't time the mpi_barrier call, because if we did then we wouldn't
        ! be able separate out some of the core Hamiltonian creation time from
        ! the MPIBarrier calls in the main loop.
        call MPIBarrier(ierr, tTimeIn=.false.)

        deallocate(hamiltonian_row, stat=ierr)
        deallocate(temp_store_nI, stat=ierr)
        deallocate(cs, stat=ierr)

    end subroutine calc_approx_hamil_sparse_hphf

    subroutine allocate_sparse_ham_row(sparse_matrix, row, sparse_row_size, sparse_matrix_name, sparse_tags)

        ! Allocate a single row and add it to the memory manager.

        type(sparse_matrix_real), intent(inout) :: sparse_matrix(:)
        integer, intent(in) :: row, sparse_row_size
        character(len=*), intent(in) :: sparse_matrix_name
        integer(TagIntType), intent(inout) :: sparse_tags(2)
        integer :: ierr
        character(len=1024) :: string_row
        character(len=1024) :: var_name
        character(len=*), parameter :: t_r = "allocate_sparse_ham_row"

        write(string_row, '(I10)') row

        var_name = trim(sparse_matrix_name)//"_"//trim(string_row)//"_elements"
        allocate(sparse_matrix(row)%elements(sparse_row_size), stat=ierr)
        call LogMemAlloc(var_name, sparse_row_size, 8, t_r, sparse_tags(1), ierr)

        var_name = trim(sparse_matrix_name)//"_"//trim(string_row)//"_positions"
        allocate(sparse_matrix(row)%positions(sparse_row_size), stat=ierr)
        call LogMemAlloc(var_name, sparse_row_size, sizeof_int, t_r, sparse_tags(2), ierr)

    end subroutine allocate_sparse_ham_row

    subroutine deallocate_core_hashtable(ht)

        type(core_hashtable), intent(inout), allocatable :: ht(:)

        integer :: i, ierr

        if (allocated(ht)) then
            do i = 1, size(ht)
                if (allocated(ht(i)%ind)) then
                    deallocate(ht(i)%ind, stat=ierr)
                    if (ierr /= 0) write(stdout, '("Error when deallocating core hashtable ind array:",1X,i8)') ierr
                end if
            end do

            deallocate(ht, stat=ierr)
            if (ierr /= 0) write(stdout, '("Error when deallocating core hashtable:",1X,i8)') ierr
        end if

    end subroutine deallocate_core_hashtable

    subroutine deallocate_trial_hashtable(ht)

        type(trial_hashtable), intent(inout), allocatable :: ht(:)

        integer :: i, ierr

        if (allocated(ht)) then
            do i = 1, size(ht)
                if (allocated(ht(i)%states)) then
                    deallocate(ht(i)%states, stat=ierr)
                    if (ierr /= 0) write(stdout, '("Error when deallocating trial hashtable states array:",1X,i8)') ierr
                end if
            end do

            deallocate(ht, stat=ierr)
            if (ierr /= 0) write(stdout, '("Error when deallocating core hashtable:",1X,i8)') ierr
        end if

    end subroutine deallocate_trial_hashtable

    function is_var_state(ilut, nI) result(var_state)

        use FciMCData, only: var_space, var_space_size_int
        use hash, only: FindWalkerHash

        integer(n_int), intent(in) :: ilut(0:NIfTot)
        integer, intent(in) :: nI(:)
        integer(int64) :: hash_val
        integer(int64) :: pos
        logical :: var_state

        hash_val = FindWalkerHash(nI, var_space_size_int)

        call var_ht%callback_lookup(hash_val, pos, var_state, loc_verify)
    contains

        function loc_verify(ind) result(match)
            integer(int64), intent(in) :: ind
            logical :: match

            match = all(ilut(0:nifd) == var_space(0:nifd, ind))

        end function loc_verify
    end function is_var_state

    elemental function t_evolve_adjoint(run) result(transp)
        use FciMCData, only: t_adjoint_replicas
        integer, intent(in) :: run

        logical :: transp

        transp = t_adjoint_replicas .and. ( mod(run,2)==0 )
    end function t_evolve_adjoint


end module sparse_arrays