read_walkers Subroutine

private subroutine read_walkers(parent, dets, CurrWalkers)

Uses

Arguments

Type IntentOptional Attributes Name
integer(kind=hid_t), intent(in) :: parent
integer(kind=n_int), intent(out) :: dets(:,:)
integer(kind=int64), intent(out) :: CurrWalkers

Contents

Source Code


Source Code

    subroutine read_walkers(parent, dets, CurrWalkers)

        use CalcData, only: pops_norm

        ! This is the routine that has complexity!!!
        !
        ! We read chunks of the walkers datasets on _all_ of the processors,
        ! and communicate the walkers to the correct node after each of the
        ! blocks. This is essentially a giant annihilation step
        !
        ! We used the spawnedparts arrays as a buffer. These are split up per
        ! processor in the same way as normally used for annihilation. We
        ! keep looping over however much we can fit in this array unil
        ! we are done.
        !
        ! --> This is quite complicated, but should equally be quite efficient

        integer(hid_t), intent(in) :: parent
        integer(n_int), intent(out) :: dets(:, :)
        integer(int64), intent(out) :: CurrWalkers
        character(*), parameter :: t_r = 'read_walkers'

        integer :: proc, nreceived
        integer(hid_t) :: grp_id, calc_grp_id
        integer(hdf_err) :: err
        integer(hid_t) :: ds_sgns, ds_ilut, ds_gdata
        integer(int64) :: nread_walkers
        integer :: ierr

        integer(int32) :: bit_rep_width
        integer(hsize_t) :: all_count, block_size, counts(0:nProcessors - 1)
        integer(hsize_t) :: offsets(0:nProcessors - 1)
        integer(hsize_t) :: block_start, block_end, last_part
        integer(hsize_t) :: this_block_size
        real(dp), allocatable :: pops_num_parts(:), pops_norm_sqr(:)
        real(dp) :: norm(lenof_sign), parts(lenof_sign)
        logical :: running, any_running
        integer(hsize_t), dimension(:, :), allocatable :: temp_ilut, temp_sgns
        integer(hsize_t), dimension(:, :), allocatable :: gdata_buf
        integer :: temp_ilut_tag, temp_sgns_tag, rest
        integer(int32) :: read_lenof_sign
        type(gdata_io_t) :: gdata_read_handler
        integer :: gdata_size, tmp_fvals_size, tmp_apvals_size
        logical :: t_read_gdata
        logical(hdf_log) :: t_exist_gdata

        ! TODO:
        ! - Read into a relatively small buffer. Make this such that all the
        !   particles could be broadcast to a single processor and everything
        !   would be fine (this is the only way to _ensure_ no overflows).
        ! - Loop over these read walkers, decode and determine which proc
        !   they are for
        ! - MPIAllToAllV to the correct processor
        ! - Ensure that these end up in the correct place in the main list,
        !   and that we don't overflow anywhere!
        ! - Writing and reading of flags, etc.
        !    -- For flags, include attribute info with which bit is which
        !       flag for supportability.

        call h5gopen_f(parent, nm_wfn_grp, grp_id, err)

        ! TODO: Make sure that this plays nicely with 32bit!!!
        if (.not. build_64bit) &
            call stop_all(t_r, "Needs manual, careful testing and adjustment")

        ! Get attributes about the sizes of stuff in the popsfile.
        ! ========

        ! We can only read in bit representations that are the right size!
        call read_int32_attribute(grp_id, nm_rep_width, bit_rep_width)
        if (bit_rep_width /= NIfD + 1) &
            call stop_all(t_r, "Mismatched bit representations")

        call read_int32_attribute(grp_id, nm_sgn_len, read_lenof_sign)
        ! as lenof_sign is of type int, do not force tmp_lenof_sign to be int32
        tmp_lenof_sign = int(read_lenof_sign)
        ! assign the tmp_inum_runs accordingly
#ifdef CMPLX_
        tmp_inum_runs = tmp_lenof_sign.div.2
#else
        tmp_inum_runs = tmp_lenof_sign
#endif

        ! read in if auto-adaptive shift was used
        ! even though this is technically calcdata, we already need it
        ! here, so we read it in alongside the walkers
        ! (calcdata has to be read in after the walkers, ugh)
        call read_log_scalar(grp_id, nm_tauto, tPopAutoAdaptiveShift, &
                             default=.false._int32, required=.false.)
        call read_log_scalar(grp_id, nm_sc_blooms, tPopScaleBlooms, &
                             default=.false._int32, required=.false.)

        ! Same goes for accum-pops and the corresponding values
        call read_log_scalar(grp_id, nm_tAP, tPopAccumPops, &
                             default=.false._int32, required=.false.)

        ! these variables are for consistency-checks
        allocate(pops_norm_sqr(tmp_lenof_sign), stat=ierr)
        allocate(pops_num_parts(tmp_lenof_sign), stat=ierr)

        call read_int64_attribute(grp_id, nm_num_dets, all_count, &
                                  required=.true.)
        call read_dp_1d_attribute(grp_id, nm_norm_sqr, pops_norm_sqr, &
                                  required=.true.)
        call read_dp_1d_attribute(grp_id, nm_num_parts, pops_num_parts, &
                                  required=.true.)

        if (lenof_sign /= tmp_lenof_sign) then
            ! currently only for real population
#ifndef CMPLX_
            ! resize the attributes
            call resize_attribute(pops_norm_sqr, lenof_sign)
            call resize_attribute(pops_num_parts, lenof_sign)
            ! notify
            write(stdout, *) "WARNING: Popsfile and input lenof_sign mismatch. Cloning replicas"
#else
            call stop_all(t_r, "Mismatched sign length")
#endif
        end if

        write(stdout, *)
        write(stdout, *) 'Reading in ', all_count, ' determinants'

        ! How many particles should each processor read in (try and distribute
        ! this as uniformly as possible. Also calculate the associated data
        ! offsets
        counts = all_count / nProcessors
        rest = int(mod(all_count, nProcessors))
        if (rest > 0) counts(0:rest - 1) = counts(0:rest - 1) + 1

        if (sum(counts) /= all_count .or. any(counts < 0)) &
            call stop_all(t_r, "Invalid particles counts")
        offsets(0) = 0
        do proc = 1, nProcessors - 1
            offsets(proc) = offsets(proc - 1) + counts(proc - 1)
        end do

        write(stdout, *) 'Reading in ', counts(iProcIndex), &
            ' determinants on this process ...'

        ! TODO: Split this up into more manageable chunks!

        ! Open the relevant datasets
        call h5dopen_f(grp_id, nm_ilut, ds_ilut, err)
        call h5dopen_f(grp_id, nm_sgns, ds_sgns, err)

        ! size of the ms data read in
        tmp_fvals_size = 2 * tmp_inum_runs

        tmp_apvals_size = tmp_lenof_sign + 1

        ! Only active the accumlation of populations if the popsfile has
        ! apvals and and accum-pops is specified with a proper iteration
        tAccumPopsActive = .false.
        if (tAccumPops .and. tPopAccumPops) then
            ! Read previous iteration data.
            ! It is read again in read_calc_data, but we need it now
            call h5gopen_f(parent, nm_calc_grp, calc_grp_id, err)
            call read_int64_scalar(calc_grp_id, nm_iters, PreviousCycles, &
                                   default=0_int64, required=.false.)
            call h5gclose_f(calc_grp_id, err)

            if (iAccumPopsIter <= PreviousCycles) then
                tAccumPopsActive = .true.
                call read_int64_scalar(grp_id, nm_accum_counter, iAccumPopsCounter, &
                                       default=0_int64, required=.false.)
                write(stdout, *) "Accumulated populations are found. Accumulation will continue."
            else
                write(stdout, *) "Accumulated populations are found, but will be discarded."
                write(stdout, *) "Accumulation will restart at iteration: ", iAccumPopsIter
            end if
        end if

        ! create an io handler for the data that is actually in the file
        ! (can have different lenof_sign and options than the one we will be using)
        call gdata_read_handler%init_gdata_io( &
            tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, &
            tmp_fvals_size, max_ratio_size, tmp_apvals_size)
        gdata_size = gdata_read_handler%entry_size()

        t_read_gdata = gdata_read_handler%t_io()
        if (t_read_gdata) then
            call h5lexists_f(grp_id, nm_gdata, t_exist_gdata, err)
            if (t_exist_gdata) then
                call h5dopen_f(grp_id, nm_gdata, ds_gdata, err)
            else
                !This is for backword-compatibility.
                !gdata group had another earlier name.
                call h5dopen_f(grp_id, nm_gdata_old, ds_gdata, err)
            end if
        end if

        ! Check that these datasets look like we expect them to.
        call check_dataset_params(ds_ilut, nm_ilut, 8_hsize_t, H5T_INTEGER_F, &
                                  [int(bit_rep_width, hsize_t), all_count])
        call check_dataset_params(ds_sgns, nm_sgns, 8_hsize_t, H5T_FLOAT_F, &
                                  [int(tmp_lenof_sign, hsize_t), all_count])
        if (t_read_gdata) then
            if (t_exist_gdata) then
                call check_dataset_params(ds_gdata, nm_gdata, 8_hsize_t, H5T_FLOAT_F, &
                                          [int(gdata_size, hsize_t), all_count])
            else
                !This is for backword-compatibility.
                !gdata group had another earlier name.
                call check_dataset_params(ds_gdata, nm_gdata_old, 8_hsize_t, H5T_FLOAT_F, &
                                          [int(gdata_size, hsize_t), all_count])
            end if
        end if

        !limit the buffer size per MPI task to 50MB or MaxSpawned entries
        block_size = 50000000 / (bit_rep_width * lenof_sign) / sizeof(SpawnedParts(1, 1))
        block_size = min(block_size, MaxSpawned)
#if 0
        do proc = 0, nProcessors - 2
            block_size = min(block_size, &
                             InitialSpawnedSlots(proc + 1) - InitialSpawnedSlots(proc))
        end do
        block_size = min(block_size, &
                         MaxSpawned - InitialSpawnedSlots(nProcessors - 1))
#endif

        ! Initialise relevant counters
        CurrWalkers = 0
        nread_walkers = 0
        pops_norm = 0
        norm = 0
        parts = 0

        ! Note that reading in the HDF5 library is zero based, not one based
        ! TODO: We need to keep looping until all the processes are done, not
        !       just the one!
        block_start = offsets(iProcIndex)
        if (iProcIndex == nProcessors - 1) then
            last_part = all_count - 1
        else
            last_part = offsets(iProcIndex + 1) - 1
        end if
        block_end = min(block_start + block_size - 1, last_part)
        this_block_size = block_end - block_start + 1

        running = .true.
        any_running = .true.

        allocate(temp_ilut(int(bit_rep_width), int(this_block_size)), stat=ierr)
        call LogMemAlloc('temp_ilut', size(temp_ilut), int(sizeof(temp_ilut(1, 1))), 'read_walkers', temp_ilut_tag, ierr)

        allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr)
        call LogMemAlloc('temp_sgns', size(temp_sgns), lenof_sign, 'read_walkers', temp_sgns_tag, ierr)

        allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr)

        do while (any_running)

            ! If this is the last block, its size will differ from the biggest
            ! one allowed.
            if (running) then
                this_block_size = block_end - block_start + 1
            else
                this_block_size = 0
            end if

            ! if we resized the sign, we need to go back to the original buffer size now
            if (tmp_lenof_sign /= lenof_sign) then
                deallocate(temp_sgns)
                allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr)
                deallocate(gdata_buf)
                allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr)
                call gdata_read_handler%init_gdata_io( &
                    tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, &
                    tmp_fvals_size, max_ratio_size, tmp_apvals_size)
            end if

            call read_walker_block_buff(ds_ilut, ds_sgns, ds_gdata, block_start, &
                                        this_block_size, bit_rep_width, temp_ilut, temp_sgns, &
                                        gdata_buf)

            if (tmp_lenof_sign /= lenof_sign) then
                call clone_signs(temp_sgns, tmp_lenof_sign, lenof_sign, this_block_size)
                ! resize the fvals in the same manner
                if (t_read_gdata) then
                    call gdata_read_handler%clone_gdata( &
                        gdata_buf, tmp_fvals_size, fvals_size, tmp_apvals_size, &
                        apvals_size, this_block_size)
                end if
            end if

            ! distribution and storage of the data is done with the read_handler, which
            ! has the memory layout for the current calculation
            call distribute_and_add_walkers(this_block_size, gdata_read_handler, &
                                            temp_ilut, temp_sgns, gdata_buf, &
                                            dets, nreceived, CurrWalkers, norm, parts)

            nread_walkers = nread_walkers + nreceived

            ! And update for the next block
            if (running) then
                block_start = block_end + 1
                block_end = min(block_start + block_size - 1, last_part)

                if (block_start > last_part) running = .false.
            end if

            ! Test if _all_ of the processes have finished. If they have
            ! not then
            call MPIAllLORLogical(running, any_running)

        end do

        deallocate(gdata_buf)
        deallocate(temp_ilut, temp_sgns)
        call LogMemDeAlloc('read_walkers', temp_ilut_tag)
        call LogMemDeAlloc('read_walkers', temp_sgns_tag)

        if (t_read_gdata) call h5dclose_f(ds_gdata, err)
        call h5dclose_f(ds_sgns, err)
        call h5dclose_f(ds_ilut, err)
        call h5gclose_f(grp_id, err)

        ! Do some checking
        call check_read_particles(nread_walkers, norm, parts, all_count, &
                                  pops_num_parts, pops_norm_sqr)

        deallocate(pops_num_parts)
        deallocate(pops_norm_sqr)

        write(stdout, *) "... done"
        write(stdout, *)

    end subroutine read_walkers