subroutine read_walkers(parent, dets, CurrWalkers)
use CalcData, only: pops_norm
! This is the routine that has complexity!!!
!
! We read chunks of the walkers datasets on _all_ of the processors,
! and communicate the walkers to the correct node after each of the
! blocks. This is essentially a giant annihilation step
!
! We used the spawnedparts arrays as a buffer. These are split up per
! processor in the same way as normally used for annihilation. We
! keep looping over however much we can fit in this array unil
! we are done.
!
! --> This is quite complicated, but should equally be quite efficient
integer(hid_t), intent(in) :: parent
integer(n_int), intent(out) :: dets(:, :)
integer(int64), intent(out) :: CurrWalkers
character(*), parameter :: t_r = 'read_walkers'
integer :: proc, nreceived
integer(hid_t) :: grp_id, calc_grp_id
integer(hdf_err) :: err
integer(hid_t) :: ds_sgns, ds_ilut, ds_gdata
integer(int64) :: nread_walkers
integer :: ierr
integer(int32) :: bit_rep_width
integer(hsize_t) :: all_count, block_size, counts(0:nProcessors - 1)
integer(hsize_t) :: offsets(0:nProcessors - 1)
integer(hsize_t) :: block_start, block_end, last_part
integer(hsize_t) :: this_block_size
real(dp), allocatable :: pops_num_parts(:), pops_norm_sqr(:)
real(dp) :: norm(lenof_sign), parts(lenof_sign)
logical :: running, any_running
integer(hsize_t), dimension(:, :), allocatable :: temp_ilut, temp_sgns
integer(hsize_t), dimension(:, :), allocatable :: gdata_buf
integer :: temp_ilut_tag, temp_sgns_tag, rest
integer(int32) :: read_lenof_sign
type(gdata_io_t) :: gdata_read_handler
integer :: gdata_size, tmp_fvals_size, tmp_apvals_size
logical :: t_read_gdata
logical(hdf_log) :: t_exist_gdata
! TODO:
! - Read into a relatively small buffer. Make this such that all the
! particles could be broadcast to a single processor and everything
! would be fine (this is the only way to _ensure_ no overflows).
! - Loop over these read walkers, decode and determine which proc
! they are for
! - MPIAllToAllV to the correct processor
! - Ensure that these end up in the correct place in the main list,
! and that we don't overflow anywhere!
! - Writing and reading of flags, etc.
! -- For flags, include attribute info with which bit is which
! flag for supportability.
call h5gopen_f(parent, nm_wfn_grp, grp_id, err)
! TODO: Make sure that this plays nicely with 32bit!!!
if (.not. build_64bit) &
call stop_all(t_r, "Needs manual, careful testing and adjustment")
! Get attributes about the sizes of stuff in the popsfile.
! ========
! We can only read in bit representations that are the right size!
call read_int32_attribute(grp_id, nm_rep_width, bit_rep_width)
if (bit_rep_width /= NIfD + 1) &
call stop_all(t_r, "Mismatched bit representations")
call read_int32_attribute(grp_id, nm_sgn_len, read_lenof_sign)
! as lenof_sign is of type int, do not force tmp_lenof_sign to be int32
tmp_lenof_sign = int(read_lenof_sign)
! assign the tmp_inum_runs accordingly
#ifdef CMPLX_
tmp_inum_runs = tmp_lenof_sign.div.2
#else
tmp_inum_runs = tmp_lenof_sign
#endif
! read in if auto-adaptive shift was used
! even though this is technically calcdata, we already need it
! here, so we read it in alongside the walkers
! (calcdata has to be read in after the walkers, ugh)
call read_log_scalar(grp_id, nm_tauto, tPopAutoAdaptiveShift, &
default=.false._int32, required=.false.)
call read_log_scalar(grp_id, nm_sc_blooms, tPopScaleBlooms, &
default=.false._int32, required=.false.)
! Same goes for accum-pops and the corresponding values
call read_log_scalar(grp_id, nm_tAP, tPopAccumPops, &
default=.false._int32, required=.false.)
! these variables are for consistency-checks
allocate(pops_norm_sqr(tmp_lenof_sign), stat=ierr)
allocate(pops_num_parts(tmp_lenof_sign), stat=ierr)
call read_int64_attribute(grp_id, nm_num_dets, all_count, &
required=.true.)
call read_dp_1d_attribute(grp_id, nm_norm_sqr, pops_norm_sqr, &
required=.true.)
call read_dp_1d_attribute(grp_id, nm_num_parts, pops_num_parts, &
required=.true.)
if (lenof_sign /= tmp_lenof_sign) then
! currently only for real population
#ifndef CMPLX_
! resize the attributes
call resize_attribute(pops_norm_sqr, lenof_sign)
call resize_attribute(pops_num_parts, lenof_sign)
! notify
write(stdout, *) "WARNING: Popsfile and input lenof_sign mismatch. Cloning replicas"
#else
call stop_all(t_r, "Mismatched sign length")
#endif
end if
write(stdout, *)
write(stdout, *) 'Reading in ', all_count, ' determinants'
! How many particles should each processor read in (try and distribute
! this as uniformly as possible. Also calculate the associated data
! offsets
counts = all_count / nProcessors
rest = int(mod(all_count, nProcessors))
if (rest > 0) counts(0:rest - 1) = counts(0:rest - 1) + 1
if (sum(counts) /= all_count .or. any(counts < 0)) &
call stop_all(t_r, "Invalid particles counts")
offsets(0) = 0
do proc = 1, nProcessors - 1
offsets(proc) = offsets(proc - 1) + counts(proc - 1)
end do
write(stdout, *) 'Reading in ', counts(iProcIndex), &
' determinants on this process ...'
! TODO: Split this up into more manageable chunks!
! Open the relevant datasets
call h5dopen_f(grp_id, nm_ilut, ds_ilut, err)
call h5dopen_f(grp_id, nm_sgns, ds_sgns, err)
! size of the ms data read in
tmp_fvals_size = 2 * tmp_inum_runs
tmp_apvals_size = tmp_lenof_sign + 1
! Only active the accumlation of populations if the popsfile has
! apvals and and accum-pops is specified with a proper iteration
tAccumPopsActive = .false.
if (tAccumPops .and. tPopAccumPops) then
! Read previous iteration data.
! It is read again in read_calc_data, but we need it now
call h5gopen_f(parent, nm_calc_grp, calc_grp_id, err)
call read_int64_scalar(calc_grp_id, nm_iters, PreviousCycles, &
default=0_int64, required=.false.)
call h5gclose_f(calc_grp_id, err)
if (iAccumPopsIter <= PreviousCycles) then
tAccumPopsActive = .true.
call read_int64_scalar(grp_id, nm_accum_counter, iAccumPopsCounter, &
default=0_int64, required=.false.)
write(stdout, *) "Accumulated populations are found. Accumulation will continue."
else
write(stdout, *) "Accumulated populations are found, but will be discarded."
write(stdout, *) "Accumulation will restart at iteration: ", iAccumPopsIter
end if
end if
! create an io handler for the data that is actually in the file
! (can have different lenof_sign and options than the one we will be using)
call gdata_read_handler%init_gdata_io( &
tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, &
tmp_fvals_size, max_ratio_size, tmp_apvals_size)
gdata_size = gdata_read_handler%entry_size()
t_read_gdata = gdata_read_handler%t_io()
if (t_read_gdata) then
call h5lexists_f(grp_id, nm_gdata, t_exist_gdata, err)
if (t_exist_gdata) then
call h5dopen_f(grp_id, nm_gdata, ds_gdata, err)
else
!This is for backword-compatibility.
!gdata group had another earlier name.
call h5dopen_f(grp_id, nm_gdata_old, ds_gdata, err)
end if
end if
! Check that these datasets look like we expect them to.
call check_dataset_params(ds_ilut, nm_ilut, 8_hsize_t, H5T_INTEGER_F, &
[int(bit_rep_width, hsize_t), all_count])
call check_dataset_params(ds_sgns, nm_sgns, 8_hsize_t, H5T_FLOAT_F, &
[int(tmp_lenof_sign, hsize_t), all_count])
if (t_read_gdata) then
if (t_exist_gdata) then
call check_dataset_params(ds_gdata, nm_gdata, 8_hsize_t, H5T_FLOAT_F, &
[int(gdata_size, hsize_t), all_count])
else
!This is for backword-compatibility.
!gdata group had another earlier name.
call check_dataset_params(ds_gdata, nm_gdata_old, 8_hsize_t, H5T_FLOAT_F, &
[int(gdata_size, hsize_t), all_count])
end if
end if
!limit the buffer size per MPI task to 50MB or MaxSpawned entries
block_size = 50000000 / (bit_rep_width * lenof_sign) / sizeof(SpawnedParts(1, 1))
block_size = min(block_size, MaxSpawned)
#if 0
do proc = 0, nProcessors - 2
block_size = min(block_size, &
InitialSpawnedSlots(proc + 1) - InitialSpawnedSlots(proc))
end do
block_size = min(block_size, &
MaxSpawned - InitialSpawnedSlots(nProcessors - 1))
#endif
! Initialise relevant counters
CurrWalkers = 0
nread_walkers = 0
pops_norm = 0
norm = 0
parts = 0
! Note that reading in the HDF5 library is zero based, not one based
! TODO: We need to keep looping until all the processes are done, not
! just the one!
block_start = offsets(iProcIndex)
if (iProcIndex == nProcessors - 1) then
last_part = all_count - 1
else
last_part = offsets(iProcIndex + 1) - 1
end if
block_end = min(block_start + block_size - 1, last_part)
this_block_size = block_end - block_start + 1
running = .true.
any_running = .true.
allocate(temp_ilut(int(bit_rep_width), int(this_block_size)), stat=ierr)
call LogMemAlloc('temp_ilut', size(temp_ilut), int(sizeof(temp_ilut(1, 1))), 'read_walkers', temp_ilut_tag, ierr)
allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr)
call LogMemAlloc('temp_sgns', size(temp_sgns), lenof_sign, 'read_walkers', temp_sgns_tag, ierr)
allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr)
do while (any_running)
! If this is the last block, its size will differ from the biggest
! one allowed.
if (running) then
this_block_size = block_end - block_start + 1
else
this_block_size = 0
end if
! if we resized the sign, we need to go back to the original buffer size now
if (tmp_lenof_sign /= lenof_sign) then
deallocate(temp_sgns)
allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr)
deallocate(gdata_buf)
allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr)
call gdata_read_handler%init_gdata_io( &
tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, &
tmp_fvals_size, max_ratio_size, tmp_apvals_size)
end if
call read_walker_block_buff(ds_ilut, ds_sgns, ds_gdata, block_start, &
this_block_size, bit_rep_width, temp_ilut, temp_sgns, &
gdata_buf)
if (tmp_lenof_sign /= lenof_sign) then
call clone_signs(temp_sgns, tmp_lenof_sign, lenof_sign, this_block_size)
! resize the fvals in the same manner
if (t_read_gdata) then
call gdata_read_handler%clone_gdata( &
gdata_buf, tmp_fvals_size, fvals_size, tmp_apvals_size, &
apvals_size, this_block_size)
end if
end if
! distribution and storage of the data is done with the read_handler, which
! has the memory layout for the current calculation
call distribute_and_add_walkers(this_block_size, gdata_read_handler, &
temp_ilut, temp_sgns, gdata_buf, &
dets, nreceived, CurrWalkers, norm, parts)
nread_walkers = nread_walkers + nreceived
! And update for the next block
if (running) then
block_start = block_end + 1
block_end = min(block_start + block_size - 1, last_part)
if (block_start > last_part) running = .false.
end if
! Test if _all_ of the processes have finished. If they have
! not then
call MPIAllLORLogical(running, any_running)
end do
deallocate(gdata_buf)
deallocate(temp_ilut, temp_sgns)
call LogMemDeAlloc('read_walkers', temp_ilut_tag)
call LogMemDeAlloc('read_walkers', temp_sgns_tag)
if (t_read_gdata) call h5dclose_f(ds_gdata, err)
call h5dclose_f(ds_sgns, err)
call h5dclose_f(ds_ilut, err)
call h5gclose_f(grp_id, err)
! Do some checking
call check_read_particles(nread_walkers, norm, parts, all_count, &
pops_num_parts, pops_norm_sqr)
deallocate(pops_num_parts)
deallocate(pops_norm_sqr)
write(stdout, *) "... done"
write(stdout, *)
end subroutine read_walkers