#include "macros.h" module hdf5_popsfile ! Read and write POPSFILES using hdf5 as the data format ! ! Data structure: ! ! --> Don't assume that only walker-data is going to be relevant ! --> Create groups appropriately. ! --> Note that we use collective writing, so we write all data from all ! nodes (except where explicitly managed), and the HDF library ! ensures the writes happen in a sensible way. !namelist /POPSHEAD/ Pop64Bit,PopHPHF,PopLz,PopNEl, & ! PopCyc,PopNIfFlag,PopNIfTot, & ! PopTau,PopiBlockingIter,PopRandomHash& ! PopNNodes, PopWalkersOnNodes, & ! PopMultiSumNoatHF, PopMultiSumENum, PopBalanceBlocks ! ! A: vcs_ver - The SHA ID of the git commit ! A: compiled_at - The time of code compilation ! A: compiled_on - The date of code compilation ! A: date - Date/time of calculation stored in popsfile ! A: seq_no - The sequence number of the calculation (i.e. ! how many restarts have there been). ! A: config - The build configuration ! A: status - (opt) indicates if local changes beyond SHA ID ! ! /system/ - Details of the system that is being restarted ! ! /calculation/ - Details of the calculation ! /random_hash/ - Random values used in the orbital mapping ! /tau_search/ - Values used in timestep optimisation ! /gamma_sing/ ! /gamma_doub/ ! /gamma_trip/ ! /gamma_opp/ ! /gamma_par/ ! /enough_sing/ ! /enough_doub/ ! /enough_trip/ ! /enough_opp/ ! /enough_par/ ! /cnt_sing/ ! /cnt_doub/ ! /cnt_trip/ ! /cnt_opp/ ! /cnt_par/ ! /max_death_cpt/ ! ! /psingles/ - And the values which have been optimised ! /pdoubles/ ! /ptriples/ ! /pparallel/ ! /tau/ ! /accumulators/ - Accumulated (output) data ! /sum_no_ref/ ! /sum_enum/ ! /completed_iters/ - How many iterations have already been completed ! /tot_imag_time/ - Total amount of imaginary time completed ! /shift/ - The diagshift value (absolute, invarient to a ! change of reference) ! ! /wavefunction/ - Details of a determinental Hilbert space ! A: width - Width of the bit-rep in 64-bit integers ! A: num_dets - Number of determinants in the file ! A: lenof_sign - Number of elements in the sign data ! A: norm_sqr - sum(sgn**2) for each sign element ! A: num_parts - sum(abs(sgn)) for each sign element ! ! /ilut/ - The bit representations of the determinants ! /sgns/ - The occupation of the determinants use MPI_wrapper, only: CommGlobal, mpiInfoNull use Parallel_neci, only: iProcIndex, MPIBarrier, MPIBCast, stop_all, MPISumAll, & MPIAllLORLogical, MPIAllGather, MPIArg, MPIAlltoAllV, & MPIAlltoAll, nProcessors, MPI_MAX, MPIAllReduce, MPISum use error_handling_neci, only: neci_flush use constants, only: dp, stdout, n_int, int32, int64, hdf_log, hdf_err, lenof_sign, & inum_runs, build_64bit use util_mod, only: near_zero, operator(.div.), get_unique_filename use CalcData, only: tAutoAdaptiveShift, tScaleBlooms, pSinglesIn, pDoublesIn, pTriplesIn use LoggingData, only: tPopAutoAdaptiveShift, tPopScaleBlooms, tAccumPops, tAccumPopsActive, & iAccumPopsIter, iAccumPopsCounter, tReduceHDF5Pops, & HDF5PopsMin, iHDF5PopsMinEx, tPopAccumPops use global_det_data, only: max_ratio_size, fvals_size, apvals_size use FcimcData, only: SpawnedParts2, AllTotParts, MaxSpawned, PreviousCycles, tSinglePartPhase, & SpawnedParts use MemoryManager, only: LogMemAlloc, LogMemDeAlloc #ifdef USE_HDF_ use hdf5, only: h5pcreate_f, h5gopen_f, h5gcreate_f, h5gopen_f, h5dopen_f, H5P_FILE_ACCESS_F, & H5F_ACC_TRUNC_F, H5F_ACC_RDONLY_F, h5fcreate_f, hid_t, hsize_t, h5garbage_collect_f, & h5close_f, h5pclose_f, h5pset_fapl_mpio_f, h5open_f, h5close_f, & h5fclose_f, h5gclose_f, h5dclose_f, h5fopen_f, H5T_INTEGER_F, & H5T_FLOAT_F, H5_REAL_KIND, h5kind_to_type, h5lexists_f, H5_INTEGER_KIND use gdata_io, only: gdata_io_t, clone_signs, resize_attribute use hdf5_util, only: write_int32_attribute, write_int64_attribute, write_string_attribute, & write_log_scalar , write_dp_scalar, read_string_attribute, & write_int64_scalar, write_dp_1d_dataset, read_dp_scalar, & read_dp_1d_dataset, read_log_scalar, read_int64_scalar, & write_dp_1d_attribute, read_dp_1d_attribute, read_int64_attribute, & check_dataset_params, read_int32_attribute, write_2d_multi_arr_chunk_buff, & read_2d_multi_chunk, tmp_lenof_sign, tmp_inum_runs, read_cplx_1d_dataset, & write_cplx_1d_dataset #endif use tau_main, only: tau_search_method, input_tau_search_method, & possible_tau_search_methods, tau_start_val, possible_tau_start, & max_death_cpt, min_tau, max_tau, tau, assign_value_to_tau use tau_search_hist, only: finalize_hist_tau_search use tau_search_conventional, only: tau_search_stats use fortran_strings, only: str use bit_rep_data, only: NIfD, NIfTot, IlutBits, extract_sign use bit_reps, only: decode_bit_det implicit none private ! Constants for naming various sections character(*), parameter :: & nm_vcs_ver = 'vcs_ver', & nm_comp_time = 'compiled_at', & nm_comp_date = 'compiled_on', & nm_date = 'date', & nm_seq_no = 'seq_no', & nm_comp_config = 'config', & nm_comp_status = 'status', & nm_calc_grp = 'calculation', & nm_random_hash = 'random_hash', & nm_iters = 'completed_iters', & nm_tot_imag = 'tot_imag_time', & nm_shift = 'shift', & nm_tAuto = 'tAutoAdaptiveShift', & nm_tAP = 'tAccumPops', & nm_accum_counter = 'iAccumPopsCounter', & nm_sc_blooms = 'tScaleBlooms', & nm_tau_grp = 'tau_search', & nm_gam_sing = 'gamma_sing', & nm_gam_doub = 'gamma_doub', & nm_gam_trip = 'gamma_trip', & nm_gam_opp = 'gamma_opp', & nm_gam_par = 'gamma_par', & nm_en_sing = 'enough_sing', & nm_en_doub = 'enough_doub', & nm_en_trip = 'enough_trip', & nm_en_opp = 'enough_opp', & nm_en_par = 'enough_par', & nm_cnt_sing = 'cnt_sing', & nm_cnt_doub = 'cnt_doub', & nm_cnt_trip = 'cnt_trip', & nm_cnt_opp = 'cnt_opp', & nm_cnt_par = 'cnt_par', & nm_max_death = 'max_death', & nm_psingles = 'psingles', & nm_pdoubles = 'pdoubles', & nm_ptriples = 'ptriples', & nm_pparallel = 'pparallel', & nm_tau = 'tau', & ! [W.D.]: ! can i just add another entry without breaking anything? nm_hist_tau = 'hist_tau_search', & nm_acc_grp = 'accumulators', & nm_sum_no_ref = 'sum_no_ref', & nm_sum_enum = 'sum_enum', & nm_wfn_grp = 'wavefunction', & nm_rep_width = 'width', & nm_sgn_len = 'lenof_sign', & nm_num_dets = 'num_dets', & nm_ilut = 'ilut', & nm_sgns = 'sgns', & nm_norm_sqr = 'norm_sqr', & nm_num_parts = 'num_parts', & nm_gdata = 'gdata', & nm_gdata_old = 'fvals' #ifdef USE_HDF_ integer(n_int), dimension(:, :), allocatable :: receivebuff integer:: receivebuff_tag #endif public :: write_popsfile_hdf5, read_popsfile_hdf5 public :: add_pops_norm_contrib contains subroutine write_popsfile_hdf5(MaxEx, IterSuffix) use CalcData, only: iPopsFileNoWrite use LoggingData, only: tIncrementPops use FciMCData, only: Iter, PreviousCycles ! TODO: ! 1) Deal with multiple filenames ! 2) Deal with build configurations without HDF5 ! 3) Deal with HDF5 build configurations without MPIO ! 4) Should we in some way make incrementpops default? integer, intent(in), optional :: MaxEx logical, intent(in), optional :: IterSuffix #ifndef USE_HDF_ character(*), parameter :: this_routine = 'write_popsfile_hdf5' #endif #ifdef USE_HDF_ integer(hid_t) :: plist_id, file_id integer(hdf_err) :: err integer :: mpi_err character(255) :: filename character(64) :: stem character(4) :: MaxExStr character(32) :: IterStr ! Get a unique filename for this popsfile. This needs to be done on ! the head node to avoid collisions. if (iProcIndex == 0) then if (present(MaxEx)) then write(MaxExStr, '(I0)') MaxEx stem = 'popsfile_trunc'//MaxExStr else stem = 'popsfile' end if if (present(IterSuffix) .and. IterSuffix) then write(IterStr, '(I0)') (Iter + PreviousCycles) stem = trim(stem)//'_'//IterStr end if call get_unique_filename(trim(stem), & tIncrementPops, .true., iPopsFileNoWrite, & filename, ext='.h5') end if call MPIBCast(filename) write(stdout, *) if (present(MaxEx)) then write(stdout, *) "========= Writing Truncated HDF5 popsfile =========" else write(stdout, *) "============== Writing HDF5 popsfile ==============" end if write(stdout, *) "File name: ", trim(filename) ! Initialise the hdf5 fortran interface call h5open_f(err) ! Set up a property list to ensure file handling across all nodes. ! TODO: Check if we should be using a more specific communicator call h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, err) ! call h5pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, MPI_INFO_NULl, err) call h5pset_fapl_mpio_f(plist_id, CommGlobal, mpiInfoNull, err) ! TODO: Do sensible file handling here... call h5fcreate_f(filename, H5F_ACC_TRUNC_F, file_id, err, & access_prp=plist_id) call h5pclose_f(plist_id, err) write(stdout, *) "writing metadata" call write_metadata(file_id) write(stdout, *) "writing calc_data" call write_calc_data(file_id) call MPIBarrier(mpi_err) if (present(MaxEx)) then write(stdout, *) "writing walkers up to excitation level: ", MaxExStr else write(stdout, *) "writing walkers" end if call write_walkers(file_id, MaxEx) call MPIBarrier(mpi_err) write(stdout, *) "closing popsfile" ! And we are done! call h5fclose_f(file_id, err) call h5close_f(err) call h5garbage_collect_f(err) call MPIBarrier(mpi_err) write(stdout, *) "popsfile write successful" #else call stop_all(this_routine, 'HDF5 support not enabled at compile time') unused_var(MaxEx) unused_var(IterSuffix) unused_var(tIncrementPops) unused_var(iPopsFileNoWrite) unused_var(iter) unused_var(PreviousCycles) #endif end subroutine write_popsfile_hdf5 function read_popsfile_hdf5(dets) result(CurrWalkers) #ifdef USE_HDF_ use CalcData, only: iPopsFileNoRead use LoggingData, only: tIncrementPops #endif ! Read a popsfile in, prior to running a new calculation ! TODO: Integrate with CheckPopsParams ! n.b. This reads into the specified array, to allow use of popsfiles ! for initialising perturbations, etc. integer(n_int), intent(out) :: dets(:, :) integer(int64) :: CurrWalkers character(*), parameter :: t_r = 'read_popsfile_hdf5' #ifdef USE_HDF_ integer(hid_t) :: file_id, plist_id integer(hdf_err) :: err integer :: mpi_err character(255) :: filename ! Get the name for the popsfile to read in if (iProcIndex == 0) & call get_unique_filename('popsfile', tIncrementPops, .false., & iPopsFileNoRead, filename, ext='.h5') call MPIBcast(filename) write(stdout, *) write(stdout, *) "========== Reading in from HDF5 popsfile ==========" write(stdout, *) 'File name: ', trim(filename) ! Initialise the hdf5 fortran interface call h5open_f(err) ! Set up a property list to ensure file handling across all nodes. ! TODO: Check if we should be using a more specific communicator call h5pcreate_f(H5P_FILE_ACCESS_F, plist_id, err) ! call h5pset_fapl_mpio_f(plist_id, MPI_COMM_WORLD, MPI_INFO_NULl, err) call h5pset_fapl_mpio_f(plist_id, CommGlobal, mpiInfoNull, err) ! Open the popsfile call h5fopen_f(filename, H5F_ACC_RDONLY_F, file_id, err, & access_prp=plist_id) if (err /= 0) call stop_all(t_r, "No popsfile.h5 found") call read_metadata(file_id) call read_walkers(file_id, dets, CurrWalkers) call read_calc_data(file_id) ! And we are done call h5pclose_f(plist_id, err) call h5fclose_f(file_id, err) call h5close_f(err) call neci_flush(stdout) call MPIBarrier(mpi_err) #else CurrWalkers = 0 call stop_all(t_r, 'HDF5 support not enabled at compile time') ! just takes care of compiler warnings, this code is actually unreachable dets = 0 #endif end function #ifdef USE_HDF_ subroutine write_metadata(parent) use CalcData, only: calc_seq_no ! Output macroscopic metadata applicable to a restart file, which may ! be used for establishing providence of calculations, etc. integer(hid_t), intent(in) :: parent character(19) :: date_str integer :: date_values(8) ! TODO: Run by call write_string_attribute(parent, nm_vcs_ver, _VCS_VER) call write_string_attribute(parent, nm_comp_date, __DATE__) call write_string_attribute(parent, nm_comp_time, __TIME__) call write_string_attribute(parent, nm_comp_config, _CONFIG) #ifdef _WORKING_DIR_CHANGES call write_string_attribute(parent, nm_comp_status, & "Working directory contains local changes") #endif ! How many calculations have been run to get to this popsfile pt? call write_int32_attribute(parent, nm_seq_no, int(calc_seq_no, int32)) ! When are we running this? call date_and_time(values=date_values) write(date_str, '(i4.4,"-",i2.2,"-",i2.2," ",i2.2,":",i2.2,":",i2.2)') & date_values(1:3), date_values(5:7) call write_string_attribute(parent, nm_date, date_str) end subroutine subroutine read_metadata(parent) use CalcData, only: calc_seq_no ! Read in the macroscopic metadata applicable to the restart file. integer(hid_t), intent(in) :: parent logical :: exists character(100) :: str_buf write(stdout, *) 'Previous calculation' call read_string_attribute(parent, nm_date, str_buf, exists) if (exists) write(stdout, *) 'Date: ', trim(str_buf) call read_int32_attribute(parent, nm_seq_no, calc_seq_no, & default=1_int32) write(stdout, *) 'Sequence no.:', calc_seq_no ! Output nice details for usability call read_string_attribute(parent, nm_vcs_ver, str_buf, exists) if (exists) write(stdout, *) 'VCS ver: ', trim(str_buf) call read_string_attribute(parent, nm_comp_config, str_buf, exists) if (exists) write(stdout, *) 'Build config: ', trim(str_buf) call read_string_attribute(parent, nm_comp_status, str_buf, exists) if (exists) write(stdout, *) 'Build status: ', trim(str_buf) call read_string_attribute(parent, nm_comp_date, str_buf, exists) if (exists) write(stdout, *) 'Build date: ', trim(str_buf) call read_string_attribute(parent, nm_comp_time, str_buf, exists) if (exists) write(stdout, *) 'Build time: ', trim(str_buf) ! Update values for the new calculation calc_seq_no = calc_seq_no + 1 end subroutine subroutine write_calc_data(parent) use FciMCData, only: Iter, PreviousCycles, TotImagTime, Hii use CalcData, only: DiagSft integer(hid_t), intent(in) :: parent integer(hid_t) :: calc_grp integer(hdf_err) :: err ! Firstly create the group for storing calculation-related data call h5gcreate_f(parent, nm_calc_grp, calc_grp, err) call MPIBcast(PreviousCycles) call write_int64_scalar(calc_grp, nm_iters, iter + PreviousCycles) call write_dp_scalar(calc_grp, nm_tot_imag, TotImagTime) call write_dp_1d_dataset(calc_grp, nm_shift, DiagSft + Hii) ! Output the values used for tau optimisation. Only output non-zero ! (i.e. used) values. call write_tau_opt(calc_grp) ! Output accumulator data call write_accumulator_data(calc_grp) ! Clear stuff up call h5gclose_f(calc_grp, err) end subroutine subroutine write_tau_opt(parent) use FciMCData, only: pSingles, pDoubles, pParallel use tc_three_body_data, only: pTriples integer(hid_t), intent(in) :: parent integer(hid_t) :: tau_grp integer(hdf_err) :: err real(dp) :: max_gam_sing, max_gam_doub, max_gam_trip, max_gam_opp, max_gam_par real(dp) :: max_max_death_cpt logical :: all_en_sing, all_en_doub, all_en_trip, all_en_opp, all_en_par integer :: max_cnt_sing, max_cnt_doub, max_cnt_trip, max_cnt_opp, max_cnt_par real(dp) :: all_pdoub, all_psing, all_ppar, all_tau, all_ptrip ! Create the group call h5gcreate_f(parent, nm_tau_grp, tau_grp, err) ! We want to use the maximised values across all the processors ! (there is nothing ensuring that all the processors are adjusted to ! the same values...) call MPIAllReduce(tau_search_stats%gamma_sing, MPI_MAX, max_gam_sing) call MPIAllReduce(tau_search_stats%gamma_doub, MPI_MAX, max_gam_doub) call MPIAllReduce(tau_search_stats%gamma_trip, MPI_MAX, max_gam_trip) call MPIAllReduce(tau_search_stats%gamma_opp, MPI_MAX, max_gam_opp) call MPIAllReduce(tau_search_stats%gamma_par, MPI_MAX, max_gam_par) call MPIAllReduce(max_death_cpt, MPI_MAX, max_max_death_cpt) call MPIAllLORLogical(tau_search_stats%enough_sing, all_en_sing) call MPIAllLORLogical(tau_search_stats%enough_doub, all_en_doub) call MPIAllLORLogical(tau_search_stats%enough_trip, all_en_trip) call MPIAllLORLogical(tau_search_stats%enough_opp, all_en_opp) call MPIAllLORLogical(tau_search_stats%enough_par, all_en_par) call MPIAllReduce(tau_search_stats%cnt_sing, MPI_MAX, max_cnt_sing) call MPIAllReduce(tau_search_stats%cnt_doub, MPI_MAX, max_cnt_doub) call MPIAllReduce(tau_search_stats%cnt_trip, MPI_MAX, max_cnt_trip) call MPIAllReduce(tau_search_stats%cnt_opp, MPI_MAX, max_cnt_opp) call MPIAllReduce(tau_search_stats%cnt_par, MPI_MAX, max_cnt_par) if (.not. near_zero(max_gam_sing)) & call write_dp_scalar(tau_grp, nm_gam_sing, max_gam_sing) if (.not. near_zero(max_gam_doub)) & call write_dp_scalar(tau_grp, nm_gam_doub, max_gam_doub) if (.not. near_zero(max_gam_trip)) & call write_dp_scalar(tau_grp, nm_gam_trip, max_gam_trip) if (.not. near_zero(max_gam_opp)) & call write_dp_scalar(tau_grp, nm_gam_opp, max_gam_opp) if (.not. near_zero(max_gam_par)) & call write_dp_scalar(tau_grp, nm_gam_par, max_gam_par) if (.not. near_zero(max_max_death_cpt)) & call write_dp_scalar(tau_grp, nm_max_death, max_max_death_cpt) if (all_en_sing) & call write_log_scalar(tau_grp, nm_en_sing, all_en_sing) if (all_en_doub) & call write_log_scalar(tau_grp, nm_en_doub, all_en_doub) if (all_en_trip) & call write_log_scalar(tau_grp, nm_en_trip, all_en_trip) if (all_en_opp) & call write_log_scalar(tau_grp, nm_en_opp, all_en_opp) if (all_en_par) & call write_log_scalar(tau_grp, nm_en_par, all_en_par) if (max_cnt_sing /= 0) & call write_int64_scalar(tau_grp, nm_cnt_sing, max_cnt_sing) if (max_cnt_doub /= 0) & call write_int64_scalar(tau_grp, nm_cnt_doub, max_cnt_doub) if (max_cnt_trip /= 0) & call write_int64_scalar(tau_grp, nm_cnt_trip, max_cnt_trip) if (max_cnt_opp /= 0) & call write_int64_scalar(tau_grp, nm_cnt_opp, max_cnt_opp) if (max_cnt_par /= 0) & call write_int64_scalar(tau_grp, nm_cnt_par, max_cnt_par) ! Use the probability values from the head node all_psing = pSingles; all_pdoub = pDoubles; all_ptrip = pTriples; all_ppar = pParallel all_tau = tau call MPIBcast(all_psing) call MPIBcast(all_pdoub) call MPIBCast(all_ptrip) call MPIBcast(all_ppar) call MPIBcast(all_tau) call write_dp_scalar(tau_grp, nm_psingles, all_psing) call write_dp_scalar(tau_grp, nm_pdoubles, all_pdoub) call write_dp_scalar(tau_grp, nm_ptriples, all_ptrip) call write_dp_scalar(tau_grp, nm_pparallel, all_ppar) call write_dp_scalar(tau_grp, nm_tau, all_tau) ! [W.D.]: ! for the new hist-tau search i essentially only need to indicat ! that a histogramming tau-search was used: if (allocated(input_tau_search_method)) then if (input_tau_search_method == possible_tau_search_methods%HISTOGRAMMING) then call write_log_scalar(tau_grp, nm_hist_tau, .true.) end if end if ! Clear up call h5gclose_f(tau_grp, err) end subroutine write_tau_opt subroutine write_accumulator_data(parent) use FciMCData, only: AllSumNoatHF, AllSumENum integer(hid_t), intent(in) :: parent integer(hid_t) :: acc_grp integer(hdf_err) :: err ! Create group call h5gcreate_f(parent, nm_acc_grp, acc_grp, err) ! Write the energy accumulator values ! (n.b. ensure values on all procs) call MPIBcast(AllSumENum) call MPIBcast(AllSumNoatHF) #ifdef CMPLX_ call write_cplx_1d_dataset(acc_grp, nm_sum_enum, AllSumENum) #else call write_dp_1d_dataset(acc_grp, nm_sum_enum, AllSumENum) #endif call write_dp_1d_dataset(acc_grp, nm_sum_no_ref, AllSumNoatHF) ! Clear up call h5gclose_f(acc_grp, err) end subroutine subroutine read_calc_data(parent) use FciMCData, only: PreviousCycles, Hii, TotImagTime, & pSingles, pDoubles, pParallel use CalcData, only: DiagSft, tWalkContGrow, hdf5_diagsft use tc_three_body_data, only: pTriples, tReadPTriples integer(hid_t), intent(in) :: parent integer(hid_t) :: grp_id integer(hdf_err) :: err logical :: exists debug_function_name("read_calc_data") call h5gopen_f(parent, nm_calc_grp, grp_id, err) ! Previous iteration data. call read_int64_scalar(grp_id, nm_iters, PreviousCycles, & default=0_int64, exists=exists) if (exists) & write(stdout, *) 'Completed iterations: ', PreviousCycles call read_dp_scalar(grp_id, nm_tot_imag, TotImagTime, default=0.0_dp, & exists=exists) if (exists) & write(stdout, *) 'Resuming calculation after ', TotImagTime, ' a.u.' ! Read in the diagsft. Note that it uses the absolute value (to be ! independent of choice of reference), so we must subtract out the ! reference ! TODO: Do scale up from 1 --> 2 runs for RDMs if (.not. tWalkContGrow) then call read_dp_1d_dataset(grp_id, nm_shift, DiagSft, required=.true.) DiagSft = DiagSft - Hii tSinglePartPhase = (abs(DiagSft(1)) < 1.0e-6) write(stdout, *) 'Initial shift: ', DiagSft else tSinglePartPhase = .true. ! if the number of runs changed, the shift also has a different size ! i still want to capture the diagshift in a temporary file ! atleast call read_dp_1d_dataset(grp_id, nm_shift, hdf5_diagsft, required=.true.) hdf5_diagsft = hdf5_diagsft - Hii end if ! [W.D.]: ! i think i also want to read in pSingles etc. even if we do not ! tau-search anymore in a restarted run.. ! but i guess i have to be careful to set the appropriate ! default, if no tau-search was used and then is restarted.. ! well, even if the tau-search is not turned on, the ! values are written anyway.. so i can also read them in, but ! not use the read-in tau, but the one specified in the input! ! except the hist_tau was used the we want to use the ! one in the popsfile all the time call read_tau_opt(grp_id) ! and also output some info: write(stdout, *) "read-in tau optimization data: " write(stdout, *) "time-step: ", tau write(stdout, *) "pSingles: ", pSingles write(stdout, *) "pDoubles: ", pDoubles if (tReadPTriples) write(stdout, *) "pTriples: ", pTriples write(stdout, *) "pParallel: ", pParallel if (tau_search_method == possible_tau_search_methods%CONVENTIONAL) then write(stdout, *) "continuing conventional tau-search!" else if (tau_search_method == possible_tau_search_methods%HISTOGRAMMING) then write(stdout, *) "continuing histogramming tau-search!" else ASSERT(tau_search_method == possible_tau_search_methods%OFF) write(stdout, *) "Do not continue tau-search!" end if call read_accumulator_data(grp_id) ! TODO: Read nbasis, nel, ms2, etc. ! --> Check that these values haven't changed from the ! previous run call h5gclose_f(grp_id, err) end subroutine subroutine read_tau_opt(parent) use FciMCData, only: pSingles, pDoubles, pParallel use tc_three_body_data, only: pTriples, tReadPTriples ! Read accumulator values for the timestep optimisation ! TODO: Add an option to reset these values... integer(hid_t), intent(in) :: parent integer(hid_t) :: grp_id integer(hdf_err) :: err logical :: ppar_set, tau_set, hist_tau, temp_previous character(*), parameter :: this_routine = "read_tau_opt" real(dp) :: temp_tau call h5gopen_f(parent, nm_tau_grp, grp_id, err) ! These are all optional things to have in the popsfile. If they don't ! exist, then they will be left unchanged. call read_dp_scalar(grp_id, nm_gam_sing, tau_search_stats%gamma_sing) call read_dp_scalar(grp_id, nm_gam_doub, tau_search_stats%gamma_doub) call read_dp_scalar(grp_id, nm_gam_trip, tau_search_stats%gamma_trip) call read_dp_scalar(grp_id, nm_gam_opp, tau_search_stats%gamma_opp) call read_dp_scalar(grp_id, nm_gam_par, tau_search_stats%gamma_par) call read_dp_scalar(grp_id, nm_max_death, max_death_cpt) call read_log_scalar(grp_id, nm_en_sing, tau_search_stats%enough_sing) call read_log_scalar(grp_id, nm_en_doub, tau_search_stats%enough_doub) call read_log_scalar(grp_id, nm_en_trip, tau_search_stats%enough_trip) call read_log_scalar(grp_id, nm_en_opp, tau_search_stats%enough_opp) call read_log_scalar(grp_id, nm_en_par, tau_search_stats%enough_par) call read_int64_scalar(grp_id, nm_cnt_sing, tau_search_stats%cnt_sing) call read_int64_scalar(grp_id, nm_cnt_doub, tau_search_stats%cnt_doub) call read_int64_scalar(grp_id, nm_cnt_trip, tau_search_stats%cnt_trip) call read_int64_scalar(grp_id, nm_cnt_opp, tau_search_stats%cnt_opp) call read_int64_scalar(grp_id, nm_cnt_par, tau_search_stats%cnt_par) if (allocated(pSinglesIn) .or. allocated(pDoublesIn)) then write(stdout,*) "pSingles or pDoubles specified in input file, which take precedence" else call read_dp_scalar(grp_id, nm_psingles, psingles) call read_dp_scalar(grp_id, nm_pdoubles, pdoubles) end if if (allocated(pTriplesIn)) then write(stdout,*) "pTriples specified in input file, which takes precedence" else call read_dp_scalar(grp_id, nm_ptriples, ptriples, exists=tReadPTriples) end if call read_dp_scalar(grp_id, nm_pparallel, pparallel, exists=ppar_set) ! here i want to make the distinction if we want to tau-search ! or not call read_log_scalar(grp_id, nm_hist_tau, temp_previous, exists=hist_tau) call read_dp_scalar(grp_id, nm_tau, temp_tau, exists=tau_set) call h5gclose_f(grp_id, err) if (tau_start_val == possible_tau_start%from_popsfile) then if (tau_set) then if (temp_tau < min_tau .or. temp_tau > max_tau) then call stop_all(this_routine, "The read tau "//str(temp_tau, after_comma=5)//" is smaller than min_tau or larger than max_tau") end if call assign_value_to_tau(temp_tau, 'Initialization from HDF5 file.') else call stop_all(this_routine, 'Time-step does not exist in Popsfile') end if end if end subroutine subroutine read_accumulator_data(parent) use FciMCData, only: AllSumNoatHF, AllSumENum integer(hid_t), intent(in) :: parent integer(hid_t) :: grp_id integer(hdf_err) :: err call h5gopen_f(parent, nm_acc_grp, grp_id, err) call read_dp_1d_dataset(grp_id, nm_sum_no_ref, AllSumNoatHF, & required=.true.) #ifdef CMPLX_ call read_cplx_1d_dataset(grp_id, nm_sum_enum, AllSumENum, & required=.true.) #else call read_dp_1d_dataset(grp_id, nm_sum_enum, AllSumENum, & required=.true.) #endif call h5gclose_f(grp_id, err) end subroutine subroutine write_walkers(parent, MaxEx) use FciMCData, only: CurrentDets, & TotWalkers, iLutHF use DetBitOps, only: FindBitExcitLevel, tAccumEmptyDet use global_det_data, only: writeFValsAsInt, writeAPValsAsInt ! Output the wavefunction information to the relevant groups in the ! wavefunction. integer(hid_t), intent(in) :: parent integer, intent(in), optional :: MaxEx character(*), parameter :: t_r = 'write_walkers' integer(hid_t) :: wfn_grp_id integer(hdf_err) :: err integer(hsize_t) :: counts(0:nProcessors - 1) integer(hsize_t) :: all_count integer(int32) :: bit_rep_width integer(hsize_t) :: write_offset(2) real(dp) :: all_parts(lenof_sign), all_norm_sqr(lenof_sign) integer :: ierr integer(n_int), allocatable :: gdata_buf(:, :) integer :: ExcitLevel integer(hsize_t) :: printed_count integer(kind=n_int), allocatable, target :: TmpVecDets(:, :) integer(kind=n_int), pointer :: PrintedDets(:, :) integer :: i real(dp) :: CurrentSign(lenof_sign), printed_tot_parts(lenof_sign), & printed_norm_sqr(inum_runs) type(gdata_io_t) :: gdata_write_handler integer :: gdata_size #ifdef CMPLX_ integer :: run #endif ! We do not want to print all dets. At least empty dets should be skipped ! Let us find out which ones to be printed and copy them allocate(TmpVecDets(0:NIfTot, TotWalkers)) PrintedDets => TmpVecDets printed_count = 0 printed_norm_sqr = 0 printed_tot_parts = 0 call gdata_write_handler%init_gdata_io(tAutoAdaptiveShift, & tScaleBlooms, tAccumPopsActive, fvals_size, max_ratio_size, lenof_sign + 1) gdata_size = gdata_write_handler%entry_size() if (gdata_size > 0) allocate(gdata_buf(gdata_size, TotWalkers)) do i = 1, int(TotWalkers) call extract_sign(CurrentDets(:, i), CurrentSign) ! Skip empty determinants (unless we are accumulating its population) if (IsUnoccDet(CurrentSign) .and. .not. tAccumEmptyDet(CurrentDets(:, i))) cycle ExcitLevel = FindBitExcitLevel(iLutHF, CurrentDets(:, i)) ! If we are printing truncated popsfiles, skip over large excitations if (present(MaxEx) .and. ExcitLevel > MaxEx) cycle ! If we are reducing the size of popsfiles, skip dets with low population and high excitations if (tReduceHDF5Pops) then if (all(abs(CurrentSign) <= HDF5PopsMin) .and. ExcitLevel > iHDF5PopsMinEx) cycle end if printed_count = printed_count + 1 !Fill in det PrintedDets(:, printed_count) = CurrentDets(:, i) !Fill in global data if (gdata_size > 0) then ! get the statistics of THIS processor call gdata_write_handler%write_gdata_hdf5(gdata_buf(:, printed_count), i) end if ! Fill in stats printed_tot_parts = printed_tot_parts + abs(CurrentSign) #ifdef CMPLX_ do run = 1, inum_runs printed_norm_sqr(run) = printed_norm_sqr(run) + & sum(CurrentSign(min_part_type(run):max_part_type(run))**2) end do #else printed_norm_sqr = printed_norm_sqr + CurrentSign**2 #endif end do ! TODO: Add a (slower) fallback routine for weird cases, odd HDF libs ! Firstly create the group for storing wavefunction info call h5gcreate_f(parent, nm_wfn_grp, wfn_grp_id, err) ! TODO: Refactor these chunks into their own little subroutines. ! We fix the format of the binary file. Thus if we are on a 32-bit ! build, we need to convert the data into 64-bit compatible chunks. if (build_64bit) then bit_rep_width = NIfD + 1 else bit_rep_width = 2 * (NIfD + 1) call stop_all(t_r, "Needs manual, careful, testing") end if ! How many occupied determinants are there on each of the processors call MPIAllGather(printed_count, counts, ierr) all_count = sum(counts) write_offset = [0_hsize_t, sum(counts(0:iProcIndex - 1))] ! Output the bit-representation data call write_int32_attribute(wfn_grp_id, nm_rep_width, bit_rep_width) call write_int32_attribute(wfn_grp_id, nm_sgn_len, & int(lenof_sign, int32)) call write_int64_attribute(wfn_grp_id, nm_num_dets, all_count) ! TODO: Check these values. May need to sum them explicitly ! denote if auto-adaptive shift was used call write_log_scalar(wfn_grp_id, nm_tauto, tAutoAdaptiveShift) call write_log_scalar(wfn_grp_id, nm_sc_blooms, tScaleBlooms) ! denote if accum-pos will be written call write_log_scalar(wfn_grp_id, nm_tAP, tAccumPopsActive) if (tAccumPopsActive) then !Write the number of iterations, during which the populations have !been accumulated call write_int64_scalar(wfn_grp_id, nm_accum_counter, iAccumPopsCounter) end if ! Accumulated values only valid on head node. collate_iter_data ! has not yet run. call MPISumAll(printed_norm_sqr, all_norm_sqr) call MPISumAll(printed_tot_parts, all_parts) call write_dp_1d_attribute(wfn_grp_id, nm_norm_sqr, all_norm_sqr) call write_dp_1d_attribute(wfn_grp_id, nm_num_parts, all_parts) !we do an explicitly buffered write to avoid performance problems with !complicated hyperslabs + collective buffering ! Write out the determinant bit-representations call write_2d_multi_arr_chunk_buff( & wfn_grp_id, nm_ilut, h5kind_to_type(int64, H5_INTEGER_KIND), & PrintedDets, & [int(nifd + 1, hsize_t), int(printed_count, hsize_t)], & ! dims [0_hsize_t, 0_hsize_t], & ! offset [int(nifd + 1, hsize_t), all_count], & ! all dims [0_hsize_t, sum(counts(0:iProcIndex - 1))] & ! output offset ) call write_2d_multi_arr_chunk_buff( & wfn_grp_id, nm_sgns, h5kind_to_type(dp, H5_REAL_KIND), & PrintedDets, & [int(lenof_sign, hsize_t), int(printed_count, hsize_t)], & ! dims [int(IlutBits%ind_pop, hsize_t), 0_hsize_t], & ! offset [int(lenof_sign, hsize_t), all_count], & ! all dims [0_hsize_t, sum(counts(0:iProcIndex - 1))] & ! output offset ) deallocate(TmpVecDets) if (gdata_size > 0) then call write_2d_multi_arr_chunk_buff( & wfn_grp_id, nm_gdata, h5kind_to_type(dp, H5_REAL_KIND), gdata_buf, & [int(gdata_size, hsize_t), int(printed_count, hsize_t)], & ! dims [0_hsize_t, 0_hsize_t], & [int(gdata_size, hsize_t), all_count], & [0_hsize_t, sum(counts(0:iProcIndex - 1))] & ) deallocate(gdata_buf) end if ! And we are done call h5gclose_f(wfn_grp_id, err) end subroutine write_walkers subroutine read_walkers(parent, dets, CurrWalkers) use CalcData, only: pops_norm ! This is the routine that has complexity!!! ! ! We read chunks of the walkers datasets on _all_ of the processors, ! and communicate the walkers to the correct node after each of the ! blocks. This is essentially a giant annihilation step ! ! We used the spawnedparts arrays as a buffer. These are split up per ! processor in the same way as normally used for annihilation. We ! keep looping over however much we can fit in this array unil ! we are done. ! ! --> This is quite complicated, but should equally be quite efficient integer(hid_t), intent(in) :: parent integer(n_int), intent(out) :: dets(:, :) integer(int64), intent(out) :: CurrWalkers character(*), parameter :: t_r = 'read_walkers' integer :: proc, nreceived integer(hid_t) :: grp_id, calc_grp_id integer(hdf_err) :: err integer(hid_t) :: ds_sgns, ds_ilut, ds_gdata integer(int64) :: nread_walkers integer :: ierr integer(int32) :: bit_rep_width integer(hsize_t) :: all_count, block_size, counts(0:nProcessors - 1) integer(hsize_t) :: offsets(0:nProcessors - 1) integer(hsize_t) :: block_start, block_end, last_part integer(hsize_t) :: this_block_size real(dp), allocatable :: pops_num_parts(:), pops_norm_sqr(:) real(dp) :: norm(lenof_sign), parts(lenof_sign) logical :: running, any_running integer(hsize_t), dimension(:, :), allocatable :: temp_ilut, temp_sgns integer(hsize_t), dimension(:, :), allocatable :: gdata_buf integer :: temp_ilut_tag, temp_sgns_tag, rest integer(int32) :: read_lenof_sign type(gdata_io_t) :: gdata_read_handler integer :: gdata_size, tmp_fvals_size, tmp_apvals_size logical :: t_read_gdata logical(hdf_log) :: t_exist_gdata ! TODO: ! - Read into a relatively small buffer. Make this such that all the ! particles could be broadcast to a single processor and everything ! would be fine (this is the only way to _ensure_ no overflows). ! - Loop over these read walkers, decode and determine which proc ! they are for ! - MPIAllToAllV to the correct processor ! - Ensure that these end up in the correct place in the main list, ! and that we don't overflow anywhere! ! - Writing and reading of flags, etc. ! -- For flags, include attribute info with which bit is which ! flag for supportability. call h5gopen_f(parent, nm_wfn_grp, grp_id, err) ! TODO: Make sure that this plays nicely with 32bit!!! if (.not. build_64bit) & call stop_all(t_r, "Needs manual, careful testing and adjustment") ! Get attributes about the sizes of stuff in the popsfile. ! ======== ! We can only read in bit representations that are the right size! call read_int32_attribute(grp_id, nm_rep_width, bit_rep_width) if (bit_rep_width /= NIfD + 1) & call stop_all(t_r, "Mismatched bit representations") call read_int32_attribute(grp_id, nm_sgn_len, read_lenof_sign) ! as lenof_sign is of type int, do not force tmp_lenof_sign to be int32 tmp_lenof_sign = int(read_lenof_sign) ! assign the tmp_inum_runs accordingly #ifdef CMPLX_ tmp_inum_runs = tmp_lenof_sign.div.2 #else tmp_inum_runs = tmp_lenof_sign #endif ! read in if auto-adaptive shift was used ! even though this is technically calcdata, we already need it ! here, so we read it in alongside the walkers ! (calcdata has to be read in after the walkers, ugh) call read_log_scalar(grp_id, nm_tauto, tPopAutoAdaptiveShift, & default=.false._int32, required=.false.) call read_log_scalar(grp_id, nm_sc_blooms, tPopScaleBlooms, & default=.false._int32, required=.false.) ! Same goes for accum-pops and the corresponding values call read_log_scalar(grp_id, nm_tAP, tPopAccumPops, & default=.false._int32, required=.false.) ! these variables are for consistency-checks allocate(pops_norm_sqr(tmp_lenof_sign), stat=ierr) allocate(pops_num_parts(tmp_lenof_sign), stat=ierr) call read_int64_attribute(grp_id, nm_num_dets, all_count, & required=.true.) call read_dp_1d_attribute(grp_id, nm_norm_sqr, pops_norm_sqr, & required=.true.) call read_dp_1d_attribute(grp_id, nm_num_parts, pops_num_parts, & required=.true.) if (lenof_sign /= tmp_lenof_sign) then ! currently only for real population #ifndef CMPLX_ ! resize the attributes call resize_attribute(pops_norm_sqr, lenof_sign) call resize_attribute(pops_num_parts, lenof_sign) ! notify write(stdout, *) "WARNING: Popsfile and input lenof_sign mismatch. Cloning replicas" #else call stop_all(t_r, "Mismatched sign length") #endif end if write(stdout, *) write(stdout, *) 'Reading in ', all_count, ' determinants' ! How many particles should each processor read in (try and distribute ! this as uniformly as possible. Also calculate the associated data ! offsets counts = all_count / nProcessors rest = int(mod(all_count, nProcessors)) if (rest > 0) counts(0:rest - 1) = counts(0:rest - 1) + 1 if (sum(counts) /= all_count .or. any(counts < 0)) & call stop_all(t_r, "Invalid particles counts") offsets(0) = 0 do proc = 1, nProcessors - 1 offsets(proc) = offsets(proc - 1) + counts(proc - 1) end do write(stdout, *) 'Reading in ', counts(iProcIndex), & ' determinants on this process ...' ! TODO: Split this up into more manageable chunks! ! Open the relevant datasets call h5dopen_f(grp_id, nm_ilut, ds_ilut, err) call h5dopen_f(grp_id, nm_sgns, ds_sgns, err) ! size of the ms data read in tmp_fvals_size = 2 * tmp_inum_runs tmp_apvals_size = tmp_lenof_sign + 1 ! Only active the accumlation of populations if the popsfile has ! apvals and and accum-pops is specified with a proper iteration tAccumPopsActive = .false. if (tAccumPops .and. tPopAccumPops) then ! Read previous iteration data. ! It is read again in read_calc_data, but we need it now call h5gopen_f(parent, nm_calc_grp, calc_grp_id, err) call read_int64_scalar(calc_grp_id, nm_iters, PreviousCycles, & default=0_int64, required=.false.) call h5gclose_f(calc_grp_id, err) if (iAccumPopsIter <= PreviousCycles) then tAccumPopsActive = .true. call read_int64_scalar(grp_id, nm_accum_counter, iAccumPopsCounter, & default=0_int64, required=.false.) write(stdout, *) "Accumulated populations are found. Accumulation will continue." else write(stdout, *) "Accumulated populations are found, but will be discarded." write(stdout, *) "Accumulation will restart at iteration: ", iAccumPopsIter end if end if ! create an io handler for the data that is actually in the file ! (can have different lenof_sign and options than the one we will be using) call gdata_read_handler%init_gdata_io( & tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, & tmp_fvals_size, max_ratio_size, tmp_apvals_size) gdata_size = gdata_read_handler%entry_size() t_read_gdata = gdata_read_handler%t_io() if (t_read_gdata) then call h5lexists_f(grp_id, nm_gdata, t_exist_gdata, err) if (t_exist_gdata) then call h5dopen_f(grp_id, nm_gdata, ds_gdata, err) else !This is for backword-compatibility. !gdata group had another earlier name. call h5dopen_f(grp_id, nm_gdata_old, ds_gdata, err) end if end if ! Check that these datasets look like we expect them to. call check_dataset_params(ds_ilut, nm_ilut, 8_hsize_t, H5T_INTEGER_F, & [int(bit_rep_width, hsize_t), all_count]) call check_dataset_params(ds_sgns, nm_sgns, 8_hsize_t, H5T_FLOAT_F, & [int(tmp_lenof_sign, hsize_t), all_count]) if (t_read_gdata) then if (t_exist_gdata) then call check_dataset_params(ds_gdata, nm_gdata, 8_hsize_t, H5T_FLOAT_F, & [int(gdata_size, hsize_t), all_count]) else !This is for backword-compatibility. !gdata group had another earlier name. call check_dataset_params(ds_gdata, nm_gdata_old, 8_hsize_t, H5T_FLOAT_F, & [int(gdata_size, hsize_t), all_count]) end if end if !limit the buffer size per MPI task to 50MB or MaxSpawned entries block_size = 50000000 / (bit_rep_width * lenof_sign) / sizeof(SpawnedParts(1, 1)) block_size = min(block_size, MaxSpawned) #if 0 do proc = 0, nProcessors - 2 block_size = min(block_size, & InitialSpawnedSlots(proc + 1) - InitialSpawnedSlots(proc)) end do block_size = min(block_size, & MaxSpawned - InitialSpawnedSlots(nProcessors - 1)) #endif ! Initialise relevant counters CurrWalkers = 0 nread_walkers = 0 pops_norm = 0 norm = 0 parts = 0 ! Note that reading in the HDF5 library is zero based, not one based ! TODO: We need to keep looping until all the processes are done, not ! just the one! block_start = offsets(iProcIndex) if (iProcIndex == nProcessors - 1) then last_part = all_count - 1 else last_part = offsets(iProcIndex + 1) - 1 end if block_end = min(block_start + block_size - 1, last_part) this_block_size = block_end - block_start + 1 running = .true. any_running = .true. allocate(temp_ilut(int(bit_rep_width), int(this_block_size)), stat=ierr) call LogMemAlloc('temp_ilut', size(temp_ilut), int(sizeof(temp_ilut(1, 1))), 'read_walkers', temp_ilut_tag, ierr) allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr) call LogMemAlloc('temp_sgns', size(temp_sgns), lenof_sign, 'read_walkers', temp_sgns_tag, ierr) allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr) do while (any_running) ! If this is the last block, its size will differ from the biggest ! one allowed. if (running) then this_block_size = block_end - block_start + 1 else this_block_size = 0 end if ! if we resized the sign, we need to go back to the original buffer size now if (tmp_lenof_sign /= lenof_sign) then deallocate(temp_sgns) allocate(temp_sgns(int(tmp_lenof_sign), int(this_block_size)), stat=ierr) deallocate(gdata_buf) allocate(gdata_buf(int(gdata_size), int(this_block_size)), stat=ierr) call gdata_read_handler%init_gdata_io( & tPopAutoAdaptiveShift, tPopScaleBlooms, tPopAccumPops, & tmp_fvals_size, max_ratio_size, tmp_apvals_size) end if call read_walker_block_buff(ds_ilut, ds_sgns, ds_gdata, block_start, & this_block_size, bit_rep_width, temp_ilut, temp_sgns, & gdata_buf) if (tmp_lenof_sign /= lenof_sign) then call clone_signs(temp_sgns, tmp_lenof_sign, lenof_sign, this_block_size) ! resize the fvals in the same manner if (t_read_gdata) then call gdata_read_handler%clone_gdata( & gdata_buf, tmp_fvals_size, fvals_size, tmp_apvals_size, & apvals_size, this_block_size) end if end if ! distribution and storage of the data is done with the read_handler, which ! has the memory layout for the current calculation call distribute_and_add_walkers(this_block_size, gdata_read_handler, & temp_ilut, temp_sgns, gdata_buf, & dets, nreceived, CurrWalkers, norm, parts) nread_walkers = nread_walkers + nreceived ! And update for the next block if (running) then block_start = block_end + 1 block_end = min(block_start + block_size - 1, last_part) if (block_start > last_part) running = .false. end if ! Test if _all_ of the processes have finished. If they have ! not then call MPIAllLORLogical(running, any_running) end do deallocate(gdata_buf) deallocate(temp_ilut, temp_sgns) call LogMemDeAlloc('read_walkers', temp_ilut_tag) call LogMemDeAlloc('read_walkers', temp_sgns_tag) if (t_read_gdata) call h5dclose_f(ds_gdata, err) call h5dclose_f(ds_sgns, err) call h5dclose_f(ds_ilut, err) call h5gclose_f(grp_id, err) ! Do some checking call check_read_particles(nread_walkers, norm, parts, all_count, & pops_num_parts, pops_norm_sqr) deallocate(pops_num_parts) deallocate(pops_norm_sqr) write(stdout, *) "... done" write(stdout, *) end subroutine read_walkers subroutine read_walker_block_buff(ds_ilut, ds_sgns, ds_gdata, block_start, block_size, & bit_rep_width, temp_ilut, temp_sgns, gdata_buf) ! Read the walkers into the array spawnedparts2 ! ! N.B. This routine is quite sensitive to the particular structure ! of the bit representations determined in BitReps.F90 ! ! --> It would also be possible to read into scratch arrays, and then ! do some transferring. integer(hid_t), intent(in) :: ds_ilut, ds_sgns, ds_gdata integer(hsize_t), intent(in) :: block_start, block_size integer(int32), intent(in) :: bit_rep_width integer(hsize_t), dimension(:, :) :: temp_ilut, temp_sgns integer(hsize_t), dimension(:, :) :: gdata_buf integer :: gdata_size #ifdef INT64_ call read_2d_multi_chunk( & ds_ilut, temp_ilut, h5kind_to_type(int64, H5_INTEGER_KIND), & [int(bit_rep_width, hsize_t), block_size], & [0_hsize_t, block_start], & [0_hsize_t, 0_hsize_t]) call read_2d_multi_chunk( & ds_sgns, temp_sgns, h5kind_to_type(dp, H5_REAL_KIND), & [int(tmp_lenof_sign, hsize_t), block_size], & [0_hsize_t, block_start], & [0_hsize_t, 0_hsize_t]) gdata_size = size(gdata_buf, dim=1) if (gdata_size > 0) then call read_2d_multi_chunk( & ds_gdata, gdata_buf, h5kind_to_type(dp, H5_REAL_KIND), & [int(gdata_size, hsize_t), block_size], & [0_hsize_t, block_start], & [0_hsize_t, 0_hsize_t]) end if #else call stop_all("read_walker_block", "32-64bit conversion not yet implemented") #endif ! TODO: Flags here!!! end subroutine read_walker_block_buff subroutine distribute_and_add_walkers(block_size, gdata_read_handler, & temp_ilut, temp_sgns, gdata_buf, & dets, nreceived, CurrWalkers, norm, parts) use FciMCData, only: MaxSpawned implicit none integer(n_int), intent(out) :: dets(:, :) integer(int64), intent(inout) :: CurrWalkers integer(hsize_t) :: block_size type(gdata_io_t), intent(in) :: gdata_read_handler integer:: nreceived real(dp), intent(inout) :: norm(lenof_sign), parts(lenof_sign) integer(hsize_t):: temp_ilut(:, :), temp_sgns(:, :), gdata_buf(:, :) integer(MPIArg) :: sendcount(0:nProcessors - 1) integer :: nlocal = 0 integer :: gdata_size integer :: ierr integer(hsize_t), allocatable :: gdata_comm(:, :), gdata_loc(:, :) ! allocate the buffers for the gdata gdata_size = size(gdata_buf, dim=1) allocate(gdata_comm(gdata_size, MaxSpawned), stat=ierr) allocate(gdata_loc(gdata_size, MaxSpawned), stat=ierr) call assign_dets_to_procs_buff(block_size, temp_ilut, temp_sgns, & gdata_buf, gdata_loc, gdata_comm, sendcount) !add elements that are on the right processor already #define localfirst !#undef localfirst #ifdef localfirst nlocal = sendcount(iProcIndex) call add_new_parts(dets, nlocal, CurrWalkers, norm, parts, gdata_loc, gdata_read_handler) sendcount(iProcIndex) = 0 #endif !communicate the remaining elements nreceived = communicate_read_walkers_buff(sendcount, gdata_comm, gdata_loc) call add_new_parts(dets, nreceived, CurrWalkers, norm, parts, gdata_loc, gdata_read_handler) if (allocated(receivebuff)) then deallocate(receivebuff) call LogMemDeAlloc('distribute_and_add', receivebuff_tag) end if nreceived = nreceived + nlocal if (allocated(gdata_loc)) deallocate(gdata_loc) if (allocated(gdata_comm)) deallocate(gdata_comm) end subroutine distribute_and_add_walkers subroutine assign_dets_to_procs_buff(block_size, temp_ilut, temp_sgns, & gdata_buf, gdata_loc, gdata_comm, sendcount) use load_balance_calcnodes, only: DetermineDetNode use FciMCData, only: SpawnedParts2, SpawnedParts use DeterminantData, only: write_det use SystemData, only: nel integer(hsize_t), intent(in) :: block_size integer(hsize_t), dimension(:, :) :: temp_ilut, temp_sgns, gdata_buf, gdata_comm integer(hsize_t), dimension(:, :) :: gdata_loc integer(hsize_t) :: onepart(0:IlutBits%len_bcast) integer :: det(nel), p, j, proc, sizeilut, targetproc(block_size) integer(MPIArg) :: sendcount(0:nProcessors - 1) integer :: index, index2 logical :: t_read_gdata sizeilut = size(temp_ilut, 1) t_read_gdata = size(gdata_buf, dim=1) > 0 ! Iterate through walkers in temp_ilut+temp_sgns and determine the target processor. onepart = 0 sendcount = 0 do j = 1, int(block_size) onepart(0:sizeilut - 1) = temp_ilut(:, j) onepart(sizeilut:sizeilut + int(lenof_sign) - 1) = temp_sgns(:, j) ! Which processor does this determinant live on? call decode_bit_det(det, onepart) proc = DetermineDetNode(nel, det, 0) targetproc(j) = proc sendcount(proc) = sendcount(proc) + 1 end do ! Write the elements to SpawnedParts in the correct order for sending index = 1 index2 = 1 do p = 0, nProcessors - 1 #ifdef localfirst if (p == iProcIndex) then #else if (.false.) then #endif !elements that don't have to be communicated are written to SpawnedParts2 do j = 1, int(block_size) if (targetproc(j) == p) then onepart(0:sizeilut - 1) = temp_ilut(:, j) onepart(sizeilut:sizeilut + int(lenof_sign) - 1) = temp_sgns(:, j) SpawnedParts2(:, index2) = onepart if (t_read_gdata) & gdata_loc(:, index2) = gdata_buf(:, j) index2 = index2 + 1 end if end do else !elements that have to be sent to other procs are written to SpawnedParts do j = 1, int(block_size) if (targetproc(j) == p) then onepart(0:sizeilut - 1) = temp_ilut(:, j) onepart(sizeilut:sizeilut + int(lenof_sign) - 1) = temp_sgns(:, j) SpawnedParts(:, index) = onepart if (t_read_gdata) & gdata_comm(:, index) = gdata_buf(:, j) index = index + 1 end if end do end if end do end subroutine assign_dets_to_procs_buff function communicate_read_walkers_buff(sendcounts, gdata_comm, & gdata_loc) result(num_received) integer(MPIArg), intent(in) :: sendcounts(0:nProcessors - 1) integer(hsize_t), intent(inout) :: gdata_comm(:, :) integer(hsize_t), allocatable, intent(inout) :: gdata_loc(:, :) integer :: num_received integer(int64) :: lnum_received integer(MPIArg) :: recvcounts(0:nProcessors - 1), recvcountsScaled(0:nProcessors - 1) integer(MPIArg) :: disps(0:nProcessors - 1), recvdisps(0:nProcessors - 1) integer(MPIArg) :: dispsScaled(0:nProcessors - 1), recvdispsScaled(0:nProcessors - 1) integer(MPIArg) :: sendcountsScaled(0:nProcessors - 1) integer :: j, ierr, gdata_size !offsets for data to the different procs disps(0) = 0 do j = 1, nProcessors - 1 disps(j) = disps(j - 1) + sendcounts(j - 1) end do ! Communicate the number of particles that need to go to each proc call MPIAllToAll(sendcounts, 1, recvcounts, 1, ierr) ! We want the data to be contiguous after the move. So calculate the ! offsets recvdisps(0) = 0 do j = 1, nProcessors - 1 recvdisps(j) = recvdisps(j - 1) + recvcounts(j - 1) end do num_received = recvdisps(nProcessors - 1) + recvcounts(nProcessors - 1) lnum_received = recvdisps(nProcessors - 1) + recvcounts(nProcessors - 1) gdata_size = size(gdata_loc, 1) ! Adjust offsets so that they match the size of the array call scaleCounts(size(SpawnedParts, 1)) if (num_received > size(SpawnedParts2, 2)) then ! there could in principle be a memory problem because we are not limiting the ! size of receivebuff. write(stdout, *) 'Allocating additional buffer for communication on Processor ', iProcIndex, 'with ', & num_received * size(SpawnedParts, 1) * sizeof(SpawnedParts(1, 1)) / 1000000, 'MB' allocate(receivebuff(size(SpawnedParts, 1), num_received)) call LogMemAlloc('receivebuff', size(receivebuff), int(sizeof(receivebuff(1, 1))), & 'communicate_read_walkers', receivebuff_tag, ierr) call MPIAllToAllV(SpawnedParts, sendcountsScaled, dispsScaled, receivebuff, & recvcountsScaled, recvdispsScaled, ierr) ! gdata communication for auto-adaptive shift mode if (gdata_size > 0) then if (allocated(gdata_loc)) deallocate(gdata_loc) allocate(gdata_loc(gdata_size, num_received)) end if else call MPIAllToAllV(SpawnedParts, sendcountsScaled, dispsScaled, SpawnedParts2, & recvcountsScaled, recvdispsScaled, ierr) end if call scaleCounts(gdata_size) if (gdata_size > 0) then call MPIAllToAllV(gdata_comm, sendcountsScaled, dispsScaled, gdata_loc, & recvcountsScaled, recvdispsScaled, ierr) end if contains subroutine scaleCounts(argSize) implicit none integer, intent(in) :: argSize recvcountsScaled = recvcounts * argSize recvdispsScaled = recvdisps * argSize sendcountsScaled = sendcounts * argSize dispsScaled = disps * argSize end subroutine scaleCounts end function communicate_read_walkers_buff subroutine add_new_parts(dets, nreceived, CurrWalkers, norm, parts, & gdata_write, gdata_read_handler) use CalcData, only: iWeightPopRead ! Integrate the just-read block of walkers into the main list. integer(n_int), intent(inout) :: dets(:, :) integer(hsize_t), intent(in) :: gdata_write(:, :) integer, intent(in) :: nreceived integer(int64), intent(inout) :: CurrWalkers real(dp), intent(inout) :: norm(lenof_sign), parts(lenof_sign) type(gdata_io_t), intent(in) :: gdata_read_handler integer(int64) :: j real(dp) :: sgn(lenof_sign) ! TODO: inum_runs == 2, PopNIfSgn == 1 if (allocated(receivebuff)) then do j = 1, nreceived ! Check that the site is occupied, and passes the relevant ! thresholds before adding it to the system. ! However, when reading accumlated populations (APVals), we want ! to add even unoccupied sites who has accumlated values call extract_sign(receivebuff(:, j), sgn) if ((any(abs(sgn) >= iWeightPopRead) .and. .not. IsUnoccDet(sgn)) .or. & tAccumPopsActive) then ! Add this site to the main list CurrWalkers = CurrWalkers + 1 dets(:, CurrWalkers) = receivebuff(:, j) call add_pops_norm_contrib(dets(:, CurrWalkers)) call extract_sign(receivebuff(:, j), sgn) norm = norm + sgn**2 parts = parts + abs(sgn) call gdata_read_handler%read_gdata_hdf5(gdata_write(:, j), int(CurrWalkers)) end if end do else do j = 1, nreceived ! Check that the site is occupied, and passes the relevant ! thresholds before adding it to the system. ! However, when reading accumlated populations (APVals), we want ! to add even unoccupied sites who has accumlated values call extract_sign(SpawnedParts2(:, j), sgn) if ((any(abs(sgn) >= iWeightPopRead) .and. .not. IsUnoccDet(sgn)) .or. & tAccumPopsActive) then ! Add this site to the main list CurrWalkers = CurrWalkers + 1 dets(:, CurrWalkers) = SpawnedParts2(:, j) call add_pops_norm_contrib(dets(:, CurrWalkers)) call extract_sign(SpawnedParts2(:, j), sgn) norm = norm + sgn**2 parts = parts + abs(sgn) call gdata_read_handler%read_gdata_hdf5(gdata_write(:, j), int(CurrWalkers)) end if end do end if ! TODO: Add check that we have read in the correct number of parts end subroutine subroutine check_read_particles(nread_walkers, norm, parts, & pops_det_count, pops_num_parts, & pops_norm_sqr) ! Check that the values received in these routines are valid ! ! nread_walkers - Number of determinant/walker lines read (this proc) ! norm - Norm (this proc) ! parts - Coefficient weight (this proc) ! pops_det_count - Total dets in popsfile (from header) ! pops_num_parts - Total particle weight (from header) ! pops_norm_sqr - Total norm of wavefunction (from header) integer(int64), intent(in) :: nread_walkers, pops_det_count real(dp), intent(in) :: pops_num_parts(lenof_sign) real(dp), intent(in) :: pops_norm_sqr(lenof_sign) real(dp), intent(in) :: norm(lenof_sign), parts(lenof_sign) character(*), parameter :: t_r = 'check_read_particles' integer(int64) :: all_read_walkers real(dp) :: all_norm(lenof_sign), all_parts(lenof_sign) ! Have all the sites been correctly read in from the file ! n.b. CurrWalkers may not equal pops_det_count, as any unoccupied ! sites, or sites below the specified threshold will have been ! dropped. call MPISum(nread_walkers, all_read_walkers) if (iProcIndex == 0 .and. all_read_walkers /= pops_det_count) then write(stdout, *) 'Determinants in popsfile header: ', pops_det_count write(stdout, *) 'Determinants read in: ', all_read_walkers call stop_all(t_r, 'Particle number mismatch') end if ! Is the total number of walkers (sum of the sign values) correct? ! ! This is relative to the total number to account for relative errors call MPISumAll(parts, all_parts) if (any(abs(all_parts - pops_num_parts) > (pops_num_parts * 1.0e-10_dp))) then write(stdout, *) 'popsfile particles: ', pops_num_parts write(stdout, *) 'read particles: ', all_parts call stop_all(t_r, 'Incorrect particle weight read from popsfile') end if ! Is the total norm of the wavefunction correct ! The test condition is rather lax, as the hugely differing magnitudes ! of the total numbers on each processor combined with the varying ! summation orders can lead to larger errors here ! ! Same relative error test as before call MPISumAll(norm, all_norm) if (any(abs(all_norm - pops_norm_sqr) > (pops_norm_sqr * 1.0e-10_dp))) then write(stdout, *) 'popsfile norm**2: ', pops_norm_sqr write(stdout, *) 'read norm**2: ', all_norm call stop_all(t_r, 'Wavefunction norm incorrect') end if ! If the absolute sum, and the sum of the squares is correct, we can ! be fairly confident that they have all been read in!... ! [W.D.] ! on behalf of sasha bring back the feature that turns off the walker ! grow even if walkcontgrow was set unintentionally but the number of ! read-in walkers already exceeds or is close to the target number ! so i guess it is enough to set the global AllTotParts here ! of walkers AllTotParts = all_parts end subroutine #endif ! ! This is only here for dependency circuit breaking subroutine add_pops_norm_contrib(ilut) use CalcData, only: pops_norm integer(n_int), intent(in) :: ilut(0:NIfTot) real(dp) :: real_sign(lenof_sign) call extract_sign(ilut, real_sign) #ifdef DOUBLERUN_ pops_norm = pops_norm + real_sign(1) * real_sign(2) #elif CMPLX_ pops_norm = pops_norm + real_sign(1)**2 + real_sign(2)**2 #else pops_norm = pops_norm + real_sign(1) * real_sign(1) #endif end subroutine add_pops_norm_contrib end module