Allow to restart job with more CPUs than the restart files.
- now, if we want to restart a job and use more processors than we used while creating restart files, all processors read and recreate meta block structure, but only N previously used processors will read data block; as soon as the job is restarted, we call redistribute_blocks() in order to get equal number of data blocks on each processor;
This commit is contained in:
parent
75c01885a3
commit
81ba4935d2
@ -40,7 +40,8 @@ program amun
|
||||
#endif /* FORCE */
|
||||
use integrals, only : init_integrals, clear_integrals, store_integrals
|
||||
use io , only : nfile, write_data, write_restart_data, restart_job
|
||||
use mesh , only : init_mesh, generate_mesh, store_mesh_stats, clear_mesh
|
||||
use mesh , only : init_mesh, generate_mesh, store_mesh_stats, clear_mesh &
|
||||
, redistribute_blocks
|
||||
use mpitools , only : ncpu, ncpus, init_mpi, clear_mpi, is_master, mfindmaxi
|
||||
use random , only : init_generator
|
||||
use timer , only : init_timers, start_timer, stop_timer, get_timer &
|
||||
@ -194,6 +195,11 @@ program amun
|
||||
!
|
||||
call restart_job()
|
||||
|
||||
! redistribute blocks between processors in case the number of processors has
|
||||
! changed
|
||||
!
|
||||
call redistribute_blocks()
|
||||
|
||||
! find new time step
|
||||
!
|
||||
call find_new_timestep()
|
||||
|
44
src/io.F90
44
src/io.F90
@ -344,7 +344,7 @@ module io
|
||||
use hdf5 , only : H5F_ACC_RDONLY_F
|
||||
use hdf5 , only : h5open_f, h5close_f, h5fis_hdf5_f, h5fopen_f &
|
||||
, h5fclose_f
|
||||
use mpitools, only : ncpu
|
||||
use mpitools, only : ncpus, ncpu
|
||||
|
||||
! declare variables
|
||||
!
|
||||
@ -354,7 +354,7 @@ module io
|
||||
!
|
||||
character(len=64) :: fl
|
||||
integer(hid_t) :: fid
|
||||
integer :: err
|
||||
integer :: err, lcpus, lcpu
|
||||
logical :: info
|
||||
!
|
||||
!-------------------------------------------------------------------------------
|
||||
@ -367,9 +367,30 @@ module io
|
||||
!
|
||||
if (err .ge. 0) then
|
||||
|
||||
! find the number of processors counting all restart files
|
||||
!
|
||||
lcpus = 0
|
||||
info = .true.
|
||||
do while(info)
|
||||
lcpus = lcpus + 1
|
||||
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, lcpus, '.h5'
|
||||
inquire(file = fl, exist = info)
|
||||
end do
|
||||
|
||||
! if the number of processors is larger then the number of files, use the last
|
||||
! file for the remaining processors
|
||||
!
|
||||
lcpu = ncpu
|
||||
if (lcpus .lt. ncpus) then
|
||||
lcpu = min(lcpus - 1, ncpu)
|
||||
end if
|
||||
if (lcpus .gt. ncpus) then
|
||||
call print_error("io::read_data_h5", "This is not supported yet!")
|
||||
end if
|
||||
|
||||
! prepare the filename
|
||||
!
|
||||
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, ncpu, '.h5'
|
||||
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, lcpu, '.h5'
|
||||
|
||||
! check if the HDF5 file exists
|
||||
!
|
||||
@ -403,7 +424,7 @@ module io
|
||||
|
||||
! read data blocks
|
||||
!
|
||||
call read_datablocks_h5(fid)
|
||||
if (lcpu .eq. ncpu) call read_datablocks_h5(fid)
|
||||
|
||||
! deallocate the array of block pointers
|
||||
!
|
||||
@ -674,7 +695,7 @@ module io
|
||||
integer(kind=4) :: dm(3)
|
||||
integer :: err, i, l
|
||||
integer :: nattrs, lndims, llast_id, lmblocks, ldblocks &
|
||||
, lnleafs, lncells, lnghost, lnseeds, lmaxlev
|
||||
, lnleafs, lncells, lnghost, lnseeds, lmaxlev, lncpu
|
||||
|
||||
! local pointers
|
||||
!
|
||||
@ -740,6 +761,8 @@ module io
|
||||
else
|
||||
fcor = 2**(maxlev - lmaxlev)
|
||||
end if
|
||||
case('ncpu')
|
||||
call read_attribute_integer_h5(aid, aname, lncpu)
|
||||
case('last_id')
|
||||
call read_attribute_integer_h5(aid, aname, llast_id)
|
||||
case('mblocks')
|
||||
@ -856,9 +879,13 @@ module io
|
||||
|
||||
! allocate all datablocks
|
||||
!
|
||||
do l = 1, ldblocks
|
||||
call append_datablock(pdata)
|
||||
end do
|
||||
if (lncpu .eq. ncpu) then
|
||||
do l = 1, ldblocks
|
||||
call append_datablock(pdata)
|
||||
end do
|
||||
else
|
||||
ldblocks = 0
|
||||
end if
|
||||
|
||||
! check if the number of created datablocks is equal to the ldblocks
|
||||
!
|
||||
@ -1838,7 +1865,6 @@ module io
|
||||
|
||||
if (lea(l) .eq. 1) call metablock_set_leaf(pmeta)
|
||||
|
||||
|
||||
l = l + 1
|
||||
pmeta => pmeta%next
|
||||
end do
|
||||
|
Loading…
x
Reference in New Issue
Block a user