Allow to restart job with more CPUs than the restart files.
- now, if we want to restart a job and use more processors than we used while creating restart files, all processors read and recreate meta block structure, but only N previously used processors will read data block; as soon as the job is restarted, we call redistribute_blocks() in order to get equal number of data blocks on each processor;
This commit is contained in:
parent
75c01885a3
commit
81ba4935d2
@ -40,7 +40,8 @@ program amun
|
|||||||
#endif /* FORCE */
|
#endif /* FORCE */
|
||||||
use integrals, only : init_integrals, clear_integrals, store_integrals
|
use integrals, only : init_integrals, clear_integrals, store_integrals
|
||||||
use io , only : nfile, write_data, write_restart_data, restart_job
|
use io , only : nfile, write_data, write_restart_data, restart_job
|
||||||
use mesh , only : init_mesh, generate_mesh, store_mesh_stats, clear_mesh
|
use mesh , only : init_mesh, generate_mesh, store_mesh_stats, clear_mesh &
|
||||||
|
, redistribute_blocks
|
||||||
use mpitools , only : ncpu, ncpus, init_mpi, clear_mpi, is_master, mfindmaxi
|
use mpitools , only : ncpu, ncpus, init_mpi, clear_mpi, is_master, mfindmaxi
|
||||||
use random , only : init_generator
|
use random , only : init_generator
|
||||||
use timer , only : init_timers, start_timer, stop_timer, get_timer &
|
use timer , only : init_timers, start_timer, stop_timer, get_timer &
|
||||||
@ -194,6 +195,11 @@ program amun
|
|||||||
!
|
!
|
||||||
call restart_job()
|
call restart_job()
|
||||||
|
|
||||||
|
! redistribute blocks between processors in case the number of processors has
|
||||||
|
! changed
|
||||||
|
!
|
||||||
|
call redistribute_blocks()
|
||||||
|
|
||||||
! find new time step
|
! find new time step
|
||||||
!
|
!
|
||||||
call find_new_timestep()
|
call find_new_timestep()
|
||||||
|
44
src/io.F90
44
src/io.F90
@ -344,7 +344,7 @@ module io
|
|||||||
use hdf5 , only : H5F_ACC_RDONLY_F
|
use hdf5 , only : H5F_ACC_RDONLY_F
|
||||||
use hdf5 , only : h5open_f, h5close_f, h5fis_hdf5_f, h5fopen_f &
|
use hdf5 , only : h5open_f, h5close_f, h5fis_hdf5_f, h5fopen_f &
|
||||||
, h5fclose_f
|
, h5fclose_f
|
||||||
use mpitools, only : ncpu
|
use mpitools, only : ncpus, ncpu
|
||||||
|
|
||||||
! declare variables
|
! declare variables
|
||||||
!
|
!
|
||||||
@ -354,7 +354,7 @@ module io
|
|||||||
!
|
!
|
||||||
character(len=64) :: fl
|
character(len=64) :: fl
|
||||||
integer(hid_t) :: fid
|
integer(hid_t) :: fid
|
||||||
integer :: err
|
integer :: err, lcpus, lcpu
|
||||||
logical :: info
|
logical :: info
|
||||||
!
|
!
|
||||||
!-------------------------------------------------------------------------------
|
!-------------------------------------------------------------------------------
|
||||||
@ -367,9 +367,30 @@ module io
|
|||||||
!
|
!
|
||||||
if (err .ge. 0) then
|
if (err .ge. 0) then
|
||||||
|
|
||||||
|
! find the number of processors counting all restart files
|
||||||
|
!
|
||||||
|
lcpus = 0
|
||||||
|
info = .true.
|
||||||
|
do while(info)
|
||||||
|
lcpus = lcpus + 1
|
||||||
|
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, lcpus, '.h5'
|
||||||
|
inquire(file = fl, exist = info)
|
||||||
|
end do
|
||||||
|
|
||||||
|
! if the number of processors is larger then the number of files, use the last
|
||||||
|
! file for the remaining processors
|
||||||
|
!
|
||||||
|
lcpu = ncpu
|
||||||
|
if (lcpus .lt. ncpus) then
|
||||||
|
lcpu = min(lcpus - 1, ncpu)
|
||||||
|
end if
|
||||||
|
if (lcpus .gt. ncpus) then
|
||||||
|
call print_error("io::read_data_h5", "This is not supported yet!")
|
||||||
|
end if
|
||||||
|
|
||||||
! prepare the filename
|
! prepare the filename
|
||||||
!
|
!
|
||||||
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, ncpu, '.h5'
|
write (fl,'("r",i6.6,"_",i5.5,a3)') nrest, lcpu, '.h5'
|
||||||
|
|
||||||
! check if the HDF5 file exists
|
! check if the HDF5 file exists
|
||||||
!
|
!
|
||||||
@ -403,7 +424,7 @@ module io
|
|||||||
|
|
||||||
! read data blocks
|
! read data blocks
|
||||||
!
|
!
|
||||||
call read_datablocks_h5(fid)
|
if (lcpu .eq. ncpu) call read_datablocks_h5(fid)
|
||||||
|
|
||||||
! deallocate the array of block pointers
|
! deallocate the array of block pointers
|
||||||
!
|
!
|
||||||
@ -674,7 +695,7 @@ module io
|
|||||||
integer(kind=4) :: dm(3)
|
integer(kind=4) :: dm(3)
|
||||||
integer :: err, i, l
|
integer :: err, i, l
|
||||||
integer :: nattrs, lndims, llast_id, lmblocks, ldblocks &
|
integer :: nattrs, lndims, llast_id, lmblocks, ldblocks &
|
||||||
, lnleafs, lncells, lnghost, lnseeds, lmaxlev
|
, lnleafs, lncells, lnghost, lnseeds, lmaxlev, lncpu
|
||||||
|
|
||||||
! local pointers
|
! local pointers
|
||||||
!
|
!
|
||||||
@ -740,6 +761,8 @@ module io
|
|||||||
else
|
else
|
||||||
fcor = 2**(maxlev - lmaxlev)
|
fcor = 2**(maxlev - lmaxlev)
|
||||||
end if
|
end if
|
||||||
|
case('ncpu')
|
||||||
|
call read_attribute_integer_h5(aid, aname, lncpu)
|
||||||
case('last_id')
|
case('last_id')
|
||||||
call read_attribute_integer_h5(aid, aname, llast_id)
|
call read_attribute_integer_h5(aid, aname, llast_id)
|
||||||
case('mblocks')
|
case('mblocks')
|
||||||
@ -856,9 +879,13 @@ module io
|
|||||||
|
|
||||||
! allocate all datablocks
|
! allocate all datablocks
|
||||||
!
|
!
|
||||||
do l = 1, ldblocks
|
if (lncpu .eq. ncpu) then
|
||||||
call append_datablock(pdata)
|
do l = 1, ldblocks
|
||||||
end do
|
call append_datablock(pdata)
|
||||||
|
end do
|
||||||
|
else
|
||||||
|
ldblocks = 0
|
||||||
|
end if
|
||||||
|
|
||||||
! check if the number of created datablocks is equal to the ldblocks
|
! check if the number of created datablocks is equal to the ldblocks
|
||||||
!
|
!
|
||||||
@ -1838,7 +1865,6 @@ module io
|
|||||||
|
|
||||||
if (lea(l) .eq. 1) call metablock_set_leaf(pmeta)
|
if (lea(l) .eq. 1) call metablock_set_leaf(pmeta)
|
||||||
|
|
||||||
|
|
||||||
l = l + 1
|
l = l + 1
|
||||||
pmeta => pmeta%next
|
pmeta => pmeta%next
|
||||||
end do
|
end do
|
||||||
|
Loading…
x
Reference in New Issue
Block a user