Below is the same code written in Fortran-90.
The wall-clock time is measured using the intrinsic subroutine
system_clock. As I have already remarked, the number of
ticks returned by this subroutine is reset to 0 every midnight. In order to avoid a catastrophe at midnight, we save the clock
value returned by the first call to system_clock in clock0. On all consecutive calls we allways check if the returned value is
less than clock0, which it will be if the clock has reset in the meantime. If we observe such an event, we add clock_max to
clock and use the result in our computations. Assuming that your job will not block the queue for more than 24 hours that
should work just fine, otherwise additional day counters would have to be included in the logic of the program.
This is basically the only difference between our C and our Fortran-90 versions of the program.
I have made a more extensive use of the cpp preprocessor in this code. All major constants have been defined using the
#ifndef .. #endif clauses at the beginning of the listing. This way their values can be altered from the command line,
using the -D switch, while generating the Fortran-90 code with
gcc -E -P -C.
The logic of the do loop differs slightly from the logic of the
do loop in C, because the while condition is tested at the
beginning of the loop. However, the default values of
quit_time and finished are such that the loop will be always
executed at least once. So, in effect, things should work here exactly as in our C example. The initialisation of quit_time to 1
also ensures that if timing has not been requested by the user, the job will continue running, until finished. The default
values of timing, and finished have the same effect as in the C example.
#ifndef STDOUT
# define STDOUT 6
#endif
#ifndef RESTART_FILE
# define RESTART_FILE 21
#endif
#ifndef LAST_N
# define LAST_N 30
#endif
#ifndef SAFETY_MARGIN
# define SAFETY_MARGIN 10
#endif
#ifndef SHORT_STRING_LEN
# define SHORT_STRING_LEN 64
#endif
#ifndef LONG_STRING_LEN
# define LONG_STRING_LEN 512
#endif
program rts
! R)estore T)ime S)ave
#ifdef XLF
use xlfutility
#endif
character (len=SHORT_STRING_LEN) restart, restart_name, old_restart_name
character (len=LONG_STRING_LEN) command
integer n, restart_file, status
parameter (restart_file = RESTART_FILE)
! Variables for timing
integer t0, t1, t2, loop_time, time_left, time_limit, quit_time, &
count0, count, count_rate, count_max, safety_margin
character (len=SHORT_STRING_LEN) time_limit_string
logical timing
data quit_time /1/, timing /.true./, safety_margin /SAFETY_MARGIN/
! Variables for finishing the task
integer last_n
logical finished
data finished /.false./, last_n /LAST_N/
! Look up the clock at the beginning of the run
call system_clock (count0, count_rate, count_max)
t0 = count0 / count_rate
! Check how much time we have for this job
call getenv ('RSAVE_TIME_LIMIT', time_limit_string)
if (len_trim (time_limit_string) .eq. 0) then
write (STDOUT, '(1x, 1a)') 'Unlimited time for this job.'
timing = .false.
else
read (time_limit_string, '(1i7)', iostat=status, err=130, end=130) &
time_limit
write (STDOUT, '(1x, 1a, 1i7, 1a)') 'Time for this job limited to ', &
time_limit, ' seconds'
end if
! Is this a continued job or a new one?
call getenv ('RSAVE_RESTART', restart)
if (len_trim(restart) .eq. 0) then
write (STDOUT, '(1x, 1a)') 'Starting a new run'
n = 0
else
call getenv ('RSAVE_CHECKFILE', restart_name)
if (len_trim(restart_name) .eq. 0) then
write (STDOUT, '(1x, 1a)') &
'Error: no checkpoint file for the restart job'
stop 1
else
write (STDOUT, '(1x, 2a)') 'Restarting the job from ', restart_name
open (unit=restart_file, iostat=status, err=100, file=restart_name, &
status='old', action='read')
read (restart_file, '(1i7)', iostat=status, err=110, end=110) n
close (restart_file)
end if
end if
! This is our computation part
write (STDOUT, '(1x, 1a, 1i7)') 'n = ', n
write (STDOUT, '(9x, 1a)') 'computing ... '
#ifdef XLF
call flush_ (STDOUT)
#endif
do while ((quit_time .gt. 0) .and. (.not. finished))
if (timing) then
call system_clock(count=count)
if (count .lt. count0) count = count + count_max
t1 = count / count_rate
end if
#ifdef XLF
call sleep_ (5)
#endif
n = n + 1
! Check if the whole simulation has been finished:
! this is our ``convergence'' criterion
if (n > last_n) finished = .true.
! Check if we still have enough time for the next loop
if (timing) then
call system_clock(count=count)
if (count .lt. count0) count = count + count_max
t2 = count / count_rate
loop_time = t2 - t1
time_left = time_limit - (t2 - t0)
quit_time = time_left - loop_time - safety_margin
write (STDOUT, '(16x, 1a, 1i7, 1a, 1i7, 1a)') &
'n = ', n, ' time left = ', time_left, ' seconds'
if ((quit_time .le. 0) .and. (.not. finished)) &
write (STDOUT, '(16x, 1a)') 'Run out of time, exiting ... '
end if
end do
write (STDOUT, '(9x, 1a)') 'done.'
write (STDOUT, '(1x, 1a, 1i7)') 'n = ', n
! And now we save the result on a new checkpoint file, saving
! the old one under a new name if need be.
call getenv ('RSAVE_CHECKFILE', restart_name)
if (len_trim(restart_name) .eq. 0) then
write (STDOUT, '(1x, 1a)') 'Checkpointing not requested, exiting ... '
stop 0
else
if (.not. (len_trim(restart) .eq. 0)) then
old_restart_name = restart_name (1:len_trim(restart_name)) // '.old'
write (STDOUT, '(1x, 2a)') 'Renaming the old restart file to ', &
old_restart_name
command = 'mv' // ' ' // restart_name // ' ' // old_restart_name
call system (command)
end if
write (STDOUT, '(1x, 2a)') 'Saving data on ', restart_name
open (unit=restart_file, iostat=status, err=120, file=restart_name, &
status='replace', action='write')
write (restart_file, '(1i7)') n
close (restart_file)
if (.not. finished) then
write (STDOUT, '(1x, 1a)') 'CONTINUE'
else
write (STDOUT, '(1x, 1a)') 'FINISHED'
end if
end if
stop 0
! error handlers
! error while opening the checkpoint file for reading
100 write (STDOUT, '(1x, 3a)') 'Error: while opening ', restart_name, &
' for reading'
write (STDOUT, '(8x, 1a, 1i7)') 'iostat = ', status
stop 2
! error while trying to read input file
110 write (STDOUT, '(1x, 2a)') 'Error: while reading from ', restart_name
write (STDOUT, '(8x, 1a, 1i7)') 'iostat = ', status
stop 3
! error while opening the checkpoint file for writing
120 write (STDOUT, '(1x, 3a)') 'Error: while opening ', restart_name, &
' for writing'
write (STDOUT, '(8x, 1a, 1i7)') 'iostat = ', status
stop 5
! error while trying to read from time_limit_string
130 write (STDOUT, '(1x, 1a)') 'Error: bad format of RSAVE_TIME_LIMIT'
write (STDOUT, '(8x, 1a, 1i7)') 'iostat = ', status
stop 6
end program rts
The program can be compiled as follows:
gustav@sp19:../LoadLeveler 14:45:36 !573 $ gcc -E -P -C -DXLF rts.cpp > rts.f gustav@sp19:../LoadLeveler 14:46:01 !574 $ xlf90 -o rts rts.f ** rts === End of Compilation 1 === 1501-510 Compilation successful for file rts.f. gustav@sp19:../LoadLeveler 14:46:07 !575 $
And here is how I've run it. Observe another subtle difference between our C and Fortran-90 examples: when the Fortran program exits, apart from writing CONTINUE or FINISHED it also writes STOP 0. If our LoadLeveler script was to inspect only the last line of the log file for the word CONTINUE, we would have missed it in this case. So, instead, the script will grep through the whole file. Of course, this assumes that a new log file will be created each time.
gustav@sp19:../LoadLeveler 14:46:19 !576 $ env | grep RSAVE
RSAVE_TIME_LIMIT=30
RSAVE_CHECKFILE=rts.dat
RSAVE_RESTART=yes
gustav@sp19:../LoadLeveler 14:46:36 !577 $ unset RSAVE_RESTART
gustav@sp19:../LoadLeveler 14:46:42 !578 $ ./rts
Time for this job limited to 30 seconds
Starting a new run
n = 0
computing ...
n = 1 time left = 25 seconds
n = 2 time left = 20 seconds
n = 3 time left = 15 seconds
Run out of time, exiting ...
done.
n = 3
Saving data on rts.dat
CONTINUE
STOP 0
gustav@sp19:../LoadLeveler 14:47:01 !579 $ export RSAVE_RESTART="yes"
gustav@sp19:../LoadLeveler 14:47:24 !580 $ ./rts
Time for this job limited to 30 seconds
Restarting the job from rts.dat
n = 3
computing ...
n = 4 time left = 25 seconds
n = 5 time left = 20 seconds
n = 6 time left = 15 seconds
Run out of time, exiting ...
done.
n = 6
Renaming the old restart file to rts.dat.old
Saving data on rts.dat
CONTINUE
STOP 0
gustav@sp19:../LoadLeveler 14:47:42 !581 $
...
gustav@sp19:../LoadLeveler 14:52:46 !588 $ ./rts
Time for this job limited to 30 seconds
Restarting the job from rts.dat
n = 27
computing ...
n = 28 time left = 25 seconds
n = 29 time left = 20 seconds
n = 30 time left = 15 seconds
Run out of time, exiting ...
done.
n = 30
Renaming the old restart file to rts.dat.old
Saving data on rts.dat
CONTINUE
STOP 0
gustav@sp19:../LoadLeveler 14:53:15 !589 $ ./rts
Time for this job limited to 30 seconds
Restarting the job from rts.dat
n = 30
computing ...
n = 31 time left = 25 seconds
done.
n = 31
Renaming the old restart file to rts.dat.old
Saving data on rts.dat
FINISHED
STOP 0
gustav@sp19:../LoadLeveler 14:53:31 !590 $