#!/bin/ksh #------------------------------------------------------------------# # This is a script for running PVMe jobs under the "LoadLeveler" # # batch queuing system at NIST. # #------------------------------------------------------------------# # Please report any difficulties you have in running this script # # to karin@cam.nist.gov # #------------------------------------------------------------------# #------------------------------------------------------------------# # First, set some variables that will be used to manage jobs/files: #------------------------------------------------------------------# export USER=`whoami` export TMPDIR="/tmp/$USER" export MYUID=`grep karin /etc/passwd | awk -F: '{print $3}' ` export MYSHELL=`grep karin /etc/passwd | awk -F: '{print $7}' ` HOST=`hostname` export HOST=${HOST%%.*} export CMD="${0##*/}" export INITIALDIR=`pwd` # This umask needed to give access to PVMLIST - if user's umask is set # to 077, this file can't be read, and the script fails. umask 022 #------------------------------------------------------------------# # Report to standard out and err which host is running the job: #------------------------------------------------------------------# echo `date` echo `date` >& 2 echo "$CMD: Running $CMD on node $HOST (Job ID: $CLUSTER)" echo "$CMD: Running $CMD on node $HOST (Job ID: $CLUSTER)" >& 2 #------------------------------------------------------------------# # The SESSIONDIR variable is to allow for overlapping PVMe # sessions. That is, the nodelist files will remain separate, thus # eliminating potential conflict if a user has multiple jobs # running (on different sets of nodes). #------------------------------------------------------------------# SESSIONDIR="/tmp/$USER/$$" if [ ! -d "/tmp/${USER}" ] then mkdir /tmp/${USER} fi if [ ! -d ${SESSIONDIR} ] then mkdir ${SESSIONDIR} fi #------------------------------# # PVMe environment set-up: #------------------------------# # Set necessary environment variables: export PVMDPATH=/usr/lpp/pvm3 export PVMEPATH=${TMPDIR} export PVMD=${PVMDPATH}/pvmd3e export PVMLIST="${SESSIONDIR}/pvmnodelist" export NODELIST=$PVMLIST export PVMHFN=".pvmdname.$$" # Use trap command before starting the PVMe daemon to handle # trapping interrupts and take actions on cleaning up # hopelessly lost PVMe daemon and user processes by calling # pvmtidy utility. trap 'pvmtidy ${PVMLIST} -u ${PROGRAM} ${NODEPROGRAM}; exit 1 ' 1 2 3 14 15 #---------------------------------------------------# # Start the PVM daemon across the node partition: #---------------------------------------------------# # Note that we first cd to SESSIONDIR echo "$CMD: starting $PVMD daemon on $NPROCS nodes" >& 2 (cd ${SESSIONDIR}; $PVMD $NPROCS ) & sleep 10 PVMD_PID=`jobs -p` if [ -n "$PVMD_PID" ] then echo "$CMD: started $PVMD demon process_id=$PVMD_PID" >& 2 echo "$CMD: waiting for $PVMD ready" >& 2 while [ ! -r ${HOME}/${PVMHFN} ] do sleep 1 done echo "$CMD: ${HOME}/${PVMHFN} file generated... " >& 2 else # Will reach this part if the daemon could not be started. # Try to reset PVM by calling PVMD with the -r option... echo "$CMD: Error: $PVMD was not started" >& 2 echo "$CMD: Running $PVMD with the reset option..." >& 2 # Then, try to start PVMD again.. # Note that we first cd to SESSIONDIR $PVMD -r $PVMLIST (cd ${SESSIONDIR}; $PVMD $NPROCS ) & sleep 10 PVMD_PID=`jobs -p` if [ -n "$PVMD_PID" ] then echo "$CMD: started $PVMD demon process_id=$PVMD_PID" >& 2 echo "$CMD: waiting for $PVMD ready" >& 2 while [ ! -r ${HOME}/${PVMHFN} ] do sleep 1 done echo "$CMD: ${HOME}/${PVMHFN} file generated... " >& 2 else echo "$CMD: Failed a second time. Exiting..." >& 2 echo "$CMD: Failed to start PVM. Exiting..." exit 1 fi fi # Successfully started PVM; # Record to standard out the nodes used: export hostlist=` cat $PVMLIST | awk '{print $1}'` echo "Node pool:\n$hostlist" #----------------------------------------------# # Invoke a user setup script if available: #----------------------------------------------# if [ -n "${USERSETUP}" ] && [ -x "${USERSETUP}" ] then echo "$CMD Invoking user set-up procedure ${USERSETUP}" ${USERSETUP} fi #---------------------------------------------------------# # Copy any necessary files to the local node directory: #---------------------------------------------------------# if [ -n "${USERSETUP}" ] && [ -x "${USERSETUP}" ] then echo "$CMD Invoking user set-up procedure ${USERSETUP}" ${USERSETUP} fi echo "$CMD: Copying files to node temporary directories (${TMPDIR})" >& 2 echo "$CMD: Files to copy: " >& 2 echo "${PROGRAM} ${NODEPROGRAM} ${INPUT_FILES} ${DEBUG_FILES}" >& 2 for host in $hostlist do rsh ${host} mkdir ${TMPDIR} > /dev/null 2>&1 rcp ${PROGRAM} ${NODEPROGRAM} ${INPUT_FILES} ${DEBUG_FILES} ${host}:${TMPDIR} done #--------------------------------------------------------------------# # If current script is not running on one of the POE pool nodes # -- as could potentially happen if you specify 'parallelmachines' # to be less than the allotted number of nodes for your class -- # then start the program on one of pool nodes: #--------------------------------------------------------------------# echo "$CMD: Checking for $HOST in the current partition..." >& 2 grep ${HOST} ${NODELIST} > /dev/null 2>&1 if [ $? -ne 0 ] then exec 3<"${NODELIST}" #open unit 3 for input from file ${hostlist} read -u3 RHOST REST #read one of the hosts from that file exec 3<&- #close unit 3 echo "$CMD: Read host from hostlist -- Will execute on $RHOST" >&2 if [ "${INPUT}" = "/dev/null" ] then echo "$CMD: No standard input, will not copy /dev/null." >& 2 else rcp ${INPUT} ${RHOST}:${TMPDIR} INPUT=${TMPDIR}/${INPUT} fi echo "$CMD: Starting $PROGRAM $ARGUMENTS < ${INPUT} on $RHOST:$TMPDIR" >& 2 echo "\n\n<<<<<<<<<<<<<< Start of program output >>>>>>>>>>>>>>\n\n" if [ "$MYSHELL" = "/bin/ksh" ] then ( rsh $RHOST "export PVMHFN=${PVMHFN}; cd ${TMPDIR}; ${TMPDIR}/${PROGRAM} ${ARGUMENTS} < ${INPUT}; exit" ) else ( rsh $RHOST "setenv PVMHFN ${PVMHFN}; cd ${TMPDIR}; ${TMPDIR}/${PROGRAM} ${ARGUMENTS} < ${INPUT}; exit" ) fi echo "\n\n<<<<<<<<<<<<<< End of program output >>>>>>>>>>>>>>\n\n" else #------------------------------------------------------------# # Otherwise, execute the program on the current node: #------------------------------------------------------------# if [ -x ${PROGRAM} ] then if [ "${INPUT}" = "/dev/null" ] then echo "$CMD: No standard input, will not copy /dev/null." >& 2 else cp ${INPUT} ${TMPDIR} INPUT=${TMPDIR}/${INPUT} fi echo "$CMD: Starting $PROGRAM $ARGUMENTS < ${INPUT} on $HOST" >& 2 cd ${TMPDIR} echo "\n\n<<<<<<<<<<<<<< Start of program output >>>>>>>>>>>>>>\n\n" (${DEBUG} ${TMPDIR}/${PROGRAM} $ARGUMENTS < ${INPUT}) echo "\n\n<<<<<<<<<<<<<< End of program output >>>>>>>>>>>>>>\n\n" else print -u2 "$CMD: error $PROGRAM not found" fi fi #--------------------------------------# # Before exiting, HALT the daemon: #--------------------------------------# which pvm >& 2 echo halt | ${PVM} echo #--------------------------------# # Clean up node directories: #--------------------------------# if [ -x ${USERCLEAN} ] then echo "$CMD Invoking user clean-up procedure ${USERCLEAN}" ${USERCLEAN} fi #------------------------------------------------------------# # Run pvmtidy utility at the end of the PVMe runstream to # cleanup hopelessly lost PVMe daemon and user processes # if there are any. #------------------------------------------------------------# echo "\npvmtidy $PVMLIST -u $PROGRAM $NODEPROGRAM" >& 2 pvmtidy $PVMLIST -u $PROGRAM $NODEPROGRAM >& 2 rm -f $HOME/$PVMHFN >& 2 #------------------------------------------------------------# # CD out of the directory where files will be removed: #------------------------------------------------------------# cd $HOME echo "$CMD: Cleaning tmp directories on the nodes..." >& 2 for host in $hostlist do rsh ${host} rm -fr ${TMPDIR} done # and, in case current node wasn't in hostlist: rm -rf $TMPDIR echo "$CMD: Done." >& 2 #--------------------------------------# # End clean-up; Exit shell program: #--------------------------------------# exit 0