diff --git a/scripts/enqueue b/scripts/enqueue index f47e8b9..217cd0c 100755 --- a/scripts/enqueue +++ b/scripts/enqueue @@ -9,6 +9,116 @@ fi # 'if anything at all happens try and shut down our job cleanly' approach SIGNALS="INT TERM HUP QUIT KILL PIPE" + +# 1 2 3 4 5 6 7 8 9 +# system retry_period total_retries key no_lock_mods completion completion_timeout errortxt logfile keep_alive linux dtbflag file_count files + +EnqueueOneSystem() { + + system="$1" + retry_period="$2" + total_retries="$3" + key="$4" + no_lock_mods="$5" + completion="$6" + completion_timeout="$7" + errortxt="$8" + logfile="$9" + shift 9 + keep_alive="$1" + linux="$2" + dtbflag="$3" + file_count="$4" + files="$5" + + + if [ ! -z "${linux}" ]; then + if ! SystemBootsLinuxPXE "${system}" ; then + echo "System ${system} cannot boot Linux over the network." + exit 1 + fi + fi + + if [ ! -z "$dtbflag" ]; then + if ! SystemAcceptsDtb "${system}" ; then + echo "System ${system} does not accept a dtb" + exit 1 + fi + fi + + if [ ! -z "$dtbflag" ] && [ -z "$linux" ]; then + echo "A dtb may only be supplied when booting a linux image over the network." + exit 1 + fi + + # Check that the number of files specified is correct + if [ "$interact" != "-r" ] ; then + if ! SystemCorrectNumberOfFiles ${linux} "${system}" "${file_count}"; then + echo "Wrong number of files specified for system ${system}" + exit 1 + fi + fi + + if ${no_lock_mods} ; then + if ! LockIOwn "${system}"; then + echo "Failed to run because you said you own the lock (-n flag), but you don't!" + LockSystemPrintInfo "${system}" + return 1 + fi + lockkey=$(LockGetKey "${system}") + if [ "${lockkey}" != "${key}" ] + then + echo "Failed to run because you said you own the lock (-n flag), but you gave the wrong key!" + return 2 + fi + + else + echo "Acquiring lock for ${system}" + + # Print out potentially useful information to the user about + # the current status of the lock + LockSystemPrintInfo "${system}" + + # We can setup the trap early as we check if we actually own the + # the lock before releasing it + trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} + + if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then + # Locking failed; reset the trap + # The outer Enqueue function will handle reporting the lock failure + trap "exit 1" ${SIGNALS} + return 1 + fi + fi + + echo "Lock acquired, we are allowed to run" + local ret=0 + + if [ "${interact}" = "-r" ]; then + echo "This is a reservation. You now own ${system} and" + echo "can do what you want. Press ctrl+d or enter here when done" + read line + echo "Attempting to power off machine now that you are done" + SystemPowerOff "${system}" + else + # Run the image. 'files' is deliberately not in quotes so the multiple + # files expands to multiple arguments + # 'dtbflag' is deliberately not in quotes so it expands to multiple + # parameters '-b' and 'my.dtb' + # 'linux' is deliberately not in quotes so it will not create an empty + # parameter if the flag is not set + SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files + ret=$? + fi + + if ! ${no_lock_mods} ; then + UnlockSystem "${system}" 0 "${key}" + trap "exit 1" ${SIGNALS} + fi + + exit $ret +} + EnqueueUsage() { echo "$0 run -r|-c -l logfile -s system [-w retry-time] [-t retry-count] [-n] [-a] [-d timeout] [-e ] [-k ] [-L] [-b ] -f file1 -f file2 .. -f filen" echo @@ -159,97 +269,75 @@ Enqueue() { exit 1 fi + + local ret=0 + # Verify the requested system exists - if ! IsSystemValid "${system}" ; then - # If it's not a system, is it a pool? - if ! IsPoolValid "${system}" ; then - echo "System or pool '$system' does not exist. Valid systems are" - SystemList - echo "Valid pools are" - PoolList - exit 1 + if IsSystemValid "${system}" ; then + + # Everything is quoted so we'll pass on the exact same number of parameters every time + if EnqueueOneSystem "${system}" "${retry_period}" "${total_retries}" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" ; then + ret=$? + exit "${ret}" fi - # Select a system from the pool + + # If we get here, the enqueing failed + echo "Failed to acquire lock for system (${system})" + exit 2 + + # If it's not a system, is it a pool? + elif IsPoolValid "${system}" ; then pool=${system} - system=$(GetRandomSystemFromPool_"${pool}") - echo "Selecting system '${system}' from pool '${pool}'" - fi - if [ ! -z "${linux}" ]; then - if ! SystemBootsLinuxPXE "${system}" ; then - echo "System cannot boot Linux over the network." - exit 1 - fi - fi + # When attempting to lock a pool, we use the retry time and retry count + # given by the user for how often we attempt locking on the whole pool, + # not for lock attempts on a specific system. + # Each time the user-configured period elapses, we try (but do not wait + # on) locking each system in the pool. - if [ ! -z "$dtbflag" ]; then - if ! SystemAcceptsDtb "${system}" ; then - echo "System does not accept a dtb" - exit 1 - fi - fi + lock_tries=0 - if [ ! -z "$dtbflag" ] && [ -z "$linux" ]; then - echo "A dtb may only be supplied when booting a linux image over the network." - exit 1 - fi + # Need to use a different name so as not to collide with other local variables + loop_retries="$total_retries" + loop_period="$retry_period" - # Check that the number of files specified is correct - if [ "$interact" != "-r" ] ; then - if ! SystemCorrectNumberOfFiles ${linux} "${system}" "${file_count}"; then - echo "Wrong number of files specified for system ${system}" - exit 1 - fi - fi + # Loop until total_retries + 1 because we want to make the initial attempt, + # and then retry the correct number of times + while [ "${loop_retries}" -eq -1 -o "${lock_tries}" -lt "$((${loop_retries}+1))" ] ; do - if ${no_lock_mods} ; then - if ! LockIOwn "${system}"; then - echo "Failed to run because you said you own the lock (-n flag), but you don't!" - LockSystemPrintInfo "${system}" - exit 1 - fi - else - echo "Acquiring lock for ${system}" + # Loop through and try all systems in the pool. + for system in $(Systems_"${pool}"); do - # Print out potentially useful information to the user about - # the current status of the lock - LockSystemPrintInfo "${system}" + echo "Attempting system ${system} in pool ${pool}" - # We can setup the trap early as we check if we actually own the - # the lock before releasing it - trap "UnlockSystem \"${system}\" 0 \"${key}\"; exit 1" ${SIGNALS} + # Note we do not retry and give a small retry period (which is unused) + # We only want to attempt locking, not wait on it + if EnqueueOneSystem "${system}" "1" "0" "${key}" "${no_lock_mods}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" "${linux}" "${dtbflag}" "${file_count}" "${files}" ; then + # Running succeeded on a board + ret=$? + exit "${ret}" + fi - if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}"; then - echo "Failed to acquire lock for system (${system})" - exit 2 - fi - fi + done - echo "Lock acquired, we are allowed to run" - local ret=0 + echo "Done attempting system ${system} in pool ${pool}" - if [ "${interact}" = "-r" ]; then - echo "This is a reservation. You now own ${system} and" - echo "can do what you want. Press ctrl+d or enter here when done" - read line - echo "Attempting to power off machine now that you are done" - SystemPowerOff "${system}" - else - # Run the image. 'files' is deliberately not in quotes so the multiple - # files expands to multiple arguments - # 'dtbflag' is deliberately not in quotes so it expands to multiple - # parameters '-b' and 'my.dtb' - # 'linux' is deliberately not in quotes so it will not create an empty - # parameter if the flag is not set - SystemRunImage "${system}" "${completion}" "${completion_timeout}" "${errortxt}" "${logfile}" "${keep_alive}" ${linux} ${dtbflag} $files - ret=$? - fi + lock_tries=$(($lock_tries+1)) - if ! ${no_lock_mods} ; then - UnlockSystem "${system}" 0 "${key}" - trap "exit 1" ${SIGNALS} - fi + sleep "${loop_period}" - exit $ret + done + + # If we get here, we've exhausted the specified number of retries + echo "Failed to acquire lock for any system in pool (${pool})" + exit 2 + else + + echo "System or pool '$system' does not exist. Valid systems are" + SystemList + echo "Valid pools are" + PoolList + exit 1 + fi } diff --git a/scripts/lock b/scripts/lock index 063a38e..30f9b4a 100755 --- a/scripts/lock +++ b/scripts/lock @@ -97,7 +97,7 @@ LockSystem() { total_retries="$3" key="$4" newtimeout="$5" - timeout="" + oldtimeout="" owner=$(LockOwner "${system}") # Error checking - the same person can't relock the same lock @@ -120,21 +120,21 @@ LockSystem() { then # Ensure timeout is either an integer or ignored # to avoid lockfile syntax errors - timeout="$(LockTimeout $system)" || timeout="" - case $timeout in + oldtimeout="$(LockTimeout $system)" || oldtimeout="" + case $oldtimeout in ''|*[!0-9]*) # no valid timeout - timeout="" + oldtimeout="" ;; *) # valid timeout - timeout="-l ${timeout}" + oldtimeout="-l ${oldtimeout}" ;; esac fi lockname="$(LockName ${system})" keyname="$(KeyName ${system})" - RemoteCommand "umask 0111 && lockfile -'${retry_period}' $timeout -r '${total_retries}' '$lockname' && rm -f '$keyname' && printf '${key}' > '$keyname' && chmod a-w,g+r '$keyname' && chmod u+w $lockname && printf '$newtimeout' > $lockname && chmod a-w $lockname" + RemoteCommand "umask 0111 && lockfile -'${retry_period}' $oldtimeout -r '${total_retries}' '$lockname' && rm -f '$keyname' && printf '${key}' > '$keyname' && chmod a-w,g+r '$keyname' && chmod u+w $lockname && printf '$newtimeout' > $lockname && chmod a-w $lockname" [ "$?" -ne 0 ] && return 3 @@ -190,7 +190,7 @@ UserLockUsage() { echo " -wait SYSTEM Acquire the lock for the specified SYSTEM" echo " -cancel SYSTEM Cancel '-wait' processes on the server that are waiting for specified SYSTEM and key" echo " -w TIME Number of seconds to wait between each attempt to acquire the lock (default 8)" - echo " -t RETRIES Number of retries to preform for acquiring the lock (default -1)" + echo " -t RETRIES Number of retries to perform for acquiring the lock (default -1)" echo " -f Forcefully releases a lock even if you are not the owner" echo " -k LOCK_KEY Set a key inside the lock" echo " -T timeout Allow lock to be reclaimed after timeout seconds" @@ -282,47 +282,121 @@ UserLock() { *);; esac - IsSystemValid "${system}" - if [ $? -ne 0 ]; then - # If it's not a system, is it a pool? - IsPoolValid "${system}" - if [ $? -ne 0 ] ; then - echo "System or pool '$system' does not exist. Valid systems are" - SystemList - echo "Valid pools are" - PoolList - exit 1 - fi - # Select a system from the pool + if IsSystemValid "${system}" ; then + + case "${action}" in + -info) + LockSystemPrintInfo "${system}" + ;; + -mr-info) + LockSystemDumpInfo "${system}" + ;; + -signal) + UnlockSystem "${system}" "${force}" "${key}" + ;; + -wait) + if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}" $timeout; then + echo "Failed to acquire lock for system (${system})" + exit 2 + fi + ;; + -cancel) + CancelWait "${system}" "${key}" + ;; + *) + echo "Unknown usage" + UserLockUsage + exit 1 + ;; + esac + + elif IsPoolValid "${system}" ; then pool=${system} - system=$(GetRandomSystemFromPool_"${pool}") - echo "${system}" - fi - case "${action}" in - -info) - LockSystemPrintInfo "${system}" - ;; - -mr-info) - LockSystemDumpInfo "${system}" - ;; - -signal) - UnlockSystem "${system}" "${force}" "${key}" - ;; - -wait) - if ! LockSystem "${system}" "${retry_period}" "${total_retries}" "${key}" $timeout; then - echo "Failed to acquire lock for system (${system})" + case "${action}" in + -info) + for system in $(Systems_"${pool}"); do + LockSystemPrintInfo "${system}" + done + ;; + -mr-info) + for system in $(Systems_"${pool}"); do + LockSystemDumpInfo "${system}" + done + ;; + -signal) + for system in $(Systems_"${pool}"); do + UnlockSystem "${system}" "${force}" "${key}" + done + ;; + -wait) + + # When attempting to lock a pool, we use the retry time and retry count + # given by the user for how often we attempt locking on the whole pool, + # not for lock attempts on a specific system. + # Each time the user-configured period elapses, we try (but do not wait + # on) locking each system in the pool. + + lock_tries=0 + local ret=0 + + # Need to use a different name so as not to collide with other local variables + loop_retries="$total_retries" + loop_period="$retry_period" + + # Loop until total_retries + 1 because we want to make the initial attempt, + # and then retry the correct number of times + while [ "${loop_retries}" -eq -1 ] || [ "${lock_tries}" -lt "$((${loop_retries}+1))" ] ; do + + # Loop through and try all systems in the pool. + for system in $(Systems_"${pool}"); do + + echo "Attempting system ${system} in pool ${pool}" + + # Note we do not retry the individual lock attempt, + # and give only a small retry period (which is unused) + # We only want to attempt locking, not wait on it + if LockSystem "${system}" "1" "0" "${key}" $timeout ; then + # Locking succeeded on a board + # We need to tell the user which one + echo "${system}" + ret=$? + exit "${ret}" + fi + done + + echo "Done attempting system ${system} in pool ${pool}" + + lock_tries=$(($lock_tries+1)) + + sleep "${loop_period}" + + done + + # If we reach this point, we've exhausted all retry attempts + echo "Failed to acquire lock for any system in pool (${pool})" exit 2 - fi - ;; - -cancel) - CancelWait "${system}" "${key}" - ;; - *) - echo "Unknown usage" - UserLockUsage - exit 1 - ;; - esac + + ;; + -cancel) + for system in $(Systems_"${pool}"); do + CancelWait "${system}" "${key}" + done + ;; + *) + echo "Unknown usage" + UserLockUsage + exit 1 + ;; + esac + + else + echo "System or pool '$system' does not exist. Valid systems are" + SystemList + echo "Valid pools are" + PoolList + exit 1 + fi + exit 0 }