diff --git a/oar/tools/oar_phoenix.py b/oar/tools/oar_phoenix.py index 7dd79955..ec599aec 100755 --- a/oar/tools/oar_phoenix.py +++ b/oar/tools/oar_phoenix.py @@ -35,7 +35,7 @@ PHOENIX_MAX_REBOOTS = 20 # Timeout (s) for unix commands -PHOENIX_CMD_TIMEOUT = 15 +PHOENIX_CMD_TIMEOUT = 60 # Properties of the broken nodes (SQL where clause) PHOENIX_BROKEN_NODES = "state='Suspected' and network_address NOT IN (SELECT distinct(network_address) FROM resources where resource_id IN (SELECT resource_id FROM assigned_resources WHERE assigned_resource_index = 'CURRENT')) and network_address not like 'luke%' and network_address != 'dahu33' and network_address not like 'bigfoot%'" @@ -66,6 +66,7 @@ def send_cmd(cmd): else: logfile.write(f"{current_time} - Command executed, no output\n") except subprocess.TimeoutExpired: + logfile.write(f"{current_time} - Command timed out!\n") process.kill() return "Timed out!" except Exception as e: @@ -125,20 +126,22 @@ def get_nodes_to_hard_reboot(db, broken_nodes): # Soft reboot nodes def soft_reboot_nodes(db, nodes): - with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile: + logfile=open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") current_time = time.strftime("%Y-%m-%d %H:%M:%S") for node in nodes: logfile.write(f"{current_time} - Soft rebooting the broken node {node}\n") + logfile.close() cmd = PHOENIX_SOFT_REBOOTCMD.replace("{NODENAME}", node) db[node] = {'soft_reboot': time.time()} send_cmd(cmd) # Hard reboot nodes def hard_reboot_nodes(db, nodes): - with open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") as logfile: + logfile=open(f"{PHOENIX_LOGDIR}/oar_phoenix.log", "a") current_time = time.strftime("%Y-%m-%d %H:%M:%S") for node in nodes: logfile.write(f"{current_time} - Hard rebooting the broken node {node}\n") + logfile.close() cmd = PHOENIX_HARD_REBOOTCMD.replace("{NODENAME}", node) del db[node] db[node] = {'hard_reboot': time.time()}