You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
o3de/Tools/LyTestTools/ly_test_tools/environment/watchdog.py

244 lines
11 KiB
Python

"""
Copyright (c) Contributors to the Open 3D Engine Project.
For complete copyright and license terms please see the LICENSE at the root of this distribution.
SPDX-License-Identifier: Apache-2.0 OR MIT
The Watchdog parent class that spawns a separate thread to wait for a specific condition. If that condition is
fulfilled, then it can raise an exception or log an error.
"""
import threading
import logging
import os
import re
import psutil
import time
import ly_test_tools.environment.process_utils as process_utils
from ly_test_tools import WINDOWS
logger = logging.getLogger(__name__)
class WatchdogError(Exception):
""" Indicates that a Watchdog met its exception condition """
class Watchdog(object):
DEFAULT_JOIN_TIMEOUT = 10 # seconds
def __init__(self, bool_fn, interval=1, raise_on_condition=True, name='ly_test_watchdog', error_message=''):
# type: (func, int, bool, str, str) -> Watchdog
"""
A Watchdog object that takes in a boolean function. It spawns a thread that loops over the boolean function
until it returns True. If the boolean function returns True, then a flag will be set and an exception
will be raised when stop() is called.
:param bool_fn: A function that must return a boolean. It will be ran by the spawned thread until it's True
:param interval: The interval (in seconds) for how frequently the bool_fn is called on the thread.
:param raise_on_condition: If True, raises an exception when bool_fn returns True. If False, logs an error message
when bool_fn returns True.
:param name: The name of the thread.
:param error_message: The error message to log when bool_fn returns True. Defaults to printing the watchdog name
and function name.
"""
self.caught_failure = False
self.name = name
self._bool_fn = bool_fn
self._interval = interval
self._raise_on_condition = raise_on_condition
default_error_message = f'Watchdog: {name} caught an unexpected condition from function: {bool_fn.__name__}()'
self._error_message = error_message if error_message else default_error_message
self._shutdown = threading.Event()
self._watchdog_thread = threading.Thread(target=self._watchdog, name=name)
def start(self):
# type: () -> None
"""
Starts the watchdog's thread which spawns it and begins executing its target function. Also clears the thread's
shutdown Event. Clears the caught_failure attribute if the thread has been restarted.
:return: None
"""
self._shutdown.clear()
self._watchdog_thread.start()
self.caught_failure = False
def stop(self, join_timeout=DEFAULT_JOIN_TIMEOUT):
# type: () -> None
"""
Stops the watchdog's thread if it's executing by enabling its shutdown Event. If the target function's condition
was found, then it will either raise an exception or log an error message.
:param join_timeout: The timeout to wait for the watchdog thread to join.
:return: None
"""
# Set the Event attribute so that the thread stops running
self._shutdown.set()
# Join the watchdog thread back to primary thread
self._watchdog_thread.join(timeout=join_timeout)
if self.is_alive():
# thread has timed out because it's still alive
logger.error(f'Thread: {self.name} timed out when calling join()')
# No further action taken if nothing was caught
if not self.caught_failure:
return
# either raise an exception or log an error
if self._raise_on_condition:
raise WatchdogError(self._error_message)
logger.error(self._error_message)
def is_alive(self):
# type: () -> bool
"""
The thread is killed when it times out or its target returns True. Timed out threads are considered not alive.
:return: Returns True if the thread is alive, else False
"""
return self._watchdog_thread.is_alive()
def _watchdog(self):
# type: () -> None
"""
The main function of the watchdog thread. It will repeatedly call its target function until the target function
returns True, in which it will set the self.caught_failure attribute to True.
:return: None
"""
while True:
# check to see if the thread is shut down
if self._shutdown.wait(timeout=self._interval):
logger.info(f"Shutting down watchdog: {self.name}")
return
# call the target function and see if it returned True
if self._bool_fn():
self.caught_failure = True
return
class ProcessUnresponsiveWatchdog(Watchdog):
def __init__(self, process_id, interval=1, raise_on_condition=True, name='process_watchdog', error_message='',
unresponsive_timeout_seconds=30):
# type: (int, int, bool, str, str, int) -> ProcessUnresponsiveWatchdog
"""
Watches a process ID and reports if it is unresponsive for a given timeout. If multiple processes need to be
watched, then multiple watchdogs should be instantiated.
Note: This is for windows OS only.
:param process_id: The process id to watch
:param interval: The interval (in seconds) for how frequently the bool_fn is called on the thread.
:param raise_on_condition: If True, raises an exception when bool_fn returns True. If False, logs an error
message when bool_fn returns True.
:param name: The name of the thread.
:param error_message: The error message when bool_fn returns True. Defaults to the watchdog name and pid
:param unresponsive_timeout_seconds: How long the process needs to be unresponsive for in order for the watchdog
to report (in seconds).
"""
if not WINDOWS:
raise (NotImplementedError, "Process watchdog is only implemented on Windows.")
self._unresponsive_timeout = unresponsive_timeout_seconds
self._calculated_timeout_point = None
self._pid = process_id
self._process_name = psutil.Process(self._pid).name()
self._default_error_message = f"Process watchdog has found that process: {self._process_name} with pid: " \
f"{self._pid} was unresponsive during the test run. Investigate process " \
f"{self._process_name} for failures."
if not error_message:
error_message = self._default_error_message
super(ProcessUnresponsiveWatchdog, self).__init__(bool_fn=self._process_not_responding, interval=interval,
raise_on_condition=raise_on_condition, name=name,
error_message=error_message)
def _process_not_responding(self):
# type: () -> bool
"""
Checks to see if the process has been unresponsive longer than self._unresponsive_timeout. Once the process is
found to be unresponsive, it will keep track of how long it has been unresponsive for.
The cmd returns a string of not responding tasks in the following format:
"b'\r\n
Image Name PID Session Name Session# Mem Usage\r\n
========================= ======== ================ =========== ============\r\n
foo.exe 00000 Console 1 0 K\r\n'"
:return: True if the process has been unresponsive for self._unresponsive_timeout seconds, else False
"""
_PROCESS_OUTPUT_ENCODING = 'utf-8'
cmd = 'tasklist /FI "PID eq %d" /FI "STATUS eq not responding"' % self._pid
# searches for a process name and pid with white space in between them and at the end
regex = f'{self._process_name}\s+{self._pid}\s'
status = process_utils.check_output(cmd)
if re.search(regex, status):
if self._calculated_timeout_point is None:
# First instance of process unresponsive
self._calculated_timeout_point = time.time() + self._unresponsive_timeout
elif time.time() >= self._calculated_timeout_point:
return True
else:
# Process is responsive
self._calculated_timeout_point = None
return False
def get_pid(self):
# type: () -> int
"""
Get the pid that the watchdog is watching.
:return: The process id of the watchdog
"""
return self._pid
class CrashLogWatchdog(Watchdog):
def __init__(self, log_path, interval=1, raise_on_condition=True, name='crash_log_watchdog', error_message=''):
# type: (str, int, bool, str, str) -> CrashLogWatchdog
"""
A watchdog that watches if a file gets created and reports if it finds the file. The watchdog will check to see
if the file already exists and removes it before starting to watch.
:param log_path: The absolute path of the log to watch
:param interval: The interval (in seconds) for how frequently the bool_fn is called on the thread.
:param raise_on_condition: If True, raises an exception when bool_fn returns True. If False, logs an error message
when bool_fn returns True.
:param name: The name of the thread.
:param error_message: The error message to log when bool_fn returns True. Defaults to printing the watchdog name
"""
self._log_path = log_path
def crash_exists():
return os.path.exists(log_path)
if not error_message:
error_message = f"Crash log watchdog has detected an error log found at {log_path} during the test run. \
Investigate the process that creates the error log for failures."
if os.path.exists(log_path):
logger.info(f"Removing existing {log_path} when initializing crash log watchdog.")
os.remove(log_path)
super(CrashLogWatchdog, self).__init__(bool_fn=crash_exists, interval=interval,
raise_on_condition=raise_on_condition,
name=name, error_message=error_message)
def stop(self):
# type: () -> None
"""
Prints the crash log if able when stopping the watchdog.
:return: None
"""
header = "================= Crash Log Print =================\n"
if self.caught_failure:
print(header)
with open(self._log_path, "r") as crash_log:
for line in crash_log:
print(line.strip('\n'))
print("=" * len(header) + "\n")
super(CrashLogWatchdog, self).stop()