110 lines
3.7 KiB
Python
110 lines
3.7 KiB
Python
## @package timeout_guard
|
|
# Module caffe2.python.timeout_guard
|
|
|
|
|
|
|
|
|
|
|
|
import contextlib
|
|
import threading
|
|
import os
|
|
import time
|
|
import signal
|
|
import logging
|
|
from future.utils import viewitems
|
|
|
|
|
|
'''
|
|
Sometimes CUDA devices can get stuck, 'deadlock'. In this case it is often
|
|
better just the kill the process automatically. Use this guard to set a
|
|
maximum timespan for a python call, such as RunNet(). If it does not complete
|
|
in time, process is killed.
|
|
|
|
Example usage:
|
|
with timeout_guard.CompleteInTimeOrDie(10.0):
|
|
core.RunNet(...)
|
|
'''
|
|
|
|
|
|
class WatcherThread(threading.Thread):
|
|
|
|
def __init__(self, timeout_secs):
|
|
threading.Thread.__init__(self)
|
|
self.timeout_secs = timeout_secs
|
|
self.completed = False
|
|
self.condition = threading.Condition()
|
|
self.daemon = True
|
|
self.caller_thread = threading.current_thread()
|
|
|
|
def run(self):
|
|
started = time.time()
|
|
self.condition.acquire()
|
|
while time.time() - started < self.timeout_secs and not self.completed:
|
|
self.condition.wait(self.timeout_secs - (time.time() - started))
|
|
self.condition.release()
|
|
if not self.completed:
|
|
log = logging.getLogger("timeout_guard")
|
|
log.error("Call did not finish in time. Timeout:{}s PID: {}".format(
|
|
self.timeout_secs,
|
|
os.getpid(),
|
|
))
|
|
|
|
# First try dying cleanly, but in 10 secs, exit properly
|
|
def forcequit():
|
|
time.sleep(10.0)
|
|
log.info("Prepared output, dumping threads. ")
|
|
print("Caller thread was: {}".format(self.caller_thread))
|
|
print("-----After force------")
|
|
import sys
|
|
import traceback
|
|
code = []
|
|
for threadId, stack in viewitems(sys._current_frames()):
|
|
if threadId == self.caller_thread.ident:
|
|
code.append("\n# ThreadID: %s" % threadId)
|
|
for filename, lineno, name, line in traceback.extract_stack(stack):
|
|
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
|
|
if line:
|
|
code.append(" %s" % (line.strip()))
|
|
|
|
print("\n".join(code))
|
|
log.error("Process did not terminate cleanly in 10 s, forcing")
|
|
os.abort()
|
|
|
|
forcet = threading.Thread(target=forcequit, args=())
|
|
forcet.daemon = True
|
|
forcet.start()
|
|
print("Caller thread was: {}".format(self.caller_thread))
|
|
print("-----Before forcing------")
|
|
import sys
|
|
import traceback
|
|
code = []
|
|
for threadId, stack in viewitems(sys._current_frames()):
|
|
code.append("\n# ThreadID: %s" % threadId)
|
|
for filename, lineno, name, line in traceback.extract_stack(stack):
|
|
code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
|
|
if line:
|
|
code.append(" %s" % (line.strip()))
|
|
|
|
print("\n".join(code))
|
|
os.kill(os.getpid(), signal.SIGINT)
|
|
|
|
|
|
@contextlib.contextmanager
|
|
def CompleteInTimeOrDie(timeout_secs):
|
|
watcher = WatcherThread(timeout_secs)
|
|
watcher.start()
|
|
yield
|
|
watcher.completed = True
|
|
watcher.condition.acquire()
|
|
watcher.condition.notify()
|
|
watcher.condition.release()
|
|
|
|
|
|
def EuthanizeIfNecessary(timeout_secs=120):
|
|
'''
|
|
Call this if you have problem with process getting stuck at shutdown.
|
|
It will kill the process if it does not terminate in timeout_secs.
|
|
'''
|
|
watcher = WatcherThread(timeout_secs)
|
|
watcher.start()
|