Merge pull request #114 from billsacks/fix_large_output_hang

Fix hangs when a subcommand has a lot of output. Rework execute_subprocess timeout handling to prevent hangs The previous implementation, relying on the lower-level subprocess.Popen, was hanging when trying to get the status of a git repository with many changed files (see #113). It seems that the problem was that subprocess.Popen buffers output, and we would need to write some extra code to periodically clear the output buffer. This seemed tricky to get right, so I have backed up to using the higher-level subprocess.check_output, as we did prior to #81. I am using a new method to detect hangs and print a useful message (which was the point of PR #81), based on threading.Timer. I can't figure out a way to kill the subprocess when it appears to be hanging, but at least we can notify the user of this possible problem. This commit brings this part of the code back to how it was prior to d6423c6, except now we have a hang-detection timer. As a side-benefit, it appears that this change speeds up the unit and system tests, based on a few trial runs before and after this change. (This is probably due to the removal of the 0.02 sec delay associated with the previous subprocess.Popen implementation.) User interface changes?: No Fixes #113 (Timeout when getting status from a git repository with many changes) Testing: test removed: none unit tests: pass system tests: pass manual testing: (1) Tested the problem reported in #113 by putting a ton of changes in components/clm (changing permissions on all files, and adding about 7000 files in a testmods directory). The original hung, this version succeeded quickly. (2) Tested the problem reported in #79 and fixed in #81 (because this PR undoes the changes from #81): I tested with authentication required (on hobart, by pointing to the NGEET fates repo): I changed the timeout to 10 seconds, ran 'manage_externals/checkout_externals clm', and reviewed the printed message. It looks like this, after about 10 seconds: Checking out externals: fates, Username for 'https://github.com': Command 'git clone --quiet https://github.com/NGEET/fates fates' from directory /scratch/cluster/sacks/cesm/components/clm/src has taken 10 seconds. It may be hanging. The command will continue to run, but you may want to abort manage_externals with ^C and investigate. A possible cause of hangs is when svn or git require authentication to access a private repository. On some systems, svn and git requests for authentication information will not be displayed to the user. In this case, the program will appear to hang. Ensure you can run svn and git manually and access all repositories without entering your authentication information.
ESMCI · Aug 21, 2018 · fc5acda · fc5acda
2 parents b0b23a6 + aa2eb71
commit fc5acda
Show file tree

Hide file tree

Showing 5 changed files with 59 additions and 94 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -17,6 +17,7 @@ matrix:
       # NOTE(bja, 2017-11) update is slow, 2.7.12 installed by default, good enough!
       # - brew update
       # - brew outdated python2 || brew upgrade python2
+      - pip install virtualenv
       - virtualenv env -p python2
       - source env/bin/activate
 install:
@@ -29,4 +30,3 @@ script:
 after_success:
   - cd test; make coverage
   - cd test; coveralls
-
diff --git a/manic/__init__.py b/manic/__init__.py
@@ -1,7 +1,7 @@
 """Public API for the manage_externals library
 """
 
-import manic.checkout as checkout
+from manic import checkout
 from manic.utils import printlog
 
 __all__ = [

diff --git a/manic/repository_git.py b/manic/repository_git.py
@@ -56,7 +56,8 @@ def checkout(self, base_dir_path, repo_dir_name, verbosity):
         """
         repo_dir_path = os.path.join(base_dir_path, repo_dir_name)
         repo_dir_exists = os.path.exists(repo_dir_path)
-        if (repo_dir_exists and not os.listdir(repo_dir_path)) or not repo_dir_exists:
+        if (repo_dir_exists and not os.listdir(
+                repo_dir_path)) or not repo_dir_exists:
             self._clone_repo(base_dir_path, repo_dir_name, verbosity)
         self._checkout_ref(repo_dir_path, verbosity)
 

diff --git a/manic/utils.py b/manic/utils.py
@@ -12,7 +12,7 @@
 import os
 import subprocess
 import sys
-import time
+from threading import Timer
 
 from .global_constants import LOCAL_PATH_INDICATOR
 
@@ -65,13 +65,15 @@ def last_n_lines(the_string, n_lines, truncation_message=None):
 
     lines = the_string.splitlines(True)
     if len(lines) <= n_lines:
-        return the_string
+        return_val = the_string
     else:
         lines_subset = lines[-n_lines:]
         str_truncated = ''.join(lines_subset)
         if truncation_message:
             str_truncated = truncation_message + '\n' + str_truncated
-        return str_truncated
+        return_val = str_truncated
+
+    return return_val
 
 
 def indent_string(the_string, indent_level):
@@ -119,9 +121,9 @@ def str_to_bool(bool_str):
     """
     value = None
     str_lower = bool_str.lower()
-    if (str_lower == 'true') or (str_lower == 't'):
+    if str_lower in ('true', 't'):
         value = True
-    elif (str_lower == 'false') or (str_lower == 'f'):
+    elif str_lower in ('false', 'f'):
         value = False
     if value is None:
         msg = ('ERROR: invalid boolean string value "{0}". '
@@ -199,80 +201,30 @@ def expand_local_url(url, field):
 # subprocess
 #
 # ---------------------------------------------------------------------
-_TIMEOUT_MSG = """ Timout errors typically occur when svn or git requires
-authentication to access a private repository. On some systems, svn
-and git requests for authentication information will not be displayed
-to the user.  In this case, the program will appear to hang and
-generate a timeout error. Ensure you can run svn and git manually and
-access all repositories without entering your authentication
-information."""
-
-_TIMEOUT_SEC = 300
-_POLL_DELTA_SEC = 0.02
-
-
-def _poll_subprocess(commands, status_to_caller, output_to_caller,
-                     timeout_sec=_TIMEOUT_SEC):
-    """Create a subprocess and poll the process until complete.
 
-    Impose a timeout limit and checkout process output for known
-    conditions that require user interaction.
+# Give the user a helpful message if we detect that a command seems to
+# be hanging.
+_HANGING_SEC = 300
 
-    NOTE: the timeout_delta has significant impact on run time. If it
-    is too long, and the many quick local subprocess calls will
-    drastically increase the run time, especially in tests.
 
-    NOTE: This function is broken out into for ease of
-    understanding. It does no error checking. It should only be called
-    from execute_subprocess, never directly.
+def _hanging_msg(working_directory, command):
+    print("""
 
-    """
-    logging.info(' '.join(commands))
-    output = []
-    start = time.time()
-
-    proc = subprocess.Popen(commands,
-                            shell=False,
-                            stdout=subprocess.PIPE,
-                            stderr=subprocess.STDOUT,
-                            universal_newlines=True)
-    while proc.poll() is None:
-        time.sleep(_POLL_DELTA_SEC)
-        if time.time() - start > timeout_sec:
-            proc.kill()
-            time.sleep(_POLL_DELTA_SEC * 5)
-            msg = ("subprocess call to  '{0}' has exceeded timeout limit of "
-                   "{1} seconds.\n{2}".format(commands[0], timeout_sec,
-                                              _TIMEOUT_MSG))
-            fatal_error(msg)
-    finish = time.time()
+Command '{command}'
+from directory {working_directory}
+has taken {hanging_sec} seconds. It may be hanging.
 
-    run_time_msg = "run time : {0:.2f} seconds".format(finish - start)
-    logging.info(run_time_msg)
-    output = proc.stdout.read()
-    log_process_output(output)
-    status = proc.returncode
+The command will continue to run, but you may want to abort
+manage_externals with ^C and investigate. A possible cause of hangs is
+when svn or git require authentication to access a private
+repository. On some systems, svn and git requests for authentication
+information will not be displayed to the user. In this case, the program
+will appear to hang. Ensure you can run svn and git manually and access
+all repositories without entering your authentication information.
 
-    # NOTE(bja, 2018-03) need to cleanup open files. In python3 use
-    # "with subprocess.Popen(...) as proc:", but that is not available
-    # with python2 unless we create a context manager.
-    proc.stdout.close()
-
-    if status != 0:
-        raise subprocess.CalledProcessError(returncode=status,
-                                            cmd=commands,
-                                            output=output)
-
-    if status_to_caller and output_to_caller:
-        ret_value = (status, output)
-    elif status_to_caller:
-        ret_value = status
-    elif output_to_caller:
-        ret_value = output
-    else:
-        ret_value = None
-
-    return ret_value
+""".format(command=command,
+           working_directory=working_directory,
+           hanging_sec=_HANGING_SEC))
 
 
 def execute_subprocess(commands, status_to_caller=False,
@@ -288,19 +240,24 @@ def execute_subprocess(commands, status_to_caller=False,
     return code, otherwise execute_subprocess treats non-zero return
     status as an error and raises an exception.
 
-    NOTE(bja, 2018-03) we are polling the running process to avoid
-    having it hang indefinitely if there is input that we don't
-    detect. Some large checkouts are multiple minutes long. For now we
-    are setting the timeout interval to five minutes.
-
     """
-    msg = 'In directory: {0}\nexecute_subprocess running command:'.format(
-        os.getcwd())
+    cwd = os.getcwd()
+    msg = 'In directory: {0}\nexecute_subprocess running command:'.format(cwd)
     logging.info(msg)
-    logging.info(commands)
+    commands_str = ' '.join(commands)
+    logging.info(commands_str)
+    return_to_caller = status_to_caller or output_to_caller
+    status = -1
+    output = ''
+    hanging_timer = Timer(_HANGING_SEC, _hanging_msg,
+                          kwargs={"working_directory": cwd,
+                                  "command": commands_str})
+    hanging_timer.start()
     try:
-        ret_value = _poll_subprocess(
-            commands, status_to_caller, output_to_caller)
+        output = subprocess.check_output(commands, stderr=subprocess.STDOUT,
+                                         universal_newlines=True)
+        log_process_output(output)
+        status = 0
     except OSError as error:
         msg = failed_command_msg(
             'Command execution failed. Does the executable exist?',
@@ -319,20 +276,27 @@ def execute_subprocess(commands, status_to_caller=False,
         # simple status check. If returning, it is the callers
         # responsibility determine if an error occurred and handle it
         # appropriately.
-        if status_to_caller and output_to_caller:
-            ret_value = (error.returncode, error.output)
-        elif status_to_caller:
-            ret_value = error.returncode
-        elif output_to_caller:
-            ret_value = error.output
-        else:
+        if not return_to_caller:
             msg_context = ('Process did not run successfully; '
                            'returned status {0}'.format(error.returncode))
             msg = failed_command_msg(msg_context, commands,
                                      output=error.output)
             logging.error(error)
+            logging.error(msg)
             log_process_output(error.output)
             fatal_error(msg)
+        status = error.returncode
+    finally:
+        hanging_timer.cancel()
+
+    if status_to_caller and output_to_caller:
+        ret_value = (status, output)
+    elif status_to_caller:
+        ret_value = status
+    elif output_to_caller:
+        ret_value = output
+    else:
+        ret_value = None
 
     return ret_value
 

diff --git a/test/.pylint.rc b/test/.pylint.rc
@@ -50,7 +50,7 @@ confidence=
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=bad-continuation
+disable=bad-continuation,useless-object-inheritance
 
 
 # Enable the message, report, category or checker with the given id(s). You can