Skip to content

Commit

Permalink
Troubleshoot blackbox crash test final verification hang (#13070)
Browse files Browse the repository at this point in the history
Summary:
Add a timeout for the blackbox crash test final verification step, and print the db_stress stack trace on a timeout. The crash test occasionally hangs in the verification step and this will help debug.

Pull Request resolved: #13070

Reviewed By: hx235

Differential Revision: D64414461

Pulled By: anand1976

fbshipit-source-id: 4629aac01fbe6c788665beddc66280ba446aadbe
  • Loading branch information
anand1976 authored and facebook-github-bot committed Oct 15, 2024
1 parent cbebbad commit 2abbb02
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions tools/db_crashtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,8 @@ def is_direct_io_supported(dbname):
"duration": 6000,
# time for one db_stress instance to run
"interval": 120,
# time for the final verification step
"verify_timeout": 1200,
# since we will be killing anyway, use large value for ops_per_thread
"ops_per_thread": 100000000,
"reopen": 0,
Expand Down Expand Up @@ -1047,6 +1049,7 @@ def gen_cmd(params, unknown_params):
"cleanup_cmd",
"skip_tmpdir_check",
"print_stderr_separately",
"verify_timeout",
}
and v is not None
]
Expand All @@ -1055,16 +1058,19 @@ def gen_cmd(params, unknown_params):
return cmd


def execute_cmd(cmd, timeout=None):
def execute_cmd(cmd, timeout=None, timeout_pstack=False):
child = subprocess.Popen(cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
print("Running db_stress with pid=%d: %s\n\n" % (child.pid, " ".join(cmd)))
pid = child.pid

try:
outs, errs = child.communicate(timeout=timeout)
hit_timeout = False
print("WARNING: db_stress ended before kill: exitcode=%d\n" % child.returncode)
except subprocess.TimeoutExpired:
hit_timeout = True
if timeout_pstack:
os.system("pstack %d" % pid)
child.kill()
print("KILLED %d\n" % child.pid)
outs, errs = child.communicate()
Expand Down Expand Up @@ -1139,7 +1145,7 @@ def blackbox_crash_main(args, unknown_args):
cmd = gen_cmd(
dict(list(cmd_params.items()) + list({"db": dbname}.items())), unknown_args
)
hit_timeout, retcode, outs, errs = execute_cmd(cmd)
hit_timeout, retcode, outs, errs = execute_cmd(cmd, cmd_params["verify_timeout"], True)

# For the final run
print_output_and_exit_on_error(outs, errs, args.print_stderr_separately)
Expand Down

0 comments on commit 2abbb02

Please sign in to comment.