From 6955a5c2a0abf38adace2553f21ea072416b1f61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabiano=20Fid=C3=AAncio?= Date: Wed, 7 Dec 2016 21:36:56 +0100 Subject: [PATCH 160/160] WATCHDOG: Avoid non async-signal-safe from the signal_handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While debugging rhbz#1396912 a deadlock on sssd_be was noticed[0] and it's been caused by the use of non async-signal-safe functions from the signal_handler (please, see man 7 signal for more info about which are the async-signal-safe functions that can be used). In order to work this situation around a pipe has been added to the watchdog_ctx structure and, in case of clock screw, a single byte is written to this pipe (which is an async-signal-safe operation) and the logic currently done by the timer handler to reset the watchdog will be done inside the fd handler in a safe way. With this patch we ended up losing some debug messages as orderly_shutdown() has been replaced by kill(-getpgrp(), SIGTERM) (or _exit(1) considering the cases where setting up the process group during the server_setup() has failed). Personally I don't think is worth the trouble to try to log those messages properly in this specific case. It's really worth to mention that a proper fix the clock screw situation should be implemented on samba's side, by having tevent using monotonic (or boottime) clock. [0]: [root@dusan ~]# pstack 17922 #0 __lll_lock_wait_private () at ../nptl/sysdeps/unix/sysv/linux/x86_64/lowlevellock.S:95 #1 0x00007fe707d04f93 in _L_lock_14932 () from /lib64/libc.so.6 #2 0x00007fe707d02013 in __GI___libc_malloc (bytes=140630248638304, bytes@entry=15) at malloc.c:2891 #3 0x00007fe707d0888a in __GI___strdup (s=0x7fe707dff4f7 "/etc/localtime") at strdup.c:42 #4 0x00007fe707d31b61 in tzset_internal (always=, explicit=explicit@entry=1) at tzset.c:438 #5 0x00007fe707d32523 in __tz_convert (timer=timer@entry=0x7ffcd5d2b090, use_localtime=use_localtime@entry=1, tp=tp@entry=0x7fe708041d40 <_tmbuf>) at tzset.c:621 #6 0x00007fe707d30521 in __GI_localtime (t=t@entry=0x7ffcd5d2b090) at localtime.c:42 #7 0x00007fe70886c7b0 in sss_vdebug_fn (file=, line=, function=0x7fe70bff27f0 <__FUNCTION__.9379> "watchdog_handler", level=16, flags=flags@entry=0, format=format@entry=0x7fe70bff2760 "Watchdog timer overflow, killing process!\n", ap=ap@entry=0x7ffcd5d2b130) at src/util/debug.c:248 #8 0x00007fe70886c995 in sss_debug_fn (file=file@entry=0x7fe70bff263b "src/util/util_watchdog.c", line=line@entry=82, function=function@entry=0x7fe70bff27f0 <__FUNCTION__.9379> "watchdog_handler", level=level@entry=16, format=format@entry=0x7fe70bff2760 "Watchdog timer overflow, killing process!\n") at src/util/debug.c:284 #9 0x00007fe70bfdb409 in watchdog_handler (sig=) at src/util/util_watchdog.c:81 #10 #11 0x00007fe707cff664 in _int_malloc (av=av@entry=0x7fe70803c760 , bytes=bytes@entry=151) at malloc.c:3494 #12 0x00007fe707d01fbc in __GI___libc_malloc (bytes=bytes@entry=151) at malloc.c:2893 #13 0x00007fe708450749 in __talloc_with_prefix (prefix_len=0, size=55, context=0x7fe718373210) at ../talloc.c:668 #14 __talloc (size=55, context=0x7fe718373210) at ../talloc.c:708 #15 _talloc_named_const (name=0x7fe70bb7015d "../common/ldb_pack.c:425", size=55, context=0x7fe718373210) at ../talloc.c:865 #16 talloc_named_const (context=, size=size@entry=55, name=name@entry=0x7fe70bb7015d "../common/ldb_pack.c:425") at ../talloc.c:1606 #17 0x00007fe70bb61803 in ldb_unpack_data_only_attr_list (ldb=ldb@entry=0x7fe70e4d52c0, data=data@entry=0x7ffcd5d2b990, message=0x7fe7184aa1e0, list=list@entry=0x0, list_size=list_size@entry=0, nb_elements_in_db=nb_elements_in_db@entry=0x0) at ../common/ldb_pack.c:425 #18 0x00007fe70bb61a7d in ldb_unpack_data (ldb=ldb@entry=0x7fe70e4d52c0, data=data@entry=0x7ffcd5d2b990, message=) at ../common/ldb_pack.c:470 #19 0x00007fe6fdc29b46 in ltdb_parse_data_unpack (key=..., data=..., private_data=0x7ffcd5d2ba70) at ../ldb_tdb/ldb_search.c:249 #20 0x00007fe70a5e0a24 in tdb_parse_data (tdb=tdb@entry=0x7fe70e4eaa10, key=..., offset=15619748, len=414772, parser=parser@entry=0x7fe6fdc29b10 , private_data=private_data@entry=0x7ffcd5d2ba70) at ../common/io.c:637 #21 0x00007fe70a5dc1fc in tdb_parse_record (tdb=0x7fe70e4eaa10, key=..., parser=parser@entry=0x7fe6fdc29b10 , private_data=private_data@entry=0x7ffcd5d2ba70) at ../common/tdb.c:253 #22 0x00007fe6fdc29e7b in ltdb_search_dn1 (module=module@entry=0x7fe70e4eab50, dn=dn@entry=0x7fe7183c4940, msg=msg@entry=0x7fe7184aa1e0) at ../ldb_tdb/ldb_search.c:287 #23 0x00007fe6fdc2acbb in ltdb_dn_list_load (module=module@entry=0x7fe70e4eab50, dn=dn@entry=0x7fe7183c4940, list=list@entry=0x7fe7183c3a30) at ../ldb_tdb/ldb_index.c:181 #24 0x00007fe6fdc2bbbb in ltdb_index_add1 (module=module@entry=0x7fe70e4eab50, dn=dn@entry=0x7fe7183bf3e0 "name=testuser7045@domain.com,cn=users,cn=DOMAIN.COM,cn=sysdb", v_idx=v_idx@entry=0, el=, el=) at ../ldb_tdb/ldb_index.c:1134 #25 0x00007fe6fdc2c62c in ltdb_index_add_el (el=0x7fe7184aa3e0, dn=0x7fe7183bf3e0 "name=testuser7045@domain.com,cn=users,cn=DOMAIN.COM,cn=sysdb", module=0x7fe70e4eab50) at ../ldb_tdb/ldb_index.c:1180 #26 ltdb_index_add_element (module=module@entry=0x7fe70e4eab50, dn=, el=el@entry=0x7fe7184aa3e0) at ../ldb_tdb/ldb_index.c:1290 #27 0x00007fe6fdc290bb in ltdb_modify_internal (module=module@entry=0x7fe70e4eab50, msg=0x7fe7183bf0c0, req=req@entry=0x7fe7183bdc10) at ../ldb_tdb/ldb_tdb.c:903 #28 0x00007fe6fdc2958a in ltdb_modify (ctx=0x7fe7183c2950, ctx=0x7fe7183c2950) at ../ldb_tdb/ldb_tdb.c:998 #29 ltdb_callback (ev=, te=, t=..., private_data=) at ../ldb_tdb/ldb_tdb.c:1380 #30 0x00007fe708664b4f in tevent_common_loop_timer_delay (ev=ev@entry=0x7fe70e4d2890) at ../tevent_timed.c:341 #31 0x00007fe708665b5a in epoll_event_loop_once (ev=0x7fe70e4d2890, location=) at ../tevent_epoll.c:911 #32 0x00007fe708664257 in std_event_loop_once (ev=0x7fe70e4d2890, location=0x7fe70bb72ec5 "../common/ldb.c:631") at ../tevent_standard.c:114 #33 0x00007fe70866040d in _tevent_loop_once (ev=ev@entry=0x7fe70e4d2890, location=location@entry=0x7fe70bb72ec5 "../common/ldb.c:631") at ../tevent.c:533 #34 0x00007fe70bb6bc4f in ldb_wait (handle=0x7fe7183c4530, type=) at ../common/ldb.c:631 #35 0x00007fe70bb6c793 in ldb_autotransaction_request (ldb=0x7fe70e4d52c0, req=0x7fe7183bdc10) at ../common/ldb.c:573 #36 0x00007fe70bb6d263 in ldb_modify (ldb=ldb@entry=0x7fe70e4d52c0, message=) at ../common/ldb.c:1655 #37 0x00007fe70bfa2ab5 in sysdb_set_cache_entry_attr (ldb=0x7fe70e4d52c0, entry_dn=entry_dn@entry=0x7fe7183c4760, attrs=attrs@entry=0x7fe7183bf680, mod_op=mod_op@entry=2) at src/db/sysdb_ops.c:1159 #38 0x00007fe70bfa304d in sysdb_rep_ts_entry_attr (sysdb=0x7fe70e4eadd0, attrs=0x7fe7183bf680, entry_dn=0x7fe7183c4760) at src/db/sysdb_ops.c:1218 #39 sysdb_set_ts_entry_attr (sysdb=sysdb@entry=0x7fe70e4eadd0, entry_dn=entry_dn@entry=0x7fe7183c4760, attrs=attrs@entry=0x7fe7183bb840, mod_op=mod_op@entry=2) at src/db/sysdb_ops.c:1248 #40 0x00007fe70bfa4aa9 in sysdb_set_entry_attr (sysdb=0x7fe70e4eadd0, entry_dn=0x7fe7183c4760, attrs=attrs@entry=0x7fe7183bb840, mod_op=mod_op@entry=2) at src/db/sysdb_ops.c:1199 #41 0x00007fe70bfa4b5f in sysdb_set_user_attr (domain=domain@entry=0x7fe70e4d62f0, name=name@entry=0x7fe7183c01f0 "testuser7045@domain.com", attrs=attrs@entry=0x7fe7183bb840, mod_op=mod_op@entry=2) at src/db/sysdb_ops.c:1285 #42 0x00007fe70bfa58c3 in sysdb_add_user (domain=domain@entry=0x7fe70e4d62f0, name=name@entry=0x7fe7183c01f0 "testuser7045@domain.com", uid=uid@entry=1415408147, gid=, gid@entry=1415400513, gecos=gecos@entry=0x7fe710465d00 "Test User7045", homedir=homedir@entry=0x0, shell=shell@entry=0x0, orig_dn=orig_dn@entry=0x7fe710465940 "CN=Test User7045,OU=Sales,DC=DOMAIN,DC=COM", attrs=attrs@entry=0x7fe7183bb840, cache_timeout=cache_timeout@entry=5400, now=now@entry=1481105315) at src/db/sysdb_ops.c:1928 #43 0x00007fe70bfab271 in sysdb_store_new_user (now=1481105315, cache_timeout=5400, attrs=0x7fe7183bb840, orig_dn=0x7fe710465940 "CN=Test User7045,OU=Sales,DC=DOMAIN,DC=COM", shell=0x0, homedir=0x0, gecos=0x7fe710465d00 "Test User7045", gid=1415400513, uid=1415408147, name=0x7fe7183c01f0 "testuser7045@domain.com", domain=0x7fe70e4d62f0) at src/db/sysdb_ops.c:2549 #44 sysdb_store_user (domain=domain@entry=0x7fe70e4d62f0, name=0x7fe7183c01f0 "testuser7045@domain.com", pwd=pwd@entry=0x0, uid=1415408147, gid=1415400513, gecos=gecos@entry=0x7fe710465d00 "Test User7045", homedir=homedir@entry=0x0, shell=shell@entry=0x0, orig_dn=orig_dn@entry=0x7fe710465940 "CN=Test User7045,OU=Sales,DC=DOMAIN,DC=COM", attrs=attrs@entry=0x7fe7183bb840, remove_attrs=0x7fe7183c08a0, cache_timeout=cache_timeout@entry=5400, now=now@entry=1481105315) at src/db/sysdb_ops.c:2499 #45 0x00007fe6fba0d9f9 in sdap_save_user (memctx=memctx@entry=0x7fe70e544ee0, opts=opts@entry=0x7fe70e518400, dom=dom@entry=0x7fe70e4d62f0, attrs=, _usn_value=_usn_value@entry=0x7ffcd5d2c260, now=now@entry=1481105315) at src/providers/ldap/sdap_async_users.c:509 #46 0x00007fe6fba0df9a in sdap_save_users (memctx=memctx@entry=0x7fe70e544e40, sysdb=0x7fe70e4eadd0, dom=0x7fe70e4d62f0, opts=0x7fe70e518400, users=, num_users=10006, _usn_value=_usn_value@entry=0x7fe70e544e60) at src/providers/ldap/sdap_async_users.c:572 #47 0x00007fe6fba0e460 in sdap_get_users_done (subreq=) at src/providers/ldap/sdap_async_users.c:938 #48 0x00007fe6fba0c9d5 in sdap_search_user_process (subreq=0x0) at src/providers/ldap/sdap_async_users.c:814 #49 0x00007fe6fba07379 in generic_ext_search_handler (subreq=0x0, opts=) at src/providers/ldap/sdap_async.c:1689 #50 0x00007fe6fba0991b in sdap_get_generic_op_finished (op=, reply=, error=, pvt=) at src/providers/ldap/sdap_async.c:1621 #51 0x00007fe6fba083cd in sdap_process_message (ev=, sh=, msg=0x7fe70e5f9ce0) at src/providers/ldap/sdap_async.c:353 #52 sdap_process_result (ev=, pvt=) at src/providers/ldap/sdap_async.c:197 #53 0x00007fe708664b4f in tevent_common_loop_timer_delay (ev=ev@entry=0x7fe70e4cbc30) at ../tevent_timed.c:341 #54 0x00007fe708665b5a in epoll_event_loop_once (ev=0x7fe70e4cbc30, location=) at ../tevent_epoll.c:911 #55 0x00007fe708664257 in std_event_loop_once (ev=0x7fe70e4cbc30, location=0x7fe70bfee8e7 "src/util/server.c:702") at ../tevent_standard.c:114 #56 0x00007fe70866040d in _tevent_loop_once (ev=ev@entry=0x7fe70e4cbc30, location=location@entry=0x7fe70bfee8e7 "src/util/server.c:702") at ../tevent.c:533 #57 0x00007fe7086605ab in tevent_common_loop_wait (ev=0x7fe70e4cbc30, location=0x7fe70bfee8e7 "src/util/server.c:702") at ../tevent.c:637 #58 0x00007fe7086641f7 in std_event_loop_wait (ev=0x7fe70e4cbc30, location=0x7fe70bfee8e7 "src/util/server.c:702") at ../tevent_standard.c:140 #59 0x00007fe70bfd1993 in server_loop (main_ctx=0x7fe70e4cd080) at src/util/server.c:702 #60 0x00007fe70c84cb82 in main (argc=8, argv=) at src/providers/data_provider_be.c:587 Resolves: https://fedorahosted.org/sssd/ticket/3266 Signed-off-by: Fabiano Fidêncio Reviewed-by: Pavel Březina Reviewed-by: Jakub Hrozek Reviewed-by: Simo Sorce (cherry picked from commit e6a5f8c58539fc31fd81fac89cfc85703b4250ea) (cherry picked from commit 0606a71b698c4acf954ba7284e62acbd0aa5e52d) --- src/util/util_watchdog.c | 118 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 96 insertions(+), 22 deletions(-) diff --git a/src/util/util_watchdog.c b/src/util/util_watchdog.c index c184fbd759bdbca4a9eae379ff0d87e2d1628470..0df85b71c0b078dc2dfd23a39df82d1a10bad7d6 100644 --- a/src/util/util_watchdog.c +++ b/src/util/util_watchdog.c @@ -23,6 +23,7 @@ #define WATCHDOG_DEF_INTERVAL 10 #define WATCHDOG_MAX_TICKS 3 +#define DEFAULT_BUFFER_SIZE 4096 /* this is intentionally a global variable */ struct watchdog_ctx { @@ -35,32 +36,27 @@ struct watchdog_ctx { struct tevent_context *ev; int input_interval; time_t timestamp; + struct tevent_fd *tfd; + int pipefd[2]; } watchdog_ctx; -static bool watchdog_detect_timeshift(void) +static void watchdog_detect_timeshift(void) { time_t prev_time; time_t cur_time; - errno_t ret; prev_time = watchdog_ctx.timestamp; cur_time = watchdog_ctx.timestamp = time(NULL); if (cur_time < prev_time) { /* Time shift detected. We need to restart watchdog. */ - DEBUG(SSSDBG_IMPORTANT_INFO, "Time shift detected, " - "restarting watchdog!\n"); - teardown_watchdog(); - ret = setup_watchdog(watchdog_ctx.ev, watchdog_ctx.input_interval); - if (ret != EOK) { - DEBUG(SSSDBG_FATAL_FAILURE, "Unable to restart watchdog " - "[%d]: %s\n", ret, sss_strerror(ret)); - orderly_shutdown(1); + if (write(watchdog_ctx.pipefd[1], "1", 1) != 1) { + if (getpid() == getpgrp()) { + kill(-getpgrp(), SIGTERM); + } else { + _exit(1); + } } - - return true; } - - return false; } /* the watchdog is purposefully *not* handled by the tevent @@ -70,17 +66,16 @@ static bool watchdog_detect_timeshift(void) * signals either */ static void watchdog_handler(int sig) { - /* Do not count ticks if time shift was detected - * since watchdog was restarted. */ - if (watchdog_detect_timeshift()) { - return; - } + + watchdog_detect_timeshift(); /* if a pre-defined number of ticks passed by kills itself */ if (__sync_add_and_fetch(&watchdog_ctx.ticks, 1) > WATCHDOG_MAX_TICKS) { - DEBUG(SSSDBG_FATAL_FAILURE, - "Watchdog timer overflow, killing process!\n"); - orderly_shutdown(1); + if (getpid() == getpgrp()) { + kill(-getpgrp(), SIGTERM); + } else { + _exit(1); + } } } @@ -109,10 +104,67 @@ static void watchdog_event_handler(struct tevent_context *ev, } } +static errno_t watchdog_fd_recv_data(int fd) +{ + ssize_t len; + char buffer[DEFAULT_BUFFER_SIZE]; + errno_t ret; + + errno = 0; + len = read(fd, buffer, DEFAULT_BUFFER_SIZE); + if (len == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + return EAGAIN; + } else { + ret = errno; + DEBUG(SSSDBG_CRIT_FAILURE, + "write failed [%d]: %s\n", ret, strerror(ret)); + return ret; + } + } + + return EOK; +} + +static void watchdog_fd_read_handler(struct tevent_context *ev, + struct tevent_fd *fde, + uint16_t flags, + void *data) +{ + errno_t ret; + + ret = watchdog_fd_recv_data(watchdog_ctx.pipefd[0]); + switch(ret) { + case EAGAIN: + DEBUG(SSSDBG_TRACE_ALL, + "Interrupted before any data could be read, retry later.\n"); + return; + case EOK: + /* all fine */ + break; + default: + DEBUG(SSSDBG_FATAL_FAILURE, + "Failed to receive data [%d]: %s. " + "orderly_shutdown() will be called.\n", ret, strerror(ret)); + orderly_shutdown(1); + } + + DEBUG(SSSDBG_IMPORTANT_INFO, "Time shift detected, " + "restarting watchdog!\n"); + teardown_watchdog(); + ret = setup_watchdog(watchdog_ctx.ev, watchdog_ctx.input_interval); + if (ret != EOK) { + DEBUG(SSSDBG_FATAL_FAILURE, "Unable to restart watchdog " + "[%d]: %s\n", ret, sss_strerror(ret)); + orderly_shutdown(1); + } +} + int setup_watchdog(struct tevent_context *ev, int interval) { struct sigevent sev; struct itimerspec its; + struct tevent_fd *tfd; int signum = SIGRTMIN; int ret; @@ -142,6 +194,21 @@ int setup_watchdog(struct tevent_context *ev, int interval) watchdog_ctx.input_interval = interval; watchdog_ctx.timestamp = time(NULL); + ret = pipe(watchdog_ctx.pipefd); + if (ret == -1) { + ret = errno; + DEBUG(SSSDBG_FATAL_FAILURE, + "pipe failed [%d] [%s].\n", ret, strerror(ret)); + return ret; + } + + sss_fd_nonblocking(watchdog_ctx.pipefd[0]); + sss_fd_nonblocking(watchdog_ctx.pipefd[1]); + + tfd = tevent_add_fd(ev, (TALLOC_CTX *)ev, watchdog_ctx.pipefd[0], + TEVENT_FD_READ, watchdog_fd_read_handler, NULL); + watchdog_ctx.tfd = tfd; + /* Start the timer */ /* we give 1 second head start to the watchdog event */ its.it_value.tv_sec = interval + 1; @@ -178,6 +245,13 @@ void teardown_watchdog(void) ret, strerror(ret)); } + /* Free the tevent_fd */ + talloc_zfree(watchdog_ctx.tfd); + + /* Close the pipefds */ + PIPE_FD_CLOSE(watchdog_ctx.pipefd[0]); + PIPE_FD_CLOSE(watchdog_ctx.pipefd[1]); + /* and kill the watchdog event */ talloc_free(watchdog_ctx.te); } -- 2.9.3