Commit 9fa9df98 authored by Tomas Krizek's avatar Tomas Krizek

Merge branch 'watchdog' into 'master'

watchdog rewrite

See merge request knot/knot-resolver!878
parents cc8c78aa 4b37dd1b
Pipeline #53897 failed with stages
in 16 minutes and 47 seconds
......@@ -9,11 +9,6 @@ lua_config = configuration_data()
lua_config.set('keyfile_default', keyfile_default)
lua_config.set('etc_dir', etc_dir)
lua_config.set('unmanaged', managed_ta ? 'false' : 'true')
if libsystemd.found() and libsystemd.version().version_compare('>=183')
lua_config.set('sd_watchdog', 'modules.load(\'sd_watchdog\')')
else
lua_config.set('sd_watchdog', '')
endif
trust_anchors = configure_file(
input: 'trust_anchors.lua.in',
......
......@@ -415,7 +415,6 @@ setfenv(0, _G)
-- Load default modules
trust_anchors = require('trust_anchors')
@sd_watchdog@
modules.load('ta_update')
modules.load('ta_signal_query')
modules.load('policy')
......@@ -425,6 +424,7 @@ modules.load('detect_time_jump')
modules.load('ta_sentinel')
modules.load('edns_keepalive')
modules.load('refuse_nord')
modules.load('watchdog')
-- Load keyfile_default
trust_anchors.add_file('@keyfile_default@', @unmanaged@)
......
......@@ -28,6 +28,7 @@ usr/lib/knot-resolver/kres_modules/ta_sentinel.lua
usr/lib/knot-resolver/kres_modules/ta_signal_query.lua
usr/lib/knot-resolver/kres_modules/ta_update.lua
usr/lib/knot-resolver/kres_modules/view.lua
usr/lib/knot-resolver/kres_modules/watchdog.lua
usr/lib/knot-resolver/kres_modules/workarounds.lua
usr/sbin/kresc
usr/sbin/kresd
......
......@@ -284,6 +284,7 @@ systemctl daemon-reload
%{_libdir}/knot-resolver/kres_modules/ta_signal_query.lua
%{_libdir}/knot-resolver/kres_modules/ta_update.lua
%{_libdir}/knot-resolver/kres_modules/view.lua
%{_libdir}/knot-resolver/kres_modules/watchdog.lua
%{_libdir}/knot-resolver/kres_modules/workarounds.lua
%{_mandir}/man8/kresd.8.gz
......
......@@ -37,3 +37,4 @@ Modules
.. include:: ../modules/edns_keepalive/README.rst
.. include:: ../modules/experimental_dot_auth/README.rst
.. include:: ../modules/refuse_nord/README.rst
.. include:: ../modules/watchdog/README.rst
......@@ -15,6 +15,7 @@ lua_mod_src = [ # add lua modules without separate meson.build
files('ta_sentinel/ta_sentinel.lua'),
files('ta_signal_query/ta_signal_query.lua'),
files('ta_update/ta_update.lua'),
files('watchdog/watchdog.lua'),
files('workarounds/workarounds.lua'),
]
......@@ -52,9 +53,6 @@ subdir('policy')
subdir('refuse_nord')
subdir('stats')
subdir('view')
if libsystemd.found() and libsystemd.version().version_compare('>=183')
subdir('sd_watchdog')
endif
# install lua modules
foreach mod : lua_mod_src
......
.. _mod-bogus_log:
Systemd watchdog
----------------
This module is loaded by default when compiled with systemd. It enables the use
systemd watchdog to restart the process in case it stops responding. The
upstream systemd unit files are configured to use this feature, which is turned
on with the ``WatchdogSec=`` directive in the service file.
# C module: sd_watchdog
sd_watchdog_src = files([
'sd_watchdog.c',
])
c_src_lint += sd_watchdog_src
sd_watchdog_mod = shared_module(
'sd_watchdog',
sd_watchdog_src,
include_directories: mod_inc_dir,
name_prefix: '',
install: true,
install_dir: modules_dir,
)
/* Copyright (C) Knot Resolver contributors. Licensed under GNU GPLv3 or
* (at your option) any later version. See COPYING for text of the license.
*
* sd_watchdog module implements support for systemd watchdog supervision */
#include <systemd/sd-daemon.h>
#include <uv.h>
#include "lib/module.h"
struct watchdog_config {
bool enabled;
uint64_t timeout_usec;
uv_timer_t timer;
};
static void keepalive_ping(uv_timer_t *timer)
{
// NOTE: in the future, some sanity checks could be used here
// It is generally recommended to ignore the return value of this call.
sd_notify(0, "WATCHDOG=1");
}
KR_EXPORT
int sd_watchdog_init(struct kr_module *module)
{
struct watchdog_config *conf = calloc(1, sizeof(*conf));
if (!conf) {
return kr_error(ENOMEM);
}
module->data = conf;
/* Check if watchdog is enabled */
int ret = sd_watchdog_enabled(1, &conf->timeout_usec);
if (ret < 0) {
kr_log_error("[sd_watchdog] error: %s\n", strerror(abs(ret)));
return kr_error(ret);
}
conf->enabled = ret > 0;
if (!conf->enabled) {
kr_log_verbose("[sd_watchdog] disabled (not required)\n");
return kr_ok();
}
uint64_t delay_ms = (conf->timeout_usec / 1000) / 2;
if (delay_ms == 0) {
kr_log_error("[sd_watchdog] error: WatchdogSec= must be at least 2ms!\n");
return kr_error(ENOTSUP);
}
uv_loop_t *loop = uv_default_loop();
uv_timer_init(loop, &conf->timer);
ret = uv_timer_start(&conf->timer, keepalive_ping, delay_ms, delay_ms);
if (ret != 0) {
kr_log_error("[sd_watchdog] error: failed to start uv_timer: %s\n",
uv_strerror(ret));
conf->timer.loop = NULL;
return kr_error(ret);
}
kr_log_verbose("[sd_watchdog] enabled (repeat: %"PRIu64" ms, timeout: %"PRIu64" ms)\n",
delay_ms, conf->timeout_usec / 1000);
return kr_ok();
}
KR_EXPORT
int sd_watchdog_deinit(struct kr_module *module)
{
struct watchdog_config *conf = module->data;
if (conf && conf->timer.loop == uv_default_loop()) { /* normal state */
int ret = uv_timer_stop(&conf->timer);
if (ret != 0) {
kr_log_error("[sd_watchdog] error: failed to stop uv_timer: %s\n",
uv_strerror(ret));
}
/* We have a problem: UV timer can't be closed immediately,
* but as soon as we return from _deinit(), we get dlclose()
* so no function from this module may be usable anymore. */
conf->timer.data = conf;
uv_close((uv_handle_t *)&conf->timer, kr_uv_free_cb);
} else { /* watchdog might be just disabled */
free(conf);
}
return kr_ok();
}
KR_MODULE_EXPORT(sd_watchdog)
.. _mod-watchdog:
Watchdog
--------
This module cooperates with Systemd watchdog to restart the process in case
the internal event loop gets stuck. The upstream Systemd unit files are configured
to use this feature, which is turned on with the ``WatchdogSec=`` directive
in the service file.
As an optional feature, this module can also do an internal DNS query to check if resolver
answers correctly. To use this feature you must configure DNS name and type to query for:
.. code-block:: lua
watchdog.config({ qname = 'nic.cz.', qtype = kres.type.A })
Each single query from watchdog must result in answer with
RCODE = NOERROR or NXDOMAIN. Any other result will terminate the resolver
(with exit code 69) to allow the supervisor process to do cleanup and restart
the resolver.
It is recommended to use a name with a very short TTL to make sure the watchdog
is testing all parts of resolver and not only its cache. Obviously this check
makes sense only when used with very reliable domains; otherwise a failure
on authoritative side will shutdown resolver!
`WatchdogSec` specifies deadline for supervisor when the process will be killed.
Watchdog queries are executed each `WatchdogSec / 2` seconds.
This implies that **half** of `WatchdogSec` interval must be long enough for
normal DNS query to succeed, so do not forget to add two or three seconds
for random network timeouts etc.
The module is loaded by default. If you'd like to disable it you can unload it:
.. code-block:: lua
modules.unload('watchdog')
Beware that unloading the module without disabling watchdog feature in supervisor
will lead to infinite restart loop.
local ffi = require('ffi')
ffi.cdef([[
int sd_watchdog_enabled(int unset_environment, uint64_t *usec);
int sd_notify(int unset_environment, const char *state);
]])
local watchdog = {}
local private = {}
local function sd_signal_ok()
ffi.C.sd_notify(0, 'WATCHDOG=1')
end
function private.fail_callback()
log('[watchdog] TERMINATING resolver, supervisor is expected to restart it')
os.exit(69) -- unclean exit code = EX_UNAVAILABLE
end
-- logging
local function add_tracer(logbuf)
return function (req)
local function qrylogger(qry, src, msg)
local req_uid = (qry and qry.request and qry.request.uid) or 0
local qry_uid = (qry and qry.uid) or 0
local logline = string.format("[%05u.%02u][%s] %s", req_uid, qry_uid, ffi.string(src), ffi.string(msg))
table.insert(logbuf, logline)
if verbose() then -- without this message would be missing in verbose log
ffi.C.kr_log_qverbose_impl(qry, src, msg)
end
end
req.trace_log = ffi.cast('trace_log_f', qrylogger)
end
end
local function check_answer(logbuf)
return function (pkt, req)
req.trace_log:free()
if pkt:rcode() == kres.rcode.NOERROR or pkt:rcode() == kres.rcode.NXDOMAIN then
private.ok_callback()
return
end
log('[watchdog] watchdog query returned unexpected answer! query verbose log:')
log(table.concat(logbuf, ''))
log('[watchdog] problematic answer:\n%s', pkt)
-- failure! quit immediatelly to allow process supervisor to restart us
private.fail_callback()
end
end
private.check_answer_callback = check_answer
local function timer()
local logbuf = {}
-- fire watchdog query
if private.qname and private.qtype then
if verbose() then
log('[watchdog] starting watchdog query %s %s', private.qname, private.qtype)
end
resolve(private.qname,
private.qtype,
kres.class.IN,
{'TRACE'},
private.check_answer_callback(logbuf),
add_tracer(logbuf))
else
private.ok_callback()
end
end
function watchdog.config(cfg)
-- read only
if not cfg then
return private
end
local interval = tonumber(cfg.interval or private.interval or 10000)
if not interval or interval < 1 then
error('[watchdog] interval must be >= 1 ms')
end
private.interval = interval
-- qname = nil will disable DNS queries
private.qname = cfg.qname
private.qtype = cfg.qtype or kres.type.A
-- restart timers
watchdog.deinit()
private.event = event.recurrent(private.interval, timer)
return private
end
-- automatically enable watchdog if it is configured in systemd
function watchdog.init()
if private.event then
error('[watchdog] module is already loaded')
end
local timeoutptr = ffi.new('uint64_t[1]')
local systemd_present, ret = pcall(function() return ffi.C.sd_watchdog_enabled(0, timeoutptr) end)
if not systemd_present then
if verbose() then
log('[watchdog] systemd library not detected')
end
return
end
private.ok_callback = sd_signal_ok
if ret < 0 then
error('[watchdog] %s', ffi.string(ffi.C.knot_strerror(math.abs(ret))))
return
elseif ret == 0 then
if verbose() then
log('[watchdog] disabled in systemd (WatchdogSec= not specified)')
end
return
end
local timeout = tonumber(timeoutptr[0]) / 1000 -- convert to ms
local interval = timeout / 2 -- halve interval to make sure we are never late
if interval < 1 then
log('[watchdog] error: WatchdogSec= must be at least 2ms! (got %d usec)',
tonumber(timeoutptr[0]))
end
watchdog.config({ interval = interval })
if verbose() then
log('[watchdog] systemd watchdog enabled (check interval: %s ms, timeout: %s ms)',
private.interval, timeout)
end
end
function watchdog.deinit()
if private.event then
event.cancel(private.event)
private.event = nil
end
end
return watchdog
......@@ -12,6 +12,7 @@ ExecStart=@sbin_dir@/kresd --config=@etc_dir@/kresd.conf
User=@user@
TimeoutStopSec=10s
WatchdogSec=10s
RestartForceExitStatus=69
Restart=on-abnormal
LimitNOFILE=1048576
Sockets=kresd.socket
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment