Commit efc08a64 authored by Vladimír Čunát's avatar Vladimír Čunát

cache: improve out-of-disk condition

When suspect SIGBUS happens, print helpful error and try to remove
the cache, so that the service might work again if auto-restarted.
Theoretically we could longjmp() out of the SIGBUS handler,
but that would be rather messy, so let the process die.
parent 3c064390
Knot Resolver 3.x.y (2018-mm-dd)
================================
Improvements
------------
- cache: handle out-of-space SIGBUS slightly better (#197)
Knot Resolver 3.0.0 (2018-08-20)
================================
......
......@@ -15,21 +15,23 @@
*/
#include <arpa/inet.h>
#include <assert.h>
#include <getopt.h>
#include <libgen.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#include <libgen.h>
#include <unistd.h>
#include <uv.h>
#include <assert.h>
#include <contrib/cleanup.h>
#include <contrib/ucw/mempool.h>
#include <contrib/ccan/asprintf/asprintf.h>
#include <libknot/error.h>
#ifdef HAS_SYSTEMD
#include <systemd/sd-daemon.h>
#endif
#include <libknot/error.h>
#include <contrib/cleanup.h>
#include <contrib/ucw/mempool.h>
#include <contrib/ccan/asprintf/asprintf.h>
#include "lib/defines.h"
#include "lib/resolve.h"
#include "lib/dnssec.h"
......@@ -300,6 +302,38 @@ static void signal_handler(uv_signal_t *handle, int signum)
uv_signal_stop(handle);
}
/** SIGBUS -> attempt to remove the overflowing cache file and abort. */
static void sigbus_handler(int sig, siginfo_t *siginfo, void *ptr)
{
/* We can't safely assume that printf-like functions work, but write() is OK.
* See POSIX for the safe functions, e.g. 2017 version just above this link:
* http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_04_04
*/
#define WRITE_ERR(err_charray) \
(void)write(STDERR_FILENO, err_charray, sizeof(err_charray))
const char msg_typical[] =
"\nSIGBUS received; this is most likely due to filling up the filesystem where cache resides.\n",
msg_unknown[] = "\nSIGBUS received, cause unknown.\n",
msg_deleted[] = "Cache file deleted.\n",
msg_del_fail[] = "Cache file deletion failed.\n",
msg_final[] = "kresd can not recover reliably by itself, exiting.\n";
if (siginfo->si_code != BUS_ADRERR) {
WRITE_ERR(msg_unknown);
goto end;
}
WRITE_ERR(msg_typical);
if (!kr_cache_emergency_file_to_remove) goto end;
if (unlink(kr_cache_emergency_file_to_remove)) {
WRITE_ERR(msg_del_fail);
} else {
WRITE_ERR(msg_deleted);
}
end:
WRITE_ERR(msg_final);
_exit(128 - sig); /*< regular return from OS-raised SIGBUS can't work anyway */
#undef WRITE_ERR
}
/** Split away port from the address. */
static const char *set_addr(char *addr, int *port)
{
......@@ -700,28 +734,36 @@ int main(int argc, char **argv)
goto cleanup;
}
/* Workaround for https://github.com/libuv/libuv/issues/45
* (Write after ECONNRESET crash.) */
if (ret == 0 && signal(SIGPIPE, SIG_IGN) == SIG_ERR) {
kr_log_error("[system] can't block SIGPIPE signal: %s\n",
strerror(errno));
ret = EXIT_FAILURE;
}
/* Catch some signals. */
if (ret != 0) {
loop = uv_default_loop();
uv_signal_t sigint, sigterm;
if (true) ret = uv_signal_init(loop, &sigint);
if (!ret) ret = uv_signal_init(loop, &sigterm);
if (!ret) ret = uv_signal_start(&sigint, signal_handler, SIGINT);
if (!ret) ret = uv_signal_start(&sigterm, signal_handler, SIGTERM);
/* Block SIGPIPE; see https://github.com/libuv/libuv/issues/45 */
if (!ret && signal(SIGPIPE, SIG_IGN) == SIG_ERR) ret = errno;
if (!ret) {
/* Catching SIGBUS via uv_signal_* can't work; see:
* https://github.com/libuv/libuv/pull/1987 */
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = sigbus_handler;
sa.sa_flags = SA_SIGINFO;
if (sigaction(SIGBUS, &sa, NULL)) {
ret = errno;
}
}
if (ret) {
kr_log_error("[system] failed to set up signal handlers: %s\n",
strerror(abs(errno)));
ret = EXIT_FAILURE;
goto cleanup;
}
engine_set_moduledir(&engine, args.moduledir);
/* Block signals. */
loop = uv_default_loop();
uv_signal_t sigint, sigterm;
uv_signal_init(loop, &sigint);
uv_signal_init(loop, &sigterm);
uv_signal_start(&sigint, signal_handler, SIGINT);
uv_signal_start(&sigterm, signal_handler, SIGTERM);
/* Start the scripting engine */
engine_set_moduledir(&engine, args.moduledir);
worker->loop = loop;
loop->data = worker;
......
......@@ -130,10 +130,20 @@ int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct k
cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX;
/* Check cache ABI version */
kr_cache_make_checkpoint(cache);
(void) assert_right_version(cache);
(void)assert_right_version(cache);
char *fpath;
ret = asprintf(&fpath, "%s/data.mdb", opts->path);
if (ret > 0) {
kr_cache_emergency_file_to_remove = fpath;
} else {
assert(false); /* non-critical, but still */
}
return 0;
}
const char *kr_cache_emergency_file_to_remove = NULL;
#define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db)
......@@ -143,6 +153,8 @@ void kr_cache_close(struct kr_cache *cache)
cache_op(cache, close);
cache->db = NULL;
}
free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove);
kr_cache_emergency_file_to_remove = NULL;
}
int kr_cache_sync(struct kr_cache *cache)
......
......@@ -65,6 +65,12 @@ struct kr_cache
KR_EXPORT
int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm);
/**
* Path to cache file to remove on critical out-of-space error. (do NOT modify it)
*/
KR_EXPORT extern
const char *kr_cache_emergency_file_to_remove;
/**
* Close persistent cache.
* @note This doesn't clear the data, just closes the connection to the database.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment