worker.c 57.9 KB
Newer Older
1
/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
Marek Vavruša's avatar
Marek Vavruša committed
2 3 4 5 6 7 8 9 10 11 12 13

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
14
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
Marek Vavruša's avatar
Marek Vavruša committed
15 16
 */

17
#include <uv.h>
18
#include <lua.h>
19
#include <libknot/packet/pkt.h>
20
#include <libknot/descriptor.h>
21 22
#include <contrib/ucw/lib.h>
#include <contrib/ucw/mempool.h>
23
#include <contrib/wire.h>
Marek Vavruša's avatar
Marek Vavruša committed
24 25 26
#if defined(__GLIBC__) && defined(_GNU_SOURCE)
#include <malloc.h>
#endif
27
#include <assert.h>
28 29
#include <sys/types.h>
#include <unistd.h>
30
#include <gnutls/gnutls.h>
31
#include "lib/utils.h"
32
#include "lib/layer.h"
33
#include "daemon/worker.h"
34
#include "daemon/bindings.h"
35
#include "daemon/engine.h"
36
#include "daemon/io.h"
37
#include "daemon/tls.h"
38
#include "daemon/zimport.h"
39
#include "daemon/session.h"
40

41 42
#define VERBOSE_MSG(qry, fmt...) QRVERBOSE(qry, "wrkr", fmt)

43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
/** Client request state. */
struct request_ctx
{
	struct kr_request req;
	struct {
		union inaddr addr;
		union inaddr dst_addr;
		/* uv_handle_t *handle; */

		/** NULL if the request didn't come over network. */
		struct session *session;
	} source;
	struct worker_ctx *worker;
	qr_tasklist_t tasks;
};

/** Query resolution task. */
struct qr_task
{
	struct request_ctx *ctx;
	knot_pkt_t *pktbuf;
	qr_tasklist_t waiting;
	uv_handle_t *pending[MAX_PENDING];
	uint16_t pending_count;
	uint16_t addrlist_count;
	uint16_t addrlist_turn;
	uint16_t timeouts;
	uint16_t iter_count;
	uint16_t bytes_remaining;
	struct sockaddr *addrlist;
	uint32_t refs;
	bool finished : 1;
	bool leading  : 1;
76 77
};

78

79 80 81 82
/* Convenience macros */
#define qr_task_ref(task) \
	do { ++(task)->refs; } while(0)
#define qr_task_unref(task) \
83
	do { if (task && --(task)->refs == 0) { qr_task_free(task); } } while (0)
84
#define qr_valid_handle(task, checked) \
85 86 87 88 89 90
	(!uv_is_closing((checked)) || (task)->ctx->source.session->handle == (checked))

/** @internal get key for tcp session
 *  @note kr_straddr() return pointer to static string
 */
#define tcpsess_key(addr) kr_straddr(addr)
91 92

/* Forward decls */
93
static void qr_task_free(struct qr_task *task);
94 95 96 97 98 99 100 101
static int qr_task_step(struct qr_task *task,
			const struct sockaddr *packet_source,
			knot_pkt_t *packet);
static int qr_task_send(struct qr_task *task, uv_handle_t *handle,
			struct sockaddr *addr, knot_pkt_t *pkt);
static int qr_task_finalize(struct qr_task *task, int state);
static void qr_task_complete(struct qr_task *task);
static struct session* worker_find_tcp_connected(struct worker_ctx *worker,
102
						 const struct sockaddr *addr);
103 104 105 106 107 108
static int worker_add_tcp_waiting(struct worker_ctx *worker,
				  const struct sockaddr *addr,
				  struct session *session);
static int worker_del_tcp_waiting(struct worker_ctx *worker,
				  const struct sockaddr *addr);
static struct session* worker_find_tcp_waiting(struct worker_ctx *worker,
109
					       const struct sockaddr *addr);
110 111 112
static void on_session_idle_timeout(uv_timer_t *timer);
static void on_tcp_connect_timeout(uv_timer_t *timer);
static void on_tcp_watchdog_timeout(uv_timer_t *timer);
113 114 115 116 117 118 119

/** @internal Get singleton worker. */
static inline struct worker_ctx *get_worker(void)
{
	return uv_default_loop()->data;
}

120 121 122 123
static inline void *iohandle_borrow(struct worker_ctx *worker)
{
	void *h = NULL;

124
	const size_t size = sizeof(uv_handles_t);
125 126 127 128 129 130 131 132 133 134 135 136
	if (worker->pool_iohandles.len > 0) {
		h = array_tail(worker->pool_iohandles);
		array_pop(worker->pool_iohandles);
		kr_asan_unpoison(h, size);
	} else {
		h = malloc(size);
	}

	return h;
}

static inline void iohandle_release(struct worker_ctx *worker, void *h)
137
{
138 139 140 141
	assert(h);

	if (worker->pool_iohandles.len < MP_FREELIST_SIZE) {
		array_push(worker->pool_iohandles, h);
142
		kr_asan_poison(h, sizeof(uv_handles_t));
143
	} else {
144
		free(h);
145 146 147
	}
}

148
void *worker_iohandle_borrow(struct worker_ctx *worker)
149
{
150 151 152 153 154 155 156 157 158
	return iohandle_borrow(worker);
}

void worker_iohandle_release(struct worker_ctx *worker, void *h)
{
	iohandle_release(worker, h);
}

static inline void *iorequest_borrow(struct worker_ctx *worker)
159
{
160 161
	void *r = NULL;

162
	const size_t size = sizeof(uv_reqs_t);
163 164 165 166
	if (worker->pool_ioreqs.len > 0) {
		r = array_tail(worker->pool_ioreqs);
		array_pop(worker->pool_ioreqs);
		kr_asan_unpoison(r, size);
167
	} else {
168
		r = malloc(size);
169
	}
170 171

	return r;
172 173
}

174
static inline void iorequest_release(struct worker_ctx *worker, void *r)
175
{
176 177 178 179
	assert(r);

	if (worker->pool_ioreqs.len < MP_FREELIST_SIZE) {
		array_push(worker->pool_ioreqs, r);
180
		kr_asan_poison(r, sizeof(uv_reqs_t));
181
	} else {
182
		free(r);
183 184 185
	}
}

186

187 188 189
/*! @internal Create a UDP/TCP handle for an outgoing AF_INET* connection.
 *  socktype is SOCK_* */
static uv_handle_t *ioreq_spawn(struct qr_task *task, int socktype, sa_family_t family)
190
{
191 192 193
	bool precond = (socktype == SOCK_DGRAM || socktype == SOCK_STREAM)
			&& (family == AF_INET  || family == AF_INET6);
	if (!precond) {
194 195
		/* assert(false); see #245 */
		kr_log_verbose("[work] ioreq_spawn: pre-condition failed\n");
196 197 198
		return NULL;
	}

199 200 201 202
	if (task->pending_count >= MAX_PENDING) {
		return NULL;
	}
	/* Create connection for iterative query */
203
	struct worker_ctx *worker = task->ctx->worker;
204 205
	void *h = iohandle_borrow(worker);
	uv_handle_t *handle = (uv_handle_t *)h;
206 207 208
	if (!handle) {
		return NULL;
	}
209 210 211 212 213 214 215 216 217
	int ret = io_create(worker->loop, handle, socktype, family);
	if (ret) {
		if (ret == UV_EMFILE) {
			worker->too_many_open = true;
			worker->rconcurrent_highwatermark = worker->stats.rconcurrent;
		}
		iohandle_release(worker, h);
		return NULL;
	}
218 219 220 221

	/* Bind to outgoing address, according to IP v4/v6. */
	union inaddr *addr;
	if (family == AF_INET) {
222
		addr = (union inaddr *)&worker->out_addr4;
223
	} else {
224
		addr = (union inaddr *)&worker->out_addr6;
225 226 227 228
	}
	if (addr->ip.sa_family != AF_UNSPEC) {
		assert(addr->ip.sa_family == family);
		if (socktype == SOCK_DGRAM) {
229 230 231 232 233
			uv_udp_t *udp = (uv_udp_t *)handle;
			ret = uv_udp_bind(udp, &addr->ip, 0);
		} else if (socktype == SOCK_STREAM){
			uv_tcp_t *tcp = (uv_tcp_t *)handle;
			ret = uv_tcp_bind(tcp, &addr->ip, 0);
234 235 236
		}
	}

237 238
	/* Set current handle as a subrequest type. */
	struct session *session = handle->data;
239
	if (ret == 0) {
240 241
		session_set_outgoing(session, true);
		ret = session_tasklist_add(session, task);
242
	}
243
	if (ret < 0) {
244
		io_deinit(handle);
245
		iohandle_release(worker, h);
246 247 248
		return NULL;
	}
	/* Connect or issue query datagram */
249
	task->pending[task->pending_count] = handle;
250
	task->pending_count += 1;
251
	return handle;
252 253
}

254
static void ioreq_kill_udp(uv_handle_t *req, struct qr_task *task)
255 256
{
	assert(req);
257 258 259
	struct session *s = req->data;
	assert(session_is_outgoing(s));
	if (session_is_closing(s)) {
260
		return;
261
	}
262 263 264 265 266
	uv_timer_t *t = session_get_timer(s);
	uv_timer_stop(t);
	session_tasklist_del(s, task);
	assert(session_tasklist_is_empty(s));
	session_close(s);
267 268
}

269
static void ioreq_kill_tcp(uv_handle_t *req, struct qr_task *task)
270
{
271
	assert(req);
272 273 274
	struct session *s = req->data;
	assert(session_is_outgoing(s));
	if (session_is_closing(s)) {
275 276 277
		return;
	}

278 279
	session_waitinglist_del(s, task);
	session_tasklist_del(s, task);
280 281 282

	int res = 0;

283 284 285
	const struct sockaddr *peer = session_get_peer(s);
	if (peer->sa_family != AF_UNSPEC && session_is_empty(s) && !session_is_closing(s)) {
		assert(peer->sa_family == AF_INET || peer->sa_family == AF_INET6);
286
		res = 1;
287
		if (session_is_connected(s)) {
288 289
			/* This is outbound TCP connection which can be reused.
			* Close it after timeout */
290 291 292 293
			uv_timer_t *t = session_get_timer(s);
			t->data = s;
			uv_timer_stop(t);
			res = uv_timer_start(t, on_session_idle_timeout,
294 295
					     KR_CONN_RTT_MAX, 0);
		}
296 297 298 299
	}

	if (res != 0) {
		/* if any errors, close the session immediately */
300
		session_close(s);
301 302 303
	}
}

304
static void ioreq_kill_pending(struct qr_task *task)
305
{
306 307 308 309 310 311 312 313
	for (uint16_t i = 0; i < task->pending_count; ++i) {
		if (task->pending[i]->type == UV_UDP) {
			ioreq_kill_udp(task->pending[i], task);
		} else if (task->pending[i]->type == UV_TCP) {
			ioreq_kill_tcp(task->pending[i], task);
		} else {
			assert(false);
		}
314 315 316 317
	}
	task->pending_count = 0;
}

318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341
/** @cond This memory layout is internal to mempool.c, use only for debugging. */
#if defined(__SANITIZE_ADDRESS__)
struct mempool_chunk {
  struct mempool_chunk *next;
  size_t size;
};
static void mp_poison(struct mempool *mp, bool poison)
{
	if (!poison) { /* @note mempool is part of the first chunk, unpoison it first */
		kr_asan_unpoison(mp, sizeof(*mp));
	}
	struct mempool_chunk *chunk = mp->state.last[0];
	void *chunk_off = (void *)chunk - chunk->size;
	if (poison) {
		kr_asan_poison(chunk_off, chunk->size);
	} else {
		kr_asan_unpoison(chunk_off, chunk->size);
	}
}
#else
#define mp_poison(mp, enable)
#endif
/** @endcond */

342
/** Get a mempool.  (Recycle if possible.)  */
343
static inline struct mempool *pool_borrow(struct worker_ctx *worker)
344 345
{
	struct mempool *mp = NULL;
346 347 348 349
	if (worker->pool_mp.len > 0) {
		mp = array_tail(worker->pool_mp);
		array_pop(worker->pool_mp);
		mp_poison(mp, 0);
350 351 352 353 354 355
	} else { /* No mempool on the freelist, create new one */
		mp = mp_new (4 * CPU_PAGE_SIZE);
	}
	return mp;
}

356
/** Return a mempool.  (Cache them up to some count.) */
357 358
static inline void pool_release(struct worker_ctx *worker, struct mempool *mp)
{
359
	if (worker->pool_mp.len < MP_FREELIST_SIZE) {
360
		mp_flush(mp);
361
		array_push(worker->pool_mp, mp);
362
		mp_poison(mp, 1);
363 364 365 366 367
	} else {
		mp_delete(mp);
	}
}

368 369 370 371 372 373 374 375 376 377 378 379
/** Create a key for an outgoing subrequest: qname, qclass, qtype.
 * @param key Destination buffer for key size, MUST be SUBREQ_KEY_LEN or larger.
 * @return key length if successful or an error
 */
static const size_t SUBREQ_KEY_LEN = KR_RRKEY_LEN;
static int subreq_key(char *dst, knot_pkt_t *pkt)
{
	assert(pkt);
	return kr_rrkey(dst, knot_pkt_qclass(pkt), knot_pkt_qname(pkt),
			knot_pkt_qtype(pkt), knot_pkt_qtype(pkt));
}

380 381 382 383 384 385 386 387
/** Create and initialize a request_ctx (on a fresh mempool).
 *
 * handle and addr point to the source of the request, and they are NULL
 * in case the request didn't come from network.
 */
static struct request_ctx *request_create(struct worker_ctx *worker,
					  uv_handle_t *handle,
					  const struct sockaddr *addr)
388
{
389
	knot_mm_t pool = {
390
		.ctx = pool_borrow(worker),
391
		.alloc = (knot_mm_alloc_t) mp_alloc
392
	};
393

394 395 396 397
	/* Create request context */
	struct request_ctx *ctx = mm_alloc(&pool, sizeof(*ctx));
	if (!ctx) {
		pool_release(worker, pool.ctx);
398 399
		return NULL;
	}
400

401 402 403 404 405
	memset(ctx, 0, sizeof(*ctx));

	/* TODO Relocate pool to struct request */
	ctx->worker = worker;
	array_init(ctx->tasks);
406 407 408
	struct session *s = handle ? handle->data : NULL;
	if (s) {
		assert(session_is_outgoing(s) == false);
409
	}
410
	ctx->source.session = s;
411 412 413

	struct kr_request *req = &ctx->req;
	req->pool = pool;
414
	req->vars_ref = LUA_NOREF;
415

416
	/* Remember query source addr */
417 418 419
	if (!addr || (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)) {
		ctx->source.addr.ip.sa_family = AF_UNSPEC;
	} else {
420 421 422
		size_t addr_len = sizeof(struct sockaddr_in);
		if (addr->sa_family == AF_INET6)
			addr_len = sizeof(struct sockaddr_in6);
423 424
		memcpy(&ctx->source.addr.ip, addr, addr_len);
		ctx->req.qsource.addr = &ctx->source.addr.ip;
425
	}
426 427 428 429 430

	worker->stats.rconcurrent += 1;

	if (!handle) {
		return ctx;
431
	}
432

433
	/* Remember the destination address. */
434 435 436 437 438 439
	int addr_len = sizeof(ctx->source.dst_addr);
	struct sockaddr *dst_addr = &ctx->source.dst_addr.ip;
	ctx->source.dst_addr.ip.sa_family = AF_UNSPEC;
	if (handle->type == UV_UDP) {
		if (uv_udp_getsockname((uv_udp_t *)handle, dst_addr, &addr_len) == 0) {
			req->qsource.dst_addr = dst_addr;
440
		}
441 442 443 444
		req->qsource.tcp = false;
	} else if (handle->type == UV_TCP) {
		if (uv_tcp_getsockname((uv_tcp_t *)handle, dst_addr, &addr_len) == 0) {
			req->qsource.dst_addr = dst_addr;
445
		}
446
		req->qsource.tcp = true;
447
	}
448 449

	return ctx;
450 451
}

452 453
/** More initialization, related to the particular incoming query/packet. */
static int request_start(struct request_ctx *ctx, knot_pkt_t *query)
454
{
455 456 457 458 459
	assert(query && ctx);
	size_t answer_max = KNOT_WIRE_MIN_PKTSIZE;
	struct kr_request *req = &ctx->req;

	/* source.session can be empty if request was generated by kresd itself */
460 461
	struct session *s = ctx->source.session;
	if (!s || session_get_handle(s)->type == UV_TCP) {
462 463 464 465
		answer_max = KNOT_WIRE_MAX_PKTSIZE;
	} else if (knot_pkt_has_edns(query)) { /* EDNS */
		answer_max = MAX(knot_edns_get_payload(query->opt_rr),
				 KNOT_WIRE_MIN_PKTSIZE);
466
	}
467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497
	req->qsource.size = query->size;

	req->answer = knot_pkt_new(NULL, answer_max, &req->pool);
	if (!req->answer) {
		return kr_error(ENOMEM);
	}

	/* Remember query source TSIG key */
	if (query->tsig_rr) {
		req->qsource.key = knot_rrset_copy(query->tsig_rr, &req->pool);
	}

	/* Remember query source EDNS data */
	if (query->opt_rr) {
		req->qsource.opt = knot_rrset_copy(query->opt_rr, &req->pool);
	}
	/* Start resolution */
	struct worker_ctx *worker = ctx->worker;
	struct engine *engine = worker->engine;
	kr_resolve_begin(req, &engine->resolver, req->answer);
	worker->stats.queries += 1;
	/* Throttle outbound queries only when high pressure */
	if (worker->stats.concurrent < QUERY_RATE_THRESHOLD) {
		req->options.NO_THROTTLE = true;
	}
	return kr_ok();
}

static void request_free(struct request_ctx *ctx)
{
	struct worker_ctx *worker = ctx->worker;
498 499 500 501 502 503 504 505 506 507 508 509 510 511
	/* Dereference any Lua vars table if exists */
	if (ctx->req.vars_ref != LUA_NOREF) {
		lua_State *L = worker->engine->L;
		/* Get worker variables table */
		lua_rawgeti(L, LUA_REGISTRYINDEX, worker->vars_table_ref);
		/* Get next free element (position 0) and store it under current reference (forming a list) */
		lua_rawgeti(L, -1, 0);
		lua_rawseti(L, -2, ctx->req.vars_ref);
		/* Set current reference as the next free element */
		lua_pushinteger(L, ctx->req.vars_ref);
		lua_rawseti(L, -2, 0);
		lua_pop(L, 1);
		ctx->req.vars_ref = LUA_NOREF;
	}
512
	/* Return mempool to ring or free it if it's full */
513
	pool_release(worker, ctx->req.pool.ctx);
514
	/* @note The 'task' is invalidated from now on. */
Marek Vavruša's avatar
Marek Vavruša committed
515
	/* Decommit memory every once in a while */
516
	static int mp_delete_count = 0;
517 518 519
	if (++mp_delete_count == 100000) {
		lua_gc(worker->engine->L, LUA_GCCOLLECT, 0);
#if defined(__GLIBC__) && defined(_GNU_SOURCE)
Marek Vavruša's avatar
Marek Vavruša committed
520
		malloc_trim(0);
521
#endif
Marek Vavruša's avatar
Marek Vavruša committed
522
		mp_delete_count = 0;
523
	}
524
	worker->stats.rconcurrent -= 1;
525
}
526

527
static int request_add_tasks(struct request_ctx *ctx, struct qr_task *task)
528
{
529 530 531 532 533 534 535 536
	for (int i = 0; i < ctx->tasks.len; ++i) {
		if (ctx->tasks.at[i] == task) {
			return i;
		}
	}
	int ret = array_push(ctx->tasks, task);
	if (ret >= 0) {
		qr_task_ref(task);
537
	}
538 539
	return ret;
}
540

541 542 543 544 545 546 547 548 549 550
static int request_del_tasks(struct request_ctx *ctx, struct qr_task *task)
{
	int ret = kr_error(ENOENT);
	for (int i = 0; i < ctx->tasks.len; ++i) {
		if (ctx->tasks.at[i] == task) {
			array_del(ctx->tasks, i);
			qr_task_unref(task);
			ret = kr_ok();
			break;
		}
551
	}
552 553
	return ret;
}
554

555 556 557 558 559 560 561 562
static struct qr_task *qr_task_create(struct request_ctx *ctx)
{
	/* How much can client handle? */
	struct engine *engine = ctx->worker->engine;
	size_t pktbuf_max = KR_EDNS_PAYLOAD;
	if (engine->resolver.opt_rr) {
		pktbuf_max = MAX(knot_edns_get_payload(engine->resolver.opt_rr),
				 pktbuf_max);
563 564
	}

565 566 567 568
	/* Create resolution task */
	struct qr_task *task = mm_alloc(&ctx->req.pool, sizeof(*task));
	if (!task) {
		return NULL;
569
	}
570
	memset(task, 0, sizeof(*task)); /* avoid accidentally unintialized fields */
571

572 573 574 575 576
	/* Create packet buffers for answer and subrequests */
	knot_pkt_t *pktbuf = knot_pkt_new(NULL, pktbuf_max, &ctx->req.pool);
	if (!pktbuf) {
		mm_free(&ctx->req.pool, task);
		return NULL;
577
	}
578
	pktbuf->size = 0;
579

580 581 582 583 584 585 586 587 588
	task->ctx = ctx;
	task->pktbuf = pktbuf;
	array_init(task->waiting);
	task->refs = 0;
	int ret = request_add_tasks(ctx, task);
	if (ret < 0) {
		mm_free(&ctx->req.pool, task);
		mm_free(&ctx->req.pool, pktbuf);
		return NULL;
589
	}
590 591
	ctx->worker->stats.concurrent += 1;
	return task;
592 593
}

594 595 596 597 598 599 600 601
/* This is called when the task refcount is zero, free memory. */
static void qr_task_free(struct qr_task *task)
{
	struct request_ctx *ctx = task->ctx;

	assert(ctx);

	/* Process outbound session. */
602
	struct session *s = ctx->source.session;
603 604 605
	struct worker_ctx *worker = ctx->worker;

	/* Process source session. */
606 607 608
	if (s && session_tasklist_get_len(s) < worker->tcp_pipeline_max/2 &&
	    !session_is_closing(s) && !session_is_throttled(s)) {
		uv_handle_t *handle = session_get_handle(s);
609 610
		/* Start reading again if the session is throttled and
		 * the number of outgoing requests is below watermark. */
Grigorii Demidov's avatar
Grigorii Demidov committed
611 612
		if (handle) {
			io_start_read(handle);
613
			session_set_throttled(s, false);
614 615 616 617 618 619 620 621 622 623 624 625
		}
	}

	if (ctx->tasks.len == 0) {
		array_clear(ctx->tasks);
		request_free(ctx);
	}

	/* Update stats */
	worker->stats.concurrent -= 1;
}

626
static void qr_task_complete(struct qr_task *task)
627
{
628
	struct request_ctx *ctx = task->ctx;
629

630
	/* Kill pending I/O requests */
631
	ioreq_kill_pending(task);
632 633
	assert(task->waiting.len == 0);
	assert(task->leading == false);
634

635 636 637 638
	struct session *s = ctx->source.session;
	if (s) {
		assert(!session_is_outgoing(s) && session_waitinglist_is_empty(s));
		session_tasklist_del(s, task);
Grigorii Demidov's avatar
Grigorii Demidov committed
639
	}
640

641
	/* Release primary reference to task. */
642
	request_del_tasks(ctx, task);
643 644
}

645
/* This is called when we send subrequest / answer */
646
static int qr_task_on_send(struct qr_task *task, uv_handle_t *handle, int status)
647
{
648 649 650 651 652 653
	if (task->finished) {
		assert(task->leading == false);
		qr_task_complete(task);
		if (!handle || handle->type != UV_TCP) {
			return status;
		}
654 655 656
		struct session* s = handle->data;
		assert(s);
		if (!session_is_outgoing(s) || session_waitinglist_is_empty(s)) {
657 658 659 660
			return status;
		}
	}

Grigorii Demidov's avatar
Grigorii Demidov committed
661
	if (handle) {
662 663 664 665 666 667 668
		struct session* s = handle->data;
		bool outgoing = session_is_outgoing(s);
		if (!outgoing) {
			struct session* source_s = task->ctx->source.session;
			if (source_s) {
				assert (session_get_handle(source_s) == handle);
			}
Grigorii Demidov's avatar
Grigorii Demidov committed
669
		}
670 671 672
		if (handle->type == UV_TCP && outgoing && !session_waitinglist_is_empty(s)) {
			session_waitinglist_del(s, task);
			if (session_is_closing(s)) {
Grigorii Demidov's avatar
Grigorii Demidov committed
673 674
				return status;
			}
675 676 677 678 679
			/* Finalize the task, if any errors.
			 * We can't add it to the end of waiting list for retrying
			 * since it may lead endless loop in some circumstances
			 * (for instance: tls; send->tls_push->too many non-critical errors->
			 * on_send with nonzero status->re-add to waiting->send->etc).*/
Grigorii Demidov's avatar
Grigorii Demidov committed
680
			if (status != 0) {
681
				if (outgoing) {
682 683
					qr_task_finalize(task, KR_STATE_FAIL);
				} else {
684
					assert(task->ctx->source.session == s);
685 686
					task->ctx->source.session = NULL;
				}
687
				session_tasklist_del(s, task);
688
			}
689 690 691 692 693
			struct qr_task *waiting_task = session_waitinglist_get_first(s);
			if (waiting_task) {
				struct sockaddr *peer = session_get_peer(s);
				knot_pkt_t *pkt = waiting_task->pktbuf;
				int ret = qr_task_send(waiting_task, handle, peer, pkt);
694
				if (ret != kr_ok()) {
695 696
					session_tasks_finalize(s, KR_STATE_FAIL);
					session_close(s);
697 698
					return status;
				}
699
			}
700
		}
701
		if (!session_is_closing(s)) {
Daniel Kahn Gillmor's avatar
Daniel Kahn Gillmor committed
702
			io_start_read(handle); /* Start reading new query */
703 704
		}
	}
705
	return status;
706 707
}

708 709
static void on_send(uv_udp_send_t *req, int status)
{
710 711 712 713
	uv_handle_t *handle = (uv_handle_t *)(req->handle);
	uv_loop_t *loop = handle->loop;
	struct worker_ctx *worker = loop->data;
	assert(worker == get_worker());
714
	struct qr_task *task = req->data;
Grigorii Demidov's avatar
Grigorii Demidov committed
715
	qr_task_on_send(task, handle, status);
716
	qr_task_unref(task);
717
	iorequest_release(worker, req);
718 719
}

720
static void on_task_write(uv_write_t *req, int status)
721
{
722 723 724 725
	uv_handle_t *handle = (uv_handle_t *)(req->handle);
	uv_loop_t *loop = handle->loop;
	struct worker_ctx *worker = loop->data;
	assert(worker == get_worker());
726
	struct qr_task *task = req->data;
Grigorii Demidov's avatar
Grigorii Demidov committed
727
	qr_task_on_send(task, handle, status);
728
	qr_task_unref(task);
729
	iorequest_release(worker, req);
730 731
}

732 733
static int qr_task_send(struct qr_task *task, uv_handle_t *handle,
			struct sockaddr *addr, knot_pkt_t *pkt)
734
{
735
	if (!handle) {
736
		return qr_task_on_send(task, handle, kr_error(EIO));
737
	}
738

739
	int ret = 0;
740 741 742
	struct request_ctx *ctx = task->ctx;
	struct worker_ctx *worker = ctx->worker;
	struct kr_request *req = &ctx->req;
743 744
	void *ioreq = iorequest_borrow(worker);
	if (!ioreq) {
745 746
		return qr_task_on_send(task, handle, kr_error(ENOMEM));
	}
747
	if (knot_wire_get_qr(pkt->wire) == 0) {
748 749 750 751 752 753 754 755 756 757 758 759
		/*
		 * Query must be finalised using destination address before
		 * sending.
		 *
		 * Libuv does not offer a convenient way how to obtain a source
		 * IP address from a UDP handle that has been initialised using
		 * uv_udp_init(). The uv_udp_getsockname() fails because of the
		 * lazy socket initialisation.
		 *
		 * @note -- A solution might be opening a separate socket and
		 * trying to obtain the IP address from it.
		 */
760
		ret = kr_resolve_checkout(req, NULL, addr,
761 762
		                          handle->type == UV_UDP ? SOCK_DGRAM : SOCK_STREAM,
		                          pkt);
763
		if (ret != 0) {
764
			iorequest_release(worker, ioreq);
765
			return ret;
766
		}
767
	}
768 769 770 771

	/* Pending ioreq on current task */
	qr_task_ref(task);

772
	/* Send using given protocol */
773
	struct session *session = handle->data;
774 775
	assert(!session_is_closing(session));
	if (session_has_tls(session)) {
776 777 778 779
		uv_write_t *write_req = (uv_write_t *)ioreq;
		write_req->data = task;
		ret = tls_write(write_req, handle, pkt, &on_task_write);
	} else if (handle->type == UV_UDP) {
780
		uv_udp_send_t *send_req = (uv_udp_send_t *)ioreq;
781
		uv_buf_t buf = { (char *)pkt->wire, pkt->size };
782 783 784 785
		send_req->data = task;
		ret = uv_udp_send(send_req, (uv_udp_t *)handle, &buf, 1, addr, &on_send);
	} else if (handle->type == UV_TCP) {
		uv_write_t *write_req = (uv_write_t *)ioreq;
786 787 788 789 790
		uint16_t pkt_size = htons(pkt->size);
		uv_buf_t buf[2] = {
			{ (char *)&pkt_size, sizeof(pkt_size) },
			{ (char *)pkt->wire, pkt->size }
		};
791
		write_req->data = task;
792
		ret = uv_write(write_req, (uv_stream_t *)handle, buf, 2, &on_task_write);
793 794
	} else {
		assert(false);
795
	}
796

797
	if (ret == 0) {
798 799
		if (worker->too_many_open &&
		    worker->stats.rconcurrent <
800
			worker->rconcurrent_highwatermark - 10) {
801 802
			worker->too_many_open = false;
		}
803
	} else {
804
		iorequest_release(worker, ioreq);
805
		qr_task_unref(task);
806 807 808 809
		if (ret == UV_EMFILE) {
			worker->too_many_open = true;
			worker->rconcurrent_highwatermark = worker->stats.rconcurrent;
		}
810
	}
811

812
	/* Update statistics */
813
	if (ctx->source.session &&
814
	    handle != session_get_handle(ctx->source.session) &&
815
	    addr) {
816
		if (session_has_tls(session))
817 818
			worker->stats.tls += 1;
		else if (handle->type == UV_UDP)
819
			worker->stats.udp += 1;
820
		else
821
			worker->stats.tcp += 1;
822

823
		if (addr->sa_family == AF_INET6)
824
			worker->stats.ipv6 += 1;
825
		else if (addr->sa_family == AF_INET)
826
			worker->stats.ipv4 += 1;
827
	}
828

829
	return ret;
830 831
}

832 833 834
static int session_next_waiting_send(struct session *session)
{
	int ret = kr_ok();
835 836 837 838 839
	if (!session_waitinglist_is_empty(session)) {
		struct sockaddr *peer = session_get_peer(session);
		struct qr_task *task = session_waitinglist_get_first(session);
		uv_handle_t *handle = session_get_handle(session);
		ret = qr_task_send(task, handle, peer, task->pktbuf);
840 841 842 843 844 845
	}
	return ret;
}

static int session_tls_hs_cb(struct session *session, int status)
{
846 847 848 849 850 851
	assert(session_is_outgoing(session));
	uv_handle_t *handle = session_get_handle(session);
	uv_loop_t *loop = handle->loop;
	struct worker_ctx *worker = loop->data;
	struct sockaddr *peer = session_get_peer(session);
	int deletion_res = worker_del_tcp_waiting(worker, peer);
852
	int ret = kr_ok();
853

854
	if (status) {
855
		kr_nsrep_update_rtt(NULL, peer, KR_NS_DEAD,
856 857
				    worker->engine->resolver.cache_rtt,
				    KR_NS_UPDATE_NORESET);
858 859 860 861
		return ret;
	}

	/* handshake was completed successfully */
862
	struct tls_client_ctx_t *tls_client_ctx = session_tls_get_client_ctx(session);
863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880
	struct tls_client_paramlist_entry *tls_params = tls_client_ctx->params;
	gnutls_session_t tls_session = tls_client_ctx->c.tls_session;
	if (gnutls_session_is_resumed(tls_session) != 0) {
		kr_log_verbose("[tls_client] TLS session has resumed\n");
	} else {
		kr_log_verbose("[tls_client] TLS session has not resumed\n");
		/* session wasn't resumed, delete old session data ... */
		if (tls_params->session_data.data != NULL) {
			gnutls_free(tls_params->session_data.data);
			tls_params->session_data.data = NULL;
			tls_params->session_data.size = 0;
		}
		/* ... and get the new session data */
		gnutls_datum_t tls_session_data = { NULL, 0 };
		ret = gnutls_session_get_data2(tls_session, &tls_session_data);
		if (ret == 0) {
			tls_params->session_data = tls_session_data;
		}
881
	}
882

883
	ret = worker_add_tcp_connected(worker, peer, session);
884 885 886 887 888 889 890 891 892 893 894
	if (deletion_res == kr_ok() && ret == kr_ok()) {
		ret = session_next_waiting_send(session);
	} else {
		ret = kr_error(EINVAL);
	}

	if (ret != kr_ok()) {
		/* Something went wrong.
		 * Session isn't in the list of waiting sessions,
		 * or addition to the list of connected sessions failed,
		 * or write to upstream failed. */
895
		worker_del_tcp_connected(worker, peer);
896
		session_waitinglist_finalize(session, KR_STATE_FAIL);
897
		assert(session_tasklist_is_empty(session));
898
		session_close(session);
899
	} else {
900 901 902
		uv_timer_t *t = session_get_timer(session);
		uv_timer_stop(t);
		t->data = session;
903 904
		session_timer_start(session, on_tcp_watchdog_timeout,
				    MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY);
905 906 907 908
	}
	return kr_ok();
}

909

910 911 912
static struct kr_query *task_get_last_pending_query(struct qr_task *task)
{
	if (!task || task->ctx->req.rplan.pending.len == 0) {
913 914 915 916 917 918
		return NULL;
	}

	return array_tail(task->ctx->req.rplan.pending);
}

919

920
static void on_connect(uv_connect_t *req, int status)
921
{
922
	struct worker_ctx *worker = get_worker();
923
	uv_stream_t *handle = req->handle;
924
	struct session *session = handle->data;
925 926 927
	struct sockaddr *peer = session_get_peer(session);

	assert(session_is_outgoing(session));
928 929

	if (status == UV_ECANCELED) {
930 931
		worker_del_tcp_waiting(worker, peer);
		assert(session_is_empty(session) && session_is_closing(session));
932
		iorequest_release(worker, req);
933 934 935
		return;
	}

936 937 938
	if (session_is_closing(session)) {
		worker_del_tcp_waiting(worker, peer);
		assert(session_is_empty(session));
939
		iorequest_release(worker, req);
940 941 942
		return;
	}

943 944
	uv_timer_t *t = session_get_timer(session);
	uv_timer_stop(t);
945

946
	if (status != 0) {
947 948 949
		worker_del_tcp_waiting(worker, peer);
		session_waitinglist_retry(session, false);
		assert(session_tasklist_is_empty(session));
950
		iorequest_release(worker, req);
951 952 953 954
		session_close(session);
		return;
	}

955
	if (!session_has_tls(session)) {
956 957
		/* if there is a TLS, session still waiting for handshake,
		 * otherwise remove it from waiting list */
958
		if (worker_del_tcp_waiting(worker, peer) != 0) {
959 960
			/* session isn't in list of waiting queries, *
			 * something gone wrong */
961 962
			session_waitinglist_finalize(session, KR_STATE_FAIL);
			assert(session_tasklist_is_empty(session));
963 964 965 966 967 968
			iorequest_release(worker, req);
			session_close(session);
			return;
		}
	}

969 970
	struct qr_task *task = session_waitinglist_get_first(session);
	struct kr_query *qry = task_get_last_pending_query(task);
971
	WITH_VERBOSE (qry) {
972 973 974 975
		struct sockaddr *peer = session_get_peer(session);
		char peer_str[INET6_ADDRSTRLEN];
		inet_ntop(peer->sa_family, kr_inaddr(peer), peer_str, sizeof(peer_str));
		VERBOSE_MSG(qry, "=> connected to '%s'\n", peer_str);
976 977
	}

978
	session_set_connected(session, true);
979 980

	int ret = kr_ok();
981 982 983
	if (session_has_tls(session)) {
		struct tls_client_ctx_t *tls_ctx = session_tls_get_client_ctx(session);
		ret = tls_client_connect_start(tls_ctx, session, session_tls_hs_cb);
984
		if (ret == kr_error(EAGAIN)) {
985
			iorequest_release(worker, req);
986
			session_start_read(session);
987 988
			session_timer_start(session, on_tcp_watchdog_timeout,
					    MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY);
989 990
			return;
		}
991 992
	} else {
		worker_add_tcp_connected(worker, peer, session);
993 994 995 996 997
	}

	if (ret == kr_ok()) {
		ret = session_next_waiting_send(session);
		if (ret == kr_ok()) {
998 999
			session_timer_start(session, on_tcp_watchdog_timeout,
					    MAX_TCP_INACTIVITY, MAX_TCP_INACTIVITY);
1000
			iorequest_release(worker, req);
1001 1002
			return;
		}
1003
	}
1004

1005 1006
	session_waitinglist_finalize(session, KR_STATE_FAIL);
	assert(session_tasklist_is_empty(session));
1007
	iorequest_release(worker, req);
1008
	session_close(session);
1009 1010
}

1011
static void on_tcp_connect_timeout(uv_timer_t *timer)
1012
{
1013 1014 1015
	struct session *session = timer->data;

	uv_timer_stop(timer);
1016
	struct worker_ctx *worker = get_worker();
1017

1018
	assert (session_waitinglist_get_len(session) == session_tasklist_get_len(session));
1019

1020 1021
	struct sockaddr *peer = session_get_peer(session);
	worker_del_tcp_waiting(worker, peer);
1022

1023 1024
	struct qr_task *task = session_waitinglist_get_first(session);
	struct kr_query *qry = task_get_last_pending_query(task);
1025
	WITH_VERBOSE (qry) {
1026 1027 1028
		char peer_str[INET6_ADDRSTRLEN];
		inet_ntop(peer->sa_family, kr_inaddr(peer), peer_str, sizeof(peer_str));
		VERBOSE_MSG(qry, "=> connection to '%s' failed\n", peer_str);
1029
	}
1030

1031
	kr_nsrep_update_rtt(NULL, peer, KR_NS_DEAD,
1032 1033
			    worker->engine->resolver.cache_rtt,
			    KR_NS_UPDATE_NORESET);
1034

1035 1036 1037
	worker->stats.timeout += session_waitinglist_get_len(session);
	session_waitinglist_retry(session, true);
	assert (session_tasklist_is_empty(session));
1038
	session_close(session);
1039 1040
}

1041
static void on_tcp_watchdog_timeout(uv_timer_t *timer)
1042
{
1043
	struct session *session = timer->data;
1044 1045 1046 1047
	struct worker_ctx *worker =  timer->loop->data;
	struct sockaddr *peer = session_get_peer(session);

	assert(session_is_outgoing(session));
1048 1049 1050

	uv_timer_stop(timer);

1051 1052
	if (session_has_tls(session)) {
		worker_del_tcp_waiting(worker, peer);
1053 1054
	}

1055 1056 1057 1058 1059
	worker_del_tcp_connected(worker, peer);
	worker->stats.timeout += session_waitinglist_get_len(session);
	session_waitinglist_finalize(session, KR_STATE_FAIL);
	worker->stats.timeout += session_tasklist_get_len(session);
	session_tasklist_finalize(session, KR_STATE_FAIL);
1060
	session_close(session);
1061 1062 1063
}

/* This is called when I/O timeouts */
1064
static void on_udp_timeout(uv_timer_t *timer)
1065
{
1066
	struct session *session = timer->data;
1067 1068 1069
	assert(session_get_handle(session)->data == session);
	assert(session_tasklist_get_len(session) == 1);
	assert(session_waitinglist_is_empty(session));
1070 1071

	uv_timer_stop(timer);
1072 1073

	/* Penalize all tried nameservers with a timeout. */
1074
	struct qr_task *task = session_tasklist_get_first(session);
1075
	struct worker_ctx *worker = task->ctx->worker;
1076
	if (task->leading && task->pending_count > 0) {
1077
		struct kr_query *qry = array_tail(task->ctx->req.rplan.pending);
1078 1079 1080
		struct sockaddr_in6 *addrlist = (struct sockaddr_in6 *)task->addrlist;
		for (uint16_t i = 0; i < MIN(task->pending_count, task->addrlist_count); ++i) {
			struct sockaddr *choice = (struct sockaddr *)(&addrlist[i]);
1081
			WITH_VERBOSE(qry) {
1082 1083
				char addr_str[INET6_ADDRSTRLEN];
				inet_ntop(choice->sa_family, kr_inaddr(choice), addr_str, sizeof(addr_str));
1084
				VERBOSE_MSG(qry, "=> server: '%s' flagged as 'bad'\n", addr_str);
1085
			}
1086
			kr_nsrep_update_rtt(&qry->ns, choice, KR_NS_DEAD,
1087 1088
					    worker->engine->resolver.cache_rtt,
					    KR_NS_UPDATE_NORESET);
1089 1090 1091 1092 1093
		}
	}
	task->timeouts += 1;
	worker->stats.timeout += 1;
	qr_task_step(task, NULL, NULL);
1094 1095
}

1096 1097 1098
static void on_session_idle_timeout(uv_timer_t *timer)
{
	struct session *s = timer->data;
Grigorii Demidov's avatar
Grigorii Demidov committed
1099
	assert(s);
1100
	uv_timer_stop(timer);
1101
	if (session_is_closing(s)) {
1102 1103 1104 1105 1106
		return;
	}
	/* session was not in use during timer timeout
	 * remove it from connection list and close
	 */
1107
	assert(session_is_empty(s));
1108 1109 1110 1111
	session_close(s);
}

static uv_handle_t *retransmit(struct qr_task *task)
1112
{
1113
	uv_handle_t *ret = NULL;
1114
	if (task && task->addrlist && task->addrlist_count > 0) {
1115
		struct sockaddr_in6 *choice = &((struct sockaddr_in6 *)task->addrlist)[task->addrlist_turn];
Grigorii Demidov's avatar
Grigorii Demidov committed
1116 1117 1118
		if (!choice) {
			return ret;
		}
1119
		ret = ioreq_spawn(task, SOCK_DGRAM, choice->sin6_family);
1120 1121 1122 1123 1124
		if (!ret) {
			return ret;
		}
		struct sockaddr *addr = (struct sockaddr *)choice;
		struct session *session = ret->data;
1125 1126 1127
		struct sockaddr *peer = session_get_peer(session);
		assert (peer->sa_family == AF_UNSPEC && session_is_outgoing(session));
		memcpy(peer, addr, kr_sockaddr_len(addr));
1128
		if (qr_task_send(task, ret, (struct sockaddr *)choice,
1129 1130 1131
				 task->pktbuf) == 0) {
			task->addrlist_turn = (task->addrlist_turn + 1) %
					      task->addrlist_count; /* Round robin */
1132 1133
		}
	}
1134
	return ret;
1135 1136 1137 1138
}

static void on_retransmit(uv_timer_t *req)
{
1139
	struct session *session = req->data;
1140
	assert(session_tasklist_get_len(session) == 1);
1141 1142

	uv_timer_stop(req);
1143
	struct qr_task *task = session_tasklist_get_first(session);
1144
	if (retransmit(task) == NULL) {
1145 1146
		/* Not possible to spawn request, start timeout timer with remaining deadline. */
		uint64_t timeout = KR_CONN_RTT_MAX - task->pending_count * KR_CONN_RETRY;
1147
		uv_timer_start(req, on_udp_timeout, timeout, 0);
1148 1149
	} else {
		uv_timer_start(req, on_retransmit, KR_CONN_RETRY, 0);
1150
	}
1151 1152
}

1153
static void subreq_finalize(struct qr_task *task, const struct sockaddr *packet_source, knot_pkt_t *pkt)
1154
{
1155
	/* Close pending timer */
1156
	ioreq_kill_pending(task);
1157
	/* Clear from outgoing table. */
1158 1159
	if (!task->leading)
		return;
1160 1161 1162 1163 1164
	char key[SUBREQ_KEY_LEN];
	const int klen = subreq_key(key, task->pktbuf);
	if (klen > 0) {
		void *val_deleted;
		int ret = trie_del(task->ctx->worker->subreq_out, key, klen, &val_deleted);
1165
		assert(ret == KNOT_EOK && val_deleted == task); (void)ret;
1166 1167
	}
	/* Notify waiting tasks. */
1168
	struct kr_query *leader_qry = array_tail(task->ctx->req.rplan.pending);
1169 1170
	for (size_t i = task->waiting.len; i > 0; i--) {
		struct qr_task *follower = task->waiting.at[i - 1];
1171
		/* Reuse MSGID and 0x20 secret */
1172 1173
		if (follower->ctx->req.rplan.pending.len > 0) {
			struct kr_query *qry = array_tail(follower->ctx->req.rplan.pending);
1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
			qry->id = leader_qry->id;
			qry->secret = leader_qry->secret;
			leader_qry->secret = 0; /* Next will be already decoded */
		}
		qr_task_step(follower, packet_source, pkt);
		qr_task_unref(follower);
	}
	task->waiting.len = 0;
	task->leading = false;
}

static void subreq_lead(struct qr_task *task)
{
	assert(task);
1188 1189 1190 1191 1192 1193 1194 1195 1196 1197