nsrep.c 16.2 KB
Newer Older
1
/*  Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 3 4 5 6 7 8 9 10 11 12 13

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
14
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
15 16
 */

Marek Vavruša's avatar
Marek Vavruša committed
17
#include <assert.h>
18 19 20
#include <sys/socket.h>
#include <netinet/in.h>
#include <netdb.h>
Marek Vavruša's avatar
Marek Vavruša committed
21

22 23
#include <arpa/inet.h>

24
#include "lib/nsrep.h"
25
#include "lib/rplan.h"
26
#include "lib/resolve.h"
Marek Vavruša's avatar
Marek Vavruša committed
27
#include "lib/defines.h"
28
#include "lib/generic/pack.h"
29
#include "contrib/ucw/lib.h"
30

31
/** Some built-in unfairness ... */
32
#ifndef FAVOUR_IPV6
33
#define FAVOUR_IPV6 20 /* 20ms bonus for v6 */
34
#endif
35

36
/** @internal Macro to set address structure. */
37
#define ADDR_SET(sa, family, addr, len, port) do {\
38 39
    	memcpy(&sa ## _addr, (addr), (len)); \
    	sa ## _family = (family); \
40
	sa ## _port = htons(port); \
41 42 43
} while (0)

/** Update nameserver representation with current name/address pair. */
44
static void update_nsrep(struct kr_nsrep *ns, size_t pos, uint8_t *addr, size_t addr_len, int port)
45 46
{
	if (addr == NULL) {
47
		ns->addr[pos].ip.sa_family = AF_UNSPEC;
48 49 50
		return;
	}

51 52 53
	/* Rotate previous addresses to the right. */
	memmove(ns->addr + pos + 1, ns->addr + pos, (KR_NSREP_MAXADDR - pos - 1) * sizeof(ns->addr[0]));

54
	switch(addr_len) {
55
	case sizeof(struct in_addr):
56
		ADDR_SET(ns->addr[pos].ip4.sin, AF_INET, addr, addr_len, port); break;
57
	case sizeof(struct in6_addr):
58
		ADDR_SET(ns->addr[pos].ip6.sin6, AF_INET6, addr, addr_len, port); break;
59 60 61 62
	default: assert(0); break;
	}
}

63 64
static void update_nsrep_set(struct kr_nsrep *ns, const knot_dname_t *name, uint8_t *addr[], unsigned score)
{
65 66 67 68 69
	/* NSLIST is not empty, empty NS cannot be a leader. */
	if (!addr[0] && ns->addr[0].ip.sa_family != AF_UNSPEC) {
		return;
	}
	/* Set new NS leader */
70 71 72
	ns->name = name;
	ns->score = score;
	for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
73 74 75
		if (addr[i]) {
			void *addr_val = pack_obj_val(addr[i]);
			size_t len = pack_obj_len(addr[i]);
76
			update_nsrep(ns, i, addr_val, len, KR_DNS_PORT);
77
		} else {
78
			break;
79
		}
80 81 82
	}
}

83 84
#undef ADDR_SET

85 86
/**
 * \param addr_set pack with one IP address per element */
87
static unsigned eval_addr_set(const pack_t *addr_set, struct kr_context *ctx,
88
			      struct kr_qflags opts, unsigned score, uint8_t *addr[])
89
{
90
	kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;
91
	kr_nsrep_rtt_lru_entry_t *rtt_cache_entry_ptr[KR_NSREP_MAXADDR] = { NULL, };
Grigorii Demidov's avatar
Grigorii Demidov committed
92
	assert (KR_NSREP_MAXADDR >= 2);
93 94 95
	unsigned rtt_cache_entry_score[KR_NSREP_MAXADDR] = { score, KR_NS_MAX_SCORE + 1, };
	uint64_t now = kr_now();

96
	/* Name server is better candidate if it has address record. */
97 98
	for (uint8_t *it = pack_head(*addr_set); it != pack_tail(*addr_set);
						it = pack_obj_next(it)) {
99 100
		void *val = pack_obj_val(it);
		size_t len = pack_obj_len(it);
101 102 103 104
		unsigned favour = 0;
		bool is_valid = false;
		/* Check if the address isn't disabled. */
		if (len == sizeof(struct in6_addr)) {
105
			is_valid = !(opts.NO_IPV6);
106
			favour = FAVOUR_IPV6;
107
		} else if (len == sizeof(struct in_addr)) {
108
			is_valid = !(opts.NO_IPV4);
109 110 111
		} else {
			assert(!EINVAL);
			is_valid = false;
112
		}
113 114

		if (!is_valid) {
115
			continue;
116 117
		}

118
		/* Get score for the current address. */
119 120 121 122 123 124 125
		kr_nsrep_rtt_lru_entry_t *cached = rtt_cache ?
						   lru_get_try(rtt_cache, val, len) :
						   NULL;
		unsigned cur_addr_score = KR_NS_GLUED;
		if (cached) {
			cur_addr_score = cached->score;
			if (cached->score >= KR_NS_TIMEOUT) {
126 127
				/* If NS once was marked as "timeouted",
				 * it won't participate in NS elections
128
				 * at least ctx->cache_rtt_tout_retry_interval milliseconds. */
129 130 131
				uint64_t elapsed = now - cached->tout_timestamp;
				elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
				if (elapsed > ctx->cache_rtt_tout_retry_interval) {
132 133 134
					/* Select this NS for probing in this particular query,
					 * but don't change the cached score.
					 * For other queries this NS will remain "timeouted". */
135
					cur_addr_score = KR_NS_LONG - 1;
136 137
				}
			}
138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
		}

		for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
			if (cur_addr_score >= KR_NS_TIMEOUT) {
				/* We can't use favour here.
				 * If all of the conditions below are true
				 *
				 * rtt_cache_entry_score[i] < KR_NS_TIMEOUT
				 * rtt_cache_entry_score[i] + favour > KR_NS_TIMEOUT
				 * cur_addr_score < rtt_cache_entry_score[i] + favour
				 *
				 * we will prefer "certainly dead" cur_addr_score
				 * instead of "almost dead, but alive" rtt_cache_entry_score[i]
				 */
				if (cur_addr_score >= rtt_cache_entry_score[i]) {
					continue;
				}
155 156 157
			}
			if (cur_addr_score >= KR_NS_TIMEOUT
			    || cur_addr_score < rtt_cache_entry_score[i] + favour) {
158
				/* Shake down previous contenders */
159 160 161 162 163 164 165 166 167
				for (size_t j = KR_NSREP_MAXADDR - 1; j > i; --j) {
					addr[j] = addr[j - 1];
					rtt_cache_entry_ptr[j] = rtt_cache_entry_ptr[j - 1];
					rtt_cache_entry_score[j] = rtt_cache_entry_score[j - 1];
				}
				addr[i] = it;
				rtt_cache_entry_score[i] = cur_addr_score;
				rtt_cache_entry_ptr[i] = cached;
				break;
168
			}
169 170
		}
	}
171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203

	/* At this point, rtt_cache_entry_ptr contains up to KR_NSREP_MAXADDR
	 * pointers to the rtt cache entries with the best scores for the given addr_set.
	 * Check if there are timeouted NS. */

	for (size_t i = 0; i < KR_NSREP_MAXADDR; ++i) {
		if (rtt_cache_entry_ptr[i] == NULL)
			continue;
		if (rtt_cache_entry_ptr[i]->score < KR_NS_TIMEOUT)
			continue;

		uint64_t elapsed = now - rtt_cache_entry_ptr[i]->tout_timestamp;
		elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
		if (elapsed <= ctx->cache_rtt_tout_retry_interval)
			continue;

		/* rtt_cache_entry_ptr[i] points to "timeouted" rtt cache entry.
		 * The period of the ban on participation in elections has expired. */

		if (VERBOSE_STATUS) {
			void *val = pack_obj_val(addr[i]);
			size_t len = pack_obj_len(addr[i]);
			char sa_str[INET6_ADDRSTRLEN];
			int af = (len == sizeof(struct in6_addr)) ? AF_INET6 : AF_INET;
			inet_ntop(af, val, sa_str, sizeof(sa_str));
			kr_log_verbose("[     ][nsre] probing timeouted NS: %s, score %i\n",
				       sa_str, rtt_cache_entry_ptr[i]->score);
		}

		rtt_cache_entry_ptr[i]->tout_timestamp = now;
	}

	return rtt_cache_entry_score[0];
204 205
}

206
static int eval_nsrep(const knot_dname_t *owner, const pack_t *addr_set, struct kr_query *qry)
207
{
208 209
	struct kr_nsrep *ns = &qry->ns;
	struct kr_context *ctx = ns->ctx;
210
	unsigned score = KR_NS_MAX_SCORE;
211
	unsigned reputation = 0;
212
	uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
213

214 215
	/* Fetch NS reputation */
	if (ctx->cache_rep) {
216 217
		unsigned *cached = lru_get_try(ctx->cache_rep, (const char *)owner,
					       knot_dname_size(owner));
218 219 220 221 222
		if (cached) {
			reputation = *cached;
		}
	}

223 224 225
	/* Favour nameservers with unknown addresses to probe them,
	 * otherwise discover the current best address for the NS. */
	if (addr_set->len == 0) {
226
		score = KR_NS_UNKNOWN;
227 228 229
		/* If the server doesn't have IPv6, give it disadvantage. */
		if (reputation & KR_NS_NOIP6) {
			score += FAVOUR_IPV6;
230
			/* If the server is unknown but has rep record, treat it as timeouted */
231
			if (reputation & KR_NS_NOIP4) {
232 233
				score = KR_NS_UNKNOWN;
				/* Try to start with clean slate */
234
				if (!(qry->flags.NO_IPV6)) {
235 236
					reputation &= ~KR_NS_NOIP6;
				}
237
				if (!(qry->flags.NO_IPV4)) {
238 239
					reputation &= ~KR_NS_NOIP4;
				}
240
			}
241
		}
242
	} else {
243
		score = eval_addr_set(addr_set, ctx, qry->flags, score, addr_choice);
244 245
	}

246 247 248
	/* Probabilistic bee foraging strategy (naive).
	 * The fastest NS is preferred by workers until it is depleted (timeouts or degrades),
	 * at the same time long distance scouts probe other sources (low probability).
249 250 251 252 253 254
	 * Servers on TIMEOUT will not have probed at all.
	 * Servers with score above KR_NS_LONG will have periodically removed from
	 * reputation cache, so that kresd can reprobe them. */
	if (score >= KR_NS_TIMEOUT) {
		return kr_ok();
	} else if (score <= ns->score &&
255
	   (score < KR_NS_LONG  || qry->flags.NO_THROTTLE)) {
256
		update_nsrep_set(ns, owner, addr_choice, score);
257
		ns->reputation = reputation;
258 259
	} else if (kr_rand_coin(1, 10) &&
		   !kr_rand_coin(score, KR_NS_MAX_SCORE)) {
260 261
		/* With 10% chance probe server with a probability
		 * given by its RTT / MAX_RTT. */
262
		update_nsrep_set(ns, owner, addr_choice, score);
263 264 265 266 267
		ns->reputation = reputation;
		return 1; /* Stop evaluation */
	} else if (ns->score > KR_NS_MAX_SCORE) {
		/* Check if any server was already selected.
		 * If no, pick current server and continue evaluation. */
268
		update_nsrep_set(ns, owner, addr_choice, score);
269
		ns->reputation = reputation;
270 271 272 273 274
	}

	return kr_ok();
}

275
int kr_nsrep_set(struct kr_query *qry, size_t index, const struct sockaddr *sock)
276
{
277
	if (!qry) {
278 279
		return kr_error(EINVAL);
	}
280 281 282
	if (index >= KR_NSREP_MAXADDR) {
		return kr_error(ENOSPC);
	}
283 284

	if (!sock) {
285
		qry->ns.name = (const uint8_t *)"";
286 287 288 289 290 291
		qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
		return kr_ok();
	}

	switch (sock->sa_family) {
	case AF_INET:
292 293 294
		if (qry->flags.NO_IPV4) {
			return kr_error(ENOENT);
		}
295 296 297
		qry->ns.addr[index].ip4 = *(const struct sockaddr_in *)sock;
		break;
	case AF_INET6:
298 299 300
		if (qry->flags.NO_IPV6) {
			return kr_error(ENOENT);
		}
301 302 303 304 305 306 307
		qry->ns.addr[index].ip6 = *(const struct sockaddr_in6 *)sock;
		break;
	default:
		qry->ns.addr[index].ip.sa_family = AF_UNSPEC;
		return kr_error(EINVAL);
	}

308 309 310 311 312 313 314
	qry->ns.name = (const uint8_t *)"";
	/* Reset score on first entry */
	if (index == 0) {
		qry->ns.score = KR_NS_UNKNOWN;
		qry->ns.reputation = 0;
	}

315
	/* Retrieve RTT from cache */
316
	struct kr_context *ctx = qry->ns.ctx;
317
	kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = ctx
318 319
		? lru_get_try(ctx->cache_rtt, kr_inaddr(sock), kr_family_len(sock->sa_family))
		: NULL;
320 321
	if (rtt_cache_entry) {
		qry->ns.score = MIN(qry->ns.score, rtt_cache_entry->score);
322
	}
323

324 325 326
	return kr_ok();
}

327 328
#define ELECT_INIT(ns, ctx_) do { \
	(ns)->ctx = (ctx_); \
329
	(ns)->addr[0].ip.sa_family = AF_UNSPEC; \
330 331 332 333
	(ns)->reputation = 0; \
	(ns)->score = KR_NS_MAX_SCORE + 1; \
} while (0)

334
int kr_nsrep_elect(struct kr_query *qry, struct kr_context *ctx)
335
{
336
	if (!qry || !ctx) {
337
		//assert(!EINVAL);
338 339 340 341
		return kr_error(EINVAL);
	}

	struct kr_nsrep *ns = &qry->ns;
342
	ELECT_INIT(ns, ctx);
343 344 345 346 347 348 349 350 351 352 353 354

	int ret = kr_ok();
	trie_it_t *it;
	for (it = trie_it_begin(qry->zone_cut.nsset); !trie_it_finished(it);
							trie_it_next(it)) {
		ret = eval_nsrep(/* we trust it's a correct dname */
				 (const knot_dname_t *)trie_it_key(it, NULL),
				 (const pack_t *)*trie_it_val(it), qry);
		if (ret) break;
	}
	trie_it_free(it);

355 356 357 358 359 360
	if (qry->ns.score <= KR_NS_MAX_SCORE && qry->ns.score >= KR_NS_LONG) {
		/* This is a low-reliability probe,
		 * go with TCP to get ICMP reachability check. */
		qry->flags.TCP = true;
	}
	return ret;
361
}
362

363 364 365
int kr_nsrep_elect_addr(struct kr_query *qry, struct kr_context *ctx)
{
	if (!qry || !ctx) {
366
		//assert(!EINVAL);
367 368 369 370 371 372
		return kr_error(EINVAL);
	}

	/* Get address list for this NS */
	struct kr_nsrep *ns = &qry->ns;
	ELECT_INIT(ns, ctx);
373
	pack_t *addr_set = kr_zonecut_find(&qry->zone_cut, ns->name);
374 375 376 377
	if (!addr_set) {
		return kr_error(ENOENT);
	}
	/* Evaluate addr list */
378
	uint8_t *addr_choice[KR_NSREP_MAXADDR] = { NULL, };
379
	unsigned score = eval_addr_set(addr_set, ctx, qry->flags, ns->score, addr_choice);
380
	update_nsrep_set(ns, ns->name, addr_choice, score);
381 382 383 384 385
	return kr_ok();
}

#undef ELECT_INIT

386
int kr_nsrep_update_rtt(struct kr_nsrep *ns, const struct sockaddr *addr,
387
			unsigned score, kr_nsrep_rtt_lru_t *cache, int umode)
388
{
389
	if (!cache || umode > KR_NS_MAX || umode < 0) {
390 391 392
		return kr_error(EINVAL);
	}

393 394 395 396 397
	/* Get `addr`, and later its raw string. */
	if (addr) {
		/* Caller provided specific address, OK. */
	} else if (ns != NULL) {
		addr = &ns->addr[0].ip;
398
	} else {
399 400 401 402 403 404 405
		assert(false && "kr_nsrep_update_rtt: don't know what address to update");
		return kr_error(EINVAL);
	}
	const char *addr_in = kr_inaddr(addr);
	size_t addr_len = kr_inaddr_len(addr);
	if (!addr_in || addr_len <= 0) {
		assert(false && "kr_nsrep_update_rtt: incorrect address");
406
		return kr_error(EINVAL);
407
	}
408

409
	bool is_new_entry = false;
410 411
	kr_nsrep_rtt_lru_entry_t  *cur = lru_get_new(cache, addr_in, addr_len,
						     (&is_new_entry));
412
	if (!cur) {
413
		return kr_ok();
414
	}
415 416
	if (score <= KR_NS_GLUED) {
		score = KR_NS_GLUED + 1;
417
	}
418 419 420 421
	/* If there's nothing to update, we reset it unless KR_NS_UPDATE_NORESET
	 * mode was requested.  New items are zeroed by LRU automatically. */
	if (is_new_entry && umode != KR_NS_UPDATE_NORESET) {
		umode = KR_NS_RESET;
422
	}
423
	unsigned new_score = 0;
424 425
	/* Update score, by default smooth over last two measurements. */
	switch (umode) {
426 427
	case KR_NS_UPDATE:
	case KR_NS_UPDATE_NORESET:
428
		new_score = (cur->score + score) / 2; break;
429
	case KR_NS_RESET:  new_score = score; break;
430 431
	case KR_NS_ADD:    new_score = MIN(KR_NS_MAX_SCORE - 1, cur->score + score); break;
	case KR_NS_MAX:    new_score = MAX(cur->score, score); break;
432
	default:           return kr_error(EINVAL);
433
	}
434 435 436 437
	/* Score limits */
	if (new_score > KR_NS_MAX_SCORE) {
		new_score = KR_NS_MAX_SCORE;
	}
438 439 440 441 442
	if (new_score >= KR_NS_TIMEOUT && cur->score < KR_NS_TIMEOUT) {
		/* Set the timestamp only when NS became "timeouted" */
		cur->tout_timestamp = kr_now();
	}
	cur->score = new_score;
443 444
	return kr_ok();
}
445 446 447 448 449 450 451 452 453 454

int kr_nsrep_update_rep(struct kr_nsrep *ns, unsigned reputation, kr_nsrep_lru_t *cache)
{
	if (!ns || !cache ) {
		return kr_error(EINVAL);
	}

	/* Store in the struct */
	ns->reputation = reputation;
	/* Store reputation in the LRU cache */
455 456
	unsigned *cur = lru_get_new(cache, (const char *)ns->name,
				    knot_dname_size(ns->name), NULL);
457 458
	if (cur) {
		*cur = reputation;
459 460
	}
	return kr_ok();
461
}
462 463 464 465 466 467 468 469 470 471 472 473 474 475

int kr_nsrep_copy_set(struct kr_nsrep *dst, const struct kr_nsrep *src)
{
	if (!dst || !src ) {
		return kr_error(EINVAL);
	}

	memcpy(dst, src, sizeof(struct kr_nsrep));
	dst->name = (const uint8_t *)"";
	dst->score = KR_NS_UNKNOWN;
	dst->reputation = 0;

	return kr_ok();
}
476

477
int kr_nsrep_sort(struct kr_nsrep *ns, struct kr_context *ctx)
478
{
479
	if (!ns || !ctx) {
480 481 482
		assert(false);
		return kr_error(EINVAL);
	}
483

484 485 486 487 488
	kr_nsrep_rtt_lru_t *rtt_cache = ctx->cache_rtt;

	ns->reputation = 0;
	ns->score = KR_NS_MAX_SCORE + 1;

489 490 491 492
	if (ns->addr[0].ip.sa_family == AF_UNSPEC) {
		return kr_error(EINVAL);
	}

493 494 495 496
	/* Compute the scores.  Unfortunately there's no space for scores
	 * along the addresses. */
	unsigned scores[KR_NSREP_MAXADDR];
	int i;
497
	bool timeouted_address_is_already_selected = false;
498 499 500 501 502
	for (i = 0; i < KR_NSREP_MAXADDR; ++i) {
		const struct sockaddr *sa = &ns->addr[i].ip;
		if (sa->sa_family == AF_UNSPEC) {
			break;
		}
503 504 505 506
		kr_nsrep_rtt_lru_entry_t *rtt_cache_entry = lru_get_try(rtt_cache,
									kr_inaddr(sa),
									kr_family_len(sa->sa_family));
		if (!rtt_cache_entry) {
507
			scores[i] = 1; /* prefer unknown to probe RTT */
508 509 510 511 512 513 514 515
		} else if (rtt_cache_entry->score < KR_NS_FWD_TIMEOUT) {
			/* some probability to bump bad ones up for re-probe */
			scores[i] = rtt_cache_entry->score;
			/* The lower the rtt, the more likely it will be selected. */
			if (!kr_rand_coin(rtt_cache_entry->score, KR_NS_FWD_TIMEOUT)) {
				scores[i] = 1;
			}
		} else {
516 517 518 519 520 521 522 523 524 525
			uint64_t now = kr_now();
			uint64_t elapsed = now - rtt_cache_entry->tout_timestamp;
			scores[i] = KR_NS_MAX_SCORE + 1;
			elapsed = elapsed > UINT_MAX ? UINT_MAX : elapsed;
			if (elapsed > ctx->cache_rtt_tout_retry_interval &&
			    !timeouted_address_is_already_selected) {
				scores[i] = 1;
				rtt_cache_entry->tout_timestamp = now;
				timeouted_address_is_already_selected = true;
			}
526
		}
527 528 529 530 531 532

		/* Give advantage to IPv6. */
		if (scores[i] <= KR_NS_MAX_SCORE && sa->sa_family == AF_INET) {
			scores[i] += FAVOUR_IPV6;
		}

533
		if (VERBOSE_STATUS) {
534
			kr_log_verbose("[     ][nsre] score %d for %s;\t cached RTT: %d\n",
535
					scores[i], kr_straddr(sa),
536
					rtt_cache_entry ? rtt_cache_entry->score : -1);
537
		}
538 539
	}

540 541 542 543 544 545 546 547
	/* Select-sort the addresses. */
	const int count = i;
	for (i = 0; i < count - 1; ++i) {
		/* find min from i onwards */
		int min_i = i;
		for (int j = i + 1; j < count; ++j) {
			if (scores[j] < scores[min_i]) {
				min_i = j;
548 549
			}
		}
550 551 552 553 554 555
		/* swap the indices */
		if (min_i != i) {
			SWAP(scores[min_i], scores[i]);
			SWAP(ns->addr[min_i], ns->addr[i]);
		}
	}
556

557 558 559 560 561
	if (count > 0) {
		ns->score = scores[0];
		ns->reputation = 0;
	}

562 563
	return kr_ok();
}