rrl.c 14.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*  Copyright (C) 2011 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

17
#include <config.h>
18
#include <time.h>
19 20
#include <unistd.h>
#include <sys/types.h>
21
#include <sys/socket.h>
22
#include <assert.h>
23 24

#include "knot/server/rrl.h"
25
#include "knot/knot.h"
26
#include "libknot/consts.h"
27
#include "libknot/packet/wire.h"
28
#include "common/hattrie/murmurhash3.h"
29
#include "libknot/dnssec/random.h"
30
#include "common/descriptor.h"
31
#include "common/errors.h"
32
#include "knot/zone/zone.h"
33

34 35
/* Hopscotch defines. */
#define HOP_LEN (sizeof(unsigned)*8)
36 37 38
/* Limits */
#define RRL_CLSBLK_MAXLEN (4 + 8 + 1 + 256)
/* CIDR block prefix lengths for v4/v6 */
39 40
#define RRL_V4_PREFIX ((uint32_t)0x00ffffff)         /* /24 */
#define RRL_V6_PREFIX ((uint64_t)0x00ffffffffffffff) /* /56 */
41 42
/* Defaults */
#define RRL_DEFAULT_RATE 100
43
#define RRL_CAPACITY 4 /* N seconds. */
44
#define RRL_SSTART 2 /* 1/Nth of the rate for slow start */
45
#define RRL_PSIZE_LARGE 1024
46 47
/* Enable RRL logging. */
#define RRL_ENABLE_LOG
48 49 50 51 52 53 54

/* Classification */
enum {
	CLS_NULL     = 0 << 0, /* Empty bucket. */
	CLS_NORMAL   = 1 << 0, /* Normal response. */
	CLS_ERROR    = 1 << 1, /* Error response. */
	CLS_NXDOMAIN = 1 << 2, /* NXDOMAIN (special case of error). */
55
	CLS_EMPTY    = 1 << 3, /* Empty response. */
56
	CLS_LARGE    = 1 << 4, /* Response size over threshold (1024k). */
57 58 59
	CLS_WILDCARD = 1 << 5, /* Wildcard query. */
	CLS_ANY      = 1 << 6, /* ANY query (spec. class). */
	CLS_DNSSEC   = 1 << 7  /* DNSSEC related RR query (spec. class) */
60 61
};

62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
/* Classification string. */
const error_table_t rrl_clsstr_tbl[] = {
        {CLS_NORMAL,  "POSITIVE" },
        {CLS_ERROR,   "ERROR" },
        {CLS_NXDOMAIN,"NXDOMAIN"},
        {CLS_EMPTY,   "EMPTY"},
        {CLS_LARGE,   "LARGE"},
        {CLS_WILDCARD,"WILDCARD"},
        {CLS_ANY,     "ANY"},
        {CLS_DNSSEC,  "DNSSEC"},
        {CLS_NULL,    "NULL"},
        {CLS_NULL,    NULL}
};
static inline const char *rrl_clsstr(int code)
{
	return error_to_str(rrl_clsstr_tbl, code);
}

80 81 82 83 84 85 86 87 88
/* Bucket flags. */
enum {
	RRL_BF_NULL   = 0 << 0, /* No flags. */
	RRL_BF_SSTART = 1 << 0, /* Bucket in slow-start after collision. */
	RRL_BF_ELIMIT = 1 << 1  /* Bucket is rate-limited. */
};

static uint8_t rrl_clsid(rrl_req_t *p)
{
89 90
	/* Check error code */
	int ret = CLS_NULL;
91
	switch (knot_wire_get_rcode(p->w)) {
92 93 94 95 96 97 98 99 100
	case KNOT_RCODE_NOERROR: ret = CLS_NORMAL; break;
	case KNOT_RCODE_NXDOMAIN: return CLS_NXDOMAIN; break;
	default: return CLS_ERROR; break;
	}

	/* Check if answered from a qname */
	if (ret == CLS_NORMAL && p->flags & KNOT_PF_WILDCARD) {
		return CLS_WILDCARD;
	}
Jan Včelák's avatar
Jan Včelák committed
101

102
	/* Check query type for spec. classes. */
103
	if (p->query) {
104
		switch(knot_pkt_qtype(p->query)) {
105 106 107 108 109 110 111 112 113 114 115 116
		case KNOT_RRTYPE_ANY:      /* ANY spec. class */
			return CLS_ANY;
			break;
		case KNOT_RRTYPE_DNSKEY:
		case KNOT_RRTYPE_RRSIG:
		case KNOT_RRTYPE_DS:      /* DNSSEC-related RR class. */
			return CLS_DNSSEC;
			break;
		default:
			break;
		}
	}
117

118
	/* Check packet size for threshold. */
119
	if (p->len >= RRL_PSIZE_LARGE) {
120 121 122 123
		return CLS_LARGE;
	}

	/* Check ancount */
124
	if (knot_wire_get_ancount(p->w) == 0) {
125 126 127 128
		return CLS_EMPTY;
	}

	return ret;
129 130
}

131
static int rrl_clsname(char *dst, size_t maxlen, uint8_t cls,
132
                       rrl_req_t *req, const zone_t *zone)
133
{
134 135 136 137
	/* Fallback zone (for errors etc.) */
	const knot_dname_t *dn = (const knot_dname_t*)"\x00";

	/* Found associated zone. */
138 139 140
	if (zone != NULL) {
		dn = zone->name;
	}
141

142 143 144 145 146 147 148
	switch (cls) {
	case CLS_ERROR:    /* Could be a non-existent zone or garbage. */
	case CLS_NXDOMAIN: /* Queries to non-existent names in zone. */
	case CLS_WILDCARD: /* Queries to names covered by a wildcard. */
		dbg_rrl_verb("%s: using zone/fallback name\n", __func__);
		break;
	default:
149 150
		/* Use QNAME */
		if (req->query)
151
			dn = knot_pkt_qname(req->query);
152 153
		break;
	}
Jan Včelák's avatar
Jan Včelák committed
154

155
	/* Write to wire */
156
	return knot_dname_to_wire((uint8_t *)dst, dn, maxlen);
157 158
}

159
static int rrl_classify(char *dst, size_t maxlen, const struct sockaddr_storage *a,
160
                        rrl_req_t *p, const zone_t *z, uint32_t seed)
161
{
162
	if (!dst || !p || !a || maxlen == 0) {
163 164
		return KNOT_EINVAL;
	}
Jan Včelák's avatar
Jan Včelák committed
165

166 167 168
	/* Class */
	uint8_t cls = rrl_clsid(p);
	*dst = cls;
169
	int blklen = sizeof(cls);
Jan Včelák's avatar
Jan Včelák committed
170

171
	/* Address (in network byteorder, adjust masks). */
172
	uint64_t nb = 0;
173 174 175 176 177 178
	if (a->ss_family == AF_INET6) {
		struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)a;
		nb = *((uint64_t*)(&ipv6->sin6_addr)) & RRL_V6_PREFIX;
	} else {
		struct sockaddr_in *ipv4 = (struct sockaddr_in *)a;
		nb = ((uint32_t)ipv4->sin_addr.s_addr) & RRL_V4_PREFIX;
179
	}
180
	if (blklen + sizeof(nb) > maxlen) return KNOT_ESPACE;
181 182 183
	memcpy(dst + blklen, (void*)&nb, sizeof(nb));
	blklen += sizeof(nb);

184
	/* Name */
185 186
	uint16_t *nlen = (uint16_t*)(dst + blklen);
	blklen += sizeof(uint16_t);
187
	int len = rrl_clsname(dst + blklen, maxlen - blklen, cls, p, z);
188 189
	if (len < 0)
		return len;
190
	*nlen = len;
191
	blklen += len;
Jan Včelák's avatar
Jan Včelák committed
192

193
	/* Seed. */
194 195 196
	if (blklen + sizeof(seed) > maxlen) return KNOT_ESPACE;
	if (memcpy(dst + blklen, (void*)&seed, sizeof(seed)) == 0) {
		blklen += sizeof(seed);
197
	}
Jan Včelák's avatar
Jan Včelák committed
198

199 200 201
	return blklen;
}

202 203 204 205
static int bucket_free(rrl_item_t *b, uint32_t now) {
	return b->cls == CLS_NULL || (b->time + 1 < now);
}

206
static int bucket_match(rrl_item_t *b, rrl_item_t *m)
207
{
208 209 210
	return b->cls    == m->cls &&
	       b->netblk == m->netblk &&
	       b->qname  == m->qname;
211
}
212

213
static int find_free(rrl_table_t *t, unsigned i, uint32_t now)
214 215 216 217 218 219 220
{
	rrl_item_t *np = t->arr + t->size;
	rrl_item_t *b = NULL;
	for (b = t->arr + i; b != np; ++b) {
		if (bucket_free(b, now)) {
			return b - (t->arr + i);
		}
221
	}
222 223 224 225
	np = t->arr + i;
	for (b = t->arr; b != np; ++b) {
		if (bucket_free(b, now)) {
			return (b - t->arr) + (t->size - i);
226
		}
227
	}
Jan Včelák's avatar
Jan Včelák committed
228

229 230 231
	/* this happens if table is full... force vacate current elm */
	dbg_rrl("%s: out of free buckets, freeing bucket %u\n", __func__, i);
	return i;
232
}
233

234 235 236 237 238 239 240 241 242 243 244 245 246
static inline unsigned find_match(rrl_table_t *t, uint32_t id, rrl_item_t *m)
{
	unsigned f = 0;
	unsigned d = 0;
	unsigned match = t->arr[id].hop;
	while (match != 0) {
		d = __builtin_ctz(match);
		f = (id + d) % t->size;
		if (bucket_match(t->arr + f, m)) {
			return d;
		} else {
			match &= ~(1<<d); /* clear potential match */
		}
247
	}
248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267

	return HOP_LEN + 1;
}

static inline unsigned reduce_dist(rrl_table_t *t, unsigned id, unsigned d, unsigned *f)
{
	unsigned rd = HOP_LEN - 1;
	while (rd > 0) {
		unsigned s = (t->size + *f - rd) % t->size; /* bucket to be vacated */
		unsigned o = __builtin_ctz(t->arr[s].hop); /* offset of first valid bucket */
		if (t->arr[s].hop != 0 && o < rd) {        /* only offsets in <s, f> are interesting */
			unsigned e = (s + o) % t->size;    /* this item will be displaced to [f] */
			unsigned keep_hop = t->arr[*f].hop; /* unpredictable padding */
			memcpy(t->arr + *f, t->arr + e, sizeof(rrl_item_t));
			t->arr[*f].hop = keep_hop;
			t->arr[e].cls = CLS_NULL;
			t->arr[s].hop &= ~(1<<o);
			t->arr[s].hop |= 1<<rd;
			*f = e;
			return d - (rd - o);
268
		}
269
		--rd;
270
	}
Jan Včelák's avatar
Jan Včelák committed
271

272 273 274 275 276
	assert(rd == 0); /* this happens with p=1/fact(HOP_LEN) */
	*f = id;
	d = 0; /* force vacate initial element */
	dbg_rrl("%s: no potential relocation, freeing bucket %u\n", __func__, id);
	return d;
277 278
}

279
static void rrl_log_state(const struct sockaddr_storage *ss, uint16_t flags, uint8_t cls)
280 281
{
#ifdef RRL_ENABLE_LOG
282 283 284
	char addr_str[SOCKADDR_STRLEN] = {0};
	sockaddr_tostr(ss, addr_str, sizeof(addr_str));

285 286 287 288
	const char *what = "leaves";
	if (flags & RRL_BF_ELIMIT) {
		what = "enters";
	}
Jan Včelák's avatar
Jan Včelák committed
289

290
	log_server_notice("Address '%s' %s rate-limiting (class '%s').\n",
291
	                  addr_str, what, rrl_clsstr(cls));
292 293 294
#endif
}

295 296
rrl_table_t *rrl_create(size_t size)
{
297 298 299
	if (size == 0) {
		return NULL;
	}
Jan Včelák's avatar
Jan Včelák committed
300

301 302 303
	const size_t tbl_len = sizeof(rrl_table_t) + size * sizeof(rrl_item_t);
	rrl_table_t *t = malloc(tbl_len);
	if (!t) return NULL;
304
	memset(t, 0, sizeof(rrl_table_t));
305
	t->size = size;
306
	rrl_reseed(t);
307
	dbg_rrl("%s: created table size '%zu'\n", __func__, t->size);
308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324
	return t;
}

uint32_t rrl_setrate(rrl_table_t *rrl, uint32_t rate)
{
	if (!rrl) return 0;
	uint32_t old = rrl->rate;
	rrl->rate = rate;
	return old;
}

uint32_t rrl_rate(rrl_table_t *rrl)
{
	if (!rrl) return 0;
	return rrl->rate;
}

325
int rrl_setlocks(rrl_table_t *rrl, unsigned granularity)
326 327 328
{
	if (!rrl) return KNOT_EINVAL;
	assert(!rrl->lk); /* Cannot change while locks are used. */
329
	assert(granularity <= rrl->size / 10); /* Due to int. division err. */
Jan Včelák's avatar
Jan Včelák committed
330

331 332 333
	if (pthread_mutex_init(&rrl->ll, NULL) < 0) {
		return KNOT_ENOMEM;
	}
Jan Včelák's avatar
Jan Včelák committed
334

335
	/* Alloc new locks. */
336
	rrl->lk = malloc(granularity * sizeof(pthread_mutex_t));
337
	if (!rrl->lk) return KNOT_ENOMEM;
338
	memset(rrl->lk, 0, granularity * sizeof(pthread_mutex_t));
Jan Včelák's avatar
Jan Včelák committed
339

340 341
	/* Initialize. */
	for (size_t i = 0; i < granularity; ++i) {
342
		if (pthread_mutex_init(rrl->lk + i, NULL) < 0) break;
343 344
		++rrl->lk_count;
	}
345

346 347 348
	/* Incomplete initialization */
	if (rrl->lk_count != granularity) {
		for (size_t i = 0; i < rrl->lk_count; ++i) {
349
			pthread_mutex_destroy(rrl->lk + i);
350 351 352 353 354 355
		}
		free(rrl->lk);
		rrl->lk_count = 0;
		dbg_rrl("%s: failed to init locks\n", __func__);
		return KNOT_ERROR;
	}
Jan Včelák's avatar
Jan Včelák committed
356

357
	dbg_rrl("%s: set granularity to '%u'\n", __func__, granularity);
358 359 360
	return KNOT_EOK;
}

361
rrl_item_t* rrl_hash(rrl_table_t *t, const struct sockaddr_storage *a, rrl_req_t *p,
362
                     const zone_t *zone, uint32_t stamp, int *lock)
363 364 365 366 367 368
{
	char buf[RRL_CLSBLK_MAXLEN];
	int len = rrl_classify(buf, sizeof(buf), a, p, zone, t->seed);
	if (len < 0) {
		return NULL;
	}
Jan Včelák's avatar
Jan Včelák committed
369

370
	uint32_t id = hash(buf, len) % t->size;
Jan Včelák's avatar
Jan Včelák committed
371

372 373 374 375 376 377 378 379 380 381 382 383 384 385
	/* Lock for lookup. */
	pthread_mutex_lock(&t->ll);

	/* Find an exact match in <id, id + HOP_LEN). */
	uint16_t *qname = (uint16_t*)(buf + sizeof(uint8_t) + sizeof(uint64_t));
	rrl_item_t match = {
	        0, *((uint64_t*)(buf + 1)), t->rate,    /* hop, netblk, ntok */
	        buf[0], RRL_BF_NULL,                    /* cls, flags */
	        hash((char*)(qname + 1), *qname), stamp /* qname, time*/
	};

	unsigned d = find_match(t, id, &match);
	if (d > HOP_LEN) { /* not an exact match, find free element [f] */
		d = find_free(t, id, stamp);
386
	}
Jan Včelák's avatar
Jan Včelák committed
387

388 389
	/* Reduce distance to fit <id, id + HOP_LEN) */
	unsigned f = (id + d) % t->size;
390
	while (d >= HOP_LEN) {
391
		d = reduce_dist(t, id, d, &f);
392
	}
Jan Včelák's avatar
Jan Včelák committed
393

394 395 396 397 398
	/* Assign granular lock and unlock lookup. */
	*lock = f % t->lk_count;
	rrl_lock(t, *lock);
	pthread_mutex_unlock(&t->ll);

399 400 401 402
	/* found free elm 'k' which is in <id, id + HOP_LEN) */
	t->arr[id].hop |= (1 << d);
	rrl_item_t* b = t->arr + f;
	assert(f == (id+d) % t->size);
403
	dbg_rrl("%s: classified pkt as %4x '%u+%u' bucket=%p \n", __func__, f, id, d, b);
404 405

	/* Inspect bucket state. */
406
	unsigned hop = b->hop;
407
	if (b->cls == CLS_NULL) {
408 409
		memcpy(b, &match, sizeof(rrl_item_t));
		b->hop = hop;
410 411
	}
	/* Check for collisions. */
412 413
	if (!bucket_match(b, &match)) {
		dbg_rrl("%s: collision in bucket '%4x'\n", __func__, id);
414
		if (!(b->flags & RRL_BF_SSTART)) {
415 416
			memcpy(b, &match, sizeof(rrl_item_t));
			b->hop = hop;
417 418
			b->ntok = t->rate + t->rate / RRL_SSTART;
			b->flags |= RRL_BF_SSTART;
419
			dbg_rrl("%s: bucket '%4x' slow-start\n", __func__, id);
420 421
		}
	}
Jan Včelák's avatar
Jan Včelák committed
422

423 424 425
	return b;
}

426
int rrl_query(rrl_table_t *rrl, const struct sockaddr_storage *a, rrl_req_t *req,
427
              const zone_t *zone)
428
{
429
	if (!rrl || !req || !a) return KNOT_EINVAL;
Jan Včelák's avatar
Jan Včelák committed
430

431 432
	/* Calculate hash and fetch */
	int ret = KNOT_EOK;
433
	int lock = -1;
434
	uint32_t now = time(NULL);
435
	rrl_item_t *b = rrl_hash(rrl, a, req, zone, now, &lock);
436 437
	if (!b) {
		dbg_rrl("%s: failed to compute bucket from packet\n", __func__);
438
		if (lock > -1) rrl_unlock(rrl, lock);
439 440
		return KNOT_ERROR;
	}
Jan Včelák's avatar
Jan Včelák committed
441

442 443 444 445 446
	/* Calculate rate for dT */
	uint32_t dt = now - b->time;
	if (dt > RRL_CAPACITY) {
		dt = RRL_CAPACITY;
	}
447 448
	/* Visit bucket. */
	b->time = now;
449
	dbg_rrl("%s: bucket=0x%x tokens=%hu flags=%x dt=%u\n",
450
	        __func__, (unsigned)(b - rrl->arr), b->ntok, b->flags, dt);
451
	if (dt > 0) { /* Window moved. */
452 453 454 455 456 457

		/* Check state change. */
		if ((b->ntok > 0 || dt > 1) && (b->flags & RRL_BF_ELIMIT)) {
			b->flags &= ~RRL_BF_ELIMIT;
			rrl_log_state(a, b->flags, b->cls);
		}
Jan Včelák's avatar
Jan Včelák committed
458

459 460
		/* Add new tokens. */
		uint32_t dn = rrl->rate * dt;
461 462
		if (b->flags & RRL_BF_SSTART) { /* Bucket in slow-start. */
			b->flags &= ~RRL_BF_SSTART;
463
			dbg_rrl("%s: bucket '0x%x' slow-start finished\n",
464
			        __func__, (unsigned)(b - rrl->arr));
465
		}
466 467 468
		b->ntok += dn;
		if (b->ntok > RRL_CAPACITY * rrl->rate) {
			b->ntok = RRL_CAPACITY * rrl->rate;
469
		}
470
	}
Jan Včelák's avatar
Jan Včelák committed
471

472 473 474 475
	/* Last item taken. */
	if (b->ntok == 1 && !(b->flags & RRL_BF_ELIMIT)) {
		b->flags |= RRL_BF_ELIMIT;
		rrl_log_state(a, b->flags, b->cls);
476
	}
477 478 479 480 481 482 483

	/* Decay current bucket. */
	if (b->ntok > 0) {
		--b->ntok;
	} else if (b->ntok == 0) {
		ret = KNOT_ELIMIT;
	}
Jan Včelák's avatar
Jan Včelák committed
484

485
	if (lock > -1) rrl_unlock(rrl, lock);
486 487 488
	return ret;
}

489 490 491 492 493 494 495
bool rrl_slip_roll(int n_slip)
{
	/* Now n_slip means every Nth answer slips.
	 * That represents a chance of 1/N that answer slips.
	 * Therefore, on average, from 100 answers 100/N will slip. */
	int threshold = RRL_SLIP_MAX / n_slip;
	int roll = knot_random_uint16_t() % RRL_SLIP_MAX;
496
	return (roll < threshold);
497 498
}

499 500
int rrl_destroy(rrl_table_t *rrl)
{
501 502
	if (rrl) {
		dbg_rrl("%s: freeing table %p\n", __func__, rrl);
503
		if (rrl->lk_count > 0) pthread_mutex_destroy(&rrl->ll);
504
		for (size_t i = 0; i < rrl->lk_count; ++i) {
505
			pthread_mutex_destroy(rrl->lk + i);
506 507 508
		}
		free(rrl->lk);
	}
Jan Včelák's avatar
Jan Včelák committed
509

510 511 512
	free(rrl);
	return KNOT_EOK;
}
513 514 515 516 517 518 519 520 521 522

int rrl_reseed(rrl_table_t *rrl)
{
	/* Lock entire table. */
	if (rrl->lk_count > 0) {
		pthread_mutex_lock(&rrl->ll);
		for (unsigned i = 0; i < rrl->lk_count; ++i) {
			rrl_lock(rrl, i);
		}
	}
Jan Včelák's avatar
Jan Včelák committed
523

524
	memset(rrl->arr, 0, rrl->size * sizeof(rrl_item_t));
525
	rrl->seed = knot_random_uint32_t();
526
	dbg_rrl("%s: reseed to '%u'\n", __func__, rrl->seed);
Jan Včelák's avatar
Jan Včelák committed
527

528 529 530 531 532 533
	if (rrl->lk_count > 0) {
		for (unsigned i = 0; i < rrl->lk_count; ++i) {
			rrl_unlock(rrl, i);
		}
		pthread_mutex_unlock(&rrl->ll);
	}
Jan Včelák's avatar
Jan Včelák committed
534

535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
	return KNOT_EOK;
}

int rrl_lock(rrl_table_t *t, int lk_id)
{
	assert(lk_id > -1);
	dbg_rrl_verb("%s: locking id '%d'\n", __func__, lk_id);
	if (pthread_mutex_lock(t->lk + lk_id) != 0) {
		return KNOT_ERROR;
	}
	return KNOT_EOK;
}

int rrl_unlock(rrl_table_t *t, int lk_id)
{
	assert(lk_id > -1);
	dbg_rrl_verb("%s: unlocking id '%d'\n", __func__, lk_id);
	if (pthread_mutex_unlock(t->lk + lk_id)!= 0) {
		return KNOT_ERROR;
	}
	return KNOT_EOK;
}