tcp-handler.c 10 KB
Newer Older
1
/*  Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

17
#include <unistd.h>
Marek Vavrusa's avatar
Marek Vavrusa committed
18
#include <fcntl.h>
19
#include <errno.h>
20 21 22
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
23 24
#include <netinet/tcp.h>
#include <netinet/in.h>
25
#include <stdio.h>
26
#include <stdlib.h>
27
#include <urcu.h>
28 29 30
#ifdef HAVE_SYS_UIO_H			// struct iovec (OpenBSD)
#include <sys/uio.h>
#endif // HAVE_SYS_UIO_H
31 32 33
#ifdef HAVE_CAP_NG_H
#include <cap-ng.h>
#endif /* HAVE_CAP_NG_H */
34

35
#include "dnssec/random.h"
36
#include "knot/server/server.h"
Daniel Salzman's avatar
Daniel Salzman committed
37
#include "knot/server/tcp-handler.h"
38
#include "knot/common/fdset.h"
39
#include "knot/common/log.h"
Daniel Salzman's avatar
Daniel Salzman committed
40
#include "knot/nameserver/process_query.h"
41
#include "knot/query/layer.h"
42
#include "contrib/macros.h"
Daniel Salzman's avatar
Daniel Salzman committed
43
#include "contrib/mempattern.h"
44 45
#include "contrib/net.h"
#include "contrib/sockaddr.h"
46
#include "contrib/time.h"
47
#include "contrib/ucw/mempool.h"
Marek Vavrusa's avatar
Marek Vavrusa committed
48

49
/*! \brief TCP context data. */
50
typedef struct tcp_context {
Jan Včelák's avatar
Jan Včelák committed
51 52 53 54 55 56 57 58
	knot_layer_t layer;              /*!< Query processing layer. */
	server_t *server;                /*!< Name server structure. */
	struct iovec iov[2];             /*!< TX/RX buffers. */
	unsigned client_threshold;       /*!< Index of first TCP client. */
	struct timespec last_poll_time;  /*!< Time of the last socket poll. */
	struct timespec throttle_end;    /*!< End of accept() throttling. */
	fdset_t set;                     /*!< Set of server/client sockets. */
	unsigned thread_id;              /*!< Thread identifier. */
59
} tcp_context_t;
60

61 62 63
/*
 * Forward decls.
 */
64 65
#define TCP_THROTTLE_LO 0 /*!< Minimum recovery time on errors. */
#define TCP_THROTTLE_HI 2 /*!< Maximum recovery time on errors. */
66 67

/*! \brief Calculate TCP throttle time (random). */
68
static inline int tcp_throttle(void) {
69
	return TCP_THROTTLE_LO + (dnssec_random_uint16_t() % TCP_THROTTLE_HI);
70
}
71

72
/*! \brief Sweep TCP connection. */
73
static enum fdset_sweep_state tcp_sweep(fdset_t *set, int i, void *data)
74
{
75
	UNUSED(data);
76 77
	assert(set && i < set->n && i >= 0);
	int fd = set->pfd[i].fd;
Jan Včelák's avatar
Jan Včelák committed
78

79
	/* Best-effort, name and shame. */
80 81
	struct sockaddr_storage ss;
	socklen_t len = sizeof(struct sockaddr_storage);
82 83
	if (getpeername(fd, (struct sockaddr*)&ss, &len) == 0) {
		char addr_str[SOCKADDR_STRLEN] = {0};
84
		sockaddr_tostr(addr_str, sizeof(addr_str), (struct sockaddr *)&ss);
85
		log_notice("TCP, terminated inactive client, address %s", addr_str);
86
	}
87

88
	close(fd);
89

90
	return FDSET_SWEEP;
91 92
}

93 94 95 96 97 98 99 100 101 102
static bool tcp_active_state(int state)
{
	return (state == KNOT_STATE_PRODUCE || state == KNOT_STATE_FAIL);
}

static bool tcp_send_state(int state)
{
	return (state != KNOT_STATE_FAIL && state != KNOT_STATE_NOOP);
}

103 104 105
/*!
 * \brief TCP event handler function.
 */
106
static int tcp_handle(tcp_context_t *tcp, int fd,
107
                      struct iovec *rx, struct iovec *tx)
108
{
109
	/* Create query processing parameter. */
110
	struct sockaddr_storage ss = { 0 };
111
	knotd_qdata_params_t params = {
112 113 114 115 116 117
		.remote = &ss,
		.socket = fd,
		.server = tcp->server,
		.thread_id = tcp->thread_id
	};

118 119 120
	rx->iov_len = KNOT_WIRE_MAX_PKTSIZE;
	tx->iov_len = KNOT_WIRE_MAX_PKTSIZE;

121 122 123 124 125 126 127
	/* Receive peer name. */
	socklen_t addrlen = sizeof(struct sockaddr_storage);
	if (getpeername(fd, (struct sockaddr *)&ss, &addrlen) < 0) {
		;
	}

	/* Timeout. */
128
	rcu_read_lock();
Filip Siroky's avatar
Filip Siroky committed
129
	int timeout = 1000 * conf()->cache.srv_tcp_reply_timeout;
130
	rcu_read_unlock();
131

132
	/* Receive data. */
133
	int ret = net_dns_tcp_recv(fd, rx->iov_base, rx->iov_len, timeout);
134 135
	if (ret <= 0) {
		if (ret == KNOT_EAGAIN) {
136
			char addr_str[SOCKADDR_STRLEN] = {0};
137
			sockaddr_tostr(addr_str, sizeof(addr_str), (struct sockaddr *)&ss);
138
			log_warning("TCP, connection timed out, address %s",
139
			            addr_str);
140
		}
Marek Vavrusa's avatar
Marek Vavrusa committed
141
		return KNOT_ECONNREFUSED;
142 143
	} else {
		rx->iov_len = ret;
144
	}
145

146
	/* Initialize processing layer. */
147
	knot_layer_begin(&tcp->layer, &params);
148 149

	/* Create packets. */
150 151
	knot_pkt_t *ans = knot_pkt_new(tx->iov_base, tx->iov_len, tcp->layer.mm);
	knot_pkt_t *query = knot_pkt_new(rx->iov_base, rx->iov_len, tcp->layer.mm);
152

153
	/* Input packet. */
154
	(void) knot_pkt_parse(query, 0);
155
	knot_layer_consume(&tcp->layer, query);
156

157
	/* Resolve until NOOP or finished. */
158
	ret = KNOT_EOK;
159 160
	while (tcp_active_state(tcp->layer.state)) {
		knot_layer_produce(&tcp->layer, ans);
161
		/* Send, if response generation passed and wasn't ignored. */
162
		if (ans->size > 0 && tcp_send_state(tcp->layer.state)) {
163
			if (net_dns_tcp_send(fd, ans->wire, ans->size, timeout) != ans->size) {
164
				ret = KNOT_ECONNREFUSED;
165
				break;
166
			}
Marek Vavrusa's avatar
Marek Vavrusa committed
167 168 169
		}
	}

Marek Vavrusa's avatar
Marek Vavrusa committed
170
	/* Reset after processing. */
171
	knot_layer_finish(&tcp->layer);
172 173 174 175

	/* Cleanup. */
	knot_pkt_free(&query);
	knot_pkt_free(&ans);
Marek Vavrusa's avatar
Marek Vavrusa committed
176

177
	return ret;
178 179
}

180
int tcp_accept(int fd)
181
{
182
	/* Accept incoming connection. */
183
	int incoming = net_accept(fd, NULL);
184

185
	/* Evaluate connection. */
186
	if (incoming >= 0) {
187 188
#ifdef SO_RCVTIMEO
		struct timeval tv;
189
		rcu_read_lock();
Filip Siroky's avatar
Filip Siroky committed
190
		tv.tv_sec = conf()->cache.srv_tcp_idle_timeout;
191
		rcu_read_unlock();
192
		tv.tv_usec = 0;
193
		if (setsockopt(incoming, SOL_SOCKET, SO_RCVTIMEO, &tv, sizeof(tv)) < 0) {
194 195
			log_warning("TCP, failed to set up watchdog timer"
			            ", fd %d", incoming);
196
		}
197
#endif
Marek Vavrusa's avatar
Marek Vavrusa committed
198
	}
199

200
	return incoming;
201 202
}

203
static int tcp_event_accept(tcp_context_t *tcp, unsigned i)
204
{
205 206 207
	/* Accept client. */
	int fd = tcp->set.pfd[i].fd;
	int client = tcp_accept(fd);
208
	if (client >= 0) {
209 210 211
		/* Assign to fdset. */
		int next_id = fdset_add(&tcp->set, client, POLLIN, NULL);
		if (next_id < 0) {
212
			close(client);
213 214 215 216 217
			return next_id; /* Contains errno. */
		}

		/* Update watchdog timer. */
		rcu_read_lock();
Filip Siroky's avatar
Filip Siroky committed
218 219
		int timeout = conf()->cache.srv_tcp_hshake_timeout;
		fdset_set_watchdog(&tcp->set, next_id, timeout);
220
		rcu_read_unlock();
221 222

		return KNOT_EOK;
Marek Vavrusa's avatar
Marek Vavrusa committed
223
	}
Jan Včelák's avatar
Jan Včelák committed
224

225
	return client;
226
}
Jan Včelák's avatar
Jan Včelák committed
227

228 229 230
static int tcp_event_serve(tcp_context_t *tcp, unsigned i)
{
	int fd = tcp->set.pfd[i].fd;
231
	int ret = tcp_handle(tcp, fd, &tcp->iov[0], &tcp->iov[1]);
Marek Vavrusa's avatar
Marek Vavrusa committed
232

233
	/* Flush per-query memory. */
234
	mp_flush(tcp->layer.mm->ctx);
Jan Včelák's avatar
Jan Včelák committed
235

236 237 238
	if (ret == KNOT_EOK) {
		/* Update socket activity timer. */
		rcu_read_lock();
Filip Siroky's avatar
Filip Siroky committed
239 240
		int timeout = conf()->cache.srv_tcp_idle_timeout;
		fdset_set_watchdog(&tcp->set, i, timeout);
241 242
		rcu_read_unlock();
	}
Jan Včelák's avatar
Jan Včelák committed
243

244 245
	return ret;
}
Jan Včelák's avatar
Jan Včelák committed
246

247 248 249 250 251
static int tcp_wait_for_events(tcp_context_t *tcp)
{
	/* Wait for events. */
	fdset_t *set = &tcp->set;
	int nfds = poll(set->pfd, set->n, TCP_SWEEP_INTERVAL * 1000);
Jan Včelák's avatar
Jan Včelák committed
252

253
	/* Mark the time of last poll call. */
Jan Včelák's avatar
Jan Včelák committed
254
	tcp->last_poll_time = time_now();
255 256 257
	bool is_throttled = (tcp->last_poll_time.tv_sec < tcp->throttle_end.tv_sec);
	if (!is_throttled) {
		/* Configuration limit, infer maximal pool size. */
258
		rcu_read_lock();
Filip Siroky's avatar
Filip Siroky committed
259 260
		int clients = conf()->cache.srv_max_tcp_clients;
		unsigned max_per_set = MAX(clients / conf_tcp_threads(conf()), 1);
261
		rcu_read_unlock();
262 263 264
		/* Subtract master sockets check limits. */
		is_throttled = (set->n - tcp->client_threshold) >= max_per_set;
	}
265 266 267 268

	/* Process events. */
	unsigned i = 0;
	while (nfds > 0 && i < set->n) {
269
		bool should_close = false;
270
		int fd = set->pfd[i].fd;
271 272 273 274 275
		if (set->pfd[i].revents & (POLLERR|POLLHUP|POLLNVAL)) {
			should_close = (i >= tcp->client_threshold);
			--nfds;
		} else if (set->pfd[i].revents & (POLLIN)) {
			/* Master sockets */
276
			if (i < tcp->client_threshold) {
277
				if (!is_throttled && tcp_event_accept(tcp, i) == KNOT_EBUSY) {
Jan Včelák's avatar
Jan Včelák committed
278
					tcp->throttle_end = time_now();
279 280 281
					tcp->throttle_end.tv_sec += tcp_throttle();
				}
			/* Client sockets */
282 283
			} else {
				if (tcp_event_serve(tcp, i) != KNOT_EOK) {
284
					should_close = true;
285 286
				}
			}
287
			--nfds;
288
		}
Jan Včelák's avatar
Jan Včelák committed
289

290 291
		/* Evaluate */
		if (should_close) {
292
			fdset_remove(set, i);
293
			close(fd);
294 295
		} else {
			++i;
296 297 298 299
		}
	}

	return nfds;
300
}
301

302
int tcp_master(dthread_t *thread)
303
{
304 305
	if (!thread || !thread->data) {
		return KNOT_EINVAL;
306
	}
Jan Včelák's avatar
Jan Včelák committed
307

308 309 310
	iohandler_t *handler = (iohandler_t *)thread->data;
	unsigned *iostate = &handler->thread_state[dt_get_id(thread)];

311
	int ret = KNOT_EOK;
312 313 314 315
	ref_t *ref = NULL;
	tcp_context_t tcp;
	memset(&tcp, 0, sizeof(tcp_context_t));

316
	/* Create big enough memory cushion. */
317
	knot_mm_t mm = { 0 };
318
	mm_ctx_mempool(&mm, 16 * MM_DEFAULT_BLKSIZE);
319

320
	/* Create TCP answering context. */
321
	tcp.server = handler->server;
322
	tcp.thread_id = handler->thread_id[dt_get_id(thread)];
323
	knot_layer_init(&tcp.layer, &mm, process_query_layer());
324

325
	/* Prepare structures for bound sockets. */
326 327
	conf_val_t val = conf_get(conf(), C_SRV, C_LISTEN);
	fdset_init(&tcp.set, conf_val_count(&val) + CONF_XFERS);
328 329 330

	/* Create iovec abstraction. */
	for (unsigned i = 0; i < 2; ++i) {
331 332 333
		tcp.iov[i].iov_len = KNOT_WIRE_MAX_PKTSIZE;
		tcp.iov[i].iov_base = malloc(tcp.iov[i].iov_len);
		if (tcp.iov[i].iov_base == NULL) {
334 335
			ret = KNOT_ENOMEM;
			goto finish;
336
		}
337 338
	}

339
	/* Initialize sweep interval. */
Jan Včelák's avatar
Jan Včelák committed
340
	struct timespec next_sweep = time_now();
341
	next_sweep.tv_sec += TCP_SWEEP_INTERVAL;
342

343
	for(;;) {
Jan Včelák's avatar
Jan Včelák committed
344

345
		/* Check handler state. */
346
		if (unlikely(*iostate & ServerReload)) {
347
			*iostate &= ~ServerReload;
348

349 350
			/* Cancel client connections. */
			for (unsigned i = tcp.client_threshold; i < tcp.set.n; ++i) {
351
				close(tcp.set.pfd[i].fd);
352
			}
353

354
			ref_release(ref);
355
			ref = server_set_ifaces(handler->server, &tcp.set, IO_TCP, tcp.thread_id);
356 357
			if (tcp.set.n == 0) {
				break; /* Terminate on zero interfaces. */
358
			}
Jan Včelák's avatar
Jan Včelák committed
359

360
			tcp.client_threshold = tcp.set.n;
361
		}
Jan Včelák's avatar
Jan Včelák committed
362

363 364 365 366 367 368 369 370 371 372 373
		/* Check for cancellation. */
		if (dt_is_cancelled(thread)) {
			break;
		}

		/* Serve client requests. */
		tcp_wait_for_events(&tcp);

		/* Sweep inactive clients. */
		if (tcp.last_poll_time.tv_sec >= next_sweep.tv_sec) {
			fdset_sweep(&tcp.set, &tcp_sweep, NULL);
Jan Včelák's avatar
Jan Včelák committed
374
			next_sweep = time_now();
375
			next_sweep.tv_sec += TCP_SWEEP_INTERVAL;
376
		}
377 378
	}

379
finish:
380 381
	free(tcp.iov[0].iov_base);
	free(tcp.iov[1].iov_base);
382
	mp_delete(mm.ctx);
383 384 385
	fdset_clear(&tcp.set);
	ref_release(ref);

386
	return ret;
387
}