io.c 47.9 KB
Newer Older
1 2 3
/*
 *	BIRD Internet Routing Daemon -- Unix I/O
 *
4
 *	(c) 1998--2004 Martin Mares <mj@ucw.cz>
5
 *      (c) 2004       Ondrej Filip <feela@network.cz>
6 7 8 9
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

10 11
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
   if _GNU_SOURCE is not defined. */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
12 13 14
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif
15

16 17
#include <stdio.h>
#include <stdlib.h>
18
#include <time.h>
19 20 21
#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
22
#include <sys/uio.h>
23
#include <sys/un.h>
24
#include <poll.h>
25
#include <unistd.h>
26
#include <fcntl.h>
27
#include <errno.h>
28
#include <net/if.h>
Ondřej Zajíček's avatar
Ondřej Zajíček committed
29
#include <netinet/in.h>
30 31
#include <netinet/tcp.h>
#include <netinet/udp.h>
32
#include <netinet/icmp6.h>
33 34 35 36 37

#include "nest/bird.h"
#include "lib/lists.h"
#include "lib/resource.h"
#include "lib/socket.h"
38
#include "lib/event.h"
39
#include "lib/timer.h"
40
#include "lib/string.h"
41
#include "nest/iface.h"
42
#include "conf/conf.h"
43

44 45
#include "sysdep/unix/unix.h"
#include CONFIG_INCLUDE_SYSIO_H
46

47
/* Maximum number of calls of tx handler for one socket in one
48
 * poll iteration. Should be small enough to not monopolize CPU by
49 50 51 52
 * one protocol instance.
 */
#define MAX_STEPS 4

53
/* Maximum number of calls of rx handler for all sockets in one poll
54 55 56 57
   iteration. RX callbacks are often much more costly so we limit
   this to gen small latencies */
#define MAX_RX_STEPS 4

58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
/*
 *	Tracked Files
 */

struct rfile {
  resource r;
  FILE *f;
};

static void
rf_free(resource *r)
{
  struct rfile *a = (struct rfile *) r;

  fclose(a->f);
}

static void
rf_dump(resource *r)
{
  struct rfile *a = (struct rfile *) r;

  debug("(FILE *%p)\n", a->f);
}

static struct resclass rf_class = {
  "FILE",
  sizeof(struct rfile),
  rf_free,
87
  rf_dump,
88
  NULL,
89
  NULL
90 91 92
};

void *
93
tracked_fopen(pool *p, char *name, char *mode)
94 95 96 97 98 99 100 101 102 103 104
{
  FILE *f = fopen(name, mode);

  if (f)
    {
      struct rfile *r = ralloc(p, &rf_class);
      r->f = f;
    }
  return f;
}

105

106 107 108 109
/*
 *	Time clock
 */

110 111
btime boot_time;

112 113 114 115 116 117 118 119 120 121 122 123 124
void
times_init(struct timeloop *loop)
{
  struct timespec ts;
  int rv;

  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
  if (rv < 0)
    die("Monotonic clock is missing");

  if ((ts.tv_sec < 0) || (((s64) ts.tv_sec) > ((s64) 1 << 40)))
    log(L_WARN "Monotonic clock is crazy");

125
  loop->last_time = ts.tv_sec S + ts.tv_nsec NS;
126 127 128 129 130 131 132 133 134 135 136 137 138
  loop->real_time = 0;
}

void
times_update(struct timeloop *loop)
{
  struct timespec ts;
  int rv;

  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
  if (rv < 0)
    die("clock_gettime: %m");

139
  btime new_time = ts.tv_sec S + ts.tv_nsec NS;
140 141 142 143 144 145 146 147

  if (new_time < loop->last_time)
    log(L_ERR "Monotonic clock is broken");

  loop->last_time = new_time;
  loop->real_time = 0;
}

148 149 150 151 152 153 154 155 156 157
void
times_update_real_time(struct timeloop *loop)
{
  struct timespec ts;
  int rv;

  rv = clock_gettime(CLOCK_REALTIME, &ts);
  if (rv < 0)
    die("clock_gettime: %m");

158
  loop->real_time = ts.tv_sec S + ts.tv_nsec NS;
159 160
}

161

162 163 164 165 166 167 168 169 170 171
/**
 * DOC: Sockets
 *
 * Socket resources represent network connections. Their data structure (&socket)
 * contains a lot of fields defining the exact type of the socket, the local and
 * remote addresses and ports, pointers to socket buffers and finally pointers to
 * hook functions to be called when new data have arrived to the receive buffer
 * (@rx_hook), when the contents of the transmit buffer have been transmitted
 * (@tx_hook) and when an error or connection close occurs (@err_hook).
 *
172
 * Freeing of sockets from inside socket hooks is perfectly safe.
173 174
 */

175 176 177 178
#ifndef SOL_IP
#define SOL_IP IPPROTO_IP
#endif

179 180 181 182
#ifndef SOL_IPV6
#define SOL_IPV6 IPPROTO_IPV6
#endif

183 184 185 186 187
#ifndef SOL_ICMPV6
#define SOL_ICMPV6 IPPROTO_ICMPV6
#endif


188 189 190
/*
 *	Sockaddr helper functions
 */
191

192
static inline int UNUSED sockaddr_length(int af)
193 194 195
{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }

static inline void
196
sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
197
{
198
  memset(sa, 0, sizeof(struct sockaddr_in));
199
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
200 201 202 203 204
  sa->sin_len = sizeof(struct sockaddr_in);
#endif
  sa->sin_family = AF_INET;
  sa->sin_port = htons(port);
  sa->sin_addr = ipa_to_in4(a);
205
}
206

207 208
static inline void
sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
209
{
210 211 212 213 214 215 216 217 218 219 220
  memset(sa, 0, sizeof(struct sockaddr_in6));
#ifdef SIN6_LEN
  sa->sin6_len = sizeof(struct sockaddr_in6);
#endif
  sa->sin6_family = AF_INET6;
  sa->sin6_port = htons(port);
  sa->sin6_flowinfo = 0;
  sa->sin6_addr = ipa_to_in6(a);

  if (ifa && ipa_is_link_local(a))
    sa->sin6_scope_id = ifa->index;
221
}
222

223 224
void
sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
225
{
226
  if (af == AF_INET)
227
    sockaddr_fill4((struct sockaddr_in *) sa, a, port);
228 229 230 231
  else if (af == AF_INET6)
    sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
  else
    bug("Unknown AF");
232 233
}

234
static inline void
235
sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
236
{
237 238
  *port = ntohs(sa->sin_port);
  *a = ipa_from_in4(sa->sin_addr);
239 240
}

241 242
static inline void
sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
243
{
244 245
  *port = ntohs(sa->sin6_port);
  *a = ipa_from_in6(sa->sin6_addr);
246

247 248
  if (ifa && ipa_is_link_local(*a))
    *ifa = if_find_by_index(sa->sin6_scope_id);
249 250
}

251 252
int
sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
253
{
254 255
  if (sa->sa.sa_family != af)
    goto fail;
256

257
  if (af == AF_INET)
258
    sockaddr_read4((struct sockaddr_in *) sa, a, port);
259 260 261 262
  else if (af == AF_INET6)
    sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
  else
    goto fail;
263

264
  return 0;
265

266 267 268 269
 fail:
  *a = IPA_NONE;
  *port = 0;
  return -1;
270 271 272
}


273 274 275
/*
 *	IPv6 multicast syscalls
 */
276

277
/* Fortunately standardized in RFC 3493 */
278

279 280
#define INIT_MREQ6(maddr,ifa) \
  { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
281

282 283
static inline int
sk_setup_multicast6(sock *s)
284
{
285 286 287
  int index = s->iface->index;
  int ttl = s->ttl;
  int n = 0;
288

289 290
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
    ERR("IPV6_MULTICAST_IF");
291

292 293
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
    ERR("IPV6_MULTICAST_HOPS");
294

295 296
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
    ERR("IPV6_MULTICAST_LOOP");
297

298
  return 0;
299 300
}

301 302
static inline int
sk_join_group6(sock *s, ip_addr maddr)
303
{
304
  struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
305

306 307
  if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
    ERR("IPV6_JOIN_GROUP");
308

309
  return 0;
310 311
}

312 313
static inline int
sk_leave_group6(sock *s, ip_addr maddr)
314
{
315
  struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
316

317 318 319 320 321
  if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
    ERR("IPV6_LEAVE_GROUP");

  return 0;
}
322

323

324 325 326
/*
 *	IPv6 packet control messages
 */
327

328
/* Also standardized, in RFC 3542 */
329

330 331 332 333 334 335 336 337 338
/*
 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
 * RFC and we use IPV6_PKTINFO.
 */
#ifndef IPV6_RECVPKTINFO
#define IPV6_RECVPKTINFO IPV6_PKTINFO
#endif
339 340 341 342 343 344
/*
 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
 */
#ifndef IPV6_RECVHOPLIMIT
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
#endif
345

346

347 348
#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
349

350 351 352 353
static inline int
sk_request_cmsg6_pktinfo(sock *s)
{
  int y = 1;
354

355 356 357 358
  if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
    ERR("IPV6_RECVPKTINFO");

  return 0;
359 360
}

361 362
static inline int
sk_request_cmsg6_ttl(sock *s)
363
{
364
  int y = 1;
365

366 367
  if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
    ERR("IPV6_RECVHOPLIMIT");
368

369 370
  return 0;
}
371

372 373 374 375
static inline void
sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
{
  if (cm->cmsg_type == IPV6_PKTINFO)
376
  {
377 378 379
    struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
    s->laddr = ipa_from_in6(pi->ipi6_addr);
    s->lifindex = pi->ipi6_ifindex;
380
  }
381
}
382

383 384 385 386 387
static inline void
sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
{
  if (cm->cmsg_type == IPV6_HOPLIMIT)
    s->rcv_ttl = * (int *) CMSG_DATA(cm);
388 389
}

390 391
static inline void
sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
392 393 394
{
  struct cmsghdr *cm;
  struct in6_pktinfo *pi;
395
  int controllen = 0;
396 397 398 399 400

  msg->msg_control = cbuf;
  msg->msg_controllen = cbuflen;

  cm = CMSG_FIRSTHDR(msg);
401
  cm->cmsg_level = SOL_IPV6;
402 403
  cm->cmsg_type = IPV6_PKTINFO;
  cm->cmsg_len = CMSG_LEN(sizeof(*pi));
404
  controllen += CMSG_SPACE(sizeof(*pi));
405 406 407

  pi = (struct in6_pktinfo *) CMSG_DATA(cm);
  pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
408
  pi->ipi6_addr = ipa_to_in6(s->saddr);
409

410
  msg->msg_controllen = controllen;
411
}
412

413

414 415 416 417 418 419
/*
 *	Miscellaneous socket syscalls
 */

static inline int
sk_set_ttl4(sock *s, int ttl)
420
{
421 422 423 424
  if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
    ERR("IP_TTL");

  return 0;
425 426
}

427 428 429 430 431
static inline int
sk_set_ttl6(sock *s, int ttl)
{
  if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
    ERR("IPV6_UNICAST_HOPS");
432

433 434 435 436 437
  return 0;
}

static inline int
sk_set_tos4(sock *s, int tos)
438
{
439 440
  if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
    ERR("IP_TOS");
441

442 443
  return 0;
}
444

445 446 447 448 449
static inline int
sk_set_tos6(sock *s, int tos)
{
  if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
    ERR("IPV6_TCLASS");
450

451 452
  return 0;
}
453

454
static inline int
455
sk_set_high_port(sock *s UNUSED)
456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
{
  /* Port range setting is optional, ignore it if not supported */

#ifdef IP_PORTRANGE
  if (sk_is_ipv4(s))
  {
    int range = IP_PORTRANGE_HIGH;
    if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
      ERR("IP_PORTRANGE");
  }
#endif

#ifdef IPV6_PORTRANGE
  if (sk_is_ipv6(s))
  {
    int range = IPV6_PORTRANGE_HIGH;
    if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
      ERR("IPV6_PORTRANGE");
  }
#endif

  return 0;
}

480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
static inline byte *
sk_skip_ip_header(byte *pkt, int *len)
{
  if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
    return NULL;

  int hlen = (*pkt & 0x0f) * 4;
  if ((hlen < 20) || (hlen > *len))
    return NULL;

  *len -= hlen;
  return pkt + hlen;
}

byte *
sk_rx_buffer(sock *s, int *len)
{
  if (sk_is_ipv4(s) && (s->type == SK_IP))
    return sk_skip_ip_header(s->rbuf, len);
  else
    return s->rbuf;
}

503

504 505 506
/*
 *	Public socket functions
 */
507

508 509 510 511 512 513 514 515 516
/**
 * sk_setup_multicast - enable multicast for given socket
 * @s: socket
 *
 * Prepare transmission of multicast packets for given datagram socket.
 * The socket must have defined @iface.
 *
 * Result: 0 for success, -1 for an error.
 */
517

518 519 520 521
int
sk_setup_multicast(sock *s)
{
  ASSERT(s->iface);
522

523 524 525 526 527
  if (sk_is_ipv4(s))
    return sk_setup_multicast4(s);
  else
    return sk_setup_multicast6(s);
}
528

529 530 531 532 533 534 535 536 537 538
/**
 * sk_join_group - join multicast group for given socket
 * @s: socket
 * @maddr: multicast address
 *
 * Join multicast group for given datagram socket and associated interface.
 * The socket must have defined @iface.
 *
 * Result: 0 for success, -1 for an error.
 */
539

540 541 542 543 544 545 546 547
int
sk_join_group(sock *s, ip_addr maddr)
{
  if (sk_is_ipv4(s))
    return sk_join_group4(s, maddr);
  else
    return sk_join_group6(s, maddr);
}
548

549 550 551 552 553 554 555 556 557 558
/**
 * sk_leave_group - leave multicast group for given socket
 * @s: socket
 * @maddr: multicast address
 *
 * Leave multicast group for given datagram socket and associated interface.
 * The socket must have defined @iface.
 *
 * Result: 0 for success, -1 for an error.
 */
559

560 561 562 563 564 565 566
int
sk_leave_group(sock *s, ip_addr maddr)
{
  if (sk_is_ipv4(s))
    return sk_leave_group4(s, maddr);
  else
    return sk_leave_group6(s, maddr);
567 568
}

569
/**
570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592
 * sk_setup_broadcast - enable broadcast for given socket
 * @s: socket
 *
 * Allow reception and transmission of broadcast packets for given datagram
 * socket. The socket must have defined @iface. For transmission, packets should
 * be send to @brd address of @iface.
 *
 * Result: 0 for success, -1 for an error.
 */

int
sk_setup_broadcast(sock *s)
{
  int y = 1;

  if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
    ERR("SO_BROADCAST");

  return 0;
}

/**
 * sk_set_ttl - set transmit TTL for given socket
593 594 595
 * @s: socket
 * @ttl: TTL value
 *
596 597
 * Set TTL for already opened connections when TTL was not set before. Useful
 * for accepted connections when different ones should have different TTL.
598 599 600 601 602 603 604 605 606
 *
 * Result: 0 for success, -1 for an error.
 */

int
sk_set_ttl(sock *s, int ttl)
{
  s->ttl = ttl;

607 608 609 610
  if (sk_is_ipv4(s))
    return sk_set_ttl4(s, ttl);
  else
    return sk_set_ttl6(s, ttl);
611 612
}

613
/**
614
 * sk_set_min_ttl - set minimal accepted TTL for given socket
615 616 617
 * @s: socket
 * @ttl: TTL value
 *
618 619
 * Set minimal accepted TTL for given socket. Can be used for TTL security.
 * implementations.
620 621 622 623 624 625 626
 *
 * Result: 0 for success, -1 for an error.
 */

int
sk_set_min_ttl(sock *s, int ttl)
{
627 628 629 630
  if (sk_is_ipv4(s))
    return sk_set_min_ttl4(s, ttl);
  else
    return sk_set_min_ttl6(s, ttl);
631
}
632

633
#if 0
634
/**
635
 * sk_set_md5_auth - add / remove MD5 security association for given socket
636
 * @s: socket
637 638
 * @local: IP address of local side
 * @remote: IP address of remote side
639
 * @ifa: Interface for link-local IP address
640 641
 * @passwd: Password used for MD5 authentication
 * @setkey: Update also system SA/SP database
642
 *
643 644 645 646
 * In TCP MD5 handling code in kernel, there is a set of security associations
 * used for choosing password and other authentication parameters according to
 * the local and remote address. This function is useful for listening socket,
 * for active sockets it may be enough to set s->password field.
647 648 649 650
 *
 * When called with passwd != NULL, the new pair is added,
 * When called with passwd == NULL, the existing pair is removed.
 *
651 652 653 654 655 656 657
 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
 * stored in global SA/SP database (but the behavior also must be enabled on
 * per-socket basis). In case of multiple sockets to the same neighbor, the
 * socket-specific state must be configured for each socket while global state
 * just once per src-dst pair. The @setkey argument controls whether the global
 * state (SA/SP database) is also updated.
 *
658 659 660 661
 * Result: 0 for success, -1 for an error.
 */

int
662
sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
663 664
{ DUMMY; }
#endif
665

666 667 668 669 670 671 672 673 674 675 676 677
/**
 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
 * @s: socket
 * @offset: offset
 *
 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
 * kernel will automatically fill it for outgoing packets and check it for
 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
 * known to the kernel.
 *
 * Result: 0 for success, -1 for an error.
 */
678

679 680 681
int
sk_set_ipv6_checksum(sock *s, int offset)
{
682
  if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
683
    ERR("IPV6_CHECKSUM");
684 685 686 687

  return 0;
}

688
int
689
sk_set_icmp6_filter(sock *s, int p1, int p2)
690 691 692 693 694 695 696 697
{
  /* a bit of lame interface, but it is here only for Radv */
  struct icmp6_filter f;

  ICMP6_FILTER_SETBLOCKALL(&f);
  ICMP6_FILTER_SETPASS(p1, &f);
  ICMP6_FILTER_SETPASS(p2, &f);

698
  if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
699
    ERR("ICMP6_FILTER");
700 701 702 703

  return 0;
}

704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753
void
sk_log_error(sock *s, const char *p)
{
  log(L_ERR "%s: Socket error: %s%#m", p, s->err);
}


/*
 *	Actual struct birdsock code
 */

static list sock_list;
static struct birdsock *current_sock;
static struct birdsock *stored_sock;

static inline sock *
sk_next(sock *s)
{
  if (!s->n.next->next)
    return NULL;
  else
    return SKIP_BACK(sock, n, s->n.next);
}

static void
sk_alloc_bufs(sock *s)
{
  if (!s->rbuf && s->rbsize)
    s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
  s->rpos = s->rbuf;
  if (!s->tbuf && s->tbsize)
    s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
  s->tpos = s->ttx = s->tbuf;
}

static void
sk_free_bufs(sock *s)
{
  if (s->rbuf_alloc)
  {
    xfree(s->rbuf_alloc);
    s->rbuf = s->rbuf_alloc = NULL;
  }
  if (s->tbuf_alloc)
  {
    xfree(s->tbuf_alloc);
    s->tbuf = s->tbuf_alloc = NULL;
  }
}

754
#ifdef HAVE_LIBSSH
755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779
static void
sk_ssh_free(sock *s)
{
  struct ssh_sock *ssh = s->ssh;

  if (s->ssh == NULL)
    return;

  s->ssh = NULL;

  if (ssh->channel)
  {
    if (ssh_channel_is_open(ssh->channel))
      ssh_channel_close(ssh->channel);
    ssh_channel_free(ssh->channel);
    ssh->channel = NULL;
  }

  if (ssh->session)
  {
    ssh_disconnect(ssh->session);
    ssh_free(ssh->session);
    ssh->session = NULL;
  }
}
780
#endif
781

782 783 784 785 786 787 788
static void
sk_free(resource *r)
{
  sock *s = (sock *) r;

  sk_free_bufs(s);

789
#ifdef HAVE_LIBSSH
790 791
  if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
    sk_ssh_free(s);
792
#endif
793

794 795
  if (s->fd < 0)
    return;
796

797 798 799
  /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
  if (!(s->flags & SKF_THREAD))
  {
800 801 802 803 804 805
    if (s == current_sock)
      current_sock = sk_next(s);
    if (s == stored_sock)
      stored_sock = sk_next(s);
    rem_node(&s->n);
  }
806 807 808 809 810

  if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
    close(s->fd);

  s->fd = -1;
811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860
}

void
sk_set_rbsize(sock *s, uint val)
{
  ASSERT(s->rbuf_alloc == s->rbuf);

  if (s->rbsize == val)
    return;

  s->rbsize = val;
  xfree(s->rbuf_alloc);
  s->rbuf_alloc = xmalloc(val);
  s->rpos = s->rbuf = s->rbuf_alloc;
}

void
sk_set_tbsize(sock *s, uint val)
{
  ASSERT(s->tbuf_alloc == s->tbuf);

  if (s->tbsize == val)
    return;

  byte *old_tbuf = s->tbuf;

  s->tbsize = val;
  s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
  s->tpos = s->tbuf + (s->tpos - old_tbuf);
  s->ttx  = s->tbuf + (s->ttx  - old_tbuf);
}

void
sk_set_tbuf(sock *s, void *tbuf)
{
  s->tbuf = tbuf ?: s->tbuf_alloc;
  s->ttx = s->tpos = s->tbuf;
}

void
sk_reallocate(sock *s)
{
  sk_free_bufs(s);
  sk_alloc_bufs(s);
}

static void
sk_dump(resource *r)
{
  sock *s = (sock *) r;
861
  static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
862

863
  debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907
	sk_type_names[s->type],
	s->data,
	s->saddr,
	s->sport,
	s->daddr,
	s->dport,
	s->tos,
	s->ttl,
	s->iface ? s->iface->name : "none");
}

static struct resclass sk_class = {
  "Socket",
  sizeof(sock),
  sk_free,
  sk_dump,
  NULL,
  NULL
};

/**
 * sk_new - create a socket
 * @p: pool
 *
 * This function creates a new socket resource. If you want to use it,
 * you need to fill in all the required fields of the structure and
 * call sk_open() to do the actual opening of the socket.
 *
 * The real function name is sock_new(), sk_new() is a macro wrapper
 * to avoid collision with OpenSSL.
 */
sock *
sock_new(pool *p)
{
  sock *s = ralloc(p, &sk_class);
  s->pool = p;
  // s->saddr = s->daddr = IPA_NONE;
  s->tos = s->priority = s->ttl = -1;
  s->fd = -1;
  return s;
}

static int
sk_setup(sock *s)
908
{
909 910
  int y = 1;
  int fd = s->fd;
911

912 913 914
  if (s->type == SK_SSH_ACTIVE)
    return 0;

915 916
  if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
    ERR("O_NONBLOCK");
917

918
  if (!s->af)
919
    return 0;
920

921 922
  if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
    s->flags |= SKF_PKTINFO;
923

924 925 926 927 928 929 930 931
#ifdef CONFIG_USE_HDRINCL
  if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
  {
    s->flags &= ~SKF_PKTINFO;
    s->flags |= SKF_HDRINCL;
    if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
      ERR("IP_HDRINCL");
  }
932 933
#endif

Ondřej Zajíček's avatar
Ondřej Zajíček committed
934 935 936 937 938 939 940 941 942 943 944 945
  if (s->vrf && !s->iface)
  {
    /* Bind socket to associated VRF interface.
       This is Linux-specific, but so is SO_BINDTODEVICE. */
#ifdef SO_BINDTODEVICE
    struct ifreq ifr = {};
    strcpy(ifr.ifr_name, s->vrf->name);
    if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
      ERR("SO_BINDTODEVICE");
#endif
  }

946 947 948
  if (s->iface)
  {
#ifdef SO_BINDTODEVICE
949
    struct ifreq ifr = {};
950 951 952 953
    strcpy(ifr.ifr_name, s->iface->name);
    if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
      ERR("SO_BINDTODEVICE");
#endif
954

955 956 957 958 959
#ifdef CONFIG_UNIX_DONTROUTE
    if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
      ERR("SO_DONTROUTE");
#endif
  }
960

961 962 963 964 965
  if (sk_is_ipv4(s))
  {
    if (s->flags & SKF_LADDR_RX)
      if (sk_request_cmsg4_pktinfo(s) < 0)
	return -1;
966

967 968 969
    if (s->flags & SKF_TTL_RX)
      if (sk_request_cmsg4_ttl(s) < 0)
	return -1;
970

971 972 973
    if ((s->type == SK_UDP) || (s->type == SK_IP))
      if (sk_disable_mtu_disc4(s) < 0)
	return -1;
974

975 976 977
    if (s->ttl >= 0)
      if (sk_set_ttl4(s, s->ttl) < 0)
	return -1;
978

979 980 981 982
    if (s->tos >= 0)
      if (sk_set_tos4(s, s->tos) < 0)
	return -1;
  }
983

984 985
  if (sk_is_ipv6(s))
  {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
986
    if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
987 988
      if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
	ERR("IPV6_V6ONLY");
989

990 991 992
    if (s->flags & SKF_LADDR_RX)
      if (sk_request_cmsg6_pktinfo(s) < 0)
	return -1;
993

994 995 996
    if (s->flags & SKF_TTL_RX)
      if (sk_request_cmsg6_ttl(s) < 0)
	return -1;
997

998 999 1000
    if ((s->type == SK_UDP) || (s->type == SK_IP))
      if (sk_disable_mtu_disc6(s) < 0)
	return -1;
1001

1002 1003 1004
    if (s->ttl >= 0)
      if (sk_set_ttl6(s, s->ttl) < 0)
	return -1;
1005

1006 1007 1008 1009
    if (s->tos >= 0)
      if (sk_set_tos6(s, s->tos) < 0)
	return -1;
  }
1010

1011 1012 1013 1014 1015
  /* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
  if (s->priority >= 0)
    if (sk_set_priority(s, s->priority) < 0)
      return -1;

1016 1017 1018
  return 0;
}

1019 1020
static void
sk_insert(sock *s)
1021
{
1022
  add_tail(&sock_list, &s->n);
1023 1024
}

1025
static void
1026 1027
sk_tcp_connected(sock *s)
{
1028 1029 1030 1031
  sockaddr sa;
  int sa_len = sizeof(sa);

  if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1032
      (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1033
    log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1034

1035 1036
  s->type = SK_TCP;
  sk_alloc_bufs(s);
1037
  s->tx_hook(s);
1038 1039
}

1040
#ifdef HAVE_LIBSSH
1041 1042 1043 1044 1045 1046 1047
static void
sk_ssh_connected(sock *s)
{
  sk_alloc_bufs(s);
  s->type = SK_SSH;
  s->tx_hook(s);
}
1048
#endif
1049

1050
static int
1051
sk_passive_connected(sock *s, int type)
1052
{
1053 1054 1055
  sockaddr loc_sa, rem_sa;
  int loc_sa_len = sizeof(loc_sa);
  int rem_sa_len = sizeof(rem_sa);
1056

1057 1058 1059 1060
  int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
  if (fd < 0)
  {
    if ((errno != EINTR) && (errno != EAGAIN))
1061
      s->err_hook(s, errno);
1062 1063 1064 1065 1066
    return 0;
  }

  sock *t = sk_new(s->pool);
  t->type = type;
1067
  t->af = s->af;
1068
  t->fd = fd;
1069 1070 1071 1072 1073 1074 1075 1076
  t->ttl = s->ttl;
  t->tos = s->tos;
  t->rbsize = s->rbsize;
  t->tbsize = s->tbsize;

  if (type == SK_TCP)
  {
    if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1077
	(sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1078 1079
      log(L_WARN "SOCK: Cannot get local IP address for TCP<");

1080
    if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1081 1082 1083 1084 1085 1086 1087 1088 1089
      log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
  }

  if (sk_setup(t) < 0)
  {
    /* FIXME: Call err_hook instead ? */
    log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);

    /* FIXME: handle it better in rfree() */
1090
    close(t->fd);
1091 1092 1093 1094 1095 1096 1097 1098 1099
    t->fd = -1;
    rfree(t);
    return 1;
  }

  sk_insert(t);
  sk_alloc_bufs(t);
  s->rx_hook(t, 0);
  return 1;
1100 1101
}

1102
#ifdef HAVE_LIBSSH
1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118
/*
 * Return SSH_OK or SSH_AGAIN or SSH_ERROR
 */
static int
sk_ssh_connect(sock *s)
{
  s->fd = ssh_get_fd(s->ssh->session);

  /* Big fall thru automata */
  switch (s->ssh->state)
  {
  case SK_SSH_CONNECT:
  {
    switch (ssh_connect(s->ssh->session))
    {
    case SSH_AGAIN:
1119 1120 1121 1122
      /* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
       * after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
       * documented but our code relies on that.
       */
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294
      return SSH_AGAIN;

    case SSH_OK:
      break;

    default:
      return SSH_ERROR;
    }
  }

  case SK_SSH_SERVER_KNOWN:
  {
    s->ssh->state = SK_SSH_SERVER_KNOWN;

    if (s->ssh->server_hostkey_path)
    {
      int server_identity_is_ok = 1;

      /* Check server identity */
      switch (ssh_is_server_known(s->ssh->session))
      {
#define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
      case SSH_SERVER_KNOWN_OK:
	/* The server is known and has not changed. */
	break;

      case SSH_SERVER_NOT_KNOWN:
	LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
	break;

      case SSH_SERVER_KNOWN_CHANGED:
	LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
	server_identity_is_ok = 0;
	break;

      case SSH_SERVER_FILE_NOT_FOUND:
	LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
	server_identity_is_ok = 0;
	break;

      case SSH_SERVER_ERROR:
	LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
	server_identity_is_ok = 0;
	break;

      case SSH_SERVER_FOUND_OTHER:
	LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
					     "It is a possible attack.");
	server_identity_is_ok = 0;
	break;
      }

      if (!server_identity_is_ok)
	return SSH_ERROR;
    }
  }

  case SK_SSH_USERAUTH:
  {
    s->ssh->state = SK_SSH_USERAUTH;
    switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
    {
    case SSH_AUTH_AGAIN:
      return SSH_AGAIN;

    case SSH_AUTH_SUCCESS:
      break;

    default:
      return SSH_ERROR;
    }
  }

  case SK_SSH_CHANNEL:
  {
    s->ssh->state = SK_SSH_CHANNEL;
    s->ssh->channel = ssh_channel_new(s->ssh->session);
    if (s->ssh->channel == NULL)
      return SSH_ERROR;
  }

  case SK_SSH_SESSION:
  {
    s->ssh->state = SK_SSH_SESSION;
    switch (ssh_channel_open_session(s->ssh->channel))
    {
    case SSH_AGAIN:
      return SSH_AGAIN;

    case SSH_OK:
      break;

    default:
      return SSH_ERROR;
    }
  }

  case SK_SSH_SUBSYSTEM:
  {
    s->ssh->state = SK_SSH_SUBSYSTEM;
    if (s->ssh->subsystem)
    {
      switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
      {
      case SSH_AGAIN:
	return SSH_AGAIN;

      case SSH_OK:
	break;

      default:
	return SSH_ERROR;
      }
    }
  }

  case SK_SSH_ESTABLISHED:
    s->ssh->state = SK_SSH_ESTABLISHED;
  }

  return SSH_OK;
}

/*
 * Return file descriptor number if success
 * Return -1 if failed
 */
static int
sk_open_ssh(sock *s)
{
  if (!s->ssh)
    bug("sk_open() sock->ssh is not allocated");

  ssh_session sess = ssh_new();
  if (sess == NULL)
    ERR2("Cannot create a ssh session");
  s->ssh->session = sess;

  const int verbosity = SSH_LOG_NOLOG;
  ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
  ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
  ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
  /* TODO: Add SSH_OPTIONS_BINDADDR */
  ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);

  if (s->ssh->server_hostkey_path)
    ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);

  if (s->ssh->client_privkey_path)
    ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);

  ssh_set_blocking(sess, 0);

  switch (sk_ssh_connect(s))
  {
  case SSH_AGAIN:
    break;

  case SSH_OK:
    sk_ssh_connected(s);
    break;

  case SSH_ERROR:
    ERR2(ssh_get_error(sess));
    break;
  }

  return ssh_get_fd(sess);

 err:
  return -1;
}
1295
#endif
1296

1297 1298 1299 1300 1301 1302 1303 1304 1305 1306
/**
 * sk_open - open a socket
 * @s: socket
 *
 * This function takes a socket resource created by sk_new() and
 * initialized by the user and binds a corresponding network connection
 * to it.
 *
 * Result: 0 for success, -1 for an error.
 */
1307 1308 1309
int
sk_open(sock *s)
{
1310
  int af = AF_UNSPEC;
1311
  int fd = -1;
1312 1313 1314 1315
  int do_bind = 0;
  int bind_port = 0;
  ip_addr bind_addr = IPA_NONE;
  sockaddr sa;
1316

1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
  if (s->type <= SK_IP)
  {
    /*
     * For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
     * explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
     * But the specifications have to be consistent.
     */

    switch (s->subtype)
    {
    case 0:
      ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
	     (ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
      af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
      break;

    case SK_IPV4:
      ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
      ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
      af = AF_INET;
      break;

    case SK_IPV6:
      ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
      ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
      af = AF_INET6;
      break;

    default:
      bug("Invalid subtype %d", s->subtype);
    }
  }

1350
  switch (s->type)
1351 1352 1353 1354 1355
  {
  case SK_TCP_ACTIVE:
    s->ttx = "";			/* Force s->ttx != s->tpos */
    /* Fall thru */
  case SK_TCP_PASSIVE:
1356
    fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1357 1358 1359 1360
    bind_port = s->sport;
    bind_addr = s->saddr;
    do_bind = bind_port || ipa_nonzero(bind_addr);
    break;
1361

1362
#ifdef HAVE_LIBSSH
1363 1364 1365 1366
  case SK_SSH_ACTIVE:
    s->ttx = "";			/* Force s->ttx != s->tpos */
    fd = sk_open_ssh(s);
    break;
1367
#endif
1368

1369
  case SK_UDP:
1370
    fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1371 1372 1373 1374 1375 1376
    bind_port = s->sport;
    bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
    do_bind = 1;
    break;

  case SK_IP:
1377
    fd = socket(af, SOCK_RAW, s->dport);
1378 1379 1380 1381 1382 1383
    bind_port = 0;
    bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
    do_bind = ipa_nonzero(bind_addr);
    break;

  case SK_MAGIC:
1384
    af = 0;
1385 1386 1387 1388 1389 1390 1391
    fd = s->fd;
    break;

  default:
    bug("sk_open() called for invalid sock type %d", s->type);
  }

1392
  if (fd < 0)
1393 1394
    ERR("socket");

1395
  s->af = af;
1396 1397
  s->fd = fd;

1398 1399
  if (sk_setup(s) < 0)
    goto err;
1400

1401
  if (do_bind)
1402 1403
  {
    if (bind_port)
1404
    {
1405 1406 1407 1408
      int y = 1;

      if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
	ERR2("SO_REUSEADDR");
1409

1410
#ifdef CONFIG_NO_IFACE_BIND
1411 1412 1413 1414 1415 1416
      /* Workaround missing ability to bind to an iface */
      if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
      {
	if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
	  ERR2("SO_REUSEPORT");
      }
1417
#endif
1418
    }
1419 1420 1421 1422
    else
      if (s->flags & SKF_HIGH_PORT)
	if (sk_set_high_port(s) < 0)
	  log(L_WARN "Socket error: %s%#m", s->err);
1423

1424
    sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
1425 1426 1427
    if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
      ERR2("bind");
  }
1428 1429

  if (s->password)
1430
    if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1431
      goto err;
1432

1433
  switch (s->type)
1434 1435
  {
  case SK_TCP_ACTIVE:
1436
    sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448
    if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
      sk_tcp_connected(s);
    else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
	     errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
      ERR2("connect");
    break;

  case SK_TCP_PASSIVE:
    if (listen(fd, 8) < 0)
      ERR2("listen");
    break;

1449
  case SK_SSH_ACTIVE:
1450 1451 1452 1453 1454 1455
  case SK_MAGIC:
    break;

  default:
    sk_alloc_bufs(s);
  }
1456

1457 1458
  if (!(s->flags & SKF_THREAD))
    sk_insert(s);
1459

1460 1461
  return 0;

1462
err:
1463 1464 1465 1466 1467
  close(fd);
  s->fd = -1;
  return -1;
}

1468
int
1469 1470 1471
sk_open_unix(sock *s, char *name)
{
  struct sockaddr_un sa;
1472 1473 1474
  int fd;

  /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1475 1476 1477

  fd = socket(AF_UNIX, SOCK_STREAM, 0);
  if (fd < 0)
1478 1479 1480 1481
    return -1;

  if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
    return -1;
1482

1483
  /* Path length checked in test_old_bird() */
1484
  sa.sun_family = AF_UNIX;
1485
  strcpy(sa.sun_path, name);
1486

1487
  if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1488 1489 1490 1491 1492 1493
    return -1;

  if (listen(fd, 8) < 0)
    return -1;

  s->fd = fd;
1494
  sk_insert(s);
1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
  return 0;
}


#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
			  CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)

static void
sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
{
  if (sk_is_ipv4(s))
    sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
  else
    sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
}

static void
sk_process_cmsgs(sock *s, struct msghdr *msg)
{
  struct cmsghdr *cm;

  s->laddr = IPA_NONE;
  s->lifindex = 0;
  s->rcv_ttl = -1;

  for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
  {
    if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
    {
      sk_process_cmsg4_pktinfo(s, cm);
      sk_process_cmsg4_ttl(s, cm);
    }
1528

1529 1530 1531 1532 1533 1534
    if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
    {
      sk_process_cmsg6_pktinfo(s, cm);
      sk_process_cmsg6_ttl(s, cm);
    }
  }
1535 1536
}

1537 1538 1539 1540 1541 1542

static inline int
sk_sendmsg(sock *s)
{
  struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
  byte cmsg_buf[CMSG_TX_SPACE];
1543
  sockaddr dst;
1544

1545
  sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1546 1547

  struct msghdr msg = {
1548 1549
    .msg_name = &dst.sa,
    .msg_namelen = SA_LEN(dst),
1550 1551 1552 1553 1554 1555 1556 1557 1558 1559
    .msg_iov = &iov,
    .msg_iovlen = 1
  };

#ifdef CONFIG_USE_HDRINCL
  byte hdr[20];
  struct iovec iov2[2] = { {hdr, 20}, iov };

  if (s->flags & SKF_HDRINCL)
  {
1560
    sk_prepare_ip_header(s, hdr, iov.iov_len);
1561 1562 1563 1564 1565 1566
    msg.msg_iov = iov2;
    msg.msg_iovlen = 2;
  }
#endif

  if (s->flags & SKF_PKTINFO)
1567
    sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579

  return sendmsg(s->fd, &msg, 0);
}

static inline int
sk_recvmsg(sock *s)
{
  struct iovec iov = {s->rbuf, s->rbsize};
  byte cmsg_buf[CMSG_RX_SPACE];
  sockaddr src;

  struct msghdr msg = {
1580 1581
    .msg_name = &src.sa,
    .msg_namelen = sizeof(src), // XXXX ??
1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597
    .msg_iov = &iov,
    .msg_iovlen = 1,
    .msg_control = cmsg_buf,
    .msg_controllen = sizeof(cmsg_buf),
    .msg_flags = 0
  };

  int rv = recvmsg(s->fd, &msg, 0);
  if (rv < 0)
    return rv;

  //ifdef IPV4
  //  if (cf_type == SK_IP)
  //    rv = ipv4_skip_header(pbuf, rv);
  //endif

1598
  sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1599
  sk_process_cmsgs(s, &msg);
1600 1601 1602 1603 1604 1605 1606 1607 1608 1609

  if (msg.msg_flags & MSG_TRUNC)
    s->flags |= SKF_TRUNCATED;
  else
    s->flags &= ~SKF_TRUNCATED;

  return rv;
}


1610 1611
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }

1612 1613 1614 1615 1616 1617
static int
sk_maybe_write(sock *s)
{
  int e;

  switch (s->type)