bgp.c 51.5 KB
Newer Older
1 2 3 4
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
Ondřej Zajíček's avatar
Ondřej Zajíček committed
5 6
 *	(c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
 *	(c) 2008--2016 CZ.NIC z.s.p.o.
7 8 9 10
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
11 12 13
/**
 * DOC: Border Gateway Protocol
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
14 15
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
 * the connection and most of the interface with BIRD core, |packets.c| handling
Martin Mareš's avatar
Martin Mareš committed
16 17 18
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
19 20 21 22 23 24
 * As opposed to the other existing routing daemons, BIRD has a sophisticated
 * core architecture which is able to keep all the information needed by BGP in
 * the primary routing table, therefore no complex data structures like a
 * central BGP table are needed. This increases memory footprint of a BGP router
 * with many connections, but not too much and, which is more important, it
 * makes BGP much easier to implement.
Martin Mareš's avatar
Martin Mareš committed
25
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
26 27 28 29 30 31
 * Each instance of BGP (corresponding to a single BGP peer) is described by a
 * &bgp_proto structure to which are attached individual connections represented
 * by &bgp_connection (usually, there exists only one connection, but during BGP
 * session setup, there can be more of them). The connections are handled
 * according to the BGP state machine defined in the RFC with all the timers and
 * all the parameters configurable.
Martin Mareš's avatar
Martin Mareš committed
32
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
33 34 35 36
 * In incoming direction, we listen on the connection's socket and each time we
 * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
 * markers and passes complete packets to bgp_rx_packet() which distributes the
 * packet according to its type.
Martin Mareš's avatar
Martin Mareš committed
37
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
 * In outgoing direction, we gather all the routing updates and sort them to
 * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
 * fast comparison of &rta's and a &fib which helps us to find if we already
 * have another route for the same destination queued for sending, so that we
 * can replace it with the new one immediately instead of sending both
 * updates). There also exists a special bucket holding all the route
 * withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the
 * connection tracking code wanting to send a Open, Keepalive or Notification
 * message), we call bgp_schedule_packet() which sets the corresponding bit in a
 * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
 * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
 * packet type bits and calls the corresponding bgp_create_xx() functions,
 * eventually rescheduling the same packet type if we have more data of the same
 * type to send.
Martin Mareš's avatar
Martin Mareš committed
53
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
54 55 56 57 58 59
 * The processing of attributes consists of two functions: bgp_decode_attrs()
 * for checking of the attribute blocks and translating them to the language of
 * BIRD's extended attributes and bgp_encode_attrs() which does the
 * converse. Both functions are built around a @bgp_attr_table array describing
 * all important characteristics of all known attributes.  Unknown transitive
 * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
60 61 62 63 64 65 66 67 68 69
 *
 * BGP protocol implements graceful restart in both restarting (local restart)
 * and receiving (neighbor restart) roles. The first is handled mostly by the
 * graceful restart code in the nest, BGP protocol just handles capabilities,
 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
 * The second is implemented by internal restart of the BGP state to %BS_IDLE
 * and protocol state to %PS_START, but keeping the protocol up from the core
 * point of view and therefore maintaining received routes. Routing table
 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
 * stale routes after reestablishment of BGP session during graceful restart.
Martin Mareš's avatar
Martin Mareš committed
70 71
 */

72
#undef LOCAL_DEBUG
73 74 75 76 77

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
78
#include "nest/cli.h"
79
#include "nest/locks.h"
80
#include "conf/conf.h"
81
#include "lib/socket.h"
82
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
83
#include "lib/string.h"
84 85 86

#include "bgp.h"

Ondřej Zajíček's avatar
Ondřej Zajíček committed
87

88
struct linpool *bgp_linpool;		/* Global temporary pool */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
89 90
static list bgp_sockets;		/* Global list of listening sockets */

91 92

static void bgp_connect(struct bgp_proto *p);
93
static void bgp_active(struct bgp_proto *p);
94
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
95

Ondřej Zajíček's avatar
Ondřej Zajíček committed
96 97
static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
static void bgp_listen_sock_err(sock *sk UNUSED, int err);
98

Ondřej Zajíček's avatar
Ondřej Zajíček committed
99 100 101 102
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
103 104 105 106
 * This function allocates and configures shared BGP resources, mainly listening
 * sockets. Should be called as the last step during initialization (when lock
 * is acquired and neighbor is ready). When error, caller should change state to
 * PS_DOWN and return immediately.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
107 108 109 110
 */
static int
bgp_open(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
111 112 113 114 115 116 117 118 119
  struct bgp_socket *bs = NULL;
  struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
  ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
    (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6);
  uint port = p->cf->local_port;

  /* FIXME: Add some global init? */
  if (!bgp_linpool)
    init_list(&bgp_sockets);
120

Ondřej Zajíček's avatar
Ondřej Zajíček committed
121
  /* We assume that cf->iface is defined iff cf->local_ip is link-local */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
122

Ondřej Zajíček's avatar
Ondřej Zajíček committed
123 124
  WALK_LIST(bs, bgp_sockets)
    if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port))
125
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
126 127 128
      bs->uc++;
      p->sock = bs;
      return 0;
129 130
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
131 132 133 134 135 136 137 138 139 140 141 142 143 144
  sock *sk = sk_new(proto_pool);
  sk->type = SK_TCP_PASSIVE;
  sk->ttl = 255;
  sk->saddr = addr;
  sk->sport = port;
  sk->flags = 0;
  sk->tos = IP_PREC_INTERNET_CONTROL;
  sk->rbsize = BGP_RX_BUFFER_SIZE;
  sk->tbsize = BGP_TX_BUFFER_SIZE;
  sk->rx_hook = bgp_incoming_connection;
  sk->err_hook = bgp_listen_sock_err;

  if (sk_open(sk) < 0)
    goto err;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
145

Ondřej Zajíček's avatar
Ondřej Zajíček committed
146 147 148 149
  bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
  bs->sk = sk;
  bs->uc = 1;
  p->sock = bs;
150

Ondřej Zajíček's avatar
Ondřej Zajíček committed
151 152 153 154
  add_tail(&bgp_sockets, &bs->n);

  if (!bgp_linpool)
    bgp_linpool = lp_new(proto_pool, 4080);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
155 156

  return 0;
157 158

err:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
159 160 161
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Cannot open listening socket", p->p.name);
  rfree(sk);
162
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
163 164
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 *
 * This function frees and deconfigures shared BGP resources.
 */
static void
bgp_close(struct bgp_proto *p)
{
  struct bgp_socket *bs = p->sock;

  ASSERT(bs && bs->uc);

  if (--bs->uc)
    return;

  rfree(bs->sk);
  rem_node(&bs->n);
  mb_free(bs);

  if (!EMPTY_LIST(bgp_sockets))
    return;

  rfree(bgp_linpool);
  bgp_linpool = NULL;
}

static inline int
bgp_setup_auth(struct bgp_proto *p, int enable)
{
  if (p->cf->password)
  {
    int rv = sk_set_md5_auth(p->sock->sk,
			     p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
			     enable ? p->cf->password : NULL, p->cf->setkey);

    if (rv < 0)
      sk_log_error(p->sock->sk, p->p.name);

    return rv;
  }
  else
    return 0;
}

static inline struct bgp_channel *
bgp_find_channel(struct bgp_proto *p, u32 afi)
{
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
    if (c->afi == afi)
      return c;

  return NULL;
}

221 222 223 224
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
225
  p->start_state = BSS_CONNECT;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
226 227 228

  if (!p->cf->passive)
    bgp_active(p);
229 230 231 232 233 234 235 236 237 238 239 240
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
241 242 243 244 245 246 247
  int err_val;

  if (bgp_open(p) < 0)
  { err_val = BEM_NO_SOCKET; goto err1; }

  if (bgp_setup_auth(p, 1) < 0)
  { err_val = BEM_INVALID_MD5; goto err2; }
248

249 250 251
  if (p->cf->bfd)
    bgp_update_bfd(p, p->cf->bfd);

252
  if (p->startup_delay)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
253 254 255 256 257
  {
    p->start_state = BSS_DELAY;
    BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
    bgp_start_timer(p->startup_timer, p->startup_delay);
  }
258 259 260
  else
    bgp_startup(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
261
  return;
262

Ondřej Zajíček's avatar
Ondřej Zajíček committed
263 264 265 266 267 268
err2:
  bgp_close(p);
err1:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, err_val);
  proto_notify_state(&p->p, PS_DOWN);
269

Ondřej Zajíček's avatar
Ondřej Zajíček committed
270
  return;
271 272
}

Martin Mareš's avatar
Martin Mareš committed
273 274 275 276 277
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
 * @value: time to fire (0 to disable the timer)
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
278 279 280
 * This functions calls tm_start() on @t with time @value and the amount of
 * randomization suggested by the BGP standard. Please use it for all BGP
 * timers.
Martin Mareš's avatar
Martin Mareš committed
281
 */
282
void
283 284
bgp_start_timer(timer *t, int value)
{
285
  if (value)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
286 287 288 289 290
  {
    /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
    t->randomize = value / 4;
    tm_start(t, value - t->randomize);
  }
291 292 293 294
  else
    tm_stop(t);
}

Martin Mareš's avatar
Martin Mareš committed
295 296 297 298
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
299 300
 * This function takes a connection described by the &bgp_conn structure, closes
 * its socket and frees all resources associated with it.
Martin Mareš's avatar
Martin Mareš committed
301
 */
302 303 304
void
bgp_close_conn(struct bgp_conn *conn)
{
305
  // struct bgp_proto *p = conn->bgp;
306 307 308

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
309 310 311
  conn->channels_to_send = 0;
  rfree(conn->connect_timer);
  conn->connect_timer = NULL;
312 313 314 315
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
316 317
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
318 319 320 321 322 323 324
  rfree(conn->sk);
  conn->sk = NULL;

  mb_free(conn->local_caps);
  conn->local_caps = NULL;
  mb_free(conn->remote_caps);
  conn->remote_caps = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
325 326 327 328 329 330 331
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
332 333 334
 * This function updates a startup delay that is used to postpone next BGP
 * connect. It also handles disable_after_error and might stop BGP instance
 * when error happened and disable_after_error is on.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
335 336 337 338
 *
 * It should be called when BGP protocol error happened.
 */
void
339
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
340 341 342
{
  struct bgp_config *cf = p->cf;

343
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
344

345
  if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time))
346 347
    p->startup_delay = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
348 349 350
  p->last_proto_error = now;

  if (cf->disable_after_error)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
351 352 353 354 355
  {
    p->startup_delay = 0;
    p->p.disabled = 1;
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
356 357 358 359

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
360
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
361 362
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
363
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
364
bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode)
365
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
366
  switch (conn->state)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385
  {
  case BS_IDLE:
  case BS_CLOSE:
    return;

  case BS_CONNECT:
  case BS_ACTIVE:
    bgp_conn_enter_idle_state(conn);
    return;

  case BS_OPENSENT:
  case BS_OPENCONFIRM:
  case BS_ESTABLISHED:
    bgp_error(conn, 6, subcode, NULL, 0);
    return;

  default:
    bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
  }
386 387
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
388 389 390 391
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
392 393 394 395
  {
    bgp_setup_auth(p, 0);
    bgp_close(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
396

397
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
398 399 400 401 402 403 404 405 406
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
407 408 409 410
  if ((p->p.proto_state == PS_START) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state != BS_OPENCONFIRM) &&
      !p->cf->passive)
411
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
412

Ondřej Zajíček's avatar
Ondřej Zajíček committed
413 414 415
  if ((p->p.proto_state == PS_STOP) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state == BS_IDLE))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
416 417 418
    bgp_down(p);
}

419
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
420
bgp_stop(struct bgp_proto *p, uint subcode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
421 422
{
  proto_notify_state(&p->p, PS_STOP);
423 424
  bgp_graceful_close_conn(&p->outgoing_conn, subcode);
  bgp_graceful_close_conn(&p->incoming_conn, subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
425 426 427
  ev_schedule(p->event);
}

428
static inline void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
429
bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
430 431 432 433 434 435 436 437 438 439 440 441 442 443
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
444 445
static const struct bgp_af_caps dummy_af_caps = { };

Ondřej Zajíček's avatar
Ondřej Zajíček committed
446 447 448 449
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
450 451 452
  struct bgp_caps *local = conn->local_caps;
  struct bgp_caps *peer = conn->remote_caps;
  struct bgp_channel *c;
453

Ondřej Zajíček's avatar
Ondřej Zajíček committed
454 455
  BGP_TRACE(D_EVENTS, "BGP session established");

456 457
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
458
    p->source_addr = conn->sk->saddr;
459

460 461
  conn->sk->fast_rx = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
462 463 464
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
465

Ondřej Zajíček's avatar
Ondřej Zajíček committed
466 467 468 469
  p->as4_session = conn->as4_session;

  p->route_refresh = peer->route_refresh;
  p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
470

Ondřej Zajíček's avatar
Ondřej Zajíček committed
471 472
  /* Whether we may handle possible GR of peer (it has some AF GR-able) */
  p->gr_ready = 0;	/* Updated later */
473

Ondřej Zajíček's avatar
Ondřej Zajíček committed
474 475
  /* Whether peer is ready to handle our GR recovery */
  int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
476

Ondřej Zajíček's avatar
Ondřej Zajíček committed
477
  if (p->gr_active_num)
478 479
    tm_stop(p->gr_timer);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506
  /* Number of active channels */
  int num = 0;

  WALK_LIST(c, p->p.channels)
  {
    const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
    const struct bgp_af_caps *rem = bgp_find_af_caps(peer,  c->afi);

    /* Ignore AFIs that were not announced in multiprotocol capability */
    if (!loc || !loc->ready)
      loc = &dummy_af_caps;

    if (!rem || !rem->ready)
      rem = &dummy_af_caps;

    int active = loc->ready && rem->ready;
    c->c.disabled = !active;
    c->c.reloadable = p->route_refresh;

    c->index = active ? num++ : 0;

    c->feed_state = BFS_NONE;
    c->load_state = BFS_NONE;

    /* Channels where peer may do GR */
    c->gr_ready = active && local->gr_aware && rem->gr_able;
    p->gr_ready = p->gr_ready || c->gr_ready;
507

Ondřej Zajíček's avatar
Ondřej Zajíček committed
508 509 510
    /* Channels not able to recover gracefully */
    if (p->p.gr_recovery && (!active || !peer_gr_ready))
      channel_graceful_restart_unlock(&c->c);
511

Ondřej Zajíček's avatar
Ondřej Zajíček committed
512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545
    /* Channels waiting for local convergence */
    if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
      c->c.gr_wait = 1;

    /* Channels where peer is not able to recover gracefully */
    if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
      bgp_graceful_restart_done(c);

    /* GR capability implies that neighbor will send End-of-RIB */
    if (peer->gr_aware)
      c->load_state = BFS_LOADING;

    c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
    c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);

    // XXXX reset back to non-ANY?
    if (c->add_path_tx)
      c->c.ra_mode = RA_ANY;
  }

  p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
  p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
  p->channel_count = num;

  WALK_LIST(c, p->p.channels)
  {
    if (c->c.disabled)
      continue;

    p->afi_map[c->index] = c->afi;
    p->channel_map[c->index] = c;
  }

  /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
546

547
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
548 549 550 551 552 553 554 555 556 557
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

  if (p->p.proto_state == PS_UP)
558
    bgp_stop(p, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
559 560 561 562 563 564 565 566
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

567
  bgp_conn_set_state(conn, BS_CLOSE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
568 569 570
  tm_stop(conn->keepalive_timer);
  conn->sk->rx_hook = NULL;

571 572 573
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
574 575 576 577 578 579 580 581 582 583 584
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
585
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
586 587 588 589 590 591
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

592 593 594 595 596 597 598 599 600 601 602
/**
 * bgp_handle_graceful_restart - handle detected BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when a BGP graceful restart of the neighbor is
 * detected (when the TCP connection fails or when a new TCP connection
 * appears). The function activates processing of the restart - starts routing
 * table refresh cycle and activates BGP restart timer. The protocol state goes
 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
 * caller.
 */
603 604 605 606 607 608
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
Ondřej Zajíček's avatar
Ondřej Zajíček committed
609 610 611
	    p->gr_active_num ? " - already pending" : "");

  p->gr_active_num = 0;
612

Ondřej Zajíček's avatar
Ondřej Zajíček committed
613 614 615 616 617 618 619
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
  {
    if (c->gr_ready)
    {
      if (c->gr_active)
	rt_refresh_end(c->c.table, &c->c);
620

Ondřej Zajíček's avatar
Ondřej Zajíček committed
621 622 623 624 625 626 627 628 629 630 631 632 633 634
      c->gr_active = 1;
      p->gr_active_num++;
      rt_refresh_begin(c->c.table, &c->c);
    }
    else
    {
      /* Just flush the routes */
      rt_refresh_begin(c->c.table, &c->c);
      rt_refresh_end(c->c.table, &c->c);
    }
  }

  proto_notify_state(&p->p, PS_START);
  bgp_start_timer(p->gr_timer, p->conn->local_caps->gr_time);
635 636
}

637 638
/**
 * bgp_graceful_restart_done - finish active BGP graceful restart
Ondřej Zajíček's avatar
Ondřej Zajíček committed
639
 * @c: BGP channel
640 641
 *
 * This function is called when the active BGP graceful restart of the neighbor
Ondřej Zajíček's avatar
Ondřej Zajíček committed
642 643 644 645
 * should be finished for channel @c - either successfully (the neighbor sends
 * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
 * unsuccessfully (the neighbor does not support BGP graceful restart on the new
 * session). The function ends the routing table refresh cycle.
646
 */
647
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
648
bgp_graceful_restart_done(struct bgp_channel *c)
649
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
650 651 652 653 654 655 656 657 658 659
  struct bgp_proto *p = (void *) c->c.proto;

  ASSERT(c->gr_active);
  c->gr_active = 0;
  p->gr_active_num--;

  if (!p->gr_active_num)
    BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");

  rt_refresh_end(c->c.table, &c->c);
660 661
}

662 663 664 665 666 667 668 669 670
/**
 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
 * @t: timer
 *
 * This function is a timeout hook for @gr_timer, implementing BGP restart time
 * limit for reestablisment of the BGP session after the graceful restart. When
 * fired, we just proceed with the usual protocol restart.
 */

671 672 673 674 675 676 677 678 679
static void
bgp_graceful_restart_timeout(timer *t)
{
  struct bgp_proto *p = t->data;

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
  bgp_stop(p, 0);
}

680 681 682

/**
 * bgp_refresh_begin - start incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
683
 * @c: BGP channel
684 685 686 687 688 689 690 691
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * started by the neighbor, demarcated by the BoRR packet. The function updates
 * the load state and starts the routing table refresh cycle. Note that graceful
 * restart also uses routing table refresh cycle, but RFC 7313 and load states
 * ensure that these two sequences do not overlap.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
692
bgp_refresh_begin(struct bgp_channel *c)
693
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
694 695 696 697
  struct bgp_proto *p = (void *) c->c.proto;

  if (c->load_state == BFS_LOADING)
  { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
698

Ondřej Zajíček's avatar
Ondřej Zajíček committed
699 700
  c->load_state = BFS_REFRESHING;
  rt_refresh_begin(c->c.table, &c->c);
701 702 703 704
}

/**
 * bgp_refresh_end - finish incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
705
 * @c: BGP channel
706 707 708 709 710 711 712
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * finished by the neighbor, demarcated by the EoRR packet. The function updates
 * the load state and ends the routing table refresh cycle. Routes not received
 * during the sequence are removed by the nest.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
713
bgp_refresh_end(struct bgp_channel *c)
714
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
715
  struct bgp_proto *p = (void *) c->c.proto;
716

Ondřej Zajíček's avatar
Ondřej Zajíček committed
717 718 719 720 721
  if (c->load_state != BFS_REFRESHING)
  { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }

  c->load_state = BFS_NONE;
  rt_refresh_end(c->c.table, &c->c);
722 723 724
}


725 726 727 728 729
static void
bgp_send_open(struct bgp_conn *conn)
{
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
730
  conn->sk->tx_hook = bgp_tx;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
731 732
  tm_stop(conn->connect_timer);
  bgp_schedule_packet(conn, NULL, PKT_OPEN);
733
  bgp_conn_set_state(conn, BS_OPENSENT);
734
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
735 736
}

737 738
static void
bgp_connected(sock *sk)
739 740
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
741
  struct bgp_proto *p = conn->bgp;
742

Martin Mareš's avatar
Martin Mareš committed
743
  BGP_TRACE(D_EVENTS, "Connected");
744 745 746 747 748 749
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
750
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
751
  struct bgp_proto *p = conn->bgp;
752

Martin Mareš's avatar
Martin Mareš committed
753
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
754
  if (p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
755 756 757 758
  {
    bgp_close_conn(conn);
    bgp_connect(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
759 760
  else
    bgp_conn_enter_idle_state(conn);
761 762 763
}

static void
764
bgp_sock_err(sock *sk, int err)
765 766
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
767
  struct bgp_proto *p = conn->bgp;
768

769 770 771 772 773 774 775 776 777
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
778 779
  bgp_store_error(p, conn, BE_SOCKET, err);

780 781 782 783
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
784

785 786 787
  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
    bgp_handle_graceful_restart(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
788
  bgp_conn_enter_idle_state(conn);
789 790
}

791 792 793 794
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
795
  struct bgp_proto *p = conn->bgp;
796

797 798
  DBG("BGP: Hold timeout\n");

799 800 801 802 803 804 805 806
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

807 808 809 810 811 812 813
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
  else
    bgp_error(conn, 4, 0, NULL, 0);
814 815 816 817 818 819 820 821
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
822
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
823 824 825 826

  /* Kick TX a bit faster */
  if (ev_active(conn->tx_ev))
    ev_run(conn->tx_ev);
827 828
}

829
static void
830
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
831
{
832
  conn->sk = NULL;
833
  conn->bgp = p;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
834

835
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
836 837 838 839 840 841 842
  conn->channels_to_send = 0;
  conn->last_channel = 0;
  conn->last_channel_count = 0;

  conn->connect_timer	= tm_new_set(p->p.pool, bgp_connect_timeout,	conn, 0, 0);
  conn->hold_timer 	= tm_new_set(p->p.pool, bgp_hold_timeout,	conn, 0, 0);
  conn->keepalive_timer	= tm_new_set(p->p.pool, bgp_keepalive_timeout,	conn, 0, 0);
843

Ondřej Zajíček's avatar
Ondřej Zajíček committed
844 845 846
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
847 848
}

849
static void
850
bgp_setup_sk(struct bgp_conn *conn, sock *s)
851 852 853
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
854
  s->fast_rx = 1;
855 856 857
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
858
static void
859
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
860
{
861
  int delay = MAX(1, p->cf->connect_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
862 863 864 865
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
866
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
867
  bgp_start_timer(conn->connect_timer, delay);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
868 869
}

Martin Mareš's avatar
Martin Mareš committed
870 871 872 873 874 875 876 877
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
878 879 880
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
881
  struct bgp_conn *conn = &p->outgoing_conn;
882
  int hops = p->cf->multihop ? : 1;
883 884

  DBG("BGP: Connecting\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
885
  sock *s = sk_new(p->p.pool);
886
  s->type = SK_TCP_ACTIVE;
887
  s->saddr = p->source_addr;
888
  s->daddr = p->cf->remote_ip;
889
  s->dport = p->cf->remote_port;
890
  s->iface = p->neigh ? p->neigh->iface : NULL;
891
  s->ttl = p->cf->ttl_security ? 255 : hops;
892 893
  s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
  s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
894 895 896
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
897
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
898
	    s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
899
  bgp_setup_conn(p, conn);
900
  bgp_setup_sk(conn, s);
901
  bgp_conn_set_state(conn, BS_CONNECT);
902 903

  if (sk_open(s) < 0)
904
    goto err;
905 906 907 908

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
    if (sk_set_min_ttl(s, 256 - hops) < 0)
909
      goto err;
910

911
  DBG("BGP: Waiting for connect success\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
912
  bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
913 914
  return;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
915
err:
916 917 918
  sk_log_error(s, p->p.name);
  bgp_sock_err(s, 0);
  return;
919 920
}

921 922 923 924 925 926 927 928
/**
 * bgp_find_proto - find existing proto for incoming connection
 * @sk: TCP socket
 *
 */
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
929
  struct bgp_proto *p;
930

Ondřej Zajíček's avatar
Ondřej Zajíček committed
931 932 933 934 935 936 937
  WALK_LIST(p, proto_list)
    if ((p->p.proto == &proto_bgp) &&
	ipa_equal(p->cf->remote_ip, sk->daddr) &&
	(!ipa_is_link_local(sk->daddr) || (p->cf->iface == sk->iface)) &&
	(ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) &&
	(p->cf->local_port == sk->sport))
      return p;
938 939 940 941

  return NULL;
}

Martin Mareš's avatar
Martin Mareš committed
942 943 944 945 946 947 948 949 950 951 952 953
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
954
static int
955
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
956
{
957 958
  struct bgp_proto *p;
  int acc, hops;
959

960
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
961 962
  p = bgp_find_proto(sk);
  if (!p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
963 964 965 966 967 968
  {
    log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
	sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
    rfree(sk);
    return 0;
  }
969

970 971 972 973 974 975 976
  /*
   * BIRD should keep multiple incoming connections in OpenSent state (for
   * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
   * connections are rejected istead. The exception is the case where an
   * incoming connection triggers a graceful restart.
   */

977 978
  acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
    (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
979

980
  if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
981 982 983 984 985 986 987 988 989 990
  {
    bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
    bgp_handle_graceful_restart(p);
    bgp_conn_enter_idle_state(p->conn);
    acc = 1;

    /* There might be separate incoming connection in OpenSent state */
    if (p->incoming_conn.state > BS_ACTIVE)
      bgp_close_conn(&p->incoming_conn);
  }
991 992 993 994 995 996

  BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
	    sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
	    sk->dport, acc ? "accepted" : "rejected");

  if (!acc)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
997 998 999 1000
  {
    rfree(sk);
    return 0;
  }
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010

  hops = p->cf->multihop ? : 1;

  if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
    goto err;

  if (p->cf->ttl_security)
    if (sk_set_min_ttl(sk, 256 - hops) < 0)
      goto err;

1011
  if (p->cf->enable_extended_messages)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1012 1013 1014 1015 1016
  {
    sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
    sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
    sk_reallocate(sk);
  }
1017

1018 1019 1020 1021 1022 1023 1024 1025
  bgp_setup_conn(p, &p->incoming_conn);
  bgp_setup_sk(&p->incoming_conn, sk);
  bgp_send_open(&p->incoming_conn);
  return 0;

err:
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Incoming connection aborted", p->p.name);
1026 1027 1028 1029
  rfree(sk);
  return 0;
}

1030
static void
1031
bgp_listen_sock_err(sock *sk UNUSED, int err)
1032 1033 1034 1035
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
1036
    log(L_ERR "BGP: Error on listening socket: %M", err);
1037 1038
}

1039 1040 1041
static void
bgp_start_neighbor(struct bgp_proto *p)
{
1042 1043 1044
  /* Called only for single-hop BGP sessions */

  if (ipa_zero(p->source_addr))
1045
    p->source_addr = p->neigh->ifa->ip;
1046

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1047 1048 1049
  if (ipa_is_link_local(p->source_addr))
    p->link_addr = p->source_addr;
  else
1050
  {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1051
    /* Find some link-local address for given iface */
1052
    struct ifa *a;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1053
    p->link_addr = IPA_NONE;
1054 1055
    WALK_LIST(a, p->neigh->iface->addrs)
      if (a->scope == SCOPE_LINK)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1056 1057 1058 1059
      {
	p->link_addr = a->ip;
	break;
      }
1060

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1061
    DBG("%s: Selected link-local address %I\n", p->p.name, p->link_addr);
1062
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1063

1064
  bgp_initiate(p);
1065 1066 1067 1068 1069 1070
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;
1071 1072 1073 1074
  int ps = p->p.proto_state;

  if (n != p->neigh)
    return;
1075

1076
  if ((ps == PS_DOWN) || (ps == PS_STOP))
1077 1078
    return;

1079 1080 1081
  int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);

  if (n->scope <= 0)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1082 1083
  {
    if (!prepare)
1084
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1085 1086 1087 1088
      BGP_TRACE(D_EVENTS, "Neighbor lost");
      bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
      /* Perhaps also run bgp_update_startup_delay(p)? */
      bgp_stop(p, 0);
1089
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1090
  }
1091
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1092 1093
  {
    if (!prepare)
1094
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1095 1096 1097 1098 1099
      BGP_TRACE(D_EVENTS, "Link down");
      bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
      bgp_stop(p, 0);
1100
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1101
  }
1102
  else
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1103 1104
  {
    if (prepare)
1105
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1106 1107
      BGP_TRACE(D_EVENTS, "Neighbor ready");
      bgp_start_neighbor(p);
1108
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1109
  }
1110 1111
}

1112 1113 1114 1115 1116 1117 1118
static void
bgp_bfd_notify(struct bfd_request *req)
{
  struct bgp_proto *p = req->data;
  int ps = p->p.proto_state;

  if (req->down && ((ps == PS_START) || (ps == PS_UP)))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1119 1120 1121 1122 1123 1124 1125
  {
    BGP_TRACE(D_EVENTS, "BFD session down");
    bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
    if (ps == PS_UP)
      bgp_update_startup_delay(p);
    bgp_stop(p, 0);
  }
1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
}

static void
bgp_update_bfd(struct bgp_proto *p, int use_bfd)
{
  if (use_bfd && !p->bfd_req)
    p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
				     p->cf->multihop ? NULL : p->neigh->iface,
				     bgp_bfd_notify, p);

  if (!use_bfd && p->bfd_req)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1137 1138 1139 1140
  {
    rfree(p->bfd_req);
    p->bfd_req = NULL;
  }
1141 1142
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1143 1144
static void
bgp_reload_routes(struct channel *C)
1145
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1146 1147
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1148

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1149 1150 1151
  ASSERT(p->conn && p->route_refresh);

  bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
1152 1153
}

1154
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1155
bgp_feed_begin(struct channel *C, int initial)
1156
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1157 1158
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1159 1160 1161

  /* This should not happen */
  if (!p->conn)
1162 1163
    return;

1164
  if (initial && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1165
    c->feed_state = BFS_LOADING;
1166 1167

  /* It is refeed and both sides support enhanced route refresh */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1168 1169 1170 1171 1172
  if (!initial && p->enhanced_refresh)
  {
    /* BoRR must not be sent before End-of-RIB */
    if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
      return;
1173

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1174 1175 1176
    c->feed_state = BFS_REFRESHING;
    bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
  }
1177 1178 1179
}

static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1180
bgp_feed_end(struct channel *C)
1181
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1182 1183
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1184 1185 1186 1187 1188 1189

  /* This should not happen */
  if (!p->conn)
    return;

  /* Non-demarcated feed ended, nothing to do */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1190
  if (c->feed_state == BFS_NONE)
1191 1192 1193
    return;

  /* Schedule End-of-RIB packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1194 1195
  if (c->feed_state == BFS_LOADING)
    c->feed_state = BFS_LOADED;
1196 1197

  /* Schedule EoRR packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1198 1199
  if (c->feed_state == BFS_REFRESHING)
    c->feed_state = BFS_REFRESHED;
1200 1201

  /* Kick TX hook */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1202
  bgp_schedule_packet(p->conn, c, PKT_UPDATE);
1203 1204
}

1205

1206 1207 1208 1209 1210 1211
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1212
  if (p->p.proto_state != PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1213 1214 1215 1216
  {
    DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1217

1218
  DBG("BGP: Got lock\n");
1219

1220
  if (cf->multihop)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1221 1222 1223 1224 1225
  {
    /* Multi-hop sessions do not use neighbor entries */
    bgp_initiate(p);
    return;
  }
1226

1227 1228
  neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
  if (!n)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1229 1230 1231 1232 1233 1234 1235 1236
  {
    log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
    /* As we do not start yet, we can just disable protocol */
    p->p.disabled = 1;
    bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
    proto_notify_state(&p->p, PS_DOWN);
    return;
  }
1237 1238 1239 1240

  p->neigh = n;

  if (n->scope <= 0)
1241
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
1242 1243 1244 1245
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
  else
    bgp_start_neighbor(p);
1246 1247
}

1248 1249 1250
static int
bgp_start(struct proto *P)
{
1251 1252 1253
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

1254
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1255
  p->start_state = BSS_PREPARE;
1256 1257
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
1258
  p->neigh = NULL;
1259
  p->bfd_req = NULL;
1260
  p->gr_ready = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1261
  p->gr_active_num = 0;
1262

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1263 1264 1265
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
1266

1267 1268 1269 1270
  p->startup_timer = tm_new(p->p.pool);
  p->startup_timer->hook = bgp_startup_timeout;
  p->startup_timer->data = p;

1271 1272 1273 1274
  p->gr_timer = tm_new(p->p.pool);
  p->gr_timer->hook = bgp_graceful_restart_timeout;
  p->gr_timer->data = p;

1275 1276 1277 1278
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

1279
  p->remote_id = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1280
  p->source_addr = p->cf->local_ip;
1281

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1282
  /* XXXX */
1283
  if (p->p.gr_recovery && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1284 1285 1286 1287 1288
  {
    struct bgp_channel *c;
    WALK_LIST(c, p->p.channels)
      channel_graceful_restart_lock(&c->c);
  }
1289

1290
  /*
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1291 1292
   * Before attempting to create the connection, we need to lock the port,
   * so that we are the only instance attempting to talk with that neighbor.
1293 1294 1295 1296
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
1297
  lock->port = p->cf->remote_port;
1298
  lock->iface = p->cf->iface;
1299 1300 1301 1302
  lock->type = OBJLOCK_TCP;
  lock->hook = bgp_start_locked;
  lock->data = p;
  olock_acquire(lock);
1303

1304
  return PS_START;
1305 1306
}

1307 1308
extern int proto_restart;

1309 1310 1311
static int
bgp_shutdown(struct proto *P)
{
1312
  struct bgp_proto *p = (struct bgp_proto *) P;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1313
  uint subcode = 0;
1314

Martin Mareš's avatar
Martin Mareš committed
1315
  BGP_TRACE(D_EVENTS, "Shutdown requested");
1316

1317
  switch (P->down_code)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354
  {
  case PDC_CF_REMOVE:
  case PDC_CF_DISABLE:
    subcode = 3; // Errcode 6, 3 - peer de-configured
    break;

  case PDC_CF_RESTART:
    subcode = 6; // Errcode 6, 6 - other configuration change
    break;

  case PDC_CMD_DISABLE:
  case PDC_CMD_SHUTDOWN:
    subcode = 2; // Errcode 6, 2 - administrative shutdown
    break;

  case PDC_CMD_RESTART:
    subcode = 4; // Errcode 6, 4 - administrative reset
    break;

  case PDC_RX_LIMIT_HIT:
  case PDC_IN_LIMIT_HIT:
    subcode = 1; // Errcode 6, 1 - max number of prefixes reached
    /* log message for compatibility */
    log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
    goto limit;

  case PDC_OUT_LIMIT_HIT:
    subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown

  limit:
    bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
    if (proto_restart)
      bgp_update_startup_delay(p);
    else
      p->startup_delay = 0;
    goto done;
  }
1355

1356
  bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1357
  p->startup_delay = 0;
1358

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1359
done:
1360
  bgp_stop(p, subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1361
  return p->p.proto_state;
1362 1363
}

1364
static struct proto *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1365
bgp_init(struct proto_config *CF)
1366
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1367
  struct proto *P = proto_new(CF);
1368
  struct bgp_proto *p = (struct bgp_proto *) P;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1369
  struct bgp_config *cf = (struct bgp_config *) CF;
1370 1371 1372 1373

  P->rt_notify = bgp_rt_notify;
  P->import_control = bgp_import_control;
  P->neigh_notify = bgp_neigh_notify;
1374
  P->reload_routes = bgp_reload_routes;
1375 1376
  P->feed_begin = bgp_feed_begin;
  P->feed_end = bgp_feed_end;
1377
  P->rte_better = bgp_rte_better;
1378
  P->rte_mergable = bgp_rte_mergable;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
  P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;

  p->cf = cf;
  p->local_as = cf->local_as;
  p->remote_as = cf->remote_as;
  p->public_as = cf->local_as;
  p->is_internal = (cf->local_as == cf->remote_as);
  p->is_interior = p->is_internal || cf->confederation_member;
  p->rs_client = cf->rs_client;
  p->rr_client = cf->rr_client;

  /* Confederation ID is used for truly external peers */
  if (cf->confederation && !p->is_interior)
    p->public_as = cf->confederation;

  /* Add all channels */
  struct bgp_channel_config *cc;
  WALK_LIST(cc, CF->channels)
    proto_add_channel(P, &cc->c);
1398

1399 1400 1401
  return P;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482
static inline rtable *
get_igp_table(struct bgp_channel_config *cf)
{
  return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
}

static void
bgp_channel_init(struct channel *C, struct channel_config *CF)
{
  struct bgp_channel *c = (void *) C;
  struct bgp_channel_config *cf = (void *) CF;

  C->ra_mode = cf->secondary ? RA_ACCEPTED : RA_OPTIMAL;

  c->cf = cf;
  c->afi = cf->afi;
  c->desc = bgp_get_af_desc(c->afi);
  c->igp_table = get_igp_table(cf);
}

static int
bgp_channel_start(struct channel *C)
{
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
  ip_addr src = p->source_addr;

  rt_lock_table(c->igp_table);

  c->pool = p->p.pool; // XXXX
  bgp_init_bucket_table(c);
  bgp_init_prefix_table(c);

  c->next_hop_addr = c->cf->next_hop_addr;
  c->link_addr = IPA_NONE;
  c->packets_to_send = 0;

  /* Try to use source address as next hop address */
  if (ipa_zero(c->next_hop_addr))
  {
    if (bgp_channel_is_ipv4(c) && ipa_is_ip4(src))
      c->next_hop_addr = src;

    if (bgp_channel_is_ipv6(c) && ipa_is_ip6(src) && !ipa_is_link_local(src))
      c->next_hop_addr = src;
  }

  /* Set link-local address for IPv6 single-hop BGP */
  if (bgp_channel_is_ipv6(c) && p->neigh)
  {
    c->link_addr = p->link_addr;

    if (ipa_zero(c->link_addr))
      log(L_WARN "%s: Missing link-local address", p->p.name);
  }

  /* No next hop address is valid on IPv6 link-local BGP */
  if (ipa_zero(c->next_hop_addr) && !ipa_is_link_local(src))
    log(L_WARN "%s: Missing next hop address", p->p.name);

  return 0; /* XXXX: Currently undefined */
}

static void
bgp_channel_shutdown(struct channel *C)
{
  struct bgp_channel *c = (void *) C;

  /* XXXX: cleanup bucket and prefix tables */

  c->next_hop_addr = IPA_NONE;
  c->link_addr = IPA_NONE;
}

static void
bgp_channel_cleanup(struct channel *C)
{
  struct bgp_channel *c = (void *) C;

  rt_unlock_table(c->igp_table);
}
1483 1484

void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1485
bgp_postconfig(struct proto_config *CF)
1486
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1487 1488