bgp.c 46.8 KB
Newer Older
1 2 3 4 5 6 7 8
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/**
 * DOC: Border Gateway Protocol
 *
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the
 * connection and most of the interface with BIRD core, |packets.c| handling
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
 * As opposed to the other existing routing daemons, BIRD has a sophisticated core
 * architecture which is able to keep all the information needed by BGP in the
 * primary routing table, therefore no complex data structures like a central
 * BGP table are needed. This increases memory footprint of a BGP router with
 * many connections, but not too much and, which is more important, it makes
 * BGP much easier to implement.
 *
Martin Mareš's avatar
Martin Mareš committed
24
 * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto
Martin Mareš's avatar
Martin Mareš committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 * structure to which are attached individual connections represented by &bgp_connection
 * (usually, there exists only one connection, but during BGP session setup, there
 * can be more of them). The connections are handled according to the BGP state machine
 * defined in the RFC with all the timers and all the parameters configurable.
 *
 * In incoming direction, we listen on the connection's socket and each time we receive
 * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and
 * passes complete packets to bgp_rx_packet() which distributes the packet according
 * to its type.
 *
 * In outgoing direction, we gather all the routing updates and sort them to buckets
 * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison
 * of &rta's and a &fib which helps us to find if we already have another route for
 * the same destination queued for sending, so that we can replace it with the new one
 * immediately instead of sending both updates). There also exists a special bucket holding
 * all the route withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the connection
Martin Mareš's avatar
Martin Mareš committed
42
 * tracking code wanting to send a Open, Keepalive or Notification message), we call
Martin Mareš's avatar
Martin Mareš committed
43 44 45 46 47 48 49 50 51 52 53
 * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send
 * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty,
 * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls
 * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet
 * type if we have more data of the same type to send.
 *
 * The processing of attributes consists of two functions: bgp_decode_attrs() for checking
 * of the attribute blocks and translating them to the language of BIRD's extended attributes
 * and bgp_encode_attrs() which does the converse. Both functions are built around a
 * @bgp_attr_table array describing all important characteristics of all known attributes.
 * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
54 55 56 57 58 59 60 61 62 63
 *
 * BGP protocol implements graceful restart in both restarting (local restart)
 * and receiving (neighbor restart) roles. The first is handled mostly by the
 * graceful restart code in the nest, BGP protocol just handles capabilities,
 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
 * The second is implemented by internal restart of the BGP state to %BS_IDLE
 * and protocol state to %PS_START, but keeping the protocol up from the core
 * point of view and therefore maintaining received routes. Routing table
 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
 * stale routes after reestablishment of BGP session during graceful restart.
Martin Mareš's avatar
Martin Mareš committed
64 65
 */

66
#undef LOCAL_DEBUG
67 68 69 70 71

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
72
#include "nest/cli.h"
73
#include "nest/locks.h"
74
#include "conf/conf.h"
75
#include "lib/socket.h"
76
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
77
#include "lib/string.h"
78 79 80

#include "bgp.h"

Ondřej Zajíček's avatar
Ondřej Zajíček committed
81

82
struct linpool *bgp_linpool;		/* Global temporary pool */
83 84 85
static sock *bgp_listen_sk;		/* Global listening socket */
static int bgp_counter;			/* Number of protocol instances using the listening socket */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
86
static void bgp_close(struct bgp_proto *p, int apply_md5);
87
static void bgp_connect(struct bgp_proto *p);
88
static void bgp_active(struct bgp_proto *p);
89
static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags);
90
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
91

92

Ondřej Zajíček's avatar
Ondřej Zajíček committed
93 94 95 96 97 98 99 100 101 102 103 104 105
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
 * This function allocates and configures shared BGP resources.
 * Should be called as the last step during initialization
 * (when lock is acquired and neighbor is ready).
 * When error, state changed to PS_DOWN, -1 is returned and caller
 * should return immediately.
 */
static int
bgp_open(struct bgp_proto *p)
{
106
  struct config *cfg = p->cf->c.global;
107 108
  int errcode;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
109
  if (!bgp_listen_sk)
110
    bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
111

112 113
  if (!bgp_listen_sk)
    {
114 115
      errcode = BEM_NO_SOCKET;
      goto err;
116 117
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
118 119 120
  if (!bgp_linpool)
    bgp_linpool = lp_new(&root_pool, 4080);

121 122
  bgp_counter++;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
123
  if (p->cf->password)
124 125
    if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
			p->cf->iface, p->cf->password, p->cf->setkey) < 0)
126 127 128 129 130 131
      {
	sk_log_error(bgp_listen_sk, p->p.name);
	bgp_close(p, 0);
	errcode = BEM_INVALID_MD5;
	goto err;
      }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
132 133

  return 0;
134 135 136 137 138 139

err:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, errcode);
  proto_notify_state(&p->p, PS_DOWN);
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
140 141
}

142 143 144 145 146
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
  p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
147 148 149

  if (!p->cf->passive)
    bgp_active(p);
150 151 152 153 154 155 156 157 158 159 160 161
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
162 163 164 165
  int rv = bgp_open(p);
  if (rv < 0)
    return;

166 167 168
  if (p->cf->bfd)
    bgp_update_bfd(p, p->cf->bfd);

169 170
  if (p->startup_delay)
    {
171
      p->start_state = BSS_DELAY;
172
      BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
173 174 175 176 177 178
      bgp_start_timer(p->startup_timer, p->startup_delay);
    }
  else
    bgp_startup(p);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
179 180 181 182 183 184 185 186 187
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 * @apply_md5: 0 to disable unsetting MD5 auth
 *
 * This function frees and deconfigures shared BGP resources.
 * @apply_md5 is set to 0 when bgp_close is called as a cleanup
 * from failed bgp_open().
 */
188
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
189
bgp_close(struct bgp_proto *p, int apply_md5)
190 191 192
{
  ASSERT(bgp_counter);
  bgp_counter--;
193

Ondřej Zajíček's avatar
Ondřej Zajíček committed
194
  if (p->cf->password && apply_md5)
195 196
    if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
			p->cf->iface, NULL, p->cf->setkey) < 0)
197
      sk_log_error(bgp_listen_sk, p->p.name);
198

199 200 201 202
  if (!bgp_counter)
    {
      rfree(bgp_listen_sk);
      bgp_listen_sk = NULL;
203 204
      rfree(bgp_linpool);
      bgp_linpool = NULL;
205 206 207
    }
}

Martin Mareš's avatar
Martin Mareš committed
208 209 210 211 212 213 214 215 216
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
 * @value: time to fire (0 to disable the timer)
 *
 * This functions calls tm_start() on @t with time @value and the
 * amount of randomization suggested by the BGP standard. Please use
 * it for all BGP timers.
 */
217
void
218 219
bgp_start_timer(timer *t, int value)
{
220
  if (value)
221 222 223 224 225
    {
      /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
      t->randomize = value / 4;
      tm_start(t, value - t->randomize);
    }
226 227 228 229
  else
    tm_stop(t);
}

Martin Mareš's avatar
Martin Mareš committed
230 231 232 233 234 235 236
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
 * This function takes a connection described by the &bgp_conn structure,
 * closes its socket and frees all resources associated with it.
 */
237 238 239
void
bgp_close_conn(struct bgp_conn *conn)
{
240
  // struct bgp_proto *p = conn->bgp;
241 242 243 244 245 246 247 248 249

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
  rfree(conn->connect_retry_timer);
  conn->connect_retry_timer = NULL;
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
250
  rfree(conn->sk);
251
  conn->sk = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
 * This function updates a startup delay that is used to postpone next BGP connect.
 * It also handles disable_after_error and might stop BGP instance when error
 * happened and disable_after_error is on.
 *
 * It should be called when BGP protocol error happened.
 */
void
268
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
269 270 271
{
  struct bgp_config *cf = p->cf;

272
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
273

274
  if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time))
275 276
    p->startup_delay = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
277 278 279 280 281 282 283
  p->last_proto_error = now;

  if (cf->disable_after_error)
    {
      p->startup_delay = 0;
      p->p.disabled = 1;
      return;
284
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
285 286 287 288

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
289
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
290 291
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
292
static void
293
bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode, byte *data, uint len)
294
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
295
  switch (conn->state)
296 297
    {
    case BS_IDLE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
298 299
    case BS_CLOSE:
      return;
300 301
    case BS_CONNECT:
    case BS_ACTIVE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
302 303
      bgp_conn_enter_idle_state(conn);
      return;
304 305 306
    case BS_OPENSENT:
    case BS_OPENCONFIRM:
    case BS_ESTABLISHED:
307
      bgp_error(conn, 6, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
308
      return;
309
    default:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
310
      bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
311 312 313
    }
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
314 315 316 317 318 319
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
    bgp_close(p, 1);

320
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
321 322 323 324 325 326 327 328 329 330
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
  if ((p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
331
      && (p->outgoing_conn.state == BS_IDLE)
332
      && (p->incoming_conn.state != BS_OPENCONFIRM)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
333
      && (!p->cf->passive))
334
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
335 336 337 338 339 340 341

  if ((p->p.proto_state == PS_STOP)
      && (p->outgoing_conn.state == BS_IDLE)
      && (p->incoming_conn.state == BS_IDLE))
    bgp_down(p);
}

342
void
343
bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
344 345
{
  proto_notify_state(&p->p, PS_STOP);
346 347
  bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
  bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
348 349 350
  ev_schedule(p->event);
}

351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
static inline void
bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state)
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
367 368 369 370
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
371

Ondřej Zajíček's avatar
Ondřej Zajíček committed
372 373 374
  BGP_TRACE(D_EVENTS, "BGP session established");
  DBG("BGP: UP!!!\n");

375 376
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
377
    p->source_addr = conn->sk->saddr;
378

379 380
  conn->sk->fast_rx = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
381 382 383
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
384 385
  p->feed_state = BFS_NONE;
  p->load_state = BFS_NONE;
386 387 388
  bgp_init_bucket_table(p);
  bgp_init_prefix_table(p, 8);

389 390 391 392 393 394 395 396
  int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART);

  if (p->p.gr_recovery && !peer_gr_ready)
    proto_graceful_restart_unlock(&p->p);

  if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready)
    p->p.gr_wait = 1;

397
  if (p->gr_active == BGP_GRS_ACTIVE)
398 399
    tm_stop(p->gr_timer);

400 401 402 403 404 405 406 407
  /* Check F-bit for regular graceful restart */
  if ((p->gr_active == BGP_GRS_ACTIVE) &&
      (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
    bgp_graceful_restart_done(p);

  /* Check F-bit for long-lived graceful restart */
  if (((p->gr_active == BGP_GRS_LLGR_1) || (p->gr_active == BGP_GRS_LLGR_2)) &&
      (!conn->peer_llgr_able || !(conn->peer_llgr_aflags & BGP_LLGRF_FORWARDING)))
408 409
    bgp_graceful_restart_done(p);

410 411 412 413 414 415
  /* GR capability implies that neighbor will send End-of-RIB */
  if (conn->peer_gr_aware)
    p->load_state = BFS_LOADING;

  /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */

416
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
417 418 419 420 421 422 423 424 425
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

426 427 428
  bgp_free_prefix_table(p);
  bgp_free_bucket_table(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
429
  if (p->p.proto_state == PS_UP)
430
    bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
431 432 433 434 435 436 437 438
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

439
  bgp_conn_set_state(conn, BS_CLOSE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
440 441 442
  tm_stop(conn->keepalive_timer);
  conn->sk->rx_hook = NULL;

443 444 445
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
446 447 448 449 450 451 452 453 454 455 456
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
457
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
458 459 460 461 462 463
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

464 465 466 467 468 469 470 471 472 473 474
/**
 * bgp_handle_graceful_restart - handle detected BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when a BGP graceful restart of the neighbor is
 * detected (when the TCP connection fails or when a new TCP connection
 * appears). The function activates processing of the restart - starts routing
 * table refresh cycle and activates BGP restart timer. The protocol state goes
 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
 * caller.
 */
475 476 477 478 479 480 481 482 483
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
	    p->gr_active ? " - already pending" : "");
  proto_notify_state(&p->p, PS_START);

484 485 486
  switch (p->gr_active)
  {
  case BGP_GRS_ACTIVE:
487
    rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
488 489 490 491 492 493 494 495 496 497 498
    break;

  case BGP_GRS_LLGR_1:
    rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
    return;

  case BGP_GRS_LLGR_2:
    rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
    rt_modify_stale(p->p.main_ahook->table, p->p.main_ahook);
    return;
  }
499

500 501 502
  p->stale_time = p->cf->llgr_mode ? p->conn->peer_llgr_time : 0;
  p->gr_active = !p->stale_time ? BGP_GRS_ACTIVE : BGP_GRS_LLGR_1;
  tm_start(p->gr_timer, p->conn->peer_gr_time);
503 504 505
  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
}

506 507 508 509 510 511 512 513 514 515
/**
 * bgp_graceful_restart_done - finish active BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when the active BGP graceful restart of the neighbor
 * should be finished - either successfully (the neighbor sends all paths and
 * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
 * not support BGP graceful restart on the new session). The function ends
 * routing table refresh cycle and stops BGP restart timer.
 */
516 517 518 519 520 521 522 523 524
void
bgp_graceful_restart_done(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
  p->gr_active = 0;
  tm_stop(p->gr_timer);
  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
}

525 526 527 528 529 530 531 532 533
/**
 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
 * @t: timer
 *
 * This function is a timeout hook for @gr_timer, implementing BGP restart time
 * limit for reestablisment of the BGP session after the graceful restart. When
 * fired, we just proceed with the usual protocol restart.
 */

534 535 536 537 538
static void
bgp_graceful_restart_timeout(timer *t)
{
  struct bgp_proto *p = t->data;

539 540 541 542 543 544 545 546 547 548 549 550 551
  switch (p->gr_active)
  {
  case BGP_GRS_ACTIVE:
    BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
    bgp_stop(p, 0, NULL, 0);
    return;

  case BGP_GRS_LLGR_1:
    BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
    p->gr_active = BGP_GRS_LLGR_2;
    tm_start(p->gr_timer, p->stale_time);
    rt_modify_stale(p->p.main_ahook->table, p->p.main_ahook);
    return;
552

553 554 555 556 557 558 559
  case BGP_GRS_LLGR_2:
    BGP_TRACE(D_EVENTS, "Long-lived graceful restart timeout");
    p->gr_active = 0;
    rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
    return;
  }
}
560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600

/**
 * bgp_refresh_begin - start incoming enhanced route refresh sequence
 * @p: BGP instance
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * started by the neighbor, demarcated by the BoRR packet. The function updates
 * the load state and starts the routing table refresh cycle. Note that graceful
 * restart also uses routing table refresh cycle, but RFC 7313 and load states
 * ensure that these two sequences do not overlap.
 */
void
bgp_refresh_begin(struct bgp_proto *p)
{
  if (p->load_state == BFS_LOADING)
    { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }

  p->load_state = BFS_REFRESHING;
  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
}

/**
 * bgp_refresh_end - finish incoming enhanced route refresh sequence
 * @p: BGP instance
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * finished by the neighbor, demarcated by the EoRR packet. The function updates
 * the load state and ends the routing table refresh cycle. Routes not received
 * during the sequence are removed by the nest.
 */
void
bgp_refresh_end(struct bgp_proto *p)
{
  if (p->load_state != BFS_REFRESHING)
    { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }

  p->load_state = BFS_NONE;
  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
}


601 602 603
static void
bgp_send_open(struct bgp_conn *conn)
{
604
  conn->start_state = conn->bgp->start_state;
605 606

  // Default values, possibly changed by receiving capabilities.
607
  conn->advertised_as = 0;
608 609 610
  conn->peer_refresh_support = 0;
  conn->peer_as4_support = 0;
  conn->peer_add_path = 0;
611
  conn->peer_enhanced_refresh_support = 0;
612 613 614 615 616
  conn->peer_gr_aware = 0;
  conn->peer_gr_able = 0;
  conn->peer_gr_time = 0;
  conn->peer_gr_flags = 0;
  conn->peer_gr_aflags = 0;
617 618 619 620
  conn->peer_llgr_aware = 0;
  conn->peer_llgr_able = 0;
  conn->peer_llgr_time = 0;
  conn->peer_llgr_aflags = 0;
621
  conn->peer_ext_messages_support = 0;
622

623 624
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
625
  conn->sk->tx_hook = bgp_tx;
626
  tm_stop(conn->connect_retry_timer);
627
  bgp_schedule_packet(conn, PKT_OPEN);
628
  bgp_conn_set_state(conn, BS_OPENSENT);
629
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
630 631
}

632 633
static void
bgp_connected(sock *sk)
634 635
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
636
  struct bgp_proto *p = conn->bgp;
637

Martin Mareš's avatar
Martin Mareš committed
638
  BGP_TRACE(D_EVENTS, "Connected");
639 640 641 642 643 644
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
645
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
646
  struct bgp_proto *p = conn->bgp;
647

Martin Mareš's avatar
Martin Mareš committed
648
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
649 650 651 652 653 654 655
  if (p->p.proto_state == PS_START)
    {
      bgp_close_conn(conn);
      bgp_connect(p);
    }
  else
    bgp_conn_enter_idle_state(conn);
656 657 658
}

static void
659
bgp_sock_err(sock *sk, int err)
660 661
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
662
  struct bgp_proto *p = conn->bgp;
663

664 665 666 667 668 669 670 671 672
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
673 674
  bgp_store_error(p, conn, BE_SOCKET, err);

675 676 677 678
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
679

680 681 682
  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
    bgp_handle_graceful_restart(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
683
  bgp_conn_enter_idle_state(conn);
684 685
}

686 687 688 689
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
690
  struct bgp_proto *p = conn->bgp;
691

692 693
  DBG("BGP: Hold timeout\n");

694 695 696 697 698 699 700 701
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

702 703 704 705 706
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
707 708 709 710 711 712
  else if ((conn->state == BS_ESTABLISHED) && p->gr_ready && conn->peer_llgr_able)
  {
    BGP_TRACE(D_EVENTS, "Hold timer expired");
    bgp_handle_graceful_restart(p);
    bgp_conn_enter_idle_state(conn);
  }
713 714
  else
    bgp_error(conn, 4, 0, NULL, 0);
715 716 717 718 719 720 721 722 723
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
  bgp_schedule_packet(conn, PKT_KEEPALIVE);
724 725 726 727

  /* Kick TX a bit faster */
  if (ev_active(conn->tx_ev))
    ev_run(conn->tx_ev);
728 729
}

730
static void
731
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
732 733 734
{
  timer *t;

735
  conn->sk = NULL;
736
  conn->bgp = p;
737
  conn->packets_to_send = 0;
738 739 740

  t = conn->connect_retry_timer = tm_new(p->p.pool);
  t->hook = bgp_connect_timeout;
741 742
  t->data = conn;
  t = conn->hold_timer = tm_new(p->p.pool);
743
  t->hook = bgp_hold_timeout;
744 745
  t->data = conn;
  t = conn->keepalive_timer = tm_new(p->p.pool);
746
  t->hook = bgp_keepalive_timeout;
747
  t->data = conn;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
748 749 750
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
751 752
}

753
static void
754
bgp_setup_sk(struct bgp_conn *conn, sock *s)
755 756 757
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
758
  s->fast_rx = 1;
759 760 761
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
762
static void
763
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
764
{
765
  int delay = MAX(1, p->cf->connect_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
766 767 768 769
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
770
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
771 772 773
  bgp_start_timer(conn->connect_retry_timer, delay);
}

Martin Mareš's avatar
Martin Mareš committed
774 775 776 777 778 779 780 781
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
782 783 784 785
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
  sock *s;
786
  struct bgp_conn *conn = &p->outgoing_conn;
787
  int hops = p->cf->multihop ? : 1;
788 789 790 791

  DBG("BGP: Connecting\n");
  s = sk_new(p->p.pool);
  s->type = SK_TCP_ACTIVE;
792
  s->saddr = p->source_addr;
793
  s->daddr = p->cf->remote_ip;
794
  s->dport = p->cf->remote_port;
795
  s->iface = p->neigh ? p->neigh->iface : NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
796
  s->vrf = p->p.vrf;
797
  s->ttl = p->cf->ttl_security ? 255 : hops;
798 799
  s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
  s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
800 801 802
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
803
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
804
	    s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
805
  bgp_setup_conn(p, conn);
806
  bgp_setup_sk(conn, s);
807
  bgp_conn_set_state(conn, BS_CONNECT);
808 809

  if (sk_open(s) < 0)
810
    goto err;
811 812 813 814

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
    if (sk_set_min_ttl(s, 256 - hops) < 0)
815
      goto err;
816

817 818
  DBG("BGP: Waiting for connect success\n");
  bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time);
819 820 821 822 823 824
  return;

 err:
  sk_log_error(s, p->p.name);
  bgp_sock_err(s, 0);
  return;
825 826
}

827 828 829 830 831 832 833 834 835 836 837 838 839 840 841
/**
 * bgp_find_proto - find existing proto for incoming connection
 * @sk: TCP socket
 *
 */
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
  struct proto_config *pc;

  WALK_LIST(pc, config->protos)
    if ((pc->protocol == &proto_bgp) && pc->proto)
      {
	struct bgp_proto *p = (struct bgp_proto *) pc->proto;
	if (ipa_equal(p->cf->remote_ip, sk->daddr) &&
842
	    (!p->cf->iface || (p->cf->iface == sk->iface)))
843 844 845 846 847 848
	  return p;
      }

  return NULL;
}

Martin Mareš's avatar
Martin Mareš committed
849 850 851 852 853 854 855 856 857 858 859 860
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
861
static int
862
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
863
{
864 865
  struct bgp_proto *p;
  int acc, hops;
866

867
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
868 869 870 871 872 873 874 875 876
  p = bgp_find_proto(sk);
  if (!p)
    {
      log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
	  sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
      rfree(sk);
      return 0;
    }

877 878 879 880 881 882 883
  /*
   * BIRD should keep multiple incoming connections in OpenSent state (for
   * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
   * connections are rejected istead. The exception is the case where an
   * incoming connection triggers a graceful restart.
   */

884 885
  acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
    (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
886

887 888 889 890 891 892
  if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
    {
      bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
      bgp_handle_graceful_restart(p);
      bgp_conn_enter_idle_state(p->conn);
      acc = 1;
893 894 895 896

      /* There might be separate incoming connection in OpenSent state */
      if (p->incoming_conn.state > BS_ACTIVE)
	bgp_close_conn(&p->incoming_conn);
897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917
    }

  BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
	    sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
	    sk->dport, acc ? "accepted" : "rejected");

  if (!acc)
    {
      rfree(sk);
      return 0;
    }

  hops = p->cf->multihop ? : 1;

  if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
    goto err;

  if (p->cf->ttl_security)
    if (sk_set_min_ttl(sk, 256 - hops) < 0)
      goto err;

918 919 920 921 922 923 924
  if (p->cf->enable_extended_messages)
    {
      sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
      sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
      sk_reallocate(sk);
    }

925 926 927 928 929 930 931 932
  bgp_setup_conn(p, &p->incoming_conn);
  bgp_setup_sk(&p->incoming_conn, sk);
  bgp_send_open(&p->incoming_conn);
  return 0;

err:
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Incoming connection aborted", p->p.name);
933 934 935 936
  rfree(sk);
  return 0;
}

937
static void
938
bgp_listen_sock_err(sock *sk UNUSED, int err)
939 940 941 942
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
943
    log(L_ERR "BGP: Error on listening socket: %M", err);
944 945
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
946
static sock *
947
bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags)
948
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
949
  sock *s = sk_new(&root_pool);
950
  DBG("BGP: Creating listening socket\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
951
  s->type = SK_TCP_PASSIVE;
952
  s->ttl = 255;
953 954
  s->saddr = addr;
  s->sport = port ? port : BGP_PORT;
955
  s->flags = flags ? 0 : SKF_V6ONLY;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
956 957 958 959
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->rbsize = BGP_RX_BUFFER_SIZE;
  s->tbsize = BGP_TX_BUFFER_SIZE;
  s->rx_hook = bgp_incoming_connection;
960
  s->err_hook = bgp_listen_sock_err;
961 962

  if (sk_open(s) < 0)
963
    goto err;
964 965

  return s;
966 967 968 969 970 971

 err:
  sk_log_error(s, "BGP");
  log(L_ERR "BGP: Cannot open listening socket");
  rfree(s);
  return NULL;
972 973 974 975 976
}

static void
bgp_start_neighbor(struct bgp_proto *p)
{
977 978 979
  /* Called only for single-hop BGP sessions */

  if (ipa_zero(p->source_addr))
980
    p->source_addr = p->neigh->ifa->ip;
981

982 983 984
#ifdef IPV6
  {
    struct ifa *a;
985
    p->local_link = IPA_NONE;
986 987 988 989 990 991
    WALK_LIST(a, p->neigh->iface->addrs)
      if (a->scope == SCOPE_LINK)
        {
	  p->local_link = a->ip;
	  break;
	}
992 993 994 995

    if (! ipa_nonzero(p->local_link))
      log(L_WARN "%s: Missing link local address on interface %s", p->p.name,  p->neigh->iface->name);

996 997 998
    DBG("BGP: Selected link-level address %I\n", p->local_link);
  }
#endif
Ondřej Zajíček's avatar
Ondřej Zajíček committed
999

1000
  bgp_initiate(p);
1001 1002 1003 1004 1005 1006
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;
1007 1008 1009 1010
  int ps = p->p.proto_state;

  if (n != p->neigh)
    return;
1011

1012
  if ((ps == PS_DOWN) || (ps == PS_STOP))
1013 1014
    return;

1015 1016 1017
  int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);

  if (n->scope <= 0)
1018
    {
1019 1020 1021 1022 1023
      if (!prepare)
        {
	  BGP_TRACE(D_EVENTS, "Neighbor lost");
	  bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
	  /* Perhaps also run bgp_update_startup_delay(p)? */
1024
	  bgp_stop(p, 0, NULL, 0);
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
	}
    }
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    {
      if (!prepare)
        {
	  BGP_TRACE(D_EVENTS, "Link down");
	  bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
	  if (ps == PS_UP)
	    bgp_update_startup_delay(p);
1035
	  bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1036
	}
1037 1038 1039
    }
  else
    {
1040
      if (prepare)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1041
	{
1042 1043
	  BGP_TRACE(D_EVENTS, "Neighbor ready");
	  bgp_start_neighbor(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1044
	}
1045 1046 1047
    }
}

1048 1049 1050 1051 1052 1053 1054
static void
bgp_bfd_notify(struct bfd_request *req)
{
  struct bgp_proto *p = req->data;
  int ps = p->p.proto_state;

  if (req->down && ((ps == PS_START) || (ps == PS_UP)))
1055 1056
  {
    BGP_TRACE(D_EVENTS, "BFD session down");
1057
    bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
1058

1059
    if (p->cf->bfd == BGP_BFD_GRACEFUL)
1060 1061
    {
      /* Trigger graceful restart */
1062 1063 1064 1065 1066 1067 1068 1069
      if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
	bgp_handle_graceful_restart(p);

      if (p->incoming_conn.state > BS_IDLE)
	bgp_conn_enter_idle_state(&p->incoming_conn);

      if (p->outgoing_conn.state > BS_IDLE)
	bgp_conn_enter_idle_state(&p->outgoing_conn);
1070 1071
    }
    else
1072
    {
1073
      /* Trigger session down */
1074 1075
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
1076
      bgp_stop(p, 0, NULL, 0);
1077
    }
1078
  }
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
}

static void
bgp_update_bfd(struct bgp_proto *p, int use_bfd)
{
  if (use_bfd && !p->bfd_req)
    p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
				     p->cf->multihop ? NULL : p->neigh->iface,
				     bgp_bfd_notify, p);

  if (!use_bfd && p->bfd_req)
    {
      rfree(p->bfd_req);
      p->bfd_req = NULL;
    }
}

1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106
static int
bgp_reload_routes(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  if (!p->conn || !p->conn->peer_refresh_support)
    return 0;

  bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH);
  return 1;
}

1107
static void
1108
bgp_feed_begin(struct proto *P, int initial)
1109 1110
{
  struct bgp_proto *p = (struct bgp_proto *) P;
1111 1112 1113

  /* This should not happen */
  if (!p->conn)
1114 1115
    return;

1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
  if (initial && p->cf->gr_mode)
    p->feed_state = BFS_LOADING;

  /* It is refeed and both sides support enhanced route refresh */
  if (!initial && p->cf->enable_refresh &&
      p->conn->peer_enhanced_refresh_support)
    {
      /* BoRR must not be sent before End-of-RIB */
      if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED)
	return;

      p->feed_state = BFS_REFRESHING;
      bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH);
    }
}

static void
bgp_feed_end(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;

  /* This should not happen */
  if (!p->conn)
    return;

  /* Non-demarcated feed ended, nothing to do */
  if (p->feed_state == BFS_NONE)
    return;

  /* Schedule End-of-RIB packet */
  if (p->feed_state == BFS_LOADING)
    p->feed_state = BFS_LOADED;

  /* Schedule EoRR packet */
  if (p->feed_state == BFS_REFRESHING)
    p->feed_state = BFS_REFRESHED;

  /* Kick TX hook */
1154 1155 1156
  bgp_schedule_packet(p->conn, PKT_UPDATE);
}

1157

1158 1159 1160 1161 1162 1163
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1164 1165 1166
  if (p->p.proto_state != PS_START)
    {
      DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
1167
      return;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1168 1169
    }

1170
  DBG("BGP: Got lock\n");
1171

1172
  if (cf->multihop)
1173
    {
1174 1175 1176
      /* Multi-hop sessions do not use neighbor entries */
      bgp_initiate(p);
      return;
1177 1178
    }

1179 1180
  neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
  if (!n)
1181
    {
1182
      log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1183
      /* As we do not start yet, we can just disable protocol */
1184
      p->p.disabled = 1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1185
      bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
1186
      proto_notify_state(&p->p, PS_DOWN);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1187
      return;
1188
    }
1189 1190 1191 1192

  p->neigh = n;

  if (n->scope <= 0)
1193
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
1194 1195 1196 1197
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
  else
    bgp_start_neighbor(p);
1198 1199
}

1200 1201 1202
static int
bgp_start(struct proto *P)
{
1203 1204 1205
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

1206
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1207
  p->start_state = BSS_PREPARE;
1208 1209
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
1210
  p->neigh = NULL;
1211
  p->bfd_req = NULL;
1212 1213
  p->gr_ready = 0;
  p->gr_active = 0;
1214

1215 1216
  rt_lock_table(p->igp_table);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1217 1218 1219
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
1220

1221 1222 1223 1224
  p->startup_timer = tm_new(p->p.pool);
  p->startup_timer->hook = bgp_startup_timeout;
  p->startup_timer->data = p;

1225 1226 1227 1228
  p->gr_timer = tm_new(p->p.pool);
  p->gr_timer->hook = bgp_graceful_restart_timeout;
  p->gr_timer->data = p;

1229 1230 1231 1232
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

1233 1234 1235
  p->remote_id = 0;
  p->source_addr = p->cf->source_addr;

1236
  if (p->p.gr_recovery && p->cf->gr_mode)
1237 1238
    proto_graceful_restart_lock(P);

1239 1240 1241 1242 1243 1244 1245 1246
  /*
   *  Before attempting to create the connection, we need to lock the
   *  port, so that are sure we're the only instance attempting to talk
   *  with that neighbor.
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
1247
  lock->port = p->cf->remote_port;
1248
  lock->iface = p->cf->iface;
1249
  lock->vrf = p->cf->iface ? NULL : p->p.vrf;
1250 1251 1252 1253
  lock->type = OBJLOCK_TCP;
  lock->hook = bgp_start_locked;
  lock->data = p;
  olock_acquire(lock);
1254

1255
  return PS_START;
1256 1257
}

1258 1259
extern int proto_restart;

1260 1261 1262
static int
bgp_shutdown(struct proto *P)
{
1263
  struct bgp_proto *p = (struct bgp_proto *) P;
1264 1265 1266 1267 1268
  uint subcode = 0;

  char *message = NULL;
  byte *data = NULL;
  uint len = 0;
1269

Martin Mareš's avatar
Martin Mareš committed
1270
  BGP_TRACE(D_EVENTS, "Shutdown requested");
1271

1272
  switch (P->down_code)
1273
    {
1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
    case PDC_CF_REMOVE:
    case PDC_CF_DISABLE:
      subcode = 3; // Errcode 6, 3 - peer de-configured
      break;

    case PDC_CF_RESTART:
      subcode = 6; // Errcode 6, 6 - other configuration change
      break;

    case PDC_CMD_DISABLE:
1284
    case PDC_CMD_SHUTDOWN:
1285
      subcode = 2; // Errcode 6, 2 - administrative shutdown
1286
      message = P->message;
1287 1288 1289 1290
      break;

    case PDC_CMD_RESTART:
      subcode = 4; // Errcode 6, 4 - administrative reset
1291
      message = P->message;
1292 1293
      break;

1294
    case PDC_RX_LIMIT_HIT:
1295 1296
    case PDC_IN_LIMIT_HIT:
      subcode = 1; // Errcode 6, 1 - max number of prefixes reached
1297
      /* log message for compatibility */
1298
      log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
1299 1300 1301 1302
      goto limit;

    case PDC_OUT_LIMIT_HIT:
      subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
1303

1304
    limit:
1305
      bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
1306
      if (proto_restart)
1307
	bgp_update_startup_delay(p);
1308
      else
1309 1310
	p->startup_delay = 0;
      goto done;
1311 1312
    }

1313
  bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1314
  p->startup_delay = 0;
1315

1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331
  /* RFC 8203 - shutdown communication */
  if (message)
  {
    uint msg_len = strlen(message);
    msg_len = MIN(msg_len, 128);

    /* Buffer will be freed automatically by protocol shutdown */
    data = mb_alloc(p->p.pool, msg_len + 1);
    len = msg_len + 1;

    data[0] = msg_len;
    memcpy(data+1, message, msg_len);
  }

done:
  bgp_stop(p, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1332
  return p->p.proto_state;
1333 1334
}

1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347
static void
bgp_cleanup(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  rt_unlock_table(p->igp_table);
}

static rtable *
get_igp_table(struct bgp_config *cf)
{
  return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
}

1348 1349 1350 1351
static struct proto *
bgp_init(struct proto_config *C)
{
  struct proto *P = proto_new(C, sizeof(struct bgp_proto));
1352
  struct bgp_config *c = (struct bgp_config *) C;
1353 1354
  struct bgp_proto *p = (struct bgp_proto *) P;

1355
  P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL;
1356 1357 1358
  P->rt_notify = bgp_rt_notify;
  P->import_control = bgp_import_control;
  P->neigh_notify = bgp_neigh_notify;
1359
  P->reload_routes = bgp_reload_routes;
1360 1361
  P->feed_begin = bgp_feed_begin;
  P->feed_end = bgp_feed_end;
1362
  P->rte_better = bgp_rte_better;
1363
  P->rte_mergable = bgp_rte_mergable;
1364
  P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
1365
  P->rte_modify = bgp_rte_modify_stale;
1366

1367 1368 1369 1370
  p->cf = c;
  p->local_as = c->local_as;
  p->remote_as = c->remote_as;
  p->is_internal = (c->local_as == c->remote_as);
1371 1372
  p->rs_client = c->rs_client;
  p->rr_client = c->rr_client;
1373
  p->igp_table = get_igp_table(c);
1374

1375 1376 1377
  return P;
}

1378 1379 1380 1381 1382 1383 1384 1385 1386 1387

void
bgp_check_config(struct bgp_config *c)
{
  int internal = (c->local_as == c->remote_as);

  /* Do not check templates at all */
  if (c->c.class == SYM_TEMPLATE)
    return;

1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400

  /* EBGP direct by default, IBGP multihop by default */
  if (c->multihop < 0)
    c->multihop = internal ? 64 : 0;

  /* Different default for gw_mode */
  if (!c->gw_mode)
    c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT;

  /* Different default based on rs_client */
  if (!c->missing_lladdr)
    c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF;

1401 1402 1403 1404
  /* LLGR mode default based on GR mode */
  if (c->llgr_mode < 0)
    c->llgr_mode = c->gr_mode ? BGP_LLGR_AWARE : 0;

1405 1406 1407 1408 1409
  /* Disable after error incompatible with restart limit action */
  if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error)
    c->c.in_limit->action = PLA_DISABLE;


1410 1411 1412
  if (!c->local_as)
    cf_error("Local AS number must be set");

1413
  if (ipa_zero(c->remote_ip))
1414 1415
    cf_error("Neighbor must be configured");

1416 1417 1418
  if (!c->remote_as)
    cf_error("Remote AS number must be set");

1419 1420
  if (ipa_is_link_local(c->remote_ip) && !c->iface)
    cf_error("Link-local neighbor address requires specified interface");
1421

1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
  if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF))
    cf_error("Neighbor AS number out of range (AS4 not available)");

  if (!internal && c->rr_client)
    cf_error("Only internal neighbor can be RR client");

  if (internal && c->rs_client)
    cf_error("Only external neighbor can be RS client");

  if (c->multihop && (c->gw_mode == GW_DIRECT))
    cf_error("Multihop BGP cannot use direct gateway mode");

1434 1435
  if (c->multihop && (ipa_is_link_local(c->remote_ip) ||
		      ipa_is_link_local(c->source_addr)))
1436 1437
    cf_error("Multihop BGP cannot be used with link-local addresses");

1438 1439 1440
  if (c->multihop && c->iface)
    cf_error("Multihop BGP cannot be bound to interface");

1441 1442 1443
  if (c->multihop && c->check_link)
    cf_error("Multihop BGP cannot depend on link state");

1444 1445 1446
  if (c->multihop && c->bfd && ipa_zero(c->source_addr))
    cf_error("Multihop BGP with BFD requires specified source address");

1447 1448 1449 1450 1451 1452 1453 1454
  if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted)
    cf_error("BGP in recursive mode prohibits sorted table");

  if (c->deterministic_med && c->c.table->sorted)
    cf_error("BGP with deterministic MED prohibits sorted table");

  if (c->secondary && !c->c.table->sorted)
    cf_error("BGP with secondary option requires sorted table");
1455 1456 1457

  if (!c->gr_mode && c->llgr_mode)
    cf_error("Long-lived graceful restart requires basic graceful restart");
1458 1459 1460 1461 1462 1463 1464 1465 1466
}

static int
bgp_reconfigure(struct proto *P, struct proto_config *C)
{
  struct bgp_config *new = (struct bgp_config *) C;
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_config *old = p->cf;

1467 1468 1469
  if (proto_get_router_id(C) != p->local_id)
    return 0;

1470 1471 1472 1473 1474 1475 1476 1477
  int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
		     ((byte *) new) + sizeof(struct proto_config),
		     // password item is last and must be checked separately
		     OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
    && ((!old->password && !new->password)
	|| (old->password && new->password && !strcmp(old->password, new->password)))
    && (get_igp_table(old) == get_igp_table(new));

1478 1479 1480
  if (same && (p->start_state > BSS_PREPARE))
    bgp_update_bfd(p, new->bfd);

1481 1482 1483 1484 1485 1486 1487