bgp.c 44.5 KB
Newer Older
1 2 3 4 5 6 7 8
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
/**
 * DOC: Border Gateway Protocol
 *
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of the
 * connection and most of the interface with BIRD core, |packets.c| handling
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
 * As opposed to the other existing routing daemons, BIRD has a sophisticated core
 * architecture which is able to keep all the information needed by BGP in the
 * primary routing table, therefore no complex data structures like a central
 * BGP table are needed. This increases memory footprint of a BGP router with
 * many connections, but not too much and, which is more important, it makes
 * BGP much easier to implement.
 *
Martin Mareš's avatar
Martin Mareš committed
24
 * Each instance of BGP (corresponding to a single BGP peer) is described by a &bgp_proto
Martin Mareš's avatar
Martin Mareš committed
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 * structure to which are attached individual connections represented by &bgp_connection
 * (usually, there exists only one connection, but during BGP session setup, there
 * can be more of them). The connections are handled according to the BGP state machine
 * defined in the RFC with all the timers and all the parameters configurable.
 *
 * In incoming direction, we listen on the connection's socket and each time we receive
 * some input, we pass it to bgp_rx(). It decodes packet headers and the markers and
 * passes complete packets to bgp_rx_packet() which distributes the packet according
 * to its type.
 *
 * In outgoing direction, we gather all the routing updates and sort them to buckets
 * (&bgp_bucket) according to their attributes (we keep a hash table for fast comparison
 * of &rta's and a &fib which helps us to find if we already have another route for
 * the same destination queued for sending, so that we can replace it with the new one
 * immediately instead of sending both updates). There also exists a special bucket holding
 * all the route withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the connection
Martin Mareš's avatar
Martin Mareš committed
42
 * tracking code wanting to send a Open, Keepalive or Notification message), we call
Martin Mareš's avatar
Martin Mareš committed
43 44 45 46 47 48 49 50 51 52 53
 * bgp_schedule_packet() which sets the corresponding bit in a @packet_to_send
 * bit field in &bgp_conn and as soon as the transmit socket buffer becomes empty,
 * we call bgp_fire_tx(). It inspects state of all the packet type bits and calls
 * the corresponding bgp_create_xx() functions, eventually rescheduling the same packet
 * type if we have more data of the same type to send.
 *
 * The processing of attributes consists of two functions: bgp_decode_attrs() for checking
 * of the attribute blocks and translating them to the language of BIRD's extended attributes
 * and bgp_encode_attrs() which does the converse. Both functions are built around a
 * @bgp_attr_table array describing all important characteristics of all known attributes.
 * Unknown transitive attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
54 55 56 57 58 59 60 61 62 63
 *
 * BGP protocol implements graceful restart in both restarting (local restart)
 * and receiving (neighbor restart) roles. The first is handled mostly by the
 * graceful restart code in the nest, BGP protocol just handles capabilities,
 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
 * The second is implemented by internal restart of the BGP state to %BS_IDLE
 * and protocol state to %PS_START, but keeping the protocol up from the core
 * point of view and therefore maintaining received routes. Routing table
 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
 * stale routes after reestablishment of BGP session during graceful restart.
Martin Mareš's avatar
Martin Mareš committed
64 65
 */

66
#undef LOCAL_DEBUG
67 68 69 70 71

#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
72
#include "nest/cli.h"
73
#include "nest/locks.h"
74
#include "conf/conf.h"
75
#include "lib/socket.h"
76
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
77
#include "lib/string.h"
78 79 80

#include "bgp.h"

Ondřej Zajíček's avatar
Ondřej Zajíček committed
81

82
struct linpool *bgp_linpool;		/* Global temporary pool */
83 84 85
static sock *bgp_listen_sk;		/* Global listening socket */
static int bgp_counter;			/* Number of protocol instances using the listening socket */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
86
static void bgp_close(struct bgp_proto *p, int apply_md5);
87
static void bgp_connect(struct bgp_proto *p);
88
static void bgp_active(struct bgp_proto *p);
89
static sock *bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags);
90
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
91

92

Ondřej Zajíček's avatar
Ondřej Zajíček committed
93 94 95 96 97 98 99 100 101 102 103 104 105
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
 * This function allocates and configures shared BGP resources.
 * Should be called as the last step during initialization
 * (when lock is acquired and neighbor is ready).
 * When error, state changed to PS_DOWN, -1 is returned and caller
 * should return immediately.
 */
static int
bgp_open(struct bgp_proto *p)
{
106
  struct config *cfg = p->cf->c.global;
107 108
  int errcode;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
109
  if (!bgp_listen_sk)
110
    bgp_listen_sk = bgp_setup_listen_sk(cfg->listen_bgp_addr, cfg->listen_bgp_port, cfg->listen_bgp_flags);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
111

112 113
  if (!bgp_listen_sk)
    {
114 115
      errcode = BEM_NO_SOCKET;
      goto err;
116 117
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
118 119 120
  if (!bgp_linpool)
    bgp_linpool = lp_new(&root_pool, 4080);

121 122
  bgp_counter++;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
123
  if (p->cf->password)
124 125
    if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
			p->cf->iface, p->cf->password, p->cf->setkey) < 0)
126 127 128 129 130 131
      {
	sk_log_error(bgp_listen_sk, p->p.name);
	bgp_close(p, 0);
	errcode = BEM_INVALID_MD5;
	goto err;
      }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
132 133

  return 0;
134 135 136 137 138 139

err:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, errcode);
  proto_notify_state(&p->p, PS_DOWN);
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
140 141
}

142 143 144 145 146
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
  p->start_state = p->cf->capabilities ? BSS_CONNECT : BSS_CONNECT_NOCAP;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
147 148 149

  if (!p->cf->passive)
    bgp_active(p);
150 151 152 153 154 155 156 157 158 159 160 161
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
162 163 164 165
  int rv = bgp_open(p);
  if (rv < 0)
    return;

166 167 168
  if (p->cf->bfd)
    bgp_update_bfd(p, p->cf->bfd);

169 170
  if (p->startup_delay)
    {
171
      p->start_state = BSS_DELAY;
172
      BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
173 174 175 176 177 178
      bgp_start_timer(p->startup_timer, p->startup_delay);
    }
  else
    bgp_startup(p);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
179 180 181 182 183 184 185 186 187
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 * @apply_md5: 0 to disable unsetting MD5 auth
 *
 * This function frees and deconfigures shared BGP resources.
 * @apply_md5 is set to 0 when bgp_close is called as a cleanup
 * from failed bgp_open().
 */
188
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
189
bgp_close(struct bgp_proto *p, int apply_md5)
190 191 192
{
  ASSERT(bgp_counter);
  bgp_counter--;
193

Ondřej Zajíček's avatar
Ondřej Zajíček committed
194
  if (p->cf->password && apply_md5)
195 196
    if (sk_set_md5_auth(bgp_listen_sk, p->cf->source_addr, p->cf->remote_ip,
			p->cf->iface, NULL, p->cf->setkey) < 0)
197
      sk_log_error(bgp_listen_sk, p->p.name);
198

199 200 201 202
  if (!bgp_counter)
    {
      rfree(bgp_listen_sk);
      bgp_listen_sk = NULL;
203 204
      rfree(bgp_linpool);
      bgp_linpool = NULL;
205 206 207
    }
}

Martin Mareš's avatar
Martin Mareš committed
208 209 210 211 212 213 214 215 216
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
 * @value: time to fire (0 to disable the timer)
 *
 * This functions calls tm_start() on @t with time @value and the
 * amount of randomization suggested by the BGP standard. Please use
 * it for all BGP timers.
 */
217
void
218 219
bgp_start_timer(timer *t, int value)
{
220
  if (value)
221 222 223 224 225
    {
      /* The randomization procedure is specified in RFC 1771: 9.2.3.3 */
      t->randomize = value / 4;
      tm_start(t, value - t->randomize);
    }
226 227 228 229
  else
    tm_stop(t);
}

Martin Mareš's avatar
Martin Mareš committed
230 231 232 233 234 235 236
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
 * This function takes a connection described by the &bgp_conn structure,
 * closes its socket and frees all resources associated with it.
 */
237 238 239
void
bgp_close_conn(struct bgp_conn *conn)
{
240
  // struct bgp_proto *p = conn->bgp;
241 242 243 244 245 246 247 248 249

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
  rfree(conn->connect_retry_timer);
  conn->connect_retry_timer = NULL;
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
250
  rfree(conn->sk);
251
  conn->sk = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
 * This function updates a startup delay that is used to postpone next BGP connect.
 * It also handles disable_after_error and might stop BGP instance when error
 * happened and disable_after_error is on.
 *
 * It should be called when BGP protocol error happened.
 */
void
268
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
269 270 271
{
  struct bgp_config *cf = p->cf;

272
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
273

274
  if (p->last_proto_error && ((now - p->last_proto_error) >= (int) cf->error_amnesia_time))
275 276
    p->startup_delay = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
277 278 279 280 281 282 283
  p->last_proto_error = now;

  if (cf->disable_after_error)
    {
      p->startup_delay = 0;
      p->p.disabled = 1;
      return;
284
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
285 286 287 288

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
289
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
290 291
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
292
static void
293
bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode, byte *data, uint len)
294
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
295
  switch (conn->state)
296 297
    {
    case BS_IDLE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
298 299
    case BS_CLOSE:
      return;
300 301
    case BS_CONNECT:
    case BS_ACTIVE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
302 303
      bgp_conn_enter_idle_state(conn);
      return;
304 305 306
    case BS_OPENSENT:
    case BS_OPENCONFIRM:
    case BS_ESTABLISHED:
307
      bgp_error(conn, 6, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
308
      return;
309
    default:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
310
      bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
311 312 313
    }
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
314 315 316 317 318 319
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
    bgp_close(p, 1);

320
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
321 322 323 324 325 326 327 328 329 330
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
  if ((p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
331
      && (p->outgoing_conn.state == BS_IDLE)
332
      && (p->incoming_conn.state != BS_OPENCONFIRM)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
333
      && (!p->cf->passive))
334
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
335 336 337 338 339 340 341

  if ((p->p.proto_state == PS_STOP)
      && (p->outgoing_conn.state == BS_IDLE)
      && (p->incoming_conn.state == BS_IDLE))
    bgp_down(p);
}

342
void
343
bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
344 345
{
  proto_notify_state(&p->p, PS_STOP);
346 347
  bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
  bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
348 349 350
  ev_schedule(p->event);
}

351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366
static inline void
bgp_conn_set_state(struct bgp_conn *conn, unsigned new_state)
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
367 368 369 370
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
371

Ondřej Zajíček's avatar
Ondřej Zajíček committed
372 373 374
  BGP_TRACE(D_EVENTS, "BGP session established");
  DBG("BGP: UP!!!\n");

375 376
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
377
    p->source_addr = conn->sk->saddr;
378

379 380
  conn->sk->fast_rx = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
381 382 383
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
384 385
  p->feed_state = BFS_NONE;
  p->load_state = BFS_NONE;
386 387 388
  bgp_init_bucket_table(p);
  bgp_init_prefix_table(p, 8);

389 390 391 392 393 394 395 396 397 398 399 400 401 402
  int peer_gr_ready = conn->peer_gr_aware && !(conn->peer_gr_flags & BGP_GRF_RESTART);

  if (p->p.gr_recovery && !peer_gr_ready)
    proto_graceful_restart_unlock(&p->p);

  if (p->p.gr_recovery && (p->cf->gr_mode == BGP_GR_ABLE) && peer_gr_ready)
    p->p.gr_wait = 1;

  if (p->gr_active)
    tm_stop(p->gr_timer);

  if (p->gr_active && (!conn->peer_gr_able || !(conn->peer_gr_aflags & BGP_GRF_FORWARDING)))
    bgp_graceful_restart_done(p);

403 404 405 406 407 408
  /* GR capability implies that neighbor will send End-of-RIB */
  if (conn->peer_gr_aware)
    p->load_state = BFS_LOADING;

  /* proto_notify_state() will likely call bgp_feed_begin(), setting p->feed_state */

409
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
410 411 412 413 414 415 416 417 418
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

419 420 421
  bgp_free_prefix_table(p);
  bgp_free_bucket_table(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
422
  if (p->p.proto_state == PS_UP)
423
    bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
424 425 426 427 428 429 430 431
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

432
  bgp_conn_set_state(conn, BS_CLOSE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
433 434 435
  tm_stop(conn->keepalive_timer);
  conn->sk->rx_hook = NULL;

436 437 438
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
439 440 441 442 443 444 445 446 447 448 449
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
450
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
451 452 453 454 455 456
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

457 458 459 460 461 462 463 464 465 466 467
/**
 * bgp_handle_graceful_restart - handle detected BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when a BGP graceful restart of the neighbor is
 * detected (when the TCP connection fails or when a new TCP connection
 * appears). The function activates processing of the restart - starts routing
 * table refresh cycle and activates BGP restart timer. The protocol state goes
 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
 * caller.
 */
468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
	    p->gr_active ? " - already pending" : "");
  proto_notify_state(&p->p, PS_START);

  if (p->gr_active)
    rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);

  p->gr_active = 1;
  bgp_start_timer(p->gr_timer, p->conn->peer_gr_time);
  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
}

485 486 487 488 489 490 491 492 493 494
/**
 * bgp_graceful_restart_done - finish active BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when the active BGP graceful restart of the neighbor
 * should be finished - either successfully (the neighbor sends all paths and
 * reports end-of-RIB on the new session) or unsuccessfully (the neighbor does
 * not support BGP graceful restart on the new session). The function ends
 * routing table refresh cycle and stops BGP restart timer.
 */
495 496 497 498 499 500 501 502 503
void
bgp_graceful_restart_done(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
  p->gr_active = 0;
  tm_stop(p->gr_timer);
  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
}

504 505 506 507 508 509 510 511 512
/**
 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
 * @t: timer
 *
 * This function is a timeout hook for @gr_timer, implementing BGP restart time
 * limit for reestablisment of the BGP session after the graceful restart. When
 * fired, we just proceed with the usual protocol restart.
 */

513 514 515 516 517 518
static void
bgp_graceful_restart_timeout(timer *t)
{
  struct bgp_proto *p = t->data;

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
519
  bgp_stop(p, 0, NULL, 0);
520 521
}

522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562

/**
 * bgp_refresh_begin - start incoming enhanced route refresh sequence
 * @p: BGP instance
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * started by the neighbor, demarcated by the BoRR packet. The function updates
 * the load state and starts the routing table refresh cycle. Note that graceful
 * restart also uses routing table refresh cycle, but RFC 7313 and load states
 * ensure that these two sequences do not overlap.
 */
void
bgp_refresh_begin(struct bgp_proto *p)
{
  if (p->load_state == BFS_LOADING)
    { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }

  p->load_state = BFS_REFRESHING;
  rt_refresh_begin(p->p.main_ahook->table, p->p.main_ahook);
}

/**
 * bgp_refresh_end - finish incoming enhanced route refresh sequence
 * @p: BGP instance
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * finished by the neighbor, demarcated by the EoRR packet. The function updates
 * the load state and ends the routing table refresh cycle. Routes not received
 * during the sequence are removed by the nest.
 */
void
bgp_refresh_end(struct bgp_proto *p)
{
  if (p->load_state != BFS_REFRESHING)
    { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }

  p->load_state = BFS_NONE;
  rt_refresh_end(p->p.main_ahook->table, p->p.main_ahook);
}


563 564 565
static void
bgp_send_open(struct bgp_conn *conn)
{
566
  conn->start_state = conn->bgp->start_state;
567 568

  // Default values, possibly changed by receiving capabilities.
569
  conn->advertised_as = 0;
570 571 572
  conn->peer_refresh_support = 0;
  conn->peer_as4_support = 0;
  conn->peer_add_path = 0;
573
  conn->peer_enhanced_refresh_support = 0;
574 575 576 577 578
  conn->peer_gr_aware = 0;
  conn->peer_gr_able = 0;
  conn->peer_gr_time = 0;
  conn->peer_gr_flags = 0;
  conn->peer_gr_aflags = 0;
579
  conn->peer_ext_messages_support = 0;
580

581 582
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
583
  conn->sk->tx_hook = bgp_tx;
584
  tm_stop(conn->connect_retry_timer);
585
  bgp_schedule_packet(conn, PKT_OPEN);
586
  bgp_conn_set_state(conn, BS_OPENSENT);
587
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
588 589
}

590 591
static void
bgp_connected(sock *sk)
592 593
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
594
  struct bgp_proto *p = conn->bgp;
595

Martin Mareš's avatar
Martin Mareš committed
596
  BGP_TRACE(D_EVENTS, "Connected");
597 598 599 600 601 602
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
603
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
604
  struct bgp_proto *p = conn->bgp;
605

Martin Mareš's avatar
Martin Mareš committed
606
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
607 608 609 610 611 612 613
  if (p->p.proto_state == PS_START)
    {
      bgp_close_conn(conn);
      bgp_connect(p);
    }
  else
    bgp_conn_enter_idle_state(conn);
614 615 616
}

static void
617
bgp_sock_err(sock *sk, int err)
618 619
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
620
  struct bgp_proto *p = conn->bgp;
621

622 623 624 625 626 627 628 629 630
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
631 632
  bgp_store_error(p, conn, BE_SOCKET, err);

633 634 635 636
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
637

638 639 640
  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
    bgp_handle_graceful_restart(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
641
  bgp_conn_enter_idle_state(conn);
642 643
}

644 645 646 647
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
648
  struct bgp_proto *p = conn->bgp;
649

650 651
  DBG("BGP: Hold timeout\n");

652 653 654 655 656 657 658 659
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

660 661 662 663 664 665 666
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
  else
    bgp_error(conn, 4, 0, NULL, 0);
667 668 669 670 671 672 673 674 675
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
  bgp_schedule_packet(conn, PKT_KEEPALIVE);
676 677 678 679

  /* Kick TX a bit faster */
  if (ev_active(conn->tx_ev))
    ev_run(conn->tx_ev);
680 681
}

682
static void
683
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
684 685 686
{
  timer *t;

687
  conn->sk = NULL;
688
  conn->bgp = p;
689
  conn->packets_to_send = 0;
690 691 692

  t = conn->connect_retry_timer = tm_new(p->p.pool);
  t->hook = bgp_connect_timeout;
693 694
  t->data = conn;
  t = conn->hold_timer = tm_new(p->p.pool);
695
  t->hook = bgp_hold_timeout;
696 697
  t->data = conn;
  t = conn->keepalive_timer = tm_new(p->p.pool);
698
  t->hook = bgp_keepalive_timeout;
699
  t->data = conn;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
700 701 702
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
703 704
}

705
static void
706
bgp_setup_sk(struct bgp_conn *conn, sock *s)
707 708 709
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
710
  s->fast_rx = 1;
711 712 713
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
714
static void
715
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
716
{
717
  int delay = MAX(1, p->cf->connect_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
718 719 720 721
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
722
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
723 724 725
  bgp_start_timer(conn->connect_retry_timer, delay);
}

Martin Mareš's avatar
Martin Mareš committed
726 727 728 729 730 731 732 733
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
734 735 736 737
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
  sock *s;
738
  struct bgp_conn *conn = &p->outgoing_conn;
739
  int hops = p->cf->multihop ? : 1;
740 741 742 743

  DBG("BGP: Connecting\n");
  s = sk_new(p->p.pool);
  s->type = SK_TCP_ACTIVE;
744
  s->saddr = p->source_addr;
745
  s->daddr = p->cf->remote_ip;
746
  s->dport = p->cf->remote_port;
747
  s->iface = p->neigh ? p->neigh->iface : NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
748
  s->vrf = p->p.vrf;
749
  s->ttl = p->cf->ttl_security ? 255 : hops;
750 751
  s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
  s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
752 753 754
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
755
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
756
	    s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
757
  bgp_setup_conn(p, conn);
758
  bgp_setup_sk(conn, s);
759
  bgp_conn_set_state(conn, BS_CONNECT);
760 761

  if (sk_open(s) < 0)
762
    goto err;
763 764 765 766

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
    if (sk_set_min_ttl(s, 256 - hops) < 0)
767
      goto err;
768

769 770
  DBG("BGP: Waiting for connect success\n");
  bgp_start_timer(conn->connect_retry_timer, p->cf->connect_retry_time);
771 772 773 774 775 776
  return;

 err:
  sk_log_error(s, p->p.name);
  bgp_sock_err(s, 0);
  return;
777 778
}

779 780 781 782 783 784 785 786 787 788 789 790 791 792 793
/**
 * bgp_find_proto - find existing proto for incoming connection
 * @sk: TCP socket
 *
 */
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
  struct proto_config *pc;

  WALK_LIST(pc, config->protos)
    if ((pc->protocol == &proto_bgp) && pc->proto)
      {
	struct bgp_proto *p = (struct bgp_proto *) pc->proto;
	if (ipa_equal(p->cf->remote_ip, sk->daddr) &&
794
	    (!p->cf->iface || (p->cf->iface == sk->iface)))
795 796 797 798 799 800
	  return p;
      }

  return NULL;
}

Martin Mareš's avatar
Martin Mareš committed
801 802 803 804 805 806 807 808 809 810 811 812
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
813
static int
814
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
815
{
816 817
  struct bgp_proto *p;
  int acc, hops;
818

819
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
820 821 822 823 824 825 826 827 828
  p = bgp_find_proto(sk);
  if (!p)
    {
      log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
	  sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
      rfree(sk);
      return 0;
    }

829 830 831 832 833 834 835
  /*
   * BIRD should keep multiple incoming connections in OpenSent state (for
   * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
   * connections are rejected istead. The exception is the case where an
   * incoming connection triggers a graceful restart.
   */

836 837
  acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
    (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
838

839 840 841 842 843 844
  if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
    {
      bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
      bgp_handle_graceful_restart(p);
      bgp_conn_enter_idle_state(p->conn);
      acc = 1;
845 846 847 848

      /* There might be separate incoming connection in OpenSent state */
      if (p->incoming_conn.state > BS_ACTIVE)
	bgp_close_conn(&p->incoming_conn);
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869
    }

  BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
	    sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
	    sk->dport, acc ? "accepted" : "rejected");

  if (!acc)
    {
      rfree(sk);
      return 0;
    }

  hops = p->cf->multihop ? : 1;

  if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
    goto err;

  if (p->cf->ttl_security)
    if (sk_set_min_ttl(sk, 256 - hops) < 0)
      goto err;

870 871 872 873 874 875 876
  if (p->cf->enable_extended_messages)
    {
      sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
      sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
      sk_reallocate(sk);
    }

877 878 879 880 881 882 883 884
  bgp_setup_conn(p, &p->incoming_conn);
  bgp_setup_sk(&p->incoming_conn, sk);
  bgp_send_open(&p->incoming_conn);
  return 0;

err:
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Incoming connection aborted", p->p.name);
885 886 887 888
  rfree(sk);
  return 0;
}

889
static void
890
bgp_listen_sock_err(sock *sk UNUSED, int err)
891 892 893 894
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
895
    log(L_ERR "BGP: Error on listening socket: %M", err);
896 897
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
898
static sock *
899
bgp_setup_listen_sk(ip_addr addr, unsigned port, u32 flags)
900
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
901
  sock *s = sk_new(&root_pool);
902
  DBG("BGP: Creating listening socket\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
903
  s->type = SK_TCP_PASSIVE;
904
  s->ttl = 255;
905 906
  s->saddr = addr;
  s->sport = port ? port : BGP_PORT;
907
  s->flags = flags ? 0 : SKF_V6ONLY;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
908 909 910 911
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->rbsize = BGP_RX_BUFFER_SIZE;
  s->tbsize = BGP_TX_BUFFER_SIZE;
  s->rx_hook = bgp_incoming_connection;
912
  s->err_hook = bgp_listen_sock_err;
913 914

  if (sk_open(s) < 0)
915
    goto err;
916 917

  return s;
918 919 920 921 922 923

 err:
  sk_log_error(s, "BGP");
  log(L_ERR "BGP: Cannot open listening socket");
  rfree(s);
  return NULL;
924 925 926 927 928
}

static void
bgp_start_neighbor(struct bgp_proto *p)
{
929 930 931
  /* Called only for single-hop BGP sessions */

  if (ipa_zero(p->source_addr))
932
    p->source_addr = p->neigh->ifa->ip;
933

934 935 936
#ifdef IPV6
  {
    struct ifa *a;
937
    p->local_link = IPA_NONE;
938 939 940 941 942 943
    WALK_LIST(a, p->neigh->iface->addrs)
      if (a->scope == SCOPE_LINK)
        {
	  p->local_link = a->ip;
	  break;
	}
944 945 946 947

    if (! ipa_nonzero(p->local_link))
      log(L_WARN "%s: Missing link local address on interface %s", p->p.name,  p->neigh->iface->name);

948 949 950
    DBG("BGP: Selected link-level address %I\n", p->local_link);
  }
#endif
Ondřej Zajíček's avatar
Ondřej Zajíček committed
951

952
  bgp_initiate(p);
953 954 955 956 957 958
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;
959 960 961 962
  int ps = p->p.proto_state;

  if (n != p->neigh)
    return;
963

964
  if ((ps == PS_DOWN) || (ps == PS_STOP))
965 966
    return;

967 968 969
  int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);

  if (n->scope <= 0)
970
    {
971 972 973 974 975
      if (!prepare)
        {
	  BGP_TRACE(D_EVENTS, "Neighbor lost");
	  bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
	  /* Perhaps also run bgp_update_startup_delay(p)? */
976
	  bgp_stop(p, 0, NULL, 0);
977 978 979 980 981 982 983 984 985 986
	}
    }
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    {
      if (!prepare)
        {
	  BGP_TRACE(D_EVENTS, "Link down");
	  bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
	  if (ps == PS_UP)
	    bgp_update_startup_delay(p);
987
	  bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
988
	}
989 990 991
    }
  else
    {
992
      if (prepare)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
993
	{
994 995
	  BGP_TRACE(D_EVENTS, "Neighbor ready");
	  bgp_start_neighbor(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
996
	}
997 998 999
    }
}

1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011
static void
bgp_bfd_notify(struct bfd_request *req)
{
  struct bgp_proto *p = req->data;
  int ps = p->p.proto_state;

  if (req->down && ((ps == PS_START) || (ps == PS_UP)))
    {
      BGP_TRACE(D_EVENTS, "BFD session down");
      bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
1012
      bgp_stop(p, 0, NULL, 0);
1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030
    }
}

static void
bgp_update_bfd(struct bgp_proto *p, int use_bfd)
{
  if (use_bfd && !p->bfd_req)
    p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
				     p->cf->multihop ? NULL : p->neigh->iface,
				     bgp_bfd_notify, p);

  if (!use_bfd && p->bfd_req)
    {
      rfree(p->bfd_req);
      p->bfd_req = NULL;
    }
}

1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
static int
bgp_reload_routes(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  if (!p->conn || !p->conn->peer_refresh_support)
    return 0;

  bgp_schedule_packet(p->conn, PKT_ROUTE_REFRESH);
  return 1;
}

1042
static void
1043
bgp_feed_begin(struct proto *P, int initial)
1044 1045
{
  struct bgp_proto *p = (struct bgp_proto *) P;
1046 1047 1048

  /* This should not happen */
  if (!p->conn)
1049 1050
    return;

1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088
  if (initial && p->cf->gr_mode)
    p->feed_state = BFS_LOADING;

  /* It is refeed and both sides support enhanced route refresh */
  if (!initial && p->cf->enable_refresh &&
      p->conn->peer_enhanced_refresh_support)
    {
      /* BoRR must not be sent before End-of-RIB */
      if (p->feed_state == BFS_LOADING || p->feed_state == BFS_LOADED)
	return;

      p->feed_state = BFS_REFRESHING;
      bgp_schedule_packet(p->conn, PKT_BEGIN_REFRESH);
    }
}

static void
bgp_feed_end(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;

  /* This should not happen */
  if (!p->conn)
    return;

  /* Non-demarcated feed ended, nothing to do */
  if (p->feed_state == BFS_NONE)
    return;

  /* Schedule End-of-RIB packet */
  if (p->feed_state == BFS_LOADING)
    p->feed_state = BFS_LOADED;

  /* Schedule EoRR packet */
  if (p->feed_state == BFS_REFRESHING)
    p->feed_state = BFS_REFRESHED;

  /* Kick TX hook */
1089 1090 1091
  bgp_schedule_packet(p->conn, PKT_UPDATE);
}

1092

1093 1094 1095 1096 1097 1098
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1099 1100 1101
  if (p->p.proto_state != PS_START)
    {
      DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
1102
      return;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1103 1104
    }

1105
  DBG("BGP: Got lock\n");
1106

1107
  if (cf->multihop)
1108
    {
1109 1110 1111
      /* Multi-hop sessions do not use neighbor entries */
      bgp_initiate(p);
      return;
1112 1113
    }

1114 1115
  neighbor *n = neigh_find2(&p->p, &cf->remote_ip, cf->iface, NEF_STICKY);
  if (!n)
1116
    {
1117
      log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1118
      /* As we do not start yet, we can just disable protocol */
1119
      p->p.disabled = 1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1120
      bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
1121
      proto_notify_state(&p->p, PS_DOWN);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1122
      return;
1123
    }
1124 1125 1126 1127

  p->neigh = n;

  if (n->scope <= 0)
1128
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
1129 1130 1131 1132
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
  else
    bgp_start_neighbor(p);
1133 1134
}

1135 1136 1137
static int
bgp_start(struct proto *P)
{
1138 1139 1140
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

1141
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1142
  p->start_state = BSS_PREPARE;
1143 1144
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
1145
  p->neigh = NULL;
1146
  p->bfd_req = NULL;
1147 1148
  p->gr_ready = 0;
  p->gr_active = 0;
1149

1150 1151
  rt_lock_table(p->igp_table);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1152 1153 1154
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
1155

1156 1157 1158 1159
  p->startup_timer = tm_new(p->p.pool);
  p->startup_timer->hook = bgp_startup_timeout;
  p->startup_timer->data = p;

1160 1161 1162 1163
  p->gr_timer = tm_new(p->p.pool);
  p->gr_timer->hook = bgp_graceful_restart_timeout;
  p->gr_timer->data = p;

1164 1165 1166 1167
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

1168 1169 1170
  p->remote_id = 0;
  p->source_addr = p->cf->source_addr;

1171
  if (p->p.gr_recovery && p->cf->gr_mode)
1172 1173
    proto_graceful_restart_lock(P);

1174 1175 1176 1177 1178 1179 1180 1181
  /*
   *  Before attempting to create the connection, we need to lock the
   *  port, so that are sure we're the only instance attempting to talk
   *  with that neighbor.
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
1182
  lock->port = p->cf->remote_port;
1183
  lock->iface = p->cf->iface;
1184
  lock->vrf = p->cf->iface ? NULL : p->p.vrf;
1185 1186 1187 1188
  lock->type = OBJLOCK_TCP;
  lock->hook = bgp_start_locked;
  lock->data = p;
  olock_acquire(lock);
1189

1190
  return PS_START;
1191 1192
}

1193 1194
extern int proto_restart;

1195 1196 1197
static int
bgp_shutdown(struct proto *P)
{
1198
  struct bgp_proto *p = (struct bgp_proto *) P;
1199 1200 1201 1202 1203
  uint subcode = 0;

  char *message = NULL;
  byte *data = NULL;
  uint len = 0;
1204

Martin Mareš's avatar
Martin Mareš committed
1205
  BGP_TRACE(D_EVENTS, "Shutdown requested");
1206

1207
  switch (P->down_code)
1208
    {
1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
    case PDC_CF_REMOVE:
    case PDC_CF_DISABLE:
      subcode = 3; // Errcode 6, 3 - peer de-configured
      break;

    case PDC_CF_RESTART:
      subcode = 6; // Errcode 6, 6 - other configuration change
      break;

    case PDC_CMD_DISABLE:
1219
    case PDC_CMD_SHUTDOWN:
1220
      subcode = 2; // Errcode 6, 2 - administrative shutdown
1221
      message = P->message;
1222 1223 1224 1225
      break;

    case PDC_CMD_RESTART:
      subcode = 4; // Errcode 6, 4 - administrative reset
1226
      message = P->message;
1227 1228
      break;

1229
    case PDC_RX_LIMIT_HIT:
1230 1231
    case PDC_IN_LIMIT_HIT:
      subcode = 1; // Errcode 6, 1 - max number of prefixes reached
1232
      /* log message for compatibility */
1233
      log(L_WARN "%s: Route limit exceeded, shutting down", p->p.name);
1234 1235 1236 1237
      goto limit;

    case PDC_OUT_LIMIT_HIT:
      subcode = proto_restart ? 4 : 2; // Administrative reset or shutdown
1238

1239
    limit:
1240
      bgp_store_error(p, NULL, BE_AUTO_DOWN, BEA_ROUTE_LIMIT_EXCEEDED);
1241
      if (proto_restart)
1242
	bgp_update_startup_delay(p);
1243
      else
1244 1245
	p->startup_delay = 0;
      goto done;
1246 1247
    }

1248
  bgp_store_error(p, NULL, BE_MAN_DOWN, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1249
  p->startup_delay = 0;
1250

1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266
  /* RFC 8203 - shutdown communication */
  if (message)
  {
    uint msg_len = strlen(message);
    msg_len = MIN(msg_len, 128);

    /* Buffer will be freed automatically by protocol shutdown */
    data = mb_alloc(p->p.pool, msg_len + 1);
    len = msg_len + 1;

    data[0] = msg_len;
    memcpy(data+1, message, msg_len);
  }

done:
  bgp_stop(p, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1267
  return p->p.proto_state;
1268 1269
}

1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
static void
bgp_cleanup(struct proto *P)
{
  struct bgp_proto *p = (struct bgp_proto *) P;
  rt_unlock_table(p->igp_table);
}

static rtable *
get_igp_table(struct bgp_config *cf)
{
  return cf->igp_table ? cf->igp_table->table : cf->c.table->table;
}

1283 1284 1285 1286
static struct proto *
bgp_init(struct proto_config *C)
{
  struct proto *P = proto_new(C, sizeof(struct bgp_proto));
1287
  struct bgp_config *c = (struct bgp_config *) C;
1288 1289
  struct bgp_proto *p = (struct bgp_proto *) P;

1290
  P->accept_ra_types = c->secondary ? RA_ACCEPTED : RA_OPTIMAL;
1291 1292 1293
  P->rt_notify = bgp_rt_notify;
  P->import_control = bgp_import_control;
  P->neigh_notify = bgp_neigh_notify;
1294
  P->reload_routes = bgp_reload_routes;
1295 1296
  P->feed_begin = bgp_feed_begin;
  P->feed_end = bgp_feed_end;
1297
  P->rte_better = bgp_rte_better;
1298
  P->rte_mergable = bgp_rte_mergable;
1299
  P->rte_recalculate = c->deterministic_med ? bgp_rte_recalculate : NULL;
1300

1301 1302 1303 1304
  p->cf = c;
  p->local_as = c->local_as;
  p->remote_as = c->remote_as;
  p->is_internal = (c->local_as == c->remote_as);
1305 1306
  p->rs_client = c->rs_client;
  p->rr_client = c->rr_client;
1307
  p->igp_table = get_igp_table(c);
1308

1309 1310 1311
  return P;
}

1312 1313 1314 1315 1316 1317 1318 1319 1320 1321

void
bgp_check_config(struct bgp_config *c)
{
  int internal = (c->local_as == c->remote_as);

  /* Do not check templates at all */
  if (c->c.class == SYM_TEMPLATE)
    return;

1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339

  /* EBGP direct by default, IBGP multihop by default */
  if (c->multihop < 0)
    c->multihop = internal ? 64 : 0;

  /* Different default for gw_mode */
  if (!c->gw_mode)
    c->gw_mode = c->multihop ? GW_RECURSIVE : GW_DIRECT;

  /* Different default based on rs_client */
  if (!c->missing_lladdr)
    c->missing_lladdr = c->rs_client ? MLL_IGNORE : MLL_SELF;

  /* Disable after error incompatible with restart limit action */
  if (c->c.in_limit && (c->c.in_limit->action == PLA_RESTART) && c->disable_after_error)
    c->c.in_limit->action = PLA_DISABLE;


1340 1341 1342
  if (!c->local_as)
    cf_error("Local AS number must be set");

1343
  if (ipa_zero(c->remote_ip))
1344 1345
    cf_error("Neighbor must be configured");

1346 1347 1348
  if (!c->remote_as)
    cf_error("Remote AS number must be set");

1349 1350
  if (ipa_is_link_local(c->remote_ip) && !c->iface)
    cf_error("Link-local neighbor address requires specified interface");
1351

1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363
  if (!(c->capabilities && c->enable_as4) && (c->remote_as > 0xFFFF))
    cf_error("Neighbor AS number out of range (AS4 not available)");

  if (!internal && c->rr_client)
    cf_error("Only internal neighbor can be RR client");

  if (internal && c->rs_client)
    cf_error("Only external neighbor can be RS client");

  if (c->multihop && (c->gw_mode == GW_DIRECT))
    cf_error("Multihop BGP cannot use direct gateway mode");

1364 1365
  if (c->multihop && (ipa_is_link_local(c->remote_ip) ||
		      ipa_is_link_local(c->source_addr)))
1366 1367
    cf_error("Multihop BGP cannot be used with link-local addresses");

1368 1369 1370
  if (c->multihop && c->iface)
    cf_error("Multihop BGP cannot be bound to interface");

1371 1372 1373
  if (c->multihop && c->check_link)
    cf_error("Multihop BGP cannot depend on link state");

1374 1375 1376
  if (c->multihop && c->bfd && ipa_zero(c->source_addr))
    cf_error("Multihop BGP with BFD requires specified source address");

1377 1378 1379 1380 1381 1382 1383 1384
  if ((c->gw_mode == GW_RECURSIVE) && c->c.table->sorted)
    cf_error("BGP in recursive mode prohibits sorted table");

  if (c->deterministic_med && c->c.table->sorted)
    cf_error("BGP with deterministic MED prohibits sorted table");

  if (c->secondary && !c->c.table->sorted)
    cf_error("BGP with secondary option requires sorted table");
1385 1386 1387 1388 1389 1390 1391 1392 1393
}

static int
bgp_reconfigure(struct proto *P, struct proto_config *C)
{
  struct bgp_config *new = (struct bgp_config *) C;
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct bgp_config *old = p->cf;

1394 1395 1396
  if (proto_get_router_id(C) != p->local_id)
    return 0;

1397 1398 1399 1400 1401 1402 1403 1404
  int same = !memcmp(((byte *) old) + sizeof(struct proto_config),
		     ((byte *) new) + sizeof(struct proto_config),
		     // password item is last and must be checked separately
		     OFFSETOF(struct bgp_config, password) - sizeof(struct proto_config))
    && ((!old->password && !new->password)
	|| (old->password && new->password && !strcmp(old->password, new->password)))
    && (get_igp_table(old) == get_igp_table(new));

1405 1406 1407
  if (same && (p->start_state > BSS_PREPARE))
    bgp_update_bfd(p, new->bfd);

1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
  /* We should update our copy of configuration ptr as old configuration will be freed */
  if (same)
    p->cf = new;

  return same;
}

static void
bgp_copy_config(struct proto_config *dest, struct proto_config *src)
{
  /* Just a shallow copy */
  proto_copy_rest(dest, src, sizeof(struct bgp_config));
}


Martin Mareš's avatar
Martin Mareš committed
1423 1424 1425 1426
/**
 * bgp_error - report a protocol error
 * @c: connection
 * @code: error code (according to the RFC)
1427
 * @subcode: error sub-code
Martin Mareš's avatar
Martin Mareš committed
1428 1429 1430 1431
 * @data: data to be passed in the Notification message
 * @len: length of the data
 *
 * bgp_error() sends a notification packet to tell the other side that a protocol
1432
 * error has occurred (including the data considered erroneous if possible) and
Martin Mareš's avatar
Martin Mareš committed
1433 1434
 * closes the connection.
 */
1435
void
1436
bgp_error(struct bgp_conn *c, unsigned code, unsigned subcode, byte *data, int len)
1437
{
1438 1439
  struct bgp_proto *p = c->bgp;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1440
  if (c->state == BS_CLOSE)
1441
    return;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1442

1443 1444
  bgp_log_error(p, BE_BGP_TX, "Error", code, subcode, data, (len > 0) ? len : -len);
  bgp_store_error(p, c, BE_BGP_TX, (code << 16) | subcode);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1445 1446
  bgp_conn_enter_close_state(c);

1447 1448
  c->notify_code = code;
  c->notify_subcode = subcode;
1449 1450
  c->notify_data = data;
  c->notify_size = (len > 0) ? len : 0;
1451
  bgp_schedule_packet(c, PKT_NOTIFICATION);
1452 1453 1454 1455

  if (code != 6)
    {
      bgp_update_startup_delay(p);
1456
      bgp_stop(p, 0, NULL, 0);
1457
    }
1458 1459
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487
/**
 * bgp_store_error - store last error for status report
 * @p: BGP instance
 * @c: connection
 * @class: error class (BE_xxx constants)
 * @code: error code (class specific)
 *
 * bgp_store_error() decides whether given error is interesting enough
 * and store that error to last_error variables of @p
 */
void
bgp_store_error(struct bgp_proto *p, struct bgp_conn *c, u8 class, u32 code)
{
  /* During PS_UP, we ignore errors on secondary connection */
  if ((p->p.proto_state == PS_UP) && c && (c != p->conn))
    return;

  /* During PS_STOP, we ignore any errors, as we want to report
   * the error that caused transition to PS_STOP
   */
  if (p->p.proto_state == PS_STOP)
    return;

  p->last_error_class = class;
  p->last_error_code = code;
}

static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "OpenConfirm", "Established", "Close" };
1488
static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
1489
static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "Link down", "BFD session down", "Graceful restart"};
1490
static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1491

1492 1493
static const char *
bgp_last_errmsg(struct bgp_proto *p)
1494
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1495