bgp.c 58.4 KB
Newer Older
1 2 3 4
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
Ondřej Zajíček's avatar
Ondřej Zajíček committed
5 6
 *	(c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
 *	(c) 2008--2016 CZ.NIC z.s.p.o.
7 8 9 10
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
11 12 13
/**
 * DOC: Border Gateway Protocol
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
14 15
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
 * the connection and most of the interface with BIRD core, |packets.c| handling
Martin Mareš's avatar
Martin Mareš committed
16 17 18
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
19 20 21 22 23 24
 * As opposed to the other existing routing daemons, BIRD has a sophisticated
 * core architecture which is able to keep all the information needed by BGP in
 * the primary routing table, therefore no complex data structures like a
 * central BGP table are needed. This increases memory footprint of a BGP router
 * with many connections, but not too much and, which is more important, it
 * makes BGP much easier to implement.
Martin Mareš's avatar
Martin Mareš committed
25
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
26 27 28 29 30 31
 * Each instance of BGP (corresponding to a single BGP peer) is described by a
 * &bgp_proto structure to which are attached individual connections represented
 * by &bgp_connection (usually, there exists only one connection, but during BGP
 * session setup, there can be more of them). The connections are handled
 * according to the BGP state machine defined in the RFC with all the timers and
 * all the parameters configurable.
Martin Mareš's avatar
Martin Mareš committed
32
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
33 34 35 36
 * In incoming direction, we listen on the connection's socket and each time we
 * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
 * markers and passes complete packets to bgp_rx_packet() which distributes the
 * packet according to its type.
Martin Mareš's avatar
Martin Mareš committed
37
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
 * In outgoing direction, we gather all the routing updates and sort them to
 * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
 * fast comparison of &rta's and a &fib which helps us to find if we already
 * have another route for the same destination queued for sending, so that we
 * can replace it with the new one immediately instead of sending both
 * updates). There also exists a special bucket holding all the route
 * withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the
 * connection tracking code wanting to send a Open, Keepalive or Notification
 * message), we call bgp_schedule_packet() which sets the corresponding bit in a
 * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
 * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
 * packet type bits and calls the corresponding bgp_create_xx() functions,
 * eventually rescheduling the same packet type if we have more data of the same
 * type to send.
Martin Mareš's avatar
Martin Mareš committed
53
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
54 55 56 57 58 59
 * The processing of attributes consists of two functions: bgp_decode_attrs()
 * for checking of the attribute blocks and translating them to the language of
 * BIRD's extended attributes and bgp_encode_attrs() which does the
 * converse. Both functions are built around a @bgp_attr_table array describing
 * all important characteristics of all known attributes.  Unknown transitive
 * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
60 61 62 63 64 65 66 67 68 69
 *
 * BGP protocol implements graceful restart in both restarting (local restart)
 * and receiving (neighbor restart) roles. The first is handled mostly by the
 * graceful restart code in the nest, BGP protocol just handles capabilities,
 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
 * The second is implemented by internal restart of the BGP state to %BS_IDLE
 * and protocol state to %PS_START, but keeping the protocol up from the core
 * point of view and therefore maintaining received routes. Routing table
 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
 * stale routes after reestablishment of BGP session during graceful restart.
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
 *
 * Supported standards:
 * <itemize>
 * <item> <rfc id="4271"> - Border Gateway Protocol 4 (BGP)
 * <item> <rfc id="1997"> - BGP Communities Attribute
 * <item> <rfc id="2385"> - Protection of BGP Sessions via TCP MD5 Signature
 * <item> <rfc id="2545"> - Use of BGP Multiprotocol Extensions for IPv6
 * <item> <rfc id="2918"> - Route Refresh Capability
 * <item> <rfc id="3107"> - Carrying Label Information in BGP
 * <item> <rfc id="4360"> - BGP Extended Communities Attribute
 * <item> <rfc id="4364"> - BGP/MPLS IPv4 Virtual Private Networks
 * <item> <rfc id="4456"> - BGP Route Reflection
 * <item> <rfc id="4486"> - Subcodes for BGP Cease Notification Message
 * <item> <rfc id="4659"> - BGP/MPLS IPv6 Virtual Private Networks
 * <item> <rfc id="4724"> - Graceful Restart Mechanism for BGP
 * <item> <rfc id="4760"> - Multiprotocol extensions for BGP
 * <item> <rfc id="4798"> - Connecting IPv6 Islands over IPv4 MPLS
 * <item> <rfc id="5065"> - AS confederations for BGP
 * <item> <rfc id="5082"> - Generalized TTL Security Mechanism
 * <item> <rfc id="5492"> - Capabilities Advertisement with BGP
 * <item> <rfc id="5549"> - Advertising IPv4 NLRI with an IPv6 Next Hop
 * <item> <rfc id="5575"> - Dissemination of Flow Specification Rules
 * <item> <rfc id="5668"> - 4-Octet AS Specific BGP Extended Community
 * <item> <rfc id="6286"> - AS-Wide Unique BGP Identifier
 * <item> <rfc id="6608"> - Subcodes for BGP Finite State Machine Error
 * <item> <rfc id="6793"> - BGP Support for 4-Octet AS Numbers
 * <item> <rfc id="7313"> - Enhanced Route Refresh Capability for BGP
 * <item> <rfc id="7606"> - Revised Error Handling for BGP UPDATE Messages
 * <item> <rfc id="7911"> - Advertisement of Multiple Paths in BGP
 * <item> <rfc id="7947"> - Internet Exchange BGP Route Server
 * <item> <rfc id="8092"> - BGP Large Communities Attribute
Ondřej Zajíček's avatar
Ondřej Zajíček committed
101
 * <item> <rfc id="8203"> - BGP Administrative Shutdown Communication
102
 * <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
103 104
 * </itemize>
*/
Martin Mareš's avatar
Martin Mareš committed
105

106
#undef LOCAL_DEBUG
107

108 109
#include <stdlib.h>

110 111 112 113
#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
114
#include "nest/cli.h"
115
#include "nest/locks.h"
116
#include "conf/conf.h"
117
#include "filter/filter.h"
118
#include "lib/socket.h"
119
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
120
#include "lib/string.h"
121 122 123

#include "bgp.h"

Ondřej Zajíček's avatar
Ondřej Zajíček committed
124

125
struct linpool *bgp_linpool;		/* Global temporary pool */
126
struct linpool *bgp_linpool2;		/* Global temporary pool for bgp_rt_notify() */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
127 128
static list bgp_sockets;		/* Global list of listening sockets */

129 130

static void bgp_connect(struct bgp_proto *p);
131
static void bgp_active(struct bgp_proto *p);
132
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
133

Ondřej Zajíček's avatar
Ondřej Zajíček committed
134 135
static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
static void bgp_listen_sock_err(sock *sk UNUSED, int err);
136

Ondřej Zajíček's avatar
Ondřej Zajíček committed
137 138 139 140
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
141 142 143 144
 * This function allocates and configures shared BGP resources, mainly listening
 * sockets. Should be called as the last step during initialization (when lock
 * is acquired and neighbor is ready). When error, caller should change state to
 * PS_DOWN and return immediately.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
145 146 147 148
 */
static int
bgp_open(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
149 150 151 152 153 154 155 156 157
  struct bgp_socket *bs = NULL;
  struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
  ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
    (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6);
  uint port = p->cf->local_port;

  /* FIXME: Add some global init? */
  if (!bgp_linpool)
    init_list(&bgp_sockets);
158

Ondřej Zajíček's avatar
Ondřej Zajíček committed
159
  /* We assume that cf->iface is defined iff cf->local_ip is link-local */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
160

Ondřej Zajíček's avatar
Ondřej Zajíček committed
161 162
  WALK_LIST(bs, bgp_sockets)
    if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port))
163
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
164 165 166
      bs->uc++;
      p->sock = bs;
      return 0;
167 168
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
169 170 171 172 173 174 175 176 177 178 179 180 181 182
  sock *sk = sk_new(proto_pool);
  sk->type = SK_TCP_PASSIVE;
  sk->ttl = 255;
  sk->saddr = addr;
  sk->sport = port;
  sk->flags = 0;
  sk->tos = IP_PREC_INTERNET_CONTROL;
  sk->rbsize = BGP_RX_BUFFER_SIZE;
  sk->tbsize = BGP_TX_BUFFER_SIZE;
  sk->rx_hook = bgp_incoming_connection;
  sk->err_hook = bgp_listen_sock_err;

  if (sk_open(sk) < 0)
    goto err;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
183

Ondřej Zajíček's avatar
Ondřej Zajíček committed
184 185 186 187
  bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
  bs->sk = sk;
  bs->uc = 1;
  p->sock = bs;
188

Ondřej Zajíček's avatar
Ondřej Zajíček committed
189 190 191
  add_tail(&bgp_sockets, &bs->n);

  if (!bgp_linpool)
192
  {
193 194
    bgp_linpool  = lp_new_default(proto_pool);
    bgp_linpool2 = lp_new_default(proto_pool);
195
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
196 197

  return 0;
198 199

err:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
200 201 202
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Cannot open listening socket", p->p.name);
  rfree(sk);
203
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
204 205
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 *
 * This function frees and deconfigures shared BGP resources.
 */
static void
bgp_close(struct bgp_proto *p)
{
  struct bgp_socket *bs = p->sock;

  ASSERT(bs && bs->uc);

  if (--bs->uc)
    return;

  rfree(bs->sk);
  rem_node(&bs->n);
  mb_free(bs);

  if (!EMPTY_LIST(bgp_sockets))
    return;

  rfree(bgp_linpool);
  bgp_linpool = NULL;
231 232 233

  rfree(bgp_linpool2);
  bgp_linpool2 = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
}

static inline int
bgp_setup_auth(struct bgp_proto *p, int enable)
{
  if (p->cf->password)
  {
    int rv = sk_set_md5_auth(p->sock->sk,
			     p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
			     enable ? p->cf->password : NULL, p->cf->setkey);

    if (rv < 0)
      sk_log_error(p->sock->sk, p->p.name);

    return rv;
  }
  else
    return 0;
}

static inline struct bgp_channel *
bgp_find_channel(struct bgp_proto *p, u32 afi)
{
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
    if (c->afi == afi)
      return c;

  return NULL;
}

265 266 267 268
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
269
  p->start_state = BSS_CONNECT;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
270 271 272

  if (!p->cf->passive)
    bgp_active(p);
273 274 275 276 277 278 279 280 281 282 283 284
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
285 286 287 288 289 290 291
  int err_val;

  if (bgp_open(p) < 0)
  { err_val = BEM_NO_SOCKET; goto err1; }

  if (bgp_setup_auth(p, 1) < 0)
  { err_val = BEM_INVALID_MD5; goto err2; }
292

293 294 295
  if (p->cf->bfd)
    bgp_update_bfd(p, p->cf->bfd);

296
  if (p->startup_delay)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
297 298 299 300 301
  {
    p->start_state = BSS_DELAY;
    BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
    bgp_start_timer(p->startup_timer, p->startup_delay);
  }
302 303 304
  else
    bgp_startup(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
305
  return;
306

Ondřej Zajíček's avatar
Ondřej Zajíček committed
307 308 309 310 311 312
err2:
  bgp_close(p);
err1:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, err_val);
  proto_notify_state(&p->p, PS_DOWN);
313

Ondřej Zajíček's avatar
Ondřej Zajíček committed
314
  return;
315 316
}

Martin Mareš's avatar
Martin Mareš committed
317 318 319
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
320
 * @value: time (in seconds) to fire (0 to disable the timer)
Martin Mareš's avatar
Martin Mareš committed
321
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
322 323 324
 * This functions calls tm_start() on @t with time @value and the amount of
 * randomization suggested by the BGP standard. Please use it for all BGP
 * timers.
Martin Mareš's avatar
Martin Mareš committed
325
 */
326
void
327
bgp_start_timer(timer *t, uint value)
328
{
329
  if (value)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
330
  {
331 332 333
    /* The randomization procedure is specified in RFC 4271 section 10 */
    btime time = value S;
    btime randomize = random() % ((time / 4) + 1);
334
    tm_start(t, time - randomize);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
335
  }
336
  else
337
    tm_stop(t);
338 339
}

Martin Mareš's avatar
Martin Mareš committed
340 341 342 343
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
344 345
 * This function takes a connection described by the &bgp_conn structure, closes
 * its socket and frees all resources associated with it.
Martin Mareš's avatar
Martin Mareš committed
346
 */
347 348 349
void
bgp_close_conn(struct bgp_conn *conn)
{
350
  // struct bgp_proto *p = conn->bgp;
351 352 353

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
354 355 356
  conn->channels_to_send = 0;
  rfree(conn->connect_timer);
  conn->connect_timer = NULL;
357 358 359 360
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
361 362
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
363 364 365 366 367 368 369
  rfree(conn->sk);
  conn->sk = NULL;

  mb_free(conn->local_caps);
  conn->local_caps = NULL;
  mb_free(conn->remote_caps);
  conn->remote_caps = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
370 371 372 373 374 375 376
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
377 378 379
 * This function updates a startup delay that is used to postpone next BGP
 * connect. It also handles disable_after_error and might stop BGP instance
 * when error happened and disable_after_error is on.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
380 381 382 383
 *
 * It should be called when BGP protocol error happened.
 */
void
384
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
385 386 387
{
  struct bgp_config *cf = p->cf;

388
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
389

390
  if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S))
391 392
    p->startup_delay = 0;

393
  p->last_proto_error = current_time();
Ondřej Zajíček's avatar
Ondřej Zajíček committed
394 395

  if (cf->disable_after_error)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
396 397 398 399 400
  {
    p->startup_delay = 0;
    p->p.disabled = 1;
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
401 402 403 404

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
405
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
406 407
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
408
static void
409
bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode, byte *data, uint len)
410
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
411
  switch (conn->state)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
412 413 414 415 416 417 418 419 420 421 422 423 424
  {
  case BS_IDLE:
  case BS_CLOSE:
    return;

  case BS_CONNECT:
  case BS_ACTIVE:
    bgp_conn_enter_idle_state(conn);
    return;

  case BS_OPENSENT:
  case BS_OPENCONFIRM:
  case BS_ESTABLISHED:
425
    bgp_error(conn, 6, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
426 427 428 429 430
    return;

  default:
    bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
  }
431 432
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
433 434 435 436
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
437 438 439 440
  {
    bgp_setup_auth(p, 0);
    bgp_close(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
441

442
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
443 444 445 446 447 448 449 450 451
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
452 453 454 455
  if ((p->p.proto_state == PS_START) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state != BS_OPENCONFIRM) &&
      !p->cf->passive)
456
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
457

Ondřej Zajíček's avatar
Ondřej Zajíček committed
458 459 460
  if ((p->p.proto_state == PS_STOP) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state == BS_IDLE))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
461 462 463
    bgp_down(p);
}

464
void
465
bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
466 467
{
  proto_notify_state(&p->p, PS_STOP);
468 469
  bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
  bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
470 471 472
  ev_schedule(p->event);
}

473
static inline void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
474
bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
475 476 477 478 479 480 481 482 483 484 485 486 487 488
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
489 490
static const struct bgp_af_caps dummy_af_caps = { };

Ondřej Zajíček's avatar
Ondřej Zajíček committed
491 492 493 494
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
495 496 497
  struct bgp_caps *local = conn->local_caps;
  struct bgp_caps *peer = conn->remote_caps;
  struct bgp_channel *c;
498

Ondřej Zajíček's avatar
Ondřej Zajíček committed
499 500
  BGP_TRACE(D_EVENTS, "BGP session established");

501 502
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
503
    p->source_addr = conn->sk->saddr;
504

505 506
  conn->sk->fast_rx = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
507 508 509
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
510

Ondřej Zajíček's avatar
Ondřej Zajíček committed
511 512 513 514
  p->as4_session = conn->as4_session;

  p->route_refresh = peer->route_refresh;
  p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
515

Ondřej Zajíček's avatar
Ondřej Zajíček committed
516 517
  /* Whether we may handle possible GR of peer (it has some AF GR-able) */
  p->gr_ready = 0;	/* Updated later */
518

Ondřej Zajíček's avatar
Ondřej Zajíček committed
519 520
  /* Whether peer is ready to handle our GR recovery */
  int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
521

Ondřej Zajíček's avatar
Ondřej Zajíček committed
522
  if (p->gr_active_num)
523
    tm_stop(p->gr_timer);
524

Ondřej Zajíček's avatar
Ondřej Zajíček committed
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551
  /* Number of active channels */
  int num = 0;

  WALK_LIST(c, p->p.channels)
  {
    const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
    const struct bgp_af_caps *rem = bgp_find_af_caps(peer,  c->afi);

    /* Ignore AFIs that were not announced in multiprotocol capability */
    if (!loc || !loc->ready)
      loc = &dummy_af_caps;

    if (!rem || !rem->ready)
      rem = &dummy_af_caps;

    int active = loc->ready && rem->ready;
    c->c.disabled = !active;
    c->c.reloadable = p->route_refresh;

    c->index = active ? num++ : 0;

    c->feed_state = BFS_NONE;
    c->load_state = BFS_NONE;

    /* Channels where peer may do GR */
    c->gr_ready = active && local->gr_aware && rem->gr_able;
    p->gr_ready = p->gr_ready || c->gr_ready;
552

Ondřej Zajíček's avatar
Ondřej Zajíček committed
553 554 555
    /* Channels not able to recover gracefully */
    if (p->p.gr_recovery && (!active || !peer_gr_ready))
      channel_graceful_restart_unlock(&c->c);
556

Ondřej Zajíček's avatar
Ondřej Zajíček committed
557 558 559 560 561 562 563 564 565 566 567 568
    /* Channels waiting for local convergence */
    if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
      c->c.gr_wait = 1;

    /* Channels where peer is not able to recover gracefully */
    if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
      bgp_graceful_restart_done(c);

    /* GR capability implies that neighbor will send End-of-RIB */
    if (peer->gr_aware)
      c->load_state = BFS_LOADING;

569
    c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
570 571 572
    c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
    c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
573
    /* Update RA mode */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
574 575
    if (c->add_path_tx)
      c->c.ra_mode = RA_ANY;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
576 577 578 579
    else if (c->cf->secondary)
      c->c.ra_mode = RA_ACCEPTED;
    else
      c->c.ra_mode = RA_OPTIMAL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595
  }

  p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
  p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
  p->channel_count = num;

  WALK_LIST(c, p->p.channels)
  {
    if (c->c.disabled)
      continue;

    p->afi_map[c->index] = c->afi;
    p->channel_map[c->index] = c;
  }

  /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
596

597
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
598 599 600 601 602 603 604 605 606 607
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

  if (p->p.proto_state == PS_UP)
608
    bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
609 610 611 612 613 614 615 616
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

617
  bgp_conn_set_state(conn, BS_CLOSE);
618
  tm_stop(conn->keepalive_timer);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
619 620
  conn->sk->rx_hook = NULL;

621 622 623
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
624 625 626 627 628 629 630 631 632 633 634
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
635
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
636 637 638 639 640 641
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

642 643 644 645 646 647 648 649 650 651 652
/**
 * bgp_handle_graceful_restart - handle detected BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when a BGP graceful restart of the neighbor is
 * detected (when the TCP connection fails or when a new TCP connection
 * appears). The function activates processing of the restart - starts routing
 * table refresh cycle and activates BGP restart timer. The protocol state goes
 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
 * caller.
 */
653 654 655 656 657 658
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
Ondřej Zajíček's avatar
Ondřej Zajíček committed
659 660 661
	    p->gr_active_num ? " - already pending" : "");

  p->gr_active_num = 0;
662

Ondřej Zajíček's avatar
Ondřej Zajíček committed
663 664 665
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
  {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
666 667 668 669
    /* FIXME: perhaps check for channel state instead of disabled flag? */
    if (c->c.disabled)
      continue;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
670 671 672 673
    if (c->gr_ready)
    {
      if (c->gr_active)
	rt_refresh_end(c->c.table, &c->c);
674

Ondřej Zajíček's avatar
Ondřej Zajíček committed
675 676 677 678 679 680 681 682 683 684
      c->gr_active = 1;
      p->gr_active_num++;
      rt_refresh_begin(c->c.table, &c->c);
    }
    else
    {
      /* Just flush the routes */
      rt_refresh_begin(c->c.table, &c->c);
      rt_refresh_end(c->c.table, &c->c);
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
685 686 687 688 689 690 691

    /* Reset bucket and prefix tables */
    bgp_free_bucket_table(c);
    bgp_free_prefix_table(c);
    bgp_init_bucket_table(c);
    bgp_init_prefix_table(c);
    c->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
692 693
  }

694 695 696
  /* p->gr_ready -> at least one active channel is c->gr_ready */
  ASSERT(p->gr_active_num > 0);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
697
  proto_notify_state(&p->p, PS_START);
698
  bgp_start_timer(p->gr_timer, p->conn->remote_caps->gr_time);
699 700
}

701 702
/**
 * bgp_graceful_restart_done - finish active BGP graceful restart
Ondřej Zajíček's avatar
Ondřej Zajíček committed
703
 * @c: BGP channel
704 705
 *
 * This function is called when the active BGP graceful restart of the neighbor
Ondřej Zajíček's avatar
Ondřej Zajíček committed
706 707 708 709
 * should be finished for channel @c - either successfully (the neighbor sends
 * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
 * unsuccessfully (the neighbor does not support BGP graceful restart on the new
 * session). The function ends the routing table refresh cycle.
710
 */
711
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
712
bgp_graceful_restart_done(struct bgp_channel *c)
713
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
714 715 716 717 718 719 720 721 722 723
  struct bgp_proto *p = (void *) c->c.proto;

  ASSERT(c->gr_active);
  c->gr_active = 0;
  p->gr_active_num--;

  if (!p->gr_active_num)
    BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");

  rt_refresh_end(c->c.table, &c->c);
724 725
}

726 727 728 729 730 731 732 733 734
/**
 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
 * @t: timer
 *
 * This function is a timeout hook for @gr_timer, implementing BGP restart time
 * limit for reestablisment of the BGP session after the graceful restart. When
 * fired, we just proceed with the usual protocol restart.
 */

735 736 737 738 739 740
static void
bgp_graceful_restart_timeout(timer *t)
{
  struct bgp_proto *p = t->data;

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
741
  bgp_stop(p, 0, NULL, 0);
742 743
}

744 745 746

/**
 * bgp_refresh_begin - start incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
747
 * @c: BGP channel
748 749 750 751 752 753 754 755
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * started by the neighbor, demarcated by the BoRR packet. The function updates
 * the load state and starts the routing table refresh cycle. Note that graceful
 * restart also uses routing table refresh cycle, but RFC 7313 and load states
 * ensure that these two sequences do not overlap.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
756
bgp_refresh_begin(struct bgp_channel *c)
757
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
758 759 760 761
  struct bgp_proto *p = (void *) c->c.proto;

  if (c->load_state == BFS_LOADING)
  { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
762

Ondřej Zajíček's avatar
Ondřej Zajíček committed
763 764
  c->load_state = BFS_REFRESHING;
  rt_refresh_begin(c->c.table, &c->c);
765 766 767 768
}

/**
 * bgp_refresh_end - finish incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
769
 * @c: BGP channel
770 771 772 773 774 775 776
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * finished by the neighbor, demarcated by the EoRR packet. The function updates
 * the load state and ends the routing table refresh cycle. Routes not received
 * during the sequence are removed by the nest.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
777
bgp_refresh_end(struct bgp_channel *c)
778
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
779
  struct bgp_proto *p = (void *) c->c.proto;
780

Ondřej Zajíček's avatar
Ondřej Zajíček committed
781 782 783 784 785
  if (c->load_state != BFS_REFRESHING)
  { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }

  c->load_state = BFS_NONE;
  rt_refresh_end(c->c.table, &c->c);
786 787 788
}


789 790 791 792 793
static void
bgp_send_open(struct bgp_conn *conn)
{
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
794
  conn->sk->tx_hook = bgp_tx;
795
  tm_stop(conn->connect_timer);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
796
  bgp_schedule_packet(conn, NULL, PKT_OPEN);
797
  bgp_conn_set_state(conn, BS_OPENSENT);
798
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
799 800
}

801 802
static void
bgp_connected(sock *sk)
803 804
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
805
  struct bgp_proto *p = conn->bgp;
806

Martin Mareš's avatar
Martin Mareš committed
807
  BGP_TRACE(D_EVENTS, "Connected");
808 809 810 811 812 813
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
814
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
815
  struct bgp_proto *p = conn->bgp;
816

Martin Mareš's avatar
Martin Mareš committed
817
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
818
  if (p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
819 820 821 822
  {
    bgp_close_conn(conn);
    bgp_connect(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
823 824
  else
    bgp_conn_enter_idle_state(conn);
825 826 827
}

static void
828
bgp_sock_err(sock *sk, int err)
829 830
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
831
  struct bgp_proto *p = conn->bgp;
832

833 834 835 836 837 838 839 840 841
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
842 843
  bgp_store_error(p, conn, BE_SOCKET, err);

844 845 846 847
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
848

849 850 851
  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
    bgp_handle_graceful_restart(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
852
  bgp_conn_enter_idle_state(conn);
853 854
}

855 856 857 858
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
859
  struct bgp_proto *p = conn->bgp;
860

861 862
  DBG("BGP: Hold timeout\n");

863 864 865 866 867 868 869 870
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

871 872 873 874 875 876 877
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
  else
    bgp_error(conn, 4, 0, NULL, 0);
878 879 880 881 882 883 884 885
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
886
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
887 888 889 890

  /* Kick TX a bit faster */
  if (ev_active(conn->tx_ev))
    ev_run(conn->tx_ev);
891 892
}

893
static void
894
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
895
{
896
  conn->sk = NULL;
897
  conn->bgp = p;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
898

899
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
900 901 902 903
  conn->channels_to_send = 0;
  conn->last_channel = 0;
  conn->last_channel_count = 0;

904 905 906
  conn->connect_timer	= tm_new_init(p->p.pool, bgp_connect_timeout,	 conn, 0, 0);
  conn->hold_timer 	= tm_new_init(p->p.pool, bgp_hold_timeout,	 conn, 0, 0);
  conn->keepalive_timer	= tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
907

Ondřej Zajíček's avatar
Ondřej Zajíček committed
908 909 910
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
911 912
}

913
static void
914
bgp_setup_sk(struct bgp_conn *conn, sock *s)
915 916 917
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
918
  s->fast_rx = 1;
919 920 921
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
922
static void
923
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
924
{
925
  int delay = MAX(1, p->cf->connect_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
926 927 928 929
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
930
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
931
  bgp_start_timer(conn->connect_timer, delay);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
932 933
}

Martin Mareš's avatar
Martin Mareš committed
934 935 936 937 938 939 940 941
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
942 943 944
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
945
  struct bgp_conn *conn = &p->outgoing_conn;
946
  int hops = p->cf->multihop ? : 1;
947 948

  DBG("BGP: Connecting\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
949
  sock *s = sk_new(p->p.pool);
950
  s->type = SK_TCP_ACTIVE;
951
  s->saddr = p->source_addr;
952
  s->daddr = p->cf->remote_ip;
953
  s->dport = p->cf->remote_port;
954
  s->iface = p->neigh ? p->neigh->iface : NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
955
  s->vrf = p->p.vrf;
956
  s->ttl = p->cf->ttl_security ? 255 : hops;
957 958
  s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
  s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
959 960 961
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
962
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
963
	    s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
964
  bgp_setup_conn(p, conn);
965
  bgp_setup_sk(conn, s);
966
  bgp_conn_set_state(conn, BS_CONNECT);
967 968

  if (sk_open(s) < 0)
969
    goto err;
970 971 972 973

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
    if (sk_set_min_ttl(s, 256 - hops) < 0)
974
      goto err;
975

976
  DBG("BGP: Waiting for connect success\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
977
  bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
978 979
  return;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
980
err:
981 982 983
  sk_log_error(s, p->p.name);
  bgp_sock_err(s, 0);
  return;
984 985
}

986 987 988 989 990 991 992 993
/**
 * bgp_find_proto - find existing proto for incoming connection
 * @sk: TCP socket
 *
 */
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
994
  struct bgp_proto *p;
995

Ondřej Zajíček's avatar
Ondřej Zajíček committed
996 997 998
  WALK_LIST(p, proto_list)
    if ((p->p.proto == &proto_bgp) &&
	ipa_equal(p->cf->remote_ip, sk->daddr) &&
999
	(!p->cf->iface  || (p->cf->iface == sk->iface)) &&
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1000 1001 1002
	(ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) &&
	(p->cf->local_port == sk->sport))
      return p;
1003 1004 1005 1006

  return NULL;
}

Martin Mareš's avatar
Martin Mareš committed
1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
1019
static int
1020
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
1021
{
1022 1023
  struct bgp_proto *p;
  int acc, hops;
1024

1025
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
1026 1027
  p = bgp_find_proto(sk);
  if (!p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1028 1029 1030 1031 1032 1033
  {
    log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
	sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
    rfree(sk);
    return 0;
  }
1034

1035 1036 1037 1038 1039 1040 1041
  /*
   * BIRD should keep multiple incoming connections in OpenSent state (for
   * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
   * connections are rejected istead. The exception is the case where an
   * incoming connection triggers a graceful restart.
   */

1042 1043
  acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
    (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
1044

1045
  if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1046 1047 1048 1049 1050 1051 1052 1053 1054 1055
  {
    bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
    bgp_handle_graceful_restart(p);
    bgp_conn_enter_idle_state(p->conn);
    acc = 1;

    /* There might be separate incoming connection in OpenSent state */
    if (p->incoming_conn.state > BS_ACTIVE)
      bgp_close_conn(&p->incoming_conn);
  }
1056 1057 1058 1059 1060 1061

  BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
	    sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
	    sk->dport, acc ? "accepted" : "rejected");

  if (!acc)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1062 1063 1064 1065
  {
    rfree(sk);
    return 0;
  }
1066 1067 1068 1069 1070 1071 1072 1073 1074 1075

  hops = p->cf->multihop ? : 1;

  if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
    goto err;

  if (p->cf->ttl_security)
    if (sk_set_min_ttl(sk, 256 - hops) < 0)
      goto err;

1076
  if (p->cf->enable_extended_messages)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1077 1078 1079 1080 1081
  {
    sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
    sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
    sk_reallocate(sk);
  }
1082

1083 1084 1085 1086 1087 1088 1089 1090
  bgp_setup_conn(p, &p->incoming_conn);
  bgp_setup_sk(&p->incoming_conn, sk);
  bgp_send_open(&p->incoming_conn);
  return 0;

err:
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Incoming connection aborted", p->p.name);
1091 1092 1093 1094
  rfree(sk);
  return 0;
}

1095
static void
1096
bgp_listen_sock_err(sock *sk UNUSED, int err)
1097 1098 1099 1100
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
1101
    log(L_ERR "BGP: Error on listening socket: %M", err);
1102 1103
}

1104 1105 1106
static void
bgp_start_neighbor(struct bgp_proto *p)
{
1107 1108 1109
  /* Called only for single-hop BGP sessions */

  if (ipa_zero(p->source_addr))
1110
    p->source_addr = p->neigh->ifa->ip;
1111

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1112 1113
  if (ipa_is_link_local(p->source_addr))
    p->link_addr = p->source_addr;
1114 1115
  else if (p->neigh->iface->llv6)
    p->link_addr = p->neigh->iface->llv6->ip;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1116

1117
  bgp_initiate(p);
1118 1119 1120 1121 1122 1123
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;
1124 1125 1126 1127
  int ps = p->p.proto_state;

  if (n != p->neigh)
    return;
1128

1129
  if ((ps == PS_DOWN) || (ps == PS_STOP))
1130 1131
    return;

1132 1133 1134
  int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);

  if (n->scope <= 0)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1135 1136
  {
    if (!prepare)
1137
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1138 1139 1140
      BGP_TRACE(D_EVENTS, "Neighbor lost");
      bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
      /* Perhaps also run bgp_update_startup_delay(p)? */
1141
      bgp_stop(p, 0, NULL, 0);
1142
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1143
  }
1144
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1145 1146
  {
    if (!prepare)
1147
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1148 1149 1150 1151
      BGP_TRACE(D_EVENTS, "Link down");
      bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
1152
      bgp_stop(p, 0, NULL, 0);
1153
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1154
  }
1155
  else
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1156 1157
  {
    if (prepare)
1158
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1159 1160
      BGP_TRACE(D_EVENTS, "Neighbor ready");
      bgp_start_neighbor(p);
1161
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1162
  }
1163 1164
}

1165 1166 1167 1168 1169 1170 1171
static void
bgp_bfd_notify(struct bfd_request *req)
{
  struct bgp_proto *p = req->data;
  int ps = p->p.proto_state;

  if (req->down && ((ps == PS_START) || (ps == PS_UP)))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1172 1173 1174 1175 1176
  {
    BGP_TRACE(D_EVENTS, "BFD session down");
    bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
    if (ps == PS_UP)
      bgp_update_startup_delay(p);
1177
    bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1178
  }
1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189
}

static void
bgp_update_bfd(struct bgp_proto *p, int use_bfd)
{
  if (use_bfd && !p->bfd_req)
    p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
				     p->cf->multihop ? NULL : p->neigh->iface,
				     bgp_bfd_notify, p);

  if (!use_bfd && p->bfd_req)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1190 1191 1192 1193
  {
    rfree(p->bfd_req);
    p->bfd_req = NULL;
  }
1194 1195
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1196 1197
static void
bgp_reload_routes(struct channel *C)
1198
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1199 1200
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1201

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1202 1203 1204
  ASSERT(p->conn && p->route_refresh);

  bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
1205 1206
}

1207
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1208
bgp_feed_begin(struct channel *C, int initial)
1209
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1210 1211
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1212 1213 1214

  /* This should not happen */
  if (!p->conn)
1215 1216
    return;

1217
  if (initial && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1218
    c->feed_state = BFS_LOADING;
1219 1220

  /* It is refeed and both sides support enhanced route refresh */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1221 1222 1223 1224 1225
  if (!initial && p->enhanced_refresh)
  {
    /* BoRR must not be sent before End-of-RIB */
    if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
      return;
1226

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1227 1228 1229
    c->feed_state = BFS_REFRESHING;
    bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
  }
1230 1231 1232
}

static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1233
bgp_feed_end(struct channel *C)
1234
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1235 1236
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1237 1238 1239 1240 1241 1242

  /* This should not happen */
  if (!p->conn)
    return;

  /* Non-demarcated feed ended, nothing to do */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1243
  if (c->feed_state == BFS_NONE)
1244 1245 1246
    return;

  /* Schedule End-of-RIB packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1247 1248
  if (c->feed_state == BFS_LOADING)
    c->feed_state = BFS_LOADED;
1249 1250

  /* Schedule EoRR packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1251 1252
  if (c->feed_state == BFS_REFRESHING)
    c->feed_state = BFS_REFRESHED;
1253 1254

  /* Kick TX hook */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1255
  bgp_schedule_packet(p->conn, c, PKT_UPDATE);
1256 1257
}

1258

1259 1260 1261 1262 1263 1264
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1265
  if (p->p.proto_state != PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1266 1267 1268 1269
  {
    DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1270

1271
  DBG("BGP: Got lock\n");
1272

1273
  if (cf->multihop)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1274 1275 1276 1277 1278
  {
    /* Multi-hop sessions do not use neighbor entries */
    bgp_initiate(p);
    return;
  }
1279

1280
  neighbor *n = neigh_find(&p->p, cf->remote_ip, cf->iface, NEF_STICKY);
1281
  if (!n)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1282 1283 1284 1285 1286 1287 1288 1289
  {
    log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
    /* As we do not start yet, we can just disable protocol */
    p->p.disabled = 1;
    bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
    proto_notify_state(&p->p, PS_DOWN);
    return;
  }
1290 1291 1292 1293

  p->neigh = n;

  if (n->scope <= 0)
1294
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
1295 1296 1297 1298
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
  else
    bgp_start_neighbor(p);
1299 1300
}

1301 1302 1303
static int
bgp_start(struct proto *P)
{
1304 1305 1306
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

1307
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1308
  p->start_state = BSS_PREPARE;
1309 1310
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
1311
  p->neigh = NULL;
1312
  p->bfd_req = NULL;
1313
  p->gr_ready = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1314
  p->gr_active_num = 0;
1315

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1316 1317 1318
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
1319

1320 1321
  p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0);
  p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0);
1322

1323 1324 1325 1326
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

1327
  p->remote_id = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1328
  p->source_addr = p->cf->local_ip;
1329
  p->link_addr = IPA_NONE;
1330

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1331
  /* Lock all channels when in GR recovery mode */
1332
  if (p->p.gr_recovery && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1333 1334 1335 1336 1337
  {
    struct bgp_channel *c;
    WALK_LIST(c, p->p.channels)
      channel_graceful_restart_lock(&c->c);
  }
1338

1339
  /*
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1340 1341
   * Before attempting to create the connection, we need to lock the port,
   * so that we are the only instance attempting to talk with that neighbor.
1342 1343 1344 1345
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
1346
  lock->port = p->cf->remote_port;
1347
  lock->iface = p->cf->iface;
1348
  lock->vrf = p->cf->iface ? NULL : p->p.vrf;
1349 1350 1351 1352
  lock->type = OBJLOCK_TCP;
  lock->hook = bgp_start_locked;
  lock->data = p;
  olock_acquire(lock);
1353

1354
  return PS_START;
1355 1356
}

1357 1358
extern int proto_restart;

1359 1360 1361
static int
bgp_shutdown(struct proto *P)
{
1362
  struct bgp_proto *p = (struct bgp_proto *) P;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1363
  uint subcode = 0;
1364

1365 1366 1367
  char *message = NULL;
  byte *data = NULL;
  uint len = 0;
1368

Martin Mareš's avatar
Martin Mareš committed
1369
  BGP_TRACE(D_EVENTS, "Shutdown requested");
1370

1371
  switch (P->down_code)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
  {
  case PDC_CF_REMOVE:
  case PDC_CF_DISABLE:
    subcode = 3; // Errcode 6, 3 - peer de-configured
    break;

  case PDC_CF_RESTART:
    subcode = 6; // Errcode 6, 6 - other configuration change
    break;

  case PDC_CMD_DISABLE:
  case PDC_CMD_SHUTDOWN:
    subcode = 2; // Errcode 6, 2 - administrative shutdown
1385
    message = P->message;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1386 1387 1388 1389
    break;

  case PDC_CMD_RESTART:
    subcode = 4; // Errcode 6, 4 - administrative reset
1390
    message = P->message;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1391 1392 1393 1394 1395 1396 1397 1398