bgp.c 62 KB
Newer Older
1 2 3 4
/*
 *	BIRD -- The Border Gateway Protocol
 *
 *	(c) 2000 Martin Mares <mj@ucw.cz>
Ondřej Zajíček's avatar
Ondřej Zajíček committed
5 6
 *	(c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
 *	(c) 2008--2016 CZ.NIC z.s.p.o.
7 8 9 10
 *
 *	Can be freely distributed and used under the terms of the GNU GPL.
 */

Martin Mareš's avatar
Martin Mareš committed
11 12 13
/**
 * DOC: Border Gateway Protocol
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
14 15
 * The BGP protocol is implemented in three parts: |bgp.c| which takes care of
 * the connection and most of the interface with BIRD core, |packets.c| handling
Martin Mareš's avatar
Martin Mareš committed
16 17 18
 * both incoming and outgoing BGP packets and |attrs.c| containing functions for
 * manipulation with BGP attribute lists.
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
19 20 21 22 23 24
 * As opposed to the other existing routing daemons, BIRD has a sophisticated
 * core architecture which is able to keep all the information needed by BGP in
 * the primary routing table, therefore no complex data structures like a
 * central BGP table are needed. This increases memory footprint of a BGP router
 * with many connections, but not too much and, which is more important, it
 * makes BGP much easier to implement.
Martin Mareš's avatar
Martin Mareš committed
25
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
26 27 28 29 30 31
 * Each instance of BGP (corresponding to a single BGP peer) is described by a
 * &bgp_proto structure to which are attached individual connections represented
 * by &bgp_connection (usually, there exists only one connection, but during BGP
 * session setup, there can be more of them). The connections are handled
 * according to the BGP state machine defined in the RFC with all the timers and
 * all the parameters configurable.
Martin Mareš's avatar
Martin Mareš committed
32
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
33 34 35 36
 * In incoming direction, we listen on the connection's socket and each time we
 * receive some input, we pass it to bgp_rx(). It decodes packet headers and the
 * markers and passes complete packets to bgp_rx_packet() which distributes the
 * packet according to its type.
Martin Mareš's avatar
Martin Mareš committed
37
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
 * In outgoing direction, we gather all the routing updates and sort them to
 * buckets (&bgp_bucket) according to their attributes (we keep a hash table for
 * fast comparison of &rta's and a &fib which helps us to find if we already
 * have another route for the same destination queued for sending, so that we
 * can replace it with the new one immediately instead of sending both
 * updates). There also exists a special bucket holding all the route
 * withdrawals which cannot be queued anywhere else as they don't have any
 * attributes. If we have any packet to send (due to either new routes or the
 * connection tracking code wanting to send a Open, Keepalive or Notification
 * message), we call bgp_schedule_packet() which sets the corresponding bit in a
 * @packet_to_send bit field in &bgp_conn and as soon as the transmit socket
 * buffer becomes empty, we call bgp_fire_tx(). It inspects state of all the
 * packet type bits and calls the corresponding bgp_create_xx() functions,
 * eventually rescheduling the same packet type if we have more data of the same
 * type to send.
Martin Mareš's avatar
Martin Mareš committed
53
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
54 55 56 57 58 59
 * The processing of attributes consists of two functions: bgp_decode_attrs()
 * for checking of the attribute blocks and translating them to the language of
 * BIRD's extended attributes and bgp_encode_attrs() which does the
 * converse. Both functions are built around a @bgp_attr_table array describing
 * all important characteristics of all known attributes.  Unknown transitive
 * attributes are attached to the route as %EAF_TYPE_OPAQUE byte streams.
60 61 62 63 64 65 66 67 68 69
 *
 * BGP protocol implements graceful restart in both restarting (local restart)
 * and receiving (neighbor restart) roles. The first is handled mostly by the
 * graceful restart code in the nest, BGP protocol just handles capabilities,
 * sets @gr_wait and locks graceful restart until end-of-RIB mark is received.
 * The second is implemented by internal restart of the BGP state to %BS_IDLE
 * and protocol state to %PS_START, but keeping the protocol up from the core
 * point of view and therefore maintaining received routes. Routing table
 * refresh cycle (rt_refresh_begin(), rt_refresh_end()) is used for removing
 * stale routes after reestablishment of BGP session during graceful restart.
70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
 *
 * Supported standards:
 * <itemize>
 * <item> <rfc id="4271"> - Border Gateway Protocol 4 (BGP)
 * <item> <rfc id="1997"> - BGP Communities Attribute
 * <item> <rfc id="2385"> - Protection of BGP Sessions via TCP MD5 Signature
 * <item> <rfc id="2545"> - Use of BGP Multiprotocol Extensions for IPv6
 * <item> <rfc id="2918"> - Route Refresh Capability
 * <item> <rfc id="3107"> - Carrying Label Information in BGP
 * <item> <rfc id="4360"> - BGP Extended Communities Attribute
 * <item> <rfc id="4364"> - BGP/MPLS IPv4 Virtual Private Networks
 * <item> <rfc id="4456"> - BGP Route Reflection
 * <item> <rfc id="4486"> - Subcodes for BGP Cease Notification Message
 * <item> <rfc id="4659"> - BGP/MPLS IPv6 Virtual Private Networks
 * <item> <rfc id="4724"> - Graceful Restart Mechanism for BGP
 * <item> <rfc id="4760"> - Multiprotocol extensions for BGP
 * <item> <rfc id="4798"> - Connecting IPv6 Islands over IPv4 MPLS
 * <item> <rfc id="5065"> - AS confederations for BGP
 * <item> <rfc id="5082"> - Generalized TTL Security Mechanism
 * <item> <rfc id="5492"> - Capabilities Advertisement with BGP
 * <item> <rfc id="5549"> - Advertising IPv4 NLRI with an IPv6 Next Hop
 * <item> <rfc id="5575"> - Dissemination of Flow Specification Rules
 * <item> <rfc id="5668"> - 4-Octet AS Specific BGP Extended Community
 * <item> <rfc id="6286"> - AS-Wide Unique BGP Identifier
 * <item> <rfc id="6608"> - Subcodes for BGP Finite State Machine Error
 * <item> <rfc id="6793"> - BGP Support for 4-Octet AS Numbers
 * <item> <rfc id="7313"> - Enhanced Route Refresh Capability for BGP
 * <item> <rfc id="7606"> - Revised Error Handling for BGP UPDATE Messages
 * <item> <rfc id="7911"> - Advertisement of Multiple Paths in BGP
 * <item> <rfc id="7947"> - Internet Exchange BGP Route Server
 * <item> <rfc id="8092"> - BGP Large Communities Attribute
Ondřej Zajíček's avatar
Ondřej Zajíček committed
101
 * <item> <rfc id="8203"> - BGP Administrative Shutdown Communication
102
 * <item> <rfc id="8212"> - Default EBGP Route Propagation Behavior without Policies
103 104
 * </itemize>
*/
Martin Mareš's avatar
Martin Mareš committed
105

106
#undef LOCAL_DEBUG
107

108 109
#include <stdlib.h>

110 111 112 113
#include "nest/bird.h"
#include "nest/iface.h"
#include "nest/protocol.h"
#include "nest/route.h"
114
#include "nest/cli.h"
115
#include "nest/locks.h"
116
#include "conf/conf.h"
117
#include "filter/filter.h"
118
#include "lib/socket.h"
119
#include "lib/resource.h"
Martin Mareš's avatar
Martin Mareš committed
120
#include "lib/string.h"
121 122 123

#include "bgp.h"

Ondřej Zajíček's avatar
Ondřej Zajíček committed
124

125
struct linpool *bgp_linpool;		/* Global temporary pool */
126
struct linpool *bgp_linpool2;		/* Global temporary pool for bgp_rt_notify() */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
127 128
static list bgp_sockets;		/* Global list of listening sockets */

129 130

static void bgp_connect(struct bgp_proto *p);
131
static void bgp_active(struct bgp_proto *p);
132
static void bgp_update_bfd(struct bgp_proto *p, int use_bfd);
133

Ondřej Zajíček's avatar
Ondřej Zajíček committed
134 135
static int bgp_incoming_connection(sock *sk, uint dummy UNUSED);
static void bgp_listen_sock_err(sock *sk UNUSED, int err);
136

Ondřej Zajíček's avatar
Ondřej Zajíček committed
137 138 139 140
/**
 * bgp_open - open a BGP instance
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
141 142 143 144
 * This function allocates and configures shared BGP resources, mainly listening
 * sockets. Should be called as the last step during initialization (when lock
 * is acquired and neighbor is ready). When error, caller should change state to
 * PS_DOWN and return immediately.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
145 146 147 148
 */
static int
bgp_open(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
149 150 151 152 153 154 155 156 157
  struct bgp_socket *bs = NULL;
  struct iface *ifa = p->cf->strict_bind ? p->cf->iface : NULL;
  ip_addr addr = p->cf->strict_bind ? p->cf->local_ip :
    (ipa_is_ip4(p->cf->remote_ip) ? IPA_NONE4 : IPA_NONE6);
  uint port = p->cf->local_port;

  /* FIXME: Add some global init? */
  if (!bgp_linpool)
    init_list(&bgp_sockets);
158

Ondřej Zajíček's avatar
Ondřej Zajíček committed
159
  /* We assume that cf->iface is defined iff cf->local_ip is link-local */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
160

Ondřej Zajíček's avatar
Ondřej Zajíček committed
161 162
  WALK_LIST(bs, bgp_sockets)
    if (ipa_equal(bs->sk->saddr, addr) && (bs->sk->iface == ifa) && (bs->sk->sport == port))
163
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
164 165 166
      bs->uc++;
      p->sock = bs;
      return 0;
167 168
    }

Ondřej Zajíček's avatar
Ondřej Zajíček committed
169 170 171 172 173 174 175 176 177 178 179 180 181 182
  sock *sk = sk_new(proto_pool);
  sk->type = SK_TCP_PASSIVE;
  sk->ttl = 255;
  sk->saddr = addr;
  sk->sport = port;
  sk->flags = 0;
  sk->tos = IP_PREC_INTERNET_CONTROL;
  sk->rbsize = BGP_RX_BUFFER_SIZE;
  sk->tbsize = BGP_TX_BUFFER_SIZE;
  sk->rx_hook = bgp_incoming_connection;
  sk->err_hook = bgp_listen_sock_err;

  if (sk_open(sk) < 0)
    goto err;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
183

Ondřej Zajíček's avatar
Ondřej Zajíček committed
184 185 186 187
  bs = mb_allocz(proto_pool, sizeof(struct bgp_socket));
  bs->sk = sk;
  bs->uc = 1;
  p->sock = bs;
188

Ondřej Zajíček's avatar
Ondřej Zajíček committed
189 190 191
  add_tail(&bgp_sockets, &bs->n);

  if (!bgp_linpool)
192
  {
193 194
    bgp_linpool  = lp_new_default(proto_pool);
    bgp_linpool2 = lp_new_default(proto_pool);
195
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
196 197

  return 0;
198 199

err:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
200 201 202
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Cannot open listening socket", p->p.name);
  rfree(sk);
203
  return -1;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
204 205
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230
/**
 * bgp_close - close a BGP instance
 * @p: BGP instance
 *
 * This function frees and deconfigures shared BGP resources.
 */
static void
bgp_close(struct bgp_proto *p)
{
  struct bgp_socket *bs = p->sock;

  ASSERT(bs && bs->uc);

  if (--bs->uc)
    return;

  rfree(bs->sk);
  rem_node(&bs->n);
  mb_free(bs);

  if (!EMPTY_LIST(bgp_sockets))
    return;

  rfree(bgp_linpool);
  bgp_linpool = NULL;
231 232 233

  rfree(bgp_linpool2);
  bgp_linpool2 = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264
}

static inline int
bgp_setup_auth(struct bgp_proto *p, int enable)
{
  if (p->cf->password)
  {
    int rv = sk_set_md5_auth(p->sock->sk,
			     p->cf->local_ip, p->cf->remote_ip, p->cf->iface,
			     enable ? p->cf->password : NULL, p->cf->setkey);

    if (rv < 0)
      sk_log_error(p->sock->sk, p->p.name);

    return rv;
  }
  else
    return 0;
}

static inline struct bgp_channel *
bgp_find_channel(struct bgp_proto *p, u32 afi)
{
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
    if (c->afi == afi)
      return c;

  return NULL;
}

265 266 267 268
static void
bgp_startup(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "Started");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
269
  p->start_state = BSS_CONNECT;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
270 271 272

  if (!p->cf->passive)
    bgp_active(p);
273 274 275 276 277 278 279 280 281 282 283 284
}

static void
bgp_startup_timeout(timer *t)
{
  bgp_startup(t->data);
}


static void
bgp_initiate(struct bgp_proto *p)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
285 286 287 288 289 290 291
  int err_val;

  if (bgp_open(p) < 0)
  { err_val = BEM_NO_SOCKET; goto err1; }

  if (bgp_setup_auth(p, 1) < 0)
  { err_val = BEM_INVALID_MD5; goto err2; }
292

293 294 295
  if (p->cf->bfd)
    bgp_update_bfd(p, p->cf->bfd);

296
  if (p->startup_delay)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
297 298 299 300 301
  {
    p->start_state = BSS_DELAY;
    BGP_TRACE(D_EVENTS, "Startup delayed by %d seconds due to errors", p->startup_delay);
    bgp_start_timer(p->startup_timer, p->startup_delay);
  }
302 303 304
  else
    bgp_startup(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
305
  return;
306

Ondřej Zajíček's avatar
Ondřej Zajíček committed
307 308 309 310 311 312
err2:
  bgp_close(p);
err1:
  p->p.disabled = 1;
  bgp_store_error(p, NULL, BE_MISC, err_val);
  proto_notify_state(&p->p, PS_DOWN);
313

Ondřej Zajíček's avatar
Ondřej Zajíček committed
314
  return;
315 316
}

Martin Mareš's avatar
Martin Mareš committed
317 318 319
/**
 * bgp_start_timer - start a BGP timer
 * @t: timer
320
 * @value: time (in seconds) to fire (0 to disable the timer)
Martin Mareš's avatar
Martin Mareš committed
321
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
322 323 324
 * This functions calls tm_start() on @t with time @value and the amount of
 * randomization suggested by the BGP standard. Please use it for all BGP
 * timers.
Martin Mareš's avatar
Martin Mareš committed
325
 */
326
void
327
bgp_start_timer(timer *t, uint value)
328
{
329
  if (value)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
330
  {
331 332 333
    /* The randomization procedure is specified in RFC 4271 section 10 */
    btime time = value S;
    btime randomize = random() % ((time / 4) + 1);
334
    tm_start(t, time - randomize);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
335
  }
336
  else
337
    tm_stop(t);
338 339
}

Martin Mareš's avatar
Martin Mareš committed
340 341 342 343
/**
 * bgp_close_conn - close a BGP connection
 * @conn: connection to close
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
344 345
 * This function takes a connection described by the &bgp_conn structure, closes
 * its socket and frees all resources associated with it.
Martin Mareš's avatar
Martin Mareš committed
346
 */
347 348 349
void
bgp_close_conn(struct bgp_conn *conn)
{
350
  // struct bgp_proto *p = conn->bgp;
351 352 353

  DBG("BGP: Closing connection\n");
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
354 355 356
  conn->channels_to_send = 0;
  rfree(conn->connect_timer);
  conn->connect_timer = NULL;
357 358 359 360
  rfree(conn->keepalive_timer);
  conn->keepalive_timer = NULL;
  rfree(conn->hold_timer);
  conn->hold_timer = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
361 362
  rfree(conn->tx_ev);
  conn->tx_ev = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
363 364 365 366 367 368 369
  rfree(conn->sk);
  conn->sk = NULL;

  mb_free(conn->local_caps);
  conn->local_caps = NULL;
  mb_free(conn->remote_caps);
  conn->remote_caps = NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
370 371 372 373 374 375 376
}


/**
 * bgp_update_startup_delay - update a startup delay
 * @p: BGP instance
 *
Ondřej Zajíček's avatar
Ondřej Zajíček committed
377 378 379
 * This function updates a startup delay that is used to postpone next BGP
 * connect. It also handles disable_after_error and might stop BGP instance
 * when error happened and disable_after_error is on.
Ondřej Zajíček's avatar
Ondřej Zajíček committed
380 381 382 383
 *
 * It should be called when BGP protocol error happened.
 */
void
384
bgp_update_startup_delay(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
385 386 387
{
  struct bgp_config *cf = p->cf;

388
  DBG("BGP: Updating startup delay\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
389

390
  if (p->last_proto_error && ((current_time() - p->last_proto_error) >= cf->error_amnesia_time S))
391 392
    p->startup_delay = 0;

393
  p->last_proto_error = current_time();
Ondřej Zajíček's avatar
Ondřej Zajíček committed
394 395

  if (cf->disable_after_error)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
396 397 398 399 400
  {
    p->startup_delay = 0;
    p->p.disabled = 1;
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
401 402 403 404

  if (!p->startup_delay)
    p->startup_delay = cf->error_delay_time_min;
  else
405
    p->startup_delay = MIN(2 * p->startup_delay, cf->error_delay_time_max);
406 407
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
408
static void
409
bgp_graceful_close_conn(struct bgp_conn *conn, uint subcode, byte *data, uint len)
410
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
411
  switch (conn->state)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
412 413 414 415 416 417 418 419 420 421 422 423 424
  {
  case BS_IDLE:
  case BS_CLOSE:
    return;

  case BS_CONNECT:
  case BS_ACTIVE:
    bgp_conn_enter_idle_state(conn);
    return;

  case BS_OPENSENT:
  case BS_OPENCONFIRM:
  case BS_ESTABLISHED:
425
    bgp_error(conn, 6, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
426 427 428 429 430
    return;

  default:
    bug("bgp_graceful_close_conn: Unknown state %d", conn->state);
  }
431 432
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
433 434 435 436
static void
bgp_down(struct bgp_proto *p)
{
  if (p->start_state > BSS_PREPARE)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
437 438 439 440
  {
    bgp_setup_auth(p, 0);
    bgp_close(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
441

442
  BGP_TRACE(D_EVENTS, "Down");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
443 444 445 446 447 448 449 450 451
  proto_notify_state(&p->p, PS_DOWN);
}

static void
bgp_decision(void *vp)
{
  struct bgp_proto *p = vp;

  DBG("BGP: Decision start\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
452 453 454 455
  if ((p->p.proto_state == PS_START) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state != BS_OPENCONFIRM) &&
      !p->cf->passive)
456
    bgp_active(p);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
457

Ondřej Zajíček's avatar
Ondřej Zajíček committed
458 459 460
  if ((p->p.proto_state == PS_STOP) &&
      (p->outgoing_conn.state == BS_IDLE) &&
      (p->incoming_conn.state == BS_IDLE))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
461 462 463
    bgp_down(p);
}

464
void
465
bgp_stop(struct bgp_proto *p, uint subcode, byte *data, uint len)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
466 467
{
  proto_notify_state(&p->p, PS_STOP);
468 469
  bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
  bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
470 471 472
  ev_schedule(p->event);
}

473
static inline void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
474
bgp_conn_set_state(struct bgp_conn *conn, uint new_state)
475 476 477 478 479 480 481 482 483 484 485 486 487 488
{
  if (conn->bgp->p.mrtdump & MD_STATES)
    mrt_dump_bgp_state_change(conn, conn->state, new_state);

  conn->state = new_state;
}

void
bgp_conn_enter_openconfirm_state(struct bgp_conn *conn)
{
  /* Really, most of the work is done in bgp_rx_open(). */
  bgp_conn_set_state(conn, BS_OPENCONFIRM);
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
489 490
static const struct bgp_af_caps dummy_af_caps = { };

Ondřej Zajíček's avatar
Ondřej Zajíček committed
491 492 493 494
void
bgp_conn_enter_established_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
495 496 497
  struct bgp_caps *local = conn->local_caps;
  struct bgp_caps *peer = conn->remote_caps;
  struct bgp_channel *c;
498

Ondřej Zajíček's avatar
Ondřej Zajíček committed
499 500
  BGP_TRACE(D_EVENTS, "BGP session established");

501 502
  /* For multi-hop BGP sessions */
  if (ipa_zero(p->source_addr))
503
    p->source_addr = conn->sk->saddr;
504

505 506
  conn->sk->fast_rx = 0;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
507 508 509
  p->conn = conn;
  p->last_error_class = 0;
  p->last_error_code = 0;
510

Ondřej Zajíček's avatar
Ondřej Zajíček committed
511 512 513 514
  p->as4_session = conn->as4_session;

  p->route_refresh = peer->route_refresh;
  p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
515

516 517
  /* Whether we may handle possible GR/LLGR of peer (it has some AF GR-able) */
  p->gr_ready = p->llgr_ready = 0;	/* Updated later */
518

Ondřej Zajíček's avatar
Ondřej Zajíček committed
519 520
  /* Whether peer is ready to handle our GR recovery */
  int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
521

Ondřej Zajíček's avatar
Ondřej Zajíček committed
522
  if (p->gr_active_num)
523
    tm_stop(p->gr_timer);
524

Ondřej Zajíček's avatar
Ondřej Zajíček committed
525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549
  /* Number of active channels */
  int num = 0;

  WALK_LIST(c, p->p.channels)
  {
    const struct bgp_af_caps *loc = bgp_find_af_caps(local, c->afi);
    const struct bgp_af_caps *rem = bgp_find_af_caps(peer,  c->afi);

    /* Ignore AFIs that were not announced in multiprotocol capability */
    if (!loc || !loc->ready)
      loc = &dummy_af_caps;

    if (!rem || !rem->ready)
      rem = &dummy_af_caps;

    int active = loc->ready && rem->ready;
    c->c.disabled = !active;
    c->c.reloadable = p->route_refresh;

    c->index = active ? num++ : 0;

    c->feed_state = BFS_NONE;
    c->load_state = BFS_NONE;

    /* Channels where peer may do GR */
550 551 552 553
    uint gr_ready = active && local->gr_aware && rem->gr_able;
    uint llgr_ready = active && local->llgr_aware && rem->llgr_able;

    c->gr_ready = gr_ready || llgr_ready;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
554
    p->gr_ready = p->gr_ready || c->gr_ready;
555 556 557 558
    p->llgr_ready = p->llgr_ready || llgr_ready;

    /* Remember last LLGR stale time */
    c->stale_time = local->llgr_aware ? rem->llgr_time : 0;
559

Ondřej Zajíček's avatar
Ondřej Zajíček committed
560 561 562
    /* Channels not able to recover gracefully */
    if (p->p.gr_recovery && (!active || !peer_gr_ready))
      channel_graceful_restart_unlock(&c->c);
563

Ondřej Zajíček's avatar
Ondřej Zajíček committed
564 565 566 567
    /* Channels waiting for local convergence */
    if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
      c->c.gr_wait = 1;

568 569 570 571 572 573 574 575
    /* Channels where regular graceful restart failed */
    if ((c->gr_active == BGP_GRS_ACTIVE) &&
	!(active && rem->gr_able && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
      bgp_graceful_restart_done(c);

    /* Channels where regular long-lived restart failed */
    if ((c->gr_active == BGP_GRS_LLGR) &&
	!(active && rem->llgr_able && (rem->gr_af_flags & BGP_LLGRF_FORWARDING)))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
576 577 578 579 580 581
      bgp_graceful_restart_done(c);

    /* GR capability implies that neighbor will send End-of-RIB */
    if (peer->gr_aware)
      c->load_state = BFS_LOADING;

582
    c->ext_next_hop = c->cf->ext_next_hop && (bgp_channel_is_ipv6(c) || rem->ext_next_hop);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
583 584 585
    c->add_path_rx = (loc->add_path & BGP_ADD_PATH_RX) && (rem->add_path & BGP_ADD_PATH_TX);
    c->add_path_tx = (loc->add_path & BGP_ADD_PATH_TX) && (rem->add_path & BGP_ADD_PATH_RX);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
586
    /* Update RA mode */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
587 588
    if (c->add_path_tx)
      c->c.ra_mode = RA_ANY;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
589 590 591 592
    else if (c->cf->secondary)
      c->c.ra_mode = RA_ACCEPTED;
    else
      c->c.ra_mode = RA_OPTIMAL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
  }

  p->afi_map = mb_alloc(p->p.pool, num * sizeof(u32));
  p->channel_map = mb_alloc(p->p.pool, num * sizeof(void *));
  p->channel_count = num;

  WALK_LIST(c, p->p.channels)
  {
    if (c->c.disabled)
      continue;

    p->afi_map[c->index] = c->afi;
    p->channel_map[c->index] = c;
  }

  /* proto_notify_state() will likely call bgp_feed_begin(), setting c->feed_state */
609

610
  bgp_conn_set_state(conn, BS_ESTABLISHED);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
611 612 613 614 615 616 617 618 619 620
  proto_notify_state(&p->p, PS_UP);
}

static void
bgp_conn_leave_established_state(struct bgp_proto *p)
{
  BGP_TRACE(D_EVENTS, "BGP session closed");
  p->conn = NULL;

  if (p->p.proto_state == PS_UP)
621
    bgp_stop(p, 0, NULL, 0);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
622 623 624 625 626 627 628 629
}

void
bgp_conn_enter_close_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

630
  bgp_conn_set_state(conn, BS_CLOSE);
631
  tm_stop(conn->keepalive_timer);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
632 633
  conn->sk->rx_hook = NULL;

634 635 636
  /* Timeout for CLOSE state, if we cannot send notification soon then we just hangup */
  bgp_start_timer(conn->hold_timer, 10);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
637 638 639 640 641 642 643 644 645 646 647
  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

void
bgp_conn_enter_idle_state(struct bgp_conn *conn)
{
  struct bgp_proto *p = conn->bgp;
  int os = conn->state;

  bgp_close_conn(conn);
648
  bgp_conn_set_state(conn, BS_IDLE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
649 650 651 652 653 654
  ev_schedule(p->event);

  if (os == BS_ESTABLISHED)
    bgp_conn_leave_established_state(p);
}

655 656 657 658 659 660 661 662 663 664 665
/**
 * bgp_handle_graceful_restart - handle detected BGP graceful restart
 * @p: BGP instance
 *
 * This function is called when a BGP graceful restart of the neighbor is
 * detected (when the TCP connection fails or when a new TCP connection
 * appears). The function activates processing of the restart - starts routing
 * table refresh cycle and activates BGP restart timer. The protocol state goes
 * back to %PS_START, but changing BGP state back to %BS_IDLE is left for the
 * caller.
 */
666 667 668 669 670 671
void
bgp_handle_graceful_restart(struct bgp_proto *p)
{
  ASSERT(p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready);

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart detected%s",
Ondřej Zajíček's avatar
Ondřej Zajíček committed
672 673 674
	    p->gr_active_num ? " - already pending" : "");

  p->gr_active_num = 0;
675

Ondřej Zajíček's avatar
Ondřej Zajíček committed
676 677 678
  struct bgp_channel *c;
  WALK_LIST(c, p->p.channels)
  {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
679 680 681 682
    /* FIXME: perhaps check for channel state instead of disabled flag? */
    if (c->c.disabled)
      continue;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
683 684
    if (c->gr_ready)
    {
685 686 687 688 689 690 691 692 693 694
      p->gr_active_num++;

      switch (c->gr_active)
      {
      case BGP_GRS_NONE:
	c->gr_active = BGP_GRS_ACTIVE;
	rt_refresh_begin(c->c.table, &c->c);
	break;

      case BGP_GRS_ACTIVE:
Ondřej Zajíček's avatar
Ondřej Zajíček committed
695
	rt_refresh_end(c->c.table, &c->c);
696 697
	rt_refresh_begin(c->c.table, &c->c);
	break;
698

699 700 701 702 703
      case BGP_GRS_LLGR:
	rt_refresh_begin(c->c.table, &c->c);
	rt_modify_stale(c->c.table, &c->c);
	break;
      }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
704 705 706 707 708 709 710
    }
    else
    {
      /* Just flush the routes */
      rt_refresh_begin(c->c.table, &c->c);
      rt_refresh_end(c->c.table, &c->c);
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
711 712 713 714 715 716 717

    /* Reset bucket and prefix tables */
    bgp_free_bucket_table(c);
    bgp_free_prefix_table(c);
    bgp_init_bucket_table(c);
    bgp_init_prefix_table(c);
    c->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
718 719
  }

720 721 722
  /* p->gr_ready -> at least one active channel is c->gr_ready */
  ASSERT(p->gr_active_num > 0);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
723
  proto_notify_state(&p->p, PS_START);
724
  tm_start(p->gr_timer, p->conn->remote_caps->gr_time S);
725 726
}

727 728
/**
 * bgp_graceful_restart_done - finish active BGP graceful restart
Ondřej Zajíček's avatar
Ondřej Zajíček committed
729
 * @c: BGP channel
730 731
 *
 * This function is called when the active BGP graceful restart of the neighbor
Ondřej Zajíček's avatar
Ondřej Zajíček committed
732 733 734 735
 * should be finished for channel @c - either successfully (the neighbor sends
 * all paths and reports end-of-RIB for given AFI/SAFI on the new session) or
 * unsuccessfully (the neighbor does not support BGP graceful restart on the new
 * session). The function ends the routing table refresh cycle.
736
 */
737
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
738
bgp_graceful_restart_done(struct bgp_channel *c)
739
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
740 741 742 743 744 745 746 747 748
  struct bgp_proto *p = (void *) c->c.proto;

  ASSERT(c->gr_active);
  c->gr_active = 0;
  p->gr_active_num--;

  if (!p->gr_active_num)
    BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");

749
  tm_stop(c->stale_timer);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
750
  rt_refresh_end(c->c.table, &c->c);
751 752
}

753 754 755 756 757 758 759 760 761
/**
 * bgp_graceful_restart_timeout - timeout of graceful restart 'restart timer'
 * @t: timer
 *
 * This function is a timeout hook for @gr_timer, implementing BGP restart time
 * limit for reestablisment of the BGP session after the graceful restart. When
 * fired, we just proceed with the usual protocol restart.
 */

762 763 764 765 766 767
static void
bgp_graceful_restart_timeout(timer *t)
{
  struct bgp_proto *p = t->data;

  BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807

  if (p->llgr_ready)
  {
    struct bgp_channel *c;
    WALK_LIST(c, p->p.channels)
    {
      /* Channel is not in GR and is already flushed */
      if (!c->gr_active)
	continue;

      /* Channel is already in LLGR from past restart */
      if (c->gr_active == BGP_GRS_LLGR)
	continue;

      /* Channel is in GR, but does not support LLGR -> stop GR */
      if (!c->stale_time)
      {
	bgp_graceful_restart_done(c);
	continue;
      }

      /* Channel is in GR, and supports LLGR -> start LLGR */
      c->gr_active = BGP_GRS_LLGR;
      tm_start(c->stale_timer, c->stale_time S);
      rt_modify_stale(c->c.table, &c->c);
    }
  }
  else
    bgp_stop(p, 0, NULL, 0);
}

static void
bgp_long_lived_stale_timeout(timer *t)
{
  struct bgp_channel *c = t->data;
  struct bgp_proto *p = (void *) c->c.proto;

  BGP_TRACE(D_EVENTS, "Long-lived stale timeout");

  bgp_graceful_restart_done(c);
808 809
}

810 811 812

/**
 * bgp_refresh_begin - start incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
813
 * @c: BGP channel
814 815 816 817 818 819 820 821
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * started by the neighbor, demarcated by the BoRR packet. The function updates
 * the load state and starts the routing table refresh cycle. Note that graceful
 * restart also uses routing table refresh cycle, but RFC 7313 and load states
 * ensure that these two sequences do not overlap.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
822
bgp_refresh_begin(struct bgp_channel *c)
823
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
824 825 826 827
  struct bgp_proto *p = (void *) c->c.proto;

  if (c->load_state == BFS_LOADING)
  { log(L_WARN "%s: BEGIN-OF-RR received before END-OF-RIB, ignoring", p->p.name); return; }
828

Ondřej Zajíček's avatar
Ondřej Zajíček committed
829 830
  c->load_state = BFS_REFRESHING;
  rt_refresh_begin(c->c.table, &c->c);
831 832 833 834
}

/**
 * bgp_refresh_end - finish incoming enhanced route refresh sequence
Ondřej Zajíček's avatar
Ondřej Zajíček committed
835
 * @c: BGP channel
836 837 838 839 840 841 842
 *
 * This function is called when an incoming enhanced route refresh sequence is
 * finished by the neighbor, demarcated by the EoRR packet. The function updates
 * the load state and ends the routing table refresh cycle. Routes not received
 * during the sequence are removed by the nest.
 */
void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
843
bgp_refresh_end(struct bgp_channel *c)
844
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
845
  struct bgp_proto *p = (void *) c->c.proto;
846

Ondřej Zajíček's avatar
Ondřej Zajíček committed
847 848 849 850 851
  if (c->load_state != BFS_REFRESHING)
  { log(L_WARN "%s: END-OF-RR received without prior BEGIN-OF-RR, ignoring", p->p.name); return; }

  c->load_state = BFS_NONE;
  rt_refresh_end(c->c.table, &c->c);
852 853 854
}


855 856 857 858 859
static void
bgp_send_open(struct bgp_conn *conn)
{
  DBG("BGP: Sending open\n");
  conn->sk->rx_hook = bgp_rx;
860
  conn->sk->tx_hook = bgp_tx;
861
  tm_stop(conn->connect_timer);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
862
  bgp_schedule_packet(conn, NULL, PKT_OPEN);
863
  bgp_conn_set_state(conn, BS_OPENSENT);
864
  bgp_start_timer(conn->hold_timer, conn->bgp->cf->initial_hold_time);
865 866
}

867 868
static void
bgp_connected(sock *sk)
869 870
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
871
  struct bgp_proto *p = conn->bgp;
872

Martin Mareš's avatar
Martin Mareš committed
873
  BGP_TRACE(D_EVENTS, "Connected");
874 875 876 877 878 879
  bgp_send_open(conn);
}

static void
bgp_connect_timeout(timer *t)
{
880
  struct bgp_conn *conn = t->data;
Martin Mareš's avatar
Martin Mareš committed
881
  struct bgp_proto *p = conn->bgp;
882

Martin Mareš's avatar
Martin Mareš committed
883
  DBG("BGP: connect_timeout\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
884
  if (p->p.proto_state == PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
885 886 887 888
  {
    bgp_close_conn(conn);
    bgp_connect(p);
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
889 890
  else
    bgp_conn_enter_idle_state(conn);
891 892 893
}

static void
894
bgp_sock_err(sock *sk, int err)
895 896
{
  struct bgp_conn *conn = sk->data;
Martin Mareš's avatar
Martin Mareš committed
897
  struct bgp_proto *p = conn->bgp;
898

899 900 901 902 903 904 905 906 907
  /*
   * This error hook may be called either asynchronously from main
   * loop, or synchronously from sk_send().  But sk_send() is called
   * only from bgp_tx() and bgp_kick_tx(), which are both called
   * asynchronously from main loop. Moreover, they end if err hook is
   * called. Therefore, we could suppose that it is always called
   * asynchronously.
   */

Ondřej Zajíček's avatar
Ondřej Zajíček committed
908 909
  bgp_store_error(p, conn, BE_SOCKET, err);

910 911 912 913
  if (err)
    BGP_TRACE(D_EVENTS, "Connection lost (%M)", err);
  else
    BGP_TRACE(D_EVENTS, "Connection closed");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
914

915 916 917
  if ((conn->state == BS_ESTABLISHED) && p->gr_ready)
    bgp_handle_graceful_restart(p);

Ondřej Zajíček's avatar
Ondřej Zajíček committed
918
  bgp_conn_enter_idle_state(conn);
919 920
}

921 922 923 924
static void
bgp_hold_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;
925
  struct bgp_proto *p = conn->bgp;
926

927 928
  DBG("BGP: Hold timeout\n");

929 930 931 932 933 934 935 936
  /* We are already closing the connection - just do hangup */
  if (conn->state == BS_CLOSE)
  {
    BGP_TRACE(D_EVENTS, "Connection stalled");
    bgp_conn_enter_idle_state(conn);
    return;
  }

937 938 939 940 941
  /* If there is something in input queue, we are probably congested
     and perhaps just not processed BGP packets in time. */

  if (sk_rx_ready(conn->sk) > 0)
    bgp_start_timer(conn->hold_timer, 10);
942 943 944 945 946 947
  else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready)
  {
    BGP_TRACE(D_EVENTS, "Hold timer expired");
    bgp_handle_graceful_restart(p);
    bgp_conn_enter_idle_state(conn);
  }
948 949
  else
    bgp_error(conn, 4, 0, NULL, 0);
950 951 952 953 954 955 956 957
}

static void
bgp_keepalive_timeout(timer *t)
{
  struct bgp_conn *conn = t->data;

  DBG("BGP: Keepalive timer\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
958
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
959 960 961 962

  /* Kick TX a bit faster */
  if (ev_active(conn->tx_ev))
    ev_run(conn->tx_ev);
963 964
}

965
static void
966
bgp_setup_conn(struct bgp_proto *p, struct bgp_conn *conn)
967
{
968
  conn->sk = NULL;
969
  conn->bgp = p;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
970

971
  conn->packets_to_send = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
972 973 974 975
  conn->channels_to_send = 0;
  conn->last_channel = 0;
  conn->last_channel_count = 0;

976 977 978
  conn->connect_timer	= tm_new_init(p->p.pool, bgp_connect_timeout,	 conn, 0, 0);
  conn->hold_timer 	= tm_new_init(p->p.pool, bgp_hold_timeout,	 conn, 0, 0);
  conn->keepalive_timer	= tm_new_init(p->p.pool, bgp_keepalive_timeout, conn, 0, 0);
979

Ondřej Zajíček's avatar
Ondřej Zajíček committed
980 981 982
  conn->tx_ev = ev_new(p->p.pool);
  conn->tx_ev->hook = bgp_kick_tx;
  conn->tx_ev->data = conn;
983 984
}

985
static void
986
bgp_setup_sk(struct bgp_conn *conn, sock *s)
987 988 989
{
  s->data = conn;
  s->err_hook = bgp_sock_err;
990
  s->fast_rx = 1;
991 992 993
  conn->sk = s;
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
994
static void
995
bgp_active(struct bgp_proto *p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
996
{
997
  int delay = MAX(1, p->cf->connect_delay_time);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
998 999 1000 1001
  struct bgp_conn *conn = &p->outgoing_conn;

  BGP_TRACE(D_EVENTS, "Connect delayed by %d seconds", delay);
  bgp_setup_conn(p, conn);
1002
  bgp_conn_set_state(conn, BS_ACTIVE);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1003
  bgp_start_timer(conn->connect_timer, delay);
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1004 1005
}

Martin Mareš's avatar
Martin Mareš committed
1006 1007 1008 1009 1010 1011 1012 1013
/**
 * bgp_connect - initiate an outgoing connection
 * @p: BGP instance
 *
 * The bgp_connect() function creates a new &bgp_conn and initiates
 * a TCP connection to the peer. The rest of connection setup is governed
 * by the BGP state machine as described in the standard.
 */
1014 1015 1016
static void
bgp_connect(struct bgp_proto *p)	/* Enter Connect state and start establishing connection */
{
1017
  struct bgp_conn *conn = &p->outgoing_conn;
1018
  int hops = p->cf->multihop ? : 1;
1019 1020

  DBG("BGP: Connecting\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1021
  sock *s = sk_new(p->p.pool);
1022
  s->type = SK_TCP_ACTIVE;
1023
  s->saddr = p->source_addr;
1024
  s->daddr = p->cf->remote_ip;
1025
  s->dport = p->cf->remote_port;
1026
  s->iface = p->neigh ? p->neigh->iface : NULL;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1027
  s->vrf = p->p.vrf;
1028
  s->ttl = p->cf->ttl_security ? 255 : hops;
1029 1030
  s->rbsize = p->cf->enable_extended_messages ? BGP_RX_BUFFER_EXT_SIZE : BGP_RX_BUFFER_SIZE;
  s->tbsize = p->cf->enable_extended_messages ? BGP_TX_BUFFER_EXT_SIZE : BGP_TX_BUFFER_SIZE;
1031 1032 1033
  s->tos = IP_PREC_INTERNET_CONTROL;
  s->password = p->cf->password;
  s->tx_hook = bgp_connected;
1034
  BGP_TRACE(D_EVENTS, "Connecting to %I%J from local address %I%J", s->daddr, p->cf->iface,
1035
	    s->saddr, ipa_is_link_local(s->saddr) ? s->iface : NULL);
1036
  bgp_setup_conn(p, conn);
1037
  bgp_setup_sk(conn, s);
1038
  bgp_conn_set_state(conn, BS_CONNECT);
1039 1040

  if (sk_open(s) < 0)
1041
    goto err;
1042 1043 1044 1045

  /* Set minimal receive TTL if needed */
  if (p->cf->ttl_security)
    if (sk_set_min_ttl(s, 256 - hops) < 0)
1046
      goto err;
1047

1048
  DBG("BGP: Waiting for connect success\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1049
  bgp_start_timer(conn->connect_timer, p->cf->connect_retry_time);
1050 1051
  return;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1052
err:
1053 1054 1055
  sk_log_error(s, p->p.name);
  bgp_sock_err(s, 0);
  return;
1056 1057
}

1058 1059 1060 1061 1062 1063 1064 1065
/**
 * bgp_find_proto - find existing proto for incoming connection
 * @sk: TCP socket
 *
 */
static struct bgp_proto *
bgp_find_proto(sock *sk)
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1066
  struct bgp_proto *p;
1067

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1068 1069 1070
  WALK_LIST(p, proto_list)
    if ((p->p.proto == &proto_bgp) &&
	ipa_equal(p->cf->remote_ip, sk->daddr) &&
1071
	(!p->cf->iface  || (p->cf->iface == sk->iface)) &&
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1072 1073 1074
	(ipa_zero(p->cf->local_ip) || ipa_equal(p->cf->local_ip, sk->saddr)) &&
	(p->cf->local_port == sk->sport))
      return p;
1075 1076 1077 1078

  return NULL;
}

Martin Mareš's avatar
Martin Mareš committed
1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
/**
 * bgp_incoming_connection - handle an incoming connection
 * @sk: TCP socket
 * @dummy: unused
 *
 * This function serves as a socket hook for accepting of new BGP
 * connections. It searches a BGP instance corresponding to the peer
 * which has connected and if such an instance exists, it creates a
 * &bgp_conn structure, attaches it to the instance and either sends
 * an Open message or (if there already is an active connection) it
 * closes the new connection by sending a Notification message.
 */
1091
static int
1092
bgp_incoming_connection(sock *sk, uint dummy UNUSED)
1093
{
1094 1095
  struct bgp_proto *p;
  int acc, hops;
1096

1097
  DBG("BGP: Incoming connection from %I port %d\n", sk->daddr, sk->dport);
1098 1099
  p = bgp_find_proto(sk);
  if (!p)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1100 1101 1102 1103 1104 1105
  {
    log(L_WARN "BGP: Unexpected connect from unknown address %I%J (port %d)",
	sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL, sk->dport);
    rfree(sk);
    return 0;
  }
1106

1107 1108 1109 1110 1111 1112 1113
  /*
   * BIRD should keep multiple incoming connections in OpenSent state (for
   * details RFC 4271 8.2.1 par 3), but it keeps just one. Duplicate incoming
   * connections are rejected istead. The exception is the case where an
   * incoming connection triggers a graceful restart.
   */

1114 1115
  acc = (p->p.proto_state == PS_START || p->p.proto_state == PS_UP) &&
    (p->start_state >= BSS_CONNECT) && (!p->incoming_conn.sk);
1116

1117
  if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
  {
    bgp_store_error(p, NULL, BE_MISC, BEM_GRACEFUL_RESTART);
    bgp_handle_graceful_restart(p);
    bgp_conn_enter_idle_state(p->conn);
    acc = 1;

    /* There might be separate incoming connection in OpenSent state */
    if (p->incoming_conn.state > BS_ACTIVE)
      bgp_close_conn(&p->incoming_conn);
  }
1128 1129 1130 1131 1132 1133

  BGP_TRACE(D_EVENTS, "Incoming connection from %I%J (port %d) %s",
	    sk->daddr, ipa_is_link_local(sk->daddr) ? sk->iface : NULL,
	    sk->dport, acc ? "accepted" : "rejected");

  if (!acc)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1134 1135 1136 1137
  {
    rfree(sk);
    return 0;
  }
1138 1139 1140 1141 1142 1143 1144 1145 1146 1147

  hops = p->cf->multihop ? : 1;

  if (sk_set_ttl(sk, p->cf->ttl_security ? 255 : hops) < 0)
    goto err;

  if (p->cf->ttl_security)
    if (sk_set_min_ttl(sk, 256 - hops) < 0)
      goto err;

1148
  if (p->cf->enable_extended_messages)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1149 1150 1151 1152 1153
  {
    sk->rbsize = BGP_RX_BUFFER_EXT_SIZE;
    sk->tbsize = BGP_TX_BUFFER_EXT_SIZE;
    sk_reallocate(sk);
  }
1154

1155 1156 1157 1158 1159 1160 1161 1162
  bgp_setup_conn(p, &p->incoming_conn);
  bgp_setup_sk(&p->incoming_conn, sk);
  bgp_send_open(&p->incoming_conn);
  return 0;

err:
  sk_log_error(sk, p->p.name);
  log(L_ERR "%s: Incoming connection aborted", p->p.name);
1163 1164 1165 1166
  rfree(sk);
  return 0;
}

1167
static void
1168
bgp_listen_sock_err(sock *sk UNUSED, int err)
1169 1170 1171 1172
{
  if (err == ECONNABORTED)
    log(L_WARN "BGP: Incoming connection aborted");
  else
1173
    log(L_ERR "BGP: Error on listening socket: %M", err);
1174 1175
}

1176 1177 1178
static void
bgp_start_neighbor(struct bgp_proto *p)
{
1179 1180 1181
  /* Called only for single-hop BGP sessions */

  if (ipa_zero(p->source_addr))
1182
    p->source_addr = p->neigh->ifa->ip;
1183

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1184 1185
  if (ipa_is_link_local(p->source_addr))
    p->link_addr = p->source_addr;
1186 1187
  else if (p->neigh->iface->llv6)
    p->link_addr = p->neigh->iface->llv6->ip;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1188

1189
  bgp_initiate(p);
1190 1191 1192 1193 1194 1195
}

static void
bgp_neigh_notify(neighbor *n)
{
  struct bgp_proto *p = (struct bgp_proto *) n->proto;
1196 1197 1198 1199
  int ps = p->p.proto_state;

  if (n != p->neigh)
    return;
1200

1201
  if ((ps == PS_DOWN) || (ps == PS_STOP))
1202 1203
    return;

1204 1205 1206
  int prepare = (ps == PS_START) && (p->start_state == BSS_PREPARE);

  if (n->scope <= 0)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1207 1208
  {
    if (!prepare)
1209
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1210 1211 1212
      BGP_TRACE(D_EVENTS, "Neighbor lost");
      bgp_store_error(p, NULL, BE_MISC, BEM_NEIGHBOR_LOST);
      /* Perhaps also run bgp_update_startup_delay(p)? */
1213
      bgp_stop(p, 0, NULL, 0);
1214
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1215
  }
1216
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1217 1218
  {
    if (!prepare)
1219
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1220 1221 1222 1223
      BGP_TRACE(D_EVENTS, "Link down");
      bgp_store_error(p, NULL, BE_MISC, BEM_LINK_DOWN);
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
1224
      bgp_stop(p, 0, NULL, 0);
1225
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1226
  }
1227
  else
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1228 1229
  {
    if (prepare)
1230
    {
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1231 1232
      BGP_TRACE(D_EVENTS, "Neighbor ready");
      bgp_start_neighbor(p);
1233
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1234
  }
1235 1236
}

1237 1238 1239 1240 1241 1242 1243
static void
bgp_bfd_notify(struct bfd_request *req)
{
  struct bgp_proto *p = req->data;
  int ps = p->p.proto_state;

  if (req->down && ((ps == PS_START) || (ps == PS_UP)))
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1244 1245 1246
  {
    BGP_TRACE(D_EVENTS, "BFD session down");
    bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266

    if (p->cf->bfd == BGP_BFD_GRACEFUL)
    {
      /* Trigger graceful restart */
      if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
	bgp_handle_graceful_restart(p);

      if (p->incoming_conn.state > BS_IDLE)
	bgp_conn_enter_idle_state(&p->incoming_conn);

      if (p->outgoing_conn.state > BS_IDLE)
	bgp_conn_enter_idle_state(&p->outgoing_conn);
    }
    else
    {
      /* Trigger session down */
      if (ps == PS_UP)
	bgp_update_startup_delay(p);
      bgp_stop(p, 0, NULL, 0);
    }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1267
  }
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278
}

static void
bgp_update_bfd(struct bgp_proto *p, int use_bfd)
{
  if (use_bfd && !p->bfd_req)
    p->bfd_req = bfd_request_session(p->p.pool, p->cf->remote_ip, p->source_addr,
				     p->cf->multihop ? NULL : p->neigh->iface,
				     bgp_bfd_notify, p);

  if (!use_bfd && p->bfd_req)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1279 1280 1281 1282
  {
    rfree(p->bfd_req);
    p->bfd_req = NULL;
  }
1283 1284
}

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1285 1286
static void
bgp_reload_routes(struct channel *C)
1287
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1288 1289
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1290

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1291 1292 1293
  ASSERT(p->conn && p->route_refresh);

  bgp_schedule_packet(p->conn, c, PKT_ROUTE_REFRESH);
1294 1295
}

1296
static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1297
bgp_feed_begin(struct channel *C, int initial)
1298
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1299 1300
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1301 1302 1303

  /* This should not happen */
  if (!p->conn)
1304 1305
    return;

1306
  if (initial && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1307
    c->feed_state = BFS_LOADING;
1308 1309

  /* It is refeed and both sides support enhanced route refresh */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1310 1311 1312 1313 1314
  if (!initial && p->enhanced_refresh)
  {
    /* BoRR must not be sent before End-of-RIB */
    if (c->feed_state == BFS_LOADING || c->feed_state == BFS_LOADED)
      return;
1315

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1316 1317 1318
    c->feed_state = BFS_REFRESHING;
    bgp_schedule_packet(p->conn, c, PKT_BEGIN_REFRESH);
  }
1319 1320 1321
}

static void
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1322
bgp_feed_end(struct channel *C)
1323
{
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1324 1325
  struct bgp_proto *p = (void *) C->proto;
  struct bgp_channel *c = (void *) C;
1326 1327 1328 1329 1330 1331

  /* This should not happen */
  if (!p->conn)
    return;

  /* Non-demarcated feed ended, nothing to do */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1332
  if (c->feed_state == BFS_NONE)
1333 1334 1335
    return;

  /* Schedule End-of-RIB packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1336 1337
  if (c->feed_state == BFS_LOADING)
    c->feed_state = BFS_LOADED;
1338 1339

  /* Schedule EoRR packet */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1340 1341
  if (c->feed_state == BFS_REFRESHING)
    c->feed_state = BFS_REFRESHED;
1342 1343

  /* Kick TX hook */
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1344
  bgp_schedule_packet(p->conn, c, PKT_UPDATE);
1345 1346
}

1347

1348 1349 1350 1351 1352 1353
static void
bgp_start_locked(struct object_lock *lock)
{
  struct bgp_proto *p = lock->data;
  struct bgp_config *cf = p->cf;

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1354
  if (p->p.proto_state != PS_START)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1355 1356 1357 1358
  {
    DBG("BGP: Got lock in different state %d\n", p->p.proto_state);
    return;
  }
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1359

1360
  DBG("BGP: Got lock\n");
1361

1362
  if (cf->multihop)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1363 1364 1365 1366 1367
  {
    /* Multi-hop sessions do not use neighbor entries */
    bgp_initiate(p);
    return;
  }
1368

1369
  neighbor *n = neigh_find(&p->p, cf->remote_ip, cf->iface, NEF_STICKY);
1370
  if (!n)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1371 1372 1373 1374 1375 1376 1377 1378
  {
    log(L_ERR "%s: Invalid remote address %I%J", p->p.name, cf->remote_ip, cf->iface);
    /* As we do not start yet, we can just disable protocol */
    p->p.disabled = 1;
    bgp_store_error(p, NULL, BE_MISC, BEM_INVALID_NEXT_HOP);
    proto_notify_state(&p->p, PS_DOWN);
    return;
  }
1379 1380 1381 1382

  p->neigh = n;

  if (n->scope <= 0)
1383
    BGP_TRACE(D_EVENTS, "Waiting for %I%J to become my neighbor", cf->remote_ip, cf->iface);
1384 1385 1386 1387
  else if (p->cf->check_link && !(n->iface->flags & IF_LINK_UP))
    BGP_TRACE(D_EVENTS, "Waiting for link on %s", n->iface->name);
  else
    bgp_start_neighbor(p);
1388 1389
}

1390 1391 1392
static int
bgp_start(struct proto *P)
{
1393 1394 1395
  struct bgp_proto *p = (struct bgp_proto *) P;
  struct object_lock *lock;

1396
  DBG("BGP: Startup.\n");
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1397
  p->start_state = BSS_PREPARE;
1398 1399
  p->outgoing_conn.state = BS_IDLE;
  p->incoming_conn.state = BS_IDLE;
1400
  p->neigh = NULL;
1401
  p->bfd_req = NULL;
1402
  p->gr_ready = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1403
  p->gr_active_num = 0;
1404

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1405 1406 1407
  p->event = ev_new(p->p.pool);
  p->event->hook = bgp_decision;
  p->event->data = p;
1408

1409 1410
  p->startup_timer = tm_new_init(p->p.pool, bgp_startup_timeout, p, 0, 0);
  p->gr_timer = tm_new_init(p->p.pool, bgp_graceful_restart_timeout, p, 0, 0);
1411

1412 1413 1414 1415
  p->local_id = proto_get_router_id(P->cf);
  if (p->rr_client)
    p->rr_cluster_id = p->cf->rr_cluster_id ? p->cf->rr_cluster_id : p->local_id;

1416
  p->remote_id = 0;
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1417
  p->source_addr = p->cf->local_ip;
1418
  p->link_addr = IPA_NONE;
1419

Ondřej Zajíček's avatar
Ondřej Zajíček committed
1420
  /* Lock all channels when in GR recovery mode */
1421
  if (p->p.gr_recovery && p->cf->gr_mode)
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1422 1423 1424 1425 1426
  {
    struct bgp_channel *c;
    WALK_LIST(c, p->p.channels)
      channel_graceful_restart_lock(&c->c);
  }
1427

1428
  /*
Ondřej Zajíček's avatar
Ondřej Zajíček committed
1429 1430
   * Before attempting to create the connection, we need to lock the port,
   * so that we are the only instance attempting to talk with that neighbor.
1431 1432 1433 1434
   */

  lock = p->lock = olock_new(P->pool);
  lock->addr = p->cf->remote_ip;
1435
  lock->port = p->cf->remote_port;