Commit 5bd73431 authored by Ondřej Zajíček's avatar Ondřej Zajíček

BGP: Long-lived graceful restart

The patch implements long-lived graceful restart for BGP, namely
draft-uttaro-idr-bgp-persistence-03.
parent 318acb0f
Pipeline #38323 passed with stages
in 4 minutes and 10 seconds
......@@ -2220,13 +2220,16 @@ using the following configuration parameters:
immediately shut down. Note that this option cannot be used with
multihop BGP. Default: enabled for direct BGP, disabled otherwise.
<tag><label id="bgp-bfd">bfd <M>switch</M></tag>
<tag><label id="bgp-bfd">bfd <M>switch</M>|graceful</tag>
BGP could use BFD protocol as an advisory mechanism for neighbor
liveness and failure detection. If enabled, BIRD setups a BFD session
for the BGP neighbor and tracks its liveness by it. This has an
advantage of an order of magnitude lower detection times in case of
failure. Note that BFD protocol also has to be configured, see
<ref id="bfd" name="BFD"> section for details. Default: disabled.
failure. When a neighbor failure is detected, the BGP session is
restarted. Optionally, it can be configured (by <cf/graceful/ argument)
to trigger graceful restart instead of regular restart. Note that BFD
protocol also has to be configured, see <ref id="bfd" name="BFD">
section for details. Default: disabled.
<tag><label id="bgp-ttl-security">ttl security <m/switch/</tag>
Use GTSM (<rfc id="5082"> - the generalized TTL security mechanism). GTSM
......@@ -2348,6 +2351,25 @@ using the following configuration parameters:
re-establish after a restart before deleting stale routes. Default:
120 seconds.
<tag><label id="bgp-long-lived-graceful-restart">long lived graceful restart <m/switch/|aware</tag>
The long-lived graceful restart is an extension of the traditional
<ref id="bgp-graceful-restart" name="BGP graceful restart">, where stale
routes are kept even after the <ref id="bgp-graceful-restart-time"
name="restart time"> expires for additional long-lived stale time, but
they are marked with the LLGR_STALE community, depreferenced, and
withdrawn from routers not supporting LLGR. Like traditional BGP
graceful restart, it has three states: disabled, aware (receiving-only),
and enabled. Note that long-lived graceful restart requires at least
aware level of traditional BGP graceful restart. Default: aware, unless
graceful restart is disabled.
<tag><label id="bgp-long-lived-stale-time">long lived stale time <m/number/</tag>
The long-lived stale time is announced in the BGP long-lived graceful
restart capability and specifies how long the neighbor would keep stale
routes depreferenced during long-lived graceful restart until either the
session is re-stablished and synchronized or the stale time expires and
routes are removed. Default: 3600 seconds.
<tag><label id="bgp-interpret-communities">interpret communities <m/switch/</tag>
<rfc id="1997"> demands that BGP speaker should process well-known
communities like no-export (65535, 65281) or no-advertise (65535,
......@@ -2607,6 +2629,19 @@ be used in explicit configuration.
configure restarting role per AFI/SAFI pair by this channel option.
The option is ignored if graceful restart is disabled by protocol-wide
option. Default: off in aware mode, on in full mode.
<tag><label id="bgp-long-lived-graceful-restart-c">long lived graceful restart <m/switch/</tag>
BGP long-lived graceful restart is configured mainly by protocol-wide
<ref id="bgp-long-lived-graceful-restart" name="options">, but the
restarting role can be set per AFI/SAFI pair by this channel option.
The option is ignored if long-lived graceful restart is disabled by
protocol-wide option. Default: off in aware mode, on in full mode.
<tag><label id="bgp-long-lived-stale-time-c">long lived stale time <m/number/</tag>
Like previous graceful restart channel options, this option allows to
set <ref id="bgp-long-lived-stale-time" name="long lived stale time">
per AFI/SAFI pair instead of per protocol. Default: set by protocol-wide
option.
</descrip>
<sect1>Attributes
......@@ -2761,7 +2796,6 @@ interfaces to be defined for them to work with.
so the default time is set to a large value.
<tag><label id="device-iface">interface <m/pattern/ [, <m/.../]</tag>
By default, the Device protocol handles all interfaces without any
configuration. Interface definitions allow to specify optional
parameters for specific interfaces. See <ref id="proto-iface"
......
......@@ -229,6 +229,7 @@ struct proto {
int (*rte_better)(struct rte *, struct rte *);
int (*rte_same)(struct rte *, struct rte *);
int (*rte_mergable)(struct rte *, struct rte *);
struct rte * (*rte_modify)(struct rte *, struct linpool *);
void (*rte_insert)(struct network *, struct rte *);
void (*rte_remove)(struct network *, struct rte *);
......
......@@ -231,6 +231,7 @@ typedef struct rte {
#ifdef CONFIG_BGP
struct {
u8 suppressed; /* Used for deterministic MED comparison */
s8 stale; /* Route is LLGR_STALE, -1 if unknown */
} bgp;
#endif
#ifdef CONFIG_BABEL
......@@ -254,6 +255,7 @@ typedef struct rte {
#define REF_FILTERED 2 /* Route is rejected by import filter */
#define REF_STALE 4 /* Route is stale in a refresh cycle */
#define REF_DISCARD 8 /* Route is scheduled for discard */
#define REF_MODIFY 16 /* Route is scheduled for modify */
/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
......@@ -297,6 +299,7 @@ int rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter);
rte *rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent);
void rt_refresh_begin(rtable *t, struct channel *c);
void rt_refresh_end(rtable *t, struct channel *c);
void rt_modify_stale(rtable *t, struct channel *c);
void rt_schedule_prune(rtable *t);
void rte_dump(rte *);
void rte_free(rte *);
......
......@@ -1437,6 +1437,28 @@ rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collec
rte_update_unlock();
}
/* Modify existing route by protocol hook, used for long-lived graceful restart */
static inline void
rte_modify(rte *old)
{
rte_update_lock();
rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
if (new != old)
{
if (new)
{
if (!rta_is_cached(new->attrs))
new->attrs = rta_lookup(new->attrs);
new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
}
rte_recalculate(old->sender, old->net, new, old->attrs->src);
}
rte_update_unlock();
}
/* Check rtable for best route to given net whether it would be exported do p */
int
rt_examine(rtable *t, net_addr *a, struct proto *p, struct filter *filter)
......@@ -1521,6 +1543,26 @@ rt_refresh_end(rtable *t, struct channel *c)
rt_schedule_prune(t);
}
void
rt_modify_stale(rtable *t, struct channel *c)
{
int prune = 0;
FIB_WALK(&t->fib, net, n)
{
rte *e;
for (e = n->routes; e; e = e->next)
if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
{
e->flags |= REF_MODIFY;
prune = 1;
}
}
FIB_WALK_END;
if (prune)
rt_schedule_prune(t);
}
/**
* rte_dump - dump a route
......@@ -1712,6 +1754,7 @@ again:
rescan:
for (e=n->routes; e; e=e->next)
{
if (e->sender->flush_active || (e->flags & REF_DISCARD))
{
if (limit <= 0)
......@@ -1727,6 +1770,22 @@ again:
goto rescan;
}
if (e->flags & REF_MODIFY)
{
if (limit <= 0)
{
FIB_ITERATE_PUT(fit);
ev_schedule(tab->rt_event);
return;
}
rte_modify(e);
limit--;
goto rescan;
}
}
if (!n->routes) /* Orphaned FIB entry */
{
FIB_ITERATE_PUT(fit);
......
......@@ -1413,6 +1413,10 @@ bgp_import_control(struct proto *P, rte **new, struct linpool *pool UNUSED)
/* Do not export outside of AS (or confederation) */
if (!p->is_interior && int_set_contains(d, BGP_COMM_NO_EXPORT))
return -1;
/* Do not export LLGR_STALE routes to LLGR-ignorant peers */
if (!p->conn->remote_caps->llgr_aware && int_set_contains(d, BGP_COMM_LLGR_STALE))
return -1;
}
return 0;
......@@ -1580,6 +1584,19 @@ rte_resolvable(rte *rt)
return rt->attrs->dest == RTD_UNICAST;
}
static inline int
rte_stale(rte *r)
{
if (r->u.bgp.stale < 0)
{
/* If staleness is unknown, compute and cache it */
eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY));
r->u.bgp.stale = a && int_set_contains(a->u.ptr, BGP_COMM_LLGR_STALE);
}
return r->u.bgp.stale;
}
int
bgp_rte_better(rte *new, rte *old)
{
......@@ -1604,6 +1621,14 @@ bgp_rte_better(rte *new, rte *old)
if (n < o)
return 0;
/* LLGR draft - depreference stale routes */
n = rte_stale(new);
o = rte_stale(old);
if (n > o)
return 0;
if (n < o)
return 1;
/* Start with local preferences */
x = ea_find(new->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
y = ea_find(old->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
......@@ -1725,6 +1750,10 @@ bgp_rte_mergable(rte *pri, rte *sec)
if (!rte_resolvable(sec))
return 0;
/* LLGR draft - depreference stale routes */
if (rte_stale(pri) != rte_stale(sec))
return 0;
/* Start with local preferences */
x = ea_find(pri->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
y = ea_find(sec->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_LOCAL_PREF));
......@@ -1926,6 +1955,27 @@ bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best)
return old_is_group_best;
}
struct rte *
bgp_rte_modify_stale(struct rte *r, struct linpool *pool)
{
eattr *a = ea_find(r->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_COMMUNITY));
struct adata *ad = a ? a->u.ptr : NULL;
uint flags = a ? a->flags : BAF_PARTIAL;
if (ad && int_set_contains(ad, BGP_COMM_NO_LLGR))
return NULL;
if (ad && int_set_contains(ad, BGP_COMM_LLGR_STALE))
return r;
r = rte_cow_rta(r, pool);
bgp_set_attr_ptr(&(r->attrs->eattrs), pool, BA_COMMUNITY, flags,
int_set_add(pool, ad, BGP_COMM_LLGR_STALE));
r->u.bgp.stale = 1;
return r;
}
/*
* Reconstruct AS_PATH and AGGREGATOR according to RFC 6793 4.2.3
......@@ -2011,6 +2061,9 @@ bgp_get_route_info(rte *e, byte *buf)
if (e->u.bgp.suppressed)
buf += bsprintf(buf, "-");
if (rte_stale(e))
buf += bsprintf(buf, "s");
if (e->attrs->hostentry)
{
if (!rte_resolvable(e))
......
......@@ -513,8 +513,8 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
p->route_refresh = peer->route_refresh;
p->enhanced_refresh = local->enhanced_refresh && peer->enhanced_refresh;
/* Whether we may handle possible GR of peer (it has some AF GR-able) */
p->gr_ready = 0; /* Updated later */
/* Whether we may handle possible GR/LLGR of peer (it has some AF GR-able) */
p->gr_ready = p->llgr_ready = 0; /* Updated later */
/* Whether peer is ready to handle our GR recovery */
int peer_gr_ready = peer->gr_aware && !(peer->gr_flags & BGP_GRF_RESTART);
......@@ -547,8 +547,15 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
c->load_state = BFS_NONE;
/* Channels where peer may do GR */
c->gr_ready = active && local->gr_aware && rem->gr_able;
uint gr_ready = active && local->gr_aware && rem->gr_able;
uint llgr_ready = active && local->llgr_aware && rem->llgr_able;
c->gr_ready = gr_ready || llgr_ready;
p->gr_ready = p->gr_ready || c->gr_ready;
p->llgr_ready = p->llgr_ready || llgr_ready;
/* Remember last LLGR stale time */
c->stale_time = local->llgr_aware ? rem->llgr_time : 0;
/* Channels not able to recover gracefully */
if (p->p.gr_recovery && (!active || !peer_gr_ready))
......@@ -558,8 +565,14 @@ bgp_conn_enter_established_state(struct bgp_conn *conn)
if (p->p.gr_recovery && loc->gr_able && peer_gr_ready)
c->c.gr_wait = 1;
/* Channels where peer is not able to recover gracefully */
if (c->gr_active && ! (c->gr_ready && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
/* Channels where regular graceful restart failed */
if ((c->gr_active == BGP_GRS_ACTIVE) &&
!(active && rem->gr_able && (rem->gr_af_flags & BGP_GRF_FORWARDING)))
bgp_graceful_restart_done(c);
/* Channels where regular long-lived restart failed */
if ((c->gr_active == BGP_GRS_LLGR) &&
!(active && rem->llgr_able && (rem->gr_af_flags & BGP_LLGRF_FORWARDING)))
bgp_graceful_restart_done(c);
/* GR capability implies that neighbor will send End-of-RIB */
......@@ -669,12 +682,25 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
if (c->gr_ready)
{
if (c->gr_active)
p->gr_active_num++;
switch (c->gr_active)
{
case BGP_GRS_NONE:
c->gr_active = BGP_GRS_ACTIVE;
rt_refresh_begin(c->c.table, &c->c);
break;
case BGP_GRS_ACTIVE:
rt_refresh_end(c->c.table, &c->c);
rt_refresh_begin(c->c.table, &c->c);
break;
c->gr_active = 1;
p->gr_active_num++;
case BGP_GRS_LLGR:
rt_refresh_begin(c->c.table, &c->c);
rt_modify_stale(c->c.table, &c->c);
break;
}
}
else
{
......@@ -695,7 +721,7 @@ bgp_handle_graceful_restart(struct bgp_proto *p)
ASSERT(p->gr_active_num > 0);
proto_notify_state(&p->p, PS_START);
bgp_start_timer(p->gr_timer, p->conn->remote_caps->gr_time);
tm_start(p->gr_timer, p->conn->remote_caps->gr_time S);
}
/**
......@@ -720,6 +746,7 @@ bgp_graceful_restart_done(struct bgp_channel *c)
if (!p->gr_active_num)
BGP_TRACE(D_EVENTS, "Neighbor graceful restart done");
tm_stop(c->stale_timer);
rt_refresh_end(c->c.table, &c->c);
}
......@@ -738,9 +765,48 @@ bgp_graceful_restart_timeout(timer *t)
struct bgp_proto *p = t->data;
BGP_TRACE(D_EVENTS, "Neighbor graceful restart timeout");
if (p->llgr_ready)
{
struct bgp_channel *c;
WALK_LIST(c, p->p.channels)
{
/* Channel is not in GR and is already flushed */
if (!c->gr_active)
continue;
/* Channel is already in LLGR from past restart */
if (c->gr_active == BGP_GRS_LLGR)
continue;
/* Channel is in GR, but does not support LLGR -> stop GR */
if (!c->stale_time)
{
bgp_graceful_restart_done(c);
continue;
}
/* Channel is in GR, and supports LLGR -> start LLGR */
c->gr_active = BGP_GRS_LLGR;
tm_start(c->stale_timer, c->stale_time S);
rt_modify_stale(c->c.table, &c->c);
}
}
else
bgp_stop(p, 0, NULL, 0);
}
static void
bgp_long_lived_stale_timeout(timer *t)
{
struct bgp_channel *c = t->data;
struct bgp_proto *p = (void *) c->c.proto;
BGP_TRACE(D_EVENTS, "Long-lived stale timeout");
bgp_graceful_restart_done(c);
}
/**
* bgp_refresh_begin - start incoming enhanced route refresh sequence
......@@ -873,6 +939,12 @@ bgp_hold_timeout(timer *t)
if (sk_rx_ready(conn->sk) > 0)
bgp_start_timer(conn->hold_timer, 10);
else if ((conn->state == BS_ESTABLISHED) && p->llgr_ready)
{
BGP_TRACE(D_EVENTS, "Hold timer expired");
bgp_handle_graceful_restart(p);
bgp_conn_enter_idle_state(conn);
}
else
bgp_error(conn, 4, 0, NULL, 0);
}
......@@ -1172,10 +1244,27 @@ bgp_bfd_notify(struct bfd_request *req)
{
BGP_TRACE(D_EVENTS, "BFD session down");
bgp_store_error(p, NULL, BE_MISC, BEM_BFD_DOWN);
if (p->cf->bfd == BGP_BFD_GRACEFUL)
{
/* Trigger graceful restart */
if (p->conn && (p->conn->state == BS_ESTABLISHED) && p->gr_ready)
bgp_handle_graceful_restart(p);
if (p->incoming_conn.state > BS_IDLE)
bgp_conn_enter_idle_state(&p->incoming_conn);
if (p->outgoing_conn.state > BS_IDLE)
bgp_conn_enter_idle_state(&p->outgoing_conn);
}
else
{
/* Trigger session down */
if (ps == PS_UP)
bgp_update_startup_delay(p);
bgp_stop(p, 0, NULL, 0);
}
}
}
static void
......@@ -1447,6 +1536,7 @@ bgp_init(struct proto_config *CF)
P->rte_better = bgp_rte_better;
P->rte_mergable = bgp_rte_mergable;
P->rte_recalculate = cf->deterministic_med ? bgp_rte_recalculate : NULL;
P->rte_modify = bgp_rte_modify_stale;
p->cf = cf;
p->local_as = cf->local_as;
......@@ -1503,6 +1593,8 @@ bgp_channel_start(struct channel *C)
bgp_init_bucket_table(c);
bgp_init_prefix_table(c);
c->stale_timer = tm_new_init(c->pool, bgp_long_lived_stale_timeout, c, 0, 0);
c->next_hop_addr = c->cf->next_hop_addr;
c->link_addr = IPA_NONE;
c->packets_to_send = 0;
......@@ -1634,6 +1726,10 @@ bgp_postconfig(struct proto_config *CF)
if (cf->multihop < 0)
cf->multihop = internal ? 64 : 0;
/* LLGR mode default based on GR mode */
if (cf->llgr_mode < 0)
cf->llgr_mode = cf->gr_mode ? BGP_LLGR_AWARE : 0;
/* Link check for single-hop BGP by default */
if (cf->check_link < 0)
cf->check_link = !cf->multihop;
......@@ -1676,6 +1772,9 @@ bgp_postconfig(struct proto_config *CF)
if (cf->multihop && cf->bfd && ipa_zero(cf->local_ip))
cf_error("Multihop BGP with BFD requires specified local address");
if (!cf->gr_mode && cf->llgr_mode)
cf_error("Long-lived graceful restart requires basic graceful restart");
struct bgp_channel_config *cc;
WALK_LIST(cc, CF->channels)
......@@ -1706,10 +1805,16 @@ bgp_postconfig(struct proto_config *CF)
if (!cc->gw_mode)
cc->gw_mode = cf->multihop ? GW_RECURSIVE : GW_DIRECT;
/* Default based on proto config */
/* Defaults based on proto config */
if (cc->gr_able == 0xff)
cc->gr_able = (cf->gr_mode == BGP_GR_ABLE);
if (cc->llgr_able == 0xff)
cc->llgr_able = (cf->llgr_mode == BGP_LLGR_ABLE);
if (cc->llgr_time == ~0U)
cc->llgr_time = cf->llgr_time;
/* Default values of IGP tables */
if ((cc->gw_mode == GW_RECURSIVE) && !cc->desc->no_igp)
{
......@@ -1885,6 +1990,7 @@ static char *bgp_state_names[] = { "Idle", "Connect", "Active", "OpenSent", "Ope
static char *bgp_err_classes[] = { "", "Error: ", "Socket: ", "Received: ", "BGP Error: ", "Automatic shutdown: ", ""};
static char *bgp_misc_errors[] = { "", "Neighbor lost", "Invalid next hop", "Kernel MD5 auth failed", "No listening socket", "Link down", "BFD session down", "Graceful restart"};
static char *bgp_auto_errors[] = { "", "Route limit exceeded"};
static char *bgp_gr_states[] = { "None", "Regular", "Long-lived"};
static const char *
bgp_last_errmsg(struct bgp_proto *p)
......@@ -1963,6 +2069,7 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
uint any_gr_able = 0;
uint any_add_path = 0;
uint any_ext_next_hop = 0;
uint any_llgr_able = 0;
u32 *afl1 = alloca(caps->af_count * sizeof(u32));
u32 *afl2 = alloca(caps->af_count * sizeof(u32));
uint afn1, afn2;
......@@ -1973,6 +2080,7 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
any_gr_able |= ac->gr_able;
any_add_path |= ac->add_path;
any_ext_next_hop |= ac->ext_next_hop;
any_llgr_able |= ac->llgr_able;
}
if (any_mp_bgp)
......@@ -2052,6 +2160,32 @@ bgp_show_capabilities(struct bgp_proto *p UNUSED, struct bgp_caps *caps)
if (caps->enhanced_refresh)
cli_msg(-1006, " Enhanced refresh");
if (caps->llgr_aware)
cli_msg(-1006, " Long-lived graceful restart");
if (any_llgr_able)
{
u32 stale_time = 0;
afn1 = afn2 = 0;
WALK_AF_CAPS(caps, ac)
{
stale_time = MAX(stale_time, ac->llgr_time);
if (ac->llgr_able && ac->llgr_time)
afl1[afn1++] = ac->afi;
if (ac->llgr_flags & BGP_GRF_FORWARDING)
afl2[afn2++] = ac->afi;
}
/* Continues from llgr_aware */
cli_msg(-1006, " LL stale time: %u", stale_time);
bgp_show_afis(-1006, " AF supported:", afl1, afn1);
bgp_show_afis(-1006, " AF preserved:", afl2, afn2);
}
}
static void
......@@ -2118,6 +2252,12 @@ bgp_show_proto_info(struct proto *P)
{
channel_show_info(&c->c);
if (p->gr_active_num)
cli_msg(-1006, " Neighbor GR: %s", bgp_gr_states[c->gr_active]);
if (tm_active(c->stale_timer))
cli_msg(-1006, " LL stale timer: %t/-", tm_remains(c->stale_timer));
if (c->c.channel_state == CS_UP)
{
if (ipa_zero(c->link_addr))
......
......@@ -107,9 +107,11 @@ struct bgp_config {
int allow_local_as; /* Allow that number of local ASNs in incoming AS_PATHs */
int allow_local_pref; /* Allow LOCAL_PREF in EBGP sessions */
int gr_mode; /* Graceful restart mode (BGP_GR_*) */
int llgr_mode; /* Long-lived graceful restart mode (BGP_LLGR_*) */
int setkey; /* Set MD5 password to system SA/SP database */
/* Times below are in seconds */
unsigned gr_time; /* Graceful restart timeout */
unsigned llgr_time; /* Long-lived graceful restart stale time */
unsigned connect_delay_time; /* Minimum delay between connect attempts */
unsigned connect_retry_time; /* Timeout for connect attempts */
unsigned hold_time, initial_hold_time;
......@@ -138,6 +140,8 @@ struct bgp_channel_config {
u8 gw_mode; /* How we compute route gateway from next_hop attr, see GW_* */
u8 secondary; /* Accept also non-best routes (i.e. RA_ACCEPTED) */
u8 gr_able; /* Allow full graceful restart for the channel */
u8 llgr_able; /* Allow full long-lived GR for the channel */
uint llgr_time; /* Long-lived graceful restart stale time */
u8 ext_next_hop; /* Allow both IPv4 and IPv6 next hops */
u8 add_path; /* Use ADD-PATH extension [RFC 7911] */
......@@ -166,12 +170,26 @@ struct bgp_channel_config {
/* For GR capability per-AF flags */
#define BGP_GRF_FORWARDING 0x80
#define BGP_LLGR_ABLE 1
#define BGP_LLGR_AWARE 2
#define BGP_LLGRF_FORWARDING 0x80
#define BGP_GRS_NONE 0 /* No GR */
#define BGP_GRS_ACTIVE 1 /* Graceful restart per RFC 4724 */
#define BGP_GRS_LLGR 2 /* Long-lived GR phase (stale timer active) */
#define BGP_BFD_GRACEFUL 2 /* BFD down triggers graceful restart */
struct bgp_af_caps {
u32 afi;
u8 ready; /* Multiprotocol capability, RFC 4760 */
u8 gr_able; /* Graceful restart support, RFC 4724 */
u8 gr_af_flags; /* Graceful restart per-AF flags */
u8 llgr_able; /* Long-lived GR, RFC draft */
u32 llgr_time; /* Long-lived GR stale time */
u8 llgr_flags; /* Long-lived GR per-AF flags */
u8 ext_next_hop; /* Extended IPv6 next hop, RFC 5549 */
u8 add_path; /* Multiple paths support, RFC 7911 */
};
......@@ -188,6 +206,8 @@ struct bgp_caps {
u8 gr_flags; /* Graceful restart flags */
u16 gr_time; /* Graceful restart time in seconds */
u8 llgr_aware; /* Long-lived GR capability, RFC draft */
u16 af_count; /* Number of af_data items */
struct bgp_af_caps af_data[0]; /* Per-AF capability data */
......@@ -243,6 +263,7 @@ struct bgp_proto {
u8 route_refresh; /* Route refresh allowed to send [RFC 2918] */
u8 enhanced_refresh; /* Enhanced refresh is negotiated [RFC 7313] */
u8 gr_ready; /* Neighbor could do graceful restart */
u8 llgr_ready; /* Neighbor could do Long-lived GR, implies gr_ready */
u8 gr_active_num; /* Neighbor is doing GR, number of active channels */
u8 channel_count; /* Number of active channels */
u32 *afi_map; /* Map channel index -> AFI */
......@@ -291,10 +312,13 @@ struct bgp_channel {
u32 packets_to_send; /* Bitmap of packet types to be sent */
u8 ext_next_hop; /* Session allows both IPv4 and IPv6 next hops */
u8 gr_ready; /* Neighbor could do GR on this AF */
u8 gr_active; /* Neighbor is doing GR and keeping fwd state */
u8 gr_active; /* Neighbor is doing GR (BGP_GRS_*) */
u8 ext_next_hop; /* Session allows both IPv4 and IPv6 next hops */
timer *stale_timer; /* Long-lived stale timer for LLGR */
u32 stale_time; /* Stored LLGR stale time from last session */
u8 add_path_rx; /* Session expects receive of ADD-PATH extended NLRI */
u8 add_path_tx; /* Session expects transmit of ADD-PATH extended NLRI */
......@@ -505,6 +529,7 @@ void bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *bp);
int bgp_rte_better(struct rte *, struct rte *);
int bgp_rte_mergable(rte *pri, rte *sec);
int bgp_rte_recalculate(rtable *table, net *net, rte *new, rte *old, rte *old_best);
struct rte *bgp_rte_modify_stale(struct rte *r, struct linpool *pool);
void bgp_rt_notify(struct proto *P, struct channel *C, net *n, rte *new, rte *old);
int bgp_import_control(struct proto *, struct rte **, struct linpool *);
int bgp_get_attr(struct eattr *e, byte *buf, int buflen);
......@@ -645,6 +670,9 @@ void bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to);
#define BGP_COMM_NO_ADVERTISE 0xffffff02 /* Don't export at all */
#define BGP_COMM_NO_EXPORT_SUBCONFED 0xffffff03 /* NO_EXPORT even in local confederation */
#define BGP_COMM_LLGR_STALE 0xffff0006 /* Route is stale according to LLGR */
#define BGP_COMM_NO_LLGR 0xffff0007 /* Do not treat the route according to LLGR */
/* Origins */
#define ORIGIN_IGP 0
......
......@@ -28,7 +28,8 @@ CF_KEYWORDS(BGP, LOCAL, NEIGHBOR, AS, HOLD, TIME, CONNECT, RETRY, KEEPALIVE,
BGP_CLUSTER_LIST, IGP, TABLE, GATEWAY, DIRECT, RECURSIVE, MED, TTL,
SECURITY, DETERMINISTIC, SECONDARY, ALLOW, BFD, ADD, PATHS, RX, TX,
GRACEFUL, RESTART, AWARE, CHECK, LINK, PORT, EXTENDED, MESSAGES, SETKEY,
STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6)
STRICT, BIND, CONFEDERATION, MEMBER, MULTICAST, FLOW4, FLOW6, LONG,
LIVED, STALE)
%type <i32> bgp_afi
......@@ -63,6 +64,8 @@ bgp_proto_start: proto_start BGP {
BGP_CFG->default_local_pref = 100;
BGP_CFG->gr_mode = BGP_GR_AWARE;
BGP_CFG->gr_time = 120;
BGP_CFG->llgr_mode = -1;
BGP_CFG->llgr_time = 3600;
BGP_CFG->setkey = 1;
BGP_CFG->check_link = -1;
}
......@@ -161,9 +164,13 @@ bgp_proto:
| bgp_proto GRACEFUL RESTART bool ';' { BGP_CFG->gr_mode = $4; }
| bgp_proto GRACEFUL RESTART AWARE ';' { BGP_CFG->gr_mode = BGP_GR_AWARE; }
| bgp_proto GRACEFUL RESTART TIME expr ';' { BGP_CFG->gr_time = $5; }
| bgp_proto LONG LIVED GRACEFUL RESTART bool ';' { BGP_CFG->llgr_mode = $6; }
| bgp_proto LONG LIVED GRACEFUL RESTART AWARE ';' { BGP_CFG->llgr_mode = BGP_LLGR_AWARE; }
| bgp_proto LONG LIVED STALE TIME expr ';' { BGP_CFG->llgr_time = $6; }
| bgp_proto TTL SECURITY bool ';' { BGP_CFG->ttl_security = $4; }
| bgp_proto CHECK LINK bool ';' { BGP_CFG->check_link = $4; }
| bgp_proto BFD bool ';' { BGP_CFG->bfd = $3; cf_check_bfd($3); }
| bgp_proto BFD GRACEFUL ';' { BGP_CFG->bfd = BGP_BFD_GRACEFUL; cf_check_bfd(1); }
;
bgp_afi:
......@@ -199,6 +206,8 @@ bgp_channel_start: bgp_afi
BGP_CC->afi = $1;
BGP_CC->desc = desc;
BGP_CC->gr_able = 0xff; /* undefined */
BGP_CC->llgr_able = 0xff; /* undefined */
BGP_CC->llgr_time = ~0U; /* undefined */
}
};
......@@ -214,6 +223,8 @@ bgp_channel_item:
| GATEWAY RECURSIVE { BGP_CC->gw_mode = GW_RECURSIVE; }
| SECONDARY bool { BGP_CC->secondary = $2; }
| GRACEFUL RESTART bool { BGP_CC->gr_able = $3; }
| LONG LIVED GRACEFUL RESTART bool { BGP_CC->llgr_able = $5; }
| LONG LIVED STALE TIME expr { BGP_CC->llgr_time = $5; }
| EXTENDED NEXT HOP bool { BGP_CC->ext_next_hop = $4; }
| ADD PATHS RX { BGP_CC->add_path = BGP_ADD_PATH_RX; }
| ADD PATHS TX { BGP_CC->add_path = BGP_ADD_PATH_TX; }
......
......@@ -260,6 +260,9 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
}
if (p->cf->llgr_mode)
caps->llgr_aware = 1;
/* Allocate and fill per-AF fields */
WALK_LIST(c, p->p.channels)
{
......@@ -280,6 +283,15 @@ bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
if (p->p.gr_recovery)
ac->gr_af_flags |= BGP_GRF_FORWARDING;
}
if (c->cf->llgr_able)
{