From 681ab6c58a95d5a9599b0a4c5874b51345500421 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:11 +0000 Subject: [PATCH 01/61] tcp: add sysctls for TCP PLB parameters PLB (Protective Load Balancing) is a host based mechanism for load balancing across switch links. It leverages congestion signals(e.g. ECN) from transport layer to randomly change the path of the connection experiencing congestion. PLB changes the path of the connection by changing the outgoing IPv6 flow label for IPv6 connections (implemented in Linux by calling sk_rethink_txhash()). Because of this implementation mechanism, PLB can currently only work for IPv6 traffic. For more information, see the SIGCOMM 2022 paper: https://doi.org/10.1145/3544216.3544226 This commit adds new sysctl knobs and sets their default values for TCP PLB. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- Documentation/networking/ip-sysctl.rst | 75 ++++++++++++++++++++++++++ include/net/netns/ipv4.h | 5 ++ net/ipv4/sysctl_net_ipv4.c | 43 +++++++++++++++ net/ipv4/tcp_ipv4.c | 8 +++ 4 files changed, 131 insertions(+) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index dffda1277d88b..11237c91434a8 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -922,6 +922,81 @@ tcp_rx_skb_cache - BOOLEAN Default: 0 (disabled) +tcp_plb_enabled - BOOLEAN + If set and the underlying congestion control (e.g. DCTCP) supports + and enables PLB feature, TCP PLB (Protective Load Balancing) is + enabled. PLB is described in the following paper: + https://doi.org/10.1145/3544216.3544226. Based on PLB parameters, + upon sensing sustained congestion, TCP triggers a change in + flow label field for outgoing IPv6 packets. A change in flow label + field potentially changes the path of outgoing packets for switches + that use ECMP/WCMP for routing. + + PLB changes socket txhash which results in a change in IPv6 Flow Label + field, and currently no-op for IPv4 headers. It is possible + to apply PLB for IPv4 with other network header fields (e.g. TCP + or IPv4 options) or using encapsulation where outer header is used + by switches to determine next hop. In either case, further host + and switch side changes will be needed. + + When set, PLB assumes that congestion signal (e.g. ECN) is made + available and used by congestion control module to estimate a + congestion measure (e.g. ce_ratio). PLB needs a congestion measure to + make repathing decisions. + + Default: FALSE + +tcp_plb_idle_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a rehash can be performed, given there are no packets in flight. + This is referred to as M in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 3 + +tcp_plb_rehash_rounds - INTEGER + Number of consecutive congested rounds (RTT) seen after which + a forced rehash can be performed. Be careful when setting this + parameter, as a small value increases the risk of retransmissions. + This is referred to as N in PLB paper: + https://doi.org/10.1145/3544216.3544226. + + Possible Values: 0 - 31 + + Default: 12 + +tcp_plb_suspend_rto_sec - INTEGER + Time, in seconds, to suspend PLB in event of an RTO. In order to avoid + having PLB repath onto a connectivity "black hole", after an RTO a TCP + connection suspends PLB repathing for a random duration between 1x and + 2x of this parameter. Randomness is added to avoid concurrent rehashing + of multiple TCP connections. This should be set corresponding to the + amount of time it takes to repair a failed link. + + Possible Values: 0 - 255 + + Default: 60 + +tcp_plb_cong_thresh - INTEGER + Fraction of packets marked with congestion over a round (RTT) to + tag that round as congested. This is referred to as K in the PLB paper: + https://doi.org/10.1145/3544216.3544226. + + The 0-1 fraction range is mapped to 0-256 range to avoid floating + point operations. For example, 128 means that if at least 50% of + the packets in a round were marked as congested then the round + will be tagged as congested. + + Setting threshold to 0 means that PLB repaths every RTT regardless + of congestion. This is not intended behavior for PLB and should be + used only for experimentation purpose. + + Possible Values: 0 - 256 + + Default: 128 + UDP variables ============= diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index fb94551d03e6e..80c23ff9745b0 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -185,6 +185,11 @@ struct netns_ipv4 { atomic_t tfo_active_disable_times; unsigned long tfo_active_disable_stamp; int sysctl_tcp_reflect_tos; + u8 sysctl_tcp_plb_enabled; + u8 sysctl_tcp_plb_idle_rehash_rounds; + u8 sysctl_tcp_plb_rehash_rounds; + u8 sysctl_tcp_plb_suspend_rto_sec; + int sysctl_tcp_plb_cong_thresh; int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3e3542bb8be60..de268ebb49866 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -49,6 +49,8 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; +static int tcp_plb_max_rounds = 31; +static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -1304,6 +1306,47 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE }, + { + .procname = "tcp_plb_enabled", + .data = &init_net.ipv4.sysctl_tcp_plb_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "tcp_plb_idle_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_suspend_rto_sec", + .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "tcp_plb_cong_thresh", + .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, { } }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index e02ebee07d882..13b5cb4d7e56a 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2957,6 +2957,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Set default values for PLB */ + net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; + net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; + net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; + /* Default congestion threshold for PLB to mark a round is 50% */ + net->ipv4.sysctl_tcp_plb_cong_thresh = 128; + /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, From 3a67d2ace9d11cd3a47f221ec931a3413d1eab04 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:12 +0000 Subject: [PATCH 02/61] tcp: add PLB functionality for TCP Congestion control algorithms track PLB state and cause the connection to trigger a path change when either of the 2 conditions is satisfied: - No packets are in flight and (# consecutive congested rounds >= sysctl_tcp_plb_idle_rehash_rounds) - (# consecutive congested rounds >= sysctl_tcp_plb_rehash_rounds) A round (RTT) is marked as congested when congestion signal (ECN ce_ratio) over an RTT is greater than sysctl_tcp_plb_cong_thresh. In the event of RTO, PLB (via tcp_write_timeout()) triggers a path change and disables congestion-triggered path changes for random time between (sysctl_tcp_plb_suspend_rto_sec, 2*sysctl_tcp_plb_suspend_rto_sec) to avoid hopping onto the "connectivity blackhole". RTO-triggered path changes can still happen during this cool-off period. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- include/net/tcp.h | 28 ++++++++++++ net/ipv4/Makefile | 2 +- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_plb.c | 107 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 137 insertions(+), 2 deletions(-) create mode 100644 net/ipv4/tcp_plb.c diff --git a/include/net/tcp.h b/include/net/tcp.h index 202790ae7cd42..34233952d5bb5 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2112,6 +2112,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); +/* tcp_plb.c */ + +/* + * Scaling factor for fractions in PLB. For example, tcp_plb_update_state + * expects cong_ratio which represents fraction of traffic that experienced + * congestion over a single RTT. In order to avoid floating point operations, + * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in. + */ +#define TCP_PLB_SCALE 8 + +/* State for PLB (Protective Load Balancing) for a single TCP connection. */ +struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +}; + +static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +{ + plb->consec_cong_rounds = 0; + plb->pause_until = 0; +} +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio); +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb); +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb); + /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 5b77a46885b95..f71e03890b01e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_rate.o tcp_recovery.o tcp_ulp.o \ - tcp_offload.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 13b5cb4d7e56a..6575afb6de889 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2963,7 +2963,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; /* Default congestion threshold for PLB to mark a round is 50% */ - net->ipv4.sysctl_tcp_plb_cong_thresh = 128; + net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; /* Reno is always built in */ if (!net_eq(net, &init_net) && diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c new file mode 100644 index 0000000000000..f4ced370acad3 --- /dev/null +++ b/net/ipv4/tcp_plb.c @@ -0,0 +1,107 @@ +/* Protective Load Balancing (PLB) + * + * PLB was designed to reduce link load imbalance across datacenter + * switches. PLB is a host-based optimization; it leverages congestion + * signals from the transport layer to randomly change the path of the + * connection experiencing sustained congestion. PLB prefers to repath + * after idle periods to minimize packet reordering. It repaths by + * changing the IPv6 Flow Label on the packets of a connection, which + * datacenter switches include as part of ECMP/WCMP hashing. + * + * PLB is described in detail in: + * + * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, + * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, + * David Wetherall,Abdul Kabbani: + * "PLB: Congestion Signals are Simple and Effective for + * Network Load Balancing" + * In ACM SIGCOMM 2022, Amsterdam Netherlands. + * + */ + +#include + +/* Called once per round-trip to update PLB state for a connection. */ +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio) +{ + struct net *net = sock_net(sk); + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + if (cong_ratio >= 0) { + if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) + plb->consec_cong_rounds = 0; + else if (plb->consec_cong_rounds < + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) + plb->consec_cong_rounds++; + } +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state); + +/* Check whether recent congestion has been persistent enough to warrant + * a load balancing decision that switches the connection to another path. + */ +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 max_suspend; + bool forced_rehash = false, idle_rehash = false; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + forced_rehash = plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); + /* If sender goes idle then we check whether to rehash. */ + idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && + !tcp_sk(sk)->packets_out && + plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); + + if (!forced_rehash && !idle_rehash) + return; + + /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for + * cases where the max suspension end is before the actual suspension + * end. We clear pause_until to 0 to indicate there is no recent + * RTO event that constrains PLB rehashing. + */ + max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + if (plb->pause_until && + (!before(tcp_jiffies32, plb->pause_until) || + before(tcp_jiffies32 + max_suspend, plb->pause_until))) + plb->pause_until = 0; + + if (plb->pause_until) + return; + + sk_rethink_txhash(sk); + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); + +/* Upon RTO, disallow load balancing for a while, to avoid having load + * balancing decisions switch traffic to a black-holed path that was + * previously avoided with a sk_rethink_txhash() call at RTO time. + */ +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 pause; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + pause += prandom_u32_max(pause); + plb->pause_until = tcp_jiffies32 + pause; + + /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call + * that may switch this connection to a path with completely different + * congestion characteristics. + */ + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); From 049b710222bbf9ae7b24a3e472ce848f82f3781e Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:13 +0000 Subject: [PATCH 03/61] tcp: add support for PLB in DCTCP PLB support is added to TCP DCTCP code. As DCTCP uses ECN as the congestion signal, PLB also uses ECN to make decisions whether to change the path or not upon sustained congestion. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- net/ipv4/tcp_dctcp.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index be2c97e907ae2..3bde2a20be5b9 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -52,6 +52,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 loss_cwnd; + struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -100,6 +101,8 @@ static void dctcp_init(struct sock *sk) ca->ce_state = 0; dctcp_reset(tp, ca); + tcp_plb_init(sk, &ca->plb); + return; } @@ -126,14 +129,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { + u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; + u32 ce_ratio = 0; + + if (delivered > 0) { + /* dctcp_alpha keeps EWMA of fraction of ECN marked + * packets. Because of EWMA smoothing, PLB reaction can + * be slow so we use ce_ratio which is an instantaneous + * measure of congestion. ce_ratio is the fraction of + * ECN marked packets in the previous RTT. + */ + if (delivered_ce > 0) + ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; + tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); + tcp_plb_check_rehash(sk, &ca->plb); + } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { - u32 delivered = tp->delivered - ca->old_delivered; /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. @@ -181,8 +198,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: + tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; + case CA_EVENT_TX_START: + tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ + break; default: /* Don't care for the rest. */ break; From 4da6df0686cbf0684abf5bd9211a304b703c1d00 Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:14 +0000 Subject: [PATCH 04/61] tcp: add u32 counter in tcp_sock and an SNMP counter for PLB A u32 counter is added to tcp_sock for counting the number of PLB triggered rehashes for a TCP connection. An SNMP counter is also added to count overall PLB triggered rehash events for a host. These counters are hooked up to PLB implementation for DCTCP. TCP_NLA_REHASH is added to SCM_TIMESTAMPING_OPT_STATS that reports the rehash attempts triggered due to PLB or timeouts. This gives a historical view of sustained congestion or timeouts experienced by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- include/linux/tcp.h | 1 + include/uapi/linux/snmp.h | 1 + include/uapi/linux/tcp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_plb.c | 2 ++ 6 files changed, 9 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 6d00e1b29fc4b..9c67f3e7a28b2 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -395,6 +395,7 @@ struct tcp_sock { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; + u32 plb_rehash; /* PLB-triggered rehash attempts */ u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index f84e7bcad6deb..e78f5c5d32e1f 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -289,6 +289,7 @@ enum LINUX_MIB_TCPDUPLICATEDATAREHASH, /* TCPDuplicateDataRehash */ LINUX_MIB_TCPDSACKRECVSEGS, /* TCPDSACKRecvSegs */ LINUX_MIB_TCPDSACKIGNOREDDUBIOUS, /* TCPDSACKIgnoredDubious */ + LINUX_MIB_TCPPLBREHASH, /* TCPPLBRehash */ __LINUX_MIB_MAX }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 62db78b9c1a0a..8460cbe2971f6 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -314,6 +314,7 @@ enum { TCP_NLA_TIMEOUT_REHASH, /* Timeout-triggered rehash attempts */ TCP_NLA_BYTES_NOTSENT, /* Bytes in write queue not yet sent */ TCP_NLA_EDT, /* Earliest departure time (CLOCK_MONOTONIC) */ + TCP_NLA_REHASH, /* PLB and timeout triggered rehash attempts */ }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 8d5e1695b9aa8..7ef211eb7068d 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -294,6 +294,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TcpDuplicateDataRehash", LINUX_MIB_TCPDUPLICATEDATAREHASH), SNMP_MIB_ITEM("TCPDSACKRecvSegs", LINUX_MIB_TCPDSACKRECVSEGS), SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), + SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 23da0b2ad0719..8c47eae7b50b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2857,6 +2857,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; @@ -3647,6 +3648,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -3707,6 +3709,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c index f4ced370acad3..bb1a08fda113e 100644 --- a/net/ipv4/tcp_plb.c +++ b/net/ipv4/tcp_plb.c @@ -79,6 +79,8 @@ void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) sk_rethink_txhash(sk); plb->consec_cong_rounds = 0; + tcp_sk(sk)->plb_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); } EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); From ea5a6f029f9d287bc25cf192e7d73d755cb06e7f Mon Sep 17 00:00:00 2001 From: Mubashir Adnan Qureshi Date: Wed, 26 Oct 2022 13:51:15 +0000 Subject: [PATCH 05/61] tcp: add rcv_wnd and plb_rehash to TCP_INFO rcv_wnd can be useful to diagnose TCP performance where receiver window becomes the bottleneck. rehash reports the PLB and timeout triggered rehash attempts by the TCP connection. Signed-off-by: Mubashir Adnan Qureshi Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- include/uapi/linux/tcp.h | 5 +++++ net/ipv4/tcp.c | 2 ++ 2 files changed, 7 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 8460cbe2971f6..dfeaf70969ae8 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -284,6 +284,11 @@ struct tcp_info { __u32 tcpi_snd_wnd; /* peer's advertised receive window after * scaling (bytes) */ + __u32 tcpi_rcv_wnd; /* local advertised receive window after + * scaling (bytes) + */ + + __u32 tcpi_rehash; /* PLB or timeout triggered rehash attempts */ }; /* netlink attributes types for SCM_TIMESTAMPING_OPT_STATS */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8c47eae7b50b2..747eb46b43436 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3615,6 +3615,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } From 04544fad12e32337e848f520edcba4e48196c6fc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Apr 2022 16:35:38 -0700 Subject: [PATCH 06/61] tcp: add accessors to read/set tp->snd_cwnd We had various bugs over the years with code breaking the assumption that tp->snd_cwnd is greater than zero. Lately, syzbot reported the WARN_ON_ONCE(!tp->prior_cwnd) added in commit 8b8a321ff72c ("tcp: fix zero cwnd in tcp_cwnd_reduction") can trigger, and without a repro we would have to spend considerable time finding the bug. Instead of complaining too late, we want to catch where and when tp->snd_cwnd is set to an illegal value. Signed-off-by: Eric Dumazet Suggested-by: Yuchung Cheng Cc: Neal Cardwell Acked-by: Yuchung Cheng Link: https://lore.kernel.org/r/20220405233538.947344-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- include/net/tcp.h | 19 +++++++++++++++---- include/trace/events/tcp.h | 2 +- net/core/filter.c | 2 +- net/ipv4/tcp.c | 8 ++++---- net/ipv4/tcp_bbr.c | 20 ++++++++++---------- net/ipv4/tcp_bic.c | 14 +++++++------- net/ipv4/tcp_cdg.c | 30 +++++++++++++++--------------- net/ipv4/tcp_cong.c | 18 +++++++++--------- net/ipv4/tcp_cubic.c | 22 +++++++++++----------- net/ipv4/tcp_dctcp.c | 11 ++++++----- net/ipv4/tcp_highspeed.c | 18 +++++++++--------- net/ipv4/tcp_htcp.c | 10 +++++----- net/ipv4/tcp_hybla.c | 18 +++++++++--------- net/ipv4/tcp_illinois.c | 12 +++++++----- net/ipv4/tcp_input.c | 34 +++++++++++++++++----------------- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_lp.c | 6 +++--- net/ipv4/tcp_metrics.c | 12 ++++++------ net/ipv4/tcp_nv.c | 24 ++++++++++++------------ net/ipv4/tcp_output.c | 30 +++++++++++++++--------------- net/ipv4/tcp_rate.c | 2 +- net/ipv4/tcp_scalable.c | 4 ++-- net/ipv4/tcp_vegas.c | 21 +++++++++++---------- net/ipv4/tcp_veno.c | 24 ++++++++++++------------ net/ipv4/tcp_westwood.c | 3 ++- net/ipv4/tcp_yeah.c | 30 +++++++++++++++--------------- net/ipv6/tcp_ipv6.c | 2 +- 27 files changed, 207 insertions(+), 191 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 34233952d5bb5..59f85d9cb6909 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1212,9 +1212,20 @@ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) #define TCP_INFINITE_SSTHRESH 0x7fffffff +static inline u32 tcp_snd_cwnd(const struct tcp_sock *tp) +{ + return tp->snd_cwnd; +} + +static inline void tcp_snd_cwnd_set(struct tcp_sock *tp, u32 val) +{ + WARN_ON_ONCE((int)val <= 0); + tp->snd_cwnd = val; +} + static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { - return tp->snd_cwnd < tp->snd_ssthresh; + return tcp_snd_cwnd(tp) < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) @@ -1240,8 +1251,8 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, - ((tp->snd_cwnd >> 1) + - (tp->snd_cwnd >> 2))); + ((tcp_snd_cwnd(tp) >> 1) + + (tcp_snd_cwnd(tp) >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ @@ -1286,7 +1297,7 @@ static inline bool tcp_is_cwnd_limited(const struct sock *sk) /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) - return tp->snd_cwnd < 2 * tp->max_packets_out; + return tcp_snd_cwnd(tp) < 2 * tp->max_packets_out; return false; } diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index cf97f6339acba..56e8569e0cc2a 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -268,7 +268,7 @@ TRACE_EVENT(tcp_probe, __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; - __entry->snd_cwnd = tp->snd_cwnd; + __entry->snd_cwnd = tcp_snd_cwnd(tp); __entry->snd_wnd = tp->snd_wnd; __entry->rcv_wnd = tp->rcv_wnd; __entry->ssthresh = tcp_current_ssthresh(sk); diff --git a/net/core/filter.c b/net/core/filter.c index 06eef8aeafb99..7e5856edcca4b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4869,7 +4869,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, if (val <= 0 || tp->data_segs_out > tp->syn_data) ret = -EINVAL; else - tp->snd_cwnd = val; + tcp_snd_cwnd_set(tp, val); break; case TCP_BPF_SNDCWND_CLAMP: if (val <= 0) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 747eb46b43436..fa0e18b9c1b12 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -430,7 +430,7 @@ void tcp_init_sock(struct sock *sk) * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ - tp->snd_cwnd = TCP_INIT_CWND; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; @@ -2816,7 +2816,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - tp->snd_cwnd = TCP_INIT_CWND; + tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; @@ -3527,7 +3527,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; - info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : @@ -3686,7 +3686,7 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); - nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6274462b86b4b..c5ee83654db1c 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -274,7 +274,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) } else { /* no RTT sample yet */ rtt_us = USEC_PER_MSEC; /* use nominal default RTT */ } - bw = (u64)tp->snd_cwnd * BW_UNIT; + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; do_div(bw, rtt_us); sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); } @@ -321,9 +321,9 @@ static void bbr_save_cwnd(struct sock *sk) struct bbr *bbr = inet_csk_ca(sk); if (bbr->prev_ca_state < TCP_CA_Recovery && bbr->mode != BBR_PROBE_RTT) - bbr->prior_cwnd = tp->snd_cwnd; /* this cwnd is good enough */ + bbr->prior_cwnd = tcp_snd_cwnd(tp); /* this cwnd is good enough */ else /* loss recovery or BBR_PROBE_RTT have temporarily cut cwnd */ - bbr->prior_cwnd = max(bbr->prior_cwnd, tp->snd_cwnd); + bbr->prior_cwnd = max(bbr->prior_cwnd, tcp_snd_cwnd(tp)); } static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) @@ -480,7 +480,7 @@ static bool bbr_set_cwnd_to_recover_or_restore( struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; - u32 cwnd = tp->snd_cwnd; + u32 cwnd = tcp_snd_cwnd(tp); /* An ACK for P pkts should release at most 2*P packets. We do this * in two steps. First, here we deduct the number of lost packets. @@ -518,7 +518,7 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd = tp->snd_cwnd, target_cwnd = 0; + u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; if (!acked) goto done; /* no packet fully ACKed; just apply caps */ @@ -542,9 +542,9 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, cwnd = max(cwnd, bbr_cwnd_min_target); done: - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); /* apply global cap */ + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ - tp->snd_cwnd = min(tp->snd_cwnd, bbr_cwnd_min_target); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); } /* End cycle phase if it's time and/or we hit the phase's in-flight target. */ @@ -854,7 +854,7 @@ static void bbr_update_ack_aggregation(struct sock *sk, bbr->ack_epoch_acked = min_t(u32, 0xFFFFF, bbr->ack_epoch_acked + rs->acked_sacked); extra_acked = bbr->ack_epoch_acked - expected_acked; - extra_acked = min(extra_acked, tp->snd_cwnd); + extra_acked = min(extra_acked, tcp_snd_cwnd(tp)); if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx]) bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; } @@ -912,7 +912,7 @@ static void bbr_check_probe_rtt_done(struct sock *sk) return; bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ - tp->snd_cwnd = max(tp->snd_cwnd, bbr->prior_cwnd); + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); bbr_reset_mode(sk); } @@ -1091,7 +1091,7 @@ static u32 bbr_undo_cwnd(struct sock *sk) bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ bbr->full_bw_cnt = 0; bbr_reset_lt_bw_sampling(sk); - return tcp_sk(sk)->snd_cwnd; + return tcp_snd_cwnd(tcp_sk(sk)); } /* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index f5f588b1f6e9d..58358bf92e1b8 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -150,7 +150,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!acked) return; } - bictcp_update(ca, tp->snd_cwnd); + bictcp_update(ca, tcp_snd_cwnd(tp)); tcp_cong_avoid_ai(tp, ca->cnt, acked); } @@ -166,16 +166,16 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ - if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) - ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else - ca->last_max_cwnd = tp->snd_cwnd; + ca->last_max_cwnd = tcp_snd_cwnd(tp); - if (tp->snd_cwnd <= low_window) - return max(tp->snd_cwnd >> 1U, 2U); + if (tcp_snd_cwnd(tp) <= low_window) + return max(tcp_snd_cwnd(tp) >> 1U, 2U); else - return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); + return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } static void bictcp_state(struct sock *sk, u8 new_state) diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index 56dede4b59d95..112f28f936934 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -161,8 +161,8 @@ static void tcp_cdg_hystart_update(struct sock *sk) LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); - tp->snd_ssthresh = tp->snd_cwnd; + tcp_snd_cwnd(tp)); + tp->snd_ssthresh = tcp_snd_cwnd(tp); return; } } @@ -180,8 +180,8 @@ static void tcp_cdg_hystart_update(struct sock *sk) LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); - tp->snd_ssthresh = tp->snd_cwnd; + tcp_snd_cwnd(tp)); + tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } @@ -252,7 +252,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad) return false; } - ca->shadow_wnd = max(ca->shadow_wnd, tp->snd_cwnd); + ca->shadow_wnd = max(ca->shadow_wnd, tcp_snd_cwnd(tp)); ca->state = CDG_BACKOFF; tcp_enter_cwr(sk); return true; @@ -285,14 +285,14 @@ static void tcp_cdg_cong_avoid(struct sock *sk, u32 ack, u32 acked) } if (!tcp_is_cwnd_limited(sk)) { - ca->shadow_wnd = min(ca->shadow_wnd, tp->snd_cwnd); + ca->shadow_wnd = min(ca->shadow_wnd, tcp_snd_cwnd(tp)); return; } - prior_snd_cwnd = tp->snd_cwnd; + prior_snd_cwnd = tcp_snd_cwnd(tp); tcp_reno_cong_avoid(sk, ack, acked); - incr = tp->snd_cwnd - prior_snd_cwnd; + incr = tcp_snd_cwnd(tp) - prior_snd_cwnd; ca->shadow_wnd = max(ca->shadow_wnd, ca->shadow_wnd + incr); } @@ -331,15 +331,15 @@ static u32 tcp_cdg_ssthresh(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (ca->state == CDG_BACKOFF) - return max(2U, (tp->snd_cwnd * min(1024U, backoff_beta)) >> 10); + return max(2U, (tcp_snd_cwnd(tp) * min(1024U, backoff_beta)) >> 10); if (ca->state == CDG_NONFULL && use_tolerance) - return tp->snd_cwnd; + return tcp_snd_cwnd(tp); - ca->shadow_wnd = min(ca->shadow_wnd >> 1, tp->snd_cwnd); + ca->shadow_wnd = min(ca->shadow_wnd >> 1, tcp_snd_cwnd(tp)); if (use_shadow) - return max3(2U, ca->shadow_wnd, tp->snd_cwnd >> 1); - return max(2U, tp->snd_cwnd >> 1); + return max3(2U, ca->shadow_wnd, tcp_snd_cwnd(tp) >> 1); + return max(2U, tcp_snd_cwnd(tp) >> 1); } static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) @@ -357,7 +357,7 @@ static void tcp_cdg_cwnd_event(struct sock *sk, const enum tcp_ca_event ev) ca->gradients = gradients; ca->rtt_seq = tp->snd_nxt; - ca->shadow_wnd = tp->snd_cwnd; + ca->shadow_wnd = tcp_snd_cwnd(tp); break; case CA_EVENT_COMPLETE_CWR: ca->state = CDG_UNKNOWN; @@ -381,7 +381,7 @@ static void tcp_cdg_init(struct sock *sk) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), GFP_NOWAIT | __GFP_NOWARN); ca->rtt_seq = tp->snd_nxt; - ca->shadow_wnd = tp->snd_cwnd; + ca->shadow_wnd = tcp_snd_cwnd(tp); } static void tcp_cdg_release(struct sock *sk) diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index db5831e6c136a..f43db30a7195d 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -395,10 +395,10 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, */ u32 tcp_slow_start(struct tcp_sock *tp, u32 acked) { - u32 cwnd = min(tp->snd_cwnd + acked, tp->snd_ssthresh); + u32 cwnd = min(tcp_snd_cwnd(tp) + acked, tp->snd_ssthresh); - acked -= cwnd - tp->snd_cwnd; - tp->snd_cwnd = min(cwnd, tp->snd_cwnd_clamp); + acked -= cwnd - tcp_snd_cwnd(tp); + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); return acked; } @@ -412,7 +412,7 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) /* If credits accumulated at a higher w, apply them gently now. */ if (tp->snd_cwnd_cnt >= w) { tp->snd_cwnd_cnt = 0; - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } tp->snd_cwnd_cnt += acked; @@ -420,9 +420,9 @@ void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked) u32 delta = tp->snd_cwnd_cnt / w; tp->snd_cwnd_cnt -= delta * w; - tp->snd_cwnd += delta; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + delta); } - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } EXPORT_SYMBOL_GPL(tcp_cong_avoid_ai); @@ -447,7 +447,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked) return; } /* In dangerous area, increase slowly. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); @@ -456,7 +456,7 @@ u32 tcp_reno_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd >> 1U, 2U); + return max(tcp_snd_cwnd(tp) >> 1U, 2U); } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); @@ -464,7 +464,7 @@ u32 tcp_reno_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd, tp->prior_cwnd); + return max(tcp_snd_cwnd(tp), tp->prior_cwnd); } EXPORT_SYMBOL_GPL(tcp_reno_undo_cwnd); diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index fffa011a007d4..64dfcaa412f26 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -341,7 +341,7 @@ static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!acked) return; } - bictcp_update(ca, tp->snd_cwnd, acked); + bictcp_update(ca, tcp_snd_cwnd(tp), acked); tcp_cong_avoid_ai(tp, ca->cnt, acked); } @@ -353,13 +353,13 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ /* Wmax and fast convergence */ - if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) - ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + if (tcp_snd_cwnd(tp) < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tcp_snd_cwnd(tp) * (BICTCP_BETA_SCALE + beta)) / (2 * BICTCP_BETA_SCALE); else - ca->last_max_cwnd = tp->snd_cwnd; + ca->last_max_cwnd = tcp_snd_cwnd(tp); - return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); + return max((tcp_snd_cwnd(tp) * beta) / BICTCP_BETA_SCALE, 2U); } static void bictcp_state(struct sock *sk, u8 new_state) @@ -420,13 +420,13 @@ static void hystart_update(struct sock *sk, u32 delay) ca->found = 1; pr_debug("hystart_ack_train (%u > %u) delay_min %u (+ ack_delay %u) cwnd %u\n", now - ca->round_start, threshold, - ca->delay_min, hystart_ack_delay(sk), tp->snd_cwnd); + ca->delay_min, hystart_ack_delay(sk), tcp_snd_cwnd(tp)); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTTRAINCWND, - tp->snd_cwnd); - tp->snd_ssthresh = tp->snd_cwnd; + tcp_snd_cwnd(tp)); + tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } @@ -445,8 +445,8 @@ static void hystart_update(struct sock *sk, u32 delay) LINUX_MIB_TCPHYSTARTDELAYDETECT); NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPHYSTARTDELAYCWND, - tp->snd_cwnd); - tp->snd_ssthresh = tp->snd_cwnd; + tcp_snd_cwnd(tp)); + tp->snd_ssthresh = tcp_snd_cwnd(tp); } } } @@ -476,7 +476,7 @@ static void bictcp_acked(struct sock *sk, const struct ack_sample *sample) /* hystart triggers when cwnd is larger than some threshold */ if (!ca->found && tcp_in_slow_start(tp) && hystart && - tp->snd_cwnd >= hystart_low_window) + tcp_snd_cwnd(tp) >= hystart_low_window) hystart_update(sk, delay); } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 3bde2a20be5b9..86d88da1e36d8 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -118,8 +118,8 @@ static u32 dctcp_ssthresh(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - ca->loss_cwnd = tp->snd_cwnd; - return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); + ca->loss_cwnd = tcp_snd_cwnd(tp); + return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * ca->dctcp_alpha) >> 11U), 2U); } static void dctcp_update_alpha(struct sock *sk, u32 flags) @@ -174,8 +174,8 @@ static void dctcp_react_to_loss(struct sock *sk) struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - ca->loss_cwnd = tp->snd_cwnd; - tp->snd_ssthresh = max(tp->snd_cwnd >> 1U, 2U); + ca->loss_cwnd = tcp_snd_cwnd(tp); + tp->snd_ssthresh = max(tcp_snd_cwnd(tp) >> 1U, 2U); } static void dctcp_state(struct sock *sk, u8 new_state) @@ -241,8 +241,9 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, static u32 dctcp_cwnd_undo(struct sock *sk) { const struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); - return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); + return max(tcp_snd_cwnd(tp), ca->loss_cwnd); } static struct tcp_congestion_ops dctcp __read_mostly = { diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index 349069d6cd0aa..c6de5ce79ad3c 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -127,22 +127,22 @@ static void hstcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) * snd_cwnd <= * hstcp_aimd_vals[ca->ai].cwnd */ - if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { - while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && + if (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd) { + while (tcp_snd_cwnd(tp) > hstcp_aimd_vals[ca->ai].cwnd && ca->ai < HSTCP_AIMD_MAX - 1) ca->ai++; - } else if (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) { - while (ca->ai && tp->snd_cwnd <= hstcp_aimd_vals[ca->ai-1].cwnd) + } else if (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) { + while (ca->ai && tcp_snd_cwnd(tp) <= hstcp_aimd_vals[ca->ai-1].cwnd) ca->ai--; } /* Do additive increase */ - if (tp->snd_cwnd < tp->snd_cwnd_clamp) { + if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) { /* cwnd = cwnd + a(w) / cwnd */ tp->snd_cwnd_cnt += ca->ai + 1; - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd_cnt -= tp->snd_cwnd; - tp->snd_cwnd++; + if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { + tp->snd_cwnd_cnt -= tcp_snd_cwnd(tp); + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } } } @@ -154,7 +154,7 @@ static u32 hstcp_ssthresh(struct sock *sk) struct hstcp *ca = inet_csk_ca(sk); /* Do multiplicative decrease */ - return max(tp->snd_cwnd - ((tp->snd_cwnd * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); + return max(tcp_snd_cwnd(tp) - ((tcp_snd_cwnd(tp) * hstcp_aimd_vals[ca->ai].md) >> 8), 2U); } static struct tcp_congestion_ops tcp_highspeed __read_mostly = { diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 55adcfcf96fea..52b1f2665dfae 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -124,7 +124,7 @@ static void measure_achieved_throughput(struct sock *sk, ca->packetcount += sample->pkts_acked; - if (ca->packetcount >= tp->snd_cwnd - (ca->alpha >> 7 ? : 1) && + if (ca->packetcount >= tcp_snd_cwnd(tp) - (ca->alpha >> 7 ? : 1) && now - ca->lasttime >= ca->minRTT && ca->minRTT > 0) { __u32 cur_Bi = ca->packetcount * HZ / (now - ca->lasttime); @@ -225,7 +225,7 @@ static u32 htcp_recalc_ssthresh(struct sock *sk) const struct htcp *ca = inet_csk_ca(sk); htcp_param_update(sk); - return max((tp->snd_cwnd * ca->beta) >> 7, 2U); + return max((tcp_snd_cwnd(tp) * ca->beta) >> 7, 2U); } static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) @@ -242,9 +242,9 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* In dangerous area, increase slowly. * In theory this is tp->snd_cwnd += alpha / tp->snd_cwnd */ - if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tp->snd_cwnd) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; + if ((tp->snd_cwnd_cnt * ca->alpha)>>7 >= tcp_snd_cwnd(tp)) { + if (tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tp->snd_cwnd_cnt = 0; htcp_alpha_update(ca); } else diff --git a/net/ipv4/tcp_hybla.c b/net/ipv4/tcp_hybla.c index be39327e04e6c..abd7d91807e54 100644 --- a/net/ipv4/tcp_hybla.c +++ b/net/ipv4/tcp_hybla.c @@ -54,7 +54,7 @@ static void hybla_init(struct sock *sk) ca->rho2_7ls = 0; ca->snd_cwnd_cents = 0; ca->hybla_en = true; - tp->snd_cwnd = 2; + tcp_snd_cwnd_set(tp, 2); tp->snd_cwnd_clamp = 65535; /* 1st Rho measurement based on initial srtt */ @@ -62,7 +62,7 @@ static void hybla_init(struct sock *sk) /* set minimum rtt as this is the 1st ever seen */ ca->minrtt_us = tp->srtt_us; - tp->snd_cwnd = ca->rho; + tcp_snd_cwnd_set(tp, ca->rho); } static void hybla_state(struct sock *sk, u8 ca_state) @@ -137,31 +137,31 @@ static void hybla_cong_avoid(struct sock *sk, u32 ack, u32 acked) * as long as increment is estimated as (rho<<7)/window * it already is <<7 and we can easily count its fractions. */ - increment = ca->rho2_7ls / tp->snd_cwnd; + increment = ca->rho2_7ls / tcp_snd_cwnd(tp); if (increment < 128) tp->snd_cwnd_cnt++; } odd = increment % 128; - tp->snd_cwnd += increment >> 7; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + (increment >> 7)); ca->snd_cwnd_cents += odd; /* check when fractions goes >=128 and increase cwnd by 1. */ while (ca->snd_cwnd_cents >= 128) { - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); ca->snd_cwnd_cents -= 128; tp->snd_cwnd_cnt = 0; } /* check when cwnd has not been incremented for a while */ - if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tp->snd_cwnd) { - tp->snd_cwnd++; + if (increment == 0 && odd == 0 && tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tp->snd_cwnd_cnt = 0; } /* clamp down slowstart cwnd to ssthresh value. */ if (is_slowstart) - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_ssthresh)); - tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), tp->snd_cwnd_clamp)); } static struct tcp_congestion_ops tcp_hybla __read_mostly = { diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 00e54873213e8..c0c81a2c77fae 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -224,7 +224,7 @@ static void update_params(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); - if (tp->snd_cwnd < win_thresh) { + if (tcp_snd_cwnd(tp) < win_thresh) { ca->alpha = ALPHA_BASE; ca->beta = BETA_BASE; } else if (ca->cnt_rtt > 0) { @@ -284,9 +284,9 @@ static void tcp_illinois_cong_avoid(struct sock *sk, u32 ack, u32 acked) * tp->snd_cwnd += alpha/tp->snd_cwnd */ delta = (tp->snd_cwnd_cnt * ca->alpha) >> ALPHA_SHIFT; - if (delta >= tp->snd_cwnd) { - tp->snd_cwnd = min(tp->snd_cwnd + delta / tp->snd_cwnd, - (u32)tp->snd_cwnd_clamp); + if (delta >= tcp_snd_cwnd(tp)) { + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp) + delta / tcp_snd_cwnd(tp), + (u32)tp->snd_cwnd_clamp)); tp->snd_cwnd_cnt = 0; } } @@ -296,9 +296,11 @@ static u32 tcp_illinois_ssthresh(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); struct illinois *ca = inet_csk_ca(sk); + u32 decr; /* Multiplicative decrease */ - return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->beta) >> BETA_SHIFT), 2U); + decr = (tcp_snd_cwnd(tp) * ca->beta) >> BETA_SHIFT; + return max(tcp_snd_cwnd(tp) - decr, 2U); } /* Extract info for Tcp socket info provided via netlink. */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index be39bdef5cee3..c2b4edb58ae33 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -426,7 +426,7 @@ static void tcp_sndbuf_expand(struct sock *sk) per_mss = roundup_pow_of_two(per_mss) + SKB_DATA_ALIGN(sizeof(struct sk_buff)); - nr_segs = max_t(u32, TCP_INIT_CWND, tp->snd_cwnd); + nr_segs = max_t(u32, TCP_INIT_CWND, tcp_snd_cwnd(tp)); nr_segs = max_t(u32, nr_segs, tp->reordering + 1); /* Fast Recovery (RFC 5681 3.2) : @@ -914,12 +914,12 @@ static void tcp_update_pacing_rate(struct sock *sk) * If snd_cwnd >= (tp->snd_ssthresh / 2), we are approaching * end of slow start and should slow down. */ - if (tp->snd_cwnd < tp->snd_ssthresh / 2) + if (tcp_snd_cwnd(tp) < tp->snd_ssthresh / 2) rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ss_ratio; else rate *= sock_net(sk)->ipv4.sysctl_tcp_pacing_ca_ratio; - rate *= max(tp->snd_cwnd, tp->packets_out); + rate *= max(tcp_snd_cwnd(tp), tp->packets_out); if (likely(tp->srtt_us)) do_div(rate, tp->srtt_us); @@ -2153,12 +2153,12 @@ void tcp_enter_loss(struct sock *sk) !after(tp->high_seq, tp->snd_una) || (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) { tp->prior_ssthresh = tcp_current_ssthresh(sk); - tp->prior_cwnd = tp->snd_cwnd; + tp->prior_cwnd = tcp_snd_cwnd(tp); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); tcp_ca_event(sk, CA_EVENT_LOSS); tcp_init_undo(tp); } - tp->snd_cwnd = tcp_packets_in_flight(tp) + 1; + tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + 1); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; @@ -2493,7 +2493,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) pr_debug("Undo %s %pI4/%u c%u l%u ss%u/%u p%u\n", msg, &inet->inet_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), + tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -2502,7 +2502,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) pr_debug("Undo %s %pI6/%u c%u l%u ss%u/%u p%u\n", msg, &sk->sk_v6_daddr, ntohs(inet->inet_dport), - tp->snd_cwnd, tcp_left_out(tp), + tcp_snd_cwnd(tp), tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -2527,7 +2527,7 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss) if (tp->prior_ssthresh) { const struct inet_connection_sock *icsk = inet_csk(sk); - tp->snd_cwnd = icsk->icsk_ca_ops->undo_cwnd(sk); + tcp_snd_cwnd_set(tp, icsk->icsk_ca_ops->undo_cwnd(sk)); if (tp->prior_ssthresh > tp->snd_ssthresh) { tp->snd_ssthresh = tp->prior_ssthresh; @@ -2645,7 +2645,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk) tp->high_seq = tp->snd_nxt; tp->tlp_high_seq = 0; tp->snd_cwnd_cnt = 0; - tp->prior_cwnd = tp->snd_cwnd; + tp->prior_cwnd = tcp_snd_cwnd(tp); tp->prr_delivered = 0; tp->prr_out = 0; tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); @@ -2676,7 +2676,7 @@ void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag) } /* Force a fast retransmit upon entering fast recovery */ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1)); - tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt; + tcp_snd_cwnd_set(tp, tcp_packets_in_flight(tp) + sndcnt); } static inline void tcp_end_cwnd_reduction(struct sock *sk) @@ -2689,7 +2689,7 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { - tp->snd_cwnd = tp->snd_ssthresh; + tcp_snd_cwnd_set(tp, tp->snd_ssthresh); tp->snd_cwnd_stamp = tcp_jiffies32; } tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); @@ -2757,10 +2757,10 @@ static void tcp_mtup_probe_success(struct sock *sk) tp->prior_ssthresh = tcp_current_ssthresh(sk); - val = (u64)tp->snd_cwnd * tcp_mss_to_mtu(sk, tp->mss_cache); + val = (u64)tcp_snd_cwnd(tp) * tcp_mss_to_mtu(sk, tp->mss_cache); do_div(val, icsk->icsk_mtup.probe_size); WARN_ON_ONCE((u32)val != val); - tp->snd_cwnd = max_t(u32, 1U, val); + tcp_snd_cwnd_set(tp, max_t(u32, 1U, val)); tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_jiffies32; @@ -3072,7 +3072,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una, tp->snd_una == tp->mtu_probe.probe_seq_start) { tcp_mtup_probe_failed(sk); /* Restores the reduction we did in tcp_mtup_probe() */ - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); tcp_simple_retransmit(sk); return; } @@ -5458,7 +5458,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we filled the congestion window, do not expand. */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + if (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)) return false; return true; @@ -6025,9 +6025,9 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) * retransmission has occurred. */ if (tp->total_retrans > 1 && tp->undo_marker) - tp->snd_cwnd = 1; + tcp_snd_cwnd_set(tp, 1); else - tp->snd_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); + tcp_snd_cwnd_set(tp, tcp_init_cwnd(tp, __sk_dst_get(sk))); tp->snd_cwnd_stamp = tcp_jiffies32; bpf_skops_established(sk, bpf_op, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6575afb6de889..37ef27244d496 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2641,7 +2641,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sk), - tp->snd_cwnd, + tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)); diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 8c643a4ffad1f..e33e2ea261697 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -290,7 +290,7 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) lp->flag &= ~LP_WITHIN_THR; pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, - tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + tcp_snd_cwnd(tp), lp->remote_hz, lp->owd_min, lp->owd_max, lp->sowd >> 3); if (lp->flag & LP_WITHIN_THR) @@ -306,12 +306,12 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) /* happened within inference * drop snd_cwnd into 1 */ if (lp->flag & LP_WITHIN_INF) - tp->snd_cwnd = 1U; + tcp_snd_cwnd_set(tp, 1U); /* happened after inference * cut snd_cwnd into half */ else - tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp) >> 1U, 1U)); /* record this drop time */ lp->last_drop = now; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 95b5ac082a2f4..6d9367142b4b2 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -408,15 +408,15 @@ void tcp_update_metrics(struct sock *sk) if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) { val = tcp_metric_get(tm, TCP_METRIC_SSTHRESH); - if (val && (tp->snd_cwnd >> 1) > val) + if (val && (tcp_snd_cwnd(tp) >> 1) > val) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, - tp->snd_cwnd >> 1); + tcp_snd_cwnd(tp) >> 1); } if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); - if (tp->snd_cwnd > val) + if (tcp_snd_cwnd(tp) > val) tcp_metric_set(tm, TCP_METRIC_CWND, - tp->snd_cwnd); + tcp_snd_cwnd(tp)); } } else if (!tcp_in_slow_start(tp) && icsk->icsk_ca_state == TCP_CA_Open) { @@ -424,10 +424,10 @@ void tcp_update_metrics(struct sock *sk) if (!READ_ONCE(net->ipv4.sysctl_tcp_no_ssthresh_metrics_save) && !tcp_metric_locked(tm, TCP_METRIC_SSTHRESH)) tcp_metric_set(tm, TCP_METRIC_SSTHRESH, - max(tp->snd_cwnd >> 1, tp->snd_ssthresh)); + max(tcp_snd_cwnd(tp) >> 1, tp->snd_ssthresh)); if (!tcp_metric_locked(tm, TCP_METRIC_CWND)) { val = tcp_metric_get(tm, TCP_METRIC_CWND); - tcp_metric_set(tm, TCP_METRIC_CWND, (val + tp->snd_cwnd) >> 1); + tcp_metric_set(tm, TCP_METRIC_CWND, (val + tcp_snd_cwnd(tp)) >> 1); } } else { /* Else slow start did not finish, cwnd is non-sense, diff --git a/net/ipv4/tcp_nv.c b/net/ipv4/tcp_nv.c index 95db7a11ba2ad..63024ec17b204 100644 --- a/net/ipv4/tcp_nv.c +++ b/net/ipv4/tcp_nv.c @@ -198,10 +198,10 @@ static void tcpnv_cong_avoid(struct sock *sk, u32 ack, u32 acked) } if (ca->cwnd_growth_factor < 0) { - cnt = tp->snd_cwnd << -ca->cwnd_growth_factor; + cnt = tcp_snd_cwnd(tp) << -ca->cwnd_growth_factor; tcp_cong_avoid_ai(tp, cnt, acked); } else { - cnt = max(4U, tp->snd_cwnd >> ca->cwnd_growth_factor); + cnt = max(4U, tcp_snd_cwnd(tp) >> ca->cwnd_growth_factor); tcp_cong_avoid_ai(tp, cnt, acked); } } @@ -210,7 +210,7 @@ static u32 tcpnv_recalc_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max((tp->snd_cwnd * nv_loss_dec_factor) >> 10, 2U); + return max((tcp_snd_cwnd(tp) * nv_loss_dec_factor) >> 10, 2U); } static void tcpnv_state(struct sock *sk, u8 new_state) @@ -258,7 +258,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) return; /* Stop cwnd growth if we were in catch up mode */ - if (ca->nv_catchup && tp->snd_cwnd >= nv_min_cwnd) { + if (ca->nv_catchup && tcp_snd_cwnd(tp) >= nv_min_cwnd) { ca->nv_catchup = 0; ca->nv_allow_cwnd_growth = 0; } @@ -372,7 +372,7 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) * if cwnd < max_win, grow cwnd * else leave the same */ - if (tp->snd_cwnd > max_win) { + if (tcp_snd_cwnd(tp) > max_win) { /* there is congestion, check that it is ok * to make a CA decision * 1. We should have at least nv_dec_eval_min_calls @@ -399,20 +399,20 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) ca->nv_allow_cwnd_growth = 0; tp->snd_ssthresh = (nv_ssthresh_factor * max_win) >> 3; - if (tp->snd_cwnd - max_win > 2) { + if (tcp_snd_cwnd(tp) - max_win > 2) { /* gap > 2, we do exponential cwnd decrease */ int dec; - dec = max(2U, ((tp->snd_cwnd - max_win) * + dec = max(2U, ((tcp_snd_cwnd(tp) - max_win) * nv_cong_dec_mult) >> 7); - tp->snd_cwnd -= dec; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - dec); } else if (nv_cong_dec_mult > 0) { - tp->snd_cwnd = max_win; + tcp_snd_cwnd_set(tp, max_win); } if (ca->cwnd_growth_factor > 0) ca->cwnd_growth_factor = 0; ca->nv_no_cong_cnt = 0; - } else if (tp->snd_cwnd <= max_win - nv_pad_buffer) { + } else if (tcp_snd_cwnd(tp) <= max_win - nv_pad_buffer) { /* There is no congestion, grow cwnd if allowed*/ if (ca->nv_eval_call_cnt < nv_inc_eval_min_calls) return; @@ -445,8 +445,8 @@ static void tcpnv_acked(struct sock *sk, const struct ack_sample *sample) * (it wasn't before, if it is now is because nv * decreased it). */ - if (tp->snd_cwnd < nv_min_cwnd) - tp->snd_cwnd = nv_min_cwnd; + if (tcp_snd_cwnd(tp) < nv_min_cwnd) + tcp_snd_cwnd_set(tp, nv_min_cwnd); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 88c7b79d22aea..89152ab3c5ad0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -143,7 +143,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) { struct tcp_sock *tp = tcp_sk(sk); u32 restart_cwnd = tcp_init_cwnd(tp, __sk_dst_get(sk)); - u32 cwnd = tp->snd_cwnd; + u32 cwnd = tcp_snd_cwnd(tp); tcp_ca_event(sk, CA_EVENT_CWND_RESTART); @@ -152,7 +152,7 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta) while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd) cwnd >>= 1; - tp->snd_cwnd = max(cwnd, restart_cwnd); + tcp_snd_cwnd_set(tp, max(cwnd, restart_cwnd)); tp->snd_cwnd_stamp = tcp_jiffies32; tp->snd_cwnd_used = 0; } @@ -1010,7 +1010,7 @@ static void tcp_tsq_write(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); if (tp->lost_out > tp->retrans_out && - tp->snd_cwnd > tcp_packets_in_flight(tp)) { + tcp_snd_cwnd(tp) > tcp_packets_in_flight(tp)) { tcp_mstamp_refresh(tp); tcp_xmit_retransmit_queue(sk); } @@ -1861,9 +1861,9 @@ static void tcp_cwnd_application_limited(struct sock *sk) /* Limited by application or receiver window. */ u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk)); u32 win_used = max(tp->snd_cwnd_used, init_win); - if (win_used < tp->snd_cwnd) { + if (win_used < tcp_snd_cwnd(tp)) { tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + tcp_snd_cwnd_set(tp, (tcp_snd_cwnd(tp) + win_used) >> 1); } tp->snd_cwnd_used = 0; } @@ -2040,7 +2040,7 @@ static inline unsigned int tcp_cwnd_test(const struct tcp_sock *tp, return 1; in_flight = tcp_packets_in_flight(tp); - cwnd = tp->snd_cwnd; + cwnd = tcp_snd_cwnd(tp); if (in_flight >= cwnd) return 0; @@ -2196,12 +2196,12 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, in_flight = tcp_packets_in_flight(tp); BUG_ON(tcp_skb_pcount(skb) <= 1); - BUG_ON(tp->snd_cwnd <= in_flight); + BUG_ON(tcp_snd_cwnd(tp) <= in_flight); send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ - cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; + cong_win = (tcp_snd_cwnd(tp) - in_flight) * tp->mss_cache; limit = min(send_win, cong_win); @@ -2215,7 +2215,7 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, win_divisor = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_tso_win_divisor); if (win_divisor) { - u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache); + u32 chunk = min(tp->snd_wnd, tcp_snd_cwnd(tp) * tp->mss_cache); /* If at least some fraction of a window is available, * just use it. @@ -2343,7 +2343,7 @@ static int tcp_mtu_probe(struct sock *sk) if (likely(!icsk->icsk_mtup.enabled || icsk->icsk_mtup.probe_size || inet_csk(sk)->icsk_ca_state != TCP_CA_Open || - tp->snd_cwnd < 11 || + tcp_snd_cwnd(tp) < 11 || tp->rx_opt.num_sacks || tp->rx_opt.dsack)) return -1; @@ -2379,7 +2379,7 @@ static int tcp_mtu_probe(struct sock *sk) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ - if (tcp_packets_in_flight(tp) + 2 > tp->snd_cwnd) { + if (tcp_packets_in_flight(tp) + 2 > tcp_snd_cwnd(tp)) { if (!tcp_packets_in_flight(tp)) return -1; else @@ -2450,7 +2450,7 @@ static int tcp_mtu_probe(struct sock *sk) if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending * effectively two packets. */ - tp->snd_cwnd--; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); @@ -2719,7 +2719,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, else tcp_chrono_stop(sk, TCP_CHRONO_RWND_LIMITED); - is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tp->snd_cwnd); + is_cwnd_limited |= (tcp_packets_in_flight(tp) >= tcp_snd_cwnd(tp)); if (likely(sent_pkts || is_cwnd_limited)) tcp_cwnd_validate(sk, is_cwnd_limited); @@ -2826,7 +2826,7 @@ void tcp_send_loss_probe(struct sock *sk) if (unlikely(!skb)) { WARN_ONCE(tp->packets_out, "invalid inflight: %u state %u cwnd %u mss %d\n", - tp->packets_out, sk->sk_state, tp->snd_cwnd, mss); + tp->packets_out, sk->sk_state, tcp_snd_cwnd(tp), mss); inet_csk(sk)->icsk_pending = 0; return; } @@ -3338,7 +3338,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) if (!hole) tp->retransmit_skb_hint = skb; - segs = tp->snd_cwnd - tcp_packets_in_flight(tp); + segs = tcp_snd_cwnd(tp) - tcp_packets_in_flight(tp); if (segs <= 0) break; sacked = TCP_SKB_CB(skb)->sacked; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 0de6935659635..192dbf25da3eb 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -189,7 +189,7 @@ void tcp_rate_check_app_limited(struct sock *sk) /* Nothing in sending host's qdisc queues or NIC tx queue. */ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && /* We are not limited by CWND. */ - tcp_packets_in_flight(tp) < tp->snd_cwnd && + tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && /* All lost packets have been retransmitted. */ tp->lost_out <= tp->retrans_out) tp->app_limited = diff --git a/net/ipv4/tcp_scalable.c b/net/ipv4/tcp_scalable.c index 5842081bc8a25..862b96248a92d 100644 --- a/net/ipv4/tcp_scalable.c +++ b/net/ipv4/tcp_scalable.c @@ -27,7 +27,7 @@ static void tcp_scalable_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!acked) return; } - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT), acked); } @@ -35,7 +35,7 @@ static u32 tcp_scalable_ssthresh(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); - return max(tp->snd_cwnd - (tp->snd_cwnd>>TCP_SCALABLE_MD_SCALE), 2U); + return max(tcp_snd_cwnd(tp) - (tcp_snd_cwnd(tp)>>TCP_SCALABLE_MD_SCALE), 2U); } static struct tcp_congestion_ops tcp_scalable __read_mostly = { diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index c8003c8aad2c0..786848ad37ea8 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -159,7 +159,7 @@ EXPORT_SYMBOL_GPL(tcp_vegas_cwnd_event); static inline u32 tcp_vegas_ssthresh(struct tcp_sock *tp) { - return min(tp->snd_ssthresh, tp->snd_cwnd); + return min(tp->snd_ssthresh, tcp_snd_cwnd(tp)); } static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) @@ -217,14 +217,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) * This is: * (actual rate in segments) * baseRTT */ - target_cwnd = (u64)tp->snd_cwnd * vegas->baseRTT; + target_cwnd = (u64)tcp_snd_cwnd(tp) * vegas->baseRTT; do_div(target_cwnd, rtt); /* Calculate the difference between the window we had, * and the window we would like to have. This quantity * is the "Diff" from the Arizona Vegas papers. */ - diff = tp->snd_cwnd * (rtt-vegas->baseRTT) / vegas->baseRTT; + diff = tcp_snd_cwnd(tp) * (rtt-vegas->baseRTT) / vegas->baseRTT; if (diff > gamma && tcp_in_slow_start(tp)) { /* Going too fast. Time to slow down @@ -238,7 +238,8 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) * truncation robs us of full link * utilization. */ - tp->snd_cwnd = min(tp->snd_cwnd, (u32)target_cwnd+1); + tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), + (u32)target_cwnd + 1)); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (tcp_in_slow_start(tp)) { @@ -254,14 +255,14 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* The old window was too fast, so * we slow down. */ - tp->snd_cwnd--; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - 1); tp->snd_ssthresh = tcp_vegas_ssthresh(tp); } else if (diff < alpha) { /* We don't have enough extra packets * in the network, so speed up. */ - tp->snd_cwnd++; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); } else { /* Sending just as fast as we * should be. @@ -269,10 +270,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, u32 acked) } } - if (tp->snd_cwnd < 2) - tp->snd_cwnd = 2; - else if (tp->snd_cwnd > tp->snd_cwnd_clamp) - tp->snd_cwnd = tp->snd_cwnd_clamp; + if (tcp_snd_cwnd(tp) < 2) + tcp_snd_cwnd_set(tp, 2); + else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) + tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); tp->snd_ssthresh = tcp_current_ssthresh(sk); } diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index cd50a61c9976d..366ff6f214b2e 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -146,11 +146,11 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) rtt = veno->minrtt; - target_cwnd = (u64)tp->snd_cwnd * veno->basertt; + target_cwnd = (u64)tcp_snd_cwnd(tp) * veno->basertt; target_cwnd <<= V_PARAM_SHIFT; do_div(target_cwnd, rtt); - veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + veno->diff = (tcp_snd_cwnd(tp) << V_PARAM_SHIFT) - target_cwnd; if (tcp_in_slow_start(tp)) { /* Slow start. */ @@ -164,15 +164,15 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* In the "non-congestive state", increase cwnd * every rtt. */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } else { /* In the "congestive state", increase cwnd * every other rtt. */ - if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd_cnt >= tcp_snd_cwnd(tp)) { if (veno->inc && - tp->snd_cwnd < tp->snd_cwnd_clamp) { - tp->snd_cwnd++; + tcp_snd_cwnd(tp) < tp->snd_cwnd_clamp) { + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) + 1); veno->inc = 0; } else veno->inc = 1; @@ -181,10 +181,10 @@ static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, u32 acked) tp->snd_cwnd_cnt += acked; } done: - if (tp->snd_cwnd < 2) - tp->snd_cwnd = 2; - else if (tp->snd_cwnd > tp->snd_cwnd_clamp) - tp->snd_cwnd = tp->snd_cwnd_clamp; + if (tcp_snd_cwnd(tp) < 2) + tcp_snd_cwnd_set(tp, 2); + else if (tcp_snd_cwnd(tp) > tp->snd_cwnd_clamp) + tcp_snd_cwnd_set(tp, tp->snd_cwnd_clamp); } /* Wipe the slate clean for the next rtt. */ /* veno->cntrtt = 0; */ @@ -199,10 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) if (veno->diff < beta) /* in "non-congestive state", cut cwnd by 1/5 */ - return max(tp->snd_cwnd * 4 / 5, 2U); + return max(tcp_snd_cwnd(tp) * 4 / 5, 2U); else /* in "congestive state", cut cwnd by 1/2 */ - return max(tp->snd_cwnd >> 1U, 2U); + return max(tcp_snd_cwnd(tp) >> 1U, 2U); } static struct tcp_congestion_ops tcp_veno __read_mostly = { diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index b2e05c4cea00f..c6e97141eef25 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -244,7 +244,8 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) switch (event) { case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + tcp_snd_cwnd_set(tp, tp->snd_ssthresh); break; case CA_EVENT_LOSS: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c index 3bb448761ca38..460df348170a8 100644 --- a/net/ipv4/tcp_yeah.c +++ b/net/ipv4/tcp_yeah.c @@ -71,11 +71,11 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (!yeah->doing_reno_now) { /* Scalable */ - tcp_cong_avoid_ai(tp, min(tp->snd_cwnd, TCP_SCALABLE_AI_CNT), + tcp_cong_avoid_ai(tp, min(tcp_snd_cwnd(tp), TCP_SCALABLE_AI_CNT), acked); } else { /* Reno */ - tcp_cong_avoid_ai(tp, tp->snd_cwnd, acked); + tcp_cong_avoid_ai(tp, tcp_snd_cwnd(tp), acked); } /* The key players are v_vegas.beg_snd_una and v_beg_snd_nxt. @@ -130,7 +130,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) /* Compute excess number of packets above bandwidth * Avoid doing full 64 bit divide. */ - bw = tp->snd_cwnd; + bw = tcp_snd_cwnd(tp); bw *= rtt - yeah->vegas.baseRTT; do_div(bw, rtt); queue = bw; @@ -138,20 +138,20 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) if (queue > TCP_YEAH_ALPHA || rtt - yeah->vegas.baseRTT > (yeah->vegas.baseRTT / TCP_YEAH_PHY)) { if (queue > TCP_YEAH_ALPHA && - tp->snd_cwnd > yeah->reno_count) { + tcp_snd_cwnd(tp) > yeah->reno_count) { u32 reduction = min(queue / TCP_YEAH_GAMMA , - tp->snd_cwnd >> TCP_YEAH_EPSILON); + tcp_snd_cwnd(tp) >> TCP_YEAH_EPSILON); - tp->snd_cwnd -= reduction; + tcp_snd_cwnd_set(tp, tcp_snd_cwnd(tp) - reduction); - tp->snd_cwnd = max(tp->snd_cwnd, - yeah->reno_count); + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), + yeah->reno_count)); - tp->snd_ssthresh = tp->snd_cwnd; + tp->snd_ssthresh = tcp_snd_cwnd(tp); } if (yeah->reno_count <= 2) - yeah->reno_count = max(tp->snd_cwnd>>1, 2U); + yeah->reno_count = max(tcp_snd_cwnd(tp)>>1, 2U); else yeah->reno_count++; @@ -176,7 +176,7 @@ static void tcp_yeah_cong_avoid(struct sock *sk, u32 ack, u32 acked) */ yeah->vegas.beg_snd_una = yeah->vegas.beg_snd_nxt; yeah->vegas.beg_snd_nxt = tp->snd_nxt; - yeah->vegas.beg_snd_cwnd = tp->snd_cwnd; + yeah->vegas.beg_snd_cwnd = tcp_snd_cwnd(tp); /* Wipe the slate clean for the next RTT. */ yeah->vegas.cntRTT = 0; @@ -193,16 +193,16 @@ static u32 tcp_yeah_ssthresh(struct sock *sk) if (yeah->doing_reno_now < TCP_YEAH_RHO) { reduction = yeah->lastQ; - reduction = min(reduction, max(tp->snd_cwnd>>1, 2U)); + reduction = min(reduction, max(tcp_snd_cwnd(tp)>>1, 2U)); - reduction = max(reduction, tp->snd_cwnd >> TCP_YEAH_DELTA); + reduction = max(reduction, tcp_snd_cwnd(tp) >> TCP_YEAH_DELTA); } else - reduction = max(tp->snd_cwnd>>1, 2U); + reduction = max(tcp_snd_cwnd(tp)>>1, 2U); yeah->fast_count = 0; yeah->reno_count = max(yeah->reno_count>>1, 2U); - return max_t(int, tp->snd_cwnd - reduction, 2); + return max_t(int, tcp_snd_cwnd(tp) - reduction, 2); } static struct tcp_congestion_ops tcp_yeah __read_mostly = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1d348b11108e3..8679f9d4c0f9c 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2038,7 +2038,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), - tp->snd_cwnd, + tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) From 998b0e96515e590781308910f0724796396ec6e6 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Thu, 23 Sep 2021 21:17:07 +0000 Subject: [PATCH 07/61] tcp: tracking packets with CE marks in BW rate sample In order to track CE marks per rate sample (one round trip), TCP needs a per-skb header field to record the tp->delivered_ce count when the skb was sent. To make space, we replace the "last_in_flight" field which is used exclusively for NV congestion control. The stat needed by NV can be alternatively approximated by existing stats tcp_sock delivered and mss_cache. This patch counts the number of packets delivered which have CE marks in the rate sample, using similar approach of delivery accounting. Cc: Lawrence Brakmo Signed-off-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: Eric Dumazet Signed-off-by: Luke Hsiao Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- include/net/tcp.h | 9 ++++++--- net/ipv4/tcp_input.c | 11 +++++------ net/ipv4/tcp_output.c | 2 -- net/ipv4/tcp_rate.c | 6 ++++++ 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 59f85d9cb6909..a6d87c6411fb3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -871,10 +871,11 @@ struct tcp_skb_cb { __u32 ack_seq; /* Sequence number ACK'd */ union { struct { +#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1) /* There is space for up to 24 bytes */ - __u32 in_flight:30,/* Bytes in flight at transmit */ - is_app_limited:1, /* cwnd not fully used? */ - unused:1; + __u32 is_app_limited:1, /* cwnd not fully used? */ + delivered_ce:20, + unused:11; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -1051,7 +1052,9 @@ struct ack_sample { struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ + s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c2b4edb58ae33..3a70eee24cd76 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3269,7 +3269,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, long seq_rtt_us = -1L; long ca_rtt_us = -1L; u32 pkts_acked = 0; - u32 last_in_flight = 0; bool rtt_update; int flag = 0; @@ -3305,7 +3304,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (!first_ackt) first_ackt = last_ackt; - last_in_flight = TCP_SKB_CB(skb)->tx.in_flight; if (before(start_seq, reord)) reord = start_seq; if (!after(scb->end_seq, tp->high_seq)) @@ -3371,8 +3369,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, seq_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, first_ackt); ca_rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, last_ackt); - if (pkts_acked == 1 && last_in_flight < tp->mss_cache && - last_in_flight && !prior_sacked && fully_acked && + if (pkts_acked == 1 && fully_acked && !prior_sacked && + (tp->snd_una - prior_snd_una) < tp->mss_cache && sack->rate->prior_delivered + 1 == tp->delivered && !(flag & (FLAG_CA_ALERT | FLAG_SYN_ACKED))) { /* Conservatively mark a delayed ACK. It's typically @@ -3429,9 +3427,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack, if (icsk->icsk_ca_ops->pkts_acked) { struct ack_sample sample = { .pkts_acked = pkts_acked, - .rtt_us = sack->rate->rtt_us, - .in_flight = last_in_flight }; + .rtt_us = sack->rate->rtt_us }; + sample.in_flight = tp->mss_cache * + (tp->delivered - sack->rate->prior_delivered); icsk->icsk_ca_ops->pkts_acked(sk, &sample); } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 89152ab3c5ad0..38e56b1de8409 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1254,8 +1254,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { - TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - - tp->snd_una; oskb = skb; tcp_skb_tsorted_save(oskb) { diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 192dbf25da3eb..617b8187c03d9 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -65,6 +65,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; } @@ -86,6 +87,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; @@ -138,6 +140,10 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, } rs->delivered = tp->delivered - rs->prior_delivered; + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + /* Model sending data and receiving ACKs as separate pipeline phases * for a window. Usually the ACK phase is longer, but with ACK * compression the send phase can be longer. To be safe we use the From 2ec70cb1bc1c27f16735460b1a683285804f1c05 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 11 Jun 2019 12:26:55 -0400 Subject: [PATCH 08/61] net-tcp_bbr: broaden app-limited rate sample detection This commit is a bug fix for the Linux TCP app-limited (application-limited) logic that is used for collecting rate (bandwidth) samples. Previously the app-limited logic only looked for "bubbles" of silence in between application writes, by checking at the start of each sendmsg. But "bubbles" of silence can also happen before retransmits: e.g. bubbles can happen between an application write and a retransmit, or between two retransmits. Retransmits are triggered by ACKs or timers. So this commit checks for bubbles of app-limited silence upon ACKs or timers. Why does this commit check for app-limited state at the start of ACKs and timer handling? Because at that point we know whether inflight was fully using the cwnd. During processing the ACK or timer event we often change the cwnd; after changing the cwnd we can't know whether inflight was fully using the old cwnd. Origin-9xx-SHA1: 3fe9b53291e018407780fb8c356adb5666722cbc Change-Id: I37221506f5166877c2b110753d39bb0757985e68 Signed-off-by: Juhyung Park --- net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_timer.c | 1 + 2 files changed, 2 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3a70eee24cd76..342b9f8e8e6a2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3840,6 +3840,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); + tcp_rate_check_app_limited(sk); /* ts_recent update must be made after we are sure that the packet * is in window. diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 8bf258b94e242..984bc159593e0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -640,6 +640,7 @@ void tcp_write_timer_handler(struct sock *sk) goto out; } + tcp_rate_check_app_limited(sk); tcp_mstamp_refresh(tcp_sk(sk)); event = icsk->icsk_pending; From befa1ca7786432976720b50b4a049d8edaac63e4 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 24 Jun 2018 21:55:59 -0400 Subject: [PATCH 09/61] net-tcp_bbr: v2: shrink delivered_mstamp, first_tx_mstamp to u32 to free up 8 bytes Free up some space for tracking inflight and losses for each bw sample, in upcoming commits. These timestamps are in microseconds, and are now stored in 32 bits. So they can only hold time intervals up to roughly 2^12 = 4096 seconds. But Linux TCP RTT and RTO tracking has the same 32-bit microsecond implementation approach and resulting deployment limitations. So this is not introducing a new limit. And these should not be a limitation for the foreseeable future. Effort: net-tcp_bbr Origin-9xx-SHA1: 238a7e6b5d51625fef1ce7769826a7b21b02ae55 Change-Id: I3b779603797263b52a61ad57c565eb91fe42680c Signed-off-by: Juhyung Park --- include/net/tcp.h | 9 +++++++-- net/ipv4/tcp_rate.c | 7 ++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a6d87c6411fb3..267ae1d2453d8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -804,6 +804,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } +static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) +{ + return max_t(s32, t1 - t0, 0); +} + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); @@ -879,9 +884,9 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ - u64 first_tx_mstamp; + u32 first_tx_mstamp; /* when we reached the "delivered" count */ - u64 delivered_mstamp; + u32 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 617b8187c03d9..363eb8f1c9dcd 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -96,8 +96,9 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); + rs->interval_us = tcp_stamp32_us_delta( + tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); } /* Mark off the skb delivered once it's sacked to avoid being @@ -150,7 +151,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); From 332b31e75811a03f38ffa6b919f581d157f6fe5b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 5 Aug 2017 11:49:50 -0400 Subject: [PATCH 10/61] net-tcp_bbr: v2: snapshot packets in flight at transmit time and pass in rate_sample CC algorithms may want to snapshot the number of packets in flight at transmit time and pass in rate_sample, to understand the relationship between inflight and losses or ECN signals, to try to find the highest inflight value that has acceptable levels of loss/ECN marking. We split out the code to set an skb's tx.in_flight field into its own function, so that this code can be used for the TCP_REPAIR "fake send" code path that inserts skbs into the rtx queue without sending them. Effort: net-tcp_bbr Origin-9xx-SHA1: b3eb4f2d20efab4ca001f32c9294739036c493ea Origin-9xx-SHA1: e880fc907d06ea7354333f60f712748ebce9497b Origin-9xx-SHA1: 330f825a08a6fe92cef74d799cc468864c479f63 Change-Id: I7314047d0ff14dd261a04b1969a46dc658c8836a Signed-off-by: Juhyung Park --- include/net/tcp.h | 6 ++++++ net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_rate.c | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 267ae1d2453d8..363d572386057 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -887,6 +887,10 @@ struct tcp_skb_cb { u32 first_tx_mstamp; /* when we reached the "delivered" count */ u32 delivered_mstamp; +#define TCPCB_IN_FLIGHT_BITS 20 +#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) + u32 in_flight:20, /* packets in flight at transmit */ + unused2:12; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -1058,6 +1062,7 @@ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ + u32 tx_in_flight; /* packets in flight at starting timestamp */ s32 delivered; /* number of packets delivered over interval */ s32 delivered_ce; /* number of packets delivered w/ CE marks*/ long interval_us; /* time for tp->delivered to incr "delivered" */ @@ -1170,6 +1175,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) } /* From tcp_rate.c */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 38e56b1de8409..a656a447411d5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2638,6 +2638,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); + tcp_set_tx_in_flight(sk, skb); goto repair; /* Skip network transmission */ } diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 363eb8f1c9dcd..1bc58e2902c2e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -34,6 +34,24 @@ * ready to send in the write queue. */ +void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 in_flight; + + /* Check, sanitize, and record packets in flight after skb was sent. */ + in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); + if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, + "insane in_flight %u cc %s mss %u " + "cwnd %u pif %u %u %u %u\n", + in_flight, inet_csk(sk)->icsk_ca_ops->name, + tp->mss_cache, tp->snd_cwnd, + tp->packets_out, tp->retrans_out, + tp->sacked_out, tp->lost_out)) + in_flight = TCPCB_IN_FLIGHT_MAX; + TCP_SKB_CB(skb)->tx.in_flight = in_flight; +} + /* Snapshot the current delivery information in the skb, to generate * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). */ @@ -67,6 +85,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; + tcp_set_tx_in_flight(sk, skb); } /* When an skb is sacked or acked, we fill in the rate sample with the (prior) @@ -92,6 +111,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, rs->prior_mstamp = scb->tx.delivered_mstamp; rs->is_app_limited = scb->tx.is_app_limited; rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->tx_in_flight = scb->tx.in_flight; /* Record send time of most recently ACKed packet: */ tp->first_tx_mstamp = tcp_skb_timestamp_us(skb); From 91695b6db87e599eec357711d3a13a30ed9ba040 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 12 Oct 2017 23:44:27 -0400 Subject: [PATCH 11/61] net-tcp_bbr: v2: count packets lost over TCP rate sampling interval For understanding the relationship between inflight and packet loss signals, to try to find the highest inflight value that has acceptable levels of packet losses. Effort: net-tcp_bbr Origin-9xx-SHA1: 4527e26b2bd7756a88b5b9ef1ada3da33dd609ab Change-Id: I594c2500868d9c530770e7ddd68ffc87c57f4fd5 Signed-off-by: Juhyung Park --- include/net/tcp.h | 5 ++++- net/ipv4/tcp_rate.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 363d572386057..d2bdc504789bc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -891,6 +891,7 @@ struct tcp_skb_cb { #define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) u32 in_flight:20, /* packets in flight at transmit */ unused2:12; + u32 lost; /* packets lost so far upon tx of skb */ } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; @@ -1060,11 +1061,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ + u32 prior_lost; /* tp->lost at "prior_mstamp" */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ u32 tx_in_flight; /* packets in flight at starting timestamp */ + s32 lost; /* number of packets lost over interval */ s32 delivered; /* number of packets delivered over interval */ - s32 delivered_ce; /* number of packets delivered w/ CE marks*/ + s32 delivered_ce; /* packets delivered w/ CE mark over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 1bc58e2902c2e..018e53ca2dc8e 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -84,6 +84,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; TCP_SKB_CB(skb)->tx.delivered = tp->delivered; TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.lost = tp->lost; TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; tcp_set_tx_in_flight(sk, skb); } @@ -106,6 +107,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, if (!rs->prior_delivered || after(scb->tx.delivered, rs->prior_delivered)) { + rs->prior_lost = scb->tx.lost; rs->prior_delivered_ce = scb->tx.delivered_ce; rs->prior_delivered = scb->tx.delivered; rs->prior_mstamp = scb->tx.delivered_mstamp; @@ -160,6 +162,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, return; } rs->delivered = tp->delivered - rs->prior_delivered; + rs->lost = tp->lost - rs->prior_lost; rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; /* delivered_ce occupies less than 32 bits in the skb control block */ From 579a358f0611e82dfe8c99f92554890adc340a55 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 19 Nov 2018 13:48:36 -0500 Subject: [PATCH 12/61] net-tcp_bbr: v2: export FLAG_ECE in rate_sample.is_ece For understanding the relationship between inflight and ECN signals, to try to find the highest inflight value that has acceptable levels ECN marking. Effort: net-tcp_bbr Origin-9xx-SHA1: 3eba998f2898541406c2666781182200934965a8 Change-Id: I3a964e04cee83e11649a54507043d2dfe769a3b3 Signed-off-by: Juhyung Park --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index d2bdc504789bc..c8896768f155f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1078,6 +1078,7 @@ struct rate_sample { bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ + bool is_ece; /* did this ACK have ECN marked? */ }; struct tcp_congestion_ops { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 342b9f8e8e6a2..f4e8cab92a638 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3939,6 +3939,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); + rs.is_ece = !!(flag & FLAG_ECE); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); From 28024779daae41671e205af2badd92c97d6b1448 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 7 Aug 2018 21:52:06 -0400 Subject: [PATCH 13/61] net-tcp_bbr: v2: introduce ca_ops->skb_marked_lost() CC module callback API For connections experiencing reordering, RACK can mark packets lost long after we receive the SACKs/ACKs hinting that the packets were actually lost. This means that CC modules cannot easily learn the volume of inflight data at which packet loss happens by looking at the current inflight or even the packets in flight when the most recently SACKed packet was sent. To learn this, CC modules need to know how many packets were in flight at the time lost packets were sent. This new callback, combined with TCP_SKB_CB(skb)->tx.in_flight, allows them to learn this. This also provides a consistent callback that is invoked whether packets are marked lost upon ACK processing, using the RACK reordering timer, or at RTO time. Effort: net-tcp_bbr Origin-9xx-SHA1: afcbebe3374e4632ac6714d39e4dc8a8455956f4 Change-Id: I54826ab53df636be537e5d3c618a46145d12d51a Signed-off-by: Juhyung Park --- include/net/tcp.h | 2 ++ net/ipv4/tcp_input.c | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index c8896768f155f..4a2e17d03a950 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1107,6 +1107,8 @@ struct tcp_congestion_ops { void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); + /* react to a specific lost skb (optional) */ + void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f4e8cab92a638..d621771118b86 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1075,7 +1075,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { + struct sock *sk = (struct sock *)tp; + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + tp->lost += tcp_skb_pcount(skb); + if (ca_ops->skb_marked_lost) + ca_ops->skb_marked_lost(sk, skb); } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) From 5009d425d1adbad7c8fb6ab4135eabf27e9d54e8 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 1 May 2019 20:16:33 -0400 Subject: [PATCH 14/61] net-tcp_bbr: v2: adjust skb tx.in_flight upon merge in tcp_shifted_skb() When tcp_shifted_skb() updates state as adjacent SACKed skbs are coalesced, previously the tx.in_flight was not adjusted, so we could get contradictory state where the skb's recorded pcount was bigger than the tx.in_flight (the number of segments that were in_flight after sending the skb). Normally have a SACKed skb with contradictory pcount/tx.in_flight would not matter. However, with SACK reneging, the SACKed bit is removed, and an skb once again becomes eligible for retransmitting, fragmenting, SACKing, etc. Packetdrill testing verified the following sequence is possible in a kernel that does not have this commit: - skb N is SACKed - skb N+1 is SACKed and combined with skb N using tcp_shifted_skb() - tcp_shifted_skb() will increase the pcount of prev, but leave tx.in_flight as-is - so prev skb can have pcount > tx.in_flight - RTO, tcp_timeout_mark_lost(), detect reneg, remove "SACKed" bit, mark skb N as lost - find pcount of skb N is greater than its tx.in_flight I suspect this issue iw what caused the bbr2_inflight_hi_from_lost_skb(): WARN_ON_ONCE(inflight_prev < 0) to fire in production machines using bbr2. Effort: net-tcp_bbr Origin-9xx-SHA1: 1a3e997e613d2dcf32b947992882854ebe873715 Change-Id: I1b0b75c27519953430c7db51c6f358f104c7af55 Signed-off-by: Juhyung Park --- net/ipv4/tcp_input.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d621771118b86..e7cc52d8c2510 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1461,6 +1461,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); + /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ + if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, + "prev in_flight: %u skb in_flight: %u pcount: %u", + TCP_SKB_CB(prev)->tx.in_flight, + TCP_SKB_CB(skb)->tx.in_flight, + pcount)) + TCP_SKB_CB(skb)->tx.in_flight = 0; + else + TCP_SKB_CB(skb)->tx.in_flight -= pcount; + TCP_SKB_CB(prev)->tx.in_flight += pcount; + /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep From 708bf2b30ed662cbddbfe90f40f1308459ed2f9e Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 1 May 2019 20:16:25 -0400 Subject: [PATCH 15/61] net-tcp_bbr: v2: adjust skb tx.in_flight upon split in tcp_fragment() When we fragment an skb that has already been sent, we need to update the tx.in_flight for the first skb in the resulting pair ("buff"). Because we were not updating the tx.in_flight, the tx.in_flight value was inconsistent with the pcount of the "buff" skb (tx.in_flight would be too high). That meant that if the "buff" skb was lost, then bbr2_inflight_hi_from_lost_skb() would calculate an inflight_hi value that is too high. This could result in longer queues and higher packet loss. Packetdrill testing verified that without this commit, when the second half of an skb is SACKed and then later the first half of that skb is marked lost, the calculated inflight_hi was incorrect. Effort: net-tcp_bbr Origin-9xx-SHA1: 385f1ddc610798fab2837f9f372857438b25f874 Origin-9xx-SHA1: a0eb099690af net-tcp_bbr: v2: fix tcp_fragment() tx.in_flight recomputation [prod feb 8 2021; use as a fixup] Origin-9xx-SHA1: 885503228153ff0c9114e net-tcp_bbr: v2: introduce tcp_skb_tx_in_flight_is_suspicious() helper for warnings Change-Id: I617f8cab4e9be7a0b8e8d30b047bf8645393354d Signed-off-by: Juhyung Park --- include/net/tcp.h | 15 +++++++++++++++ net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4a2e17d03a950..4d24ac6c0b5cd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1189,6 +1189,21 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); +/* If a retransmit failed due to local qdisc congestion or other local issues, + * then we may have called tcp_set_skb_tso_segs() to increase the number of + * segments in the skb without increasing the tx.in_flight. In all other cases, + * the tx.in_flight should be at least as big as the pcount of the sk_buff. We + * do not have the state to know whether a retransmit failed due to local qdisc + * congestion or other local issues, so to avoid spurious warnings we consider + * that any skb marked lost may have suffered that fate. + */ +static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, + u32 skb_sacked_flags, + u32 tx_in_flight) +{ + return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); +} + /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a656a447411d5..705e040694439 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1530,7 +1530,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; - int nsize, old_factor; + int nsize, old_factor, inflight_prev; long limit; int nlen; u8 flags; @@ -1608,6 +1608,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); + + inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; + if (inflight_prev < 0) { + WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( + old_factor, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->tx.in_flight), + "inconsistent: tx.in_flight: %u " + "old_factor: %d mss: %u sacked: %u " + "1st pcount: %d 2nd pcount: %d " + "1st len: %u 2nd len: %u ", + TCP_SKB_CB(skb)->tx.in_flight, old_factor, + mss_now, TCP_SKB_CB(skb)->sacked, + tcp_skb_pcount(skb), tcp_skb_pcount(buff), + skb->len, buff->len); + inflight_prev = 0; + } + /* Set 1st tx.in_flight as if 1st were sent by itself: */ + TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + + tcp_skb_pcount(skb); + /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ + TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + + tcp_skb_pcount(skb) + + tcp_skb_pcount(buff); } /* Link BUFF into the send queue. */ From c15f7a37852e75c98778425afceb9ba821c80fd1 Mon Sep 17 00:00:00 2001 From: Yousuk Seung Date: Wed, 23 May 2018 17:55:54 -0700 Subject: [PATCH 16/61] net-tcp: add new ca opts flag TCP_CONG_WANTS_CE_EVENTS Add a a new ca opts flag TCP_CONG_WANTS_CE_EVENTS that allows a congestion control module to receive CE events. Currently congestion control modules have to set the TCP_CONG_NEEDS_ECN bit in opts flag to receive CE events but this may incur changes in ECN behavior elsewhere. This patch adds a new bit TCP_CONG_WANTS_CE_EVENTS that allows congestion control modules to receive CE events independently of TCP_CONG_NEEDS_ECN. Effort: net-tcp Origin-9xx-SHA1: 9f7e14716cde760bc6c67ef8ef7e1ee48501d95b Change-Id: I2255506985242f376d910c6fd37daabaf4744f24 Signed-off-by: Juhyung Park --- include/net/tcp.h | 14 +++++++++++++- net/ipv4/tcp_input.c | 4 ++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 4d24ac6c0b5cd..2b54685ca8291 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1041,7 +1041,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 -#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) +/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ +#define TCP_CONG_WANTS_CE_EVENTS 0x4 +#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ + TCP_CONG_NEEDS_ECN | \ + TCP_CONG_WANTS_CE_EVENTS) union tcp_cc_info; @@ -1156,6 +1160,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif +static inline bool tcp_ca_wants_ce_events(const struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | + TCP_CONG_WANTS_CE_EVENTS); +} + static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e7cc52d8c2510..9552289359032 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -361,7 +361,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { @@ -372,7 +372,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: - if (tcp_ca_needs_ecn(sk)) + if (tcp_ca_wants_ce_events(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; From 578d7d93b31e207bb48b0e6a524f80a10a2aaee0 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Fri, 27 Sep 2019 17:10:26 -0400 Subject: [PATCH 17/61] net-tcp: re-generalize TSO sizing in TCP CC module API Reorganize the API for CC modules so that the CC module once again gets complete control of the TSO sizing decision. This is how the API was set up around 2016 and the initial BBRv1 upstreaming. Later Eric Dumazet simplified it. But with wider testing it now seems that to avoid CPU regressions BBR needs to have a different TSO sizing function. This is necessary to handle cases where there are many flows bottlenecked on the sender host's NIC, in which case BBR's pacing rate is much lower than CUBIC/Reno/DCTCP's. Why does this happen? Because BBR's pacing rate adapts to the low bandwidth share each flow sees. By contrast, CUBIC/Reno/DCTCP see no loss or ECN, so they grow a very large cwnd, and thus large pacing rate and large TSO burst size. Change-Id: Ic8ccfdbe4010ee8d4bf6a6334c48a2fceb2171ea Signed-off-by: Juhyung Park --- include/net/tcp.h | 4 ++-- net/ipv4/bpf_tcp_ca.c | 2 +- net/ipv4/tcp_bbr.c | 37 ++++++++++++++++++++++++++----------- net/ipv4/tcp_output.c | 11 +++++------ 4 files changed, 34 insertions(+), 20 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2b54685ca8291..38be218af7c3b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1109,8 +1109,8 @@ struct tcp_congestion_ops { u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); - /* override sysctl_tcp_min_tso_segs */ - u32 (*min_tso_segs)(struct sock *sk); + /* pick target number of segments per TSO/GSO skb (optional): */ + u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); /* react to a specific lost skb (optional) */ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764d..8ad93e1fe9dd3 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -16,7 +16,7 @@ static u32 optional_ops[] = { offsetof(struct tcp_congestion_ops, cwnd_event), offsetof(struct tcp_congestion_ops, in_ack_event), offsetof(struct tcp_congestion_ops, pkts_acked), - offsetof(struct tcp_congestion_ops, min_tso_segs), + offsetof(struct tcp_congestion_ops, tso_segs), offsetof(struct tcp_congestion_ops, sndbuf_expand), offsetof(struct tcp_congestion_ops, cong_control), }; diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index c5ee83654db1c..935ef3ccd342f 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -298,20 +298,35 @@ static u32 bbr_min_tso_segs(struct sock *sk) return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; } +/* Return the number of segments BBR would like in a TSO/GSO skb, given + * a particular max gso size as a constraint. + */ +static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, + u32 gso_max_size) +{ + u32 segs; + u64 bytes; + + /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ + bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); + segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); + return segs; +} + +/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +{ + return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); +} + +/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ static u32 bbr_tso_segs_goal(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - u32 segs, bytes; - - /* Sort of tcp_tso_autosize() but ignoring - * driver provided sk_gso_max_size. - */ - bytes = min_t(unsigned long, - sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), - GSO_MAX_SIZE - 1 - MAX_TCP_HEADER); - segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); - return min(segs, 0x7FU); + return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_MAX_SIZE); } /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ @@ -1147,7 +1162,7 @@ static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, - .min_tso_segs = bbr_min_tso_segs, + .tso_segs = bbr_tso_segs, .get_info = bbr_get_info, .set_state = bbr_set_state, }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 705e040694439..b68a8cb148c44 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2006,13 +2006,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; - u32 min_tso, tso_segs; + u32 tso_segs; - min_tso = ca_ops->min_tso_segs ? - ca_ops->min_tso_segs(sk) : - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); - - tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); + tso_segs = ca_ops->tso_segs ? + ca_ops->tso_segs(sk, mss_now) : + tcp_tso_autosize(sk, mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); return min_t(u32, tso_segs, sk->sk_gso_max_segs); } From b0b91af6f9816f74c40e4000c3872dbdcf0de1aa Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sat, 16 Nov 2019 13:16:25 -0500 Subject: [PATCH 18/61] net-tcp: add fast_ack_mode=1: skip rwin check in tcp_fast_ack_mode__tcp_ack_snd_check() Add logic for an experimental TCP connection behavior, enabled with tp->fast_ack_mode = 1, which disables checking the receive window before sending an ack in __tcp_ack_snd_check(). If this behavior is enabled, the data receiver sends an ACK if the amount of data is > RCV.MSS. Change-Id: Iaa0a0fd7108221f883137a79d5bfa724f1b096d4 Signed-off-by: Juhyung Park --- include/linux/tcp.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_cong.c | 1 + net/ipv4/tcp_input.c | 5 +++-- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 9c67f3e7a28b2..46ed7dd23e7db 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -226,7 +226,8 @@ struct tcp_sock { u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ - unused:5; + fast_ack_mode:2, /* which fast ack mode ? */ + unused:3; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fa0e18b9c1b12..58fcaef32108d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2872,6 +2872,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; + tp->fast_ack_mode = 0; /* Clean up fastopen related fields */ diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index f43db30a7195d..d2cd30d4effde 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -179,6 +179,7 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; + tcp_sk(sk)->fast_ack_mode = 0; if (icsk->icsk_ca_ops->init) icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9552289359032..37e386db15ded 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5531,13 +5531,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && + (tp->fast_ack_mode == 1 || /* ... and right edge of window advances far enough. * (tcp_recvmsg() will send ACK otherwise). * If application uses SO_RCVLOWAT, we want send ack now if * we have not received enough bytes to satisfy the condition. */ - (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || - __tcp_select_window(sk) >= tp->rcv_wnd)) || + (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || + __tcp_select_window(sk) >= tp->rcv_wnd))) || /* We ACK each frame or... */ tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ From f32ed640423f79b8d6dce572851f60359298c199 Mon Sep 17 00:00:00 2001 From: Jianfeng Wang Date: Fri, 19 Jun 2020 17:33:45 +0000 Subject: [PATCH 19/61] net-tcp_bbr: v2: record app-limited status of TLP-repaired flight When sending a TLP retransmit, record whether the outstanding flight of data is application limited. This is important for congestion control modules that want to respond to losses repaired by TLP retransmits. This is important because the following scenarios convey very different information: (1) a packet loss with a small number of packets in flight; (2) a packet loss with the maximum amount of data in flight allowed by the CC module; Effort: net-tcp_bbr Change-Id: Ic8ae567caa4e4bfd5fd82c3d4be12a5d9171655e Signed-off-by: Juhyung Park --- include/linux/tcp.h | 3 ++- net/ipv4/tcp_output.c | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 46ed7dd23e7db..edc53adbac3ed 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -227,7 +227,8 @@ struct tcp_sock { u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ fast_ack_mode:2, /* which fast ack mode ? */ - unused:3; + tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ + unused:2; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b68a8cb148c44..63c894031b23e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2871,6 +2871,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; + tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; if (__tcp_retransmit_skb(sk, skb, 1)) goto rearm_timer; From fa711b8d8181e1e3814083e622f267e42f21f373 Mon Sep 17 00:00:00 2001 From: Jianfeng Wang Date: Tue, 16 Jun 2020 17:41:19 +0000 Subject: [PATCH 20/61] net-tcp_bbr: v2: inform CC module of losses repaired by TLP probe Before this commit, when there is a packet loss that creates a sequence hole that is filled by a TLP loss probe, then tcp_process_tlp_ack() only informs the congestion control (CC) module via a back-to-back entry and exit of CWR. But some congestion control modules (e.g. BBR) do not respond to CWR events. This commit adds a new CA event with which the core TCP stack notifies the CC module when a loss is repaired by a TLP. This will allow CC modules that do not use the CWR mechanism to have a custom handler for such TLP recoveries. Effort: net-tcp_bbr Change-Id: Ieba72332b401b329bff5a641d2b2043a3fb8f632 Signed-off-by: Juhyung Park --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/net/tcp.h b/include/net/tcp.h index 38be218af7c3b..99646fd620dce 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1019,6 +1019,7 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ + CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 37e386db15ded..daf6d7e4471c6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3738,6 +3738,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ + tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); From 64f3b6f1f164d1117d9d54d64870ad57ce53933d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Mon, 21 Sep 2020 14:46:26 -0400 Subject: [PATCH 21/61] net-tcp_bbr: v2: introduce is_acking_tlp_retrans_seq into rate_sample Introduce is_acking_tlp_retrans_seq into rate_sample. This bool will export to the CC module the knowledge of whether the current ACK matched a TLP retransmit. Note that when this bool is true, we cannot yet tell (in general) whether this ACK is for the original or the TLP retransmit. Effort: net-tcp_bbr Change-Id: I2e6494332167e75efcbdc99bd5c119034e9c39b4 Signed-off-by: Juhyung Park --- include/net/tcp.h | 1 + net/ipv4/tcp_input.c | 12 +++++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 99646fd620dce..638ee7d93668e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1082,6 +1082,7 @@ struct rate_sample { u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ + bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ bool is_ece; /* did this ACK have ECN marked? */ }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index daf6d7e4471c6..5f602e0fabf85 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3721,7 +3721,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ -static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, + struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); @@ -3749,6 +3750,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; + } else { + /* This ACK matches a TLP retransmit. We cannot yet tell if + * this ACK is for the original or the TLP retransmit. + */ + rs->is_acking_tlp_retrans_seq = 1; } } @@ -3932,7 +3938,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); + tcp_process_tlp_ack(sk, ack, flag, &rs); if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | @@ -3976,7 +3982,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_ack_probe(sk); if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); + tcp_process_tlp_ack(sk, ack, flag, &rs); return 1; old_ack: From 06f92ed200065966ff30aaf6e06a24efec1d3cce Mon Sep 17 00:00:00 2001 From: David Morley Date: Fri, 14 Jul 2023 11:07:56 -0400 Subject: [PATCH 22/61] tcp: introduce per-route feature RTAX_FEATURE_ECN_LOW Define and implement a new per-route feature, RTAX_FEATURE_ECN_LOW. This feature indicates that the given destination network is a low-latency ECN environment, meaning both that ECN CE marks are applied by the network using a low-latency marking threshold and also that TCP endpoints provide precise per-data-segment ECN feedback in ACKs (where the ACK ECE flag echoes the received CE status of all newly-acknowledged data segments). This feature indication can be used by congestion control algorithms to decide how to interpret ECN signals over the given destination network. This feature is appropriate for datacenter-style ECN marking, such as the ECN marking approach expected by DCTCP or BBR congestion control modules. Signed-off-by: David Morley Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Tested-by: David Morley Change-Id: I6bc06e9c6cb426fbae7243fc71c9a8c18175f5d3 Signed-off-by: Juhyung Park --- include/net/tcp.h | 10 ++++++++++ include/uapi/linux/rtnetlink.h | 4 +++- net/ipv4/tcp_minisocks.c | 2 ++ net/ipv4/tcp_output.c | 6 ++++-- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 638ee7d93668e..be1dba13c8bf9 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -372,6 +372,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 +#define TCP_ECN_LOW 16 enum tcp_tw_status { TCP_TW_SUCCESS = 0, @@ -708,6 +709,15 @@ static inline void tcp_fast_path_check(struct sock *sk) tcp_fast_path_on(tp); } +static inline void tcp_set_ecn_low_from_dst(struct sock *sk, + const struct dst_entry *dst) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) + tp->ecn_flags |= TCP_ECN_LOW; +} + /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 9b814c92de123..0f7bff0c6e985 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -482,9 +482,11 @@ enum { #define RTAX_FEATURE_SACK (1 << 1) #define RTAX_FEATURE_TIMESTAMP (1 << 2) #define RTAX_FEATURE_ALLFRAG (1 << 3) +#define RTAX_FEATURE_ECN_LOW (1 << 4) #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ - RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) + RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \ + | RTAX_FEATURE_ECN_LOW) struct rta_session { __u8 proto; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 01e27620b7ee5..8cc4e3c774cdd 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -409,6 +409,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; + tcp_set_ecn_low_from_dst(sk, dst); + if (ca_key != TCP_CA_UNSPEC) { const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 63c894031b23e..5dc37e3f59c60 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -322,10 +322,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; + const struct dst_entry *dst = __sk_dst_get(sk); if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } @@ -337,6 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); + + if (dst) + tcp_set_ecn_low_from_dst(sk, dst); } } From b06822be250a1cd32c01a5b2f82d30f2f522a067 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 11 Jun 2019 12:54:22 -0400 Subject: [PATCH 23/61] net-tcp_bbr: v3: update TCP "bbr" congestion control module to BBRv3 BBR v3 is an enhacement to the BBR v1 algorithm. It's designed to aim for lower queues, lower loss, and better Reno/CUBIC coexistence than BBR v1. BBR v3 maintains the core of BBR v1: an explicit model of the network path that is two-dimensional, adapting to estimate the (a) maximum available bandwidth and (b) maximum safe volume of data a flow can keep in-flight in the network. It maintains the estimated BDP as a core guide for estimating an appropriate level of in-flight data. BBR v3 makes several key enhancements: o Its bandwidth-probing time scale is adapted, within bounds, to allow improved coexistence with Reno and CUBIC. The bandwidth-probing time scale is (a) extended dynamically based on estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by an interactive wall-clock time-scale to be more scalable and responsive than Reno and CUBIC. o Rather than being largely agnostic to loss and ECN marks, it explicitly uses loss and (DCTCP-style) ECN signals to maintain its model. o It aims for lower losses than v1 by adjusting its model to attempt to stay within loss rate and ECN mark rate bounds (loss_thresh and ecn_thresh, respectively). o It adapts to loss/ECN signals even when the application is running out of data ("application-limited"), in case the "application-limited" flow is also "network-limited" (the bw and/or inflight available to this flow is lower than previously estimated when the flow ran out of data). o It has a three-part model: the model explicit three tracks operating points, where an operating point is a tuple: (bandwidth, inflight). The three operating points are: o latest: the latest measurement from the current round trip o upper bound: robust, optimistic, long-term upper bound o lower bound: robust, conservative, short-term lower bound These are stored in the following state variables: o latest: bw_latest, inflight_latest o lo: bw_lo, inflight_lo o hi: bw_hi[2], inflight_hi To gain intuition about the meaning of the three operating points, it may help to consider the analogs in CUBIC, which has a somewhat analogous three-part model used by its probing state machine: BBR param CUBIC param ----------- ------------- latest ~ cwnd lo ~ ssthresh hi ~ last_max_cwnd The analogy is only a loose one, though, since the BBR operating points are calculated differently, and are 2-dimensional (bw,inflight) rather than CUBIC's one-dimensional notion of operating point (inflight). o It uses the three-part model to adapt the magnitude of its bandwidth to match the estimated space available in the buffer, rather than (as in BBR v1) assuming that it was always acceptable to place 0.25*BDP in the bottleneck buffer when probing (commodity datacenter switches commonly do not have that much buffer for WAN flows). When BBR v3 estimates it hit a buffer limit during probing, its bandwidth probing then starts gently in case little space is still available in the buffer, and the accelerates, slowly at first and then rapidly if it can grow inflight without seeing congestion signals. In such cases, probing is bounded by inflight_hi + inflight_probe, where inflight_probe grows as: [0, 1, 2, 4, 8, 16,...]. This allows BBR to keep losses low and bounded if a bottleneck remains congested, while rapidly/scalably utilizing free bandwidth when it becomes available. o It has a slightly revised state machine, to achieve the goals above. BBR_BW_PROBE_UP: pushes up inflight to probe for bw/vol BBR_BW_PROBE_DOWN: drain excess inflight from the queue BBR_BW_PROBE_CRUISE: use pipe, w/ headroom in queue/pipe BBR_BW_PROBE_REFILL: try refill the pipe again to 100%, leaving queue empty o The estimated BDP: BBR v3 continues to maintain an estimate of the path's two-way propagation delay, by tracking a windowed min_rtt, and coordinating (on an as-ndeeded basis) to try to expose the two-way propagation delay by draining the bottleneck queue. BBR v3 continues to use its min_rtt and (currently-applicable) bandwidth estimate to estimate the current bandwidth-delay product. The estimated BDP still provides one important guideline for bounding inflight data. However, because any min-filtered RTT and max-filtered bw inherently tend to both overestimate, the estimated BDP is often too high; in this case loss or ECN marks can ensue, in which case BBR v3 adjusts inflight_hi and inflight_lo to adapt its sending rate and inflight down to match the available capacity of the path. o Space: Note that ICSK_CA_PRIV_SIZE increased. This is because BBR v3 requires more space. Note that much of the space is due to support for per-socket parameterization and debugging in this release for research and debugging. With that state removed, the full "struct bbr" is 140 bytes, or 144 with padding. This is an increase of 40 bytes over the existing ca_priv space. o Code: BBR v3 reuses many pieces from BBR v1. But it omits the following significant pieces: o "packet conservation" (bbr_set_cwnd_to_recover_or_restore(), bbr_can_grow_inflight()) o long-term bandwidth estimator ("policer mode") The code layout tries to keep BBR v3 code near the bottom of the file, so that v1-applicable code in the top does not accidentally refer to v3 code. o Docs: See the following docs for more details and diagrams decsribing the BBR v3 algorithm: https://datatracker.ietf.org/meeting/104/materials/slides-104-iccrg-an-update-on-bbr-00 https://datatracker.ietf.org/meeting/102/materials/slides-102-iccrg-an-update-on-bbr-work-at-google-00 o Internal notes: For this upstream rebase, Neal started from: git show fed518041ac6:net/ipv4/tcp_bbr.c > net/ipv4/tcp_bbr.c then removed dev instrumentation (dynamic get/set for parameters) and code that was only used by BBRv1 Effort: net-tcp_bbr Origin-9xx-SHA1: 2c84098e60bed6d67dde23cd7538c51dee273102 Change-Id: I125cf26ba2a7a686f2fa5e87f4c2afceb65f7a05 Signed-off-by: Juhyung Park --- include/net/inet_connection_sock.h | 4 +- include/net/tcp.h | 2 +- include/uapi/linux/inet_diag.h | 23 + net/ipv4/Kconfig | 21 +- net/ipv4/tcp_bbr.c | 2212 +++++++++++++++++++++------- 5 files changed, 1739 insertions(+), 523 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index d8ff6aaa2e7d3..6747875ffd058 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -139,8 +139,8 @@ struct inet_connection_sock { ANDROID_KABI_RESERVE(1); - u64 icsk_ca_priv[104 / sizeof(u64)]; -#define ICSK_CA_PRIV_SIZE (13 * sizeof(u64)) +#define ICSK_CA_PRIV_SIZE (144) + u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; }; #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h index be1dba13c8bf9..8915fd3556267 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2197,7 +2197,7 @@ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ -}; +} __attribute__ ((__packed__)); static inline void tcp_plb_init(const struct sock *sk, struct tcp_plb_state *plb) diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 20ee93f0f8761..06fd41ea1a8c5 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -229,6 +229,29 @@ struct tcp_bbr_info { __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ + __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ + __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ + __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ + __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ + __u8 bbr_mode; /* current bbr_mode in state machine */ + __u8 bbr_phase; /* current state machine phase */ + __u8 unused1; /* alignment padding; not used yet */ + __u8 bbr_version; /* BBR algorithm version */ + __u32 bbr_inflight_lo; /* lower short-term data volume bound */ + __u32 bbr_inflight_hi; /* higher long-term data volume bound */ + __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ +}; + +/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ +enum tcp_bbr_phase { + BBR_PHASE_INVALID = 0, + BBR_PHASE_STARTUP = 1, + BBR_PHASE_DRAIN = 2, + BBR_PHASE_PROBE_RTT = 3, + BBR_PHASE_PROBE_BW_UP = 4, + BBR_PHASE_PROBE_BW_DOWN = 5, + BBR_PHASE_PROBE_BW_CRUISE = 6, + BBR_PHASE_PROBE_BW_REFILL = 7, }; union tcp_cc_info { diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 23b06063e1a51..01f8c1c77eefe 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -669,15 +669,18 @@ config TCP_CONG_BBR default n help - BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to - maximize network utilization and minimize queues. It builds an explicit - model of the bottleneck delivery rate and path round-trip propagation - delay. It tolerates packet loss and delay unrelated to congestion. It - can operate over LAN, WAN, cellular, wifi, or cable modem links. It can - coexist with flows that use loss-based congestion control, and can - operate with shallow buffers, deep buffers, bufferbloat, policers, or - AQM schemes that do not provide a delay signal. It requires the fq - ("Fair Queue") pacing packet scheduler. + BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a + model-based congestion control algorithm that aims to maximize + network utilization, keep queues and retransmit rates low, and to be + able to coexist with Reno/CUBIC in common scenarios. It builds an + explicit model of the network path. It tolerates a targeted degree + of random packet loss and delay. It can operate over LAN, WAN, + cellular, wifi, or cable modem links, and can use shallow-threshold + ECN signals. It can coexist to some degree with flows that use + loss-based congestion control, and can operate with shallow buffers, + deep buffers, bufferbloat, policers, or AQM schemes that do not + provide a delay signal. It requires pacing, using either TCP internal + pacing or the fq ("Fair Queue") pacing packet scheduler. choice prompt "Default TCP congestion control" diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 935ef3ccd342f..5feb473b1bd76 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1,18 +1,19 @@ -/* Bottleneck Bandwidth and RTT (BBR) congestion control +/* BBR (Bottleneck Bandwidth and RTT) congestion control * - * BBR congestion control computes the sending rate based on the delivery - * rate (throughput) estimated from ACKs. In a nutshell: + * BBR is a model-based congestion control algorithm that aims for low queues, + * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the + * network path, it uses measurements of bandwidth and RTT, as well as (if they + * occur) packet loss and/or shallow-threshold ECN signals. Note that although + * it can use ECN or loss signals explicitly, it does not require either; it + * can bound its in-flight data based on its estimate of the BDP. * - * On each ACK, update our model of the network path: - * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) - * min_rtt = windowed_min(rtt, 10 seconds) - * pacing_rate = pacing_gain * bottleneck_bandwidth - * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) - * - * The core algorithm does not react directly to packet losses or delays, - * although BBR may adjust the size of next send per ACK when loss is - * observed, or adjust the sending rate if it estimates there is a - * traffic policer, in order to keep the drop rate reasonable. + * The model has both higher and lower bounds for the operating range: + * lo: bw_lo, inflight_lo: conservative short-term lower bound + * hi: bw_hi, inflight_hi: robust long-term upper bound + * The bandwidth-probing time scale is (a) extended dynamically based on + * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by + * an interactive wall-clock time-scale to be more scalable and responsive + * than Reno and CUBIC. * * Here is a state transition diagram for BBR: * @@ -63,6 +64,13 @@ #include #include +#include +#include "tcp_dctcp.h" + +#define BBR_VERSION 3 + +#define bbr_param(sk,name) (bbr_ ## name) + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. @@ -83,36 +91,41 @@ enum bbr_mode { BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ }; +/* How does the incoming ACK stream relate to our bandwidth probing? */ +enum bbr_ack_phase { + BBR_ACKS_INIT, /* not probing; not getting probe feedback */ + BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ + BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ + BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ + BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ +}; + /* BBR congestion control block */ struct bbr { u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ u32 min_rtt_stamp; /* timestamp of min_rtt_us */ u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ - struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ - u32 rtt_cnt; /* count of packet-timed rounds elapsed */ + u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ + u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ u64 cycle_mstamp; /* time of this cycle phase start */ - u32 mode:3, /* current bbr_mode in state machine */ + u32 mode:2, /* current bbr_mode in state machine */ prev_ca_state:3, /* CA state on previous ACK */ - packet_conservation:1, /* use packet conservation? */ round_start:1, /* start of packet-timed tx->ack round? */ + ce_state:1, /* If most recent data has CE bit set */ + bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ + try_fast_path:1, /* can we take fast path? */ idle_restart:1, /* restarting after idle? */ probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ - unused:13, - lt_is_sampling:1, /* taking long-term ("LT") samples now? */ - lt_rtt_cnt:7, /* round trips in long-term interval */ - lt_use_bw:1; /* use lt_bw as our bw estimate? */ - u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ - u32 lt_last_delivered; /* LT intvl start: tp->delivered */ - u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ - u32 lt_last_lost; /* LT intvl start: tp->lost */ + init_cwnd:7, /* initial cwnd */ + unused_1:10; u32 pacing_gain:10, /* current gain for setting pacing rate */ cwnd_gain:10, /* current gain for setting cwnd */ full_bw_reached:1, /* reached full bw in Startup? */ full_bw_cnt:2, /* number of rounds without large bw gains */ - cycle_idx:3, /* current index in pacing_gain cycle array */ + cycle_idx:2, /* current index in pacing_gain cycle array */ has_seen_rtt:1, /* have we seen an RTT sample yet? */ - unused_b:5; + unused_2:6; u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 full_bw; /* recent bw, to estimate if pipe is full */ @@ -122,19 +135,67 @@ struct bbr { u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ extra_acked_win_idx:1, /* current index in extra_acked array */ - unused_c:6; + /* BBR v3 state: */ + full_bw_now:1, /* recently reached full bw plateau? */ + startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ + loss_in_cycle:1, /* packet loss in this cycle? */ + ecn_in_cycle:1, /* ECN in this cycle? */ + unused_3:1; + u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ + u32 undo_bw_lo; /* bw_lo before latest losses */ + u32 undo_inflight_lo; /* inflight_lo before latest losses */ + u32 undo_inflight_hi; /* inflight_hi before latest losses */ + u32 bw_latest; /* max delivered bw in last round trip */ + u32 bw_lo; /* lower bound on sending bandwidth */ + u32 bw_hi[2]; /* max recent measured bw sample */ + u32 inflight_latest; /* max delivered data in last round trip */ + u32 inflight_lo; /* lower bound of inflight data range */ + u32 inflight_hi; /* upper bound of inflight data range */ + u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ + u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ + u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ + u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ + u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ + ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ + bw_probe_samples:1, /* rate samples reflect bw probing? */ + prev_probe_too_high:1, /* did last PROBE_UP go too high? */ + stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ + rounds_since_probe:8, /* packet-timed rounds since probed bw */ + loss_round_start:1, /* loss_round_delivered round trip? */ + loss_in_round:1, /* loss marked in this round trip? */ + ecn_in_round:1, /* ECN marked in this round trip? */ + ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ + loss_events_in_round:4,/* losses in STARTUP round */ + initialized:1; /* has bbr_init() been called? */ + u32 alpha_last_delivered; /* tp->delivered at alpha update */ + u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ + + u8 unused_4; /* to preserve alignment */ + struct tcp_plb_state plb; }; -#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ +struct bbr_context { + u32 sample_bw; +}; -/* Window length of bw filter (in rounds): */ -static const int bbr_bw_rtts = CYCLE_LEN + 2; /* Window length of min_rtt filter (in sec): */ static const u32 bbr_min_rtt_win_sec = 10; /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ static const u32 bbr_probe_rtt_mode_ms = 200; -/* Skip TSO below the following bandwidth (bits/sec): */ -static const int bbr_min_tso_rate = 1200000; +/* Window length of probe_rtt_min_us filter (in ms), and consequently the + * typical interval between PROBE_RTT mode entries. The default is 5000ms. + * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC + */ +static const u32 bbr_probe_rtt_win_ms = 5000; +/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ +static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; + +/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting + * in bigger TSO bursts. We cut the RTT-based allowance in half + * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance + * is below 1500 bytes after 6 * ~500 usec = 3ms. + */ +static const u32 bbr_tso_rtt_shift = 9; /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. * In order to help drive the network toward lower queues and low latency while @@ -144,13 +205,15 @@ static const int bbr_min_tso_rate = 1200000; */ static const int bbr_pacing_margin_percent = 1; -/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain +/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value * that will allow a smoothly increasing pacing rate that will double each RTT * and send the same number of packets per RTT that an un-paced, slow-starting * Reno or CUBIC flow would: */ -static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; -/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain +static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; +/* The gain for deriving startup cwnd: */ +static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; +/* The pacing gain in BBR_DRAIN is calculated to typically drain * the queue created in BBR_STARTUP in a single round: */ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; @@ -158,13 +221,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; static const int bbr_cwnd_gain = BBR_UNIT * 2; /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ static const int bbr_pacing_gain[] = { - BBR_UNIT * 5 / 4, /* probe for more available bw */ - BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ - BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ - BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ + BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ + BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ + BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ + BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ +}; +enum bbr_pacing_gain_phase { + BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ + BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ + BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ + BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ }; -/* Randomize the starting gain cycling phase over N phases: */ -static const u32 bbr_cycle_rand = 7; /* Try to keep at least this many packets in flight, if things go smoothly. For * smooth functioning, a sliding window protocol ACKing every other packet @@ -172,24 +239,12 @@ static const u32 bbr_cycle_rand = 7; */ static const u32 bbr_cwnd_min_target = 4; -/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ +/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ /* If bw has increased significantly (1.25x), there may be more bw available: */ static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ static const u32 bbr_full_bw_cnt = 3; -/* "long-term" ("LT") bandwidth estimator parameters... */ -/* The minimum number of rounds in an LT bw sampling interval: */ -static const u32 bbr_lt_intvl_min_rtts = 4; -/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ -static const u32 bbr_lt_loss_thresh = 50; -/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ -static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; -/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ -static const u32 bbr_lt_bw_diff = 4000 / 8; -/* If we estimate we're policed, use lt_bw for this many round trips: */ -static const u32 bbr_lt_bw_max_rtts = 48; - /* Gain factor for adding extra_acked to target cwnd: */ static const int bbr_extra_acked_gain = BBR_UNIT; /* Window length of extra_acked window. */ @@ -199,8 +254,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; /* Time period for clamping cwnd increment due to ack aggregation */ static const u32 bbr_extra_acked_max_us = 100 * 1000; +/* Flags to control BBR ECN-related behavior... */ + +/* Ensure ACKs only ACK packets with consistent ECN CE status? */ +static const bool bbr_precise_ece_ack = true; + +/* Max RTT (in usec) at which to use sender-side ECN logic. + * Disabled when 0 (ECN allowed at any RTT). + */ +static const u32 bbr_ecn_max_rtt_us = 5000; + +/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. + * No loss response when 0. + */ +static const u32 bbr_beta = BBR_UNIT * 30 / 100; + +/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ +static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; + +/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly + * to congestion if the bottleneck is congested when the flow starts up. + */ +static const u32 bbr_ecn_alpha_init = BBR_UNIT; + +/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. + * No ECN based bounding when 0. + */ +static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ + +/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. + * Scaled by BBR_SCALE. Disabled when 0. + */ +static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ + +/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN + * clears then make the first round's increment to inflight_hi the following + * fraction of inflight_hi. + */ +static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; + +/* Estimate bw probing has gone too far if loss rate exceeds this level. */ +static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ + +/* Slow down for a packet loss recovered by TLP? */ +static const bool bbr_loss_probe_recovery = true; + +/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, + * and loss rate is higher than bbr_loss_thresh. + * Disabled if 0. + */ +static const u32 bbr_full_loss_cnt = 6; + +/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh + * meets this count. + */ +static const u32 bbr_full_ecn_cnt = 2; + +/* Fraction of unutilized headroom to try to leave in path upon high loss. */ +static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; + +/* How much do we increase cwnd_gain when probing for bandwidth in + * BBR_BW_PROBE_UP? This specifies the increment in units of + * BBR_UNIT/4. The default is 1, meaning 0.25. + * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). + */ +static const u32 bbr_bw_probe_cwnd_gain = 1; + +/* Max number of packet-timed rounds to wait before probing for bandwidth. If + * we want to tolerate 1% random loss per round, and not have this cut our + * inflight too much, we must probe for bw periodically on roughly this scale. + * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. + * We aim to be fair with Reno/CUBIC up to a BDP of at least: + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + */ +static const u32 bbr_bw_probe_max_rounds = 63; + +/* Max amount of randomness to inject in round counting for Reno-coexistence. + */ +static const u32 bbr_bw_probe_rand_rounds = 2; + +/* Use BBR-native probe time scale starting at this many usec. + * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: + * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs + */ +static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ + +/* Use BBR-native probes spread over this many usec: */ +static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ + +/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ +static const bool bbr_fast_path = true; + +/* Use fast ack mode? */ +static const bool bbr_fast_ack_mode = true; + +static u32 bbr_max_bw(const struct sock *sk); +static u32 bbr_bw(const struct sock *sk); +static void bbr_exit_probe_rtt(struct sock *sk); +static void bbr_reset_congestion_signals(struct sock *sk); +static void bbr_run_loss_probe_recovery(struct sock *sk); + static void bbr_check_probe_rtt_done(struct sock *sk); +/* This connection can use ECN if both endpoints have signaled ECN support in + * the handshake and the per-route settings indicated this is a + * shallow-threshold ECN environment, meaning both: + * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and + * (b) TCP endpoints provide precise ACKs that only ACK data segments + * with consistent ECN CE status + */ +static bool bbr_can_use_ecn(const struct sock *sk) +{ + return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && + (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); +} + /* Do we estimate that STARTUP filled the pipe? */ static bool bbr_full_bw_reached(const struct sock *sk) { @@ -212,17 +380,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ static u32 bbr_max_bw(const struct sock *sk) { - struct bbr *bbr = inet_csk_ca(sk); + const struct bbr *bbr = inet_csk_ca(sk); - return minmax_get(&bbr->bw); + return max(bbr->bw_hi[0], bbr->bw_hi[1]); } /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ static u32 bbr_bw(const struct sock *sk) { - struct bbr *bbr = inet_csk_ca(sk); + const struct bbr *bbr = inet_csk_ca(sk); - return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); + return min(bbr_max_bw(sk), bbr->bw_lo); } /* Return maximum extra acked in past k-2k round trips, @@ -239,15 +407,23 @@ static u16 bbr_extra_acked(const struct sock *sk) * The order here is chosen carefully to avoid overflow of u64. This should * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. */ -static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) +static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, + int margin) { unsigned int mss = tcp_sk(sk)->mss_cache; rate *= mss; rate *= gain; rate >>= BBR_SCALE; - rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); - return rate >> BW_SCALE; + rate *= USEC_PER_SEC / 100 * (100 - margin); + rate >>= BW_SCALE; + rate = max(rate, 1ULL); + return rate; +} + +static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) +{ + return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); } /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ @@ -255,12 +431,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) { u64 rate = bw; - rate = bbr_rate_bytes_per_sec(sk, rate, gain); + rate = bbr_rate_bytes_per_sec(sk, rate, gain, + bbr_pacing_margin_percent); rate = min_t(u64, rate, sk->sk_max_pacing_rate); return rate; } -/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ +/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -276,7 +453,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) } bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; do_div(bw, rtt_us); - sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); + sk->sk_pacing_rate = + bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)); } /* Pace using current bw estimate and a gain factor. */ @@ -292,31 +470,38 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) sk->sk_pacing_rate = rate; } -/* override sysctl_tcp_min_tso_segs */ -static u32 bbr_min_tso_segs(struct sock *sk) -{ - return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; -} - -/* Return the number of segments BBR would like in a TSO/GSO skb, given - * a particular max gso size as a constraint. +/* Return the number of segments BBR would like in a TSO/GSO skb, given a + * particular max gso size as a constraint. TODO: make this simpler and more + * consistent by switching bbr to just call tcp_tso_autosize(). */ static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, u32 gso_max_size) { - u32 segs; + struct bbr *bbr = inet_csk_ca(sk); + u32 segs, r; u64 bytes; /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + /* Budget a TSO/GSO burst size allowance based on min_rtt. For every + * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. + * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) + */ + if (bbr_param(sk, tso_rtt_shift)) { + r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); + if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ + bytes += GSO_MAX_SIZE >> r; + } + bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); - segs = max_t(u32, bytes / mss_now, bbr_min_tso_segs(sk)); + segs = max_t(u32, bytes / mss_now, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); return segs; } /* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ -static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) +static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) { return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); } @@ -346,7 +531,9 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - if (event == CA_EVENT_TX_START && tp->app_limited) { + if (event == CA_EVENT_TX_START) { + if (!tp->app_limited) + return; bbr->idle_restart = 1; bbr->ack_epoch_mstamp = tp->tcp_mstamp; bbr->ack_epoch_acked = 0; @@ -357,6 +544,16 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); else if (bbr->mode == BBR_PROBE_RTT) bbr_check_probe_rtt_done(sk); + } else if ((event == CA_EVENT_ECN_IS_CE || + event == CA_EVENT_ECN_NO_CE) && + bbr_can_use_ecn(sk) && + bbr_param(sk, precise_ece_ack)) { + u32 state = bbr->ce_state; + dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); + bbr->ce_state = state; + } else if (event == CA_EVENT_TLP_RECOVERY && + bbr_param(sk, loss_probe_recovery)) { + bbr_run_loss_probe_recovery(sk); } } @@ -379,10 +576,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * default. This should only happen when the connection is not using TCP * timestamps and has retransmitted all of the SYN/SYNACK/data packets * ACKed so far. In this case, an RTO can cut cwnd to 1, in which - * case we need to slow-start up toward something safe: TCP_INIT_CWND. + * case we need to slow-start up toward something safe: initial cwnd. */ if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ - return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ + return bbr->init_cwnd; /* be safe: cap at initial cwnd */ w = (u64)bw * bbr->min_rtt_us; @@ -399,23 +596,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) * - one skb in sending host Qdisc, * - one skb in sending host TSO/GSO engine * - one skb being received by receiver host LRO/GRO/delayed-ACK engine - * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because - * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, + * Don't worry, at low rates this won't bloat cwnd because + * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, * which allows 2 outstanding 2-packet sequences, to try to keep pipe * full even with ACK-every-other-packet delayed ACKs. */ static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) { struct bbr *bbr = inet_csk_ca(sk); + u32 tso_segs_goal; - /* Allow enough full-sized skbs in flight to utilize end systems. */ - cwnd += 3 * bbr_tso_segs_goal(sk); - - /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ - cwnd = (cwnd + 1) & ~1U; + tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + /* Allow enough full-sized skbs in flight to utilize end systems. */ + cwnd = max_t(u32, cwnd, tso_segs_goal); + cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ - if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) cwnd += 2; return cwnd; @@ -470,10 +667,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) { u32 max_aggr_cwnd, aggr_cwnd = 0; - if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { + if (bbr_param(sk, extra_acked_gain)) { max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) / BW_UNIT; - aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) + aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) >> BBR_SCALE; aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); } @@ -481,66 +678,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) return aggr_cwnd; } -/* An optimization in BBR to reduce losses: On the first round of recovery, we - * follow the packet conservation principle: send P packets per P packets acked. - * After that, we slow-start and send at most 2*P packets per P packets acked. - * After recovery finishes, or upon undo, we restore the cwnd we had when - * recovery started (capped by the target cwnd based on estimated BDP). - * - * TODO(ycheng/ncardwell): implement a rate-based approach. - */ -static bool bbr_set_cwnd_to_recover_or_restore( - struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) +/* Returns the cwnd for PROBE_RTT mode. */ +static u32 bbr_probe_rtt_cwnd(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; - u32 cwnd = tcp_snd_cwnd(tp); - - /* An ACK for P pkts should release at most 2*P packets. We do this - * in two steps. First, here we deduct the number of lost packets. - * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. - */ - if (rs->losses > 0) - cwnd = max_t(s32, cwnd - rs->losses, 1); - - if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { - /* Starting 1st round of Recovery, so do packet conservation. */ - bbr->packet_conservation = 1; - bbr->next_rtt_delivered = tp->delivered; /* start round now */ - /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ - cwnd = tcp_packets_in_flight(tp) + acked; - } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { - /* Exiting loss recovery; restore cwnd saved before recovery. */ - cwnd = max(cwnd, bbr->prior_cwnd); - bbr->packet_conservation = 0; - } - bbr->prev_ca_state = state; - - if (bbr->packet_conservation) { - *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); - return true; /* yes, using packet conservation */ - } - *new_cwnd = cwnd; - return false; + return max_t(u32, bbr_param(sk, cwnd_min_target), + bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); } /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss * has drawn us down below target), or snap down to target if we're above it. */ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, - u32 acked, u32 bw, int gain) + u32 acked, u32 bw, int gain, u32 cwnd, + struct bbr_context *ctx) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; + u32 target_cwnd = 0; if (!acked) goto done; /* no packet fully ACKed; just apply caps */ - if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) - goto done; - target_cwnd = bbr_bdp(sk, bw, gain); /* Increment the cwnd to account for excess ACKed data that seems @@ -549,74 +707,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, target_cwnd += bbr_ack_aggregation_cwnd(sk); target_cwnd = bbr_quantization_budget(sk, target_cwnd); - /* If we're below target cwnd, slow start cwnd toward target cwnd. */ - if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ - cwnd = min(cwnd + acked, target_cwnd); - else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) - cwnd = cwnd + acked; - cwnd = max(cwnd, bbr_cwnd_min_target); + /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ + bbr->try_fast_path = 0; + if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ + cwnd += acked; + if (cwnd >= target_cwnd) { + cwnd = target_cwnd; + bbr->try_fast_path = 1; + } + } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { + cwnd += acked; + } else { + bbr->try_fast_path = 1; + } + cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); done: - tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ + tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ - tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); -} - -/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ -static bool bbr_is_next_cycle_phase(struct sock *sk, - const struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - bool is_full_length = - tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > - bbr->min_rtt_us; - u32 inflight, bw; - - /* The pacing_gain of 1.0 paces at the estimated bw to try to fully - * use the pipe without increasing the queue. - */ - if (bbr->pacing_gain == BBR_UNIT) - return is_full_length; /* just use wall clock time */ - - inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); - bw = bbr_max_bw(sk); - - /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at - * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is - * small (e.g. on a LAN). We do not persist if packets are lost, since - * a path with small buffers may not hold that much. - */ - if (bbr->pacing_gain > BBR_UNIT) - return is_full_length && - (rs->losses || /* perhaps pacing_gain*BDP won't fit */ - inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); - - /* A pacing_gain < 1.0 tries to drain extra queue we added if bw - * probing didn't find more bw. If inflight falls to match BDP then we - * estimate queue is drained; persisting would underutilize the pipe. - */ - return is_full_length || - inflight <= bbr_inflight(sk, bw, BBR_UNIT); -} - -static void bbr_advance_cycle_phase(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - - bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); - bbr->cycle_mstamp = tp->delivered_mstamp; -} - -/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ -static void bbr_update_cycle_phase(struct sock *sk, - const struct rate_sample *rs) -{ - struct bbr *bbr = inet_csk_ca(sk); - - if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) - bbr_advance_cycle_phase(sk); + tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), + bbr_probe_rtt_cwnd(sk))); } static void bbr_reset_startup_mode(struct sock *sk) @@ -626,191 +736,49 @@ static void bbr_reset_startup_mode(struct sock *sk) bbr->mode = BBR_STARTUP; } -static void bbr_reset_probe_bw_mode(struct sock *sk) -{ - struct bbr *bbr = inet_csk_ca(sk); - - bbr->mode = BBR_PROBE_BW; - bbr->cycle_idx = CYCLE_LEN - 1 - prandom_u32_max(bbr_cycle_rand); - bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ -} - -static void bbr_reset_mode(struct sock *sk) -{ - if (!bbr_full_bw_reached(sk)) - bbr_reset_startup_mode(sk); - else - bbr_reset_probe_bw_mode(sk); -} - -/* Start a new long-term sampling interval. */ -static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - - bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); - bbr->lt_last_delivered = tp->delivered; - bbr->lt_last_lost = tp->lost; - bbr->lt_rtt_cnt = 0; -} - -/* Completely reset long-term bandwidth sampling. */ -static void bbr_reset_lt_bw_sampling(struct sock *sk) -{ - struct bbr *bbr = inet_csk_ca(sk); - - bbr->lt_bw = 0; - bbr->lt_use_bw = 0; - bbr->lt_is_sampling = false; - bbr_reset_lt_bw_sampling_interval(sk); -} - -/* Long-term bw sampling interval is done. Estimate whether we're policed. */ -static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) -{ - struct bbr *bbr = inet_csk_ca(sk); - u32 diff; - - if (bbr->lt_bw) { /* do we have bw from a previous interval? */ - /* Is new bw close to the lt_bw from the previous interval? */ - diff = abs(bw - bbr->lt_bw); - if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || - (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= - bbr_lt_bw_diff)) { - /* All criteria are met; estimate we're policed. */ - bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ - bbr->lt_use_bw = 1; - bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ - bbr->lt_rtt_cnt = 0; - return; - } - } - bbr->lt_bw = bw; - bbr_reset_lt_bw_sampling_interval(sk); -} - -/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of - * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and - * explicitly models their policed rate, to reduce unnecessary losses. We - * estimate that we're policed if we see 2 consecutive sampling intervals with - * consistent throughput and high packet loss. If we think we're being policed, - * set lt_bw to the "long-term" average delivery rate from those 2 intervals. +/* See if we have reached next round trip. Upon start of the new round, + * returns packets delivered since previous round start plus this ACK. */ -static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct bbr *bbr = inet_csk_ca(sk); - u32 lost, delivered; - u64 bw; - u32 t; - - if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ - if (bbr->mode == BBR_PROBE_BW && bbr->round_start && - ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { - bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ - bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ - } - return; - } - - /* Wait for the first loss before sampling, to let the policer exhaust - * its tokens and estimate the steady-state rate allowed by the policer. - * Starting samples earlier includes bursts that over-estimate the bw. - */ - if (!bbr->lt_is_sampling) { - if (!rs->losses) - return; - bbr_reset_lt_bw_sampling_interval(sk); - bbr->lt_is_sampling = true; - } - - /* To avoid underestimates, reset sampling if we run out of data. */ - if (rs->is_app_limited) { - bbr_reset_lt_bw_sampling(sk); - return; - } - - if (bbr->round_start) - bbr->lt_rtt_cnt++; /* count round trips in this interval */ - if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) - return; /* sampling interval needs to be longer */ - if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { - bbr_reset_lt_bw_sampling(sk); /* interval is too long */ - return; - } - - /* End sampling interval when a packet is lost, so we estimate the - * policer tokens were exhausted. Stopping the sampling before the - * tokens are exhausted under-estimates the policed rate. - */ - if (!rs->losses) - return; - - /* Calculate packets lost and delivered in sampling interval. */ - lost = tp->lost - bbr->lt_last_lost; - delivered = tp->delivered - bbr->lt_last_delivered; - /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ - if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) - return; - - /* Find average delivery rate in this sampling interval. */ - t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; - if ((s32)t < 1) - return; /* interval is less than one ms, so wait */ - /* Check if can multiply without overflow */ - if (t >= ~0U / USEC_PER_MSEC) { - bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ - return; - } - t *= USEC_PER_MSEC; - bw = (u64)delivered * BW_UNIT; - do_div(bw, t); - bbr_lt_bw_interval_done(sk, bw); -} - -/* Estimate the bandwidth based on how fast packets are delivered */ -static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) +static u32 bbr_update_round_start(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw; + u32 round_delivered = 0; bbr->round_start = 0; - if (rs->delivered < 0 || rs->interval_us <= 0) - return; /* Not a valid observation */ /* See if we've reached the next RTT */ - if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { + if (rs->interval_us > 0 && + !before(rs->prior_delivered, bbr->next_rtt_delivered)) { + round_delivered = tp->delivered - bbr->next_rtt_delivered; bbr->next_rtt_delivered = tp->delivered; - bbr->rtt_cnt++; bbr->round_start = 1; - bbr->packet_conservation = 0; } + return round_delivered; +} - bbr_lt_bw_sampling(sk, rs); +/* Calculate the bandwidth based on how fast packets are delivered */ +static void bbr_calculate_bw_sample(struct sock *sk, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + u64 bw = 0; /* Divide delivered by the interval to find a (lower bound) bottleneck * bandwidth sample. Delivered is in packets and interval_us in uS and * ratio will be <<1 for most connections. So delivered is first scaled. + * Round up to allow growth at low rates, even with integer division. */ - bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); - - /* If this sample is application-limited, it is likely to have a very - * low delivered count that represents application behavior rather than - * the available network rate. Such a sample could drag down estimated - * bw, causing needless slow-down. Thus, to continue to send at the - * last measured network rate, we filter out app-limited samples unless - * they describe the path bw at least as well as our bw model. - * - * So the goal during app-limited phase is to proceed with the best - * network rate no matter how long. We automatically leave this - * phase when app writes faster than the network can deliver :) - */ - if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { - /* Incorporate new sample into our max bw filter. */ - minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); + if (rs->interval_us > 0) { + if (WARN_ONCE(rs->delivered < 0, + "negative delivered: %d interval_us: %ld\n", + rs->delivered, rs->interval_us)) + return; + + bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); } + + ctx->sample_bw = bw; } /* Estimates the windowed max degree of ack aggregation. @@ -824,7 +792,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) * * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). * Max filter is an approximate sliding window of 5-10 (packet timed) round - * trips. + * trips for non-startup phase, and 1-2 round trips for startup. */ static void bbr_update_ack_aggregation(struct sock *sk, const struct rate_sample *rs) @@ -832,15 +800,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, u32 epoch_us, expected_acked, extra_acked; struct bbr *bbr = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); - if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || + if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || rs->delivered < 0 || rs->interval_us <= 0) return; if (bbr->round_start) { bbr->extra_acked_win_rtts = min(0x1F, bbr->extra_acked_win_rtts + 1); - if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { + if (!bbr_full_bw_reached(sk)) + extra_acked_win_rtts_thresh = 1; + if (bbr->extra_acked_win_rtts >= + extra_acked_win_rtts_thresh) { bbr->extra_acked_win_rtts = 0; bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? 0 : 1; @@ -874,49 +846,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; } -/* Estimate when the pipe is full, using the change in delivery rate: BBR - * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by - * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited - * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the - * higher rwin, 3: we get higher delivery rate samples. Or transient - * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar - * design goal, but uses delay and inter-ACK spacing instead of bandwidth. - */ -static void bbr_check_full_bw_reached(struct sock *sk, - const struct rate_sample *rs) -{ - struct bbr *bbr = inet_csk_ca(sk); - u32 bw_thresh; - - if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) - return; - - bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; - if (bbr_max_bw(sk) >= bw_thresh) { - bbr->full_bw = bbr_max_bw(sk); - bbr->full_bw_cnt = 0; - return; - } - ++bbr->full_bw_cnt; - bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; -} - -/* If pipe is probably full, drain the queue and then enter steady-state. */ -static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) -{ - struct bbr *bbr = inet_csk_ca(sk); - - if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { - bbr->mode = BBR_DRAIN; /* drain queue we created */ - tcp_sk(sk)->snd_ssthresh = - bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); - } /* fall through to check if in-flight is already small: */ - if (bbr->mode == BBR_DRAIN && - bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= - bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) - bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ -} - static void bbr_check_probe_rtt_done(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -926,9 +855,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) return; - bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ + bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); - bbr_reset_mode(sk); + bbr_exit_probe_rtt(sk); } /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and @@ -954,23 +883,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bool filter_expired; + bool probe_rtt_expired, min_rtt_expired; + u32 expire; - /* Track min RTT seen in the min_rtt_win_sec filter window: */ - filter_expired = after(tcp_jiffies32, - bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); + /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ + expire = bbr->probe_rtt_min_stamp + + msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); + probe_rtt_expired = after(tcp_jiffies32, expire); if (rs->rtt_us >= 0 && - (rs->rtt_us < bbr->min_rtt_us || - (filter_expired && !rs->is_ack_delayed))) { - bbr->min_rtt_us = rs->rtt_us; - bbr->min_rtt_stamp = tcp_jiffies32; + (rs->rtt_us < bbr->probe_rtt_min_us || + (probe_rtt_expired && !rs->is_ack_delayed))) { + bbr->probe_rtt_min_us = rs->rtt_us; + bbr->probe_rtt_min_stamp = tcp_jiffies32; + } + /* Track min RTT seen in the min_rtt_win_sec filter window: */ + expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; + min_rtt_expired = after(tcp_jiffies32, expire); + if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || + min_rtt_expired) { + bbr->min_rtt_us = bbr->probe_rtt_min_us; + bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; } - if (bbr_probe_rtt_mode_ms > 0 && filter_expired && + if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ bbr_save_cwnd(sk); /* note cwnd so we can restore it */ bbr->probe_rtt_done_stamp = 0; + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; } if (bbr->mode == BBR_PROBE_RTT) { @@ -979,9 +920,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; /* Maintain min packets in flight for max(200 ms, 1 round). */ if (!bbr->probe_rtt_done_stamp && - tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { + tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { bbr->probe_rtt_done_stamp = tcp_jiffies32 + - msecs_to_jiffies(bbr_probe_rtt_mode_ms); + msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); bbr->probe_rtt_round_done = 0; bbr->next_rtt_delivered = tp->delivered; } else if (bbr->probe_rtt_done_stamp) { @@ -1002,18 +943,20 @@ static void bbr_update_gains(struct sock *sk) switch (bbr->mode) { case BBR_STARTUP: - bbr->pacing_gain = bbr_high_gain; - bbr->cwnd_gain = bbr_high_gain; + bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); + bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); break; case BBR_DRAIN: - bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ - bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ + bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ + bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ break; case BBR_PROBE_BW: - bbr->pacing_gain = (bbr->lt_use_bw ? - BBR_UNIT : - bbr_pacing_gain[bbr->cycle_idx]); - bbr->cwnd_gain = bbr_cwnd_gain; + bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; + bbr->cwnd_gain = bbr_param(sk, cwnd_gain); + if (bbr_param(sk, bw_probe_cwnd_gain) && + bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr->cwnd_gain += + BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; break; case BBR_PROBE_RTT: bbr->pacing_gain = BBR_UNIT; @@ -1025,140 +968,1380 @@ static void bbr_update_gains(struct sock *sk) } } -static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) +static u32 bbr_sndbuf_expand(struct sock *sk) { - bbr_update_bw(sk, rs); - bbr_update_ack_aggregation(sk, rs); - bbr_update_cycle_phase(sk, rs); - bbr_check_full_bw_reached(sk, rs); - bbr_check_drain(sk, rs); - bbr_update_min_rtt(sk, rs); - bbr_update_gains(sk); + /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ + return 3; } -static void bbr_main(struct sock *sk, const struct rate_sample *rs) +/* Incorporate a new bw sample into the current window of our max filter. */ +static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) { struct bbr *bbr = inet_csk_ca(sk); - u32 bw; - - bbr_update_model(sk, rs); - bw = bbr_bw(sk); - bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); - bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); + bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); } -static void bbr_init(struct sock *sk) +/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ +static void bbr_advance_max_bw_filter(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - bbr->prior_cwnd = 0; - tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; - bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = tp->delivered; - bbr->prev_ca_state = TCP_CA_Open; - bbr->packet_conservation = 0; - - bbr->probe_rtt_done_stamp = 0; - bbr->probe_rtt_round_done = 0; - bbr->min_rtt_us = tcp_min_rtt(tp); - bbr->min_rtt_stamp = tcp_jiffies32; - - minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ + if (!bbr->bw_hi[1]) + return; /* no samples in this window; remember old window */ + bbr->bw_hi[0] = bbr->bw_hi[1]; + bbr->bw_hi[1] = 0; +} - bbr->has_seen_rtt = 0; - bbr_init_pacing_rate_from_rtt(sk); +/* Reset the estimator for reaching full bandwidth based on bw plateau. */ +static void bbr_reset_full_bw(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); - bbr->round_start = 0; - bbr->idle_restart = 0; - bbr->full_bw_reached = 0; bbr->full_bw = 0; bbr->full_bw_cnt = 0; - bbr->cycle_mstamp = 0; - bbr->cycle_idx = 0; - bbr_reset_lt_bw_sampling(sk); - bbr_reset_startup_mode(sk); - - bbr->ack_epoch_mstamp = tp->tcp_mstamp; - bbr->ack_epoch_acked = 0; - bbr->extra_acked_win_rtts = 0; - bbr->extra_acked_win_idx = 0; - bbr->extra_acked[0] = 0; - bbr->extra_acked[1] = 0; - - cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); + bbr->full_bw_now = 0; } -static u32 bbr_sndbuf_expand(struct sock *sk) +/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ +static u32 bbr_target_inflight(struct sock *sk) { - /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ - return 3; + u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + + return min(bdp, tcp_sk(sk)->snd_cwnd); } -/* In theory BBR does not need to undo the cwnd since it does not - * always reduce cwnd on losses (see bbr_main()). Keep it for now. - */ -static u32 bbr_undo_cwnd(struct sock *sk) +static bool bbr_is_probing_bandwidth(struct sock *sk) { struct bbr *bbr = inet_csk_ca(sk); - bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ - bbr->full_bw_cnt = 0; - bbr_reset_lt_bw_sampling(sk); - return tcp_snd_cwnd(tcp_sk(sk)); + return (bbr->mode == BBR_STARTUP) || + (bbr->mode == BBR_PROBE_BW && + (bbr->cycle_idx == BBR_BW_PROBE_REFILL || + bbr->cycle_idx == BBR_BW_PROBE_UP)); } -/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ +/* Has the given amount of time elapsed since we marked the phase start? */ +static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct bbr *bbr = inet_csk_ca(sk); + + return tcp_stamp_us_delta(tp->tcp_mstamp, + bbr->cycle_mstamp + interval_us) > 0; +} + +static void bbr_handle_queue_too_high_in_startup(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bdp; /* estimated BDP in packets, with quantization budget */ + + bbr->full_bw_reached = 1; + + bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + bbr->inflight_hi = max(bdp, bbr->inflight_latest); +} + +/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ +static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || + !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) + return; + + if (ce_ratio >= bbr_param(sk, ecn_thresh)) + bbr->startup_ecn_rounds++; + else + bbr->startup_ecn_rounds = 0; + + if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { + bbr_handle_queue_too_high_in_startup(sk); + return; + } +} + +/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ +static int bbr_update_ecn_alpha(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); + struct bbr *bbr = inet_csk_ca(sk); + s32 delivered, delivered_ce; + u64 alpha, ce_ratio; + u32 gain; + bool want_ecn_alpha; + + /* See if we should use ECN sender logic for this connection. */ + if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && + bbr_param(sk, ecn_factor) && + (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || + !bbr_ecn_max_rtt_us)) + bbr->ecn_eligible = 1; + + /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ + want_ecn_alpha = (bbr->ecn_eligible || + (bbr_can_use_ecn(sk) && + READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); + if (!want_ecn_alpha) + return -1; + + delivered = tp->delivered - bbr->alpha_last_delivered; + delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; + + if (delivered == 0 || /* avoid divide by zero */ + WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ + return -1; + + BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); + ce_ratio = (u64)delivered_ce << BBR_SCALE; + do_div(ce_ratio, delivered); + + gain = bbr_param(sk, ecn_alpha_gain); + alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; + alpha += (gain * ce_ratio) >> BBR_SCALE; + bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); + + bbr->alpha_last_delivered = tp->delivered; + bbr->alpha_last_delivered_ce = tp->delivered_ce; + + bbr_check_ecn_too_high_in_startup(sk, ce_ratio); + return (int)ce_ratio; +} + +/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 + * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->round_start && ce_ratio >= 0) + tcp_plb_update_state(sk, &bbr->plb, ce_ratio); + + tcp_plb_check_rehash(sk, &bbr->plb); +} + +/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ +static void bbr_raise_inflight_hi_slope(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 growth_this_round, cnt; + + /* Calculate "slope": packets S/Acked per inflight_hi increment. */ + growth_this_round = 1 << bbr->bw_probe_up_rounds; + bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); + cnt = tcp_snd_cwnd(tp) / growth_this_round; + cnt = max(cnt, 1U); + bbr->bw_probe_up_cnt = cnt; +} + +/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ +static void bbr_probe_inflight_hi_upward(struct sock *sk, + const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 delta; + + if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) + return; /* not fully using inflight_hi, so don't grow it */ + + /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ + bbr->bw_probe_up_acks += rs->acked_sacked; + if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { + delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; + bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; + bbr->inflight_hi += delta; + bbr->try_fast_path = 0; /* Need to update cwnd */ + } + + if (bbr->round_start) + bbr_raise_inflight_hi_slope(sk); +} + +/* Does loss/ECN rate for this sample say inflight is "too high"? + * This is used by both the bbr_check_loss_too_high_in_startup() function, + * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which + * uses it to notice when loss/ECN rates suggest inflight is too high. + */ +static bool bbr_is_inflight_too_high(const struct sock *sk, + const struct rate_sample *rs) +{ + const struct bbr *bbr = inet_csk_ca(sk); + u32 loss_thresh, ecn_thresh; + + if (rs->lost > 0 && rs->tx_in_flight) { + loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> + BBR_SCALE; + if (rs->lost > loss_thresh) { + return true; + } + } + + if (rs->delivered_ce > 0 && rs->delivered > 0 && + bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { + ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> + BBR_SCALE; + if (rs->delivered_ce > ecn_thresh) { + return true; + } + } + + return false; +} + +/* Calculate the tx_in_flight level that corresponded to excessive loss. + * We find "lost_prefix" segs of the skb where loss rate went too high, + * by solving for "lost_prefix" in the following equation: + * lost / inflight >= loss_thresh + * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh + * Then we take that equation, convert it to fixed point, and + * round up to the nearest packet. + */ +static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, + const struct rate_sample *rs, + const struct sk_buff *skb) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 loss_thresh = bbr_param(sk, loss_thresh); + u32 pcount, divisor, inflight_hi; + s32 inflight_prev, lost_prev; + u64 loss_budget, lost_prefix; + + pcount = tcp_skb_pcount(skb); + + /* How much data was in flight before this skb? */ + inflight_prev = rs->tx_in_flight - pcount; + if (inflight_prev < 0) { + WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( + pcount, + TCP_SKB_CB(skb)->sacked, + rs->tx_in_flight), + "tx_in_flight: %u pcount: %u reneg: %u", + rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); + return ~0U; + } + + /* How much inflight data was marked lost before this skb? */ + lost_prev = rs->lost - pcount; + if (WARN_ONCE(lost_prev < 0, + "cwnd: %u ca: %d out: %u lost: %u pif: %u " + "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " + "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", + tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, + tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), + rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, + rs->lost, lost_prev, pcount, + TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tp->is_sack_reneg)) + return ~0U; + + /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ + loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; + loss_budget >>= BBR_SCALE; + if (lost_prev >= loss_budget) { + lost_prefix = 0; /* previous losses crossed loss_thresh */ + } else { + lost_prefix = loss_budget - lost_prev; + lost_prefix <<= BBR_SCALE; + divisor = BBR_UNIT - loss_thresh; + if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ + return ~0U; + do_div(lost_prefix, divisor); + } + + inflight_hi = inflight_prev + lost_prefix; + return inflight_hi; +} + +/* If loss/ECN rates during probing indicated we may have overfilled a + * buffer, return an operating point that tries to leave unutilized headroom in + * the path for other flows, for fairness convergence and lower RTTs and loss. + */ +static u32 bbr_inflight_with_headroom(const struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 headroom, headroom_fraction; + + if (bbr->inflight_hi == ~0U) + return ~0U; + + headroom_fraction = bbr_param(sk, inflight_headroom); + headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; + headroom = max(headroom, 1U); + return max_t(s32, bbr->inflight_hi - headroom, + bbr_param(sk, cwnd_min_target)); +} + +/* Bound cwnd to a sensible level, based on our current probing state + * machine phase and model of a good inflight level (inflight_lo, inflight_hi). + */ +static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + u32 cap; + + /* tcp_rcv_synsent_state_process() currently calls tcp_ack() + * and thus cong_control() without first initializing us(!). + */ + if (!bbr->initialized) + return; + + cap = ~0U; + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { + /* Probe to see if more packets fit in the path. */ + cap = bbr->inflight_hi; + } else { + if (bbr->mode == BBR_PROBE_RTT || + (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) + cap = bbr_inflight_with_headroom(sk); + } + /* Adapt to any loss/ECN since our last bw probe. */ + cap = min(cap, bbr->inflight_lo); + + cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); + tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); +} + +/* How should we multiplicatively cut bw or inflight limits based on ECN? */ +u32 bbr_ecn_cut(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + return BBR_UNIT - + ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); +} + +/* Init lower bounds if have not inited yet. */ +static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (init_bw && bbr->bw_lo == ~0U) + bbr->bw_lo = bbr_max_bw(sk); + if (bbr->inflight_lo == ~0U) + bbr->inflight_lo = tcp_snd_cwnd(tp); +} + +/* Reduce bw and inflight to (1 - beta). */ +static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) +{ + struct bbr* bbr = inet_csk_ca(sk); + u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); + + *bw = max_t(u32, bbr->bw_latest, + (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); + *inflight = max_t(u32, bbr->inflight_latest, + (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); +} + +/* Reduce inflight to (1 - alpha*ecn_factor). */ +static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 ecn_cut = bbr_ecn_cut(sk); + + *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; +} + +/* Estimate a short-term lower bound on the capacity available now, based + * on measurements of the current delivery process and recent history. When we + * are seeing loss/ECN at times when we are not probing bw, then conservatively + * move toward flow balance by multiplicatively cutting our short-term + * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a + * multiplicative decrease in order to converge to a lower capacity in time + * logarithmic in the magnitude of the decrease. + * + * However, we do not cut our short-term estimates lower than the current rate + * and volume of delivered data from this round trip, since from the current + * delivery process we can estimate the measured capacity available now. + * + * Anything faster than that approach would knowingly risk high loss, which can + * cause low bw for Reno/CUBIC and high loss recovery latency for + * request/response flows using any congestion control. + */ +static void bbr_adapt_lower_bounds(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 ecn_inflight_lo = ~0U; + + /* We only use lower-bound estimates when not probing bw. + * When probing we need to push inflight higher to probe bw. + */ + if (bbr_is_probing_bandwidth(sk)) + return; + + /* ECN response. */ + if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { + bbr_init_lower_bounds(sk, false); + bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); + } + + /* Loss response. */ + if (bbr->loss_in_round) { + bbr_init_lower_bounds(sk, true); + bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); + } + + /* Adjust to the lower of the levels implied by loss/ECN. */ + bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); + bbr->bw_lo = max(1U, bbr->bw_lo); +} + +/* Reset any short-term lower-bound adaptation to congestion, so that we can + * push our inflight up. + */ +static void bbr_reset_lower_bounds(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->bw_lo = ~0U; + bbr->inflight_lo = ~0U; +} + +/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state + * machine phase where we adapt our lower bound based on congestion signals. + */ +static void bbr_reset_congestion_signals(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; + bbr->loss_in_cycle = 0; + bbr->ecn_in_cycle = 0; + bbr->bw_latest = 0; + bbr->inflight_latest = 0; +} + +static void bbr_exit_loss_recovery(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); + bbr->try_fast_path = 0; /* bound cwnd using latest model */ +} + +/* Update rate and volume of delivered data from latest round trip. */ +static void bbr_update_latest_delivery_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->loss_round_start = 0; + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ + + bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); + bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); + + if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { + bbr->loss_round_delivered = tp->delivered; + bbr->loss_round_start = 1; /* mark start of new round trip */ + } +} + +/* Once per round, reset filter for latest rate and volume of delivered data. */ +static void bbr_advance_latest_delivery_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* If ACK matches a TLP retransmit, persist the filter. If we detect + * that a TLP retransmit plugged a tail loss, we'll want to remember + * how much data the path delivered before the tail loss. + */ + if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { + bbr->bw_latest = ctx->sample_bw; + bbr->inflight_latest = rs->delivered; + } +} + +/* Update (most of) our congestion signals: track the recent rate and volume of + * delivered data, presence of loss, and EWMA degree of ECN marking. + */ +static void bbr_update_congestion_signals( + struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u64 bw; + + if (rs->interval_us <= 0 || !rs->acked_sacked) + return; /* Not a valid observation */ + bw = ctx->sample_bw; + + if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) + bbr_take_max_bw_sample(sk, bw); + + bbr->loss_in_round |= (rs->losses > 0); + + if (!bbr->loss_round_start) + return; /* skip the per-round-trip updates */ + /* Now do per-round-trip updates. */ + bbr_adapt_lower_bounds(sk, rs); + + bbr->loss_in_round = 0; + bbr->ecn_in_round = 0; +} + +/* Bandwidth probing can cause loss. To help coexistence with loss-based + * congestion control we spread out our probing in a Reno-conscious way. Due to + * the shape of the Reno sawtooth, the time required between loss epochs for an + * idealized Reno flow is a number of round trips that is the BDP of that + * flow. We count packet-timed round trips directly, since measured RTT can + * vary widely, and Reno is driven by packet-timed round trips. + */ +static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 rounds; + + /* Random loss can shave some small percentage off of our inflight + * in each round. To survive this, flows need robust periodic probes. + */ + rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); + return bbr->rounds_since_probe >= rounds; +} + +/* How long do we want to wait before probing for bandwidth (and risking + * loss)? We randomize the wait, for better mixing and fairness convergence. + * + * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. + * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, + * (eg 4K video to a broadband user): + * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets + * + * We bound the BBR-native inter-bw-probe wall clock time to be: + * (a) higher than 2 sec: to try to avoid causing loss for a long enough time + * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must + * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs + * (b) lower than 3 sec: to ensure flows can start probing in a reasonable + * amount of time to discover unutilized bw on human-scale interactive + * time-scales (e.g. perhaps traffic from a web page download that we + * were competing with is now complete). + */ +static void bbr_pick_probe_wait(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Decide the random round-trip bound for wait until probe: */ + bbr->rounds_since_probe = + prandom_u32_max(bbr_param(sk, bw_probe_rand_rounds)); + /* Decide the random wall clock bound for wait until probe: */ + bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + + prandom_u32_max(bbr_param(sk, bw_probe_rand_us)); +} + +static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr->cycle_idx = cycle_idx; + /* New phase, so need to update cwnd and pacing rate. */ + bbr->try_fast_path = 0; +} + +/* Send at estimated bw to fill the pipe, but not queue. We need this phase + * before PROBE_UP, because as soon as we send faster than the available bw + * we will start building a queue, and if the buffer is shallow we can cause + * loss. If we do not fill the pipe before we cause this loss, our bw_hi and + * inflight_hi estimates will underestimate. + */ +static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr_reset_lower_bounds(sk); + bbr->bw_probe_up_rounds = bw_probe_up_rounds; + bbr->bw_probe_up_acks = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_REFILLING; + bbr->next_rtt_delivered = tp->delivered; + bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); +} + +/* Now probe max deliverable data rate and volume. */ +static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->ack_phase = BBR_ACKS_PROBE_STARTING; + bbr->next_rtt_delivered = tp->delivered; + bbr->cycle_mstamp = tp->tcp_mstamp; + bbr_reset_full_bw(sk); + bbr->full_bw = ctx->sample_bw; + bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); + bbr_raise_inflight_hi_slope(sk); +} + +/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall + * clock time at which to probe beyond an inflight that we think to be + * safe. This will knowingly risk packet loss, so we want to do this rarely, to + * keep packet loss rates low. Also start a round-trip counter, to probe faster + * if we estimate a Reno flow at our BDP would probe faster. + */ +static void bbr_start_bw_probe_down(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr_reset_congestion_signals(sk); + bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ + bbr_pick_probe_wait(sk); + bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ + bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; + bbr->next_rtt_delivered = tp->delivered; + bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); +} + +/* Cruise: maintain what we estimate to be a neutral, conservative + * operating point, without attempting to probe up for bandwidth or down for + * RTT, and only reducing inflight in response to loss/ECN signals. + */ +static void bbr_start_bw_probe_cruise(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->inflight_lo != ~0U) + bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); + + bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); +} + +/* Loss and/or ECN rate is too high while probing. + * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. + */ +static void bbr_handle_inflight_too_high(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + const u32 beta = bbr_param(sk, beta); + + bbr->prev_probe_too_high = 1; + bbr->bw_probe_samples = 0; /* only react once per probe */ + /* If we are app-limited then we are not robustly + * probing the max volume of inflight data we think + * might be safe (analogous to how app-limited bw + * samples are not known to be robustly probing bw). + */ + if (!rs->is_app_limited) { + bbr->inflight_hi = max_t(u32, rs->tx_in_flight, + (u64)bbr_target_inflight(sk) * + (BBR_UNIT - beta) >> BBR_SCALE); + } + if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr_start_bw_probe_down(sk); +} + +/* If we're seeing bw and loss samples reflecting our bw probing, adapt + * using the signals we see. If loss or ECN mark rate gets too high, then adapt + * inflight_hi downward. If we're able to push inflight higher without such + * signals, push higher: adapt inflight_hi upward. + */ +static bool bbr_adapt_upper_bounds(struct sock *sk, + const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + /* Track when we'll see bw/loss samples resulting from our bw probes. */ + if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) + bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; + if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { + /* End of samples from bw probing phase. */ + bbr->bw_probe_samples = 0; + bbr->ack_phase = BBR_ACKS_INIT; + /* At this point in the cycle, our current bw sample is also + * our best recent chance at finding the highest available bw + * for this flow. So now is the best time to forget the bw + * samples from the previous cycle, by advancing the window. + */ + if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) + bbr_advance_max_bw_filter(sk); + /* If we had an inflight_hi, then probed and pushed inflight all + * the way up to hit that inflight_hi without seeing any + * high loss/ECN in all the resulting ACKs from that probing, + * then probe up again, this time letting inflight persist at + * inflight_hi for a round trip, then accelerating beyond. + */ + if (bbr->mode == BBR_PROBE_BW && + bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { + bbr_start_bw_probe_refill(sk, 0); + return true; /* yes, decided state transition */ + } + } + if (bbr_is_inflight_too_high(sk, rs)) { + if (bbr->bw_probe_samples) /* sample is from bw probing? */ + bbr_handle_inflight_too_high(sk, rs); + } else { + /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ + + if (bbr->inflight_hi == ~0U) + return false; /* no excess queue signals yet */ + + /* To be resilient to random loss, we must raise bw/inflight_hi + * if we observe in any phase that a higher level is safe. + */ + if (rs->tx_in_flight > bbr->inflight_hi) { + bbr->inflight_hi = rs->tx_in_flight; + } + + if (bbr->mode == BBR_PROBE_BW && + bbr->cycle_idx == BBR_BW_PROBE_UP) + bbr_probe_inflight_hi_upward(sk, rs); + } + + return false; +} + +/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ +static bool bbr_check_time_to_probe_bw(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 n; + + /* If we seem to be at an operating point where we are not seeing loss + * but we are seeing ECN marks, then when the ECN marks cease we reprobe + * quickly (in case cross-traffic has ceased and freed up bw). + */ + if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && + bbr->ecn_in_cycle && !bbr->loss_in_cycle && + inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { + /* Calculate n so that when bbr_raise_inflight_hi_slope() + * computes growth_this_round as 2^n it will be roughly the + * desired volume of data (inflight_hi*ecn_reprobe_gain). + */ + n = ilog2((((u64)bbr->inflight_hi * + bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); + bbr_start_bw_probe_refill(sk, n); + return true; + } + + if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || + bbr_is_reno_coexistence_probe_time(sk)) { + bbr_start_bw_probe_refill(sk, 0); + return true; + } + return false; +} + +/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ +static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) +{ + /* Always need to pull inflight down to leave headroom in queue. */ + if (inflight > bbr_inflight_with_headroom(sk)) + return false; + + return inflight <= bbr_inflight(sk, bw, BBR_UNIT); +} + +/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ +static void bbr_update_cycle_phase(struct sock *sk, + const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + bool is_bw_probe_done = false; + u32 inflight, bw; + + if (!bbr_full_bw_reached(sk)) + return; + + /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ + if (bbr_adapt_upper_bounds(sk, rs, ctx)) + return; /* already decided state transition */ + + if (bbr->mode != BBR_PROBE_BW) + return; + + inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); + bw = bbr_max_bw(sk); + + switch (bbr->cycle_idx) { + /* First we spend most of our time cruising with a pacing_gain of 1.0, + * which paces at the estimated bw, to try to fully use the pipe + * without building queue. If we encounter loss/ECN marks, we adapt + * by slowing down. + */ + case BBR_BW_PROBE_CRUISE: + if (bbr_check_time_to_probe_bw(sk, rs)) + return; /* already decided state transition */ + break; + + /* After cruising, when it's time to probe, we first "refill": we send + * at the estimated bw to fill the pipe, before probing higher and + * knowingly risking overflowing the bottleneck buffer (causing loss). + */ + case BBR_BW_PROBE_REFILL: + if (bbr->round_start) { + /* After one full round trip of sending in REFILL, we + * start to see bw samples reflecting our REFILL, which + * may be putting too much data in flight. + */ + bbr->bw_probe_samples = 1; + bbr_start_bw_probe_up(sk, ctx); + } + break; + + /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to + * probe for bw. If we have not seen loss/ECN, we try to raise inflight + * to at least pacing_gain*BDP; note that this may take more than + * min_rtt if min_rtt is small (e.g. on a LAN). + * + * We terminate PROBE_UP bandwidth probing upon any of the following: + * + * (1) We've pushed inflight up to hit the inflight_hi target set in the + * most recent previous bw probe phase. Thus we want to start + * draining the queue immediately because it's very likely the most + * recently sent packets will fill the queue and cause drops. + * (2) If inflight_hi has not limited bandwidth growth recently, and + * yet delivered bandwidth has not increased much recently + * (bbr->full_bw_now). + * (3) Loss filter says loss rate is "too high". + * (4) ECN filter says ECN mark rate is "too high". + * + * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() + */ + case BBR_BW_PROBE_UP: + if (bbr->prev_probe_too_high && + inflight >= bbr->inflight_hi) { + bbr->stopped_risky_probe = 1; + is_bw_probe_done = true; + } else { + if (tp->is_cwnd_limited && + tcp_snd_cwnd(tp) >= bbr->inflight_hi) { + /* inflight_hi is limiting bw growth */ + bbr_reset_full_bw(sk); + bbr->full_bw = ctx->sample_bw; + } else if (bbr->full_bw_now) { + /* Plateau in estimated bw. Pipe looks full. */ + is_bw_probe_done = true; + } + } + if (is_bw_probe_done) { + bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ + bbr_start_bw_probe_down(sk); /* restart w/ down */ + } + break; + + /* After probing in PROBE_UP, we have usually accumulated some data in + * the bottleneck buffer (if bw probing didn't find more bw). We next + * enter PROBE_DOWN to try to drain any excess data from the queue. To + * do this, we use a pacing_gain < 1.0. We hold this pacing gain until + * our inflight is less then that target cruising point, which is the + * minimum of (a) the amount needed to leave headroom, and (b) the + * estimated BDP. Once inflight falls to match the target, we estimate + * the queue is drained; persisting would underutilize the pipe. + */ + case BBR_BW_PROBE_DOWN: + if (bbr_check_time_to_probe_bw(sk, rs)) + return; /* already decided state transition */ + if (bbr_check_time_to_cruise(sk, inflight, bw)) + bbr_start_bw_probe_cruise(sk); + break; + + default: + WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); + } +} + +/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ +static void bbr_exit_probe_rtt(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr_reset_lower_bounds(sk); + if (bbr_full_bw_reached(sk)) { + bbr->mode = BBR_PROBE_BW; + /* Raising inflight after PROBE_RTT may cause loss, so reset + * the PROBE_BW clock and schedule the next bandwidth probe for + * a friendly and randomized future point in time. + */ + bbr_start_bw_probe_down(sk); + /* Since we are exiting PROBE_RTT, we know inflight is + * below our estimated BDP, so it is reasonable to cruise. + */ + bbr_start_bw_probe_cruise(sk); + } else { + bbr->mode = BBR_STARTUP; + } +} + +/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until + * the end of the round in recovery to get a good estimate of how many packets + * have been lost, and how many we need to drain with a low pacing rate. + */ +static void bbr_check_loss_too_high_in_startup(struct sock *sk, + const struct rate_sample *rs) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr_full_bw_reached(sk)) + return; + + /* For STARTUP exit, check the loss rate at the end of each round trip + * of Recovery episodes in STARTUP. We check the loss rate at the end + * of the round trip to filter out noisy/low loss and have a better + * sense of inflight (extent of loss), so we can drain more accurately. + */ + if (rs->losses && bbr->loss_events_in_round < 0xf) + bbr->loss_events_in_round++; /* update saturating counter */ + if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && + inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && + bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && + bbr_is_inflight_too_high(sk, rs)) { + bbr_handle_queue_too_high_in_startup(sk); + return; + } + if (bbr->loss_round_start) + bbr->loss_events_in_round = 0; +} + +/* Estimate when the pipe is full, using the change in delivery rate: BBR + * estimates bw probing filled the pipe if the estimated bw hasn't changed by + * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited + * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the + * higher rwin, 3: we get higher delivery rate samples. Or transient + * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar + * design goal, but uses delay and inter-ACK spacing instead of bandwidth. + */ +static void bbr_check_full_bw_reached(struct sock *sk, + const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 bw_thresh, full_cnt, thresh; + + if (bbr->full_bw_now || rs->is_app_limited) + return; + + thresh = bbr_param(sk, full_bw_thresh); + full_cnt = bbr_param(sk, full_bw_cnt); + bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; + if (ctx->sample_bw >= bw_thresh) { + bbr_reset_full_bw(sk); + bbr->full_bw = ctx->sample_bw; + return; + } + if (!bbr->round_start) + return; + ++bbr->full_bw_cnt; + bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; + bbr->full_bw_reached |= bbr->full_bw_now; +} + +/* If pipe is probably full, drain the queue and then enter steady-state. */ +static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + + if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { + bbr->mode = BBR_DRAIN; /* drain queue we created */ + /* Set ssthresh to export purely for monitoring, to signal + * completion of initial STARTUP by setting to a non- + * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). + */ + tcp_sk(sk)->snd_ssthresh = + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); + bbr_reset_congestion_signals(sk); + } /* fall through to check if in-flight is already small: */ + if (bbr->mode == BBR_DRAIN && + bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= + bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { + bbr->mode = BBR_PROBE_BW; + bbr_start_bw_probe_down(sk); + } +} + +static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, + struct bbr_context *ctx) +{ + bbr_update_congestion_signals(sk, rs, ctx); + bbr_update_ack_aggregation(sk, rs); + bbr_check_loss_too_high_in_startup(sk, rs); + bbr_check_full_bw_reached(sk, rs, ctx); + bbr_check_drain(sk, rs, ctx); + bbr_update_cycle_phase(sk, rs, ctx); + bbr_update_min_rtt(sk, rs); +} + +/* Fast path for app-limited case. + * + * On each ack, we execute bbr state machine, which primarily consists of: + * 1) update model based on new rate sample, and + * 2) update control based on updated model or state change. + * + * There are certain workload/scenarios, e.g. app-limited case, where + * either we can skip updating model or we can skip update of both model + * as well as control. This provides signifcant softirq cpu savings for + * processing incoming acks. + * + * In case of app-limited, if there is no congestion (loss/ecn) and + * if observed bw sample is less than current estimated bw, then we can + * skip some of the computation in bbr state processing: + * + * - if there is no rtt/mode/phase change: In this case, since all the + * parameters of the network model are constant, we can skip model + * as well control update. + * + * - else we can skip rest of the model update. But we still need to + * update the control to account for the new rtt/mode/phase. + * + * Returns whether we can take fast path or not. + */ +static bool bbr_run_fast_path(struct sock *sk, bool *update_model, + const struct rate_sample *rs, struct bbr_context *ctx) +{ + struct bbr *bbr = inet_csk_ca(sk); + u32 prev_min_rtt_us, prev_mode; + + if (bbr_param(sk, fast_path) && bbr->try_fast_path && + rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && + !bbr->loss_in_round && !bbr->ecn_in_round ) { + prev_mode = bbr->mode; + prev_min_rtt_us = bbr->min_rtt_us; + bbr_check_drain(sk, rs, ctx); + bbr_update_cycle_phase(sk, rs, ctx); + bbr_update_min_rtt(sk, rs); + + if (bbr->mode == prev_mode && + bbr->min_rtt_us == prev_min_rtt_us && + bbr->try_fast_path) { + return true; + } + + /* Skip model update, but control still needs to be updated */ + *update_model = false; + } + return false; +} + +void bbr_main(struct sock *sk, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct bbr_context ctx = { 0 }; + bool update_model = true; + u32 bw, round_delivered; + int ce_ratio = -1; + + round_delivered = bbr_update_round_start(sk, rs, &ctx); + if (bbr->round_start) { + bbr->rounds_since_probe = + min_t(s32, bbr->rounds_since_probe + 1, 0xFF); + ce_ratio = bbr_update_ecn_alpha(sk); + } + bbr_plb(sk, rs, ce_ratio); + + bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); + bbr_calculate_bw_sample(sk, rs, &ctx); + bbr_update_latest_delivery_signals(sk, rs, &ctx); + + if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) + goto out; + + if (update_model) + bbr_update_model(sk, rs, &ctx); + + bbr_update_gains(sk); + bw = bbr_bw(sk); + bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); + bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, + tcp_snd_cwnd(tp), &ctx); + bbr_bound_cwnd_for_inflight_model(sk); + +out: + bbr_advance_latest_delivery_signals(sk, rs, &ctx); + bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; + bbr->loss_in_cycle |= rs->lost > 0; + bbr->ecn_in_cycle |= rs->delivered_ce > 0; +} + +static void bbr_init(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + bbr->initialized = 1; + + bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); + bbr->prior_cwnd = tp->prior_cwnd; + tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; + bbr->next_rtt_delivered = tp->delivered; + bbr->prev_ca_state = TCP_CA_Open; + + bbr->probe_rtt_done_stamp = 0; + bbr->probe_rtt_round_done = 0; + bbr->probe_rtt_min_us = tcp_min_rtt(tp); + bbr->probe_rtt_min_stamp = tcp_jiffies32; + bbr->min_rtt_us = tcp_min_rtt(tp); + bbr->min_rtt_stamp = tcp_jiffies32; + + bbr->has_seen_rtt = 0; + bbr_init_pacing_rate_from_rtt(sk); + + bbr->round_start = 0; + bbr->idle_restart = 0; + bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; + bbr->cycle_mstamp = 0; + bbr->cycle_idx = 0; + + bbr_reset_startup_mode(sk); + + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = 0; + bbr->extra_acked[0] = 0; + bbr->extra_acked[1] = 0; + + bbr->ce_state = 0; + bbr->prior_rcv_nxt = tp->rcv_nxt; + bbr->try_fast_path = 0; + + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); + + /* Start sampling ECN mark rate after first full flight is ACKed: */ + bbr->loss_round_delivered = tp->delivered + 1; + bbr->loss_round_start = 0; + bbr->undo_bw_lo = 0; + bbr->undo_inflight_lo = 0; + bbr->undo_inflight_hi = 0; + bbr->loss_events_in_round = 0; + bbr->startup_ecn_rounds = 0; + bbr_reset_congestion_signals(sk); + bbr->bw_lo = ~0U; + bbr->bw_hi[0] = 0; + bbr->bw_hi[1] = 0; + bbr->inflight_lo = ~0U; + bbr->inflight_hi = ~0U; + bbr_reset_full_bw(sk); + bbr->bw_probe_up_cnt = ~0U; + bbr->bw_probe_up_acks = 0; + bbr->bw_probe_up_rounds = 0; + bbr->probe_wait_us = 0; + bbr->stopped_risky_probe = 0; + bbr->ack_phase = BBR_ACKS_INIT; + bbr->rounds_since_probe = 0; + bbr->bw_probe_samples = 0; + bbr->prev_probe_too_high = 0; + bbr->ecn_eligible = 0; + bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); + bbr->alpha_last_delivered = 0; + bbr->alpha_last_delivered_ce = 0; + bbr->plb.pause_until = 0; + + tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; +} + +/* BBR marks the current round trip as a loss round. */ +static void bbr_note_loss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + /* Capture "current" data over the full round trip of loss, to + * have a better chance of observing the full capacity of the path. + */ + if (!bbr->loss_in_round) /* first loss in this round trip? */ + bbr->loss_round_delivered = tp->delivered; /* set round trip */ + bbr->loss_in_round = 1; + bbr->loss_in_cycle = 1; +} + +/* Core TCP stack informs us that the given skb was just marked lost. */ +static void bbr_skb_marked_lost(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct rate_sample rs = {}; + + bbr_note_loss(sk); + + if (!bbr->bw_probe_samples) + return; /* not an skb sent while probing for bandwidth */ + if (unlikely(!scb->tx.delivered_mstamp)) + return; /* skb was SACKed, reneged, marked lost; ignore it */ + /* We are probing for bandwidth. Construct a rate sample that + * estimates what happened in the flight leading up to this lost skb, + * then see if the loss rate went too high, and if so at which packet. + */ + rs.tx_in_flight = scb->tx.in_flight; + rs.lost = tp->lost - scb->tx.lost; + rs.is_app_limited = scb->tx.is_app_limited; + if (bbr_is_inflight_too_high(sk, &rs)) { + rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); + bbr_handle_inflight_too_high(sk, &rs); + } +} + +static void bbr_run_loss_probe_recovery(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + struct rate_sample rs = {0}; + + bbr_note_loss(sk); + + if (!bbr->bw_probe_samples) + return; /* not sent while probing for bandwidth */ + /* We are probing for bandwidth. Construct a rate sample that + * estimates what happened in the flight leading up to this + * loss, then see if the loss rate went too high. + */ + rs.lost = 1; /* TLP probe repaired loss of a single segment */ + rs.tx_in_flight = bbr->inflight_latest + rs.lost; + rs.is_app_limited = tp->tlp_orig_data_app_limited; + if (bbr_is_inflight_too_high(sk, &rs)) + bbr_handle_inflight_too_high(sk, &rs); +} + +/* Revert short-term model if current loss recovery event was spurious. */ +static u32 bbr_undo_cwnd(struct sock *sk) +{ + struct bbr *bbr = inet_csk_ca(sk); + + bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ + bbr->loss_in_round = 0; + + /* Revert to cwnd and other state saved before loss episode. */ + bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); + bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); + bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); + bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ + return bbr->prior_cwnd; +} + +/* Entering loss recovery, so save state for when we undo recovery. */ static u32 bbr_ssthresh(struct sock *sk) { + struct bbr *bbr = inet_csk_ca(sk); + bbr_save_cwnd(sk); + /* For undo, save state that adapts based on loss signal. */ + bbr->undo_bw_lo = bbr->bw_lo; + bbr->undo_inflight_lo = bbr->inflight_lo; + bbr->undo_inflight_hi = bbr->inflight_hi; return tcp_sk(sk)->snd_ssthresh; } +static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) +{ + switch (bbr->mode) { + case BBR_STARTUP: + return BBR_PHASE_STARTUP; + case BBR_DRAIN: + return BBR_PHASE_DRAIN; + case BBR_PROBE_BW: + break; + case BBR_PROBE_RTT: + return BBR_PHASE_PROBE_RTT; + default: + return BBR_PHASE_INVALID; + } + switch (bbr->cycle_idx) { + case BBR_BW_PROBE_UP: + return BBR_PHASE_PROBE_BW_UP; + case BBR_BW_PROBE_DOWN: + return BBR_PHASE_PROBE_BW_DOWN; + case BBR_BW_PROBE_CRUISE: + return BBR_PHASE_PROBE_BW_CRUISE; + case BBR_BW_PROBE_REFILL: + return BBR_PHASE_PROBE_BW_REFILL; + default: + return BBR_PHASE_INVALID; + } +} + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) + union tcp_cc_info *info) { if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); - u64 bw = bbr_bw(sk); - - bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; - memset(&info->bbr, 0, sizeof(info->bbr)); - info->bbr.bbr_bw_lo = (u32)bw; - info->bbr.bbr_bw_hi = (u32)(bw >> 32); - info->bbr.bbr_min_rtt = bbr->min_rtt_us; - info->bbr.bbr_pacing_gain = bbr->pacing_gain; - info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; + u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); + u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); + u64 bw_lo = bbr->bw_lo == ~0U ? + ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); + struct tcp_bbr_info *bbr_info = &info->bbr; + + memset(bbr_info, 0, sizeof(*bbr_info)); + bbr_info->bbr_bw_lo = (u32)bw; + bbr_info->bbr_bw_hi = (u32)(bw >> 32); + bbr_info->bbr_min_rtt = bbr->min_rtt_us; + bbr_info->bbr_pacing_gain = bbr->pacing_gain; + bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; + bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; + bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); + bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; + bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); + bbr_info->bbr_mode = bbr->mode; + bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); + bbr_info->bbr_version = (__u8)BBR_VERSION; + bbr_info->bbr_inflight_lo = bbr->inflight_lo; + bbr_info->bbr_inflight_hi = bbr->inflight_hi; + bbr_info->bbr_extra_acked = bbr_extra_acked(sk); *attr = INET_DIAG_BBRINFO; - return sizeof(info->bbr); + return sizeof(*bbr_info); } return 0; } static void bbr_set_state(struct sock *sk, u8 new_state) { + struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); if (new_state == TCP_CA_Loss) { - struct rate_sample rs = { .losses = 1 }; bbr->prev_ca_state = TCP_CA_Loss; - bbr->full_bw = 0; - bbr->round_start = 1; /* treat RTO like end of a round */ - bbr_lt_bw_sampling(sk, &rs); + tcp_plb_update_state_upon_rto(sk, &bbr->plb); + /* The tcp_write_timeout() call to sk_rethink_txhash() likely + * repathed this flow, so re-learn the min network RTT on the + * new path: + */ + bbr_reset_full_bw(sk); + if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { + /* bbr_adapt_lower_bounds() needs cwnd before + * we suffered an RTO, to update inflight_lo: + */ + bbr->inflight_lo = + max(tcp_snd_cwnd(tp), bbr->prior_cwnd); + } + } else if (bbr->prev_ca_state == TCP_CA_Loss && + new_state != TCP_CA_Loss) { + bbr_exit_loss_recovery(sk); } } + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { - .flags = TCP_CONG_NON_RESTRICTED, + .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, .name = "bbr", .owner = THIS_MODULE, .init = bbr_init, .cong_control = bbr_main, .sndbuf_expand = bbr_sndbuf_expand, + .skb_marked_lost = bbr_skb_marked_lost, .undo_cwnd = bbr_undo_cwnd, .cwnd_event = bbr_cwnd_event, .ssthresh = bbr_ssthresh, @@ -1185,5 +2368,12 @@ MODULE_AUTHOR("Van Jacobson "); MODULE_AUTHOR("Neal Cardwell "); MODULE_AUTHOR("Yuchung Cheng "); MODULE_AUTHOR("Soheil Hassas Yeganeh "); +MODULE_AUTHOR("Priyaranjan Jha "); +MODULE_AUTHOR("Yousuk Seung "); +MODULE_AUTHOR("Kevin Yang "); +MODULE_AUTHOR("Arjun Roy "); +MODULE_AUTHOR("David Morley "); + MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +MODULE_VERSION(__stringify(BBR_VERSION)); From b6f7f64ec624618318ef9b8d3ae58dce34f7f9a2 Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Fri, 22 Mar 2024 17:40:57 +0900 Subject: [PATCH 24/61] net-tcp_bbr: introduce bbr_param_enabled macro for boolean check The compiler might get confused that we should've been using bit-wise arithmetics without this. Signed-off-by: Juhyung Park --- net/ipv4/tcp_bbr.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 5feb473b1bd76..72744c96a54bc 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -70,6 +70,7 @@ #define BBR_VERSION 3 #define bbr_param(sk,name) (bbr_ ## name) +#define bbr_param_enabled(sk,name) (!!(bbr_ ## name)) /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. @@ -488,7 +489,7 @@ static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) */ - if (bbr_param(sk, tso_rtt_shift)) { + if (bbr_param_enabled(sk, tso_rtt_shift)) { r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ bytes += GSO_MAX_SIZE >> r; @@ -547,12 +548,12 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) } else if ((event == CA_EVENT_ECN_IS_CE || event == CA_EVENT_ECN_NO_CE) && bbr_can_use_ecn(sk) && - bbr_param(sk, precise_ece_ack)) { + bbr_param_enabled(sk, precise_ece_ack)) { u32 state = bbr->ce_state; dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); bbr->ce_state = state; } else if (event == CA_EVENT_TLP_RECOVERY && - bbr_param(sk, loss_probe_recovery)) { + bbr_param_enabled(sk, loss_probe_recovery)) { bbr_run_loss_probe_recovery(sk); } } @@ -667,7 +668,7 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) { u32 max_aggr_cwnd, aggr_cwnd = 0; - if (bbr_param(sk, extra_acked_gain)) { + if (bbr_param_enabled(sk, extra_acked_gain)) { max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) / BW_UNIT; aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) @@ -802,7 +803,7 @@ static void bbr_update_ack_aggregation(struct sock *sk, struct tcp_sock *tp = tcp_sk(sk); u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); - if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + if (!bbr_param_enabled(sk, extra_acked_gain) || rs->acked_sacked <= 0 || rs->delivered < 0 || rs->interval_us <= 0) return; @@ -953,7 +954,7 @@ static void bbr_update_gains(struct sock *sk) case BBR_PROBE_BW: bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; bbr->cwnd_gain = bbr_param(sk, cwnd_gain); - if (bbr_param(sk, bw_probe_cwnd_gain) && + if (bbr_param_enabled(sk, bw_probe_cwnd_gain) && bbr->cycle_idx == BBR_BW_PROBE_UP) bbr->cwnd_gain += BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; @@ -1048,15 +1049,15 @@ static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) struct bbr *bbr = inet_csk_ca(sk); if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || - !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) + !bbr_param_enabled(sk, full_ecn_cnt) || !bbr_param_enabled(sk, ecn_thresh)) return; - if (ce_ratio >= bbr_param(sk, ecn_thresh)) + if (ce_ratio >= bbr_param_enabled(sk, ecn_thresh)) bbr->startup_ecn_rounds++; else bbr->startup_ecn_rounds = 0; - if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { + if (bbr->startup_ecn_rounds >= bbr_param_enabled(sk, full_ecn_cnt)) { bbr_handle_queue_too_high_in_startup(sk); return; } @@ -1075,7 +1076,7 @@ static int bbr_update_ecn_alpha(struct sock *sk) /* See if we should use ECN sender logic for this connection. */ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && - bbr_param(sk, ecn_factor) && + bbr_param_enabled(sk, ecn_factor) && (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || !bbr_ecn_max_rtt_us)) bbr->ecn_eligible = 1; @@ -1182,7 +1183,7 @@ static bool bbr_is_inflight_too_high(const struct sock *sk, } if (rs->delivered_ce > 0 && rs->delivered > 0 && - bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { + bbr->ecn_eligible && bbr_param_enabled(sk, ecn_thresh)) { ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> BBR_SCALE; if (rs->delivered_ce > ecn_thresh) { @@ -1380,7 +1381,7 @@ static void bbr_adapt_lower_bounds(struct sock *sk, return; /* ECN response. */ - if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { + if (bbr->ecn_in_round && bbr_param_enabled(sk, ecn_factor)) { bbr_init_lower_bounds(sk, false); bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); } @@ -1717,7 +1718,7 @@ static bool bbr_check_time_to_probe_bw(struct sock *sk, * but we are seeing ECN marks, then when the ECN marks cease we reprobe * quickly (in case cross-traffic has ceased and freed up bw). */ - if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && + if (bbr_param_enabled(sk, ecn_reprobe_gain) && bbr->ecn_eligible && bbr->ecn_in_cycle && !bbr->loss_in_cycle && inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { /* Calculate n so that when bbr_raise_inflight_hi_slope() @@ -1900,7 +1901,7 @@ static void bbr_check_loss_too_high_in_startup(struct sock *sk, */ if (rs->losses && bbr->loss_events_in_round < 0xf) bbr->loss_events_in_round++; /* update saturating counter */ - if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && + if (bbr_param_enabled(sk, full_loss_cnt) && bbr->loss_round_start && inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && bbr_is_inflight_too_high(sk, rs)) { @@ -2010,7 +2011,7 @@ static bool bbr_run_fast_path(struct sock *sk, bool *update_model, struct bbr *bbr = inet_csk_ca(sk); u32 prev_min_rtt_us, prev_mode; - if (bbr_param(sk, fast_path) && bbr->try_fast_path && + if (bbr_param_enabled(sk, fast_path) && bbr->try_fast_path && rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && !bbr->loss_in_round && !bbr->ecn_in_round ) { prev_mode = bbr->mode; From b2ec79b9665a3e21be7b9638f7e0652aeef48322 Mon Sep 17 00:00:00 2001 From: Adithya Abraham Philip Date: Fri, 11 Jun 2021 21:56:10 +0000 Subject: [PATCH 25/61] net-tcp_bbr: v3: ensure ECN-enabled BBR flows set ECT on retransmits Adds a new flag TCP_ECN_ECT_PERMANENT that is used by CCAs to indicate that retransmitted packets and pure ACKs must have the ECT bit set. This is necessary for BBR, which when using ECN expects ECT to be set even on retransmitted packets and ACKs. Previous to this addition of TCP_ECN_ECT_PERMANENT, CCAs which can use ECN but don't "need" it did not have a way to indicate that ECT should be set on retransmissions/ACKs. Signed-off-by: Adithya Abraham Philip Signed-off-by: Neal Cardwell Change-Id: I8b048eaab35e136fe6501ef6cd89fd9faa15e6d2 Signed-off-by: Juhyung Park --- include/net/tcp.h | 1 + net/ipv4/tcp_bbr.c | 3 +++ net/ipv4/tcp_output.c | 3 ++- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8915fd3556267..f4bd23459b986 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -373,6 +373,7 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 #define TCP_ECN_LOW 16 +#define TCP_ECN_ECT_PERMANENT 32 enum tcp_tw_status { TCP_TW_SUCCESS = 0, diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 72744c96a54bc..2548767c2c703 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -2150,6 +2150,9 @@ static void bbr_init(struct sock *sk) bbr->plb.pause_until = 0; tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; + + if (bbr_can_use_ecn(sk)) + tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; } /* BBR marks the current round trip as a loss round. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5dc37e3f59c60..c0011333cb511 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -376,7 +376,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } - } else if (!tcp_ca_needs_ecn(sk)) { + } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && + !tcp_ca_needs_ecn(sk)) { /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } From fe9179e28666fa3aa262eda1e66ed68ca7dbde33 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 23 Jul 2023 23:25:34 -0400 Subject: [PATCH 26/61] tcp: export TCPI_OPT_ECN_LOW in tcp_info tcpi_options field Analogous to other important ECN information, export TCPI_OPT_ECN_LOW in tcp_info tcpi_options field. Signed-off-by: Neal Cardwell Change-Id: I08d8d8c7e8780e6e37df54038ee50301ac5a0320 Signed-off-by: Juhyung Park --- include/uapi/linux/tcp.h | 1 + net/ipv4/tcp.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index dfeaf70969ae8..45c4273abf207 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ +#define TCPI_OPT_ECN_LOW 64 /* Low-latency ECN configured at init */ /* * Sender's congestion state indicating normal or abnormal situations diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 58fcaef32108d..686878b3b0570 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3561,6 +3561,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; + if (tp->ecn_flags & TCP_ECN_LOW) + info->tcpi_options |= TCPI_OPT_ECN_LOW; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; From b4e9a526e58d1849c3e8f3c096209aaed3b9a9e7 Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Tue, 2 Jul 2024 19:21:22 +0900 Subject: [PATCH 27/61] net-tcp_bbr: v3: merge changes from bbr-v3-2024-02-22-01 Signed-off-by: Juhyung Park --- net/ipv4/tcp_bbr.c | 47 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 2548767c2c703..7edde2783fcbc 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -37,10 +37,10 @@ * In steady state a BBR flow only uses PROBE_BW and PROBE_RTT. * A long-lived BBR flow spends the vast majority of its time remaining * (repeatedly) in PROBE_BW, fully probing and utilizing the pipe's bandwidth - * in a fair manner, with a small, bounded queue. *If* a flow has been - * continuously sending for the entire min_rtt window, and hasn't seen an RTT - * sample that matches or decreases its min_rtt estimate for 10 seconds, then - * it briefly enters PROBE_RTT to cut inflight to a minimum value to re-probe + * in a fair manner, with a small, bounded queue. *If* a flow has not seen + * a measured RTT reduction for probe_rtt_win_ms (likely meaning it has been + * continuously sending for the entire probe_rtt_win_ms interval) then + * it briefly enters PROBE_RTT to cut inflight to a smaller value to re-probe * the path's two-way propagation delay (min_rtt). When exiting PROBE_RTT, if * we estimated that we reached the full bw of the pipe then we enter PROBE_BW; * otherwise we enter STARTUP to try to fill the pipe. @@ -231,7 +231,7 @@ enum bbr_pacing_gain_phase { BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ - BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + BBR_BW_PROBE_REFILL = 3, /* try to refill the pipe again to 100% */ }; /* Try to keep at least this many packets in flight, if things go smoothly. For @@ -434,7 +434,7 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) rate = bbr_rate_bytes_per_sec(sk, rate, gain, bbr_pacing_margin_percent); - rate = min_t(u64, rate, sk->sk_max_pacing_rate); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); return rate; } @@ -454,8 +454,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) } bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; do_div(bw, rtt_us); - sk->sk_pacing_rate = - bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)); + WRITE_ONCE(sk->sk_pacing_rate, + bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); } /* Pace using current bw estimate and a gain factor. */ @@ -467,8 +467,8 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) if (unlikely(!bbr->has_seen_rtt && tp->srtt_us)) bbr_init_pacing_rate_from_rtt(sk); - if (bbr_full_bw_reached(sk) || rate > sk->sk_pacing_rate) - sk->sk_pacing_rate = rate; + if (bbr_full_bw_reached(sk) || rate > READ_ONCE(sk->sk_pacing_rate)) + WRITE_ONCE(sk->sk_pacing_rate, rate); } /* Return the number of segments BBR would like in a TSO/GSO skb, given a @@ -483,7 +483,7 @@ static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, u64 bytes; /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ - bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; + bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); /* Budget a TSO/GSO burst size allowance based on min_rtt. For every * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. @@ -867,16 +867,18 @@ static void bbr_check_probe_rtt_done(struct sock *sk) * small (reducing queuing delay and packet loss) and achieve fairness among * BBR flows. * - * The min_rtt filter window is 10 seconds. When the min_rtt estimate expires, - * we enter PROBE_RTT mode and cap the cwnd at bbr_cwnd_min_target=4 packets. + * The PROBE_RTT window is probe_rtt_win_ms (5 sec). If this elapses without + * measuring a lower RTT sample, we enter PROBE_RTT mode and reduce cwnd + * using the bbr_probe_rtt_cwnd_gain factor of 0.5x, i.e. cwnd ~= 0.5 * est_BDP. * After at least bbr_probe_rtt_mode_ms=200ms and at least one packet-timed - * round trip elapsed with that flight size <= 4, we leave PROBE_RTT mode and + * round trip elapsed with the lower flight size, we leave PROBE_RTT mode and * re-enter the previous mode. BBR uses 200ms to approximately bound the - * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (200ms/10s). + * performance penalty of PROBE_RTT's cwnd capping to roughly 2% (note that + * the expected bandwidth utilization is 0.5*.2/5 + 1.0*( 5 - .2)/5 = 0.98). * - * Note that flows need only pay 2% if they are busy sending over the last 10 + * Note that flows need only pay 2% if they are busy sending over the last 5 * seconds. Interactive applications (e.g., Web, RPCs, video chunks) often have - * natural silences or low-rate periods within 10 seconds where the rate is low + * natural silences or low-rate periods within 5 seconds where the rate is low * enough for long enough to drain its queue in the bottleneck. We pick up * these min RTT measurements opportunistically with our min_rtt filter. :-) */ @@ -1163,10 +1165,9 @@ static void bbr_probe_inflight_hi_upward(struct sock *sk, bbr_raise_inflight_hi_slope(sk); } -/* Does loss/ECN rate for this sample say inflight is "too high"? - * This is used by both the bbr_check_loss_too_high_in_startup() function, - * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which - * uses it to notice when loss/ECN rates suggest inflight is too high. +/* Does loss/ECN rate for this sample suggest inflight is "too high"? This is + * used both in STARTUP and BBR_BW_PROBE_UP, to notice when loss/ECN rates + * suggest the volume of in-flight data is too high. */ static bool bbr_is_inflight_too_high(const struct sock *sk, const struct rate_sample *rs) @@ -1311,7 +1312,7 @@ static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) } /* How should we multiplicatively cut bw or inflight limits based on ECN? */ -u32 bbr_ecn_cut(struct sock *sk) +static u32 bbr_ecn_cut(struct sock *sk) { struct bbr *bbr = inet_csk_ca(sk); @@ -2032,7 +2033,7 @@ static bool bbr_run_fast_path(struct sock *sk, bool *update_model, return false; } -void bbr_main(struct sock *sk, const struct rate_sample *rs) +static void bbr_main(struct sock *sk, const struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); struct bbr *bbr = inet_csk_ca(sk); From dfb34bbde2c735a08f643514516aaa777effae78 Mon Sep 17 00:00:00 2001 From: Rasenkai Date: Mon, 13 Jan 2025 11:52:08 +0530 Subject: [PATCH 28/61] arm64: rockchip_linux: Use BBR as default TCP congestion control Signed-off-by: Rasenkai --- arch/arm64/configs/rockchip_linux_defconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/arm64/configs/rockchip_linux_defconfig b/arch/arm64/configs/rockchip_linux_defconfig index 74cb4ab1179ac..6b517c223b5c3 100644 --- a/arch/arm64/configs/rockchip_linux_defconfig +++ b/arch/arm64/configs/rockchip_linux_defconfig @@ -891,3 +891,8 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=60 CONFIG_FUNCTION_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_LKDTM=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BBR=y +# CONFIG_DEFAULT_CUBIC is not set +CONFIG_DEFAULT_BBR=y +CONFIG_DEFAULT_TCP_CONG="bbr" From 13a73d7d9bca44989a73ec1aa565ffc71bddb17b Mon Sep 17 00:00:00 2001 From: gushengxian Date: Wed, 9 Jun 2021 20:09:35 -0700 Subject: [PATCH 29/61] af_unix: remove the repeated word "and" Remove the repeated word "and". Signed-off-by: gushengxian Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e2ef754f4e9b4..5bfe11333af6a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1399,7 +1399,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, unix_state_unlock(sk); - /* take ten and and send info to listening sock */ + /* take ten and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); From 3de437d8355132b26a72c64b238fb4c060aa0eeb Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:26 +0000 Subject: [PATCH 30/61] af_unix: take address assignment/hash insertion into a new helper Duplicated logics in all bind variants (autobind, bind-to-path, bind-to-abstract) gets taken into a common helper. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5bfe11333af6a..83c815afd2721 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -256,6 +256,14 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) sk_add_node(sk, list); } +static void __unix_set_addr(struct sock *sk, struct unix_address *addr, + unsigned hash) +{ + __unix_remove_socket(sk); + smp_store_release(&unix_sk(sk)->addr, addr); + __unix_insert_socket(&unix_socket_table[hash], sk); +} + static inline void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); @@ -922,9 +930,7 @@ static int unix_autobind(struct socket *sock) } addr->hash ^= sk->sk_type; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(&unix_socket_table[addr->hash], sk); + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); err = 0; @@ -1026,7 +1032,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct hlist_head *list; struct path path = { }; err = -EINVAL; @@ -1078,25 +1083,20 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; - list = &unix_socket_table[hash]; } else { spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); unix_release_addr(addr); - goto out_unlock; + goto out_up; } - - list = &unix_socket_table[addr->hash]; + hash = addr->hash; } err = 0; - __unix_remove_socket(sk); - smp_store_release(&u->addr, addr); - __unix_insert_socket(list, sk); - -out_unlock: + __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); out_up: mutex_unlock(&u->bindlock); From c8baa6d33468cde97c42a626caf46e30948bc60a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:27 +0000 Subject: [PATCH 31/61] unix_bind(): allocate addr earlier makes it easier to massage; we do pay for that by extra work (kmalloc+memcpy+kfree) in some error cases, but those are not on the hot paths anyway. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 83c815afd2721..c72da7ee6e6f0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1048,6 +1048,15 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err < 0) goto out; addr_len = err; + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash ^ sk->sk_type; + refcount_set(&addr->refcnt, 1); if (sun_path[0]) { umode_t mode = S_IFSOCK | @@ -1056,7 +1065,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) { if (err == -EEXIST) err = -EADDRINUSE; - goto out; + goto out_addr; } } @@ -1068,16 +1077,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (u->addr) goto out_up; - err = -ENOMEM; - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - goto out_up; - - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; - refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1089,20 +1088,23 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - unix_release_addr(addr); goto out_up; } hash = addr->hash; } - err = 0; __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); + addr = NULL; + err = 0; out_up: mutex_unlock(&u->bindlock); out_put: if (err) path_put(&path); +out_addr: + if (addr) + unix_release_addr(addr); out: return err; } From 82295e928aa4c6a1039ea39e8d97c7393fc5fa9e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:28 +0000 Subject: [PATCH 32/61] unix_bind(): separate BSD and abstract cases We do get some duplication that way, but it's minor compared to parts that are different. What we get is an ability to change locking in BSD case without making failure exits very hard to follow. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 55 ++++++++++++++++++++++++++++------------------ 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c72da7ee6e6f0..de5d3d0cc55a8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1032,7 +1032,6 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) int err; unsigned int hash; struct unix_address *addr; - struct path path = { }; err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || @@ -1059,6 +1058,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) refcount_set(&addr->refcnt, 1); if (sun_path[0]) { + struct path path = { }; umode_t mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current_umask()); err = unix_mknod(sun_path, mode, &path); @@ -1067,41 +1067,54 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = -EADDRINUSE; goto out_addr; } - } - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_put; + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + goto out_addr; + } - err = -EINVAL; - if (u->addr) - goto out_up; + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + goto out_addr; + } - if (sun_path[0]) { addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } else { + err = mutex_lock_interruptible(&u->bindlock); + if (err) + goto out_addr; + + err = -EINVAL; + if (u->addr) { + mutex_unlock(&u->bindlock); + goto out_addr; + } + spin_lock(&unix_table_lock); err = -EADDRINUSE; if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { spin_unlock(&unix_table_lock); - goto out_up; + mutex_unlock(&u->bindlock); + goto out_addr; } - hash = addr->hash; + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + addr = NULL; + err = 0; } - - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - addr = NULL; - err = 0; -out_up: - mutex_unlock(&u->bindlock); -out_put: - if (err) - path_put(&path); out_addr: if (addr) unix_release_addr(addr); From 5ed7d81243260b2e51cb058ef38fd59afb04371e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:29 +0000 Subject: [PATCH 33/61] unix_bind(): take BSD and abstract address cases into new helpers unix_bind_bsd() and unix_bind_abstract() respectively. Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 147 +++++++++++++++++++++++---------------------- 1 file changed, 74 insertions(+), 73 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index de5d3d0cc55a8..58de24442d006 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1022,104 +1022,105 @@ static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) return err; } +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + struct path path = { }; + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int hash; + int err; + + err = unix_mknod(addr->name->sun_path, mode, &path); + if (err) + return err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) { + path_put(&path); + return err; + } + + if (u->addr) { + mutex_unlock(&u->bindlock); + path_put(&path); + return -EINVAL; + } + + addr->hash = UNIX_HASH_SIZE; + hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + spin_lock(&unix_table_lock); + u->path = path; + __unix_set_addr(sk, addr, hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + +static int unix_bind_abstract(struct sock *sk, unsigned hash, + struct unix_address *addr) +{ + struct unix_sock *u = unix_sk(sk); + int err; + + err = mutex_lock_interruptible(&u->bindlock); + if (err) + return err; + + if (u->addr) { + mutex_unlock(&u->bindlock); + return -EINVAL; + } + + spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + sk->sk_type, hash)) { + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return -EADDRINUSE; + } + __unix_set_addr(sk, addr, addr->hash); + spin_unlock(&unix_table_lock); + mutex_unlock(&u->bindlock); + return 0; +} + static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; int err; unsigned int hash; struct unix_address *addr; - err = -EINVAL; if (addr_len < offsetofend(struct sockaddr_un, sun_family) || sunaddr->sun_family != AF_UNIX) - goto out; + return -EINVAL; - if (addr_len == sizeof(short)) { - err = unix_autobind(sock); - goto out; - } + if (addr_len == sizeof(short)) + return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) - goto out; + return err; addr_len = err; - err = -ENOMEM; addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) - goto out; + return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); - if (sun_path[0]) { - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sock)->i_mode & ~current_umask()); - err = unix_mknod(sun_path, mode, &path); - if (err) { - if (err == -EEXIST) - err = -EADDRINUSE; - goto out_addr; - } - - err = mutex_lock_interruptible(&u->bindlock); - if (err) { - path_put(&path); - goto out_addr; - } - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - path_put(&path); - goto out_addr; - } - - addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); - spin_lock(&unix_table_lock); - u->path = path; - __unix_set_addr(sk, addr, hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } else { - err = mutex_lock_interruptible(&u->bindlock); - if (err) - goto out_addr; - - err = -EINVAL; - if (u->addr) { - mutex_unlock(&u->bindlock); - goto out_addr; - } - - spin_lock(&unix_table_lock); - err = -EADDRINUSE; - if (__unix_find_socket_byname(net, sunaddr, addr_len, - sk->sk_type, hash)) { - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - goto out_addr; - } - __unix_set_addr(sk, addr, addr->hash); - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - addr = NULL; - err = 0; - } -out_addr: - if (addr) + if (sun_path[0]) + err = unix_bind_bsd(sk, addr); + else + err = unix_bind_abstract(sk, hash, addr); + if (err) unix_release_addr(addr); -out: - return err; + return err == -EEXIST ? -EADDRINUSE : err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) From 0dd82e3d5bce78433277ef23b115507288377d13 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:30 +0000 Subject: [PATCH 34/61] fold unix_mknod() into unix_bind_bsd() Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 39 +++++++++++++++------------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 58de24442d006..8b177a7b7cce1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -993,45 +993,36 @@ static struct sock *unix_find_other(struct net *net, return NULL; } -static int unix_mknod(const char *sun_path, umode_t mode, struct path *res) +static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { + struct unix_sock *u = unix_sk(sk); + umode_t mode = S_IFSOCK | + (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + struct path parent, path; struct dentry *dentry; - struct path path; - int err = 0; + unsigned int hash; + int err; + /* * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0); - err = PTR_ERR(dentry); + dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) - return err; + return PTR_ERR(dentry); /* * All right, let's create it. */ - err = security_path_mknod(&path, dentry, mode, 0); + err = security_path_mknod(&parent, dentry, mode, 0); if (!err) { - err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0); + err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); if (!err) { - res->mnt = mntget(path.mnt); - res->dentry = dget(dentry); + path.mnt = mntget(parent.mnt); + path.dentry = dget(dentry); } } - done_path_create(&path, dentry); - return err; -} - -static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) -{ - struct unix_sock *u = unix_sk(sk); - struct path path = { }; - umode_t mode = S_IFSOCK | - (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - unsigned int hash; - int err; - - err = unix_mknod(addr->name->sun_path, mode, &path); + done_path_create(&parent, dentry); if (err) return err; From 76f42dcd88c9427597c60adb9fdd7b25ab23545d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:31 +0000 Subject: [PATCH 35/61] unix_bind_bsd(): move done_path_create() call after dealing with ->bindlock Final preparations for doing unlink on failure past the successful mknod. We can't hold ->bindlock over ->mknod() or ->unlink(), since either might do sb_start_write() (e.g. on overlayfs). However, we can do it while holding filesystem and VFS locks - doing kern_path_create() vfs_mknod() grab ->bindlock if u->addr had been set drop ->bindlock done_path_create return -EINVAL else assign the address to socket drop ->bindlock done_path_create return 0 would be deadlock-free. Here we massage unix_bind_bsd() to that form. We are still doing equivalent transformations. Next commit will *not* be an equivalent transformation - it will add a call of vfs_unlink() before done_path_create() in "alread bound" case. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8b177a7b7cce1..9d80de06a8840 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -998,7 +998,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - struct path parent, path; + struct path parent; struct dentry *dentry; unsigned int hash; int err; @@ -1015,36 +1015,32 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) * All right, let's create it. */ err = security_path_mknod(&parent, dentry, mode, 0); - if (!err) { + if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); - if (!err) { - path.mnt = mntget(parent.mnt); - path.dentry = dget(dentry); - } - } - done_path_create(&parent, dentry); - if (err) + if (err) { + done_path_create(&parent, dentry); return err; - + } err = mutex_lock_interruptible(&u->bindlock); if (err) { - path_put(&path); + done_path_create(&parent, dentry); return err; } - if (u->addr) { mutex_unlock(&u->bindlock); - path_put(&path); + done_path_create(&parent, dentry); return -EINVAL; } addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); spin_lock(&unix_table_lock); - u->path = path; + u->path.mnt = mntget(parent.mnt); + u->path.dentry = dget(dentry); __unix_set_addr(sk, addr, hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); + done_path_create(&parent, dentry); return 0; } From 1f566c403e0878bbddf2f2bd2e89b7361d8be59c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:32 +0000 Subject: [PATCH 36/61] unix_bind_bsd(): unlink if we fail after successful mknod We can do that more or less safely, since the parent is held locked all along. Yes, somebody might observe the object via dcache, only to have it disappear afterwards, but there's really no good way to prevent that. It won't race with other bind(2) or attempts to move the sucker elsewhere, or put something else in its place - locked parent prevents that. Signed-off-by: Al Viro Signed-off-by: David S. Miller Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9d80de06a8840..e89de840651bc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1017,20 +1017,13 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) err = security_path_mknod(&parent, dentry, mode, 0); if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); - if (err) { - done_path_create(&parent, dentry); - return err; - } + if (err) + goto out; err = mutex_lock_interruptible(&u->bindlock); - if (err) { - done_path_create(&parent, dentry); - return err; - } - if (u->addr) { - mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); - return -EINVAL; - } + if (err) + goto out_unlink; + if (u->addr) + goto out_unlock; addr->hash = UNIX_HASH_SIZE; hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); @@ -1042,6 +1035,16 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; + +out_unlock: + mutex_unlock(&u->bindlock); + err = -EINVAL; +out_unlink: + /* failed after successful mknod? unlink what we'd created... */ + vfs_unlink(d_inode(parent.dentry), dentry, NULL); +out: + done_path_create(&parent, dentry); + return err; } static int unix_bind_abstract(struct sock *sk, unsigned hash, From 21286499093ccff79fd49a59fbbe55b13a76f7f7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jun 2021 03:50:33 +0000 Subject: [PATCH 37/61] __unix_find_socket_byname(): don't pass hash and type separately We only care about exclusive or of those, so pass that directly. Makes life simpler for callers as well... Signed-off-by: Al Viro Signed-off-by: David S. Miller --- net/unix/af_unix.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e89de840651bc..723e9e543746e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -280,11 +280,11 @@ static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, unsigned int hash) + int len, unsigned int hash) { struct sock *s; - sk_for_each(s, &unix_socket_table[hash ^ type]) { + sk_for_each(s, &unix_socket_table[hash]) { struct unix_sock *u = unix_sk(s); if (!net_eq(sock_net(s), net)) @@ -299,13 +299,12 @@ static struct sock *__unix_find_socket_byname(struct net *net, static inline struct sock *unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, - int len, int type, - unsigned int hash) + int len, unsigned int hash) { struct sock *s; spin_lock(&unix_table_lock); - s = __unix_find_socket_byname(net, sunname, len, type, hash); + s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&unix_table_lock); @@ -908,12 +907,12 @@ static int unix_autobind(struct socket *sock) retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, - addr->hash)) { + if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -928,7 +927,6 @@ static int unix_autobind(struct socket *sock) } goto retry; } - addr->hash ^= sk->sk_type; __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); @@ -975,7 +973,7 @@ static struct sock *unix_find_other(struct net *net, } } else { err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type, hash); + u = unix_find_socket_byname(net, sunname, len, type ^ hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->path.dentry; @@ -1047,8 +1045,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) return err; } -static int unix_bind_abstract(struct sock *sk, unsigned hash, - struct unix_address *addr) +static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); int err; @@ -1064,7 +1061,7 @@ static int unix_bind_abstract(struct sock *sk, unsigned hash, spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - sk->sk_type, hash)) { + addr->hash)) { spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return -EADDRINUSE; @@ -1107,7 +1104,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (sun_path[0]) err = unix_bind_bsd(sk, addr); else - err = unix_bind_abstract(sk, hash, addr); + err = unix_bind_abstract(sk, addr); if (err) unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; From 474d0e2283aa4a7e41b0e264af8f1b613d810e67 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:19 +0900 Subject: [PATCH 38/61] af_unix: Use offsetof() instead of sizeof(). The length of the AF_UNIX socket address contains an offset to the member sun_path of struct sockaddr_un. Currently, the preceding member is just sun_family, and its type is sa_family_t and resolved to short. Therefore, the offset is represented by sizeof(short). However, it is not clear and fragile to changes in struct sockaddr_storage or sockaddr_un. This commit makes it clear and robust by rewriting sizeof() with offsetof(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 19 ++++++++++++------- net/unix/diag.c | 3 ++- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 723e9e543746e..8783eba778920 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -224,7 +224,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp { *hashp = 0; - if (len <= sizeof(short) || len > sizeof(*sunaddr)) + if (len <= offsetof(struct sockaddr_un, sun_path) || + len > sizeof(*sunaddr)) return -EINVAL; if (!sunaddr || sunaddr->sun_family != AF_UNIX) return -EINVAL; @@ -237,7 +238,8 @@ static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp * kernel address buffer. */ ((char *)sunaddr)[len] = 0; - len = strlen(sunaddr->sun_path)+1+sizeof(short); + len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; return len; } @@ -897,7 +899,8 @@ static int unix_autobind(struct socket *sock) goto out; err = -ENOMEM; - addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + addr = kzalloc(sizeof(*addr) + + offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL); if (!addr) goto out; @@ -905,7 +908,8 @@ static int unix_autobind(struct socket *sock) refcount_set(&addr->refcnt, 1); retry: - addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); + addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + + offsetof(struct sockaddr_un, sun_path) + 1; addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); addr->hash ^= sk->sk_type; @@ -1085,7 +1089,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sunaddr->sun_family != AF_UNIX) return -EINVAL; - if (addr_len == sizeof(short)) + if (addr_len == offsetof(struct sockaddr_un, sun_path)) return unix_autobind(sock); err = unix_mkname(sunaddr, addr_len, &hash); @@ -1521,7 +1525,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) if (!addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; - err = sizeof(short); + err = offsetof(struct sockaddr_un, sun_path); } else { err = addr->len; memcpy(sunaddr, addr->name, addr->len); @@ -2901,7 +2905,8 @@ static int unix_seq_show(struct seq_file *seq, void *v) seq_putc(seq, ' '); i = 0; - len = u->addr->len - sizeof(short); + len = u->addr->len - + offsetof(struct sockaddr_un, sun_path); if (!UNIX_ABSTRACT(s)) len--; else { diff --git a/net/unix/diag.c b/net/unix/diag.c index 7066a36234106..d146fd1a95fa7 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -19,7 +19,8 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) if (!addr) return 0; - return nla_put(nlskb, UNIX_DIAG_NAME, addr->len - sizeof(short), + return nla_put(nlskb, UNIX_DIAG_NAME, + addr->len - offsetof(struct sockaddr_un, sun_path), addr->name->sun_path); } From b7da428d19d031e32168666166adfa7340a04d9b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:20 +0900 Subject: [PATCH 39/61] af_unix: Pass struct sock to unix_autobind(). We do not use struct socket in unix_autobind() and pass struct sock to unix_bind_bsd() and unix_bind_abstract(). Let's pass it to unix_autobind() as well. Also, this patch fixes these errors by checkpatch.pl. ERROR: do not use assignment in if condition #1795: FILE: net/unix/af_unix.c:1795: + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr CHECK: Logical continuations should be on the previous line #1796: FILE: net/unix/af_unix.c:1796: + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr + && (err = unix_autobind(sock)) != 0) Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8783eba778920..5eca753f90d15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -881,15 +881,13 @@ static int unix_release(struct socket *sock) return 0; } -static int unix_autobind(struct socket *sock) +static int unix_autobind(struct sock *sk) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); - static u32 ordernum = 1; struct unix_address *addr; - int err; unsigned int retries = 0; + static u32 ordernum = 1; + int err; err = mutex_lock_interruptible(&u->bindlock); if (err) @@ -916,7 +914,8 @@ static int unix_autobind(struct socket *sock) spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) { + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, + addr->hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -1090,7 +1089,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return -EINVAL; if (addr_len == offsetof(struct sockaddr_un, sun_path)) - return unix_autobind(sock); + return unix_autobind(sk); err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) @@ -1158,8 +1157,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, alen = err; if (test_bit(SOCK_PASSCRED, &sock->flags) && - !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) - goto out; + !unix_sk(sk)->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } restart: other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); @@ -1259,9 +1261,11 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr && - (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); @@ -1712,9 +1716,11 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr - && (err = unix_autobind(sock)) != 0) - goto out; + if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { + err = unix_autobind(sk); + if (err) + goto out; + } err = -EMSGSIZE; if (len > sk->sk_sndbuf - 32) From fd252e42bc3d2d0f5a4c0efbbd2280036912efed Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:21 +0900 Subject: [PATCH 40/61] af_unix: Factorise unix_find_other() based on address types. As done in the commit fa42d910a38e ("unix_bind(): take BSD and abstract address cases into new helpers"), this patch moves BSD and abstract address cases from unix_find_other() into unix_find_bsd() and unix_find_abstract(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 136 +++++++++++++++++++++++++++------------------ 1 file changed, 81 insertions(+), 55 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5eca753f90d15..546ecafdfe800 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -881,6 +881,87 @@ static int unix_release(struct socket *sock) return 0; } +static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, + int type, int *error) +{ + struct inode *inode; + struct path path; + struct sock *sk; + int err; + + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + inode = d_backing_inode(path.dentry); + err = inode_permission(inode, MAY_WRITE); + if (err) + goto path_put; + + err = -ECONNREFUSED; + if (!S_ISSOCK(inode->i_mode)) + goto path_put; + + sk = unix_find_socket_byinode(inode); + if (!sk) + goto path_put; + + err = -EPROTOTYPE; + if (sk->sk_type == type) + touch_atime(&path); + else + goto sock_put; + + path_put(&path); + + return sk; + +sock_put: + sock_put(sk); +path_put: + path_put(&path); +fail: + *error = err; + return NULL; +} + +static struct sock *unix_find_abstract(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type, + unsigned int hash, int *error) +{ + struct dentry *dentry; + struct sock *sk; + + sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); + if (!sk) { + *error = -ECONNREFUSED; + return NULL; + } + + dentry = unix_sk(sk)->path.dentry; + if (dentry) + touch_atime(&unix_sk(sk)->path); + + return sk; +} + +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunaddr, + int addr_len, int type, + unsigned int hash, int *error) +{ + struct sock *sk; + + if (sunaddr->sun_path[0]) + sk = unix_find_bsd(net, sunaddr, type, error); + else + sk = unix_find_abstract(net, sunaddr, addr_len, type, hash, + error); + + return sk; +} + static int unix_autobind(struct sock *sk) { struct unix_sock *u = unix_sk(sk); @@ -939,61 +1020,6 @@ out: mutex_unlock(&u->bindlock); return err; } -static struct sock *unix_find_other(struct net *net, - struct sockaddr_un *sunname, int len, - int type, unsigned int hash, int *error) -{ - struct sock *u; - struct path path; - int err = 0; - - if (sunname->sun_path[0]) { - struct inode *inode; - err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - inode = d_backing_inode(path.dentry); - err = inode_permission(inode, MAY_WRITE); - if (err) - goto put_fail; - - err = -ECONNREFUSED; - if (!S_ISSOCK(inode->i_mode)) - goto put_fail; - u = unix_find_socket_byinode(inode); - if (!u) - goto put_fail; - - if (u->sk_type == type) - touch_atime(&path); - - path_put(&path); - - err = -EPROTOTYPE; - if (u->sk_type != type) { - sock_put(u); - goto fail; - } - } else { - err = -ECONNREFUSED; - u = unix_find_socket_byname(net, sunname, len, type ^ hash); - if (u) { - struct dentry *dentry; - dentry = unix_sk(u)->path.dentry; - if (dentry) - touch_atime(&unix_sk(u)->path); - } else - goto fail; - } - return u; - -put_fail: - path_put(&path); -fail: - *error = err; - return NULL; -} - static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) { struct unix_sock *u = unix_sk(sk); From 40e2cff56af2025f4f47cdba3035d163c331ac1a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:22 +0900 Subject: [PATCH 41/61] af_unix: Return an error as a pointer in unix_find_other(). We can return an error as a pointer and need not pass an additional argument to unix_find_other(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 546ecafdfe800..3864c29e6c672 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -882,7 +882,7 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int type, int *error) + int type) { struct inode *inode; struct path path; @@ -921,23 +921,20 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, path_put: path_put(&path); fail: - *error = err; - return NULL; + return ERR_PTR(err); } static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type, - unsigned int hash, int *error) + unsigned int hash) { struct dentry *dentry; struct sock *sk; sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); - if (!sk) { - *error = -ECONNREFUSED; - return NULL; - } + if (!sk) + return ERR_PTR(-ECONNREFUSED); dentry = unix_sk(sk)->path.dentry; if (dentry) @@ -949,15 +946,14 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type, - unsigned int hash, int *error) + unsigned int hash) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, type, error); + sk = unix_find_bsd(net, sunaddr, type); else - sk = unix_find_abstract(net, sunaddr, addr_len, type, hash, - error); + sk = unix_find_abstract(net, sunaddr, addr_len, type, hash); return sk; } @@ -1190,9 +1186,11 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, alen, sock->type, hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); goto out; + } unix_state_double_lock(sk, other); @@ -1314,9 +1312,12 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); - if (!other) + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out; + } /* Latch state of peer */ unix_state_lock(other); @@ -1787,9 +1788,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; other = unix_find_other(net, sunaddr, namelen, sk->sk_type, - hash, &err); - if (other == NULL) + hash); + if (IS_ERR(other)) { + err = PTR_ERR(other); + other = NULL; goto out_free; + } } if (sk_filter(other, skb) < 0) { From b3e0d362ea2078f684d2b00a73c4fcf5e23ac2ec Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:23 +0900 Subject: [PATCH 42/61] af_unix: Cut unix_validate_addr() out of unix_mkname(). unix_mkname() tests socket address length and family and does some processing based on the address type. It is called in the early stage, and therefore some instructions are redundant and can end up in vain. The address length/family tests are done twice in unix_bind(). Also, the address type is rechecked later in unix_bind() and unix_find_other(), where we can do the same processing. Moreover, in the BSD address case, the hash is set to 0 but never used and confusing. This patch moves the address tests out of unix_mkname(), and the following patches move the other part into appropriate places and remove unix_mkname() finally. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3864c29e6c672..410183678608b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -220,15 +220,22 @@ static inline void unix_release_addr(struct unix_address *addr) * - if started by zero, it is abstract name. */ +static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) +{ + if (addr_len <= offsetof(struct sockaddr_un, sun_path) || + addr_len > sizeof(*sunaddr)) + return -EINVAL; + + if (sunaddr->sun_family != AF_UNIX) + return -EINVAL; + + return 0; +} + static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { *hashp = 0; - if (len <= offsetof(struct sockaddr_un, sun_path) || - len > sizeof(*sunaddr)) - return -EINVAL; - if (!sunaddr || sunaddr->sun_family != AF_UNIX) - return -EINVAL; if (sunaddr->sun_path[0]) { /* * This may look like an off by one error but it is a bit more @@ -1106,13 +1113,14 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned int hash; struct unix_address *addr; - if (addr_len < offsetofend(struct sockaddr_un, sun_family) || - sunaddr->sun_family != AF_UNIX) - return -EINVAL; - - if (addr_len == offsetof(struct sockaddr_un, sun_path)) + if (addr_len == offsetof(struct sockaddr_un, sun_path) && + sunaddr->sun_family == AF_UNIX) return unix_autobind(sk); + err = unix_validate_addr(sunaddr, addr_len); + if (err) + return err; + err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) return err; @@ -1173,6 +1181,10 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, goto out; if (addr->sa_family != AF_UNSPEC) { + err = unix_validate_addr(sunaddr, alen); + if (err) + goto out; + err = unix_mkname(sunaddr, alen, &hash); if (err < 0) goto out; @@ -1280,6 +1292,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int err; long timeo; + err = unix_validate_addr(sunaddr, addr_len); + if (err) + goto out; + err = unix_mkname(sunaddr, addr_len, &hash); if (err < 0) goto out; @@ -1731,6 +1747,10 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; if (msg->msg_namelen) { + err = unix_validate_addr(sunaddr, msg->msg_namelen); + if (err) + goto out; + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); if (err < 0) goto out; From 61a87d5158665c6995061e2876055e4b98d9f33c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:24 +0900 Subject: [PATCH 43/61] af_unix: Copy unix_mkname() into unix_find_(bsd|abstract)(). We should not call unix_mkname() before unix_find_other() and instead do the same thing where necessary based on the address type: - terminating the address with '\0' in unix_find_bsd() - calculating the hash in unix_find_abstract(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 63 ++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 38 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 410183678608b..11836d7a38ef5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -232,19 +232,25 @@ static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len) return 0; } +static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) +{ + /* This may look like an off by one error but it is a bit more + * subtle. 108 is the longest valid AF_UNIX path for a binding. + * sun_path[108] doesn't as such exist. However in kernel space + * we are guaranteed that it is a valid memory location in our + * kernel address buffer because syscall functions always pass + * a pointer of struct sockaddr_storage which has a bigger buffer + * than 108. + */ + ((char *)sunaddr)[addr_len] = 0; +} + static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) { *hashp = 0; if (sunaddr->sun_path[0]) { - /* - * This may look like an off by one error but it is a bit more - * subtle. 108 is the longest valid AF_UNIX path for a binding. - * sun_path[108] doesn't as such exist. However in kernel space - * we are guaranteed that it is a valid memory location in our - * kernel address buffer. - */ - ((char *)sunaddr)[len] = 0; + unix_mkname_bsd(sunaddr, len); len = strlen(sunaddr->sun_path) + offsetof(struct sockaddr_un, sun_path) + 1; return len; @@ -889,13 +895,14 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int type) + int addr_len, int type) { struct inode *inode; struct path path; struct sock *sk; int err; + unix_mkname_bsd(sunaddr, addr_len); err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); if (err) goto fail; @@ -933,9 +940,9 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type, - unsigned int hash) + int addr_len, int type) { + unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0)); struct dentry *dentry; struct sock *sk; @@ -952,15 +959,14 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type, - unsigned int hash) + int addr_len, int type) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, type); + sk = unix_find_bsd(net, sunaddr, addr_len, type); else - sk = unix_find_abstract(net, sunaddr, addr_len, type, hash); + sk = unix_find_abstract(net, sunaddr, addr_len, type); return sk; } @@ -1173,7 +1179,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; struct sock *other; - unsigned int hash; int err; err = -EINVAL; @@ -1185,11 +1190,6 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - err = unix_mkname(sunaddr, alen, &hash); - if (err < 0) - goto out; - alen = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !unix_sk(sk)->addr) { err = unix_autobind(sk); @@ -1198,7 +1198,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type, hash); + other = unix_find_other(net, sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1287,7 +1287,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sock *newsk = NULL; struct sock *other = NULL; struct sk_buff *skb = NULL; - unsigned int hash; int st; int err; long timeo; @@ -1296,11 +1295,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) - goto out; - addr_len = err; - if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) { err = unix_autobind(sk); if (err) @@ -1328,7 +1322,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; @@ -1728,9 +1722,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); struct sock *other = NULL; - int namelen = 0; /* fake GCC */ int err; - unsigned int hash; struct sk_buff *skb; long timeo; struct scm_cookie scm; @@ -1750,11 +1742,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, err = unix_validate_addr(sunaddr, msg->msg_namelen); if (err) goto out; - - err = unix_mkname(sunaddr, msg->msg_namelen, &hash); - if (err < 0) - goto out; - namelen = err; } else { sunaddr = NULL; err = -ENOTCONN; @@ -1807,8 +1794,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, namelen, sk->sk_type, - hash); + other = unix_find_other(net, sunaddr, msg->msg_namelen, + sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); other = NULL; From 4b828a11b750d071db8e54e91a9e173dcc691da9 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:25 +0900 Subject: [PATCH 44/61] af_unix: Remove unix_mkname(). This patch removes unix_mkname() and postpones calculating a hash to unix_bind_abstract(). Some BSD stuffs still remain in unix_bind() though, the next patch packs them into unix_bind_bsd(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 34 +++++++++++----------------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 11836d7a38ef5..b093a96015f3b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -245,21 +245,6 @@ static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len) ((char *)sunaddr)[addr_len] = 0; } -static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp) -{ - *hashp = 0; - - if (sunaddr->sun_path[0]) { - unix_mkname_bsd(sunaddr, len); - len = strlen(sunaddr->sun_path) + - offsetof(struct sockaddr_un, sun_path) + 1; - return len; - } - - *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0)); - return len; -} - static void __unix_remove_socket(struct sock *sk) { sk_del_node_init(sk); @@ -1097,6 +1082,9 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) return -EINVAL; } + addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); + addr->hash ^= sk->sk_type; + spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, addr->hash)) { @@ -1112,12 +1100,11 @@ static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sock *sk = sock->sk; struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; char *sun_path = sunaddr->sun_path; - int err; - unsigned int hash; + struct sock *sk = sock->sk; struct unix_address *addr; + int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && sunaddr->sun_family == AF_UNIX) @@ -1127,17 +1114,18 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - err = unix_mkname(sunaddr, addr_len, &hash); - if (err < 0) - return err; - addr_len = err; + if (sun_path[0]) { + unix_mkname_bsd(sunaddr, addr_len); + addr_len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; + } + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); if (!addr) return -ENOMEM; memcpy(addr->name, sunaddr, addr_len); addr->len = addr_len; - addr->hash = hash ^ sk->sk_type; refcount_set(&addr->refcnt, 1); if (sun_path[0]) From 9eecb82ecdf2d673a0c28233764fc86113adebe4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:26 +0900 Subject: [PATCH 45/61] af_unix: Allocate unix_address in unix_bind_(bsd|abstract)(). To terminate address with '\0' in unix_bind_bsd(), we add unix_create_addr() and call it in unix_bind_bsd() and unix_bind_abstract(). Also, unix_bind_abstract() does not return -EEXIST. Only kern_path_create() and vfs_mknod() in unix_bind_bsd() can return it, so we move the last error check in unix_bind() to unix_bind_bsd(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 105 ++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 39 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b093a96015f3b..8ecf0da84eb75 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -207,6 +207,22 @@ struct sock *unix_peer_get(struct sock *s) } EXPORT_SYMBOL_GPL(unix_peer_get); +static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr, + int addr_len) +{ + struct unix_address *addr; + + addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL); + if (!addr) + return NULL; + + refcount_set(&addr->refcnt, 1); + addr->len = addr_len; + memcpy(addr->name, sunaddr, addr_len); + + return addr; +} + static inline void unix_release_addr(struct unix_address *addr) { if (refcount_dec_and_test(&addr->refcnt)) @@ -1014,23 +1030,35 @@ out: mutex_unlock(&u->bindlock); return err; } -static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) +static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { - struct unix_sock *u = unix_sk(sk); umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); - struct path parent; + struct unix_sock *u = unix_sk(sk); + struct unix_address *addr; struct dentry *dentry; + struct path parent; unsigned int hash; int err; + unix_mkname_bsd(sunaddr, addr_len); + addr_len = strlen(sunaddr->sun_path) + + offsetof(struct sockaddr_un, sun_path) + 1; + + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; + /* * Get the parent directory, calculate the hash for last * component. */ dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out; + } /* * All right, let's create it. @@ -1039,7 +1067,7 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) if (!err) err = vfs_mknod(d_inode(parent.dentry), dentry, mode, 0); if (err) - goto out; + goto out_path; err = mutex_lock_interruptible(&u->bindlock); if (err) goto out_unlink; @@ -1063,47 +1091,61 @@ static int unix_bind_bsd(struct sock *sk, struct unix_address *addr) out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(d_inode(parent.dentry), dentry, NULL); -out: +out_path: done_path_create(&parent, dentry); - return err; +out: + unix_release_addr(addr); + return err == -EEXIST ? -EADDRINUSE : err; } -static int unix_bind_abstract(struct sock *sk, struct unix_address *addr) +static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, + int addr_len) { struct unix_sock *u = unix_sk(sk); + struct unix_address *addr; int err; + addr = unix_create_addr(sunaddr, addr_len); + if (!addr) + return -ENOMEM; + err = mutex_lock_interruptible(&u->bindlock); if (err) - return err; + goto out; if (u->addr) { - mutex_unlock(&u->bindlock); - return -EINVAL; + err = -EINVAL; + goto out_mutex; } addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); addr->hash ^= sk->sk_type; spin_lock(&unix_table_lock); + if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) { - spin_unlock(&unix_table_lock); - mutex_unlock(&u->bindlock); - return -EADDRINUSE; - } + addr->hash)) + goto out_spin; + __unix_set_addr(sk, addr, addr->hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return 0; + +out_spin: + spin_unlock(&unix_table_lock); + err = -EADDRINUSE; +out_mutex: + mutex_unlock(&u->bindlock); +out: + unix_release_addr(addr); + return err; } static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - char *sun_path = sunaddr->sun_path; struct sock *sk = sock->sk; - struct unix_address *addr; int err; if (addr_len == offsetof(struct sockaddr_un, sun_path) && @@ -1114,27 +1156,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - if (sun_path[0]) { - unix_mkname_bsd(sunaddr, addr_len); - addr_len = strlen(sunaddr->sun_path) + - offsetof(struct sockaddr_un, sun_path) + 1; - } - - addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); - if (!addr) - return -ENOMEM; - - memcpy(addr->name, sunaddr, addr_len); - addr->len = addr_len; - refcount_set(&addr->refcnt, 1); - - if (sun_path[0]) - err = unix_bind_bsd(sk, addr); + if (sunaddr->sun_path[0]) + err = unix_bind_bsd(sk, sunaddr, addr_len); else - err = unix_bind_abstract(sk, addr); - if (err) - unix_release_addr(addr); - return err == -EEXIST ? -EADDRINUSE : err; + err = unix_bind_abstract(sk, sunaddr, addr_len); + + return err; } static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) From 69bf151877899ec548f9379d03b80698606f7948 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:27 +0900 Subject: [PATCH 46/61] af_unix: Remove UNIX_ABSTRACT() macro and test sun_path[0] instead. In BSD and abstract address cases, we store sockets in the hash table with keys between 0 and UNIX_HASH_SIZE - 1. However, the hash saved in a socket varies depending on its address type; sockets with BSD addresses always have UNIX_HASH_SIZE in their unix_sk(sk)->addr->hash. This is just for the UNIX_ABSTRACT() macro used to check the address type. The difference of the saved hashes comes from the first byte of the address in the first place. So, we can test it directly. Then we can keep a real hash in each socket and replace unix_table_lock with per-hash locks in the later patch. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8ecf0da84eb75..174391112547e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -133,8 +133,6 @@ static struct hlist_head *unix_sockets_unbound(void *addr) return &unix_socket_table[UNIX_HASH_SIZE + hash]; } -#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE) - #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -2965,9 +2963,9 @@ static int unix_seq_show(struct seq_file *seq, void *v) i = 0; len = u->addr->len - offsetof(struct sockaddr_un, sun_path); - if (!UNIX_ABSTRACT(s)) + if (u->addr->name->sun_path[0]) { len--; - else { + } else { seq_putc(seq, '@'); i++; } From 112a5ea0dd563ab89f178a3c72ce1f03776a4c58 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:28 +0900 Subject: [PATCH 47/61] af_unix: Add helpers to calculate hashes. This patch adds three helper functions that calculate hashes for unbound sockets and bound sockets with BSD/abstract addresses. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- net/unix/af_unix.c | 64 +++++++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 29 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 174391112547e..1afdbf02bc64d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -122,15 +122,38 @@ DEFINE_SPINLOCK(unix_table_lock); EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; +/* SMP locking strategy: + * hash table is protected with spinlock unix_table_lock + * each socket state is protected by separate spin lock. + */ -static struct hlist_head *unix_sockets_unbound(void *addr) +static unsigned int unix_unbound_hash(struct sock *sk) { - unsigned long hash = (unsigned long)addr; + unsigned long hash = (unsigned long)sk; hash ^= hash >> 16; hash ^= hash >> 8; - hash %= UNIX_HASH_SIZE; - return &unix_socket_table[UNIX_HASH_SIZE + hash]; + hash ^= sk->sk_type; + + return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); +} + +static unsigned int unix_bsd_hash(struct inode *i) +{ + return i->i_ino & (UNIX_HASH_SIZE - 1); +} + +static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, + int addr_len, int type) +{ + __wsum csum = csum_partial(sunaddr, addr_len, 0); + unsigned int hash; + + hash = (__force unsigned int)csum_fold(csum); + hash ^= hash >> 8; + hash ^= type; + + return hash & (UNIX_HASH_SIZE - 1); } #ifdef CONFIG_SECURITY_NETWORK @@ -161,20 +184,6 @@ static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb) } #endif /* CONFIG_SECURITY_NETWORK */ -/* - * SMP locking strategy: - * hash table is protected with spinlock unix_table_lock - * each socket state is protected by separate spin lock. - */ - -static inline unsigned int unix_hash_fold(__wsum n) -{ - unsigned int hash = (__force unsigned int)csum_fold(n); - - hash ^= hash>>8; - return hash&(UNIX_HASH_SIZE-1); -} - #define unix_peer(sk) (unix_sk(sk)->peer) static inline int unix_our_peer(struct sock *sk, struct sock *osk) @@ -327,11 +336,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, static struct sock *unix_find_socket_byinode(struct inode *i) { + unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&unix_table_lock); - sk_for_each(s, - &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { + sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { @@ -836,7 +845,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_socket(unix_sockets_unbound(sk), sk); + unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk); out: if (sk == NULL) atomic_long_dec(&unix_nr_socks); @@ -941,11 +950,11 @@ static struct sock *unix_find_abstract(struct net *net, struct sockaddr_un *sunaddr, int addr_len, int type) { - unsigned int hash = unix_hash_fold(csum_partial(sunaddr, addr_len, 0)); + unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type); struct dentry *dentry; struct sock *sk; - sk = unix_find_socket_byname(net, sunaddr, addr_len, type ^ hash); + sk = unix_find_socket_byname(net, sunaddr, addr_len, hash); if (!sk) return ERR_PTR(-ECONNREFUSED); @@ -997,8 +1006,7 @@ static int unix_autobind(struct sock *sk) retry: addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + offsetof(struct sockaddr_un, sun_path) + 1; - addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); - addr->hash ^= sk->sk_type; + addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; @@ -1073,7 +1081,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; addr->hash = UNIX_HASH_SIZE; - hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1); + hash = unix_bsd_hash(d_backing_inode(dentry)); spin_lock(&unix_table_lock); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); @@ -1116,9 +1124,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } - addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0)); - addr->hash ^= sk->sk_type; - + addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, From cdc1b05ca2b491e2e83b2d3d6ee03b9af6f33deb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:29 +0900 Subject: [PATCH 48/61] af_unix: Save hash in sk_hash. To replace unix_table_lock with per-hash locks in the next patch, we need to save a hash in each socket because /proc/net/unix or BPF prog iterate sockets while holding a hash table lock and release it later in a different function. Currently, we store a real/pseudo hash in struct unix_address. However, we do not allocate it to unbound sockets, nor should we do just for that. For this purpose, we can use sk_hash. Then, we no longer use the hash field in struct unix_address and can remove it. Also, this patch does - rename unix_insert_socket() to unix_insert_unbound_socket() - remove the redundant list argument from __unix_insert_socket() and unix_insert_unbound_socket() - use 'unsigned int' instead of 'unsigned' in __unix_set_addr_hash() - remove 'inline' from unix_remove_socket() and unix_insert_unbound_socket(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- include/net/af_unix.h | 1 - net/unix/af_unix.c | 42 +++++++++++++++++++++++------------------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 349279c4d2672..30690d8ff9dc4 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -26,7 +26,6 @@ extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; struct unix_address { refcount_t refcnt; int len; - unsigned int hash; struct sockaddr_un name[]; }; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1afdbf02bc64d..73c61c360ed53 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -273,31 +273,33 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void __unix_insert_socket(struct sock *sk) { WARN_ON(!sk_unhashed(sk)); - sk_add_node(sk, list); + sk_add_node(sk, &unix_socket_table[sk->sk_hash]); } -static void __unix_set_addr(struct sock *sk, struct unix_address *addr, - unsigned hash) +static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, + unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); - __unix_insert_socket(&unix_socket_table[hash], sk); + + sk->sk_hash = hash; + __unix_insert_socket(sk); } -static inline void unix_remove_socket(struct sock *sk) +static void unix_remove_socket(struct sock *sk) { spin_lock(&unix_table_lock); __unix_remove_socket(sk); spin_unlock(&unix_table_lock); } -static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) +static void unix_insert_unbound_socket(struct sock *sk) { spin_lock(&unix_table_lock); - __unix_insert_socket(list, sk); + __unix_insert_socket(sk); spin_unlock(&unix_table_lock); } @@ -830,6 +832,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) sock_init_data(sock, sk); + sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); @@ -845,7 +848,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern) init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_socket(&unix_socket_table[unix_unbound_hash(sk)], sk); + unix_insert_unbound_socket(sk); out: if (sk == NULL) atomic_long_dec(&unix_nr_socks); @@ -985,6 +988,7 @@ static int unix_autobind(struct sock *sk) struct unix_address *addr; unsigned int retries = 0; static u32 ordernum = 1; + unsigned int new_hash; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1006,13 +1010,13 @@ static int unix_autobind(struct sock *sk) retry: addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + offsetof(struct sockaddr_un, sun_path) + 1; - addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) { + new_hash)) { spin_unlock(&unix_table_lock); /* * __unix_find_socket_byname() may take long time if many names @@ -1028,7 +1032,7 @@ static int unix_autobind(struct sock *sk) goto retry; } - __unix_set_addr(sk, addr, addr->hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); err = 0; @@ -1043,9 +1047,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); struct unix_sock *u = unix_sk(sk); struct unix_address *addr; + unsigned int new_hash; struct dentry *dentry; struct path parent; - unsigned int hash; int err; unix_mkname_bsd(sunaddr, addr_len); @@ -1080,12 +1084,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, if (u->addr) goto out_unlock; - addr->hash = UNIX_HASH_SIZE; - hash = unix_bsd_hash(d_backing_inode(dentry)); + new_hash = unix_bsd_hash(d_backing_inode(dentry)); spin_lock(&unix_table_lock); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); - __unix_set_addr(sk, addr, hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); @@ -1109,6 +1112,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, { struct unix_sock *u = unix_sk(sk); struct unix_address *addr; + unsigned int new_hash; int err; addr = unix_create_addr(sunaddr, addr_len); @@ -1124,14 +1128,14 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, goto out_mutex; } - addr->hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); + new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); spin_lock(&unix_table_lock); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - addr->hash)) + new_hash)) goto out_spin; - __unix_set_addr(sk, addr, addr->hash); + __unix_set_addr_hash(sk, addr, new_hash); spin_unlock(&unix_table_lock); mutex_unlock(&u->bindlock); return 0; From d248b50b44027c561a4c158dcc21354905e4b27a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:30 +0900 Subject: [PATCH 49/61] af_unix: Replace the big lock with small locks. The hash table of AF_UNIX sockets is protected by the single lock. This patch replaces it with per-hash locks. The effect is noticeable when we handle multiple sockets simultaneously. Here is a test result on an EC2 c5.24xlarge instance. It shows latency (under 10us only) in unix_insert_unbound_socket() while 64 CPUs creating 1024 sockets for each in parallel. Without this patch: nsec : count distribution 0 : 179 | | 500 : 3021 |********* | 1000 : 6271 |******************* | 1500 : 6318 |******************* | 2000 : 5828 |***************** | 2500 : 5124 |*************** | 3000 : 4426 |************* | 3500 : 3672 |*********** | 4000 : 3138 |********* | 4500 : 2811 |******** | 5000 : 2384 |******* | 5500 : 2023 |****** | 6000 : 1954 |***** | 6500 : 1737 |***** | 7000 : 1749 |***** | 7500 : 1520 |**** | 8000 : 1469 |**** | 8500 : 1394 |**** | 9000 : 1232 |*** | 9500 : 1138 |*** | 10000 : 994 |*** | With this patch: nsec : count distribution 0 : 1634 |**** | 500 : 13170 |****************************************| 1000 : 13156 |*************************************** | 1500 : 9010 |*************************** | 2000 : 6363 |******************* | 2500 : 4443 |************* | 3000 : 3240 |********* | 3500 : 2549 |******* | 4000 : 1872 |***** | 4500 : 1504 |**** | 5000 : 1247 |*** | 5500 : 1035 |*** | 6000 : 889 |** | 6500 : 744 |** | 7000 : 634 |* | 7500 : 498 |* | 8000 : 433 |* | 8500 : 355 |* | 9000 : 336 |* | 9500 : 284 | | 10000 : 243 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski Signed-off-by: Juhyung Park --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 98 ++++++++++++++++++++++++++----------------- net/unix/diag.c | 20 ++++----- 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 30690d8ff9dc4..297487f08a200 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -20,7 +20,7 @@ struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_BITS 8 extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_lock; +extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; struct unix_address { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 73c61c360ed53..ecd2019477029 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,14 +116,14 @@ #include "scm.h" +spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +EXPORT_SYMBOL_GPL(unix_table_locks); struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); -DEFINE_SPINLOCK(unix_table_lock); -EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks; /* SMP locking strategy: - * hash table is protected with spinlock unix_table_lock + * hash table is protected with spinlock unix_table_locks * each socket state is protected by separate spin lock. */ @@ -156,6 +156,25 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, return hash & (UNIX_HASH_SIZE - 1); } +static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +{ + /* hash1 and hash2 is never the same because + * one is between 0 and UNIX_HASH_SIZE - 1, and + * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + */ + if (hash1 > hash2) + swap(hash1, hash2); + + spin_lock(&unix_table_locks[hash1]); + spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); +} + +static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +{ + spin_unlock(&unix_table_locks[hash1]); + spin_unlock(&unix_table_locks[hash2]); +} + #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -291,16 +310,16 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, static void unix_remove_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_remove_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_insert_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); } static struct sock *__unix_find_socket_byname(struct net *net, @@ -328,11 +347,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s; - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[hash]); return s; } @@ -341,19 +360,18 @@ static struct sock *unix_find_socket_byinode(struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - goto found; + spin_unlock(&unix_table_locks[hash]); + return s; } } - s = NULL; -found: - spin_unlock(&unix_table_lock); - return s; + spin_unlock(&unix_table_locks[hash]); + return NULL; } /* Support code for asymmetrically connected dgram sockets @@ -984,11 +1002,11 @@ static struct sock *unix_find_other(struct net *net, static int unix_autobind(struct sock *sk) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; unsigned int retries = 0; static u32 ordernum = 1; - unsigned int new_hash; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1012,12 +1030,13 @@ static int unix_autobind(struct sock *sk) offsetof(struct sockaddr_un, sun_path) + 1; new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) { - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); + /* * __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1033,7 +1052,7 @@ static int unix_autobind(struct sock *sk) } __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); @@ -1045,9 +1064,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; struct dentry *dentry; struct path parent; int err; @@ -1085,11 +1104,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1110,9 +1129,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; int err; addr = unix_create_addr(sunaddr, addr_len); @@ -1129,19 +1148,19 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -1438,9 +1457,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_lock. Insertion + * otheru in hash under unix_table_locks. Insertion * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_lock, + * in an earlier critical area protected by unix_table_locks, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -2871,7 +2890,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1) #define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) @@ -2895,7 +2914,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, struct sock *sk, loff_t *pos) { - unsigned long bucket; + unsigned long bucket = get_bucket(*pos); while (sk > (struct sock *)SEQ_START_TOKEN) { sk = sk_next(sk); @@ -2906,12 +2925,13 @@ static struct sock *unix_next_socket(struct seq_file *seq, } do { + spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; next_bucket: - bucket = get_bucket(*pos) + 1; + spin_unlock(&unix_table_locks[bucket++]); *pos = set_bucket_offset(bucket, 1); } while (bucket < ARRAY_SIZE(unix_socket_table)); @@ -2919,10 +2939,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, } static void *unix_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(unix_table_lock) { - spin_lock(&unix_table_lock); - if (!*pos) return SEQ_START_TOKEN; @@ -2939,9 +2956,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void unix_seq_stop(struct seq_file *seq, void *v) - __releases(unix_table_lock) { - spin_unlock(&unix_table_lock); + struct sock *sk = v; + + if (sk) + spin_unlock(&unix_table_locks[sk->sk_hash]); } static int unix_seq_show(struct seq_file *seq, void *v) @@ -2966,7 +2985,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_lock here + if (u->addr) { // under unix_table_locks here int i, len; seq_putc(seq, ' '); @@ -3038,10 +3057,13 @@ static struct pernet_operations unix_net_ops = { static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); + for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + spin_lock_init(&unix_table_locks[i]); + rc = proto_register(&unix_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); diff --git a/net/unix/diag.c b/net/unix/diag.c index d146fd1a95fa7..dce508042f90e 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -13,7 +13,7 @@ static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - /* might or might not have unix_table_lock */ + /* might or might not have unix_table_locks */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); if (!addr) @@ -207,13 +207,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_slot = cb->args[0]; num = s_num = cb->args[1]; - spin_lock(&unix_table_lock); for (slot = s_slot; slot < ARRAY_SIZE(unix_socket_table); s_num = 0, slot++) { struct sock *sk; num = 0; + spin_lock(&unix_table_locks[slot]); sk_for_each(sk, &unix_socket_table[slot]) { if (!net_eq(sock_net(sk), net)) continue; @@ -224,14 +224,16 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (sk_diag_dump(sk, skb, req, sk_user_ns(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI) < 0) + NLM_F_MULTI) < 0) { + spin_unlock(&unix_table_locks[slot]); goto done; + } next: num++; } + spin_unlock(&unix_table_locks[slot]); } done: - spin_unlock(&unix_table_lock); cb->args[0] = slot; cb->args[1] = num; @@ -240,21 +242,19 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) static struct sock *unix_lookup_by_ino(unsigned int ino) { - int i; struct sock *sk; + int i; - spin_lock(&unix_table_lock); for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { + spin_lock(&unix_table_locks[i]); sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { sock_hold(sk); - spin_unlock(&unix_table_lock); - + spin_unlock(&unix_table_locks[i]); return sk; } + spin_unlock(&unix_table_locks[i]); } - - spin_unlock(&unix_table_lock); return NULL; } From ffc731a8893fc8463cc633470fa28a1eef0cad79 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 24 Nov 2021 11:14:31 +0900 Subject: [PATCH 50/61] af_unix: Relax race in unix_autobind(). When we bind an AF_UNIX socket without a name specified, the kernel selects an available one from 0x00000 to 0xFFFFF. unix_autobind() starts searching from a number in the 'static' variable and increments it after acquiring two locks. If multiple processes try autobind, they obtain the same lock and check if a socket in the hash list has the same name. If not, one process uses it, and all except one end up retrying the _next_ number (actually not, it may be incremented by the other processes). The more we autobind sockets in parallel, the longer the latency gets. We can avoid such a race by searching for a name from a random number. These show latency in unix_autobind() while 64 CPUs are simultaneously autobind-ing 1024 sockets for each. Without this patch: usec : count distribution 0 : 1176 |*** | 2 : 3655 |*********** | 4 : 4094 |************* | 6 : 3831 |************ | 8 : 3829 |************ | 10 : 3844 |************ | 12 : 3638 |*********** | 14 : 2992 |********* | 16 : 2485 |******* | 18 : 2230 |******* | 20 : 2095 |****** | 22 : 1853 |***** | 24 : 1827 |***** | 26 : 1677 |***** | 28 : 1473 |**** | 30 : 1573 |***** | 32 : 1417 |**** | 34 : 1385 |**** | 36 : 1345 |**** | 38 : 1344 |**** | 40 : 1200 |*** | With this patch: usec : count distribution 0 : 1855 |****** | 2 : 6464 |********************* | 4 : 9936 |******************************** | 6 : 12107 |****************************************| 8 : 10441 |********************************** | 10 : 7264 |*********************** | 12 : 4254 |************** | 14 : 2538 |******** | 16 : 1596 |***** | 18 : 1088 |*** | 20 : 800 |** | 22 : 670 |** | 24 : 601 |* | 26 : 562 |* | 28 : 525 |* | 30 : 446 |* | 32 : 378 |* | 34 : 337 |* | 36 : 317 |* | 38 : 314 |* | 40 : 298 | | Signed-off-by: Kuniyuki Iwashima Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ecd2019477029..61ae3cf03b234 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1005,8 +1005,7 @@ static int unix_autobind(struct sock *sk) unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int retries = 0; - static u32 ordernum = 1; + u32 lastnum, ordernum; int err; err = mutex_lock_interruptible(&u->bindlock); @@ -1022,32 +1021,35 @@ static int unix_autobind(struct sock *sk) if (!addr) goto out; + addr->len = offsetof(struct sockaddr_un, sun_path) + 6; addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); + ordernum = prandom_u32(); + lastnum = ordernum & 0xFFFFF; retry: - addr->len = sprintf(addr->name->sun_path + 1, "%05x", ordernum) + - offsetof(struct sockaddr_un, sun_path) + 1; + ordernum = (ordernum + 1) & 0xFFFFF; + sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); unix_table_double_lock(old_hash, new_hash); - ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) { unix_table_double_unlock(old_hash, new_hash); - /* - * __unix_find_socket_byname() may take long time if many names + /* __unix_find_socket_byname() may take long time if many names * are already in use. */ cond_resched(); - /* Give up if all names seems to be in use. */ - if (retries++ == 0xFFFFF) { + + if (ordernum == lastnum) { + /* Give up if all names seems to be in use. */ err = -ENOSPC; - kfree(addr); + unix_release_addr(addr); goto out; } + goto retry; } From 7fe8c5e7e5f3eb7dc54d38153b4aad65ee4ca69f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 6 Jan 2017 15:34:09 +0000 Subject: [PATCH 51/61] ipv4/tcp: allow the memory tuning for tcp to go a little bigger than default Signed-off-by: Diab Neiroukh --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 686878b3b0570..718a35ea574f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4344,8 +4344,8 @@ void __init tcp_init(void) tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); - max_wshare = min(4UL*1024*1024, limit); - max_rshare = min(6UL*1024*1024, limit); + max_wshare = min(16UL*1024*1024, limit); + max_rshare = min(16UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; From 1c251e397b730aab04e8033f5af1e4af06b4cf2c Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Thu, 29 Sep 2022 12:47:41 +0530 Subject: [PATCH 52/61] ipv4/tcp: Force applications to use TCP_NODELAY to improve network latency `TCP_NODELAY` will disable Nagle's algorithm, which basically collects small outgoing packets to send all at once. Thus `TCP_NODELAY` will send the data whenever it is available without waiting for any period to collect packets. It will basically optimize network video games and applications using chatty protocols (source: ExtraHop[1], RedHat[2]) [1]: https://www.extrahop.com/company/blog/2016/tcp-nodelay-nagle-quickack-best-practices/#:~:text=The%20TCP_NODELAY%20socket%20option%20allows,buffer%2C%20whatever%20the%20packet%20size. [2]: https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux_for_real_time/7/html/tuning_guide/tcp_nodelay_and_small_buffer_writes Signed-off-by: Panchajanya1999 [Rasenkai: forward-port to 5.4] Signed-off-by: Rasenkai Change-Id: I2518d04969975c69c61d5e0acab7295702190b0d --- net/ipv4/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 718a35ea574f9..c6260df04f3e5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3238,6 +3238,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); + /* Hack optname to use TCP_NODELAY for everything */ + optname=TCP_NODELAY; + switch (optname) { case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However From 789db66a32121c32717b6b92334a8f3d25814c22 Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Thu, 29 Sep 2022 12:51:27 +0530 Subject: [PATCH 53/61] net: ipv4: Reduce TCP performance spikes Turning off TCP timestamps re4duces performance spikes related to timestamp generation. Making the node read-only prevents any unwanted writes to the node. This tuning is suggested by RedHat[1]. [1] https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux_for_real_time/7/html/tuning_guide/reduce_tcp_performance_spikes Signed-off-by: Panchajanya1999 [Rasenkai: forward-port to 5.4] Signed-off-by: Rasenkai Change-Id: I082f414017a58d58e89d9d57f2fdb73aa2a6c224 --- net/ipv4/sysctl_net_ipv4.c | 2 +- net/ipv4/tcp_ipv4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index de268ebb49866..9855f602d439a 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1035,7 +1035,7 @@ static struct ctl_table ipv4_net_table[] = { .procname = "tcp_timestamps", .data = &init_net.ipv4.sysctl_tcp_timestamps, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = proc_dointvec }, { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 37ef27244d496..cb4fbad48a6fa 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2915,7 +2915,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; - net->ipv4.sysctl_tcp_timestamps = 1; + net->ipv4.sysctl_tcp_timestamps = 0; net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ From 1afa5804bb86927cb1cec51b2996cc954bac99f9 Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Thu, 29 Sep 2022 12:54:39 +0530 Subject: [PATCH 54/61] netdev: Increase the size of the receive queue The received frames will be stored in this queue after taking them from the ring buffer on the network card. Increasing this value for high speed cards may help prevent losing packets. Signed-off-by: Panchajanya1999 [Rasenkai: forward-port to 5.4] Signed-off-by: Rasenkai Change-Id: I756dc3850fa9da288ceabd149a95bcb9239042f4 --- net/core/dev.c | 2 +- net/core/sysctl_net_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 6f9a66e57b252..e1f36298e99fa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4274,7 +4274,7 @@ EXPORT_SYMBOL(__dev_direct_xmit); * Receiver routines *************************************************************************/ -int netdev_max_backlog __read_mostly = 1000; +int netdev_max_backlog __read_mostly = 16384; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0dfe9f255ab3a..790f116e65d74 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -370,7 +370,7 @@ static struct ctl_table net_core_table[] = { .procname = "netdev_max_backlog", .data = &netdev_max_backlog, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = proc_dointvec }, { From 72f09a2ab22b6dc6349e783ca40f38e1f781b523 Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Thu, 29 Sep 2022 12:56:41 +0530 Subject: [PATCH 55/61] tcp: Allow TCP Fast Open for both incoming & outgoing connections The proc(RO) "tcp_fastopen" reads the value of "sysctl_tcp_fastopen", which is set to "TFO_CLIENT_ENABLE" in tcp_fastopen.c Signed-off-by: Panchajanya1999 [Rasenkai: forward-port to 5.4] Signed-off-by: Rasenkai Change-Id: I33482a87c8f3748a6e0b2f64588effa679bcc922 --- include/net/tcp.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f4bd23459b986..d399228b8bda7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -232,7 +232,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ -#define TFO_CLIENT_ENABLE 1 +#define TFO_CLIENT_ENABLE 3 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9855f602d439a..19d25a677a349 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -956,7 +956,7 @@ static struct ctl_table ipv4_net_table[] = { .procname = "tcp_fastopen", .data = &init_net.ipv4.sysctl_tcp_fastopen, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = proc_dointvec, }, { From 661888ad8851dbd0fcf5278c78d91b14053bfcf7 Mon Sep 17 00:00:00 2001 From: Tyler Nijmeh Date: Mon, 17 Oct 2022 16:11:16 +0530 Subject: [PATCH 56/61] tcp: Enable Explicit Congestion Control for everything Incoming and outgoing connections both request ECN Signed-off-by: Tyler Nijmeh Signed-off-by: Rasenkai Change-Id: I0c21c926e083fb9d98cc99d70ea01fd78f475db3 --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index cb4fbad48a6fa..d49741248b1ad 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2883,7 +2883,7 @@ static int __net_init tcp_sk_init(struct net *net) *per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk; } - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = 1; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; From e81f3b5c8d413d959ca4496821dc2043f79ea56b Mon Sep 17 00:00:00 2001 From: Panchajanya1999 Date: Mon, 17 Jan 2022 15:47:53 +0530 Subject: [PATCH 57/61] ipv4/tcp_output: Disable tcp_slow_start_after_idle tcp_slow_start_after_idle sets whether TCP should start at the default window size only for new connections or also for existing connections that have been idle for too long. Enabling this setting kills persistent single connection performance. Change-Id: Ied57fbba354fb71803636cd7fb9a98fe0304bfe5 Signed-off-by: Panchajanya1999 Signed-off-by: Panchajanya1999 (cherry picked from commit c9c256e3d343300bbd04498842972e47a2b41ffc) (cherry picked from commit 92100d0a925ec87b1e2f8d79f236f7075baf509a) --- net/ipv4/tcp_ipv4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d49741248b1ad..f03826e231779 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2918,7 +2918,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_timestamps = 0; net->ipv4.sysctl_tcp_early_retrans = 3; net->ipv4.sysctl_tcp_recovery = TCP_RACK_LOSS_DETECTION; - net->ipv4.sysctl_tcp_slow_start_after_idle = 1; /* By default, RFC2861 behavior. */ + net->ipv4.sysctl_tcp_slow_start_after_idle = 0; /* By default, RFC2861 behavior. */ net->ipv4.sysctl_tcp_retrans_collapse = 1; net->ipv4.sysctl_tcp_max_reordering = 300; net->ipv4.sysctl_tcp_dsack = 1; From 4896113a5286c65c46bc341b776882e85165fb83 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Fri, 2 Aug 2019 00:33:26 -0700 Subject: [PATCH 58/61] tcp: Enable support for TCP fast open when serving Signed-off-by: Danny Lin Signed-off-by: Adithya R --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f03826e231779..1acbf7a2c69ba 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2952,7 +2952,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; - net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; + net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE | TFO_SERVER_ENABLE | + TFO_SERVER_WO_SOCKOPT1; spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock); net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); From ef7217f8ec8f54c7ebdc8cb5eb03a1c92307120f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 13 Dec 2018 01:00:49 +0000 Subject: [PATCH 59/61] kernel: do accept() in LIFO order for cache efficiency Signed-off-by: Diab Neiroukh Change-Id: Idbd799e7a81dff99b686d87f342f39983de9a6e1 --- include/linux/wait.h | 2 ++ kernel/sched/wait.c | 24 ++++++++++++++++++++++++ net/ipv4/inet_connection_sock.c | 2 +- 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index e9966f3929f60..ea3ba5d9b4780 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -164,6 +164,7 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) @@ -1155,6 +1156,7 @@ do { \ */ void prepare_to_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); bool prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index c4f324ad035c6..b179580660a6b 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -38,6 +38,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue } EXPORT_SYMBOL(add_wait_queue_exclusive); +void add_wait_queue_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive_lifo); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -279,6 +290,19 @@ prepare_to_wait_exclusive(struct wait_queue_head *wq_head, struct wait_queue_ent } EXPORT_SYMBOL(prepare_to_wait_exclusive); +void prepare_to_wait_exclusive_lifo(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&wq_head->lock, flags); + if (list_empty(&wq_entry->entry)) + __add_wait_queue(wq_head, wq_entry); + set_current_state(state); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive_lifo); + void init_wait_entry(struct wait_queue_entry *wq_entry, int flags) { wq_entry->flags = flags; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index ae91335d1f271..82d5c38397acd 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -437,7 +437,7 @@ static int inet_csk_wait_for_connect(struct sock *sk, long timeo) * having to remove and re-insert us on the wait queue. */ for (;;) { - prepare_to_wait_exclusive(sk_sleep(sk), &wait, + prepare_to_wait_exclusive_lifo(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); release_sock(sk); if (reqsk_queue_empty(&icsk->icsk_accept_queue)) From 927d6e2a42727c32ea357de5368d18143c41ddd7 Mon Sep 17 00:00:00 2001 From: Vaisakh Murali Date: Tue, 9 Aug 2022 05:29:50 +0000 Subject: [PATCH 60/61] net/bluetooth: Queue delayed work on power efficient wq Signed-off-by: Vaisakh Murali --- net/bluetooth/6lowpan.c | 3 ++- net/bluetooth/hci_core.c | 4 ++-- net/bluetooth/l2cap_core.c | 6 ++++-- net/bluetooth/sco.c | 3 ++- net/bluetooth/smp.c | 3 ++- 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 7601ce9143c18..bb1e7a2886d1a 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -684,7 +684,8 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, /* Notifying peers about us needs to be done without locks held */ if (new_netdev) INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); - schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); + queue_delayed_work(system_power_efficient_wq, + &dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; } diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index e40eae2752820..1a3a749dfb4a5 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -5001,8 +5001,8 @@ static void hci_cmd_work(struct work_struct *work) if (test_bit(HCI_RESET, &hdev->flags)) cancel_delayed_work(&hdev->cmd_timer); else - schedule_delayed_work(&hdev->cmd_timer, - HCI_CMD_TIMEOUT); + queue_delayed_work(system_power_efficient_wq, + &hdev->cmd_timer, HCI_CMD_TIMEOUT); } else { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index daffb13e43f9c..8f4c2142ac0b1 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1532,7 +1532,8 @@ static void l2cap_request_info(struct l2cap_conn *conn) conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; conn->info_ident = l2cap_get_ident(conn); - schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + queue_delayed_work(system_power_efficient_wq, + &conn->info_timer, L2CAP_INFO_TIMEOUT); l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, sizeof(req), &req); @@ -4251,7 +4252,8 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_SENT; conn->info_ident = l2cap_get_ident(conn); - schedule_delayed_work(&conn->info_timer, L2CAP_INFO_TIMEOUT); + queue_delayed_work(system_power_efficient_wq, + &conn->info_timer, L2CAP_INFO_TIMEOUT); l2cap_send_cmd(conn, conn->info_ident, L2CAP_INFO_REQ, sizeof(info), &info); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index ae788d3e0c53a..4dba7f8d33fa0 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -112,7 +112,8 @@ static void sco_sock_set_timer(struct sock *sk, long timeout) BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&sco_pi(sk)->conn->timeout_work); - schedule_delayed_work(&sco_pi(sk)->conn->timeout_work, timeout); + queue_delayed_work(system_power_efficient_wq, + &sco_pi(sk)->conn->timeout_work, timeout); } static void sco_sock_clear_timer(struct sock *sk) diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 8f9566f37498e..360f234b94180 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -616,7 +616,8 @@ static void smp_send_cmd(struct l2cap_conn *conn, u8 code, u16 len, void *data) smp = chan->data; cancel_delayed_work_sync(&smp->security_timer); - schedule_delayed_work(&smp->security_timer, SMP_TIMEOUT); + queue_delayed_work(system_power_efficient_wq, + &smp->security_timer, SMP_TIMEOUT); } static u8 authreq_to_seclevel(u8 authreq) From 9e96a402c9da85eb343f4b6d568b638c7b7875b3 Mon Sep 17 00:00:00 2001 From: Danny Lin Date: Tue, 2 Mar 2021 11:39:19 +0530 Subject: [PATCH 61/61] arm64: rockchip_linux: Switch to fq_codel queue discipline | pfifo_fast | fq_codel | cake + ack-filter | cake + ack-filter-aggressive ----------------+------------+----------+-------------------+----------------------------- Download (mbps) | 144.67 | 171.33 | 162.33 | 149.33 Upload (mbps) | 6.79 | 7.25 | 6.92 | 6.94 Speeds averaged across 3 runs of the official Speedtest.net app for Android. The same server was selected manually on each run. Wi-Fi setup: 802.11ac, 5 GHz, VHT80, channel 108 (DFS) Full result spreadsheet: https://docs.google.com/spreadsheets/d/1YzZJkxgij-KARvgwpKd98UVEdgbAynWLq-Y3lxRLkBg/edit Change-Id: I80a0a74c11d91906b30171b2b1451150fdc19824 Signed-off-by: Adithya R Signed-off-by: Divyanshu-Modi Signed-off-by: Rasenkai --- arch/arm64/configs/rockchip_linux_defconfig | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/arm64/configs/rockchip_linux_defconfig b/arch/arm64/configs/rockchip_linux_defconfig index 6b517c223b5c3..1e3312e3db0cf 100644 --- a/arch/arm64/configs/rockchip_linux_defconfig +++ b/arch/arm64/configs/rockchip_linux_defconfig @@ -896,3 +896,12 @@ CONFIG_TCP_CONG_BBR=y # CONFIG_DEFAULT_CUBIC is not set CONFIG_DEFAULT_BBR=y CONFIG_DEFAULT_TCP_CONG="bbr" +# CONFIG_NET_SCH_MULTIQ is not set +# CONFIG_NET_SCH_SFQ is not set +# CONFIG_NET_SCH_TBF is not set +# CONFIG_NET_SCH_NETEM is not set +# CONFIG_NET_SCH_CODEL is not set +CONFIG_NET_SCH_DEFAULT=y +CONFIG_DEFAULT_FQ_CODEL=y +CONFIG_DEFAULT_NET_SCH="fq_codel" +# CONFIG_NET_SCH_FQ is not set