diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 6111227077182..1c6d3320a606a 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -59,11 +59,11 @@ jobs: path: debian_build release: - name: Release build artifacts for the testing branch + name: Release build artifacts for the branch runs-on: ubuntu-20.04 needs: build permissions: write-all - if: github.ref == 'refs/heads/testing' + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023'}} steps: - name: Get artifact uses: actions/download-artifact@v3 @@ -74,15 +74,19 @@ jobs: wget https://github.com/L4STeam/iproute2/releases/download/master-build/iproute2-l4s.zip unzip iproute2-l4s mv -t . iproute2-l4s/*.deb + - name: Extract branch name + shell: bash + run: echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT + id: extract_branch - name: Zip artifacts run: | mkdir debian_build mv *.deb debian_build - zip -r l4s-testing.zip debian_build + zip -r l4s-${{ steps.extract_branch.outputs.branch }}.zip debian_build - name: Release tip build uses: pyTooling/Actions/releaser@main with: token: ${{ secrets.GITHUB_TOKEN }} - tag: testing-build + tag: ${{ steps.extract_branch.outputs.branch }}-build files: | - l4s-testing.zip + l4s-${{ steps.extract_branch.outputs.branch }}.zip diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 822f350a181be..88c315841f3d6 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -387,27 +387,28 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. The highest variant - of ECN feedback that both peers support is chosen by the ECN - negotiation (Accurate ECN, ECN, or no ECN). + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). The highest negotiated variant for incoming connection requests and the highest variant requested by outgoing connection attempts: - = ==================== ==================== - Incoming connections Outgoing connections - = ==================== ==================== - 0 No ECN No ECN - 1 ECN ECN - 2 ECN No ECN - 3 AccECN AccECN - 4 AccECN ECN - 5 AccECN No ECN - = ==================== ==================== + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 @@ -430,6 +431,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + takes effect only when tcp_ecn_option is set to 2. + + Default: 1 (AccECN will be send at least 1 time per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d20f31b53a984..02e36e8ec023b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -210,6 +210,7 @@ struct tcp_sock { u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ + bool mss_cache_set_by_ca; u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ @@ -233,6 +234,9 @@ struct tcp_sock { syn_ect_snt:2, /* AccECN ECT memory, only */ syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ + u8 accecn_no_respond:1, /* AccECN no response on feedback */ + accecn_no_options:1, /* AccECN no options send out */ + first_data_ack:1; /* Check for first data ack */ u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec933..d382e540f3298 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -62,7 +62,8 @@ struct request_sock { u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ - u8 num_timeout:7; /* number of timeouts */ + u8 num_timeout:7, + is_rtx:1; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; @@ -105,6 +106,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->num_timeout = 0; + req->is_rtx = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); diff --git a/include/net/tcp.h b/include/net/tcp.h index 754aa34bd3071..6745b64ca7051 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -228,7 +228,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ TCPOLEN_ACCECN_PERFIELD * \ TCP_ACCECN_NUMFIELDS) -#define TCP_ACCECN_BEACON_FREQ_SHIFT 2 /* Send option at least 2^2 times per RTT */ #define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* tp->saw_accecn_opt states */ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index a51c4f735ad11..2726b6cc65875 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -236,11 +236,10 @@ struct tcp_bbr_info { struct tcp_prague_info { __u64 prague_alpha; - __u64 prague_ai_ack_increase; __u64 prague_frac_cwnd; + __u64 prague_rate_bytes; __u32 prague_max_burst; __u32 prague_round; - __u32 prague_rtt_indep; __u32 prague_rtt_target; bool prague_enabled; }; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a53f9bf7886f0..9d20f2456cf2f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -688,6 +688,7 @@ static void syn_ack_recalc(struct request_sock *req, int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { + req->is_rtx = 1; int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9721d7f0db9b9..1000e9b17c450 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + tp->mss_cache_set_by_ca = false; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); @@ -3033,6 +3034,9 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->saw_accecn_opt = 0; tp->ecn_fail = 0; + tp->accecn_no_respond = 0; + tp->accecn_no_options = 0; + tp->first_data_ack = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c561a28c8c9dc..c2c72a2688efd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -444,11 +444,31 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); break; case 0x1: - case 0x5: if (tcp_ca_no_fallback_rfc3168(sk)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - else if (tcp_ecn_mode_pending(tp)) - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + else + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + /* [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with (AE,CWR,ECE) = + * (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) = (1,0,1) but it does not + * have logic specific to such a combination, the Client MUST enable AccECN mode as if the SYN/ACK confirmed that the + * Server supported AccECN and as if it fed back that the IP-ECN field on the SYN had arrived unchanged. + */ + case 0x5: + if (tcp_ecn_mode_pending(tp)) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + tp->accecn_opt_demand = 2; + } + if (INET_ECN_is_ce(ip_dsfield)) { + tp->received_ce++; + tp->received_ce_pending++; + } + } break; default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); @@ -575,7 +595,7 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, bool order1, res; unsigned int i; - if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL) + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_respond) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { @@ -683,6 +703,22 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (flag & FLAG_SYN_ACKED) return 0; + /* [CY] 3.2.2.4. Testing for Zeroing of the ACE Field - If AccECN has been successfully negotiated, the Data Sender + * MAY check the value of the ACE counter in the first feedback packet (with or without data) that arrives after the + * 3-way handshake. If the value of this ACE field is found to be zero (0b000), for the remainder of the half- + * connection the Data Sender ought to send non-ECN-capable packets and it is advised not to respond to any feedback + * of CE markings. + */ + if (!tp->first_data_ack) { + tp->first_data_ack = 1; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { + tp->ecn_fail = 1; + INET_ECN_dontxmit(sk); + tp->accecn_no_respond = 1; + return 0; + } + } + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; @@ -4873,8 +4909,18 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * DSACK state and change the txhash to re-route speculatively. */ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && - sk_rethink_txhash(sk)) + sk_rethink_txhash(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If a middlebox is dropping + * packets with options it does not recognize, a host that is sending little or no data but mostly pure + * ACKs will not inherently detect such losses. Such a host MAY detect loss of ACKs carrying the AccECN + * Option by detecting whether the acknowledged data always reappears as a retransmission. In such cases, + * the host SHOULD disable the sending of the AccECN Option for this half-connection. + */ + if (tcp_ecn_mode_accecn(tcp_sk(sk))) + tcp_sk(sk)->accecn_no_options = 1; + + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -6215,6 +6261,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { send_accecn_reflector = true; + /* [CY] 3.1.5. Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive.” + */ + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { tp->saw_accecn_opt = tcp_accecn_option_init(skb, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index faab5a4869dbc..bb1e70a5e5ec2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3142,7 +3142,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_option = 2; - net->ipv4.sysctl_tcp_ecn_option_beacon = 1; + net->ipv4.sysctl_tcp_ecn_option_beacon = 3; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_ecn_unsafe_cep = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0f54f94e30b5b..ba104fb82c7d5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -405,7 +405,13 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, switch (ace) { case 0x0: + /* [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + * state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest + * of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + * feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + */ tp->ecn_fail = 1; + tp->accecn_no_respond = 1; break; case 0x7: case 0x5: @@ -432,6 +438,10 @@ static void tcp_ecn_openreq_child(struct sock *sk, const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + */ if (treq->accecn_ok) { const struct tcphdr *th = (const struct tcphdr *)skb->data; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); @@ -694,9 +704,24 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, - &tcp_rsk(req)->last_oow_ack_time) && + &tcp_rsk(req)->last_oow_ack_time)) { + + if (tcp_rsk(req)->accecn_ok) { + /* [CY] 3.1.5 Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive. + */ + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake + */ + tcp_sk(sk)->ecn_fail = 1; + } + } - !inet_rtx_syn_ack(sk, req)) { + if (!inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += min(TCP_TIMEOUT_INIT << req->num_timeout, @@ -705,6 +730,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; + } } return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32c347fe2ccfe..9f7c372e490c2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -378,12 +378,27 @@ static void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcphdr *th) { - if (tcp_rsk(req)->accecn_ok) - tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); - else if (inet_rsk(req)->ecn_ok) - th->ece = 1; + if (!req->is_rtx || req->num_timeout < 1) { + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) + th->ece = 1; + } else if (tcp_rsk(req)->accecn_ok) { + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, + * to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and + * no AccECN Option, but it remains in AccECN feedback mode + */ + th->ae = 0; + th->cwr = 0; + th->ece = 0; + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on any packet for + * the rest of the connection, if it has received or sent at least one valid SYN or Acceptable SYN/ACK with + * (AE,CWR,ECE) = (0,0,0) during the handshake. + */ + tcp_sk(sk)->ecn_fail = 1; + } } static void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, @@ -922,8 +937,12 @@ static bool tcp_accecn_option_beacon_check(const struct sock *sk) if (!sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon) return false; - return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) >= - (tp->srtt_us >> (3 + TCP_ACCECN_BEACON_FREQ_SHIFT)); + /* [CY] AccECN period shall be larger than srtt[us]/TCP_ECN_OPTION_BEACON + * Following texts are removed in AccECN “6. Summary: Protocol Properties - However, it has to send a full-sized + * AccECN Option at least three times per RTT, which the Data Sender can rely on as a regular beacon or checkpoint.” + */ + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon >= + (tp->srtt_us >> 3); } /* Compute TCP options for SYN packets. This is not the final @@ -1086,8 +1105,11 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the + * SYN/ACK, but with no AccECN Option + */ if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - req->num_timeout < 1 && (remaining >= TCPOLEN_ACCECN_BASE)) { + !req->is_rtx && (remaining >= TCPOLEN_ACCECN_BASE)) { opts->ecn_bytes = synack_ecn_bytes; remaining -= tcp_options_fit_accecn(opts, 0, remaining, tcp_synack_options_combine_saving(opts)); @@ -1167,7 +1189,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp) && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL)) { + (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL && !tp->accecn_no_options)) { if (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= 2 || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk)) { @@ -2018,7 +2040,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; - if (icsk->icsk_mtup.search_high > pmtu) + if (icsk->icsk_mtup.search_high > pmtu && !tp->mss_cache_set_by_ca) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); @@ -2048,7 +2070,7 @@ unsigned int tcp_current_mss(struct sock *sk) mss_now = tp->mss_cache; - if (dst) { + if (dst && !tp->mss_cache_set_by_ca) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); @@ -3431,12 +3453,20 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. + /* [CY] 3.1.4.1. Retransmitted SYNs - If the sender of an AccECN SYN (the TCP Client) times out before receiving the SYN/ACK, + * it SHOULD attempt to negotiate the use of AccECN at least one more time by continuing to set all three TCP ECN flags + * (AE,CWR,ECE) = (1,1,1) on the first retransmitted SYN (using the usual retransmission time-outs). If this first + * retransmission also fails to be acknowledged, in deployment scenarios where AccECN path traversal might be problematic, the + * TCP Client SHOULD send subsequent retransmissions of the SYN with the three TCP-ECN flags cleared (AE,CWR,ECE) = (0,0,0). */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); @@ -3822,7 +3852,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th); + tcp_ecn_make_synack((struct sock *)sk, req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index ccb7fe3488cbf..5a8da0c05b2bd 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,17 +89,22 @@ #include #include -#define MIN_CWND 2U -#define PRAGUE_ALPHA_BITS 20U +#define MIN_CWND_RTT 2U +#define MIN_CWND_VIRT 2U +#define MIN_MSS 150U +#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ +#define PRAGUE_ALPHA_BITS 24U #define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) #define CWND_UNIT 20U -#define ONE_CWND (1LL << CWND_UNIT) /* Must be signed */ +#define ONE_CWND (1ULL << CWND_UNIT) #define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ -#define DEFAULT_RTT_TRANSITION 500 +#define DEFAULT_RTT_TRANSITION 4 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define RTT_UNIT 7 -#define RTT2US(x) ((x) << RTT_UNIT) -#define US2RTT(x) ((x) >> RTT_UNIT) +#define MTU_SYS 1500UL +#define RATE_OFFSET 4 +#define OFFSET_UNIT 7 +#define HSRTT_SHIFT 7 +#define DEFAULT_MODE 0 /* 0: win-base; 1: rate-base */ #define PRAGUE_MAX_SRTT_BITS 18U #define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) @@ -159,6 +164,16 @@ MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); +static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ +MODULE_PARM_DESC(prague_rate_offset, + "Pacing rate offset in 1/128 units at each half of RTT_virt"); +module_param(prague_rate_offset, uint, 0644); + +static int prague_cwnd_transit __read_mostly = 4; +MODULE_PARM_DESC(prague_cwnd_transit, + "CWND mode switching point in term of # of MTU_SYS"); +module_param(prague_cwnd_transit, uint, 0644); + static int prague_ecn_fallback __read_mostly = 0; MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" " 2 = detection"); @@ -168,10 +183,13 @@ struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ - u64 ai_ack_stamp; u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ + u32 mtu_cache; + u64 hsrtt_us; + u32 rate_offset; u64 frac_cwnd; /* internal fractional cwnd */ - u64 loss_frac_cwnd; + u64 rate_bytes; /* internal pacing rate in bytes */ + u64 loss_rate_bytes; u32 loss_cwnd; u32 max_tso_burst; u32 rest_depth_us; @@ -184,6 +202,7 @@ struct prague { u32 rtt_target; /* RTT scaling target */ u8 saw_ce:1, /* Is there an AQM on the path? */ rtt_indep:3, /* RTT independence mode */ + cwnd_mode:1, /* CWND operating mode */ in_loss:1; /* In cwnd reduction caused by loss */ }; @@ -213,7 +232,7 @@ static void __prague_connection_id(struct sock *sk, char *str, size_t len) char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) @@ -263,18 +282,77 @@ static u64 prague_unscaled_ai_ack_increase(struct sock *sk) return 1 << CWND_UNIT; } +static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) +{ + u64 a0 = left & ((1ULL<<32)-1); + u64 a1 = left >> 32; + u64 b0 = right & ((1ULL<<32)-1); + u64 b1 = right >> 32; + u64 m0 = a0 * b0; + u64 m1 = a0 * b1; + u64 m2 = a1 * b0; + u64 m3 = a1 * b1; + u64 result_low; + u64 result_high; + + m2 += (m0 >> 32); + m2 += m1; + /* Overflow */ + if (m2 < m1) + m3 += (1ULL<<32); + + result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); + result_high = m3 + (m2 >> 32); + if (shift && 64 >= shift) { + result_low = (result_low >> shift) | (result_high << (64-shift)); + result_high = (result_high >> shift); + } + return (result_high) ? 0xffffffffffffffffULL : result_low; +} + static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) { struct prague *ca = prague_ca(sk); - u64 rtt, target, frac_cwnd; + struct tcp_sock *tp = tcp_sk(sk); - rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); - target = prague_target_rtt(sk); - frac_cwnd = ca->frac_cwnd; - if (likely(target)) - frac_cwnd = div64_u64(frac_cwnd * rtt + (target>>1), target); + return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), + MIN_CWND_RTT), tp->snd_cwnd_clamp); +} - return max((u32)((frac_cwnd + ONE_CWND - 1) >> CWND_UNIT), 1); +static u64 prague_virtual_rtt(struct sock *sk) +{ + return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); +} + +static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) +{ + return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + + (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); +} + +static bool prague_half_virtual_rtt_elapsed(struct sock *sk) +{ + return (prague_virtual_rtt(sk) >> (3 + 1)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); +} + +static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 rtt; + u64 mtu; + + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; + + return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); +} + +static u32 prague_valid_mtu(struct sock *sk, u32 mtu) +{ + return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -289,9 +367,9 @@ static void prague_ai_ack_increase(struct sock *sk) goto exit; } - rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); + rtt = tcp_sk(sk)->srtt_us; if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > MAX_SCALED_RTT) { + !rtt || rtt > (MAX_SCALED_RTT << 3)) { increase = prague_unscaled_ai_ack_increase(sk); goto exit; } @@ -302,38 +380,36 @@ static void prague_ai_ack_increase(struct sock *sk) WRITE_ONCE(ca->ai_ack_increase, increase); } -/* Ensure prague sends traffic as smoothly as possible: - * - Pacing is set to 100% during AI - * - The max GSO burst size is bounded in time at the pacing rate. - * - * We keep the 200% pacing rate during SS, as we need to send 2 MSS back to - * back for every received ACK. - */ static void prague_update_pacing_rate(struct sock *sk) { struct prague *ca = prague_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); u64 max_inflight; - u64 rate, burst; - int mtu; - - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - // Must also set tcp_ecn_option=0 and tcp_ecn_unsafe_cep=1 - // to disable the option and safer heuristic... - max_inflight = ca->frac_cwnd; + u64 rate, burst, offset; + u64 mtu; + + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { + offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_virtual_rtt_elapsed(sk)) // second half + rate = ca->rate_bytes - offset; + else // first half + rate = ca->rate_bytes + offset; + } else { + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; + } - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - //if (likely(tp->srtt_us)) - // rate = div64_u64(rate, tp->srtt_us); - if (likely(RTT2US(prague_target_rtt(sk)))) - rate = div64_u64(rate + RTT2US(prague_target_rtt(sk)) << 2, RTT2US(prague_target_rtt(sk)) << 3); - rate = (rate*max_inflight + (ONE_CWND >> 1)) >> CWND_UNIT; + if (!prague_is_rtt_indep(sk) || (ca->cwnd_mode == 0 || unlikely(!ca->saw_ce))) { + if (likely(tp->srtt_us)) + rate = div64_u64(rate, (u64)tp->srtt_us); + rate = max_t(u64, rate*max_inflight, MINIMUM_RATE); + ca->rate_bytes = rate; + } + rate = min_t(u64, rate, sk->sk_max_pacing_rate); - /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this - * division. It will somehow need to be able to take hdr sizes into - * account */ burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); WRITE_ONCE(prague_ca(sk)->max_tso_burst, @@ -422,7 +498,7 @@ static void prague_update_alpha(struct sock *sk) { struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha; + u64 ecn_segs, alpha, mtu, mtu_used; /* Do not update alpha before we have proof that there's an AQM on * the path. @@ -457,7 +533,18 @@ static void prague_update_alpha(struct sock *sk) WRITE_ONCE(ca->upscaled_alpha, alpha); tp->alpha = alpha >> PRAGUE_SHIFT_G; + if (prague_is_rtt_indep(sk) && !ca->in_loss) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } + } skip: + ca->hsrtt_us = ca->hsrtt_us + (u64)tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); prague_new_round(sk); } @@ -467,7 +554,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; - u32 new_cwnd; + u64 new_cwnd; + u64 divisor; + u64 mtu_used; acked = rs->acked_sacked; if (rs->ece_delta) { @@ -478,33 +567,43 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } - if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) + if (acked <= 0 || ca->in_loss || tp->app_limited) goto adjust; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); if (!acked) { prague_cwnd_changed(sk); return; } } - if (RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->ai_ack_stamp)) - goto adjust; - ca->ai_ack_stamp = tp->tcp_mstamp; - increase = acked * ca->ai_ack_increase; - ca->frac_cwnd += max_t(u64, acked, increase); + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); + divisor = mtu_used << 23; + new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); + if (likely(new_cwnd)) + ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); + } else { + increase = acked * ca->ai_ack_increase; + new_cwnd = ca->frac_cwnd; + if (likely(new_cwnd)) + increase = div64_u64((increase << CWND_UNIT) + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd += max_t(u64, acked, increase); + } adjust: new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (tp->snd_cwnd > new_cwnd && tp->snd_cwnd > MIN_CWND) { - /* Reuse the step-wise cwnd decrement */ + if (tp->snd_cwnd > new_cwnd) { + /* Step-wise cwnd decrement */ --tp->snd_cwnd; tp->snd_ssthresh = tp->snd_cwnd; prague_cwnd_changed(sk); - } else if (tp->snd_cwnd < new_cwnd && tp->snd_cwnd < tp->snd_cwnd_clamp) { - /* Reuse the step-wise cwnd increment */ + } else if (tp->snd_cwnd < new_cwnd) { + /* Step-wise cwnd increment */ ++tp->snd_cwnd; prague_cwnd_changed(sk); } @@ -522,10 +621,15 @@ static void prague_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; - ca->loss_frac_cwnd = ca->frac_cwnd; - ca->frac_cwnd -= (ca->frac_cwnd >> 1); + ca->loss_rate_bytes = ca->rate_bytes; + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { + ca->rate_bytes -= (ca->rate_bytes >> 1); + //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + } else { + ca->frac_cwnd -= (ca->frac_cwnd >> 1); + } ca->in_loss = 1; - prague_cwnd_changed(sk); } static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) @@ -578,8 +682,8 @@ static void prague_enter_cwr(struct sock *sk) u64 alpha; if (prague_is_rtt_indep(sk) && - RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) + (prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) return; ca->cwr_stamp = tp->tcp_mstamp; alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; @@ -587,11 +691,17 @@ static void prague_enter_cwr(struct sock *sk) if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) alpha = prague_classic_ecn_fallback(tp, alpha); - reduction = ((ca->frac_cwnd + 1) >> 1) + ONE_CWND; - reduction = (alpha * reduction + - (PRAGUE_MAX_ALPHA >> 1)) >> - (PRAGUE_ALPHA_BITS); - ca->frac_cwnd -= reduction; + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { + reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); + ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + } else { + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->frac_cwnd -= reduction; + } return; } @@ -659,7 +769,10 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->frac_cwnd = max(ca->frac_cwnd, ca->loss_frac_cwnd); + ca->in_loss = 0; + ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); + //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -669,6 +782,11 @@ static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) if (prague_should_update_ewma(sk)) prague_update_alpha(sk); prague_update_pacing_rate(sk); + if (prague_ca(sk)->cwnd_mode == 0 && tcp_sk(sk)->snd_cwnd*tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache) <= prague_cwnd_transit*MTU_SYS) { + prague_ca(sk)->cwnd_mode = 1; + } else if (prague_ca(sk)->cwnd_mode == 1 && tcp_sk(sk)->snd_cwnd*tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache) > prague_cwnd_transit*MTU_SYS) { + prague_ca(sk)->cwnd_mode = 0; + } } static u32 prague_ssthresh(struct sock *sk) @@ -681,8 +799,7 @@ static u32 prague_ssthresh(struct sock *sk) static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = max_t(u32, prague_ca(sk)->max_tso_burst, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs = prague_ca(sk)->max_tso_burst; if (prague_max_tso_segs) tso_segs = min(tso_segs, prague_max_tso_segs); @@ -702,13 +819,12 @@ static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, info->prague.prague_alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; info->prague.prague_max_burst = ca->max_tso_burst; - info->prague.prague_ai_ack_increase = - READ_ONCE(ca->ai_ack_increase); info->prague.prague_round = ca->round; - info->prague.prague_frac_cwnd = + info->prague.prague_rate_bytes = + READ_ONCE(ca->rate_bytes); + info->prague.prague_frac_cwnd = READ_ONCE(ca->frac_cwnd); info->prague.prague_enabled = 1; - info->prague.prague_rtt_indep = ca->rtt_indep; info->prague.prague_rtt_target = prague_target_rtt(sk); } @@ -751,25 +867,34 @@ static void prague_init(struct sock *sk) /* If we have an initial RTT estimate, ensure we have an initial pacing * rate to use if net.ipv4.tcp_pace_iw is set. */ - if (tp->srtt_us) - prague_update_pacing_rate(sk); - ca->alpha_stamp = tp->tcp_mstamp; ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - ca->loss_frac_cwnd = 0; ca->max_tso_burst = 1; + + /* rate initialization */ + if (tp->srtt_us) { + ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); + ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); + } else { + ca->rate_bytes = MINIMUM_RATE; + } + prague_update_pacing_rate(sk); + ca->loss_rate_bytes = 0; ca->round = 0; ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = US2RTT(prague_rtt_target); + ca->rtt_target = prague_rtt_target << 3; ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; if (ca->rtt_indep >= __RTT_CONTROL_MAX) ca->rtt_indep = RTT_CONTROL_NONE; LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", - ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); + ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk) >> 3); ca->saw_ce = !!tp->delivered_ce; - if (US2RTT(tp->srtt_us >> 3)) - ca->frac_cwnd = div64_u64(ca->frac_cwnd*prague_target_rtt(sk), US2RTT(tp->srtt_us >> 3)); + + ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); + // Default as 1us + ca->hsrtt_us = (tp->srtt_us) ? ((u64)tp->srtt_us) << HSRTT_SHIFT : (1 << (HSRTT_SHIFT + 3)); + ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; /* reuse existing meaurement of SRTT as an intial starting point */ tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; @@ -782,13 +907,13 @@ static void prague_init(struct sock *sk) tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ - + ca->cwnd_mode = DEFAULT_MODE; prague_new_round(sk); } static bool prague_target_rtt_elapsed(struct sock *sk) { - return RTT2US(prague_target_rtt(sk)) <= + return (prague_target_rtt(sk) >> 3) <= tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, prague_ca(sk)->alpha_stamp); } @@ -809,8 +934,9 @@ static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) * * Overflows if e2e RTT is > 100ms, hence the cap */ - increase = (u64)1 << CWND_UNIT; - divisor = 1; + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; increase = div64_u64(increase + (divisor >> 1), divisor); return increase; } @@ -818,7 +944,7 @@ static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) { /* R0 ~= 16ms, R1 ~= 1.5ms */ - const s64 R0 = US2RTT(1 << 14), R1 = US2RTT((1 << 10) + (1 << 9)); + const s64 R0 = ((1 << 14) << 3), R1 = (((1 << 10) + (1 << 9)) << 3); u64 increase; u64 divisor; @@ -836,7 +962,7 @@ static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) static u32 prague_dynamic_rtt_target(struct sock *sk) { - return prague_ca(sk)->rtt_target + US2RTT(tcp_sk(sk)->srtt_us >> 3); + return prague_ca(sk)->rtt_target + tcp_sk(sk)->srtt_us; } static struct rtt_scaling_ops @@ -903,10 +1029,11 @@ static void __exit prague_unregister(void) module_init(prague_register); module_exit(prague_unregister); +MODULE_AUTHOR("Chia-Yu Chang "); MODULE_AUTHOR("Olivier Tilmans "); MODULE_AUTHOR("Koen De Schepper "); MODULE_AUTHOR("Bob briscoe "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("TCP Prague"); -MODULE_VERSION("0.6"); +MODULE_VERSION("0.7");