From ea42bd8ce72002f973cf5205029016b015569944 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang <125277758+minuscat@users.noreply.github.com> Date: Tue, 8 Aug 2023 19:18:00 +0200 Subject: [PATCH 01/47] Update kernel.yml --- .github/workflows/kernel.yml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 6111227077182..906d044824549 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -63,7 +63,7 @@ jobs: runs-on: ubuntu-20.04 needs: build permissions: write-all - if: github.ref == 'refs/heads/testing' + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase'}} steps: - name: Get artifact uses: actions/download-artifact@v3 @@ -79,10 +79,19 @@ jobs: mkdir debian_build mv *.deb debian_build zip -r l4s-testing.zip debian_build - - name: Release tip build + - name: Release tip build for testing branch uses: pyTooling/Actions/releaser@main + if: ${{ github.ref == 'refs/heads/testing'}} with: token: ${{ secrets.GITHUB_TOKEN }} tag: testing-build files: | l4s-testing.zip + - name: Release tip build fpr ratebase branch + uses: pyTooling/Actions/releaser@main + if: ${{ github.ref == 'refs/heads/ratebase'}} + with: + token: ${{ secrets.GITHUB_TOKEN }} + tag: ratebase-build + files: | + l4s-ratebase.zip From 2ca19e976b6c5652fc1b72ef7c75adbb09e1afc3 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Wed, 9 Aug 2023 03:32:39 +0200 Subject: [PATCH 02/47] RTT_ref window version --- net/ipv4/tcp_prague.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index ccb7fe3488cbf..c5ef8f0155fa4 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -272,7 +272,7 @@ static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) target = prague_target_rtt(sk); frac_cwnd = ca->frac_cwnd; if (likely(target)) - frac_cwnd = div64_u64(frac_cwnd * rtt + (target>>1), target); + frac_cwnd = div64_u64(frac_cwnd * rtt + target - 1, target); return max((u32)((frac_cwnd + ONE_CWND - 1) >> CWND_UNIT), 1); } @@ -587,10 +587,10 @@ static void prague_enter_cwr(struct sock *sk) if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) alpha = prague_classic_ecn_fallback(tp, alpha); - reduction = ((ca->frac_cwnd + 1) >> 1) + ONE_CWND; - reduction = (alpha * reduction + - (PRAGUE_MAX_ALPHA >> 1)) >> - (PRAGUE_ALPHA_BITS); + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); ca->frac_cwnd -= reduction; return; From 61b56d75a87b909eb3699ebcc24978d3b1b5b38f Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Wed, 9 Aug 2023 16:18:24 +0200 Subject: [PATCH 03/47] RTT_ref window version --- net/ipv4/tcp_prague.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index c5ef8f0155fa4..17cedc6ef562d 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -170,6 +170,8 @@ struct prague { u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_stamp; u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ + u32 acc_acked; /* accumulated acked */ + u32 acc_acked_ce; /* accumulated acked ce */ u64 frac_cwnd; /* internal fractional cwnd */ u64 loss_frac_cwnd; u32 loss_cwnd; @@ -272,7 +274,7 @@ static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) target = prague_target_rtt(sk); frac_cwnd = ca->frac_cwnd; if (likely(target)) - frac_cwnd = div64_u64(frac_cwnd * rtt + target - 1, target); + frac_cwnd = div64_u64(frac_cwnd * rtt, target); return max((u32)((frac_cwnd + ONE_CWND - 1) >> CWND_UNIT), 1); } @@ -328,7 +330,7 @@ static void prague_update_pacing_rate(struct sock *sk) //if (likely(tp->srtt_us)) // rate = div64_u64(rate, tp->srtt_us); if (likely(RTT2US(prague_target_rtt(sk)))) - rate = div64_u64(rate + RTT2US(prague_target_rtt(sk)) << 2, RTT2US(prague_target_rtt(sk)) << 3); + rate = div64_u64(rate, RTT2US(prague_target_rtt(sk)) << 3); rate = (rate*max_inflight + (ONE_CWND >> 1)) >> CWND_UNIT; rate = min_t(u64, rate, sk->sk_max_pacing_rate); /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this @@ -478,6 +480,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } + ca->acc_acked += rs->acked_sacked; + ca->acc_acked_ce += rs->ece_delta; + if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) goto adjust; @@ -493,9 +498,12 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) ca->ai_ack_stamp)) goto adjust; ca->ai_ack_stamp = tp->tcp_mstamp; - increase = acked * ca->ai_ack_increase; - ca->frac_cwnd += max_t(u64, acked, increase); - + if (likely(ca->acc_acked)) { + increase = div_u64((ca->acc_acked - ca->acc_acked_ce)*ca->ai_ack_increase, ca->acc_acked); + ca->frac_cwnd += max_t(u64, increase, ca->ai_ack_increase); + } + ca->acc_acked = 0; + ca->acc_acked_ce = 0; adjust: new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); if (tp->snd_cwnd > new_cwnd && tp->snd_cwnd > MIN_CWND) { From 07c9c18b8ddf542faf8fc1009e8fbdc9a21f9aa8 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Fri, 11 Aug 2023 02:00:22 +0200 Subject: [PATCH 04/47] Update kernel.yml --- .github/workflows/kernel.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 906d044824549..0f0289c53a325 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -59,7 +59,7 @@ jobs: path: debian_build release: - name: Release build artifacts for the testing branch + name: Release build artifacts for the branch runs-on: ubuntu-20.04 needs: build permissions: write-all @@ -74,7 +74,8 @@ jobs: wget https://github.com/L4STeam/iproute2/releases/download/master-build/iproute2-l4s.zip unzip iproute2-l4s mv -t . iproute2-l4s/*.deb - - name: Zip artifacts + - name: Zip artifacts for testing branch + if: ${{ github.ref == 'refs/heads/testing'}} run: | mkdir debian_build mv *.deb debian_build @@ -87,7 +88,13 @@ jobs: tag: testing-build files: | l4s-testing.zip - - name: Release tip build fpr ratebase branch + - name: Zip artifacts for ratebase branch + if: ${{ github.ref == 'refs/heads/ratebase'}} + run: | + mkdir debian_build + mv *.deb debian_build + zip -r l4s-ratebase.zip debian_build + - name: Release tip build for ratebase branch uses: pyTooling/Actions/releaser@main if: ${{ github.ref == 'refs/heads/ratebase'}} with: From 5f9d6d029ab3d0993fc6dfda16b2211083e28c09 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Fri, 11 Aug 2023 16:13:50 +0200 Subject: [PATCH 05/47] Update kernel.yml --- .github/workflows/kernel.yml | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 0f0289c53a325..168eb823cadc5 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -74,31 +74,19 @@ jobs: wget https://github.com/L4STeam/iproute2/releases/download/master-build/iproute2-l4s.zip unzip iproute2-l4s mv -t . iproute2-l4s/*.deb - - name: Zip artifacts for testing branch - if: ${{ github.ref == 'refs/heads/testing'}} + - name: Extract branch name + shell: bash + run: echo "branch=${GITHUB_HEAD_REF:-${GITHUB_REF#refs/heads/}}" >> $GITHUB_OUTPUT + id: extract_branch + - name: Zip artifacts run: | mkdir debian_build mv *.deb debian_build - zip -r l4s-testing.zip debian_build - - name: Release tip build for testing branch + zip -r l4s-${{ steps.extract_branch.outputs.branch }}.zip debian_build + - name: Release tip build uses: pyTooling/Actions/releaser@main - if: ${{ github.ref == 'refs/heads/testing'}} with: token: ${{ secrets.GITHUB_TOKEN }} - tag: testing-build + tag: ${{ steps.extract_branch.outputs.branch }}-build files: | - l4s-testing.zip - - name: Zip artifacts for ratebase branch - if: ${{ github.ref == 'refs/heads/ratebase'}} - run: | - mkdir debian_build - mv *.deb debian_build - zip -r l4s-ratebase.zip debian_build - - name: Release tip build for ratebase branch - uses: pyTooling/Actions/releaser@main - if: ${{ github.ref == 'refs/heads/ratebase'}} - with: - token: ${{ secrets.GITHUB_TOKEN }} - tag: ratebase-build - files: | - l4s-ratebase.zip + l4s-${{ steps.extract_branch.outputs.branch }}.zip From 13ad496aa847ff36c398813a6123345e05b4151e Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Thu, 14 Sep 2023 20:03:37 +0200 Subject: [PATCH 06/47] Revert experiment code and fix typo in workflow --- .github/workflows/kernel.yml | 4 +- include/uapi/linux/inet_diag.h | 2 +- net/ipv4/tcp_prague.c | 98 +++++++++++++--------------------- 3 files changed, 41 insertions(+), 63 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 168eb823cadc5..2ac6c4d375df0 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -59,11 +59,11 @@ jobs: path: debian_build release: - name: Release build artifacts for the branch + name: Release build artifacts for the testing branch runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase'}} + if: ${{ github.ref == 'refs/heads/testing' }} steps: - name: Get artifact uses: actions/download-artifact@v3 diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index a51c4f735ad11..c9b87628657a9 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -237,9 +237,9 @@ struct tcp_bbr_info { struct tcp_prague_info { __u64 prague_alpha; __u64 prague_ai_ack_increase; - __u64 prague_frac_cwnd; __u32 prague_max_burst; __u32 prague_round; + __u32 prague_rtt_transition; __u32 prague_rtt_indep; __u32 prague_rtt_target; bool prague_enabled; diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 17cedc6ef562d..069c7004ed9cb 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -168,12 +168,9 @@ struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ - u64 ai_ack_stamp; u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - u32 acc_acked; /* accumulated acked */ - u32 acc_acked_ce; /* accumulated acked ce */ - u64 frac_cwnd; /* internal fractional cwnd */ - u64 loss_frac_cwnd; + s64 cwnd_cnt; /* cwnd update carry */ + s64 loss_cwnd_cnt; u32 loss_cwnd; u32 max_tso_burst; u32 rest_depth_us; @@ -265,20 +262,6 @@ static u64 prague_unscaled_ai_ack_increase(struct sock *sk) return 1 << CWND_UNIT; } -static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) -{ - struct prague *ca = prague_ca(sk); - u64 rtt, target, frac_cwnd; - - rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); - target = prague_target_rtt(sk); - frac_cwnd = ca->frac_cwnd; - if (likely(target)) - frac_cwnd = div64_u64(frac_cwnd * rtt, target); - - return max((u32)((frac_cwnd + ONE_CWND - 1) >> CWND_UNIT), 1); -} - /* RTT independence will scale the classical 1/W per ACK increase. */ static void prague_ai_ack_increase(struct sock *sk) { @@ -313,25 +296,22 @@ static void prague_ai_ack_increase(struct sock *sk) */ static void prague_update_pacing_rate(struct sock *sk) { - struct prague *ca = prague_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); - u64 max_inflight; + u32 max_inflight; u64 rate, burst; int mtu; mtu = tcp_mss_to_mtu(sk, tp->mss_cache); // Must also set tcp_ecn_option=0 and tcp_ecn_unsafe_cep=1 // to disable the option and safer heuristic... - max_inflight = ca->frac_cwnd; + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - //if (likely(tp->srtt_us)) - // rate = div64_u64(rate, tp->srtt_us); - if (likely(RTT2US(prague_target_rtt(sk)))) - rate = div64_u64(rate, RTT2US(prague_target_rtt(sk)) << 3); - rate = (rate*max_inflight + (ONE_CWND >> 1)) >> CWND_UNIT; + if (likely(tp->srtt_us)) + rate = div64_u64(rate, tp->srtt_us); + rate *= max_inflight; rate = min_t(u64, rate, sk->sk_max_pacing_rate); /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this * division. It will somehow need to be able to take hdr sizes into @@ -469,7 +449,6 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; - u32 new_cwnd; acked = rs->acked_sacked; if (rs->ece_delta) { @@ -480,9 +459,6 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } - ca->acc_acked += rs->acked_sacked; - ca->acc_acked_ce += rs->ece_delta; - if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) goto adjust; @@ -494,26 +470,27 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) } } - if (RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->ai_ack_stamp)) - goto adjust; - ca->ai_ack_stamp = tp->tcp_mstamp; - if (likely(ca->acc_acked)) { - increase = div_u64((ca->acc_acked - ca->acc_acked_ce)*ca->ai_ack_increase, ca->acc_acked); - ca->frac_cwnd += max_t(u64, increase, ca->ai_ack_increase); - } - ca->acc_acked = 0; - ca->acc_acked_ce = 0; + increase = acked * ca->ai_ack_increase; + if (likely(tp->snd_cwnd)) + increase = div_u64(increase + (tp->snd_cwnd >> 1), + tp->snd_cwnd); + ca->cwnd_cnt += max_t(u64, acked, increase); + adjust: - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (tp->snd_cwnd > new_cwnd && tp->snd_cwnd > MIN_CWND) { - /* Reuse the step-wise cwnd decrement */ + if (ca->cwnd_cnt <= -ONE_CWND) { + ca->cwnd_cnt += ONE_CWND; --tp->snd_cwnd; + if (tp->snd_cwnd < MIN_CWND) { + tp->snd_cwnd = MIN_CWND; + /* No point in applying further reductions */ + ca->cwnd_cnt = 0; + } tp->snd_ssthresh = tp->snd_cwnd; prague_cwnd_changed(sk); - } else if (tp->snd_cwnd < new_cwnd && tp->snd_cwnd < tp->snd_cwnd_clamp) { - /* Reuse the step-wise cwnd increment */ + } else if (ca->cwnd_cnt >= ONE_CWND) { + ca->cwnd_cnt -= ONE_CWND; ++tp->snd_cwnd; + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); prague_cwnd_changed(sk); } return; @@ -530,8 +507,9 @@ static void prague_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; - ca->loss_frac_cwnd = ca->frac_cwnd; - ca->frac_cwnd -= (ca->frac_cwnd >> 1); + ca->loss_cwnd_cnt = ca->cwnd_cnt; + ca->cwnd_cnt -= + (((u64)tp->snd_cwnd) << (CWND_UNIT - 1)) + (ca->cwnd_cnt >> 1); ca->in_loss = 1; prague_cwnd_changed(sk); } @@ -595,11 +573,11 @@ static void prague_enter_cwr(struct sock *sk) if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) alpha = prague_classic_ecn_fallback(tp, alpha); - reduction = (alpha * (ca->frac_cwnd) + + reduction = (alpha * ((u64)tp->snd_cwnd << CWND_UNIT) + /* Unbias the rounding by adding 1/2 */ PRAGUE_MAX_ALPHA) >> (PRAGUE_ALPHA_BITS + 1U); - ca->frac_cwnd -= reduction; + ca->cwnd_cnt -= reduction; return; } @@ -667,7 +645,7 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->frac_cwnd = max(ca->frac_cwnd, ca->loss_frac_cwnd); + ca->cwnd_cnt += ca->cwnd_cnt - ca->loss_cwnd_cnt; return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -713,8 +691,8 @@ static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, info->prague.prague_ai_ack_increase = READ_ONCE(ca->ai_ack_increase); info->prague.prague_round = ca->round; - info->prague.prague_frac_cwnd = - READ_ONCE(ca->frac_cwnd); + info->prague.prague_rtt_transition = + ca->rtt_transition_delay; info->prague.prague_enabled = 1; info->prague.prague_rtt_indep = ca->rtt_indep; info->prague.prague_rtt_target = @@ -764,8 +742,9 @@ static void prague_init(struct sock *sk) ca->alpha_stamp = tp->tcp_mstamp; ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - ca->loss_frac_cwnd = 0; + ca->cwnd_cnt = 0; + ca->loss_cwnd_cnt = 0; + ca->loss_cwnd = 0; ca->max_tso_burst = 1; ca->round = 0; ca->rtt_transition_delay = prague_rtt_transition; @@ -774,10 +753,8 @@ static void prague_init(struct sock *sk) if (ca->rtt_indep >= __RTT_CONTROL_MAX) ca->rtt_indep = RTT_CONTROL_NONE; LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", - ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); + ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); ca->saw_ce = !!tp->delivered_ce; - if (US2RTT(tp->srtt_us >> 3)) - ca->frac_cwnd = div64_u64(ca->frac_cwnd*prague_target_rtt(sk), US2RTT(tp->srtt_us >> 3)); /* reuse existing meaurement of SRTT as an intial starting point */ tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; @@ -817,8 +794,9 @@ static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) * * Overflows if e2e RTT is > 100ms, hence the cap */ - increase = (u64)1 << CWND_UNIT; - divisor = 1; + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; increase = div64_u64(increase + (divisor >> 1), divisor); return increase; } From a8d9adfdf2849722283f55e889b32326116c907d Mon Sep 17 00:00:00 2001 From: Bob Briscoe Date: Thu, 14 Sep 2023 19:04:37 +0100 Subject: [PATCH 07/47] Update tcp_ecn in ip-sysctl.rst Fixed table (table headings don't seem to render unless there's text in the first heading column). Clarified that tcp_ecn enables ECN at IP and TCP layers --- Documentation/networking/ip-sysctl.rst | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 822f350a181be..5a899d7402a89 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -387,7 +387,7 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is used only when both ends of the TCP connection indicate + ECN is supported (at both the IP and TCP layers) only when both ends of the TCP connection indicate support for it. This feature is useful in avoiding losses due to congestion by allowing supporting routers to signal congestion before having to drop packets. The highest variant @@ -398,16 +398,16 @@ tcp_ecn - INTEGER and the highest variant requested by outgoing connection attempts: - = ==================== ==================== - Incoming connections Outgoing connections - = ==================== ==================== - 0 No ECN No ECN - 1 ECN ECN - 2 ECN No ECN - 3 AccECN AccECN - 4 AccECN ECN - 5 AccECN No ECN - = ==================== ==================== + ===== ==================== ==================== + Value Incoming connections Outgoing connections + ===== ==================== ==================== + 0 No ECN No ECN + 1 ECN ECN + 2 ECN No ECN + 3 AccECN AccECN + 4 AccECN ECN + 5 AccECN No ECN + ===== ==================== ==================== Default: 2 From dbab85e13554832176b05104f6b72b48e7249bd9 Mon Sep 17 00:00:00 2001 From: Bob Briscoe Date: Thu, 14 Sep 2023 19:20:49 +0100 Subject: [PATCH 08/47] Update tcp_ecn in ip-sysctl.rst Wrap And explained IP and TCP layers differently. --- Documentation/networking/ip-sysctl.rst | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 5a899d7402a89..62d298f89828c 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -387,12 +387,13 @@ tcp_early_retrans - INTEGER tcp_ecn - INTEGER Control use of Explicit Congestion Notification (ECN) by TCP. - ECN is supported (at both the IP and TCP layers) only when both ends of the TCP connection indicate - support for it. This feature is useful in avoiding losses due - to congestion by allowing supporting routers to signal - congestion before having to drop packets. The highest variant - of ECN feedback that both peers support is chosen by the ECN - negotiation (Accurate ECN, ECN, or no ECN). + ECN is used only when both ends of the TCP connection indicate support + for it. This feature is useful in avoiding losses due to congestion by + allowing supporting routers to signal congestion before having to drop + packets. A host that supports ECN both sends ECN at the IP layer and + feeds back ECN at the TCP layer. The highest variant of ECN feedback + that both peers support is chosen by the ECN negotiation (Accurate ECN, + ECN, or no ECN). The highest negotiated variant for incoming connection requests and the highest variant requested by outgoing connection From 6a56c9c3ca3fa78dc49df0972f799344b37c6da0 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Mon, 18 Sep 2023 10:41:54 +0200 Subject: [PATCH 09/47] Update ratebased control code --- include/linux/tcp.h | 1 + include/uapi/linux/inet_diag.h | 5 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_output.c | 4 +- net/ipv4/tcp_prague.c | 619 +++++++++++++-------------------- 5 files changed, 241 insertions(+), 389 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d20f31b53a984..41dce7a3424b1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -210,6 +210,7 @@ struct tcp_sock { u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ + bool mss_cache_set_by_ca; u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c9b87628657a9..2726b6cc65875 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -236,11 +236,10 @@ struct tcp_bbr_info { struct tcp_prague_info { __u64 prague_alpha; - __u64 prague_ai_ack_increase; + __u64 prague_frac_cwnd; + __u64 prague_rate_bytes; __u32 prague_max_burst; __u32 prague_round; - __u32 prague_rtt_transition; - __u32 prague_rtt_indep; __u32 prague_rtt_target; bool prague_enabled; }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9721d7f0db9b9..aefd3a9362bd1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + tp->mss_cache_set_by_ca = false; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32c347fe2ccfe..eadb0eeb4bcc0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2018,7 +2018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; - if (icsk->icsk_mtup.search_high > pmtu) + if (icsk->icsk_mtup.search_high > pmtu && !tp->mss_cache_set_by_ca) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); @@ -2048,7 +2048,7 @@ unsigned int tcp_current_mss(struct sock *sk) mss_now = tp->mss_cache; - if (dst) { + if (dst && !tp->mss_cache_set_by_ca) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 069c7004ed9cb..5a2f1f160c301 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,52 +89,19 @@ #include #include -#define MIN_CWND 2U -#define PRAGUE_ALPHA_BITS 20U +#define MIN_CWND_RTT 2U +#define MIN_CWND_VIRT 2U +#define MIN_MSS 150U +#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ +#define PRAGUE_ALPHA_BITS 24U #define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) #define CWND_UNIT 20U -#define ONE_CWND (1LL << CWND_UNIT) /* Must be signed */ +#define ONE_CWND (1ULL << CWND_UNIT) #define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ #define DEFAULT_RTT_TRANSITION 500 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define RTT_UNIT 7 -#define RTT2US(x) ((x) << RTT_UNIT) -#define US2RTT(x) ((x) >> RTT_UNIT) - -#define PRAGUE_MAX_SRTT_BITS 18U -#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) -#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ -#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ - -/* Weights, 1/2^x */ -#define V 1 /* 0.5 */ -#define D 1 /* 0.5 */ -#define S 2 /* 0.25 */ - -/* Store classic_ecn with same scaling as alpha */ -#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ -#define CLASSIC_ECN L_STICKY + \ - PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ -#define C_STICKY CLASSIC_ECN + \ - L_STICKY /* Pure classic behaviour */ - -#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ -#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ - -/* RTT cwnd scaling heuristics */ -enum { - /* No RTT independence */ - RTT_CONTROL_NONE = 0, - /* Flows with e2e RTT <= target RTT achieve the same throughput */ - RTT_CONTROL_RATE, - /* Trade some throughput balance at very low RTTs for a floor on the - * amount of marks/RTT */ - RTT_CONTROL_SCALABLE, - /* Behave as a flow operating with an extra target RTT */ - RTT_CONTROL_ADDITIVE, - - __RTT_CONTROL_MAX -}; +#define MTU_SYS 1500UL +#define OFFSET_UNIT 7 static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, @@ -145,11 +112,6 @@ static u32 prague_max_tso_segs __read_mostly = 0; MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments"); module_param(prague_max_tso_segs, uint, 0644); -static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; -MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " - "chosen RTT scaling heuristic"); -module_param(prague_rtt_scaling, uint, 0644); - static u32 prague_rtt_target __read_mostly = 25 * USEC_PER_MSEC; MODULE_PARM_DESC(prague_rtt_target, "RTT scaling target"); module_param(prague_rtt_target, uint, 0644); @@ -159,22 +121,27 @@ MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); -static int prague_ecn_fallback __read_mostly = 0; -MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" - " 2 = detection"); -module_param(prague_ecn_fallback, int, 0644); +static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ +MODULE_PARM_DESC(prague_rate_offset, + "Pacing rate offset in 1/128 units at each half of RTT_virt"); +module_param(prague_rate_offset, uint, 0644); + +static int prague_hsrtt_shift __read_mostly = 7; +MODULE_PARM_DESC(prague_hsrtt_shift, + "Pacing high-smoothed RTT facot as a base-2 exponent"); +module_param(prague_hsrtt_shift, uint, 0644); struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - s64 cwnd_cnt; /* cwnd update carry */ - s64 loss_cwnd_cnt; + u64 hsrtt_us; + u64 frac_cwnd; /* internal fractional cwnd */ + u64 rate_bytes; /* internal pacing rate in bytes */ + u64 loss_rate_bytes; u32 loss_cwnd; u32 max_tso_burst; - u32 rest_depth_us; - u32 rest_mdev_us; u32 old_delivered; /* tp->delivered at round start */ u32 old_delivered_ce; /* tp->delivered_ce at round start */ u32 next_seq; /* tp->snd_nxt at round start */ @@ -182,17 +149,9 @@ struct prague { u32 rtt_transition_delay; u32 rtt_target; /* RTT scaling target */ u8 saw_ce:1, /* Is there an AQM on the path? */ - rtt_indep:3, /* RTT independence mode */ in_loss:1; /* In cwnd reduction caused by loss */ }; -struct rtt_scaling_ops { - bool (*should_update_ewma)(struct sock *sk); - u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); - u32 (*target_rtt)(struct sock *sk); -}; -static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX]; - /* Fallback struct ops if we fail to negotiate AccECN */ static struct tcp_congestion_ops prague_reno; @@ -212,7 +171,7 @@ static void __prague_connection_id(struct sock *sk, char *str, size_t len) char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) @@ -224,19 +183,25 @@ static bool prague_is_rtt_indep(struct sock *sk) { struct prague *ca = prague_ca(sk); - return ca->rtt_indep != RTT_CONTROL_NONE && - !tcp_in_slow_start(tcp_sk(sk)) && + return !tcp_in_slow_start(tcp_sk(sk)) && ca->round >= ca->rtt_transition_delay; } -static struct rtt_scaling_ops* prague_rtt_scaling_ops(struct sock *sk) +static bool prague_e2e_rtt_elapsed(struct sock *sk) { - return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; + return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); } -static bool prague_e2e_rtt_elapsed(struct sock *sk) +static u32 prague_target_rtt(struct sock *sk) { - return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); + return prague_ca(sk)->rtt_target; +} + +static bool prague_target_rtt_elapsed(struct sock *sk) +{ + return (prague_target_rtt(sk) >> 3) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); } /* RTT independence on a step AQM requires the competing flows to converge to @@ -245,21 +210,104 @@ static bool prague_e2e_rtt_elapsed(struct sock *sk) static bool prague_should_update_ewma(struct sock *sk) { return prague_e2e_rtt_elapsed(sk) && - (!prague_rtt_scaling_ops(sk)->should_update_ewma || - !prague_is_rtt_indep(sk) || - prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); + (!prague_is_rtt_indep(sk) || + prague_target_rtt_elapsed(sk)); } -static u32 prague_target_rtt(struct sock *sk) +static u64 prague_unscaled_ai_ack_increase(struct sock *sk) { - return prague_rtt_scaling_ops(sk)->target_rtt ? - prague_rtt_scaling_ops(sk)->target_rtt(sk) : - prague_ca(sk)->rtt_target; + return 1 << CWND_UNIT; } -static u64 prague_unscaled_ai_ack_increase(struct sock *sk) +static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) { - return 1 << CWND_UNIT; + u64 increase; + u64 divisor; + u64 target; + + target = prague_target_rtt(sk); + if (rtt >= target) + return prague_unscaled_ai_ack_increase(sk); + + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) +{ + u64 a0 = left & ((1ULL<<32)-1); + u64 a1 = left >> 32; + u64 b0 = right & ((1ULL<<32)-1); + u64 b1 = right >> 32; + u64 m0 = a0 * b0; + u64 m1 = a0 * b1; + u64 m2 = a1 * b0; + u64 m3 = a1 * b1; + u64 result_low; + u64 result_high; + + m2 += (m0 >> 32); + m2 += m1; + /* Overflow */ + if (m2 < m1) + m3 += (1ULL<<32); + + result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); + result_high = m3 + (m2 >> 32); + if (shift && 64 >= shift) { + result_low = (result_low >> shift) | (result_high << (64-shift)); + result_high = (result_high >> shift); + } + return (result_high) ? 0xffffffffffffffffULL : result_low; +} + +static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), + MIN_CWND_RTT), tp->snd_cwnd_clamp); +} + +static u64 prague_virtual_rtt(struct sock *sk) +{ + return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); +} + +static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) +{ + return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + + (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); +} + +static bool prague_half_target_rtt_elapsed(struct sock *sk) +{ + return (prague_target_rtt(sk) >> (3 + 1)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); +} + +static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 rtt; + u32 mtu; + + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + rtt = (prague_is_rtt_indep(sk) && (ca->hsrtt_us >> prague_hsrtt_shift)) ? + (ca->hsrtt_us >> prague_hsrtt_shift) : tp->srtt_us; + + return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); +} + +static u32 prague_valid_mtu(struct sock *sk, u32 mtu) +{ + return max_t(u32, min_t(u32, MTU_SYS, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -269,53 +317,50 @@ static void prague_ai_ack_increase(struct sock *sk) u64 increase; u32 rtt; - if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { - increase = prague_unscaled_ai_ack_increase(sk); - goto exit; - } - - rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); + rtt = tcp_sk(sk)->srtt_us; if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > MAX_SCALED_RTT) { + !rtt || rtt > (MAX_SCALED_RTT << 3)) { increase = prague_unscaled_ai_ack_increase(sk); goto exit; } - increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); + increase = prague_rate_scaled_ai_ack_increase(sk, rtt); exit: WRITE_ONCE(ca->ai_ack_increase, increase); } -/* Ensure prague sends traffic as smoothly as possible: - * - Pacing is set to 100% during AI - * - The max GSO burst size is bounded in time at the pacing rate. - * - * We keep the 200% pacing rate during SS, as we need to send 2 MSS back to - * back for every received ACK. - */ static void prague_update_pacing_rate(struct sock *sk) { + struct prague *ca = prague_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); - u32 max_inflight; - u64 rate, burst; - int mtu; - - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - // Must also set tcp_ecn_option=0 and tcp_ecn_unsafe_cep=1 - // to disable the option and safer heuristic... - max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + u64 max_inflight; + u64 rate, burst, offset; + u64 mtu; + + if (prague_is_rtt_indep(sk)) { + offset = mul_64_64_shift(prague_rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_target_rtt_elapsed(sk)) // second half + rate = ca->rate_bytes - offset; + else // first half + rate = ca->rate_bytes + offset; + } else { + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + max_inflight = ca->frac_cwnd; + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; + } - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - if (likely(tp->srtt_us)) - rate = div64_u64(rate, tp->srtt_us); - rate *= max_inflight; + + if (!prague_is_rtt_indep(sk)) { + if (likely(tp->srtt_us)) + rate = div64_u64(rate, (u64)tp->srtt_us); + rate = (rate*max_inflight) >> CWND_UNIT; + ca->rate_bytes = max_t(u64, rate, MINIMUM_RATE); + } + rate = min_t(u64, rate, sk->sk_max_pacing_rate); - /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this - * division. It will somehow need to be able to take hdr sizes into - * account */ burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); WRITE_ONCE(prague_ca(sk)->max_tso_burst, @@ -342,69 +387,16 @@ static void prague_new_round(struct sock *sk) static void prague_cwnd_changed(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd_stamp = tcp_jiffies32; prague_ai_ack_increase(sk); } -/* TODO(asadsa): move this detection out of prague to make it more generic. */ -/* TODO(asadsa): check if self-limited works as given out in the design */ -static void prague_classic_ecn_detection(struct sock *sk) -{ - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u32 min_rtt_us = tcp_min_rtt(tp); - u32 g_srtt_shift = tp->g_srtt_shift; - u32 g_mdev_shift = tp->g_mdev_shift; - u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; - u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; - u64 depth_us; - u32 mdev_lg, depth_lg; - u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); - s64 new_classic_ecn = (s64)tp->classic_ecn; - - if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) - return; - - /* Multiply upscaled mdev by upscaled geometric carry from the previous round - * adding upscaled adjustment to unbias the subsequent integer log - */ - mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; - mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; - /* carry the new rest to the next round */ - ca->rest_mdev_us = mdev_us >> mdev_lg; - /* V*lg(mdev_us/VO) */ - mdev_lg <<= PRAGUE_ALPHA_BITS - V; - new_classic_ecn += (s64)mdev_lg - V0_LG; - - if (unlikely(srtt_us <= min_rtt_us)) - goto out; - - depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); - depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; - ca->rest_depth_us = depth_us >> depth_lg; - /* queue build-up can only bring classic_ecn toward more classic */ - /* + D*lg(max(d/D0, 1)) */ - depth_lg <<= PRAGUE_ALPHA_BITS - D; - if (depth_lg > D0_LG) { - new_classic_ecn += (u64)depth_lg - D0_LG; - } - - /* self-limited? */ - //if (!tcp_is_cwnd_limited(sk)) - // /* - S*s */ - // new_classic_ecn -= PRAGUE_MAX_ALPHA - - // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; - -out: - tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); -} - static void prague_update_alpha(struct sock *sk) { struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha; + u64 ecn_segs, alpha, mtu, mtu_used; + u64 hsrtt; /* Do not update alpha before we have proof that there's an AQM on * the path. @@ -412,9 +404,6 @@ static void prague_update_alpha(struct sock *sk) if (unlikely(!ca->saw_ce)) goto skip; - if (prague_ecn_fallback > 0) - prague_classic_ecn_detection(sk); - alpha = ca->upscaled_alpha; ecn_segs = tp->delivered_ce - ca->old_delivered_ce; /* We diverge from the original EWMA, i.e., @@ -439,6 +428,18 @@ static void prague_update_alpha(struct sock *sk) WRITE_ONCE(ca->upscaled_alpha, alpha); tp->alpha = alpha >> PRAGUE_SHIFT_G; + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } + + hsrtt = ca->hsrtt_us; + hsrtt = hsrtt - (hsrtt >> prague_hsrtt_shift) + tp->srtt_us; + WRITE_ONCE(ca->hsrtt_us, hsrtt); skip: prague_new_round(sk); } @@ -449,6 +450,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; + u32 new_cwnd; + u64 divisor; + u64 mtu_used; acked = rs->acked_sacked; if (rs->ece_delta) { @@ -459,38 +463,44 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } - if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) + if (acked <= 0 || ca->in_loss || tp->app_limited) goto adjust; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); if (!acked) { prague_cwnd_changed(sk); return; } } - increase = acked * ca->ai_ack_increase; - if (likely(tp->snd_cwnd)) - increase = div_u64(increase + (tp->snd_cwnd >> 1), - tp->snd_cwnd); - ca->cwnd_cnt += max_t(u64, acked, increase); + if (prague_is_rtt_indep(sk)) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); + divisor = mtu_used << 23; + new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); + if (likely(new_cwnd)) + ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); + } else { + increase = acked * ca->ai_ack_increase; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (likely(new_cwnd)) + increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd += max_t(u64, acked, increase); + } adjust: - if (ca->cwnd_cnt <= -ONE_CWND) { - ca->cwnd_cnt += ONE_CWND; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (tp->snd_cwnd > new_cwnd) { + /* Step-wise cwnd decrement */ --tp->snd_cwnd; - if (tp->snd_cwnd < MIN_CWND) { - tp->snd_cwnd = MIN_CWND; - /* No point in applying further reductions */ - ca->cwnd_cnt = 0; - } tp->snd_ssthresh = tp->snd_cwnd; prague_cwnd_changed(sk); - } else if (ca->cwnd_cnt >= ONE_CWND) { - ca->cwnd_cnt -= ONE_CWND; + } else if (tp->snd_cwnd < new_cwnd) { + /* Step-wise cwnd increment */ ++tp->snd_cwnd; - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); prague_cwnd_changed(sk); } return; @@ -507,55 +517,13 @@ static void prague_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; - ca->loss_cwnd_cnt = ca->cwnd_cnt; - ca->cwnd_cnt -= - (((u64)tp->snd_cwnd) << (CWND_UNIT - 1)) + (ca->cwnd_cnt >> 1); + ca->loss_rate_bytes = ca->rate_bytes; + ca->rate_bytes -= (ca->rate_bytes >> 1); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); ca->in_loss = 1; prague_cwnd_changed(sk); } -static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) -{ - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - int delta_shift; - u8 new_g_srtt_shift; - u8 old_g_srtt_shift = tp->g_srtt_shift; - - new_g_srtt_shift = ilog2(ssthresh); - new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; - tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); - tp->g_mdev_shift = tp->g_srtt_shift + 1; - delta_shift = tp->g_srtt_shift - old_g_srtt_shift; - - if (!delta_shift) - return; - - if (delta_shift > 0) { - tp->srtt_pace_us <<= delta_shift; - tp->mdev_pace_us <<= delta_shift; - ca->rest_depth_us <<= delta_shift; - ca->rest_mdev_us <<= delta_shift; - } else { - delta_shift = -delta_shift; - tp->srtt_pace_us >>= delta_shift; - tp->mdev_pace_us >>= delta_shift; - ca->rest_depth_us >>= delta_shift; - ca->rest_mdev_us >>= delta_shift; - } -} - -static u64 prague_classic_ecn_fallback(struct tcp_sock *tp, u64 alpha) -{ - u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; - /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ - c = (c >> 1) + (c >> 3); /* c * ~0.6 */ - - - /* clamp alpha no lower than c to compete fair with classic AQMs */ - return max(alpha, c); -} - static void prague_enter_cwr(struct sock *sk) { struct prague *ca = prague_ca(sk); @@ -563,57 +531,28 @@ static void prague_enter_cwr(struct sock *sk) u64 reduction; u64 alpha; - if (prague_is_rtt_indep(sk) && - RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); - - reduction = (alpha * ((u64)tp->snd_cwnd << CWND_UNIT) + - /* Unbias the rounding by adding 1/2 */ - PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); - ca->cwnd_cnt -= reduction; - - return; -} + if (prague_is_rtt_indep(sk)) { + if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; -/* Calculate SRTT & SMDEV with lower gain to see past instantaneous variation. - * Also use accurate RTT measurement of last segment to do Classic ECN detection - * rather than using RFC6298 which includes delay accumulated between two - * successive segments at the receiver. Finally, we do not use this MDEV for RTO - * so initialize it to zero. We use a tweaked version of tcp_rtt_estimator(). - */ -static void prague_rtt_estimator(struct sock *sk, long mrtt_us) -{ - struct tcp_sock *tp = tcp_sk(sk); - long long m = mrtt_us; /* Accurate RTT */ - u64 srtt_pace = tp->srtt_pace_us; - tp->mrtt_pace_us = mrtt_us; - - if (srtt_pace != 0) { - m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ - srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (tp->mdev_pace_us >> tp->g_mdev_shift); - tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ + reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); + ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); } else { - /* no previous measure. */ - srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->frac_cwnd -= reduction; } - tp->srtt_pace_us = max(1ULL, srtt_pace); -} -static void prague_pkts_acked(struct sock *sk, const struct ack_sample *sample) -{ - if (sample->rtt_us != -1) - prague_rtt_estimator(sk, sample->rtt_us); + return; } static void prague_state(struct sock *sk, u8 new_state) @@ -645,7 +584,8 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->cwnd_cnt += ca->cwnd_cnt - ca->loss_cwnd_cnt; + ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -660,15 +600,12 @@ static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) static u32 prague_ssthresh(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - - prague_update_rtt_scaling(sk, tp->snd_ssthresh); return tp->snd_ssthresh; } static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = max_t(u32, prague_ca(sk)->max_tso_burst, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs = prague_ca(sk)->max_tso_burst; if (prague_max_tso_segs) tso_segs = min(tso_segs, prague_max_tso_segs); @@ -688,13 +625,12 @@ static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, info->prague.prague_alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; info->prague.prague_max_burst = ca->max_tso_burst; - info->prague.prague_ai_ack_increase = - READ_ONCE(ca->ai_ack_increase); info->prague.prague_round = ca->round; - info->prague.prague_rtt_transition = - ca->rtt_transition_delay; + info->prague.prague_rate_bytes = + READ_ONCE(ca->rate_bytes); + info->prague.prague_frac_cwnd = + READ_ONCE(ca->frac_cwnd); info->prague.prague_enabled = 1; - info->prague.prague_rtt_indep = ca->rtt_indep; info->prague.prague_rtt_target = prague_target_rtt(sk); } @@ -737,33 +673,26 @@ static void prague_init(struct sock *sk) /* If we have an initial RTT estimate, ensure we have an initial pacing * rate to use if net.ipv4.tcp_pace_iw is set. */ - if (tp->srtt_us) - prague_update_pacing_rate(sk); - ca->alpha_stamp = tp->tcp_mstamp; ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->cwnd_cnt = 0; - ca->loss_cwnd_cnt = 0; - ca->loss_cwnd = 0; + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); ca->max_tso_burst = 1; + + /* rate initialization */ + if (tp->srtt_us) { + ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); + ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); + } else { + ca->rate_bytes = MINIMUM_RATE; + } + prague_update_pacing_rate(sk); + ca->loss_rate_bytes = 0; ca->round = 0; ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = US2RTT(prague_rtt_target); - ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; - if (ca->rtt_indep >= __RTT_CONTROL_MAX) - ca->rtt_indep = RTT_CONTROL_NONE; - LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", - ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); + ca->rtt_target = prague_rtt_target << 3; ca->saw_ce = !!tp->delivered_ce; - /* reuse existing meaurement of SRTT as an intial starting point */ - tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; - tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; - tp->mrtt_pace_us = tp->srtt_us >> 3; - tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; - ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; - ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; + ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << prague_hsrtt_shift) : (USEC_PER_MSEC << (prague_hsrtt_shift + 3)); tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ @@ -771,84 +700,6 @@ static void prague_init(struct sock *sk) prague_new_round(sk); } -static bool prague_target_rtt_elapsed(struct sock *sk) -{ - return RTT2US(prague_target_rtt(sk)) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); -} - -static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) -{ - u64 increase; - u64 divisor; - u64 target; - - - target = prague_target_rtt(sk); - if (rtt >= target) - return prague_unscaled_ai_ack_increase(sk); - /* Scale increase to: - * - Grow by 1MSS/target RTT - * - Take into account the rate ratio of doing cwnd += 1MSS - * - * Overflows if e2e RTT is > 100ms, hence the cap - */ - increase = (u64)rtt << CWND_UNIT; - increase *= rtt; - divisor = target * target; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; -} - -static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) -{ - /* R0 ~= 16ms, R1 ~= 1.5ms */ - const s64 R0 = US2RTT(1 << 14), R1 = US2RTT((1 << 10) + (1 << 9)); - u64 increase; - u64 divisor; - - /* Scale increase to: - * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. - * - Take into account the rate ratio of doing cwnd += 1MSS - */ - increase = (ONE_CWND >> 3) * R0; - increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); - increase *= rtt; - divisor = R0 * R0; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; -} - -static u32 prague_dynamic_rtt_target(struct sock *sk) -{ - return prague_ca(sk)->rtt_target + US2RTT(tcp_sk(sk)->srtt_us >> 3); -} - -static struct rtt_scaling_ops -rtt_scaling_heuristics[__RTT_CONTROL_MAX] __read_mostly = { - [RTT_CONTROL_NONE] = { - .should_update_ewma = NULL, - .ai_ack_increase = NULL, - .target_rtt = NULL, - }, - [RTT_CONTROL_RATE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_SCALABLE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_scalable_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_ADDITIVE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = prague_dynamic_rtt_target - }, -}; - static struct tcp_congestion_ops prague __read_mostly = { .init = prague_init, .release = prague_release, @@ -856,7 +707,6 @@ static struct tcp_congestion_ops prague __read_mostly = { .cwnd_event = prague_cwnd_event, .ssthresh = prague_ssthresh, .undo_cwnd = prague_cwnd_undo, - .pkts_acked = prague_pkts_acked, .set_state = prague_state, .get_info = prague_get_info, .tso_segs = prague_tso_segs, @@ -890,9 +740,10 @@ module_init(prague_register); module_exit(prague_unregister); MODULE_AUTHOR("Olivier Tilmans "); +MODULE_AUTHOR("Chia-Yu Chang "); MODULE_AUTHOR("Koen De Schepper "); MODULE_AUTHOR("Bob briscoe "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("TCP Prague"); -MODULE_VERSION("0.6"); +MODULE_VERSION("0.7"); From 0c6e07bb6567f9c399f13facf50aa0df64ee90e3 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Mon, 18 Sep 2023 11:52:46 +0200 Subject: [PATCH 10/47] Revert ratebased control code and update workflow --- .github/workflows/kernel.yml | 4 +- include/linux/tcp.h | 1 - include/uapi/linux/inet_diag.h | 5 +- net/ipv4/tcp.c | 1 - net/ipv4/tcp_output.c | 4 +- net/ipv4/tcp_prague.c | 619 ++++++++++++++++++++------------- 6 files changed, 391 insertions(+), 243 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 2ac6c4d375df0..168eb823cadc5 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -59,11 +59,11 @@ jobs: path: debian_build release: - name: Release build artifacts for the testing branch + name: Release build artifacts for the branch runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' }} + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase'}} steps: - name: Get artifact uses: actions/download-artifact@v3 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 41dce7a3424b1..d20f31b53a984 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -210,7 +210,6 @@ struct tcp_sock { u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ - bool mss_cache_set_by_ca; u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 2726b6cc65875..c9b87628657a9 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -236,10 +236,11 @@ struct tcp_bbr_info { struct tcp_prague_info { __u64 prague_alpha; - __u64 prague_frac_cwnd; - __u64 prague_rate_bytes; + __u64 prague_ai_ack_increase; __u32 prague_max_burst; __u32 prague_round; + __u32 prague_rtt_transition; + __u32 prague_rtt_indep; __u32 prague_rtt_target; bool prague_enabled; }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index aefd3a9362bd1..9721d7f0db9b9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -447,7 +447,6 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; - tp->mss_cache_set_by_ca = false; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index eadb0eeb4bcc0..32c347fe2ccfe 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2018,7 +2018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; - if (icsk->icsk_mtup.search_high > pmtu && !tp->mss_cache_set_by_ca) + if (icsk->icsk_mtup.search_high > pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); @@ -2048,7 +2048,7 @@ unsigned int tcp_current_mss(struct sock *sk) mss_now = tp->mss_cache; - if (dst && !tp->mss_cache_set_by_ca) { + if (dst) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 5a2f1f160c301..069c7004ed9cb 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,19 +89,52 @@ #include #include -#define MIN_CWND_RTT 2U -#define MIN_CWND_VIRT 2U -#define MIN_MSS 150U -#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ -#define PRAGUE_ALPHA_BITS 24U +#define MIN_CWND 2U +#define PRAGUE_ALPHA_BITS 20U #define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) #define CWND_UNIT 20U -#define ONE_CWND (1ULL << CWND_UNIT) +#define ONE_CWND (1LL << CWND_UNIT) /* Must be signed */ #define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ #define DEFAULT_RTT_TRANSITION 500 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define MTU_SYS 1500UL -#define OFFSET_UNIT 7 +#define RTT_UNIT 7 +#define RTT2US(x) ((x) << RTT_UNIT) +#define US2RTT(x) ((x) >> RTT_UNIT) + +#define PRAGUE_MAX_SRTT_BITS 18U +#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) +#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ +#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ + +/* Weights, 1/2^x */ +#define V 1 /* 0.5 */ +#define D 1 /* 0.5 */ +#define S 2 /* 0.25 */ + +/* Store classic_ecn with same scaling as alpha */ +#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ +#define CLASSIC_ECN L_STICKY + \ + PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ +#define C_STICKY CLASSIC_ECN + \ + L_STICKY /* Pure classic behaviour */ + +#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ +#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ + +/* RTT cwnd scaling heuristics */ +enum { + /* No RTT independence */ + RTT_CONTROL_NONE = 0, + /* Flows with e2e RTT <= target RTT achieve the same throughput */ + RTT_CONTROL_RATE, + /* Trade some throughput balance at very low RTTs for a floor on the + * amount of marks/RTT */ + RTT_CONTROL_SCALABLE, + /* Behave as a flow operating with an extra target RTT */ + RTT_CONTROL_ADDITIVE, + + __RTT_CONTROL_MAX +}; static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, @@ -112,6 +145,11 @@ static u32 prague_max_tso_segs __read_mostly = 0; MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments"); module_param(prague_max_tso_segs, uint, 0644); +static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; +MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " + "chosen RTT scaling heuristic"); +module_param(prague_rtt_scaling, uint, 0644); + static u32 prague_rtt_target __read_mostly = 25 * USEC_PER_MSEC; MODULE_PARM_DESC(prague_rtt_target, "RTT scaling target"); module_param(prague_rtt_target, uint, 0644); @@ -121,27 +159,22 @@ MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); -static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ -MODULE_PARM_DESC(prague_rate_offset, - "Pacing rate offset in 1/128 units at each half of RTT_virt"); -module_param(prague_rate_offset, uint, 0644); - -static int prague_hsrtt_shift __read_mostly = 7; -MODULE_PARM_DESC(prague_hsrtt_shift, - "Pacing high-smoothed RTT facot as a base-2 exponent"); -module_param(prague_hsrtt_shift, uint, 0644); +static int prague_ecn_fallback __read_mostly = 0; +MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" + " 2 = detection"); +module_param(prague_ecn_fallback, int, 0644); struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - u64 hsrtt_us; - u64 frac_cwnd; /* internal fractional cwnd */ - u64 rate_bytes; /* internal pacing rate in bytes */ - u64 loss_rate_bytes; + s64 cwnd_cnt; /* cwnd update carry */ + s64 loss_cwnd_cnt; u32 loss_cwnd; u32 max_tso_burst; + u32 rest_depth_us; + u32 rest_mdev_us; u32 old_delivered; /* tp->delivered at round start */ u32 old_delivered_ce; /* tp->delivered_ce at round start */ u32 next_seq; /* tp->snd_nxt at round start */ @@ -149,9 +182,17 @@ struct prague { u32 rtt_transition_delay; u32 rtt_target; /* RTT scaling target */ u8 saw_ce:1, /* Is there an AQM on the path? */ + rtt_indep:3, /* RTT independence mode */ in_loss:1; /* In cwnd reduction caused by loss */ }; +struct rtt_scaling_ops { + bool (*should_update_ewma)(struct sock *sk); + u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); + u32 (*target_rtt)(struct sock *sk); +}; +static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX]; + /* Fallback struct ops if we fail to negotiate AccECN */ static struct tcp_congestion_ops prague_reno; @@ -171,7 +212,7 @@ static void __prague_connection_id(struct sock *sk, char *str, size_t len) char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) @@ -183,25 +224,19 @@ static bool prague_is_rtt_indep(struct sock *sk) { struct prague *ca = prague_ca(sk); - return !tcp_in_slow_start(tcp_sk(sk)) && + return ca->rtt_indep != RTT_CONTROL_NONE && + !tcp_in_slow_start(tcp_sk(sk)) && ca->round >= ca->rtt_transition_delay; } -static bool prague_e2e_rtt_elapsed(struct sock *sk) -{ - return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); -} - -static u32 prague_target_rtt(struct sock *sk) +static struct rtt_scaling_ops* prague_rtt_scaling_ops(struct sock *sk) { - return prague_ca(sk)->rtt_target; + return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; } -static bool prague_target_rtt_elapsed(struct sock *sk) +static bool prague_e2e_rtt_elapsed(struct sock *sk) { - return (prague_target_rtt(sk) >> 3) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); + return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); } /* RTT independence on a step AQM requires the competing flows to converge to @@ -210,104 +245,21 @@ static bool prague_target_rtt_elapsed(struct sock *sk) static bool prague_should_update_ewma(struct sock *sk) { return prague_e2e_rtt_elapsed(sk) && - (!prague_is_rtt_indep(sk) || - prague_target_rtt_elapsed(sk)); -} - -static u64 prague_unscaled_ai_ack_increase(struct sock *sk) -{ - return 1 << CWND_UNIT; -} - -static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) -{ - u64 increase; - u64 divisor; - u64 target; - - target = prague_target_rtt(sk); - if (rtt >= target) - return prague_unscaled_ai_ack_increase(sk); - - increase = (u64)rtt << CWND_UNIT; - increase *= rtt; - divisor = target * target; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; -} - -static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) -{ - u64 a0 = left & ((1ULL<<32)-1); - u64 a1 = left >> 32; - u64 b0 = right & ((1ULL<<32)-1); - u64 b1 = right >> 32; - u64 m0 = a0 * b0; - u64 m1 = a0 * b1; - u64 m2 = a1 * b0; - u64 m3 = a1 * b1; - u64 result_low; - u64 result_high; - - m2 += (m0 >> 32); - m2 += m1; - /* Overflow */ - if (m2 < m1) - m3 += (1ULL<<32); - - result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); - result_high = m3 + (m2 >> 32); - if (shift && 64 >= shift) { - result_low = (result_low >> shift) | (result_high << (64-shift)); - result_high = (result_high >> shift); - } - return (result_high) ? 0xffffffffffffffffULL : result_low; -} - -static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) -{ - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), - MIN_CWND_RTT), tp->snd_cwnd_clamp); + (!prague_rtt_scaling_ops(sk)->should_update_ewma || + !prague_is_rtt_indep(sk) || + prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); } -static u64 prague_virtual_rtt(struct sock *sk) -{ - return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); -} - -static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) -{ - return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + - (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); -} - -static bool prague_half_target_rtt_elapsed(struct sock *sk) -{ - return (prague_target_rtt(sk) >> (3 + 1)) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); -} - -static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) +static u32 prague_target_rtt(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 rtt; - u32 mtu; - - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - rtt = (prague_is_rtt_indep(sk) && (ca->hsrtt_us >> prague_hsrtt_shift)) ? - (ca->hsrtt_us >> prague_hsrtt_shift) : tp->srtt_us; - - return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); + return prague_rtt_scaling_ops(sk)->target_rtt ? + prague_rtt_scaling_ops(sk)->target_rtt(sk) : + prague_ca(sk)->rtt_target; } -static u32 prague_valid_mtu(struct sock *sk, u32 mtu) +static u64 prague_unscaled_ai_ack_increase(struct sock *sk) { - return max_t(u32, min_t(u32, MTU_SYS, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + return 1 << CWND_UNIT; } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -317,50 +269,53 @@ static void prague_ai_ack_increase(struct sock *sk) u64 increase; u32 rtt; - rtt = tcp_sk(sk)->srtt_us; + if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { + increase = prague_unscaled_ai_ack_increase(sk); + goto exit; + } + + rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > (MAX_SCALED_RTT << 3)) { + !rtt || rtt > MAX_SCALED_RTT) { increase = prague_unscaled_ai_ack_increase(sk); goto exit; } - increase = prague_rate_scaled_ai_ack_increase(sk, rtt); + increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); exit: WRITE_ONCE(ca->ai_ack_increase, increase); } +/* Ensure prague sends traffic as smoothly as possible: + * - Pacing is set to 100% during AI + * - The max GSO burst size is bounded in time at the pacing rate. + * + * We keep the 200% pacing rate during SS, as we need to send 2 MSS back to + * back for every received ACK. + */ static void prague_update_pacing_rate(struct sock *sk) { - struct prague *ca = prague_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); - u64 max_inflight; - u64 rate, burst, offset; - u64 mtu; - - if (prague_is_rtt_indep(sk)) { - offset = mul_64_64_shift(prague_rate_offset, ca->rate_bytes, OFFSET_UNIT); - if (prague_half_target_rtt_elapsed(sk)) // second half - rate = ca->rate_bytes - offset; - else // first half - rate = ca->rate_bytes + offset; - } else { - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - max_inflight = ca->frac_cwnd; - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; - } + u32 max_inflight; + u64 rate, burst; + int mtu; + + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + // Must also set tcp_ecn_option=0 and tcp_ecn_unsafe_cep=1 + // to disable the option and safer heuristic... + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - - if (!prague_is_rtt_indep(sk)) { - if (likely(tp->srtt_us)) - rate = div64_u64(rate, (u64)tp->srtt_us); - rate = (rate*max_inflight) >> CWND_UNIT; - ca->rate_bytes = max_t(u64, rate, MINIMUM_RATE); - } - + if (likely(tp->srtt_us)) + rate = div64_u64(rate, tp->srtt_us); + rate *= max_inflight; rate = min_t(u64, rate, sk->sk_max_pacing_rate); + /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this + * division. It will somehow need to be able to take hdr sizes into + * account */ burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); WRITE_ONCE(prague_ca(sk)->max_tso_burst, @@ -387,16 +342,69 @@ static void prague_new_round(struct sock *sk) static void prague_cwnd_changed(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + tp->snd_cwnd_stamp = tcp_jiffies32; prague_ai_ack_increase(sk); } +/* TODO(asadsa): move this detection out of prague to make it more generic. */ +/* TODO(asadsa): check if self-limited works as given out in the design */ +static void prague_classic_ecn_detection(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 min_rtt_us = tcp_min_rtt(tp); + u32 g_srtt_shift = tp->g_srtt_shift; + u32 g_mdev_shift = tp->g_mdev_shift; + u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; + u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; + u64 depth_us; + u32 mdev_lg, depth_lg; + u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); + s64 new_classic_ecn = (s64)tp->classic_ecn; + + if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) + return; + + /* Multiply upscaled mdev by upscaled geometric carry from the previous round + * adding upscaled adjustment to unbias the subsequent integer log + */ + mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; + mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; + /* carry the new rest to the next round */ + ca->rest_mdev_us = mdev_us >> mdev_lg; + /* V*lg(mdev_us/VO) */ + mdev_lg <<= PRAGUE_ALPHA_BITS - V; + new_classic_ecn += (s64)mdev_lg - V0_LG; + + if (unlikely(srtt_us <= min_rtt_us)) + goto out; + + depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); + depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; + ca->rest_depth_us = depth_us >> depth_lg; + /* queue build-up can only bring classic_ecn toward more classic */ + /* + D*lg(max(d/D0, 1)) */ + depth_lg <<= PRAGUE_ALPHA_BITS - D; + if (depth_lg > D0_LG) { + new_classic_ecn += (u64)depth_lg - D0_LG; + } + + /* self-limited? */ + //if (!tcp_is_cwnd_limited(sk)) + // /* - S*s */ + // new_classic_ecn -= PRAGUE_MAX_ALPHA - + // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; + +out: + tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); +} + static void prague_update_alpha(struct sock *sk) { struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha, mtu, mtu_used; - u64 hsrtt; + u64 ecn_segs, alpha; /* Do not update alpha before we have proof that there's an AQM on * the path. @@ -404,6 +412,9 @@ static void prague_update_alpha(struct sock *sk) if (unlikely(!ca->saw_ce)) goto skip; + if (prague_ecn_fallback > 0) + prague_classic_ecn_detection(sk); + alpha = ca->upscaled_alpha; ecn_segs = tp->delivered_ce - ca->old_delivered_ce; /* We diverge from the original EWMA, i.e., @@ -428,18 +439,6 @@ static void prague_update_alpha(struct sock *sk) WRITE_ONCE(ca->upscaled_alpha, alpha); tp->alpha = alpha >> PRAGUE_SHIFT_G; - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); - if (mtu_used != mtu) { - ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); - tp->mss_cache_set_by_ca = true; - tcp_sync_mss(sk, mtu); - tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - } - - hsrtt = ca->hsrtt_us; - hsrtt = hsrtt - (hsrtt >> prague_hsrtt_shift) + tp->srtt_us; - WRITE_ONCE(ca->hsrtt_us, hsrtt); skip: prague_new_round(sk); } @@ -450,9 +449,6 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; - u32 new_cwnd; - u64 divisor; - u64 mtu_used; acked = rs->acked_sacked; if (rs->ece_delta) { @@ -463,44 +459,38 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } - if (acked <= 0 || ca->in_loss || tp->app_limited) + if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) goto adjust; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); if (!acked) { prague_cwnd_changed(sk); return; } } - if (prague_is_rtt_indep(sk)) { - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); - divisor = mtu_used << 23; - new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); - if (likely(new_cwnd)) - ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); - } else { - increase = acked * ca->ai_ack_increase; - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (likely(new_cwnd)) - increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd += max_t(u64, acked, increase); - } + increase = acked * ca->ai_ack_increase; + if (likely(tp->snd_cwnd)) + increase = div_u64(increase + (tp->snd_cwnd >> 1), + tp->snd_cwnd); + ca->cwnd_cnt += max_t(u64, acked, increase); adjust: - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (tp->snd_cwnd > new_cwnd) { - /* Step-wise cwnd decrement */ + if (ca->cwnd_cnt <= -ONE_CWND) { + ca->cwnd_cnt += ONE_CWND; --tp->snd_cwnd; + if (tp->snd_cwnd < MIN_CWND) { + tp->snd_cwnd = MIN_CWND; + /* No point in applying further reductions */ + ca->cwnd_cnt = 0; + } tp->snd_ssthresh = tp->snd_cwnd; prague_cwnd_changed(sk); - } else if (tp->snd_cwnd < new_cwnd) { - /* Step-wise cwnd increment */ + } else if (ca->cwnd_cnt >= ONE_CWND) { + ca->cwnd_cnt -= ONE_CWND; ++tp->snd_cwnd; + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); prague_cwnd_changed(sk); } return; @@ -517,13 +507,55 @@ static void prague_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; - ca->loss_rate_bytes = ca->rate_bytes; - ca->rate_bytes -= (ca->rate_bytes >> 1); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + ca->loss_cwnd_cnt = ca->cwnd_cnt; + ca->cwnd_cnt -= + (((u64)tp->snd_cwnd) << (CWND_UNIT - 1)) + (ca->cwnd_cnt >> 1); ca->in_loss = 1; prague_cwnd_changed(sk); } +static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + int delta_shift; + u8 new_g_srtt_shift; + u8 old_g_srtt_shift = tp->g_srtt_shift; + + new_g_srtt_shift = ilog2(ssthresh); + new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; + tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); + tp->g_mdev_shift = tp->g_srtt_shift + 1; + delta_shift = tp->g_srtt_shift - old_g_srtt_shift; + + if (!delta_shift) + return; + + if (delta_shift > 0) { + tp->srtt_pace_us <<= delta_shift; + tp->mdev_pace_us <<= delta_shift; + ca->rest_depth_us <<= delta_shift; + ca->rest_mdev_us <<= delta_shift; + } else { + delta_shift = -delta_shift; + tp->srtt_pace_us >>= delta_shift; + tp->mdev_pace_us >>= delta_shift; + ca->rest_depth_us >>= delta_shift; + ca->rest_mdev_us >>= delta_shift; + } +} + +static u64 prague_classic_ecn_fallback(struct tcp_sock *tp, u64 alpha) +{ + u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; + /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ + c = (c >> 1) + (c >> 3); /* c * ~0.6 */ + + + /* clamp alpha no lower than c to compete fair with classic AQMs */ + return max(alpha, c); +} + static void prague_enter_cwr(struct sock *sk) { struct prague *ca = prague_ca(sk); @@ -531,28 +563,57 @@ static void prague_enter_cwr(struct sock *sk) u64 reduction; u64 alpha; - if (prague_is_rtt_indep(sk)) { - if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + if (prague_is_rtt_indep(sk) && + RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + + reduction = (alpha * ((u64)tp->snd_cwnd << CWND_UNIT) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->cwnd_cnt -= reduction; - reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); - ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + return; +} + +/* Calculate SRTT & SMDEV with lower gain to see past instantaneous variation. + * Also use accurate RTT measurement of last segment to do Classic ECN detection + * rather than using RFC6298 which includes delay accumulated between two + * successive segments at the receiver. Finally, we do not use this MDEV for RTO + * so initialize it to zero. We use a tweaked version of tcp_rtt_estimator(). + */ +static void prague_rtt_estimator(struct sock *sk, long mrtt_us) +{ + struct tcp_sock *tp = tcp_sk(sk); + long long m = mrtt_us; /* Accurate RTT */ + u64 srtt_pace = tp->srtt_pace_us; + tp->mrtt_pace_us = mrtt_us; + + if (srtt_pace != 0) { + m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ + srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev_pace_us >> tp->g_mdev_shift); + tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ } else { - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - reduction = (alpha * (ca->frac_cwnd) + - /* Unbias the rounding by adding 1/2 */ - PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); - ca->frac_cwnd -= reduction; + /* no previous measure. */ + srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; } + tp->srtt_pace_us = max(1ULL, srtt_pace); +} - return; +static void prague_pkts_acked(struct sock *sk, const struct ack_sample *sample) +{ + if (sample->rtt_us != -1) + prague_rtt_estimator(sk, sample->rtt_us); } static void prague_state(struct sock *sk, u8 new_state) @@ -584,8 +645,7 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + ca->cwnd_cnt += ca->cwnd_cnt - ca->loss_cwnd_cnt; return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -600,12 +660,15 @@ static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) static u32 prague_ssthresh(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); + + prague_update_rtt_scaling(sk, tp->snd_ssthresh); return tp->snd_ssthresh; } static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = prague_ca(sk)->max_tso_burst; + u32 tso_segs = max_t(u32, prague_ca(sk)->max_tso_burst, + sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); if (prague_max_tso_segs) tso_segs = min(tso_segs, prague_max_tso_segs); @@ -625,12 +688,13 @@ static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, info->prague.prague_alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; info->prague.prague_max_burst = ca->max_tso_burst; + info->prague.prague_ai_ack_increase = + READ_ONCE(ca->ai_ack_increase); info->prague.prague_round = ca->round; - info->prague.prague_rate_bytes = - READ_ONCE(ca->rate_bytes); - info->prague.prague_frac_cwnd = - READ_ONCE(ca->frac_cwnd); + info->prague.prague_rtt_transition = + ca->rtt_transition_delay; info->prague.prague_enabled = 1; + info->prague.prague_rtt_indep = ca->rtt_indep; info->prague.prague_rtt_target = prague_target_rtt(sk); } @@ -673,26 +737,33 @@ static void prague_init(struct sock *sk) /* If we have an initial RTT estimate, ensure we have an initial pacing * rate to use if net.ipv4.tcp_pace_iw is set. */ + if (tp->srtt_us) + prague_update_pacing_rate(sk); + ca->alpha_stamp = tp->tcp_mstamp; ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); + ca->cwnd_cnt = 0; + ca->loss_cwnd_cnt = 0; + ca->loss_cwnd = 0; ca->max_tso_burst = 1; - - /* rate initialization */ - if (tp->srtt_us) { - ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); - ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); - } else { - ca->rate_bytes = MINIMUM_RATE; - } - prague_update_pacing_rate(sk); - ca->loss_rate_bytes = 0; ca->round = 0; ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = prague_rtt_target << 3; + ca->rtt_target = US2RTT(prague_rtt_target); + ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; + if (ca->rtt_indep >= __RTT_CONTROL_MAX) + ca->rtt_indep = RTT_CONTROL_NONE; + LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", + ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); ca->saw_ce = !!tp->delivered_ce; - ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << prague_hsrtt_shift) : (USEC_PER_MSEC << (prague_hsrtt_shift + 3)); + /* reuse existing meaurement of SRTT as an intial starting point */ + tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; + tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; + tp->mrtt_pace_us = tp->srtt_us >> 3; + tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; + ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ @@ -700,6 +771,84 @@ static void prague_init(struct sock *sk) prague_new_round(sk); } +static bool prague_target_rtt_elapsed(struct sock *sk) +{ + return RTT2US(prague_target_rtt(sk)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); +} + +static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) +{ + u64 increase; + u64 divisor; + u64 target; + + + target = prague_target_rtt(sk); + if (rtt >= target) + return prague_unscaled_ai_ack_increase(sk); + /* Scale increase to: + * - Grow by 1MSS/target RTT + * - Take into account the rate ratio of doing cwnd += 1MSS + * + * Overflows if e2e RTT is > 100ms, hence the cap + */ + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) +{ + /* R0 ~= 16ms, R1 ~= 1.5ms */ + const s64 R0 = US2RTT(1 << 14), R1 = US2RTT((1 << 10) + (1 << 9)); + u64 increase; + u64 divisor; + + /* Scale increase to: + * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. + * - Take into account the rate ratio of doing cwnd += 1MSS + */ + increase = (ONE_CWND >> 3) * R0; + increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); + increase *= rtt; + divisor = R0 * R0; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u32 prague_dynamic_rtt_target(struct sock *sk) +{ + return prague_ca(sk)->rtt_target + US2RTT(tcp_sk(sk)->srtt_us >> 3); +} + +static struct rtt_scaling_ops +rtt_scaling_heuristics[__RTT_CONTROL_MAX] __read_mostly = { + [RTT_CONTROL_NONE] = { + .should_update_ewma = NULL, + .ai_ack_increase = NULL, + .target_rtt = NULL, + }, + [RTT_CONTROL_RATE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_SCALABLE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_scalable_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_ADDITIVE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = prague_dynamic_rtt_target + }, +}; + static struct tcp_congestion_ops prague __read_mostly = { .init = prague_init, .release = prague_release, @@ -707,6 +856,7 @@ static struct tcp_congestion_ops prague __read_mostly = { .cwnd_event = prague_cwnd_event, .ssthresh = prague_ssthresh, .undo_cwnd = prague_cwnd_undo, + .pkts_acked = prague_pkts_acked, .set_state = prague_state, .get_info = prague_get_info, .tso_segs = prague_tso_segs, @@ -740,10 +890,9 @@ module_init(prague_register); module_exit(prague_unregister); MODULE_AUTHOR("Olivier Tilmans "); -MODULE_AUTHOR("Chia-Yu Chang "); MODULE_AUTHOR("Koen De Schepper "); MODULE_AUTHOR("Bob briscoe "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("TCP Prague"); -MODULE_VERSION("0.7"); +MODULE_VERSION("0.6"); From 8b0704346393c88d12cdb71741cba392a8a712ae Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Mon, 18 Sep 2023 11:59:56 +0200 Subject: [PATCH 11/47] Commit ratebased control code --- .github/workflows/kernel.yml | 4 +- include/linux/tcp.h | 1 + include/uapi/linux/inet_diag.h | 5 +- net/ipv4/tcp.c | 1 + net/ipv4/tcp_output.c | 4 +- net/ipv4/tcp_prague.c | 619 +++++++++++++-------------------- 6 files changed, 243 insertions(+), 391 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 2ac6c4d375df0..168eb823cadc5 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -59,11 +59,11 @@ jobs: path: debian_build release: - name: Release build artifacts for the testing branch + name: Release build artifacts for the branch runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' }} + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase'}} steps: - name: Get artifact uses: actions/download-artifact@v3 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d20f31b53a984..41dce7a3424b1 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -210,6 +210,7 @@ struct tcp_sock { u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ + bool mss_cache_set_by_ca; u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index c9b87628657a9..2726b6cc65875 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -236,11 +236,10 @@ struct tcp_bbr_info { struct tcp_prague_info { __u64 prague_alpha; - __u64 prague_ai_ack_increase; + __u64 prague_frac_cwnd; + __u64 prague_rate_bytes; __u32 prague_max_burst; __u32 prague_round; - __u32 prague_rtt_transition; - __u32 prague_rtt_indep; __u32 prague_rtt_target; bool prague_enabled; }; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9721d7f0db9b9..aefd3a9362bd1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -447,6 +447,7 @@ void tcp_init_sock(struct sock *sk) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; + tp->mss_cache_set_by_ca = false; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32c347fe2ccfe..eadb0eeb4bcc0 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2018,7 +2018,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) struct inet_connection_sock *icsk = inet_csk(sk); int mss_now; - if (icsk->icsk_mtup.search_high > pmtu) + if (icsk->icsk_mtup.search_high > pmtu && !tp->mss_cache_set_by_ca) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); @@ -2048,7 +2048,7 @@ unsigned int tcp_current_mss(struct sock *sk) mss_now = tp->mss_cache; - if (dst) { + if (dst && !tp->mss_cache_set_by_ca) { u32 mtu = dst_mtu(dst); if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 069c7004ed9cb..5a2f1f160c301 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,52 +89,19 @@ #include #include -#define MIN_CWND 2U -#define PRAGUE_ALPHA_BITS 20U +#define MIN_CWND_RTT 2U +#define MIN_CWND_VIRT 2U +#define MIN_MSS 150U +#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ +#define PRAGUE_ALPHA_BITS 24U #define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) #define CWND_UNIT 20U -#define ONE_CWND (1LL << CWND_UNIT) /* Must be signed */ +#define ONE_CWND (1ULL << CWND_UNIT) #define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ #define DEFAULT_RTT_TRANSITION 500 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define RTT_UNIT 7 -#define RTT2US(x) ((x) << RTT_UNIT) -#define US2RTT(x) ((x) >> RTT_UNIT) - -#define PRAGUE_MAX_SRTT_BITS 18U -#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) -#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ -#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ - -/* Weights, 1/2^x */ -#define V 1 /* 0.5 */ -#define D 1 /* 0.5 */ -#define S 2 /* 0.25 */ - -/* Store classic_ecn with same scaling as alpha */ -#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ -#define CLASSIC_ECN L_STICKY + \ - PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ -#define C_STICKY CLASSIC_ECN + \ - L_STICKY /* Pure classic behaviour */ - -#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ -#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ - -/* RTT cwnd scaling heuristics */ -enum { - /* No RTT independence */ - RTT_CONTROL_NONE = 0, - /* Flows with e2e RTT <= target RTT achieve the same throughput */ - RTT_CONTROL_RATE, - /* Trade some throughput balance at very low RTTs for a floor on the - * amount of marks/RTT */ - RTT_CONTROL_SCALABLE, - /* Behave as a flow operating with an extra target RTT */ - RTT_CONTROL_ADDITIVE, - - __RTT_CONTROL_MAX -}; +#define MTU_SYS 1500UL +#define OFFSET_UNIT 7 static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, @@ -145,11 +112,6 @@ static u32 prague_max_tso_segs __read_mostly = 0; MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments"); module_param(prague_max_tso_segs, uint, 0644); -static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; -MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " - "chosen RTT scaling heuristic"); -module_param(prague_rtt_scaling, uint, 0644); - static u32 prague_rtt_target __read_mostly = 25 * USEC_PER_MSEC; MODULE_PARM_DESC(prague_rtt_target, "RTT scaling target"); module_param(prague_rtt_target, uint, 0644); @@ -159,22 +121,27 @@ MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); -static int prague_ecn_fallback __read_mostly = 0; -MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" - " 2 = detection"); -module_param(prague_ecn_fallback, int, 0644); +static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ +MODULE_PARM_DESC(prague_rate_offset, + "Pacing rate offset in 1/128 units at each half of RTT_virt"); +module_param(prague_rate_offset, uint, 0644); + +static int prague_hsrtt_shift __read_mostly = 7; +MODULE_PARM_DESC(prague_hsrtt_shift, + "Pacing high-smoothed RTT facot as a base-2 exponent"); +module_param(prague_hsrtt_shift, uint, 0644); struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - s64 cwnd_cnt; /* cwnd update carry */ - s64 loss_cwnd_cnt; + u64 hsrtt_us; + u64 frac_cwnd; /* internal fractional cwnd */ + u64 rate_bytes; /* internal pacing rate in bytes */ + u64 loss_rate_bytes; u32 loss_cwnd; u32 max_tso_burst; - u32 rest_depth_us; - u32 rest_mdev_us; u32 old_delivered; /* tp->delivered at round start */ u32 old_delivered_ce; /* tp->delivered_ce at round start */ u32 next_seq; /* tp->snd_nxt at round start */ @@ -182,17 +149,9 @@ struct prague { u32 rtt_transition_delay; u32 rtt_target; /* RTT scaling target */ u8 saw_ce:1, /* Is there an AQM on the path? */ - rtt_indep:3, /* RTT independence mode */ in_loss:1; /* In cwnd reduction caused by loss */ }; -struct rtt_scaling_ops { - bool (*should_update_ewma)(struct sock *sk); - u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); - u32 (*target_rtt)(struct sock *sk); -}; -static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX]; - /* Fallback struct ops if we fail to negotiate AccECN */ static struct tcp_congestion_ops prague_reno; @@ -212,7 +171,7 @@ static void __prague_connection_id(struct sock *sk, char *str, size_t len) char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) @@ -224,19 +183,25 @@ static bool prague_is_rtt_indep(struct sock *sk) { struct prague *ca = prague_ca(sk); - return ca->rtt_indep != RTT_CONTROL_NONE && - !tcp_in_slow_start(tcp_sk(sk)) && + return !tcp_in_slow_start(tcp_sk(sk)) && ca->round >= ca->rtt_transition_delay; } -static struct rtt_scaling_ops* prague_rtt_scaling_ops(struct sock *sk) +static bool prague_e2e_rtt_elapsed(struct sock *sk) { - return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; + return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); } -static bool prague_e2e_rtt_elapsed(struct sock *sk) +static u32 prague_target_rtt(struct sock *sk) { - return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); + return prague_ca(sk)->rtt_target; +} + +static bool prague_target_rtt_elapsed(struct sock *sk) +{ + return (prague_target_rtt(sk) >> 3) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); } /* RTT independence on a step AQM requires the competing flows to converge to @@ -245,21 +210,104 @@ static bool prague_e2e_rtt_elapsed(struct sock *sk) static bool prague_should_update_ewma(struct sock *sk) { return prague_e2e_rtt_elapsed(sk) && - (!prague_rtt_scaling_ops(sk)->should_update_ewma || - !prague_is_rtt_indep(sk) || - prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); + (!prague_is_rtt_indep(sk) || + prague_target_rtt_elapsed(sk)); } -static u32 prague_target_rtt(struct sock *sk) +static u64 prague_unscaled_ai_ack_increase(struct sock *sk) { - return prague_rtt_scaling_ops(sk)->target_rtt ? - prague_rtt_scaling_ops(sk)->target_rtt(sk) : - prague_ca(sk)->rtt_target; + return 1 << CWND_UNIT; } -static u64 prague_unscaled_ai_ack_increase(struct sock *sk) +static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) { - return 1 << CWND_UNIT; + u64 increase; + u64 divisor; + u64 target; + + target = prague_target_rtt(sk); + if (rtt >= target) + return prague_unscaled_ai_ack_increase(sk); + + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) +{ + u64 a0 = left & ((1ULL<<32)-1); + u64 a1 = left >> 32; + u64 b0 = right & ((1ULL<<32)-1); + u64 b1 = right >> 32; + u64 m0 = a0 * b0; + u64 m1 = a0 * b1; + u64 m2 = a1 * b0; + u64 m3 = a1 * b1; + u64 result_low; + u64 result_high; + + m2 += (m0 >> 32); + m2 += m1; + /* Overflow */ + if (m2 < m1) + m3 += (1ULL<<32); + + result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); + result_high = m3 + (m2 >> 32); + if (shift && 64 >= shift) { + result_low = (result_low >> shift) | (result_high << (64-shift)); + result_high = (result_high >> shift); + } + return (result_high) ? 0xffffffffffffffffULL : result_low; +} + +static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), + MIN_CWND_RTT), tp->snd_cwnd_clamp); +} + +static u64 prague_virtual_rtt(struct sock *sk) +{ + return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); +} + +static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) +{ + return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + + (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); +} + +static bool prague_half_target_rtt_elapsed(struct sock *sk) +{ + return (prague_target_rtt(sk) >> (3 + 1)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); +} + +static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 rtt; + u32 mtu; + + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + rtt = (prague_is_rtt_indep(sk) && (ca->hsrtt_us >> prague_hsrtt_shift)) ? + (ca->hsrtt_us >> prague_hsrtt_shift) : tp->srtt_us; + + return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); +} + +static u32 prague_valid_mtu(struct sock *sk, u32 mtu) +{ + return max_t(u32, min_t(u32, MTU_SYS, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -269,53 +317,50 @@ static void prague_ai_ack_increase(struct sock *sk) u64 increase; u32 rtt; - if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { - increase = prague_unscaled_ai_ack_increase(sk); - goto exit; - } - - rtt = US2RTT(tcp_sk(sk)->srtt_us >> 3); + rtt = tcp_sk(sk)->srtt_us; if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > MAX_SCALED_RTT) { + !rtt || rtt > (MAX_SCALED_RTT << 3)) { increase = prague_unscaled_ai_ack_increase(sk); goto exit; } - increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); + increase = prague_rate_scaled_ai_ack_increase(sk, rtt); exit: WRITE_ONCE(ca->ai_ack_increase, increase); } -/* Ensure prague sends traffic as smoothly as possible: - * - Pacing is set to 100% during AI - * - The max GSO burst size is bounded in time at the pacing rate. - * - * We keep the 200% pacing rate during SS, as we need to send 2 MSS back to - * back for every received ACK. - */ static void prague_update_pacing_rate(struct sock *sk) { + struct prague *ca = prague_ca(sk); const struct tcp_sock *tp = tcp_sk(sk); - u32 max_inflight; - u64 rate, burst; - int mtu; - - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - // Must also set tcp_ecn_option=0 and tcp_ecn_unsafe_cep=1 - // to disable the option and safer heuristic... - max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + u64 max_inflight; + u64 rate, burst, offset; + u64 mtu; + + if (prague_is_rtt_indep(sk)) { + offset = mul_64_64_shift(prague_rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_target_rtt_elapsed(sk)) // second half + rate = ca->rate_bytes - offset; + else // first half + rate = ca->rate_bytes + offset; + } else { + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + max_inflight = ca->frac_cwnd; + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; + } - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - if (likely(tp->srtt_us)) - rate = div64_u64(rate, tp->srtt_us); - rate *= max_inflight; + + if (!prague_is_rtt_indep(sk)) { + if (likely(tp->srtt_us)) + rate = div64_u64(rate, (u64)tp->srtt_us); + rate = (rate*max_inflight) >> CWND_UNIT; + ca->rate_bytes = max_t(u64, rate, MINIMUM_RATE); + } + rate = min_t(u64, rate, sk->sk_max_pacing_rate); - /* TODO(otilmans) rewrite the tso_segs hook to bytes to avoid this - * division. It will somehow need to be able to take hdr sizes into - * account */ burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); WRITE_ONCE(prague_ca(sk)->max_tso_burst, @@ -342,69 +387,16 @@ static void prague_new_round(struct sock *sk) static void prague_cwnd_changed(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd_stamp = tcp_jiffies32; prague_ai_ack_increase(sk); } -/* TODO(asadsa): move this detection out of prague to make it more generic. */ -/* TODO(asadsa): check if self-limited works as given out in the design */ -static void prague_classic_ecn_detection(struct sock *sk) -{ - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u32 min_rtt_us = tcp_min_rtt(tp); - u32 g_srtt_shift = tp->g_srtt_shift; - u32 g_mdev_shift = tp->g_mdev_shift; - u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; - u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; - u64 depth_us; - u32 mdev_lg, depth_lg; - u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); - s64 new_classic_ecn = (s64)tp->classic_ecn; - - if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) - return; - - /* Multiply upscaled mdev by upscaled geometric carry from the previous round - * adding upscaled adjustment to unbias the subsequent integer log - */ - mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; - mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; - /* carry the new rest to the next round */ - ca->rest_mdev_us = mdev_us >> mdev_lg; - /* V*lg(mdev_us/VO) */ - mdev_lg <<= PRAGUE_ALPHA_BITS - V; - new_classic_ecn += (s64)mdev_lg - V0_LG; - - if (unlikely(srtt_us <= min_rtt_us)) - goto out; - - depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); - depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; - ca->rest_depth_us = depth_us >> depth_lg; - /* queue build-up can only bring classic_ecn toward more classic */ - /* + D*lg(max(d/D0, 1)) */ - depth_lg <<= PRAGUE_ALPHA_BITS - D; - if (depth_lg > D0_LG) { - new_classic_ecn += (u64)depth_lg - D0_LG; - } - - /* self-limited? */ - //if (!tcp_is_cwnd_limited(sk)) - // /* - S*s */ - // new_classic_ecn -= PRAGUE_MAX_ALPHA - - // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; - -out: - tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); -} - static void prague_update_alpha(struct sock *sk) { struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha; + u64 ecn_segs, alpha, mtu, mtu_used; + u64 hsrtt; /* Do not update alpha before we have proof that there's an AQM on * the path. @@ -412,9 +404,6 @@ static void prague_update_alpha(struct sock *sk) if (unlikely(!ca->saw_ce)) goto skip; - if (prague_ecn_fallback > 0) - prague_classic_ecn_detection(sk); - alpha = ca->upscaled_alpha; ecn_segs = tp->delivered_ce - ca->old_delivered_ce; /* We diverge from the original EWMA, i.e., @@ -439,6 +428,18 @@ static void prague_update_alpha(struct sock *sk) WRITE_ONCE(ca->upscaled_alpha, alpha); tp->alpha = alpha >> PRAGUE_SHIFT_G; + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } + + hsrtt = ca->hsrtt_us; + hsrtt = hsrtt - (hsrtt >> prague_hsrtt_shift) + tp->srtt_us; + WRITE_ONCE(ca->hsrtt_us, hsrtt); skip: prague_new_round(sk); } @@ -449,6 +450,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; + u32 new_cwnd; + u64 divisor; + u64 mtu_used; acked = rs->acked_sacked; if (rs->ece_delta) { @@ -459,38 +463,44 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) acked -= rs->ece_delta; } - if (acked <= 0 || ca->in_loss || !tcp_is_cwnd_limited(sk)) + if (acked <= 0 || ca->in_loss || tp->app_limited) goto adjust; if (tcp_in_slow_start(tp)) { acked = tcp_slow_start(tp, acked); + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); if (!acked) { prague_cwnd_changed(sk); return; } } - increase = acked * ca->ai_ack_increase; - if (likely(tp->snd_cwnd)) - increase = div_u64(increase + (tp->snd_cwnd >> 1), - tp->snd_cwnd); - ca->cwnd_cnt += max_t(u64, acked, increase); + if (prague_is_rtt_indep(sk)) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); + divisor = mtu_used << 23; + new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); + if (likely(new_cwnd)) + ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); + } else { + increase = acked * ca->ai_ack_increase; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (likely(new_cwnd)) + increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd += max_t(u64, acked, increase); + } adjust: - if (ca->cwnd_cnt <= -ONE_CWND) { - ca->cwnd_cnt += ONE_CWND; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (tp->snd_cwnd > new_cwnd) { + /* Step-wise cwnd decrement */ --tp->snd_cwnd; - if (tp->snd_cwnd < MIN_CWND) { - tp->snd_cwnd = MIN_CWND; - /* No point in applying further reductions */ - ca->cwnd_cnt = 0; - } tp->snd_ssthresh = tp->snd_cwnd; prague_cwnd_changed(sk); - } else if (ca->cwnd_cnt >= ONE_CWND) { - ca->cwnd_cnt -= ONE_CWND; + } else if (tp->snd_cwnd < new_cwnd) { + /* Step-wise cwnd increment */ ++tp->snd_cwnd; - tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_cwnd_clamp); prague_cwnd_changed(sk); } return; @@ -507,55 +517,13 @@ static void prague_enter_loss(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); ca->loss_cwnd = tp->snd_cwnd; - ca->loss_cwnd_cnt = ca->cwnd_cnt; - ca->cwnd_cnt -= - (((u64)tp->snd_cwnd) << (CWND_UNIT - 1)) + (ca->cwnd_cnt >> 1); + ca->loss_rate_bytes = ca->rate_bytes; + ca->rate_bytes -= (ca->rate_bytes >> 1); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); ca->in_loss = 1; prague_cwnd_changed(sk); } -static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) -{ - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - int delta_shift; - u8 new_g_srtt_shift; - u8 old_g_srtt_shift = tp->g_srtt_shift; - - new_g_srtt_shift = ilog2(ssthresh); - new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; - tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); - tp->g_mdev_shift = tp->g_srtt_shift + 1; - delta_shift = tp->g_srtt_shift - old_g_srtt_shift; - - if (!delta_shift) - return; - - if (delta_shift > 0) { - tp->srtt_pace_us <<= delta_shift; - tp->mdev_pace_us <<= delta_shift; - ca->rest_depth_us <<= delta_shift; - ca->rest_mdev_us <<= delta_shift; - } else { - delta_shift = -delta_shift; - tp->srtt_pace_us >>= delta_shift; - tp->mdev_pace_us >>= delta_shift; - ca->rest_depth_us >>= delta_shift; - ca->rest_mdev_us >>= delta_shift; - } -} - -static u64 prague_classic_ecn_fallback(struct tcp_sock *tp, u64 alpha) -{ - u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; - /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ - c = (c >> 1) + (c >> 3); /* c * ~0.6 */ - - - /* clamp alpha no lower than c to compete fair with classic AQMs */ - return max(alpha, c); -} - static void prague_enter_cwr(struct sock *sk) { struct prague *ca = prague_ca(sk); @@ -563,57 +531,28 @@ static void prague_enter_cwr(struct sock *sk) u64 reduction; u64 alpha; - if (prague_is_rtt_indep(sk) && - RTT2US(prague_target_rtt(sk)) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); - - reduction = (alpha * ((u64)tp->snd_cwnd << CWND_UNIT) + - /* Unbias the rounding by adding 1/2 */ - PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); - ca->cwnd_cnt -= reduction; - - return; -} + if (prague_is_rtt_indep(sk)) { + if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; -/* Calculate SRTT & SMDEV with lower gain to see past instantaneous variation. - * Also use accurate RTT measurement of last segment to do Classic ECN detection - * rather than using RFC6298 which includes delay accumulated between two - * successive segments at the receiver. Finally, we do not use this MDEV for RTO - * so initialize it to zero. We use a tweaked version of tcp_rtt_estimator(). - */ -static void prague_rtt_estimator(struct sock *sk, long mrtt_us) -{ - struct tcp_sock *tp = tcp_sk(sk); - long long m = mrtt_us; /* Accurate RTT */ - u64 srtt_pace = tp->srtt_pace_us; - tp->mrtt_pace_us = mrtt_us; - - if (srtt_pace != 0) { - m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ - srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (tp->mdev_pace_us >> tp->g_mdev_shift); - tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ + reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); + ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); } else { - /* no previous measure. */ - srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->frac_cwnd -= reduction; } - tp->srtt_pace_us = max(1ULL, srtt_pace); -} -static void prague_pkts_acked(struct sock *sk, const struct ack_sample *sample) -{ - if (sample->rtt_us != -1) - prague_rtt_estimator(sk, sample->rtt_us); + return; } static void prague_state(struct sock *sk, u8 new_state) @@ -645,7 +584,8 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->cwnd_cnt += ca->cwnd_cnt - ca->loss_cwnd_cnt; + ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -660,15 +600,12 @@ static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) static u32 prague_ssthresh(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - - prague_update_rtt_scaling(sk, tp->snd_ssthresh); return tp->snd_ssthresh; } static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = max_t(u32, prague_ca(sk)->max_tso_burst, - sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + u32 tso_segs = prague_ca(sk)->max_tso_burst; if (prague_max_tso_segs) tso_segs = min(tso_segs, prague_max_tso_segs); @@ -688,13 +625,12 @@ static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, info->prague.prague_alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; info->prague.prague_max_burst = ca->max_tso_burst; - info->prague.prague_ai_ack_increase = - READ_ONCE(ca->ai_ack_increase); info->prague.prague_round = ca->round; - info->prague.prague_rtt_transition = - ca->rtt_transition_delay; + info->prague.prague_rate_bytes = + READ_ONCE(ca->rate_bytes); + info->prague.prague_frac_cwnd = + READ_ONCE(ca->frac_cwnd); info->prague.prague_enabled = 1; - info->prague.prague_rtt_indep = ca->rtt_indep; info->prague.prague_rtt_target = prague_target_rtt(sk); } @@ -737,33 +673,26 @@ static void prague_init(struct sock *sk) /* If we have an initial RTT estimate, ensure we have an initial pacing * rate to use if net.ipv4.tcp_pace_iw is set. */ - if (tp->srtt_us) - prague_update_pacing_rate(sk); - ca->alpha_stamp = tp->tcp_mstamp; ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->cwnd_cnt = 0; - ca->loss_cwnd_cnt = 0; - ca->loss_cwnd = 0; + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); ca->max_tso_burst = 1; + + /* rate initialization */ + if (tp->srtt_us) { + ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); + ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); + } else { + ca->rate_bytes = MINIMUM_RATE; + } + prague_update_pacing_rate(sk); + ca->loss_rate_bytes = 0; ca->round = 0; ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = US2RTT(prague_rtt_target); - ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; - if (ca->rtt_indep >= __RTT_CONTROL_MAX) - ca->rtt_indep = RTT_CONTROL_NONE; - LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", - ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); + ca->rtt_target = prague_rtt_target << 3; ca->saw_ce = !!tp->delivered_ce; - /* reuse existing meaurement of SRTT as an intial starting point */ - tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; - tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; - tp->mrtt_pace_us = tp->srtt_us >> 3; - tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; - ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; - ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; + ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << prague_hsrtt_shift) : (USEC_PER_MSEC << (prague_hsrtt_shift + 3)); tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ @@ -771,84 +700,6 @@ static void prague_init(struct sock *sk) prague_new_round(sk); } -static bool prague_target_rtt_elapsed(struct sock *sk) -{ - return RTT2US(prague_target_rtt(sk)) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); -} - -static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) -{ - u64 increase; - u64 divisor; - u64 target; - - - target = prague_target_rtt(sk); - if (rtt >= target) - return prague_unscaled_ai_ack_increase(sk); - /* Scale increase to: - * - Grow by 1MSS/target RTT - * - Take into account the rate ratio of doing cwnd += 1MSS - * - * Overflows if e2e RTT is > 100ms, hence the cap - */ - increase = (u64)rtt << CWND_UNIT; - increase *= rtt; - divisor = target * target; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; -} - -static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) -{ - /* R0 ~= 16ms, R1 ~= 1.5ms */ - const s64 R0 = US2RTT(1 << 14), R1 = US2RTT((1 << 10) + (1 << 9)); - u64 increase; - u64 divisor; - - /* Scale increase to: - * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. - * - Take into account the rate ratio of doing cwnd += 1MSS - */ - increase = (ONE_CWND >> 3) * R0; - increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); - increase *= rtt; - divisor = R0 * R0; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; -} - -static u32 prague_dynamic_rtt_target(struct sock *sk) -{ - return prague_ca(sk)->rtt_target + US2RTT(tcp_sk(sk)->srtt_us >> 3); -} - -static struct rtt_scaling_ops -rtt_scaling_heuristics[__RTT_CONTROL_MAX] __read_mostly = { - [RTT_CONTROL_NONE] = { - .should_update_ewma = NULL, - .ai_ack_increase = NULL, - .target_rtt = NULL, - }, - [RTT_CONTROL_RATE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_SCALABLE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_scalable_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_ADDITIVE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = prague_dynamic_rtt_target - }, -}; - static struct tcp_congestion_ops prague __read_mostly = { .init = prague_init, .release = prague_release, @@ -856,7 +707,6 @@ static struct tcp_congestion_ops prague __read_mostly = { .cwnd_event = prague_cwnd_event, .ssthresh = prague_ssthresh, .undo_cwnd = prague_cwnd_undo, - .pkts_acked = prague_pkts_acked, .set_state = prague_state, .get_info = prague_get_info, .tso_segs = prague_tso_segs, @@ -890,9 +740,10 @@ module_init(prague_register); module_exit(prague_unregister); MODULE_AUTHOR("Olivier Tilmans "); +MODULE_AUTHOR("Chia-Yu Chang "); MODULE_AUTHOR("Koen De Schepper "); MODULE_AUTHOR("Bob briscoe "); MODULE_LICENSE("GPL v2"); MODULE_DESCRIPTION("TCP Prague"); -MODULE_VERSION("0.6"); +MODULE_VERSION("0.7"); From 60b07a24636a08448eb5d99a181d72bbf37b6800 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Fri, 22 Sep 2023 14:36:48 +0200 Subject: [PATCH 12/47] Update the control part of ratebase code --- net/ipv4/tcp_prague.c | 47 ++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 5a2f1f160c301..ef078f6311a05 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -101,7 +101,9 @@ #define DEFAULT_RTT_TRANSITION 500 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) #define MTU_SYS 1500UL +#define RATE_OFFSET 4 #define OFFSET_UNIT 7 +#define HSRTT_SHIFT 7 static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, @@ -126,17 +128,13 @@ MODULE_PARM_DESC(prague_rate_offset, "Pacing rate offset in 1/128 units at each half of RTT_virt"); module_param(prague_rate_offset, uint, 0644); -static int prague_hsrtt_shift __read_mostly = 7; -MODULE_PARM_DESC(prague_hsrtt_shift, - "Pacing high-smoothed RTT facot as a base-2 exponent"); -module_param(prague_hsrtt_shift, uint, 0644); - struct prague { u64 cwr_stamp; u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ u64 hsrtt_us; + u32 rate_offset; u64 frac_cwnd; /* internal fractional cwnd */ u64 rate_bytes; /* internal pacing rate in bytes */ u64 loss_rate_bytes; @@ -284,9 +282,9 @@ static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); } -static bool prague_half_target_rtt_elapsed(struct sock *sk) +static bool prague_half_virtual_rtt_elapsed(struct sock *sk) { - return (prague_target_rtt(sk) >> (3 + 1)) <= + return (prague_virtual_rtt(sk) >> (3 + 1)) <= tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, prague_ca(sk)->alpha_stamp); } @@ -296,18 +294,17 @@ static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); u64 rtt; - u32 mtu; + u64 mtu; mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - rtt = (prague_is_rtt_indep(sk) && (ca->hsrtt_us >> prague_hsrtt_shift)) ? - (ca->hsrtt_us >> prague_hsrtt_shift) : tp->srtt_us; + rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); } static u32 prague_valid_mtu(struct sock *sk, u32 mtu) { - return max_t(u32, min_t(u32, MTU_SYS, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + return max_t(u32, min_t(u32, inet_csk(sk)->icsk_pmtu_cookie, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -339,14 +336,14 @@ static void prague_update_pacing_rate(struct sock *sk) u64 mtu; if (prague_is_rtt_indep(sk)) { - offset = mul_64_64_shift(prague_rate_offset, ca->rate_bytes, OFFSET_UNIT); - if (prague_half_target_rtt_elapsed(sk)) // second half + offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_virtual_rtt_elapsed(sk)) // second half rate = ca->rate_bytes - offset; else // first half rate = ca->rate_bytes + offset; } else { mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - max_inflight = ca->frac_cwnd; + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; } @@ -356,8 +353,8 @@ static void prague_update_pacing_rate(struct sock *sk) if (!prague_is_rtt_indep(sk)) { if (likely(tp->srtt_us)) rate = div64_u64(rate, (u64)tp->srtt_us); - rate = (rate*max_inflight) >> CWND_UNIT; - ca->rate_bytes = max_t(u64, rate, MINIMUM_RATE); + rate = rate*max_inflight; + ca->rate_bytes = rate; } rate = min_t(u64, rate, sk->sk_max_pacing_rate); @@ -396,7 +393,6 @@ static void prague_update_alpha(struct sock *sk) struct prague *ca = prague_ca(sk); struct tcp_sock *tp = tcp_sk(sk); u64 ecn_segs, alpha, mtu, mtu_used; - u64 hsrtt; /* Do not update alpha before we have proof that there's an AQM on * the path. @@ -437,9 +433,7 @@ static void prague_update_alpha(struct sock *sk) tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); } - hsrtt = ca->hsrtt_us; - hsrtt = hsrtt - (hsrtt >> prague_hsrtt_shift) + tp->srtt_us; - WRITE_ONCE(ca->hsrtt_us, hsrtt); + ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); skip: prague_new_round(sk); } @@ -485,7 +479,7 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); } else { increase = acked * ca->ai_ack_increase; - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + new_cwnd = tp->snd_cwnd; if (likely(new_cwnd)) increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); ca->frac_cwnd += max_t(u64, acked, increase); @@ -519,9 +513,9 @@ static void prague_enter_loss(struct sock *sk) ca->loss_cwnd = tp->snd_cwnd; ca->loss_rate_bytes = ca->rate_bytes; ca->rate_bytes -= (ca->rate_bytes >> 1); + //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); ca->in_loss = 1; - prague_cwnd_changed(sk); } static void prague_enter_cwr(struct sock *sk) @@ -584,7 +578,9 @@ static u32 prague_cwnd_undo(struct sock *sk) struct prague *ca = prague_ca(sk); /* We may have made some progress since then, account for it. */ - ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->in_loss = 0; + ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); + //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } @@ -692,7 +688,8 @@ static void prague_init(struct sock *sk) ca->rtt_target = prague_rtt_target << 3; ca->saw_ce = !!tp->delivered_ce; - ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << prague_hsrtt_shift) : (USEC_PER_MSEC << (prague_hsrtt_shift + 3)); + ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); + ca->rate_offset = (prague_rate_offset && prage_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ @@ -739,8 +736,8 @@ static void __exit prague_unregister(void) module_init(prague_register); module_exit(prague_unregister); -MODULE_AUTHOR("Olivier Tilmans "); MODULE_AUTHOR("Chia-Yu Chang "); +MODULE_AUTHOR("Olivier Tilmans "); MODULE_AUTHOR("Koen De Schepper "); MODULE_AUTHOR("Bob briscoe "); From eab30d09ebc3ad777e4a0422cb3d5ebcf2b6db72 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Fri, 22 Sep 2023 16:12:43 +0200 Subject: [PATCH 13/47] Fix one typo --- net/ipv4/tcp_prague.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index ef078f6311a05..35ec859e04293 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -689,7 +689,7 @@ static void prague_init(struct sock *sk) ca->saw_ce = !!tp->delivered_ce; ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); - ca->rate_offset = (prague_rate_offset && prage_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; + ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ From 8f15b59b3c0d48ee35741959f64d140c18ea4b2f Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Sat, 23 Sep 2023 13:19:07 +0200 Subject: [PATCH 14/47] Use dst_mtu to replace mtu_cookie --- net/ipv4/tcp_prague.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 35ec859e04293..cf67690842660 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -304,7 +304,8 @@ static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) static u32 prague_valid_mtu(struct sock *sk, u32 mtu) { - return max_t(u32, min_t(u32, inet_csk(sk)->icsk_pmtu_cookie, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + u32 sk_dst_mtu = dst_mtu(__sk_dst_get(sk)); + return max_t(u32, min_t(u32, sk_dst_mtu, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ From b748e186af1c5c9a3a866d1b7a919b0596bf250e Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Sun, 24 Sep 2023 16:12:52 +0200 Subject: [PATCH 15/47] Update with mtu_cache --- net/ipv4/tcp_prague.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index cf67690842660..ac641ec6874f2 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -133,6 +133,7 @@ struct prague { u64 alpha_stamp; /* EWMA update timestamp */ u64 upscaled_alpha; /* Congestion-estimate EWMA */ u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ + u32 mtu_cache; u64 hsrtt_us; u32 rate_offset; u64 frac_cwnd; /* internal fractional cwnd */ @@ -304,8 +305,7 @@ static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) static u32 prague_valid_mtu(struct sock *sk, u32 mtu) { - u32 sk_dst_mtu = dst_mtu(__sk_dst_get(sk)); - return max_t(u32, min_t(u32, sk_dst_mtu, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ @@ -425,13 +425,15 @@ static void prague_update_alpha(struct sock *sk) WRITE_ONCE(ca->upscaled_alpha, alpha); tp->alpha = alpha >> PRAGUE_SHIFT_G; - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); - if (mtu_used != mtu) { - ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); - tp->mss_cache_set_by_ca = true; - tcp_sync_mss(sk, mtu); - tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (prague_is_rtt_indep(sk) && !ca->in_loss) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } } ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); @@ -689,6 +691,7 @@ static void prague_init(struct sock *sk) ca->rtt_target = prague_rtt_target << 3; ca->saw_ce = !!tp->delivered_ce; + ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; From fbd66a9cbe61ac63e95a3c671de55a99d288a88d Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Mon, 23 Oct 2023 05:19:39 +0200 Subject: [PATCH 16/47] Revert some changes --- net/ipv4/tcp_prague.c | 1198 +++++++++++++++++++++++++---------------- 1 file changed, 742 insertions(+), 456 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index ac641ec6874f2..0fe2f6d1be762 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,27 +89,70 @@ #include #include -#define MIN_CWND_RTT 2U -#define MIN_CWND_VIRT 2U -#define MIN_MSS 150U -#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ -#define PRAGUE_ALPHA_BITS 24U -#define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) -#define CWND_UNIT 20U -#define ONE_CWND (1ULL << CWND_UNIT) -#define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ -#define DEFAULT_RTT_TRANSITION 500 -#define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define MTU_SYS 1500UL -#define RATE_OFFSET 4 -#define OFFSET_UNIT 7 -#define HSRTT_SHIFT 7 +#define MIN_CWND_RTT 2U +#define MIN_CWND_VIRT 2U +#define MIN_MSS 150U +#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ +#define PRAGUE_ALPHA_BITS 24U +#define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) +#define CWND_UNIT 20U +#define ONE_CWND (1ULL << CWND_UNIT) +#define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ +#define DEFAULT_RTT_TRANSITION 500 +#define MAX_SCALED_RTT (100 * USEC_PER_MSEC) +#define MTU_SYS 1500UL +#define RATE_OFFSET 4 +#define OFFSET_UNIT 7 +#define HSRTT_SHIFT 7 +#define RTT_UNIT 7 +#define RTT2US(x) ((x) << RTT_UNIT) +#define US2RTT(x) ((x) >> RTT_UNIT) + +#define PRAGUE_MAX_SRTT_BITS 18U +#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) +#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ +#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ + +/* Weights, 1/2^x */ +#define V 1 /* 0.5 */ +#define D 1 /* 0.5 */ +#define S 2 /* 0.25 */ + +/* Store classic_ecn with same scaling as alpha */ +#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ +#define CLASSIC_ECN L_STICKY + \ + PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ +#define C_STICKY CLASSIC_ECN + \ + L_STICKY /* Pure classic behaviour */ + +#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ +#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ + +/* RTT cwnd scaling heuristics */ +enum { + /* No RTT independence */ + RTT_CONTROL_NONE = 0, + /* Flows with e2e RTT <= target RTT achieve the same throughput */ + RTT_CONTROL_RATE, + /* Trade some throughput balance at very low RTTs for a floor on the + * amount of marks/RTT */ + RTT_CONTROL_SCALABLE, + /* Behave as a flow operating with an extra target RTT */ + RTT_CONTROL_ADDITIVE, + + __RTT_CONTROL_MAX +}; static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, - "maximal GSO burst duration as a base-2 negative exponent"); + "maximal GSO burst duration as a base-2 negative exponent"); module_param(prague_burst_shift, uint, 0644); +static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; +MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " + "chosen RTT scaling heuristic"); +module_param(prague_rtt_scaling, uint, 0644); + static u32 prague_max_tso_segs __read_mostly = 0; MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments"); module_param(prague_max_tso_segs, uint, 0644); @@ -120,87 +163,101 @@ module_param(prague_rtt_target, uint, 0644); static int prague_rtt_transition __read_mostly = DEFAULT_RTT_TRANSITION; MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" - " to be RTT independent."); + " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ MODULE_PARM_DESC(prague_rate_offset, - "Pacing rate offset in 1/128 units at each half of RTT_virt"); + "Pacing rate offset in 1/128 units at each half of RTT_virt"); module_param(prague_rate_offset, uint, 0644); +static int prague_ecn_fallback __read_mostly = 0; +MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" + " 2 = detection"); +module_param(prague_ecn_fallback, int, 0644); + struct prague { - u64 cwr_stamp; - u64 alpha_stamp; /* EWMA update timestamp */ - u64 upscaled_alpha; /* Congestion-estimate EWMA */ - u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - u32 mtu_cache; - u64 hsrtt_us; - u32 rate_offset; - u64 frac_cwnd; /* internal fractional cwnd */ - u64 rate_bytes; /* internal pacing rate in bytes */ - u64 loss_rate_bytes; - u32 loss_cwnd; - u32 max_tso_burst; - u32 old_delivered; /* tp->delivered at round start */ - u32 old_delivered_ce; /* tp->delivered_ce at round start */ - u32 next_seq; /* tp->snd_nxt at round start */ - u32 round; /* Round count since last slow-start exit */ - u32 rtt_transition_delay; - u32 rtt_target; /* RTT scaling target */ - u8 saw_ce:1, /* Is there an AQM on the path? */ - in_loss:1; /* In cwnd reduction caused by loss */ + u64 cwr_stamp; + u64 alpha_stamp; /* EWMA update timestamp */ + u64 upscaled_alpha; /* Congestion-estimate EWMA */ + u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ + u32 mtu_cache; + u64 hsrtt_us; + u32 rate_offset; + u64 frac_cwnd; /* internal fractional cwnd */ + u64 rate_bytes; /* internal pacing rate in bytes */ + u64 loss_rate_bytes; + u32 loss_cwnd; + u32 max_tso_burst; + u32 rest_depth_us; + u32 rest_mdev_us; + u32 old_delivered; /* tp->delivered at round start */ + u32 old_delivered_ce; /* tp->delivered_ce at round start */ + u32 next_seq; /* tp->snd_nxt at round start */ + u32 round; /* Round count since last slow-start exit */ + u32 rtt_transition_delay; + u32 rtt_target; /* RTT scaling target */ + u8 saw_ce:1, /* Is there an AQM on the path? */ + rtt_indep:3, /* RTT independence mode */ + in_loss:1; /* In cwnd reduction caused by loss */ }; +struct rtt_scaling_ops { + bool (*should_update_ewma)(struct sock *sk); + u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); + u32 (*target_rtt)(struct sock *sk); +}; +static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX]; + /* Fallback struct ops if we fail to negotiate AccECN */ static struct tcp_congestion_ops prague_reno; static void __prague_connection_id(struct sock *sk, char *str, size_t len) { - u16 dport = ntohs(inet_sk(sk)->inet_dport); - u16 sport = ntohs(inet_sk(sk)->inet_sport); - - if (sk->sk_family == AF_INET) - snprintf(str, len, "%pI4:%u-%pI4:%u", &sk->sk_rcv_saddr, sport, - &sk->sk_daddr, dport); - else if (sk->sk_family == AF_INET6) - snprintf(str, len, "[%pI6c]:%u-[%pI6c]:%u", - &sk->sk_v6_rcv_saddr, sport, &sk->sk_v6_daddr, dport); -} -#define LOG(sk, fmt, ...) do { \ - char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ - __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ - /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + u16 dport = ntohs(inet_sk(sk)->inet_dport); + u16 sport = ntohs(inet_sk(sk)->inet_sport); + + if (sk->sk_family == AF_INET) + snprintf(str, len, "%pI4:%u-%pI4:%u", &sk->sk_rcv_saddr, sport, + &sk->sk_daddr, dport); + else if (sk->sk_family == AF_INET6) + snprintf(str, len, "[%pI6c]:%u-[%pI6c]:%u", + &sk->sk_v6_rcv_saddr, sport, &sk->sk_v6_daddr, dport); +} +#define LOG(sk, fmt, ...) do { \ + char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ + __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ + /* pr_fmt expects the connection ID*/ \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) { - return (struct prague*)inet_csk_ca(sk); + return (struct prague*)inet_csk_ca(sk); } static bool prague_is_rtt_indep(struct sock *sk) { - struct prague *ca = prague_ca(sk); + struct prague *ca = prague_ca(sk); - return !tcp_in_slow_start(tcp_sk(sk)) && - ca->round >= ca->rtt_transition_delay; + return ca->rtt_indep != RTT_CONTROL_NONE && + !tcp_in_slow_start(tcp_sk(sk)) && + ca->round >= ca->rtt_transition_delay; } -static bool prague_e2e_rtt_elapsed(struct sock *sk) +static struct rtt_scaling_ops* prague_rtt_scaling_ops(struct sock *sk) { - return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); + return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; } -static u32 prague_target_rtt(struct sock *sk) +static bool prague_e2e_rtt_elapsed(struct sock *sk) { - return prague_ca(sk)->rtt_target; + return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); } -static bool prague_target_rtt_elapsed(struct sock *sk) +static u32 prague_target_rtt(struct sock *sk) { - return (prague_target_rtt(sk) >> 3) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); + return prague_ca(sk)->rtt_target; } /* RTT independence on a step AQM requires the competing flows to converge to @@ -208,533 +265,762 @@ static bool prague_target_rtt_elapsed(struct sock *sk) * every RTT" */ static bool prague_should_update_ewma(struct sock *sk) { - return prague_e2e_rtt_elapsed(sk) && - (!prague_is_rtt_indep(sk) || - prague_target_rtt_elapsed(sk)); + return prague_e2e_rtt_elapsed(sk) && + (!prague_rtt_scaling_ops(sk)->should_update_ewma || + !prague_is_rtt_indep(sk) || + prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); } -static u64 prague_unscaled_ai_ack_increase(struct sock *sk) +static u32 prague_target_rtt(struct sock *sk) { - return 1 << CWND_UNIT; + return prague_rtt_scaling_ops(sk)->target_rtt ? + prague_rtt_scaling_ops(sk)->target_rtt(sk) : + prague_ca(sk)->rtt_target; } -static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) +static u64 prague_unscaled_ai_ack_increase(struct sock *sk) { - u64 increase; - u64 divisor; - u64 target; - - target = prague_target_rtt(sk); - if (rtt >= target) - return prague_unscaled_ai_ack_increase(sk); - - increase = (u64)rtt << CWND_UNIT; - increase *= rtt; - divisor = target * target; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; + return 1 << CWND_UNIT; } static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) { - u64 a0 = left & ((1ULL<<32)-1); - u64 a1 = left >> 32; - u64 b0 = right & ((1ULL<<32)-1); - u64 b1 = right >> 32; - u64 m0 = a0 * b0; - u64 m1 = a0 * b1; - u64 m2 = a1 * b0; - u64 m3 = a1 * b1; - u64 result_low; - u64 result_high; - - m2 += (m0 >> 32); - m2 += m1; - /* Overflow */ - if (m2 < m1) - m3 += (1ULL<<32); - - result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); - result_high = m3 + (m2 >> 32); - if (shift && 64 >= shift) { - result_low = (result_low >> shift) | (result_high << (64-shift)); - result_high = (result_high >> shift); - } - return (result_high) ? 0xffffffffffffffffULL : result_low; + u64 a0 = left & ((1ULL<<32)-1); + u64 a1 = left >> 32; + u64 b0 = right & ((1ULL<<32)-1); + u64 b1 = right >> 32; + u64 m0 = a0 * b0; + u64 m1 = a0 * b1; + u64 m2 = a1 * b0; + u64 m3 = a1 * b1; + u64 result_low; + u64 result_high; + + m2 += (m0 >> 32); + m2 += m1; + /* Overflow */ + if (m2 < m1) + m3 += (1ULL<<32); + + result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); + result_high = m3 + (m2 >> 32); + if (shift && 64 >= shift) { + result_low = (result_low >> shift) | (result_high << (64-shift)); + result_high = (result_high >> shift); + } + return (result_high) ? 0xffffffffffffffffULL : result_low; } static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); - return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), - MIN_CWND_RTT), tp->snd_cwnd_clamp); + return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), + MIN_CWND_RTT), tp->snd_cwnd_clamp); } static u64 prague_virtual_rtt(struct sock *sk) { - return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); + return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); } static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) { - return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + - (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); + return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + + (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); } static bool prague_half_virtual_rtt_elapsed(struct sock *sk) { - return (prague_virtual_rtt(sk) >> (3 + 1)) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); + return (prague_virtual_rtt(sk) >> (3 + 1)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); } static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 rtt; - u64 mtu; + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 rtt; + u64 mtu; - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; - return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); + return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); } static u32 prague_valid_mtu(struct sock *sk, u32 mtu) { - return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ static void prague_ai_ack_increase(struct sock *sk) { - struct prague *ca = prague_ca(sk); - u64 increase; - u32 rtt; - - rtt = tcp_sk(sk)->srtt_us; - if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > (MAX_SCALED_RTT << 3)) { - increase = prague_unscaled_ai_ack_increase(sk); - goto exit; - } - - increase = prague_rate_scaled_ai_ack_increase(sk, rtt); + struct prague *ca = prague_ca(sk); + u64 increase; + u32 rtt; + + if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { + increase = prague_unscaled_ai_ack_increase(sk); + goto exit; + } + + rtt = tcp_sk(sk)->srtt_us; + if (ca->round < ca->rtt_transition_delay || + !rtt || rtt > (MAX_SCALED_RTT << 3)) { + increase = prague_unscaled_ai_ack_increase(sk); + goto exit; + } + + increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); exit: - WRITE_ONCE(ca->ai_ack_increase, increase); + WRITE_ONCE(ca->ai_ack_increase, increase); } static void prague_update_pacing_rate(struct sock *sk) { - struct prague *ca = prague_ca(sk); - const struct tcp_sock *tp = tcp_sk(sk); - u64 max_inflight; - u64 rate, burst, offset; - u64 mtu; - - if (prague_is_rtt_indep(sk)) { - offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); - if (prague_half_virtual_rtt_elapsed(sk)) // second half - rate = ca->rate_bytes - offset; - else // first half - rate = ca->rate_bytes + offset; - } else { - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; - } - - if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate <<= 1; - - if (!prague_is_rtt_indep(sk)) { - if (likely(tp->srtt_us)) - rate = div64_u64(rate, (u64)tp->srtt_us); - rate = rate*max_inflight; - ca->rate_bytes = rate; - } - - rate = min_t(u64, rate, sk->sk_max_pacing_rate); - burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); - - WRITE_ONCE(prague_ca(sk)->max_tso_burst, - max_t(u32, 1, burst >> prague_burst_shift)); - WRITE_ONCE(sk->sk_pacing_rate, rate); + struct prague *ca = prague_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + u64 max_inflight; + u64 rate, burst, offset; + u64 mtu; + + if (prague_is_rtt_indep(sk)) { + offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_virtual_rtt_elapsed(sk)) // second half + rate = ca->rate_bytes - offset; + else // first half + rate = ca->rate_bytes + offset; + } else { + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; + } + + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate <<= 1; + + if (!prague_is_rtt_indep(sk)) { + if (likely(tp->srtt_us)) + rate = div64_u64(rate, (u64)tp->srtt_us); + rate *= max_inflight; + ca->rate_bytes = rate; + } + + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); + + WRITE_ONCE(prague_ca(sk)->max_tso_burst, + max_t(u32, 1, burst >> prague_burst_shift)); + WRITE_ONCE(sk->sk_pacing_rate, rate); } static void prague_new_round(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - ca->next_seq = tp->snd_nxt; - ca->old_delivered_ce = tp->delivered_ce; - ca->old_delivered = tp->delivered; - if (!tcp_in_slow_start(tp)) { - ++ca->round; - if (!ca->round) - ca->round = ca->rtt_transition_delay; - } - prague_ai_ack_increase(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->next_seq = tp->snd_nxt; + ca->old_delivered_ce = tp->delivered_ce; + ca->old_delivered = tp->delivered; + if (!tcp_in_slow_start(tp)) { + ++ca->round; + if (!ca->round) + ca->round = ca->rtt_transition_delay; + } + prague_ai_ack_increase(sk); } static void prague_cwnd_changed(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd_stamp = tcp_jiffies32; - prague_ai_ack_increase(sk); + struct tcp_sock *tp = tcp_sk(sk); + tp->snd_cwnd_stamp = tcp_jiffies32; + prague_ai_ack_increase(sk); +} + +/* TODO(asadsa): move this detection out of prague to make it more generic. */ +/* TODO(asadsa): check if self-limited works as given out in the design */ +static void prague_classic_ecn_detection(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 min_rtt_us = tcp_min_rtt(tp); + u32 g_srtt_shift = tp->g_srtt_shift; + u32 g_mdev_shift = tp->g_mdev_shift; + u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; + u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; + u64 depth_us; + u32 mdev_lg, depth_lg; + u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); + s64 new_classic_ecn = (s64)tp->classic_ecn; + + if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) + return; + + /* Multiply upscaled mdev by upscaled geometric carry from the previous round + * adding upscaled adjustment to unbias the subsequent integer log + */ + mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; + mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; + /* carry the new rest to the next round */ + ca->rest_mdev_us = mdev_us >> mdev_lg; + /* V*lg(mdev_us/VO) */ + mdev_lg <<= PRAGUE_ALPHA_BITS - V; + new_classic_ecn += (s64)mdev_lg - V0_LG; + + if (unlikely(srtt_us <= min_rtt_us)) + goto out; + + depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); + depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; + ca->rest_depth_us = depth_us >> depth_lg; + /* queue build-up can only bring classic_ecn toward more classic */ + /* + D*lg(max(d/D0, 1)) */ + depth_lg <<= PRAGUE_ALPHA_BITS - D; + if (depth_lg > D0_LG) { + new_classic_ecn += (u64)depth_lg - D0_LG; + } + + /* self-limited? */ + //if (!tcp_is_cwnd_limited(sk)) + // /* - S*s */ + // new_classic_ecn -= PRAGUE_MAX_ALPHA - + // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; + +out: + tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); } static void prague_update_alpha(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha, mtu, mtu_used; - - /* Do not update alpha before we have proof that there's an AQM on - * the path. - */ - if (unlikely(!ca->saw_ce)) - goto skip; - - alpha = ca->upscaled_alpha; - ecn_segs = tp->delivered_ce - ca->old_delivered_ce; - /* We diverge from the original EWMA, i.e., - * alpha = (1 - g) * alpha + g * F - * by working with (and storing) - * upscaled_alpha = alpha * (1/g) [recall that 0delivered - ca->old_delivered; - - ecn_segs <<= PRAGUE_ALPHA_BITS; - ecn_segs = div_u64(ecn_segs, max(1U, acked_segs)); - } - alpha = alpha - (alpha >> PRAGUE_SHIFT_G) + ecn_segs; - ca->alpha_stamp = tp->tcp_mstamp; - alpha = min(PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G, alpha); - - WRITE_ONCE(ca->upscaled_alpha, alpha); - tp->alpha = alpha >> PRAGUE_SHIFT_G; - - if (prague_is_rtt_indep(sk) && !ca->in_loss) { - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); - if (mtu_used != mtu) { - ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); - tp->mss_cache_set_by_ca = true; - tcp_sync_mss(sk, mtu); - tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - } - } - - ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 ecn_segs, alpha, mtu, mtu_used; + + /* Do not update alpha before we have proof that there's an AQM on + * the path. + */ + if (unlikely(!ca->saw_ce)) + goto skip; + + if (prague_ecn_fallback > 0) + prague_classic_ecn_detection(sk); + + alpha = ca->upscaled_alpha; + ecn_segs = tp->delivered_ce - ca->old_delivered_ce; + /* We diverge from the original EWMA, i.e., + * alpha = (1 - g) * alpha + g * F + * by working with (and storing) + * upscaled_alpha = alpha * (1/g) [recall that 0delivered - ca->old_delivered; + + ecn_segs <<= PRAGUE_ALPHA_BITS; + ecn_segs = div_u64(ecn_segs, max(1U, acked_segs)); + } + alpha = alpha - (alpha >> PRAGUE_SHIFT_G) + ecn_segs; + ca->alpha_stamp = tp->tcp_mstamp; + alpha = min(PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G, alpha); + + WRITE_ONCE(ca->upscaled_alpha, alpha); + tp->alpha = alpha >> PRAGUE_SHIFT_G; + + if (prague_is_rtt_indep(sk) && !ca->in_loss) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } + } + + ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); skip: - prague_new_round(sk); + prague_new_round(sk); } static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 increase; - s64 acked; - u32 new_cwnd; - u64 divisor; - u64 mtu_used; - - acked = rs->acked_sacked; - if (rs->ece_delta) { - if (rs->ece_delta > acked) - LOG(sk, "Received %u marks for %lld acks at %u", - rs->ece_delta, acked, tp->snd_una); - ca->saw_ce = 1; - acked -= rs->ece_delta; - } - - if (acked <= 0 || ca->in_loss || tp->app_limited) - goto adjust; - - if (tcp_in_slow_start(tp)) { - acked = tcp_slow_start(tp, acked); - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - if (!acked) { - prague_cwnd_changed(sk); - return; - } - } - - if (prague_is_rtt_indep(sk)) { - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); - divisor = mtu_used << 23; - new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); - if (likely(new_cwnd)) - ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); - } else { - increase = acked * ca->ai_ack_increase; - new_cwnd = tp->snd_cwnd; - if (likely(new_cwnd)) - increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd += max_t(u64, acked, increase); - } + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 increase; + s64 acked; + u32 new_cwnd; + u64 divisor; + u64 mtu_used; + + acked = rs->acked_sacked; + if (rs->ece_delta) { + if (rs->ece_delta > acked) + LOG(sk, "Received %u marks for %lld acks at %u", + rs->ece_delta, acked, tp->snd_una); + ca->saw_ce = 1; + acked -= rs->ece_delta; + } + + if (acked <= 0 || ca->in_loss || tp->app_limited) + goto adjust; + + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); + if (!acked) { + prague_cwnd_changed(sk); + return; + } + } + + if (prague_is_rtt_indep(sk)) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); + divisor = mtu_used << 23; + new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); + if (likely(new_cwnd)) + ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); + } else { + increase = acked * ca->ai_ack_increase; + new_cwnd = tp->snd_cwnd; + if (likely(new_cwnd)) + increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd += max_t(u64, acked, increase); + } adjust: - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (tp->snd_cwnd > new_cwnd) { - /* Step-wise cwnd decrement */ - --tp->snd_cwnd; - tp->snd_ssthresh = tp->snd_cwnd; - prague_cwnd_changed(sk); - } else if (tp->snd_cwnd < new_cwnd) { - /* Step-wise cwnd increment */ - ++tp->snd_cwnd; - prague_cwnd_changed(sk); - } - return; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (tp->snd_cwnd > new_cwnd) { + /* Step-wise cwnd decrement */ + --tp->snd_cwnd; + tp->snd_ssthresh = tp->snd_cwnd; + prague_cwnd_changed(sk); + } else if (tp->snd_cwnd < new_cwnd) { + /* Step-wise cwnd increment */ + ++tp->snd_cwnd; + prague_cwnd_changed(sk); + } +return; } static void prague_ca_open(struct sock *sk) { - prague_ca(sk)->in_loss = 0; + prague_ca(sk)->in_loss = 0; } static void prague_enter_loss(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->loss_cwnd = tp->snd_cwnd; + ca->loss_rate_bytes = ca->rate_bytes; + ca->rate_bytes -= (ca->rate_bytes >> 1); + //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + ca->in_loss = 1; +} - ca->loss_cwnd = tp->snd_cwnd; - ca->loss_rate_bytes = ca->rate_bytes; - ca->rate_bytes -= (ca->rate_bytes >> 1); - //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - ca->in_loss = 1; +static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + int delta_shift; + u8 new_g_srtt_shift; + u8 old_g_srtt_shift = tp->g_srtt_shift; + + new_g_srtt_shift = ilog2(ssthresh); + new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; + tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); + tp->g_mdev_shift = tp->g_srtt_shift + 1; + delta_shift = tp->g_srtt_shift - old_g_srtt_shift; + + if (!delta_shift) + return; + + if (delta_shift > 0) { + tp->srtt_pace_us <<= delta_shift; + tp->mdev_pace_us <<= delta_shift; + ca->rest_depth_us <<= delta_shift; + ca->rest_mdev_us <<= delta_shift; + } else { + delta_shift = -delta_shift; + tp->srtt_pace_us >>= delta_shift; + tp->mdev_pace_us >>= delta_shift; + ca->rest_depth_us >>= delta_shift; + ca->rest_mdev_us >>= delta_shift; + } } -static void prague_enter_cwr(struct sock *sk) +static u64 prague_classic_ecn_fallback(struct tcp_sock *tp, u64 alpha) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 reduction; - u64 alpha; + u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; + /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ + c = (c >> 1) + (c >> 3); /* c * ~0.6 */ - if (prague_is_rtt_indep(sk)) { - if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); - ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - } else { - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + /* clamp alpha no lower than c to compete fair with classic AQMs */ + return max(alpha, c); +} - reduction = (alpha * (ca->frac_cwnd) + - /* Unbias the rounding by adding 1/2 */ - PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); - ca->frac_cwnd -= reduction; - } +static void prague_enter_cwr(struct sock *sk) +{ + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 reduction; + u64 alpha; + + if (prague_is_rtt_indep(sk)) { + if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + + reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); + ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + } else { + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->frac_cwnd -= reduction; + } + + return; +} - return; +/* Calculate SRTT & SMDEV with lower gain to see past instantaneous variation. + * Also use accurate RTT measurement of last segment to do Classic ECN detection + * rather than using RFC6298 which includes delay accumulated between two + * successive segments at the receiver. Finally, we do not use this MDEV for RTO + * so initialize it to zero. We use a tweaked version of tcp_rtt_estimator(). + */ +static void prague_rtt_estimator(struct sock *sk, long mrtt_us) +{ + struct tcp_sock *tp = tcp_sk(sk); + long long m = mrtt_us; /* Accurate RTT */ + u64 srtt_pace = tp->srtt_pace_us; + tp->mrtt_pace_us = mrtt_us; + + if (srtt_pace != 0) { + m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ + srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev_pace_us >> tp->g_mdev_shift); + tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ + } else { + /* no previous measure. */ + srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + } + tp->srtt_pace_us = max(1ULL, srtt_pace); } -static void prague_state(struct sock *sk, u8 new_state) +static void prague_pkts_acked(struct sock *sk, const struct ack_sample *sample) { - if (new_state == inet_csk(sk)->icsk_ca_state) - return; + if (sample->rtt_us != -1) + prague_rtt_estimator(sk, sample->rtt_us); +} - switch (new_state) { - case TCP_CA_Recovery: - prague_enter_loss(sk); - break; - case TCP_CA_CWR: - prague_enter_cwr(sk); - break; - case TCP_CA_Open: - prague_ca_open(sk); - break; - } +static void prague_state(struct sock *sk, u8 new_state) +{ + if (new_state == inet_csk(sk)->icsk_ca_state) + return; + + switch (new_state) { + case TCP_CA_Recovery: + prague_enter_loss(sk); + break; + case TCP_CA_CWR: + prague_enter_cwr(sk); + break; + case TCP_CA_Open: + prague_ca_open(sk); + break; + } } static void prague_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { - if (ev == CA_EVENT_LOSS) - prague_enter_loss(sk); + if (ev == CA_EVENT_LOSS) + prague_enter_loss(sk); } static u32 prague_cwnd_undo(struct sock *sk) { - struct prague *ca = prague_ca(sk); - - /* We may have made some progress since then, account for it. */ - ca->in_loss = 0; - ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); - //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); + struct prague *ca = prague_ca(sk); + + /* We may have made some progress since then, account for it. */ + ca->in_loss = 0; + ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); + //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) { - prague_update_cwnd(sk, rs); - if (prague_should_update_ewma(sk)) - prague_update_alpha(sk); - prague_update_pacing_rate(sk); + prague_update_cwnd(sk, rs); + if (prague_should_update_ewma(sk)) + prague_update_alpha(sk); + prague_update_pacing_rate(sk); } static u32 prague_ssthresh(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; + struct tcp_sock *tp = tcp_sk(sk); + + prague_update_rtt_scaling(sk, tp->snd_ssthresh); + return tp->snd_ssthresh; } static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = prague_ca(sk)->max_tso_burst; + u32 tso_segs = prague_ca(sk)->max_tso_burst; - if (prague_max_tso_segs) - tso_segs = min(tso_segs, prague_max_tso_segs); + if (prague_max_tso_segs) + tso_segs = min(tso_segs, prague_max_tso_segs); - return tso_segs; + return tso_segs; } static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) -{ - const struct prague *ca = prague_ca(sk); - - if (ext & (1 << (INET_DIAG_PRAGUEINFO - 1)) || - ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - memset(&info->prague, 0, sizeof(info->prague)); - if (inet_csk(sk)->icsk_ca_ops != &prague_reno) { - info->prague.prague_alpha = - ca->upscaled_alpha >> PRAGUE_SHIFT_G; - info->prague.prague_max_burst = ca->max_tso_burst; - info->prague.prague_round = ca->round; - info->prague.prague_rate_bytes = - READ_ONCE(ca->rate_bytes); - info->prague.prague_frac_cwnd = - READ_ONCE(ca->frac_cwnd); - info->prague.prague_enabled = 1; - info->prague.prague_rtt_target = - prague_target_rtt(sk); - } - *attr = INET_DIAG_PRAGUEINFO; - return sizeof(info->prague); - } - return 0; + union tcp_cc_info *info) +{ + const struct prague *ca = prague_ca(sk); + + if (ext & (1 << (INET_DIAG_PRAGUEINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + memset(&info->prague, 0, sizeof(info->prague)); + if (inet_csk(sk)->icsk_ca_ops != &prague_reno) { + info->prague.prague_alpha = + ca->upscaled_alpha >> PRAGUE_SHIFT_G; + info->prague.prague_max_burst = ca->max_tso_burst; + info->prague.prague_round = ca->round; + info->prague.prague_rate_bytes = + READ_ONCE(ca->rate_bytes); + info->prague.prague_frac_cwnd = + READ_ONCE(ca->frac_cwnd); + info->prague.prague_enabled = 1; + info->prague.prague_rtt_target = + prague_target_rtt(sk); + } + *attr = INET_DIAG_PRAGUEINFO; + return sizeof(info->prague); + } + return 0; } static void prague_release(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); - cmpxchg(&sk->sk_pacing_status, SK_PACING_NEEDED, SK_PACING_NONE); - tp->ecn_flags &= ~TCP_ECN_ECT_1; - if (!tcp_ecn_mode_any(tp)) - /* We forced the use of ECN, but failed to negotiate it */ - INET_ECN_dontxmit(sk); + cmpxchg(&sk->sk_pacing_status, SK_PACING_NEEDED, SK_PACING_NONE); + tp->ecn_flags &= ~TCP_ECN_ECT_1; + if (!tcp_ecn_mode_any(tp)) + /* We forced the use of ECN, but failed to negotiate it */ + INET_ECN_dontxmit(sk); - LOG(sk, "Released [delivered_ce=%u,received_ce=%u]", - tp->delivered_ce, tp->received_ce); + LOG(sk, "Released [delivered_ce=%u,received_ce=%u]", + tp->delivered_ce, tp->received_ce); } static void prague_init(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (!tcp_ecn_mode_any(tp) && - sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) { - prague_release(sk); - LOG(sk, "Switching to pure reno [ecn_status=%u,sk_state=%u]", - tcp_ecn_mode_any(tp), sk->sk_state); - inet_csk(sk)->icsk_ca_ops = &prague_reno; - return; - } - - tp->ecn_flags |= TCP_ECN_ECT_1; - cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - /* If we have an initial RTT estimate, ensure we have an initial pacing - * rate to use if net.ipv4.tcp_pace_iw is set. - */ - ca->alpha_stamp = tp->tcp_mstamp; - ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - ca->max_tso_burst = 1; - - /* rate initialization */ - if (tp->srtt_us) { - ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); - ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); - } else { - ca->rate_bytes = MINIMUM_RATE; - } - prague_update_pacing_rate(sk); - ca->loss_rate_bytes = 0; - ca->round = 0; - ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = prague_rtt_target << 3; - ca->saw_ce = !!tp->delivered_ce; - - ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); - ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); - ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; - - tp->classic_ecn = 0ULL; - tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ - - prague_new_round(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_ecn_mode_any(tp) && + sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) { + prague_release(sk); + LOG(sk, "Switching to pure reno [ecn_status=%u,sk_state=%u]", + tcp_ecn_mode_any(tp), sk->sk_state); + inet_csk(sk)->icsk_ca_ops = &prague_reno; + return; + } + + tp->ecn_flags |= TCP_ECN_ECT_1; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); + /* If we have an initial RTT estimate, ensure we have an initial pacing + * rate to use if net.ipv4.tcp_pace_iw is set. + */ + ca->alpha_stamp = tp->tcp_mstamp; + ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); + ca->max_tso_burst = 1; + + /* rate initialization */ + if (tp->srtt_us) { + ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); + ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); + } else { + ca->rate_bytes = MINIMUM_RATE; + } + prague_update_pacing_rate(sk); + ca->loss_rate_bytes = 0; + ca->round = 0; + ca->rtt_transition_delay = prague_rtt_transition; + ca->rtt_target = prague_rtt_target << 3; + ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; + if (ca->rtt_indep >= __RTT_CONTROL_MAX) + ca->rtt_indep = RTT_CONTROL_NONE; + LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", + ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); + ca->saw_ce = !!tp->delivered_ce; + + ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); + ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); + ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; + + /* reuse existing meaurement of SRTT as an intial starting point */ + tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; + tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; + tp->mrtt_pace_us = tp->srtt_us >> 3; + tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; + ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; + + tp->classic_ecn = 0ULL; + tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ + + prague_new_round(sk); } +static bool prague_target_rtt_elapsed(struct sock *sk) +{ + return (prague_target_rtt(sk) >> 3) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); +} + +static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) +{ + u64 increase; + u64 divisor; + u64 target; + + + target = prague_target_rtt(sk); + if (rtt >= target) + return prague_unscaled_ai_ack_increase(sk); + /* Scale increase to: + * - Grow by 1MSS/target RTT + * - Take into account the rate ratio of doing cwnd += 1MSS + * + * Overflows if e2e RTT is > 100ms, hence the cap + */ + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) +{ + /* R0 ~= 16ms, R1 ~= 1.5ms */ + const s64 R0 = ((1 << 14) << 3), R1 = (((1 << 10) + (1 << 9)) << 3); + u64 increase; + u64 divisor; + + /* Scale increase to: + * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. + * - Take into account the rate ratio of doing cwnd += 1MSS + */ + increase = (ONE_CWND >> 3) * R0; + increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); + increase *= rtt; + divisor = R0 * R0; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; +} + +static u32 prague_dynamic_rtt_target(struct sock *sk) +{ + return prague_ca(sk)->rtt_target + tcp_sk(sk)->srtt_us; +} + +static struct rtt_scaling_ops +rtt_scaling_heuristics[__RTT_CONTROL_MAX] __read_mostly = { + [RTT_CONTROL_NONE] = { + .should_update_ewma = NULL, + .ai_ack_increase = NULL, + .target_rtt = NULL, + }, + [RTT_CONTROL_RATE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_SCALABLE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_scalable_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_ADDITIVE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = prague_dynamic_rtt_target + }, +}; + static struct tcp_congestion_ops prague __read_mostly = { - .init = prague_init, - .release = prague_release, - .cong_control = prague_cong_control, - .cwnd_event = prague_cwnd_event, - .ssthresh = prague_ssthresh, - .undo_cwnd = prague_cwnd_undo, - .set_state = prague_state, - .get_info = prague_get_info, - .tso_segs = prague_tso_segs, - .flags = TCP_CONG_NEEDS_ECN | TCP_CONG_NEEDS_ACCECN | - TCP_CONG_NO_FALLBACK_RFC3168 | TCP_CONG_NON_RESTRICTED, - .owner = THIS_MODULE, - .name = "prague", + .init = prague_init, + .release = prague_release, + .cong_control = prague_cong_control, + .cwnd_event = prague_cwnd_event, + .ssthresh = prague_ssthresh, + .undo_cwnd = prague_cwnd_undo, + .pkts_acked = prague_pkts_acked, + .set_state = prague_state, + .get_info = prague_get_info, + .tso_segs = prague_tso_segs, + .flags = TCP_CONG_NEEDS_ECN | TCP_CONG_NEEDS_ACCECN | + TCP_CONG_NO_FALLBACK_RFC3168 | TCP_CONG_NON_RESTRICTED, + .owner = THIS_MODULE, + .name = "prague", }; static struct tcp_congestion_ops prague_reno __read_mostly = { - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, - .undo_cwnd = tcp_reno_undo_cwnd, - .get_info = prague_get_info, - .owner = THIS_MODULE, - .name = "prague-reno", + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .get_info = prague_get_info, + .owner = THIS_MODULE, + .name = "prague-reno", }; static int __init prague_register(void) { - BUILD_BUG_ON(sizeof(struct prague) > ICSK_CA_PRIV_SIZE); - return tcp_register_congestion_control(&prague); + BUILD_BUG_ON(sizeof(struct prague) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&prague); } static void __exit prague_unregister(void) { - tcp_unregister_congestion_control(&prague); + tcp_unregister_congestion_control(&prague); } module_init(prague_register); From 1452e0eda5f674676e56767510f0dcb4ac8df25e Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Mon, 23 Oct 2023 11:54:06 +0200 Subject: [PATCH 17/47] Revert some changes --- net/ipv4/tcp_prague.c | 1334 ++++++++++++++++++++--------------------- 1 file changed, 663 insertions(+), 671 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 0fe2f6d1be762..f7b26e876cb6b 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -89,81 +89,78 @@ #include #include -#define MIN_CWND_RTT 2U -#define MIN_CWND_VIRT 2U -#define MIN_MSS 150U -#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ -#define PRAGUE_ALPHA_BITS 24U -#define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) -#define CWND_UNIT 20U -#define ONE_CWND (1ULL << CWND_UNIT) -#define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ -#define DEFAULT_RTT_TRANSITION 500 -#define MAX_SCALED_RTT (100 * USEC_PER_MSEC) -#define MTU_SYS 1500UL -#define RATE_OFFSET 4 -#define OFFSET_UNIT 7 -#define HSRTT_SHIFT 7 -#define RTT_UNIT 7 -#define RTT2US(x) ((x) << RTT_UNIT) -#define US2RTT(x) ((x) >> RTT_UNIT) - -#define PRAGUE_MAX_SRTT_BITS 18U -#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) -#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ -#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ +#define MIN_CWND_RTT 2U +#define MIN_CWND_VIRT 2U +#define MIN_MSS 150U +#define MINIMUM_RATE 12500ULL /* Minimum rate in Bytes/second: 100kbps */ +#define PRAGUE_ALPHA_BITS 24U +#define PRAGUE_MAX_ALPHA (1ULL << PRAGUE_ALPHA_BITS) +#define CWND_UNIT 20U +#define ONE_CWND (1ULL << CWND_UNIT) +#define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ +#define DEFAULT_RTT_TRANSITION 500 +#define MAX_SCALED_RTT (100 * USEC_PER_MSEC) +#define MTU_SYS 1500UL +#define RATE_OFFSET 4 +#define OFFSET_UNIT 7 +#define HSRTT_SHIFT 7 + +#define PRAGUE_MAX_SRTT_BITS 18U +#define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) +#define PRAGUE_INIT_MDEV_CARRY 741455 /* 1 << (PRAGUE_MAX_MDEV_BITS+0.5) */ +#define PRAGUE_INIT_ADJ_US 262144 /* 1 << (PRAGUE_MAX_MDEV_BITS-1) */ /* Weights, 1/2^x */ -#define V 1 /* 0.5 */ -#define D 1 /* 0.5 */ -#define S 2 /* 0.25 */ +#define V 1 /* 0.5 */ +#define D 1 /* 0.5 */ +#define S 2 /* 0.25 */ /* Store classic_ecn with same scaling as alpha */ -#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ +#define L_STICKY (16ULL << (PRAGUE_ALPHA_BITS-V)) /* Pure L4S behaviour */ #define CLASSIC_ECN L_STICKY + \ - PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ -#define C_STICKY CLASSIC_ECN + \ - L_STICKY /* Pure classic behaviour */ + PRAGUE_MAX_ALPHA /* Transition between classic and L4S */ +#define C_STICKY CLASSIC_ECN + \ + L_STICKY /* Pure classic behaviour */ -#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ -#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ +#define V0_LG (10014683ULL >> V) /* reference queue V of ~750us */ +#define D0_LG (11498458ULL >> D) /* reference queue D of ~2ms */ /* RTT cwnd scaling heuristics */ enum { - /* No RTT independence */ - RTT_CONTROL_NONE = 0, - /* Flows with e2e RTT <= target RTT achieve the same throughput */ - RTT_CONTROL_RATE, - /* Trade some throughput balance at very low RTTs for a floor on the - * amount of marks/RTT */ - RTT_CONTROL_SCALABLE, - /* Behave as a flow operating with an extra target RTT */ - RTT_CONTROL_ADDITIVE, - - __RTT_CONTROL_MAX + /* No RTT independence */ + RTT_CONTROL_NONE = 0, + /* Flows with e2e RTT <= target RTT achieve the same throughput */ + RTT_CONTROL_RATE, + /* Trade some throughput balance at very low RTTs for a floor on the + * amount of marks/RTT */ + RTT_CONTROL_SCALABLE, + /* Behave as a flow operating with an extra target RTT */ + RTT_CONTROL_ADDITIVE, + + __RTT_CONTROL_MAX }; static u32 prague_burst_shift __read_mostly = 12; /* 1/2^12 sec ~=.25ms */ MODULE_PARM_DESC(prague_burst_shift, - "maximal GSO burst duration as a base-2 negative exponent"); + "maximal GSO burst duration as a base-2 negative exponent"); module_param(prague_burst_shift, uint, 0644); -static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; -MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " - "chosen RTT scaling heuristic"); -module_param(prague_rtt_scaling, uint, 0644); - static u32 prague_max_tso_segs __read_mostly = 0; MODULE_PARM_DESC(prague_max_tso_segs, "Maximum TSO/GSO segments"); module_param(prague_max_tso_segs, uint, 0644); +static u32 prague_rtt_scaling __read_mostly = RTT_CONTROL_RATE; +MODULE_PARM_DESC(prague_rtt_scaling, "Enable RTT independence through the " + "chosen RTT scaling heuristic"); +module_param(prague_rtt_scaling, uint, 0644); + static u32 prague_rtt_target __read_mostly = 25 * USEC_PER_MSEC; MODULE_PARM_DESC(prague_rtt_target, "RTT scaling target"); module_param(prague_rtt_target, uint, 0644); static int prague_rtt_transition __read_mostly = DEFAULT_RTT_TRANSITION; MODULE_PARM_DESC(prague_rtt_transition, "Amount of post-SS rounds to transition" - " to be RTT independent."); + " to be RTT independent."); module_param(prague_rtt_transition, uint, 0644); static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ @@ -173,39 +170,39 @@ module_param(prague_rate_offset, uint, 0644); static int prague_ecn_fallback __read_mostly = 0; MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" - " 2 = detection"); + " 2 = detection"); module_param(prague_ecn_fallback, int, 0644); struct prague { - u64 cwr_stamp; - u64 alpha_stamp; /* EWMA update timestamp */ - u64 upscaled_alpha; /* Congestion-estimate EWMA */ - u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ - u32 mtu_cache; - u64 hsrtt_us; - u32 rate_offset; - u64 frac_cwnd; /* internal fractional cwnd */ - u64 rate_bytes; /* internal pacing rate in bytes */ - u64 loss_rate_bytes; - u32 loss_cwnd; - u32 max_tso_burst; - u32 rest_depth_us; - u32 rest_mdev_us; - u32 old_delivered; /* tp->delivered at round start */ - u32 old_delivered_ce; /* tp->delivered_ce at round start */ - u32 next_seq; /* tp->snd_nxt at round start */ - u32 round; /* Round count since last slow-start exit */ - u32 rtt_transition_delay; - u32 rtt_target; /* RTT scaling target */ - u8 saw_ce:1, /* Is there an AQM on the path? */ - rtt_indep:3, /* RTT independence mode */ - in_loss:1; /* In cwnd reduction caused by loss */ + u64 cwr_stamp; + u64 alpha_stamp; /* EWMA update timestamp */ + u64 upscaled_alpha; /* Congestion-estimate EWMA */ + u64 ai_ack_increase; /* AI increase per non-CE ACKed MSS */ + u32 mtu_cache; + u64 hsrtt_us; + u32 rate_offset; + u64 frac_cwnd; /* internal fractional cwnd */ + u64 rate_bytes; /* internal pacing rate in bytes */ + u64 loss_rate_bytes; + u32 loss_cwnd; + u32 max_tso_burst; + u32 rest_depth_us; + u32 rest_mdev_us; + u32 old_delivered; /* tp->delivered at round start */ + u32 old_delivered_ce; /* tp->delivered_ce at round start */ + u32 next_seq; /* tp->snd_nxt at round start */ + u32 round; /* Round count since last slow-start exit */ + u32 rtt_transition_delay; + u32 rtt_target; /* RTT scaling target */ + u8 saw_ce:1, /* Is there an AQM on the path? */ + rtt_indep:3, /* RTT independence mode */ + in_loss:1; /* In cwnd reduction caused by loss */ }; struct rtt_scaling_ops { - bool (*should_update_ewma)(struct sock *sk); - u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); - u32 (*target_rtt)(struct sock *sk); + bool (*should_update_ewma)(struct sock *sk); + u64 (*ai_ack_increase)(struct sock *sk, u32 rtt); + u32 (*target_rtt)(struct sock *sk); }; static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX]; @@ -214,50 +211,45 @@ static struct tcp_congestion_ops prague_reno; static void __prague_connection_id(struct sock *sk, char *str, size_t len) { - u16 dport = ntohs(inet_sk(sk)->inet_dport); - u16 sport = ntohs(inet_sk(sk)->inet_sport); - - if (sk->sk_family == AF_INET) - snprintf(str, len, "%pI4:%u-%pI4:%u", &sk->sk_rcv_saddr, sport, - &sk->sk_daddr, dport); - else if (sk->sk_family == AF_INET6) - snprintf(str, len, "[%pI6c]:%u-[%pI6c]:%u", - &sk->sk_v6_rcv_saddr, sport, &sk->sk_v6_daddr, dport); + u16 dport = ntohs(inet_sk(sk)->inet_dport); + u16 sport = ntohs(inet_sk(sk)->inet_sport); + + if (sk->sk_family == AF_INET) + snprintf(str, len, "%pI4:%u-%pI4:%u", &sk->sk_rcv_saddr, sport, + &sk->sk_daddr, dport); + else if (sk->sk_family == AF_INET6) + snprintf(str, len, "[%pI6c]:%u-[%pI6c]:%u", + &sk->sk_v6_rcv_saddr, sport, &sk->sk_v6_daddr, dport); } -#define LOG(sk, fmt, ...) do { \ - char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ - __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ - /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ +#define LOG(sk, fmt, ...) do { \ + char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ + __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ + /* pr_fmt expects the connection ID*/ \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) { - return (struct prague*)inet_csk_ca(sk); + return (struct prague*)inet_csk_ca(sk); } static bool prague_is_rtt_indep(struct sock *sk) { - struct prague *ca = prague_ca(sk); + struct prague *ca = prague_ca(sk); - return ca->rtt_indep != RTT_CONTROL_NONE && - !tcp_in_slow_start(tcp_sk(sk)) && - ca->round >= ca->rtt_transition_delay; + return ca->rtt_indep != RTT_CONTROL_NONE && + !tcp_in_slow_start(tcp_sk(sk)) && + ca->round >= ca->rtt_transition_delay; } static struct rtt_scaling_ops* prague_rtt_scaling_ops(struct sock *sk) { - return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; + return &rtt_scaling_heuristics[prague_ca(sk)->rtt_indep]; } static bool prague_e2e_rtt_elapsed(struct sock *sk) { - return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); -} - -static u32 prague_target_rtt(struct sock *sk) -{ - return prague_ca(sk)->rtt_target; + return !before(tcp_sk(sk)->snd_una, prague_ca(sk)->next_seq); } /* RTT independence on a step AQM requires the competing flows to converge to @@ -265,448 +257,448 @@ static u32 prague_target_rtt(struct sock *sk) * every RTT" */ static bool prague_should_update_ewma(struct sock *sk) { - return prague_e2e_rtt_elapsed(sk) && - (!prague_rtt_scaling_ops(sk)->should_update_ewma || - !prague_is_rtt_indep(sk) || - prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); + return prague_e2e_rtt_elapsed(sk) && + (!prague_rtt_scaling_ops(sk)->should_update_ewma || + !prague_is_rtt_indep(sk) || + prague_rtt_scaling_ops(sk)->should_update_ewma(sk)); } static u32 prague_target_rtt(struct sock *sk) { - return prague_rtt_scaling_ops(sk)->target_rtt ? - prague_rtt_scaling_ops(sk)->target_rtt(sk) : - prague_ca(sk)->rtt_target; + return prague_rtt_scaling_ops(sk)->target_rtt ? + prague_rtt_scaling_ops(sk)->target_rtt(sk) : + prague_ca(sk)->rtt_target; } static u64 prague_unscaled_ai_ack_increase(struct sock *sk) { - return 1 << CWND_UNIT; + return 1 << CWND_UNIT; } static u64 mul_64_64_shift(u64 left, u64 right, u32 shift) { - u64 a0 = left & ((1ULL<<32)-1); - u64 a1 = left >> 32; - u64 b0 = right & ((1ULL<<32)-1); - u64 b1 = right >> 32; - u64 m0 = a0 * b0; - u64 m1 = a0 * b1; - u64 m2 = a1 * b0; - u64 m3 = a1 * b1; - u64 result_low; - u64 result_high; - - m2 += (m0 >> 32); - m2 += m1; - /* Overflow */ - if (m2 < m1) - m3 += (1ULL<<32); - - result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); - result_high = m3 + (m2 >> 32); - if (shift && 64 >= shift) { - result_low = (result_low >> shift) | (result_high << (64-shift)); - result_high = (result_high >> shift); - } - return (result_high) ? 0xffffffffffffffffULL : result_low; + u64 a0 = left & ((1ULL<<32)-1); + u64 a1 = left >> 32; + u64 b0 = right & ((1ULL<<32)-1); + u64 b1 = right >> 32; + u64 m0 = a0 * b0; + u64 m1 = a0 * b1; + u64 m2 = a1 * b0; + u64 m3 = a1 * b1; + u64 result_low; + u64 result_high; + + m2 += (m0 >> 32); + m2 += m1; + /* Overflow */ + if (m2 < m1) + m3 += (1ULL<<32); + + result_low = (m0 & ((1ULL<<32)-1)) | (m2 << 32); + result_high = m3 + (m2 >> 32); + if (shift && 64 >= shift) { + result_low = (result_low >> shift) | (result_high << (64-shift)); + result_high = (result_high >> shift); + } + return (result_high) ? 0xffffffffffffffffULL : result_low; } static u32 prague_frac_cwnd_to_snd_cwnd(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); - return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), - MIN_CWND_RTT), tp->snd_cwnd_clamp); + return min_t(u32, max_t(u32, (u32)((ca->frac_cwnd + (ONE_CWND - 1)) >> CWND_UNIT), + MIN_CWND_RTT), tp->snd_cwnd_clamp); } static u64 prague_virtual_rtt(struct sock *sk) { - return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); + return max_t(u32, prague_target_rtt(sk), tcp_sk(sk)->srtt_us); } static u64 prague_pacing_rate_to_max_mtu(struct sock *sk) { - return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + - (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); + return div_u64(mul_64_64_shift(prague_ca(sk)->rate_bytes, prague_virtual_rtt(sk), 23) + + (MIN_CWND_VIRT - 1), MIN_CWND_VIRT); } static bool prague_half_virtual_rtt_elapsed(struct sock *sk) { - return (prague_virtual_rtt(sk) >> (3 + 1)) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); + return (prague_virtual_rtt(sk) >> (3 + 1)) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); } static u64 prague_pacing_rate_to_frac_cwnd(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 rtt; - u64 mtu; + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 rtt; + u64 mtu; - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + rtt = (ca->hsrtt_us >> HSRTT_SHIFT) ? (ca->hsrtt_us >> HSRTT_SHIFT) : tp->srtt_us; - return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); + return div_u64(mul_64_64_shift(ca->rate_bytes, rtt, 23 - CWND_UNIT) + (mtu - 1), mtu); } static u32 prague_valid_mtu(struct sock *sk, u32 mtu) { - return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); + return max_t(u32, min_t(u32, prague_ca(sk)->mtu_cache, mtu), tcp_mss_to_mtu(sk, MIN_MSS)); } /* RTT independence will scale the classical 1/W per ACK increase. */ static void prague_ai_ack_increase(struct sock *sk) { - struct prague *ca = prague_ca(sk); - u64 increase; - u32 rtt; - - if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { - increase = prague_unscaled_ai_ack_increase(sk); - goto exit; - } - - rtt = tcp_sk(sk)->srtt_us; - if (ca->round < ca->rtt_transition_delay || - !rtt || rtt > (MAX_SCALED_RTT << 3)) { - increase = prague_unscaled_ai_ack_increase(sk); - goto exit; - } - - increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); + struct prague *ca = prague_ca(sk); + u64 increase; + u32 rtt; + + if (!prague_rtt_scaling_ops(sk)->ai_ack_increase) { + increase = prague_unscaled_ai_ack_increase(sk); + goto exit; + } + + rtt = tcp_sk(sk)->srtt_us; + if (ca->round < ca->rtt_transition_delay || + !rtt || rtt > (MAX_SCALED_RTT << 3)) { + increase = prague_unscaled_ai_ack_increase(sk); + goto exit; + } + + increase = prague_rtt_scaling_ops(sk)->ai_ack_increase(sk, rtt); exit: - WRITE_ONCE(ca->ai_ack_increase, increase); + WRITE_ONCE(ca->ai_ack_increase, increase); } static void prague_update_pacing_rate(struct sock *sk) { - struct prague *ca = prague_ca(sk); - const struct tcp_sock *tp = tcp_sk(sk); - u64 max_inflight; - u64 rate, burst, offset; - u64 mtu; - - if (prague_is_rtt_indep(sk)) { - offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); - if (prague_half_virtual_rtt_elapsed(sk)) // second half - rate = ca->rate_bytes - offset; - else // first half - rate = ca->rate_bytes + offset; - } else { - mtu = tcp_mss_to_mtu(sk, tp->mss_cache); - max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); - rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; - } - - if (tp->snd_cwnd < tp->snd_ssthresh / 2) - rate <<= 1; - - if (!prague_is_rtt_indep(sk)) { - if (likely(tp->srtt_us)) - rate = div64_u64(rate, (u64)tp->srtt_us); - rate *= max_inflight; - ca->rate_bytes = rate; - } - - rate = min_t(u64, rate, sk->sk_max_pacing_rate); - burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); - - WRITE_ONCE(prague_ca(sk)->max_tso_burst, - max_t(u32, 1, burst >> prague_burst_shift)); - WRITE_ONCE(sk->sk_pacing_rate, rate); + struct prague *ca = prague_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + u64 max_inflight; + u64 rate, burst, offset; + u64 mtu; + + if (prague_is_rtt_indep(sk)) { + offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); + if (prague_half_virtual_rtt_elapsed(sk)) // second half + rate = ca->rate_bytes - offset; + else // first half + rate = ca->rate_bytes + offset; + } else { + mtu = tcp_mss_to_mtu(sk, tp->mss_cache); + max_inflight = max(tp->snd_cwnd, tcp_packets_in_flight(tp)); + rate = (u64)((u64)USEC_PER_SEC << 3) * mtu; + } + + if (tp->snd_cwnd < tp->snd_ssthresh / 2) + rate <<= 1; + if (!prague_is_rtt_indep(sk)) { + if (likely(tp->srtt_us)) + rate = div64_u64(rate, tp->srtt_us); + rate *= max_inflight; + ca->rate_bytes = rate; + } + + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + burst = div_u64(rate, tcp_mss_to_mtu(sk, tp->mss_cache)); + + WRITE_ONCE(prague_ca(sk)->max_tso_burst, + max_t(u32, 1, burst >> prague_burst_shift)); + WRITE_ONCE(sk->sk_pacing_rate, rate); } static void prague_new_round(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - ca->next_seq = tp->snd_nxt; - ca->old_delivered_ce = tp->delivered_ce; - ca->old_delivered = tp->delivered; - if (!tcp_in_slow_start(tp)) { - ++ca->round; - if (!ca->round) - ca->round = ca->rtt_transition_delay; - } - prague_ai_ack_increase(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->next_seq = tp->snd_nxt; + ca->old_delivered_ce = tp->delivered_ce; + ca->old_delivered = tp->delivered; + if (!tcp_in_slow_start(tp)) { + ++ca->round; + if (!ca->round) + ca->round = ca->rtt_transition_delay; + } + prague_ai_ack_increase(sk); } static void prague_cwnd_changed(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); - tp->snd_cwnd_stamp = tcp_jiffies32; - prague_ai_ack_increase(sk); + struct tcp_sock *tp = tcp_sk(sk); + + tp->snd_cwnd_stamp = tcp_jiffies32; + prague_ai_ack_increase(sk); } /* TODO(asadsa): move this detection out of prague to make it more generic. */ /* TODO(asadsa): check if self-limited works as given out in the design */ static void prague_classic_ecn_detection(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u32 min_rtt_us = tcp_min_rtt(tp); - u32 g_srtt_shift = tp->g_srtt_shift; - u32 g_mdev_shift = tp->g_mdev_shift; - u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; - u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; - u64 depth_us; - u32 mdev_lg, depth_lg; - u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); - s64 new_classic_ecn = (s64)tp->classic_ecn; - - if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) - return; - - /* Multiply upscaled mdev by upscaled geometric carry from the previous round - * adding upscaled adjustment to unbias the subsequent integer log - */ - mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; - mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; - /* carry the new rest to the next round */ - ca->rest_mdev_us = mdev_us >> mdev_lg; - /* V*lg(mdev_us/VO) */ - mdev_lg <<= PRAGUE_ALPHA_BITS - V; - new_classic_ecn += (s64)mdev_lg - V0_LG; - - if (unlikely(srtt_us <= min_rtt_us)) - goto out; - - depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); - depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; - ca->rest_depth_us = depth_us >> depth_lg; - /* queue build-up can only bring classic_ecn toward more classic */ - /* + D*lg(max(d/D0, 1)) */ - depth_lg <<= PRAGUE_ALPHA_BITS - D; - if (depth_lg > D0_LG) { - new_classic_ecn += (u64)depth_lg - D0_LG; - } - - /* self-limited? */ - //if (!tcp_is_cwnd_limited(sk)) - // /* - S*s */ - // new_classic_ecn -= PRAGUE_MAX_ALPHA - - // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u32 min_rtt_us = tcp_min_rtt(tp); + u32 g_srtt_shift = tp->g_srtt_shift; + u32 g_mdev_shift = tp->g_mdev_shift; + u64 srtt_us = tp->srtt_pace_us >> g_srtt_shift; + u64 mdev_us = tp->mdev_pace_us >> g_mdev_shift; + u64 depth_us; + u32 mdev_lg, depth_lg; + u32 adj_us = PRAGUE_INIT_ADJ_US >> (PRAGUE_MAX_MDEV_BITS - g_mdev_shift); + s64 new_classic_ecn = (s64)tp->classic_ecn; + + if (unlikely(!srtt_us) || unlikely(min_rtt_us == ~0U)) + return; + + /* Multiply upscaled mdev by upscaled geometric carry from the previous round + * adding upscaled adjustment to unbias the subsequent integer log + */ + mdev_us = (u64)mdev_us * ca->rest_mdev_us + adj_us; + mdev_lg = max_t(u32, ilog2(mdev_us), g_mdev_shift) - g_mdev_shift; + /* carry the new rest to the next round */ + ca->rest_mdev_us = mdev_us >> mdev_lg; + /* V*lg(mdev_us/VO) */ + mdev_lg <<= PRAGUE_ALPHA_BITS - V; + new_classic_ecn += (s64)mdev_lg - V0_LG; + + if (unlikely(srtt_us <= min_rtt_us)) + goto out; + + depth_us = (srtt_us - min_rtt_us) * ca->rest_depth_us + (adj_us >> 1); + depth_lg = max_t(u32, ilog2(depth_us), g_srtt_shift) - g_srtt_shift; + ca->rest_depth_us = depth_us >> depth_lg; + /* queue build-up can only bring classic_ecn toward more classic */ + /* + D*lg(max(d/D0, 1)) */ + depth_lg <<= PRAGUE_ALPHA_BITS - D; + if (depth_lg > D0_LG) { + new_classic_ecn += (u64)depth_lg - D0_LG; + } + + /* self-limited? */ + //if (!tcp_is_cwnd_limited(sk)) + // /* - S*s */ + // new_classic_ecn -= PRAGUE_MAX_ALPHA - + // (tp->snd_cwnd_used << (PRAGUE_ALPHA_BITS-S)) / tp->snd_cwnd; out: - tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); + tp->classic_ecn = min_t(u64, max_t(s64, new_classic_ecn, 0), C_STICKY); } static void prague_update_alpha(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 ecn_segs, alpha, mtu, mtu_used; - - /* Do not update alpha before we have proof that there's an AQM on - * the path. - */ - if (unlikely(!ca->saw_ce)) - goto skip; - - if (prague_ecn_fallback > 0) - prague_classic_ecn_detection(sk); - - alpha = ca->upscaled_alpha; - ecn_segs = tp->delivered_ce - ca->old_delivered_ce; - /* We diverge from the original EWMA, i.e., - * alpha = (1 - g) * alpha + g * F - * by working with (and storing) - * upscaled_alpha = alpha * (1/g) [recall that 0delivered - ca->old_delivered; - - ecn_segs <<= PRAGUE_ALPHA_BITS; - ecn_segs = div_u64(ecn_segs, max(1U, acked_segs)); - } - alpha = alpha - (alpha >> PRAGUE_SHIFT_G) + ecn_segs; - ca->alpha_stamp = tp->tcp_mstamp; - alpha = min(PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G, alpha); - - WRITE_ONCE(ca->upscaled_alpha, alpha); - tp->alpha = alpha >> PRAGUE_SHIFT_G; - - if (prague_is_rtt_indep(sk) && !ca->in_loss) { - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); - if (mtu_used != mtu) { - ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); - tp->mss_cache_set_by_ca = true; - tcp_sync_mss(sk, mtu); - tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - } - } - - ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 ecn_segs, alpha, mtu, mtu_used; + + /* Do not update alpha before we have proof that there's an AQM on + * the path. + */ + if (unlikely(!ca->saw_ce)) + goto skip; + + if (prague_ecn_fallback > 0) + prague_classic_ecn_detection(sk); + + alpha = ca->upscaled_alpha; + ecn_segs = tp->delivered_ce - ca->old_delivered_ce; + /* We diverge from the original EWMA, i.e., + * alpha = (1 - g) * alpha + g * F + * by working with (and storing) + * upscaled_alpha = alpha * (1/g) [recall that 0delivered - ca->old_delivered; + + ecn_segs <<= PRAGUE_ALPHA_BITS; + ecn_segs = div_u64(ecn_segs, max(1U, acked_segs)); + } + alpha = alpha - (alpha >> PRAGUE_SHIFT_G) + ecn_segs; + ca->alpha_stamp = tp->tcp_mstamp; + alpha = min(PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G, alpha); + + WRITE_ONCE(ca->upscaled_alpha, alpha); + tp->alpha = alpha >> PRAGUE_SHIFT_G; + + if (prague_is_rtt_indep(sk) && !ca->in_loss) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + mtu = prague_valid_mtu(sk, prague_pacing_rate_to_max_mtu(sk)); + if (mtu_used != mtu) { + ca->frac_cwnd = div_u64(ca->frac_cwnd * mtu_used, mtu); + tp->mss_cache_set_by_ca = true; + tcp_sync_mss(sk, mtu); + tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + } + } + + ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); skip: - prague_new_round(sk); + prague_new_round(sk); } static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 increase; - s64 acked; - u32 new_cwnd; - u64 divisor; - u64 mtu_used; - - acked = rs->acked_sacked; - if (rs->ece_delta) { - if (rs->ece_delta > acked) - LOG(sk, "Received %u marks for %lld acks at %u", - rs->ece_delta, acked, tp->snd_una); - ca->saw_ce = 1; - acked -= rs->ece_delta; - } - - if (acked <= 0 || ca->in_loss || tp->app_limited) - goto adjust; - - if (tcp_in_slow_start(tp)) { - acked = tcp_slow_start(tp, acked); - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - if (!acked) { - prague_cwnd_changed(sk); - return; - } - } - - if (prague_is_rtt_indep(sk)) { - mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); - increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); - divisor = mtu_used << 23; - new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); - if (likely(new_cwnd)) - ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); - } else { - increase = acked * ca->ai_ack_increase; - new_cwnd = tp->snd_cwnd; - if (likely(new_cwnd)) - increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); - ca->frac_cwnd += max_t(u64, acked, increase); - } + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 increase; + s64 acked; + u32 new_cwnd; + u64 divisor; + u64 mtu_used; + + acked = rs->acked_sacked; + if (rs->ece_delta) { + if (rs->ece_delta > acked) + LOG(sk, "Received %u marks for %lld acks at %u", + rs->ece_delta, acked, tp->snd_una); + ca->saw_ce = 1; + acked -= rs->ece_delta; + } + + if (acked <= 0 || ca->in_loss || tp->app_limited) + goto adjust; + + if (tcp_in_slow_start(tp)) { + acked = tcp_slow_start(tp, acked); + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); + if (!acked) { + prague_cwnd_changed(sk); + return; + } + } + + if (prague_is_rtt_indep(sk)) { + mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); + increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); + divisor = mtu_used << 23; + new_cwnd = div64_u64(ca->rate_bytes * prague_virtual_rtt(sk) + divisor - 1, divisor); + if (likely(new_cwnd)) + ca->rate_bytes += div_u64(increase + (new_cwnd >> 1), new_cwnd); + ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); + } else { + increase = acked * ca->ai_ack_increase; + if (likely(tp->snd_cwnd)) + increase = div_u64(increase + (tp->snd_cwnd >> 1), + tp->snd_cwnd); + ca->frac_cwnd += max_t(u64, acked, increase); + } adjust: - new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); - if (tp->snd_cwnd > new_cwnd) { - /* Step-wise cwnd decrement */ - --tp->snd_cwnd; - tp->snd_ssthresh = tp->snd_cwnd; - prague_cwnd_changed(sk); - } else if (tp->snd_cwnd < new_cwnd) { - /* Step-wise cwnd increment */ - ++tp->snd_cwnd; - prague_cwnd_changed(sk); - } -return; + new_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); + if (tp->snd_cwnd > new_cwnd) { + /* Step-wise cwnd decrement */ + --tp->snd_cwnd; + tp->snd_ssthresh = tp->snd_cwnd; + prague_cwnd_changed(sk); + } else if (tp->snd_cwnd < new_cwnd) { + /* Step-wise cwnd increment */ + ++tp->snd_cwnd; + prague_cwnd_changed(sk); + } + return; } static void prague_ca_open(struct sock *sk) { - prague_ca(sk)->in_loss = 0; + prague_ca(sk)->in_loss = 0; } static void prague_enter_loss(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - ca->loss_cwnd = tp->snd_cwnd; - ca->loss_rate_bytes = ca->rate_bytes; - ca->rate_bytes -= (ca->rate_bytes >> 1); - //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - ca->in_loss = 1; + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + ca->loss_cwnd = tp->snd_cwnd; + ca->loss_rate_bytes = ca->rate_bytes; + ca->rate_bytes -= (ca->rate_bytes >> 1); + //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + ca->in_loss = 1; } static void prague_update_rtt_scaling(struct sock *sk, u32 ssthresh) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - int delta_shift; - u8 new_g_srtt_shift; - u8 old_g_srtt_shift = tp->g_srtt_shift; - - new_g_srtt_shift = ilog2(ssthresh); - new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; - tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); - tp->g_mdev_shift = tp->g_srtt_shift + 1; - delta_shift = tp->g_srtt_shift - old_g_srtt_shift; - - if (!delta_shift) - return; - - if (delta_shift > 0) { - tp->srtt_pace_us <<= delta_shift; - tp->mdev_pace_us <<= delta_shift; - ca->rest_depth_us <<= delta_shift; - ca->rest_mdev_us <<= delta_shift; - } else { - delta_shift = -delta_shift; - tp->srtt_pace_us >>= delta_shift; - tp->mdev_pace_us >>= delta_shift; - ca->rest_depth_us >>= delta_shift; - ca->rest_mdev_us >>= delta_shift; - } + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + int delta_shift; + u8 new_g_srtt_shift; + u8 old_g_srtt_shift = tp->g_srtt_shift; + + new_g_srtt_shift = ilog2(ssthresh); + new_g_srtt_shift += (new_g_srtt_shift >> 1) + 1; + tp->g_srtt_shift = min_t(u8, new_g_srtt_shift, PRAGUE_MAX_SRTT_BITS); + tp->g_mdev_shift = tp->g_srtt_shift + 1; + delta_shift = tp->g_srtt_shift - old_g_srtt_shift; + + if (!delta_shift) + return; + + if (delta_shift > 0) { + tp->srtt_pace_us <<= delta_shift; + tp->mdev_pace_us <<= delta_shift; + ca->rest_depth_us <<= delta_shift; + ca->rest_mdev_us <<= delta_shift; + } else { + delta_shift = -delta_shift; + tp->srtt_pace_us >>= delta_shift; + tp->mdev_pace_us >>= delta_shift; + ca->rest_depth_us >>= delta_shift; + ca->rest_mdev_us >>= delta_shift; + } } static u64 prague_classic_ecn_fallback(struct tcp_sock *tp, u64 alpha) { - u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; - /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ - c = (c >> 1) + (c >> 3); /* c * ~0.6 */ + u64 c = min(tp->classic_ecn, CLASSIC_ECN) - L_STICKY; + /* 0 ... CLASSIC_ECN/PRAGUE_MAX_ALPHA */ + c = (c >> 1) + (c >> 3); /* c * ~0.6 */ - /* clamp alpha no lower than c to compete fair with classic AQMs */ - return max(alpha, c); + /* clamp alpha no lower than c to compete fair with classic AQMs */ + return max(alpha, c); } static void prague_enter_cwr(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - u64 reduction; - u64 alpha; - - if (prague_is_rtt_indep(sk)) { - if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); - - reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); - ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - } else { - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); - - reduction = (alpha * (ca->frac_cwnd) + - /* Unbias the rounding by adding 1/2 */ - PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); - ca->frac_cwnd -= reduction; - } - - return; + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + u64 reduction; + u64 alpha; + + if (prague_is_rtt_indep(sk)) { + if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + + reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); + ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + } else { + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + + reduction = (alpha * (ca->frac_cwnd) + + /* Unbias the rounding by adding 1/2 */ + PRAGUE_MAX_ALPHA) >> + (PRAGUE_ALPHA_BITS + 1U); + ca->frac_cwnd -= reduction; + } + + return; } /* Calculate SRTT & SMDEV with lower gain to see past instantaneous variation. @@ -717,310 +709,310 @@ static void prague_enter_cwr(struct sock *sk) */ static void prague_rtt_estimator(struct sock *sk, long mrtt_us) { - struct tcp_sock *tp = tcp_sk(sk); - long long m = mrtt_us; /* Accurate RTT */ - u64 srtt_pace = tp->srtt_pace_us; - tp->mrtt_pace_us = mrtt_us; - - if (srtt_pace != 0) { - m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ - srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ - if (m < 0) - m = -m; /* m is now abs(error) */ - m -= (tp->mdev_pace_us >> tp->g_mdev_shift); - tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ - } else { - /* no previous measure. */ - srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; - } - tp->srtt_pace_us = max(1ULL, srtt_pace); + struct tcp_sock *tp = tcp_sk(sk); + long long m = mrtt_us; /* Accurate RTT */ + u64 srtt_pace = tp->srtt_pace_us; + tp->mrtt_pace_us = mrtt_us; + + if (srtt_pace != 0) { + m -= (srtt_pace >> tp->g_srtt_shift); /* m is now error in rtt est */ + srtt_pace += m; /* rtt += 1/2^g_srtt_shift new */ + if (m < 0) + m = -m; /* m is now abs(error) */ + m -= (tp->mdev_pace_us >> tp->g_mdev_shift); + tp->mdev_pace_us += m; /* mdev += 1/2^g_mev_shift new */ + } else { + /* no previous measure. */ + srtt_pace = m << tp->g_srtt_shift; /* take the measured time to be rtt */ + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + } + tp->srtt_pace_us = max(1ULL, srtt_pace); } static void prague_pkts_acked(struct sock *sk, const struct ack_sample *sample) { - if (sample->rtt_us != -1) - prague_rtt_estimator(sk, sample->rtt_us); + if (sample->rtt_us != -1) + prague_rtt_estimator(sk, sample->rtt_us); } static void prague_state(struct sock *sk, u8 new_state) { - if (new_state == inet_csk(sk)->icsk_ca_state) - return; - - switch (new_state) { - case TCP_CA_Recovery: - prague_enter_loss(sk); - break; - case TCP_CA_CWR: - prague_enter_cwr(sk); - break; - case TCP_CA_Open: - prague_ca_open(sk); - break; - } + if (new_state == inet_csk(sk)->icsk_ca_state) + return; + + switch (new_state) { + case TCP_CA_Recovery: + prague_enter_loss(sk); + break; + case TCP_CA_CWR: + prague_enter_cwr(sk); + break; + case TCP_CA_Open: + prague_ca_open(sk); + break; + } } static void prague_cwnd_event(struct sock *sk, enum tcp_ca_event ev) { - if (ev == CA_EVENT_LOSS) - prague_enter_loss(sk); + if (ev == CA_EVENT_LOSS) + prague_enter_loss(sk); } static u32 prague_cwnd_undo(struct sock *sk) { - struct prague *ca = prague_ca(sk); - - /* We may have made some progress since then, account for it. */ - ca->in_loss = 0; - ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); - //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); - return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); + struct prague *ca = prague_ca(sk); + + /* We may have made some progress since then, account for it. */ + ca->in_loss = 0; + ca->rate_bytes = max(ca->rate_bytes, ca->loss_rate_bytes); + //ca->rate_bytes += ca->rate_bytes - ca->loss_rate_bytes; + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + return max(ca->loss_cwnd, tcp_sk(sk)->snd_cwnd); } static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) { - prague_update_cwnd(sk, rs); - if (prague_should_update_ewma(sk)) - prague_update_alpha(sk); - prague_update_pacing_rate(sk); + prague_update_cwnd(sk, rs); + if (prague_should_update_ewma(sk)) + prague_update_alpha(sk); + prague_update_pacing_rate(sk); } static u32 prague_ssthresh(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); - prague_update_rtt_scaling(sk, tp->snd_ssthresh); - return tp->snd_ssthresh; + prague_update_rtt_scaling(sk, tp->snd_ssthresh); + return tp->snd_ssthresh; } static u32 prague_tso_segs(struct sock *sk, unsigned int mss_now) { - u32 tso_segs = prague_ca(sk)->max_tso_burst; + u32 tso_segs = prague_ca(sk)->max_tso_burst; - if (prague_max_tso_segs) - tso_segs = min(tso_segs, prague_max_tso_segs); + if (prague_max_tso_segs) + tso_segs = min(tso_segs, prague_max_tso_segs); - return tso_segs; + return tso_segs; } static size_t prague_get_info(struct sock *sk, u32 ext, int *attr, - union tcp_cc_info *info) + union tcp_cc_info *info) { - const struct prague *ca = prague_ca(sk); - - if (ext & (1 << (INET_DIAG_PRAGUEINFO - 1)) || - ext & (1 << (INET_DIAG_VEGASINFO - 1))) { - memset(&info->prague, 0, sizeof(info->prague)); - if (inet_csk(sk)->icsk_ca_ops != &prague_reno) { - info->prague.prague_alpha = - ca->upscaled_alpha >> PRAGUE_SHIFT_G; - info->prague.prague_max_burst = ca->max_tso_burst; - info->prague.prague_round = ca->round; - info->prague.prague_rate_bytes = - READ_ONCE(ca->rate_bytes); - info->prague.prague_frac_cwnd = - READ_ONCE(ca->frac_cwnd); - info->prague.prague_enabled = 1; - info->prague.prague_rtt_target = - prague_target_rtt(sk); - } - *attr = INET_DIAG_PRAGUEINFO; - return sizeof(info->prague); - } - return 0; + const struct prague *ca = prague_ca(sk); + + if (ext & (1 << (INET_DIAG_PRAGUEINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + memset(&info->prague, 0, sizeof(info->prague)); + if (inet_csk(sk)->icsk_ca_ops != &prague_reno) { + info->prague.prague_alpha = + ca->upscaled_alpha >> PRAGUE_SHIFT_G; + info->prague.prague_max_burst = ca->max_tso_burst; + info->prague.prague_round = ca->round; + info->prague.prague_rate_bytes = + READ_ONCE(ca->rate_bytes); + info->prague.prague_frac_cwnd = + READ_ONCE(ca->frac_cwnd); + info->prague.prague_enabled = 1; + info->prague.prague_rtt_target = + prague_target_rtt(sk); + } + *attr = INET_DIAG_PRAGUEINFO; + return sizeof(info->prague); + } + return 0; } static void prague_release(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); - cmpxchg(&sk->sk_pacing_status, SK_PACING_NEEDED, SK_PACING_NONE); - tp->ecn_flags &= ~TCP_ECN_ECT_1; - if (!tcp_ecn_mode_any(tp)) - /* We forced the use of ECN, but failed to negotiate it */ - INET_ECN_dontxmit(sk); + cmpxchg(&sk->sk_pacing_status, SK_PACING_NEEDED, SK_PACING_NONE); + tp->ecn_flags &= ~TCP_ECN_ECT_1; + if (!tcp_ecn_mode_any(tp)) + /* We forced the use of ECN, but failed to negotiate it */ + INET_ECN_dontxmit(sk); - LOG(sk, "Released [delivered_ce=%u,received_ce=%u]", - tp->delivered_ce, tp->received_ce); + LOG(sk, "Released [delivered_ce=%u,received_ce=%u]", + tp->delivered_ce, tp->received_ce); } static void prague_init(struct sock *sk) { - struct prague *ca = prague_ca(sk); - struct tcp_sock *tp = tcp_sk(sk); - - if (!tcp_ecn_mode_any(tp) && - sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) { - prague_release(sk); - LOG(sk, "Switching to pure reno [ecn_status=%u,sk_state=%u]", - tcp_ecn_mode_any(tp), sk->sk_state); - inet_csk(sk)->icsk_ca_ops = &prague_reno; - return; - } - - tp->ecn_flags |= TCP_ECN_ECT_1; - cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); - /* If we have an initial RTT estimate, ensure we have an initial pacing - * rate to use if net.ipv4.tcp_pace_iw is set. - */ - ca->alpha_stamp = tp->tcp_mstamp; - ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; - ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); - ca->max_tso_burst = 1; - - /* rate initialization */ - if (tp->srtt_us) { - ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); - ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); - } else { - ca->rate_bytes = MINIMUM_RATE; - } - prague_update_pacing_rate(sk); - ca->loss_rate_bytes = 0; - ca->round = 0; - ca->rtt_transition_delay = prague_rtt_transition; - ca->rtt_target = prague_rtt_target << 3; - ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; - if (ca->rtt_indep >= __RTT_CONTROL_MAX) - ca->rtt_indep = RTT_CONTROL_NONE; - LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", - ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk)); - ca->saw_ce = !!tp->delivered_ce; - - ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); - ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); - ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; - - /* reuse existing meaurement of SRTT as an intial starting point */ - tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; - tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; - tp->mrtt_pace_us = tp->srtt_us >> 3; - tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; - tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; - ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; - ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; - - tp->classic_ecn = 0ULL; - tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ - - prague_new_round(sk); + struct prague *ca = prague_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + + if (!tcp_ecn_mode_any(tp) && + sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) { + prague_release(sk); + LOG(sk, "Switching to pure reno [ecn_status=%u,sk_state=%u]", + tcp_ecn_mode_any(tp), sk->sk_state); + inet_csk(sk)->icsk_ca_ops = &prague_reno; + return; + } + + tp->ecn_flags |= TCP_ECN_ECT_1; + cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); + /* If we have an initial RTT estimate, ensure we have an initial pacing + * rate to use if net.ipv4.tcp_pace_iw is set. + */ + ca->alpha_stamp = tp->tcp_mstamp; + ca->upscaled_alpha = PRAGUE_MAX_ALPHA << PRAGUE_SHIFT_G; + ca->frac_cwnd = ((u64)tp->snd_cwnd << CWND_UNIT); + ca->max_tso_burst = 1; + + /* rate initialization */ + if (tp->srtt_us) { + ca->rate_bytes = div_u64(((u64)USEC_PER_SEC << 3) * tcp_mss_to_mtu(sk, tp->mss_cache) , tp->srtt_us); + ca->rate_bytes = max_t(u64, ca->rate_bytes * tp->snd_cwnd, MINIMUM_RATE); + } else { + ca->rate_bytes = MINIMUM_RATE; + } + prague_update_pacing_rate(sk); + ca->loss_rate_bytes = 0; + ca->round = 0; + ca->rtt_transition_delay = prague_rtt_transition; + ca->rtt_target = prague_rtt_target << 3; + ca->rtt_indep = ca->rtt_target ? prague_rtt_scaling : RTT_CONTROL_NONE; + if (ca->rtt_indep >= __RTT_CONTROL_MAX) + ca->rtt_indep = RTT_CONTROL_NONE; + LOG(sk, "RTT indep chosen: %d (after %u rounds), targetting %u usec", + ca->rtt_indep, ca->rtt_transition_delay, prague_target_rtt(sk) >> 3); + ca->saw_ce = !!tp->delivered_ce; + + ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); + ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); + ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; + + /* reuse existing meaurement of SRTT as an intial starting point */ + tp->g_srtt_shift = PRAGUE_MAX_SRTT_BITS; + tp->g_mdev_shift = PRAGUE_MAX_MDEV_BITS; + tp->mrtt_pace_us = tp->srtt_us >> 3; + tp->srtt_pace_us = (u64)tp->mrtt_pace_us << tp->g_srtt_shift; + tp->mdev_pace_us = 1ULL << tp->g_mdev_shift; + ca->rest_mdev_us = PRAGUE_INIT_MDEV_CARRY; + ca->rest_depth_us = PRAGUE_INIT_MDEV_CARRY >> 1; + + tp->classic_ecn = 0ULL; + tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ + + prague_new_round(sk); } static bool prague_target_rtt_elapsed(struct sock *sk) { - return (prague_target_rtt(sk) >> 3) <= - tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, - prague_ca(sk)->alpha_stamp); + return (prague_target_rtt(sk) >> 3) <= + tcp_stamp_us_delta(tcp_sk(sk)->tcp_mstamp, + prague_ca(sk)->alpha_stamp); } static u64 prague_rate_scaled_ai_ack_increase(struct sock *sk, u32 rtt) { - u64 increase; - u64 divisor; - u64 target; - - - target = prague_target_rtt(sk); - if (rtt >= target) - return prague_unscaled_ai_ack_increase(sk); - /* Scale increase to: - * - Grow by 1MSS/target RTT - * - Take into account the rate ratio of doing cwnd += 1MSS - * - * Overflows if e2e RTT is > 100ms, hence the cap - */ - increase = (u64)rtt << CWND_UNIT; - increase *= rtt; - divisor = target * target; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; + u64 increase; + u64 divisor; + u64 target; + + + target = prague_target_rtt(sk); + if (rtt >= target) + return prague_unscaled_ai_ack_increase(sk); + /* Scale increase to: + * - Grow by 1MSS/target RTT + * - Take into account the rate ratio of doing cwnd += 1MSS + * + * Overflows if e2e RTT is > 100ms, hence the cap + */ + increase = (u64)rtt << CWND_UNIT; + increase *= rtt; + divisor = target * target; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; } static u64 prague_scalable_ai_ack_increase(struct sock *sk, u32 rtt) { - /* R0 ~= 16ms, R1 ~= 1.5ms */ - const s64 R0 = ((1 << 14) << 3), R1 = (((1 << 10) + (1 << 9)) << 3); - u64 increase; - u64 divisor; - - /* Scale increase to: - * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. - * - Take into account the rate ratio of doing cwnd += 1MSS - */ - increase = (ONE_CWND >> 3) * R0; - increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); - increase *= rtt; - divisor = R0 * R0; - increase = div64_u64(increase + (divisor >> 1), divisor); - return increase; + /* R0 ~= 16ms, R1 ~= 1.5ms */ + const s64 R0 = ((1 << 14) << 3), R1 = (((1 << 10) + (1 << 9)) << 3); + u64 increase; + u64 divisor; + + /* Scale increase to: + * - Ensure a growth of at least 1/8th, i.e., one mark every 8 RTT. + * - Take into account the rate ratio of doing cwnd += 1MSS + */ + increase = (ONE_CWND >> 3) * R0; + increase += ONE_CWND * min_t(s64, max_t(s64, rtt - R1, 0), R0); + increase *= rtt; + divisor = R0 * R0; + increase = div64_u64(increase + (divisor >> 1), divisor); + return increase; } static u32 prague_dynamic_rtt_target(struct sock *sk) { - return prague_ca(sk)->rtt_target + tcp_sk(sk)->srtt_us; + return prague_ca(sk)->rtt_target + tcp_sk(sk)->srtt_us; } static struct rtt_scaling_ops rtt_scaling_heuristics[__RTT_CONTROL_MAX] __read_mostly = { - [RTT_CONTROL_NONE] = { - .should_update_ewma = NULL, - .ai_ack_increase = NULL, - .target_rtt = NULL, - }, - [RTT_CONTROL_RATE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_SCALABLE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_scalable_ai_ack_increase, - .target_rtt = NULL, - }, - [RTT_CONTROL_ADDITIVE] = { - .should_update_ewma = prague_target_rtt_elapsed, - .ai_ack_increase = prague_rate_scaled_ai_ack_increase, - .target_rtt = prague_dynamic_rtt_target - }, + [RTT_CONTROL_NONE] = { + .should_update_ewma = NULL, + .ai_ack_increase = NULL, + .target_rtt = NULL, + }, + [RTT_CONTROL_RATE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_SCALABLE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_scalable_ai_ack_increase, + .target_rtt = NULL, + }, + [RTT_CONTROL_ADDITIVE] = { + .should_update_ewma = prague_target_rtt_elapsed, + .ai_ack_increase = prague_rate_scaled_ai_ack_increase, + .target_rtt = prague_dynamic_rtt_target + }, }; static struct tcp_congestion_ops prague __read_mostly = { - .init = prague_init, - .release = prague_release, - .cong_control = prague_cong_control, - .cwnd_event = prague_cwnd_event, - .ssthresh = prague_ssthresh, - .undo_cwnd = prague_cwnd_undo, - .pkts_acked = prague_pkts_acked, - .set_state = prague_state, - .get_info = prague_get_info, - .tso_segs = prague_tso_segs, - .flags = TCP_CONG_NEEDS_ECN | TCP_CONG_NEEDS_ACCECN | - TCP_CONG_NO_FALLBACK_RFC3168 | TCP_CONG_NON_RESTRICTED, - .owner = THIS_MODULE, - .name = "prague", + .init = prague_init, + .release = prague_release, + .cong_control = prague_cong_control, + .cwnd_event = prague_cwnd_event, + .ssthresh = prague_ssthresh, + .undo_cwnd = prague_cwnd_undo, + .pkts_acked = prague_pkts_acked, + .set_state = prague_state, + .get_info = prague_get_info, + .tso_segs = prague_tso_segs, + .flags = TCP_CONG_NEEDS_ECN | TCP_CONG_NEEDS_ACCECN | + TCP_CONG_NO_FALLBACK_RFC3168 | TCP_CONG_NON_RESTRICTED, + .owner = THIS_MODULE, + .name = "prague", }; static struct tcp_congestion_ops prague_reno __read_mostly = { - .ssthresh = tcp_reno_ssthresh, - .cong_avoid = tcp_reno_cong_avoid, - .undo_cwnd = tcp_reno_undo_cwnd, - .get_info = prague_get_info, - .owner = THIS_MODULE, - .name = "prague-reno", + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = tcp_reno_undo_cwnd, + .get_info = prague_get_info, + .owner = THIS_MODULE, + .name = "prague-reno", }; static int __init prague_register(void) { - BUILD_BUG_ON(sizeof(struct prague) > ICSK_CA_PRIV_SIZE); - return tcp_register_congestion_control(&prague); + BUILD_BUG_ON(sizeof(struct prague) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&prague); } static void __exit prague_unregister(void) { - tcp_unregister_congestion_control(&prague); + tcp_unregister_congestion_control(&prague); } module_init(prague_register); From 1a974835a81b251321753b7e128c80a81aa855c7 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 5 Dec 2023 13:40:07 +0100 Subject: [PATCH 18/47] Fix issues for rate-base and add mode switching criterion --- net/ipv4/tcp_prague.c | 55 +++++++++++++++++++++++++++---------------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index f7b26e876cb6b..03021dbbb02da 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -98,7 +98,7 @@ #define CWND_UNIT 20U #define ONE_CWND (1ULL << CWND_UNIT) #define PRAGUE_SHIFT_G 4 /* EWMA gain g = 1/2^4 */ -#define DEFAULT_RTT_TRANSITION 500 +#define DEFAULT_RTT_TRANSITION 4 #define MAX_SCALED_RTT (100 * USEC_PER_MSEC) #define MTU_SYS 1500UL #define RATE_OFFSET 4 @@ -165,9 +165,14 @@ module_param(prague_rtt_transition, uint, 0644); static int prague_rate_offset __read_mostly = 4; /* 4/128 ~= 3% */ MODULE_PARM_DESC(prague_rate_offset, - "Pacing rate offset in 1/128 units at each half of RTT_virt"); + "Pacing rate offset in 1/128 units at each half of RTT_virt"); module_param(prague_rate_offset, uint, 0644); +static int prague_cwnd_transit __read_mostly = 4; +MODULE_PARM_DESC(prague_cwnd_transit, + "CWND mode switching point in term of # of MTU_SYS"); +module_param(prague_cwnd_transit, uint, 0644); + static int prague_ecn_fallback __read_mostly = 0; MODULE_PARM_DESC(prague_ecn_fallback, "0 = none, 1 = detection & fallback" " 2 = detection"); @@ -196,6 +201,7 @@ struct prague { u32 rtt_target; /* RTT scaling target */ u8 saw_ce:1, /* Is there an AQM on the path? */ rtt_indep:3, /* RTT independence mode */ + cwnd_mode:1, /* CWND operating mode */ in_loss:1; /* In cwnd reduction caused by loss */ }; @@ -225,7 +231,7 @@ static void __prague_connection_id(struct sock *sk, char *str, size_t len) char __tmp[2 * (INET6_ADDRSTRLEN + 9) + 1] = {0}; \ __prague_connection_id(sk, __tmp, sizeof(__tmp)); \ /* pr_fmt expects the connection ID*/ \ - pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ + pr_info("(%s) : " fmt "\n", __tmp, ##__VA_ARGS__); \ } while (0) static struct prague *prague_ca(struct sock *sk) @@ -381,7 +387,7 @@ static void prague_update_pacing_rate(struct sock *sk) u64 rate, burst, offset; u64 mtu; - if (prague_is_rtt_indep(sk)) { + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { offset = mul_64_64_shift(ca->rate_offset, ca->rate_bytes, OFFSET_UNIT); if (prague_half_virtual_rtt_elapsed(sk)) // second half rate = ca->rate_bytes - offset; @@ -395,10 +401,10 @@ static void prague_update_pacing_rate(struct sock *sk) if (tp->snd_cwnd < tp->snd_ssthresh / 2) rate <<= 1; - if (!prague_is_rtt_indep(sk)) { + if (!prague_is_rtt_indep(sk) || (ca->cwnd_mode == 0 || unlikely(!ca->saw_ce))) { if (likely(tp->srtt_us)) - rate = div64_u64(rate, tp->srtt_us); - rate *= max_inflight; + rate = div64_u64(rate, (u64)tp->srtt_us); + rate = max_t(u64, rate*max_inflight, MINIMUM_RATE); ca->rate_bytes = rate; } @@ -536,9 +542,8 @@ static void prague_update_alpha(struct sock *sk) tp->snd_cwnd = prague_frac_cwnd_to_snd_cwnd(sk); } } - - ca->hsrtt_us += tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); skip: + ca->hsrtt_us = ca->hsrtt_us - (ca->hsrtt_us >> HSRTT_SHIFT) + tp->srtt_us; prague_new_round(sk); } @@ -573,7 +578,7 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) } } - if (prague_is_rtt_indep(sk)) { + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { mtu_used = tcp_mss_to_mtu(sk, tp->mss_cache); increase = div_u64(((u64)(acked * MTU_SYS)) << 23, prague_virtual_rtt(sk)); divisor = mtu_used << 23; @@ -583,9 +588,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); } else { increase = acked * ca->ai_ack_increase; - if (likely(tp->snd_cwnd)) - increase = div_u64(increase + (tp->snd_cwnd >> 1), - tp->snd_cwnd); + new_cwnd = tp->snd_cwnd; + if (likely(new_cwnd)) + increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); ca->frac_cwnd += max_t(u64, acked, increase); } @@ -616,9 +621,13 @@ static void prague_enter_loss(struct sock *sk) ca->loss_cwnd = tp->snd_cwnd; ca->loss_rate_bytes = ca->rate_bytes; - ca->rate_bytes -= (ca->rate_bytes >> 1); - //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); - ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { + ca->rate_bytes -= (ca->rate_bytes >> 1); + //ca->rate_bytes = mul_64_64_shift(717, ca->rate_bytes, 10); + ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); + } else { + ca->frac_cwnd -= (ca->frac_cwnd >> 1); + } ca->in_loss = 1; } @@ -671,7 +680,7 @@ static void prague_enter_cwr(struct sock *sk) u64 reduction; u64 alpha; - if (prague_is_rtt_indep(sk)) { + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, ca->cwr_stamp)) return; @@ -694,7 +703,7 @@ static void prague_enter_cwr(struct sock *sk) reduction = (alpha * (ca->frac_cwnd) + /* Unbias the rounding by adding 1/2 */ PRAGUE_MAX_ALPHA) >> - (PRAGUE_ALPHA_BITS + 1U); + (PRAGUE_ALPHA_BITS + 1U); ca->frac_cwnd -= reduction; } @@ -777,6 +786,11 @@ static void prague_cong_control(struct sock *sk, const struct rate_sample *rs) if (prague_should_update_ewma(sk)) prague_update_alpha(sk); prague_update_pacing_rate(sk); + if (prague_ca(sk)->cwnd_mode == 0 && tcp_sk(sk)->snd_cwnd*tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache) <= prague_cwnd_transit*MTU_SYS) { + prague_ca(sk)->cwnd_mode = 1; + } else if (prague_ca(sk)->cwnd_mode == 1 && tcp_sk(sk)->snd_cwnd*tcp_mss_to_mtu(sk, tcp_sk(sk)->mss_cache) > prague_cwnd_transit*MTU_SYS) { + prague_ca(sk)->cwnd_mode = 0; + } } static u32 prague_ssthresh(struct sock *sk) @@ -882,7 +896,8 @@ static void prague_init(struct sock *sk) ca->saw_ce = !!tp->delivered_ce; ca->mtu_cache = tcp_mss_to_mtu(sk, tp->mss_cache); - ca->hsrtt_us = (tp->srtt_us) ? (tp->srtt_us << HSRTT_SHIFT) : (USEC_PER_MSEC << (HSRTT_SHIFT + 3)); + // Default as 1us + ca->hsrtt_us = (tp->srtt_us) ? ((u64)tp->srtt_us) << HSRTT_SHIFT : (1 << (HSRTT_SHIFT + 3)); ca->rate_offset = (prague_rate_offset && prague_rate_offset < ((1 << OFFSET_UNIT) -1)) ? prague_rate_offset : RATE_OFFSET ; /* reuse existing meaurement of SRTT as an intial starting point */ @@ -896,7 +911,7 @@ static void prague_init(struct sock *sk) tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ - + ca->cwnd_mode = 0; prague_new_round(sk); } From b55c9fafd20d0120eb79a3bb7c1dc03eb8a37f09 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Wed, 3 Jan 2024 03:28:44 +0100 Subject: [PATCH 19/47] Modify to align AccECN draft --- .github/workflows/kernel.yml | 2 +- include/linux/tcp.h | 3 +- include/net/tcp.h | 4 +-- net/ipv4/tcp_input.c | 31 +++++++++++++--- net/ipv4/tcp_minisocks.c | 68 +++++++++++++++++++++++++++--------- net/ipv4/tcp_output.c | 36 ++++++++++++++----- 6 files changed, 111 insertions(+), 33 deletions(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 168eb823cadc5..1c6d3320a606a 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -63,7 +63,7 @@ jobs: runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase'}} + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023'}} steps: - name: Get artifact uses: actions/download-artifact@v3 diff --git a/include/linux/tcp.h b/include/linux/tcp.h index d20f31b53a984..1557cf0241c8b 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -126,7 +126,8 @@ struct tcp_request_sock { #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif - u8 accecn_ok : 1, + u8 noect : 1, + accecn_ok : 1, saw_accecn_opt : 2, syn_ect_snt: 2, syn_ect_rcv: 2; diff --git a/include/net/tcp.h b/include/net/tcp.h index 754aa34bd3071..26be3bfe26de8 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -433,8 +433,8 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) } bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); -void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - u8 syn_ect_snt); +bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, u8 syn_ect_snt); u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset); void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 payload_len); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c561a28c8c9dc..fdd58702b7368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -444,11 +444,31 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); break; case 0x1: - case 0x5: if (tcp_ca_no_fallback_rfc3168(sk)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - else if (tcp_ecn_mode_pending(tp)) - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + else + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + break; + // [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with + // (AE,CWR,ECE) = (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) + // = (1,0,1) but it does not have logic specific to such a combination, the Client MUST enable AccECN + // mode as if the SYN/ACK confirmed that the Server supported AccECN and as if it fed back that the + // IP-ECN field on the SYN had arrived unchanged. + case 0x5: + if (tcp_ecn_mode_pending(tp)) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + tp->accecn_opt_demand = 2; + } + if (INET_ECN_is_ce(ip_dsfield)) { + tp->received_ce++; + tp->received_ce_pending++; + } + } break; default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); @@ -6997,7 +7017,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); if (tcp_ecn_mode_accecn(tp)) - tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); + tcp_accecn_third_ack(sk, skb, req, tp->syn_ect_snt); tcp_fast_path_on(tp); break; @@ -7198,6 +7218,7 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->noect = 0; tcp_rsk(req)->accecn_ok = 0; tcp_rsk(req)->saw_accecn_opt = 0; tcp_rsk(req)->syn_ect_rcv = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 0f54f94e30b5b..8450748d6873d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -397,15 +397,26 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - u8 syn_ect_snt) +bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + struct request_sock *req, u8 syn_ect_snt) { u8 ace = tcp_accecn_ace(tcp_hdr(skb)); struct tcp_sock *tp = tcp_sk(sk); + bool verify_ace = true; switch (ace) { case 0x0: tp->ecn_fail = 1; + // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest + // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + if (!TCP_SKB_CB(skb)->sacked) { + inet_rsk(req)->ecn_ok = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + verify_ace = false; + } break; case 0x7: case 0x5: @@ -423,28 +434,37 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, } break; } + return verify_ace; } static void tcp_ecn_openreq_child(struct sock *sk, - const struct request_sock *req, + struct request_sock *req, const struct sk_buff *skb) { - const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - if (treq->accecn_ok) { - const struct tcphdr *th = (const struct tcphdr *)skb->data; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_snt = treq->syn_ect_snt; - tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tp->saw_accecn_opt = treq->saw_accecn_opt; - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid + // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + if (treq->noect) { + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } else { + if (treq->accecn_ok) { + const struct tcphdr *th = (const struct tcphdr *)skb->data; + if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); + } + } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + TCP_ECN_DISABLED); + } } } @@ -694,9 +714,24 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, - &tcp_rsk(req)->last_oow_ack_time) && + &tcp_rsk(req)->last_oow_ack_time)) { + + if (tcp_rsk(req)->accecn_ok) { + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD + // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with + // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) + tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid + // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + tcp_rsk(req)->noect = 1; + INET_ECN_dontxmit(sk); + } + } - !inet_rtx_syn_ack(sk, req)) { + if (!inet_rtx_syn_ack(sk, req)) { unsigned long expires = jiffies; expires += min(TCP_TIMEOUT_INIT << req->num_timeout, @@ -705,6 +740,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, mod_timer_pending(&req->rsk_timer, expires); else req->rsk_timer.expires = expires; + } } return NULL; } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 32c347fe2ccfe..8317b217bc68b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -378,12 +378,27 @@ static void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) } static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) +tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th) { - if (tcp_rsk(req)->accecn_ok) - tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); - else if (inet_rsk(req)->ecn_ok) - th->ece = 1; + if (req->num_timeout < 2) { + if (tcp_rsk(req)->accecn_ok) + tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); + else if (inet_rsk(req)->ecn_ok) + th->ece = 1; + } else if (tcp_rsk(req)->accecn_ok) { + // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, + // to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and + // no AccECN Option, but it remains in AccECN feedback mode + th->ae = 0; + th->cwr = 0; + th->ece = 0; + + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid + // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + tcp_rsk(req)->noect = 1; + INET_ECN_dontxmit(sk); + } } static void tcp_accecn_set_ace(struct tcp_sock *tp, struct sk_buff *skb, @@ -922,8 +937,11 @@ static bool tcp_accecn_option_beacon_check(const struct sock *sk) if (!sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon) return false; - return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) >= - (tp->srtt_us >> (3 + TCP_ACCECN_BEACON_FREQ_SHIFT)); + // [CY] 6. Summary: Protocol Properties - REMOVE “However, it has to send a full-sized AccECN Option at least + // three times per RTT, which the Data Sender can rely on as a regular beacon or checkpoint.” + return false; + //return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) >= + // (tp->srtt_us >> (3 + TCP_ACCECN_BEACON_FREQ_SHIFT)); } /* Compute TCP options for SYN packets. This is not the final @@ -1086,6 +1104,8 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the + // SYN/ACK, but with no AccECN Option if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && req->num_timeout < 1 && (remaining >= TCPOLEN_ACCECN_BASE)) { opts->ecn_bytes = synack_ecn_bytes; @@ -3822,7 +3842,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th); + tcp_ecn_make_synack((struct sock *)sk, req, th); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; From b06ab7c5c7938ce9d7d64e1e58828323bb5aaf8e Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Fri, 19 Jan 2024 11:47:54 +0100 Subject: [PATCH 20/47] Modify how tcp_ecn_option_beacon works --- Documentation/networking/ip-sysctl.rst | 6 ++++++ include/net/tcp.h | 1 - net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_output.c | 11 ++++++----- 4 files changed, 13 insertions(+), 7 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 62d298f89828c..dbe87fe8cc568 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -431,6 +431,12 @@ tcp_ecn_option - INTEGER Default: 2 +tcp_ecn_option_beacon - INTEGER + Control Accurate ECN (AccECN) option sending frequency per RTT and it + take effect only when tcp_ecn_option is set to 2. + + Default: 3 (AccECN will be send at least 3 times per RTT) + tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall back to non-ECN. Currently, this knob implements the fallback diff --git a/include/net/tcp.h b/include/net/tcp.h index 26be3bfe26de8..f26a931d2ae2a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -228,7 +228,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCP_ACCECN_MAXSIZE (TCPOLEN_ACCECN_BASE + \ TCPOLEN_ACCECN_PERFIELD * \ TCP_ACCECN_NUMFIELDS) -#define TCP_ACCECN_BEACON_FREQ_SHIFT 2 /* Send option at least 2^2 times per RTT */ #define TCP_ACCECN_SAFETY_SHIFT 1 /* SAFETY_FACTOR in accecn draft */ /* tp->saw_accecn_opt states */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index faab5a4869dbc..bb1e70a5e5ec2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3142,7 +3142,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_option = 2; - net->ipv4.sysctl_tcp_ecn_option_beacon = 1; + net->ipv4.sysctl_tcp_ecn_option_beacon = 3; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_ecn_unsafe_cep = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8317b217bc68b..99ae746753eba 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -937,11 +937,12 @@ static bool tcp_accecn_option_beacon_check(const struct sock *sk) if (!sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon) return false; - // [CY] 6. Summary: Protocol Properties - REMOVE “However, it has to send a full-sized AccECN Option at least - // three times per RTT, which the Data Sender can rely on as a regular beacon or checkpoint.” - return false; - //return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) >= - // (tp->srtt_us >> (3 + TCP_ACCECN_BEACON_FREQ_SHIFT)); + /* [CY] AccECN period shall be larger than srtt[us]/TCP_ECN_OPTION_BEACON + * Following texts are removed in AccECN “6. Summary: Protocol Properties - However, it has to send a full-sized + * AccECN Option at least three times per RTT, which the Data Sender can rely on as a regular beacon or checkpoint.” + */ + return tcp_stamp_us_delta(tp->tcp_mstamp, tp->accecn_opt_tstamp) * sock_net(sk)->ipv4.sysctl_tcp_ecn_option_beacon >= + (tp->srtt_us >> 3); } /* Compute TCP options for SYN packets. This is not the final From ae3f31df5722995bbf717fd4ffe7bfa9c2e80826 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Fri, 19 Jan 2024 15:11:58 +0100 Subject: [PATCH 21/47] Modification for ACCECN draft --- net/ipv4/tcp_minisocks.c | 39 +++++++++++++++++---------------------- net/ipv4/tcp_output.c | 6 ------ 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8450748d6873d..75b919d850c23 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -414,7 +414,11 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, if (!TCP_SKB_CB(skb)->sacked) { inet_rsk(req)->ecn_ok = 0; tcp_rsk(req)->accecn_ok = 0; + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); verify_ace = false; } break; @@ -447,24 +451,20 @@ static void tcp_ecn_openreq_child(struct sock *sk, // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - if (treq->noect) { - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - } else { - if (treq->accecn_ok) { - const struct tcphdr *th = (const struct tcphdr *)skb->data; - if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_snt = treq->syn_ect_snt; - tp->saw_accecn_opt = treq->saw_accecn_opt; - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); - } - } else { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + if (treq->accecn_ok) { + const struct tcphdr *th = (const struct tcphdr *)skb->data; + if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); } } @@ -723,11 +723,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - tcp_rsk(req)->noect = 1; - INET_ECN_dontxmit(sk); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 99ae746753eba..c884210b21cc4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -392,12 +392,6 @@ tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th th->ae = 0; th->cwr = 0; th->ece = 0; - - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. - tcp_rsk(req)->noect = 1; - INET_ECN_dontxmit(sk); } } From bb25bcf7350fae2b174666c7c1572ae9ea836460 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang <125277758+minuscat@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:23:54 +0100 Subject: [PATCH 22/47] Update kernel.yml --- .github/workflows/kernel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 1c6d3320a606a..4c81b75c797e5 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -63,7 +63,7 @@ jobs: runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023'}} + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023' || github.ref == 'refs/heads/AccECN-2024'}} steps: - name: Get artifact uses: actions/download-artifact@v3 From 12a84bc0a1b5996a20e4e1a627530d0181ca5393 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang <125277758+minuscat@users.noreply.github.com> Date: Fri, 19 Jan 2024 15:25:24 +0100 Subject: [PATCH 23/47] Update tcp_minisocks.c --- net/ipv4/tcp_minisocks.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 75b919d850c23..f78c540f13f2c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -407,20 +407,6 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, switch (ace) { case 0x0: tp->ecn_fail = 1; - // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD - // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest - // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN - // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. - if (!TCP_SKB_CB(skb)->sacked) { - inet_rsk(req)->ecn_ok = 0; - tcp_rsk(req)->accecn_ok = 0; - - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); - verify_ace = false; - } break; case 0x7: case 0x5: From 230e7ae791293db96b0696a9e4ac79b839097f4e Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Fri, 19 Jan 2024 18:09:50 +0100 Subject: [PATCH 24/47] Modification for ACCECN draft --- net/ipv4/tcp_input.c | 33 +++++++++++++++++---------------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd58702b7368..820330e81eba5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -444,6 +444,7 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); break; case 0x1: + case 0x5: if (tcp_ca_no_fallback_rfc3168(sk)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); else @@ -454,22 +455,22 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, // = (1,0,1) but it does not have logic specific to such a combination, the Client MUST enable AccECN // mode as if the SYN/ACK confirmed that the Server supported AccECN and as if it fed back that the // IP-ECN field on the SYN had arrived unchanged. - case 0x5: - if (tcp_ecn_mode_pending(tp)) { - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - if (tp->rx_opt.accecn && - tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { - tp->saw_accecn_opt = tcp_accecn_option_init(skb, - tp->rx_opt.accecn); - tp->accecn_opt_demand = 2; - } - if (INET_ECN_is_ce(ip_dsfield)) { - tp->received_ce++; - tp->received_ce_pending++; - } - } - break; + //case 0x5: + // if (tcp_ecn_mode_pending(tp)) { + // tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + // tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + // if (tp->rx_opt.accecn && + // tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + // tp->saw_accecn_opt = tcp_accecn_option_init(skb, + // tp->rx_opt.accecn); + // tp->accecn_opt_demand = 2; + // } + // if (INET_ECN_is_ce(ip_dsfield)) { + // tp->received_ce++; + // tp->received_ce_pending++; + // } + // } + // break; default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f78c540f13f2c..deacdb22aab32 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) - tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + //tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c884210b21cc4..9237c9f7a4efd 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -389,9 +389,9 @@ tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, // to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and // no AccECN Option, but it remains in AccECN feedback mode - th->ae = 0; - th->cwr = 0; - th->ece = 0; + //th->ae = 0; + //th->cwr = 0; + //th->ece = 0; } } From 472b228b6effa285a9064dfd4bc8df596f723f93 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sat, 20 Jan 2024 11:04:29 +0100 Subject: [PATCH 25/47] Revert "Modification for ACCECN draft" This reverts commit 230e7ae791293db96b0696a9e4ac79b839097f4e. --- net/ipv4/tcp_input.c | 33 ++++++++++++++++----------------- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 6 +++--- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 820330e81eba5..fdd58702b7368 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -444,7 +444,6 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); break; case 0x1: - case 0x5: if (tcp_ca_no_fallback_rfc3168(sk)) tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); else @@ -455,22 +454,22 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, // = (1,0,1) but it does not have logic specific to such a combination, the Client MUST enable AccECN // mode as if the SYN/ACK confirmed that the Server supported AccECN and as if it fed back that the // IP-ECN field on the SYN had arrived unchanged. - //case 0x5: - // if (tcp_ecn_mode_pending(tp)) { - // tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - // tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; - // if (tp->rx_opt.accecn && - // tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { - // tp->saw_accecn_opt = tcp_accecn_option_init(skb, - // tp->rx_opt.accecn); - // tp->accecn_opt_demand = 2; - // } - // if (INET_ECN_is_ce(ip_dsfield)) { - // tp->received_ce++; - // tp->received_ce_pending++; - // } - // } - // break; + case 0x5: + if (tcp_ecn_mode_pending(tp)) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + tp->accecn_opt_demand = 2; + } + if (INET_ECN_is_ce(ip_dsfield)) { + tp->received_ce++; + tp->received_ce_pending++; + } + } + break; default: tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_rcv = ip_dsfield & INET_ECN_MASK; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index deacdb22aab32..f78c540f13f2c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -707,7 +707,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) - //tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9237c9f7a4efd..c884210b21cc4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -389,9 +389,9 @@ tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, // to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and // no AccECN Option, but it remains in AccECN feedback mode - //th->ae = 0; - //th->cwr = 0; - //th->ece = 0; + th->ae = 0; + th->cwr = 0; + th->ece = 0; } } From 2fc01fe6844a2f364600f919b25d09c1e4a6df99 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sat, 20 Jan 2024 11:33:05 +0100 Subject: [PATCH 26/47] Modification for ACCECN draft --- net/ipv4/tcp_minisocks.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f78c540f13f2c..149e12d5c6f69 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -407,6 +407,16 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, switch (ace) { case 0x0: tp->ecn_fail = 1; + // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest + // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + if (!TCP_SKB_CB(skb)->sacked) { + inet_rsk(req)->ecn_ok = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + verify_ace = false; + } break; case 0x7: case 0x5: @@ -443,10 +453,10 @@ static void tcp_ecn_openreq_child(struct sock *sk, tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; tp->saw_accecn_opt = treq->saw_accecn_opt; - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } else { tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? TCP_ECN_MODE_RFC3168 : From 04b6c9851681024aaa26cc203d0c6a115a9f4285 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sat, 20 Jan 2024 16:58:40 +0100 Subject: [PATCH 27/47] Modification for ACCECN draft --- net/ipv4/tcp_minisocks.c | 11 ++++++++--- net/ipv4/tcp_output.c | 5 +++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 149e12d5c6f69..47623c7bc3e8f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -412,8 +412,6 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. if (!TCP_SKB_CB(skb)->sacked) { - inet_rsk(req)->ecn_ok = 0; - tcp_rsk(req)->accecn_ok = 0; tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); verify_ace = false; } @@ -451,9 +449,11 @@ static void tcp_ecn_openreq_child(struct sock *sk, const struct tcphdr *th = (const struct tcphdr *)skb->data; if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_snt = treq->syn_ect_snt; tp->saw_accecn_opt = treq->saw_accecn_opt; + } else { + tp->saw_accecn_opt = TCP_ACCECN_OPT_FAIL; } + tp->syn_ect_snt = treq->syn_ect_snt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); @@ -719,6 +719,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid + // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake + tcp_sk(sk)->ecn_fail = 1; + INET_ECN_dontxmit(sk); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c884210b21cc4..5bf109464f6b5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -392,6 +392,11 @@ tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th th->ae = 0; th->cwr = 0; th->ece = 0; + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid + // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + tcp_sk(sk)->ecn_fail = 1; + INET_ECN_dontxmit(sk); } } From a449debb5c81f55cb04b5fe07e903563294bc581 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sun, 21 Jan 2024 03:37:15 +0100 Subject: [PATCH 28/47] Modification for ACCECN draft --- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_minisocks.c | 33 +++++++++++++++------------------ net/ipv4/tcp_output.c | 2 +- 5 files changed, 21 insertions(+), 24 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1557cf0241c8b..50f036db09bf8 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -126,8 +126,7 @@ struct tcp_request_sock { #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif - u8 noect : 1, - accecn_ok : 1, + u8 accecn_ok : 1, saw_accecn_opt : 2, syn_ect_snt: 2, syn_ect_rcv: 2; @@ -234,6 +233,7 @@ struct tcp_sock { syn_ect_snt:2, /* AccECN ECT memory, only */ syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ + u8 accecn_no_process:1; /* AccECN no response on feedback */ u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9721d7f0db9b9..a9ccb83a85482 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3033,6 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->saw_accecn_opt = 0; tp->ecn_fail = 0; + tp->accecn_no_preocess = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fdd58702b7368..56d2a9a454fd1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -595,7 +595,7 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, bool order1, res; unsigned int i; - if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL) + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_process) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { @@ -7017,7 +7017,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tcp_initialize_rcv_mss(sk); if (tcp_ecn_mode_accecn(tp)) - tcp_accecn_third_ack(sk, skb, req, tp->syn_ect_snt); + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); break; @@ -7218,7 +7218,6 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->last_oow_ack_time = 0; - tcp_rsk(req)->noect = 0; tcp_rsk(req)->accecn_ok = 0; tcp_rsk(req)->saw_accecn_opt = 0; tcp_rsk(req)->syn_ect_rcv = 0; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 47623c7bc3e8f..c0a448571ae70 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -397,24 +397,25 @@ void tcp_openreq_init_rwin(struct request_sock *req, } EXPORT_SYMBOL(tcp_openreq_init_rwin); -bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - struct request_sock *req, u8 syn_ect_snt) +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + u8 syn_ect_snt) { u8 ace = tcp_accecn_ace(tcp_hdr(skb)); struct tcp_sock *tp = tcp_sk(sk); - bool verify_ace = true; switch (ace) { case 0x0: - tp->ecn_fail = 1; // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. - if (!TCP_SKB_CB(skb)->sacked) { - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - verify_ace = false; - } + tp->ecn_fail = 1; + tp->accecn_no_process = 1; + //INET_ECN_dontxmit(sk); + //if (!TCP_SKB_CB(skb)->sacked) { + // tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); + // verify_ace = false; + //} break; case 0x7: case 0x5: @@ -432,14 +433,13 @@ bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, } break; } - return verify_ace; } static void tcp_ecn_openreq_child(struct sock *sk, - struct request_sock *req, + const struct request_sock *req, const struct sk_buff *skb) { - struct tcp_request_sock *treq = tcp_rsk(req); + const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on @@ -447,13 +447,10 @@ static void tcp_ecn_openreq_child(struct sock *sk, // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. if (treq->accecn_ok) { const struct tcphdr *th = (const struct tcphdr *)skb->data; - if (tcp_accecn_third_ack(sk, skb, req, treq->syn_ect_snt)) { - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->saw_accecn_opt = treq->saw_accecn_opt; - } else { - tp->saw_accecn_opt = TCP_ACCECN_OPT_FAIL; - } + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); @@ -723,7 +720,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake tcp_sk(sk)->ecn_fail = 1; - INET_ECN_dontxmit(sk); + //INET_ECN_dontxmit(sk); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5bf109464f6b5..04313da0783dc 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -396,7 +396,7 @@ tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. tcp_sk(sk)->ecn_fail = 1; - INET_ECN_dontxmit(sk); + //INET_ECN_dontxmit(sk); } } From 0e900e57ac8f015b44f226ef53f78c48586b09a4 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sun, 21 Jan 2024 08:44:50 +0100 Subject: [PATCH 29/47] Modification for ACCECN draft --- Documentation/networking/ip-sysctl.rst | 4 ++-- include/net/tcp.h | 4 ++-- net/ipv4/tcp_minisocks.c | 24 ++++++++++++------------ net/ipv4/tcp_output.c | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index dbe87fe8cc568..88c315841f3d6 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -433,9 +433,9 @@ tcp_ecn_option - INTEGER tcp_ecn_option_beacon - INTEGER Control Accurate ECN (AccECN) option sending frequency per RTT and it - take effect only when tcp_ecn_option is set to 2. + takes effect only when tcp_ecn_option is set to 2. - Default: 3 (AccECN will be send at least 3 times per RTT) + Default: 1 (AccECN will be send at least 1 time per RTT) tcp_ecn_fallback - BOOLEAN If the kernel detects that ECN connection misbehaves, enable fall diff --git a/include/net/tcp.h b/include/net/tcp.h index f26a931d2ae2a..6745b64ca7051 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -432,8 +432,8 @@ static inline int tcp_accecn_extract_syn_ect(u8 ace) } bool tcp_accecn_validate_syn_feedback(struct sock *sk, u8 ace, u8 sent_ect); -bool tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - struct request_sock *req, u8 syn_ect_snt); +void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, + u8 syn_ect_snt); u8 tcp_accecn_option_init(const struct sk_buff *skb, u8 opt_offset); void tcp_ecn_received_counters(struct sock *sk, const struct sk_buff *skb, u32 payload_len); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index c0a448571ae70..95a4bfdb189e1 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -398,7 +398,7 @@ void tcp_openreq_init_rwin(struct request_sock *req, EXPORT_SYMBOL(tcp_openreq_init_rwin); void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, - u8 syn_ect_snt) + u8 syn_ect_snt) { u8 ace = tcp_accecn_ace(tcp_hdr(skb)); struct tcp_sock *tp = tcp_sk(sk); @@ -446,18 +446,18 @@ static void tcp_ecn_openreq_child(struct sock *sk, // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. if (treq->accecn_ok) { - const struct tcphdr *th = (const struct tcphdr *)skb->data; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); - tp->syn_ect_snt = treq->syn_ect_snt; - tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); - tp->saw_accecn_opt = treq->saw_accecn_opt; - tp->prev_ecnfield = treq->syn_ect_rcv; - tp->accecn_opt_demand = 1; - tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); + const struct tcphdr *th = (const struct tcphdr *)skb->data; + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters(sk, skb, skb->len - th->doff * 4); } else { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk) ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 04313da0783dc..e0987c3e7ed4e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -378,7 +378,7 @@ static void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) } static void -tcp_ecn_make_synack(struct sock *sk, struct request_sock *req, struct tcphdr *th) +tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcphdr *th) { if (req->num_timeout < 2) { if (tcp_rsk(req)->accecn_ok) From 24da58c05a036e69f364fd84e7ed65c0b699f947 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sun, 21 Jan 2024 11:44:16 +0100 Subject: [PATCH 30/47] Fix typo in tcp.c --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a9ccb83a85482..a2058babe9d84 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3033,7 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->saw_accecn_opt = 0; tp->ecn_fail = 0; - tp->accecn_no_preocess = 0; + tp->accecn_no_process = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; From 7a6a4eeac615f43353f9bd58958d201d8cb4d15c Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sun, 21 Jan 2024 17:40:49 +0100 Subject: [PATCH 31/47] Update retx counter for synack --- net/ipv4/tcp_timer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e873a768f9e82..46d40b967a68e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -428,8 +428,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ - inet_rtx_syn_ack(sk, req); req->num_timeout++; + inet_rtx_syn_ack(sk, req); icsk->icsk_retransmits++; if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp(tp); From 3941f6d878415bc54058720aa733bfbb5ae3c6ca Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sun, 21 Jan 2024 23:47:45 +0100 Subject: [PATCH 32/47] Add is_rtx flag --- include/net/request_sock.h | 4 +++- net/ipv4/inet_connection_sock.c | 1 + net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/include/net/request_sock.h b/include/net/request_sock.h index 29e41ff3ec933..d382e540f3298 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -62,7 +62,8 @@ struct request_sock { u16 mss; u8 num_retrans; /* number of retransmits */ u8 syncookie:1; /* syncookie: encode tcpopts in timestamp */ - u8 num_timeout:7; /* number of timeouts */ + u8 num_timeout:7, + is_rtx:1; /* number of timeouts */ u32 ts_recent; struct timer_list rsk_timer; const struct request_sock_ops *rsk_ops; @@ -105,6 +106,7 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->num_timeout = 0; + req->is_rtx = 0; req->num_retrans = 0; req->sk = NULL; refcount_set(&req->rsk_refcnt, 0); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index a53f9bf7886f0..9d20f2456cf2f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -688,6 +688,7 @@ static void syn_ack_recalc(struct request_sock *req, int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) { + req->is_rtx = 1; int err = req->rsk_ops->rtx_syn_ack(parent, req); if (!err) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index e0987c3e7ed4e..6f2a4ff81a584 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -380,7 +380,7 @@ static void tcp_accecn_echo_syn_ect(struct tcphdr *th, u8 ect) static void tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcphdr *th) { - if (req->num_timeout < 2) { + if (!req->is_rtx || req->num_timeout < 1) { if (tcp_rsk(req)->accecn_ok) tcp_accecn_echo_syn_ect(th, tcp_rsk(req)->syn_ect_rcv); else if (inet_rsk(req)->ecn_ok) @@ -1107,7 +1107,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the // SYN/ACK, but with no AccECN Option if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - req->num_timeout < 1 && (remaining >= TCPOLEN_ACCECN_BASE)) { + !req->is_rtx && (remaining >= TCPOLEN_ACCECN_BASE)) { opts->ecn_bytes = synack_ecn_bytes; remaining -= tcp_options_fit_accecn(opts, 0, remaining, tcp_synack_options_combine_saving(opts)); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 46d40b967a68e..e873a768f9e82 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -428,8 +428,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * regular retransmit because if the child socket has been accepted * it's not good to give up too easily. */ - req->num_timeout++; inet_rtx_syn_ack(sk, req); + req->num_timeout++; icsk->icsk_retransmits++; if (!tp->retrans_stamp) tp->retrans_stamp = tcp_time_stamp(tp); From 3ff07b9511a5e49eb428f160455535f849e81309 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Mon, 22 Jan 2024 10:40:59 +0100 Subject: [PATCH 33/47] Fix missing SYN retx in AccECN draft --- net/ipv4/tcp_output.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6f2a4ff81a584..695b437ff06d8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3451,12 +3451,20 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. + /* [CY] 3.1.4.1. Retransmitted SYNs - If the sender of an AccECN SYN (the TCP Client) times out before receiving the SYN/ACK, + * it SHOULD attempt to negotiate the use of AccECN at least one more time by continuing to set all three TCP ECN flags + * (AE,CWR,ECE) = (1,1,1) on the first retransmitted SYN (using the usual retransmission time-outs). If this first + * retransmission also fails to be acknowledged, in deployment scenarios where AccECN path traversal might be problematic, the + * TCP Client SHOULD send subsequent retransmissions of the SYN with the three TCP-ECN flags cleared (AE,CWR,ECE) = (0,0,0). */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); From 15a3b7ce1ab461d76c7d2bf58ae4bdd8932e18a7 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Mon, 22 Jan 2024 16:44:19 +0100 Subject: [PATCH 34/47] Update syn_ack_rcv for challenge ack --- net/ipv4/tcp_input.c | 5 +++++ net/ipv4/tcp_minisocks.c | 6 ------ net/ipv4/tcp_output.c | 1 - 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 56d2a9a454fd1..bfa026e766380 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6235,6 +6235,11 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { send_accecn_reflector = true; + /* [CY] 3.1.5. Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive.” + */ + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { tp->saw_accecn_opt = tcp_accecn_option_init(skb, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 95a4bfdb189e1..acf58f2c224be 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -411,11 +411,6 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. tp->ecn_fail = 1; tp->accecn_no_process = 1; - //INET_ECN_dontxmit(sk); - //if (!TCP_SKB_CB(skb)->sacked) { - // tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); - // verify_ace = false; - //} break; case 0x7: case 0x5: @@ -720,7 +715,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake tcp_sk(sk)->ecn_fail = 1; - //INET_ECN_dontxmit(sk); } } diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 695b437ff06d8..ddc5f1c4d06c3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -396,7 +396,6 @@ tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcph // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. tcp_sk(sk)->ecn_fail = 1; - //INET_ECN_dontxmit(sk); } } From 08367ec3369f82259117c01b26eb17fe351f5b85 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Tue, 23 Jan 2024 01:40:28 +0100 Subject: [PATCH 35/47] Update syn_ect_rcv for retx SYN --- net/ipv4/tcp_minisocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index acf58f2c224be..4054fcbf12dac 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -709,7 +709,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) - tcp_sk(sk)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on // any packet for the rest of the connection, if it has received or sent at least one valid From b9fd5d0bfa39704b371927b79fc16ecac406020c Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Tue, 23 Jan 2024 11:18:27 +0100 Subject: [PATCH 36/47] Modification for ACCECN draft --- net/ipv4/tcp_minisocks.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 4054fcbf12dac..5d661a5c31802 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -705,15 +705,16 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, &tcp_rsk(req)->last_oow_ack_time)) { if (tcp_rsk(req)->accecn_ok) { + /* [CY] 3.1.5 Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * SYN/ACK to arrive. + */ + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server already in AccECN mode: SHOULD - // acknowledge a valid SYN arriving with (AE,CWR,ECE) =(0,0,0) by emitting an AccECN SYN/ACK (with - // the appropriate combination of TCP-ECN flags to feed back the IP-ECN field of this latest SYN) - tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; - - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake + */ tcp_sk(sk)->ecn_fail = 1; } } From a974c3b451539a64f21d884270528515823b43a5 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 23 Jan 2024 19:25:34 +0100 Subject: [PATCH 37/47] Modify update_cwr and clean duplicate code --- net/ipv4/tcp_prague.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 03021dbbb02da..0eaf1889c8303 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -680,26 +680,21 @@ static void prague_enter_cwr(struct sock *sk) u64 reduction; u64 alpha; - if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { - if ((prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, - ca->cwr_stamp)) - return; - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; + if (prague_is_rtt_indep(sk) && + (prague_target_rtt(sk) >> 3) > tcp_stamp_us_delta(tp->tcp_mstamp, + ca->cwr_stamp)) + return; + ca->cwr_stamp = tp->tcp_mstamp; + alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); + if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) + alpha = prague_classic_ecn_fallback(tp, alpha); + if (prague_is_rtt_indep(sk) && (ca->cwnd_mode == 1 && likely(ca->saw_ce))) { reduction = mul_64_64_shift(ca->rate_bytes, alpha, PRAGUE_ALPHA_BITS + 1); ca->rate_bytes = max_t(u64, ca->rate_bytes - reduction, MINIMUM_RATE); ca->frac_cwnd = prague_pacing_rate_to_frac_cwnd(sk); } else { - ca->cwr_stamp = tp->tcp_mstamp; - alpha = ca->upscaled_alpha >> PRAGUE_SHIFT_G; - - if (prague_ecn_fallback == 1 && tp->classic_ecn > L_STICKY) - alpha = prague_classic_ecn_fallback(tp, alpha); - reduction = (alpha * (ca->frac_cwnd) + /* Unbias the rounding by adding 1/2 */ PRAGUE_MAX_ALPHA) >> From 37343541ac36dd69690ce8d326ea75eac11034fa Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Tue, 23 Jan 2024 20:14:25 +0100 Subject: [PATCH 38/47] Parametrize the mode --- net/ipv4/tcp_prague.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index 0eaf1889c8303..e93b2b4ad1061 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -104,6 +104,7 @@ #define RATE_OFFSET 4 #define OFFSET_UNIT 7 #define HSRTT_SHIFT 7 +#define DEFAULT_MODE 0 /* 0: win-base; 1: rate-base */ #define PRAGUE_MAX_SRTT_BITS 18U #define PRAGUE_MAX_MDEV_BITS (PRAGUE_MAX_SRTT_BITS+1) @@ -543,7 +544,7 @@ static void prague_update_alpha(struct sock *sk) } } skip: - ca->hsrtt_us = ca->hsrtt_us - (ca->hsrtt_us >> HSRTT_SHIFT) + tp->srtt_us; + ca->hsrtt_us = ca->hsrtt_us + (u64)tp->srtt_us - (ca->hsrtt_us >> HSRTT_SHIFT); prague_new_round(sk); } @@ -906,7 +907,7 @@ static void prague_init(struct sock *sk) tp->classic_ecn = 0ULL; tp->alpha = PRAGUE_MAX_ALPHA; /* Used ONLY to log alpha */ - ca->cwnd_mode = 0; + ca->cwnd_mode = DEFAULT_MODE; prague_new_round(sk); } From 8780b899edb517ce6dbea01514368f953603e5e1 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Thu, 25 Jan 2024 16:00:58 +0100 Subject: [PATCH 39/47] Include the case of 3.2.3.2.2. for packet loss carrying AccECN option --- include/linux/tcp.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 29 ++++++++++++++++++++--------- net/ipv4/tcp_minisocks.c | 16 ++++++++-------- net/ipv4/tcp_output.c | 21 ++++++++++++--------- 5 files changed, 43 insertions(+), 27 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 50f036db09bf8..cc1013374a96c 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -233,7 +233,8 @@ struct tcp_sock { syn_ect_snt:2, /* AccECN ECT memory, only */ syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ - u8 accecn_no_process:1; /* AccECN no response on feedback */ + u8 accecn_no_process:1, /* AccECN no response on feedback */ + accecn_no_options:1; u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a2058babe9d84..8b4fce6fef4ce 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3034,6 +3034,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->saw_accecn_opt = 0; tp->ecn_fail = 0; tp->accecn_no_process = 0; + tp->accecn_no_options = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bfa026e766380..94aed08fd16e9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -449,11 +449,11 @@ static void tcp_ecn_rcv_synack(struct sock *sk, const struct sk_buff *skb, else tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); break; - // [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with - // (AE,CWR,ECE) = (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) - // = (1,0,1) but it does not have logic specific to such a combination, the Client MUST enable AccECN - // mode as if the SYN/ACK confirmed that the Server supported AccECN and as if it fed back that the - // IP-ECN field on the SYN had arrived unchanged. + /* [CY] 3.1.2. Backward Compatibility - If a TCP Client has sent a SYN requesting AccECN feedback with (AE,CWR,ECE) = + * (1,1,1) then receives a SYN/ACK with the currently reserved combination (AE,CWR,ECE) = (1,0,1) but it does not + * have logic specific to such a combination, the Client MUST enable AccECN mode as if the SYN/ACK confirmed that the + * Server supported AccECN and as if it fed back that the IP-ECN field on the SYN had arrived unchanged. + */ case 0x5: if (tcp_ecn_mode_pending(tp)) { tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); @@ -595,7 +595,7 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, bool order1, res; unsigned int i; - if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_process) + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_response) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { @@ -4893,8 +4893,19 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * DSACK state and change the txhash to re-route speculatively. */ if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && - sk_rethink_txhash(sk)) + sk_rethink_txhash(sk)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If a middlebox is dropping + * packets with options it does not recognize, a host that is sending little or no data but mostly pure + * ACKs will not inherently detect such losses. Such a host MAY detect loss of ACKs carrying the AccECN + * Option by detecting whether the acknowledged data always reappears as a retransmission. In such cases, + * the host SHOULD disable the sending of the AccECN Option for this half-connection. + */ + if (tcp_ecn_mode_accecn(tp)) + tcp_sock(sk)->accecn_no_options = 1; + } + + } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -6235,8 +6246,8 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { send_accecn_reflector = true; - /* [CY] 3.1.5. Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN - * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + /* [CY] 3.1.5. Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable * SYN/ACK to arrive.” */ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 5d661a5c31802..2066097415f23 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -405,12 +405,12 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, switch (ace) { case 0x0: - // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest - // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. tp->ecn_fail = 1; - tp->accecn_no_process = 1; + tp->accecn_no_response = 1; break; case 0x7: case 0x5: @@ -437,8 +437,8 @@ static void tcp_ecn_openreq_child(struct sock *sk, const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid + // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + // any packet for the rest of the connection, if it has received or sent at least one valid // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. if (treq->accecn_ok) { const struct tcphdr *th = (const struct tcphdr *)skb->data; @@ -706,13 +706,13 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (tcp_rsk(req)->accecn_ok) { /* [CY] 3.1.5 Implications of AccECN Mode - A host in AccECN mode that is feeding back the IP-ECN - * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable + * field on a SYN or SYN/ACK: MUST feed back the IP-ECN field on the latest valid SYN or acceptable * SYN/ACK to arrive. */ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & INET_ECN_MASK; if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { - /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - * any packet for the rest of the connection, if it has received or sent at least one valid + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on + * any packet for the rest of the connection, if it has received or sent at least one valid * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake */ tcp_sk(sk)->ecn_fail = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ddc5f1c4d06c3..4806686bcb49f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -386,15 +386,17 @@ tcp_ecn_make_synack(struct sock *sk, const struct request_sock *req, struct tcph else if (inet_rsk(req)->ecn_ok) th->ece = 1; } else if (tcp_rsk(req)->accecn_ok) { - // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, - // to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and - // no AccECN Option, but it remains in AccECN feedback mode + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - If this retransmission times out, + * to expedite connection setup, the TCP Server SHOULD retransmit the SYN/ACK with (AE,CWR,ECE) = (0,0,0) and + * no AccECN Option, but it remains in AccECN feedback mode + */ th->ae = 0; th->cwr = 0; th->ece = 0; - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on any packet for + * the rest of the connection, if it has received or sent at least one valid SYN or Acceptable SYN/ACK with + * (AE,CWR,ECE) = (0,0,0) during the handshake. + */ tcp_sk(sk)->ecn_fail = 1; } } @@ -1103,8 +1105,9 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); - // [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the - // SYN/ACK, but with no AccECN Option + /* [CY] 3.2.3.2.2. Testing for Loss of Packets Carrying the AccECN Option - TCP Server SHOULD retransmit the + * SYN/ACK, but with no AccECN Option + */ if (treq->accecn_ok && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && !req->is_rtx && (remaining >= TCPOLEN_ACCECN_BASE)) { opts->ecn_bytes = synack_ecn_bytes; @@ -1186,7 +1189,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp) && sock_net(sk)->ipv4.sysctl_tcp_ecn_option && - (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL)) { + (tp->saw_accecn_opt && tp->saw_accecn_opt != TCP_ACCECN_OPT_FAIL && !tp->accecn_no_options)) { if (sock_net(sk)->ipv4.sysctl_tcp_ecn_option >= 2 || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk)) { From 821d92cc77deac49a46cd514676266a122ef799e Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Thu, 25 Jan 2024 17:28:55 +0100 Subject: [PATCH 40/47] Fix typo --- include/linux/tcp.h | 4 ++-- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_minisocks.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index cc1013374a96c..ee11ba9a1f1d7 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -233,8 +233,8 @@ struct tcp_sock { syn_ect_snt:2, /* AccECN ECT memory, only */ syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ - u8 accecn_no_process:1, /* AccECN no response on feedback */ - accecn_no_options:1; + u8 accecn_no_respond:1, /* AccECN no response on feedback */ + accecn_no_options:1; /* AccECN no options send out */ u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8b4fce6fef4ce..ad66e0cd8c01a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3033,7 +3033,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->delivered_ce = 0; tp->saw_accecn_opt = 0; tp->ecn_fail = 0; - tp->accecn_no_process = 0; + tp->accecn_no_respond = 0; tp->accecn_no_options = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 94aed08fd16e9..3e019e324c461 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -595,7 +595,7 @@ static bool tcp_accecn_process_option(struct tcp_sock *tp, bool order1, res; unsigned int i; - if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_response) + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL || tp->accecn_no_respond) return false; if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2066097415f23..7318de84a6c80 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -410,7 +410,7 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. tp->ecn_fail = 1; - tp->accecn_no_response = 1; + tp->accecn_no_respond = 1; break; case 0x7: case 0x5: From 08b1f990ee1f55e95fc2ad4ea6979bdcd2db4f0d Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Tue, 30 Jan 2024 17:20:30 +0100 Subject: [PATCH 41/47] Fix typo and comments --- net/ipv4/tcp_input.c | 1 - net/ipv4/tcp_minisocks.c | 16 +++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3e019e324c461..760e69e593c65 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4903,7 +4903,6 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) */ if (tcp_ecn_mode_accecn(tp)) tcp_sock(sk)->accecn_no_options = 1; - } } } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 7318de84a6c80..8aeeca7423dc0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -405,10 +405,11 @@ void tcp_accecn_third_ack(struct sock *sk, const struct sk_buff *skb, switch (ace) { case 0x0: - // [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD - // state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest - // of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN - // feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + /* [CY] 3.2.2.1. ACE Field on the ACK of the SYN/ACK - If the Server is in AccECN mode and in SYN-RCVD + * state, and if it receives a value of zero on a pure ACK with SYN=0 and no SACK blocks, for the rest + * of the connection the Server MUST NOT set ECT on outgoing packets and MUST NOT respond to AccECN + * feedback. Nonetheless, as a Data Receiver it MUST NOT disable AccECN feedback. + */ tp->ecn_fail = 1; tp->accecn_no_respond = 1; break; @@ -437,9 +438,10 @@ static void tcp_ecn_openreq_child(struct sock *sk, const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - // [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on - // any packet for the rest of the connection, if it has received or sent at least one valid - // SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT oni + * any packet for the rest of the connection, if it has received or sent at least one valid + * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. + */ if (treq->accecn_ok) { const struct tcphdr *th = (const struct tcphdr *)skb->data; tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); From 4538174db72a276736f9c5e517b43926be79cc67 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Tue, 30 Jan 2024 18:21:17 +0100 Subject: [PATCH 42/47] Fix typo --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 760e69e593c65..4eeba5e3bfe04 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4901,7 +4901,7 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * Option by detecting whether the acknowledged data always reappears as a retransmission. In such cases, * the host SHOULD disable the sending of the AccECN Option for this half-connection. */ - if (tcp_ecn_mode_accecn(tp)) + if (tcp_ecn_mode_accecn(tcp_sock(sk))) tcp_sock(sk)->accecn_no_options = 1; } From bde09fefe1e049e25a745b6860d2b5e99c034ac7 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Tue, 30 Jan 2024 19:23:54 +0100 Subject: [PATCH 43/47] Fix typo --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4eeba5e3bfe04..68835c3010a41 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4901,8 +4901,8 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * Option by detecting whether the acknowledged data always reappears as a retransmission. In such cases, * the host SHOULD disable the sending of the AccECN Option for this half-connection. */ - if (tcp_ecn_mode_accecn(tcp_sock(sk))) - tcp_sock(sk)->accecn_no_options = 1; + if (tcp_ecn_mode_accecn(tcp_sk(sk))) + tcp_sk(sk)->accecn_no_options = 1; } } From 5d809aaa59df0fc2f484a3d614f6c8ce798d3812 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Sat, 3 Feb 2024 15:33:06 +0100 Subject: [PATCH 44/47] Add ACE check on first data ACK --- include/linux/tcp.h | 3 ++- net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 15 +++++++++++++++ net/ipv4/tcp_minisocks.c | 2 +- 4 files changed, 19 insertions(+), 2 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index ee11ba9a1f1d7..73ab3e9c79421 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -234,7 +234,8 @@ struct tcp_sock { syn_ect_rcv:2, /* ... needed durign 3WHS + first seqno */ ecn_fail:1; /* ECN reflector detected path mangling */ u8 accecn_no_respond:1, /* AccECN no response on feedback */ - accecn_no_options:1; /* AccECN no options send out */ + accecn_no_options:1, /* AccECN no options send out */ + first_data_ack:1; /* Check for first data ack */ u8 saw_accecn_opt:2, /* An AccECN option was seen */ fast_ack_mode:2, /* which fast ack mode ? */ unused:4; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ad66e0cd8c01a..3f7a83eb6faef 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3035,6 +3035,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->ecn_fail = 0; tp->accecn_no_respond = 0; tp->accecn_no_options = 0; + tp->first_data_ack = 0; tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 68835c3010a41..dee1e31d9b8a4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -703,6 +703,21 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (flag & FLAG_SYN_ACKED) return 0; + /* [CY] 3.2.2.4. Testing for Zeroing of the ACE Field - If AccECN has been successfully negotiated, the Data Sender + * MAY check the value of the ACE counter in the first feedback packet (with or without data) that arrives after the + * 3-way handshake. If the value of this ACE field is found to be zero (0b000), for the remainder of the half- + * connection the Data Sender ought to send non-ECN-capable packets and it is advised not to respond to any feedback + * of CE markings. + */ + if (!tp->first_data_ack) { + tp->first_data_ack = 1; + if (!tcp_accecn_ace(tcp_hdr(skb))) { + tp->ecn_fail = 1; + tp->accecn_no_respond = 1; + return 0; + } + } + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8aeeca7423dc0..ba104fb82c7d5 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -438,7 +438,7 @@ static void tcp_ecn_openreq_child(struct sock *sk, const struct tcp_request_sock *treq = tcp_rsk(req); struct tcp_sock *tp = tcp_sk(sk); - /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT oni + /* [CY] 3.1.5. Implications of AccECN Mode - A TCP Server in AccECN mode: MUST NOT set ECT on * any packet for the rest of the connection, if it has received or sent at least one valid * SYN or Acceptable SYN/ACK with (AE,CWR,ECE) = (0,0,0) during the handshake. */ From ebae93fe595af2c517daa156df765b7c4ba97c83 Mon Sep 17 00:00:00 2001 From: Chia-Yu Date: Mon, 5 Feb 2024 22:11:12 +0100 Subject: [PATCH 45/47] Add ACE check on first data ACK --- net/ipv4/tcp_input.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dee1e31d9b8a4..c2c72a2688efd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -711,8 +711,9 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, */ if (!tp->first_data_ack) { tp->first_data_ack = 1; - if (!tcp_accecn_ace(tcp_hdr(skb))) { + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) { tp->ecn_fail = 1; + INET_ECN_dontxmit(sk); tp->accecn_no_respond = 1; return 0; } From 623cd470dd64fcf2522b7246fdf5a4bafbe45495 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang <125277758+minuscat@users.noreply.github.com> Date: Wed, 7 Feb 2024 13:07:17 +0100 Subject: [PATCH 46/47] Update kernel.yml --- .github/workflows/kernel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/kernel.yml b/.github/workflows/kernel.yml index 4c81b75c797e5..1c6d3320a606a 100644 --- a/.github/workflows/kernel.yml +++ b/.github/workflows/kernel.yml @@ -63,7 +63,7 @@ jobs: runs-on: ubuntu-20.04 needs: build permissions: write-all - if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023' || github.ref == 'refs/heads/AccECN-2024'}} + if: ${{ github.ref == 'refs/heads/testing' || github.ref == 'refs/heads/ratebase' || github.ref == 'refs/heads/AccECN-2023'}} steps: - name: Get artifact uses: actions/download-artifact@v3 From dc111c5216ca1acde6042849e827a96f2a7da644 Mon Sep 17 00:00:00 2001 From: Chia-Yu Chang Date: Thu, 16 May 2024 16:51:14 +0200 Subject: [PATCH 47/47] Update for fractional congestion window --- net/ipv4/tcp_prague.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_prague.c b/net/ipv4/tcp_prague.c index e93b2b4ad1061..5a8da0c05b2bd 100644 --- a/net/ipv4/tcp_prague.c +++ b/net/ipv4/tcp_prague.c @@ -554,7 +554,7 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) struct tcp_sock *tp = tcp_sk(sk); u64 increase; s64 acked; - u32 new_cwnd; + u64 new_cwnd; u64 divisor; u64 mtu_used; @@ -589,9 +589,9 @@ static void prague_update_cwnd(struct sock *sk, const struct rate_sample *rs) ca->frac_cwnd = max_t(u64, ca->frac_cwnd + acked, prague_pacing_rate_to_frac_cwnd(sk)); } else { increase = acked * ca->ai_ack_increase; - new_cwnd = tp->snd_cwnd; + new_cwnd = ca->frac_cwnd; if (likely(new_cwnd)) - increase = div_u64(increase + (new_cwnd >> 1), new_cwnd); + increase = div64_u64((increase << CWND_UNIT) + (new_cwnd >> 1), new_cwnd); ca->frac_cwnd += max_t(u64, acked, increase); }