From cc17c6805556044d1643cb401028759dcf3282d0 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 16 Oct 2025 13:36:45 +0200 Subject: [PATCH 1/4] Minor refactor of network filtering The filter_ifindex() and filter_network_ns() functios are often used together. Therefore, add a filter_network() function that budles these two together. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 1a1b0afe..b0a53ccf 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -188,11 +188,17 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk) return get_network_ns(skb, sk) == user_config.network_ns; } +static bool filter_network(struct sk_buff *skb, struct sock *sk) +{ + if (!filter_ifindex(skb ? skb->skb_iif : sk ? sk->sk_rx_dst_ifindex : 0)) + return false; + + return filter_network_ns(skb, sk); +} static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook) { struct hist_key key = { .hook = hook }; - u32 ifindex; if (bpf_core_field_exists(skb->tstamp_type)) { /* @@ -217,15 +223,11 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta return; } - ifindex = skb->skb_iif; - if (!filter_ifindex(ifindex)) - return; - - if (!filter_network_ns(skb, sk)) + if (!filter_network(skb, sk)) return; if (user_config.groupby_ifindex) - key.ifindex = ifindex; + key.ifindex = skb->skb_iif; record_latency_since(skb->tstamp, &key); } @@ -305,7 +307,6 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, { struct hist_key key = { .hook = hook }; u64 cgroup = 0; - u32 ifindex; if (!filter_min_sockqueue_len(sk)) return; @@ -316,15 +317,11 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb, if (!filter_current_task(cgroup)) return; - ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; - if (!filter_ifindex(ifindex)) - return; - - if (!filter_network_ns(skb, sk)) + if (!filter_network(skb, sk)) return; if (user_config.groupby_ifindex) - key.ifindex = ifindex; + key.ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex; if (user_config.groupby_cgroup) key.cgroup = cgroup; From dff6bc54988ea492072a58d22ca61ffad00184c0 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Thu, 16 Oct 2025 20:42:40 +0200 Subject: [PATCH 2/4] netstacklat: Exclude TCP reads for HOL blocked segments The 'tcp-socket-read' currently reports the latency for the skb containing the last TCP segment read from the socket. However, this segment might have been head of line (HOL) blocked by a previous segment missing. In this case, netstacklat's reported latency will include HOL blocking periods that is dependent on external factors (such as network packet loss, and network latency impacts retransmission time). As netstacklat is primarily intended to identify issues within the local host (in the network stack or receiving applications), by default filter out any socket reads were the last read SKB might have experienced HOL-blocking. Add the new -y/--include-tcp-hol-delay option to retain the old behavior of reporting latency for all reads, including those that are HOL-blocked. This may be useful in some scenarios when you still want to be aware of latency issues caused by HOL-blocking, even though it is caused by external components. For example, in a data center context were you have full control over the network, it may still be relevant to monitor HOL-based caused by the network. To exclude HOL-blocked reads, detect if any new ooo-segments have arrived by checking for differences in the number of ooo-packets in tcp_sock->rcv_ooopack. If any new ooo-segments have arrived, exclude the latency sample from the current read and set a limit for the next safe sequence number to read where the current ooo-packets must have been passed so segments can no longer be HOL-blocked. If there are skbs in the ooo-queue, set the limit to the end of the ooo-queue. Otherwise, set the limit to the current rcv_nxt (as if the ooo-queue is empty the detected ooo-segments must already have been merged into the receive queue and rcv_nxt must have advanced past them). If the read is past the safe sequence limit and no new ooo-segments have arrived, it's safe to start including the latency samples again. For sockets were some ooo-segments have been observed, keep the ooo-range state in socket storage (BPF_MAP_TYPE_SK_STORAGE). Skip protecting this state with a spin-lock, as it should only be concurrently accessed if there are concurrent reads on the same TCP socket, which is assumed to be very rare as applications attempting that cannot know which part of the data each of their concurrent reads will get. There are some scenarios that may cause this ooo-filtering to fail. - If multiple reads are done to the socket concurrently, we may not correctly track the last read byte. The kernel does not keep a lock on the TCP socket at the time our hooked function tcp_recv_timestamp() runs. If two reads are done in parallel, it's therefore possible that for both reads we will check the last read byte (tcp_sock.copied_seq) after the second read has updated it. We may then incorrectly conclude that the first read was ahead of the ooo-range when it was not, and record its latency when we should have excluded it. In practice I belive this issue should be quite rare, as most applications will probably not attempt to perform multiple concurrent reads to a single connected TCP socket in parallel (as then you cannot know which part of the payload the parallel reads will return). - As tcp_recv_timestamp() runs outside of the socket lock, the various state members we access may concurrently be updated as we're attempting to read them. An especially problematic one is tcp_sock.ooo_last_skb, which keeps a pointer to an SKB that is only valid while the ooo-queue is non-empty. It is possible that between our check for if the ooo-queue is non-empty and following the ooo_last_skb pointer, the ooo-queue is cleared and the ooo_last_skb pointer may end up pointing towards a freed SKB. If the socket members we access are updated before or while we read them, it can break the filtering in numerous ways, e.g. result in includes samples that should have been excluded (due to e.g. copied_seq being updated before our read) or excluding a large amount of valid samples (due to e.g. setting a sequence limit based on garbage in a freed SKB). Signed-off-by: Simon Sundberg --- headers/vmlinux/vmlinux_common.h | 4 + headers/vmlinux/vmlinux_net.h | 208 +++++++++++++++++++++++++++++++ netstacklat/netstacklat.bpf.c | 128 +++++++++++++++++++ netstacklat/netstacklat.c | 29 +++-- netstacklat/netstacklat.h | 1 + 5 files changed, 358 insertions(+), 12 deletions(-) diff --git a/headers/vmlinux/vmlinux_common.h b/headers/vmlinux/vmlinux_common.h index ff0b0088..4281dc6e 100644 --- a/headers/vmlinux/vmlinux_common.h +++ b/headers/vmlinux/vmlinux_common.h @@ -13,6 +13,10 @@ struct list_head { struct list_head *prev; }; +struct rb_root { + struct rb_node *rb_node; +}; + struct rb_node { long unsigned int __rb_parent_color; struct rb_node *rb_right; diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h index b0f6476e..f9e0a834 100644 --- a/headers/vmlinux/vmlinux_net.h +++ b/headers/vmlinux/vmlinux_net.h @@ -161,6 +161,35 @@ struct sk_buff { struct skb_ext *extensions; }; +struct tcp_skb_cb { + __u32 seq; + __u32 end_seq; + union { + struct { + u16 tcp_gso_segs; + u16 tcp_gso_size; + }; + }; + __u8 tcp_flags; + __u8 sacked; + __u8 ip_dsfield; + __u8 txstamp_ack : 1; + __u8 eor : 1; + __u8 has_rxtstamp : 1; + __u8 unused : 5; + __u32 ack_seq; + union { + struct { + __u32 is_app_limited : 1; + __u32 delivered_ce : 20; + __u32 unused : 11; + __u32 delivered; + u64 first_tx_mstamp; + u64 delivered_mstamp; + } tx; + }; +}; + struct nf_conn { unsigned long status; }; @@ -202,4 +231,183 @@ struct sock { u32 sk_rx_dst_cookie; }; +struct inet_sock { + struct sock sk; +}; + +struct inet_connection_sock { + struct inet_sock icsk_inet; +}; + +struct tcp_sock { + struct inet_connection_sock inet_conn; + __u8 __cacheline_group_begin__tcp_sock_read_tx[0]; + u32 max_window; + u32 rcv_ssthresh; + u32 reordering; + u32 notsent_lowat; + u16 gso_segs; + struct sk_buff *lost_skb_hint; + struct sk_buff *retransmit_skb_hint; + __u8 __cacheline_group_end__tcp_sock_read_tx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_txrx[0]; + u32 tsoffset; + u32 snd_wnd; + u32 mss_cache; + u32 snd_cwnd; + u32 prr_out; + u32 lost_out; + u32 sacked_out; + u16 tcp_header_len; + u8 scaling_ratio; + u8 chrono_type: 2; + u8 repair: 1; + u8 tcp_usec_ts: 1; + u8 is_sack_reneg: 1; + u8 is_cwnd_limited: 1; + __u8 __cacheline_group_end__tcp_sock_read_txrx[0]; + __u8 __cacheline_group_begin__tcp_sock_read_rx[0]; + u32 copied_seq; + u32 rcv_tstamp; + u32 snd_wl1; + u32 tlp_high_seq; + u32 rttvar_us; + u32 retrans_out; + u16 advmss; + u16 urg_data; + u32 lost; + /* struct minmax rtt_min; */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; + u8 recvmsg_inq: 1; + __u8 __cacheline_group_end__tcp_sock_read_rx[0]; + long: 0; + __u8 __cacheline_group_begin__tcp_sock_write_tx[0]; + u32 segs_out; + u32 data_segs_out; + u64 bytes_sent; + u32 snd_sml; + u32 chrono_start; + u32 chrono_stat[3]; + u32 write_seq; + u32 pushed_seq; + u32 lsndtime; + u32 mdev_us; + u32 rtt_seq; + u64 tcp_wstamp_ns; + struct list_head tsorted_sent_queue; + struct sk_buff *highest_sack; + u8 ecn_flags; + __u8 __cacheline_group_end__tcp_sock_write_tx[0]; + __u8 __cacheline_group_begin__tcp_sock_write_txrx[0]; + __be32 pred_flags; + u64 tcp_clock_cache; + u64 tcp_mstamp; + u32 rcv_nxt; + u32 snd_nxt; + u32 snd_una; + u32 window_clamp; + u32 srtt_us; + u32 packets_out; + u32 snd_up; + u32 delivered; + u32 delivered_ce; + u32 app_limited; + u32 rcv_wnd; + /* struct tcp_options_received rx_opt; */ + u8 nonagle: 4; + u8 rate_app_limited: 1; + __u8 __cacheline_group_end__tcp_sock_write_txrx[0]; + long: 0; + __u8 __cacheline_group_begin__tcp_sock_write_rx[0]; + u64 bytes_received; + u32 segs_in; + u32 data_segs_in; + u32 rcv_wup; + u32 max_packets_out; + u32 cwnd_usage_seq; + u32 rate_delivered; + u32 rate_interval_us; + u32 rcv_rtt_last_tsecr; + u64 first_tx_mstamp; + u64 delivered_mstamp; + u64 bytes_acked; + struct { + u32 rtt_us; + u32 seq; + u64 time; + } rcv_rtt_est; + struct { + u32 space; + u32 seq; + u64 time; + } rcvq_space; + __u8 __cacheline_group_end__tcp_sock_write_rx[0]; + u32 dsack_dups; + u32 compressed_ack_rcv_nxt; + struct list_head tsq_node; + /* struct tcp_rack rack; */ + u8 compressed_ack; + u8 dup_ack_counter: 2; + u8 tlp_retrans: 1; + u8 unused: 5; + u8 thin_lto: 1; + u8 fastopen_connect: 1; + u8 fastopen_no_cookie: 1; + u8 fastopen_client_fail: 2; + u8 frto: 1; + u8 repair_queue; + u8 save_syn: 2; + u8 syn_data: 1; + u8 syn_fastopen: 1; + u8 syn_fastopen_exp: 1; + u8 syn_fastopen_ch: 1; + u8 syn_data_acked: 1; + u8 keepalive_probes; + u32 tcp_tx_delay; + u32 mdev_max_us; + u32 reord_seen; + u32 snd_cwnd_cnt; + u32 snd_cwnd_clamp; + u32 snd_cwnd_used; + u32 snd_cwnd_stamp; + u32 prior_cwnd; + u32 prr_delivered; + u32 last_oow_ack_time; + /* struct hrtimer pacing_timer; */ + /* struct hrtimer compressed_ack_timer; */ + struct sk_buff *ooo_last_skb; + /* struct tcp_sack_block duplicate_sack[1]; */ + /* struct tcp_sack_block selective_acks[4]; */ + /* struct tcp_sack_block recv_sack_cache[4]; */ + int lost_cnt_hint; + u32 prior_ssthresh; + u32 high_seq; + u32 retrans_stamp; + u32 undo_marker; + int undo_retrans; + u64 bytes_retrans; + u32 total_retrans; + u32 rto_stamp; + u16 total_rto; + u16 total_rto_recoveries; + u32 total_rto_time; + u32 urg_seq; + unsigned int keepalive_time; + unsigned int keepalive_intvl; + int linger2; + u8 bpf_sock_ops_cb_flags; + u8 bpf_chg_cc_inprogress: 1; + u16 timeout_rehash; + u32 rcv_ooopack; + struct { + u32 probe_seq_start; + u32 probe_seq_end; + } mtu_probe; + u32 plb_rehash; + u32 mtu_info; + bool is_mptcp; +}; + + #endif /* __VMLINUX_NET_H__ */ diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index b0a53ccf..44882d10 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -11,6 +11,10 @@ #define READ_ONCE(x) (*(volatile typeof(x) *)&(x)) +// Mimic macros from /include/net/tcp.h +#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk) +#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) + char LICENSE[] SEC("license") = "GPL"; @@ -23,6 +27,7 @@ volatile const struct netstacklat_bpf_config user_config = { .filter_cgroup = false, .groupby_ifindex = false, .groupby_cgroup = false, + .include_hol_blocked = false, }; /* @@ -38,6 +43,13 @@ struct sk_buff___old { __u8 mono_delivery_time: 1; } __attribute__((preserve_access_index)); +struct tcp_sock_ooo_range { + u32 prev_n_ooopkts; + u32 ooo_seq_end; + /* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */ + bool active; +}; + struct { __uint(type, BPF_MAP_TYPE_PERCPU_HASH); __uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64); @@ -66,6 +78,22 @@ struct { __type(value, u64); } netstack_cgroupfilter SEC(".maps"); +struct { + __uint(type, BPF_MAP_TYPE_SK_STORAGE); + __uint(map_flags, BPF_F_NO_PREALLOC); + __type(key, int); + __type(value, struct tcp_sock_ooo_range); +} netstack_tcp_ooo_range SEC(".maps"); + +/* + * Is a < b considering u32 wrap around? + * Based on the before() function in /include/net/tcp.h + */ +static bool u32_lt(u32 a, u32 b) +{ + return (s32)(a - b) < 0; +} + static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key) { u64 zero = 0; @@ -302,6 +330,102 @@ static bool filter_min_sockqueue_len(struct sock *sk) return false; } +static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) +{ + struct tcp_skb_cb cb; + u32 max_seq = 0; + int err = 0; + + if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { + /* No ooo-segments currently in ooo-queue + * Any ooo-segments must already have been merged to the + * receive queue. Current rcv_nxt must therefore be ahead + * of all ooo-segments that have arrived until now. + */ + err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt); + if (err) + bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d", + err); + } else { + /* + * Some ooo-segments currently in ooo-queue + * Max out-of-order seq is given by the seq_end of the tail + * skb in the ooo-queue. + */ + err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); + if (err) + bpf_printk( + "failed to read tcp_sock->ooo_last_skb->cb, err=%d", + err); + max_seq = cb.end_seq; + } + + *seq = max_seq; + return err; +} + +static bool tcp_read_in_ooo_range(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range) +{ + u32 read_seq; + int err; + + if (!ooo_range->active) + return false; + + err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq); + if (err) { + bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err); + return true; // Assume we may be in ooo-range + } + + if (u32_lt(ooo_range->ooo_seq_end, read_seq)) { + ooo_range->active = false; + return false; + } else { + return true; + } +} + +static bool tcp_read_maybe_holblocked(struct sock *sk) +{ + struct tcp_sock_ooo_range *ooo_range; + struct tcp_sock *tp = tcp_sk(sk); + u32 n_ooopkts, nxt_seq; + int err; + + err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n", + err); + return true; // Assume we may be in ooo-range + } + + if (n_ooopkts == 0) + return false; + + ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL, + BPF_SK_STORAGE_GET_F_CREATE); + if (!ooo_range) { + bpf_printk( + "failed getting ooo-range socket storage for tcp socket"); + return true; // Assume we may be in ooo-range + } + + // Increase in ooo-packets since last - figure out next safe seq + if (n_ooopkts > ooo_range->prev_n_ooopkts) { + ooo_range->prev_n_ooopkts = n_ooopkts; + err = current_max_possible_ooo_seq(tp, &nxt_seq); + if (!err) { + ooo_range->ooo_seq_end = nxt_seq; + ooo_range->active = true; + } + return true; + } + + return tcp_read_in_ooo_range(tp, ooo_range); +} + static void record_socket_latency(struct sock *sk, struct sk_buff *skb, ktime_t tstamp, enum netstacklat_hook hook) { @@ -393,6 +517,10 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk, struct scm_timestamping_internal *tss) { struct timespec64 *ts = &tss->ts[0]; + + if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk)) + return 0; + record_socket_latency(sk, NULL, (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec, NETSTACKLAT_HOOK_TCP_SOCK_READ); diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c index 70dd4111..dfda239c 100644 --- a/netstacklat/netstacklat.c +++ b/netstacklat/netstacklat.c @@ -83,18 +83,19 @@ struct netstacklat_config { }; static const struct option long_options[] = { - { "help", no_argument, NULL, 'h' }, - { "report-interval", required_argument, NULL, 'r' }, - { "list-probes", no_argument, NULL, 'l' }, - { "enable-probes", required_argument, NULL, 'e' }, - { "disable-probes", required_argument, NULL, 'd' }, - { "pids", required_argument, NULL, 'p' }, - { "interfaces", required_argument, NULL, 'i' }, - { "network-namespace", required_argument, NULL, 'n' }, - { "cgroups", required_argument, NULL, 'c' }, - { "min-queuelength", required_argument, NULL, 'q' }, - { "groupby-interface", no_argument, NULL, 'I' }, - { "groupby-cgroup", no_argument, NULL, 'C' }, + { "help", no_argument, NULL, 'h' }, + { "report-interval", required_argument, NULL, 'r' }, + { "list-probes", no_argument, NULL, 'l' }, + { "enable-probes", required_argument, NULL, 'e' }, + { "disable-probes", required_argument, NULL, 'd' }, + { "pids", required_argument, NULL, 'p' }, + { "interfaces", required_argument, NULL, 'i' }, + { "network-namespace", required_argument, NULL, 'n' }, + { "cgroups", required_argument, NULL, 'c' }, + { "min-queuelength", required_argument, NULL, 'q' }, + { "groupby-interface", no_argument, NULL, 'I' }, + { "groupby-cgroup", no_argument, NULL, 'C' }, + { "include-tcp-hol-delay", no_argument, NULL, 'y' }, { 0, 0, 0, 0 } }; @@ -564,6 +565,7 @@ static int parse_arguments(int argc, char *argv[], conf->bpf_conf.filter_cgroup = false; conf->bpf_conf.groupby_ifindex = false; conf->bpf_conf.groupby_cgroup = false; + conf->bpf_conf.include_hol_blocked = false; for (i = 0; i < NETSTACKLAT_N_HOOKS; i++) // All probes enabled by default @@ -658,6 +660,9 @@ static int parse_arguments(int argc, char *argv[], case 'C': // groupby-cgroup conf->bpf_conf.groupby_cgroup = true; break; + case 'y': // include-tcp-hol-delay + conf->bpf_conf.include_hol_blocked = true; + break; case 'h': // help print_usage(stdout, argv[0]); exit(EXIT_SUCCESS); diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h index d0da8553..d1708ce4 100644 --- a/netstacklat/netstacklat.h +++ b/netstacklat/netstacklat.h @@ -77,6 +77,7 @@ struct netstacklat_bpf_config { bool filter_cgroup; bool groupby_ifindex; bool groupby_cgroup; + bool include_hol_blocked; }; #endif From c8d50c221b54d54d0ef56e1dce0ce49e0ed03277 Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 27 Oct 2025 15:23:23 +0100 Subject: [PATCH 3/4] netstacklat: Add sanity check for out-of-order sequence The logic for excluding samples from TCP reads that may have been delayed by HOL blocking relies on reading a number of fields from the TCP socket outside of the socket lock. This may be prone to errors due to the socket state being updated at another place in the kernel while our eBPF program is running. To reduce the risk that a data race causes the filter to fail, add a sanity check for the maximum out of order sequence used to exclude future TCP reads from monitoring. The most problematic of the read fields in the tcp_sock is ooo_last_skb, as that is a pointer to another SKB rather than a direct value. This pointer is only valid as long as the out_of_order_queue is non-empty. Due to a data race, we may check that the ooo-queue is non-empty while there are still SKBs in it, then have the kernel clear out the ooo-queue, and finally attempt to read the ooo_last_skb pointer later when it is no longer valid (and may now point to a freed/recycled SKB). This may result in incorrect values being used for the sequence limit used to exclude future reads of ooo-segments. The faulty sequence limit may both cause reads of HOL-blocked segments to be included or the exclusion of an unnecessarily large amount of future reads (up to 2 GB). To reduce the risk that the garbage data from an invalid SKB is used, introduce two sanity checks for end_seq in the ooo_last_skb. First check if the sequence number is zero, if so assume it is invalid (even though it can be a valid sequence number). Even though we will get an error code if reading the data from this SKB fails altogether, we may still succeed reading from a no longer valid SKB, in which case there is a high risk the data will have been zeroed. If it's non-zero, also check that it is within the current receive window (if not, clamp it to the receive window). Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 63 +++++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 44882d10..918dd507 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -330,22 +330,56 @@ static bool filter_min_sockqueue_len(struct sock *sk) return false; } +/* Get the current receive window end sequence for tp + * In the kernel receive window checks are done against + * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable + * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher + */ +static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq) +{ + u32 rcv_wup, rcv_wnd, window = 0; + int err; + + err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err); + goto exit; + } + + err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd); + if (err) { + bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err); + goto exit; + } + + window = rcv_wup + rcv_wnd; + if (u32_lt(window, rcv_nxt)) + window = rcv_nxt; + +exit: + *seq = window; + return err; +} + static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) { + u32 rcv_nxt, cur_rcv_window, max_seq = 0; struct tcp_skb_cb cb; - u32 max_seq = 0; int err = 0; + err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); + if (err) { + bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err); + goto exit; + } + if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { /* No ooo-segments currently in ooo-queue * Any ooo-segments must already have been merged to the * receive queue. Current rcv_nxt must therefore be ahead * of all ooo-segments that have arrived until now. */ - err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt); - if (err) - bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d", - err); + max_seq = rcv_nxt; } else { /* * Some ooo-segments currently in ooo-queue @@ -353,13 +387,28 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) * skb in the ooo-queue. */ err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb); - if (err) + if (err) { bpf_printk( "failed to read tcp_sock->ooo_last_skb->cb, err=%d", err); - max_seq = cb.end_seq; + goto exit; + } + + // Sanity check - ooo_last_skb->cb.end_seq within the receive window? + err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window); + if (err) + goto exit; + + /* While seq 0 can be a valid seq, consider it more likely to + * be the result of reading from an invalid SKB pointer + */ + if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq)) + max_seq = cur_rcv_window; + else + max_seq = cb.end_seq; } +exit: *seq = max_seq; return err; } From 24893487e890d0ee23042c03bc0957cf478b70db Mon Sep 17 00:00:00 2001 From: Simon Sundberg Date: Mon, 27 Oct 2025 16:20:44 +0100 Subject: [PATCH 4/4] netstacklat: Add sanity checks for rcv_nxt and copied_seq In addition to the out-of-order sequence number in the previous commit, two more key members that are read from the tcp_sock (outside of the socket lock) is rcv_nxt and copied_seq. Add sanity checks to these two members that ensure that they are monotonically increasing (in a u32 wrap-around space). To enable this sanity check, track the last seen (sane) value for both of them together with the other ooo-state in the socket storage map. At each read, compare the recorded values from the last read with their current values to determine if the current values are ahead of the previously seen values or not. Unlike the ooo sequence number, do not consider sequence 0 invalid for these checks. As they are direct members of the tcp_sock their values should always be valid (although possibly concurrently updated elsewhere), as long as the probe read succeeds (and failure is directly detected from the return value of bpf_core_read()). Skip adding similar monotonic growth checks for rcv_wup field to avoid also having to probe and update that value every time. For the rcv_wnd field I am no aware of any simple validity checks than can be performed. Signed-off-by: Simon Sundberg --- netstacklat/netstacklat.bpf.c | 89 ++++++++++++++++++++++++++--------- 1 file changed, 66 insertions(+), 23 deletions(-) diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c index 918dd507..5ca122cd 100644 --- a/netstacklat/netstacklat.bpf.c +++ b/netstacklat/netstacklat.bpf.c @@ -1,6 +1,7 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ #include "vmlinux_local.h" #include +#include #include #include @@ -46,6 +47,8 @@ struct sk_buff___old { struct tcp_sock_ooo_range { u32 prev_n_ooopkts; u32 ooo_seq_end; + u32 last_rcv_nxt; + u32 last_copied_seq; /* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */ bool active; }; @@ -361,18 +364,13 @@ static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq) return err; } -static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) +static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 rcv_nxt, + u32 *seq) { - u32 rcv_nxt, cur_rcv_window, max_seq = 0; + u32 cur_rcv_window, max_seq = 0; struct tcp_skb_cb cb; int err = 0; - err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); - if (err) { - bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err); - goto exit; - } - if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) { /* No ooo-segments currently in ooo-queue * Any ooo-segments must already have been merged to the @@ -413,22 +411,13 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq) return err; } -static bool tcp_read_in_ooo_range(struct tcp_sock *tp, +static bool tcp_read_in_ooo_range(struct tcp_sock *tp, u32 copied_seq, struct tcp_sock_ooo_range *ooo_range) { - u32 read_seq; - int err; - if (!ooo_range->active) return false; - err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq); - if (err) { - bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err); - return true; // Assume we may be in ooo-range - } - - if (u32_lt(ooo_range->ooo_seq_end, read_seq)) { + if (u32_lt(ooo_range->ooo_seq_end, copied_seq)) { ooo_range->active = false; return false; } else { @@ -436,12 +425,54 @@ static bool tcp_read_in_ooo_range(struct tcp_sock *tp, } } +static int get_and_validate_rcvnxt(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range, + u32 *rcvnxt) +{ + u32 rcv_nxt = 0; + int err; + + err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt); + if (err || (ooo_range->last_rcv_nxt && + u32_lt(rcv_nxt, ooo_range->last_rcv_nxt))) { + bpf_printk("failed to read valid tcp_sock->rcv_nxt, err=%d", + err); + err = err ?: -ERANGE; + } else { + ooo_range->last_rcv_nxt = rcv_nxt; + } + + *rcvnxt = rcv_nxt; + return err; +} + +static int get_and_validate_copiedseq(struct tcp_sock *tp, + struct tcp_sock_ooo_range *ooo_range, + u32 *copiedseq) +{ + u32 copied_seq = 0; + int err; + + err = bpf_core_read(&copied_seq, sizeof(copied_seq), &tp->copied_seq); + if (err || (ooo_range->last_copied_seq && + u32_lt(copied_seq, ooo_range->last_copied_seq))) { + bpf_printk("failed to read valid tcp_sock->copied_seq, err=%d", + err); + err = err ?: -ERANGE; + } else { + ooo_range->last_copied_seq = copied_seq; + } + + *copiedseq = copied_seq; + return err; +} + static bool tcp_read_maybe_holblocked(struct sock *sk) { + u32 n_ooopkts, rcv_nxt, copied_seq, nxt_seq; struct tcp_sock_ooo_range *ooo_range; + int err, err_rcvnxt, err_copiedseq; struct tcp_sock *tp = tcp_sk(sk); - u32 n_ooopkts, nxt_seq; - int err; err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack); if (err) { @@ -461,18 +492,30 @@ static bool tcp_read_maybe_holblocked(struct sock *sk) return true; // Assume we may be in ooo-range } + /* rcv_nxt and copied_seq may not be needed, but to ensure we always + * update our tracked state for them, read, sanity check and update + * both their values here. Errors are only checked for in the paths + * were the values are actually needed. + */ + err_rcvnxt = get_and_validate_rcvnxt(tp, ooo_range, &rcv_nxt); + err_copiedseq = get_and_validate_copiedseq(tp, ooo_range, &copied_seq); + // Increase in ooo-packets since last - figure out next safe seq if (n_ooopkts > ooo_range->prev_n_ooopkts) { ooo_range->prev_n_ooopkts = n_ooopkts; - err = current_max_possible_ooo_seq(tp, &nxt_seq); + err = err_rcvnxt ?: + current_max_possible_ooo_seq(tp, rcv_nxt, + &nxt_seq); if (!err) { ooo_range->ooo_seq_end = nxt_seq; ooo_range->active = true; } + return true; } - return tcp_read_in_ooo_range(tp, ooo_range); + return err_copiedseq ? true : + tcp_read_in_ooo_range(tp, copied_seq, ooo_range); } static void record_socket_latency(struct sock *sk, struct sk_buff *skb,