From cc17c6805556044d1643cb401028759dcf3282d0 Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Thu, 16 Oct 2025 13:36:45 +0200
Subject: [PATCH 1/4] Minor refactor of network filtering

The filter_ifindex() and filter_network_ns() functios are often used
together. Therefore, add a filter_network() function that budles these
two together.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 netstacklat/netstacklat.bpf.c | 25 +++++++++++--------------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index 1a1b0afe..b0a53ccf 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -188,11 +188,17 @@ static bool filter_network_ns(struct sk_buff *skb, struct sock *sk)
 	return get_network_ns(skb, sk) == user_config.network_ns;
 }
 
+static bool filter_network(struct sk_buff *skb, struct sock *sk)
+{
+	if (!filter_ifindex(skb ? skb->skb_iif : sk ? sk->sk_rx_dst_ifindex : 0))
+		return false;
+
+	return filter_network_ns(skb, sk);
+}
 
 static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netstacklat_hook hook)
 {
 	struct hist_key key = { .hook = hook };
-	u32 ifindex;
 
 	if (bpf_core_field_exists(skb->tstamp_type)) {
 		/*
@@ -217,15 +223,11 @@ static void record_skb_latency(struct sk_buff *skb, struct sock *sk, enum netsta
 			return;
 	}
 
-	ifindex = skb->skb_iif;
-	if (!filter_ifindex(ifindex))
-		return;
-
-	if (!filter_network_ns(skb, sk))
+	if (!filter_network(skb, sk))
 		return;
 
 	if (user_config.groupby_ifindex)
-		key.ifindex = ifindex;
+		key.ifindex = skb->skb_iif;
 
 	record_latency_since(skb->tstamp, &key);
 }
@@ -305,7 +307,6 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 {
 	struct hist_key key = { .hook = hook };
 	u64 cgroup = 0;
-	u32 ifindex;
 
 	if (!filter_min_sockqueue_len(sk))
 		return;
@@ -316,15 +317,11 @@ static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 	if (!filter_current_task(cgroup))
 		return;
 
-	ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
-	if (!filter_ifindex(ifindex))
-		return;
-
-	if (!filter_network_ns(skb, sk))
+	if (!filter_network(skb, sk))
 		return;
 
 	if (user_config.groupby_ifindex)
-		key.ifindex = ifindex;
+		key.ifindex = skb ? skb->skb_iif : sk->sk_rx_dst_ifindex;
 	if (user_config.groupby_cgroup)
 		key.cgroup = cgroup;
 

From dff6bc54988ea492072a58d22ca61ffad00184c0 Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Thu, 16 Oct 2025 20:42:40 +0200
Subject: [PATCH 2/4] netstacklat: Exclude TCP reads for HOL blocked segments

The 'tcp-socket-read' currently reports the latency for the skb
containing the last TCP segment read from the socket. However, this
segment might have been head of line (HOL) blocked by a previous
segment missing. In this case, netstacklat's reported latency will
include HOL blocking periods that is dependent on external
factors (such as network packet loss, and network latency impacts
retransmission time). As netstacklat is primarily intended to identify
issues within the local host (in the network stack or receiving
applications), by default filter out any socket reads were the last
read SKB might have experienced HOL-blocking.

Add the new -y/--include-tcp-hol-delay option to retain the old
behavior of reporting latency for all reads, including those that are
HOL-blocked. This may be useful in some scenarios when you still want
to be aware of latency issues caused by HOL-blocking, even though it
is caused by external components. For example, in a data center
context were you have full control over the network, it may still be
relevant to monitor HOL-based caused by the network.

To exclude HOL-blocked reads, detect if any new ooo-segments have
arrived by checking for differences in the number of ooo-packets in
tcp_sock->rcv_ooopack. If any new ooo-segments have arrived, exclude
the latency sample from the current read and set a limit for the next
safe sequence number to read where the current ooo-packets must have
been passed so segments can no longer be HOL-blocked. If there are
skbs in the ooo-queue, set the limit to the end of the
ooo-queue. Otherwise, set the limit to the current rcv_nxt (as if the
ooo-queue is empty the detected ooo-segments must already have been
merged into the receive queue and rcv_nxt must have advanced past
them). If the read is past the safe sequence limit and no new
ooo-segments have arrived, it's safe to start including the latency
samples again.

For sockets were some ooo-segments have been observed, keep the
ooo-range state in socket storage (BPF_MAP_TYPE_SK_STORAGE). Skip
protecting this state with a spin-lock, as it should only be
concurrently accessed if there are concurrent reads on the same TCP
socket, which is assumed to be very rare as applications attempting
that cannot know which part of the data each of their concurrent reads
will get.

There are some scenarios that may cause this ooo-filtering to fail.
- If multiple reads are done to the socket concurrently, we may not
  correctly track the last read byte. The kernel does not keep a lock
  on the TCP socket at the time our hooked function
  tcp_recv_timestamp() runs. If two reads are done in parallel, it's
  therefore possible that for both reads we will check the last read
  byte (tcp_sock.copied_seq) after the second read has updated it. We
  may then incorrectly conclude that the first read was ahead of the
  ooo-range when it was not, and record its latency when we should
  have excluded it. In practice I belive this issue should be quite
  rare, as most applications will probably not attempt to perform
  multiple concurrent reads to a single connected TCP socket in
  parallel (as then you cannot know which part of the payload the
  parallel reads will return).

- As tcp_recv_timestamp() runs outside of the socket lock, the various
  state members we access may concurrently be updated as we're
  attempting to read them. An especially problematic one is
  tcp_sock.ooo_last_skb, which keeps a pointer to an SKB that is only
  valid while the ooo-queue is non-empty. It is possible that between
  our check for if the ooo-queue is non-empty and following the
  ooo_last_skb pointer, the ooo-queue is cleared and the ooo_last_skb
  pointer may end up pointing towards a freed SKB. If the socket
  members we access are updated before or while we read them, it can
  break the filtering in numerous ways, e.g. result in includes
  samples that should have been excluded (due to e.g. copied_seq being
  updated before our read) or excluding a large amount of valid
  samples (due to e.g. setting a sequence limit based on garbage in a
  freed SKB).

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 headers/vmlinux/vmlinux_common.h |   4 +
 headers/vmlinux/vmlinux_net.h    | 208 +++++++++++++++++++++++++++++++
 netstacklat/netstacklat.bpf.c    | 128 +++++++++++++++++++
 netstacklat/netstacklat.c        |  29 +++--
 netstacklat/netstacklat.h        |   1 +
 5 files changed, 358 insertions(+), 12 deletions(-)

diff --git a/headers/vmlinux/vmlinux_common.h b/headers/vmlinux/vmlinux_common.h
index ff0b0088..4281dc6e 100644
--- a/headers/vmlinux/vmlinux_common.h
+++ b/headers/vmlinux/vmlinux_common.h
@@ -13,6 +13,10 @@ struct list_head {
 	struct list_head *prev;
 };
 
+struct rb_root {
+	struct rb_node *rb_node;
+};
+
 struct rb_node {
 	long unsigned int __rb_parent_color;
 	struct rb_node *rb_right;
diff --git a/headers/vmlinux/vmlinux_net.h b/headers/vmlinux/vmlinux_net.h
index b0f6476e..f9e0a834 100644
--- a/headers/vmlinux/vmlinux_net.h
+++ b/headers/vmlinux/vmlinux_net.h
@@ -161,6 +161,35 @@ struct sk_buff {
 	struct skb_ext *extensions;
 };
 
+struct tcp_skb_cb {
+	__u32 seq;
+	__u32 end_seq;
+	union {
+		struct {
+			u16 tcp_gso_segs;
+			u16 tcp_gso_size;
+		};
+	};
+	__u8 tcp_flags;
+	__u8 sacked;
+	__u8 ip_dsfield;
+	__u8 txstamp_ack : 1;
+	__u8 eor : 1;
+	__u8 has_rxtstamp : 1;
+	__u8 unused : 5;
+	__u32 ack_seq;
+	union {
+		struct {
+			__u32 is_app_limited : 1;
+			__u32 delivered_ce : 20;
+			__u32 unused : 11;
+			__u32 delivered;
+			u64 first_tx_mstamp;
+			u64 delivered_mstamp;
+		} tx;
+	};
+};
+
 struct nf_conn {
 	unsigned long status;
 };
@@ -202,4 +231,183 @@ struct sock {
 	u32 sk_rx_dst_cookie;
 };
 
+struct inet_sock {
+	struct sock sk;
+};
+
+struct inet_connection_sock {
+	struct inet_sock icsk_inet;
+};
+
+struct tcp_sock {
+	struct inet_connection_sock inet_conn;
+	__u8 __cacheline_group_begin__tcp_sock_read_tx[0];
+	u32 max_window;
+	u32 rcv_ssthresh;
+	u32 reordering;
+	u32 notsent_lowat;
+	u16 gso_segs;
+	struct sk_buff *lost_skb_hint;
+	struct sk_buff *retransmit_skb_hint;
+	__u8 __cacheline_group_end__tcp_sock_read_tx[0];
+	__u8 __cacheline_group_begin__tcp_sock_read_txrx[0];
+	u32 tsoffset;
+	u32 snd_wnd;
+	u32 mss_cache;
+	u32 snd_cwnd;
+	u32 prr_out;
+	u32 lost_out;
+	u32 sacked_out;
+	u16 tcp_header_len;
+	u8 scaling_ratio;
+	u8 chrono_type: 2;
+	u8 repair: 1;
+	u8 tcp_usec_ts: 1;
+	u8 is_sack_reneg: 1;
+	u8 is_cwnd_limited: 1;
+	__u8 __cacheline_group_end__tcp_sock_read_txrx[0];
+	__u8 __cacheline_group_begin__tcp_sock_read_rx[0];
+	u32 copied_seq;
+	u32 rcv_tstamp;
+	u32 snd_wl1;
+	u32 tlp_high_seq;
+	u32 rttvar_us;
+	u32 retrans_out;
+	u16 advmss;
+	u16 urg_data;
+	u32 lost;
+	/* struct minmax rtt_min; */
+	struct rb_root out_of_order_queue;
+	u32 snd_ssthresh;
+	u8 recvmsg_inq: 1;
+	__u8 __cacheline_group_end__tcp_sock_read_rx[0];
+	long: 0;
+	__u8 __cacheline_group_begin__tcp_sock_write_tx[0];
+	u32 segs_out;
+	u32 data_segs_out;
+	u64 bytes_sent;
+	u32 snd_sml;
+	u32 chrono_start;
+	u32 chrono_stat[3];
+	u32 write_seq;
+	u32 pushed_seq;
+	u32 lsndtime;
+	u32 mdev_us;
+	u32 rtt_seq;
+	u64 tcp_wstamp_ns;
+	struct list_head tsorted_sent_queue;
+	struct sk_buff *highest_sack;
+	u8 ecn_flags;
+	__u8 __cacheline_group_end__tcp_sock_write_tx[0];
+	__u8 __cacheline_group_begin__tcp_sock_write_txrx[0];
+	__be32 pred_flags;
+	u64 tcp_clock_cache;
+	u64 tcp_mstamp;
+	u32 rcv_nxt;
+	u32 snd_nxt;
+	u32 snd_una;
+	u32 window_clamp;
+	u32 srtt_us;
+	u32 packets_out;
+	u32 snd_up;
+	u32 delivered;
+	u32 delivered_ce;
+	u32 app_limited;
+	u32 rcv_wnd;
+	/* struct tcp_options_received rx_opt; */
+	u8 nonagle: 4;
+	u8 rate_app_limited: 1;
+	__u8 __cacheline_group_end__tcp_sock_write_txrx[0];
+	long: 0;
+	__u8 __cacheline_group_begin__tcp_sock_write_rx[0];
+	u64 bytes_received;
+	u32 segs_in;
+	u32 data_segs_in;
+	u32 rcv_wup;
+	u32 max_packets_out;
+	u32 cwnd_usage_seq;
+	u32 rate_delivered;
+	u32 rate_interval_us;
+	u32 rcv_rtt_last_tsecr;
+	u64 first_tx_mstamp;
+	u64 delivered_mstamp;
+	u64 bytes_acked;
+	struct {
+		u32 rtt_us;
+		u32 seq;
+		u64 time;
+	} rcv_rtt_est;
+	struct {
+		u32 space;
+		u32 seq;
+		u64 time;
+	} rcvq_space;
+	__u8 __cacheline_group_end__tcp_sock_write_rx[0];
+	u32 dsack_dups;
+	u32 compressed_ack_rcv_nxt;
+	struct list_head tsq_node;
+	/* struct tcp_rack rack; */
+	u8 compressed_ack;
+	u8 dup_ack_counter: 2;
+	u8 tlp_retrans: 1;
+	u8 unused: 5;
+	u8 thin_lto: 1;
+	u8 fastopen_connect: 1;
+	u8 fastopen_no_cookie: 1;
+	u8 fastopen_client_fail: 2;
+	u8 frto: 1;
+	u8 repair_queue;
+	u8 save_syn: 2;
+	u8 syn_data: 1;
+	u8 syn_fastopen: 1;
+	u8 syn_fastopen_exp: 1;
+	u8 syn_fastopen_ch: 1;
+	u8 syn_data_acked: 1;
+	u8 keepalive_probes;
+	u32 tcp_tx_delay;
+	u32 mdev_max_us;
+	u32 reord_seen;
+	u32 snd_cwnd_cnt;
+	u32 snd_cwnd_clamp;
+	u32 snd_cwnd_used;
+	u32 snd_cwnd_stamp;
+	u32 prior_cwnd;
+	u32 prr_delivered;
+	u32 last_oow_ack_time;
+	/* struct hrtimer pacing_timer; */
+	/* struct hrtimer compressed_ack_timer; */
+	struct sk_buff *ooo_last_skb;
+	/* struct tcp_sack_block duplicate_sack[1]; */
+	/* struct tcp_sack_block selective_acks[4]; */
+	/* struct tcp_sack_block recv_sack_cache[4]; */
+	int lost_cnt_hint;
+	u32 prior_ssthresh;
+	u32 high_seq;
+	u32 retrans_stamp;
+	u32 undo_marker;
+	int undo_retrans;
+	u64 bytes_retrans;
+	u32 total_retrans;
+	u32 rto_stamp;
+	u16 total_rto;
+	u16 total_rto_recoveries;
+	u32 total_rto_time;
+	u32 urg_seq;
+	unsigned int keepalive_time;
+	unsigned int keepalive_intvl;
+	int linger2;
+	u8 bpf_sock_ops_cb_flags;
+	u8 bpf_chg_cc_inprogress: 1;
+	u16 timeout_rehash;
+	u32 rcv_ooopack;
+	struct {
+		u32 probe_seq_start;
+		u32 probe_seq_end;
+	} mtu_probe;
+	u32 plb_rehash;
+	u32 mtu_info;
+	bool is_mptcp;
+};
+
+
 #endif /* __VMLINUX_NET_H__ */
diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index b0a53ccf..44882d10 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -11,6 +11,10 @@
 
 #define READ_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+// Mimic macros from /include/net/tcp.h
+#define tcp_sk(ptr) container_of(ptr, struct tcp_sock, inet_conn.icsk_inet.sk)
+#define TCP_SKB_CB(__skb)	((struct tcp_skb_cb *)&((__skb)->cb[0]))
+
 char LICENSE[] SEC("license") = "GPL";
 
 
@@ -23,6 +27,7 @@ volatile const struct netstacklat_bpf_config user_config = {
 	.filter_cgroup = false,
 	.groupby_ifindex = false,
 	.groupby_cgroup = false,
+	.include_hol_blocked = false,
 };
 
 /*
@@ -38,6 +43,13 @@ struct sk_buff___old {
 	__u8 mono_delivery_time: 1;
 } __attribute__((preserve_access_index));
 
+struct tcp_sock_ooo_range {
+	u32 prev_n_ooopkts;
+	u32 ooo_seq_end;
+	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
+	bool active;
+};
+
 struct {
 	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
 	__uint(max_entries, HIST_NBUCKETS * NETSTACKLAT_N_HOOKS * 64);
@@ -66,6 +78,22 @@ struct {
 	__type(value, u64);
 } netstack_cgroupfilter SEC(".maps");
 
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct tcp_sock_ooo_range);
+} netstack_tcp_ooo_range SEC(".maps");
+
+/*
+ * Is a < b considering u32 wrap around?
+ * Based on the before() function in /include/net/tcp.h
+ */
+static bool u32_lt(u32 a, u32 b)
+{
+	return (s32)(a - b) < 0;
+}
+
 static u64 *lookup_or_zeroinit_histentry(void *map, const struct hist_key *key)
 {
 	u64 zero = 0;
@@ -302,6 +330,102 @@ static bool filter_min_sockqueue_len(struct sock *sk)
 	return false;
 }
 
+static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
+{
+	struct tcp_skb_cb cb;
+	u32 max_seq = 0;
+	int err = 0;
+
+	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
+		/* No ooo-segments currently in ooo-queue
+		 * Any ooo-segments must already have been merged to the
+		 * receive queue. Current rcv_nxt must therefore be ahead
+		 * of all ooo-segments that have arrived until now.
+		 */
+		err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt);
+		if (err)
+			bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d",
+				   err);
+	} else {
+		/*
+		 * Some ooo-segments currently in ooo-queue
+		 * Max out-of-order seq is given by the seq_end of the tail
+		 * skb in the ooo-queue.
+		 */
+		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
+		if (err)
+			bpf_printk(
+				"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
+				err);
+		max_seq = cb.end_seq;
+	}
+
+	*seq = max_seq;
+	return err;
+}
+
+static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
+				  struct tcp_sock_ooo_range *ooo_range)
+{
+	u32 read_seq;
+	int err;
+
+	if (!ooo_range->active)
+		return false;
+
+	err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (u32_lt(ooo_range->ooo_seq_end, read_seq)) {
+		ooo_range->active = false;
+		return false;
+	} else {
+		return true;
+	}
+}
+
+static bool tcp_read_maybe_holblocked(struct sock *sk)
+{
+	struct tcp_sock_ooo_range *ooo_range;
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 n_ooopkts, nxt_seq;
+	int err;
+
+	err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_ooopack, err=%d\n",
+			   err);
+		return true; // Assume we may be in ooo-range
+	}
+
+	if (n_ooopkts == 0)
+		return false;
+
+	ooo_range = bpf_sk_storage_get(&netstack_tcp_ooo_range, sk, NULL,
+				       BPF_SK_STORAGE_GET_F_CREATE);
+	if (!ooo_range) {
+		bpf_printk(
+			"failed getting ooo-range socket storage for tcp socket");
+		return true; // Assume we may be in ooo-range
+	}
+
+	// Increase in ooo-packets since last - figure out next safe seq
+	if (n_ooopkts > ooo_range->prev_n_ooopkts) {
+		ooo_range->prev_n_ooopkts = n_ooopkts;
+		err = current_max_possible_ooo_seq(tp, &nxt_seq);
+		if (!err) {
+			ooo_range->ooo_seq_end = nxt_seq;
+			ooo_range->active = true;
+		}
+		return true;
+	}
+
+	return tcp_read_in_ooo_range(tp, ooo_range);
+}
+
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,
 				  ktime_t tstamp, enum netstacklat_hook hook)
 {
@@ -393,6 +517,10 @@ int BPF_PROG(netstacklat_tcp_recv_timestamp, void *msg, struct sock *sk,
 	     struct scm_timestamping_internal *tss)
 {
 	struct timespec64 *ts = &tss->ts[0];
+
+	if (!user_config.include_hol_blocked && tcp_read_maybe_holblocked(sk))
+		return 0;
+
 	record_socket_latency(sk, NULL,
 			      (ktime_t)ts->tv_sec * NS_PER_S + ts->tv_nsec,
 			      NETSTACKLAT_HOOK_TCP_SOCK_READ);
diff --git a/netstacklat/netstacklat.c b/netstacklat/netstacklat.c
index 70dd4111..dfda239c 100644
--- a/netstacklat/netstacklat.c
+++ b/netstacklat/netstacklat.c
@@ -83,18 +83,19 @@ struct netstacklat_config {
 };
 
 static const struct option long_options[] = {
-	{ "help",              no_argument,       NULL, 'h' },
-	{ "report-interval",   required_argument, NULL, 'r' },
-	{ "list-probes",       no_argument,       NULL, 'l' },
-	{ "enable-probes",     required_argument, NULL, 'e' },
-	{ "disable-probes",    required_argument, NULL, 'd' },
-	{ "pids",              required_argument, NULL, 'p' },
-	{ "interfaces",        required_argument, NULL, 'i' },
-	{ "network-namespace", required_argument, NULL, 'n' },
-	{ "cgroups",           required_argument, NULL, 'c' },
-	{ "min-queuelength",   required_argument, NULL, 'q' },
-	{ "groupby-interface", no_argument,       NULL, 'I' },
-	{ "groupby-cgroup",    no_argument,       NULL, 'C' },
+	{ "help",                  no_argument,       NULL, 'h' },
+	{ "report-interval",       required_argument, NULL, 'r' },
+	{ "list-probes",           no_argument,       NULL, 'l' },
+	{ "enable-probes",         required_argument, NULL, 'e' },
+	{ "disable-probes",        required_argument, NULL, 'd' },
+	{ "pids",                  required_argument, NULL, 'p' },
+	{ "interfaces",            required_argument, NULL, 'i' },
+	{ "network-namespace",     required_argument, NULL, 'n' },
+	{ "cgroups",               required_argument, NULL, 'c' },
+	{ "min-queuelength",       required_argument, NULL, 'q' },
+	{ "groupby-interface",     no_argument,       NULL, 'I' },
+	{ "groupby-cgroup",        no_argument,       NULL, 'C' },
+	{ "include-tcp-hol-delay", no_argument,       NULL, 'y' },
 	{ 0, 0, 0, 0 }
 };
 
@@ -564,6 +565,7 @@ static int parse_arguments(int argc, char *argv[],
 	conf->bpf_conf.filter_cgroup = false;
 	conf->bpf_conf.groupby_ifindex = false;
 	conf->bpf_conf.groupby_cgroup = false;
+	conf->bpf_conf.include_hol_blocked = false;
 
 	for (i = 0; i < NETSTACKLAT_N_HOOKS; i++)
 		// All probes enabled by default
@@ -658,6 +660,9 @@ static int parse_arguments(int argc, char *argv[],
 		case 'C': // groupby-cgroup
 			conf->bpf_conf.groupby_cgroup = true;
 			break;
+		case 'y': // include-tcp-hol-delay
+			conf->bpf_conf.include_hol_blocked = true;
+			break;
 		case 'h': // help
 			print_usage(stdout, argv[0]);
 			exit(EXIT_SUCCESS);
diff --git a/netstacklat/netstacklat.h b/netstacklat/netstacklat.h
index d0da8553..d1708ce4 100644
--- a/netstacklat/netstacklat.h
+++ b/netstacklat/netstacklat.h
@@ -77,6 +77,7 @@ struct netstacklat_bpf_config {
 	bool filter_cgroup;
 	bool groupby_ifindex;
 	bool groupby_cgroup;
+	bool include_hol_blocked;
 };
 
 #endif

From c8d50c221b54d54d0ef56e1dce0ce49e0ed03277 Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Mon, 27 Oct 2025 15:23:23 +0100
Subject: [PATCH 3/4] netstacklat: Add sanity check for out-of-order sequence

The logic for excluding samples from TCP reads that may have been
delayed by HOL blocking relies on reading a number of fields from the
TCP socket outside of the socket lock. This may be prone to errors due
to the socket state being updated at another place in the kernel while
our eBPF program is running. To reduce the risk that a data race
causes the filter to fail, add a sanity check for the maximum out of
order sequence used to exclude future TCP reads from monitoring.

The most problematic of the read fields in the tcp_sock is
ooo_last_skb, as that is a pointer to another SKB rather than a direct
value. This pointer is only valid as long as the out_of_order_queue is
non-empty. Due to a data race, we may check that the ooo-queue is
non-empty while there are still SKBs in it, then have the kernel clear
out the ooo-queue, and finally attempt to read the ooo_last_skb
pointer later when it is no longer valid (and may now point to a
freed/recycled SKB). This may result in incorrect values being used
for the sequence limit used to exclude future reads of
ooo-segments. The faulty sequence limit may both cause reads of
HOL-blocked segments to be included or the exclusion of an
unnecessarily large amount of future reads (up to 2 GB).

To reduce the risk that the garbage data from an invalid SKB is used,
introduce two sanity checks for end_seq in the ooo_last_skb. First
check if the sequence number is zero, if so assume it is invalid (even
though it can be a valid sequence number). Even though we will get an
error code if reading the data from this SKB fails altogether, we may
still succeed reading from a no longer valid SKB, in which case there
is a high risk the data will have been zeroed. If it's non-zero, also
check that it is within the current receive window (if not, clamp it
to the receive window).

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 netstacklat/netstacklat.bpf.c | 63 +++++++++++++++++++++++++++++++----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index 44882d10..918dd507 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -330,22 +330,56 @@ static bool filter_min_sockqueue_len(struct sock *sk)
 	return false;
 }
 
+/* Get the current receive window end sequence for tp
+ * In the kernel receive window checks are done against
+ * tp->rcv_nxt + tcp_receive_window(tp). This function should give a compareable
+ * result, i.e. rcv_wup + rcv_wnd or rcv_nxt, whichever is higher
+ */
+static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
+{
+	u32 rcv_wup, rcv_wnd, window = 0;
+	int err;
+
+	err = bpf_core_read(&rcv_wup, sizeof(rcv_wup), &tp->rcv_wup);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_wup, err=%d", err);
+		goto exit;
+	}
+
+	err = bpf_core_read(&rcv_wnd, sizeof(rcv_wnd), &tp->rcv_wnd);
+	if (err) {
+		bpf_printk("failed to read tcp_sock->rcv_wnd, err=%d", err);
+		goto exit;
+	}
+
+	window = rcv_wup + rcv_wnd;
+	if (u32_lt(window, rcv_nxt))
+		window = rcv_nxt;
+
+exit:
+	*seq = window;
+	return err;
+}
+
 static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 {
+	u32 rcv_nxt, cur_rcv_window, max_seq = 0;
 	struct tcp_skb_cb cb;
-	u32 max_seq = 0;
 	int err = 0;
 
+	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
+	if (err) {
+		bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err);
+		goto exit;
+	}
+
 	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
 		/* No ooo-segments currently in ooo-queue
 		 * Any ooo-segments must already have been merged to the
 		 * receive queue. Current rcv_nxt must therefore be ahead
 		 * of all ooo-segments that have arrived until now.
 		 */
-		err = bpf_core_read(&max_seq, sizeof(max_seq), &tp->rcv_nxt);
-		if (err)
-			bpf_printk("failed to read tcp_sock->rcv_nxt, err=%d",
-				   err);
+		max_seq = rcv_nxt;
 	} else {
 		/*
 		 * Some ooo-segments currently in ooo-queue
@@ -353,13 +387,28 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 		 * skb in the ooo-queue.
 		 */
 		err = BPF_CORE_READ_INTO(&cb, tp, ooo_last_skb, cb);
-		if (err)
+		if (err) {
 			bpf_printk(
 				"failed to read tcp_sock->ooo_last_skb->cb, err=%d",
 				err);
-		max_seq = cb.end_seq;
+			goto exit;
+		}
+
+		// Sanity check - ooo_last_skb->cb.end_seq within the receive window?
+		err = get_current_rcv_wnd_seq(tp, rcv_nxt, &cur_rcv_window);
+		if (err)
+			goto exit;
+
+		/* While seq 0 can be a valid seq, consider it more likely to
+		 * be the result of reading from an invalid SKB pointer
+		 */
+		if (cb.end_seq == 0 || u32_lt(cur_rcv_window, cb.end_seq))
+			max_seq = cur_rcv_window;
+		else
+			max_seq = cb.end_seq;
 	}
 
+exit:
 	*seq = max_seq;
 	return err;
 }

From 24893487e890d0ee23042c03bc0957cf478b70db Mon Sep 17 00:00:00 2001
From: Simon Sundberg <simon.sundberg@kau.se>
Date: Mon, 27 Oct 2025 16:20:44 +0100
Subject: [PATCH 4/4] netstacklat: Add sanity checks for rcv_nxt and copied_seq

In addition to the out-of-order sequence number in the previous
commit, two more key members that are read from the tcp_sock (outside
of the socket lock) is rcv_nxt and copied_seq. Add sanity checks to
these two members that ensure that they are monotonically
increasing (in a u32 wrap-around space).

To enable this sanity check, track the last seen (sane) value for both
of them together with the other ooo-state in the socket storage
map. At each read, compare the recorded values from the last read with
their current values to determine if the current values are ahead of
the previously seen values or not. Unlike the ooo sequence number, do
not consider sequence 0 invalid for these checks. As they are direct
members of the tcp_sock their values should always be valid (although
possibly concurrently updated elsewhere), as long as the probe read
succeeds (and failure is directly detected from the return value of
bpf_core_read()).

Skip adding similar monotonic growth checks for rcv_wup field to avoid
also having to probe and update that value every time. For the rcv_wnd
field I am no aware of any simple validity checks than can be
performed.

Signed-off-by: Simon Sundberg <simon.sundberg@kau.se>
---
 netstacklat/netstacklat.bpf.c | 89 ++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 23 deletions(-)

diff --git a/netstacklat/netstacklat.bpf.c b/netstacklat/netstacklat.bpf.c
index 918dd507..5ca122cd 100644
--- a/netstacklat/netstacklat.bpf.c
+++ b/netstacklat/netstacklat.bpf.c
@@ -1,6 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0-or-later */
 #include "vmlinux_local.h"
 #include <linux/bpf.h>
+#include <linux/errno.h>
 
 #include <bpf/bpf_helpers.h>
 #include <bpf/bpf_tracing.h>
@@ -46,6 +47,8 @@ struct sk_buff___old {
 struct tcp_sock_ooo_range {
 	u32 prev_n_ooopkts;
 	u32 ooo_seq_end;
+	u32 last_rcv_nxt;
+	u32 last_copied_seq;
 	/* indicates if ooo_seq_end is still valid (as 0 can be valid seq) */
 	bool active;
 };
@@ -361,18 +364,13 @@ static int get_current_rcv_wnd_seq(struct tcp_sock *tp, u32 rcv_nxt, u32 *seq)
 	return err;
 }
 
-static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
+static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 rcv_nxt,
+					u32 *seq)
 {
-	u32 rcv_nxt, cur_rcv_window, max_seq = 0;
+	u32 cur_rcv_window, max_seq = 0;
 	struct tcp_skb_cb cb;
 	int err = 0;
 
-	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
-	if (err) {
-		bpf_printk("failed reading tcp_sock->rcv_nxt, err=%d", err);
-		goto exit;
-	}
-
 	if (BPF_CORE_READ(tp, out_of_order_queue.rb_node) == NULL) {
 		/* No ooo-segments currently in ooo-queue
 		 * Any ooo-segments must already have been merged to the
@@ -413,22 +411,13 @@ static int current_max_possible_ooo_seq(struct tcp_sock *tp, u32 *seq)
 	return err;
 }
 
-static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
+static bool tcp_read_in_ooo_range(struct tcp_sock *tp, u32 copied_seq,
 				  struct tcp_sock_ooo_range *ooo_range)
 {
-	u32 read_seq;
-	int err;
-
 	if (!ooo_range->active)
 		return false;
 
-	err = bpf_core_read(&read_seq, sizeof(read_seq), &tp->copied_seq);
-	if (err) {
-		bpf_printk("failed to read tcp_sock->copied_seq, err=%d", err);
-		return true; // Assume we may be in ooo-range
-	}
-
-	if (u32_lt(ooo_range->ooo_seq_end, read_seq)) {
+	if (u32_lt(ooo_range->ooo_seq_end, copied_seq)) {
 		ooo_range->active = false;
 		return false;
 	} else {
@@ -436,12 +425,54 @@ static bool tcp_read_in_ooo_range(struct tcp_sock *tp,
 	}
 }
 
+static int get_and_validate_rcvnxt(struct tcp_sock *tp,
+				   struct tcp_sock_ooo_range *ooo_range,
+				   u32 *rcvnxt)
+{
+	u32 rcv_nxt = 0;
+	int err;
+
+	err = bpf_core_read(&rcv_nxt, sizeof(rcv_nxt), &tp->rcv_nxt);
+	if (err || (ooo_range->last_rcv_nxt &&
+		    u32_lt(rcv_nxt, ooo_range->last_rcv_nxt))) {
+		bpf_printk("failed to read valid tcp_sock->rcv_nxt, err=%d",
+			   err);
+		err = err ?: -ERANGE;
+	} else {
+		ooo_range->last_rcv_nxt = rcv_nxt;
+	}
+
+	*rcvnxt = rcv_nxt;
+	return err;
+}
+
+static int get_and_validate_copiedseq(struct tcp_sock *tp,
+				      struct tcp_sock_ooo_range *ooo_range,
+				      u32 *copiedseq)
+{
+	u32 copied_seq = 0;
+	int err;
+
+	err = bpf_core_read(&copied_seq, sizeof(copied_seq), &tp->copied_seq);
+	if (err || (ooo_range->last_copied_seq &&
+		    u32_lt(copied_seq, ooo_range->last_copied_seq))) {
+		bpf_printk("failed to read valid tcp_sock->copied_seq, err=%d",
+			   err);
+		err = err ?: -ERANGE;
+	} else {
+		ooo_range->last_copied_seq = copied_seq;
+	}
+
+	*copiedseq = copied_seq;
+	return err;
+}
+
 static bool tcp_read_maybe_holblocked(struct sock *sk)
 {
+	u32 n_ooopkts, rcv_nxt, copied_seq, nxt_seq;
 	struct tcp_sock_ooo_range *ooo_range;
+	int err, err_rcvnxt, err_copiedseq;
 	struct tcp_sock *tp = tcp_sk(sk);
-	u32 n_ooopkts, nxt_seq;
-	int err;
 
 	err = bpf_core_read(&n_ooopkts, sizeof(n_ooopkts), &tp->rcv_ooopack);
 	if (err) {
@@ -461,18 +492,30 @@ static bool tcp_read_maybe_holblocked(struct sock *sk)
 		return true; // Assume we may be in ooo-range
 	}
 
+	/* rcv_nxt and copied_seq may not be needed, but to ensure we always
+	 * update our tracked state for them, read, sanity check and update
+	 * both their values here. Errors are only checked for in the paths
+	 * were the values are actually needed.
+	 */
+	err_rcvnxt = get_and_validate_rcvnxt(tp, ooo_range, &rcv_nxt);
+	err_copiedseq = get_and_validate_copiedseq(tp, ooo_range, &copied_seq);
+
 	// Increase in ooo-packets since last - figure out next safe seq
 	if (n_ooopkts > ooo_range->prev_n_ooopkts) {
 		ooo_range->prev_n_ooopkts = n_ooopkts;
-		err = current_max_possible_ooo_seq(tp, &nxt_seq);
+		err = err_rcvnxt ?:
+				   current_max_possible_ooo_seq(tp, rcv_nxt,
+								&nxt_seq);
 		if (!err) {
 			ooo_range->ooo_seq_end = nxt_seq;
 			ooo_range->active = true;
 		}
+
 		return true;
 	}
 
-	return tcp_read_in_ooo_range(tp, ooo_range);
+	return err_copiedseq ? true :
+			       tcp_read_in_ooo_range(tp, copied_seq, ooo_range);
 }
 
 static void record_socket_latency(struct sock *sk, struct sk_buff *skb,