Blame SOURCES/0001-ss-add-support-for-BPF-socket-local-storage.patch

c8e1db
From 8740ca9dcd3ccf1c75c362900cb3218ae3204cf5 Mon Sep 17 00:00:00 2001
c8e1db
From: Quentin Deslandes <qde@naccy.de>
c8e1db
Date: Wed, 21 Feb 2024 16:16:19 +0100
c8e1db
Subject: [PATCH] ss: add support for BPF socket-local storage
c8e1db
c8e1db
While sock_diag is able to return BPF socket-local storage in response
c8e1db
to INET_DIAG_REQ_SK_BPF_STORAGES requests, ss doesn't request it.
c8e1db
c8e1db
This change introduces the --bpf-maps and --bpf-map-id= options to request
c8e1db
BPF socket-local storage for all SK_STORAGE maps, or only specific ones.
c8e1db
c8e1db
The bigger part of this change will check the requested map IDs and
c8e1db
ensure they are valid. The column COL_EXT is used to print the
c8e1db
socket-local data into.
c8e1db
c8e1db
When --bpf-maps is used, ss will send an empty
c8e1db
INET_DIAG_REQ_SK_BPF_STORAGES request, in return the kernel will send
c8e1db
all the BPF socket-local storage entries for a given socket. The BTF
c8e1db
data for each map is loaded on demand, as ss can't predict which map ID
c8e1db
are used.
c8e1db
c8e1db
When --bpf-map-id=ID is used, a file descriptor to the requested maps is
c8e1db
open to 1) ensure the map doesn't disappear before the data is printed,
c8e1db
and 2) ensure the map type is BPF_MAP_TYPE_SK_STORAGE. The BTF data for
c8e1db
each requested map is loaded before the request is sent to the kernel.
c8e1db
c8e1db
Co-developed-by: Martin KaFai Lau <martin.lau@kernel.org>
c8e1db
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
c8e1db
Signed-off-by: Quentin Deslandes <qde@naccy.de>
c8e1db
Signed-off-by: David Ahern <dsahern@kernel.org>
c8e1db
---
c8e1db
 misc/ss.c | 272 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
c8e1db
 1 file changed, 269 insertions(+), 3 deletions(-)
c8e1db
c8e1db
diff --git a/misc/ss.c b/misc/ss.c
c8e1db
index 72a841be..2c7e7c58 100644
c8e1db
--- a/misc/ss.c
c8e1db
+++ b/misc/ss.c
c8e1db
@@ -51,6 +51,24 @@
c8e1db
 #include <linux/tls.h>
c8e1db
 #include <linux/mptcp.h>
c8e1db
 
c8e1db
+#ifdef HAVE_LIBBPF
c8e1db
+/* If libbpf is new enough (0.5+), support for pretty-printing BPF socket-local
c8e1db
+ * storage is enabled, otherwise we emit a warning and disable it.
c8e1db
+ * ENABLE_BPF_SKSTORAGE_SUPPORT is only used to gate the socket-local storage
c8e1db
+ * feature, so this wouldn't prevent any feature relying on HAVE_LIBBPF to be
c8e1db
+ * usable.
c8e1db
+ */
c8e1db
+#define ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+
c8e1db
+#include <bpf/bpf.h>
c8e1db
+#include <bpf/libbpf.h>
c8e1db
+
c8e1db
+#if (LIBBPF_MAJOR_VERSION == 0) && (LIBBPF_MINOR_VERSION < 5)
c8e1db
+#warning "libbpf version 0.5 or later is required, disabling BPF socket-local storage support"
c8e1db
+#undef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+#endif
c8e1db
+#endif
c8e1db
+
c8e1db
 #if HAVE_RPC
c8e1db
 #include <rpc/rpc.h>
c8e1db
 #include <rpc/xdr.h>
c8e1db
@@ -3384,6 +3402,202 @@ static void parse_diag_msg(struct nlmsghdr *nlh, struct sockstat *s)
c8e1db
 	memcpy(s->remote.data, r->id.idiag_dst, s->local.bytelen);
c8e1db
 }
c8e1db
 
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+
c8e1db
+#define MAX_NR_BPF_MAP_ID_OPTS 32
c8e1db
+
c8e1db
+struct btf;
c8e1db
+
c8e1db
+static struct bpf_map_opts {
c8e1db
+	unsigned int nr_maps;
c8e1db
+	struct bpf_sk_storage_map_info {
c8e1db
+		unsigned int id;
c8e1db
+		int fd;
c8e1db
+	} maps[MAX_NR_BPF_MAP_ID_OPTS];
c8e1db
+	bool show_all;
c8e1db
+} bpf_map_opts;
c8e1db
+
c8e1db
+static void bpf_map_opts_mixed_error(void)
c8e1db
+{
c8e1db
+	fprintf(stderr,
c8e1db
+		"ss: --bpf-maps and --bpf-map-id cannot be used together\n");
c8e1db
+}
c8e1db
+
c8e1db
+static int bpf_map_opts_load_info(unsigned int map_id)
c8e1db
+{
c8e1db
+	struct bpf_map_info info = {};
c8e1db
+	uint32_t len = sizeof(info);
c8e1db
+	int fd;
c8e1db
+	int r;
c8e1db
+
c8e1db
+	if (bpf_map_opts.nr_maps == MAX_NR_BPF_MAP_ID_OPTS) {
c8e1db
+		fprintf(stderr,
c8e1db
+			"ss: too many (> %u) BPF socket-local storage maps found, skipping map ID %u\n",
c8e1db
+			MAX_NR_BPF_MAP_ID_OPTS, map_id);
c8e1db
+		return 0;
c8e1db
+	}
c8e1db
+
c8e1db
+	fd = bpf_map_get_fd_by_id(map_id);
c8e1db
+	if (fd < 0) {
c8e1db
+		if (errno == -ENOENT)
c8e1db
+			return 0;
c8e1db
+
c8e1db
+		fprintf(stderr, "ss: cannot get fd for BPF map ID %u%s\n",
c8e1db
+			map_id, errno == EPERM ?
c8e1db
+			": missing root permissions, CAP_BPF, or CAP_SYS_ADMIN" : "");
c8e1db
+		return -1;
c8e1db
+	}
c8e1db
+
c8e1db
+	r = bpf_obj_get_info_by_fd(fd, &info, &len;;
c8e1db
+	if (r) {
c8e1db
+		fprintf(stderr, "ss: failed to get info for BPF map ID %u\n",
c8e1db
+			map_id);
c8e1db
+		close(fd);
c8e1db
+		return -1;
c8e1db
+	}
c8e1db
+
c8e1db
+	if (info.type != BPF_MAP_TYPE_SK_STORAGE) {
c8e1db
+		fprintf(stderr,
c8e1db
+			"ss: BPF map with ID %s has type ID %d, expecting %d ('sk_storage')\n",
c8e1db
+			optarg, info.type, BPF_MAP_TYPE_SK_STORAGE);
c8e1db
+		close(fd);
c8e1db
+		return -1;
c8e1db
+	}
c8e1db
+
c8e1db
+	bpf_map_opts.maps[bpf_map_opts.nr_maps].id = map_id;
c8e1db
+	bpf_map_opts.maps[bpf_map_opts.nr_maps++].fd = fd;
c8e1db
+
c8e1db
+	return 0;
c8e1db
+}
c8e1db
+
c8e1db
+static struct bpf_sk_storage_map_info *bpf_map_opts_get_info(
c8e1db
+	unsigned int map_id)
c8e1db
+{
c8e1db
+	unsigned int i;
c8e1db
+	int r;
c8e1db
+
c8e1db
+	for (i = 0; i < bpf_map_opts.nr_maps; ++i) {
c8e1db
+		if (bpf_map_opts.maps[i].id == map_id)
c8e1db
+			return &bpf_map_opts.maps[i];
c8e1db
+	}
c8e1db
+
c8e1db
+	r = bpf_map_opts_load_info(map_id);
c8e1db
+	if (r)
c8e1db
+		return NULL;
c8e1db
+
c8e1db
+	return &bpf_map_opts.maps[bpf_map_opts.nr_maps - 1];
c8e1db
+}
c8e1db
+
c8e1db
+static int bpf_map_opts_add_id(const char *optarg)
c8e1db
+{
c8e1db
+	size_t optarg_len;
c8e1db
+	unsigned long id;
c8e1db
+	char *end;
c8e1db
+
c8e1db
+	if (bpf_map_opts.show_all) {
c8e1db
+		bpf_map_opts_mixed_error();
c8e1db
+		return -1;
c8e1db
+	}
c8e1db
+
c8e1db
+	optarg_len = strlen(optarg);
c8e1db
+	id = strtoul(optarg, &end, 0);
c8e1db
+	if (end != optarg + optarg_len || id == 0 || id >= UINT32_MAX) {
c8e1db
+		fprintf(stderr, "ss: invalid BPF map ID %s\n", optarg);
c8e1db
+		return -1;
c8e1db
+	}
c8e1db
+
c8e1db
+	/* Force lazy loading of the map's data. */
c8e1db
+	if (!bpf_map_opts_get_info(id))
c8e1db
+		return -1;
c8e1db
+
c8e1db
+	return 0;
c8e1db
+}
c8e1db
+
c8e1db
+static void bpf_map_opts_destroy(void)
c8e1db
+{
c8e1db
+	int i;
c8e1db
+
c8e1db
+	for (i = 0; i < bpf_map_opts.nr_maps; ++i)
c8e1db
+		close(bpf_map_opts.maps[i].fd);
c8e1db
+}
c8e1db
+
c8e1db
+static struct rtattr *bpf_map_opts_alloc_rta(void)
c8e1db
+{
c8e1db
+	struct rtattr *stgs_rta, *fd_rta;
c8e1db
+	size_t total_size;
c8e1db
+	unsigned int i;
c8e1db
+	void *buf;
c8e1db
+
c8e1db
+	/* If bpf_map_opts.show_all == true, we will send an empty message to
c8e1db
+	 * the kernel, which will return all the socket-local data attached to
c8e1db
+	 * a socket, no matter their map ID
c8e1db
+	 */
c8e1db
+	if (bpf_map_opts.show_all) {
c8e1db
+		total_size = RTA_LENGTH(0);
c8e1db
+	} else {
c8e1db
+		total_size = RTA_LENGTH(RTA_LENGTH(sizeof(int)) *
c8e1db
+					bpf_map_opts.nr_maps);
c8e1db
+	}
c8e1db
+
c8e1db
+	buf = malloc(total_size);
c8e1db
+	if (!buf)
c8e1db
+		return NULL;
c8e1db
+
c8e1db
+	stgs_rta = buf;
c8e1db
+	stgs_rta->rta_type = INET_DIAG_REQ_SK_BPF_STORAGES | NLA_F_NESTED;
c8e1db
+	stgs_rta->rta_len = total_size;
c8e1db
+
c8e1db
+	/* If inet_show_netlink() retries fetching socket data, nr_maps might
c8e1db
+	 * be different from 0, even with show_all == true, so we return early
c8e1db
+	 * to avoid inserting specific map IDs into the request.
c8e1db
+	 */
c8e1db
+	if (bpf_map_opts.show_all)
c8e1db
+		return stgs_rta;
c8e1db
+
c8e1db
+	buf = RTA_DATA(stgs_rta);
c8e1db
+	for (i = 0; i < bpf_map_opts.nr_maps; i++) {
c8e1db
+		int *fd;
c8e1db
+
c8e1db
+		fd_rta = buf;
c8e1db
+		fd_rta->rta_type = SK_DIAG_BPF_STORAGE_REQ_MAP_FD;
c8e1db
+		fd_rta->rta_len = RTA_LENGTH(sizeof(int));
c8e1db
+
c8e1db
+		fd = RTA_DATA(fd_rta);
c8e1db
+		*fd = bpf_map_opts.maps[i].fd;
c8e1db
+
c8e1db
+		buf += fd_rta->rta_len;
c8e1db
+	}
c8e1db
+
c8e1db
+	return stgs_rta;
c8e1db
+}
c8e1db
+
c8e1db
+static void show_sk_bpf_storages(struct rtattr *bpf_stgs)
c8e1db
+{
c8e1db
+	struct rtattr *tb[SK_DIAG_BPF_STORAGE_MAX + 1], *bpf_stg;
c8e1db
+	unsigned int rem;
c8e1db
+
c8e1db
+	for (bpf_stg = RTA_DATA(bpf_stgs), rem = RTA_PAYLOAD(bpf_stgs);
c8e1db
+		RTA_OK(bpf_stg, rem); bpf_stg = RTA_NEXT(bpf_stg, rem)) {
c8e1db
+
c8e1db
+		if ((bpf_stg->rta_type & NLA_TYPE_MASK) != SK_DIAG_BPF_STORAGE)
c8e1db
+			continue;
c8e1db
+
c8e1db
+		parse_rtattr_nested(tb, SK_DIAG_BPF_STORAGE_MAX,
c8e1db
+				    (struct rtattr *)bpf_stg);
c8e1db
+
c8e1db
+		if (tb[SK_DIAG_BPF_STORAGE_MAP_ID]) {
c8e1db
+			out(" map_id:%u",
c8e1db
+			    rta_getattr_u32(tb[SK_DIAG_BPF_STORAGE_MAP_ID]));
c8e1db
+		}
c8e1db
+	}
c8e1db
+}
c8e1db
+
c8e1db
+static bool bpf_map_opts_is_enabled(void)
c8e1db
+{
c8e1db
+	return bpf_map_opts.nr_maps || bpf_map_opts.show_all;
c8e1db
+}
c8e1db
+#endif
c8e1db
+
c8e1db
 static int inet_show_sock(struct nlmsghdr *nlh,
c8e1db
 			  struct sockstat *s)
c8e1db
 {
c8e1db
@@ -3391,8 +3605,9 @@ static int inet_show_sock(struct nlmsghdr *nlh,
c8e1db
 	struct inet_diag_msg *r = NLMSG_DATA(nlh);
c8e1db
 	unsigned char v6only = 0;
c8e1db
 
c8e1db
-	parse_rtattr(tb, INET_DIAG_MAX, (struct rtattr *)(r+1),
c8e1db
-		     nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)));
c8e1db
+	parse_rtattr_flags(tb, INET_DIAG_MAX, (struct rtattr *)(r+1),
c8e1db
+			   nlh->nlmsg_len - NLMSG_LENGTH(sizeof(*r)),
c8e1db
+			   NLA_F_NESTED);
c8e1db
 
c8e1db
 	if (tb[INET_DIAG_PROTOCOL])
c8e1db
 		s->type = rta_getattr_u8(tb[INET_DIAG_PROTOCOL]);
c8e1db
@@ -3489,6 +3704,11 @@ static int inet_show_sock(struct nlmsghdr *nlh,
c8e1db
 	}
c8e1db
 	sctp_ino = s->ino;
c8e1db
 
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+	if (tb[INET_DIAG_SK_BPF_STORAGES])
c8e1db
+		show_sk_bpf_storages(tb[INET_DIAG_SK_BPF_STORAGES]);
c8e1db
+#endif
c8e1db
+
c8e1db
 	return 0;
c8e1db
 }
c8e1db
 
c8e1db
@@ -3570,13 +3790,14 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f)
c8e1db
 {
c8e1db
 	struct sockaddr_nl nladdr = { .nl_family = AF_NETLINK };
c8e1db
 	DIAG_REQUEST(req, struct inet_diag_req_v2 r);
c8e1db
+	struct rtattr *bpf_rta = NULL;
c8e1db
 	char    *bc = NULL;
c8e1db
 	int	bclen;
c8e1db
 	__u32	proto;
c8e1db
 	struct msghdr msg;
c8e1db
 	struct rtattr rta_bc;
c8e1db
 	struct rtattr rta_proto;
c8e1db
-	struct iovec iov[5];
c8e1db
+	struct iovec iov[6];
c8e1db
 	int iovlen = 1;
c8e1db
 
c8e1db
 	if (family == PF_UNSPEC)
c8e1db
@@ -3629,6 +3850,20 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f)
c8e1db
 		iovlen += 2;
c8e1db
 	}
c8e1db
 
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+	if (bpf_map_opts_is_enabled()) {
c8e1db
+		bpf_rta = bpf_map_opts_alloc_rta();
c8e1db
+		if (!bpf_rta) {
c8e1db
+			fprintf(stderr,
c8e1db
+				"ss: cannot alloc request for --bpf-map\n");
c8e1db
+			return -1;
c8e1db
+		}
c8e1db
+
c8e1db
+		iov[iovlen++] = (struct iovec){ bpf_rta, bpf_rta->rta_len };
c8e1db
+		req.nlh.nlmsg_len += bpf_rta->rta_len;
c8e1db
+	}
c8e1db
+#endif
c8e1db
+
c8e1db
 	msg = (struct msghdr) {
c8e1db
 		.msg_name = (void *)&nladdr,
c8e1db
 		.msg_namelen = sizeof(nladdr),
c8e1db
@@ -3637,10 +3872,13 @@ static int sockdiag_send(int family, int fd, int protocol, struct filter *f)
c8e1db
 	};
c8e1db
 
c8e1db
 	if (sendmsg(fd, &msg, 0) < 0) {
c8e1db
+		free(bpf_rta);
c8e1db
 		close(fd);
c8e1db
 		return -1;
c8e1db
 	}
c8e1db
 
c8e1db
+	free(bpf_rta);
c8e1db
+
c8e1db
 	return 0;
c8e1db
 }
c8e1db
 
c8e1db
@@ -5361,6 +5599,10 @@ static void _usage(FILE *dest)
c8e1db
 "       --tos           show tos and priority information\n"
c8e1db
 "       --cgroup        show cgroup information\n"
c8e1db
 "   -b, --bpf           show bpf filter socket information\n"
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+"       --bpf-maps      show all BPF socket-local storage maps\n"
c8e1db
+"       --bpf-map-id=MAP-ID    show a BPF socket-local storage map\n"
c8e1db
+#endif
c8e1db
 "   -E, --events        continually display sockets as they are destroyed\n"
c8e1db
 "   -Z, --context       display task SELinux security contexts\n"
c8e1db
 "   -z, --contexts      display task and socket SELinux security contexts\n"
c8e1db
@@ -5487,6 +5729,9 @@ wrong_state:
c8e1db
 
c8e1db
 #define OPT_INET_SOCKOPT 262
c8e1db
 
c8e1db
+#define OPT_BPF_MAPS 263
c8e1db
+#define OPT_BPF_MAP_ID 264
c8e1db
+
c8e1db
 static const struct option long_opts[] = {
c8e1db
 	{ "numeric", 0, 0, 'n' },
c8e1db
 	{ "resolve", 0, 0, 'r' },
c8e1db
@@ -5533,6 +5778,10 @@ static const struct option long_opts[] = {
c8e1db
 	{ "mptcp", 0, 0, 'M' },
c8e1db
 	{ "oneline", 0, 0, 'O' },
c8e1db
 	{ "inet-sockopt", 0, 0, OPT_INET_SOCKOPT },
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+	{ "bpf-maps", 0, 0, OPT_BPF_MAPS},
c8e1db
+	{ "bpf-map-id", 1, 0, OPT_BPF_MAP_ID},
c8e1db
+#endif
c8e1db
 	{ 0 }
c8e1db
 
c8e1db
 };
c8e1db
@@ -5741,6 +5990,19 @@ int main(int argc, char *argv[])
c8e1db
 		case OPT_INET_SOCKOPT:
c8e1db
 			show_inet_sockopt = 1;
c8e1db
 			break;
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+		case OPT_BPF_MAPS:
c8e1db
+			if (bpf_map_opts.nr_maps) {
c8e1db
+				bpf_map_opts_mixed_error();
c8e1db
+				return -1;
c8e1db
+			}
c8e1db
+			bpf_map_opts.show_all = true;
c8e1db
+			break;
c8e1db
+		case OPT_BPF_MAP_ID:
c8e1db
+			if (bpf_map_opts_add_id(optarg))
c8e1db
+				exit(1);
c8e1db
+			break;
c8e1db
+#endif
c8e1db
 		case 'h':
c8e1db
 			help();
c8e1db
 		case '?':
c8e1db
@@ -5880,6 +6142,10 @@ int main(int argc, char *argv[])
c8e1db
 	if (show_processes || show_threads || show_proc_ctx || show_sock_ctx)
c8e1db
 		user_ent_destroy();
c8e1db
 
c8e1db
+#ifdef ENABLE_BPF_SKSTORAGE_SUPPORT
c8e1db
+	bpf_map_opts_destroy();
c8e1db
+#endif
c8e1db
+
c8e1db
 	render();
c8e1db
 
c8e1db
 	return 0;
c8e1db
-- 
c8e1db
2.41.0
c8e1db