Blame SOURCES/net-packet-make-tp_drops-atomic.patch

02818e
From 21d5e92b26cc9b90b522ef7dd03e5cf09167f1cc Mon Sep 17 00:00:00 2001
02818e
From: Artem Savkov <asavkov@redhat.com>
02818e
Date: Tue, 22 Sep 2020 15:48:56 +0200
02818e
Subject: [RHEL8.2 KPATCH v2] [net] packet: make tp_drops atomic
02818e
02818e
Kernels:
02818e
4.18.0-193.el8
02818e
4.18.0-193.1.2.el8_2
02818e
4.18.0-193.6.3.el8_2
02818e
4.18.0-193.13.2.el8_2
02818e
4.18.0-193.14.3.el8_2
02818e
4.18.0-193.19.1.el8_2
02818e
02818e
Changes since last build:
02818e
[x86_64]:
02818e
af_packet.o: changed function: packet_create
02818e
af_packet.o: changed function: packet_getsockopt
02818e
af_packet.o: changed function: packet_rcv
02818e
af_packet.o: changed function: packet_sock_destruct
02818e
af_packet.o: changed function: prb_retire_current_block
02818e
af_packet.o: changed function: tpacket_rcv
02818e
02818e
[ppc64le]:
02818e
af_packet.o: changed function: packet_create
02818e
af_packet.o: changed function: packet_getsockopt
02818e
af_packet.o: changed function: packet_rcv
02818e
af_packet.o: changed function: packet_sock_destruct
02818e
af_packet.o: changed function: prb_retire_current_block
02818e
af_packet.o: changed function: run_filter
02818e
af_packet.o: changed function: tpacket_rcv
02818e
02818e
---------------------------
02818e
02818e
Modifications:
02818e
 - bpf calls altered to avoid issues with jump labels
02818e
 - tp_drops as shadow variable
02818e
02818e
Testing: reproducer from bz
02818e
02818e
commit 1513be1efa2a836cb0f4309fcf1956df3faad34c
02818e
Author: Hangbin Liu <haliu@redhat.com>
02818e
Date:   Fri Sep 11 04:19:13 2020 -0400
02818e
02818e
    [net] packet: fix overflow in tpacket_rcv
02818e
02818e
    Message-id: <20200911041913.2808606-3-haliu@redhat.com>
02818e
    Patchwork-id: 326146
02818e
    Patchwork-instance: patchwork
02818e
    O-Subject: [CVE-2020-14386 RHEL8.3 net PATCH 2/2] net/packet: fix overflow in tpacket_rcv
02818e
    Bugzilla: 1876224
02818e
    Z-Bugzilla: 1876223
02818e
    CVE: CVE-2020-14386
02818e
    RH-Acked-by: Davide Caratti <dcaratti@redhat.com>
02818e
    RH-Acked-by: Marcelo Leitner <mleitner@redhat.com>
02818e
    RH-Acked-by: Jarod Wilson <jarod@redhat.com>
02818e
    RH-Acked-by: Paolo Abeni <pabeni@redhat.com>
02818e
    RH-Acked-by: Ivan Vecera <ivecera@redhat.com>
02818e
02818e
    Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1876224
02818e
    Brew: https://brewweb.devel.redhat.com/taskinfo?taskID=31276277
02818e
    Upstream Status: net.git commit acf69c946233
02818e
    CVE: CVE-2020-14386
02818e
02818e
    commit acf69c946233259ab4d64f8869d4037a198c7f06
02818e
    Author: Or Cohen <orcohen@paloaltonetworks.com>
02818e
    Date:   Thu Sep 3 21:05:28 2020 -0700
02818e
02818e
        net/packet: fix overflow in tpacket_rcv
02818e
02818e
        Using tp_reserve to calculate netoff can overflow as
02818e
        tp_reserve is unsigned int and netoff is unsigned short.
02818e
02818e
        This may lead to macoff receving a smaller value then
02818e
        sizeof(struct virtio_net_hdr), and if po->has_vnet_hdr
02818e
        is set, an out-of-bounds write will occur when
02818e
        calling virtio_net_hdr_from_skb.
02818e
02818e
        The bug is fixed by converting netoff to unsigned int
02818e
        and checking if it exceeds USHRT_MAX.
02818e
02818e
        This addresses CVE-2020-14386
02818e
02818e
        Fixes: 8913336a7e8d ("packet: add PACKET_RESERVE sockopt")
02818e
        Signed-off-by: Or Cohen <orcohen@paloaltonetworks.com>
02818e
        Signed-off-by: Eric Dumazet <edumazet@google.com>
02818e
        Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
02818e
02818e
    Signed-off-by: Hangbin Liu <haliu@redhat.com>
02818e
    Signed-off-by: Timothy Redaelli <tredaelli@redhat.com>
02818e
    Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
02818e
02818e
commit 5d07c2093eec0b75b60f6087a6c1b1f79c46e20c
02818e
Author: Hangbin Liu <haliu@redhat.com>
02818e
Date:   Fri Sep 11 04:19:12 2020 -0400
02818e
02818e
    [net] packet: make tp_drops atomic
02818e
02818e
    Message-id: <20200911041913.2808606-2-haliu@redhat.com>
02818e
    Patchwork-id: 326145
02818e
    Patchwork-instance: patchwork
02818e
    O-Subject: [CVE-2020-14386 RHEL8.3 net PATCH 1/2] net/packet: make tp_drops atomic
02818e
    Bugzilla: 1876224
02818e
    Z-Bugzilla: 1876223
02818e
    CVE: CVE-2020-14386
02818e
    RH-Acked-by: Davide Caratti <dcaratti@redhat.com>
02818e
    RH-Acked-by: Marcelo Leitner <mleitner@redhat.com>
02818e
    RH-Acked-by: Jarod Wilson <jarod@redhat.com>
02818e
    RH-Acked-by: Paolo Abeni <pabeni@redhat.com>
02818e
    RH-Acked-by: Ivan Vecera <ivecera@redhat.com>
02818e
02818e
    Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1876224
02818e
    Brew: https://brewweb.devel.redhat.com/taskinfo?taskID=31276277
02818e
    Upstream Status: net.git commit 8e8e2951e309
02818e
02818e
    commit 8e8e2951e3095732d7e780c241f61ea130955a57
02818e
    Author: Eric Dumazet <edumazet@google.com>
02818e
    Date:   Wed Jun 12 09:52:30 2019 -0700
02818e
02818e
        net/packet: make tp_drops atomic
02818e
02818e
        Under DDOS, we want to be able to increment tp_drops without
02818e
        touching the spinlock. This will help readers to drain
02818e
        the receive queue slightly faster :/
02818e
02818e
        Signed-off-by: Eric Dumazet <edumazet@google.com>
02818e
        Signed-off-by: David S. Miller <davem@davemloft.net>
02818e
02818e
    Signed-off-by: Hangbin Liu <haliu@redhat.com>
02818e
    Signed-off-by: Timothy Redaelli <tredaelli@redhat.com>
02818e
    Signed-off-by: Bruno Meneguele <bmeneg@redhat.com>
02818e
02818e
Acked-by: Joe Lawrence <joe.lawrence@redhat.com>
02818e
Acked-by: Yannick Cote <ycote@redhat.com>
02818e
Signed-off-by: Artem Savkov <asavkov@redhat.com>
02818e
02818e
---
02818e
 net/packet/af_packet.c | 118 ++++++++++++++++++++++++++++++++++++-----
02818e
 1 file changed, 106 insertions(+), 12 deletions(-)
02818e
02818e
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
02818e
index d69fb2077196..4c67f7156a17 100644
02818e
--- a/net/packet/af_packet.c
02818e
+++ b/net/packet/af_packet.c
02818e
@@ -185,6 +185,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
02818e
 #define BLOCK_O2PRIV(x)	((x)->offset_to_priv)
02818e
 #define BLOCK_PRIV(x)		((void *)((char *)(x) + BLOCK_O2PRIV(x)))
02818e
 
02818e
+#define KLP_SHADOW_TP_DROPS 0x2020143860000000
02818e
+
02818e
 struct packet_sock;
02818e
 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 		       struct packet_type *pt, struct net_device *orig_dev);
02818e
@@ -747,6 +749,8 @@ static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
02818e
 #endif
02818e
 }
02818e
 
02818e
+#include "kpatch-macros.h"
02818e
+
02818e
 /*
02818e
  * Side effect:
02818e
  *
02818e
@@ -765,8 +769,9 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
02818e
 	struct tpacket3_hdr *last_pkt;
02818e
 	struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
02818e
 	struct sock *sk = &po->sk;
02818e
+	atomic_t *tp_drops = klp_shadow_get(po, KLP_SHADOW_TP_DROPS);
02818e
 
02818e
-	if (po->stats.stats3.tp_drops)
02818e
+	if (tp_drops && atomic_read(tp_drops))
02818e
 		status |= TP_STATUS_LOSING;
02818e
 
02818e
 	last_pkt = (struct tpacket3_hdr *)pkc1->prev;
02818e
@@ -1281,6 +1286,8 @@ static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
02818e
 
02818e
 static void packet_sock_destruct(struct sock *sk)
02818e
 {
02818e
+	struct packet_sock *po = pkt_sk(sk);
02818e
+
02818e
 	skb_queue_purge(&sk->sk_error_queue);
02818e
 
02818e
 	WARN_ON(atomic_read(&sk->sk_rmem_alloc));
02818e
@@ -1291,6 +1298,8 @@ static void packet_sock_destruct(struct sock *sk)
02818e
 		return;
02818e
 	}
02818e
 
02818e
+	klp_shadow_free(po, KLP_SHADOW_TP_DROPS, NULL);
02818e
+
02818e
 	sk_refcnt_debug_dec(sk);
02818e
 }
02818e
 
02818e
@@ -1994,6 +2003,38 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
02818e
 	return err;
02818e
 }
02818e
 
02818e
+#define BPF_PROG_RUN_KPATCH(prog, ctx)	({				\
02818e
+	u32 ret;						\
02818e
+	cant_sleep();						\
02818e
+	if (static_key_enabled(&bpf_stats_enabled_key)) {	\
02818e
+		struct bpf_prog_stats *stats;			\
02818e
+		u64 start = sched_clock();			\
02818e
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
02818e
+		stats = this_cpu_ptr(prog->aux->stats);		\
02818e
+		u64_stats_update_begin(&stats->syncp);		\
02818e
+		stats->cnt++;					\
02818e
+		stats->nsecs += sched_clock() - start;		\
02818e
+		u64_stats_update_end(&stats->syncp);		\
02818e
+	} else {						\
02818e
+		ret = (*(prog)->bpf_func)(ctx, (prog)->insnsi);	\
02818e
+	}							\
02818e
+	ret; })
02818e
+
02818e
+static inline u32 bpf_prog_run_clear_cb_kpatch(const struct bpf_prog *prog,
02818e
+					struct sk_buff *skb)
02818e
+{
02818e
+	u8 *cb_data = bpf_skb_cb(skb);
02818e
+	u32 res;
02818e
+
02818e
+	if (unlikely(prog->cb_access))
02818e
+		memset(cb_data, 0, BPF_SKB_CB_LEN);
02818e
+
02818e
+	preempt_disable();
02818e
+	res = BPF_PROG_RUN_KPATCH(prog, skb);
02818e
+	preempt_enable();
02818e
+	return res;
02818e
+}
02818e
+
02818e
 static unsigned int run_filter(struct sk_buff *skb,
02818e
 			       const struct sock *sk,
02818e
 			       unsigned int res)
02818e
@@ -2003,7 +2044,7 @@ static unsigned int run_filter(struct sk_buff *skb,
02818e
 	rcu_read_lock();
02818e
 	filter = rcu_dereference(sk->sk_filter);
02818e
 	if (filter != NULL)
02818e
-		res = bpf_prog_run_clear_cb(filter->prog, skb);
02818e
+		res = bpf_prog_run_clear_cb_kpatch(filter->prog, skb);
02818e
 	rcu_read_unlock();
02818e
 
02818e
 	return res;
02818e
@@ -2046,6 +2087,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	int skb_len = skb->len;
02818e
 	unsigned int snaplen, res;
02818e
 	bool is_drop_n_account = false;
02818e
+	atomic_t *tp_drops;
02818e
 
02818e
 	if (skb->pkt_type == PACKET_LOOPBACK)
02818e
 		goto drop;
02818e
@@ -2053,6 +2095,17 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	sk = pt->af_packet_priv;
02818e
 	po = pkt_sk(sk);
02818e
 
02818e
+	tp_drops = klp_shadow_get(po, KLP_SHADOW_TP_DROPS);
02818e
+	if (!tp_drops) {
02818e
+		tp_drops = klp_shadow_alloc(po, KLP_SHADOW_TP_DROPS,
02818e
+					    sizeof(atomic_t*), GFP_ATOMIC,
02818e
+					    NULL, NULL);
02818e
+		if (!tp_drops)
02818e
+			goto drop;
02818e
+
02818e
+		atomic_set(tp_drops, po->stats.stats1.tp_drops);
02818e
+	}
02818e
+
02818e
 	if (!net_eq(dev_net(dev), sock_net(sk)))
02818e
 		goto drop;
02818e
 
02818e
@@ -2135,10 +2188,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 
02818e
 drop_n_acct:
02818e
 	is_drop_n_account = true;
02818e
-	spin_lock(&sk->sk_receive_queue.lock);
02818e
-	po->stats.stats1.tp_drops++;
02818e
+	atomic_inc(tp_drops);
02818e
 	atomic_inc(&sk->sk_drops);
02818e
-	spin_unlock(&sk->sk_receive_queue.lock);
02818e
 
02818e
 drop_n_restore:
02818e
 	if (skb_head != skb->data && skb_shared(skb)) {
02818e
@@ -2164,12 +2215,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	int skb_len = skb->len;
02818e
 	unsigned int snaplen, res;
02818e
 	unsigned long status = TP_STATUS_USER;
02818e
-	unsigned short macoff, netoff, hdrlen;
02818e
+	unsigned short macoff, hdrlen;
02818e
+	unsigned int netoff;
02818e
 	struct sk_buff *copy_skb = NULL;
02818e
 	struct timespec ts;
02818e
 	__u32 ts_status;
02818e
 	bool is_drop_n_account = false;
02818e
 	bool do_vnet = false;
02818e
+	atomic_t *tp_drops;
02818e
 
02818e
 	/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
02818e
 	 * We may add members to them until current aligned size without forcing
02818e
@@ -2184,6 +2237,17 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	sk = pt->af_packet_priv;
02818e
 	po = pkt_sk(sk);
02818e
 
02818e
+	tp_drops = klp_shadow_get(po, KLP_SHADOW_TP_DROPS);
02818e
+	if (!tp_drops) {
02818e
+		tp_drops = klp_shadow_alloc(po, KLP_SHADOW_TP_DROPS,
02818e
+					    sizeof(atomic_t*), GFP_ATOMIC,
02818e
+					    NULL, NULL);
02818e
+		if (!tp_drops)
02818e
+			goto drop;
02818e
+
02818e
+		atomic_set(tp_drops, po->stats.stats1.tp_drops);
02818e
+	}
02818e
+
02818e
 	if (!net_eq(dev_net(dev), sock_net(sk)))
02818e
 		goto drop;
02818e
 
02818e
@@ -2226,6 +2290,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 		}
02818e
 		macoff = netoff - maclen;
02818e
 	}
02818e
+	if (netoff > USHRT_MAX) {
02818e
+		atomic_inc(tp_drops);
02818e
+		goto drop_n_restore;
02818e
+	}
02818e
 	if (po->tp_version <= TPACKET_V2) {
02818e
 		if (macoff + snaplen > po->rx_ring.frame_size) {
02818e
 			if (po->copy_thresh &&
02818e
@@ -2272,7 +2340,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	 * Anyways, moving it for V1/V2 only as V3 doesn't need this
02818e
 	 * at packet level.
02818e
 	 */
02818e
-		if (po->stats.stats1.tp_drops)
02818e
+		if (atomic_read(tp_drops))
02818e
 			status |= TP_STATUS_LOSING;
02818e
 	}
02818e
 
02818e
@@ -2388,9 +2456,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
02818e
 	return 0;
02818e
 
02818e
 drop_n_account:
02818e
-	is_drop_n_account = true;
02818e
-	po->stats.stats1.tp_drops++;
02818e
 	spin_unlock(&sk->sk_receive_queue.lock);
02818e
+	atomic_inc(tp_drops);
02818e
+	is_drop_n_account = true;
02818e
 
02818e
 	sk->sk_data_ready(sk);
02818e
 	kfree_skb(copy_skb);
02818e
@@ -3195,6 +3263,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
02818e
 	struct sock *sk;
02818e
 	struct packet_sock *po;
02818e
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
02818e
+	atomic_t *tp_drops;
02818e
 	int err;
02818e
 
02818e
 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
02818e
@@ -3221,9 +3290,16 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
02818e
 	po->num = proto;
02818e
 	po->xmit = dev_queue_xmit;
02818e
 
02818e
+	tp_drops = klp_shadow_get_or_alloc(po, KLP_SHADOW_TP_DROPS,
02818e
+					   sizeof(atomic_t*), GFP_KERNEL,
02818e
+					   NULL, NULL);
02818e
+
02818e
+	if (!tp_drops)
02818e
+		goto out2;
02818e
+
02818e
 	err = packet_alloc_pending(po);
02818e
 	if (err)
02818e
-		goto out2;
02818e
+		goto out3;
02818e
 
02818e
 	packet_cached_dev_reset(po);
02818e
 
02818e
@@ -3258,6 +3334,8 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
02818e
 	preempt_enable();
02818e
 
02818e
 	return 0;
02818e
+out3:
02818e
+	klp_shadow_free(po, KLP_SHADOW_TP_DROPS, NULL);
02818e
 out2:
02818e
 	sk_free(sk);
02818e
 out:
02818e
@@ -3873,6 +3951,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
02818e
 	void *data = &val;
02818e
 	union tpacket_stats_u st;
02818e
 	struct tpacket_rollover_stats rstats;
02818e
+	int drops;
02818e
+	atomic_t *tp_drops;
02818e
 
02818e
 	if (level != SOL_PACKET)
02818e
 		return -ENOPROTOOPT;
02818e
@@ -3883,20 +3963,34 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
02818e
 	if (len < 0)
02818e
 		return -EINVAL;
02818e
 
02818e
+	tp_drops = klp_shadow_get(po, KLP_SHADOW_TP_DROPS);
02818e
+	if (!tp_drops) {
02818e
+		tp_drops = klp_shadow_alloc(po, KLP_SHADOW_TP_DROPS,
02818e
+					    sizeof(atomic_t*), GFP_ATOMIC,
02818e
+					    NULL, NULL);
02818e
+		if (!tp_drops)
02818e
+			return -ENOMEM;
02818e
+
02818e
+		atomic_set(tp_drops, po->stats.stats1.tp_drops);
02818e
+	}
02818e
+
02818e
 	switch (optname) {
02818e
 	case PACKET_STATISTICS:
02818e
 		spin_lock_bh(&sk->sk_receive_queue.lock);
02818e
 		memcpy(&st, &po->stats, sizeof(st));
02818e
 		memset(&po->stats, 0, sizeof(po->stats));
02818e
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
02818e
+		drops = atomic_xchg(tp_drops, 0);
02818e
 
02818e
 		if (po->tp_version == TPACKET_V3) {
02818e
 			lv = sizeof(struct tpacket_stats_v3);
02818e
-			st.stats3.tp_packets += st.stats3.tp_drops;
02818e
+			st.stats3.tp_drops = drops;
02818e
+			st.stats3.tp_packets += drops;
02818e
 			data = &st.stats3;
02818e
 		} else {
02818e
 			lv = sizeof(struct tpacket_stats);
02818e
-			st.stats1.tp_packets += st.stats1.tp_drops;
02818e
+			st.stats1.tp_drops = drops;
02818e
+			st.stats1.tp_packets += drops;
02818e
 			data = &st.stats1;
02818e
 		}
02818e
 
02818e
-- 
02818e
2.26.2
02818e