From ab83b9905ec3531b3d8b975f8035ca32d9254091 Mon Sep 17 00:00:00 2001 From: Dumitru Ceara Date: Tue, 3 Nov 2020 16:51:21 +0100 Subject: [PATCH 2/2] ovn-northd: Limit self originated ARP/ND broadcast domain. Considering the following large scale deployment: external-network -- public-logical-switch -- router-1 -- sw1 -- VIF-1 -- router-2 -- sw2 -- VIF-2 ... -- router-n -- swn -- VIF-n To avoid hitting the max number of OVS resubmits (4K currently) OVN already restricted the broadcast domain for ARP/ND requests received from the external network and targeting an OVN-owned IP (e.g., owned by router-x). Such packets are not flooded in the broadcast domain of the public logical switch and instead are forwarded only to the port connecting router-x. However, the max number of OVS resubmits can also be hit in the following scenarios: - router-x tries to resolve an IP owned by router-y (e.g., VIF-x trying to reach VIF-y via floating IP). - router-x tries to resolve an IP owned by the external network. Because ARP/ND requests in the above cases are originated by OVN router ports they were being flooded in the complete broadcast domain of the public logical switch. Instead, we now create a new Multicast_Group for each logical switch and add all non-router ports to it. ARP/ND requests are now forwarded to the router port that owns the IP (if any) and then flooded in this restricted MC_FLOOD_L2 broadcast domain. Fixes: 32f5ebb06226 ("ovn-northd: Limit ARP/ND broadcast domain whenever possible.") Acked-by: Mark Michelson Signed-off-by: Dumitru Ceara Signed-off-by: Numan Siddique (cherry picked from upstream commit 8c6a5bc21847dab8ccbe18ab1e4b563ddca13379) Change-Id: Ic35a6e7531d09374c0b3ba9c8be3bd986adc7941 --- lib/mcast-group-index.h | 1 + northd/ovn-northd.8.xml | 19 +++++------ northd/ovn-northd.c | 84 ++++++++++++++++++++++++++++++------------------- tests/ovn.at | 50 ++++++++++++++--------------- 4 files changed, 86 insertions(+), 68 deletions(-) diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h index ba995ba..72af117 100644 --- a/lib/mcast-group-index.h +++ b/lib/mcast-group-index.h @@ -30,6 +30,7 @@ enum ovn_mcast_tunnel_keys { OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY, OVN_MCAST_MROUTER_STATIC_TUNNEL_KEY, OVN_MCAST_STATIC_TUNNEL_KEY, + OVN_MCAST_FLOOD_L2_TUNNEL_KEY, OVN_MIN_IP_MULTICAST, OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST, }; diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml index f1c7c9b..8206982 100644 --- a/northd/ovn-northd.8.xml +++ b/northd/ovn-northd.8.xml @@ -1367,18 +1367,19 @@ output;
  • - Priority-80 flows for each port connected to a logical router - matching self originated GARP/ARP request/ND packets. These packets - are flooded to the MC_FLOOD which contains all logical - ports. + Priority-80 flows for each IP address/VIP/NAT address owned by a + router port connected to the switch. These flows match ARP requests + and ND packets for the specific IP addresses. Matched packets are + forwarded only to the router that owns the IP address and to the + MC_FLOOD_L2 multicast group which contains all non-router + logical ports.
  • - Priority-75 flows for each IP address/VIP/NAT address owned by a - router port connected to the switch. These flows match ARP requests - and ND packets for the specific IP addresses. Matched packets are - forwarded only to the router that owns the IP address and, if - present, to the localnet port of the logical switch. + Priority-75 flows for each port connected to a logical router + matching self originated ARP request/ND packets. These packets + are flooded to the MC_FLOOD_L2 which contains all + non-router logical ports.
  • diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c index ecbe98e..ce291ec 100644 --- a/northd/ovn-northd.c +++ b/northd/ovn-northd.c @@ -1493,6 +1493,12 @@ lsp_is_external(const struct nbrec_logical_switch_port *nbsp) } static bool +lsp_is_router(const struct nbrec_logical_switch_port *nbsp) +{ + return !strcmp(nbsp->type, "router"); +} + +static bool lrport_is_enabled(const struct nbrec_logical_router_port *lrport) { return !lrport->enabled || *lrport->enabled; @@ -2424,7 +2430,7 @@ join_logical_ports(struct northd_context *ctx, * to their peers. */ struct ovn_port *op; HMAP_FOR_EACH (op, key_node, ports) { - if (op->nbsp && !strcmp(op->nbsp->type, "router") && !op->derived) { + if (op->nbsp && lsp_is_router(op->nbsp) && !op->derived) { const char *peer_name = smap_get(&op->nbsp->options, "router-port"); if (!peer_name) { continue; @@ -3105,7 +3111,7 @@ ovn_port_update_sbrec(struct northd_context *ctx, sbrec_port_binding_set_nat_addresses(op->sb, NULL, 0); } else { - if (strcmp(op->nbsp->type, "router")) { + if (!lsp_is_router(op->nbsp)) { uint32_t queue_id = smap_get_int( &op->sb->options, "qdisc_queue_id", 0); bool has_qos = port_has_qos_params(&op->nbsp->options); @@ -3808,6 +3814,10 @@ static const struct multicast_group mc_static = static const struct multicast_group mc_unknown = { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY }; +#define MC_FLOOD_L2 "_MC_flood_l2" +static const struct multicast_group mc_flood_l2 = + { MC_FLOOD_L2, OVN_MCAST_FLOOD_L2_TUNNEL_KEY }; + static bool multicast_group_equal(const struct multicast_group *a, const struct multicast_group *b) @@ -6372,12 +6382,11 @@ build_lswitch_rport_arp_req_self_orig_flow(struct ovn_port *op, sset_add(&all_eth_addrs, nat->external_mac); } - - /* Self originated (G)ARP requests/ND need to be flooded as usual. - * Determine that packets are self originated by also matching on - * source MAC. Matching on ingress port is not reliable in case this - * is a VLAN-backed network. - * Priority: 80. + /* Self originated ARP requests/ND need to be flooded to the L2 domain + * (except on router ports). Determine that packets are self originated + * by also matching on source MAC. Matching on ingress port is not + * reliable in case this is a VLAN-backed network. + * Priority: 75. */ const char *eth_addr; @@ -6393,7 +6402,7 @@ build_lswitch_rport_arp_req_self_orig_flow(struct ovn_port *op, ds_cstr(ð_src)); ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, priority, ds_cstr(&match), - "outport = \""MC_FLOOD"\"; output;"); + "outport = \""MC_FLOOD_L2"\"; output;"); sset_destroy(&all_eth_addrs); ds_destroy(ð_src); @@ -6439,14 +6448,16 @@ build_lswitch_rport_arp_req_flow_for_ip(struct sset *ips, ds_chomp(&match, ','); ds_put_cstr(&match, "}"); - /* Send a the packet only to the router pipeline and skip flooding it - * in the broadcast domain (except for the localnet port). + /* Send a the packet to the router pipeline. If the switch has non-router + * ports then flood it there as well. */ - for (size_t i = 0; i < od->n_localnet_ports; i++) { - ds_put_format(&actions, "clone { outport = %s; output; }; ", - od->localnet_ports[i]->json_key); + if (od->n_router_ports != od->nbs->n_ports) { + ds_put_format(&actions, "clone {outport = %s; output; }; " + "outport = \""MC_FLOOD_L2"\"; output;", + patch_op->json_key); + } else { + ds_put_format(&actions, "outport = %s; output;", patch_op->json_key); } - ds_put_format(&actions, "outport = %s; output;", patch_op->json_key); ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_L2_LKUP, priority, ds_cstr(&match), ds_cstr(&actions), stage_hint); @@ -6476,14 +6487,9 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op, return; } - /* Self originated (G)ARP requests/ND need to be flooded as usual. - * Priority: 80. - */ - build_lswitch_rport_arp_req_self_orig_flow(op, 80, sw_od, lflows); - /* Forward ARP requests for owned IP addresses (L3, VIP, NAT) only to this * router port. - * Priority: 75. + * Priority: 80. */ struct sset all_ips_v4 = SSET_INITIALIZER(&all_ips_v4); struct sset all_ips_v6 = SSET_INITIALIZER(&all_ips_v6); @@ -6558,17 +6564,28 @@ build_lswitch_rport_arp_req_flows(struct ovn_port *op, if (!sset_is_empty(&all_ips_v4)) { build_lswitch_rport_arp_req_flow_for_ip(&all_ips_v4, AF_INET, sw_op, - sw_od, 75, lflows, + sw_od, 80, lflows, stage_hint); } if (!sset_is_empty(&all_ips_v6)) { build_lswitch_rport_arp_req_flow_for_ip(&all_ips_v6, AF_INET6, sw_op, - sw_od, 75, lflows, + sw_od, 80, lflows, stage_hint); } sset_destroy(&all_ips_v4); sset_destroy(&all_ips_v6); + + /* Self originated ARP requests/ND need to be flooded as usual. + * + * However, if the switch doesn't have any non-router ports we shouldn't + * even try to flood. + * + * Priority: 75. + */ + if (sw_od->n_router_ports != sw_od->nbs->n_ports) { + build_lswitch_rport_arp_req_self_orig_flow(op, 75, sw_od, lflows); + } } static void @@ -6908,7 +6925,7 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports, * - port type is localport */ if (check_lsp_is_up && - !lsp_is_up(op->nbsp) && strcmp(op->nbsp->type, "router") && + !lsp_is_up(op->nbsp) && !lsp_is_router(op->nbsp) && strcmp(op->nbsp->type, "localport")) { continue; } @@ -6983,8 +7000,7 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports, "flags.loopback = 1; " "output; " "};", - !strcmp(op->nbsp->type, "router") ? - "nd_na_router" : "nd_na", + lsp_is_router(op->nbsp) ? "nd_na_router" : "nd_na", op->lsp_addrs[i].ea_s, op->lsp_addrs[i].ipv6_addrs[j].addr_s, op->lsp_addrs[i].ipv6_addrs[j].addr_s, @@ -7066,7 +7082,7 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports, continue; } - if (!lsp_is_enabled(op->nbsp) || !strcmp(op->nbsp->type, "router")) { + if (!lsp_is_enabled(op->nbsp) || lsp_is_router(op->nbsp)) { /* Don't add the DHCP flows if the port is not enabled or if the * port is a router port. */ continue; @@ -7326,7 +7342,7 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports, * broadcast flooding of ARP/ND requests in table 19. We direct the * requests only to the router port that owns the IP address. */ - if (!strcmp(op->nbsp->type, "router")) { + if (lsp_is_router(op->nbsp)) { build_lswitch_rport_arp_req_flows(op->peer, op->od, op, lflows, &op->nbsp->header_); } @@ -10786,7 +10802,7 @@ build_arp_resolve_flows_for_lrouter_port( &op->nbrp->header_); } } - } else if (op->od->n_router_ports && strcmp(op->nbsp->type, "router") + } else if (op->od->n_router_ports && !lsp_is_router(op->nbsp) && strcmp(op->nbsp->type, "virtual")) { /* This is a logical switch port that backs a VM or a container. * Extract its addresses. For each of the address, go through all @@ -10870,7 +10886,7 @@ build_arp_resolve_flows_for_lrouter_port( } } } - } else if (op->od->n_router_ports && strcmp(op->nbsp->type, "router") + } else if (op->od->n_router_ports && !lsp_is_router(op->nbsp) && !strcmp(op->nbsp->type, "virtual")) { /* This is a virtual port. Add ARP replies for the virtual ip with * the mac of the present active virtual parent. @@ -10974,7 +10990,7 @@ build_arp_resolve_flows_for_lrouter_port( } } } - } else if (!strcmp(op->nbsp->type, "router")) { + } else if (lsp_is_router(op->nbsp)) { /* This is a logical switch port that connects to a router. */ /* The peer of this switch port is the router port for which @@ -11929,6 +11945,10 @@ build_mcast_groups(struct northd_context *ctx, } else if (op->nbsp && lsp_is_enabled(op->nbsp)) { ovn_multicast_add(mcast_groups, &mc_flood, op); + if (!lsp_is_router(op->nbsp)) { + ovn_multicast_add(mcast_groups, &mc_flood_l2, op); + } + /* If this port is connected to a multicast router then add it * to the MC_MROUTER_FLOOD group. */ @@ -12372,7 +12392,7 @@ handle_port_binding_changes(struct northd_context *ctx, struct hmap *ports, continue; } - bool up = (sb->chassis || !strcmp(op->nbsp->type, "router")); + bool up = (sb->chassis || lsp_is_router(op->nbsp)); if (!op->nbsp->up || *op->nbsp->up != up) { nbrec_logical_switch_port_set_up(op->nbsp, &up, 1); } diff --git a/tests/ovn.at b/tests/ovn.at index ea4a6da..180fb91 100644 --- a/tests/ovn.at +++ b/tests/ovn.at @@ -3626,7 +3626,7 @@ test_ip() { done } -# test_arp INPORT SHA SPA TPA FLOOD [REPLY_HA] +# test_arp INPORT SHA SPA TPA [REPLY_HA] # # Causes a packet to be received on INPORT. The packet is an ARP # request with SHA, SPA, and TPA as specified. If REPLY_HA is provided, then @@ -3637,25 +3637,21 @@ test_ip() { # SHA and REPLY_HA are each 12 hex digits. # SPA and TPA are each 8 hex digits. test_arp() { - local inport=$1 sha=$2 spa=$3 tpa=$4 flood=$5 reply_ha=$6 + local inport=$1 sha=$2 spa=$3 tpa=$4 reply_ha=$5 local request=ffffffffffff${sha}08060001080006040001${sha}${spa}ffffffffffff${tpa} hv=hv`vif_to_hv $inport` as $hv ovs-appctl netdev-dummy/receive vif$inport $request as $hv ovs-appctl ofproto/trace br-int in_port=$inport $request # Expect to receive the broadcast ARP on the other logical switch ports if - # IP address is not configured on the switch patch port or on the router - # port (i.e, $flood == 1). + # IP address is not configured to the switch patch port. local i=`vif_to_ls $inport` local j k for j in 1 2 3; do for k in 1 2 3; do - # Skip ingress port. - if test $i$j$k == $inport; then - continue - fi - - if test X$flood == X1; then + # 192.168.33.254 is configured to the switch patch port for lrp33, + # so no ARP flooding expected for it. + if test $i$j$k != $inport && test $tpa != `ip_to_hex 192 168 33 254`; then echo $request >> $i$j$k.expected fi done @@ -3792,9 +3788,9 @@ for i in 1 2 3; do otherip=`ip_to_hex 192 168 $i$j 55` # Some other IP in subnet externalip=`ip_to_hex 1 2 3 4` # Some other IP not in subnet - test_arp $i$j$k $smac $sip $rip 0 $rmac #4 - test_arp $i$j$k $smac $otherip $rip 0 $rmac #5 - test_arp $i$j$k $smac $sip $otherip 1 #6 + test_arp $i$j$k $smac $sip $rip $rmac #4 + test_arp $i$j$k $smac $otherip $rip $rmac #5 + test_arp $i$j$k $smac $sip $otherip #6 # When rip is 192.168.33.254, ARP request from externalip won't be # filtered, because 192.168.33.254 is configured to switch peer port @@ -3803,7 +3799,7 @@ for i in 1 2 3; do if test $i = 3 && test $j = 3; then lrp33_rsp=$rmac fi - test_arp $i$j$k $smac $externalip $rip 0 $lrp33_rsp #7 + test_arp $i$j$k $smac $externalip $rip $lrp33_rsp #7 # MAC binding should be learned from ARP request. host_mac_pretty=f0:00:00:00:0$i:$j$k @@ -19895,7 +19891,7 @@ match_r1_metadata="metadata=0x${r1_dp_key}" send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0 0 1) # Verify that the ARP request is sent only to rtr1. -match_arp_req="priority=75.*${match_sw_metadata}.*arp_tpa=10.0.0.1,arp_op=1" +match_arp_req="priority=80.*${match_sw_metadata}.*arp_tpa=10.0.0.1,arp_op=1" match_send_rtr1="load:0x${r1_tnl_key}->NXM_NX_REG15" match_send_rtr2="load:0x${r2_tnl_key}->NXM_NX_REG15" @@ -19919,7 +19915,7 @@ dst_ipv6=00100000000000000000000000000001 send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 751d # Verify that the ND_NS is sent only to rtr1. -match_nd_ns="priority=75.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::1" +match_nd_ns="priority=80.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::1" as hv1 OVS_WAIT_UNTIL([ @@ -19951,7 +19947,7 @@ ovn-nbctl --wait=hv sync send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0 0 11) # Verify that the ARP request is sent only to rtr1. -match_arp_req="priority=75.*${match_sw_metadata}.*arp_tpa=10.0.0.11,arp_op=1" +match_arp_req="priority=80.*${match_sw_metadata}.*arp_tpa=10.0.0.11,arp_op=1" match_send_rtr1="load:0x${r1_tnl_key}->NXM_NX_REG15" match_send_rtr2="load:0x${r2_tnl_key}->NXM_NX_REG15" @@ -19975,7 +19971,7 @@ dst_ipv6=00100000000000000000000000000011 send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 751d # Verify that the ND_NS is sent only to rtr1. -match_nd_ns="priority=75.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::11" +match_nd_ns="priority=80.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::11" as hv1 OVS_WAIT_UNTIL([ @@ -20015,7 +20011,7 @@ ovn-nbctl --wait=hv sync # - 10.0.0.22, 10::22 - LB VIPs. # - 10.0.0.222, 10::222 - DNAT IPs. as hv1 -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw_metadata}" | grep -oE "arp_tpa=[[0-9.]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw_metadata}" | grep -oE "arp_tpa=[[0-9.]]+" | sort], [0], [dnl arp_tpa=10.0.0.1 arp_tpa=10.0.0.11 arp_tpa=10.0.0.111 @@ -20025,7 +20021,7 @@ arp_tpa=10.0.0.2 arp_tpa=10.0.0.22 arp_tpa=10.0.0.222 ]) -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw_metadata}" | grep -oE "nd_target=[[0-9a-f:]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw_metadata}" | grep -oE "nd_target=[[0-9a-f:]]+" | sort], [0], [dnl nd_target=10::1 nd_target=10::11 nd_target=10::111 @@ -20041,10 +20037,10 @@ nd_target=fe80::200:ff:fe00:200 # For sw1-rtr1: # - 20.0.0.1, 20::1, fe80::200:1ff:fe00:0 - interface IPs. as hv1 -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw1_metadata}" | grep -oE "arp_tpa=[[0-9.]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw1_metadata}" | grep -oE "arp_tpa=[[0-9.]]+" | sort], [0], [dnl arp_tpa=20.0.0.1 ]) -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw1_metadata}" | grep -oE "nd_target=[[0-9a-f:]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw1_metadata}" | grep -oE "nd_target=[[0-9a-f:]]+" | sort], [0], [dnl nd_target=20::1 nd_target=fe80::200:1ff:fe00:0 ]) @@ -20056,13 +20052,13 @@ nd_target=fe80::200:1ff:fe00:0 # - 00:00:00:01:00:00 - dnat_and_snat external MAC. # - 00:00:00:02:00:00 - dnat_and_snat external MAC. as hv1 -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw_metadata}.*arp_op=1" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw_metadata}.*arp_op=1" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl dl_src=00:00:00:00:01:00 dl_src=00:00:00:00:02:00 dl_src=00:00:00:01:00:00 dl_src=00:00:00:02:00:00 ]) -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw_metadata}.*icmp_type=135" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw_metadata}.*icmp_type=135" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl dl_src=00:00:00:00:01:00 dl_src=00:00:00:00:02:00 dl_src=00:00:00:01:00:00 @@ -20072,7 +20068,7 @@ dl_src=00:00:00:02:00:00 # Self originated ARP/NS with SMACs owned by rtr1-sw1 should be flooded: # - 00:00:01:00:00:00 - interface MAC. as hv1 -AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=80,.*${match_sw1_metadata}.*arp_op=1" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl +AT_CHECK([ovs-ofctl dump-flows br-int | grep -E "priority=75,.*${match_sw1_metadata}.*arp_op=1" | grep -oE "dl_src=[[0-9a-f:]]+" | sort], [0], [dnl dl_src=00:00:01:00:00:00 ]) @@ -20080,7 +20076,7 @@ dl_src=00:00:01:00:00:00 send_arp_request 1 0 ${src_mac} $(ip_to_hex 10 0 0 254) $(ip_to_hex 10 0 0 111) # Verify that the ARP request is sent only to rtr1. -match_arp_req="priority=75.*${match_sw_metadata}.*arp_tpa=10.0.0.111,arp_op=1" +match_arp_req="priority=80.*${match_sw_metadata}.*arp_tpa=10.0.0.111,arp_op=1" match_send_rtr1="load:0x${r1_tnl_key}->NXM_NX_REG15" match_send_rtr2="load:0x${r2_tnl_key}->NXM_NX_REG15" @@ -20144,7 +20140,7 @@ dst_ipv6=00100000000000000000000000000111 send_nd_ns 1 0 ${src_mac} ${src_ipv6} ${dst_ipv6} 751d # Verify that the ND_NS is sent only to rtr1. -match_nd_ns="priority=75.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::111" +match_nd_ns="priority=80.*${match_sw_metadata}.*icmp_type=135.*nd_target=10::111" as hv1 OVS_WAIT_UNTIL([ -- 1.8.3.1