Blob Blame History Raw
From c41499498db66fbe54155206a513e418d75f8d49 Mon Sep 17 00:00:00 2001
From: Numan Siddique <numans@ovn.org>
Date: Wed, 2 Dec 2020 23:57:56 +0530
Subject: [PATCH 5/7] northd: Add ECMP support to router policies.

A user can add a policy now like:

ovn-nbctl lr-policy-add <LR> 100 "ip4.src == 10.0.0.4" reroute 172.0.0.5,172.0.0.6

We do have ECMP support for logical router static routes, but since
policies are applied after the routing stage, ECMP support for
policies is desired by ovn-kubernetes.

A new column 'nexthops' is added to the Logical_Router_Policy table
instead of modifying the existing column 'nexthop' to preserve
backward compatibility and avoid any upgrade issues.

Change-Id: Ib5723d1de30f0ad86ee740bb4e3b593f1cca98eb
Requested-by: Alexander Constantinescu <aconstan@redhat.com>
Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1881826
Acked-by: Mark Michelson <mmichels@redhat.com>
Signed-off-by: Numan Siddique <numans@ovn.org>
---
 northd/ovn-northd.8.xml   |  80 +++++++++++++++++++--
 northd/ovn-northd.c       | 148 ++++++++++++++++++++++++++++++++++----
 ovn-nb.ovsschema          |   6 +-
 ovn-nb.xml                |  18 ++++-
 tests/ovn-northd.at       | 124 ++++++++++++++++++++++++++++++++
 tests/ovn.at              |  16 ++---
 utilities/ovn-nbctl.8.xml |  12 ++--
 utilities/ovn-nbctl.c     |  73 +++++++++++++++----
 8 files changed, 429 insertions(+), 48 deletions(-)

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index d86f36ea6..1f0f71f34 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -3041,14 +3041,36 @@ outport = <var>P</var>;
 
       <li>
         <p>
-          If the policy action is <code>reroute</code>, then the logical
-          flow is added with the following actions:
+          If the policy action is <code>reroute</code> with 2 or more nexthops
+          defined, then the logical flow is added with the following actions:
+        </p>
+
+         <pre>
+reg8[0..15] = <var>GID</var>;
+reg8[16..31] = select(1,..n);
+        </pre>
+
+        <p>
+          where <var>GID</var> is the ECMP group id generated by
+          <code>ovn-northd</code> for this policy and <var>n</var>
+          is the number of nexthops. <code>select</code> action
+          selects one of the nexthop member id, stores it in the register
+          <code>reg8[16..31]</code> and advances the packet to the
+          next stage.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          If the policy action is <code>reroute</code> with just one nexhop,
+          then the logical flow is added with the following actions:
         </p>
 
          <pre>
 [xx]reg0 = <var>H</var>;
 eth.src = <var>E</var>;
 outport = <var>P</var>;
+reg8[0..15] = 0;
 flags.loopback = 1;
 next;
         </pre>
@@ -3072,7 +3094,51 @@ next;
       </li>
     </ul>
 
-    <h3>Ingress Table 13: ARP/ND Resolution</h3>
+    <h3>Ingress Table 13: ECMP handling for router policies</h3>
+    <p>
+      This table handles the ECMP for the router policies configured
+      with multiple nexthops.
+    </p>
+
+    <ul>
+      <li>
+        <p>
+          A priority-150 flow is added to advance the packet to the next stage
+          if the ECMP group id register <code>reg8[0..15]</code> is 0.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          For each ECMP reroute router policy with multiple nexthops,
+          a priority-100 flow is added for each nexthop <var>H</var>
+          with the match <code>reg8[0..15] == <var>GID</var> &amp;&amp;
+          reg8[16..31] == <var>M</var></code> where <var>GID</var>
+          is the router policy group id generated by <code>ovn-northd</code>
+          and <var>M</var> is the member id of the nexthop <var>H</var>
+          generated by <code>ovn-northd</code>. The following actions are added
+          to the flow:
+        </p>
+
+        <pre>
+[xx]reg0 = <var>H</var>;
+eth.src = <var>E</var>;
+outport = <var>P</var>
+"flags.loopback = 1; "
+"next;"
+        </pre>
+
+        <p>
+          where <var>H</var> is the <code>nexthop </code> defined in the
+          router policy, <var>E</var> is the ethernet address of the
+          logical router port from which the <code>nexthop</code> is
+          reachable and <var>P</var> is the logical router port from
+          which the <code>nexthop</code> is reachable.
+        </p>
+      </li>
+    </ul>
+
+    <h3>Ingress Table 14: ARP/ND Resolution</h3>
 
     <p>
       Any packet that reaches this table is an IP packet whose next-hop
@@ -3258,7 +3324,7 @@ next;
 
     </ul>
 
-    <h3>Ingress Table 14: Check packet length</h3>
+    <h3>Ingress Table 15: Check packet length</h3>
 
     <p>
       For distributed logical routers with distributed gateway port configured
@@ -3288,7 +3354,7 @@ REGBIT_PKT_LARGER = check_pkt_larger(<var>L</var>); next;
       and advances to the next table.
     </p>
 
-    <h3>Ingress Table 15: Handle larger packets</h3>
+    <h3>Ingress Table 16: Handle larger packets</h3>
 
     <p>
       For distributed logical routers with distributed gateway port configured
@@ -3349,7 +3415,7 @@ icmp6 {
       and advances to the next table.
     </p>
 
-    <h3>Ingress Table 16: Gateway Redirect</h3>
+    <h3>Ingress Table 17: Gateway Redirect</h3>
 
     <p>
       For distributed logical routers where one of the logical router
@@ -3389,7 +3455,7 @@ icmp6 {
       </li>
     </ul>
 
-    <h3>Ingress Table 17: ARP Request</h3>
+    <h3>Ingress Table 18: ARP Request</h3>
 
     <p>
       In the common case where the Ethernet destination has been resolved, this
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 478f1a339..dfd7d69d0 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -188,11 +188,12 @@ enum ovn_stage {
     PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING,      10, "lr_in_ip_routing")   \
     PIPELINE_STAGE(ROUTER, IN,  IP_ROUTING_ECMP, 11, "lr_in_ip_routing_ecmp") \
     PIPELINE_STAGE(ROUTER, IN,  POLICY,          12, "lr_in_policy")       \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,     13, "lr_in_arp_resolve")  \
-    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   ,  14, "lr_in_chk_pkt_len")   \
-    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,     15,"lr_in_larger_pkts")   \
-    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,     16, "lr_in_gw_redirect")  \
-    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,     17, "lr_in_arp_request")  \
+    PIPELINE_STAGE(ROUTER, IN,  POLICY_ECMP,     13, "lr_in_policy_ecmp")  \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_RESOLVE,     14, "lr_in_arp_resolve")  \
+    PIPELINE_STAGE(ROUTER, IN,  CHK_PKT_LEN   ,  15, "lr_in_chk_pkt_len")  \
+    PIPELINE_STAGE(ROUTER, IN,  LARGER_PKTS,     16, "lr_in_larger_pkts")  \
+    PIPELINE_STAGE(ROUTER, IN,  GW_REDIRECT,     17, "lr_in_gw_redirect")  \
+    PIPELINE_STAGE(ROUTER, IN,  ARP_REQUEST,     18, "lr_in_arp_request")  \
                                                                       \
     /* Logical router egress stages. */                               \
     PIPELINE_STAGE(ROUTER, OUT, UNDNAT,    0, "lr_out_undnat")        \
@@ -7562,33 +7563,39 @@ build_routing_policy_flow(struct hmap *lflows, struct ovn_datapath *od,
     struct ds actions = DS_EMPTY_INITIALIZER;
 
     if (!strcmp(rule->action, "reroute")) {
+        ovs_assert(rule->n_nexthops <= 1);
+
+        char *nexthop =
+            (rule->n_nexthops == 1 ? rule->nexthops[0] : rule->nexthop);
         struct ovn_port *out_port = get_outport_for_routing_policy_nexthop(
-             od, ports, rule->priority, rule->nexthop);
+             od, ports, rule->priority, nexthop);
         if (!out_port) {
             return;
         }
 
-        const char *lrp_addr_s = find_lrp_member_ip(out_port, rule->nexthop);
+        const char *lrp_addr_s = find_lrp_member_ip(out_port, nexthop);
         if (!lrp_addr_s) {
             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
             VLOG_WARN_RL(&rl, "lrp_addr not found for routing policy "
                          " priority %"PRId64" nexthop %s",
-                         rule->priority, rule->nexthop);
+                         rule->priority, nexthop);
             return;
         }
         uint32_t pkt_mark = ovn_smap_get_uint(&rule->options, "pkt_mark", 0);
         if (pkt_mark) {
             ds_put_format(&actions, "pkt.mark = %u; ", pkt_mark);
         }
-        bool is_ipv4 = strchr(rule->nexthop, '.') ? true : false;
+
+        bool is_ipv4 = strchr(nexthop, '.') ? true : false;
         ds_put_format(&actions, "%s = %s; "
                       "%s = %s; "
                       "eth.src = %s; "
                       "outport = %s; "
                       "flags.loopback = 1; "
+                      REG_ECMP_GROUP_ID" = 0; "
                       "next;",
                       is_ipv4 ? REG_NEXT_HOP_IPV4 : REG_NEXT_HOP_IPV6,
-                      rule->nexthop,
+                      nexthop,
                       is_ipv4 ? REG_SRC_IPV4 : REG_SRC_IPV6,
                       lrp_addr_s,
                       out_port->lrp_networks.ea_s,
@@ -7601,7 +7608,7 @@ build_routing_policy_flow(struct hmap *lflows, struct ovn_datapath *od,
         if (pkt_mark) {
             ds_put_format(&actions, "pkt.mark = %u; ", pkt_mark);
         }
-        ds_put_cstr(&actions, "next;");
+        ds_put_cstr(&actions, REG_ECMP_GROUP_ID" = 0; next;");
     }
     ds_put_format(&match, "%s", rule->match);
 
@@ -7611,6 +7618,107 @@ build_routing_policy_flow(struct hmap *lflows, struct ovn_datapath *od,
     ds_destroy(&actions);
 }
 
+static void
+build_ecmp_routing_policy_flows(struct hmap *lflows, struct ovn_datapath *od,
+                                struct hmap *ports,
+                                const struct nbrec_logical_router_policy *rule,
+                                uint16_t ecmp_group_id)
+{
+    ovs_assert(rule->n_nexthops > 1);
+
+    bool nexthops_is_ipv4 = true;
+
+    /* Check that all the nexthops belong to the same addr family before
+     * adding logical flows. */
+    for (uint16_t i = 0; i < rule->n_nexthops; i++) {
+        bool is_ipv4 = strchr(rule->nexthops[i], '.') ? true : false;
+
+        if (i == 0) {
+            nexthops_is_ipv4 = is_ipv4;
+        }
+
+        if (is_ipv4 != nexthops_is_ipv4) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+            VLOG_WARN_RL(&rl, "nexthop [%s] of the router policy with "
+                         "the match [%s] do not belong to the same address "
+                         "family as other next hops",
+                         rule->nexthops[i], rule->match);
+            return;
+        }
+    }
+
+    struct ds match = DS_EMPTY_INITIALIZER;
+    struct ds actions = DS_EMPTY_INITIALIZER;
+
+    for (uint16_t i = 0; i < rule->n_nexthops; i++) {
+        struct ovn_port *out_port = get_outport_for_routing_policy_nexthop(
+             od, ports, rule->priority, rule->nexthops[i]);
+        if (!out_port) {
+            goto cleanup;
+        }
+
+        const char *lrp_addr_s =
+            find_lrp_member_ip(out_port, rule->nexthops[i]);
+        if (!lrp_addr_s) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+            VLOG_WARN_RL(&rl, "lrp_addr not found for routing policy "
+                            " priority %"PRId64" nexthop %s",
+                            rule->priority, rule->nexthops[i]);
+            goto cleanup;
+        }
+
+        ds_clear(&actions);
+        uint32_t pkt_mark = ovn_smap_get_uint(&rule->options, "pkt_mark", 0);
+        if (pkt_mark) {
+            ds_put_format(&actions, "pkt.mark = %u; ", pkt_mark);
+        }
+
+        bool is_ipv4 = strchr(rule->nexthops[i], '.') ? true : false;
+
+        ds_put_format(&actions, "%s = %s; "
+                      "%s = %s; "
+                      "eth.src = %s; "
+                      "outport = %s; "
+                      "flags.loopback = 1; "
+                      "next;",
+                      is_ipv4 ? REG_NEXT_HOP_IPV4 : REG_NEXT_HOP_IPV6,
+                      rule->nexthops[i],
+                      is_ipv4 ? REG_SRC_IPV4 : REG_SRC_IPV6,
+                      lrp_addr_s,
+                      out_port->lrp_networks.ea_s,
+                      out_port->json_key);
+
+        ds_clear(&match);
+        ds_put_format(&match, REG_ECMP_GROUP_ID" == %"PRIu16" && "
+                      REG_ECMP_MEMBER_ID" == %"PRIu16,
+                      ecmp_group_id, i + 1);
+        ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_POLICY_ECMP,
+                                100, ds_cstr(&match),
+                                ds_cstr(&actions), &rule->header_);
+    }
+
+    ds_clear(&actions);
+    ds_put_format(&actions, "%s = %"PRIu16
+                  "; %s = select(", REG_ECMP_GROUP_ID, ecmp_group_id,
+                  REG_ECMP_MEMBER_ID);
+
+    for (uint16_t i = 0; i < rule->n_nexthops; i++) {
+        if (i > 0) {
+            ds_put_cstr(&actions, ", ");
+        }
+
+        ds_put_format(&actions, "%"PRIu16, i + 1);
+    }
+    ds_put_cstr(&actions, ");");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_POLICY,
+                            rule->priority, rule->match,
+                            ds_cstr(&actions), &rule->header_);
+
+cleanup:
+    ds_destroy(&match);
+    ds_destroy(&actions);
+}
+
 struct parsed_route {
     struct ovs_list list_node;
     struct in6_addr prefix;
@@ -10300,13 +10408,27 @@ build_ingress_policy_flows_for_lrouter(
     if (od->nbr) {
         /* This is a catch-all rule. It has the lowest priority (0)
          * does a match-all("1") and pass-through (next) */
-        ovn_lflow_add(lflows, od, S_ROUTER_IN_POLICY, 0, "1", "next;");
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_POLICY, 0, "1",
+                      REG_ECMP_GROUP_ID" = 0; next;");
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_POLICY_ECMP, 150,
+                      REG_ECMP_GROUP_ID" == 0", "next;");
 
         /* Convert routing policies to flows. */
+        uint16_t ecmp_group_id = 1;
         for (int i = 0; i < od->nbr->n_policies; i++) {
             const struct nbrec_logical_router_policy *rule
                 = od->nbr->policies[i];
-            build_routing_policy_flow(lflows, od, ports, rule, &rule->header_);
+            bool is_ecmp_reroute =
+                (!strcmp(rule->action, "reroute") && rule->n_nexthops > 1);
+
+            if (is_ecmp_reroute) {
+                build_ecmp_routing_policy_flows(lflows, od, ports, rule,
+                                                ecmp_group_id);
+                ecmp_group_id++;
+            } else {
+                build_routing_policy_flow(lflows, od, ports, rule,
+                                          &rule->header_);
+            }
         }
     }
 }
diff --git a/ovn-nb.ovsschema b/ovn-nb.ovsschema
index af77dd138..b77a2308c 100644
--- a/ovn-nb.ovsschema
+++ b/ovn-nb.ovsschema
@@ -1,7 +1,7 @@
 {
     "name": "OVN_Northbound",
-    "version": "5.29.0",
-    "cksum": "328602112 27064",
+    "version": "5.30.0",
+    "cksum": "3273824429 27172",
     "tables": {
         "NB_Global": {
             "columns": {
@@ -391,6 +391,8 @@
                     "key": {"type": "string",
                             "enum": ["set", ["allow", "drop", "reroute"]]}}},
                 "nexthop": {"type": {"key": "string", "min": 0, "max": 1}},
+                "nexthops": {"type": {
+                    "key": "string", "min": 0, "max": "unlimited"}},
                 "options": {
                     "type": {"key": "string", "value": "string",
                              "min": 0, "max": "unlimited"}},
diff --git a/ovn-nb.xml b/ovn-nb.xml
index e7a8d6833..0cf043790 100644
--- a/ovn-nb.xml
+++ b/ovn-nb.xml
@@ -2723,18 +2723,34 @@
         </li>
 
         <li>
-          <code>reroute</code>: Reroute packet to <ref column="nexthop"/>.
+          <code>reroute</code>: Reroute packet to <ref column="nexthop"/> or
+          <ref column="nexthops"/>.
         </li>
       </ul>
     </column>
 
     <column name="nexthop">
+      <p>
+        Note: This column is deprecated in favor of <ref column="nexthops"/>.
+      </p>
       <p>
         Next-hop IP address for this route, which should be the IP
         address of a connected router port or the IP address of a logical port.
       </p>
     </column>
 
+    <column name="nexthops">
+      <p>
+        Next-hop ECMP IP addresses for this route. Each IP in the list should
+        be the IP address of a connected router port or the IP address of a
+        logical port.
+      </p>
+
+      <p>
+        One IP from the list is selected as next hop.
+      </p>
+    </column>
+
     <column name="options" key="pkt_mark">
       <p>
         Marks the packet with the value specified when the router policy
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 50a4cae76..ce6c44db4 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -2198,3 +2198,127 @@ dnl Number of common flows should be the same.
 check_row_count Logical_Flow ${n_flows_common} logical_dp_group=${dp_group_uuid}
 
 AT_CLEANUP
+
+AT_SETUP([ovn -- Router policies - ECMP reroute])
+AT_KEYWORDS([router policies ecmp reroute])
+ovn_start
+
+check ovn-nbctl ls-add sw0
+check ovn-nbctl lsp-add sw0 sw0-port1
+check ovn-nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03 10.0.0.3"
+
+check ovn-nbctl ls-add sw1
+check ovn-nbctl lsp-add sw1 sw1-port1
+check ovn-nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03 20.0.0.3"
+
+# Create a logical router and attach both logical switches
+check ovn-nbctl lr-add lr0
+check ovn-nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24 1000::a/64
+check ovn-nbctl lsp-add sw0 sw0-lr0
+check ovn-nbctl lsp-set-type sw0-lr0 router
+check ovn-nbctl lsp-set-addresses sw0-lr0 00:00:00:00:ff:01
+check ovn-nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
+
+check ovn-nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02 20.0.0.1/24 2000::a/64
+check ovn-nbctl lsp-add sw1 sw1-lr0
+check ovn-nbctl lsp-set-type sw1-lr0 router
+check ovn-nbctl lsp-set-addresses sw1-lr0 00:00:00:00:ff:02
+check ovn-nbctl lsp-set-options sw1-lr0 router-port=lr-sw1
+
+check ovn-nbctl ls-add public
+check ovn-nbctl lrp-add lr0 lr0-public 00:00:20:20:12:13 172.168.0.100/24
+check ovn-nbctl lsp-add public public-lr0
+check ovn-nbctl lsp-set-type public-lr0 router
+check ovn-nbctl lsp-set-addresses public-lr0 router
+check ovn-nbctl lsp-set-options public-lr0 router-port=lr0-public
+
+check ovn-nbctl --wait=sb lr-policy-add lr0  10 "ip4.src == 10.0.0.3" reroute 172.168.0.101,172.168.0.102
+
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = 0; next;)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.3), action=(reg8[[0..15]] = 1; reg8[[16..31]] = select(1, 2);)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == 1 && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == 1 && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == 0), action=(next;)
+])
+
+check ovn-nbctl --wait=sb lr-policy-add lr0  10 "ip4.src == 10.0.0.4" reroute 172.168.0.101,172.168.0.102,172.168.0.103
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 |  \
+sed 's/reg8\[[0..15\]] = [[0-9]]*/reg8\[[0..15\]] = <cleared>/' | \
+sed 's/reg8\[[0..15\]] == [[0-9]]*/reg8\[[0..15\]] == <cleared>/' | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = <cleared>; next;)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.3), action=(reg8[[0..15]] = <cleared>; reg8[[16..31]] = select(1, 2);)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.4), action=(reg8[[0..15]] = <cleared>; reg8[[16..31]] = select(1, 2, 3);)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 3), action=(reg0 = 172.168.0.103; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == <cleared>), action=(next;)
+])
+
+check ovn-nbctl --wait=sb lr-policy-add lr0  10 "ip4.src == 10.0.0.5" reroute 172.168.0.110
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 |  \
+sed 's/reg8\[[0..15\]] = [[0-9]]*/reg8\[[0..15\]] = <cleared>/' | \
+sed 's/reg8\[[0..15\]] == [[0-9]]*/reg8\[[0..15\]] == <cleared>/' | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = <cleared>; next;)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.3), action=(reg8[[0..15]] = <cleared>; reg8[[16..31]] = select(1, 2);)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.4), action=(reg8[[0..15]] = <cleared>; reg8[[16..31]] = select(1, 2, 3);)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.5), action=(reg0 = 172.168.0.110; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; reg8[[0..15]] = <cleared>; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 3), action=(reg0 = 172.168.0.103; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == <cleared>), action=(next;)
+])
+
+check ovn-nbctl --wait=sb lr-policy-del lr0  10 "ip4.src == 10.0.0.3"
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 |  \
+sed 's/reg8\[[0..15\]] = [[0-9]]*/reg8\[[0..15\]] = <cleared>/' | \
+sed 's/reg8\[[0..15\]] == [[0-9]]*/reg8\[[0..15\]] == <cleared>/' | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = <cleared>; next;)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.4), action=(reg8[[0..15]] = <cleared>; reg8[[16..31]] = select(1, 2, 3);)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.5), action=(reg0 = 172.168.0.110; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; reg8[[0..15]] = <cleared>; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 1), action=(reg0 = 172.168.0.101; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 2), action=(reg0 = 172.168.0.102; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=100  , match=(reg8[[0..15]] == <cleared> && reg8[[16..31]] == 3), action=(reg0 = 172.168.0.103; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == <cleared>), action=(next;)
+])
+
+check ovn-nbctl --wait=sb lr-policy-del lr0  10 "ip4.src == 10.0.0.4"
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 |  \
+sed 's/reg8\[[0..15\]] = [[0-9]]*/reg8\[[0..15\]] = <cleared>/' | \
+sed 's/reg8\[[0..15\]] == [[0-9]]*/reg8\[[0..15\]] == <cleared>/' | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = <cleared>; next;)
+  table=12(lr_in_policy       ), priority=10   , match=(ip4.src == 10.0.0.5), action=(reg0 = 172.168.0.110; reg1 = 172.168.0.100; eth.src = 00:00:20:20:12:13; outport = "lr0-public"; flags.loopback = 1; reg8[[0..15]] = <cleared>; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == <cleared>), action=(next;)
+])
+
+check ovn-nbctl --wait=sb add logical_router_policy . nexthops "2000\:\:b"
+ovn-sbctl dump-flows lr0 > lr0flows3
+AT_CAPTURE_FILE([lr0flows3])
+
+AT_CHECK([grep "lr_in_policy" lr0flows3 |  \
+sed 's/reg8\[[0..15\]] = [[0-9]]*/reg8\[[0..15\]] = <cleared>/' | \
+sed 's/reg8\[[0..15\]] == [[0-9]]*/reg8\[[0..15\]] == <cleared>/' | sort], [0], [dnl
+  table=12(lr_in_policy       ), priority=0    , match=(1), action=(reg8[[0..15]] = <cleared>; next;)
+  table=13(lr_in_policy_ecmp  ), priority=150  , match=(reg8[[0..15]] == <cleared>), action=(next;)
+])
+
+AT_CLEANUP
diff --git a/tests/ovn.at b/tests/ovn.at
index 2e0bc9c53..66088a7f5 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -21156,31 +21156,31 @@ AT_CHECK([
 
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_policy.*priority=1001" | sort], [0], [dnl
   table=12(lr_in_policy       ), priority=1001 , dnl
-match=(ip6), action=(pkt.mark = 4294967295; next;)
+match=(ip6), action=(pkt.mark = 4294967295; reg8[[0..15]] = 0; next;)
 ])
 
 ovn-nbctl --wait=hv set logical_router_policy $pol5 options:pkt_mark=-1
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_policy.*priority=1001" | sort], [0], [dnl
   table=12(lr_in_policy       ), priority=1001 , dnl
-match=(ip6), action=(next;)
+match=(ip6), action=(reg8[[0..15]] = 0; next;)
 ])
 
 ovn-nbctl --wait=hv set logical_router_policy $pol5 options:pkt_mark=2147483648
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_policy.*priority=1001" | sort], [0], [dnl
   table=12(lr_in_policy       ), priority=1001 , dnl
-match=(ip6), action=(pkt.mark = 2147483648; next;)
+match=(ip6), action=(pkt.mark = 2147483648; reg8[[0..15]] = 0; next;)
 ])
 
 ovn-nbctl --wait=hv set logical_router_policy $pol5 options:pkt_mark=foo
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_policy.*priority=1001" | sort], [0], [dnl
   table=12(lr_in_policy       ), priority=1001 , dnl
-match=(ip6), action=(next;)
+match=(ip6), action=(reg8[[0..15]] = 0; next;)
 ])
 
 ovn-nbctl --wait=hv set logical_router_policy $pol5 options:pkt_mark=4294967296
 AT_CHECK([ovn-sbctl lflow-list | grep -E "lr_in_policy.*priority=1001" | sort], [0], [dnl
   table=12(lr_in_policy       ), priority=1001 , dnl
-match=(ip6), action=(next;)
+match=(ip6), action=(reg8[[0..15]] = 0; next;)
 ])
 
 OVN_CLEANUP([hv1])
@@ -21759,7 +21759,7 @@ AT_CHECK([
     grep "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))" -c)
 ])
 AT_CHECK([
-    test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=21 | \
+    test 1 -eq $(as hv1 ovs-ofctl dump-flows br-int table=22 | \
     grep "priority=200" | \
     grep "actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
 ])
@@ -21770,7 +21770,7 @@ AT_CHECK([
     grep "ct(commit,zone=NXM_NX_REG11\\[[0..15\\]],exec(move:NXM_OF_ETH_SRC\\[[\\]]->NXM_NX_CT_LABEL\\[[32..79\\]],load:0x[[0-9]]->NXM_NX_CT_LABEL\\[[80..95\\]]))" -c)
 ])
 AT_CHECK([
-    test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=21 | \
+    test 0 -eq $(as hv2 ovs-ofctl dump-flows br-int table=22 | \
     grep "priority=200" | \
     grep "actions=move:NXM_NX_CT_LABEL\\[[32..79\\]]->NXM_OF_ETH_DST\\[[\\]]" -c)
 ])
@@ -22208,7 +22208,7 @@ AT_CHECK([as hv1 ovs-ofctl dump-flows br-int | grep "actions=controller" | grep
 ])
 
 # The packet should've been dropped in the lr_in_arp_resolve stage.
-AT_CHECK([as hv1 ovs-ofctl dump-flows br-int | grep -E "table=21, n_packets=1,.* priority=1,ip,metadata=0x${sw_key},nw_dst=10.0.1.1 actions=drop" -c], [0], [dnl
+AT_CHECK([as hv1 ovs-ofctl dump-flows br-int | grep -E "table=22, n_packets=1,.* priority=1,ip,metadata=0x${sw_key},nw_dst=10.0.1.1 actions=drop" -c], [0], [dnl
 1
 ])
 
diff --git a/utilities/ovn-nbctl.8.xml b/utilities/ovn-nbctl.8.xml
index e5a35f307..e6fec9980 100644
--- a/utilities/ovn-nbctl.8.xml
+++ b/utilities/ovn-nbctl.8.xml
@@ -739,7 +739,7 @@
     <dl>
       <dt>[<code>--may-exist</code>]<code>lr-policy-add</code>
           <var>router</var> <var>priority</var> <var>match</var>
-          <var>action</var> [<var>nexthop</var>]
+          <var>action</var> [<var>nexthop</var>[,<var>nexthop</var>,...]]
           [<var>options key=value]</var>] </dt>
       <dd>
         <p>
@@ -748,10 +748,12 @@
           are similar to OVN ACLs, but exist on the logical-router. Reroute
           policies are needed for service-insertion and service-chaining.
           <var>nexthop</var> is an optional parameter. It needs to be provided
-          only when <var>action</var> is <var>reroute</var>. A policy is
-          uniquely identified by <var>priority</var> and <var>match</var>.
-          Multiple policies can have the same <var>priority</var>.
-          <var>options</var> sets the router policy options as key-value pair.
+          only when <var>action</var> is <var>reroute</var>. Multiple
+          <code>nexthops</code> can be specified for ECMP routing.
+          A policy is uniquely identified by <var>priority</var> and
+          <var>match</var>. Multiple policies can have the same
+          <var>priority</var>. <var>options</var> sets the router policy
+          options as key-value pair.
           The supported option is : <code>pkt_mark</code>.
         </p>
 
diff --git a/utilities/ovn-nbctl.c b/utilities/ovn-nbctl.c
index 835161f25..94e7eedeb 100644
--- a/utilities/ovn-nbctl.c
+++ b/utilities/ovn-nbctl.c
@@ -766,7 +766,7 @@ Route commands:\n\
   lr-route-list ROUTER      print routes for ROUTER\n\
 \n\
 Policy commands:\n\
-  lr-policy-add ROUTER PRIORITY MATCH ACTION [NEXTHOP] \
+  lr-policy-add ROUTER PRIORITY MATCH ACTION [NEXTHOP,[NEXTHOP,...]] \
 [OPTIONS KEY=VALUE ...] \n\
                             add a policy to router\n\
   lr-policy-del ROUTER [{PRIORITY | UUID} [MATCH]]\n\
@@ -3634,7 +3634,8 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
         return;
     }
     const char *action = ctx->argv[4];
-    char *next_hop = NULL;
+    size_t n_nexthops = 0;
+    char **nexthops = NULL;
 
     bool reroute = false;
     /* Validate action. */
@@ -3654,7 +3655,8 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
     /* Check if same routing policy already exists.
      * A policy is uniquely identified by priority and match */
     bool may_exist = !!shash_find(&ctx->options, "--may-exist");
-    for (int i = 0; i < lr->n_policies; i++) {
+    size_t i;
+    for (i = 0; i < lr->n_policies; i++) {
         const struct nbrec_logical_router_policy *policy = lr->policies[i];
         if (policy->priority == priority &&
             !strcmp(policy->match, ctx->argv[3])) {
@@ -3665,12 +3667,53 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
             return;
         }
     }
+
     if (reroute) {
-        next_hop = normalize_prefix_str(ctx->argv[5]);
-        if (!next_hop) {
-            ctl_error(ctx, "bad next hop argument: %s", ctx->argv[5]);
-            return;
+        char *nexthops_arg = xstrdup(ctx->argv[5]);
+        char *save_ptr, *next_hop, *token;
+
+        n_nexthops = 0;
+        size_t n_allocs = 0;
+
+        bool nexthops_is_ipv4 = true;
+        for (token = strtok_r(nexthops_arg, ",", &save_ptr);
+            token != NULL; token = strtok_r(NULL, ",", &save_ptr)) {
+            next_hop = normalize_addr_str(token);
+
+            if (!next_hop) {
+                ctl_error(ctx, "bad next hop argument: %s", ctx->argv[5]);
+                free(nexthops_arg);
+                for (i = 0; i < n_nexthops; i++) {
+                    free(nexthops[i]);
+                }
+                free(nexthops);
+                return;
+            }
+            if (n_nexthops == n_allocs) {
+                nexthops = x2nrealloc(nexthops, &n_allocs, sizeof *nexthops);
+            }
+
+            bool is_ipv4 = strchr(next_hop, '.') ? true : false;
+            if (n_nexthops == 0) {
+                nexthops_is_ipv4 = is_ipv4;
+            }
+
+            if (is_ipv4 != nexthops_is_ipv4) {
+                ctl_error(ctx, "bad next hops argument, not in the same "
+                          "addr family : %s", ctx->argv[5]);
+                free(nexthops_arg);
+                free(next_hop);
+                for (i = 0; i < n_nexthops; i++) {
+                    free(nexthops[i]);
+                }
+                free(nexthops);
+                return;
+            }
+            nexthops[n_nexthops] = next_hop;
+            n_nexthops++;
         }
+
+        free(nexthops_arg);
     }
 
     struct nbrec_logical_router_policy *policy;
@@ -3679,12 +3722,13 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
     nbrec_logical_router_policy_set_match(policy, ctx->argv[3]);
     nbrec_logical_router_policy_set_action(policy, action);
     if (reroute) {
-        nbrec_logical_router_policy_set_nexthop(policy, next_hop);
+        nbrec_logical_router_policy_set_nexthops(
+            policy, (const char **)nexthops, n_nexthops);
     }
 
     /* Parse the options. */
     struct smap options = SMAP_INITIALIZER(&options);
-    for (size_t i = reroute ? 6 : 5; i < ctx->argc; i++) {
+    for (i = reroute ? 6 : 5; i < ctx->argc; i++) {
         char *key, *value;
         value = xstrdup(ctx->argv[i]);
         key = strsep(&value, "=");
@@ -3694,7 +3738,10 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
             ctl_error(ctx, "No value specified for the option : %s", key);
             smap_destroy(&options);
             free(key);
-            free(next_hop);
+            for (i = 0; i < n_nexthops; i++) {
+                free(nexthops[i]);
+            }
+            free(nexthops);
             return;
         }
         free(key);
@@ -3703,9 +3750,11 @@ nbctl_lr_policy_add(struct ctl_context *ctx)
     smap_destroy(&options);
 
     nbrec_logical_router_update_policies_addvalue(lr, policy);
-    if (next_hop != NULL) {
-        free(next_hop);
+
+    for (i = 0; i < n_nexthops; i++) {
+        free(nexthops[i]);
     }
+    free(nexthops);
 }
 
 static void
-- 
2.28.0