diff --git a/.ovn.metadata b/.ovn.metadata index 6f36ca7..2dec392 100644 --- a/.ovn.metadata +++ b/.ovn.metadata @@ -1,6 +1,5 @@ 002450621b33c5690060345b0aac25bc2426d675 SOURCES/docutils-0.12.tar.gz -9bd78cda74977132b211af95ac0f63bf05fafb91 SOURCES/openvswitch-2.14.90.tar.gz -057adab900c382cd8bd12966e8dfd7d7d1cb9b29 SOURCES/openvswitch-ac09cbf.tar.gz -f56373e54eec629b9d6e88e8b1c0c880bd498809 SOURCES/ovn-20.12.0.tar.gz +b7cb5bddcefce929e60e4533da84d13dc8ce4fd0 SOURCES/openvswitch-ac85cdb.tar.gz +35a22f67bf3675fce0ca8a39ee4aed7e0b716560 SOURCES/ovn-21.03.0.tar.gz d34f96421a86004aa5d26ecf975edefd09f948b1 SOURCES/Pygments-1.4.tar.gz 6beb30f18ffac3de7689b7fd63e9a8a7d9c8df3a SOURCES/Sphinx-1.1.3.tar.gz diff --git a/SOURCES/gen_config_group.sh b/SOURCES/gen_config_group.sh index 651a0c5..d1c06fe 100755 --- a/SOURCES/gen_config_group.sh +++ b/SOURCES/gen_config_group.sh @@ -207,10 +207,10 @@ do done popd >/dev/null -echo -n "For each arch ( " +printf "For each arch ( " for ((i=0; i < ${#OVS_DPDK_CONF_MACH_ARCH[@]}; i++)); do - echo -n "${OVS_DPDK_CONF_MACH_ARCH[i]} " + printf "${OVS_DPDK_CONF_MACH_ARCH[i]} " done echo "):" echo "1. ensure you enable the requisite hw" diff --git a/SOURCES/ovn-21.03.0.patch b/SOURCES/ovn-21.03.0.patch new file mode 100644 index 0000000..99c4e61 --- /dev/null +++ b/SOURCES/ovn-21.03.0.patch @@ -0,0 +1,4948 @@ +diff --git a/.ci/linux-prepare.sh b/.ci/linux-prepare.sh +index 0bb0ff096..83ad3958b 100755 +--- a/.ci/linux-prepare.sh ++++ b/.ci/linux-prepare.sh +@@ -12,5 +12,5 @@ set -ev + git clone git://git.kernel.org/pub/scm/devel/sparse/sparse.git + cd sparse && make -j4 HAVE_LLVM= HAVE_SQLITE= install && cd .. + +-pip install --disable-pip-version-check --user six flake8 hacking +-pip install --user --upgrade docutils ++pip3 install --disable-pip-version-check --user flake8 hacking sphinx pyOpenSSL ++pip3 install --upgrade --user docutils +diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml +index f3a53a8b6..91bd1e538 100644 +--- a/.github/workflows/test.yml ++++ b/.github/workflows/test.yml +@@ -13,7 +13,6 @@ jobs: + dependencies: | + automake libtool gcc bc libjemalloc1 libjemalloc-dev \ + libssl-dev llvm-dev libelf-dev libnuma-dev libpcap-dev \ +- python3-openssl python3-pip python3-sphinx \ + selinux-policy-dev + m32_dependecies: gcc-multilib + CC: ${{ matrix.compiler }} +@@ -88,11 +87,21 @@ jobs: + if: matrix.m32 != '' + run: sudo apt install -y ${{ env.m32_dependecies }} + ++ - name: update PATH ++ run: | ++ echo "$HOME/bin" >> $GITHUB_PATH ++ echo "$HOME/.local/bin" >> $GITHUB_PATH ++ ++ - name: set up python ++ uses: actions/setup-python@v2 ++ with: ++ python-version: '3.x' ++ + - name: prepare + run: ./.ci/linux-prepare.sh + + - name: build +- run: PATH="$PATH:$HOME/bin" ./.ci/linux-build.sh ++ run: ./.ci/linux-build.sh + + - name: copy logs on failure + if: failure() || cancelled() +@@ -145,10 +154,18 @@ jobs: + ref: 'master' + - name: install dependencies + run: brew install automake libtool ++ - name: update PATH ++ run: | ++ echo "$HOME/bin" >> $GITHUB_PATH ++ echo "$HOME/.local/bin" >> $GITHUB_PATH ++ - name: set up python ++ uses: actions/setup-python@v2 ++ with: ++ python-version: '3.x' + - name: prepare + run: ./.ci/osx-prepare.sh + - name: build +- run: PATH="$PATH:$HOME/bin" ./.ci/osx-build.sh ++ run: ./.ci/osx-build.sh + - name: upload logs on failure + if: failure() + uses: actions/upload-artifact@v2 +diff --git a/Makefile.am b/Makefile.am +index 80247b62d..1fe730dc4 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -221,6 +221,7 @@ dist-hook-git: distfiles + grep -v '\.gitattributes$$' | \ + grep -v '\.gitmodules$$' | \ + grep -v "$(submodules)" | \ ++ grep -v 'redhat' | \ + LC_ALL=C sort -u > all-gitfiles; \ + LC_ALL=C comm -1 -3 distfiles all-gitfiles > missing-distfiles; \ + if test -s missing-distfiles; then \ +@@ -332,7 +333,7 @@ check-tabs: + @cd $(srcdir); \ + if test -e .git && (git --version) >/dev/null 2>&1 && \ + grep -ln "^ " \ +- `git ls-files | grep -v $(submodules) \ ++ `git ls-files | grep -v $(submodules) | grep -v redhat \ + | grep -v -f build-aux/initial-tab-whitelist` /dev/null \ + | $(EGREP) -v ':[ ]*/?\*'; \ + then \ +diff --git a/NEWS b/NEWS +index 5372668bf..530c5d42f 100644 +--- a/NEWS ++++ b/NEWS +@@ -1,3 +1,13 @@ ++Post-v21.03.0 ++------------------------- ++ - ovn-northd-ddlog: New implementation of northd, based on DDlog. This ++ implementation is incremental, meaning that it only recalculates what is ++ needed for the southbound database when northbound changes occur. It is ++ expected to scale better than the C implementation, for large deployments. ++ (This may take testing and tuning to be effective.) This version of OVN ++ requires DDLog 0.36. ++ - Introduce ovn-controller incremetal processing engine statistics ++ + OVN v21.03.0 - 12 Mar 2021 + ------------------------- + - Support ECMP multiple nexthops for reroute router policies. +diff --git a/configure.ac b/configure.ac +index 37b476d53..f3de6fef2 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -13,7 +13,7 @@ + # limitations under the License. + + AC_PREREQ(2.63) +-AC_INIT(ovn, 21.03.0, bugs@openvswitch.org) ++AC_INIT(ovn, 21.03.1, bugs@openvswitch.org) + AC_CONFIG_MACRO_DIR([m4]) + AC_CONFIG_AUX_DIR([build-aux]) + AC_CONFIG_HEADERS([config.h]) +diff --git a/controller/binding.c b/controller/binding.c +index 4e6c75696..514f5f33f 100644 +--- a/controller/binding.c ++++ b/controller/binding.c +@@ -597,6 +597,23 @@ remove_local_lport_ids(const struct sbrec_port_binding *pb, + } + } + ++/* Corresponds to each Port_Binding.type. */ ++enum en_lport_type { ++ LP_UNKNOWN, ++ LP_VIF, ++ LP_CONTAINER, ++ LP_PATCH, ++ LP_L3GATEWAY, ++ LP_LOCALNET, ++ LP_LOCALPORT, ++ LP_L2GATEWAY, ++ LP_VTEP, ++ LP_CHASSISREDIRECT, ++ LP_VIRTUAL, ++ LP_EXTERNAL, ++ LP_REMOTE ++}; ++ + /* Local bindings. binding.c module binds the logical port (represented by + * Port_Binding rows) and sets the 'chassis' column when it sees the + * OVS interface row (of type "" or "internal") with the +@@ -608,134 +625,180 @@ remove_local_lport_ids(const struct sbrec_port_binding *pb, + * 'struct local_binding' is used. A shash of these local bindings is + * maintained with the 'external_ids:iface-id' as the key to the shash. + * +- * struct local_binding (defined in binding.h) has 3 main fields: +- * - type +- * - OVS interface row object +- * - Port_Binding row object +- * +- * An instance of 'struct local_binding' can be one of 3 types. +- * +- * BT_VIF: Represent a local binding for an OVS interface of +- * type "" or "internal" with the external_ids:iface-id +- * set. +- * +- * This can be a +- * * probable local binding - external_ids:iface-id is +- * set, but the corresponding Port_Binding row is not +- * created or is not visible to the local ovn-controller +- * instance. +- * +- * * a local binding - external_ids:iface-id is set and +- * which is already bound to the corresponding Port_Binding +- * row. +- * +- * It maintains a list of children +- * (of type BT_CONTAINER/BT_VIRTUAL) if any. +- * +- * BT_CONTAINER: Represents a local binding which has a parent of type +- * BT_VIF. Its Port_Binding row's 'parent' column is set to +- * its parent's Port_Binding. It shares the OVS interface row +- * with the parent. +- * Each ovn-controller when it sees a container Port_Binding, +- * it creates 'struct local_binding' for the parent +- * Port_Binding and for its even if the OVS interface row for +- * the parent is not present. +- * +- * BT_VIRTUAL: Represents a local binding which has a parent of type BT_VIF. +- * Its Port_Binding type is "virtual" and it shares the OVS +- * interface row with the parent. +- * Port_Binding of type "virtual" is claimed by pinctrl module +- * when it sees the ARP packet from the parent's VIF. +- * ++ * struct local_binding has 3 main fields: ++ * - name : 'external_ids:iface-id' of the OVS interface (key). ++ * - OVS interface row object. ++ * - List of 'binding_lport' objects with the primary lport ++ * in the front of the list (if present). + * + * An object of 'struct local_binding' is created: +- * - For each interface that has iface-id configured with the type - BT_VIF. +- * +- * - For each container Port Binding (of type BT_CONTAINER) and its +- * parent Port_Binding (of type BT_VIF), no matter if +- * they are bound to this chassis i.e even if OVS interface row for the +- * parent is not present. ++ * - For each interface that has external_ids:iface-id configured. + * +- * - For each 'virtual' Port Binding (of type BT_VIRTUAL) provided its parent +- * is bound to this chassis. ++ * - For each port binding (also referred as lport) of type 'LP_VIF' ++ * if it is a parent lport of container lports even if there is no ++ * corresponding OVS interface. + */ ++struct local_binding { ++ char *name; ++ const struct ovsrec_interface *iface; ++ struct ovs_list binding_lports; ++}; + +-static struct local_binding * +-local_binding_create(const char *name, const struct ovsrec_interface *iface, +- const struct sbrec_port_binding *pb, +- enum local_binding_type type) +-{ +- struct local_binding *lbinding = xzalloc(sizeof *lbinding); +- lbinding->name = xstrdup(name); +- lbinding->type = type; +- lbinding->pb = pb; +- lbinding->iface = iface; +- shash_init(&lbinding->children); +- return lbinding; +-} +- +-static void +-local_binding_add(struct shash *local_bindings, struct local_binding *lbinding) +-{ +- shash_add(local_bindings, lbinding->name, lbinding); +-} ++/* This structure represents a logical port (or port binding) ++ * which is associated with 'struct local_binding'. ++ * ++ * An instance of 'struct binding_lport' is created for a logical port ++ * - If the OVS interface's iface-id corresponds to the logical port. ++ * - If it is a container or virtual logical port and its parent ++ * has a 'local binding'. ++ * ++ */ ++struct binding_lport { ++ struct ovs_list list_node; /* Node in local_binding.binding_lports. */ + +-static void +-local_binding_destroy(struct local_binding *lbinding) +-{ +- local_bindings_destroy(&lbinding->children); ++ char *name; ++ const struct sbrec_port_binding *pb; ++ struct local_binding *lbinding; ++ enum en_lport_type type; ++}; + +- free(lbinding->name); +- free(lbinding); +-} ++static struct local_binding *local_binding_create( ++ const char *name, const struct ovsrec_interface *); ++static void local_binding_add(struct shash *local_bindings, ++ struct local_binding *); ++static struct local_binding *local_binding_find( ++ struct shash *local_bindings, const char *name); ++static void local_binding_destroy(struct local_binding *, ++ struct shash *binding_lports); ++static void local_binding_delete(struct local_binding *, ++ struct shash *local_bindings, ++ struct shash *binding_lports); ++static struct binding_lport *local_binding_add_lport( ++ struct shash *binding_lports, ++ struct local_binding *, ++ const struct sbrec_port_binding *, ++ enum en_lport_type); ++static struct binding_lport *local_binding_get_primary_lport( ++ struct local_binding *); ++static bool local_binding_handle_stale_binding_lports( ++ struct local_binding *lbinding, struct binding_ctx_in *b_ctx_in, ++ struct binding_ctx_out *b_ctx_out, struct hmap *qos_map); ++ ++static struct binding_lport *binding_lport_create( ++ const struct sbrec_port_binding *, ++ struct local_binding *, enum en_lport_type); ++static void binding_lport_destroy(struct binding_lport *); ++static void binding_lport_delete(struct shash *binding_lports, ++ struct binding_lport *); ++static void binding_lport_add(struct shash *binding_lports, ++ struct binding_lport *); ++static struct binding_lport *binding_lport_find( ++ struct shash *binding_lports, const char *lport_name); ++static const struct sbrec_port_binding *binding_lport_get_parent_pb( ++ struct binding_lport *b_lprt); ++static struct binding_lport *binding_lport_check_and_cleanup( ++ struct binding_lport *, struct shash *b_lports); ++ ++static char *get_lport_type_str(enum en_lport_type lport_type); + + void +-local_bindings_init(struct shash *local_bindings) ++local_binding_data_init(struct local_binding_data *lbinding_data) + { +- shash_init(local_bindings); ++ shash_init(&lbinding_data->bindings); ++ shash_init(&lbinding_data->lports); + } + + void +-local_bindings_destroy(struct shash *local_bindings) ++local_binding_data_destroy(struct local_binding_data *lbinding_data) + { + struct shash_node *node, *next; +- SHASH_FOR_EACH_SAFE (node, next, local_bindings) { ++ ++ SHASH_FOR_EACH_SAFE (node, next, &lbinding_data->lports) { ++ struct binding_lport *b_lport = node->data; ++ binding_lport_destroy(b_lport); ++ shash_delete(&lbinding_data->lports, node); ++ } ++ ++ SHASH_FOR_EACH_SAFE (node, next, &lbinding_data->bindings) { + struct local_binding *lbinding = node->data; +- local_binding_destroy(lbinding); +- shash_delete(local_bindings, node); ++ local_binding_destroy(lbinding, &lbinding_data->lports); ++ shash_delete(&lbinding_data->bindings, node); + } + +- shash_destroy(local_bindings); ++ shash_destroy(&lbinding_data->lports); ++ shash_destroy(&lbinding_data->bindings); + } + +-static +-void local_binding_delete(struct shash *local_bindings, +- struct local_binding *lbinding) ++const struct sbrec_port_binding * ++local_binding_get_primary_pb(struct shash *local_bindings, const char *pb_name) + { +- shash_find_and_delete(local_bindings, lbinding->name); +- local_binding_destroy(lbinding); +-} ++ struct local_binding *lbinding = ++ local_binding_find(local_bindings, pb_name); ++ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); + +-static void +-local_binding_add_child(struct local_binding *lbinding, +- struct local_binding *child) +-{ +- local_binding_add(&lbinding->children, child); +- child->parent = lbinding; ++ return b_lport ? b_lport->pb : NULL; + } + +-static struct local_binding * +-local_binding_find_child(struct local_binding *lbinding, +- const char *child_name) ++void ++binding_dump_local_bindings(struct local_binding_data *lbinding_data, ++ struct ds *out_data) + { +- return local_binding_find(&lbinding->children, child_name); +-} ++ const struct shash_node **nodes; + +-static void +-local_binding_delete_child(struct local_binding *lbinding, +- struct local_binding *child) +-{ +- shash_find_and_delete(&lbinding->children, child->name); ++ nodes = shash_sort(&lbinding_data->bindings); ++ size_t n = shash_count(&lbinding_data->bindings); ++ ++ ds_put_cstr(out_data, "Local bindings:\n"); ++ for (size_t i = 0; i < n; i++) { ++ const struct shash_node *node = nodes[i]; ++ struct local_binding *lbinding = node->data; ++ size_t num_lports = ovs_list_size(&lbinding->binding_lports); ++ ds_put_format(out_data, "name: [%s], OVS interface name : [%s], " ++ "num binding lports : [%"PRIuSIZE"]\n", ++ lbinding->name, ++ lbinding->iface ? lbinding->iface->name : "NULL", ++ num_lports); ++ ++ if (num_lports) { ++ struct shash child_lports = SHASH_INITIALIZER(&child_lports); ++ struct binding_lport *primary_lport = NULL; ++ struct binding_lport *b_lport; ++ bool first_elem = true; ++ ++ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { ++ if (first_elem && b_lport->type == LP_VIF) { ++ primary_lport = b_lport; ++ } else { ++ shash_add(&child_lports, b_lport->name, b_lport); ++ } ++ first_elem = false; ++ } ++ ++ if (primary_lport) { ++ ds_put_format(out_data, "primary lport : [%s]\n", ++ primary_lport->name); ++ } else { ++ ds_put_format(out_data, "no primary lport\n"); ++ } ++ ++ if (!shash_is_empty(&child_lports)) { ++ const struct shash_node **c_nodes = ++ shash_sort(&child_lports); ++ for (size_t j = 0; j < shash_count(&child_lports); j++) { ++ b_lport = c_nodes[j]->data; ++ ds_put_format(out_data, "child lport[%"PRIuSIZE"] : [%s], " ++ "type : [%s]\n", j + 1, b_lport->name, ++ get_lport_type_str(b_lport->type)); ++ } ++ free(c_nodes); ++ } ++ shash_destroy(&child_lports); ++ } ++ ++ ds_put_cstr(out_data, "----------------------------------------\n"); ++ } ++ ++ free(nodes); + } + + static bool +@@ -744,12 +807,6 @@ is_lport_vif(const struct sbrec_port_binding *pb) + return !pb->type[0]; + } + +-static bool +-is_lport_container(const struct sbrec_port_binding *pb) +-{ +- return is_lport_vif(pb) && pb->parent_port && pb->parent_port[0]; +-} +- + static struct tracked_binding_datapath * + tracked_binding_datapath_create(const struct sbrec_datapath_binding *dp, + bool is_new, +@@ -818,26 +875,13 @@ binding_tracked_dp_destroy(struct hmap *tracked_datapaths) + hmap_destroy(tracked_datapaths); + } + +-/* Corresponds to each Port_Binding.type. */ +-enum en_lport_type { +- LP_UNKNOWN, +- LP_VIF, +- LP_PATCH, +- LP_L3GATEWAY, +- LP_LOCALNET, +- LP_LOCALPORT, +- LP_L2GATEWAY, +- LP_VTEP, +- LP_CHASSISREDIRECT, +- LP_VIRTUAL, +- LP_EXTERNAL, +- LP_REMOTE +-}; +- + static enum en_lport_type + get_lport_type(const struct sbrec_port_binding *pb) + { + if (is_lport_vif(pb)) { ++ if (pb->parent_port && pb->parent_port[0]) { ++ return LP_CONTAINER; ++ } + return LP_VIF; + } else if (!strcmp(pb->type, "patch")) { + return LP_PATCH; +@@ -864,6 +908,41 @@ get_lport_type(const struct sbrec_port_binding *pb) + return LP_UNKNOWN; + } + ++static char * ++get_lport_type_str(enum en_lport_type lport_type) ++{ ++ switch (lport_type) { ++ case LP_VIF: ++ return "VIF"; ++ case LP_CONTAINER: ++ return "CONTAINER"; ++ case LP_VIRTUAL: ++ return "VIRTUAL"; ++ case LP_PATCH: ++ return "PATCH"; ++ case LP_CHASSISREDIRECT: ++ return "CHASSISREDIRECT"; ++ case LP_L3GATEWAY: ++ return "L3GATEWAT"; ++ case LP_LOCALNET: ++ return "PATCH"; ++ case LP_LOCALPORT: ++ return "LOCALPORT"; ++ case LP_L2GATEWAY: ++ return "L2GATEWAY"; ++ case LP_EXTERNAL: ++ return "EXTERNAL"; ++ case LP_REMOTE: ++ return "REMOTE"; ++ case LP_VTEP: ++ return "VTEP"; ++ case LP_UNKNOWN: ++ return "UNKNOWN"; ++ } ++ ++ OVS_NOT_REACHED(); ++} ++ + /* For newly claimed ports, if 'notify_up' is 'false': + * - set the 'pb.up' field to true if 'pb' has no 'parent_pb'. + * - set the 'pb.up' field to true if 'parent_pb.up' is 'true' (e.g., for +@@ -991,14 +1070,15 @@ release_lport(const struct sbrec_port_binding *pb, bool sb_readonly, + static bool + is_lbinding_set(struct local_binding *lbinding) + { +- return lbinding && lbinding->pb && lbinding->iface; ++ return lbinding && lbinding->iface; + } + + static bool +-is_lbinding_this_chassis(struct local_binding *lbinding, +- const struct sbrec_chassis *chassis) ++is_binding_lport_this_chassis(struct binding_lport *b_lport, ++ const struct sbrec_chassis *chassis) + { +- return lbinding && lbinding->pb && lbinding->pb->chassis == chassis; ++ return (b_lport && b_lport->pb && chassis && ++ b_lport->pb->chassis == chassis); + } + + static bool +@@ -1010,15 +1090,14 @@ can_bind_on_this_chassis(const struct sbrec_chassis *chassis_rec, + || !strcmp(requested_chassis, chassis_rec->hostname); + } + +-/* Returns 'true' if the 'lbinding' has children of type BT_CONTAINER, ++/* Returns 'true' if the 'lbinding' has binding lports of type LP_CONTAINER, + * 'false' otherwise. */ + static bool + is_lbinding_container_parent(struct local_binding *lbinding) + { +- struct shash_node *node; +- SHASH_FOR_EACH (node, &lbinding->children) { +- struct local_binding *l = node->data; +- if (l->type == BT_CONTAINER) { ++ struct binding_lport *b_lport; ++ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { ++ if (b_lport->type == LP_CONTAINER) { + return true; + } + } +@@ -1027,66 +1106,41 @@ is_lbinding_container_parent(struct local_binding *lbinding) + } + + static bool +-release_local_binding_children(const struct sbrec_chassis *chassis_rec, +- struct local_binding *lbinding, +- bool sb_readonly, +- struct hmap *tracked_dp_bindings) +-{ +- struct shash_node *node; +- SHASH_FOR_EACH (node, &lbinding->children) { +- struct local_binding *l = node->data; +- if (is_lbinding_this_chassis(l, chassis_rec)) { +- if (!release_lport(l->pb, sb_readonly, tracked_dp_bindings)) { +- return false; +- } ++release_binding_lport(const struct sbrec_chassis *chassis_rec, ++ struct binding_lport *b_lport, bool sb_readonly, ++ struct binding_ctx_out *b_ctx_out) ++{ ++ if (is_binding_lport_this_chassis(b_lport, chassis_rec)) { ++ remove_local_lport_ids(b_lport->pb, b_ctx_out); ++ if (!release_lport(b_lport->pb, sb_readonly, ++ b_ctx_out->tracked_dp_bindings)) { ++ return false; + } +- +- /* Clear the local bindings' 'iface'. */ +- l->iface = NULL; + } + + return true; + } + +-static bool +-release_local_binding(const struct sbrec_chassis *chassis_rec, +- struct local_binding *lbinding, bool sb_readonly, +- struct hmap *tracked_dp_bindings) +-{ +- if (!release_local_binding_children(chassis_rec, lbinding, +- sb_readonly, tracked_dp_bindings)) { +- return false; +- } +- +- bool retval = true; +- if (is_lbinding_this_chassis(lbinding, chassis_rec)) { +- retval = release_lport(lbinding->pb, sb_readonly, tracked_dp_bindings); +- } +- +- lbinding->pb = NULL; +- lbinding->iface = NULL; +- return retval; +-} +- + static bool + consider_vif_lport_(const struct sbrec_port_binding *pb, + bool can_bind, const char *vif_chassis, + struct binding_ctx_in *b_ctx_in, + struct binding_ctx_out *b_ctx_out, +- struct local_binding *lbinding, ++ struct binding_lport *b_lport, + struct hmap *qos_map) + { +- bool lbinding_set = is_lbinding_set(lbinding); ++ bool lbinding_set = b_lport && is_lbinding_set(b_lport->lbinding); ++ + if (lbinding_set) { + if (can_bind) { + /* We can claim the lport. */ + const struct sbrec_port_binding *parent_pb = +- lbinding->parent ? lbinding->parent->pb : NULL; ++ binding_lport_get_parent_pb(b_lport); + + if (!claim_lport(pb, parent_pb, b_ctx_in->chassis_rec, +- lbinding->iface, !b_ctx_in->ovnsb_idl_txn, +- !lbinding->parent, +- b_ctx_out->tracked_dp_bindings)){ ++ b_lport->lbinding->iface, ++ !b_ctx_in->ovnsb_idl_txn, ++ !parent_pb, b_ctx_out->tracked_dp_bindings)){ + return false; + } + +@@ -1098,7 +1152,7 @@ consider_vif_lport_(const struct sbrec_port_binding *pb, + b_ctx_out->tracked_dp_bindings); + update_local_lport_ids(pb, b_ctx_out); + update_local_lports(pb->logical_port, b_ctx_out); +- if (lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) { ++ if (b_lport->lbinding->iface && qos_map && b_ctx_in->ovs_idl_txn) { + get_qos_params(pb, qos_map); + } + } else { +@@ -1136,16 +1190,19 @@ consider_vif_lport(const struct sbrec_port_binding *pb, + vif_chassis); + + if (!lbinding) { +- lbinding = local_binding_find(b_ctx_out->local_bindings, ++ lbinding = local_binding_find(&b_ctx_out->lbinding_data->bindings, + pb->logical_port); + } + ++ struct binding_lport *b_lport = NULL; + if (lbinding) { +- lbinding->pb = pb; ++ struct shash *binding_lports = ++ &b_ctx_out->lbinding_data->lports; ++ b_lport = local_binding_add_lport(binding_lports, lbinding, pb, LP_VIF); + } + + return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, +- b_ctx_out, lbinding, qos_map); ++ b_ctx_out, b_lport, qos_map); + } + + static bool +@@ -1154,9 +1211,9 @@ consider_container_lport(const struct sbrec_port_binding *pb, + struct binding_ctx_out *b_ctx_out, + struct hmap *qos_map) + { ++ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; + struct local_binding *parent_lbinding; +- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, +- pb->parent_port); ++ parent_lbinding = local_binding_find(local_bindings, pb->parent_port); + + if (!parent_lbinding) { + /* There is no local_binding for parent port. Create it +@@ -1171,54 +1228,61 @@ consider_container_lport(const struct sbrec_port_binding *pb, + * we want the these container ports also be claimed by the + * chassis. + * */ +- parent_lbinding = local_binding_create(pb->parent_port, NULL, NULL, +- BT_VIF); +- local_binding_add(b_ctx_out->local_bindings, parent_lbinding); ++ parent_lbinding = local_binding_create(pb->parent_port, NULL); ++ local_binding_add(local_bindings, parent_lbinding); + } + +- struct local_binding *container_lbinding = +- local_binding_find_child(parent_lbinding, pb->logical_port); ++ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; ++ struct binding_lport *container_b_lport = ++ local_binding_add_lport(binding_lports, parent_lbinding, pb, ++ LP_CONTAINER); + +- if (!container_lbinding) { +- container_lbinding = local_binding_create(pb->logical_port, +- parent_lbinding->iface, +- pb, BT_CONTAINER); +- local_binding_add_child(parent_lbinding, container_lbinding); +- } else { +- ovs_assert(container_lbinding->type == BT_CONTAINER); +- container_lbinding->pb = pb; +- container_lbinding->iface = parent_lbinding->iface; +- } ++ struct binding_lport *parent_b_lport = ++ binding_lport_find(binding_lports, pb->parent_port); + +- if (!parent_lbinding->pb) { +- parent_lbinding->pb = lport_lookup_by_name( ++ bool can_consider_c_lport = true; ++ if (!parent_b_lport || !parent_b_lport->pb) { ++ const struct sbrec_port_binding *parent_pb = lport_lookup_by_name( + b_ctx_in->sbrec_port_binding_by_name, pb->parent_port); + +- if (parent_lbinding->pb) { ++ if (parent_pb && get_lport_type(parent_pb) == LP_VIF) { + /* Its possible that the parent lport is not considered yet. + * So call consider_vif_lport() to process it first. */ +- consider_vif_lport(parent_lbinding->pb, b_ctx_in, b_ctx_out, ++ consider_vif_lport(parent_pb, b_ctx_in, b_ctx_out, + parent_lbinding, qos_map); ++ parent_b_lport = binding_lport_find(binding_lports, ++ pb->parent_port); + } else { +- /* The parent lport doesn't exist. Call release_lport() to +- * release the container lport, if it was bound earlier. */ +- if (is_lbinding_this_chassis(container_lbinding, +- b_ctx_in->chassis_rec)) { +- return release_lport(pb, !b_ctx_in->ovnsb_idl_txn, +- b_ctx_out->tracked_dp_bindings); +- } ++ /* The parent lport doesn't exist. Cannot consider the container ++ * lport for binding. */ ++ can_consider_c_lport = false; ++ } ++ } + +- return true; ++ if (parent_b_lport && parent_b_lport->type != LP_VIF) { ++ can_consider_c_lport = false; ++ } ++ ++ if (!can_consider_c_lport) { ++ /* Call release_lport() to release the container lport, ++ * if it was bound earlier. */ ++ if (is_binding_lport_this_chassis(container_b_lport, ++ b_ctx_in->chassis_rec)) { ++ return release_lport(pb, !b_ctx_in->ovnsb_idl_txn, ++ b_ctx_out->tracked_dp_bindings); + } ++ ++ return true; + } + +- const char *vif_chassis = smap_get(&parent_lbinding->pb->options, ++ ovs_assert(parent_b_lport && parent_b_lport->pb); ++ const char *vif_chassis = smap_get(&parent_b_lport->pb->options, + "requested-chassis"); + bool can_bind = can_bind_on_this_chassis(b_ctx_in->chassis_rec, + vif_chassis); + + return consider_vif_lport_(pb, can_bind, vif_chassis, b_ctx_in, b_ctx_out, +- container_lbinding, qos_map); ++ container_b_lport, qos_map); + } + + static bool +@@ -1227,46 +1291,58 @@ consider_virtual_lport(const struct sbrec_port_binding *pb, + struct binding_ctx_out *b_ctx_out, + struct hmap *qos_map) + { +- struct local_binding * parent_lbinding = +- pb->virtual_parent ? local_binding_find(b_ctx_out->local_bindings, ++ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; ++ struct local_binding *parent_lbinding = ++ pb->virtual_parent ? local_binding_find(local_bindings, + pb->virtual_parent) + : NULL; + +- if (parent_lbinding && !parent_lbinding->pb) { +- parent_lbinding->pb = lport_lookup_by_name( +- b_ctx_in->sbrec_port_binding_by_name, pb->virtual_parent); +- +- if (parent_lbinding->pb) { +- /* Its possible that the parent lport is not considered yet. +- * So call consider_vif_lport() to process it first. */ +- consider_vif_lport(parent_lbinding->pb, b_ctx_in, b_ctx_out, +- parent_lbinding, qos_map); +- } +- } +- ++ struct binding_lport *virtual_b_lport = NULL; + /* Unlike container lports, we don't have to create parent_lbinding if + * it is NULL. This is because, if parent_lbinding is not present, it + * means the virtual port can't bind in this chassis. + * Note: pinctrl module binds the virtual lport when it sees ARP + * packet from the parent lport. */ +- struct local_binding *virtual_lbinding = NULL; +- if (is_lbinding_this_chassis(parent_lbinding, b_ctx_in->chassis_rec)) { +- virtual_lbinding = +- local_binding_find_child(parent_lbinding, pb->logical_port); +- if (!virtual_lbinding) { +- virtual_lbinding = local_binding_create(pb->logical_port, +- parent_lbinding->iface, +- pb, BT_VIRTUAL); +- local_binding_add_child(parent_lbinding, virtual_lbinding); +- } else { +- ovs_assert(virtual_lbinding->type == BT_VIRTUAL); +- virtual_lbinding->pb = pb; +- virtual_lbinding->iface = parent_lbinding->iface; ++ if (parent_lbinding) { ++ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; ++ ++ struct binding_lport *parent_b_lport = ++ binding_lport_find(binding_lports, pb->virtual_parent); ++ ++ if (!parent_b_lport || !parent_b_lport->pb) { ++ const struct sbrec_port_binding *parent_pb = lport_lookup_by_name( ++ b_ctx_in->sbrec_port_binding_by_name, pb->virtual_parent); ++ ++ if (parent_pb && get_lport_type(parent_pb) == LP_VIF) { ++ /* Its possible that the parent lport is not considered yet. ++ * So call consider_vif_lport() to process it first. */ ++ consider_vif_lport(parent_pb, b_ctx_in, b_ctx_out, ++ parent_lbinding, qos_map); ++ } ++ } ++ ++ parent_b_lport = local_binding_get_primary_lport(parent_lbinding); ++ if (is_binding_lport_this_chassis(parent_b_lport, ++ b_ctx_in->chassis_rec)) { ++ virtual_b_lport = ++ local_binding_add_lport(binding_lports, parent_lbinding, pb, ++ LP_VIRTUAL); + } + } + +- return consider_vif_lport_(pb, true, NULL, b_ctx_in, b_ctx_out, +- virtual_lbinding, qos_map); ++ if (!consider_vif_lport_(pb, true, NULL, b_ctx_in, b_ctx_out, ++ virtual_b_lport, qos_map)) { ++ return false; ++ } ++ ++ /* If the virtual lport is not bound to this chassis, then remove ++ * its entry from the local_lport_ids if present. This is required ++ * when a virtual port moves from one chassis to other.*/ ++ if (!virtual_b_lport) { ++ remove_local_lport_ids(pb, b_ctx_out); ++ } ++ ++ return true; + } + + /* Considers either claiming the lport or releasing the lport +@@ -1407,6 +1483,8 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, + continue; + } + ++ struct shash *local_bindings = ++ &b_ctx_out->lbinding_data->bindings; + for (j = 0; j < port_rec->n_interfaces; j++) { + const struct ovsrec_interface *iface_rec; + +@@ -1416,11 +1494,10 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, + + if (iface_id && ofport > 0) { + struct local_binding *lbinding = +- local_binding_find(b_ctx_out->local_bindings, iface_id); ++ local_binding_find(local_bindings, iface_id); + if (!lbinding) { +- lbinding = local_binding_create(iface_id, iface_rec, NULL, +- BT_VIF); +- local_binding_add(b_ctx_out->local_bindings, lbinding); ++ lbinding = local_binding_create(iface_id, iface_rec); ++ local_binding_add(local_bindings, lbinding); + } else { + static struct vlog_rate_limit rl = + VLOG_RATE_LIMIT_INIT(1, 5); +@@ -1431,7 +1508,6 @@ build_local_bindings(struct binding_ctx_in *b_ctx_in, + "configuration on interface [%s]", + lbinding->iface->name, iface_rec->name, + iface_rec->name); +- ovs_assert(lbinding->type == BT_VIF); + } + + update_local_lports(iface_id, b_ctx_out); +@@ -1494,11 +1570,11 @@ binding_run(struct binding_ctx_in *b_ctx_in, struct binding_ctx_out *b_ctx_out) + break; + + case LP_VIF: +- if (is_lport_container(pb)) { +- consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map_ptr); +- } else { +- consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr); +- } ++ consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map_ptr); ++ break; ++ ++ case LP_CONTAINER: ++ consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map_ptr); + break; + + case LP_VIRTUAL: +@@ -1799,39 +1875,44 @@ consider_iface_claim(const struct ovsrec_interface *iface_rec, + update_local_lports(iface_id, b_ctx_out); + smap_replace(b_ctx_out->local_iface_ids, iface_rec->name, iface_id); + +- struct local_binding *lbinding = +- local_binding_find(b_ctx_out->local_bindings, iface_id); ++ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; ++ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; ++ struct local_binding *lbinding = local_binding_find(local_bindings, ++ iface_id); + + if (!lbinding) { +- lbinding = local_binding_create(iface_id, iface_rec, NULL, BT_VIF); +- local_binding_add(b_ctx_out->local_bindings, lbinding); ++ lbinding = local_binding_create(iface_id, iface_rec); ++ local_binding_add(local_bindings, lbinding); + } else { + lbinding->iface = iface_rec; + } + +- if (!lbinding->pb || strcmp(lbinding->name, lbinding->pb->logical_port)) { +- lbinding->pb = lport_lookup_by_name( +- b_ctx_in->sbrec_port_binding_by_name, lbinding->name); +- if (lbinding->pb && !strcmp(lbinding->pb->type, "virtual")) { +- lbinding->pb = NULL; ++ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); ++ const struct sbrec_port_binding *pb = NULL; ++ if (!b_lport) { ++ pb = lport_lookup_by_name(b_ctx_in->sbrec_port_binding_by_name, ++ lbinding->name); ++ if (pb && get_lport_type(pb) == LP_VIF) { ++ b_lport = local_binding_add_lport(binding_lports, lbinding, pb, ++ LP_VIF); + } + } + +- if (lbinding->pb) { +- if (!consider_vif_lport(lbinding->pb, b_ctx_in, b_ctx_out, +- lbinding, qos_map)) { +- return false; +- } ++ if (!b_lport) { ++ /* There is no binding lport for this local binding. */ ++ return true; ++ } ++ ++ if (!consider_vif_lport(b_lport->pb, b_ctx_in, b_ctx_out, ++ lbinding, qos_map)) { ++ return false; + } + + /* Update the child local_binding's iface (if any children) and try to + * claim the container lbindings. */ +- struct shash_node *node; +- SHASH_FOR_EACH (node, &lbinding->children) { +- struct local_binding *child = node->data; +- child->iface = iface_rec; +- if (child->type == BT_CONTAINER) { +- if (!consider_container_lport(child->pb, b_ctx_in, b_ctx_out, ++ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { ++ if (b_lport->type == LP_CONTAINER) { ++ if (!consider_container_lport(b_lport->pb, b_ctx_in, b_ctx_out, + qos_map)) { + return false; + } +@@ -1862,32 +1943,42 @@ consider_iface_release(const struct ovsrec_interface *iface_rec, + struct binding_ctx_out *b_ctx_out) + { + struct local_binding *lbinding; +- lbinding = local_binding_find(b_ctx_out->local_bindings, +- iface_id); +- if (is_lbinding_this_chassis(lbinding, b_ctx_in->chassis_rec)) { ++ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; ++ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; ++ ++ lbinding = local_binding_find(local_bindings, iface_id); ++ struct binding_lport *b_lport = local_binding_get_primary_lport(lbinding); ++ if (is_binding_lport_this_chassis(b_lport, b_ctx_in->chassis_rec)) { + struct local_datapath *ld = + get_local_datapath(b_ctx_out->local_datapaths, +- lbinding->pb->datapath->tunnel_key); ++ b_lport->pb->datapath->tunnel_key); + if (ld) { +- remove_pb_from_local_datapath(lbinding->pb, +- b_ctx_in->chassis_rec, +- b_ctx_out, ld); ++ remove_pb_from_local_datapath(b_lport->pb, ++ b_ctx_in->chassis_rec, ++ b_ctx_out, ld); + } + +- /* Note: release_local_binding() resets lbinding->pb and +- * lbinding->iface. +- * Cannot access these members of lbinding after this call. */ +- if (!release_local_binding(b_ctx_in->chassis_rec, lbinding, +- !b_ctx_in->ovnsb_idl_txn, +- b_ctx_out->tracked_dp_bindings)) { +- return false; ++ /* Release the primary binding lport and other children lports if ++ * any. */ ++ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { ++ if (!release_binding_lport(b_ctx_in->chassis_rec, b_lport, ++ !b_ctx_in->ovnsb_idl_txn, ++ b_ctx_out)) { ++ return false; ++ } + } ++ ++ } ++ ++ if (lbinding) { ++ /* Clear the iface of the local binding. */ ++ lbinding->iface = NULL; + } + + /* Check if the lbinding has children of type PB_CONTAINER. + * If so, don't delete the local_binding. */ + if (lbinding && !is_lbinding_container_parent(lbinding)) { +- local_binding_delete(b_ctx_out->local_bindings, lbinding); ++ local_binding_delete(lbinding, local_bindings, binding_lports); + } + + remove_local_lports(iface_id, b_ctx_out); +@@ -2088,56 +2179,35 @@ handle_deleted_lport(const struct sbrec_port_binding *pb, + } + } + +-static struct local_binding * +-get_lbinding_for_lport(const struct sbrec_port_binding *pb, +- enum en_lport_type lport_type, +- struct binding_ctx_out *b_ctx_out) +-{ +- ovs_assert(lport_type == LP_VIF || lport_type == LP_VIRTUAL); +- +- if (lport_type == LP_VIF && !is_lport_container(pb)) { +- return local_binding_find(b_ctx_out->local_bindings, pb->logical_port); +- } +- +- struct local_binding *parent_lbinding = NULL; +- +- if (lport_type == LP_VIRTUAL) { +- if (pb->virtual_parent) { +- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, +- pb->virtual_parent); +- } +- } else { +- if (pb->parent_port) { +- parent_lbinding = local_binding_find(b_ctx_out->local_bindings, +- pb->parent_port); +- } +- } +- +- return parent_lbinding +- ? local_binding_find(&parent_lbinding->children, pb->logical_port) +- : NULL; +-} +- + static bool + handle_deleted_vif_lport(const struct sbrec_port_binding *pb, + enum en_lport_type lport_type, + struct binding_ctx_in *b_ctx_in, + struct binding_ctx_out *b_ctx_out) + { +- struct local_binding *lbinding = +- get_lbinding_for_lport(pb, lport_type, b_ctx_out); ++ struct local_binding *lbinding = NULL; ++ bool bound = false; + +- if (lbinding) { +- lbinding->pb = NULL; +- /* The port_binding 'pb' is deleted. So there is no need to +- * clear the 'chassis' column of 'pb'. But we need to do +- * for the local_binding's children. */ +- if (lbinding->type == BT_VIF && +- !release_local_binding_children( +- b_ctx_in->chassis_rec, lbinding, +- !b_ctx_in->ovnsb_idl_txn, +- b_ctx_out->tracked_dp_bindings)) { +- return false; ++ struct shash *binding_lports = &b_ctx_out->lbinding_data->lports; ++ struct binding_lport *b_lport = binding_lport_find(binding_lports, pb->logical_port); ++ if (b_lport) { ++ lbinding = b_lport->lbinding; ++ bound = is_binding_lport_this_chassis(b_lport, b_ctx_in->chassis_rec); ++ ++ /* Remove b_lport from local_binding. */ ++ binding_lport_delete(binding_lports, b_lport); ++ } ++ ++ if (bound && lbinding && lport_type == LP_VIF) { ++ /* We need to release the container/virtual binding lports (if any) if ++ * deleted 'pb' type is LP_VIF. */ ++ struct binding_lport *c_lport; ++ LIST_FOR_EACH (c_lport, list_node, &lbinding->binding_lports) { ++ if (!release_binding_lport(b_ctx_in->chassis_rec, c_lport, ++ !b_ctx_in->ovnsb_idl_txn, ++ b_ctx_out)) { ++ return false; ++ } + } + } + +@@ -2147,18 +2217,8 @@ handle_deleted_vif_lport(const struct sbrec_port_binding *pb, + * it from local_lports if there is a VIF entry. + * consider_iface_release() takes care of removing from the local_lports + * when the interface change happens. */ +- if (is_lport_container(pb)) { ++ if (lport_type == LP_CONTAINER) { + remove_local_lports(pb->logical_port, b_ctx_out); +- +- /* If the container port is removed we should also remove it from +- * its parent's children set. +- */ +- if (lbinding) { +- if (lbinding->parent) { +- local_binding_delete_child(lbinding->parent, lbinding); +- } +- local_binding_destroy(lbinding); +- } + } + + handle_deleted_lport(pb, b_ctx_in, b_ctx_out); +@@ -2177,7 +2237,7 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, + + if (lport_type == LP_VIRTUAL) { + handled = consider_virtual_lport(pb, b_ctx_in, b_ctx_out, qos_map); +- } else if (lport_type == LP_VIF && is_lport_container(pb)) { ++ } else if (lport_type == LP_CONTAINER) { + handled = consider_container_lport(pb, b_ctx_in, b_ctx_out, qos_map); + } else { + handled = consider_vif_lport(pb, b_ctx_in, b_ctx_out, NULL, qos_map); +@@ -2189,14 +2249,14 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, + + bool now_claimed = (pb->chassis == b_ctx_in->chassis_rec); + +- if (lport_type == LP_VIRTUAL || +- (lport_type == LP_VIF && is_lport_container(pb)) || ++ if (lport_type == LP_VIRTUAL || lport_type == LP_CONTAINER || + claimed == now_claimed) { + return true; + } + +- struct local_binding *lbinding = +- local_binding_find(b_ctx_out->local_bindings, pb->logical_port); ++ struct shash *local_bindings = &b_ctx_out->lbinding_data->bindings; ++ struct local_binding *lbinding = local_binding_find(local_bindings, ++ pb->logical_port); + + /* If the ovs port backing this binding previously was removed in the + * meantime, we won't have a local_binding for it. +@@ -2206,12 +2266,11 @@ handle_updated_vif_lport(const struct sbrec_port_binding *pb, + return true; + } + +- struct shash_node *node; +- SHASH_FOR_EACH (node, &lbinding->children) { +- struct local_binding *child = node->data; +- if (child->type == BT_CONTAINER) { +- handled = consider_container_lport(child->pb, b_ctx_in, b_ctx_out, +- qos_map); ++ struct binding_lport *b_lport; ++ LIST_FOR_EACH (b_lport, list_node, &lbinding->binding_lports) { ++ if (b_lport->type == LP_CONTAINER) { ++ handled = consider_container_lport(b_lport->pb, b_ctx_in, ++ b_ctx_out, qos_map); + if (!handled) { + return false; + } +@@ -2256,12 +2315,25 @@ binding_handle_port_binding_changes(struct binding_ctx_in *b_ctx_in, + + enum en_lport_type lport_type = get_lport_type(pb); + +- if (lport_type == LP_VIF) { +- if (is_lport_container(pb)) { +- shash_add(&deleted_container_pbs, pb->logical_port, pb); +- } else { +- shash_add(&deleted_vif_pbs, pb->logical_port, pb); ++ struct binding_lport *b_lport = ++ binding_lport_find(&b_ctx_out->lbinding_data->lports, ++ pb->logical_port); ++ if (b_lport) { ++ /* If the 'b_lport->type' and 'lport_type' don't match, then update ++ * the b_lport->type to the updated 'lport_type'. The function ++ * binding_lport_check_and_cleanup() will cleanup the 'b_lport' ++ * if required. */ ++ if (b_lport->type != lport_type) { ++ b_lport->type = lport_type; + } ++ b_lport = binding_lport_check_and_cleanup( ++ b_lport, &b_ctx_out->lbinding_data->lports); ++ } ++ ++ if (lport_type == LP_VIF) { ++ shash_add(&deleted_vif_pbs, pb->logical_port, pb); ++ } else if (lport_type == LP_CONTAINER) { ++ shash_add(&deleted_container_pbs, pb->logical_port, pb); + } else if (lport_type == LP_VIRTUAL) { + shash_add(&deleted_virtual_pbs, pb->logical_port, pb); + } else { +@@ -2272,7 +2344,7 @@ binding_handle_port_binding_changes(struct binding_ctx_in *b_ctx_in, + struct shash_node *node; + struct shash_node *node_next; + SHASH_FOR_EACH_SAFE (node, node_next, &deleted_container_pbs) { +- handled = handle_deleted_vif_lport(node->data, LP_VIF, b_ctx_in, ++ handled = handle_deleted_vif_lport(node->data, LP_CONTAINER, b_ctx_in, + b_ctx_out); + shash_delete(&deleted_container_pbs, node); + if (!handled) { +@@ -2326,12 +2398,33 @@ delete_done: + + enum en_lport_type lport_type = get_lport_type(pb); + ++ struct binding_lport *b_lport = ++ binding_lport_find(&b_ctx_out->lbinding_data->lports, ++ pb->logical_port); ++ if (b_lport) { ++ ovs_assert(b_lport->pb == pb); ++ ++ if (b_lport->type != lport_type) { ++ b_lport->type = lport_type; ++ } ++ ++ if (b_lport->lbinding) { ++ handled = local_binding_handle_stale_binding_lports( ++ b_lport->lbinding, b_ctx_in, b_ctx_out, qos_map_ptr); ++ if (!handled) { ++ /* Backout from the handling. */ ++ break; ++ } ++ } ++ } ++ + struct local_datapath *ld = + get_local_datapath(b_ctx_out->local_datapaths, + pb->datapath->tunnel_key); + + switch (lport_type) { + case LP_VIF: ++ case LP_CONTAINER: + case LP_VIRTUAL: + handled = handle_updated_vif_lport(pb, lport_type, b_ctx_in, + b_ctx_out, qos_map_ptr); +@@ -2468,11 +2561,11 @@ binding_init(void) + * available. + */ + void +-binding_seqno_run(struct shash *local_bindings) ++binding_seqno_run(struct local_binding_data *lbinding_data) + { + const char *iface_id; + const char *iface_id_next; +- ++ struct shash *local_bindings = &lbinding_data->bindings; + SSET_FOR_EACH_SAFE (iface_id, iface_id_next, &binding_iface_released_set) { + struct shash_node *lb_node = shash_find(local_bindings, iface_id); + +@@ -2508,16 +2601,17 @@ binding_seqno_run(struct shash *local_bindings) + * If so, then this is a newly bound interface, make sure we reset the + * Port_Binding 'up' field and the OVS Interface 'external-id'. + */ +- if (lb && lb->pb && lb->iface) { ++ struct binding_lport *b_lport = local_binding_get_primary_lport(lb); ++ if (lb && b_lport && lb->iface) { + new_ifaces = true; + + if (smap_get(&lb->iface->external_ids, OVN_INSTALLED_EXT_ID)) { + ovsrec_interface_update_external_ids_delkey( + lb->iface, OVN_INSTALLED_EXT_ID); + } +- if (lb->pb->n_up) { ++ if (b_lport->pb->n_up) { + bool up = false; +- sbrec_port_binding_set_up(lb->pb, &up, 1); ++ sbrec_port_binding_set_up(b_lport->pb, &up, 1); + } + simap_put(&binding_iface_seqno_map, lb->name, new_seqno); + } +@@ -2542,12 +2636,13 @@ binding_seqno_run(struct shash *local_bindings) + * available. + */ + void +-binding_seqno_install(struct shash *local_bindings) ++binding_seqno_install(struct local_binding_data *lbinding_data) + { + struct ofctrl_acked_seqnos *acked_seqnos = + ofctrl_acked_seqnos_get(binding_seq_type_pb_cfg); + struct simap_node *node; + struct simap_node *node_next; ++ struct shash *local_bindings = &lbinding_data->bindings; + + SIMAP_FOR_EACH_SAFE (node, node_next, &binding_iface_seqno_map) { + struct shash_node *lb_node = shash_find(local_bindings, node->name); +@@ -2557,7 +2652,8 @@ binding_seqno_install(struct shash *local_bindings) + } + + struct local_binding *lb = lb_node->data; +- if (!lb->pb || !lb->iface) { ++ struct binding_lport *b_lport = local_binding_get_primary_lport(lb); ++ if (!b_lport || !lb->iface) { + goto del_seqno; + } + +@@ -2568,14 +2664,12 @@ binding_seqno_install(struct shash *local_bindings) + ovsrec_interface_update_external_ids_setkey(lb->iface, + OVN_INSTALLED_EXT_ID, + "true"); +- if (lb->pb->n_up) { ++ if (b_lport->pb->n_up) { + bool up = true; + +- sbrec_port_binding_set_up(lb->pb, &up, 1); +- struct shash_node *child_node; +- SHASH_FOR_EACH (child_node, &lb->children) { +- struct local_binding *lb_child = child_node->data; +- sbrec_port_binding_set_up(lb_child->pb, &up, 1); ++ sbrec_port_binding_set_up(b_lport->pb, &up, 1); ++ LIST_FOR_EACH (b_lport, list_node, &lb->binding_lports) { ++ sbrec_port_binding_set_up(b_lport->pb, &up, 1); + } + } + +@@ -2591,3 +2685,305 @@ binding_seqno_flush(void) + { + simap_clear(&binding_iface_seqno_map); + } ++ ++/* Static functions for local_lbindind and binding_lport. */ ++static struct local_binding * ++local_binding_create(const char *name, const struct ovsrec_interface *iface) ++{ ++ struct local_binding *lbinding = xzalloc(sizeof *lbinding); ++ lbinding->name = xstrdup(name); ++ lbinding->iface = iface; ++ ovs_list_init(&lbinding->binding_lports); ++ ++ return lbinding; ++} ++ ++static struct local_binding * ++local_binding_find(struct shash *local_bindings, const char *name) ++{ ++ return shash_find_data(local_bindings, name); ++} ++ ++static void ++local_binding_add(struct shash *local_bindings, struct local_binding *lbinding) ++{ ++ shash_add(local_bindings, lbinding->name, lbinding); ++} ++ ++static void ++local_binding_destroy(struct local_binding *lbinding, ++ struct shash *binding_lports) ++{ ++ struct binding_lport *b_lport; ++ LIST_FOR_EACH_POP (b_lport, list_node, &lbinding->binding_lports) { ++ b_lport->lbinding = NULL; ++ binding_lport_delete(binding_lports, b_lport); ++ } ++ ++ free(lbinding->name); ++ free(lbinding); ++} ++ ++static void ++local_binding_delete(struct local_binding *lbinding, ++ struct shash *local_bindings, ++ struct shash *binding_lports) ++{ ++ shash_find_and_delete(local_bindings, lbinding->name); ++ local_binding_destroy(lbinding, binding_lports); ++} ++ ++/* Returns the primary binding lport if present in lbinding's ++ * binding lports list. A binding lport is considered primary ++ * if binding lport's type is LP_VIF and the name matches ++ * with the 'lbinding'. ++ */ ++static struct binding_lport * ++local_binding_get_primary_lport(struct local_binding *lbinding) ++{ ++ if (!lbinding) { ++ return NULL; ++ } ++ ++ if (!ovs_list_is_empty(&lbinding->binding_lports)) { ++ struct binding_lport *b_lport = NULL; ++ b_lport = CONTAINER_OF(ovs_list_front(&lbinding->binding_lports), ++ struct binding_lport, list_node); ++ ++ if (b_lport->type == LP_VIF && ++ !strcmp(lbinding->name, b_lport->name)) { ++ return b_lport; ++ } ++ } ++ ++ return NULL; ++} ++ ++static struct binding_lport * ++local_binding_add_lport(struct shash *binding_lports, ++ struct local_binding *lbinding, ++ const struct sbrec_port_binding *pb, ++ enum en_lport_type b_type) ++{ ++ struct binding_lport *b_lport = ++ binding_lport_find(binding_lports, pb->logical_port); ++ bool add_to_lport_list = false; ++ if (!b_lport) { ++ b_lport = binding_lport_create(pb, lbinding, b_type); ++ binding_lport_add(binding_lports, b_lport); ++ add_to_lport_list = true; ++ } else if (b_lport->lbinding != lbinding) { ++ add_to_lport_list = true; ++ if (!ovs_list_is_empty(&b_lport->list_node)) { ++ ovs_list_remove(&b_lport->list_node); ++ } ++ b_lport->lbinding = lbinding; ++ b_lport->type = b_type; ++ } ++ ++ if (add_to_lport_list) { ++ if (b_type == LP_VIF) { ++ ovs_list_push_front(&lbinding->binding_lports, &b_lport->list_node); ++ } else { ++ ovs_list_push_back(&lbinding->binding_lports, &b_lport->list_node); ++ } ++ } ++ ++ return b_lport; ++} ++ ++/* This function handles the stale binding lports of 'lbinding' if 'lbinding' ++ * doesn't have a primary binding lport. ++ */ ++static bool ++local_binding_handle_stale_binding_lports(struct local_binding *lbinding, ++ struct binding_ctx_in *b_ctx_in, ++ struct binding_ctx_out *b_ctx_out, ++ struct hmap *qos_map) ++{ ++ /* Check if this lbinding has a primary binding_lport or not. */ ++ struct binding_lport *p_lport = local_binding_get_primary_lport(lbinding); ++ if (p_lport) { ++ /* Nothing to be done. */ ++ return true; ++ } ++ ++ bool handled = true; ++ struct binding_lport *b_lport, *next; ++ const struct sbrec_port_binding *pb; ++ LIST_FOR_EACH_SAFE (b_lport, next, list_node, &lbinding->binding_lports) { ++ /* Get the lport type again from the pb. Its possible that the ++ * pb type has changed. */ ++ enum en_lport_type pb_lport_type = get_lport_type(b_lport->pb); ++ if (b_lport->type == LP_VIRTUAL && pb_lport_type == LP_VIRTUAL) { ++ pb = b_lport->pb; ++ binding_lport_delete(&b_ctx_out->lbinding_data->lports, ++ b_lport); ++ handled = consider_virtual_lport(pb, b_ctx_in, b_ctx_out, qos_map); ++ } else if (b_lport->type == LP_CONTAINER && ++ pb_lport_type == LP_CONTAINER) { ++ /* For container lport, binding_lport is preserved so that when ++ * the parent port is created, it can be considered. ++ * consider_container_lport() creates the binding_lport for the parent ++ * port (with iface set to NULL). */ ++ handled = consider_container_lport(b_lport->pb, b_ctx_in, b_ctx_out, qos_map); ++ } else { ++ /* This can happen when the lport type changes from one type ++ * to another. Eg. from normal lport to external. Release the ++ * lport if it was claimed earlier and delete the b_lport. */ ++ handled = release_binding_lport(b_ctx_in->chassis_rec, b_lport, ++ !b_ctx_in->ovnsb_idl_txn, ++ b_ctx_out); ++ binding_lport_delete(&b_ctx_out->lbinding_data->lports, ++ b_lport); ++ } ++ ++ if (!handled) { ++ return false; ++ } ++ } ++ ++ return handled; ++} ++ ++static struct binding_lport * ++binding_lport_create(const struct sbrec_port_binding *pb, ++ struct local_binding *lbinding, ++ enum en_lport_type type) ++{ ++ struct binding_lport *b_lport = xzalloc(sizeof *b_lport); ++ b_lport->name = xstrdup(pb->logical_port); ++ b_lport->pb = pb; ++ b_lport->type = type; ++ b_lport->lbinding = lbinding; ++ ovs_list_init(&b_lport->list_node); ++ ++ return b_lport; ++} ++ ++static void ++binding_lport_add(struct shash *binding_lports, struct binding_lport *b_lport) ++{ ++ shash_add(binding_lports, b_lport->pb->logical_port, b_lport); ++} ++ ++static struct binding_lport * ++binding_lport_find(struct shash *binding_lports, const char *lport_name) ++{ ++ if (!lport_name) { ++ return NULL; ++ } ++ ++ return shash_find_data(binding_lports, lport_name); ++} ++ ++static void ++binding_lport_destroy(struct binding_lport *b_lport) ++{ ++ if (!ovs_list_is_empty(&b_lport->list_node)) { ++ ovs_list_remove(&b_lport->list_node); ++ } ++ ++ free(b_lport->name); ++ free(b_lport); ++} ++ ++static void ++binding_lport_delete(struct shash *binding_lports, ++ struct binding_lport *b_lport) ++{ ++ shash_find_and_delete(binding_lports, b_lport->name); ++ binding_lport_destroy(b_lport); ++} ++ ++ ++static const struct sbrec_port_binding * ++binding_lport_get_parent_pb(struct binding_lport *b_lport) ++{ ++ if (!b_lport) { ++ return NULL; ++ } ++ ++ if (b_lport->type == LP_VIF) { ++ return NULL; ++ } ++ ++ struct local_binding *lbinding = b_lport->lbinding; ++ ovs_assert(lbinding); ++ ++ struct binding_lport *parent_b_lport = ++ local_binding_get_primary_lport(lbinding); ++ ++ return parent_b_lport ? parent_b_lport->pb : NULL; ++} ++ ++/* This function checks and cleans up the 'b_lport' if it is ++ * not in the correct state. ++ * ++ * If the 'b_lport' type is LP_VIF, then its name and its lbinding->name ++ * should match. Otherwise this should be cleaned up. ++ * ++ * If the 'b_lport' type is LP_CONTAINER, then its parent_port name should ++ * be the same as its lbinding's name. Otherwise this should be ++ * cleaned up. ++ * ++ * If the 'b_lport' type is LP_VIRTUAL, then its virtual parent name ++ * should be the same as its lbinding's name. Otherwise this ++ * should be cleaned up. ++ * ++ * If the 'b_lport' type is not LP_VIF, LP_CONTAINER or LP_VIRTUAL, it ++ * should be cleaned up. This can happen if the CMS changes ++ * the port binding type. ++ */ ++static struct binding_lport * ++binding_lport_check_and_cleanup(struct binding_lport *b_lport, ++ struct shash *binding_lports) ++{ ++ bool cleanup_blport = false; ++ ++ if (!b_lport->lbinding) { ++ cleanup_blport = true; ++ goto cleanup; ++ } ++ ++ switch (b_lport->type) { ++ case LP_VIF: ++ if (strcmp(b_lport->name, b_lport->lbinding->name)) { ++ cleanup_blport = true; ++ } ++ break; ++ ++ case LP_CONTAINER: ++ if (strcmp(b_lport->pb->parent_port, b_lport->lbinding->name)) { ++ cleanup_blport = true; ++ } ++ break; ++ ++ case LP_VIRTUAL: ++ if (!b_lport->pb->virtual_parent || ++ strcmp(b_lport->pb->virtual_parent, b_lport->lbinding->name)) { ++ cleanup_blport = true; ++ } ++ break; ++ ++ case LP_PATCH: ++ case LP_LOCALPORT: ++ case LP_VTEP: ++ case LP_L2GATEWAY: ++ case LP_L3GATEWAY: ++ case LP_CHASSISREDIRECT: ++ case LP_EXTERNAL: ++ case LP_LOCALNET: ++ case LP_REMOTE: ++ case LP_UNKNOWN: ++ cleanup_blport = true; ++ } ++ ++cleanup: ++ if (cleanup_blport) { ++ binding_lport_delete(binding_lports, b_lport); ++ return NULL; ++ } ++ ++ return b_lport; ++} +diff --git a/controller/binding.h b/controller/binding.h +index c9ebef4b1..4fc9ef207 100644 +--- a/controller/binding.h ++++ b/controller/binding.h +@@ -36,6 +36,7 @@ struct sbrec_chassis; + struct sbrec_port_binding_table; + struct sset; + struct sbrec_port_binding; ++struct ds; + + struct binding_ctx_in { + struct ovsdb_idl_txn *ovnsb_idl_txn; +@@ -56,7 +57,7 @@ struct binding_ctx_in { + + struct binding_ctx_out { + struct hmap *local_datapaths; +- struct shash *local_bindings; ++ struct local_binding_data *lbinding_data; + + /* sset of (potential) local lports. */ + struct sset *local_lports; +@@ -86,28 +87,16 @@ struct binding_ctx_out { + struct hmap *tracked_dp_bindings; + }; + +-enum local_binding_type { +- BT_VIF, +- BT_CONTAINER, +- BT_VIRTUAL ++struct local_binding_data { ++ struct shash bindings; ++ struct shash lports; + }; + +-struct local_binding { +- char *name; +- enum local_binding_type type; +- const struct ovsrec_interface *iface; +- const struct sbrec_port_binding *pb; +- +- /* shash of 'struct local_binding' representing children. */ +- struct shash children; +- struct local_binding *parent; +-}; ++void local_binding_data_init(struct local_binding_data *); ++void local_binding_data_destroy(struct local_binding_data *); + +-static inline struct local_binding * +-local_binding_find(struct shash *local_bindings, const char *name) +-{ +- return shash_find_data(local_bindings, name); +-} ++const struct sbrec_port_binding *local_binding_get_primary_pb( ++ struct shash *local_bindings, const char *pb_name); + + /* Represents a tracked binding logical port. */ + struct tracked_binding_lport { +@@ -128,8 +117,6 @@ bool binding_cleanup(struct ovsdb_idl_txn *ovnsb_idl_txn, + const struct sbrec_port_binding_table *, + const struct sbrec_chassis *); + +-void local_bindings_init(struct shash *local_bindings); +-void local_bindings_destroy(struct shash *local_bindings); + bool binding_handle_ovs_interface_changes(struct binding_ctx_in *, + struct binding_ctx_out *); + bool binding_handle_port_binding_changes(struct binding_ctx_in *, +@@ -137,7 +124,8 @@ bool binding_handle_port_binding_changes(struct binding_ctx_in *, + void binding_tracked_dp_destroy(struct hmap *tracked_datapaths); + + void binding_init(void); +-void binding_seqno_run(struct shash *local_bindings); +-void binding_seqno_install(struct shash *local_bindings); ++void binding_seqno_run(struct local_binding_data *lbinding_data); ++void binding_seqno_install(struct local_binding_data *lbinding_data); + void binding_seqno_flush(void); ++void binding_dump_local_bindings(struct local_binding_data *, struct ds *); + #endif /* controller/binding.h */ +diff --git a/controller/ovn-controller.8.xml b/controller/ovn-controller.8.xml +index 51c0c372c..8886df568 100644 +--- a/controller/ovn-controller.8.xml ++++ b/controller/ovn-controller.8.xml +@@ -578,6 +578,28 @@ + Displays logical flow cache statistics: enabled/disabled, per cache + type entry counts. + ++ ++
inc-engine/show-stats
ovn-controller
engine counters. For each engine
++ node the following counters have been added:
++ recompute
++ compute
++ abort
++ inc-engine/clear-stats
ovn-controller
engine counters.
++ flags.force_snat_for_lb = 1;
+- ct_lb(args);
. If health check is enabled, then
++ ct_lb(args);.
++ If the load balancing rule is configured with skip_snat
++ set to true, the above action will be replaced by
++ flags.skip_snat_for_lb = 1; ct_lb(args);
.
++ If health check is enabled, then
+ args will only contain those endpoints whose service
+ monitor status entry in OVN_Southbound
db is
+ either online
or empty.
+@@ -2737,6 +2741,9 @@ icmp6 {
+ with an action of ct_dnat;
. If the router is
+ configured to force SNAT any load-balanced packets, the above action
+ will be replaced by flags.force_snat_for_lb = 1; ct_dnat;
.
++ If the load balancing rule is configured with skip_snat
++ set to true, the above action will be replaced by
++ flags.skip_snat_for_lb = 1; ct_dnat;
.
+
+
+ flags.force_snat_for_lb = 1;
+ ct_lb(args);
.
++ If the load balancing rule is configured with skip_snat
++ set to true, the above action will be replaced by
++ flags.skip_snat_for_lb = 1; ct_lb(args);
.
+ flags.force_snat_for_lb = 1; ct_dnat;
.
++ If the load balancing rule is configured with skip_snat
++ set to true, the above action will be replaced by
++ flags.skip_snat_for_lb = 1; ct_dnat;
.
+
++ If a load balancer configured to skip snat has been applied to
++ the Gateway router pipeline, a priority-120 flow matches
++ flags.skip_snat_for_lb == 1 && ip
with an
++ action next;
.
++
+ If the Gateway router in the OVN Northbound database has been
+diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
+index 5a2018c2e..4e406c594 100644
+--- a/northd/ovn-northd.c
++++ b/northd/ovn-northd.c
+@@ -8573,10 +8573,16 @@ get_force_snat_ip(struct ovn_datapath *od, const char *key_type,
+ return true;
+ }
+
++enum lb_snat_type {
++ NO_FORCE_SNAT,
++ FORCE_SNAT,
++ SKIP_SNAT,
++};
++
+ static void
+ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
+ struct ds *match, struct ds *actions, int priority,
+- bool force_snat_for_lb, struct ovn_lb_vip *lb_vip,
++ enum lb_snat_type snat_type, struct ovn_lb_vip *lb_vip,
+ const char *proto, struct nbrec_load_balancer *lb,
+ struct shash *meter_groups, struct sset *nat_entries)
+ {
+@@ -8585,9 +8591,10 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
+
+ /* A match and actions for new connections. */
+ char *new_match = xasprintf("ct.new && %s", ds_cstr(match));
+- if (force_snat_for_lb) {
+- char *new_actions = xasprintf("flags.force_snat_for_lb = 1; %s",
+- ds_cstr(actions));
++ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
++ char *new_actions = xasprintf("flags.%s_snat_for_lb = 1; %s",
++ snat_type == SKIP_SNAT ? "skip" : "force",
++ ds_cstr(actions));
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
+ new_match, new_actions, &lb->header_);
+ free(new_actions);
+@@ -8598,11 +8605,12 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
+
+ /* A match and actions for established connections. */
+ char *est_match = xasprintf("ct.est && %s", ds_cstr(match));
+- if (force_snat_for_lb) {
++ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
++ char *est_actions = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;",
++ snat_type == SKIP_SNAT ? "skip" : "force");
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
+- est_match,
+- "flags.force_snat_for_lb = 1; ct_dnat;",
+- &lb->header_);
++ est_match, est_actions, &lb->header_);
++ free(est_actions);
+ } else {
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, priority,
+ est_match, "ct_dnat;", &lb->header_);
+@@ -8675,11 +8683,13 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
+ ds_put_format(&undnat_match, ") && outport == %s && "
+ "is_chassis_resident(%s)", od->l3dgw_port->json_key,
+ od->l3redirect_port->json_key);
+- if (force_snat_for_lb) {
++ if (snat_type == FORCE_SNAT || snat_type == SKIP_SNAT) {
++ char *action = xasprintf("flags.%s_snat_for_lb = 1; ct_dnat;",
++ snat_type == SKIP_SNAT ? "skip" : "force");
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
+- ds_cstr(&undnat_match),
+- "flags.force_snat_for_lb = 1; ct_dnat;",
++ ds_cstr(&undnat_match), action,
+ &lb->header_);
++ free(action);
+ } else {
+ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 120,
+ ds_cstr(&undnat_match), "ct_dnat;",
+@@ -8689,6 +8699,105 @@ add_router_lb_flow(struct hmap *lflows, struct ovn_datapath *od,
+ ds_destroy(&undnat_match);
+ }
+
++static void
++build_lrouter_lb_flows(struct hmap *lflows, struct ovn_datapath *od,
++ struct hmap *lbs, struct shash *meter_groups,
++ struct sset *nat_entries, struct ds *match,
++ struct ds *actions)
++{
++ /* A set to hold all ips that need defragmentation and tracking. */
++ struct sset all_ips = SSET_INITIALIZER(&all_ips);
++ bool lb_force_snat_ip =
++ !lport_addresses_is_empty(&od->lb_force_snat_addrs);
++
++ for (int i = 0; i < od->nbr->n_load_balancer; i++) {
++ struct nbrec_load_balancer *nb_lb = od->nbr->load_balancer[i];
++ struct ovn_northd_lb *lb =
++ ovn_northd_lb_find(lbs, &nb_lb->header_.uuid);
++ ovs_assert(lb);
++
++ bool lb_skip_snat = smap_get_bool(&nb_lb->options, "skip_snat", false);
++ if (lb_skip_snat) {
++ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120,
++ "flags.skip_snat_for_lb == 1 && ip", "next;");
++ }
++
++ for (size_t j = 0; j < lb->n_vips; j++) {
++ struct ovn_lb_vip *lb_vip = &lb->vips[j];
++ struct ovn_northd_lb_vip *lb_vip_nb = &lb->vips_nb[j];
++ ds_clear(actions);
++ build_lb_vip_actions(lb_vip, lb_vip_nb, actions,
++ lb->selection_fields, false);
++
++ if (!sset_contains(&all_ips, lb_vip->vip_str)) {
++ sset_add(&all_ips, lb_vip->vip_str);
++ /* If there are any load balancing rules, we should send
++ * the packet to conntrack for defragmentation and
++ * tracking. This helps with two things.
++ *
++ * 1. With tracking, we can send only new connections to
++ * pick a DNAT ip address from a group.
++ * 2. If there are L4 ports in load balancing rules, we
++ * need the defragmentation to match on L4 ports. */
++ ds_clear(match);
++ if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) {
++ ds_put_format(match, "ip && ip4.dst == %s",
++ lb_vip->vip_str);
++ } else {
++ ds_put_format(match, "ip && ip6.dst == %s",
++ lb_vip->vip_str);
++ }
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG,
++ 100, ds_cstr(match), "ct_next;",
++ &nb_lb->header_);
++ }
++
++ /* Higher priority rules are added for load-balancing in DNAT
++ * table. For every match (on a VIP[:port]), we add two flows
++ * via add_router_lb_flow(). One flow is for specific matching
++ * on ct.new with an action of "ct_lb($targets);". The other
++ * flow is for ct.est with an action of "ct_dnat;". */
++ ds_clear(match);
++ if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) {
++ ds_put_format(match, "ip && ip4.dst == %s",
++ lb_vip->vip_str);
++ } else {
++ ds_put_format(match, "ip && ip6.dst == %s",
++ lb_vip->vip_str);
++ }
++
++ int prio = 110;
++ bool is_udp = nullable_string_is_equal(nb_lb->protocol, "udp");
++ bool is_sctp = nullable_string_is_equal(nb_lb->protocol,
++ "sctp");
++ const char *proto = is_udp ? "udp" : is_sctp ? "sctp" : "tcp";
++
++ if (lb_vip->vip_port) {
++ ds_put_format(match, " && %s && %s.dst == %d", proto,
++ proto, lb_vip->vip_port);
++ prio = 120;
++ }
++
++ if (od->l3redirect_port &&
++ (lb_vip->n_backends || !lb_vip->empty_backend_rej)) {
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ }
++
++ enum lb_snat_type snat_type = NO_FORCE_SNAT;
++ if (lb_skip_snat) {
++ snat_type = SKIP_SNAT;
++ } else if (lb_force_snat_ip || od->lb_force_snat_router_ip) {
++ snat_type = FORCE_SNAT;
++ }
++ add_router_lb_flow(lflows, od, match, actions, prio,
++ snat_type, lb_vip, proto, nb_lb,
++ meter_groups, nat_entries);
++ }
++ }
++ sset_destroy(&all_ips);
++}
++
+ #define ND_RA_MAX_INTERVAL_MAX 1800
+ #define ND_RA_MAX_INTERVAL_MIN 4
+
+@@ -11002,668 +11111,643 @@ build_lrouter_ipv4_ip_input(struct ovn_port *op,
+ }
+ }
+
+-/* NAT, Defrag and load balancing. */
+ static void
+-build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od,
+- struct hmap *lflows,
+- struct shash *meter_groups,
+- struct hmap *lbs,
+- struct ds *match, struct ds *actions)
++build_lrouter_in_unsnat_flow(struct hmap *lflows, struct ovn_datapath *od,
++ const struct nbrec_nat *nat, struct ds *match,
++ struct ds *actions, bool distributed, bool is_v6)
+ {
+- if (od->nbr) {
++ /* Ingress UNSNAT table: It is for already established connections'
++ * reverse traffic. i.e., SNAT has already been done in egress
++ * pipeline and now the packet has entered the ingress pipeline as
++ * part of a reply. We undo the SNAT here.
++ *
++ * Undoing SNAT has to happen before DNAT processing. This is
++ * because when the packet was DNATed in ingress pipeline, it did
++ * not know about the possibility of eventual additional SNAT in
++ * egress pipeline. */
++ if (strcmp(nat->type, "snat") && strcmp(nat->type, "dnat_and_snat")) {
++ return;
++ }
+
+- /* Packets are allowed by default. */
+- ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");
+- ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0, "1", "next;");
+-
+- /* Send the IPv6 NS packets to next table. When ovn-controller
+- * generates IPv6 NS (for the action - nd_ns{}), the injected
+- * packet would go through conntrack - which is not required. */
+- ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120, "nd_ns", "next;");
+-
+- /* NAT rules are only valid on Gateway routers and routers with
+- * l3dgw_port (router has a port with gateway chassis
+- * specified). */
+- if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
+- return;
++ bool stateless = lrouter_nat_is_stateless(nat);
++ if (!od->l3dgw_port) {
++ /* Gateway router. */
++ ds_clear(match);
++ ds_clear(actions);
++ ds_put_format(match, "ip && ip%s.dst == %s",
++ is_v6 ? "6" : "4", nat->external_ip);
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "ip%s.dst=%s; next;",
++ is_v6 ? "6" : "4", nat->logical_ip);
++ } else {
++ ds_put_cstr(actions, "ct_snat;");
+ }
+
+- struct sset nat_entries = SSET_INITIALIZER(&nat_entries);
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT,
++ 90, ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++ } else {
++ /* Distributed router. */
+
+- bool dnat_force_snat_ip =
+- !lport_addresses_is_empty(&od->dnat_force_snat_addrs);
+- bool lb_force_snat_ip =
+- !lport_addresses_is_empty(&od->lb_force_snat_addrs);
++ /* Traffic received on l3dgw_port is subject to NAT. */
++ ds_clear(match);
++ ds_clear(actions);
++ ds_put_format(match, "ip && ip%s.dst == %s && inport == %s",
++ is_v6 ? "6" : "4", nat->external_ip,
++ od->l3dgw_port->json_key);
++ if (!distributed && od->l3redirect_port) {
++ /* Flows for NAT rules that are centralized are only
++ * programmed on the gateway chassis. */
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ }
+
+- for (int i = 0; i < od->nbr->n_nat; i++) {
+- const struct nbrec_nat *nat;
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "ip%s.dst=%s; next;",
++ is_v6 ? "6" : "4", nat->logical_ip);
++ } else {
++ ds_put_cstr(actions, "ct_snat;");
++ }
+
+- nat = od->nbr->nat[i];
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT,
++ 100, ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++ }
++}
+
+- ovs_be32 ip, mask;
+- struct in6_addr ipv6, mask_v6, v6_exact = IN6ADDR_EXACT_INIT;
+- bool is_v6 = false;
+- bool stateless = lrouter_nat_is_stateless(nat);
+- struct nbrec_address_set *allowed_ext_ips =
+- nat->allowed_ext_ips;
+- struct nbrec_address_set *exempted_ext_ips =
+- nat->exempted_ext_ips;
++static void
++build_lrouter_in_dnat_flow(struct hmap *lflows, struct ovn_datapath *od,
++ const struct nbrec_nat *nat, struct ds *match,
++ struct ds *actions, bool distributed,
++ ovs_be32 mask, bool is_v6)
++{
++ /* Ingress DNAT table: Packets enter the pipeline with destination
++ * IP address that needs to be DNATted from a external IP address
++ * to a logical IP address. */
++ if (!strcmp(nat->type, "dnat") || !strcmp(nat->type, "dnat_and_snat")) {
++ bool stateless = lrouter_nat_is_stateless(nat);
+
+- if (allowed_ext_ips && exempted_ext_ips) {
+- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+- VLOG_WARN_RL(&rl, "NAT rule: "UUID_FMT" not applied, since "
+- "both allowed and exempt external ips set",
+- UUID_ARGS(&(nat->header_.uuid)));
+- continue;
++ if (!od->l3dgw_port) {
++ /* Gateway router. */
++ /* Packet when it goes from the initiator to destination.
++ * We need to set flags.loopback because the router can
++ * send the packet back through the same interface. */
++ ds_clear(match);
++ ds_put_format(match, "ip && ip%s.dst == %s",
++ is_v6 ? "6" : "4", nat->external_ip);
++ ds_clear(actions);
++ if (nat->allowed_ext_ips || nat->exempted_ext_ips) {
++ lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
++ is_v6, true, mask);
+ }
+
+- char *error = ip_parse_masked(nat->external_ip, &ip, &mask);
+- if (error || mask != OVS_BE32_MAX) {
+- free(error);
+- error = ipv6_parse_masked(nat->external_ip, &ipv6, &mask_v6);
+- if (error || memcmp(&mask_v6, &v6_exact, sizeof(mask_v6))) {
+- /* Invalid for both IPv4 and IPv6 */
+- static struct vlog_rate_limit rl =
+- VLOG_RATE_LIMIT_INIT(5, 1);
+- VLOG_WARN_RL(&rl, "bad external ip %s for nat",
+- nat->external_ip);
+- free(error);
+- continue;
+- }
+- /* It was an invalid IPv4 address, but valid IPv6.
+- * Treat the rest of the handling of this NAT rule
+- * as IPv6. */
+- is_v6 = true;
+- }
+-
+- /* Check the validity of nat->logical_ip. 'logical_ip' can
+- * be a subnet when the type is "snat". */
+- int cidr_bits;
+- if (is_v6) {
+- error = ipv6_parse_masked(nat->logical_ip, &ipv6, &mask_v6);
+- cidr_bits = ipv6_count_cidr_bits(&mask_v6);
+- } else {
+- error = ip_parse_masked(nat->logical_ip, &ip, &mask);
+- cidr_bits = ip_count_cidr_bits(mask);
++ if (!lport_addresses_is_empty(&od->dnat_force_snat_addrs)) {
++ /* Indicate to the future tables that a DNAT has taken
++ * place and a force SNAT needs to be done in the
++ * Egress SNAT table. */
++ ds_put_format(actions, "flags.force_snat_for_dnat = 1; ");
+ }
+- if (!strcmp(nat->type, "snat")) {
+- if (error) {
+- /* Invalid for both IPv4 and IPv6 */
+- static struct vlog_rate_limit rl =
+- VLOG_RATE_LIMIT_INIT(5, 1);
+- VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
+- "in router "UUID_FMT"",
+- nat->logical_ip, UUID_ARGS(&od->key));
+- free(error);
+- continue;
+- }
++
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "flags.loopback = 1; "
++ "ip%s.dst=%s; next;",
++ is_v6 ? "6" : "4", nat->logical_ip);
+ } else {
+- if (error || (!is_v6 && mask != OVS_BE32_MAX)
+- || (is_v6 && memcmp(&mask_v6, &v6_exact,
+- sizeof mask_v6))) {
+- /* Invalid for both IPv4 and IPv6 */
+- static struct vlog_rate_limit rl =
+- VLOG_RATE_LIMIT_INIT(5, 1);
+- VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
+- ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
+- free(error);
+- continue;
++ ds_put_format(actions, "flags.loopback = 1; ct_dnat(%s",
++ nat->logical_ip);
++
++ if (nat->external_port_range[0]) {
++ ds_put_format(actions, ",%s", nat->external_port_range);
+ }
++ ds_put_format(actions, ");");
+ }
+
+- /* For distributed router NAT, determine whether this NAT rule
+- * satisfies the conditions for distributed NAT processing. */
+- bool distributed = false;
+- struct eth_addr mac;
+- if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") &&
+- nat->logical_port && nat->external_mac) {
+- if (eth_addr_from_string(nat->external_mac, &mac)) {
+- distributed = true;
+- } else {
+- static struct vlog_rate_limit rl =
+- VLOG_RATE_LIMIT_INIT(5, 1);
+- VLOG_WARN_RL(&rl, "bad mac %s for dnat in router "
+- ""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key));
+- continue;
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100,
++ ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++ } else {
++ /* Distributed router. */
++
++ /* Traffic received on l3dgw_port is subject to NAT. */
++ ds_clear(match);
++ ds_put_format(match, "ip && ip%s.dst == %s && inport == %s",
++ is_v6 ? "6" : "4", nat->external_ip,
++ od->l3dgw_port->json_key);
++ if (!distributed && od->l3redirect_port) {
++ /* Flows for NAT rules that are centralized are only
++ * programmed on the gateway chassis. */
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ }
++ ds_clear(actions);
++ if (nat->allowed_ext_ips || nat->exempted_ext_ips) {
++ lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
++ is_v6, true, mask);
++ }
++
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "ip%s.dst=%s; next;",
++ is_v6 ? "6" : "4", nat->logical_ip);
++ } else {
++ ds_put_format(actions, "ct_dnat(%s", nat->logical_ip);
++ if (nat->external_port_range[0]) {
++ ds_put_format(actions, ",%s", nat->external_port_range);
+ }
++ ds_put_format(actions, ");");
+ }
+
+- /* Ingress UNSNAT table: It is for already established connections'
+- * reverse traffic. i.e., SNAT has already been done in egress
+- * pipeline and now the packet has entered the ingress pipeline as
+- * part of a reply. We undo the SNAT here.
+- *
+- * Undoing SNAT has to happen before DNAT processing. This is
+- * because when the packet was DNATed in ingress pipeline, it did
+- * not know about the possibility of eventual additional SNAT in
+- * egress pipeline. */
+- if (!strcmp(nat->type, "snat")
+- || !strcmp(nat->type, "dnat_and_snat")) {
+- if (!od->l3dgw_port) {
+- /* Gateway router. */
+- ds_clear(match);
+- ds_clear(actions);
+- ds_put_format(match, "ip && ip%s.dst == %s",
+- is_v6 ? "6" : "4",
+- nat->external_ip);
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.dst=%s; next;",
+- is_v6 ? "6" : "4", nat->logical_ip);
+- } else {
+- ds_put_cstr(actions, "ct_snat;");
+- }
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100,
++ ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++ }
++ }
++}
+
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT,
+- 90, ds_cstr(match),
+- ds_cstr(actions),
+- &nat->header_);
+- } else {
+- /* Distributed router. */
++static void
++build_lrouter_out_undnat_flow(struct hmap *lflows, struct ovn_datapath *od,
++ const struct nbrec_nat *nat, struct ds *match,
++ struct ds *actions, bool distributed,
++ struct eth_addr mac, bool is_v6)
++{
++ /* Egress UNDNAT table: It is for already established connections'
++ * reverse traffic. i.e., DNAT has already been done in ingress
++ * pipeline and now the packet has entered the egress pipeline as
++ * part of a reply. We undo the DNAT here.
++ *
++ * Note that this only applies for NAT on a distributed router.
++ * Undo DNAT on a gateway router is done in the ingress DNAT
++ * pipeline stage. */
++ if (!od->l3dgw_port ||
++ (strcmp(nat->type, "dnat") && strcmp(nat->type, "dnat_and_snat"))) {
++ return;
++ }
+
+- /* Traffic received on l3dgw_port is subject to NAT. */
+- ds_clear(match);
+- ds_clear(actions);
+- ds_put_format(match, "ip && ip%s.dst == %s"
+- " && inport == %s",
+- is_v6 ? "6" : "4",
+- nat->external_ip,
+- od->l3dgw_port->json_key);
+- if (!distributed && od->l3redirect_port) {
+- /* Flows for NAT rules that are centralized are only
+- * programmed on the gateway chassis. */
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- }
++ ds_clear(match);
++ ds_put_format(match, "ip && ip%s.src == %s && outport == %s",
++ is_v6 ? "6" : "4", nat->logical_ip,
++ od->l3dgw_port->json_key);
++ if (!distributed && od->l3redirect_port) {
++ /* Flows for NAT rules that are centralized are only
++ * programmed on the gateway chassis. */
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ }
++ ds_clear(actions);
++ if (distributed) {
++ ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ",
++ ETH_ADDR_ARGS(mac));
++ }
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.dst=%s; next;",
+- is_v6 ? "6" : "4", nat->logical_ip);
+- } else {
+- ds_put_cstr(actions, "ct_snat;");
+- }
++ if (!strcmp(nat->type, "dnat_and_snat") &&
++ lrouter_nat_is_stateless(nat)) {
++ ds_put_format(actions, "ip%s.src=%s; next;",
++ is_v6 ? "6" : "4", nat->external_ip);
++ } else {
++ ds_put_format(actions, "ct_dnat;");
++ }
+
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_UNSNAT,
+- 100,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- }
+- }
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 100,
++ ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++}
+
+- /* Ingress DNAT table: Packets enter the pipeline with destination
+- * IP address that needs to be DNATted from a external IP address
+- * to a logical IP address. */
+- if (!strcmp(nat->type, "dnat")
+- || !strcmp(nat->type, "dnat_and_snat")) {
+- if (!od->l3dgw_port) {
+- /* Gateway router. */
+- /* Packet when it goes from the initiator to destination.
+- * We need to set flags.loopback because the router can
+- * send the packet back through the same interface. */
+- ds_clear(match);
+- ds_put_format(match, "ip && ip%s.dst == %s",
+- is_v6 ? "6" : "4",
+- nat->external_ip);
+- ds_clear(actions);
+- if (allowed_ext_ips || exempted_ext_ips) {
+- lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
+- is_v6, true, mask);
+- }
++static void
++build_lrouter_out_snat_flow(struct hmap *lflows, struct ovn_datapath *od,
++ const struct nbrec_nat *nat, struct ds *match,
++ struct ds *actions, bool distributed,
++ struct eth_addr mac, ovs_be32 mask,
++ int cidr_bits, bool is_v6)
++{
++ /* Egress SNAT table: Packets enter the egress pipeline with
++ * source ip address that needs to be SNATted to a external ip
++ * address. */
++ if (strcmp(nat->type, "snat") && strcmp(nat->type, "dnat_and_snat")) {
++ return;
++ }
+
+- if (dnat_force_snat_ip) {
+- /* Indicate to the future tables that a DNAT has taken
+- * place and a force SNAT needs to be done in the
+- * Egress SNAT table. */
+- ds_put_format(actions,
+- "flags.force_snat_for_dnat = 1; ");
+- }
++ bool stateless = lrouter_nat_is_stateless(nat);
++ if (!od->l3dgw_port) {
++ /* Gateway router. */
++ ds_clear(match);
++ ds_put_format(match, "ip && ip%s.src == %s",
++ is_v6 ? "6" : "4", nat->logical_ip);
++ ds_clear(actions);
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "flags.loopback = 1; "
+- "ip%s.dst=%s; next;",
+- is_v6 ? "6" : "4", nat->logical_ip);
+- } else {
+- ds_put_format(actions, "flags.loopback = 1; "
+- "ct_dnat(%s", nat->logical_ip);
++ if (nat->allowed_ext_ips || nat->exempted_ext_ips) {
++ lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
++ is_v6, false, mask);
++ }
+
+- if (nat->external_port_range[0]) {
+- ds_put_format(actions, ",%s",
+- nat->external_port_range);
+- }
+- ds_put_format(actions, ");");
+- }
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "ip%s.src=%s; next;",
++ is_v6 ? "6" : "4", nat->external_ip);
++ } else {
++ ds_put_format(actions, "ct_snat(%s", nat->external_ip);
+
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- } else {
+- /* Distributed router. */
++ if (nat->external_port_range[0]) {
++ ds_put_format(actions, ",%s",
++ nat->external_port_range);
++ }
++ ds_put_format(actions, ");");
++ }
+
+- /* Traffic received on l3dgw_port is subject to NAT. */
+- ds_clear(match);
+- ds_put_format(match, "ip && ip%s.dst == %s"
+- " && inport == %s",
+- is_v6 ? "6" : "4",
+- nat->external_ip,
+- od->l3dgw_port->json_key);
+- if (!distributed && od->l3redirect_port) {
+- /* Flows for NAT rules that are centralized are only
+- * programmed on the gateway chassis. */
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- }
+- ds_clear(actions);
+- if (allowed_ext_ips || exempted_ext_ips) {
+- lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
+- is_v6, true, mask);
+- }
++ /* The priority here is calculated such that the
++ * nat->logical_ip with the longest mask gets a higher
++ * priority. */
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT,
++ cidr_bits + 1, ds_cstr(match),
++ ds_cstr(actions), &nat->header_);
++ } else {
++ uint16_t priority = cidr_bits + 1;
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.dst=%s; next;",
+- is_v6 ? "6" : "4", nat->logical_ip);
+- } else {
+- ds_put_format(actions, "ct_dnat(%s", nat->logical_ip);
+- if (nat->external_port_range[0]) {
+- ds_put_format(actions, ",%s",
+- nat->external_port_range);
+- }
+- ds_put_format(actions, ");");
+- }
++ /* Distributed router. */
++ ds_clear(match);
++ ds_put_format(match, "ip && ip%s.src == %s && outport == %s",
++ is_v6 ? "6" : "4", nat->logical_ip,
++ od->l3dgw_port->json_key);
++ if (!distributed && od->l3redirect_port) {
++ /* Flows for NAT rules that are centralized are only
++ * programmed on the gateway chassis. */
++ priority += 128;
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ }
++ ds_clear(actions);
+
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DNAT, 100,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- }
+- }
++ if (nat->allowed_ext_ips || nat->exempted_ext_ips) {
++ lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
++ is_v6, false, mask);
++ }
+
+- /* ARP resolve for NAT IPs. */
+- if (od->l3dgw_port) {
+- if (!strcmp(nat->type, "snat")) {
+- ds_clear(match);
+- ds_put_format(
+- match, "inport == %s && %s == %s",
+- od->l3dgw_port->json_key,
+- is_v6 ? "ip6.src" : "ip4.src", nat->external_ip);
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
+- 120, ds_cstr(match), "next;",
+- &nat->header_);
+- }
++ if (distributed) {
++ ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ",
++ ETH_ADDR_ARGS(mac));
++ }
+
+- if (!sset_contains(&nat_entries, nat->external_ip)) {
+- ds_clear(match);
+- ds_put_format(
+- match, "outport == %s && %s == %s",
+- od->l3dgw_port->json_key,
+- is_v6 ? REG_NEXT_HOP_IPV6 : REG_NEXT_HOP_IPV4,
++ if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
++ ds_put_format(actions, "ip%s.src=%s; next;",
++ is_v6 ? "6" : "4", nat->external_ip);
++ } else {
++ ds_put_format(actions, "ct_snat(%s",
+ nat->external_ip);
+- ds_clear(actions);
+- ds_put_format(
+- actions, "eth.dst = %s; next;",
+- distributed ? nat->external_mac :
+- od->l3dgw_port->lrp_networks.ea_s);
+- ovn_lflow_add_with_hint(lflows, od,
+- S_ROUTER_IN_ARP_RESOLVE,
+- 100, ds_cstr(match),
+- ds_cstr(actions),
+- &nat->header_);
+- sset_add(&nat_entries, nat->external_ip);
+- }
+- } else {
+- /* Add the NAT external_ip to the nat_entries even for
+- * gateway routers. This is required for adding load balancer
+- * flows.*/
+- sset_add(&nat_entries, nat->external_ip);
++ if (nat->external_port_range[0]) {
++ ds_put_format(actions, ",%s", nat->external_port_range);
+ }
++ ds_put_format(actions, ");");
++ }
+
+- /* Egress UNDNAT table: It is for already established connections'
+- * reverse traffic. i.e., DNAT has already been done in ingress
+- * pipeline and now the packet has entered the egress pipeline as
+- * part of a reply. We undo the DNAT here.
+- *
+- * Note that this only applies for NAT on a distributed router.
+- * Undo DNAT on a gateway router is done in the ingress DNAT
+- * pipeline stage. */
+- if (od->l3dgw_port && (!strcmp(nat->type, "dnat")
+- || !strcmp(nat->type, "dnat_and_snat"))) {
+- ds_clear(match);
+- ds_put_format(match, "ip && ip%s.src == %s"
+- " && outport == %s",
+- is_v6 ? "6" : "4",
+- nat->logical_ip,
+- od->l3dgw_port->json_key);
+- if (!distributed && od->l3redirect_port) {
+- /* Flows for NAT rules that are centralized are only
+- * programmed on the gateway chassis. */
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- }
+- ds_clear(actions);
+- if (distributed) {
+- ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ",
+- ETH_ADDR_ARGS(mac));
+- }
++ /* The priority here is calculated such that the
++ * nat->logical_ip with the longest mask gets a higher
++ * priority. */
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT,
++ priority, ds_cstr(match),
++ ds_cstr(actions), &nat->header_);
++ }
++}
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.src=%s; next;",
+- is_v6 ? "6" : "4", nat->external_ip);
+- } else {
+- ds_put_format(actions, "ct_dnat;");
+- }
++static void
++build_lrouter_ingress_flow(struct hmap *lflows, struct ovn_datapath *od,
++ const struct nbrec_nat *nat, struct ds *match,
++ struct ds *actions, struct eth_addr mac,
++ bool distributed, bool is_v6)
++{
++ if (od->l3dgw_port && !strcmp(nat->type, "snat")) {
++ ds_clear(match);
++ ds_put_format(
++ match, "inport == %s && %s == %s",
++ od->l3dgw_port->json_key,
++ is_v6 ? "ip6.src" : "ip4.src", nat->external_ip);
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_IP_INPUT,
++ 120, ds_cstr(match), "next;",
++ &nat->header_);
++ }
++ /* Logical router ingress table 0:
++ * For NAT on a distributed router, add rules allowing
++ * ingress traffic with eth.dst matching nat->external_mac
++ * on the l3dgw_port instance where nat->logical_port is
++ * resident. */
++ if (distributed) {
++ /* Store the ethernet address of the port receiving the packet.
++ * This will save us from having to match on inport further
++ * down in the pipeline.
++ */
++ ds_clear(actions);
++ ds_put_format(actions, REG_INPORT_ETH_ADDR " = %s; next;",
++ od->l3dgw_port->lrp_networks.ea_s);
+
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_UNDNAT, 100,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- }
++ ds_clear(match);
++ ds_put_format(match,
++ "eth.dst == "ETH_ADDR_FMT" && inport == %s"
++ " && is_chassis_resident(\"%s\")",
++ ETH_ADDR_ARGS(mac),
++ od->l3dgw_port->json_key,
++ nat->logical_port);
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ADMISSION, 50,
++ ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
++ }
++}
+
+- /* Egress SNAT table: Packets enter the egress pipeline with
+- * source ip address that needs to be SNATted to a external ip
+- * address. */
+- if (!strcmp(nat->type, "snat")
+- || !strcmp(nat->type, "dnat_and_snat")) {
+- if (!od->l3dgw_port) {
+- /* Gateway router. */
+- ds_clear(match);
+- ds_put_format(match, "ip && ip%s.src == %s",
+- is_v6 ? "6" : "4",
+- nat->logical_ip);
+- ds_clear(actions);
++static int
++lrouter_check_nat_entry(struct ovn_datapath *od, const struct nbrec_nat *nat,
++ ovs_be32 *mask, bool *is_v6, int *cidr_bits,
++ struct eth_addr *mac, bool *distributed)
++{
++ struct in6_addr ipv6, mask_v6, v6_exact = IN6ADDR_EXACT_INIT;
++ ovs_be32 ip;
+
+- if (allowed_ext_ips || exempted_ext_ips) {
+- lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
+- is_v6, false, mask);
+- }
++ if (nat->allowed_ext_ips && nat->exempted_ext_ips) {
++ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
++ VLOG_WARN_RL(&rl, "NAT rule: "UUID_FMT" not applied, since "
++ "both allowed and exempt external ips set",
++ UUID_ARGS(&(nat->header_.uuid)));
++ return -EINVAL;
++ }
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.src=%s; next;",
+- is_v6 ? "6" : "4", nat->external_ip);
+- } else {
+- ds_put_format(actions, "ct_snat(%s",
+- nat->external_ip);
++ char *error = ip_parse_masked(nat->external_ip, &ip, mask);
++ *is_v6 = false;
+
+- if (nat->external_port_range[0]) {
+- ds_put_format(actions, ",%s",
+- nat->external_port_range);
+- }
+- ds_put_format(actions, ");");
+- }
++ if (error || *mask != OVS_BE32_MAX) {
++ free(error);
++ error = ipv6_parse_masked(nat->external_ip, &ipv6, &mask_v6);
++ if (error || memcmp(&mask_v6, &v6_exact, sizeof(mask_v6))) {
++ /* Invalid for both IPv4 and IPv6 */
++ static struct vlog_rate_limit rl =
++ VLOG_RATE_LIMIT_INIT(5, 1);
++ VLOG_WARN_RL(&rl, "bad external ip %s for nat",
++ nat->external_ip);
++ free(error);
++ return -EINVAL;
++ }
++ /* It was an invalid IPv4 address, but valid IPv6.
++ * Treat the rest of the handling of this NAT rule
++ * as IPv6. */
++ *is_v6 = true;
++ }
+
+- /* The priority here is calculated such that the
+- * nat->logical_ip with the longest mask gets a higher
+- * priority. */
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT,
+- cidr_bits + 1,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- } else {
+- uint16_t priority = cidr_bits + 1;
++ /* Check the validity of nat->logical_ip. 'logical_ip' can
++ * be a subnet when the type is "snat". */
++ if (*is_v6) {
++ error = ipv6_parse_masked(nat->logical_ip, &ipv6, &mask_v6);
++ *cidr_bits = ipv6_count_cidr_bits(&mask_v6);
++ } else {
++ error = ip_parse_masked(nat->logical_ip, &ip, mask);
++ *cidr_bits = ip_count_cidr_bits(*mask);
++ }
++ if (!strcmp(nat->type, "snat")) {
++ if (error) {
++ /* Invalid for both IPv4 and IPv6 */
++ static struct vlog_rate_limit rl =
++ VLOG_RATE_LIMIT_INIT(5, 1);
++ VLOG_WARN_RL(&rl, "bad ip network or ip %s for snat "
++ "in router "UUID_FMT"",
++ nat->logical_ip, UUID_ARGS(&od->key));
++ free(error);
++ return -EINVAL;
++ }
++ } else {
++ if (error || (*is_v6 == false && *mask != OVS_BE32_MAX)
++ || (*is_v6 && memcmp(&mask_v6, &v6_exact,
++ sizeof mask_v6))) {
++ /* Invalid for both IPv4 and IPv6 */
++ static struct vlog_rate_limit rl =
++ VLOG_RATE_LIMIT_INIT(5, 1);
++ VLOG_WARN_RL(&rl, "bad ip %s for dnat in router "
++ ""UUID_FMT"", nat->logical_ip, UUID_ARGS(&od->key));
++ free(error);
++ return -EINVAL;
++ }
++ }
+
+- /* Distributed router. */
+- ds_clear(match);
+- ds_put_format(match, "ip && ip%s.src == %s"
+- " && outport == %s",
+- is_v6 ? "6" : "4",
+- nat->logical_ip,
+- od->l3dgw_port->json_key);
+- if (!distributed && od->l3redirect_port) {
+- /* Flows for NAT rules that are centralized are only
+- * programmed on the gateway chassis. */
+- priority += 128;
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- }
+- ds_clear(actions);
++ /* For distributed router NAT, determine whether this NAT rule
++ * satisfies the conditions for distributed NAT processing. */
++ *distributed = false;
++ if (od->l3dgw_port && !strcmp(nat->type, "dnat_and_snat") &&
++ nat->logical_port && nat->external_mac) {
++ if (eth_addr_from_string(nat->external_mac, mac)) {
++ *distributed = true;
++ } else {
++ static struct vlog_rate_limit rl =
++ VLOG_RATE_LIMIT_INIT(5, 1);
++ VLOG_WARN_RL(&rl, "bad mac %s for dnat in router "
++ ""UUID_FMT"", nat->external_mac, UUID_ARGS(&od->key));
++ return -EINVAL;
++ }
++ }
+
+- if (allowed_ext_ips || exempted_ext_ips) {
+- lrouter_nat_add_ext_ip_match(od, lflows, match, nat,
+- is_v6, false, mask);
+- }
++ return 0;
++}
+
+- if (distributed) {
+- ds_put_format(actions, "eth.src = "ETH_ADDR_FMT"; ",
+- ETH_ADDR_ARGS(mac));
+- }
++/* NAT, Defrag and load balancing. */
++static void
++build_lrouter_nat_defrag_and_lb(struct ovn_datapath *od,
++ struct hmap *lflows,
++ struct shash *meter_groups,
++ struct hmap *lbs,
++ struct ds *match, struct ds *actions)
++{
++ if (!od->nbr) {
++ return;
++ }
+
+- if (!strcmp(nat->type, "dnat_and_snat") && stateless) {
+- ds_put_format(actions, "ip%s.src=%s; next;",
+- is_v6 ? "6" : "4", nat->external_ip);
+- } else {
+- ds_put_format(actions, "ct_snat(%s",
+- nat->external_ip);
+- if (nat->external_port_range[0]) {
+- ds_put_format(actions, ",%s",
+- nat->external_port_range);
+- }
+- ds_put_format(actions, ");");
+- }
++ /* Packets are allowed by default. */
++ ovn_lflow_add(lflows, od, S_ROUTER_IN_DEFRAG, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_IN_UNSNAT, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_OUT_UNDNAT, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_OUT_EGR_LOOP, 0, "1", "next;");
++ ovn_lflow_add(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 0, "1", "next;");
++
++ /* Send the IPv6 NS packets to next table. When ovn-controller
++ * generates IPv6 NS (for the action - nd_ns{}), the injected
++ * packet would go through conntrack - which is not required. */
++ ovn_lflow_add(lflows, od, S_ROUTER_OUT_SNAT, 120, "nd_ns", "next;");
++
++ /* NAT rules are only valid on Gateway routers and routers with
++ * l3dgw_port (router has a port with gateway chassis
++ * specified). */
++ if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
++ return;
++ }
+
+- /* The priority here is calculated such that the
+- * nat->logical_ip with the longest mask gets a higher
+- * priority. */
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_SNAT,
+- priority, ds_cstr(match),
+- ds_cstr(actions),
+- &nat->header_);
+- }
+- }
++ struct sset nat_entries = SSET_INITIALIZER(&nat_entries);
+
+- /* Logical router ingress table 0:
+- * For NAT on a distributed router, add rules allowing
+- * ingress traffic with eth.dst matching nat->external_mac
+- * on the l3dgw_port instance where nat->logical_port is
+- * resident. */
+- if (distributed) {
+- /* Store the ethernet address of the port receiving the packet.
+- * This will save us from having to match on inport further
+- * down in the pipeline.
+- */
+- ds_clear(actions);
+- ds_put_format(actions, REG_INPORT_ETH_ADDR " = %s; next;",
+- od->l3dgw_port->lrp_networks.ea_s);
++ bool dnat_force_snat_ip =
++ !lport_addresses_is_empty(&od->dnat_force_snat_addrs);
++ bool lb_force_snat_ip =
++ !lport_addresses_is_empty(&od->lb_force_snat_addrs);
+
+- ds_clear(match);
+- ds_put_format(match,
+- "eth.dst == "ETH_ADDR_FMT" && inport == %s"
+- " && is_chassis_resident(\"%s\")",
+- ETH_ADDR_ARGS(mac),
+- od->l3dgw_port->json_key,
+- nat->logical_port);
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ADMISSION, 50,
+- ds_cstr(match), ds_cstr(actions),
+- &nat->header_);
+- }
++ for (int i = 0; i < od->nbr->n_nat; i++) {
++ const struct nbrec_nat *nat = nat = od->nbr->nat[i];
++ struct eth_addr mac = eth_addr_broadcast;
++ bool is_v6, distributed;
++ ovs_be32 mask;
++ int cidr_bits;
+
+- /* Ingress Gateway Redirect Table: For NAT on a distributed
+- * router, add flows that are specific to a NAT rule. These
+- * flows indicate the presence of an applicable NAT rule that
+- * can be applied in a distributed manner.
+- * In particulr REG_SRC_IPV4/REG_SRC_IPV6 and eth.src are set to
+- * NAT external IP and NAT external mac so the ARP request
+- * generated in the following stage is sent out with proper IP/MAC
+- * src addresses.
+- */
+- if (distributed) {
+- ds_clear(match);
+- ds_clear(actions);
+- ds_put_format(match,
+- "ip%s.src == %s && outport == %s && "
+- "is_chassis_resident(\"%s\")",
+- is_v6 ? "6" : "4", nat->logical_ip,
+- od->l3dgw_port->json_key, nat->logical_port);
+- ds_put_format(actions, "eth.src = %s; %s = %s; next;",
+- nat->external_mac,
+- is_v6 ? REG_SRC_IPV6 : REG_SRC_IPV4,
+- nat->external_ip);
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_GW_REDIRECT,
+- 100, ds_cstr(match),
+- ds_cstr(actions), &nat->header_);
+- }
++ if (lrouter_check_nat_entry(od, nat, &mask, &is_v6, &cidr_bits,
++ &mac, &distributed) < 0) {
++ continue;
++ }
+
+- /* Egress Loopback table: For NAT on a distributed router.
+- * If packets in the egress pipeline on the distributed
+- * gateway port have ip.dst matching a NAT external IP, then
+- * loop a clone of the packet back to the beginning of the
+- * ingress pipeline with inport = outport. */
+- if (od->l3dgw_port) {
+- /* Distributed router. */
+- ds_clear(match);
+- ds_put_format(match, "ip%s.dst == %s && outport == %s",
+- is_v6 ? "6" : "4",
+- nat->external_ip,
+- od->l3dgw_port->json_key);
+- if (!distributed) {
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- } else {
+- ds_put_format(match, " && is_chassis_resident(\"%s\")",
+- nat->logical_port);
+- }
++ /* S_ROUTER_IN_UNSNAT */
++ build_lrouter_in_unsnat_flow(lflows, od, nat, match, actions, distributed,
++ is_v6);
++ /* S_ROUTER_IN_DNAT */
++ build_lrouter_in_dnat_flow(lflows, od, nat, match, actions, distributed,
++ mask, is_v6);
+
++ /* ARP resolve for NAT IPs. */
++ if (od->l3dgw_port) {
++ if (!sset_contains(&nat_entries, nat->external_ip)) {
++ ds_clear(match);
++ ds_put_format(
++ match, "outport == %s && %s == %s",
++ od->l3dgw_port->json_key,
++ is_v6 ? REG_NEXT_HOP_IPV6 : REG_NEXT_HOP_IPV4,
++ nat->external_ip);
+ ds_clear(actions);
+- ds_put_format(actions,
+- "clone { ct_clear; "
+- "inport = outport; outport = \"\"; "
+- "flags = 0; flags.loopback = 1; ");
+- for (int j = 0; j < MFF_N_LOG_REGS; j++) {
+- ds_put_format(actions, "reg%d = 0; ", j);
+- }
+- ds_put_format(actions, REGBIT_EGRESS_LOOPBACK" = 1; "
+- "next(pipeline=ingress, table=%d); };",
+- ovn_stage_get_table(S_ROUTER_IN_ADMISSION));
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100,
+- ds_cstr(match), ds_cstr(actions),
++ ds_put_format(
++ actions, "eth.dst = %s; next;",
++ distributed ? nat->external_mac :
++ od->l3dgw_port->lrp_networks.ea_s);
++ ovn_lflow_add_with_hint(lflows, od,
++ S_ROUTER_IN_ARP_RESOLVE,
++ 100, ds_cstr(match),
++ ds_cstr(actions),
+ &nat->header_);
++ sset_add(&nat_entries, nat->external_ip);
+ }
+- }
+-
+- /* Handle force SNAT options set in the gateway router. */
+- if (!od->l3dgw_port) {
+- if (dnat_force_snat_ip) {
+- if (od->dnat_force_snat_addrs.n_ipv4_addrs) {
+- build_lrouter_force_snat_flows(lflows, od, "4",
+- od->dnat_force_snat_addrs.ipv4_addrs[0].addr_s,
+- "dnat");
+- }
+- if (od->dnat_force_snat_addrs.n_ipv6_addrs) {
+- build_lrouter_force_snat_flows(lflows, od, "6",
+- od->dnat_force_snat_addrs.ipv6_addrs[0].addr_s,
+- "dnat");
+- }
+- }
+- if (lb_force_snat_ip) {
+- if (od->lb_force_snat_addrs.n_ipv4_addrs) {
+- build_lrouter_force_snat_flows(lflows, od, "4",
+- od->lb_force_snat_addrs.ipv4_addrs[0].addr_s, "lb");
+- }
+- if (od->lb_force_snat_addrs.n_ipv6_addrs) {
+- build_lrouter_force_snat_flows(lflows, od, "6",
+- od->lb_force_snat_addrs.ipv6_addrs[0].addr_s, "lb");
+- }
++ } else {
++ /* Add the NAT external_ip to the nat_entries even for
++ * gateway routers. This is required for adding load balancer
++ * flows.*/
++ sset_add(&nat_entries, nat->external_ip);
++ }
++
++ /* S_ROUTER_OUT_UNDNAT */
++ build_lrouter_out_undnat_flow(lflows, od, nat, match, actions, distributed,
++ mac, is_v6);
++ /* S_ROUTER_OUT_SNAT */
++ build_lrouter_out_snat_flow(lflows, od, nat, match, actions, distributed,
++ mac, mask, cidr_bits, is_v6);
++
++ /* S_ROUTER_IN_ADMISSION - S_ROUTER_IN_IP_INPUT */
++ build_lrouter_ingress_flow(lflows, od, nat, match, actions,
++ mac, distributed, is_v6);
++
++ /* Ingress Gateway Redirect Table: For NAT on a distributed
++ * router, add flows that are specific to a NAT rule. These
++ * flows indicate the presence of an applicable NAT rule that
++ * can be applied in a distributed manner.
++ * In particulr REG_SRC_IPV4/REG_SRC_IPV6 and eth.src are set to
++ * NAT external IP and NAT external mac so the ARP request
++ * generated in the following stage is sent out with proper IP/MAC
++ * src addresses.
++ */
++ if (distributed) {
++ ds_clear(match);
++ ds_clear(actions);
++ ds_put_format(match,
++ "ip%s.src == %s && outport == %s && "
++ "is_chassis_resident(\"%s\")",
++ is_v6 ? "6" : "4", nat->logical_ip,
++ od->l3dgw_port->json_key, nat->logical_port);
++ ds_put_format(actions, "eth.src = %s; %s = %s; next;",
++ nat->external_mac,
++ is_v6 ? REG_SRC_IPV6 : REG_SRC_IPV4,
++ nat->external_ip);
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_GW_REDIRECT,
++ 100, ds_cstr(match),
++ ds_cstr(actions), &nat->header_);
++ }
++
++ /* Egress Loopback table: For NAT on a distributed router.
++ * If packets in the egress pipeline on the distributed
++ * gateway port have ip.dst matching a NAT external IP, then
++ * loop a clone of the packet back to the beginning of the
++ * ingress pipeline with inport = outport. */
++ if (od->l3dgw_port) {
++ /* Distributed router. */
++ ds_clear(match);
++ ds_put_format(match, "ip%s.dst == %s && outport == %s",
++ is_v6 ? "6" : "4",
++ nat->external_ip,
++ od->l3dgw_port->json_key);
++ if (!distributed) {
++ ds_put_format(match, " && is_chassis_resident(%s)",
++ od->l3redirect_port->json_key);
++ } else {
++ ds_put_format(match, " && is_chassis_resident(\"%s\")",
++ nat->logical_port);
+ }
+
+- /* For gateway router, re-circulate every packet through
+- * the DNAT zone. This helps with the following.
+- *
+- * Any packet that needs to be unDNATed in the reverse
+- * direction gets unDNATed. Ideally this could be done in
+- * the egress pipeline. But since the gateway router
+- * does not have any feature that depends on the source
+- * ip address being external IP address for IP routing,
+- * we can do it here, saving a future re-circulation. */
+- ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
+- "ip", "flags.loopback = 1; ct_dnat;");
++ ds_clear(actions);
++ ds_put_format(actions,
++ "clone { ct_clear; "
++ "inport = outport; outport = \"\"; "
++ "flags = 0; flags.loopback = 1; ");
++ for (int j = 0; j < MFF_N_LOG_REGS; j++) {
++ ds_put_format(actions, "reg%d = 0; ", j);
++ }
++ ds_put_format(actions, REGBIT_EGRESS_LOOPBACK" = 1; "
++ "next(pipeline=ingress, table=%d); };",
++ ovn_stage_get_table(S_ROUTER_IN_ADMISSION));
++ ovn_lflow_add_with_hint(lflows, od, S_ROUTER_OUT_EGR_LOOP, 100,
++ ds_cstr(match), ds_cstr(actions),
++ &nat->header_);
+ }
++ }
+
+- /* Load balancing and packet defrag are only valid on
+- * Gateway routers or router with gateway port. */
+- if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
+- sset_destroy(&nat_entries);
+- return;
++ /* Handle force SNAT options set in the gateway router. */
++ if (!od->l3dgw_port) {
++ if (dnat_force_snat_ip) {
++ if (od->dnat_force_snat_addrs.n_ipv4_addrs) {
++ build_lrouter_force_snat_flows(lflows, od, "4",
++ od->dnat_force_snat_addrs.ipv4_addrs[0].addr_s,
++ "dnat");
++ }
++ if (od->dnat_force_snat_addrs.n_ipv6_addrs) {
++ build_lrouter_force_snat_flows(lflows, od, "6",
++ od->dnat_force_snat_addrs.ipv6_addrs[0].addr_s,
++ "dnat");
++ }
+ }
+-
+- /* A set to hold all ips that need defragmentation and tracking. */
+- struct sset all_ips = SSET_INITIALIZER(&all_ips);
+-
+- for (int i = 0; i < od->nbr->n_load_balancer; i++) {
+- struct nbrec_load_balancer *nb_lb = od->nbr->load_balancer[i];
+- struct ovn_northd_lb *lb =
+- ovn_northd_lb_find(lbs, &nb_lb->header_.uuid);
+- ovs_assert(lb);
+-
+- for (size_t j = 0; j < lb->n_vips; j++) {
+- struct ovn_lb_vip *lb_vip = &lb->vips[j];
+- struct ovn_northd_lb_vip *lb_vip_nb = &lb->vips_nb[j];
+- ds_clear(actions);
+- build_lb_vip_actions(lb_vip, lb_vip_nb, actions,
+- lb->selection_fields, false);
+-
+- if (!sset_contains(&all_ips, lb_vip->vip_str)) {
+- sset_add(&all_ips, lb_vip->vip_str);
+- /* If there are any load balancing rules, we should send
+- * the packet to conntrack for defragmentation and
+- * tracking. This helps with two things.
+- *
+- * 1. With tracking, we can send only new connections to
+- * pick a DNAT ip address from a group.
+- * 2. If there are L4 ports in load balancing rules, we
+- * need the defragmentation to match on L4 ports. */
+- ds_clear(match);
+- if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) {
+- ds_put_format(match, "ip && ip4.dst == %s",
+- lb_vip->vip_str);
+- } else {
+- ds_put_format(match, "ip && ip6.dst == %s",
+- lb_vip->vip_str);
+- }
+- ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG,
+- 100, ds_cstr(match), "ct_next;",
+- &nb_lb->header_);
+- }
+-
+- /* Higher priority rules are added for load-balancing in DNAT
+- * table. For every match (on a VIP[:port]), we add two flows
+- * via add_router_lb_flow(). One flow is for specific matching
+- * on ct.new with an action of "ct_lb($targets);". The other
+- * flow is for ct.est with an action of "ct_dnat;". */
+- ds_clear(match);
+- if (IN6_IS_ADDR_V4MAPPED(&lb_vip->vip)) {
+- ds_put_format(match, "ip && ip4.dst == %s",
+- lb_vip->vip_str);
+- } else {
+- ds_put_format(match, "ip && ip6.dst == %s",
+- lb_vip->vip_str);
+- }
+-
+- int prio = 110;
+- bool is_udp = nullable_string_is_equal(nb_lb->protocol, "udp");
+- bool is_sctp = nullable_string_is_equal(nb_lb->protocol,
+- "sctp");
+- const char *proto = is_udp ? "udp" : is_sctp ? "sctp" : "tcp";
+-
+- if (lb_vip->vip_port) {
+- ds_put_format(match, " && %s && %s.dst == %d", proto,
+- proto, lb_vip->vip_port);
+- prio = 120;
+- }
+-
+- if (od->l3redirect_port &&
+- (lb_vip->n_backends || !lb_vip->empty_backend_rej)) {
+- ds_put_format(match, " && is_chassis_resident(%s)",
+- od->l3redirect_port->json_key);
+- }
+- bool force_snat_for_lb =
+- lb_force_snat_ip || od->lb_force_snat_router_ip;
+- add_router_lb_flow(lflows, od, match, actions, prio,
+- force_snat_for_lb, lb_vip, proto,
+- nb_lb, meter_groups, &nat_entries);
++ if (lb_force_snat_ip) {
++ if (od->lb_force_snat_addrs.n_ipv4_addrs) {
++ build_lrouter_force_snat_flows(lflows, od, "4",
++ od->lb_force_snat_addrs.ipv4_addrs[0].addr_s, "lb");
++ }
++ if (od->lb_force_snat_addrs.n_ipv6_addrs) {
++ build_lrouter_force_snat_flows(lflows, od, "6",
++ od->lb_force_snat_addrs.ipv6_addrs[0].addr_s, "lb");
+ }
+ }
+- sset_destroy(&all_ips);
++
++ /* For gateway router, re-circulate every packet through
++ * the DNAT zone. This helps with the following.
++ *
++ * Any packet that needs to be unDNATed in the reverse
++ * direction gets unDNATed. Ideally this could be done in
++ * the egress pipeline. But since the gateway router
++ * does not have any feature that depends on the source
++ * ip address being external IP address for IP routing,
++ * we can do it here, saving a future re-circulation. */
++ ovn_lflow_add(lflows, od, S_ROUTER_IN_DNAT, 50,
++ "ip", "flags.loopback = 1; ct_dnat;");
++ }
++
++ /* Load balancing and packet defrag are only valid on
++ * Gateway routers or router with gateway port. */
++ if (!smap_get(&od->nbr->options, "chassis") && !od->l3dgw_port) {
+ sset_destroy(&nat_entries);
++ return;
+ }
++
++ build_lrouter_lb_flows(lflows, od, lbs, meter_groups, &nat_entries,
++ match, actions);
++
++ sset_destroy(&nat_entries);
+ }
+
+
+diff --git a/ovn-nb.xml b/ovn-nb.xml
+index b0a4adffe..408c98090 100644
+--- a/ovn-nb.xml
++++ b/ovn-nb.xml
+@@ -1653,6 +1653,12 @@
+ exactly one IPv4 and/or one IPv6 address on it, separated by a space
+ character.
+
++
++ skip_snat
++ option, the force_snat_for_lb option configured for the router
++ pipeline will not be applied for this load balancer.
++