From 4b3b86ca22efdbde590c60afae0f8bcf8ad480ae Mon Sep 17 00:00:00 2001 From: Alfredo Moralejo Date: Sep 29 2020 15:56:09 +0000 Subject: Import openvswitch2.13-2.13.0-57 from Fast DataPath --- diff --git a/.openvswitch.metadata b/.openvswitch.metadata index af1aabc..0db19ca 100644 --- a/.openvswitch.metadata +++ b/.openvswitch.metadata @@ -1,5 +1,5 @@ 002450621b33c5690060345b0aac25bc2426d675 SOURCES/docutils-0.12.tar.gz -e704a36f712c1c81f253f77d1bd7c60d85b8a7ff SOURCES/dpdk-19.11.1.tar.xz +435b0b3a5da6d7417d318050e5b50ac400354c60 SOURCES/dpdk-19.11.tar.xz 0c5f78212173d2cac286f8f78aa95ebdea9e2444 SOURCES/openvswitch-2.13.0.tar.gz d34f96421a86004aa5d26ecf975edefd09f948b1 SOURCES/Pygments-1.4.tar.gz 6beb30f18ffac3de7689b7fd63e9a8a7d9c8df3a SOURCES/Sphinx-1.1.3.tar.gz diff --git a/SOURCES/arm64-armv8a-linuxapp-gcc-config b/SOURCES/arm64-armv8a-linuxapp-gcc-config index c219def..06a3d70 100644 --- a/SOURCES/arm64-armv8a-linuxapp-gcc-config +++ b/SOURCES/arm64-armv8a-linuxapp-gcc-config @@ -1,4 +1,4 @@ -# -*- cfg-sha: c5b6330ff61c71cf3196f55aad5cc3766b44dd62560396f67c2fee4f7ab46780 +# -*- cfg-sha: bfd08c718502ce9a9d75d102e9b680c4ecf9fb2b14b112aa45899a016d3bc7bb # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2015 Cavium, Inc # SPDX-License-Identifier: BSD-3-Clause @@ -12,7 +12,7 @@ CONFIG_RTE_VER_PREFIX="DPDK" # Version information completed when this file is processed for a build CONFIG_RTE_VER_YEAR=19 CONFIG_RTE_VER_MONTH=11 -CONFIG_RTE_VER_MINOR=1 +CONFIG_RTE_VER_MINOR=3 CONFIG_RTE_VER_SUFFIX="" CONFIG_RTE_VER_RELEASE=99 # RTE_EXEC_ENV values are the directories in mk/exec-env/ @@ -604,4 +604,3 @@ CONFIG_RTE_ARCH_ARM64_MEMCPY=n #CONFIG_RTE_ARM64_MEMCPY_STRICT_ALIGN=n # NXP PFE PMD Driver CONFIG_RTE_TOOLCHAIN_GCC=y -CONFIG_RTE_LIBRTE_PMD_XENVIRT=n diff --git a/SOURCES/openvswitch-2.13.0.patch b/SOURCES/openvswitch-2.13.0.patch index 930e9e4..2776047 100644 --- a/SOURCES/openvswitch-2.13.0.patch +++ b/SOURCES/openvswitch-2.13.0.patch @@ -1,1615 +1,41744 @@ -From 4ee0f6af9e601cbb5f69a486526d1011314bbfed Mon Sep 17 00:00:00 2001 -From: Ben Pfaff -Date: Thu, 19 Mar 2020 17:53:10 -0700 -Subject: [PATCH 01/15] ofproto-dpif-xlate: Fix recirculation when in_port is - OFPP_CONTROLLER. - -[ upstream commit c5a910dd92ecbad24f86b4c59b4ff8105b5149fd ] - -Recirculation usually requires finding the pre-recirculation input port. -Packets sent by the controller, with in_port of OFPP_CONTROLLER or -OFPP_NONE, do not have a real input port data structure, only a port -number. The code in xlate_lookup_ofproto_() mishandled this case, -failing to return the ofproto data structure. This commit fixes the -problem and adds a test to guard against regression. - -Reported-by: Numan Siddique -Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2020-March/368642.html -Tested-by: Numan Siddique -Acked-by: Numan Siddique -Signed-off-by: Ben Pfaff - -Resolves: #1775160 -Signed-off-by: Numan Siddique ---- - ofproto/ofproto-dpif-xlate.c | 25 +++++++++++++++++++++---- - tests/ofproto-dpif.at | 30 ++++++++++++++++++++++++++++++ - 2 files changed, 51 insertions(+), 4 deletions(-) - -diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c -index 4407f9c97a..54cfbfbdff 100644 ---- a/ofproto/ofproto-dpif-xlate.c -+++ b/ofproto/ofproto-dpif-xlate.c -@@ -1516,15 +1516,32 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, - return NULL; - } +diff --git a/.cirrus.yml b/.cirrus.yml +index 1b32f55d65..9428164eee 100644 +--- a/.cirrus.yml ++++ b/.cirrus.yml +@@ -16,6 +16,7 @@ freebsd_build_task: -- /* If recirculation was initiated due to bond (in_port = OFPP_NONE) -- * then frozen state is static and xport_uuid is not defined, so xport -- * cannot be restored from frozen state. */ -- if (recirc_id_node->state.metadata.in_port != OFPP_NONE) { -+ ofp_port_t in_port = recirc_id_node->state.metadata.in_port; -+ if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) { - struct uuid xport_uuid = recirc_id_node->state.xport_uuid; - xport = xport_lookup_by_uuid(xcfg, &xport_uuid); - if (xport && xport->xbridge && xport->xbridge->ofproto) { - goto out; - } -+ } else { -+ /* OFPP_NONE and OFPP_CONTROLLER are not real ports. They indicate -+ * that the packet originated from the controller via an OpenFlow -+ * "packet-out". The right thing to do is to find just the -+ * ofproto. There is no xport, which is OK. -+ * -+ * OFPP_NONE can also indicate that a bond caused recirculation. */ -+ struct uuid uuid = recirc_id_node->state.ofproto_uuid; -+ const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid); -+ if (bridge && bridge->ofproto) { -+ if (errorp) { -+ *errorp = NULL; -+ } -+ *xportp = NULL; -+ if (ofp_in_port) { -+ *ofp_in_port = in_port; -+ } -+ return bridge->ofproto; -+ } - } - } + prepare_script: + - sysctl -w kern.coredump=0 ++ - pkg update -f + - pkg install -y ${DEPENDENCIES} -diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at -index ff1cc93707..d444cf0844 100644 ---- a/tests/ofproto-dpif.at -+++ b/tests/ofproto-dpif.at -@@ -5171,6 +5171,36 @@ AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: 2 - OVS_VSWITCHD_STOP - AT_CLEANUP + configure_script: +diff --git a/.travis.yml b/.travis.yml +index abd2a9117a..a59371c496 100644 +--- a/.travis.yml ++++ b/.travis.yml +@@ -52,6 +52,18 @@ matrix: + compiler: clang + env: OPTS="--disable-ssl" -+# Checks for regression against a bug in which OVS dropped packets -+# with in_port=CONTROLLER when they were recirculated (because -+# CONTROLLER isn't a real port and could not be looked up). -+AT_SETUP([ofproto-dpif - packet-out recirculation]) -+OVS_VSWITCHD_START -+add_of_ports br0 1 2 -+ -+AT_DATA([flows.txt], [dnl -+table=0 ip actions=mod_dl_dst:83:83:83:83:83:83,ct(table=1) -+table=1 ip actions=ct(commit),output:2 -+]) -+AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) ++matrix: ++ include: ++ - env: DEB_PACKAGE=1 ++ addons: ++ apt: ++ packages: ++ - linux-headers-$(uname -r) ++ - build-essential ++ - fakeroot ++ - devscripts ++ - equivs + -+packet=ffffffffffff00102030405008004500001c00000000401100000a000002ffffffff0035111100080000 -+AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet actions=table"]) -+ -+# Dumps out the flow table, extracts the number of packets that have gone -+# through the (single) flow in table 1, and returns success if it's exactly 1. -+# -+# If this remains 0, then the recirculation isn't working properly since the -+# packet never goes through flow in table 1. -+check_flows () { -+ n=$(ovs-ofctl dump-flows br0 table=1 | sed -n 's/.*n_packets=\([[0-9]]\{1,\}\).*/\1/p') -+ echo "n_packets=$n" -+ test "$n" = 1 -+} -+OVS_WAIT_UNTIL([check_flows], [ovs dump-flows br0]) + script: ./.travis/${TRAVIS_OS_NAME}-build.sh $OPTS + + notifications: +diff --git a/.travis/linux-build.sh b/.travis/linux-build.sh +index bb47b3ee19..dd89eab5f8 100755 +--- a/.travis/linux-build.sh ++++ b/.travis/linux-build.sh +@@ -159,13 +159,24 @@ function build_ovs() + fi + } + ++if [ "$DEB_PACKAGE" ]; then ++ mk-build-deps --install --root-cmd sudo --remove debian/control ++ dpkg-checkbuilddeps ++ DEB_BUILD_OPTIONS='parallel=4 nocheck' fakeroot debian/rules binary ++ # Not trying to install ipsec package as there are issues with system-wide ++ # installed python3-openvswitch package and the pyenv used by Travis. ++ packages=$(ls $(pwd)/../*.deb | grep -v ipsec) ++ sudo apt install ${packages} ++ exit 0 ++fi + -+OVS_VSWITCHD_STOP -+AT_CLEANUP + if [ "$KERNEL" ]; then + install_kernel $KERNEL + fi - AT_SETUP([ofproto-dpif - debug_slow action]) - OVS_VSWITCHD_START --- -2.25.1 - - -From 71f25b7920093daa59827a0a4be4095309aec6ff Mon Sep 17 00:00:00 2001 -From: Timothy Redaelli -Date: Thu, 19 Mar 2020 20:05:39 +0100 -Subject: [PATCH 02/15] bugtool: Fix for Python3. - -Currently ovs-bugtool tool doesn't start on Python 3. -This commit fixes ovs-bugtool to make it works on Python 3. - -Replaced StringIO.StringIO with io.BytesIO since the script is -processing binary data. - -Reported-at: https://bugzilla.redhat.com/1809241 -Reported-by: Flavio Leitner -Signed-off-by: Timothy Redaelli -Co-authored-by: William Tu -Signed-off-by: William Tu -(cherry picked from commit 9e6c00bca9af29031d0e160d33174b7ae99b9244) ---- - utilities/bugtool/ovs-bugtool.in | 48 +++++++++++++++++--------------- - 1 file changed, 25 insertions(+), 23 deletions(-) - -diff --git a/utilities/bugtool/ovs-bugtool.in b/utilities/bugtool/ovs-bugtool.in -index e55bfc2ed5..47f3c4629f 100755 ---- a/utilities/bugtool/ovs-bugtool.in -+++ b/utilities/bugtool/ovs-bugtool.in -@@ -33,8 +33,7 @@ - # or func_output(). - # + if [ "$DPDK" ] || [ "$DPDK_SHARED" ]; then + if [ -z "$DPDK_VER" ]; then +- DPDK_VER="19.11" ++ DPDK_VER="19.11.2" + fi + install_dpdk $DPDK_VER + # Enable pdump support in OVS. +diff --git a/.travis/linux-prepare.sh b/.travis/linux-prepare.sh +index fda13e7d21..71eb347e89 100755 +--- a/.travis/linux-prepare.sh ++++ b/.travis/linux-prepare.sh +@@ -2,14 +2,22 @@ --import StringIO --import commands -+from io import BytesIO - import fcntl - import getopt - import hashlib -@@ -48,7 +47,7 @@ import warnings - import zipfile - from select import select - from signal import SIGTERM --from subprocess import PIPE, Popen -+from subprocess import PIPE, Popen, check_output + set -ev - from xml.dom.minidom import getDOMImplementation, parse ++if [ "$DEB_PACKAGE" ]; then ++ # We're not using sparse for debian packages, tests are skipped and ++ # all extra dependencies tracked by mk-build-deps. ++ exit 0 ++fi ++ + # Build and install sparse. + # + # Explicitly disable sparse support for llvm because some travis + # environments claim to have LLVM (llvm-config exists and works) but + # linking against it fails. ++# Disabling sqlite support because sindex build fails and we don't ++# really need this utility being installed. + git clone git://git.kernel.org/pub/scm/devel/sparse/sparse.git + cd sparse +-make -j4 HAVE_LLVM= install ++make -j4 HAVE_LLVM= HAVE_SQLITE= install + cd .. -@@ -348,7 +347,7 @@ def collect_data(): - cap = v['cap'] - if 'cmd_args' in v: - if 'output' not in v.keys(): -- v['output'] = StringIOmtime() -+ v['output'] = BytesIOmtime() - if v['repeat_count'] > 0: - if cap not in process_lists: - process_lists[cap] = [] -@@ -373,20 +372,23 @@ def collect_data(): - if 'filename' in v and v['filename'].startswith('/proc/'): - # proc files must be read into memory - try: -- f = open(v['filename'], 'r') -+ f = open(v['filename'], 'rb') - s = f.read() - f.close() - if check_space(cap, v['filename'], len(s)): -- v['output'] = StringIOmtime(s) -+ v['output'] = BytesIOmtime(s) - except: - pass - elif 'func' in v: - try: - s = v['func'](cap) - except Exception as e: -- s = str(e) -+ s = str(e).encode() - if check_space(cap, k, len(s)): -- v['output'] = StringIOmtime(s) -+ if isinstance(s, str): -+ v['output'] = BytesIOmtime(s.encode()) -+ else: -+ v['output'] = BytesIOmtime(s) + pip3 install --disable-pip-version-check --user flake8 hacking +diff --git a/AUTHORS.rst b/AUTHORS.rst +index fe3935fca2..4c8772f63a 100644 +--- a/AUTHORS.rst ++++ b/AUTHORS.rst +@@ -419,6 +419,7 @@ Zhenyu Gao sysugaozhenyu@gmail.com + ZhiPeng Lu luzhipeng@uniudc.com + Zhou Yangchao 1028519445@qq.com + aginwala amginwal@gmail.com ++lzhecheng lzhecheng@vmware.com + parameswaran krishnamurthy parkrish@gmail.com + solomon liwei.solomon@gmail.com + wenxu wenxu@ucloud.cn +@@ -496,6 +497,7 @@ Edwin Chiu echiu@vmware.com + Eivind Bulie Haanaes + Enas Ahmad enas.ahmad@kaust.edu.sa + Eric Lopez ++Frank Wang (王培辉) wangpeihui@inspur.com + Frido Roose fr.roose@gmail.com + Gaetano Catalli gaetano.catalli@gmail.com + Gavin Remaley gavin_remaley@selinc.com +@@ -558,6 +560,7 @@ Krishna Miriyala miriyalak@vmware.com + Krishna Mohan Elluru elluru.kri.mohan@hpe.com + László Sürü laszlo.suru@ericsson.com + Len Gao leng@vmware.com ++Linhaifeng haifeng.lin@huawei.com + Logan Rosen logatronico@gmail.com + Luca Falavigna dktrkranz@debian.org + Luiz Henrique Ozaki luiz.ozaki@gmail.com +@@ -655,6 +658,7 @@ Ying Chen yingchen@vmware.com + Yongqiang Liu liuyq7809@gmail.com + ZHANG Zhiming zhangzhiming@yunshan.net.cn + Zhangguanghui zhang.guanghui@h3c.com ++Zheng Jingzhou glovejmm@163.com + Ziyou Wang ziyouw@vmware.com + ankur dwivedi ankurengg2003@gmail.com + chen zhang 3zhangchen9211@gmail.com +diff --git a/Documentation/faq/releases.rst b/Documentation/faq/releases.rst +index 6702c58a2b..41e1315a4c 100644 +--- a/Documentation/faq/releases.rst ++++ b/Documentation/faq/releases.rst +@@ -67,9 +67,10 @@ Q: What Linux kernel versions does each Open vSwitch release work with? + 2.7.x 3.10 to 4.9 + 2.8.x 3.10 to 4.12 + 2.9.x 3.10 to 4.13 +- 2.10.x 3.10 to 4.17 +- 2.11.x 3.10 to 4.18 +- 2.12.x 3.10 to 5.0 ++ 2.10.x 3.16 to 4.17 ++ 2.11.x 3.16 to 4.18 ++ 2.12.x 3.16 to 5.0 ++ 2.13.x 3.16 to 5.0 + ============ ============== + Open vSwitch userspace should also work with the Linux kernel module built +@@ -78,6 +79,10 @@ Q: What Linux kernel versions does each Open vSwitch release work with? + Open vSwitch userspace is not sensitive to the Linux kernel version. It + should build against almost any kernel, certainly against 2.6.32 and later. - def main(argv=None): -@@ -704,7 +706,7 @@ exclude those logs from the archive. ++ Open vSwitch branches 2.10 through 2.13 will still compile against the ++ RHEL and CentOS 7 3.10 based kernels since they have diverged from the ++ Linux kernel.org 3.10 kernels. ++ + Q: Are all features available with all datapaths? - # permit the user to filter out data - # We cannot use iteritems, since we modify 'data' as we pass through -- for (k, v) in sorted(data.items()): -+ for (k, v) in data.items(): - cap = v['cap'] - if 'filename' in v: - key = k[0] -@@ -721,7 +723,7 @@ exclude those logs from the archive. + A: Open vSwitch supports different datapaths on different platforms. Each +@@ -173,9 +178,9 @@ Q: What DPDK version does each Open vSwitch release work with? + A: The following table lists the DPDK version against which the given + versions of Open vSwitch will successfully build. - # include inventory - data['inventory.xml'] = {'cap': None, -- 'output': StringIOmtime(make_inventory(data, subdir))} -+ 'output': BytesIOmtime(make_inventory(data, subdir))} +- ============ ======= ++ ============ ======== + Open vSwitch DPDK +- ============ ======= ++ ============ ======== + 2.2.x 1.6 + 2.3.x 1.6 + 2.4.x 2.0 +@@ -183,11 +188,12 @@ Q: What DPDK version does each Open vSwitch release work with? + 2.6.x 16.07.2 + 2.7.x 16.11.9 + 2.8.x 17.05.2 +- 2.9.x 17.11.4 +- 2.10.x 17.11.4 +- 2.11.x 18.11.5 +- 2.12.x 18.11.5 +- ============ ======= ++ 2.9.x 17.11.10 ++ 2.10.x 17.11.10 ++ 2.11.x 18.11.9 ++ 2.12.x 18.11.9 ++ 2.13.x 19.11.2 ++ ============ ======== - # create archive - if output_fd == -1: -@@ -782,7 +784,7 @@ def dump_scsi_hosts(cap): + Q: Are all the DPDK releases that OVS versions work with maintained? +diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst +index dbf88ec43f..90eaa8aa2c 100644 +--- a/Documentation/intro/install/dpdk.rst ++++ b/Documentation/intro/install/dpdk.rst +@@ -42,7 +42,7 @@ Build requirements + In addition to the requirements described in :doc:`general`, building Open + vSwitch with DPDK will require the following: - def module_info(cap): -- output = StringIO.StringIO() -+ output = BytesIO() - modules = open(PROC_MODULES, 'r') - procs = [] +-- DPDK 19.11 ++- DPDK 19.11.2 -@@ -806,7 +808,7 @@ def multipathd_topology(cap): + - A `DPDK supported NIC`_ +@@ -71,9 +71,9 @@ Install DPDK + #. Download the `DPDK sources`_, extract the file and set ``DPDK_DIR``:: - def dp_list(): -- output = StringIO.StringIO() -+ output = BytesIO() - procs = [ProcOutput([OVS_DPCTL, 'dump-dps'], - caps[CAP_NETWORK_STATUS][MAX_TIME], output)] + $ cd /usr/src/ +- $ wget https://fast.dpdk.org/rel/dpdk-19.11.tar.xz +- $ tar xf dpdk-19.11.tar.xz +- $ export DPDK_DIR=/usr/src/dpdk-19.11 ++ $ wget https://fast.dpdk.org/rel/dpdk-19.11.2.tar.xz ++ $ tar xf dpdk-19.11.2.tar.xz ++ $ export DPDK_DIR=/usr/src/dpdk-stable-19.11.2 + $ cd $DPDK_DIR -@@ -828,7 +830,7 @@ def collect_ovsdb(): - if os.path.isfile(OPENVSWITCH_COMPACT_DB): - os.unlink(OPENVSWITCH_COMPACT_DB) + #. (Optional) Configure DPDK as a shared library +diff --git a/Documentation/topics/dpdk/vhost-user.rst b/Documentation/topics/dpdk/vhost-user.rst +index c6c6fd8bde..4bc5aef59d 100644 +--- a/Documentation/topics/dpdk/vhost-user.rst ++++ b/Documentation/topics/dpdk/vhost-user.rst +@@ -392,9 +392,9 @@ To begin, instantiate a guest as described in :ref:`dpdk-vhost-user` or + DPDK sources to VM and build DPDK:: -- output = StringIO.StringIO() -+ output = BytesIO() - max_time = 5 - procs = [ProcOutput(['ovsdb-tool', 'compact', - OPENVSWITCH_CONF_DB, OPENVSWITCH_COMPACT_DB], -@@ -871,7 +873,7 @@ def fd_usage(cap): + $ cd /root/dpdk/ +- $ wget https://fast.dpdk.org/rel/dpdk-19.11.tar.xz +- $ tar xf dpdk-19.11.tar.xz +- $ export DPDK_DIR=/root/dpdk/dpdk-19.11 ++ $ wget https://fast.dpdk.org/rel/dpdk-19.11.2.tar.xz ++ $ tar xf dpdk-19.11.2.tar.xz ++ $ export DPDK_DIR=/root/dpdk/dpdk-stable-19.11.2 + $ export DPDK_TARGET=x86_64-native-linuxapp-gcc + $ export DPDK_BUILD=$DPDK_DIR/$DPDK_TARGET + $ cd $DPDK_DIR +diff --git a/Documentation/topics/userspace-tso.rst b/Documentation/topics/userspace-tso.rst +index 94eddc0b2f..369d70691d 100644 +--- a/Documentation/topics/userspace-tso.rst ++++ b/Documentation/topics/userspace-tso.rst +@@ -91,21 +91,19 @@ The current OvS userspace `TSO` implementation supports flat and VLAN networks + only (i.e. no support for `TSO` over tunneled connection [VxLAN, GRE, IPinIP, + etc.]). ++The NIC driver must support and advertise checksum offload for TCP and UDP. ++However, SCTP is not mandatory because very few drivers advertised support ++and it wasn't a widely used protocol at the moment this feature was introduced ++in Open vSwitch. Currently, if the NIC supports that, then the feature is ++enabled, otherwise TSO can still be enabled but SCTP packets sent to the NIC ++will be dropped. ++ + There is no software implementation of TSO, so all ports attached to the + datapath must support TSO or packets using that feature will be dropped + on ports without TSO support. That also means guests using vhost-user + in client mode will receive TSO packet regardless of TSO being enabled + or disabled within the guest. - def dump_rdac_groups(cap): -- output = StringIO.StringIO() -+ output = BytesIO() - procs = [ProcOutput([MPPUTIL, '-a'], caps[cap][MAX_TIME], output)] +-When the NIC performing the segmentation is using the i40e DPDK PMD, a fix +-must be included in the DPDK build, otherwise TSO will not work. The fix can +-be found on `DPDK patchwork`__. +- +-__ https://patches.dpdk.org/patch/64136/ +- +-This fix is expected to be included in the 19.11.1 release. When OVS migrates +-to this DPDK release, this limitation can be removed. +- + ~~~~~~~~~~~~~~~~~~ + Performance Tuning + ~~~~~~~~~~~~~~~~~~ +diff --git a/Makefile.am b/Makefile.am +index b279303d18..27ef9e4b48 100644 +--- a/Makefile.am ++++ b/Makefile.am +@@ -46,7 +46,7 @@ AM_CPPFLAGS += -DNDEBUG + AM_CFLAGS += -fomit-frame-pointer + endif - run_procs([procs]) -@@ -896,7 +898,7 @@ def load_plugins(just_capabilities=False, filter=None): - for node in nodelist: - if node.nodeType == node.TEXT_NODE: - rc += node.data -- return rc.encode() -+ return rc +-AM_CTAGSFLAGS = $(OVS_CTAGS_IDENTIFIERS_LIST) ++AM_CTAGSFLAGS = -I "$(OVS_CTAGS_IDENTIFIERS_LIST)" - def getBoolAttr(el, attr, default=False): - ret = default -@@ -1037,7 +1039,7 @@ def make_tar(subdir, suffix, output_fd, output_file): - s = os.stat(v['filename']) - ti.mtime = s.st_mtime - ti.size = s.st_size -- tf.addfile(ti, open(v['filename'])) -+ tf.addfile(ti, open(v['filename'], 'rb')) - except: - pass - finally: -@@ -1095,12 +1097,12 @@ def make_inventory(inventory, subdir): - s.setAttribute('date', time.strftime('%c')) - s.setAttribute('hostname', platform.node()) - s.setAttribute('uname', ' '.join(platform.uname())) -- s.setAttribute('uptime', commands.getoutput(UPTIME)) -+ s.setAttribute('uptime', check_output(UPTIME).decode()) - document.getElementsByTagName(INVENTORY_XML_ROOT)[0].appendChild(s) + if WIN32 + psep=";" +diff --git a/NEWS b/NEWS +index dab94e924d..128db0f619 100644 +--- a/NEWS ++++ b/NEWS +@@ -1,3 +1,14 @@ ++v2.13.2 - xx xxx xxxx ++--------------------- ++ ++v2.13.1 - 30 Jul 2020 ++--------------------- ++ - Bug fixes ++ - DPDK: ++ * OVS validated with DPDK 19.11.2, due to the inclusion of fixes for ++ CVE-2020-10722, CVE-2020-10723, CVE-2020-10724, CVE-2020-10725 and ++ CVE-2020-10726, this DPDK version is strongly recommended to be used. ++ + v2.13.0 - 14 Feb 2020 + --------------------- + - OVN: +diff --git a/acinclude.m4 b/acinclude.m4 +index c1470ccc6b..7f028836f5 100644 +--- a/acinclude.m4 ++++ b/acinclude.m4 +@@ -250,6 +250,18 @@ AC_DEFUN([OVS_CHECK_LINUX_SCTP_CT], [ + [Define to 1 if SCTP_CONNTRACK_HEARTBEAT_SENT is available.])]) + ]) - map(lambda k_v: inventory_entry(document, subdir, k_v[0], k_v[1]), - inventory.items()) -- return document.toprettyxml() -+ return document.toprettyxml().encode() ++dnl OVS_CHECK_LINUX_VIRTIO_TYPES ++dnl ++dnl Checks for kernels that need virtio_types definition. ++AC_DEFUN([OVS_CHECK_LINUX_VIRTIO_TYPES], [ ++ AC_COMPILE_IFELSE([ ++ AC_LANG_PROGRAM([#include ], [ ++ __virtio16 x = 0; ++ ])], ++ [AC_DEFINE([HAVE_VIRTIO_TYPES], [1], ++ [Define to 1 if __virtio16 is available.])]) ++]) ++ + dnl OVS_FIND_DEPENDENCY(FUNCTION, SEARCH_LIBS, NAME_TO_PRINT) + dnl + dnl Check for a function in a library list. +@@ -379,7 +391,6 @@ AC_DEFUN([OVS_CHECK_DPDK], [ + [AC_MSG_RESULT([no])]) + AC_CHECK_DECL([RTE_LIBRTE_MLX5_PMD], [dnl found +- OVS_FIND_DEPENDENCY([mnl_attr_put], [mnl], [libmnl]) + AC_CHECK_DECL([RTE_IBVERBS_LINK_DLOPEN], [], [dnl not found + OVS_FIND_DEPENDENCY([mlx5dv_create_wq], [mlx5], [libmlx5]) + OVS_FIND_DEPENDENCY([verbs_init_cq], [ibverbs], [libibverbs]) +@@ -567,9 +578,14 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ + OVS_GREP_IFELSE([$KSRC/include/net/ip6_fib.h], [rt6_get_cookie], + [OVS_DEFINE([HAVE_RT6_GET_COOKIE])]) - def inventory_entry(document, subdir, k, v): -@@ -1301,7 +1303,7 @@ class ProcOutput(object): - line = self.proc.stdout.readline() - else: - line = self.proc.stdout.read(self.bufsize) -- if line == '': -+ if line == b'': - # process exited - self.proc.stdout.close() - self.status = self.proc.wait() -@@ -1391,13 +1393,13 @@ def get_free_disk_space(path): - return s.f_frsize * s.f_bfree ++ OVS_FIND_FIELD_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_stub], ++ [dst_entry]) + OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_dst_lookup.*net], + [OVS_DEFINE([HAVE_IPV6_DST_LOOKUP_NET])]) ++ OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_dst_lookup_flow.*net], ++ [OVS_DEFINE([HAVE_IPV6_DST_LOOKUP_FLOW_NET])]) + OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_stub]) ++ OVS_GREP_IFELSE([$KSRC/include/net/addrconf.h], [ipv6_dst_lookup_flow]) + OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [ERR_CAST]) + OVS_GREP_IFELSE([$KSRC/include/linux/err.h], [IS_ERR_OR_NULL]) +@@ -765,6 +781,10 @@ AC_DEFUN([OVS_CHECK_LINUX_COMPAT], [ + [prandom_u32[[\(]]], + [OVS_DEFINE([HAVE_PRANDOM_U32])]) + OVS_GREP_IFELSE([$KSRC/include/linux/random.h], [prandom_u32_max]) ++ OVS_GREP_IFELSE([$KSRC/include/linux/prandom.h], ++ [prandom_u32[[\(]]], ++ [OVS_DEFINE([HAVE_PRANDOM_U32])]) ++ OVS_GREP_IFELSE([$KSRC/include/linux/prandom.h], [prandom_u32_max]) --class StringIOmtime(StringIO.StringIO): -- def __init__(self, buf=''): -- StringIO.StringIO.__init__(self, buf) -+class BytesIOmtime(BytesIO): -+ def __init__(self, buf=b''): -+ BytesIO.__init__(self, buf) - self.mtime = time.time() + OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [get_link_net]) + OVS_GREP_IFELSE([$KSRC/include/net/rtnetlink.h], [name_assign_type]) +@@ -1294,11 +1314,11 @@ AC_DEFUN([OVS_ENABLE_SPARSE], - def write(self, s): -- StringIO.StringIO.write(self, s) -+ BytesIO.write(self, s) - self.mtime = time.time() + dnl OVS_CTAGS_IDENTIFIERS + dnl +-dnl ctags ignores symbols with extras identifiers. This builds a list of +-dnl specially handled identifiers to be ignored. ++dnl ctags ignores symbols with extras identifiers. This is a list of ++dnl specially handled identifiers to be ignored. [ctags(1) -I ]. + AC_DEFUN([OVS_CTAGS_IDENTIFIERS], + AC_SUBST([OVS_CTAGS_IDENTIFIERS_LIST], +- [`printf %s '-I "'; sed -n 's/^#define \(OVS_[A-Z_]\+\)(\.\.\.)$/\1+/p' ${srcdir}/include/openvswitch/compiler.h | tr \\\n ' ' ; printf '"'`] )) ++ ["OVS_LOCKABLE OVS_NO_THREAD_SAFETY_ANALYSIS OVS_REQ_RDLOCK+ OVS_ACQ_RDLOCK+ OVS_REQ_WRLOCK+ OVS_ACQ_WRLOCK+ OVS_REQUIRES+ OVS_ACQUIRES+ OVS_TRY_WRLOCK+ OVS_TRY_RDLOCK+ OVS_TRY_LOCK+ OVS_GUARDED_BY+ OVS_EXCLUDED+ OVS_RELEASES+ OVS_ACQ_BEFORE+ OVS_ACQ_AFTER+"])) + dnl OVS_PTHREAD_SET_NAME + dnl +diff --git a/configure.ac b/configure.ac +index 92b52f6712..67942bbfb7 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -13,7 +13,7 @@ + # limitations under the License. --- -2.25.1 - - -From 914d885061c9f7e7e6e5f921065301e08837e122 Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Fri, 28 Feb 2020 18:07:04 -0800 -Subject: [PATCH 03/15] raft-rpc: Fix message format. - -[ upstream commit 78c8011f58daec41ec97440f2e42795699322742 ] - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft-rpc.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c -index 18c83fe9c2..dd14d81091 100644 ---- a/ovsdb/raft-rpc.c -+++ b/ovsdb/raft-rpc.c -@@ -544,8 +544,8 @@ raft_format_install_snapshot_request( - ds_put_format(s, " last_index=%"PRIu64, rq->last_index); - ds_put_format(s, " last_term=%"PRIu64, rq->last_term); - ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid)); -- ds_put_cstr(s, " last_servers="); - ds_put_format(s, " election_timer=%"PRIu64, rq->election_timer); -+ ds_put_cstr(s, " last_servers="); + AC_PREREQ(2.63) +-AC_INIT(openvswitch, 2.13.0, bugs@openvswitch.org) ++AC_INIT(openvswitch, 2.13.2, bugs@openvswitch.org) + AC_CONFIG_SRCDIR([datapath/datapath.c]) + AC_CONFIG_MACRO_DIR([m4]) + AC_CONFIG_AUX_DIR([build-aux]) +@@ -188,6 +188,7 @@ OVS_CHECK_LINUX + OVS_CHECK_LINUX_NETLINK + OVS_CHECK_LINUX_TC + OVS_CHECK_LINUX_SCTP_CT ++OVS_CHECK_LINUX_VIRTIO_TYPES + OVS_CHECK_DPDK + OVS_CHECK_PRAGMA_MESSAGE + AC_SUBST([OVS_CFLAGS]) +diff --git a/datapath-windows/ovsext/Actions.c b/datapath-windows/ovsext/Actions.c +index 5c9b5c3a0c..ced1d2957d 100644 +--- a/datapath-windows/ovsext/Actions.c ++++ b/datapath-windows/ovsext/Actions.c +@@ -1259,6 +1259,7 @@ OvsActionMplsPush(OvsForwardingContext *ovsFwdCtx, + */ + static __inline NDIS_STATUS + OvsUpdateEthHeader(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_ethernet *ethAttr) + { + PNET_BUFFER curNb; +@@ -1285,9 +1286,11 @@ OvsUpdateEthHeader(OvsForwardingContext *ovsFwdCtx, + } + ethHdr = (EthHdr *)(bufferStart + NET_BUFFER_CURRENT_MDL_OFFSET(curNb)); - struct hmap servers; - struct ovsdb_error *error = --- -2.25.1 - - -From 8ff30dfee6cb075e36ed38b77695ff03321ce12b Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Fri, 28 Feb 2020 18:07:05 -0800 -Subject: [PATCH 04/15] ovsdb-server: Don't disconnect clients after raft - install_snapshot. - -[ upstream commit f0c8b44c5832c36989fad78927407fc14e64ce46 ] - -When "schema" field is found in read_db(), there can be two cases: -1. There is a schema change in clustered DB and the "schema" is the new one. -2. There is a install_snapshot RPC happened, which caused log compaction on the -server and the next log is just the snapshot, which always constains "schema" -field, even though the schema hasn't been changed. - -The current implementation doesn't handle case 2), and always assume the schema -is changed hence disconnect all clients of the server. It can cause stability -problem when there are big number of clients connected when this happens in -a large scale environment. - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/ovsdb-server.c | 3 ++- - tests/ovsdb-cluster.at | 56 ++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 58 insertions(+), 1 deletion(-) - -diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c -index b6957d7300..d416f1b606 100644 ---- a/ovsdb/ovsdb-server.c -+++ b/ovsdb/ovsdb-server.c -@@ -543,7 +543,8 @@ parse_txn(struct server_config *config, struct db *db, - struct ovsdb_schema *schema, const struct json *txn_json, - const struct uuid *txnid) +- RtlCopyMemory(ethHdr->Destination, ethAttr->eth_dst, +- sizeof ethHdr->Destination); +- RtlCopyMemory(ethHdr->Source, ethAttr->eth_src, sizeof ethHdr->Source); ++ RtlCopyMemory(ethHdr->Destination, ethAttr->eth_dst, ETH_ADDR_LENGTH); ++ RtlCopyMemory(ethHdr->Source, ethAttr->eth_src, ETH_ADDR_LENGTH); ++ /* Update l2 flow key */ ++ RtlCopyMemory(key->l2.dlDst, ethAttr->eth_dst, ETH_ADDR_LENGTH); ++ RtlCopyMemory(key->l2.dlSrc, ethAttr->eth_src, ETH_ADDR_LENGTH); + + return NDIS_STATUS_SUCCESS; + } +@@ -1376,6 +1379,7 @@ PUINT8 OvsGetHeaderBySize(OvsForwardingContext *ovsFwdCtx, + */ + NDIS_STATUS + OvsUpdateUdpPorts(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_udp *udpAttr) { -- if (schema) { -+ if (schema && (!db->db->schema || strcmp(schema->version, -+ db->db->schema->version))) { - /* We're replacing the schema (and the data). Destroy the database - * (first grabbing its storage), then replace it with the new schema. - * The transaction must also include the replacement data. -diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at -index 3a0bd4579e..5b6188b96f 100644 ---- a/tests/ovsdb-cluster.at -+++ b/tests/ovsdb-cluster.at -@@ -273,6 +273,62 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s4 cluster/status $schema_name | grep "Ele + PUINT8 bufferStart; +@@ -1400,15 +1404,19 @@ OvsUpdateUdpPorts(OvsForwardingContext *ovsFwdCtx, + udpHdr->check = ChecksumUpdate16(udpHdr->check, udpHdr->source, + udpAttr->udp_src); + udpHdr->source = udpAttr->udp_src; ++ key->ipKey.l4.tpSrc = udpAttr->udp_src; + } + if (udpHdr->dest != udpAttr->udp_dst) { + udpHdr->check = ChecksumUpdate16(udpHdr->check, udpHdr->dest, + udpAttr->udp_dst); + udpHdr->dest = udpAttr->udp_dst; ++ key->ipKey.l4.tpDst = udpAttr->udp_dst; + } + } else { + udpHdr->source = udpAttr->udp_src; ++ key->ipKey.l4.tpSrc = udpAttr->udp_src; + udpHdr->dest = udpAttr->udp_dst; ++ key->ipKey.l4.tpDst = udpAttr->udp_dst; + } - AT_CLEANUP + return NDIS_STATUS_SUCCESS; +@@ -1423,6 +1431,7 @@ OvsUpdateUdpPorts(OvsForwardingContext *ovsFwdCtx, + */ + NDIS_STATUS + OvsUpdateTcpPorts(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_tcp *tcpAttr) + { + PUINT8 bufferStart; +@@ -1447,11 +1456,13 @@ OvsUpdateTcpPorts(OvsForwardingContext *ovsFwdCtx, + tcpHdr->check = ChecksumUpdate16(tcpHdr->check, tcpHdr->source, + tcpAttr->tcp_src); + tcpHdr->source = tcpAttr->tcp_src; ++ key->ipKey.l4.tpSrc = tcpAttr->tcp_src; + } + if (tcpHdr->dest != tcpAttr->tcp_dst) { + tcpHdr->check = ChecksumUpdate16(tcpHdr->check, tcpHdr->dest, + tcpAttr->tcp_dst); + tcpHdr->dest = tcpAttr->tcp_dst; ++ key->ipKey.l4.tpDst = tcpAttr->tcp_dst; + } -+ -+AT_BANNER([OVSDB cluster install snapshot RPC]) -+ -+AT_SETUP([OVSDB cluster - install snapshot RPC]) -+AT_KEYWORDS([ovsdb server positive unix cluster snapshot]) -+ -+n=3 -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+ordinal_schema > schema -+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) -+cid=`ovsdb-tool db-cid s1.db` -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+for i in `seq 2 $n`; do -+ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) -+done -+ -+on_exit 'kill `cat *.pid`' -+for i in `seq $n`; do -+ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) -+done -+for i in `seq $n`; do -+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) -+done -+ -+# Kill one follower (s2) and write some data to cluster, so that the follower is falling behind -+printf "\ns2: stopping\n" -+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s2], [s2.pid]) -+ -+AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", -+ {"op": "insert", -+ "table": "simple", -+ "row": {"i": 1}}]]'], [0], [ignore], [ignore]) -+ -+# Compact leader online to generate snapshot -+AT_CHECK([ovs-appctl -t "`pwd`"/s1 ovsdb-server/compact]) -+ -+# Start the follower s2 again. -+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb s2.db]) -+AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name connected]) -+ -+# A client transaction through s2. During this transaction, there will be a -+# install_snapshot RPC because s2 detects it is behind and s1 doesn't have the -+# pre_log_index requested by s2 because it is already compacted. -+# After the install_snapshot RPC process, the transaction through s2 should -+# succeed. -+AT_CHECK([ovsdb-client transact unix:s2.ovsdb '[["idltest", -+ {"op": "insert", -+ "table": "simple", -+ "row": {"i": 1}}]]'], [0], [ignore], [ignore]) -+ -+for i in `seq $n`; do -+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) -+done -+ -+AT_CLEANUP -+ - + return NDIS_STATUS_SUCCESS; +@@ -1579,6 +1590,7 @@ OvsUpdateAddressAndPort(OvsForwardingContext *ovsFwdCtx, + */ + NDIS_STATUS + OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_ipv4 *ipAttr) + { + PUINT8 bufferStart; +@@ -1632,6 +1644,7 @@ OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, + ipAttr->ipv4_src); + } + ipHdr->saddr = ipAttr->ipv4_src; ++ key->ipKey.nwSrc = ipAttr->ipv4_src; + } + if (ipHdr->daddr != ipAttr->ipv4_dst) { + if (tcpHdr) { +@@ -1647,6 +1660,7 @@ OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, + ipAttr->ipv4_dst); + } + ipHdr->daddr = ipAttr->ipv4_dst; ++ key->ipKey.nwDst = ipAttr->ipv4_dst; + } + if (ipHdr->protocol != ipAttr->ipv4_proto) { + UINT16 oldProto = (ipHdr->protocol << 16) & 0xff00; +@@ -1661,6 +1675,7 @@ OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, + ipHdr->check = ChecksumUpdate16(ipHdr->check, oldProto, newProto); + } + ipHdr->protocol = ipAttr->ipv4_proto; ++ key->ipKey.nwProto = ipAttr->ipv4_proto; + } + if (ipHdr->ttl != ipAttr->ipv4_ttl) { + UINT16 oldTtl = (ipHdr->ttl) & 0xff; +@@ -1669,6 +1684,7 @@ OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, + ipHdr->check = ChecksumUpdate16(ipHdr->check, oldTtl, newTtl); + } + ipHdr->ttl = ipAttr->ipv4_ttl; ++ key->ipKey.nwTtl = ipAttr->ipv4_ttl; + } - OVS_START_SHELL_HELPERS --- -2.25.1 - - -From e732012d7be335650398ff03c2431c64b2c4aaba Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Fri, 28 Feb 2020 18:07:06 -0800 -Subject: [PATCH 05/15] raft: Fix raft_is_connected() when there is no leader - yet. - -[ upstream commit adc64ab057345f7004c44bf92363b9adda862134 ] - -If there is never a leader known by the current server, it's status -should be "disconnected" to the cluster. Without this patch, when -a server in cluster is restarted, before it successfully connecting -back to the cluster it will appear as connected, which is wrong. - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft.c | 10 ++++++++-- - tests/ovsdb-cluster.at | 35 +++++++++++++++++++++++++++++++++++ - 2 files changed, 43 insertions(+), 2 deletions(-) - -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 4789bc4f22..6cd7b0041a 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -298,6 +298,11 @@ struct raft { - bool had_leader; /* There has been leader elected since last - election initiated. This is to help setting - candidate_retrying. */ -+ -+ /* For all. */ -+ bool ever_had_leader; /* There has been leader elected since the raft -+ is initialized, meaning it is ever -+ connected. */ - }; + return NDIS_STATUS_SUCCESS; +@@ -1691,12 +1707,12 @@ OvsExecuteSetAction(OvsForwardingContext *ovsFwdCtx, - /* All Raft structures. */ -@@ -1024,7 +1029,8 @@ raft_is_connected(const struct raft *raft) - && !raft->joining - && !raft->leaving - && !raft->left -- && !raft->failed); -+ && !raft->failed -+ && raft->ever_had_leader); - VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false"); - return ret; - } -@@ -2519,7 +2525,7 @@ static void - raft_set_leader(struct raft *raft, const struct uuid *sid) + switch (type) { + case OVS_KEY_ATTR_ETHERNET: +- status = OvsUpdateEthHeader(ovsFwdCtx, ++ status = OvsUpdateEthHeader(ovsFwdCtx, key, + NlAttrGetUnspec(a, sizeof(struct ovs_key_ethernet))); + break; + + case OVS_KEY_ATTR_IPV4: +- status = OvsUpdateIPv4Header(ovsFwdCtx, ++ status = OvsUpdateIPv4Header(ovsFwdCtx, key, + NlAttrGetUnspec(a, sizeof(struct ovs_key_ipv4))); + break; + +@@ -1709,16 +1725,17 @@ OvsExecuteSetAction(OvsForwardingContext *ovsFwdCtx, + status = SUCCEEDED(convertStatus) ? NDIS_STATUS_SUCCESS : NDIS_STATUS_FAILURE; + ASSERT(status == NDIS_STATUS_SUCCESS); + RtlCopyMemory(&ovsFwdCtx->tunKey, &tunKey, sizeof ovsFwdCtx->tunKey); ++ RtlCopyMemory(&key->tunKey, &tunKey, sizeof key->tunKey); + break; + } + + case OVS_KEY_ATTR_UDP: +- status = OvsUpdateUdpPorts(ovsFwdCtx, ++ status = OvsUpdateUdpPorts(ovsFwdCtx, key, + NlAttrGetUnspec(a, sizeof(struct ovs_key_udp))); + break; + + case OVS_KEY_ATTR_TCP: +- status = OvsUpdateTcpPorts(ovsFwdCtx, ++ status = OvsUpdateTcpPorts(ovsFwdCtx, key, + NlAttrGetUnspec(a, sizeof(struct ovs_key_tcp))); + break; + +diff --git a/datapath-windows/ovsext/Actions.h b/datapath-windows/ovsext/Actions.h +index fd050d5dd8..bc12e1166d 100644 +--- a/datapath-windows/ovsext/Actions.h ++++ b/datapath-windows/ovsext/Actions.h +@@ -115,14 +115,17 @@ PUINT8 OvsGetHeaderBySize(OvsForwardingContext *ovsFwdCtx, + + NDIS_STATUS + OvsUpdateUdpPorts(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_udp *udpAttr); + + NDIS_STATUS + OvsUpdateTcpPorts(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_tcp *tcpAttr); + + NDIS_STATUS + OvsUpdateIPv4Header(OvsForwardingContext *ovsFwdCtx, ++ OvsFlowKey *key, + const struct ovs_key_ipv4 *ipAttr); + + NDIS_STATUS +diff --git a/datapath-windows/ovsext/Conntrack-other.c b/datapath-windows/ovsext/Conntrack-other.c +index 962cc8ac65..8580415a6b 100644 +--- a/datapath-windows/ovsext/Conntrack-other.c ++++ b/datapath-windows/ovsext/Conntrack-other.c +@@ -49,17 +49,19 @@ OvsConntrackUpdateOtherEntry(OVS_CT_ENTRY *conn_, { - raft->leader_sid = *sid; -- raft->had_leader = true; -+ raft->ever_had_leader = raft->had_leader = true; - raft->candidate_retrying = false; + ASSERT(conn_); + struct conn_other *conn = OvsCastConntrackEntryToOtherEntry(conn_); ++ enum CT_UPDATE_RES ret = CT_UPDATE_VALID; + + if (reply && conn->state != OTHERS_BIDIR) { + conn->state = OTHERS_BIDIR; + } else if (conn->state == OTHERS_FIRST) { + conn->state = OTHERS_MULTIPLE; ++ ret = CT_UPDATE_VALID_NEW; + } + + OvsConntrackUpdateExpiration(&conn->up, now, + other_timeouts[conn->state]); + +- return CT_UPDATE_VALID; ++ return ret; } -diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at -index 5b6188b96f..0aa4564480 100644 ---- a/tests/ovsdb-cluster.at -+++ b/tests/ovsdb-cluster.at -@@ -179,6 +179,41 @@ AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) - ovsdb_test_cluster_disconnect 5 leader yes - AT_CLEANUP + OVS_CT_ENTRY * +diff --git a/datapath-windows/ovsext/Conntrack-tcp.c b/datapath-windows/ovsext/Conntrack-tcp.c +index eda42ac823..a468c3e6bc 100644 +--- a/datapath-windows/ovsext/Conntrack-tcp.c ++++ b/datapath-windows/ovsext/Conntrack-tcp.c +@@ -213,11 +213,17 @@ OvsConntrackUpdateTcpEntry(OVS_CT_ENTRY* conn_, + return CT_UPDATE_INVALID; + } -+AT_SETUP([OVSDB cluster - initial status should be disconnected]) -+AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) -+ -+n=3 -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+ordinal_schema > schema -+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) -+cid=`ovsdb-tool db-cid s1.db` -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+for i in `seq 2 $n`; do -+ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) -+done -+ -+on_exit 'kill `cat *.pid`' -+for i in `seq $n`; do -+ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) -+done -+for i in `seq $n`; do -+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) -+done +- if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) +- && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 ++ if ((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) { ++ if (dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 + && src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { +- src->state = dst->state = CT_DPIF_TCPS_CLOSED; +- return CT_UPDATE_NEW; ++ src->state = dst->state = CT_DPIF_TCPS_CLOSED; ++ return CT_UPDATE_NEW; ++ } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) { ++ src->state = CT_DPIF_TCPS_SYN_SENT; ++ OvsConntrackUpdateExpiration(&conn->up, now, ++ 30 * CT_INTERVAL_SEC); ++ return CT_UPDATE_VALID_NEW; ++ } + } + + if (src->wscale & CT_WSCALE_FLAG +diff --git a/datapath-windows/ovsext/Conntrack.c b/datapath-windows/ovsext/Conntrack.c +index ba5611697a..55917c43ff 100644 +--- a/datapath-windows/ovsext/Conntrack.c ++++ b/datapath-windows/ovsext/Conntrack.c +@@ -753,6 +753,9 @@ OvsProcessConntrackEntry(OvsForwardingContext *fwdCtx, + return NULL; + } + break; ++ case CT_UPDATE_VALID_NEW: ++ state |= OVS_CS_F_NEW; ++ break; + } + } + if (entry) { +diff --git a/datapath-windows/ovsext/Conntrack.h b/datapath-windows/ovsext/Conntrack.h +index bc6580d708..b0932186af 100644 +--- a/datapath-windows/ovsext/Conntrack.h ++++ b/datapath-windows/ovsext/Conntrack.h +@@ -56,6 +56,7 @@ typedef enum CT_UPDATE_RES { + CT_UPDATE_INVALID, + CT_UPDATE_VALID, + CT_UPDATE_NEW, ++ CT_UPDATE_VALID_NEW, + } CT_UPDATE_RES; + + /* Metadata mark for masked write to conntrack mark */ +diff --git a/datapath/linux/compat/geneve.c b/datapath/linux/compat/geneve.c +index c044b14896..bf995aa83a 100644 +--- a/datapath/linux/compat/geneve.c ++++ b/datapath/linux/compat/geneve.c +@@ -962,14 +962,26 @@ static struct dst_entry *geneve_get_v6_dst(struct sk_buff *skb, + return dst; + } + +-#ifdef HAVE_IPV6_DST_LOOKUP_NET +- if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { ++#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) ++#ifdef HAVE_IPV6_DST_LOOKUP_FLOW_NET ++ dst = ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, fl6, ++ NULL); + #else +-#ifdef HAVE_IPV6_STUB ++ dst = ipv6_stub->ipv6_dst_lookup_flow(gs6->sock->sk, fl6, ++ NULL); ++#endif ++ if (IS_ERR(dst)) { ++#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW_NET) ++ if (ipv6_stub->ipv6_dst_lookup_flow(geneve->net, gs6->sock->sk, &dst, ++ fl6)) { ++#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW) ++ if (ipv6_stub->ipv6_dst_lookup_flow(gs6->sock->sk, &dst, fl6)) { ++#elif defined(HAVE_IPV6_DST_LOOKUP_NET) ++ if (ipv6_stub->ipv6_dst_lookup(geneve->net, gs6->sock->sk, &dst, fl6)) { ++#elif defined(HAVE_IPV6_STUB) + if (ipv6_stub->ipv6_dst_lookup(gs6->sock->sk, &dst, fl6)) { + #else + if (ip6_dst_lookup(gs6->sock->sk, &dst, fl6)) { +-#endif + #endif + netdev_dbg(dev, "no route to %pI6\n", &fl6->daddr); + return ERR_PTR(-ENETUNREACH); +diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c +index 23118e8b63..05ccfb9288 100644 +--- a/datapath/linux/compat/vxlan.c ++++ b/datapath/linux/compat/vxlan.c +@@ -967,7 +967,10 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, + bool use_cache = (dst_cache && ip_tunnel_dst_cache_usable(skb, info)); + struct dst_entry *ndst; + struct flowi6 fl6; ++#if !defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) || \ ++ !defined(HAVE_IPV6_DST_LOOKUP_FLOW) + int err; ++#endif + + if (!sock6) + return ERR_PTR(-EIO); +@@ -990,20 +993,35 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan, + fl6.fl6_dport = dport; + fl6.fl6_sport = sport; + +-#ifdef HAVE_IPV6_DST_LOOKUP_NET +- err = ipv6_stub->ipv6_dst_lookup(vxlan->net, +- sock6->sock->sk, +- &ndst, &fl6); ++#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) ++#ifdef HAVE_IPV6_DST_LOOKUP_FLOW_NET ++ ndst = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, ++ &fl6, NULL); + #else +-#ifdef HAVE_IPV6_STUB ++ ndst = ipv6_stub->ipv6_dst_lookup_flow(sock6->sock->sk, &fl6, NULL); ++#endif ++ if (unlikely(IS_ERR(ndst))) { ++#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW_NET) ++ err = ipv6_stub->ipv6_dst_lookup_flow(vxlan->net, sock6->sock->sk, ++ &ndst, &fl6); ++#elif defined(HAVE_IPV6_DST_LOOKUP_FLOW) ++ err = ipv6_stub->ipv6_dst_lookup_flow(sock6->sock->sk, &ndst, &fl6); ++#elif defined(HAVE_IPV6_DST_LOOKUP_NET) ++ err = ipv6_stub->ipv6_dst_lookup(vxlan->net, sock6->sock->sk, ++ &ndst, &fl6); ++#elif defined(HAVE_IPV6_STUB) + err = ipv6_stub->ipv6_dst_lookup(vxlan->vn6_sock->sock->sk, + &ndst, &fl6); + #else + err = ip6_dst_lookup(vxlan->vn6_sock->sock->sk, &ndst, &fl6); + #endif +-#endif ++#if defined(HAVE_IPV6_STUB_WITH_DST_ENTRY) && defined(HAVE_IPV6_DST_LOOKUP_FLOW) ++ return ERR_PTR(-ENETUNREACH); ++ } ++#else + if (err < 0) + return ERR_PTR(err); ++#endif + + *saddr = fl6.saddr; + if (use_cache) +diff --git a/debian/changelog b/debian/changelog +index 8e075bc98b..d803cf10d1 100644 +--- a/debian/changelog ++++ b/debian/changelog +@@ -1,3 +1,15 @@ ++openvswitch (2.13.2-1) unstable; urgency=low ++ [ Open vSwitch team ] ++ * New upstream version + -+# Stop all servers, and start the s1 only, to test initial connection status -+# when there is no leader yet. -+for i in `seq 1 $n`; do -+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) -+done -+i=1 -+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) ++ -- Open vSwitch team Thu, 30 Jul 2020 00:25:23 +0200 + -+# The initial status should be disconnected. So wait should fail. -+AT_CHECK([ovsdb_client_wait --timeout=1 unix:s$i.ovsdb $schema_name connected], [142], [ignore], [ignore]) -+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) ++openvswitch (2.13.1-1) unstable; urgency=low ++ [ Open vSwitch team] ++ * New upstream version + -+AT_CLEANUP ++ -- Open vSwitch team Thu, 30 Jul 2020 00:25:23 +0200 + - + openvswitch (2.13.0-1) unstable; urgency=low + [ Open vSwitch team] + * New upstream version +diff --git a/debian/control b/debian/control +index a50e97249f..6420b9d3e2 100644 +--- a/debian/control ++++ b/debian/control +@@ -14,8 +14,9 @@ Build-Depends: graphviz, + openssl, + procps, + python3-all, +- python3-twisted-conch, +- python3-zopeinterface, ++ python3-sphinx, ++ python3-twisted, ++ python3-zope.interface, + libunbound-dev, + libunwind-dev + Standards-Version: 3.9.3 +@@ -187,7 +188,7 @@ Description: Python bindings for Open vSwitch + Package: openvswitch-test + Architecture: all + Depends: python3, +- python3-twisted-web, ++ python3-twisted, + ${misc:Depends}, + ${python3:Depends} + Description: Open vSwitch test package +diff --git a/debian/openvswitch-common.manpages b/debian/openvswitch-common.manpages +index 9ac6a1dd6d..95004122cc 100644 +--- a/debian/openvswitch-common.manpages ++++ b/debian/openvswitch-common.manpages +@@ -1,7 +1,7 @@ + ovsdb/ovsdb-client.1 + ovsdb/ovsdb-tool.1 + utilities/bugtool/ovs-bugtool.8 +-utilities/ovs-appctl.8 ++debian/tmp/usr/share/man/man8/ovs-appctl.8 + utilities/ovs-ofctl.8 +-utilities/ovs-parse-backtrace.8 +-utilities/ovs-pki.8 ++debian/tmp/usr/share/man/man8/ovs-parse-backtrace.8 ++debian/tmp/usr/share/man/man8/ovs-pki.8 +diff --git a/debian/openvswitch-switch.manpages b/debian/openvswitch-switch.manpages +index 1161cfda77..7fd7bc55da 100644 +--- a/debian/openvswitch-switch.manpages ++++ b/debian/openvswitch-switch.manpages +@@ -1,12 +1,12 @@ + ovsdb/ovsdb-server.1 + ovsdb/ovsdb-server.5 +-utilities/ovs-ctl.8 ++debian/tmp/usr/share/man/man8/ovs-ctl.8 + utilities/ovs-dpctl-top.8 + utilities/ovs-dpctl.8 + utilities/ovs-kmod-ctl.8 + utilities/ovs-pcap.1 +-utilities/ovs-tcpdump.8 +-utilities/ovs-tcpundump.1 ++debian/tmp/usr/share/man/man8/ovs-tcpdump.8 ++debian/tmp/usr/share/man/man1/ovs-tcpundump.1 + utilities/ovs-vsctl.8 + vswitchd/ovs-vswitchd.8 + vswitchd/ovs-vswitchd.conf.db.5 +diff --git a/debian/openvswitch-test.manpages b/debian/openvswitch-test.manpages +index 3f71858691..eb3a561d01 100644 +--- a/debian/openvswitch-test.manpages ++++ b/debian/openvswitch-test.manpages +@@ -1 +1 @@ +-utilities/ovs-l3ping.8 ++debian/tmp/usr/share/man/man8/ovs-l3ping.8 +diff --git a/dpdk/.ci/linux-setup.sh b/dpdk/.ci/linux-setup.sh +index dfb9d4a206..38bb88e15c 100755 +--- a/dpdk/.ci/linux-setup.sh ++++ b/dpdk/.ci/linux-setup.sh +@@ -1,7 +1,7 @@ + #!/bin/sh -xe - AT_BANNER([OVSDB cluster election timer change]) --- -2.25.1 - - -From 053b78c8d60ffb4d212fd7894f91be52027f291f Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Fri, 28 Feb 2020 18:07:07 -0800 -Subject: [PATCH 06/15] raft: Avoid busy loop during leader election. - -[ upstream commit 3ae90e1899c5a05148ea1870d9bb4ac3c05e3a19 ] - -When a server doesn't see a leader yet, e.g. during leader re-election, -if a transaction comes from a client, it will cause 100% CPU busy loop. -With debug log enabled it is like: - -2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164 -... - -The problem is that in ovsdb_trigger_try(), all cluster errors are treated -as temporary error and retry immediately. This patch fixes it by introducing -'run_triggers_now', which tells if a retry is needed immediately. When the -cluster error is with detail 'not leader', we don't immediately retry, but -will wait for the next poll event to trigger the retry. When 'not leader' -status changes, there must be a event, i.e. raft RPC that changes the -status, so the trigger is guaranteed to be triggered, without busy loop. - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/ovsdb.c | 2 +- - ovsdb/ovsdb.h | 1 + - ovsdb/transaction.c | 2 +- - ovsdb/trigger.c | 11 +++++++++-- - 4 files changed, 12 insertions(+), 4 deletions(-) - -diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c -index cfc96b32f8..7e683e6815 100644 ---- a/ovsdb/ovsdb.c -+++ b/ovsdb/ovsdb.c -@@ -414,7 +414,7 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage) - db->storage = storage; - ovs_list_init(&db->monitors); - ovs_list_init(&db->triggers); -- db->run_triggers = false; -+ db->run_triggers_now = db->run_triggers = false; + # need to install as 'root' since some of the unit tests won't run without it +-sudo python3 -m pip install --upgrade meson ++sudo python3 -m pip install --upgrade 'meson==0.47.1' - shash_init(&db->tables); - if (schema) { -diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h -index 32e5333163..5c30a83d92 100644 ---- a/ovsdb/ovsdb.h -+++ b/ovsdb/ovsdb.h -@@ -83,6 +83,7 @@ struct ovsdb { - /* Triggers. */ - struct ovs_list triggers; /* Contains "struct ovsdb_trigger"s. */ - bool run_triggers; -+ bool run_triggers_now; + # setup hugepages + cat /proc/meminfo +diff --git a/dpdk/.travis.yml b/dpdk/.travis.yml +index 8f90d06f28..77ac26dd85 100644 +--- a/dpdk/.travis.yml ++++ b/dpdk/.travis.yml +@@ -15,19 +15,19 @@ addons: + packages: &required_packages + - [libnuma-dev, linux-headers-$(uname -r), python3-setuptools, python3-wheel, python3-pip, ninja-build] - struct ovsdb_table *rbac_role; +-aarch64_packages: &aarch64_packages ++_aarch64_packages: &aarch64_packages + - *required_packages + - [gcc-aarch64-linux-gnu, libc6-dev-arm64-cross, pkg-config-aarch64-linux-gnu] -diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c -index 369436bffb..8ffefcf7c9 100644 ---- a/ovsdb/transaction.c -+++ b/ovsdb/transaction.c -@@ -967,7 +967,7 @@ ovsdb_txn_complete(struct ovsdb_txn *txn) +-extra_packages: &extra_packages ++_extra_packages: &extra_packages + - *required_packages +- - [libbsd-dev, libpcap-dev, libcrypto++-dev, libjansson4] ++ - [libbsd-dev, libpcap-dev, libcrypto++-dev, libjansson-dev] + +-build_32b_packages: &build_32b_packages ++_build_32b_packages: &build_32b_packages + - *required_packages + - [gcc-multilib] + +-doc_packages: &doc_packages ++_doc_packages: &doc_packages + - [doxygen, graphviz, python3-sphinx] + + before_install: ./.ci/${TRAVIS_OS_NAME}-setup.sh +@@ -39,7 +39,7 @@ env: + - DEF_LIB="shared" OPTS="-Denable_kmods=false" + - DEF_LIB="shared" RUN_TESTS=1 + +-matrix: ++jobs: + include: + - env: DEF_LIB="static" BUILD_32BIT=1 + compiler: gcc +diff --git a/dpdk/MAINTAINERS b/dpdk/MAINTAINERS +index 4395d8df14..10c4e1a613 100644 +--- a/dpdk/MAINTAINERS ++++ b/dpdk/MAINTAINERS +@@ -370,7 +370,7 @@ F: devtools/test-null.sh + F: doc/guides/prog_guide/switch_representation.rst + + Flow API +-M: Adrien Mazarguil ++M: Ori Kam + T: git://dpdk.org/next/dpdk-next-net + F: app/test-pmd/cmdline_flow.c + F: doc/guides/prog_guide/rte_flow.rst +@@ -910,7 +910,7 @@ F: drivers/net/null/ + F: doc/guides/nics/features/null.ini + + Fail-safe PMD +-M: Gaetan Rivet ++M: Gaetan Rivet + F: drivers/net/failsafe/ + F: doc/guides/nics/fail_safe.rst + F: doc/guides/nics/features/failsafe.ini +@@ -1373,7 +1373,7 @@ F: app/test/test_rcu* + F: doc/guides/prog_guide/rcu_lib.rst + + PCI +-M: Gaetan Rivet ++M: Gaetan Rivet + F: lib/librte_pci/ + + Power management +@@ -1434,6 +1434,7 @@ Unit tests framework + F: app/test/Makefile + F: app/test/autotest* + F: app/test/commands.c ++F: app/test/get-coremask.sh + F: app/test/packet_burst_generator.c + F: app/test/packet_burst_generator.h + F: app/test/process.h +diff --git a/dpdk/VERSION b/dpdk/VERSION +index 22131b00aa..a43c349903 100644 +--- a/dpdk/VERSION ++++ b/dpdk/VERSION +@@ -1 +1 @@ +-19.11.0 ++19.11.3 +diff --git a/dpdk/app/pdump/main.c b/dpdk/app/pdump/main.c +index 903d02f482..c38c53719e 100644 +--- a/dpdk/app/pdump/main.c ++++ b/dpdk/app/pdump/main.c +@@ -151,7 +151,7 @@ static uint8_t multiple_core_capture; + static void + pdump_usage(const char *prgname) { - if (!ovsdb_txn_is_empty(txn)) { +- printf("usage: %s [EAL options]" ++ printf("usage: %s [EAL options] --" + " --["CMD_LINE_OPT_MULTI"]\n" + " --"CMD_LINE_OPT_PDUMP" " + "'(port= | device_id=)," +@@ -595,7 +595,7 @@ configure_vdev(uint16_t port_id) + if (ret != 0) + rte_exit(EXIT_FAILURE, "dev config failed\n"); -- txn->db->run_triggers = true; -+ txn->db->run_triggers_now = txn->db->run_triggers = true; - ovsdb_monitors_commit(txn->db, txn); - ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs)); - ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit)); -diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c -index 7e62e90ae3..0372302af4 100644 ---- a/ovsdb/trigger.c -+++ b/ovsdb/trigger.c -@@ -141,7 +141,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) - struct ovsdb_trigger *t, *next; +- for (q = 0; q < txRings; q++) { ++ for (q = 0; q < txRings; q++) { + ret = rte_eth_tx_queue_setup(port_id, q, TX_DESC_PER_QUEUE, + rte_eth_dev_socket_id(port_id), NULL); + if (ret < 0) +diff --git a/dpdk/app/test-acl/main.c b/dpdk/app/test-acl/main.c +index 57f23942eb..08f06c1fa3 100644 +--- a/dpdk/app/test-acl/main.c ++++ b/dpdk/app/test-acl/main.c +@@ -12,7 +12,7 @@ + #include + #include - bool run_triggers = db->run_triggers; -- db->run_triggers = false; -+ db->run_triggers_now = db->run_triggers = false; +-#define PRINT_USAGE_START "%s [EAL options]\n" ++#define PRINT_USAGE_START "%s [EAL options] --\n" - bool disconnect_all = false; + #define RTE_LOGTYPE_TESTACL RTE_LOGTYPE_USER1 + +diff --git a/dpdk/app/test-crypto-perf/main.c b/dpdk/app/test-crypto-perf/main.c +index 52a1860fbf..7bb286ccbe 100644 +--- a/dpdk/app/test-crypto-perf/main.c ++++ b/dpdk/app/test-crypto-perf/main.c +@@ -582,7 +582,8 @@ main(int argc, char **argv) + goto err; + } + +- if (!opts.silent) ++ if (!opts.silent && opts.test != CPERF_TEST_TYPE_THROUGHPUT && ++ opts.test != CPERF_TEST_TYPE_LATENCY) + show_test_vector(t_vec); + + total_nb_qps = nb_cryptodevs * opts.nb_qps; +diff --git a/dpdk/app/test-eventdev/meson.build b/dpdk/app/test-eventdev/meson.build +index 7ff2b786cf..9e588d9ec7 100644 +--- a/dpdk/app/test-eventdev/meson.build ++++ b/dpdk/app/test-eventdev/meson.build +@@ -10,5 +10,8 @@ sources = files('evt_main.c', + 'test_order_queue.c', + 'test_perf_common.c', + 'test_perf_atq.c', +- 'test_perf_queue.c') ++ 'test_perf_queue.c', ++ 'test_pipeline_common.c', ++ 'test_pipeline_atq.c', ++ 'test_pipeline_queue.c') + deps += 'eventdev' +diff --git a/dpdk/app/test-eventdev/test_pipeline_common.c b/dpdk/app/test-eventdev/test_pipeline_common.c +index fa91bf2290..126e2165a3 100644 +--- a/dpdk/app/test-eventdev/test_pipeline_common.c ++++ b/dpdk/app/test-eventdev/test_pipeline_common.c +@@ -385,12 +385,16 @@ pipeline_event_tx_adapter_setup(struct evt_options *opt, + if (!(cap & RTE_EVENT_ETH_TX_ADAPTER_CAP_INTERNAL_PORT)) { + uint32_t service_id = -1U; + +- rte_event_eth_tx_adapter_service_id_get(consm, +- &service_id); ++ ret = rte_event_eth_tx_adapter_service_id_get(consm, ++ &service_id); ++ if (ret != -ESRCH && ret != 0) { ++ evt_err("Failed to get Tx adptr service ID"); ++ return ret; ++ } + ret = evt_service_setup(service_id); + if (ret) { + evt_err("Failed to setup service core" +- " for Tx adapter\n"); ++ " for Tx adapter"); + return ret; + } + } +diff --git a/dpdk/app/test-pipeline/config.c b/dpdk/app/test-pipeline/config.c +index 28ac9fcc0e..33f3f1c827 100644 +--- a/dpdk/app/test-pipeline/config.c ++++ b/dpdk/app/test-pipeline/config.c +@@ -42,8 +42,6 @@ + + #include "main.h" + +-struct app_params app; +- + static const char usage[] = "\n"; -@@ -160,7 +160,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) void - ovsdb_trigger_wait(struct ovsdb *db, long long int now) +diff --git a/dpdk/app/test-pmd/cmdline.c b/dpdk/app/test-pmd/cmdline.c +index 9f3e0b251b..d508d1e26d 100644 +--- a/dpdk/app/test-pmd/cmdline.c ++++ b/dpdk/app/test-pmd/cmdline.c +@@ -94,7 +94,7 @@ static void cmd_help_brief_parsed(__attribute__((unused)) void *parsed_result, + " help ports : Configuring ports.\n" + " help registers : Reading and setting port registers.\n" + " help filters : Filters configuration help.\n" +- " help traffic_management : Traffic Management commmands.\n" ++ " help traffic_management : Traffic Management commands.\n" + " help devices : Device related cmds.\n" + " help all : All of the above sections.\n\n" + ); +@@ -1437,7 +1437,7 @@ cmdline_parse_inst_t cmd_set_port_setup_on = { + struct cmd_operate_attach_port_result { + cmdline_fixed_string_t port; + cmdline_fixed_string_t keyword; +- cmdline_fixed_string_t identifier; ++ cmdline_multi_string_t identifier; + }; + + static void cmd_operate_attach_port_parsed(void *parsed_result, +@@ -1460,7 +1460,7 @@ cmdline_parse_token_string_t cmd_operate_attach_port_keyword = + keyword, "attach"); + cmdline_parse_token_string_t cmd_operate_attach_port_identifier = + TOKEN_STRING_INITIALIZER(struct cmd_operate_attach_port_result, +- identifier, NULL); ++ identifier, TOKEN_STRING_MULTI); + + cmdline_parse_inst_t cmd_operate_attach_port = { + .f = cmd_operate_attach_port_parsed, +@@ -1488,10 +1488,12 @@ static void cmd_operate_detach_port_parsed(void *parsed_result, { -- if (db->run_triggers) { -+ if (db->run_triggers_now) { - poll_immediate_wake(); - } else { - long long int deadline = LLONG_MAX; -@@ -319,9 +319,16 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) - if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) { - /* Temporary error. Transition back to "initialized" state to - * try again. */ -+ char *err_s = ovsdb_error_to_string(error); -+ VLOG_DBG("cluster error %s", err_s); -+ - jsonrpc_msg_destroy(t->reply); - t->reply = NULL; - t->db->run_triggers = true; -+ if (!strstr(err_s, "not leader")) { -+ t->db->run_triggers_now = true; -+ } -+ free(err_s); - ovsdb_error_destroy(error); - } else { - /* Permanent error. Transition to "completed" state to report --- -2.25.1 - - -From cc3d02699203e2fe9d9fd384d09e268ba614828d Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Fri, 28 Feb 2020 18:07:10 -0800 -Subject: [PATCH 07/15] raft: Fix next_index in install_snapshot reply - handling. - -[ upstream commit 877618fc833273d1e29e012b5e925d51cba80ff5 ] - -When a leader handles install_snapshot reply, the next_index for -the follower should be log_start instead of log_end, because there -can be new entries added in leader's log after initiating the -install_snapshot procedure. Also, it should send all the accumulated -entries to follower in the following append-request message, instead -of sending 0 entries, to speed up the converge. - -Without this fix, there is no functional problem, but it takes -uncessary extra rounds of append-requests responsed with "inconsistency" -by follower, although finally will be converged. - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 6cd7b0041a..fa04d8c80b 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -3998,8 +3998,9 @@ raft_handle_install_snapshot_reply( - VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s " - " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid), - s->nickname, rpy->last_term, rpy->last_index); -- s->next_index = raft->log_end; -- raft_send_append_request(raft, s, 0, "snapshot installed"); -+ s->next_index = raft->log_start; -+ raft_send_append_request(raft, s, raft->log_end - s->next_index, -+ "snapshot installed"); + struct cmd_operate_detach_port_result *res = parsed_result; + +- if (!strcmp(res->keyword, "detach")) ++ if (!strcmp(res->keyword, "detach")) { ++ RTE_ETH_VALID_PORTID_OR_RET(res->port_id); + detach_port_device(res->port_id); +- else ++ } else { + printf("Unknown parameter\n"); ++ } } - /* Returns true if 'raft' has grown enough since the last snapshot that --- -2.25.1 - - -From 9c76350e271546eedfeb18720975e35b4e36e1f1 Mon Sep 17 00:00:00 2001 -From: Han Zhou -Date: Thu, 5 Mar 2020 23:48:45 -0800 -Subject: [PATCH 08/15] raft: Fix the problem of stuck in candidate role - forever. - -[ upstream commit 25a7e5547f1e107db0f032ad269f447c57401531 ] - -Sometimes a server can stay in candidate role forever, even if the server -already see the new leader and handles append-requests normally. However, -because of the wrong role, it appears as disconnected from cluster and -so the clients are disconnected. - -This problem happens when 2 servers become candidates in the same -term, and one of them is elected as leader in that term. It can be -reproduced by the test cases added in this patch. - -The root cause is that the current implementation only changes role to -follower when a bigger term is observed (in raft_receive_term__()). -According to the RAFT paper, if another candidate becomes leader with -the same term, the candidate should change to follower. - -This patch fixes it by changing the role to follower when leader -is being updated in raft_update_leader(). - -Signed-off-by: Han Zhou -Signed-off-by: Ben Pfaff - -Resolves: #1836305 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft.c | 19 +++++++++++++-- - tests/ovsdb-cluster.at | 55 ++++++++++++++++++++++++++++++++++++++++++ - 2 files changed, 72 insertions(+), 2 deletions(-) - -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index fa04d8c80b..6452182ba6 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -73,7 +73,8 @@ enum raft_failure_test { - FT_CRASH_BEFORE_SEND_EXEC_REQ, - FT_CRASH_AFTER_SEND_EXEC_REQ, - FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE, -- FT_DELAY_ELECTION -+ FT_DELAY_ELECTION, -+ FT_DONT_SEND_VOTE_REQUEST - }; - static enum raft_failure_test failure_test; + cmdline_parse_token_string_t cmd_operate_detach_port_port = +@@ -1530,7 +1532,7 @@ static void cmd_operate_detach_device_parsed(void *parsed_result, + struct cmd_operate_detach_device_result *res = parsed_result; -@@ -1647,6 +1648,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) - } + if (!strcmp(res->keyword, "detach")) +- detach_device(res->identifier); ++ detach_devargs(res->identifier); + else + printf("Unknown parameter\n"); + } +@@ -5120,7 +5122,7 @@ cmd_gso_size_parsed(void *parsed_result, - ovs_assert(raft->role != RAFT_LEADER); -+ - raft->role = RAFT_CANDIDATE; - /* If there was no leader elected since last election, we know we are - * retrying now. */ -@@ -1690,7 +1692,9 @@ raft_start_election(struct raft *raft, bool leadership_transfer) - .leadership_transfer = leadership_transfer, - }, - }; -- raft_send(raft, &rq); -+ if (failure_test != FT_DONT_SEND_VOTE_REQUEST) { -+ raft_send(raft, &rq); -+ } - } + if (test_done == 0) { + printf("Before setting GSO segsz, please first" +- " stop fowarding\n"); ++ " stop forwarding\n"); + return; + } - /* Vote for ourselves. */ -@@ -2966,6 +2970,15 @@ raft_update_leader(struct raft *raft, const struct uuid *sid) - }; - ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); - } -+ if (raft->role == RAFT_CANDIDATE) { -+ /* Section 3.4: While waiting for votes, a candidate may -+ * receive an AppendEntries RPC from another server claiming to -+ * be leader. If the leader’s term (included in its RPC) is at -+ * least as large as the candidate’s current term, then the -+ * candidate recognizes the leader as legitimate and returns to -+ * follower state. */ -+ raft->role = RAFT_FOLLOWER; -+ } - return true; - } +@@ -7078,9 +7080,10 @@ cmd_priority_flow_ctrl_set_parsed(void *parsed_result, + * the RTE_FC_RX_PAUSE, Respond to the pause frame at the Tx side. + */ + static enum rte_eth_fc_mode rx_tx_onoff_2_pfc_mode[2][2] = { +- {RTE_FC_NONE, RTE_FC_RX_PAUSE}, {RTE_FC_TX_PAUSE, RTE_FC_FULL} ++ {RTE_FC_NONE, RTE_FC_TX_PAUSE}, {RTE_FC_RX_PAUSE, RTE_FC_FULL} + }; -@@ -4674,6 +4687,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED, - raft_reset_election_timer(raft); - } - } -+ } else if (!strcmp(test, "dont-send-vote-request")) { -+ failure_test = FT_DONT_SEND_VOTE_REQUEST; - } else if (!strcmp(test, "clear")) { - failure_test = FT_NO_TEST; - unixctl_command_reply(conn, "test dismissed"); -diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at -index 0aa4564480..9714545151 100644 ---- a/tests/ovsdb-cluster.at -+++ b/tests/ovsdb-cluster.at -@@ -527,6 +527,61 @@ AT_KEYWORDS([ovsdb server negative unix cluster pending-txn]) - ovsdb_cluster_failure_test 2 2 3 crash-after-receiving-append-request-update - AT_CLEANUP ++ memset(&pfc_conf, 0, sizeof(struct rte_eth_pfc_conf)); + rx_fc_enable = (!strncmp(res->rx_pfc_mode, "on",2)) ? 1 : 0; + tx_fc_enable = (!strncmp(res->tx_pfc_mode, "on",2)) ? 1 : 0; + pfc_conf.fc.mode = rx_tx_onoff_2_pfc_mode[rx_fc_enable][tx_fc_enable]; +@@ -16802,8 +16805,10 @@ cmd_ddp_get_list_parsed( + #ifdef RTE_LIBRTE_I40E_PMD + size = PROFILE_INFO_SIZE * MAX_PROFILE_NUM + 4; + p_list = (struct rte_pmd_i40e_profile_list *)malloc(size); +- if (!p_list) ++ if (!p_list) { + printf("%s: Failed to malloc buffer\n", __func__); ++ return; ++ } -+ -+AT_SETUP([OVSDB cluster - competing candidates]) -+AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates]) -+ -+n=3 -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+ordinal_schema > schema -+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) -+cid=`ovsdb-tool db-cid s1.db` -+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` -+for i in `seq 2 $n`; do -+ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) -+done -+ -+on_exit 'kill `cat *.pid`' -+for i in `seq $n`; do -+ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) -+done -+for i in `seq $n`; do -+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) -+done + if (ret == -ENOTSUP) + ret = rte_pmd_i40e_get_ddp_list(res->port_id, +diff --git a/dpdk/app/test-pmd/cmdline_flow.c b/dpdk/app/test-pmd/cmdline_flow.c +index 99dade7d8c..deced65016 100644 +--- a/dpdk/app/test-pmd/cmdline_flow.c ++++ b/dpdk/app/test-pmd/cmdline_flow.c +@@ -1005,7 +1005,6 @@ static const enum index item_pppoes[] = { + }; + + static const enum index item_pppoe_proto_id[] = { +- ITEM_PPPOE_PROTO_ID, + ITEM_NEXT, + ZERO, + }; +@@ -2544,11 +2543,14 @@ static const struct token token_list[] = { + session_id)), + }, + [ITEM_PPPOE_PROTO_ID] = { +- .name = "proto_id", ++ .name = "pppoe_proto_id", + .help = "match PPPoE session protocol identifier", + .priv = PRIV_ITEM(PPPOE_PROTO_ID, + sizeof(struct rte_flow_item_pppoe_proto_id)), +- .next = NEXT(item_pppoe_proto_id), ++ .next = NEXT(item_pppoe_proto_id, NEXT_ENTRY(UNSIGNED), ++ item_param), ++ .args = ARGS(ARGS_ENTRY_HTON ++ (struct rte_flow_item_pppoe_proto_id, proto_id)), + .call = parse_vc, + }, + [ITEM_HIGIG2] = { +@@ -4534,7 +4536,9 @@ parse_vc_action_mplsogre_decap(struct context *ctx, const struct token *token, + struct rte_flow_item_gre gre = { + .protocol = rte_cpu_to_be_16(ETHER_TYPE_MPLS_UNICAST), + }; +- struct rte_flow_item_mpls mpls; ++ struct rte_flow_item_mpls mpls = { ++ .ttl = 0, ++ }; + uint8_t *header; + int ret; + +@@ -6236,6 +6240,9 @@ flow_item_default_mask(const struct rte_flow_item *item) + case RTE_FLOW_ITEM_TYPE_GTP_PSC: + mask = &rte_flow_item_gtp_psc_mask; + break; ++ case RTE_FLOW_ITEM_TYPE_GENEVE: ++ mask = &rte_flow_item_geneve_mask; ++ break; + case RTE_FLOW_ITEM_TYPE_PPPOE_PROTO_ID: + mask = &rte_flow_item_pppoe_proto_id_mask; + default: +diff --git a/dpdk/app/test-pmd/config.c b/dpdk/app/test-pmd/config.c +index d599682788..42eba68b35 100644 +--- a/dpdk/app/test-pmd/config.c ++++ b/dpdk/app/test-pmd/config.c +@@ -223,11 +223,26 @@ nic_stats_display(portid_t port_id) + void + nic_stats_clear(portid_t port_id) + { ++ int ret; + -+# We need to simulate the situation when 2 candidates starts election with same -+# term. -+# -+# Before triggering leader election, tell follower s2 don't send vote request (simulating -+# vote-request lost or not handled in time), and tell follower s3 to delay -+# election timer to make sure s3 doesn't send vote-request before s2 enters -+# term 2. -+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test dont-send-vote-request], [0], [ignore]) -+AT_CHECK([ovs-appctl -t "`pwd`"/s3 cluster/failure-test delay-election], [0], [ignore]) + if (port_id_is_invalid(port_id, ENABLED_WARN)) { + print_valid_ports(); + return; + } +- rte_eth_stats_reset(port_id); + -+# Restart leader, which will become follower, and both old followers will start -+# election as candidate. The new follower (old leader) will vote one of them, -+# and the other candidate should step back as follower as again. -+kill -9 `cat s1.pid` -+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb s1.db]) ++ ret = rte_eth_stats_reset(port_id); ++ if (ret != 0) { ++ printf("%s: Error: failed to reset stats (port %u): %s", ++ __func__, port_id, strerror(ret)); ++ return; ++ } + -+# Tell s1 to delay election timer so that it won't start election before s3 -+# becomes candidate. -+AT_CHECK([ovs-appctl -t "`pwd`"/s1 cluster/failure-test delay-election], [0], [ignore]) ++ ret = rte_eth_stats_get(port_id, &ports[port_id].stats); ++ if (ret != 0) { ++ printf("%s: Error: failed to get stats (port %u): %s", ++ __func__, port_id, strerror(ret)); ++ return; ++ } + printf("\n NIC statistics for port %d cleared\n", port_id); + } + +@@ -303,10 +318,19 @@ nic_xstats_clear(portid_t port_id) + print_valid_ports(); + return; + } + -+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s1 cluster/status $schema_name | grep "Term: 2"]) + ret = rte_eth_xstats_reset(port_id); + if (ret != 0) { + printf("%s: Error: failed to reset xstats (port %u): %s", + __func__, port_id, strerror(ret)); ++ return; ++ } + -+for i in `seq $n`; do -+ OVS_WAIT_WHILE([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep "candidate"]) -+ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) -+done ++ ret = rte_eth_stats_get(port_id, &ports[port_id].stats); ++ if (ret != 0) { ++ printf("%s: Error: failed to get stats (port %u): %s", ++ __func__, port_id, strerror(ret)); ++ return; + } + } + +@@ -1216,7 +1240,9 @@ void + port_mtu_set(portid_t port_id, uint16_t mtu) + { + int diag; ++ struct rte_port *rte_port = &ports[port_id]; + struct rte_eth_dev_info dev_info; ++ uint16_t eth_overhead; + int ret; + + if (port_id_is_invalid(port_id, ENABLED_WARN)) +@@ -1232,8 +1258,25 @@ port_mtu_set(portid_t port_id, uint16_t mtu) + return; + } + diag = rte_eth_dev_set_mtu(port_id, mtu); +- if (diag == 0) ++ if (diag == 0 && ++ dev_info.rx_offload_capa & DEV_RX_OFFLOAD_JUMBO_FRAME) { ++ /* ++ * Ether overhead in driver is equal to the difference of ++ * max_rx_pktlen and max_mtu in rte_eth_dev_info when the ++ * device supports jumbo frame. ++ */ ++ eth_overhead = dev_info.max_rx_pktlen - dev_info.max_mtu; ++ if (mtu > RTE_ETHER_MAX_LEN - eth_overhead) { ++ rte_port->dev_conf.rxmode.offloads |= ++ DEV_RX_OFFLOAD_JUMBO_FRAME; ++ rte_port->dev_conf.rxmode.max_rx_pkt_len = ++ mtu + eth_overhead; ++ } else ++ rte_port->dev_conf.rxmode.offloads &= ++ ~DEV_RX_OFFLOAD_JUMBO_FRAME; + -+for i in `seq $n`; do -+ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) -+done + return; ++ } + printf("Set MTU failed. diag=%d\n", diag); + } + +@@ -3707,6 +3750,14 @@ mcast_addr_pool_extend(struct rte_port *port) + + } + ++static void ++mcast_addr_pool_append(struct rte_port *port, struct rte_ether_addr *mc_addr) ++{ ++ if (mcast_addr_pool_extend(port) != 0) ++ return; ++ rte_ether_addr_copy(mc_addr, &port->mc_addr_pool[port->mc_addr_nb - 1]); ++} + -+AT_CLEANUP + static void + mcast_addr_pool_remove(struct rte_port *port, uint32_t addr_idx) + { +@@ -3725,7 +3776,7 @@ mcast_addr_pool_remove(struct rte_port *port, uint32_t addr_idx) + sizeof(struct rte_ether_addr) * (port->mc_addr_nb - addr_idx)); + } + +-static void ++static int + eth_port_multicast_addr_list_set(portid_t port_id) + { + struct rte_port *port; +@@ -3734,10 +3785,11 @@ eth_port_multicast_addr_list_set(portid_t port_id) + port = &ports[port_id]; + diag = rte_eth_dev_set_mc_addr_list(port_id, port->mc_addr_pool, + port->mc_addr_nb); +- if (diag == 0) +- return; +- printf("rte_eth_dev_set_mc_addr_list(port=%d, nb=%u) failed. diag=%d\n", +- port->mc_addr_nb, port_id, -diag); ++ if (diag < 0) ++ printf("rte_eth_dev_set_mc_addr_list(port=%d, nb=%u) failed. diag=%d\n", ++ port_id, port->mc_addr_nb, diag); + - - AT_BANNER([OVSDB - cluster tests]) ++ return diag; + } --- -2.25.1 - - -From 5c38ccd52fb3925e82eda20f1897ec02abb390d9 Mon Sep 17 00:00:00 2001 -From: Ilya Maximets -Date: Mon, 4 May 2020 21:55:41 +0200 -Subject: [PATCH 09/15] raft: Fix leak of the incomplete command. - -[ upstream commit 168beb87ca63056e8896b09a60031565b7b60728 ] - -Function raft_command_initiate() returns correctly referenced command -instance. 'n_ref' equals 1 for complete commands and 2 for incomplete -commands because one more reference is in raft->commands list. -raft_handle_execute_command_request__() leaks the reference by not -returning pointer anywhere and not unreferencing incomplete commands. - - 792 bytes in 11 blocks are definitely lost in loss record 258 of 262 - at 0x483BB1A: calloc (vg_replace_malloc.c:762) - by 0x44BA32: xcalloc (util.c:121) - by 0x422E5F: raft_command_create_incomplete (raft.c:2038) - by 0x422E5F: raft_command_initiate (raft.c:2061) - by 0x428651: raft_handle_execute_command_request__ (raft.c:4161) - by 0x428651: raft_handle_execute_command_request (raft.c:4177) - by 0x428651: raft_handle_rpc (raft.c:4230) - by 0x428651: raft_conn_run (raft.c:1445) - by 0x428DEA: raft_run (raft.c:1803) - by 0x407392: main_loop (ovsdb-server.c:226) - by 0x407392: main (ovsdb-server.c:469) - -Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") -Signed-off-by: Ilya Maximets -Acked-by: Han Zhou -Signed-off-by: William Tu - -Resolves: #1836307 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft.c | 4 +--- - 1 file changed, 1 insertion(+), 3 deletions(-) - -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 6452182ba6..1505814138 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -4163,9 +4163,7 @@ raft_handle_execute_command_request__( - cmd->sid = rq->common.sid; + void +@@ -3762,10 +3814,10 @@ mcast_addr_add(portid_t port_id, struct rte_ether_addr *mc_addr) + } + } - enum raft_command_status status = cmd->status; -- if (status != RAFT_CMD_INCOMPLETE) { -- raft_command_unref(cmd); -- } -+ raft_command_unref(cmd); - return status; +- if (mcast_addr_pool_extend(port) != 0) +- return; +- rte_ether_addr_copy(mc_addr, &port->mc_addr_pool[i]); +- eth_port_multicast_addr_list_set(port_id); ++ mcast_addr_pool_append(port, mc_addr); ++ if (eth_port_multicast_addr_list_set(port_id) < 0) ++ /* Rollback on failure, remove the address from the pool */ ++ mcast_addr_pool_remove(port, i); } --- -2.25.1 - - -From 3d9b529afb098531190d57d6f35d1622bb4093cd Mon Sep 17 00:00:00 2001 -From: Zhen Wang -Date: Mon, 30 Mar 2020 17:21:04 -0700 -Subject: [PATCH 10/15] raft: Disable RAFT jsonrpc inactivity probe. - -[ upstream commit 1600e0040caded7eaa9b1f41926f9619d8e0ec8d ] - -With the scale test of 640 nodes k8s cluster, raft DB nodes' jsonrpc -session got closed due to the timeout of default 5 seconds probe. -It will cause disturbance of the raft cluster. Since we already have -the heartbeat for RAFT, just disable the probe between the servers -to avoid the unnecessary jsonrpc inactivity probe. - -Acked-by: Han Zhou -Signed-off-by: Zhen Wang -Signed-off-by: Ilya Maximets - -Resolves: #1836308 -Signed-off-by: Dumitru Ceara ---- - ovsdb/raft.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 1505814138..395cc56113 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -938,6 +938,7 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js, - &conn->sid); - conn->incoming = incoming; - conn->js_seqno = jsonrpc_session_get_seqno(conn->js); -+ jsonrpc_session_set_probe_interval(js, 0); + void +@@ -3792,7 +3844,9 @@ mcast_addr_remove(portid_t port_id, struct rte_ether_addr *mc_addr) + } + + mcast_addr_pool_remove(port, i); +- eth_port_multicast_addr_list_set(port_id); ++ if (eth_port_multicast_addr_list_set(port_id) < 0) ++ /* Rollback on failure, add the address back into the pool */ ++ mcast_addr_pool_append(port, mc_addr); } - /* Starts the local server in an existing Raft cluster, using the local copy of --- -2.25.1 - - -From 8b155475749cdb7a1817810d447e4cf6598cb6fa Mon Sep 17 00:00:00 2001 -From: Aaron Conole -Date: Fri, 15 May 2020 16:36:18 -0400 -Subject: [PATCH 11/15] netdev-linux: Update LAG in all cases. - -In some cases, when processing a netlink change event, it's possible for -an alternate part of OvS (like the IPv6 endpoint processing) to hold an -active netdev interface. This creates a race-condition, where sometimes -the OvS change processing will take the normal path. This doesn't work -because the netdev device object won't actually be enslaved to the -ovs-system (for instance, a linux bond) and ingress qdisc entries will -be missing. - -To address this, we update the LAG information in ALL cases where -LAG information could come in. - -Fixes: d22f8927c3c9 ("netdev-linux: monitor and offload LAG slaves to TC") -Cc: Marcelo Leitner -Cc: John Hurley -Acked-by: Roi Dayan -Signed-off-by: Aaron Conole -Signed-off-by: Ilya Maximets ---- - lib/netdev-linux.c | 11 +++++------ - 1 file changed, 5 insertions(+), 6 deletions(-) - -diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c -index c6f3d27409..2bf8d4c477 100644 ---- a/lib/netdev-linux.c -+++ b/lib/netdev-linux.c -@@ -659,10 +659,6 @@ netdev_linux_update_lag(struct rtnetlink_change *change) + void +diff --git a/dpdk/app/test-pmd/csumonly.c b/dpdk/app/test-pmd/csumonly.c +index 25091de881..7b92ab1195 100644 +--- a/dpdk/app/test-pmd/csumonly.c ++++ b/dpdk/app/test-pmd/csumonly.c +@@ -139,22 +139,23 @@ parse_ipv6(struct rte_ipv6_hdr *ipv6_hdr, struct testpmd_offload_info *info) + + /* + * Parse an ethernet header to fill the ethertype, l2_len, l3_len and +- * ipproto. This function is able to recognize IPv4/IPv6 with one optional vlan +- * header. The l4_len argument is only set in case of TCP (useful for TSO). ++ * ipproto. This function is able to recognize IPv4/IPv6 with optional VLAN ++ * headers. The l4_len argument is only set in case of TCP (useful for TSO). + */ + static void + parse_ethernet(struct rte_ether_hdr *eth_hdr, struct testpmd_offload_info *info) { - struct linux_lag_slave *lag; + struct rte_ipv4_hdr *ipv4_hdr; + struct rte_ipv6_hdr *ipv6_hdr; ++ struct rte_vlan_hdr *vlan_hdr; -- if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) { -- return; -- } + info->l2_len = sizeof(struct rte_ether_hdr); + info->ethertype = eth_hdr->ether_type; + +- if (info->ethertype == _htons(RTE_ETHER_TYPE_VLAN)) { +- struct rte_vlan_hdr *vlan_hdr = ( +- struct rte_vlan_hdr *)(eth_hdr + 1); - - if (change->slave && netdev_linux_kind_is_lag(change->slave)) { - lag = shash_find_data(&lag_shash, change->ifname); ++ while (info->ethertype == _htons(RTE_ETHER_TYPE_VLAN) || ++ info->ethertype == _htons(RTE_ETHER_TYPE_QINQ)) { ++ vlan_hdr = (struct rte_vlan_hdr *) ++ ((char *)eth_hdr + info->l2_len); + info->l2_len += sizeof(struct rte_vlan_hdr); + info->ethertype = vlan_hdr->eth_proto; + } +diff --git a/dpdk/app/test-pmd/flowgen.c b/dpdk/app/test-pmd/flowgen.c +index 03b72aaa56..68931fdea6 100644 +--- a/dpdk/app/test-pmd/flowgen.c ++++ b/dpdk/app/test-pmd/flowgen.c +@@ -1,35 +1,5 @@ +-/*- +- * BSD LICENSE +- * +- * Copyright(c) 2010-2013 Tilera Corporation. All rights reserved. +- * All rights reserved. +- * +- * Redistribution and use in source and binary forms, with or without +- * modification, are permitted provided that the following conditions +- * are met: +- * +- * * Redistributions of source code must retain the above copyright +- * notice, this list of conditions and the following disclaimer. +- * * Redistributions in binary form must reproduce the above copyright +- * notice, this list of conditions and the following disclaimer in +- * the documentation and/or other materials provided with the +- * distribution. +- * * Neither the name of Tilera Corporation nor the names of its +- * contributors may be used to endorse or promote products derived +- * from this software without specific prior written permission. +- * +- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +- * ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright 2014-2020 Mellanox Technologies, Ltd + */ -@@ -760,8 +756,11 @@ netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED) - netdev_linux_update(netdev, nsid, &change); - ovs_mutex_unlock(&netdev->mutex); - } -- else if (!netdev_ && change.ifname) { -- /* Netdev is not present in OvS but its master could be. */ -+ -+ if (change.ifname && -+ rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) { -+ -+ /* Need to try updating the LAG information. */ - ovs_mutex_lock(&lag_mutex); - netdev_linux_update_lag(&change); - ovs_mutex_unlock(&lag_mutex); --- -2.25.1 - - -From d14e39f81bec29064a58df0177ce457765305f8b Mon Sep 17 00:00:00 2001 -From: Aaron Conole -Date: Fri, 15 May 2020 16:36:19 -0400 -Subject: [PATCH 12/15] netdev-offload-tc: Re-fetch block ID after probing. - -It's possible that block_id could changes after the probe for block -support. Therefore, fetch the block_id again after the probe. - -Fixes: edc2055a2bf7 ("netdev-offload-tc: Flush rules on ingress block when init tc flow api") -Cc: Dmytro Linkin -Acked-by: Roi Dayan -Co-authored-by: Marcelo Leitner -Signed-off-by: Marcelo Leitner -Signed-off-by: Aaron Conole -Signed-off-by: Ilya Maximets ---- - lib/netdev-offload-tc.c | 2 ++ - 1 file changed, 2 insertions(+) - -diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c -index 550e440b3a..f577311aec 100644 ---- a/lib/netdev-offload-tc.c -+++ b/lib/netdev-offload-tc.c -@@ -1922,6 +1922,8 @@ netdev_tc_init_flow_api(struct netdev *netdev) + #include +diff --git a/dpdk/app/test-pmd/macswap.c b/dpdk/app/test-pmd/macswap.c +index 71af916fc3..8428c26d85 100644 +--- a/dpdk/app/test-pmd/macswap.c ++++ b/dpdk/app/test-pmd/macswap.c +@@ -1,34 +1,5 @@ +-/*- +- * BSD LICENSE +- * +- * Copyright(c) 2014 Tilera Corporation. All rights reserved. +- * +- * Redistribution and use in source and binary forms, with or without +- * modification, are permitted provided that the following conditions +- * are met: +- * +- * * Redistributions of source code must retain the above copyright +- * notice, this list of conditions and the following disclaimer. +- * * Redistributions in binary form must reproduce the above copyright +- * notice, this list of conditions and the following disclaimer in +- * the documentation and/or other materials provided with the +- * distribution. +- * * Neither the name of Tilera Corporation nor the names of its +- * contributors may be used to endorse or promote products derived +- * from this software without specific prior written permission. +- * +- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +- * ++/* SPDX-License-Identifier: BSD-3-Clause ++ * Copyright 2014-2020 Mellanox Technologies, Ltd + */ - if (ovsthread_once_start(&block_once)) { - probe_tc_block_support(ifindex); -+ /* Need to re-fetch block id as it depends on feature availability. */ -+ block_id = get_block_id_from_netdev(netdev); - ovsthread_once_done(&block_once); - } + #include +diff --git a/dpdk/app/test-pmd/parameters.c b/dpdk/app/test-pmd/parameters.c +index 2e7a504415..0eb7844783 100644 +--- a/dpdk/app/test-pmd/parameters.c ++++ b/dpdk/app/test-pmd/parameters.c +@@ -49,7 +49,7 @@ + static void + usage(char* progname) + { +- printf("usage: %s " ++ printf("usage: %s [EAL options] -- " + #ifdef RTE_LIBRTE_CMDLINE + "[--interactive|-i] " + "[--cmdline-file=FILENAME] " +diff --git a/dpdk/app/test-pmd/testpmd.c b/dpdk/app/test-pmd/testpmd.c +index b374682236..0b126594b7 100644 +--- a/dpdk/app/test-pmd/testpmd.c ++++ b/dpdk/app/test-pmd/testpmd.c +@@ -2549,32 +2549,17 @@ setup_attached_port(portid_t pi) + printf("Done\n"); + } --- -2.25.1 - - -From fb32a78921e50b1ffa0c52f873167f68622e8723 Mon Sep 17 00:00:00 2001 -From: Ilya Maximets -Date: Fri, 22 May 2020 18:31:19 +0200 -Subject: [PATCH 13/15] ovsdb: Add raft memory usage to memory report. - -[ upstream commit 3423cd97f88fe6a8de8b649d79fe6ac83bce94d1 ] - -Memory reports could be found in logs or by calling 'memory/show' -appctl command. For ovsdb-server it includes information about db -cells, monitor connections with their backlog size, etc. But it -doesn't contain any information about memory consumed by raft. -Backlogs of raft connections could be insanely large because of -snapshot installation requests that simply contains the whole database. -In not that healthy clusters where one of ovsdb servers is not able to -timely handle all the incoming raft traffic, backlog on a sender's side -could cause significant memory consumption issues. - -Adding new 'raft-connections' and 'raft-backlog' counters to the -memory report to better track such conditions. - -Acked-by: Han Zhou -Signed-off-by: Ilya Maximets - -Related: #1834838 -Signed-off-by: Ilya Maximets ---- - ovsdb/ovsdb.c | 4 ++++ - ovsdb/raft.c | 16 ++++++++++++++++ - ovsdb/raft.h | 2 ++ - ovsdb/storage.c | 10 ++++++++++ - ovsdb/storage.h | 3 +++ - 5 files changed, 35 insertions(+) - -diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c -index 7e683e6815..2da117cb36 100644 ---- a/ovsdb/ovsdb.c -+++ b/ovsdb/ovsdb.c -@@ -502,6 +502,10 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage) - } +-void +-detach_port_device(portid_t port_id) ++static void ++detach_device(struct rte_device *dev) + { +- struct rte_device *dev; + portid_t sibling; - simap_increase(usage, "cells", cells); -+ -+ if (db->storage) { -+ ovsdb_storage_get_memory_usage(db->storage, usage); -+ } - } +- printf("Removing a device...\n"); +- +- if (port_id_is_invalid(port_id, ENABLED_WARN)) +- return; +- +- dev = rte_eth_devices[port_id].device; + if (dev == NULL) { + printf("Device already removed\n"); + return; + } - struct ovsdb_table * -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 395cc56113..6ca63b4352 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -36,6 +36,7 @@ - #include "ovsdb/log.h" - #include "raft-rpc.h" - #include "random.h" -+#include "simap.h" - #include "socket-util.h" - #include "stream.h" - #include "timeval.h" -@@ -1014,6 +1015,21 @@ raft_get_sid(const struct raft *raft) - return &raft->sid; +- if (ports[port_id].port_status != RTE_PORT_CLOSED) { +- if (ports[port_id].port_status != RTE_PORT_STOPPED) { +- printf("Port not stopped\n"); +- return; +- } +- printf("Port was not closed\n"); +- if (ports[port_id].flow_list) +- port_flow_flush(port_id); +- } ++ printf("Removing a device...\n"); + + if (rte_dev_remove(dev) < 0) { + TESTPMD_LOG(ERR, "Failed to detach device %s\n", dev->name); +@@ -2592,14 +2577,33 @@ detach_port_device(portid_t port_id) + + remove_invalid_ports(); + +- printf("Device of port %u is detached\n", port_id); ++ printf("Device is detached\n"); + printf("Now total ports is %d\n", nb_ports); + printf("Done\n"); + return; } -+/* Adds memory consumption info to 'usage' for later use by memory_report(). */ -+void -+raft_get_memory_usage(const struct raft *raft, struct simap *usage) + void +-detach_device(char *identifier) ++detach_port_device(portid_t port_id) +{ -+ struct raft_conn *conn; -+ int cnt = 0; ++ if (port_id_is_invalid(port_id, ENABLED_WARN)) ++ return; + -+ LIST_FOR_EACH (conn, list_node, &raft->conns) { -+ simap_increase(usage, "raft-backlog", -+ jsonrpc_session_get_backlog(conn->js)); -+ cnt++; -+ } -+ simap_increase(usage, "raft-connections", cnt); ++ if (ports[port_id].port_status != RTE_PORT_CLOSED) { ++ if (ports[port_id].port_status != RTE_PORT_STOPPED) { ++ printf("Port not stopped\n"); ++ return; ++ } ++ printf("Port was not closed\n"); ++ if (ports[port_id].flow_list) ++ port_flow_flush(port_id); ++ } ++ ++ detach_device(rte_eth_devices[port_id].device); +} + - /* Returns true if 'raft' has completed joining its cluster, has not left or - * initiated leaving the cluster, does not have failed disk storage, and is - * apparently connected to the leader in a healthy way (or is itself the -diff --git a/ovsdb/raft.h b/ovsdb/raft.h -index 3d448995af..99d5307e54 100644 ---- a/ovsdb/raft.h -+++ b/ovsdb/raft.h -@@ -67,6 +67,7 @@ - struct json; - struct ovsdb_log; - struct raft; -+struct simap; - struct sset; - - #define RAFT_MAGIC "CLUSTER" -@@ -113,6 +114,7 @@ const struct uuid *raft_get_cid(const struct raft *); - const struct uuid *raft_get_sid(const struct raft *); - bool raft_is_connected(const struct raft *); - bool raft_is_leader(const struct raft *); -+void raft_get_memory_usage(const struct raft *, struct simap *usage); ++void ++detach_devargs(char *identifier) + { + struct rte_dev_iterator iterator; + struct rte_devargs da; +@@ -2748,7 +2752,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. speed %u Mbps- %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +@@ -2790,6 +2794,7 @@ rmv_port_callback(void *arg) + int need_to_start = 0; + int org_no_link_check = no_link_check; + portid_t port_id = (intptr_t)arg; ++ struct rte_device *dev; - /* Joining a cluster. */ - bool raft_is_joining(const struct raft *); -diff --git a/ovsdb/storage.c b/ovsdb/storage.c -index e26252b066..7b4ad16f60 100644 ---- a/ovsdb/storage.c -+++ b/ovsdb/storage.c -@@ -26,6 +26,7 @@ - #include "ovsdb.h" - #include "raft.h" - #include "random.h" -+#include "simap.h" - #include "timeval.h" - #include "util.h" + RTE_ETH_VALID_PORTID_OR_RET(port_id); -@@ -188,6 +189,15 @@ ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage) - return storage->raft ? raft_get_applied_index(storage->raft) : 0; +@@ -2800,8 +2805,12 @@ rmv_port_callback(void *arg) + no_link_check = 1; + stop_port(port_id); + no_link_check = org_no_link_check; ++ ++ /* Save rte_device pointer before closing ethdev port */ ++ dev = rte_eth_devices[port_id].device; + close_port(port_id); +- detach_port_device(port_id); ++ detach_device(dev); /* might be already removed or have more ports */ ++ + if (need_to_start) + start_packet_forwarding(0); } +@@ -3184,6 +3193,8 @@ get_eth_dcb_conf(portid_t pid, struct rte_eth_conf *eth_conf, + struct rte_eth_dcb_tx_conf *tx_conf = + ð_conf->tx_adv_conf.dcb_tx_conf; -+void -+ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage, -+ struct simap *usage) -+{ -+ if (storage->raft) { -+ raft_get_memory_usage(storage->raft, usage); -+ } -+} ++ memset(&rss_conf, 0, sizeof(struct rte_eth_rss_conf)); + - void - ovsdb_storage_run(struct ovsdb_storage *storage) - { -diff --git a/ovsdb/storage.h b/ovsdb/storage.h -index 8a9bbab709..a223968912 100644 ---- a/ovsdb/storage.h -+++ b/ovsdb/storage.h -@@ -23,6 +23,7 @@ - struct json; - struct ovsdb_schema; - struct ovsdb_storage; -+struct simap; - struct uuid; + rc = rte_eth_dev_rss_hash_conf_get(pid, &rss_conf); + if (rc != 0) + return rc; +@@ -3570,5 +3581,10 @@ main(int argc, char** argv) + return 1; + } - struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw, -@@ -39,6 +40,8 @@ bool ovsdb_storage_is_leader(const struct ovsdb_storage *); - const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *); - const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *); - uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *); -+void ovsdb_storage_get_memory_usage(const struct ovsdb_storage *, -+ struct simap *usage); +- return 0; ++ ret = rte_eal_cleanup(); ++ if (ret != 0) ++ rte_exit(EXIT_FAILURE, ++ "EAL cleanup failed: %s\n", strerror(-ret)); ++ ++ return EXIT_SUCCESS; + } +diff --git a/dpdk/app/test-pmd/testpmd.h b/dpdk/app/test-pmd/testpmd.h +index 217d577018..0694e1ef8b 100644 +--- a/dpdk/app/test-pmd/testpmd.h ++++ b/dpdk/app/test-pmd/testpmd.h +@@ -797,7 +797,7 @@ void stop_port(portid_t pid); + void close_port(portid_t pid); + void reset_port(portid_t pid); + void attach_port(char *identifier); +-void detach_device(char *identifier); ++void detach_devargs(char *identifier); + void detach_port_device(portid_t port_id); + int all_ports_stopped(void); + int port_is_stopped(portid_t port_id); +diff --git a/dpdk/app/test-pmd/txonly.c b/dpdk/app/test-pmd/txonly.c +index 3caf281cb8..8ed436def5 100644 +--- a/dpdk/app/test-pmd/txonly.c ++++ b/dpdk/app/test-pmd/txonly.c +@@ -45,8 +45,8 @@ uint16_t tx_udp_src_port = 9; + uint16_t tx_udp_dst_port = 9; - void ovsdb_storage_run(struct ovsdb_storage *); - void ovsdb_storage_wait(struct ovsdb_storage *); --- -2.25.1 - - -From 92a1e56c8a37927441fb1742e6054a9118654ef0 Mon Sep 17 00:00:00 2001 -From: Ilya Maximets -Date: Thu, 14 May 2020 22:10:45 +0200 -Subject: [PATCH 14/15] ovsdb-server: Fix schema leak while reading db. - -[ upstream commit 16e3a80cf646f6c53d22ef98599d5aecb8310414 ] - -parse_txn() function doesn't always take ownership of the 'schema' -passed. So, if the schema of the clustered db has same version as the -one that already in use, parse_txn() will not use it, resulting with a -memory leak: - - 7,827 (56 direct, 7,771 indirect) bytes in 1 blocks are definitely lost - at 0x483BB1A: calloc (vg_replace_malloc.c:762) - by 0x44AD02: xcalloc (util.c:121) - by 0x40E70E: ovsdb_schema_create (ovsdb.c:41) - by 0x40EA6D: ovsdb_schema_from_json (ovsdb.c:217) - by 0x415EDD: ovsdb_storage_read (storage.c:280) - by 0x408968: read_db (ovsdb-server.c:607) - by 0x40733D: main_loop (ovsdb-server.c:227) - by 0x40733D: main (ovsdb-server.c:469) - -While we could put ovsdb_schema_destroy() in a few places inside -'parse_txn()', from the users' point of view it seems better to have a -constant argument and just clone the 'schema' if needed. The caller -will be responsible for destroying the 'schema' it owns. - -Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.") -Acked-by: Han Zhou -Signed-off-by: Ilya Maximets - -Related: #1834838 -Signed-off-by: Ilya Maximets ---- - ovsdb/ovsdb-server.c | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c -index d416f1b606..ef4e996df2 100644 ---- a/ovsdb/ovsdb-server.c -+++ b/ovsdb/ovsdb-server.c -@@ -540,7 +540,7 @@ close_db(struct server_config *config, struct db *db, char *comment) + /* use RFC5735 / RFC2544 reserved network test addresses */ +-uint32_t tx_ip_src_addr = (192U << 24) | (18 << 16) | (0 << 8) | 1; +-uint32_t tx_ip_dst_addr = (192U << 24) | (18 << 16) | (0 << 8) | 2; ++uint32_t tx_ip_src_addr = (198U << 24) | (18 << 16) | (0 << 8) | 1; ++uint32_t tx_ip_dst_addr = (198U << 24) | (18 << 16) | (0 << 8) | 2; - static struct ovsdb_error * OVS_WARN_UNUSED_RESULT - parse_txn(struct server_config *config, struct db *db, -- struct ovsdb_schema *schema, const struct json *txn_json, -+ const struct ovsdb_schema *schema, const struct json *txn_json, - const struct uuid *txnid) + #define IP_DEFTTL 64 /* from RFC 1340. */ + +@@ -153,7 +153,6 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp, + const uint16_t vlan_tci_outer, const uint64_t ol_flags) { - if (schema && (!db->db->schema || strcmp(schema->version, -@@ -565,7 +565,7 @@ parse_txn(struct server_config *config, struct db *db, - ? xasprintf("database %s schema changed", db->db->name) - : xasprintf("database %s connected to storage", db->db->name))); + struct rte_mbuf *pkt_segs[RTE_MAX_SEGS_PER_PKT]; +- uint8_t ip_var = RTE_PER_LCORE(_ip_var); + struct rte_mbuf *pkt_seg; + uint32_t nb_segs, pkt_len; + uint8_t i; +@@ -192,6 +191,7 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp, + copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt, + sizeof(struct rte_ether_hdr)); + if (txonly_multi_flow) { ++ uint8_t ip_var = RTE_PER_LCORE(_ip_var); + struct rte_ipv4_hdr *ip_hdr; + uint32_t addr; -- ovsdb_replace(db->db, ovsdb_create(schema, NULL)); -+ ovsdb_replace(db->db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); +@@ -207,6 +207,7 @@ pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp, + */ + addr = (tx_ip_dst_addr | (ip_var++ << 8)) + rte_lcore_id(); + ip_hdr->src_addr = rte_cpu_to_be_32(addr); ++ RTE_PER_LCORE(_ip_var) = ip_var; + } + copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt, + sizeof(struct rte_ether_hdr) + +@@ -314,7 +315,7 @@ pkt_burst_transmit(struct fwd_stream *fs) + fs->tx_packets += nb_tx; - /* Force update to schema in _Server database. */ - db->row_uuid = UUID_ZERO; -@@ -614,6 +614,7 @@ read_db(struct server_config *config, struct db *db) - } else { - error = parse_txn(config, db, schema, txn_json, &txnid); - json_destroy(txn_json); -+ ovsdb_schema_destroy(schema); - if (error) { - break; - } --- -2.25.1 - - -From 3168eba559cbce28937be4e785c3337030694455 Mon Sep 17 00:00:00 2001 -From: Ilya Maximets -Date: Fri, 22 May 2020 22:36:27 +0200 -Subject: [PATCH 15/15] raft: Avoid sending equal snapshots. - -[ upstream commit 8c2c503bdb0da1ce6044a53d462f905fd4f8acf5 ] - -Snapshots are huge. In some cases we could receive several outdated -append replies from the remote server. This could happen in high -scale cases if the remote server is overloaded and not able to process -all the raft requests in time. As an action to each outdated append -reply we're sending full database snapshot. While remote server is -already overloaded those snapshots will stuck in jsonrpc backlog for -a long time making it grow up to few GB. Since remote server wasn't -able to timely process incoming messages it will likely not able to -process snapshots leading to the same situation with low chances to -recover. Remote server will likely stuck in 'candidate' state, other -servers will grow their memory consumption due to growing jsonrpc -backlogs: - -jsonrpc|INFO|excessive sending backlog, jsonrpc: ssl:192.16.0.3:6644, - num of msgs: 3795, backlog: 8838994624. - -This patch is trying to avoid that situation by avoiding sending of -equal snapshot install requests. This helps maintain reasonable memory -consumption and allows the cluster to recover on a larger scale. - -Acked-by: Han Zhou -Signed-off-by: Ilya Maximets - -Related: #1834838 -Signed-off-by: Ilya Maximets ---- - ovsdb/raft-private.c | 1 + - ovsdb/raft-private.h | 4 ++++ - ovsdb/raft.c | 39 ++++++++++++++++++++++++++++++++++++++- - 3 files changed, 43 insertions(+), 1 deletion(-) - -diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c -index 26d39a087f..9468fdaf4a 100644 ---- a/ovsdb/raft-private.c -+++ b/ovsdb/raft-private.c -@@ -137,6 +137,7 @@ raft_server_destroy(struct raft_server *s) - if (s) { - free(s->address); - free(s->nickname); -+ free(s->last_install_snapshot_request); - free(s); - } - } -diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h -index ac8656d42f..1f366b4ab3 100644 ---- a/ovsdb/raft-private.h -+++ b/ovsdb/raft-private.h -@@ -27,6 +27,7 @@ + if (txonly_multi_flow) +- RTE_PER_LCORE(_ip_var) += nb_tx; ++ RTE_PER_LCORE(_ip_var) -= nb_pkt - nb_tx; - struct ds; - struct ovsdb_parser; -+struct raft_install_snapshot_request; - - /* Formatting server IDs and cluster IDs for use in human-readable logs. Do - * not use these in cases where the whole server or cluster ID is needed; use -@@ -83,6 +84,9 @@ struct raft_server { - bool replied; /* Reply to append_request was received from this - node during current election_timeout interval. - */ -+ /* Copy of the last install_snapshot_request sent to this server. */ -+ struct raft_install_snapshot_request *last_install_snapshot_request; -+ - /* For use in adding and removing servers: */ - struct uuid requester_sid; /* Nonzero if requested via RPC. */ - struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ -diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 6ca63b4352..8df386fa19 100644 ---- a/ovsdb/raft.c -+++ b/ovsdb/raft.c -@@ -1421,8 +1421,20 @@ raft_conn_run(struct raft *raft, struct raft_conn *conn) - jsonrpc_session_run(conn->js); + #ifdef RTE_TEST_PMD_RECORD_BURST_STATS + fs->tx_burst_stats.pkt_burst_spread[nb_tx]++; +diff --git a/dpdk/app/test-pmd/util.c b/dpdk/app/test-pmd/util.c +index b514be5e16..4e4ead3075 100644 +--- a/dpdk/app/test-pmd/util.c ++++ b/dpdk/app/test-pmd/util.c +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2010-2014 Intel Corporation +- * Copyright(c) 2018 Mellanox Technology ++ * Copyright 2018 Mellanox Technologies, Ltd + */ - unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js); -- bool just_connected = (new_seqno != conn->js_seqno -+ bool reconnected = new_seqno != conn->js_seqno; -+ bool just_connected = (reconnected - && jsonrpc_session_is_connected(conn->js)); -+ -+ if (reconnected) { -+ /* Clear 'last_install_snapshot_request' since it might not reach the -+ * destination or server was restarted. */ -+ struct raft_server *server = raft_find_server(raft, &conn->sid); -+ if (server) { -+ free(server->last_install_snapshot_request); -+ server->last_install_snapshot_request = NULL; -+ } -+ } -+ - conn->js_seqno = new_seqno; - if (just_connected) { - if (raft->joining) { -@@ -3296,6 +3308,31 @@ raft_send_install_snapshot_request(struct raft *raft, - .election_timer = raft->election_timer, /* use latest value */ - } - }; -+ -+ if (s->last_install_snapshot_request) { -+ struct raft_install_snapshot_request *old, *new; + #include +diff --git a/dpdk/app/test/Makefile b/dpdk/app/test/Makefile +index 57930c00b1..1ee1550094 100644 +--- a/dpdk/app/test/Makefile ++++ b/dpdk/app/test/Makefile +@@ -151,8 +151,12 @@ SRCS-y += test_func_reentrancy.c + + SRCS-y += test_service_cores.c + ++ifeq ($(CONFIG_RTE_LIBRTE_PMD_RING),y) ++SRCS-y += sample_packet_forward.c + SRCS-$(CONFIG_RTE_LIBRTE_BITRATE) += test_bitratestats.c + SRCS-$(CONFIG_RTE_LIBRTE_LATENCY_STATS) += test_latencystats.c ++SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) += test_pdump.c ++endif + + SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline.c + SRCS-$(CONFIG_RTE_LIBRTE_CMDLINE) += test_cmdline_num.c +@@ -181,11 +185,8 @@ SRCS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR) += test_distributor_perf.c + + SRCS-$(CONFIG_RTE_LIBRTE_REORDER) += test_reorder.c + +-SRCS-$(CONFIG_RTE_LIBRTE_PDUMP) += test_pdump.c +- + SRCS-y += virtual_pmd.c + SRCS-y += packet_burst_generator.c +-SRCS-y += sample_packet_forward.c + SRCS-$(CONFIG_RTE_LIBRTE_ACL) += test_acl.c + + ifeq ($(CONFIG_RTE_LIBRTE_PMD_RING),y) +@@ -215,7 +216,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_EVENTDEV),y) + SRCS-y += test_eventdev.c + SRCS-y += test_event_ring.c + SRCS-y += test_event_eth_rx_adapter.c +-SRCS-y += test_event_eth_tx_adapter.c ++SRCS-$(CONFIG_RTE_LIBRTE_PMD_RING) += test_event_eth_tx_adapter.c + SRCS-y += test_event_timer_adapter.c + SRCS-y += test_event_crypto_adapter.c + endif +@@ -268,13 +269,6 @@ endif + endif + endif + +-# Link against shared libraries when needed +-ifeq ($(CONFIG_RTE_LIBRTE_PMD_BOND),y) +-ifneq ($(CONFIG_RTE_LIBRTE_PMD_RING),y) +-$(error Link bonding tests require CONFIG_RTE_LIBRTE_PMD_RING=y) +-endif +-endif +- + ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),y) + + ifeq ($(CONFIG_RTE_LIBRTE_PMD_BOND),y) +diff --git a/dpdk/app/test/get-coremask.sh b/dpdk/app/test/get-coremask.sh +new file mode 100755 +index 0000000000..bb8cf404d2 +--- /dev/null ++++ b/dpdk/app/test/get-coremask.sh +@@ -0,0 +1,13 @@ ++#! /bin/sh -e ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright(c) 2019 Intel Corporation + -+ old = s->last_install_snapshot_request; -+ new = &rpc.install_snapshot_request; -+ if ( old->term == new->term -+ && old->last_index == new->last_index -+ && old->last_term == new->last_term -+ && old->last_servers == new->last_servers -+ && old->data == new->data -+ && old->election_timer == new->election_timer -+ && uuid_equals(&old->last_eid, &new->last_eid)) { -+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); ++if [ "$(uname)" = "Linux" ] ; then ++ cat /sys/devices/system/cpu/present ++elif [ "$(uname)" = "FreeBSD" ] ; then ++ ncpus=$(/sbin/sysctl -n hw.ncpu) ++ echo 0-$(expr $ncpus - 1) ++else ++# fallback ++ echo 0-3 ++fi +diff --git a/dpdk/app/test/meson.build b/dpdk/app/test/meson.build +index fb49d804ba..8524a986a1 100644 +--- a/dpdk/app/test/meson.build ++++ b/dpdk/app/test/meson.build +@@ -7,13 +7,11 @@ endif + + test_sources = files('commands.c', + 'packet_burst_generator.c', +- 'sample_packet_forward.c', + 'test.c', + 'test_acl.c', + 'test_alarm.c', + 'test_atomic.c', + 'test_barrier.c', +- 'test_bitratestats.c', + 'test_bpf.c', + 'test_byteorder.c', + 'test_cmdline.c', +@@ -43,7 +41,6 @@ test_sources = files('commands.c', + 'test_event_crypto_adapter.c', + 'test_event_eth_rx_adapter.c', + 'test_event_ring.c', +- 'test_event_eth_tx_adapter.c', + 'test_event_timer_adapter.c', + 'test_eventdev.c', + 'test_external_mem.c', +@@ -65,9 +62,7 @@ test_sources = files('commands.c', + 'test_ipsec_sad.c', + 'test_kni.c', + 'test_kvargs.c', +- 'test_latencystats.c', + 'test_link_bonding.c', +- 'test_link_bonding_mode4.c', + 'test_link_bonding_rssconf.c', + 'test_logs.c', + 'test_lpm.c', +@@ -88,11 +83,8 @@ test_sources = files('commands.c', + 'test_metrics.c', + 'test_mcslock.c', + 'test_mp_secondary.c', +- 'test_pdump.c', + 'test_per_lcore.c', + 'test_pmd_perf.c', +- 'test_pmd_ring.c', +- 'test_pmd_ring_perf.c', + 'test_power.c', + 'test_power_cpufreq.c', + 'test_power_kvm_vm.c', +@@ -212,7 +204,6 @@ fast_test_names = [ + 'rib_autotest', + 'rib6_autotest', + 'ring_autotest', +- 'ring_pmd_autotest', + 'rwlock_test1_autotest', + 'rwlock_rda_autotest', + 'rwlock_rds_wrm_autotest', +@@ -227,7 +218,6 @@ fast_test_names = [ + 'timer_autotest', + 'user_delay_us', + 'version_autotest', +- 'bitratestats_autotest', + 'crc_autotest', + 'delay_us_sleep_autotest', + 'distributor_autotest', +@@ -238,10 +228,8 @@ fast_test_names = [ + 'ipsec_autotest', + 'kni_autotest', + 'kvargs_autotest', +- 'latencystats_autotest', + 'member_autotest', + 'metrics_autotest', +- 'pdump_autotest', + 'power_cpufreq_autotest', + 'power_autotest', + 'power_kvm_vm_autotest', +@@ -277,7 +265,6 @@ perf_test_names = [ + 'rcu_qsbr_perf_autotest', + 'red_perf', + 'distributor_perf_autotest', +- 'ring_pmd_perf_autotest', + 'pmd_perf_autotest', + 'stack_perf_autotest', + 'stack_lf_perf_autotest', +@@ -302,7 +289,6 @@ driver_test_names = [ + 'eventdev_selftest_octeontx', + 'eventdev_selftest_sw', + 'link_bonding_autotest', +- 'link_bonding_mode4_autotest', + 'link_bonding_rssconf_autotest', + 'rawdev_autotest', + ] +@@ -339,6 +325,21 @@ if dpdk_conf.has('RTE_LIBRTE_BOND_PMD') + endif + if dpdk_conf.has('RTE_LIBRTE_RING_PMD') + test_deps += 'pmd_ring' ++ test_sources += 'test_pmd_ring_perf.c' ++ test_sources += 'test_pmd_ring.c' ++ test_sources += 'test_event_eth_tx_adapter.c' ++ test_sources += 'test_bitratestats.c' ++ test_sources += 'test_latencystats.c' ++ test_sources += 'test_link_bonding_mode4.c' ++ test_sources += 'sample_packet_forward.c' ++ test_sources += 'test_pdump.c' ++ fast_test_names += 'ring_pmd_autotest' ++ perf_test_names += 'ring_pmd_perf_autotest' ++ fast_test_names += 'event_eth_tx_adapter_autotest' ++ fast_test_names += 'bitratestats_autotest' ++ fast_test_names += 'latencystats_autotest' ++ driver_test_names += 'link_bonding_mode4_autotest' ++ fast_test_names += 'pdump_autotest' + endif + + if dpdk_conf.has('RTE_LIBRTE_POWER') +@@ -398,45 +399,36 @@ dpdk_test = executable('dpdk-test', + timeout_seconds = 600 + timeout_seconds_fast = 10 + +-# Retrieve the number of CPU cores, defaulting to 4. +-num_cores = '0-3' +-if host_machine.system() == 'linux' +- num_cores = run_command('cat', +- '/sys/devices/system/cpu/present' +- ).stdout().strip() +-elif host_machine.system() == 'freebsd' +- snum_cores = run_command('/sbin/sysctl', '-n', +- 'hw.ncpu').stdout().strip() +- inum_cores = snum_cores.to_int() - 1 +- num_cores = '0-@0@'.format(inum_cores) +-endif ++get_coremask = find_program('get-coremask.sh') ++num_cores_arg = '-l ' + run_command(get_coremask).stdout().strip() + +-num_cores_arg = '-l ' + num_cores ++default_test_args = [num_cores_arg] + +-test_args = [num_cores_arg] + foreach arg : fast_test_names +- if host_machine.system() == 'linux' +- test(arg, dpdk_test, +- env : ['DPDK_TEST=' + arg], +- args : test_args + +- ['--file-prefix=@0@'.format(arg)], +- timeout : timeout_seconds_fast, +- is_parallel : false, +- suite : 'fast-tests') +- else +- test(arg, dpdk_test, +- env : ['DPDK_TEST=' + arg], +- args : test_args, ++ test_args = default_test_args + -+ VLOG_WARN_RL(&rl, "not sending exact same install_snapshot_request" -+ " to server %s again", s->nickname); -+ return; -+ } -+ } -+ free(s->last_install_snapshot_request); -+ CONST_CAST(struct raft_server *, s)->last_install_snapshot_request -+ = xmemdup(&rpc.install_snapshot_request, -+ sizeof rpc.install_snapshot_request); ++ if (get_option('default_library') == 'shared' and ++ arg == 'event_eth_tx_adapter_autotest') ++ foreach drv:dpdk_drivers ++ test_args += ['-d', drv.full_path().split('.a')[0] + '.so'] ++ endforeach ++ endif ++ if is_linux ++ test_args += ['--file-prefix=@0@'.format(arg)] ++ endif + - raft_send(raft, &rpc); - } ++ test(arg, dpdk_test, ++ env : ['DPDK_TEST=' + arg], ++ args : test_args, + timeout : timeout_seconds_fast, + is_parallel : false, + suite : 'fast-tests') +- endif + endforeach --- -2.25.1 - -diff --git a/dpdk/drivers/bus/pci/linux/pci_vfio.c b/dpdk/drivers/bus/pci/linux/pci_vfio.c -index 64cd84a689..ba60e7ce99 100644 ---- a/dpdk/drivers/bus/pci/linux/pci_vfio.c -+++ b/dpdk/drivers/bus/pci/linux/pci_vfio.c -@@ -149,6 +149,38 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table) - return 0; + foreach arg : perf_test_names + test(arg, dpdk_test, + env : ['DPDK_TEST=' + arg], +- args : test_args, ++ args : default_test_args, + timeout : timeout_seconds, + is_parallel : false, + suite : 'perf-tests') +@@ -445,7 +437,7 @@ endforeach + foreach arg : driver_test_names + test(arg, dpdk_test, + env : ['DPDK_TEST=' + arg], +- args : test_args, ++ args : default_test_args, + timeout : timeout_seconds, + is_parallel : false, + suite : 'driver-tests') +@@ -454,7 +446,7 @@ endforeach + foreach arg : dump_test_names + test(arg, dpdk_test, + env : ['DPDK_TEST=' + arg], +- args : test_args, ++ args : default_test_args, + timeout : timeout_seconds, + is_parallel : false, + suite : 'debug-tests') +diff --git a/dpdk/app/test/process.h b/dpdk/app/test/process.h +index 191d2796a9..c3b3780337 100644 +--- a/dpdk/app/test/process.h ++++ b/dpdk/app/test/process.h +@@ -25,10 +25,12 @@ + #endif + + #ifdef RTE_LIBRTE_PDUMP ++#ifdef RTE_LIBRTE_RING_PMD + #include + extern void *send_pkts(void *empty); + extern uint16_t flag_for_send_pkts; + #endif ++#endif + + /* + * launches a second copy of the test process using the given argv parameters, +@@ -44,7 +46,9 @@ process_dup(const char *const argv[], int numargs, const char *env_value) + int i, status; + char path[32]; + #ifdef RTE_LIBRTE_PDUMP ++#ifdef RTE_LIBRTE_RING_PMD + pthread_t thread; ++#endif + #endif + + pid_t pid = fork(); +@@ -121,17 +125,21 @@ process_dup(const char *const argv[], int numargs, const char *env_value) + } + /* parent process does a wait */ + #ifdef RTE_LIBRTE_PDUMP ++#ifdef RTE_LIBRTE_RING_PMD + if ((strcmp(env_value, "run_pdump_server_tests") == 0)) + pthread_create(&thread, NULL, &send_pkts, NULL); ++#endif + #endif + + while (wait(&status) != pid) + ; + #ifdef RTE_LIBRTE_PDUMP ++#ifdef RTE_LIBRTE_RING_PMD + if ((strcmp(env_value, "run_pdump_server_tests") == 0)) { + flag_for_send_pkts = 0; + pthread_join(thread, NULL); + } ++#endif + #endif + return status; } +diff --git a/dpdk/app/test/test.c b/dpdk/app/test/test.c +index cd7aaf645f..d0826ca69e 100644 +--- a/dpdk/app/test/test.c ++++ b/dpdk/app/test/test.c +@@ -53,7 +53,9 @@ do_recursive_call(void) + } actions[] = { + { "run_secondary_instances", test_mp_secondary }, + #ifdef RTE_LIBRTE_PDUMP ++#ifdef RTE_LIBRTE_RING_PMD + { "run_pdump_server_tests", test_pdump }, ++#endif + #endif + { "test_missing_c_flag", no_action }, + { "test_master_lcore_flag", no_action }, +diff --git a/dpdk/app/test/test.h b/dpdk/app/test/test.h +index ac0c50616c..b07f6c1ef0 100644 +--- a/dpdk/app/test/test.h ++++ b/dpdk/app/test/test.h +@@ -22,8 +22,6 @@ + # define TEST_TRACE_FAILURE(_file, _line, _func) + #endif -+/* enable PCI bus memory space */ -+static int -+pci_vfio_enable_bus_memory(int dev_fd) -+{ -+ uint16_t cmd; -+ int ret; -+ -+ ret = pread64(dev_fd, &cmd, sizeof(cmd), -+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + -+ PCI_COMMAND); +-#define RTE_TEST_TRACE_FAILURE TEST_TRACE_FAILURE +- + #include + + #define TEST_ASSERT RTE_TEST_ASSERT +diff --git a/dpdk/app/test/test_acl.c b/dpdk/app/test/test_acl.c +index 9cd9e37dbe..b78b67193a 100644 +--- a/dpdk/app/test/test_acl.c ++++ b/dpdk/app/test/test_acl.c +@@ -1394,16 +1394,18 @@ test_invalid_parameters(void) + } else + rte_acl_free(acx); + +- /* invalid NUMA node */ +- memcpy(¶m, &acl_param, sizeof(param)); +- param.socket_id = RTE_MAX_NUMA_NODES + 1; +- +- acx = rte_acl_create(¶m); +- if (acx != NULL) { +- printf("Line %i: ACL context creation with invalid NUMA " +- "should have failed!\n", __LINE__); +- rte_acl_free(acx); +- return -1; ++ if (rte_eal_has_hugepages()) { ++ /* invalid NUMA node */ ++ memcpy(¶m, &acl_param, sizeof(param)); ++ param.socket_id = RTE_MAX_NUMA_NODES + 1; + -+ if (ret != sizeof(cmd)) { -+ RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n"); -+ return -1; ++ acx = rte_acl_create(¶m); ++ if (acx != NULL) { ++ printf("Line %i: ACL context creation with invalid " ++ "NUMA should have failed!\n", __LINE__); ++ rte_acl_free(acx); ++ return -1; ++ } + } + + /* NULL name */ +diff --git a/dpdk/app/test/test_common.c b/dpdk/app/test/test_common.c +index 2b856f8ba5..12bd1cad90 100644 +--- a/dpdk/app/test/test_common.c ++++ b/dpdk/app/test/test_common.c +@@ -216,7 +216,19 @@ test_log2(void) + const uint32_t max = 0x10000; + const uint32_t step = 1; + +- for (i = 0; i < max; i = i + step) { ++ compare = rte_log2_u32(0); ++ if (compare != 0) { ++ printf("Wrong rte_log2_u32(0) val %x, expected 0\n", compare); ++ return TEST_FAILED; + } + -+ if (cmd & PCI_COMMAND_MEMORY) -+ return 0; -+ -+ cmd |= PCI_COMMAND_MEMORY; -+ ret = pwrite64(dev_fd, &cmd, sizeof(cmd), -+ VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) + -+ PCI_COMMAND); -+ -+ if (ret != sizeof(cmd)) { -+ RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n"); -+ return -1; ++ compare = rte_log2_u64(0); ++ if (compare != 0) { ++ printf("Wrong rte_log2_u64(0) val %x, expected 0\n", compare); ++ return TEST_FAILED; + } + -+ return 0; -+} -+ - /* set PCI bus mastering */ - static int - pci_vfio_set_bus_master(int dev_fd, bool op) -@@ -427,6 +459,11 @@ pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd) - return -1; - } ++ for (i = 1; i < max; i = i + step) { + uint64_t i64; -+ if (pci_vfio_enable_bus_memory(vfio_dev_fd)) { -+ RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n"); -+ return -1; -+ } + /* extend range for 64-bit */ +diff --git a/dpdk/app/test/test_compressdev_test_buffer.h b/dpdk/app/test/test_compressdev_test_buffer.h +index c0492f89a2..d241602445 100644 +--- a/dpdk/app/test/test_compressdev_test_buffer.h ++++ b/dpdk/app/test/test_compressdev_test_buffer.h +@@ -1,3 +1,7 @@ ++/* SPDX-License-Identifier: (BSD-3-Clause) ++ * Copyright(c) 2018-2020 Intel Corporation ++ */ + - /* set bus mastering for the device */ - if (pci_vfio_set_bus_master(vfio_dev_fd, true)) { - RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n"); -diff --git a/dpdk/lib/librte_vhost/vhost_user.c b/dpdk/lib/librte_vhost/vhost_user.c -index 40c4520c08..8954f7930e 100644 ---- a/dpdk/lib/librte_vhost/vhost_user.c -+++ b/dpdk/lib/librte_vhost/vhost_user.c -@@ -206,7 +206,7 @@ vhost_backend_cleanup(struct virtio_net *dev) - dev->inflight_info->addr = NULL; - } + #ifndef TEST_COMPRESSDEV_TEST_BUFFERS_H_ + #define TEST_COMPRESSDEV_TEST_BUFFERS_H_ -- if (dev->inflight_info->fd > 0) { -+ if (dev->inflight_info->fd >= 0) { - close(dev->inflight_info->fd); - dev->inflight_info->fd = -1; - } -@@ -1408,6 +1408,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, - "failed to alloc dev inflight area\n"); - return RTE_VHOST_MSG_RESULT_ERR; - } -+ dev->inflight_info->fd = -1; - } +@@ -190,106 +194,104 @@ static const char test_buf_shakespeare[] = + "\n" + "ORLANDO Go apart, Adam, and thou shalt hear how he will\n"; - num_queues = msg->payload.inflight.num_queues; -@@ -1433,6 +1434,16 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, +-/* Snippet of source code in Pascal */ +-static const char test_buf_pascal[] = +- " Ptr = 1..DMem;\n" +- " Loc = 1..IMem;\n" +- " Loc0 = 0..IMem;\n" +- " EdgeT = (hout,lin,hin,lout); {Warning this order is important in}\n" +- " {predicates such as gtS,geS}\n" +- " CardT = (finite,infinite);\n" +- " ExpT = Minexp..Maxexp;\n" +- " ManT = Mininf..Maxinf; \n" +- " Pflag = (PNull,PSoln,PTrace,PPrint);\n" +- " Sreal = record\n" +- " edge:EdgeT;\n" +- " cardinality:CardT;\n" +- " exp:ExpT; {exponent}\n" +- " mantissa:ManT;\n" +- " end;\n" +- " Int = record\n" +- " hi:Sreal;\n" +- " lo:Sreal;\n" +- " end;\n" +- " Instr = record\n" +- " Code:OpType;\n" +- " Pars: array[0..Par] of 0..DMem;\n" +- " end;\n" +- " DataMem= record\n" +- " D :array [Ptr] of Int;\n" +- " S :array [Loc] of State;\n" +- " LastHalve:Loc;\n" +- " RHalve :array [Loc] of real;\n" +- " end;\n" +- " DataFlags=record\n" +- " PF :array [Ptr] of Pflag;\n" +- " end;\n" +- "var\n" +- " Debug : (none,activity,post,trace,dump);\n" +- " Cut : (once,all);\n" +- " GlobalEnd,Verifiable:boolean;\n" +- " HalveThreshold:real;\n" +- " I : array [Loc] of Instr; {Memory holding instructions}\n" +- " End : Loc; {last instruction in I}\n" +- " ParN : array [OpType] of -1..Par; {number of parameters for each \n" +- " opcode. -1 means no result}\n" +- " ParIntersect : array [OpType] of boolean ;\n" +- " DInit : DataMem; {initial memory which is cleared and \n" +- " used in first call}\n" +- " DF : DataFlags; {hold flags for variables, e.g. print/trace}\n" +- " MaxDMem:0..DMem;\n" +- " Shift : array[0..Digits] of 1..maxint;{array of constant multipliers}\n" +- " {used for alignment etc.}\n" +- " Dummy :Positive;\n" +- " {constant intervals and Sreals}\n" +- " PlusInfS,MinusInfS,PlusSmallS,MinusSmallS,ZeroS,\n" +- " PlusFiniteS,MinusFiniteS:Sreal;\n" +- " Zero,All,AllFinite:Int;\n" +- "\n" +- "procedure deblank;\n" +- "var Ch:char;\n" +- "begin\n" +- " while (not eof) and (input^ in [' ',' ']) do read(Ch);\n" +- "end;\n" +- "\n" +- "procedure InitialOptions;\n" +- "\n" +- "#include '/user/profs/cleary/bin/options.i';\n" +- "\n" +- " procedure Option;\n" +- " begin\n" +- " case Opt of\n" +- " 'a','A':Debug:=activity;\n" +- " 'd','D':Debug:=dump;\n" +- " 'h','H':HalveThreshold:=StringNum/100;\n" +- " 'n','N':Debug:=none;\n" +- " 'p','P':Debug:=post;\n" +- " 't','T':Debug:=trace;\n" +- " 'v','V':Verifiable:=true;\n" +- " end;\n" +- " end;\n" +- "\n" +- "begin\n" +- " Debug:=trace;\n" +- " Verifiable:=false;\n" +- " HalveThreshold:=67/100;\n" +- " Options;\n" +- " writeln(Debug);\n" +- " writeln('Verifiable:',Verifiable);\n" +- " writeln('Halve threshold',HalveThreshold);\n" +- "end;{InitialOptions}\n" +- "\n" +- "procedure NormalizeUp(E,M:integer;var S:Sreal;var Closed:boolean);\n" +- "begin\n" +- "with S do\n" +- "begin\n" +- " if M=0 then S:=ZeroS else\n" +- " if M>0 then\n"; ++/* Snippet of Alice's Adventures in Wonderland */ ++static const char test_buf_alice2[] = ++ "`Curiouser and curiouser!' cried Alice (she was so much\n" ++ "surprised, that for the moment she quite forgot how to speak good\n" ++ "English); `now I'm opening out like the largest telescope that\n" ++ "ever was! Good-bye, feet!' (for when she looked down at her\n" ++ "feet, they seemed to be almost out of sight, they were getting so\n" ++ "far off). `Oh, my poor little feet, I wonder who will put on\n" ++ "your shoes and stockings for you now, dears? I'm sure _I_ shan't\n" ++ "be able! I shall be a great deal too far off to trouble myself\n" ++ "about you: you must manage the best way you can; --but I must be\n" ++ "kind to them,' thought Alice, `or perhaps they won't walk the\n" ++ "way I want to go! Let me see: I'll give them a new pair of\n" ++ "boots every Christmas.'\n" ++ "\n" ++ " And she went on planning to herself how she would manage it.\n" ++ "`They must go by the carrier,' she thought; `and how funny it'll\n" ++ "seem, sending presents to one's own feet! And how odd the\n" ++ "directions will look!\n" ++ "\n" ++ " ALICE'S RIGHT FOOT, ESQ.\n" ++ " HEARTHRUG,\n" ++ " NEAR THE FENDER,\n" ++ " (WITH ALICE'S LOVE).\n" ++ "\n" ++ "Oh dear, what nonsense I'm talking!'\n" ++ "\n" ++ " Just then her head struck against the roof of the hall: in\n" ++ "fact she was now more than nine feet high, and she at once took\n" ++ "up the little golden key and hurried off to the garden door.\n" ++ "\n" ++ " Poor Alice! It was as much as she could do, lying down on one\n" ++ "side, to look through into the garden with one eye; but to get\n" ++ "through was more hopeless than ever: she sat down and began to\n" ++ "cry again.\n" ++ "\n" ++ " `You ought to be ashamed of yourself,' said Alice, `a great\n" ++ "girl like you,' (she might well say this), `to go on crying in\n" ++ "this way! Stop this moment, I tell you!' But she went on all\n" ++ "the same, shedding gallons of tears, until there was a large pool\n" ++ "all round her, about four inches deep and reaching half down the\n" ++ "hall.\n" ++ "\n" ++ " After a time she heard a little pattering of feet in the\n" ++ "distance, and she hastily dried her eyes to see what was coming.\n" ++ "It was the White Rabbit returning, splendidly dressed, with a\n" ++ "pair of white kid gloves in one hand and a large fan in the\n" ++ "other: he came trotting along in a great hurry, muttering to\n" ++ "himself as he came, `Oh! the Duchess, the Duchess! Oh! won't she\n" ++ "be savage if I've kept her waiting!' Alice felt so desperate\n" ++ "that she was ready to ask help of any one; so, when the Rabbit\n" ++ "came near her, she began, in a low, timid voice, `If you please,\n" ++ "sir--' The Rabbit started violently, dropped the white kid\n" ++ "gloves and the fan, and skurried away into the darkness as hard\n" ++ "as he could go.\n" ++ "\n" ++ " Alice took up the fan and gloves, and, as the hall was very\n" ++ "hot, she kept fanning herself all the time she went on talking:\n" ++ "`Dear, dear! How queer everything is to-day! And yesterday\n" ++ "things went on just as usual. I wonder if I've been changed in\n" ++ "the night? Let me think: was I the same when I got up this\n" ++ "morning? I almost think I can remember feeling a little\n" ++ "different. But if I'm not the same, the next question is, Who in\n" ++ "the world am I? Ah, THAT'S the great puzzle!' And she began\n" ++ "thinking over all the children she knew that were of the same age\n" ++ "as herself, to see if she could have been changed for any of\n" ++ "them.\n" ++ "\n" ++ " `I'm sure I'm not Ada,' she said, `for her hair goes in such\n" ++ "long ringlets, and mine doesn't go in ringlets at all; and I'm\n" ++ "sure I can't be Mabel, for I know all sorts of things, and she,\n" ++ "oh! she knows such a very little! Besides, SHE'S she, and I'm I,\n" ++ "and--oh dear, how puzzling it all is! I'll try if I know all the\n" ++ "things I used to know. Let me see: four times five is twelve,\n" ++ "and four times six is thirteen, and four times seven is--oh dear!\n" ++ "I shall never get to twenty at that rate! However, the\n" ++ "Multiplication Table doesn't signify: let's try Geography.\n" ++ "London is the capital of Paris, and Paris is the capital of Rome,\n" ++ "and Rome--no, THAT'S all wrong, I'm certain! I must have been\n" ++ "changed for Mabel! I'll try and say ''How doth the little--''\n" ++ "and she crossed her hands on her lap as if she were saying lessons,\n" ++ "and began to repeat it, but her voice sounded hoarse and\n" ++ "strange, and the words did not come the same as they used to do:--\n" ++ "\n" ++ " `How doth the little crocodile\n" ++ " Improve his shining tail,\n" ++ " And pour the waters of the Nile\n" ++ " On every golden scale!\n" ++ "\n" ++ " `How cheerfully he seems to grin,\n" ++ " How neatly spread his claws,\n" ++ " And welcome little fishes in\n" ++ " With gently smiling jaws!'\n"; + + static const char * const compress_test_bufs[] = { + test_buf_alice, + test_buf_shakespeare, +- test_buf_pascal ++ test_buf_alice2 + }; + + #endif /* TEST_COMPRESSDEV_TEST_BUFFERS_H_ */ +diff --git a/dpdk/app/test/test_cryptodev.c b/dpdk/app/test/test_cryptodev.c +index 1b561456d7..db9dd3aecb 100644 +--- a/dpdk/app/test/test_cryptodev.c ++++ b/dpdk/app/test/test_cryptodev.c +@@ -143,7 +143,7 @@ static struct rte_crypto_op * + process_crypto_request(uint8_t dev_id, struct rte_crypto_op *op) + { + if (rte_cryptodev_enqueue_burst(dev_id, 0, &op, 1) != 1) { +- printf("Error sending packet for encryption"); ++ RTE_LOG(ERR, USER1, "Error sending packet for encryption\n"); + return NULL; } - memset(addr, 0, mmap_size); -+ if (dev->inflight_info->addr) { -+ munmap(dev->inflight_info->addr, dev->inflight_info->size); -+ dev->inflight_info->addr = NULL; +@@ -152,6 +152,11 @@ process_crypto_request(uint8_t dev_id, struct rte_crypto_op *op) + while (rte_cryptodev_dequeue_burst(dev_id, 0, &op, 1) == 0) + rte_pause(); + ++ if (op->status != RTE_CRYPTO_OP_STATUS_SUCCESS) { ++ RTE_LOG(DEBUG, USER1, "Operation status %d\n", op->status); ++ return NULL; + } + -+ if (dev->inflight_info->fd >= 0) { -+ close(dev->inflight_info->fd); -+ dev->inflight_info->fd = -1; -+ } + return op; + } + +@@ -2823,9 +2828,18 @@ create_wireless_algo_auth_cipher_session(uint8_t dev_id, + ut_params->sess = rte_cryptodev_sym_session_create( + ts_params->session_mpool); + +- status = rte_cryptodev_sym_session_init(dev_id, ut_params->sess, +- &ut_params->auth_xform, +- ts_params->session_priv_mpool); ++ if (cipher_op == RTE_CRYPTO_CIPHER_OP_DECRYPT) { ++ ut_params->auth_xform.next = NULL; ++ ut_params->cipher_xform.next = &ut_params->auth_xform; ++ status = rte_cryptodev_sym_session_init(dev_id, ut_params->sess, ++ &ut_params->cipher_xform, ++ ts_params->session_priv_mpool); + - dev->inflight_info->addr = addr; - dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size; - dev->inflight_info->fd = msg->fds[0] = fd; -@@ -1515,10 +1526,13 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, - "failed to alloc dev inflight area\n"); - return RTE_VHOST_MSG_RESULT_ERR; ++ } else ++ status = rte_cryptodev_sym_session_init(dev_id, ut_params->sess, ++ &ut_params->auth_xform, ++ ts_params->session_priv_mpool); ++ + TEST_ASSERT_EQUAL(status, 0, "session init failed"); + TEST_ASSERT_NOT_NULL(ut_params->sess, "Session creation failed"); + +@@ -3018,13 +3032,14 @@ create_wireless_algo_cipher_hash_operation(const uint8_t *auth_tag, + } + + static int +-create_wireless_algo_auth_cipher_operation(unsigned int auth_tag_len, ++create_wireless_algo_auth_cipher_operation( ++ const uint8_t *auth_tag, unsigned int auth_tag_len, + const uint8_t *cipher_iv, uint8_t cipher_iv_len, + const uint8_t *auth_iv, uint8_t auth_iv_len, + unsigned int data_pad_len, + unsigned int cipher_len, unsigned int cipher_offset, + unsigned int auth_len, unsigned int auth_offset, +- uint8_t op_mode, uint8_t do_sgl) ++ uint8_t op_mode, uint8_t do_sgl, uint8_t verify) + { + struct crypto_testsuite_params *ts_params = &testsuite_params; + struct crypto_unittest_params *ut_params = &unittest_params; +@@ -3081,6 +3096,10 @@ create_wireless_algo_auth_cipher_operation(unsigned int auth_tag_len, } -+ dev->inflight_info->fd = -1; } -- if (dev->inflight_info->addr) -+ if (dev->inflight_info->addr) { - munmap(dev->inflight_info->addr, dev->inflight_info->size); -+ dev->inflight_info->addr = NULL; -+ } ++ /* Copy digest for the verification */ ++ if (verify) ++ memcpy(sym_op->auth.digest.data, auth_tag, auth_tag_len); ++ + /* Copy cipher and auth IVs at the end of the crypto operation */ + uint8_t *iv_ptr = rte_crypto_op_ctod_offset( + ut_params->op, uint8_t *, IV_OFFSET); +@@ -4643,7 +4662,7 @@ test_snow3g_auth_cipher(const struct snow3g_test_data *tdata, - addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, - fd, mmap_offset); -@@ -1527,8 +1541,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, - return RTE_VHOST_MSG_RESULT_ERR; + /* Create SNOW 3G operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + tdata->auth_iv.data, tdata->auth_iv.len, + (tdata->digest.offset_bytes == 0 ? +@@ -4653,7 +4672,7 @@ test_snow3g_auth_cipher(const struct snow3g_test_data *tdata, + tdata->cipher.offset_bits, + tdata->validAuthLenInBits.len, + tdata->auth.offset_bits, +- op_mode, 0); ++ op_mode, 0, verify); + + if (retval < 0) + return retval; +@@ -4819,7 +4838,7 @@ test_snow3g_auth_cipher_sgl(const struct snow3g_test_data *tdata, + + /* Create SNOW 3G operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + tdata->auth_iv.data, tdata->auth_iv.len, + (tdata->digest.offset_bytes == 0 ? +@@ -4829,7 +4848,7 @@ test_snow3g_auth_cipher_sgl(const struct snow3g_test_data *tdata, + tdata->cipher.offset_bits, + tdata->validAuthLenInBits.len, + tdata->auth.offset_bits, +- op_mode, 1); ++ op_mode, 1, verify); + + if (retval < 0) + return retval; +@@ -4988,7 +5007,7 @@ test_kasumi_auth_cipher(const struct kasumi_test_data *tdata, + + /* Create KASUMI operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + NULL, 0, + (tdata->digest.offset_bytes == 0 ? +@@ -4998,7 +5017,7 @@ test_kasumi_auth_cipher(const struct kasumi_test_data *tdata, + tdata->validCipherOffsetInBits.len, + tdata->validAuthLenInBits.len, + 0, +- op_mode, 0); ++ op_mode, 0, verify); + + if (retval < 0) + return retval; +@@ -5165,7 +5184,7 @@ test_kasumi_auth_cipher_sgl(const struct kasumi_test_data *tdata, + + /* Create KASUMI operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + NULL, 0, + (tdata->digest.offset_bytes == 0 ? +@@ -5175,7 +5194,7 @@ test_kasumi_auth_cipher_sgl(const struct kasumi_test_data *tdata, + tdata->validCipherOffsetInBits.len, + tdata->validAuthLenInBits.len, + 0, +- op_mode, 1); ++ op_mode, 1, verify); + + if (retval < 0) + return retval; +@@ -5666,7 +5685,7 @@ test_zuc_auth_cipher(const struct wireless_test_data *tdata, + + /* Create ZUC operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + tdata->auth_iv.data, tdata->auth_iv.len, + (tdata->digest.offset_bytes == 0 ? +@@ -5676,7 +5695,7 @@ test_zuc_auth_cipher(const struct wireless_test_data *tdata, + tdata->validCipherOffsetInBits.len, + tdata->validAuthLenInBits.len, + 0, +- op_mode, 0); ++ op_mode, 0, verify); + + if (retval < 0) + return retval; +@@ -5852,7 +5871,7 @@ test_zuc_auth_cipher_sgl(const struct wireless_test_data *tdata, + + /* Create ZUC operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest.len, ++ tdata->digest.data, tdata->digest.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + NULL, 0, + (tdata->digest.offset_bytes == 0 ? +@@ -5862,7 +5881,7 @@ test_zuc_auth_cipher_sgl(const struct wireless_test_data *tdata, + tdata->validCipherOffsetInBits.len, + tdata->validAuthLenInBits.len, + 0, +- op_mode, 1); ++ op_mode, 1, verify); + + if (retval < 0) + return retval; +@@ -6643,7 +6662,7 @@ test_mixed_auth_cipher(const struct mixed_cipher_auth_test_data *tdata, + + /* Create the operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest_enc.len, ++ tdata->digest_enc.data, tdata->digest_enc.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + tdata->auth_iv.data, tdata->auth_iv.len, + (tdata->digest_enc.offset == 0 ? +@@ -6653,7 +6672,7 @@ test_mixed_auth_cipher(const struct mixed_cipher_auth_test_data *tdata, + tdata->cipher.offset_bits, + tdata->validAuthLen.len_bits, + tdata->auth.offset_bits, +- op_mode, 0); ++ op_mode, 0, verify); + + if (retval < 0) + return retval; +@@ -6827,7 +6846,7 @@ test_mixed_auth_cipher_sgl(const struct mixed_cipher_auth_test_data *tdata, + + /* Create the operation */ + retval = create_wireless_algo_auth_cipher_operation( +- tdata->digest_enc.len, ++ tdata->digest_enc.data, tdata->digest_enc.len, + tdata->cipher_iv.data, tdata->cipher_iv.len, + tdata->auth_iv.data, tdata->auth_iv.len, + (tdata->digest_enc.offset == 0 ? +@@ -6837,7 +6856,7 @@ test_mixed_auth_cipher_sgl(const struct mixed_cipher_auth_test_data *tdata, + tdata->cipher.offset_bits, + tdata->validAuthLen.len_bits, + tdata->auth.offset_bits, +- op_mode, 1); ++ op_mode, 1, verify); + + if (retval < 0) + return retval; +@@ -9139,8 +9158,10 @@ test_stats(void) + { + struct crypto_testsuite_params *ts_params = &testsuite_params; + struct rte_cryptodev_stats stats; +- struct rte_cryptodev *dev; +- cryptodev_stats_get_t temp_pfn; ++ ++ if (rte_cryptodev_stats_get(ts_params->valid_devs[0], &stats) ++ == -ENOTSUP) ++ return -ENOTSUP; + + rte_cryptodev_stats_reset(ts_params->valid_devs[0]); + TEST_ASSERT((rte_cryptodev_stats_get(ts_params->valid_devs[0] + 600, +@@ -9148,13 +9169,6 @@ test_stats(void) + "rte_cryptodev_stats_get invalid dev failed"); + TEST_ASSERT((rte_cryptodev_stats_get(ts_params->valid_devs[0], 0) != 0), + "rte_cryptodev_stats_get invalid Param failed"); +- dev = &rte_cryptodevs[ts_params->valid_devs[0]]; +- temp_pfn = dev->dev_ops->stats_get; +- dev->dev_ops->stats_get = (cryptodev_stats_get_t)0; +- TEST_ASSERT((rte_cryptodev_stats_get(ts_params->valid_devs[0], &stats) +- == -ENOTSUP), +- "rte_cryptodev_stats_get invalid Param failed"); +- dev->dev_ops->stats_get = temp_pfn; + + /* Test expected values */ + ut_setup(); +@@ -10818,13 +10832,8 @@ test_authentication_verify_fail_when_data_corruption( + + ut_params->op = process_crypto_request(ts_params->valid_devs[0], + ut_params->op); +- TEST_ASSERT_NOT_NULL(ut_params->op, "failed crypto process"); +- TEST_ASSERT_NOT_EQUAL(ut_params->op->status, +- RTE_CRYPTO_OP_STATUS_SUCCESS, +- "authentication not failed"); + +- ut_params->obuf = ut_params->op->sym->m_src; +- TEST_ASSERT_NOT_NULL(ut_params->obuf, "failed to retrieve obuf"); ++ TEST_ASSERT_NULL(ut_params->op, "authentication not failed"); + + return 0; + } +@@ -10879,13 +10888,8 @@ test_authentication_verify_GMAC_fail_when_corruption( + + ut_params->op = process_crypto_request(ts_params->valid_devs[0], + ut_params->op); +- TEST_ASSERT_NOT_NULL(ut_params->op, "failed crypto process"); +- TEST_ASSERT_NOT_EQUAL(ut_params->op->status, +- RTE_CRYPTO_OP_STATUS_SUCCESS, +- "authentication not failed"); + +- ut_params->obuf = ut_params->op->sym->m_src; +- TEST_ASSERT_NOT_NULL(ut_params->obuf, "failed to retrieve obuf"); ++ TEST_ASSERT_NULL(ut_params->op, "authentication not failed"); + + return 0; + } +@@ -10940,13 +10944,7 @@ test_authenticated_decryption_fail_when_corruption( + ut_params->op = process_crypto_request(ts_params->valid_devs[0], + ut_params->op); + +- TEST_ASSERT_NOT_NULL(ut_params->op, "failed crypto process"); +- TEST_ASSERT_NOT_EQUAL(ut_params->op->status, +- RTE_CRYPTO_OP_STATUS_SUCCESS, +- "authentication not failed"); +- +- ut_params->obuf = ut_params->op->sym->m_src; +- TEST_ASSERT_NOT_NULL(ut_params->obuf, "failed to retrieve obuf"); ++ TEST_ASSERT_NULL(ut_params->op, "authentication not failed"); + + return 0; + } +@@ -11149,6 +11147,7 @@ create_aead_operation_SGL(enum rte_crypto_aead_operation op, + const unsigned int auth_tag_len = tdata->auth_tag.len; + const unsigned int iv_len = tdata->iv.len; + unsigned int aad_len = tdata->aad.len; ++ unsigned int aad_len_pad = 0; + + /* Generate Crypto op data structure */ + ut_params->op = rte_crypto_op_alloc(ts_params->op_mpool, +@@ -11203,8 +11202,10 @@ create_aead_operation_SGL(enum rte_crypto_aead_operation op, + + rte_memcpy(iv_ptr, tdata->iv.data, iv_len); + ++ aad_len_pad = RTE_ALIGN_CEIL(aad_len, 16); ++ + sym_op->aead.aad.data = (uint8_t *)rte_pktmbuf_prepend( +- ut_params->ibuf, aad_len); ++ ut_params->ibuf, aad_len_pad); + TEST_ASSERT_NOT_NULL(sym_op->aead.aad.data, + "no room to prepend aad"); + sym_op->aead.aad.phys_addr = rte_pktmbuf_iova( +@@ -11219,7 +11220,7 @@ create_aead_operation_SGL(enum rte_crypto_aead_operation op, } -- if (dev->inflight_info->fd) -+ if (dev->inflight_info->fd >= 0) { - close(dev->inflight_info->fd); -+ dev->inflight_info->fd = -1; + sym_op->aead.data.length = tdata->plaintext.len; +- sym_op->aead.data.offset = aad_len; ++ sym_op->aead.data.offset = aad_len_pad; + + return 0; + } +@@ -11252,7 +11253,7 @@ test_authenticated_encryption_SGL(const struct aead_test_data *tdata, + int ecx = 0; + void *digest_mem = NULL; + +- uint32_t prepend_len = tdata->aad.len; ++ uint32_t prepend_len = RTE_ALIGN_CEIL(tdata->aad.len, 16); + + if (tdata->plaintext.len % fragsz != 0) { + if (tdata->plaintext.len / fragsz + 1 > SGL_MAX_NO) +@@ -11915,6 +11916,8 @@ static struct unit_test_suite cryptodev_qat_testsuite = { + test_AES_GCM_auth_encrypt_SGL_out_of_place_400B_400B), + TEST_CASE_ST(ut_setup, ut_teardown, + test_AES_GCM_auth_encrypt_SGL_out_of_place_1500B_2000B), ++ TEST_CASE_ST(ut_setup, ut_teardown, ++ test_AES_GCM_auth_encrypt_SGL_out_of_place_400B_1seg), + TEST_CASE_ST(ut_setup, ut_teardown, + test_AES_GCM_authenticated_encryption_test_case_1), + TEST_CASE_ST(ut_setup, ut_teardown, +diff --git a/dpdk/app/test/test_cryptodev_blockcipher.c b/dpdk/app/test/test_cryptodev_blockcipher.c +index 5bfe2d009f..2f91d000a2 100644 +--- a/dpdk/app/test/test_cryptodev_blockcipher.c ++++ b/dpdk/app/test/test_cryptodev_blockcipher.c +@@ -93,7 +93,7 @@ test_blockcipher_one_case(const struct blockcipher_test_case *t, + uint64_t feat_flags = dev_info.feature_flags; + uint64_t oop_flag = RTE_CRYPTODEV_FF_OOP_SGL_IN_LB_OUT; + +- if (t->feature_mask && BLOCKCIPHER_TEST_FEATURE_OOP) { ++ if (t->feature_mask & BLOCKCIPHER_TEST_FEATURE_OOP) { + if (!(feat_flags & oop_flag)) { + printf("Device doesn't support out-of-place " + "scatter-gather in input mbuf. " +diff --git a/dpdk/app/test/test_cryptodev_hash_test_vectors.h b/dpdk/app/test/test_cryptodev_hash_test_vectors.h +index cff2831185..394bb6b60b 100644 +--- a/dpdk/app/test/test_cryptodev_hash_test_vectors.h ++++ b/dpdk/app/test/test_cryptodev_hash_test_vectors.h +@@ -460,6 +460,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha1_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_GEN, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -473,6 +474,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha1_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_VERIFY, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -540,6 +542,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha224_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_GEN, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -553,6 +556,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha224_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_VERIFY, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -596,6 +600,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha256_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_GEN, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -609,6 +614,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha256_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_VERIFY, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -654,6 +660,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha384_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_GEN, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -667,6 +674,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha384_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_VERIFY, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -712,6 +720,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha512_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_GEN, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +@@ -724,6 +733,7 @@ static const struct blockcipher_test_case hash_test_cases[] = { + .test_data = &sha512_test_vector, + .op_mask = BLOCKCIPHER_TEST_OP_AUTH_VERIFY, + .pmd_mask = BLOCKCIPHER_TEST_TARGET_PMD_OPENSSL | ++ BLOCKCIPHER_TEST_TARGET_PMD_QAT | + BLOCKCIPHER_TEST_TARGET_PMD_CCP | + BLOCKCIPHER_TEST_TARGET_PMD_MVSAM | + #if IMB_VERSION_NUM >= IMB_VERSION(0, 52, 0) +diff --git a/dpdk/app/test/test_eventdev.c b/dpdk/app/test/test_eventdev.c +index 427dbbf77f..43ccb1ce97 100644 +--- a/dpdk/app/test/test_eventdev.c ++++ b/dpdk/app/test/test_eventdev.c +@@ -996,9 +996,13 @@ test_eventdev_common(void) + static int + test_eventdev_selftest_impl(const char *pmd, const char *opts) + { +- rte_vdev_init(pmd, opts); ++ int ret = 0; ++ + if (rte_event_dev_get_dev_id(pmd) == -ENODEV) ++ ret = rte_vdev_init(pmd, opts); ++ if (ret) + return TEST_SKIPPED; ++ + return rte_event_dev_selftest(rte_event_dev_get_dev_id(pmd)); + } + +@@ -1017,7 +1021,7 @@ test_eventdev_selftest_octeontx(void) + static int + test_eventdev_selftest_octeontx2(void) + { +- return test_eventdev_selftest_impl("otx2_eventdev", ""); ++ return test_eventdev_selftest_impl("event_octeontx2", ""); + } + + static int +diff --git a/dpdk/app/test/test_fib_perf.c b/dpdk/app/test/test_fib_perf.c +index 573087c3c0..dd2e54db8b 100644 +--- a/dpdk/app/test/test_fib_perf.c ++++ b/dpdk/app/test/test_fib_perf.c +@@ -35,7 +35,7 @@ struct route_rule { + uint8_t depth; + }; + +-struct route_rule large_route_table[MAX_RULE_NUM]; ++static struct route_rule large_route_table[MAX_RULE_NUM]; + + static uint32_t num_route_entries; + #define NUM_ROUTE_ENTRIES num_route_entries +diff --git a/dpdk/app/test/test_flow_classify.c b/dpdk/app/test/test_flow_classify.c +index ff5265c6af..ef0b6fdd5c 100644 +--- a/dpdk/app/test/test_flow_classify.c ++++ b/dpdk/app/test/test_flow_classify.c +@@ -23,7 +23,7 @@ + + #define FLOW_CLASSIFY_MAX_RULE_NUM 100 + #define MAX_PKT_BURST 32 +-#define NB_SOCKETS 1 ++#define NB_SOCKETS 4 + #define MEMPOOL_CACHE_SIZE 256 + #define MBUF_SIZE 512 + #define NB_MBUF 512 +diff --git a/dpdk/app/test/test_hash.c b/dpdk/app/test/test_hash.c +index 0052dce2de..2ac298e21e 100644 +--- a/dpdk/app/test/test_hash.c ++++ b/dpdk/app/test/test_hash.c +@@ -1142,8 +1142,11 @@ fbk_hash_unit_test(void) + handle = rte_fbk_hash_create(&invalid_params_7); + RETURN_IF_ERROR_FBK(handle != NULL, "fbk hash creation should have failed"); + +- handle = rte_fbk_hash_create(&invalid_params_8); +- RETURN_IF_ERROR_FBK(handle != NULL, "fbk hash creation should have failed"); ++ if (rte_eal_has_hugepages()) { ++ handle = rte_fbk_hash_create(&invalid_params_8); ++ RETURN_IF_ERROR_FBK(handle != NULL, ++ "fbk hash creation should have failed"); + } - dev->inflight_info->fd = fd; - dev->inflight_info->addr = addr; -@@ -2059,10 +2075,10 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg, - size = msg->payload.log.mmap_size; - off = msg->payload.log.mmap_offset; + handle = rte_fbk_hash_create(&invalid_params_same_name_1); + RETURN_IF_ERROR_FBK(handle == NULL, "fbk hash creation should have succeeded"); +diff --git a/dpdk/app/test/test_ipsec.c b/dpdk/app/test/test_ipsec.c +index 7dc83fee7e..79d00d7e02 100644 +--- a/dpdk/app/test/test_ipsec.c ++++ b/dpdk/app/test/test_ipsec.c +@@ -237,7 +237,7 @@ fill_crypto_xform(struct ipsec_unitest_params *ut_params, + } -- /* Don't allow mmap_offset to point outside the mmap region */ -- if (off > size) { -+ /* Check for mmap size and offset overflow. */ -+ if (off >= -size) { - RTE_LOG(ERR, VHOST_CONFIG, -- "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", -+ "log offset %#"PRIx64" and log size %#"PRIx64" overflow\n", - off, size); - return RTE_VHOST_MSG_RESULT_ERR; + static int +-check_cryptodev_capablity(const struct ipsec_unitest_params *ut, ++check_cryptodev_capability(const struct ipsec_unitest_params *ut, + uint8_t dev_id) + { + struct rte_cryptodev_sym_capability_idx cap_idx; +@@ -302,7 +302,7 @@ testsuite_setup(void) + + /* Find first valid crypto device */ + for (i = 0; i < nb_devs; i++) { +- rc = check_cryptodev_capablity(ut_params, i); ++ rc = check_cryptodev_capability(ut_params, i); + if (rc == 0) { + ts_params->valid_dev = i; + ts_params->valid_dev_found = 1; +@@ -1167,6 +1167,34 @@ test_ipsec_dump_buffers(struct ipsec_unitest_params *ut_params, int i) } -@@ -2526,7 +2542,7 @@ static int - vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, - struct VhostUserMsg *msg) + } + ++static void ++destroy_dummy_sec_session(struct ipsec_unitest_params *ut, ++ uint32_t j) ++{ ++ rte_security_session_destroy(&dummy_sec_ctx, ++ ut->ss[j].security.ses); ++ ut->ss[j].security.ctx = NULL; ++} ++ ++static void ++destroy_crypto_session(struct ipsec_unitest_params *ut, ++ uint8_t crypto_dev, uint32_t j) ++{ ++ rte_cryptodev_sym_session_clear(crypto_dev, ut->ss[j].crypto.ses); ++ rte_cryptodev_sym_session_free(ut->ss[j].crypto.ses); ++ memset(&ut->ss[j], 0, sizeof(ut->ss[j])); ++} ++ ++static void ++destroy_session(struct ipsec_unitest_params *ut, ++ uint8_t crypto_dev, uint32_t j) ++{ ++ if (ut->ss[j].type == RTE_SECURITY_ACTION_TYPE_NONE) ++ return destroy_crypto_session(ut, crypto_dev, j); ++ else ++ return destroy_dummy_sec_session(ut, j); ++} ++ + static void + destroy_sa(uint32_t j) { -- uint16_t vring_idx; -+ uint32_t vring_idx; +@@ -1175,9 +1203,8 @@ destroy_sa(uint32_t j) - switch (msg->request.master) { - case VHOST_USER_SET_VRING_KICK: -diff --git a/dpdk/lib/librte_vhost/virtio_net.c b/dpdk/lib/librte_vhost/virtio_net.c -index ac2842b2d2..33f10258cf 100644 ---- a/dpdk/lib/librte_vhost/virtio_net.c -+++ b/dpdk/lib/librte_vhost/virtio_net.c -@@ -1086,6 +1086,8 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, - VHOST_ACCESS_RW); + rte_ipsec_sa_fini(ut->ss[j].sa); + rte_free(ut->ss[j].sa); +- rte_cryptodev_sym_session_clear(ts->valid_dev, ut->ss[j].crypto.ses); +- rte_cryptodev_sym_session_free(ut->ss[j].crypto.ses); +- memset(&ut->ss[j], 0, sizeof(ut->ss[j])); ++ ++ destroy_session(ut, ts->valid_dev, j); + } - vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { -+ if (unlikely(!desc_addrs[i])) -+ return -1; - if (unlikely(lens[i] != descs[avail_idx + i].len)) - return -1; + static int +diff --git a/dpdk/app/test/test_kvargs.c b/dpdk/app/test/test_kvargs.c +index a42056f361..2a2dae43a0 100644 +--- a/dpdk/app/test/test_kvargs.c ++++ b/dpdk/app/test/test_kvargs.c +@@ -142,7 +142,7 @@ static int test_valid_kvargs(void) + valid_keys = valid_keys_list; + kvlist = rte_kvargs_parse(args, valid_keys); + if (kvlist == NULL) { +- printf("rte_kvargs_parse() error"); ++ printf("rte_kvargs_parse() error\n"); + goto fail; } -@@ -1841,6 +1843,8 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, + if (strcmp(kvlist->pairs[0].value, "[0,1]") != 0) { +@@ -157,6 +157,40 @@ static int test_valid_kvargs(void) } + rte_kvargs_free(kvlist); - vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { -+ if (unlikely(!desc_addrs[i])) -+ return -1; - if (unlikely((lens[i] != descs[avail_idx + i].len))) - return -1; ++ /* test using empty string (it is valid) */ ++ args = ""; ++ kvlist = rte_kvargs_parse(args, NULL); ++ if (kvlist == NULL) { ++ printf("rte_kvargs_parse() error\n"); ++ goto fail; ++ } ++ if (rte_kvargs_count(kvlist, NULL) != 0) { ++ printf("invalid count value\n"); ++ goto fail; ++ } ++ rte_kvargs_free(kvlist); ++ ++ /* test using empty elements (it is valid) */ ++ args = "foo=1,,check=value2,,"; ++ kvlist = rte_kvargs_parse(args, NULL); ++ if (kvlist == NULL) { ++ printf("rte_kvargs_parse() error\n"); ++ goto fail; ++ } ++ if (rte_kvargs_count(kvlist, NULL) != 2) { ++ printf("invalid count value\n"); ++ goto fail; ++ } ++ if (rte_kvargs_count(kvlist, "foo") != 1) { ++ printf("invalid count value for 'foo'\n"); ++ goto fail; ++ } ++ if (rte_kvargs_count(kvlist, "check") != 1) { ++ printf("invalid count value for 'check'\n"); ++ goto fail; ++ } ++ rte_kvargs_free(kvlist); ++ + return 0; + + fail: +@@ -179,11 +213,11 @@ static int test_invalid_kvargs(void) + const char *args_list[] = { + "wrong-key=x", /* key not in valid_keys_list */ + "foo=1,foo=", /* empty value */ +- "foo=1,,foo=2", /* empty key/value */ + "foo=1,foo", /* no value */ + "foo=1,=2", /* no key */ + "foo=[1,2", /* no closing bracket in value */ + ",=", /* also test with a smiley */ ++ "foo=[", /* no value in list and no closing bracket */ + NULL }; + const char **args; + const char *valid_keys_list[] = { "foo", "check", NULL }; +@@ -197,8 +231,8 @@ static int test_invalid_kvargs(void) + rte_kvargs_free(kvlist); + goto fail; + } +- return 0; + } ++ return 0; + + fail: + printf("while processing <%s>", *args); +diff --git a/dpdk/app/test/test_lpm_perf.c b/dpdk/app/test/test_lpm_perf.c +index a2578fe90e..489719c40b 100644 +--- a/dpdk/app/test/test_lpm_perf.c ++++ b/dpdk/app/test/test_lpm_perf.c +@@ -34,7 +34,7 @@ struct route_rule { + uint8_t depth; + }; + +-struct route_rule large_route_table[MAX_RULE_NUM]; ++static struct route_rule large_route_table[MAX_RULE_NUM]; + + static uint32_t num_route_entries; + #define NUM_ROUTE_ENTRIES num_route_entries +diff --git a/dpdk/app/test/test_malloc.c b/dpdk/app/test/test_malloc.c +index a16e28cc32..57f796f9e5 100644 +--- a/dpdk/app/test/test_malloc.c ++++ b/dpdk/app/test/test_malloc.c +@@ -746,6 +746,18 @@ test_malloc_bad_params(void) + if (bad_ptr != NULL) + goto err_return; + ++ /* rte_malloc expected to return null with size will cause overflow */ ++ align = RTE_CACHE_LINE_SIZE; ++ size = (size_t)-8; ++ ++ bad_ptr = rte_malloc(type, size, align); ++ if (bad_ptr != NULL) ++ goto err_return; ++ ++ bad_ptr = rte_realloc(NULL, size, align); ++ if (bad_ptr != NULL) ++ goto err_return; ++ + return 0; + + err_return: +diff --git a/dpdk/app/test/test_mbuf.c b/dpdk/app/test/test_mbuf.c +index 61ecffc184..f2922e05e0 100644 +--- a/dpdk/app/test/test_mbuf.c ++++ b/dpdk/app/test/test_mbuf.c +@@ -1144,7 +1144,7 @@ test_refcnt_mbuf(void) + tref += refcnt_lcore[slave]; + + if (tref != refcnt_lcore[master]) +- rte_panic("refernced mbufs: %u, freed mbufs: %u\n", ++ rte_panic("referenced mbufs: %u, freed mbufs: %u\n", + tref, refcnt_lcore[master]); + + rte_mempool_dump(stdout, refcnt_pool); +diff --git a/dpdk/app/test/test_pmd_perf.c b/dpdk/app/test/test_pmd_perf.c +index d61be58bb3..352cd47156 100644 +--- a/dpdk/app/test/test_pmd_perf.c ++++ b/dpdk/app/test/test_pmd_perf.c +@@ -151,7 +151,7 @@ check_all_ports_link_status(uint16_t port_num, uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + if (link_mbps == 0) + link_mbps = link.link_speed; + } else +diff --git a/dpdk/app/test/test_table_pipeline.c b/dpdk/app/test/test_table_pipeline.c +index 441338ac01..bc412c3081 100644 +--- a/dpdk/app/test/test_table_pipeline.c ++++ b/dpdk/app/test/test_table_pipeline.c +@@ -190,11 +190,13 @@ check_pipeline_invalid_params(void) + goto fail; + } + +- p = rte_pipeline_create(&pipeline_params_3); +- if (p != NULL) { +- RTE_LOG(INFO, PIPELINE, "%s: Configure pipeline with invalid " +- "socket\n", __func__); +- goto fail; ++ if (rte_eal_has_hugepages()) { ++ p = rte_pipeline_create(&pipeline_params_3); ++ if (p != NULL) { ++ RTE_LOG(INFO, PIPELINE, "%s: Configure pipeline with " ++ "invalid socket\n", __func__); ++ goto fail; ++ } } + + /* Check pipeline consistency */ +diff --git a/dpdk/buildtools/meson.build b/dpdk/buildtools/meson.build +index 6ef2c5721c..cd6f4c1af0 100644 +--- a/dpdk/buildtools/meson.build ++++ b/dpdk/buildtools/meson.build +@@ -3,9 +3,11 @@ + + subdir('pmdinfogen') + ++pkgconf = find_program('pkg-config', 'pkgconf', required: false) + pmdinfo = find_program('gen-pmdinfo-cfile.sh') + + check_experimental_syms = find_program('check-experimental-syms.sh') ++ldflags_ibverbs_static = find_program('options-ibverbs-static.sh') + + # set up map-to-def script using python, either built-in or external + python3 = import('python').find_installation(required: false) +diff --git a/dpdk/buildtools/options-ibverbs-static.sh b/dpdk/buildtools/options-ibverbs-static.sh +index 0f285a343b..0740a711ff 100755 +--- a/dpdk/buildtools/options-ibverbs-static.sh ++++ b/dpdk/buildtools/options-ibverbs-static.sh +@@ -9,6 +9,13 @@ + # + # PKG_CONFIG_PATH may be required to be set if libibverbs.pc is not installed. + +-pkg-config --libs-only-l --static libibverbs | ++lib='libibverbs' ++deps='pthread|nl' ++ ++pkg-config --libs --static $lib | + tr '[:space:]' '\n' | +- sed -r '/^-l(pthread|nl)/! s,(^-l)(.*),\1:lib\2.a,' ++ sed -r "/^-l($deps)/! s,(^-l)(.*),\1:lib\2.a," | # explicit .a ++ sed -n '/^-[Ll]/p' | # extra link options may break with make ++ tac | ++ awk "/^-l:$lib.a/&&c++ {next} 1" | # drop first duplicates of main lib ++ tac +diff --git a/dpdk/config/common_base b/dpdk/config/common_base +index 7dec7ed457..861f7d1a0b 100644 +--- a/dpdk/config/common_base ++++ b/dpdk/config/common_base +@@ -328,7 +328,6 @@ CONFIG_RTE_LIBRTE_ICE_PMD=y + CONFIG_RTE_LIBRTE_ICE_DEBUG_RX=n + CONFIG_RTE_LIBRTE_ICE_DEBUG_TX=n + CONFIG_RTE_LIBRTE_ICE_DEBUG_TX_FREE=n +-CONFIG_RTE_LIBRTE_ICE_RX_ALLOW_BULK_ALLOC=y + CONFIG_RTE_LIBRTE_ICE_16BYTE_RX_DESC=n + + # Compile burst-oriented IAVF PMD driver +@@ -352,7 +351,7 @@ CONFIG_RTE_LIBRTE_MLX4_DEBUG=n + + # + # Compile burst-oriented Mellanox ConnectX-4, ConnectX-5, +-# ConnectX-6 & Bluefield (MLX5) PMD ++# ConnectX-6 & BlueField (MLX5) PMD + # + CONFIG_RTE_LIBRTE_MLX5_PMD=n + CONFIG_RTE_LIBRTE_MLX5_DEBUG=n +diff --git a/dpdk/config/meson.build b/dpdk/config/meson.build +index 364a8d7394..78bfdf3094 100644 +--- a/dpdk/config/meson.build ++++ b/dpdk/config/meson.build +@@ -14,6 +14,10 @@ foreach env:supported_exec_envs + set_variable('is_' + env, exec_env == env) + endforeach + ++# MS linker requires special treatment. ++# TODO: use cc.get_linker_id() with Meson >= 0.54 ++is_ms_linker = is_windows and (cc.get_id() == 'clang') ++ + # set the major version, which might be used by drivers and libraries + # depending on the configuration options + pver = meson.project_version().split('.') +@@ -98,14 +102,18 @@ dpdk_conf.set('RTE_TOOLCHAIN_' + toolchain.to_upper(), 1) + + dpdk_conf.set('RTE_ARCH_64', cc.sizeof('void *') == 8) + +-add_project_link_arguments('-Wl,--no-as-needed', language: 'c') ++if not is_windows ++ add_project_link_arguments('-Wl,--no-as-needed', language: 'c') ++endif + +-# use pthreads +-add_project_link_arguments('-pthread', language: 'c') +-dpdk_extra_ldflags += '-pthread' ++# use pthreads if available for the platform ++if not is_ms_linker ++ add_project_link_arguments('-pthread', language: 'c') ++ dpdk_extra_ldflags += '-pthread' ++endif + + # on some OS, maths functions are in a separate library +-if cc.find_library('libm', required : false).found() ++if cc.find_library('m', required : false).found() + # some libs depend on maths lib + add_project_link_arguments('-lm', language: 'c') + dpdk_extra_ldflags += '-lm' +@@ -183,6 +191,10 @@ warning_flags = [ + '-Wno-packed-not-aligned', + '-Wno-missing-field-initializers' + ] ++if cc.get_id() == 'gcc' and cc.version().version_compare('>=10.0') ++# FIXME: Bugzilla 396 ++ warning_flags += '-Wno-zero-length-bounds' ++endif + if not dpdk_conf.get('RTE_ARCH_64') + # for 32-bit, don't warn about casting a 32-bit pointer to 64-bit int - it's fine!! + warning_flags += '-Wno-pointer-to-int-cast' +@@ -231,6 +243,16 @@ if is_freebsd + add_project_arguments('-D__BSD_VISIBLE', language: 'c') + endif + ++if is_windows ++ # Minimum supported API is Windows 7. ++ add_project_arguments('-D_WIN32_WINNT=0x0601', language: 'c') ++ ++ # Use MinGW-w64 stdio, because DPDK assumes ANSI-compliant formatting. ++ if cc.get_id() == 'gcc' ++ add_project_arguments('-D__USE_MINGW_ANSI_STDIO', language: 'c') ++ endif ++endif ++ + if get_option('b_lto') + if cc.has_argument('-ffat-lto-objects') + add_project_arguments('-ffat-lto-objects', language: 'c') +diff --git a/dpdk/config/x86/meson.build b/dpdk/config/x86/meson.build +index 8b0fa3e6f1..adc857ba28 100644 +--- a/dpdk/config/x86/meson.build ++++ b/dpdk/config/x86/meson.build +@@ -15,11 +15,9 @@ if not is_windows + endif + + # we require SSE4.2 for DPDK +-sse_errormsg = '''SSE4.2 instruction set is required for DPDK. +-Please set the machine type to "nehalem" or "corei7" or higher value''' +- + if cc.get_define('__SSE4_2__', args: machine_args) == '' +- error(sse_errormsg) ++ message('SSE 4.2 not enabled by default, explicitly enabling') ++ machine_args += '-msse4' + endif + + base_flags = ['SSE', 'SSE2', 'SSE3','SSSE3', 'SSE4_1', 'SSE4_2'] +diff --git a/dpdk/devtools/check-symbol-change.sh b/dpdk/devtools/check-symbol-change.sh +index c5434f3bb0..ed2178e36e 100755 +--- a/dpdk/devtools/check-symbol-change.sh ++++ b/dpdk/devtools/check-symbol-change.sh +@@ -17,13 +17,11 @@ build_map_changes() + # map files are altered, and all section/symbol names + # appearing between a triggering of this rule and the + # next trigger of this rule are associated with this file +- /[-+] a\/.*\.map/ {map=$2; in_map=1} ++ /[-+] [ab]\/.*\.map/ {map=$2; in_map=1; next} + +- # Same pattern as above, only it matches on anything that +- # does not end in 'map', indicating we have left the map chunk. +- # When we hit this, turn off the in_map variable, which +- # supresses the subordonate rules below +- /[-+] a\/.*\.[^map]/ {in_map=0} ++ # The previous rule catches all .map files, anything else ++ # indicates we left the map chunk. ++ /[-+] [ab]\// {in_map=0} + + # Triggering this rule, which starts a line and ends it + # with a { identifies a versioned section. The section name is +diff --git a/dpdk/devtools/checkpatches.sh b/dpdk/devtools/checkpatches.sh +index b16bace927..9902e2a9bc 100755 +--- a/dpdk/devtools/checkpatches.sh ++++ b/dpdk/devtools/checkpatches.sh +@@ -70,6 +70,14 @@ check_forbidden_additions() { # + -f $(dirname $(readlink -f $0))/check-forbidden-tokens.awk \ + "$1" || res=1 + ++ # links must prefer https over http ++ awk -v FOLDERS='doc' \ ++ -v EXPRESSIONS='http://.*dpdk.org' \ ++ -v RET_ON_FAIL=1 \ ++ -v MESSAGE='Using non https link to dpdk.org' \ ++ -f $(dirname $(readlink -f $0))/check-forbidden-tokens.awk \ ++ "$1" || res=1 ++ + return $res + } + +diff --git a/dpdk/devtools/cocci.sh b/dpdk/devtools/cocci.sh +index 8b17a8ceba..ab9a6efe9a 100755 +--- a/dpdk/devtools/cocci.sh ++++ b/dpdk/devtools/cocci.sh +@@ -1,34 +1,6 @@ + #! /bin/sh +- +-# BSD LICENSE +-# +-# Copyright 2015 EZchip Semiconductor Ltd. +-# +-# Redistribution and use in source and binary forms, with or without +-# modification, are permitted provided that the following conditions +-# are met: +-# +-# * Redistributions of source code must retain the above copyright +-# notice, this list of conditions and the following disclaimer. +-# * Redistributions in binary form must reproduce the above copyright +-# notice, this list of conditions and the following disclaimer in +-# the documentation and/or other materials provided with the +-# distribution. +-# * Neither the name of EZchip Semiconductor nor the names of its +-# contributors may be used to endorse or promote products derived +-# from this software without specific prior written permission. +-# +-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++# SPDX-License-Identifier: BSD-3-Clause ++# Copyright 2015-2020 Mellanox Technologies, Ltd + + # Apply coccinelle transforms. + +diff --git a/dpdk/devtools/git-log-fixes.sh b/dpdk/devtools/git-log-fixes.sh +index e37ee22600..6d468d6731 100755 +--- a/dpdk/devtools/git-log-fixes.sh ++++ b/dpdk/devtools/git-log-fixes.sh +@@ -94,11 +94,23 @@ stable_tag () # + fi + } + ++# print a marker for fixes tag presence ++fixes_tag () # ++{ ++ if git log --format='%b' -1 $1 | grep -qi '^Fixes: *' ; then ++ echo 'F' ++ else ++ echo '-' ++ fi ++} ++ + git log --oneline --reverse $range | + while read id headline ; do + origins=$(origin_filter $id) + stable=$(stable_tag $id) +- [ "$stable" = "S" ] || [ -n "$origins" ] || echo "$headline" | grep -q fix || continue ++ fixes=$(fixes_tag $id) ++ [ "$stable" = "S" ] || [ "$fixes" = "F" ] || [ -n "$origins" ] || \ ++ echo "$headline" | grep -q fix || continue + version=$(commit_version $id) + if [ -n "$origins" ] ; then + origver="$(origin_version $origins)" +@@ -108,5 +120,5 @@ while read id headline ; do + else + origver='N/A' + fi +- printf '%s %7s %s %s (%s)\n' $version $id $stable "$headline" "$origver" ++ printf '%s %7s %s %s %s (%s)\n' $version $id $stable $fixes "$headline" "$origver" + done +diff --git a/dpdk/devtools/test-build.sh b/dpdk/devtools/test-build.sh +index be565a1bea..52305fbb8c 100755 +--- a/dpdk/devtools/test-build.sh ++++ b/dpdk/devtools/test-build.sh +@@ -149,7 +149,7 @@ config () # + ! echo $3 | grep -q '+debug' || ( \ + sed -ri="" 's,(RTE_LOG_DP_LEVEL=).*,\1RTE_LOG_DEBUG,' $1/.config + sed -ri="" 's,(_DEBUG.*=)n,\1y,' $1/.config +- sed -ri="" 's,(_STAT.*=)n,\1y,' $1/.config ++ sed -ri="" 's,(_STAT)([S_].*=|=)n,\1\2y,' $1/.config + sed -ri="" 's,(TEST_PMD_RECORD_.*=)n,\1y,' $1/.config ) + + # Automatic configuration +diff --git a/dpdk/doc/api/doxy-api-index.md b/dpdk/doc/api/doxy-api-index.md +index dff496be09..d7c8bd24db 100644 +--- a/dpdk/doc/api/doxy-api-index.md ++++ b/dpdk/doc/api/doxy-api-index.md +@@ -1,4 +1,4 @@ +-API {#index} ++API + === + + IPv4 */ +diff --git a/dpdk/drivers/net/ice/ice_rxtx_vec_common.h b/dpdk/drivers/net/ice/ice_rxtx_vec_common.h +index 5e6f89642a..46e3be98a6 100644 +--- a/dpdk/drivers/net/ice/ice_rxtx_vec_common.h ++++ b/dpdk/drivers/net/ice/ice_rxtx_vec_common.h +@@ -29,6 +29,7 @@ ice_rx_reassemble_packets(struct ice_rx_queue *rxq, struct rte_mbuf **rx_bufs, + if (!split_flags[buf_idx]) { + /* it's the last packet of the set */ + start->hash = end->hash; ++ start->vlan_tci = end->vlan_tci; + start->ol_flags = end->ol_flags; + /* we need to strip crc for the whole packet */ + start->pkt_len -= rxq->crc_len; +@@ -245,6 +246,7 @@ ice_rx_vec_queue_default(struct ice_rx_queue *rxq) + DEV_TX_OFFLOAD_VLAN_INSERT | \ + DEV_TX_OFFLOAD_SCTP_CKSUM | \ + DEV_TX_OFFLOAD_UDP_CKSUM | \ ++ DEV_TX_OFFLOAD_TCP_TSO | \ + DEV_TX_OFFLOAD_TCP_CKSUM) + + static inline int +diff --git a/dpdk/drivers/net/ice/ice_switch_filter.c b/dpdk/drivers/net/ice/ice_switch_filter.c +index 4a9356b317..6c24731638 100644 +--- a/dpdk/drivers/net/ice/ice_switch_filter.c ++++ b/dpdk/drivers/net/ice/ice_switch_filter.c +@@ -871,7 +871,7 @@ ice_switch_inset_get(const struct rte_flow_item pattern[], + vlan_spec->inner_type; + list[t].m_u.vlan_hdr.type = + UINT16_MAX; +- input_set |= ICE_INSET_VLAN_OUTER; ++ input_set |= ICE_INSET_ETHERTYPE; + } + t++; + } else if (!vlan_spec && !vlan_mask) { +@@ -937,6 +937,8 @@ ice_switch_parse_action(struct ice_pf *pf, + switch (action_type) { + case RTE_FLOW_ACTION_TYPE_RSS: + act_qgrop = action->conf; ++ if (act_qgrop->queue_num <= 1) ++ goto error; + rule_info->sw_act.fltr_act = + ICE_FWD_TO_QGRP; + rule_info->sw_act.fwd_id.q_id = +@@ -997,6 +999,46 @@ ice_switch_parse_action(struct ice_pf *pf, + return -rte_errno; + } + ++static int ++ice_switch_check_action(const struct rte_flow_action *actions, ++ struct rte_flow_error *error) ++{ ++ const struct rte_flow_action *action; ++ enum rte_flow_action_type action_type; ++ uint16_t actions_num = 0; ++ ++ for (action = actions; action->type != ++ RTE_FLOW_ACTION_TYPE_END; action++) { ++ action_type = action->type; ++ switch (action_type) { ++ case RTE_FLOW_ACTION_TYPE_VF: ++ case RTE_FLOW_ACTION_TYPE_RSS: ++ case RTE_FLOW_ACTION_TYPE_QUEUE: ++ case RTE_FLOW_ACTION_TYPE_DROP: ++ actions_num++; ++ break; ++ case RTE_FLOW_ACTION_TYPE_VOID: ++ continue; ++ default: ++ rte_flow_error_set(error, ++ EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, ++ actions, ++ "Invalid action type"); ++ return -rte_errno; ++ } ++ } ++ ++ if (actions_num > 1) { ++ rte_flow_error_set(error, ++ EINVAL, RTE_FLOW_ERROR_TYPE_ACTION, ++ actions, ++ "Invalid action number"); ++ return -rte_errno; ++ } ++ ++ return 0; ++} ++ + static int + ice_switch_parse_pattern_action(struct ice_adapter *ad, + struct ice_pattern_match_item *array, +@@ -1015,7 +1057,8 @@ ice_switch_parse_pattern_action(struct ice_adapter *ad, + uint16_t lkups_num = 0; + const struct rte_flow_item *item = pattern; + uint16_t item_num = 0; +- enum ice_sw_tunnel_type tun_type = ICE_NON_TUN; ++ enum ice_sw_tunnel_type tun_type = ++ ICE_SW_TUN_AND_NON_TUN; + struct ice_pattern_match_item *pattern_match_item = NULL; + + for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { +@@ -1051,6 +1094,7 @@ ice_switch_parse_pattern_action(struct ice_adapter *ad, + return -rte_errno; + } + ++ memset(&rule_info, 0, sizeof(rule_info)); + rule_info.tun_type = tun_type; + + sw_meta_ptr = +@@ -1081,6 +1125,14 @@ ice_switch_parse_pattern_action(struct ice_adapter *ad, + goto error; + } + ++ ret = ice_switch_check_action(actions, error); ++ if (ret) { ++ rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_HANDLE, NULL, ++ "Invalid input action number"); ++ goto error; ++ } ++ + ret = ice_switch_parse_action(pf, actions, error, &rule_info); + if (ret) { + rte_flow_error_set(error, EINVAL, +@@ -1088,10 +1140,17 @@ ice_switch_parse_pattern_action(struct ice_adapter *ad, + "Invalid input action"); + goto error; + } +- *meta = sw_meta_ptr; +- ((struct sw_meta *)*meta)->list = list; +- ((struct sw_meta *)*meta)->lkups_num = lkups_num; +- ((struct sw_meta *)*meta)->rule_info = rule_info; ++ ++ if (meta) { ++ *meta = sw_meta_ptr; ++ ((struct sw_meta *)*meta)->list = list; ++ ((struct sw_meta *)*meta)->lkups_num = lkups_num; ++ ((struct sw_meta *)*meta)->rule_info = rule_info; ++ } else { ++ rte_free(list); ++ rte_free(sw_meta_ptr); ++ } ++ + rte_free(pattern_match_item); + + return 0; +diff --git a/dpdk/drivers/net/ipn3ke/ipn3ke_representor.c b/dpdk/drivers/net/ipn3ke/ipn3ke_representor.c +index 8d9ebef915..b673c49149 100644 +--- a/dpdk/drivers/net/ipn3ke/ipn3ke_representor.c ++++ b/dpdk/drivers/net/ipn3ke/ipn3ke_representor.c +@@ -701,7 +701,7 @@ struct ipn3ke_rpst_hw_port_stats *hw_stats) + &tmp, + IPN3KE_25G_TX_STATISTICS_STATUS, + port_id, +- 1); ++ 0); + if (tmp & IPN3KE_25G_TX_STATISTICS_STATUS_SHADOW_REQUEST_MASK) { + tmp = 0x00000000; + (*hw->f_mac_read)(hw, +@@ -2598,7 +2598,8 @@ ipn3ke_rpst_scan_check(void) + int ret; + + if (ipn3ke_rpst_scan_num == 1) { +- ret = pthread_create(&ipn3ke_rpst_scan_thread, ++ ret = rte_ctrl_thread_create(&ipn3ke_rpst_scan_thread, ++ "ipn3ke scanner", + NULL, + ipn3ke_rpst_scan_handle_request, NULL); + if (ret) { +diff --git a/dpdk/drivers/net/ipn3ke/meson.build b/dpdk/drivers/net/ipn3ke/meson.build +index e3c8a6768c..bfec592aba 100644 +--- a/dpdk/drivers/net/ipn3ke/meson.build ++++ b/dpdk/drivers/net/ipn3ke/meson.build +@@ -21,9 +21,11 @@ endif + if build + allow_experimental_apis = true + ++ includes += include_directories('../../raw/ifpga') ++ + sources += files('ipn3ke_ethdev.c', + 'ipn3ke_representor.c', + 'ipn3ke_tm.c', + 'ipn3ke_flow.c') +- deps += ['bus_ifpga', 'sched', 'pmd_i40e', 'rawdev', 'rawdev_ifpga'] ++ deps += ['bus_ifpga', 'ethdev', 'sched'] + endif +diff --git a/dpdk/drivers/net/ixgbe/Makefile b/dpdk/drivers/net/ixgbe/Makefile +index 85762e2f2a..31523025b3 100644 +--- a/dpdk/drivers/net/ixgbe/Makefile ++++ b/dpdk/drivers/net/ixgbe/Makefile +@@ -57,6 +57,7 @@ endif + LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring + LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash + LDLIBS += -lrte_bus_pci ++LDLIBS += -lpthread + + # + # Add extra flags for base driver files (also known as shared code) +diff --git a/dpdk/drivers/net/ixgbe/base/README b/dpdk/drivers/net/ixgbe/base/README +index b6b420e2f3..a48b14ed27 100644 +--- a/dpdk/drivers/net/ixgbe/base/README ++++ b/dpdk/drivers/net/ixgbe/base/README +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2010-2018 Intel Corporation ++ * Copyright(c) 2010-2020 Intel Corporation + */ + + Intel® IXGBE driver +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_82598.c b/dpdk/drivers/net/ixgbe/base/ixgbe_82598.c +index 245ff75d55..c83e1c6b30 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_82598.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_82598.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_type.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_82598.h b/dpdk/drivers/net/ixgbe/base/ixgbe_82598.h +index 8013f495ec..7bad5e12d3 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_82598.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_82598.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_82598_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_82599.c b/dpdk/drivers/net/ixgbe/base/ixgbe_82599.c +index 96bdde62c8..9cd0b1428c 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_82599.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_82599.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_type.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_82599.h b/dpdk/drivers/net/ixgbe/base/ixgbe_82599.h +index a32eb1f517..238481983f 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_82599.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_82599.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_82599_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_api.c b/dpdk/drivers/net/ixgbe/base/ixgbe_api.c +index 873c07999c..0a22df3d06 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_api.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_api.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_api.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_api.h b/dpdk/drivers/net/ixgbe/base/ixgbe_api.h +index ff8f7b2611..33e7c3c215 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_api.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_api.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_API_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_common.c b/dpdk/drivers/net/ixgbe/base/ixgbe_common.c +index 62ff767230..4eb98dc198 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_common.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_common.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_common.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_common.h b/dpdk/drivers/net/ixgbe/base/ixgbe_common.h +index 3bb2475119..7a31f088c4 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_common.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_common.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_COMMON_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.c b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.c +index a590e0e07c..53def2146e 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.h b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.h +index 503d06018f..c2a1013ac0 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_DCB_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.c b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.c +index d87cb58857..bb309e28fd 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.h b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.h +index 1a14744482..8f36881378 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82598.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_DCB_82598_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.c b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.c +index f4f0ff0190..04e0d1fb7d 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.h b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.h +index 085ada27f7..7bd1d6a325 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_dcb_82599.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_DCB_82599_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.c b/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.c +index 67a124d8d1..6005c4ac93 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_vf.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.h b/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.h +index 9664f3bdbf..dd2e1eee4e 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_hv_vf.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_HV_VF_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.c b/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.c +index cb82942dfa..13bdb5f68f 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_type.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.h b/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.h +index 5d32cbc074..1a45e49c2f 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_mbx.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_MBX_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_osdep.h b/dpdk/drivers/net/ixgbe/base/ixgbe_osdep.h +index ea8dc1cbe5..a4eb71777c 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_osdep.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_osdep.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_OS_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_phy.c b/dpdk/drivers/net/ixgbe/base/ixgbe_phy.c +index dd118f9170..a8243fa974 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_phy.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_phy.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_api.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_phy.h b/dpdk/drivers/net/ixgbe/base/ixgbe_phy.h +index f1605f2cc9..a06c3be170 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_phy.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_phy.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_PHY_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_type.h b/dpdk/drivers/net/ixgbe/base/ixgbe_type.h +index 077b8f01c7..15e9370105 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_type.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_type.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_TYPE_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_vf.c b/dpdk/drivers/net/ixgbe/base/ixgbe_vf.c +index aac37822e4..7f69ece107 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_vf.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_vf.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_vf.h b/dpdk/drivers/net/ixgbe/base/ixgbe_vf.h +index dba643fced..be58b4f76e 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_vf.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_vf.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_VF_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_x540.c b/dpdk/drivers/net/ixgbe/base/ixgbe_x540.c +index f00f0eae7e..d65f47c181 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_x540.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_x540.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_x540.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_x540.h b/dpdk/drivers/net/ixgbe/base/ixgbe_x540.h +index 231dfe56e5..ba79847d11 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_x540.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_x540.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_X540_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_x550.c b/dpdk/drivers/net/ixgbe/base/ixgbe_x550.c +index 930a61a20b..3de406fd35 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_x550.c ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_x550.c +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #include "ixgbe_x550.h" +diff --git a/dpdk/drivers/net/ixgbe/base/ixgbe_x550.h b/dpdk/drivers/net/ixgbe/base/ixgbe_x550.h +index 3bd98f243d..10086ab423 100644 +--- a/dpdk/drivers/net/ixgbe/base/ixgbe_x550.h ++++ b/dpdk/drivers/net/ixgbe/base/ixgbe_x550.h +@@ -1,5 +1,5 @@ + /* SPDX-License-Identifier: BSD-3-Clause +- * Copyright(c) 2001-2018 ++ * Copyright(c) 2001-2020 Intel Corporation + */ + + #ifndef _IXGBE_X550_H_ +diff --git a/dpdk/drivers/net/ixgbe/base/meson.build b/dpdk/drivers/net/ixgbe/base/meson.build +index bbd0f51ea5..20677ab034 100644 +--- a/dpdk/drivers/net/ixgbe/base/meson.build ++++ b/dpdk/drivers/net/ixgbe/base/meson.build +@@ -1,5 +1,5 @@ + # SPDX-License-Identifier: BSD-3-Clause +-# Copyright(c) 2017 Intel Corporation ++# Copyright(c) 2017-2020 Intel Corporation + + sources = [ + 'ixgbe_82598.c', +diff --git a/dpdk/drivers/net/ixgbe/ixgbe_ethdev.c b/dpdk/drivers/net/ixgbe/ixgbe_ethdev.c +index 2c6fd0f131..50edb69cb2 100644 +--- a/dpdk/drivers/net/ixgbe/ixgbe_ethdev.c ++++ b/dpdk/drivers/net/ixgbe/ixgbe_ethdev.c +@@ -229,7 +229,9 @@ static int ixgbe_dev_interrupt_get_status(struct rte_eth_dev *dev); + static int ixgbe_dev_interrupt_action(struct rte_eth_dev *dev); + static void ixgbe_dev_interrupt_handler(void *param); + static void ixgbe_dev_interrupt_delayed_handler(void *param); +-static void ixgbe_dev_setup_link_alarm_handler(void *param); ++static void *ixgbe_dev_setup_link_thread_handler(void *param); ++static int ixgbe_dev_wait_setup_link_complete(struct rte_eth_dev *dev, ++ uint32_t timeout_ms); + + static int ixgbe_add_rar(struct rte_eth_dev *dev, + struct rte_ether_addr *mac_addr, +@@ -378,6 +380,7 @@ static int ixgbe_dev_udp_tunnel_port_del(struct rte_eth_dev *dev, + struct rte_eth_udp_tunnel *udp_tunnel); + static int ixgbe_filter_restore(struct rte_eth_dev *dev); + static void ixgbe_l2_tunnel_conf(struct rte_eth_dev *dev); ++static int ixgbe_wait_for_link_up(struct ixgbe_hw *hw); + + /* + * Define VF Stats MACRO for Non "cleared on read" register +@@ -1075,6 +1078,7 @@ ixgbe_swfw_lock_reset(struct ixgbe_hw *hw) + static int + eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) + { ++ struct ixgbe_adapter *ad = eth_dev->data->dev_private; + struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev); + struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; + struct ixgbe_hw *hw = +@@ -1126,6 +1130,7 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) + return 0; + } + ++ rte_atomic32_clear(&ad->link_thread_running); + rte_eth_copy_pci_info(eth_dev, pci_dev); + + /* Vendor and Device ID need to be set before init of shared code */ +@@ -1170,8 +1175,8 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) + memset(dcb_config, 0, sizeof(struct ixgbe_dcb_config)); + ixgbe_dcb_init(hw, dcb_config); + /* Get Hardware Flow Control setting */ +- hw->fc.requested_mode = ixgbe_fc_full; +- hw->fc.current_mode = ixgbe_fc_full; ++ hw->fc.requested_mode = ixgbe_fc_none; ++ hw->fc.current_mode = ixgbe_fc_none; + hw->fc.pause_time = IXGBE_FC_PAUSE; + for (i = 0; i < IXGBE_DCB_MAX_TRAFFIC_CLASS; i++) { + hw->fc.low_water[i] = IXGBE_FC_LO; +@@ -1190,7 +1195,6 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) + diag = ixgbe_bypass_init_hw(hw); + #else + diag = ixgbe_init_hw(hw); +- hw->mac.autotry_restart = false; + #endif /* RTE_LIBRTE_IXGBE_BYPASS */ + + /* +@@ -1301,8 +1305,6 @@ eth_ixgbe_dev_init(struct rte_eth_dev *eth_dev, void *init_params __rte_unused) + /* enable support intr */ + ixgbe_enable_intr(eth_dev); + +- ixgbe_dev_set_link_down(eth_dev); +- + /* initialize filter info */ + memset(filter_info, 0, + sizeof(struct ixgbe_filter_info)); +@@ -1564,6 +1566,7 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev) + { + int diag; + uint32_t tc, tcs; ++ struct ixgbe_adapter *ad = eth_dev->data->dev_private; + struct rte_pci_device *pci_dev = RTE_ETH_DEV_TO_PCI(eth_dev); + struct rte_intr_handle *intr_handle = &pci_dev->intr_handle; + struct ixgbe_hw *hw = +@@ -1604,6 +1607,7 @@ eth_ixgbevf_dev_init(struct rte_eth_dev *eth_dev) + return 0; + } + ++ rte_atomic32_clear(&ad->link_thread_running); + ixgbevf_parse_devargs(eth_dev->data->dev_private, + pci_dev->device.devargs); + +@@ -2530,6 +2534,41 @@ ixgbe_set_vf_rate_limit(struct rte_eth_dev *dev, uint16_t vf, + return 0; + } + ++static int ++ixgbe_flow_ctrl_enable(struct rte_eth_dev *dev, struct ixgbe_hw *hw) ++{ ++ struct ixgbe_adapter *adapter = dev->data->dev_private; ++ int err; ++ uint32_t mflcn; ++ ++ ixgbe_setup_fc(hw); ++ ++ err = ixgbe_fc_enable(hw); ++ ++ /* Not negotiated is not an error case */ ++ if (err == IXGBE_SUCCESS || err == IXGBE_ERR_FC_NOT_NEGOTIATED) { ++ /* ++ *check if we want to forward MAC frames - driver doesn't ++ *have native capability to do that, ++ *so we'll write the registers ourselves ++ */ ++ ++ mflcn = IXGBE_READ_REG(hw, IXGBE_MFLCN); ++ ++ /* set or clear MFLCN.PMCF bit depending on configuration */ ++ if (adapter->mac_ctrl_frame_fwd != 0) ++ mflcn |= IXGBE_MFLCN_PMCF; ++ else ++ mflcn &= ~IXGBE_MFLCN_PMCF; ++ ++ IXGBE_WRITE_REG(hw, IXGBE_MFLCN, mflcn); ++ IXGBE_WRITE_FLUSH(hw); ++ ++ return 0; ++ } ++ return err; ++} ++ + /* + * Configure device link speed and setup link. + * It returns 0 on success. +@@ -2558,19 +2597,8 @@ ixgbe_dev_start(struct rte_eth_dev *dev) + + PMD_INIT_FUNC_TRACE(); + +- /* IXGBE devices don't support: +- * - half duplex (checked afterwards for valid speeds) +- * - fixed speed: TODO implement +- */ +- if (dev->data->dev_conf.link_speeds & ETH_LINK_SPEED_FIXED) { +- PMD_INIT_LOG(ERR, +- "Invalid link_speeds for port %u, fix speed not supported", +- dev->data->port_id); +- return -EINVAL; +- } +- + /* Stop the link setup handler before resetting the HW. */ +- rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); ++ ixgbe_dev_wait_setup_link_complete(dev, 0); + + /* disable uio/vfio intr/eventfd mapping */ + rte_intr_disable(intr_handle); +@@ -2666,6 +2694,12 @@ ixgbe_dev_start(struct rte_eth_dev *dev) + + ixgbe_restore_statistics_mapping(dev); + ++ err = ixgbe_flow_ctrl_enable(dev, hw); ++ if (err < 0) { ++ PMD_INIT_LOG(ERR, "enable flow ctrl err"); ++ goto error; ++ } ++ + err = ixgbe_dev_rxtx_start(dev); + if (err < 0) { + PMD_INIT_LOG(ERR, "Unable to start rxtx queues"); +@@ -2724,7 +2758,11 @@ ixgbe_dev_start(struct rte_eth_dev *dev) + } + + link_speeds = &dev->data->dev_conf.link_speeds; +- if (*link_speeds & ~allowed_speeds) { ++ ++ /* Ignore autoneg flag bit and check the validity of  ++ * link_speed  ++ */ ++ if (((*link_speeds) >> 1) & ~(allowed_speeds >> 1)) { + PMD_INIT_LOG(ERR, "Invalid link setting"); + goto error; + } +@@ -2801,6 +2839,11 @@ ixgbe_dev_start(struct rte_eth_dev *dev) + "please call hierarchy_commit() " + "before starting the port"); + ++ /* wait for the controller to acquire link */ ++ err = ixgbe_wait_for_link_up(hw); ++ if (err) ++ goto error; ++ + /* + * Update link status right before return, because it may + * start link configuration process in a separate thread. +@@ -2842,7 +2885,7 @@ ixgbe_dev_stop(struct rte_eth_dev *dev) + + PMD_INIT_FUNC_TRACE(); + +- rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); ++ ixgbe_dev_wait_setup_link_complete(dev, 0); + + /* disable interrupts */ + ixgbe_disable_intr(hw); +@@ -2893,6 +2936,8 @@ ixgbe_dev_stop(struct rte_eth_dev *dev) + + adapter->rss_reta_updated = 0; + ++ adapter->mac_ctrl_frame_fwd = 0; ++ + hw->adapter_stopped = true; + } + +@@ -4095,16 +4140,46 @@ ixgbevf_check_link(struct ixgbe_hw *hw, ixgbe_link_speed *speed, + return ret_val; + } + +-static void +-ixgbe_dev_setup_link_alarm_handler(void *param) ++/* ++ * If @timeout_ms was 0, it means that it will not return until link complete. ++ * It returns 1 on complete, return 0 on timeout. ++ */ ++static int ++ixgbe_dev_wait_setup_link_complete(struct rte_eth_dev *dev, uint32_t timeout_ms) ++{ ++#define WARNING_TIMEOUT 9000 /* 9s in total */ ++ struct ixgbe_adapter *ad = dev->data->dev_private; ++ uint32_t timeout = timeout_ms ? timeout_ms : WARNING_TIMEOUT; ++ ++ while (rte_atomic32_read(&ad->link_thread_running)) { ++ msec_delay(1); ++ timeout--; ++ ++ if (timeout_ms) { ++ if (!timeout) ++ return 0; ++ } else if (!timeout) { ++ /* It will not return until link complete */ ++ timeout = WARNING_TIMEOUT; ++ PMD_DRV_LOG(ERR, "IXGBE link thread not complete too long time!"); ++ } ++ } ++ ++ return 1; ++} ++ ++static void * ++ixgbe_dev_setup_link_thread_handler(void *param) + { + struct rte_eth_dev *dev = (struct rte_eth_dev *)param; ++ struct ixgbe_adapter *ad = dev->data->dev_private; + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); + struct ixgbe_interrupt *intr = + IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private); + u32 speed; + bool autoneg = false; + ++ pthread_detach(pthread_self()); + speed = hw->phy.autoneg_advertised; + if (!speed) + ixgbe_get_link_capabilities(hw, &speed, &autoneg); +@@ -4112,6 +4187,40 @@ ixgbe_dev_setup_link_alarm_handler(void *param) + ixgbe_setup_link(hw, speed, true); + + intr->flags &= ~IXGBE_FLAG_NEED_LINK_CONFIG; ++ rte_atomic32_clear(&ad->link_thread_running); ++ return NULL; ++} ++ ++/* ++ * In freebsd environment, nic_uio drivers do not support interrupts, ++ * rte_intr_callback_register() will fail to register interrupts. ++ * We can not make link status to change from down to up by interrupt ++ * callback. So we need to wait for the controller to acquire link ++ * when ports start. ++ * It returns 0 on link up. ++ */ ++static int ++ixgbe_wait_for_link_up(struct ixgbe_hw *hw) ++{ ++#ifdef RTE_EXEC_ENV_FREEBSD ++ int err, i, link_up = 0; ++ uint32_t speed = 0; ++ const int nb_iter = 25; ++ ++ for (i = 0; i < nb_iter; i++) { ++ err = ixgbe_check_link(hw, &speed, &link_up, 0); ++ if (err) ++ return err; ++ if (link_up) ++ return 0; ++ msec_delay(200); ++ } ++ ++ return 0; ++#else ++ RTE_SET_USED(hw); ++ return 0; ++#endif + } + + /* return 0 means link status changed, -1 means not changed */ +@@ -4120,6 +4229,7 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, + int wait_to_complete, int vf) + { + struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private); ++ struct ixgbe_adapter *ad = dev->data->dev_private; + struct rte_eth_link link; + ixgbe_link_speed link_speed = IXGBE_LINK_SPEED_UNKNOWN; + struct ixgbe_interrupt *intr = +@@ -4133,7 +4243,8 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, + link.link_status = ETH_LINK_DOWN; + link.link_speed = ETH_SPEED_NUM_NONE; + link.link_duplex = ETH_LINK_HALF_DUPLEX; +- link.link_autoneg = ETH_LINK_AUTONEG; ++ link.link_autoneg = !(dev->data->dev_conf.link_speeds & ++ ETH_LINK_SPEED_FIXED); + + hw->mac.get_link_status = true; + +@@ -4144,6 +4255,11 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, + if (wait_to_complete == 0 || dev->data->dev_conf.intr_conf.lsc != 0) + wait = 0; + ++/* BSD has no interrupt mechanism, so force NIC status synchronization. */ ++#ifdef RTE_EXEC_ENV_FREEBSD ++ wait = 1; ++#endif ++ + if (vf) + diag = ixgbevf_check_link(hw, &link_speed, &link_up, wait); + else +@@ -4155,15 +4271,34 @@ ixgbe_dev_link_update_share(struct rte_eth_dev *dev, + return rte_eth_linkstatus_set(dev, &link); + } + +- esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP); +- if ((esdp_reg & IXGBE_ESDP_SDP3)) +- link_up = 0; ++ if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { ++ esdp_reg = IXGBE_READ_REG(hw, IXGBE_ESDP); ++ if ((esdp_reg & IXGBE_ESDP_SDP3)) ++ link_up = 0; ++ } + + if (link_up == 0) { + if (ixgbe_get_media_type(hw) == ixgbe_media_type_fiber) { +- intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; +- rte_eal_alarm_set(10, +- ixgbe_dev_setup_link_alarm_handler, dev); ++ ixgbe_dev_wait_setup_link_complete(dev, 0); ++ if (rte_atomic32_test_and_set(&ad->link_thread_running)) { ++ /* To avoid race condition between threads, set ++ * the IXGBE_FLAG_NEED_LINK_CONFIG flag only ++ * when there is no link thread running. ++ */ ++ intr->flags |= IXGBE_FLAG_NEED_LINK_CONFIG; ++ if (rte_ctrl_thread_create(&ad->link_thread_tid, ++ "ixgbe-link-handler", ++ NULL, ++ ixgbe_dev_setup_link_thread_handler, ++ dev) < 0) { ++ PMD_DRV_LOG(ERR, ++ "Create link thread failed!"); ++ rte_atomic32_clear(&ad->link_thread_running); ++ } ++ } else { ++ PMD_DRV_LOG(ERR, ++ "Other link thread is running now!"); ++ } + } + return rte_eth_linkstatus_set(dev, &link); + } +@@ -4646,10 +4781,10 @@ static int + ixgbe_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) + { + struct ixgbe_hw *hw; ++ struct ixgbe_adapter *adapter = dev->data->dev_private; + int err; + uint32_t rx_buf_size; + uint32_t max_high_water; +- uint32_t mflcn; + enum ixgbe_fc_mode rte_fcmode_2_ixgbe_fcmode[] = { + ixgbe_fc_none, + ixgbe_fc_rx_pause, +@@ -4682,31 +4817,14 @@ ixgbe_flow_ctrl_set(struct rte_eth_dev *dev, struct rte_eth_fc_conf *fc_conf) + hw->fc.low_water[0] = fc_conf->low_water; + hw->fc.send_xon = fc_conf->send_xon; + hw->fc.disable_fc_autoneg = !fc_conf->autoneg; ++ adapter->mac_ctrl_frame_fwd = fc_conf->mac_ctrl_frame_fwd; + +- err = ixgbe_fc_enable(hw); +- +- /* Not negotiated is not an error case */ +- if ((err == IXGBE_SUCCESS) || (err == IXGBE_ERR_FC_NOT_NEGOTIATED)) { +- +- /* check if we want to forward MAC frames - driver doesn't have native +- * capability to do that, so we'll write the registers ourselves */ +- +- mflcn = IXGBE_READ_REG(hw, IXGBE_MFLCN); +- +- /* set or clear MFLCN.PMCF bit depending on configuration */ +- if (fc_conf->mac_ctrl_frame_fwd != 0) +- mflcn |= IXGBE_MFLCN_PMCF; +- else +- mflcn &= ~IXGBE_MFLCN_PMCF; +- +- IXGBE_WRITE_REG(hw, IXGBE_MFLCN, mflcn); +- IXGBE_WRITE_FLUSH(hw); +- +- return 0; ++ err = ixgbe_flow_ctrl_enable(dev, hw); ++ if (err < 0) { ++ PMD_INIT_LOG(ERR, "ixgbe_flow_ctrl_enable = 0x%x", err); ++ return -EIO; + } +- +- PMD_INIT_LOG(ERR, "ixgbe_fc_enable = 0x%x", err); +- return -EIO; ++ return err; + } + + /** +@@ -5207,7 +5325,7 @@ ixgbevf_dev_start(struct rte_eth_dev *dev) + PMD_INIT_FUNC_TRACE(); + + /* Stop the link setup handler before resetting the HW. */ +- rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); ++ ixgbe_dev_wait_setup_link_complete(dev, 0); + + err = hw->mac.ops.reset_hw(hw); + if (err) { +@@ -5305,7 +5423,7 @@ ixgbevf_dev_stop(struct rte_eth_dev *dev) + + PMD_INIT_FUNC_TRACE(); + +- rte_eal_alarm_cancel(ixgbe_dev_setup_link_alarm_handler, dev); ++ ixgbe_dev_wait_setup_link_complete(dev, 0); + + ixgbevf_intr_disable(dev); + +diff --git a/dpdk/drivers/net/ixgbe/ixgbe_ethdev.h b/dpdk/drivers/net/ixgbe/ixgbe_ethdev.h +index 76a1b9d184..0334c226be 100644 +--- a/dpdk/drivers/net/ixgbe/ixgbe_ethdev.h ++++ b/dpdk/drivers/net/ixgbe/ixgbe_ethdev.h +@@ -510,6 +510,9 @@ struct ixgbe_adapter { + * mailbox status) link status. + */ + uint8_t pflink_fullchk; ++ uint8_t mac_ctrl_frame_fwd; ++ rte_atomic32_t link_thread_running; ++ pthread_t link_thread_tid; + }; + + struct ixgbe_vf_representor { +diff --git a/dpdk/drivers/net/ixgbe/ixgbe_rxtx.c b/dpdk/drivers/net/ixgbe/ixgbe_rxtx.c +index fa572d184d..a7b24cd053 100644 +--- a/dpdk/drivers/net/ixgbe/ixgbe_rxtx.c ++++ b/dpdk/drivers/net/ixgbe/ixgbe_rxtx.c +@@ -87,11 +87,6 @@ + #define rte_ixgbe_prefetch(p) do {} while (0) + #endif + +-#ifdef RTE_IXGBE_INC_VECTOR +-uint16_t ixgbe_xmit_fixed_burst_vec(void *tx_queue, struct rte_mbuf **tx_pkts, +- uint16_t nb_pkts); +-#endif +- + /********************************************************************* + * + * TX functions +@@ -993,6 +988,12 @@ ixgbe_prep_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + return i; + } + ++ /* check the size of packet */ ++ if (m->pkt_len < IXGBE_TX_MIN_PKT_LEN) { ++ rte_errno = EINVAL; ++ return i; ++ } ++ + #ifdef RTE_LIBRTE_ETHDEV_DEBUG + ret = rte_validate_tx_offload(m); + if (ret != 0) { +diff --git a/dpdk/drivers/net/ixgbe/ixgbe_rxtx.h b/dpdk/drivers/net/ixgbe/ixgbe_rxtx.h +index 505d344b9c..5e1ac44942 100644 +--- a/dpdk/drivers/net/ixgbe/ixgbe_rxtx.h ++++ b/dpdk/drivers/net/ixgbe/ixgbe_rxtx.h +@@ -53,6 +53,8 @@ + + #define IXGBE_TX_MAX_SEG 40 + ++#define IXGBE_TX_MIN_PKT_LEN 14 ++ + #define IXGBE_PACKET_TYPE_MASK_82599 0X7F + #define IXGBE_PACKET_TYPE_MASK_X550 0X10FF + #define IXGBE_PACKET_TYPE_MASK_TUNNEL 0XFF +diff --git a/dpdk/drivers/net/ixgbe/rte_pmd_ixgbe.c b/dpdk/drivers/net/ixgbe/rte_pmd_ixgbe.c +index 8bcaded6e5..9bff557f97 100644 +--- a/dpdk/drivers/net/ixgbe/rte_pmd_ixgbe.c ++++ b/dpdk/drivers/net/ixgbe/rte_pmd_ixgbe.c +@@ -522,6 +522,9 @@ rte_pmd_ixgbe_macsec_enable(uint16_t port, uint8_t en, uint8_t rp) + + dev = &rte_eth_devices[port]; + ++ if (!is_ixgbe_supported(dev)) ++ return -ENOTSUP; ++ + macsec_setting.offload_en = 1; + macsec_setting.encrypt_en = en; + macsec_setting.replayprotect_en = rp; +@@ -542,6 +545,9 @@ rte_pmd_ixgbe_macsec_disable(uint16_t port) + + dev = &rte_eth_devices[port]; + ++ if (!is_ixgbe_supported(dev)) ++ return -ENOTSUP; ++ + ixgbe_dev_macsec_setting_reset(dev); + + ixgbe_dev_macsec_register_disable(dev); +diff --git a/dpdk/drivers/net/memif/memif_socket.c b/dpdk/drivers/net/memif/memif_socket.c +index ad5e30b96e..c1967c67bf 100644 +--- a/dpdk/drivers/net/memif/memif_socket.c ++++ b/dpdk/drivers/net/memif/memif_socket.c +@@ -204,6 +204,13 @@ memif_msg_receive_init(struct memif_control_channel *cc, memif_msg_t *msg) + pmd = dev->data->dev_private; + if (((pmd->flags & ETH_MEMIF_FLAG_DISABLED) == 0) && + pmd->id == i->id) { ++ if (pmd->flags & (ETH_MEMIF_FLAG_CONNECTING | ++ ETH_MEMIF_FLAG_CONNECTED)) { ++ memif_msg_enq_disconnect(cc, ++ "Already connected", 0); ++ return -1; ++ } ++ + /* assign control channel to device */ + cc->dev = dev; + pmd->cc = cc; +@@ -215,12 +222,6 @@ memif_msg_receive_init(struct memif_control_channel *cc, memif_msg_t *msg) + return -1; + } + +- if (pmd->flags & (ETH_MEMIF_FLAG_CONNECTING | +- ETH_MEMIF_FLAG_CONNECTED)) { +- memif_msg_enq_disconnect(pmd->cc, +- "Already connected", 0); +- return -1; +- } + strlcpy(pmd->remote_name, (char *)i->name, + sizeof(pmd->remote_name)); + +@@ -765,6 +766,7 @@ memif_intr_handler(void *arg) + ret = memif_msg_receive(cc); + /* if driver failed to assign device */ + if (cc->dev == NULL) { ++ memif_msg_send_from_queue(cc); + ret = rte_intr_callback_unregister_pending(&cc->intr_handle, + memif_intr_handler, + cc, +diff --git a/dpdk/drivers/net/memif/rte_eth_memif.c b/dpdk/drivers/net/memif/rte_eth_memif.c +index 8dd1d0d63d..aa83df3652 100644 +--- a/dpdk/drivers/net/memif/rte_eth_memif.c ++++ b/dpdk/drivers/net/memif/rte_eth_memif.c +@@ -1501,7 +1501,7 @@ memif_create(struct rte_vdev_device *vdev, enum memif_role_t role, + } + + +- eth_dev->data->dev_flags &= RTE_ETH_DEV_CLOSE_REMOVE; ++ eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE; + + rte_eth_dev_probing_finish(eth_dev); + +diff --git a/dpdk/drivers/net/mlx4/meson.build b/dpdk/drivers/net/mlx4/meson.build +index 9eb4988420..ff7386aee2 100644 +--- a/dpdk/drivers/net/mlx4/meson.build ++++ b/dpdk/drivers/net/mlx4/meson.build +@@ -9,11 +9,12 @@ if not is_linux + endif + build = true + +-pmd_dlopen = (get_option('ibverbs_link') == 'dlopen') ++static_ibverbs = (get_option('ibverbs_link') == 'static') ++dlopen_ibverbs = (get_option('ibverbs_link') == 'dlopen') + LIB_GLUE_BASE = 'librte_pmd_mlx4_glue.so' + LIB_GLUE_VERSION = '18.02.0' + LIB_GLUE = LIB_GLUE_BASE + '.' + LIB_GLUE_VERSION +-if pmd_dlopen ++if dlopen_ibverbs + dpdk_conf.set('RTE_IBVERBS_LINK_DLOPEN', 1) + cflags += [ + '-DMLX4_GLUE="@0@"'.format(LIB_GLUE), +@@ -24,12 +25,15 @@ endif + libnames = [ 'mlx4', 'ibverbs' ] + libs = [] + foreach libname:libnames +- lib = dependency('lib' + libname, required:false) +- if not lib.found() ++ lib = dependency('lib' + libname, static:static_ibverbs, required:false) ++ if not lib.found() and not static_ibverbs + lib = cc.find_library(libname, required:false) + endif + if lib.found() +- libs += [ lib ] ++ libs += lib ++ if not static_ibverbs and not dlopen_ibverbs ++ ext_deps += lib ++ endif + else + build = false + reason = 'missing dependency, "' + libname + '"' +@@ -37,8 +41,17 @@ foreach libname:libnames + endforeach + + if build ++ if static_ibverbs or dlopen_ibverbs ++ # Build without adding shared libs to Requires.private ++ ibv_cflags = run_command(pkgconf, '--cflags', 'libibverbs').stdout() ++ ext_deps += declare_dependency(compile_args: ibv_cflags.split()) ++ endif ++ if static_ibverbs ++ # Add static deps ldflags to internal apps and Libs.private ++ ibv_ldflags = run_command(ldflags_ibverbs_static, check:true).stdout() ++ ext_deps += declare_dependency(link_args:ibv_ldflags.split()) ++ endif + allow_experimental_apis = true +- ext_deps += libs + sources = files( + 'mlx4.c', + 'mlx4_ethdev.c', +@@ -51,7 +64,7 @@ if build + 'mlx4_txq.c', + 'mlx4_utils.c', + ) +- if not pmd_dlopen ++ if not dlopen_ibverbs + sources += files('mlx4_glue.c') + endif + cflags_options = [ +@@ -103,7 +116,7 @@ if build + configure_file(output : 'mlx4_autoconf.h', configuration : config) + endif + # Build Glue Library +-if pmd_dlopen and build ++if dlopen_ibverbs and build + dlopen_name = 'mlx4_glue' + dlopen_lib_name = driver_name_fmt.format(dlopen_name) + dlopen_so_version = LIB_GLUE_VERSION +diff --git a/dpdk/drivers/net/mlx4/mlx4.c b/dpdk/drivers/net/mlx4/mlx4.c +index ab5e6c66cb..7771417ebe 100644 +--- a/dpdk/drivers/net/mlx4/mlx4.c ++++ b/dpdk/drivers/net/mlx4/mlx4.c +@@ -49,6 +49,10 @@ + #include "mlx4_rxtx.h" + #include "mlx4_utils.h" + ++#ifdef MLX4_GLUE ++const struct mlx4_glue *mlx4_glue; ++#endif ++ + static const char *MZ_MLX4_PMD_SHARED_DATA = "mlx4_pmd_shared_data"; + + /* Shared memory between primary and secondary processes. */ +diff --git a/dpdk/drivers/net/mlx4/mlx4_flow.c b/dpdk/drivers/net/mlx4/mlx4_flow.c +index 96479b83dd..2da4f6d965 100644 +--- a/dpdk/drivers/net/mlx4/mlx4_flow.c ++++ b/dpdk/drivers/net/mlx4/mlx4_flow.c +@@ -981,12 +981,13 @@ mlx4_drop_get(struct mlx4_priv *priv) + priv->drop = drop; + return drop; + error: +- if (drop->qp) +- claim_zero(mlx4_glue->destroy_qp(drop->qp)); +- if (drop->cq) +- claim_zero(mlx4_glue->destroy_cq(drop->cq)); +- if (drop) ++ if (drop) { ++ if (drop->qp) ++ claim_zero(mlx4_glue->destroy_qp(drop->qp)); ++ if (drop->cq) ++ claim_zero(mlx4_glue->destroy_cq(drop->cq)); + rte_free(drop); ++ } + rte_errno = ENOMEM; + return NULL; + } +diff --git a/dpdk/drivers/net/mlx4/mlx4_glue.h b/dpdk/drivers/net/mlx4/mlx4_glue.h +index 668ca86700..5d9e985495 100644 +--- a/dpdk/drivers/net/mlx4/mlx4_glue.h ++++ b/dpdk/drivers/net/mlx4/mlx4_glue.h +@@ -84,6 +84,6 @@ struct mlx4_glue { + void *attr); + }; + +-const struct mlx4_glue *mlx4_glue; ++extern const struct mlx4_glue *mlx4_glue; + + #endif /* MLX4_GLUE_H_ */ +diff --git a/dpdk/drivers/net/mlx4/mlx4_rxtx.h b/dpdk/drivers/net/mlx4/mlx4_rxtx.h +index 8baf33fa94..9de6c59411 100644 +--- a/dpdk/drivers/net/mlx4/mlx4_rxtx.h ++++ b/dpdk/drivers/net/mlx4/mlx4_rxtx.h +@@ -124,7 +124,7 @@ struct txq { + + /* mlx4_rxq.c */ + +-uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE]; ++extern uint8_t mlx4_rss_hash_key_default[MLX4_RSS_HASH_KEY_SIZE]; + int mlx4_rss_init(struct mlx4_priv *priv); + void mlx4_rss_deinit(struct mlx4_priv *priv); + struct mlx4_rss *mlx4_rss_get(struct mlx4_priv *priv, uint64_t fields, +diff --git a/dpdk/drivers/net/mlx4/mlx4_utils.h b/dpdk/drivers/net/mlx4/mlx4_utils.h +index 74b9d2ecdc..5718b9c742 100644 +--- a/dpdk/drivers/net/mlx4/mlx4_utils.h ++++ b/dpdk/drivers/net/mlx4/mlx4_utils.h +@@ -79,9 +79,10 @@ pmd_drv_log_basename(const char *s) + + /** Allocate a buffer on the stack and fill it with a printf format string. */ + #define MKSTR(name, ...) \ +- char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ ++ int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \ ++ char name[mkstr_size_##name + 1]; \ + \ +- snprintf(name, sizeof(name), __VA_ARGS__) ++ snprintf(name, sizeof(name), "" __VA_ARGS__) + + /** Generate a string out of the provided arguments. */ + #define MLX4_STR(...) # __VA_ARGS__ +diff --git a/dpdk/drivers/net/mlx5/Makefile b/dpdk/drivers/net/mlx5/Makefile +index c5cf4397ac..605975c245 100644 +--- a/dpdk/drivers/net/mlx5/Makefile ++++ b/dpdk/drivers/net/mlx5/Makefile +@@ -193,6 +193,11 @@ mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh + infiniband/mlx5dv.h \ + func mlx5dv_devx_obj_query_async \ + $(AUTOCONF_OUTPUT) ++ $Q sh -- '$<' '$@' \ ++ HAVE_IBV_DEVX_QP \ ++ infiniband/mlx5dv.h \ ++ func mlx5dv_devx_qp_query \ ++ $(AUTOCONF_OUTPUT) + $Q sh -- '$<' '$@' \ + HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR \ + infiniband/mlx5dv.h \ +diff --git a/dpdk/drivers/net/mlx5/meson.build b/dpdk/drivers/net/mlx5/meson.build +index d6b32db794..a5775d18e3 100644 +--- a/dpdk/drivers/net/mlx5/meson.build ++++ b/dpdk/drivers/net/mlx5/meson.build +@@ -9,11 +9,12 @@ if not is_linux + endif + build = true + +-pmd_dlopen = (get_option('ibverbs_link') == 'dlopen') ++static_ibverbs = (get_option('ibverbs_link') == 'static') ++dlopen_ibverbs = (get_option('ibverbs_link') == 'dlopen') + LIB_GLUE_BASE = 'librte_pmd_mlx5_glue.so' + LIB_GLUE_VERSION = '19.08.0' + LIB_GLUE = LIB_GLUE_BASE + '.' + LIB_GLUE_VERSION +-if pmd_dlopen ++if dlopen_ibverbs + dpdk_conf.set('RTE_IBVERBS_LINK_DLOPEN', 1) + cflags += [ + '-DMLX5_GLUE="@0@"'.format(LIB_GLUE), +@@ -24,12 +25,15 @@ endif + libnames = [ 'mlx5', 'ibverbs' ] + libs = [] + foreach libname:libnames +- lib = dependency('lib' + libname, required:false) +- if not lib.found() ++ lib = dependency('lib' + libname, static:static_ibverbs, required:false) ++ if not lib.found() and not static_ibverbs + lib = cc.find_library(libname, required:false) + endif + if lib.found() + libs += [ lib ] ++ if not static_ibverbs ++ ext_deps += lib ++ endif + else + build = false + reason = 'missing dependency, "' + libname + '"' +@@ -37,9 +41,18 @@ foreach libname:libnames + endforeach + + if build ++ if static_ibverbs or dlopen_ibverbs ++ # Build without adding shared libs to Requires.private ++ ibv_cflags = run_command(pkgconf, '--cflags', 'libibverbs').stdout() ++ ext_deps += declare_dependency(compile_args: ibv_cflags.split()) ++ endif ++ if static_ibverbs ++ # Add static deps ldflags to internal apps and Libs.private ++ ibv_ldflags = run_command(ldflags_ibverbs_static, check:true).stdout() ++ ext_deps += declare_dependency(link_args:ibv_ldflags.split()) ++ endif + allow_experimental_apis = true + deps += ['hash'] +- ext_deps += libs + sources = files( + 'mlx5.c', + 'mlx5_ethdev.c', +@@ -67,7 +80,7 @@ if build + or dpdk_conf.has('RTE_ARCH_PPC_64')) + sources += files('mlx5_rxtx_vec.c') + endif +- if not pmd_dlopen ++ if not dlopen_ibverbs + sources += files('mlx5_glue.c') + endif + cflags_options = [ +@@ -130,6 +143,8 @@ if build + 'MLX5DV_FLOW_ACTION_COUNTERS_DEVX' ], + [ 'HAVE_IBV_DEVX_ASYNC', 'infiniband/mlx5dv.h', + 'mlx5dv_devx_obj_query_async' ], ++ [ 'HAVE_IBV_DEVX_QP', 'infiniband/mlx5dv.h', ++ 'mlx5dv_devx_qp_query' ], + [ 'HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR', 'infiniband/mlx5dv.h', + 'mlx5dv_dr_action_create_dest_devx_tir' ], + [ 'HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER', 'infiniband/mlx5dv.h', +@@ -200,7 +215,7 @@ if build + configure_file(output : 'mlx5_autoconf.h', configuration : config) + endif + # Build Glue Library +-if pmd_dlopen and build ++if dlopen_ibverbs and build + dlopen_name = 'mlx5_glue' + dlopen_lib_name = driver_name_fmt.format(dlopen_name) + dlopen_so_version = LIB_GLUE_VERSION +diff --git a/dpdk/drivers/net/mlx5/mlx5.c b/dpdk/drivers/net/mlx5/mlx5.c +index d84a6f91b4..8879df317d 100644 +--- a/dpdk/drivers/net/mlx5/mlx5.c ++++ b/dpdk/drivers/net/mlx5/mlx5.c +@@ -62,6 +62,9 @@ + /* Device parameter to configure log 2 of the number of strides for MPRQ. */ + #define MLX5_RX_MPRQ_LOG_STRIDE_NUM "mprq_log_stride_num" + ++/* Device parameter to configure log 2 of the stride size for MPRQ. */ ++#define MLX5_RX_MPRQ_LOG_STRIDE_SIZE "mprq_log_stride_size" ++ + /* Device parameter to limit the size of memcpy'd packet for MPRQ. */ + #define MLX5_RX_MPRQ_MAX_MEMCPY_LEN "mprq_max_memcpy_len" + +@@ -184,6 +187,10 @@ struct mlx5_dev_spawn_data { + struct rte_pci_device *pci_dev; /**< Backend PCI device. */ + }; + ++#ifdef MLX5_GLUE ++const struct mlx5_glue *mlx5_glue; ++#endif ++ + static LIST_HEAD(, mlx5_ibv_shared) mlx5_ibv_list = LIST_HEAD_INITIALIZER(); + static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; + +@@ -196,11 +203,14 @@ static pthread_mutex_t mlx5_ibv_list_mutex = PTHREAD_MUTEX_INITIALIZER; + /** + * Allocate ID pool structure. + * ++ * @param[in] max_id ++ * The maximum id can be allocated from the pool. ++ * + * @return + * Pointer to pool object, NULL value otherwise. + */ + struct mlx5_flow_id_pool * +-mlx5_flow_id_pool_alloc(void) ++mlx5_flow_id_pool_alloc(uint32_t max_id) + { + struct mlx5_flow_id_pool *pool; + void *mem; +@@ -223,6 +233,7 @@ mlx5_flow_id_pool_alloc(void) + pool->curr = pool->free_arr; + pool->last = pool->free_arr + MLX5_FLOW_MIN_ID_POOL_SIZE; + pool->base_index = 0; ++ pool->max_id = max_id; + return pool; + error: + rte_free(pool); +@@ -257,7 +268,7 @@ uint32_t + mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id) + { + if (pool->curr == pool->free_arr) { +- if (pool->base_index == UINT32_MAX) { ++ if (pool->base_index == pool->max_id) { + rte_errno = ENOMEM; + DRV_LOG(ERR, "no free id"); + return -rte_errno; +@@ -590,7 +601,7 @@ mlx5_alloc_shared_ibctx(const struct mlx5_dev_spawn_data *spawn, + goto error; + } + } +- sh->flow_id_pool = mlx5_flow_id_pool_alloc(); ++ sh->flow_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX); + if (!sh->flow_id_pool) { + DRV_LOG(ERR, "can't create flow id pool"); + err = ENOMEM; +@@ -673,12 +684,12 @@ mlx5_free_shared_ibctx(struct mlx5_ibv_shared *sh) + assert(rte_eal_process_type() == RTE_PROC_PRIMARY); + if (--sh->refcnt) + goto exit; +- /* Release created Memory Regions. */ +- mlx5_mr_release(sh); + /* Remove from memory callback device list. */ + rte_rwlock_write_lock(&mlx5_shared_data->mem_event_rwlock); + LIST_REMOVE(sh, mem_event_cb); + rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock); ++ /* Release created Memory Regions. */ ++ mlx5_mr_release(sh); + /* Remove context from the global device list. */ + LIST_REMOVE(sh, next); + /* +@@ -868,8 +879,13 @@ mlx5_alloc_shared_dr(struct mlx5_priv *priv) + { + struct mlx5_ibv_shared *sh = priv->sh; + char s[MLX5_HLIST_NAMESIZE]; +- int err = mlx5_alloc_table_hash_list(priv); ++ int err = 0; + ++ if (!sh->flow_tbls) ++ err = mlx5_alloc_table_hash_list(priv); ++ else ++ DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n", ++ (void *)sh->flow_tbls); + if (err) + return err; + /* Create tags hash list table. */ +@@ -1490,6 +1506,8 @@ mlx5_args_check(const char *key, const char *val, void *opaque) + config->mprq.enabled = !!tmp; + } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_NUM, key) == 0) { + config->mprq.stride_num_n = tmp; ++ } else if (strcmp(MLX5_RX_MPRQ_LOG_STRIDE_SIZE, key) == 0) { ++ config->mprq.stride_size_n = tmp; + } else if (strcmp(MLX5_RX_MPRQ_MAX_MEMCPY_LEN, key) == 0) { + config->mprq.max_memcpy_len = tmp; + } else if (strcmp(MLX5_RXQS_MIN_MPRQ, key) == 0) { +@@ -1582,6 +1600,7 @@ mlx5_args(struct mlx5_dev_config *config, struct rte_devargs *devargs) + MLX5_RXQ_PKT_PAD_EN, + MLX5_RX_MPRQ_EN, + MLX5_RX_MPRQ_LOG_STRIDE_NUM, ++ MLX5_RX_MPRQ_LOG_STRIDE_SIZE, + MLX5_RX_MPRQ_MAX_MEMCPY_LEN, + MLX5_RXQS_MIN_MPRQ, + MLX5_TXQ_INLINE, +@@ -1697,7 +1716,7 @@ mlx5_init_once(void) + * key is specified in devargs + * - if DevX is enabled the inline mode is queried from the + * device (HCA attributes and NIC vport context if needed). +- * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4LX ++ * - otherwise L2 mode (18 bytes) is assumed for ConnectX-4/4 Lx + * and none (0 bytes) for other NICs + * + * @param spawn +@@ -1931,9 +1950,9 @@ mlx5_get_dbr(struct rte_eth_dev *dev, struct mlx5_devx_dbr_page **dbr_page) + i++) + ; /* Empty. */ + /* Find the first clear bit. */ ++ assert(i < MLX5_DBR_BITMAP_SIZE); + j = rte_bsf64(~page->dbr_bitmap[i]); +- assert(i < (MLX5_DBR_PER_PAGE / 64)); +- page->dbr_bitmap[i] |= (1 << j); ++ page->dbr_bitmap[i] |= (UINT64_C(1) << j); + page->dbr_count++; + *dbr_page = page; + return (((i * 64) + j) * sizeof(uint64_t)); +@@ -1978,7 +1997,7 @@ mlx5_release_dbr(struct rte_eth_dev *dev, uint32_t umem_id, uint64_t offset) + int i = offset / 64; + int j = offset % 64; + +- page->dbr_bitmap[i] &= ~(1 << j); ++ page->dbr_bitmap[i] &= ~(UINT64_C(1) << j); + } + return ret; + } +@@ -2236,8 +2255,6 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, + mprq_caps.min_single_wqe_log_num_of_strides; + mprq_max_stride_num_n = + mprq_caps.max_single_wqe_log_num_of_strides; +- config.mprq.stride_num_n = RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, +- mprq_min_stride_num_n); + } + #endif + if (RTE_CACHE_LINE_SIZE == 128 && +@@ -2543,6 +2560,8 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, + priv->mtr_color_reg = ffs(reg_c_mask) - 1 + + REG_C_0; + priv->mtr_en = 1; ++ priv->mtr_reg_share = ++ config.hca_attr.qos.flow_meter_reg_share; + DRV_LOG(DEBUG, "The REG_C meter uses is %d", + priv->mtr_color_reg); + } +@@ -2550,17 +2569,32 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, + #endif + } + if (config.mprq.enabled && mprq) { +- if (config.mprq.stride_num_n > mprq_max_stride_num_n || +- config.mprq.stride_num_n < mprq_min_stride_num_n) { ++ if (config.mprq.stride_num_n && ++ (config.mprq.stride_num_n > mprq_max_stride_num_n || ++ config.mprq.stride_num_n < mprq_min_stride_num_n)) { + config.mprq.stride_num_n = +- RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, +- mprq_min_stride_num_n); ++ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N, ++ mprq_min_stride_num_n), ++ mprq_max_stride_num_n); + DRV_LOG(WARNING, + "the number of strides" + " for Multi-Packet RQ is out of range," + " setting default value (%u)", + 1 << config.mprq.stride_num_n); + } ++ if (config.mprq.stride_size_n && ++ (config.mprq.stride_size_n > mprq_max_stride_size_n || ++ config.mprq.stride_size_n < mprq_min_stride_size_n)) { ++ config.mprq.stride_size_n = ++ RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N, ++ mprq_min_stride_size_n), ++ mprq_max_stride_size_n); ++ DRV_LOG(WARNING, ++ "the size of a stride" ++ " for Multi-Packet RQ is out of range," ++ " setting default value (%u)", ++ 1 << config.mprq.stride_size_n); ++ } + config.mprq.min_stride_size_n = mprq_min_stride_size_n; + config.mprq.max_stride_size_n = mprq_max_stride_size_n; + } else if (config.mprq.enabled && !mprq) { +@@ -2675,7 +2709,12 @@ mlx5_dev_spawn(struct rte_device *dpdk_dev, + err = mlx5_alloc_shared_dr(priv); + if (err) + goto error; +- priv->qrss_id_pool = mlx5_flow_id_pool_alloc(); ++ /* ++ * RSS id is shared with meter flow id. Meter flow id can only ++ * use the 24 MSB of the register. ++ */ ++ priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >> ++ MLX5_MTR_COLOR_BITS); + if (!priv->qrss_id_pool) { + DRV_LOG(ERR, "can't create flow id pool"); + err = ENOMEM; +@@ -3074,7 +3113,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + /* + * Single IB device with multiple ports found, + * it may be E-Switch master device and representors. +- * We have to perform identification trough the ports. ++ * We have to perform identification through the ports. + */ + assert(nl_rdma >= 0); + assert(ns == 0); +@@ -3274,7 +3313,8 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused, + .mr_ext_memseg_en = 1, + .mprq = { + .enabled = 0, /* Disabled by default. */ +- .stride_num_n = MLX5_MPRQ_STRIDE_NUM_N, ++ .stride_num_n = 0, ++ .stride_size_n = 0, + .max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN, + .min_rxqs_num = MLX5_MPRQ_MIN_RXQS, + }, +diff --git a/dpdk/drivers/net/mlx5/mlx5.h b/dpdk/drivers/net/mlx5/mlx5.h +index 0c3a90e1bf..e4af5d40db 100644 +--- a/dpdk/drivers/net/mlx5/mlx5.h ++++ b/dpdk/drivers/net/mlx5/mlx5.h +@@ -148,12 +148,15 @@ struct mlx5_xstats_ctrl { + /* Index in the device counters table. */ + uint16_t dev_table_idx[MLX5_MAX_XSTATS]; + uint64_t base[MLX5_MAX_XSTATS]; ++ uint64_t xstats[MLX5_MAX_XSTATS]; ++ uint64_t hw_stats[MLX5_MAX_XSTATS]; + struct mlx5_counter_ctrl info[MLX5_MAX_XSTATS]; + }; + + struct mlx5_stats_ctrl { + /* Base for imissed counter. */ + uint64_t imissed_base; ++ uint64_t imissed; + }; + + /* devX creation object */ +@@ -173,6 +176,8 @@ struct mlx5_devx_mkey_attr { + struct mlx5_hca_qos_attr { + uint32_t sup:1; /* Whether QOS is supported. */ + uint32_t srtcm_sup:1; /* Whether srTCM mode is supported. */ ++ uint32_t flow_meter_reg_share:1; ++ /* Whether reg_c share is supported. */ + uint8_t log_max_flow_meter; + /* Power of the maximum supported meters. */ + uint8_t flow_meter_reg_c_ids; +@@ -262,6 +267,7 @@ struct mlx5_dev_config { + struct { + unsigned int enabled:1; /* Whether MPRQ is enabled. */ + unsigned int stride_num_n; /* Number of strides. */ ++ unsigned int stride_size_n; /* Size of a stride. */ + unsigned int min_stride_size_n; /* Min size of a stride. */ + unsigned int max_stride_size_n; /* Max size of a stride. */ + unsigned int max_memcpy_len; +@@ -364,7 +370,7 @@ struct mlx5_devx_tir_attr { + uint32_t rx_hash_fn:4; + uint32_t self_lb_block:2; + uint32_t transport_domain:24; +- uint32_t rx_hash_toeplitz_key[10]; ++ uint8_t rx_hash_toeplitz_key[MLX5_RSS_HASH_KEY_LEN]; + struct mlx5_rx_hash_field_select rx_hash_field_selector_outer; + struct mlx5_rx_hash_field_select rx_hash_field_selector_inner; + }; +@@ -626,6 +632,7 @@ struct mlx5_flow_id_pool { + /**< The next index that can be used without any free elements. */ + uint32_t *curr; /**< Pointer to the index to pop. */ + uint32_t *last; /**< Pointer to the last element in the empty arrray. */ ++ uint32_t max_id; /**< Maximum id can be allocated from the pool. */ + }; + + /* +@@ -660,14 +667,8 @@ struct mlx5_ibv_shared { + uint32_t dv_regc0_mask; /* available bits of metatada reg_c[0]. */ + uint32_t dv_refcnt; /* DV/DR data reference counter. */ + void *fdb_domain; /* FDB Direct Rules name space handle. */ +- struct mlx5_flow_tbl_resource *fdb_mtr_sfx_tbl; +- /* FDB meter suffix rules table. */ + void *rx_domain; /* RX Direct Rules name space handle. */ +- struct mlx5_flow_tbl_resource *rx_mtr_sfx_tbl; +- /* RX meter suffix rules table. */ + void *tx_domain; /* TX Direct Rules name space handle. */ +- struct mlx5_flow_tbl_resource *tx_mtr_sfx_tbl; +- /* TX meter suffix rules table. */ + struct mlx5_hlist *flow_tbls; + /* Direct Rules tables for FDB, NIC TX+RX */ + void *esw_drop_action; /* Pointer to DR E-Switch drop action. */ +@@ -727,6 +728,7 @@ struct mlx5_priv { + unsigned int dr_shared:1; /* DV/DR data is shared. */ + unsigned int counter_fallback:1; /* Use counter fallback management. */ + unsigned int mtr_en:1; /* Whether support meter. */ ++ unsigned int mtr_reg_share:1; /* Whether support meter REG_C share. */ + uint16_t domain_id; /* Switch domain identifier. */ + uint16_t vport_id; /* Associated VF vport index (if any). */ + uint32_t vport_meta_tag; /* Used for vport index match ove VF LAG. */ +@@ -784,6 +786,7 @@ struct mlx5_priv { + /* UAR same-page access control required in 32bit implementations. */ + #endif + uint8_t skip_default_rss_reta; /* Skip configuration of default reta. */ ++ uint8_t fdb_def_rule; /* Whether fdb jump to table 1 is configured. */ + }; + + #define PORT_ID(priv) ((priv)->dev_data->port_id) +@@ -972,6 +975,7 @@ struct mlx5_flow_counter *mlx5_counter_alloc(struct rte_eth_dev *dev); + void mlx5_counter_free(struct rte_eth_dev *dev, struct mlx5_flow_counter *cnt); + int mlx5_counter_query(struct rte_eth_dev *dev, struct mlx5_flow_counter *cnt, + bool clear, uint64_t *pkts, uint64_t *bytes); ++void mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev); + + /* mlx5_mp.c */ + void mlx5_mp_req_start_rxtx(struct rte_eth_dev *dev); +diff --git a/dpdk/drivers/net/mlx5/mlx5_defs.h b/dpdk/drivers/net/mlx5/mlx5_defs.h +index 042e1f31ee..418e744d65 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_defs.h ++++ b/dpdk/drivers/net/mlx5/mlx5_defs.h +@@ -146,6 +146,9 @@ + /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */ + #define MLX5_MPRQ_STRIDE_NUM_N 6U + ++/* Log 2 of the default size of a stride per WQE for Multi-Packet RQ. */ ++#define MLX5_MPRQ_STRIDE_SIZE_N 11U ++ + /* Two-byte shift is disabled for Multi-Packet RQ. */ + #define MLX5_MPRQ_TWO_BYTE_SHIFT 0 + +@@ -176,6 +179,10 @@ + #define MLX5_FLOW_MREG_HNAME "MARK_COPY_TABLE" + #define MLX5_DEFAULT_COPY_ID UINT32_MAX + ++/* Hairpin TX/RX queue configuration parameters. */ ++#define MLX5_HAIRPIN_QUEUE_STRIDE 6 ++#define MLX5_HAIRPIN_JUMBO_LOG_SIZE (15 + 2) ++ + /* Definition of static_assert found in /usr/include/assert.h */ + #ifndef HAVE_STATIC_ASSERT + #define static_assert _Static_assert +diff --git a/dpdk/drivers/net/mlx5/mlx5_devx_cmds.c b/dpdk/drivers/net/mlx5/mlx5_devx_cmds.c +index 9893287ba8..e223ee9b18 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_devx_cmds.c ++++ b/dpdk/drivers/net/mlx5/mlx5_devx_cmds.c +@@ -362,6 +362,8 @@ mlx5_devx_cmd_query_hca_attr(struct ibv_context *ctx, + MLX5_GET(qos_cap, hcattr, log_max_flow_meter); + attr->qos.flow_meter_reg_c_ids = + MLX5_GET(qos_cap, hcattr, flow_meter_reg_id); ++ attr->qos.flow_meter_reg_share = ++ MLX5_GET(qos_cap, hcattr, flow_meter_reg_share); + } + if (!attr->eth_net_offloads) + return 0; +@@ -633,9 +635,8 @@ mlx5_devx_cmd_create_tir(struct ibv_context *ctx, + { + uint32_t in[MLX5_ST_SZ_DW(create_tir_in)] = {0}; + uint32_t out[MLX5_ST_SZ_DW(create_tir_out)] = {0}; +- void *tir_ctx, *outer, *inner; ++ void *tir_ctx, *outer, *inner, *rss_key; + struct mlx5_devx_obj *tir = NULL; +- int i; + + tir = rte_calloc(__func__, 1, sizeof(*tir), 0); + if (!tir) { +@@ -658,10 +659,8 @@ mlx5_devx_cmd_create_tir(struct ibv_context *ctx, + MLX5_SET(tirc, tir_ctx, rx_hash_fn, tir_attr->rx_hash_fn); + MLX5_SET(tirc, tir_ctx, self_lb_block, tir_attr->self_lb_block); + MLX5_SET(tirc, tir_ctx, transport_domain, tir_attr->transport_domain); +- for (i = 0; i < 10; i++) { +- MLX5_SET(tirc, tir_ctx, rx_hash_toeplitz_key[i], +- tir_attr->rx_hash_toeplitz_key[i]); +- } ++ rss_key = MLX5_ADDR_OF(tirc, tir_ctx, rx_hash_toeplitz_key); ++ memcpy(rss_key, tir_attr->rx_hash_toeplitz_key, MLX5_RSS_HASH_KEY_LEN); + outer = MLX5_ADDR_OF(tirc, tir_ctx, rx_hash_field_selector_outer); + MLX5_SET(rx_hash_field_select, outer, l3_prot_type, + tir_attr->rx_hash_field_selector_outer.l3_prot_type); +diff --git a/dpdk/drivers/net/mlx5/mlx5_ethdev.c b/dpdk/drivers/net/mlx5/mlx5_ethdev.c +index d80ae458bc..3b4c5dbe7a 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_ethdev.c ++++ b/dpdk/drivers/net/mlx5/mlx5_ethdev.c +@@ -476,7 +476,7 @@ mlx5_dev_configure_rss_reta(struct rte_eth_dev *dev) + + rxq_data = (*priv->rxqs)[i]; + rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq); +- if (rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) ++ if (rxq_ctrl && rxq_ctrl->type == MLX5_RXQ_TYPE_STANDARD) + rss_queue_arr[j++] = i; + } + rss_queue_n = j; +diff --git a/dpdk/drivers/net/mlx5/mlx5_flow.c b/dpdk/drivers/net/mlx5/mlx5_flow.c +index 008716367c..e05c35a417 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_flow.c ++++ b/dpdk/drivers/net/mlx5/mlx5_flow.c +@@ -165,7 +165,9 @@ static const struct rte_flow_expand_node mlx5_support_expansion[] = { + .rss_types = ETH_RSS_NONFRAG_IPV6_TCP, + }, + [MLX5_EXPANSION_VXLAN] = { +- .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH), ++ .next = RTE_FLOW_EXPAND_RSS_NEXT(MLX5_EXPANSION_ETH, ++ MLX5_EXPANSION_IPV4, ++ MLX5_EXPANSION_IPV6), + .type = RTE_FLOW_ITEM_TYPE_VXLAN, + }, + [MLX5_EXPANSION_VXLAN_GPE] = { +@@ -336,7 +338,7 @@ static struct mlx5_flow_tunnel_info tunnels_info[] = { + * The request register on success, a negative errno + * value otherwise and rte_errno is set. + */ +-enum modify_reg ++int + mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + enum mlx5_feature_name feature, + uint32_t id, +@@ -345,6 +347,7 @@ mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *config = &priv->config; + enum modify_reg start_reg; ++ bool skip_mtr_reg = false; + + switch (feature) { + case MLX5_HAIRPIN_RX: +@@ -383,29 +386,36 @@ mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + return REG_C_0; + } + break; +- case MLX5_COPY_MARK: + case MLX5_MTR_SFX: + /* +- * Metadata COPY_MARK register using is in meter suffix sub +- * flow while with meter. It's safe to share the same register. ++ * If meter color and flow match share one register, flow match ++ * should use the meter color register for match. + */ +- return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : REG_C_3; ++ if (priv->mtr_reg_share) ++ return priv->mtr_color_reg; ++ else ++ return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : ++ REG_C_3; + case MLX5_MTR_COLOR: + RTE_ASSERT(priv->mtr_color_reg != REG_NONE); + return priv->mtr_color_reg; ++ case MLX5_COPY_MARK: ++ /* ++ * Metadata COPY_MARK register using is in meter suffix sub ++ * flow while with meter. It's safe to share the same register. ++ */ ++ return priv->mtr_color_reg != REG_C_2 ? REG_C_2 : REG_C_3; + case MLX5_APP_TAG: + /* +- * If meter is enable, it will engage two registers for color ++ * If meter is enable, it will engage the register for color + * match and flow match. If meter color match is not using the + * REG_C_2, need to skip the REG_C_x be used by meter color + * match. + * If meter is disable, free to use all available registers. + */ +- if (priv->mtr_color_reg != REG_NONE) +- start_reg = priv->mtr_color_reg != REG_C_2 ? REG_C_3 : +- REG_C_4; +- else +- start_reg = REG_C_2; ++ start_reg = priv->mtr_color_reg != REG_C_2 ? REG_C_2 : ++ (priv->mtr_reg_share ? REG_C_3 : REG_C_4); ++ skip_mtr_reg = !!(priv->mtr_en && start_reg == REG_C_2); + if (id > (REG_C_7 - start_reg)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ITEM, +@@ -420,12 +430,16 @@ mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + * If the available index REG_C_y >= REG_C_x, skip the + * color register. + */ +- if (start_reg == REG_C_3 && config->flow_mreg_c +- [id + REG_C_3 - REG_C_0] >= priv->mtr_color_reg) { +- if (config->flow_mreg_c[id + 1 + REG_C_3 - REG_C_0] != +- REG_NONE) ++ if (skip_mtr_reg && config->flow_mreg_c ++ [id + start_reg - REG_C_0] >= priv->mtr_color_reg) { ++ if (id >= (REG_C_7 - start_reg)) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, ++ NULL, "invalid tag id"); ++ if (config->flow_mreg_c ++ [id + 1 + start_reg - REG_C_0] != REG_NONE) + return config->flow_mreg_c +- [id + 1 + REG_C_3 - REG_C_0]; ++ [id + 1 + start_reg - REG_C_0]; + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, + NULL, "unsupported tag id"); +@@ -859,6 +873,35 @@ flow_rxq_flags_clear(struct rte_eth_dev *dev) + } + } + ++/** ++ * Set the Rx queue dynamic metadata (mask and offset) for a flow ++ * ++ * @param[in] dev ++ * Pointer to the Ethernet device structure. ++ */ ++void ++mlx5_flow_rxq_dynf_metadata_set(struct rte_eth_dev *dev) ++{ ++ struct mlx5_priv *priv = dev->data->dev_private; ++ struct mlx5_rxq_data *data; ++ unsigned int i; ++ ++ for (i = 0; i != priv->rxqs_n; ++i) { ++ if (!(*priv->rxqs)[i]) ++ continue; ++ data = (*priv->rxqs)[i]; ++ if (!rte_flow_dynf_metadata_avail()) { ++ data->dynf_meta = 0; ++ data->flow_meta_mask = 0; ++ data->flow_meta_offset = -1; ++ } else { ++ data->dynf_meta = 1; ++ data->flow_meta_mask = rte_flow_dynf_metadata_mask; ++ data->flow_meta_offset = rte_flow_dynf_metadata_offs; ++ } ++ } ++} ++ + /* + * return a pointer to the desired action in the list of actions. + * +@@ -900,11 +943,6 @@ mlx5_flow_validate_action_flag(uint64_t action_flags, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { +- +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -956,10 +994,6 @@ mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + &mark->id, + "mark id must in 0 <= id < " + RTE_STR(MLX5_FLOW_MARK_MAX)); +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -991,24 +1025,10 @@ mlx5_flow_validate_action_mark(const struct rte_flow_action *action, + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + int +-mlx5_flow_validate_action_drop(uint64_t action_flags, ++mlx5_flow_validate_action_drop(uint64_t action_flags __rte_unused, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { +- if (action_flags & MLX5_FLOW_ACTION_FLAG) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and flag in same flow"); +- if (action_flags & MLX5_FLOW_ACTION_MARK) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and mark in same flow"); +- if (action_flags & (MLX5_FLOW_FATE_ACTIONS | +- MLX5_FLOW_FATE_ESWITCH_ACTIONS)) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't have 2 fate actions in" +- " same flow"); + if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, NULL, +@@ -1634,7 +1654,6 @@ mlx5_flow_validate_item_ipv6(const struct rte_flow_item *item, + "\xff\xff\xff\xff\xff\xff\xff\xff", + .vtc_flow = RTE_BE32(0xffffffff), + .proto = 0xff, +- .hop_limits = 0xff, + }, + }; + const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); +@@ -1831,7 +1850,6 @@ mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; +- uint32_t vlan_id = 0; + + + if (item_flags & MLX5_FLOW_LAYER_TUNNEL) +@@ -1858,23 +1876,8 @@ mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item, + return ret; + if (spec) { + memcpy(&id.vni[1], spec->vni, 3); +- vlan_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); +- vlan_id &= id.vlan_id; + } +- /* +- * Tunnel id 0 is equivalent as not adding a VXLAN layer, if +- * only this layer is defined in the Verbs specification it is +- * interpreted as wildcard and all packets will match this +- * rule, if it follows a full stack layer (ex: eth / ipv4 / +- * udp), all packets matching the layers before will also +- * match this rule. To avoid such situation, VNI 0 is +- * currently refused. +- */ +- if (!vlan_id) +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ITEM, item, +- "VXLAN vni cannot be 0"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, +@@ -1913,7 +1916,6 @@ mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + uint32_t vlan_id; + uint8_t vni[4]; + } id = { .vlan_id = 0, }; +- uint32_t vlan_id = 0; + + if (!priv->config.l3_vxlan_en) + return rte_flow_error_set(error, ENOTSUP, +@@ -1951,22 +1953,8 @@ mlx5_flow_validate_item_vxlan_gpe(const struct rte_flow_item *item, + "VxLAN-GPE protocol" + " not supported"); + memcpy(&id.vni[1], spec->vni, 3); +- vlan_id = id.vlan_id; + memcpy(&id.vni[1], mask->vni, 3); +- vlan_id &= id.vlan_id; + } +- /* +- * Tunnel id 0 is equivalent as not adding a VXLAN layer, if only this +- * layer is defined in the Verbs specification it is interpreted as +- * wildcard and all packets will match this rule, if it follows a full +- * stack layer (ex: eth / ipv4 / udp), all packets matching the layers +- * before will also match this rule. To avoid such situation, VNI 0 +- * is currently refused. +- */ +- if (!vlan_id) +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ITEM, item, +- "VXLAN-GPE vni cannot be 0"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER)) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, +@@ -2131,9 +2119,7 @@ mlx5_flow_validate_item_geneve(const struct rte_flow_item *item, + .protocol = RTE_BE16(UINT16_MAX), + }; + +- if (!(priv->config.hca_attr.flex_parser_protocols & +- MLX5_HCA_FLEX_GENEVE_ENABLED) || +- !priv->config.hca_attr.tunnel_stateless_geneve_rx) ++ if (!priv->config.hca_attr.tunnel_stateless_geneve_rx) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ITEM, item, + "L3 Geneve is not enabled by device" +@@ -2349,6 +2335,7 @@ flow_null_validate(struct rte_eth_dev *dev __rte_unused, + const struct rte_flow_item items[] __rte_unused, + const struct rte_flow_action actions[] __rte_unused, + bool external __rte_unused, ++ int hairpin __rte_unused, + struct rte_flow_error *error) + { + return rte_flow_error_set(error, ENOTSUP, +@@ -2463,6 +2450,8 @@ flow_get_drv_type(struct rte_eth_dev *dev, const struct rte_flow_attr *attr) + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. ++ * @param[in] hairpin ++ * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * +@@ -2474,13 +2463,14 @@ flow_drv_validate(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], +- bool external, struct rte_flow_error *error) ++ bool external, int hairpin, struct rte_flow_error *error) + { + const struct mlx5_flow_driver_ops *fops; + enum mlx5_flow_drv_type type = flow_get_drv_type(dev, attr); + + fops = flow_get_drv_ops(type); +- return fops->validate(dev, attr, items, actions, external, error); ++ return fops->validate(dev, attr, items, actions, external, ++ hairpin, error); + } + + /** +@@ -2638,47 +2628,6 @@ flow_drv_destroy(struct rte_eth_dev *dev, struct rte_flow *flow) + fops->destroy(dev, flow); + } + +-/** +- * Validate a flow supported by the NIC. +- * +- * @see rte_flow_validate() +- * @see rte_flow_ops +- */ +-int +-mlx5_flow_validate(struct rte_eth_dev *dev, +- const struct rte_flow_attr *attr, +- const struct rte_flow_item items[], +- const struct rte_flow_action actions[], +- struct rte_flow_error *error) +-{ +- int ret; +- +- ret = flow_drv_validate(dev, attr, items, actions, true, error); +- if (ret < 0) +- return ret; +- return 0; +-} +- +-/** +- * Get port id item from the item list. +- * +- * @param[in] item +- * Pointer to the list of items. +- * +- * @return +- * Pointer to the port id item if exist, else return NULL. +- */ +-static const struct rte_flow_item * +-find_port_id_item(const struct rte_flow_item *item) +-{ +- assert(item); +- for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { +- if (item->type == RTE_FLOW_ITEM_TYPE_PORT_ID) +- return item; +- } +- return NULL; +-} +- + /** + * Get RSS action from the action list. + * +@@ -2723,7 +2672,44 @@ find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) + } + + /** +- * Get QUEUE/RSS action from the action list. ++ * Get layer flags from the prefix flow. ++ * ++ * Some flows may be split to several subflows, the prefix subflow gets the ++ * match items and the suffix sub flow gets the actions. ++ * Some actions need the user defined match item flags to get the detail for ++ * the action. ++ * This function helps the suffix flow to get the item layer flags from prefix ++ * subflow. ++ * ++ * @param[in] dev_flow ++ * Pointer the created preifx subflow. ++ * ++ * @return ++ * The layers get from prefix subflow. ++ */ ++static inline uint64_t ++flow_get_prefix_layer_flags(struct mlx5_flow *dev_flow) ++{ ++ uint64_t layers = 0; ++ ++ /* If no decap actions, use the layers directly. */ ++ if (!(dev_flow->actions & MLX5_FLOW_ACTION_DECAP)) ++ return dev_flow->layers; ++ /* Convert L3 layers with decap action. */ ++ if (dev_flow->layers & MLX5_FLOW_LAYER_INNER_L3_IPV4) ++ layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV4; ++ else if (dev_flow->layers & MLX5_FLOW_LAYER_INNER_L3_IPV6) ++ layers |= MLX5_FLOW_LAYER_OUTER_L3_IPV6; ++ /* Convert L4 layers with decap action. */ ++ if (dev_flow->layers & MLX5_FLOW_LAYER_INNER_L4_TCP) ++ layers |= MLX5_FLOW_LAYER_OUTER_L4_TCP; ++ else if (dev_flow->layers & MLX5_FLOW_LAYER_INNER_L4_UDP) ++ layers |= MLX5_FLOW_LAYER_OUTER_L4_UDP; ++ return layers; ++} ++ ++/** ++ * Get metadata split action information. + * + * @param[in] actions + * Pointer to the list of actions. +@@ -2732,18 +2718,38 @@ find_graph_root(const struct rte_flow_item pattern[], uint32_t rss_level) + * @param[out] qrss_type + * Pointer to the action type to return. RTE_FLOW_ACTION_TYPE_END is returned + * if no QUEUE/RSS is found. ++ * @param[out] encap_idx ++ * Pointer to the index of the encap action if exists, otherwise the last ++ * action index. + * + * @return + * Total number of actions. + */ + static int +-flow_parse_qrss_action(const struct rte_flow_action actions[], +- const struct rte_flow_action **qrss) ++flow_parse_metadata_split_actions_info(const struct rte_flow_action actions[], ++ const struct rte_flow_action **qrss, ++ int *encap_idx) + { ++ const struct rte_flow_action_raw_encap *raw_encap; + int actions_n = 0; ++ int raw_decap_idx = -1; + ++ *encap_idx = -1; + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { + switch (actions->type) { ++ case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: ++ case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: ++ *encap_idx = actions_n; ++ break; ++ case RTE_FLOW_ACTION_TYPE_RAW_DECAP: ++ raw_decap_idx = actions_n; ++ break; ++ case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: ++ raw_encap = actions->conf; ++ if (raw_encap->size > MLX5_ENCAPSULATION_DECISION_SIZE) ++ *encap_idx = raw_decap_idx != -1 ? ++ raw_decap_idx : actions_n; ++ break; + case RTE_FLOW_ACTION_TYPE_QUEUE: + case RTE_FLOW_ACTION_TYPE_RSS: + *qrss = actions; +@@ -2753,6 +2759,8 @@ flow_parse_qrss_action(const struct rte_flow_action actions[], + } + actions_n++; + } ++ if (*encap_idx == -1) ++ *encap_idx = actions_n; + /* Count RTE_FLOW_ACTION_TYPE_END. */ + return actions_n + 1; + } +@@ -2958,18 +2966,21 @@ flow_mreg_add_copy_action(struct rte_eth_dev *dev, uint32_t mark_id, + /* Build a new flow. */ + if (mark_id != MLX5_DEFAULT_COPY_ID) { + items[0] = (struct rte_flow_item){ +- .type = MLX5_RTE_FLOW_ITEM_TYPE_TAG, ++ .type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_TAG, + .spec = &tag_spec, + }; + items[1] = (struct rte_flow_item){ + .type = RTE_FLOW_ITEM_TYPE_END, + }; + actions[0] = (struct rte_flow_action){ +- .type = MLX5_RTE_FLOW_ACTION_TYPE_MARK, ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_MARK, + .conf = &ftag, + }; + actions[1] = (struct rte_flow_action){ +- .type = MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &cp_mreg, + }; + actions[2] = (struct rte_flow_action){ +@@ -2986,7 +2997,8 @@ flow_mreg_add_copy_action(struct rte_eth_dev *dev, uint32_t mark_id, + .type = RTE_FLOW_ITEM_TYPE_END, + }; + actions[0] = (struct rte_flow_action){ +- .type = MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &cp_mreg, + }; + actions[1] = (struct rte_flow_action){ +@@ -3360,7 +3372,8 @@ flow_hairpin_split(struct rte_eth_dev *dev, + } + /* Add set meta action and end action for the Rx flow. */ + tag_action = actions_rx; +- tag_action->type = MLX5_RTE_FLOW_ACTION_TYPE_TAG; ++ tag_action->type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_TAG; + actions_rx++; + rte_memcpy(actions_rx, actions, sizeof(struct rte_flow_action)); + actions_rx++; +@@ -3373,7 +3386,8 @@ flow_hairpin_split(struct rte_eth_dev *dev, + rte_memcpy(actions_tx, actions, sizeof(struct rte_flow_action)); + addr = (void *)&pattern_tx[2]; + item = pattern_tx; +- item->type = MLX5_RTE_FLOW_ITEM_TYPE_TAG; ++ item->type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_TAG; + tag_item = (void *)addr; + tag_item->data = *flow_id; + tag_item->id = mlx5_flow_get_reg_id(dev, MLX5_HAIRPIN_TX, 0, NULL); +@@ -3401,6 +3415,8 @@ flow_hairpin_split(struct rte_eth_dev *dev, + * Parent flow structure pointer. + * @param[in, out] sub_flow + * Pointer to return the created subflow, may be NULL. ++ * @param[in] prefix_layers ++ * Prefix subflow layers, may be 0. + * @param[in] attr + * Flow rule attributes. + * @param[in] items +@@ -3418,6 +3434,7 @@ static int + flow_create_split_inner(struct rte_eth_dev *dev, + struct rte_flow *flow, + struct mlx5_flow **sub_flow, ++ uint64_t prefix_layers, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], +@@ -3432,6 +3449,12 @@ flow_create_split_inner(struct rte_eth_dev *dev, + dev_flow->external = external; + /* Subflow object was created, we must include one in the list. */ + LIST_INSERT_HEAD(&flow->dev_flows, dev_flow, next); ++ /* ++ * If dev_flow is as one of the suffix flow, some actions in suffix ++ * flow may need some user defined item layer flags. ++ */ ++ if (prefix_layers) ++ dev_flow->layers = prefix_layers; + if (sub_flow) + *sub_flow = dev_flow; + return flow_drv_translate(dev, dev_flow, attr, items, actions, error); +@@ -3451,6 +3474,10 @@ flow_create_split_inner(struct rte_eth_dev *dev, + * + * @param dev + * Pointer to Ethernet device. ++ * @param[in] items ++ * Pattern specification (list terminated by the END pattern item). ++ * @param[out] sfx_items ++ * Suffix flow match items (list terminated by the END pattern item). + * @param[in] actions + * Associated actions (list terminated by the END action). + * @param[out] actions_sfx +@@ -3467,66 +3494,61 @@ flow_create_split_inner(struct rte_eth_dev *dev, + */ + static int + flow_meter_split_prep(struct rte_eth_dev *dev, ++ const struct rte_flow_item items[], ++ struct rte_flow_item sfx_items[], + const struct rte_flow_action actions[], + struct rte_flow_action actions_sfx[], + struct rte_flow_action actions_pre[]) + { +- struct rte_flow_action *tag_action; ++ struct rte_flow_action *tag_action = NULL; ++ struct rte_flow_item *tag_item; + struct mlx5_rte_flow_action_set_tag *set_tag; + struct rte_flow_error error; + const struct rte_flow_action_raw_encap *raw_encap; + const struct rte_flow_action_raw_decap *raw_decap; ++ struct mlx5_rte_flow_item_tag *tag_spec; ++ struct mlx5_rte_flow_item_tag *tag_mask; + uint32_t tag_id; ++ bool copy_vlan = false; + +- /* Add the extra tag action first. */ +- tag_action = actions_pre; +- tag_action->type = MLX5_RTE_FLOW_ACTION_TYPE_TAG; +- actions_pre++; + /* Prepare the actions for prefix and suffix flow. */ + for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) { ++ struct rte_flow_action **action_cur = NULL; ++ + switch (actions->type) { + case RTE_FLOW_ACTION_TYPE_METER: ++ /* Add the extra tag action first. */ ++ tag_action = actions_pre; ++ tag_action->type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_TAG; ++ actions_pre++; ++ action_cur = &actions_pre; ++ break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: +- memcpy(actions_pre, actions, +- sizeof(struct rte_flow_action)); +- actions_pre++; ++ action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + raw_encap = actions->conf; +- if (raw_encap->size > +- (sizeof(struct rte_flow_item_eth) + +- sizeof(struct rte_flow_item_ipv4))) { +- memcpy(actions_sfx, actions, +- sizeof(struct rte_flow_action)); +- actions_sfx++; +- } else { +- rte_memcpy(actions_pre, actions, +- sizeof(struct rte_flow_action)); +- actions_pre++; +- } ++ if (raw_encap->size < MLX5_ENCAPSULATION_DECISION_SIZE) ++ action_cur = &actions_pre; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: + raw_decap = actions->conf; +- /* Size 0 decap means 50 bytes as vxlan decap. */ +- if (raw_decap->size && (raw_decap->size < +- (sizeof(struct rte_flow_item_eth) + +- sizeof(struct rte_flow_item_ipv4)))) { +- memcpy(actions_sfx, actions, +- sizeof(struct rte_flow_action)); +- actions_sfx++; +- } else { +- rte_memcpy(actions_pre, actions, +- sizeof(struct rte_flow_action)); +- actions_pre++; +- } ++ if (raw_decap->size > MLX5_ENCAPSULATION_DECISION_SIZE) ++ action_cur = &actions_pre; ++ break; ++ case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: ++ case RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID: ++ copy_vlan = true; + break; + default: +- memcpy(actions_sfx, actions, +- sizeof(struct rte_flow_action)); +- actions_sfx++; + break; + } ++ if (!action_cur) ++ action_cur = &actions_sfx; ++ memcpy(*action_cur, actions, sizeof(struct rte_flow_action)); ++ (*action_cur)++; + } + /* Add end action to the actions. */ + actions_sfx->type = RTE_FLOW_ACTION_TYPE_END; +@@ -3539,8 +3561,47 @@ flow_meter_split_prep(struct rte_eth_dev *dev, + * Get the id from the qrss_pool to make qrss share the id with meter. + */ + tag_id = flow_qrss_get_id(dev); +- set_tag->data = rte_cpu_to_be_32(tag_id); ++ set_tag->data = tag_id << MLX5_MTR_COLOR_BITS; ++ assert(tag_action); + tag_action->conf = set_tag; ++ /* Prepare the suffix subflow items. */ ++ tag_item = sfx_items++; ++ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { ++ int item_type = items->type; ++ ++ switch (item_type) { ++ case RTE_FLOW_ITEM_TYPE_PORT_ID: ++ memcpy(sfx_items, items, sizeof(*sfx_items)); ++ sfx_items++; ++ break; ++ case RTE_FLOW_ITEM_TYPE_VLAN: ++ if (copy_vlan) { ++ memcpy(sfx_items, items, sizeof(*sfx_items)); ++ /* ++ * Convert to internal match item, it is used ++ * for vlan push and set vid. ++ */ ++ sfx_items->type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_VLAN; ++ sfx_items++; ++ } ++ break; ++ default: ++ break; ++ } ++ } ++ sfx_items->type = RTE_FLOW_ITEM_TYPE_END; ++ sfx_items++; ++ tag_spec = (struct mlx5_rte_flow_item_tag *)sfx_items; ++ tag_spec->data = tag_id << MLX5_MTR_COLOR_BITS; ++ tag_spec->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, &error); ++ tag_mask = tag_spec + 1; ++ tag_mask->data = 0xffffff00; ++ tag_item->type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_TAG; ++ tag_item->spec = tag_spec; ++ tag_item->last = NULL; ++ tag_item->mask = tag_mask; + return tag_id; + } + +@@ -3640,7 +3701,8 @@ flow_mreg_split_qrss_prep(struct rte_eth_dev *dev, + /* Construct new actions array. */ + /* Replace QUEUE/RSS action. */ + split_actions[qrss_idx] = (struct rte_flow_action){ +- .type = MLX5_RTE_FLOW_ACTION_TYPE_TAG, ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_TAG, + .conf = set_tag, + }; + } +@@ -3673,6 +3735,8 @@ flow_mreg_split_qrss_prep(struct rte_eth_dev *dev, + * Number of actions in the list. + * @param[out] error + * Perform verbose error reporting if not NULL. ++ * @param[in] encap_idx ++ * The encap action inndex. + * + * @return + * 0 on success, negative value otherwise +@@ -3681,7 +3745,8 @@ static int + flow_mreg_tx_copy_prep(struct rte_eth_dev *dev, + struct rte_flow_action *ext_actions, + const struct rte_flow_action *actions, +- int actions_n, struct rte_flow_error *error) ++ int actions_n, struct rte_flow_error *error, ++ int encap_idx) + { + struct mlx5_flow_action_copy_mreg *cp_mreg = + (struct mlx5_flow_action_copy_mreg *) +@@ -3696,15 +3761,26 @@ flow_mreg_tx_copy_prep(struct rte_eth_dev *dev, + if (ret < 0) + return ret; + cp_mreg->src = ret; +- memcpy(ext_actions, actions, +- sizeof(*ext_actions) * actions_n); +- ext_actions[actions_n - 1] = (struct rte_flow_action){ +- .type = MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, +- .conf = cp_mreg, +- }; +- ext_actions[actions_n] = (struct rte_flow_action){ +- .type = RTE_FLOW_ACTION_TYPE_END, +- }; ++ if (encap_idx != 0) ++ memcpy(ext_actions, actions, sizeof(*ext_actions) * encap_idx); ++ if (encap_idx == actions_n - 1) { ++ ext_actions[actions_n - 1] = (struct rte_flow_action){ ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, ++ .conf = cp_mreg, ++ }; ++ ext_actions[actions_n] = (struct rte_flow_action){ ++ .type = RTE_FLOW_ACTION_TYPE_END, ++ }; ++ } else { ++ ext_actions[encap_idx] = (struct rte_flow_action){ ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, ++ .conf = cp_mreg, ++ }; ++ memcpy(ext_actions + encap_idx + 1, actions + encap_idx, ++ sizeof(*ext_actions) * (actions_n - encap_idx)); ++ } + return 0; + } + +@@ -3722,6 +3798,8 @@ flow_mreg_tx_copy_prep(struct rte_eth_dev *dev, + * Pointer to Ethernet device. + * @param[in] flow + * Parent flow structure pointer. ++ * @param[in] prefix_layers ++ * Prefix flow layer flags. + * @param[in] attr + * Flow rule attributes. + * @param[in] items +@@ -3738,6 +3816,7 @@ flow_mreg_tx_copy_prep(struct rte_eth_dev *dev, + static int + flow_create_split_metadata(struct rte_eth_dev *dev, + struct rte_flow *flow, ++ uint64_t prefix_layers, + const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], +@@ -3752,15 +3831,18 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + int mtr_sfx = 0; + size_t act_size; + int actions_n; ++ int encap_idx; + int ret; + + /* Check whether extensive metadata feature is engaged. */ + if (!config->dv_flow_en || + config->dv_xmeta_en == MLX5_XMETA_MODE_LEGACY || + !mlx5_flow_ext_mreg_supported(dev)) +- return flow_create_split_inner(dev, flow, NULL, attr, items, +- actions, external, error); +- actions_n = flow_parse_qrss_action(actions, &qrss); ++ return flow_create_split_inner(dev, flow, NULL, prefix_layers, ++ attr, items, actions, external, ++ error); ++ actions_n = flow_parse_metadata_split_actions_info(actions, &qrss, ++ &encap_idx); + if (qrss) { + /* Exclude hairpin flows from splitting. */ + if (qrss->type == RTE_FLOW_ACTION_TYPE_QUEUE) { +@@ -3807,6 +3889,7 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + RTE_FLOW_ACTION_TYPE_VOID; + else + ext_actions[qrss - actions].type = ++ (enum rte_flow_action_type) + MLX5_RTE_FLOW_ACTION_TYPE_TAG; + /* + * Create the new actions list with removed Q/RSS action +@@ -3835,14 +3918,14 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + "metadata flow"); + /* Create the action list appended with copy register. */ + ret = flow_mreg_tx_copy_prep(dev, ext_actions, actions, +- actions_n, error); ++ actions_n, error, encap_idx); + if (ret < 0) + goto exit; + } + /* Add the unmodified original or prefix subflow. */ +- ret = flow_create_split_inner(dev, flow, &dev_flow, attr, items, +- ext_actions ? ext_actions : actions, +- external, error); ++ ret = flow_create_split_inner(dev, flow, &dev_flow, prefix_layers, attr, ++ items, ext_actions ? ext_actions : ++ actions, external, error); + if (ret < 0) + goto exit; + assert(dev_flow); +@@ -3858,7 +3941,8 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + }; + struct rte_flow_item q_items[] = { + { +- .type = MLX5_RTE_FLOW_ITEM_TYPE_TAG, ++ .type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_TAG, + .spec = &q_tag_spec, + .last = NULL, + .mask = NULL, +@@ -3876,7 +3960,7 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + .type = RTE_FLOW_ACTION_TYPE_END, + }, + }; +- uint64_t hash_fields = dev_flow->hash_fields; ++ uint64_t layers = flow_get_prefix_layer_flags(dev_flow); + + /* + * Configure the tag item only if there is no meter subflow. +@@ -3903,14 +3987,13 @@ flow_create_split_metadata(struct rte_eth_dev *dev, + } + dev_flow = NULL; + /* Add suffix subflow to execute Q/RSS. */ +- ret = flow_create_split_inner(dev, flow, &dev_flow, ++ ret = flow_create_split_inner(dev, flow, &dev_flow, layers, + &q_attr, mtr_sfx ? items : + q_items, q_actions, + external, error); + if (ret < 0) + goto exit; + assert(dev_flow); +- dev_flow->hash_fields = hash_fields; + } + + exit: +@@ -3963,7 +4046,6 @@ flow_create_split_meter(struct rte_eth_dev *dev, + struct rte_flow_action *sfx_actions = NULL; + struct rte_flow_action *pre_actions = NULL; + struct rte_flow_item *sfx_items = NULL; +- const struct rte_flow_item *sfx_port_id_item; + struct mlx5_flow *dev_flow = NULL; + struct rte_flow_attr sfx_attr = *attr; + uint32_t mtr = 0; +@@ -3976,63 +4058,47 @@ flow_create_split_meter(struct rte_eth_dev *dev, + if (priv->mtr_en) + actions_n = flow_check_meter_action(actions, &mtr); + if (mtr) { +- struct mlx5_rte_flow_item_tag *tag_spec; + /* The five prefix actions: meter, decap, encap, tag, end. */ + act_size = sizeof(struct rte_flow_action) * (actions_n + 5) + +- sizeof(struct rte_flow_action_set_tag); +- /* tag, end. */ +-#define METER_SUFFIX_ITEM 3 ++ sizeof(struct mlx5_rte_flow_action_set_tag); ++ /* tag, vlan, port id, end. */ ++#define METER_SUFFIX_ITEM 4 + item_size = sizeof(struct rte_flow_item) * METER_SUFFIX_ITEM + +- sizeof(struct mlx5_rte_flow_item_tag); ++ sizeof(struct mlx5_rte_flow_item_tag) * 2; + sfx_actions = rte_zmalloc(__func__, (act_size + item_size), 0); + if (!sfx_actions) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no memory to split " + "meter flow"); ++ sfx_items = (struct rte_flow_item *)((char *)sfx_actions + ++ act_size); + pre_actions = sfx_actions + actions_n; +- mtr_tag_id = flow_meter_split_prep(dev, actions, sfx_actions, +- pre_actions); ++ mtr_tag_id = flow_meter_split_prep(dev, items, sfx_items, ++ actions, sfx_actions, ++ pre_actions); + if (!mtr_tag_id) { + ret = -rte_errno; + goto exit; + } + /* Add the prefix subflow. */ +- ret = flow_create_split_inner(dev, flow, &dev_flow, attr, items, +- pre_actions, external, error); ++ ret = flow_create_split_inner(dev, flow, &dev_flow, 0, attr, ++ items, pre_actions, external, ++ error); + if (ret) { + ret = -rte_errno; + goto exit; + } + dev_flow->mtr_flow_id = mtr_tag_id; +- /* Prepare the suffix flow match pattern. */ +- sfx_items = (struct rte_flow_item *)((char *)sfx_actions + +- act_size); +- tag_spec = (struct mlx5_rte_flow_item_tag *)(sfx_items + +- METER_SUFFIX_ITEM); +- tag_spec->data = rte_cpu_to_be_32(dev_flow->mtr_flow_id); +- tag_spec->id = mlx5_flow_get_reg_id(dev, MLX5_MTR_SFX, 0, +- error); +- sfx_items->type = MLX5_RTE_FLOW_ITEM_TYPE_TAG; +- sfx_items->spec = tag_spec; +- sfx_items->last = NULL; +- sfx_items->mask = NULL; +- sfx_items++; +- sfx_port_id_item = find_port_id_item(items); +- if (sfx_port_id_item) { +- memcpy(sfx_items, sfx_port_id_item, +- sizeof(*sfx_items)); +- sfx_items++; +- } +- sfx_items->type = RTE_FLOW_ITEM_TYPE_END; +- sfx_items -= METER_SUFFIX_ITEM; + /* Setting the sfx group atrr. */ + sfx_attr.group = sfx_attr.transfer ? + (MLX5_FLOW_TABLE_LEVEL_SUFFIX - 1) : + MLX5_FLOW_TABLE_LEVEL_SUFFIX; + } + /* Add the prefix subflow. */ +- ret = flow_create_split_metadata(dev, flow, &sfx_attr, ++ ret = flow_create_split_metadata(dev, flow, dev_flow ? ++ flow_get_prefix_layer_flags(dev_flow) : ++ 0, &sfx_attr, + sfx_items ? sfx_items : items, + sfx_actions ? sfx_actions : actions, + external, error); +@@ -4146,14 +4212,18 @@ flow_list_create(struct rte_eth_dev *dev, struct mlx5_flows *list, + } items_tx; + struct rte_flow_expand_rss *buf = &expand_buffer.buf; + const struct rte_flow_action *p_actions_rx = actions; +- int ret; + uint32_t i; + uint32_t flow_size; +- int hairpin_flow = 0; ++ int hairpin_flow; + uint32_t hairpin_id = 0; + struct rte_flow_attr attr_tx = { .priority = 0 }; ++ int ret; + + hairpin_flow = flow_check_hairpin_split(dev, attr, actions); ++ ret = flow_drv_validate(dev, attr, items, p_actions_rx, ++ external, hairpin_flow, error); ++ if (ret < 0) ++ return NULL; + if (hairpin_flow > 0) { + if (hairpin_flow > MLX5_MAX_SPLIT_ACTIONS) { + rte_errno = EINVAL; +@@ -4164,10 +4234,6 @@ flow_list_create(struct rte_eth_dev *dev, struct mlx5_flows *list, + &hairpin_id); + p_actions_rx = actions_rx.actions; + } +- ret = flow_drv_validate(dev, attr, items, p_actions_rx, external, +- error); +- if (ret < 0) +- goto error_before_flow; + flow_size = sizeof(struct rte_flow); + rss = flow_get_rss_action(p_actions_rx); + if (rss) +@@ -4334,6 +4400,26 @@ mlx5_flow_create_esw_table_zero_flow(struct rte_eth_dev *dev) + actions, false, &error); + } + ++/** ++ * Validate a flow supported by the NIC. ++ * ++ * @see rte_flow_validate() ++ * @see rte_flow_ops ++ */ ++int ++mlx5_flow_validate(struct rte_eth_dev *dev, ++ const struct rte_flow_attr *attr, ++ const struct rte_flow_item items[], ++ const struct rte_flow_action actions[], ++ struct rte_flow_error *error) ++{ ++ int hairpin_flow; ++ ++ hairpin_flow = flow_check_hairpin_split(dev, attr, actions); ++ return flow_drv_validate(dev, attr, items, actions, ++ true, hairpin_flow, error); ++} ++ + /** + * Create a flow. + * +@@ -4518,7 +4604,8 @@ mlx5_ctrl_flow_source_queue(struct rte_eth_dev *dev, + }; + struct rte_flow_item items[] = { + { +- .type = MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE, ++ .type = (enum rte_flow_item_type) ++ MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE, + .spec = &queue_spec, + .last = NULL, + .mask = &queue_mask, +@@ -4623,6 +4710,8 @@ mlx5_ctrl_flow_vlan(struct rte_eth_dev *dev, + if (!priv->reta_idx_n || !priv->rxqs_n) { + return 0; + } ++ if (!(dev->data->dev_conf.rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)) ++ action_rss.types = 0; + for (i = 0; i != priv->reta_idx_n; ++i) + queue[i] = (*priv->reta_idx)[i]; + flow = flow_list_create(dev, &priv->ctrl_flows, +@@ -5570,6 +5659,8 @@ mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh, + * Value is part of flow rule created by request external to PMD. + * @param[in] group + * rte_flow group index value. ++ * @param[out] fdb_def_rule ++ * Whether fdb jump to table 1 is configured. + * @param[out] table + * HW table value. + * @param[out] error +@@ -5580,10 +5671,10 @@ mlx5_flow_async_pool_query_handle(struct mlx5_ibv_shared *sh, + */ + int + mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, bool external, +- uint32_t group, uint32_t *table, ++ uint32_t group, bool fdb_def_rule, uint32_t *table, + struct rte_flow_error *error) + { +- if (attributes->transfer && external) { ++ if (attributes->transfer && external && fdb_def_rule) { + if (group == UINT32_MAX) + return rte_flow_error_set + (error, EINVAL, +@@ -5633,7 +5724,8 @@ mlx5_flow_discover_mreg_c(struct rte_eth_dev *dev) + }; + struct rte_flow_action actions[] = { + [0] = { +- .type = MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, ++ .type = (enum rte_flow_action_type) ++ MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG, + .conf = &(struct mlx5_flow_action_copy_mreg){ + .src = REG_C_1, + .dst = idx, +diff --git a/dpdk/drivers/net/mlx5/mlx5_flow.h b/dpdk/drivers/net/mlx5/mlx5_flow.h +index 3fff5dd7da..f8046119ec 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_flow.h ++++ b/dpdk/drivers/net/mlx5/mlx5_flow.h +@@ -33,6 +33,7 @@ enum mlx5_rte_flow_item_type { + MLX5_RTE_FLOW_ITEM_TYPE_END = INT_MIN, + MLX5_RTE_FLOW_ITEM_TYPE_TAG, + MLX5_RTE_FLOW_ITEM_TYPE_TX_QUEUE, ++ MLX5_RTE_FLOW_ITEM_TYPE_VLAN, + }; + + /* Private (internal) rte flow actions. */ +@@ -188,20 +189,16 @@ enum mlx5_feature_name { + #define MLX5_FLOW_ACTION_DEC_TTL (1u << 19) + #define MLX5_FLOW_ACTION_SET_MAC_SRC (1u << 20) + #define MLX5_FLOW_ACTION_SET_MAC_DST (1u << 21) +-#define MLX5_FLOW_ACTION_VXLAN_ENCAP (1u << 22) +-#define MLX5_FLOW_ACTION_VXLAN_DECAP (1u << 23) +-#define MLX5_FLOW_ACTION_NVGRE_ENCAP (1u << 24) +-#define MLX5_FLOW_ACTION_NVGRE_DECAP (1u << 25) +-#define MLX5_FLOW_ACTION_RAW_ENCAP (1u << 26) +-#define MLX5_FLOW_ACTION_RAW_DECAP (1u << 27) +-#define MLX5_FLOW_ACTION_INC_TCP_SEQ (1u << 28) +-#define MLX5_FLOW_ACTION_DEC_TCP_SEQ (1u << 29) +-#define MLX5_FLOW_ACTION_INC_TCP_ACK (1u << 30) +-#define MLX5_FLOW_ACTION_DEC_TCP_ACK (1u << 31) +-#define MLX5_FLOW_ACTION_SET_TAG (1ull << 32) +-#define MLX5_FLOW_ACTION_MARK_EXT (1ull << 33) +-#define MLX5_FLOW_ACTION_SET_META (1ull << 34) +-#define MLX5_FLOW_ACTION_METER (1ull << 35) ++#define MLX5_FLOW_ACTION_ENCAP (1u << 22) ++#define MLX5_FLOW_ACTION_DECAP (1u << 23) ++#define MLX5_FLOW_ACTION_INC_TCP_SEQ (1u << 24) ++#define MLX5_FLOW_ACTION_DEC_TCP_SEQ (1u << 25) ++#define MLX5_FLOW_ACTION_INC_TCP_ACK (1u << 26) ++#define MLX5_FLOW_ACTION_DEC_TCP_ACK (1u << 27) ++#define MLX5_FLOW_ACTION_SET_TAG (1ull << 28) ++#define MLX5_FLOW_ACTION_MARK_EXT (1ull << 29) ++#define MLX5_FLOW_ACTION_SET_META (1ull << 30) ++#define MLX5_FLOW_ACTION_METER (1ull << 31) + + #define MLX5_FLOW_FATE_ACTIONS \ + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_QUEUE | \ +@@ -211,15 +208,6 @@ enum mlx5_feature_name { + (MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_PORT_ID | \ + MLX5_FLOW_ACTION_JUMP) + +-#define MLX5_FLOW_ENCAP_ACTIONS (MLX5_FLOW_ACTION_VXLAN_ENCAP | \ +- MLX5_FLOW_ACTION_NVGRE_ENCAP | \ +- MLX5_FLOW_ACTION_RAW_ENCAP | \ +- MLX5_FLOW_ACTION_OF_PUSH_VLAN) +- +-#define MLX5_FLOW_DECAP_ACTIONS (MLX5_FLOW_ACTION_VXLAN_DECAP | \ +- MLX5_FLOW_ACTION_NVGRE_DECAP | \ +- MLX5_FLOW_ACTION_RAW_DECAP | \ +- MLX5_FLOW_ACTION_OF_POP_VLAN) + + #define MLX5_FLOW_MODIFY_HDR_ACTIONS (MLX5_FLOW_ACTION_SET_IPV4_SRC | \ + MLX5_FLOW_ACTION_SET_IPV4_DST | \ +@@ -242,6 +230,9 @@ enum mlx5_feature_name { + + #define MLX5_FLOW_VLAN_ACTIONS (MLX5_FLOW_ACTION_OF_POP_VLAN | \ + MLX5_FLOW_ACTION_OF_PUSH_VLAN) ++ ++#define MLX5_FLOW_XCAP_ACTIONS (MLX5_FLOW_ACTION_ENCAP | MLX5_FLOW_ACTION_DECAP) ++ + #ifndef IPPROTO_MPLS + #define IPPROTO_MPLS 137 + #endif +@@ -288,6 +279,27 @@ enum mlx5_feature_name { + /* IBV hash source bits for IPV6. */ + #define MLX5_IPV6_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV6 | IBV_RX_HASH_DST_IPV6) + ++/* IBV hash bits for L3 SRC. */ ++#define MLX5_L3_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_IPV4 | IBV_RX_HASH_SRC_IPV6) ++ ++/* IBV hash bits for L3 DST. */ ++#define MLX5_L3_DST_IBV_RX_HASH (IBV_RX_HASH_DST_IPV4 | IBV_RX_HASH_DST_IPV6) ++ ++/* IBV hash bits for TCP. */ ++#define MLX5_TCP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \ ++ IBV_RX_HASH_DST_PORT_TCP) ++ ++/* IBV hash bits for UDP. */ ++#define MLX5_UDP_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_UDP | \ ++ IBV_RX_HASH_DST_PORT_UDP) ++ ++/* IBV hash bits for L4 SRC. */ ++#define MLX5_L4_SRC_IBV_RX_HASH (IBV_RX_HASH_SRC_PORT_TCP | \ ++ IBV_RX_HASH_SRC_PORT_UDP) ++ ++/* IBV hash bits for L4 DST. */ ++#define MLX5_L4_DST_IBV_RX_HASH (IBV_RX_HASH_DST_PORT_TCP | \ ++ IBV_RX_HASH_DST_PORT_UDP) + + /* Geneve header first 16Bit */ + #define MLX5_GENEVE_VER_MASK 0x3 +@@ -315,6 +327,26 @@ enum mlx5_feature_name { + #define MLX5_GENEVE_OPT_LEN_0 14 + #define MLX5_GENEVE_OPT_LEN_1 63 + ++#define MLX5_ENCAPSULATION_DECISION_SIZE (sizeof(struct rte_flow_item_eth) + \ ++ sizeof(struct rte_flow_item_ipv4)) ++ ++/* Software header modify action numbers of a flow. */ ++#define MLX5_ACT_NUM_MDF_IPV4 1 ++#define MLX5_ACT_NUM_MDF_IPV6 4 ++#define MLX5_ACT_NUM_MDF_MAC 2 ++#define MLX5_ACT_NUM_MDF_VID 1 ++#define MLX5_ACT_NUM_MDF_PORT 2 ++#define MLX5_ACT_NUM_MDF_TTL 1 ++#define MLX5_ACT_NUM_DEC_TTL MLX5_ACT_NUM_MDF_TTL ++#define MLX5_ACT_NUM_MDF_TCPSEQ 1 ++#define MLX5_ACT_NUM_MDF_TCPACK 1 ++#define MLX5_ACT_NUM_SET_REG 1 ++#define MLX5_ACT_NUM_SET_TAG 1 ++#define MLX5_ACT_NUM_CPY_MREG MLX5_ACT_NUM_SET_TAG ++#define MLX5_ACT_NUM_SET_MARK MLX5_ACT_NUM_SET_TAG ++#define MLX5_ACT_NUM_SET_META MLX5_ACT_NUM_SET_TAG ++#define MLX5_ACT_NUM_SET_DSCP 1 ++ + enum mlx5_flow_drv_type { + MLX5_FLOW_TYPE_MIN, + MLX5_FLOW_TYPE_DV, +@@ -370,11 +402,16 @@ struct mlx5_flow_dv_tag_resource { + + /* + * Number of modification commands. +- * If extensive metadata registers are supported +- * the maximal actions amount is 16 and 8 otherwise. ++ * The maximal actions amount in FW is some constant, and it is 16 in the ++ * latest releases. In some old releases, it will be limited to 8. ++ * Since there is no interface to query the capacity, the maximal value should ++ * be used to allow PMD to create the flow. The validation will be done in the ++ * lower driver layer or FW. A failure will be returned if exceeds the maximal ++ * supported actions number on the root table. ++ * On non-root tables, there is no limitation, but 32 is enough right now. + */ +-#define MLX5_MODIFY_NUM 16 +-#define MLX5_MODIFY_NUM_NO_MREG 8 ++#define MLX5_MAX_MODIFY_NUM 32 ++#define MLX5_ROOT_TBL_MODIFY_NUM 16 + + /* Modify resource structure */ + struct mlx5_flow_dv_modify_hdr_resource { +@@ -385,9 +422,9 @@ struct mlx5_flow_dv_modify_hdr_resource { + /**< Verbs modify header action object. */ + uint8_t ft_type; /**< Flow table type, Rx or Tx. */ + uint32_t actions_num; /**< Number of modification actions. */ +- struct mlx5_modification_cmd actions[MLX5_MODIFY_NUM]; +- /**< Modification actions. */ + uint64_t flags; /**< Flags for RDMA API. */ ++ struct mlx5_modification_cmd actions[]; ++ /**< Modification actions. */ + }; + + /* Jump action resource structure. */ +@@ -554,6 +591,8 @@ struct mlx5_flow_policer_stats { + struct mlx5_meter_domain_info { + struct mlx5_flow_tbl_resource *tbl; + /**< Meter table. */ ++ struct mlx5_flow_tbl_resource *sfx_tbl; ++ /**< Meter suffix table. */ + void *any_matcher; + /**< Meter color not match default criteria. */ + void *color_matcher; +@@ -657,6 +696,7 @@ typedef int (*mlx5_flow_validate_t)(struct rte_eth_dev *dev, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external, ++ int hairpin, + struct rte_flow_error *error); + typedef struct mlx5_flow *(*mlx5_flow_prepare_t) + (const struct rte_flow_attr *attr, const struct rte_flow_item items[], +@@ -724,20 +764,20 @@ struct mlx5_flow_driver_ops { + + /* mlx5_flow.c */ + +-struct mlx5_flow_id_pool *mlx5_flow_id_pool_alloc(void); ++struct mlx5_flow_id_pool *mlx5_flow_id_pool_alloc(uint32_t max_id); + void mlx5_flow_id_pool_release(struct mlx5_flow_id_pool *pool); + uint32_t mlx5_flow_id_get(struct mlx5_flow_id_pool *pool, uint32_t *id); + uint32_t mlx5_flow_id_release(struct mlx5_flow_id_pool *pool, + uint32_t id); + int mlx5_flow_group_to_table(const struct rte_flow_attr *attributes, +- bool external, uint32_t group, uint32_t *table, +- struct rte_flow_error *error); ++ bool external, uint32_t group, bool fdb_def_rule, ++ uint32_t *table, struct rte_flow_error *error); + uint64_t mlx5_flow_hashfields_adjust(struct mlx5_flow *dev_flow, int tunnel, + uint64_t layer_types, + uint64_t hash_fields); + uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority, + uint32_t subpriority); +-enum modify_reg mlx5_flow_get_reg_id(struct rte_eth_dev *dev, ++int mlx5_flow_get_reg_id(struct rte_eth_dev *dev, + enum mlx5_feature_name feature, + uint32_t id, + struct rte_flow_error *error); +diff --git a/dpdk/drivers/net/mlx5/mlx5_flow_dv.c b/dpdk/drivers/net/mlx5/mlx5_flow_dv.c +index 73aaea4536..d83e49f954 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_flow_dv.c ++++ b/dpdk/drivers/net/mlx5/mlx5_flow_dv.c +@@ -51,8 +51,6 @@ + #define MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL 1 + #endif + +-#define MLX5_ENCAPSULATION_DECISION_SIZE (sizeof(struct rte_flow_item_eth) + \ +- sizeof(struct rte_flow_item_ipv4)) + /* VLAN header definitions */ + #define MLX5DV_FLOW_VLAN_PCP_SHIFT 13 + #define MLX5DV_FLOW_VLAN_PCP_MASK (0x7 << MLX5DV_FLOW_VLAN_PCP_SHIFT) +@@ -72,6 +70,10 @@ union flow_dv_attr { + uint32_t attr; + }; + ++static int ++flow_dv_tbl_resource_release(struct rte_eth_dev *dev, ++ struct mlx5_flow_tbl_resource *tbl); ++ + /** + * Initialize flow attributes structure according to flow items' types. + * +@@ -82,19 +84,74 @@ union flow_dv_attr { + * Pointer to item specification. + * @param[out] attr + * Pointer to flow attributes structure. ++ * @param[in] dev_flow ++ * Pointer to the sub flow. ++ * @param[in] tunnel_decap ++ * Whether action is after tunnel decapsulation. + */ + static void +-flow_dv_attr_init(const struct rte_flow_item *item, union flow_dv_attr *attr) ++flow_dv_attr_init(const struct rte_flow_item *item, union flow_dv_attr *attr, ++ struct mlx5_flow *dev_flow, bool tunnel_decap) + { ++ /* ++ * If layers is already initialized, it means this dev_flow is the ++ * suffix flow, the layers flags is set by the prefix flow. Need to ++ * use the layer flags from prefix flow as the suffix flow may not ++ * have the user defined items as the flow is split. ++ */ ++ if (dev_flow->layers) { ++ if (dev_flow->layers & MLX5_FLOW_LAYER_OUTER_L3_IPV4) ++ attr->ipv4 = 1; ++ else if (dev_flow->layers & MLX5_FLOW_LAYER_OUTER_L3_IPV6) ++ attr->ipv6 = 1; ++ if (dev_flow->layers & MLX5_FLOW_LAYER_OUTER_L4_TCP) ++ attr->tcp = 1; ++ else if (dev_flow->layers & MLX5_FLOW_LAYER_OUTER_L4_UDP) ++ attr->udp = 1; ++ attr->valid = 1; ++ return; ++ } + for (; item->type != RTE_FLOW_ITEM_TYPE_END; item++) { ++ uint8_t next_protocol = 0xff; + switch (item->type) { ++ case RTE_FLOW_ITEM_TYPE_GRE: ++ case RTE_FLOW_ITEM_TYPE_NVGRE: ++ case RTE_FLOW_ITEM_TYPE_VXLAN: ++ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: ++ case RTE_FLOW_ITEM_TYPE_GENEVE: ++ case RTE_FLOW_ITEM_TYPE_MPLS: ++ if (tunnel_decap) ++ attr->attr = 0; ++ break; + case RTE_FLOW_ITEM_TYPE_IPV4: + if (!attr->ipv6) + attr->ipv4 = 1; ++ if (item->mask != NULL && ++ ((const struct rte_flow_item_ipv4 *) ++ item->mask)->hdr.next_proto_id) ++ next_protocol = ++ ((const struct rte_flow_item_ipv4 *) ++ (item->spec))->hdr.next_proto_id & ++ ((const struct rte_flow_item_ipv4 *) ++ (item->mask))->hdr.next_proto_id; ++ if ((next_protocol == IPPROTO_IPIP || ++ next_protocol == IPPROTO_IPV6) && tunnel_decap) ++ attr->attr = 0; + break; + case RTE_FLOW_ITEM_TYPE_IPV6: + if (!attr->ipv4) + attr->ipv6 = 1; ++ if (item->mask != NULL && ++ ((const struct rte_flow_item_ipv6 *) ++ item->mask)->hdr.proto) ++ next_protocol = ++ ((const struct rte_flow_item_ipv6 *) ++ (item->spec))->hdr.proto & ++ ((const struct rte_flow_item_ipv6 *) ++ (item->mask))->hdr.proto; ++ if ((next_protocol == IPPROTO_IPIP || ++ next_protocol == IPPROTO_IPV6) && tunnel_decap) ++ attr->attr = 0; + break; + case RTE_FLOW_ITEM_TYPE_UDP: + if (!attr->tcp) +@@ -363,7 +420,7 @@ flow_dv_convert_modify_action(struct rte_flow_item *item, + uint32_t mask; + uint32_t data; + +- if (i >= MLX5_MODIFY_NUM) ++ if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); +@@ -380,10 +437,12 @@ flow_dv_convert_modify_action(struct rte_flow_item *item, + off_b - __builtin_clz(mask); + assert(size_b); + size_b = size_b == sizeof(uint32_t) * CHAR_BIT ? 0 : size_b; +- actions[i].action_type = type; +- actions[i].field = field->id; +- actions[i].offset = off_b; +- actions[i].length = size_b; ++ actions[i] = (struct mlx5_modification_cmd) { ++ .action_type = type, ++ .field = field->id, ++ .offset = off_b, ++ .length = size_b, ++ }; + /* Convert entire record to expected big-endian format. */ + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + if (type == MLX5_MODIFICATION_TYPE_COPY) { +@@ -404,11 +463,11 @@ flow_dv_convert_modify_action(struct rte_flow_item *item, + ++i; + ++field; + } while (field->size); +- resource->actions_num = i; +- if (!resource->actions_num) ++ if (resource->actions_num == i) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "invalid modification flow item"); ++ resource->actions_num = i; + return 0; + } + +@@ -566,17 +625,19 @@ flow_dv_convert_action_modify_vlan_vid + const struct rte_flow_action_of_set_vlan_vid *conf = + (const struct rte_flow_action_of_set_vlan_vid *)(action->conf); + int i = resource->actions_num; +- struct mlx5_modification_cmd *actions = &resource->actions[i]; ++ struct mlx5_modification_cmd *actions = resource->actions; + struct field_modify_info *field = modify_vlan_out_first_vid; + +- if (i >= MLX5_MODIFY_NUM) ++ if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); +- actions[i].action_type = MLX5_MODIFICATION_TYPE_SET; +- actions[i].field = field->id; +- actions[i].length = field->size; +- actions[i].offset = field->offset; ++ actions[i] = (struct mlx5_modification_cmd) { ++ .action_type = MLX5_MODIFICATION_TYPE_SET, ++ .field = field->id, ++ .length = field->size, ++ .offset = field->offset, ++ }; + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + actions[i].data1 = conf->vlan_vid; + actions[i].data1 = actions[i].data1 << 16; +@@ -595,6 +656,10 @@ flow_dv_convert_action_modify_vlan_vid + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. ++ * @param[in] dev_flow ++ * Pointer to the sub flow. ++ * @param[in] tunnel_decap ++ * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * +@@ -606,8 +671,8 @@ flow_dv_convert_action_modify_tp + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + const struct rte_flow_item *items, +- union flow_dv_attr *attr, +- struct rte_flow_error *error) ++ union flow_dv_attr *attr, struct mlx5_flow *dev_flow, ++ bool tunnel_decap, struct rte_flow_error *error) + { + const struct rte_flow_action_set_tp *conf = + (const struct rte_flow_action_set_tp *)(action->conf); +@@ -619,7 +684,7 @@ flow_dv_convert_action_modify_tp + struct field_modify_info *field; + + if (!attr->valid) +- flow_dv_attr_init(items, attr); ++ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->udp) { + memset(&udp, 0, sizeof(udp)); + memset(&udp_mask, 0, sizeof(udp_mask)); +@@ -636,8 +701,8 @@ flow_dv_convert_action_modify_tp + item.spec = &udp; + item.mask = &udp_mask; + field = modify_udp; +- } +- if (attr->tcp) { ++ } else { ++ assert(attr->tcp); + memset(&tcp, 0, sizeof(tcp)); + memset(&tcp_mask, 0, sizeof(tcp_mask)); + if (action->type == RTE_FLOW_ACTION_TYPE_SET_TP_SRC) { +@@ -669,6 +734,10 @@ flow_dv_convert_action_modify_tp + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. ++ * @param[in] dev_flow ++ * Pointer to the sub flow. ++ * @param[in] tunnel_decap ++ * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * +@@ -680,8 +749,8 @@ flow_dv_convert_action_modify_ttl + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_action *action, + const struct rte_flow_item *items, +- union flow_dv_attr *attr, +- struct rte_flow_error *error) ++ union flow_dv_attr *attr, struct mlx5_flow *dev_flow, ++ bool tunnel_decap, struct rte_flow_error *error) + { + const struct rte_flow_action_set_ttl *conf = + (const struct rte_flow_action_set_ttl *)(action->conf); +@@ -693,7 +762,7 @@ flow_dv_convert_action_modify_ttl + struct field_modify_info *field; + + if (!attr->valid) +- flow_dv_attr_init(items, attr); ++ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->ipv4) { + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); +@@ -703,8 +772,8 @@ flow_dv_convert_action_modify_ttl + item.spec = &ipv4; + item.mask = &ipv4_mask; + field = modify_ipv4; +- } +- if (attr->ipv6) { ++ } else { ++ assert(attr->ipv6); + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + ipv6.hdr.hop_limits = conf->ttl_value; +@@ -729,6 +798,10 @@ flow_dv_convert_action_modify_ttl + * Pointer to rte_flow_item objects list. + * @param[in] attr + * Pointer to flow attributes structure. ++ * @param[in] dev_flow ++ * Pointer to the sub flow. ++ * @param[in] tunnel_decap ++ * Whether action is after tunnel decapsulation. + * @param[out] error + * Pointer to the error structure. + * +@@ -739,8 +812,8 @@ static int + flow_dv_convert_action_modify_dec_ttl + (struct mlx5_flow_dv_modify_hdr_resource *resource, + const struct rte_flow_item *items, +- union flow_dv_attr *attr, +- struct rte_flow_error *error) ++ union flow_dv_attr *attr, struct mlx5_flow *dev_flow, ++ bool tunnel_decap, struct rte_flow_error *error) + { + struct rte_flow_item item; + struct rte_flow_item_ipv4 ipv4; +@@ -750,7 +823,7 @@ flow_dv_convert_action_modify_dec_ttl + struct field_modify_info *field; + + if (!attr->valid) +- flow_dv_attr_init(items, attr); ++ flow_dv_attr_init(items, attr, dev_flow, tunnel_decap); + if (attr->ipv4) { + memset(&ipv4, 0, sizeof(ipv4)); + memset(&ipv4_mask, 0, sizeof(ipv4_mask)); +@@ -760,8 +833,8 @@ flow_dv_convert_action_modify_dec_ttl + item.spec = &ipv4; + item.mask = &ipv4_mask; + field = modify_ipv4; +- } +- if (attr->ipv6) { ++ } else { ++ assert(attr->ipv6); + memset(&ipv6, 0, sizeof(ipv6)); + memset(&ipv6_mask, 0, sizeof(ipv6_mask)); + ipv6.hdr.hop_limits = 0xFF; +@@ -902,22 +975,20 @@ flow_dv_convert_action_set_reg + struct mlx5_modification_cmd *actions = resource->actions; + uint32_t i = resource->actions_num; + +- if (i >= MLX5_MODIFY_NUM) ++ if (i >= MLX5_MAX_MODIFY_NUM) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many items to modify"); + assert(conf->id != REG_NONE); + assert(conf->id < RTE_DIM(reg_to_field)); +- actions[i].action_type = MLX5_MODIFICATION_TYPE_SET; +- actions[i].field = reg_to_field[conf->id]; ++ actions[i] = (struct mlx5_modification_cmd) { ++ .action_type = MLX5_MODIFICATION_TYPE_SET, ++ .field = reg_to_field[conf->id], ++ }; + actions[i].data0 = rte_cpu_to_be_32(actions[i].data0); + actions[i].data1 = rte_cpu_to_be_32(conf->data); + ++i; + resource->actions_num = i; +- if (!resource->actions_num) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "invalid modification flow item"); + return 0; + } + +@@ -1078,7 +1149,7 @@ flow_dv_convert_action_mark(struct rte_eth_dev *dev, + {4, 0, 0}, /* dynamic instead of MLX5_MODI_META_REG_C_1. */ + {0, 0, 0}, + }; +- enum modify_reg reg; ++ int reg; + + if (!mask) + return rte_flow_error_set(error, EINVAL, +@@ -1088,6 +1159,14 @@ flow_dv_convert_action_mark(struct rte_eth_dev *dev, + if (reg < 0) + return reg; + assert(reg > 0); ++ if (reg == REG_C_0) { ++ uint32_t msk_c0 = priv->sh->dv_regc0_mask; ++ uint32_t shl_c0 = rte_bsf32(msk_c0); ++ ++ data = rte_cpu_to_be_32(rte_cpu_to_be_32(data) << shl_c0); ++ mask = rte_cpu_to_be_32(mask) & msk_c0; ++ mask = rte_cpu_to_be_32(mask << shl_c0); ++ } + reg_c_x[0].id = reg_to_field[reg]; + return flow_dv_convert_modify_action(&item, reg_c_x, NULL, resource, + MLX5_MODIFICATION_TYPE_SET, error); +@@ -1112,7 +1191,7 @@ flow_dv_get_metadata_reg(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { +- enum modify_reg reg = ++ int reg = + mlx5_flow_get_reg_id(dev, attr->transfer ? + MLX5_METADATA_FDB : + attr->egress ? +@@ -1160,7 +1239,7 @@ flow_dv_convert_action_set_meta + struct field_modify_info reg_c_x[] = { + [1] = {0, 0, 0}, + }; +- enum modify_reg reg = flow_dv_get_metadata_reg(dev, attr, error); ++ int reg = flow_dv_get_metadata_reg(dev, attr, error); + + if (reg < 0) + return reg; +@@ -1250,6 +1329,11 @@ flow_dv_validate_item_mark(struct rte_eth_dev *dev, + "mark id exceeds the limit"); + if (!mask) + mask = &nic_mask; ++ if (!mask->id) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, ++ "mask cannot be zero"); ++ + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_mark), +@@ -1287,7 +1371,7 @@ flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused, + struct rte_flow_item_meta nic_mask = { + .data = UINT32_MAX + }; +- enum modify_reg reg; ++ int reg; + int ret; + + if (!spec) +@@ -1295,10 +1379,6 @@ flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused, + RTE_FLOW_ERROR_TYPE_ITEM_SPEC, + item->spec, + "data cannot be empty"); +- if (!spec->data) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, +- "data cannot be zero"); + if (config->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) { + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, +@@ -1318,6 +1398,11 @@ flow_dv_validate_item_meta(struct rte_eth_dev *dev __rte_unused, + } + if (!mask) + mask = &rte_flow_item_meta_mask; ++ if (!mask->data) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, ++ "mask cannot be zero"); ++ + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_meta), +@@ -1366,6 +1451,11 @@ flow_dv_validate_item_tag(struct rte_eth_dev *dev, + "data cannot be empty"); + if (!mask) + mask = &rte_flow_item_tag_mask; ++ if (!mask->data) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM_SPEC, NULL, ++ "mask cannot be zero"); ++ + ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, + (const uint8_t *)&nic_mask, + sizeof(struct rte_flow_item_tag), +@@ -1465,6 +1555,79 @@ flow_dv_validate_item_port_id(struct rte_eth_dev *dev, + return 0; + } + ++/** ++ * Validate VLAN item. ++ * ++ * @param[in] item ++ * Item specification. ++ * @param[in] item_flags ++ * Bit-fields that holds the items detected until now. ++ * @param[in] dev ++ * Ethernet device flow is being created on. ++ * @param[out] error ++ * Pointer to error structure. ++ * ++ * @return ++ * 0 on success, a negative errno value otherwise and rte_errno is set. ++ */ ++static int ++flow_dv_validate_item_vlan(const struct rte_flow_item *item, ++ uint64_t item_flags, ++ struct rte_eth_dev *dev, ++ struct rte_flow_error *error) ++{ ++ const struct rte_flow_item_vlan *mask = item->mask; ++ const struct rte_flow_item_vlan nic_mask = { ++ .tci = RTE_BE16(UINT16_MAX), ++ .inner_type = RTE_BE16(UINT16_MAX), ++ }; ++ const int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); ++ int ret; ++ const uint64_t l34m = tunnel ? (MLX5_FLOW_LAYER_INNER_L3 | ++ MLX5_FLOW_LAYER_INNER_L4) : ++ (MLX5_FLOW_LAYER_OUTER_L3 | ++ MLX5_FLOW_LAYER_OUTER_L4); ++ const uint64_t vlanm = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : ++ MLX5_FLOW_LAYER_OUTER_VLAN; ++ ++ if (item_flags & vlanm) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, item, ++ "multiple VLAN layers not supported"); ++ else if ((item_flags & l34m) != 0) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, item, ++ "VLAN cannot follow L3/L4 layer"); ++ if (!mask) ++ mask = &rte_flow_item_vlan_mask; ++ ret = mlx5_flow_item_acceptable(item, (const uint8_t *)mask, ++ (const uint8_t *)&nic_mask, ++ sizeof(struct rte_flow_item_vlan), ++ error); ++ if (ret) ++ return ret; ++ if (!tunnel && mask->tci != RTE_BE16(0x0fff)) { ++ struct mlx5_priv *priv = dev->data->dev_private; ++ ++ if (priv->vmwa_context) { ++ /* ++ * Non-NULL context means we have a virtual machine ++ * and SR-IOV enabled, we have to create VLAN interface ++ * to make hypervisor to setup E-Switch vport ++ * context correctly. We avoid creating the multiple ++ * VLAN interfaces, so we cannot support VLAN tag mask. ++ */ ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ITEM, ++ item, ++ "VLAN tag mask is not" ++ " supported in virtual" ++ " environment"); ++ } ++ } ++ return 0; ++} ++ + /** + * Validate the pop VLAN action. + * +@@ -1492,7 +1655,7 @@ flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { +- struct mlx5_priv *priv = dev->data->dev_private; ++ const struct mlx5_priv *priv = dev->data->dev_private; + + (void)action; + (void)attr; +@@ -1501,17 +1664,16 @@ flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "pop vlan action is not supported"); +- /* +- * Check for inconsistencies: +- * fail strip_vlan in a flow that matches packets without VLAN tags. +- * fail strip_vlan in a flow that matches packets without explicitly a +- * matching on VLAN tag ? +- */ +- if (action_flags & MLX5_FLOW_ACTION_OF_POP_VLAN) ++ if (attr->egress) + return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_UNSPECIFIED, ++ RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, + NULL, +- "no support for multiple vlan pop " ++ "pop vlan action not supported for " ++ "egress"); ++ if (action_flags & MLX5_FLOW_VLAN_ACTIONS) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, action, ++ "no support for multiple VLAN " + "actions"); + if (!(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) + return rte_flow_error_set(error, ENOTSUP, +@@ -1524,20 +1686,21 @@ flow_dv_validate_action_pop_vlan(struct rte_eth_dev *dev, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after pop VLAN action"); ++ if (!attr->transfer && priv->representor) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, ++ "pop vlan action for VF representor " ++ "not supported on NIC table"); + return 0; + } + + /** + * Get VLAN default info from vlan match info. + * +- * @param[in] dev +- * Pointer to the rte_eth_dev structure. +- * @param[in] item ++ * @param[in] items + * the list of item specifications. + * @param[out] vlan + * pointer VLAN info to fill to. +- * @param[out] error +- * Pointer to error structure. + * + * @return + * 0 on success, a negative errno value otherwise and rte_errno is set. +@@ -1554,19 +1717,26 @@ flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items, + + if (items == NULL) + return; +- for (; items->type != RTE_FLOW_ITEM_TYPE_END && +- items->type != RTE_FLOW_ITEM_TYPE_VLAN; items++) +- ; +- if (items->type == RTE_FLOW_ITEM_TYPE_VLAN) { ++ for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { ++ int type = items->type; ++ ++ if (type == RTE_FLOW_ITEM_TYPE_VLAN || ++ type == MLX5_RTE_FLOW_ITEM_TYPE_VLAN) ++ break; ++ } ++ if (items->type != RTE_FLOW_ITEM_TYPE_END) { + const struct rte_flow_item_vlan *vlan_m = items->mask; + const struct rte_flow_item_vlan *vlan_v = items->spec; + ++ /* If VLAN item in pattern doesn't contain data, return here. */ ++ if (!vlan_v) ++ return; + if (!vlan_m) + vlan_m = &nic_mask; + /* Only full match values are accepted */ + if ((vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) == + MLX5DV_FLOW_VLAN_PCP_MASK_BE) { +- vlan->vlan_tci &= MLX5DV_FLOW_VLAN_PCP_MASK; ++ vlan->vlan_tci &= ~MLX5DV_FLOW_VLAN_PCP_MASK; + vlan->vlan_tci |= + rte_be_to_cpu_16(vlan_v->tci & + MLX5DV_FLOW_VLAN_PCP_MASK_BE); +@@ -1587,10 +1757,14 @@ flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items, + /** + * Validate the push VLAN action. + * ++ * @param[in] dev ++ * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. ++ * @param[in] item_flags ++ * The items found in this flow rule. + * @param[in] action +- * Pointer to the encap action. ++ * Pointer to the action structure. + * @param[in] attr + * Pointer to flow attributes + * @param[out] error +@@ -1600,38 +1774,68 @@ flow_dev_get_vlan_info_from_items(const struct rte_flow_item *items, + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + static int +-flow_dv_validate_action_push_vlan(uint64_t action_flags, +- uint64_t item_flags, ++flow_dv_validate_action_push_vlan(struct rte_eth_dev *dev, ++ uint64_t action_flags, ++ const struct rte_flow_item_vlan *vlan_m, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { + const struct rte_flow_action_of_push_vlan *push_vlan = action->conf; ++ const struct mlx5_priv *priv = dev->data->dev_private; + ++ if (!attr->transfer && attr->ingress) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, ++ NULL, ++ "push VLAN action not supported for " ++ "ingress"); + if (push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_VLAN) && + push_vlan->ethertype != RTE_BE16(RTE_ETHER_TYPE_QINQ)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "invalid vlan ethertype"); +- if (action_flags & +- (MLX5_FLOW_ACTION_OF_POP_VLAN | MLX5_FLOW_ACTION_OF_PUSH_VLAN)) ++ if (action_flags & MLX5_FLOW_VLAN_ACTIONS) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "no support for multiple VLAN " + "actions"); +- if (!mlx5_flow_find_action +- (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID) && +- !(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "push VLAN needs to match on VLAN in order to " +- "get VLAN VID information because there is " +- "no followed set VLAN VID action"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "wrong action order, port_id should " + "be after push VLAN"); ++ if (!attr->transfer && priv->representor) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, ++ "push vlan action for VF representor " ++ "not supported on NIC table"); ++ if (vlan_m && ++ (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) && ++ (vlan_m->tci & MLX5DV_FLOW_VLAN_PCP_MASK_BE) != ++ MLX5DV_FLOW_VLAN_PCP_MASK_BE && ++ !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_PCP) && ++ !(mlx5_flow_find_action ++ (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_PCP))) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, action, ++ "not full match mask on VLAN PCP and " ++ "there is no of_set_vlan_pcp action, " ++ "push VLAN action cannot figure out " ++ "PCP value"); ++ if (vlan_m && ++ (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) && ++ (vlan_m->tci & MLX5DV_FLOW_VLAN_VID_MASK_BE) != ++ MLX5DV_FLOW_VLAN_VID_MASK_BE && ++ !(action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) && ++ !(mlx5_flow_find_action ++ (action + 1, RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID))) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, action, ++ "not full match mask on VLAN VID and " ++ "there is no of_set_vlan_vid action, " ++ "push VLAN action cannot figure out " ++ "VID value"); + (void)attr; + return 0; + } +@@ -1643,8 +1847,6 @@ flow_dv_validate_action_push_vlan(uint64_t action_flags, + * Holds the actions detected until now. + * @param[in] actions + * Pointer to the list of actions remaining in the flow rule. +- * @param[in] attr +- * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * +@@ -1686,10 +1888,10 @@ flow_dv_validate_action_set_vlan_pcp(uint64_t action_flags, + * + * @param[in] item_flags + * Holds the items detected in this rule. ++ * @param[in] action_flags ++ * Holds the actions detected until now. + * @param[in] actions + * Pointer to the list of actions remaining in the flow rule. +- * @param[in] attr +- * Pointer to flow attributes + * @param[out] error + * Pointer to error structure. + * +@@ -1705,37 +1907,21 @@ flow_dv_validate_action_set_vlan_vid(uint64_t item_flags, + const struct rte_flow_action *action = actions; + const struct rte_flow_action_of_set_vlan_vid *conf = action->conf; + +- if (conf->vlan_vid > RTE_BE16(0xFFE)) ++ if (rte_be_to_cpu_16(conf->vlan_vid) > 0xFFE) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "VLAN VID value is too big"); +- /* there is an of_push_vlan action before us */ +- if (action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) { +- if (mlx5_flow_find_action(actions + 1, +- RTE_FLOW_ACTION_TYPE_OF_SET_VLAN_VID)) +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "Multiple VLAN VID modifications are " +- "not supported"); +- else +- return 0; +- } +- +- /* +- * Action is on an existing VLAN header: +- * Need to verify this is a single modify CID action. +- * Rule mast include a match on outer VLAN. +- */ ++ if (!(action_flags & MLX5_FLOW_ACTION_OF_PUSH_VLAN) && ++ !(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, action, ++ "set VLAN VID action must follow push" ++ " VLAN action or match on VLAN item"); + if (action_flags & MLX5_FLOW_ACTION_OF_SET_VLAN_VID) + return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "Multiple VLAN VID modifications are " + "not supported"); +- if (!(item_flags & MLX5_FLOW_LAYER_OUTER_VLAN)) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "match on VLAN is required in order " +- "to set VLAN VID"); + if (action_flags & MLX5_FLOW_ACTION_PORT_ID) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, +@@ -1788,10 +1974,6 @@ flow_dv_validate_action_flag(struct rte_eth_dev *dev, + if (ret < 0) + return ret; + assert(ret > 0); +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and flag in same flow"); + if (action_flags & MLX5_FLOW_ACTION_MARK) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -1861,10 +2043,6 @@ flow_dv_validate_action_mark(struct rte_eth_dev *dev, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + &mark->id, + "mark id exceeds the limit"); +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and mark in same flow"); + if (action_flags & MLX5_FLOW_ACTION_FLAG) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -1883,7 +2061,7 @@ flow_dv_validate_action_mark(struct rte_eth_dev *dev, + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action +- * Pointer to the encap action. ++ * Pointer to the action structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr +@@ -1903,7 +2081,7 @@ flow_dv_validate_action_set_meta(struct rte_eth_dev *dev, + { + const struct rte_flow_action_set_meta *conf; + uint32_t nic_mask = UINT32_MAX; +- enum modify_reg reg; ++ int reg; + + if (!mlx5_flow_ext_mreg_supported(dev)) + return rte_flow_error_set(error, ENOTSUP, +@@ -1931,10 +2109,6 @@ flow_dv_validate_action_set_meta(struct rte_eth_dev *dev, + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "meta data must be within reg C0"); +- if (!(conf->data & conf->mask)) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "zero value has no effect"); + return 0; + } + +@@ -1944,7 +2118,7 @@ flow_dv_validate_action_set_meta(struct rte_eth_dev *dev, + * @param[in] dev + * Pointer to the rte_eth_dev structure. + * @param[in] action +- * Pointer to the encap action. ++ * Pointer to the action structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr +@@ -1998,7 +2172,7 @@ flow_dv_validate_action_set_tag(struct rte_eth_dev *dev, + * Validate count action. + * + * @param[in] dev +- * device otr. ++ * Pointer to rte_eth_dev structure. + * @param[out] error + * Pointer to error structure. + * +@@ -2027,12 +2201,14 @@ flow_dv_validate_action_count(struct rte_eth_dev *dev, + /** + * Validate the L2 encap action. + * ++ * @param[in] dev ++ * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] action +- * Pointer to the encap action. ++ * Pointer to the action structure. + * @param[in] attr +- * Pointer to flow attributes ++ * Pointer to flow attributes. + * @param[out] error + * Pointer to error structure. + * +@@ -2040,36 +2216,36 @@ flow_dv_validate_action_count(struct rte_eth_dev *dev, + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + static int +-flow_dv_validate_action_l2_encap(uint64_t action_flags, ++flow_dv_validate_action_l2_encap(struct rte_eth_dev *dev, ++ uint64_t action_flags, + const struct rte_flow_action *action, + const struct rte_flow_attr *attr, + struct rte_flow_error *error) + { ++ const struct mlx5_priv *priv = dev->data->dev_private; ++ + if (!(action->conf)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, action, + "configuration cannot be null"); +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and encap in same flow"); +- if (action_flags & (MLX5_FLOW_ENCAP_ACTIONS | MLX5_FLOW_DECAP_ACTIONS)) ++ if (action_flags & MLX5_FLOW_ACTION_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can only have a single encap or" +- " decap action in a flow"); +- if (!attr->transfer && attr->ingress) ++ "can only have a single encap action " ++ "in a flow"); ++ if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, +- NULL, +- "encap action not supported for " +- "ingress"); ++ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, ++ "encap action for VF representor " ++ "not supported on NIC table"); + return 0; + } + + /** +- * Validate the L2 decap action. ++ * Validate a decap action. + * ++ * @param[in] dev ++ * Pointer to the rte_eth_dev structure. + * @param[in] action_flags + * Holds the actions detected until now. + * @param[in] attr +@@ -2081,19 +2257,20 @@ flow_dv_validate_action_l2_encap(uint64_t action_flags, + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + static int +-flow_dv_validate_action_l2_decap(uint64_t action_flags, +- const struct rte_flow_attr *attr, +- struct rte_flow_error *error) ++flow_dv_validate_action_decap(struct rte_eth_dev *dev, ++ uint64_t action_flags, ++ const struct rte_flow_attr *attr, ++ struct rte_flow_error *error) + { +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and decap in same flow"); +- if (action_flags & (MLX5_FLOW_ENCAP_ACTIONS | MLX5_FLOW_DECAP_ACTIONS)) +- return rte_flow_error_set(error, EINVAL, ++ const struct mlx5_priv *priv = dev->data->dev_private; ++ ++ if (action_flags & MLX5_FLOW_XCAP_ACTIONS) ++ return rte_flow_error_set(error, ENOTSUP, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can only have a single encap or" +- " decap action in a flow"); ++ action_flags & ++ MLX5_FLOW_ACTION_DECAP ? "can only " ++ "have a single decap action" : "decap " ++ "after encap is not supported"); + if (action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -2105,69 +2282,31 @@ flow_dv_validate_action_l2_decap(uint64_t action_flags, + NULL, + "decap action not supported for " + "egress"); +- return 0; +-} +- +-/** +- * Validate the raw encap action. +- * +- * @param[in] action_flags +- * Holds the actions detected until now. +- * @param[in] action +- * Pointer to the encap action. +- * @param[in] attr +- * Pointer to flow attributes +- * @param[out] error +- * Pointer to error structure. +- * +- * @return +- * 0 on success, a negative errno value otherwise and rte_errno is set. +- */ +-static int +-flow_dv_validate_action_raw_encap(uint64_t action_flags, +- const struct rte_flow_action *action, +- const struct rte_flow_attr *attr, +- struct rte_flow_error *error) +-{ +- const struct rte_flow_action_raw_encap *raw_encap = +- (const struct rte_flow_action_raw_encap *)action->conf; +- if (!(action->conf)) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "configuration cannot be null"); +- if (action_flags & MLX5_FLOW_ACTION_DROP) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and encap in same flow"); +- if (action_flags & MLX5_FLOW_ENCAP_ACTIONS) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can only have a single encap" +- " action in a flow"); +- /* encap without preceding decap is not supported for ingress */ +- if (!attr->transfer && attr->ingress && +- !(action_flags & MLX5_FLOW_ACTION_RAW_DECAP)) ++ if (!attr->transfer && priv->representor) + return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ATTR_INGRESS, +- NULL, +- "encap action not supported for " +- "ingress"); +- if (!raw_encap->size || !raw_encap->data) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, action, +- "raw encap data cannot be empty"); ++ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, ++ "decap action for VF representor " ++ "not supported on NIC table"); + return 0; + } + ++const struct rte_flow_action_raw_decap empty_decap = {.data = NULL, .size = 0,}; ++ + /** +- * Validate the raw decap action. ++ * Validate the raw encap and decap actions. + * +- * @param[in] action_flags +- * Holds the actions detected until now. +- * @param[in] action ++ * @param[in] dev ++ * Pointer to the rte_eth_dev structure. ++ * @param[in] decap ++ * Pointer to the decap action. ++ * @param[in] encap + * Pointer to the encap action. + * @param[in] attr + * Pointer to flow attributes ++ * @param[in/out] action_flags ++ * Holds the actions detected until now. ++ * @param[out] actions_n ++ * pointer to the number of actions counter. + * @param[out] error + * Pointer to error structure. + * +@@ -2175,41 +2314,72 @@ flow_dv_validate_action_raw_encap(uint64_t action_flags, + * 0 on success, a negative errno value otherwise and rte_errno is set. + */ + static int +-flow_dv_validate_action_raw_decap(uint64_t action_flags, +- const struct rte_flow_action *action, +- const struct rte_flow_attr *attr, +- struct rte_flow_error *error) +-{ +- const struct rte_flow_action_raw_decap *decap = action->conf; ++flow_dv_validate_action_raw_encap_decap ++ (struct rte_eth_dev *dev, ++ const struct rte_flow_action_raw_decap *decap, ++ const struct rte_flow_action_raw_encap *encap, ++ const struct rte_flow_attr *attr, uint64_t *action_flags, ++ int *actions_n, struct rte_flow_error *error) ++{ ++ const struct mlx5_priv *priv = dev->data->dev_private; ++ int ret; + +- if (action_flags & MLX5_FLOW_ACTION_DROP) ++ if (encap && (!encap->size || !encap->data)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't drop and decap in same flow"); +- if (action_flags & MLX5_FLOW_ENCAP_ACTIONS) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can't have encap action before" +- " decap action"); +- if (action_flags & MLX5_FLOW_DECAP_ACTIONS) +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, NULL, +- "can only have a single decap" +- " action in a flow"); +- /* decap action is valid on egress only if it is followed by encap */ +- if (attr->egress && decap && +- decap->size > MLX5_ENCAPSULATION_DECISION_SIZE) { +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ATTR_EGRESS, +- NULL, "decap action not supported" +- " for egress"); +- } else if (decap && decap->size > MLX5_ENCAPSULATION_DECISION_SIZE && +- (action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) { +- return rte_flow_error_set(error, EINVAL, +- RTE_FLOW_ERROR_TYPE_ACTION, +- NULL, +- "can't have decap action " +- "after modify action"); ++ "raw encap data cannot be empty"); ++ if (decap && encap) { ++ if (decap->size <= MLX5_ENCAPSULATION_DECISION_SIZE && ++ encap->size > MLX5_ENCAPSULATION_DECISION_SIZE) ++ /* L3 encap. */ ++ decap = NULL; ++ else if (encap->size <= ++ MLX5_ENCAPSULATION_DECISION_SIZE && ++ decap->size > ++ MLX5_ENCAPSULATION_DECISION_SIZE) ++ /* L3 decap. */ ++ encap = NULL; ++ else if (encap->size > ++ MLX5_ENCAPSULATION_DECISION_SIZE && ++ decap->size > ++ MLX5_ENCAPSULATION_DECISION_SIZE) ++ /* 2 L2 actions: encap and decap. */ ++ ; ++ else ++ return rte_flow_error_set(error, ++ ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, "unsupported too small " ++ "raw decap and too small raw " ++ "encap combination"); ++ } ++ if (decap) { ++ ret = flow_dv_validate_action_decap(dev, *action_flags, attr, ++ error); ++ if (ret < 0) ++ return ret; ++ *action_flags |= MLX5_FLOW_ACTION_DECAP; ++ ++(*actions_n); ++ } ++ if (encap) { ++ if (encap->size <= MLX5_ENCAPSULATION_DECISION_SIZE) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, ++ "small raw encap size"); ++ if (*action_flags & MLX5_FLOW_ACTION_ENCAP) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, ++ "more than one encap action"); ++ if (!attr->transfer && priv->representor) ++ return rte_flow_error_set ++ (error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, ++ "encap action for VF representor " ++ "not supported on NIC table"); ++ *action_flags |= MLX5_FLOW_ACTION_ENCAP; ++ ++(*actions_n); + } + return 0; + } +@@ -2248,7 +2418,6 @@ flow_dv_encap_decap_resource_register + domain = sh->rx_domain; + else + domain = sh->tx_domain; +- + /* Lookup a matching resource from cache. */ + LIST_FOREACH(cache_resource, &sh->encaps_decaps, next) { + if (resource->reformat_type == cache_resource->reformat_type && +@@ -2334,6 +2503,8 @@ flow_dv_jump_tbl_resource_register + DRV_LOG(DEBUG, "new jump table resource %p: refcnt %d++", + (void *)&tbl_data->jump, cnt); + } else { ++ /* old jump should not make the table ref++. */ ++ flow_dv_tbl_resource_release(dev, &tbl_data->tbl); + assert(tbl_data->jump.action); + DRV_LOG(DEBUG, "existed jump table resource %p: refcnt %d++", + (void *)&tbl_data->jump, cnt); +@@ -2799,8 +2970,6 @@ flow_dv_create_action_l2_encap(struct rte_eth_dev *dev, + (const struct rte_flow_action_raw_encap *)action->conf; + res.size = raw_encap_data->size; + memcpy(res.buf, raw_encap_data->data, res.size); +- if (flow_dv_zero_encap_udp_csum(res.buf, error)) +- return -rte_errno; + } else { + if (action->type == RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP) + encap_data = +@@ -2814,6 +2983,8 @@ flow_dv_create_action_l2_encap(struct rte_eth_dev *dev, + &res.size, error)) + return -rte_errno; + } ++ if (flow_dv_zero_encap_udp_csum(res.buf, error)) ++ return -rte_errno; + if (flow_dv_encap_decap_resource_register(dev, &res, dev_flow, error)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, +@@ -2907,12 +3078,12 @@ flow_dv_create_action_raw_encap(struct rte_eth_dev *dev, + * + * @param[in] dev + * Pointer to rte_eth_dev structure. +- * @param[in] vlan_tag +- * the vlan tag to push to the Ethernet header. +- * @param[in, out] dev_flow +- * Pointer to the mlx5_flow. + * @param[in] attr + * Pointer to the flow attributes. ++ * @param[in] vlan ++ * Pointer to the vlan to push to the Ethernet header. ++ * @param[in, out] dev_flow ++ * Pointer to the mlx5_flow. + * @param[out] error + * Pointer to the error structure. + * +@@ -2962,7 +3133,7 @@ flow_dv_validate_action_modify_hdr(const uint64_t action_flags, + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION_CONF, + NULL, "action configuration not set"); +- if (action_flags & MLX5_FLOW_ENCAP_ACTIONS) ++ if (action_flags & MLX5_FLOW_ACTION_ENCAP) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "can't have encap action before" +@@ -3026,10 +3197,14 @@ flow_dv_validate_action_modify_ipv4(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV4)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L3_IPV4 : ++ MLX5_FLOW_LAYER_OUTER_L3_IPV4; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, +@@ -3060,10 +3235,14 @@ flow_dv_validate_action_modify_ipv6(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_L3_IPV6)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L3_IPV6 : ++ MLX5_FLOW_LAYER_OUTER_L3_IPV6; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, +@@ -3094,10 +3273,14 @@ flow_dv_validate_action_modify_tp(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_L4)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L4 : ++ MLX5_FLOW_LAYER_OUTER_L4; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no transport layer " +@@ -3129,10 +3312,14 @@ flow_dv_validate_action_modify_tcp_seq(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L4_TCP : ++ MLX5_FLOW_LAYER_OUTER_L4_TCP; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no TCP item in" +@@ -3174,10 +3361,14 @@ flow_dv_validate_action_modify_tcp_ack(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_OUTER_L4_TCP)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L4_TCP : ++ MLX5_FLOW_LAYER_OUTER_L4_TCP; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, "no TCP item in" +@@ -3218,10 +3409,14 @@ flow_dv_validate_action_modify_ttl(const uint64_t action_flags, + struct rte_flow_error *error) + { + int ret = 0; ++ uint64_t layer; + + ret = flow_dv_validate_action_modify_hdr(action_flags, action, error); + if (!ret) { +- if (!(item_flags & MLX5_FLOW_LAYER_L3)) ++ layer = (action_flags & MLX5_FLOW_ACTION_DECAP) ? ++ MLX5_FLOW_LAYER_INNER_L3 : ++ MLX5_FLOW_LAYER_OUTER_L3; ++ if (!(item_flags & layer)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, + NULL, +@@ -3273,7 +3468,7 @@ flow_dv_validate_action_jump(const struct rte_flow_action *action, + target_group = + ((const struct rte_flow_action_jump *)action->conf)->group; + ret = mlx5_flow_group_to_table(attributes, external, target_group, +- &table, error); ++ true, &table, error); + if (ret) + return ret; + if (attributes->group == target_group) +@@ -3359,21 +3554,24 @@ flow_dv_validate_action_port_id(struct rte_eth_dev *dev, + * + * @param dev + * Pointer to rte_eth_dev structure. ++ * @param flags ++ * Flags bits to check if root level. + * + * @return + * Max number of modify header actions device can support. + */ +-static unsigned int +-flow_dv_modify_hdr_action_max(struct rte_eth_dev *dev) ++static inline unsigned int ++flow_dv_modify_hdr_action_max(struct rte_eth_dev *dev __rte_unused, ++ uint64_t flags) + { + /* +- * There's no way to directly query the max cap. Although it has to be +- * acquried by iterative trial, it is a safe assumption that more +- * actions are supported by FW if extensive metadata register is +- * supported. ++ * There's no way to directly query the max capacity from FW. ++ * The maximal value on root table should be assumed to be supported. + */ +- return mlx5_flow_ext_mreg_supported(dev) ? MLX5_MODIFY_NUM : +- MLX5_MODIFY_NUM_NO_MREG; ++ if (!(flags & MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL)) ++ return MLX5_MAX_MODIFY_NUM; ++ else ++ return MLX5_ROOT_TBL_MODIFY_NUM; + } + + /** +@@ -3402,7 +3600,12 @@ mlx5_flow_validate_action_meter(struct rte_eth_dev *dev, + { + struct mlx5_priv *priv = dev->data->dev_private; + const struct rte_flow_action_meter *am = action->conf; +- struct mlx5_flow_meter *fm = mlx5_flow_meter_find(priv, am->mtr_id); ++ struct mlx5_flow_meter *fm; ++ ++ if (!am) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, NULL, ++ "meter action conf is NULL"); + + if (action_flags & MLX5_FLOW_ACTION_METER) + return rte_flow_error_set(error, ENOTSUP, +@@ -3417,6 +3620,7 @@ mlx5_flow_validate_action_meter(struct rte_eth_dev *dev, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, + NULL, + "meter action not supported"); ++ fm = mlx5_flow_meter_find(priv, am->mtr_id); + if (!fm) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, +@@ -3458,8 +3662,12 @@ flow_dv_modify_hdr_resource_register + struct mlx5_ibv_shared *sh = priv->sh; + struct mlx5_flow_dv_modify_hdr_resource *cache_resource; + struct mlx5dv_dr_domain *ns; ++ uint32_t actions_len; + +- if (resource->actions_num > flow_dv_modify_hdr_action_max(dev)) ++ resource->flags = ++ dev_flow->group ? 0 : MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL; ++ if (resource->actions_num > flow_dv_modify_hdr_action_max(dev, ++ resource->flags)) + return rte_flow_error_set(error, EOVERFLOW, + RTE_FLOW_ERROR_TYPE_ACTION, NULL, + "too many modify header items"); +@@ -3469,17 +3677,15 @@ flow_dv_modify_hdr_resource_register + ns = sh->tx_domain; + else + ns = sh->rx_domain; +- resource->flags = +- dev_flow->group ? 0 : MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL; + /* Lookup a matching resource from cache. */ ++ actions_len = resource->actions_num * sizeof(resource->actions[0]); + LIST_FOREACH(cache_resource, &sh->modify_cmds, next) { + if (resource->ft_type == cache_resource->ft_type && + resource->actions_num == cache_resource->actions_num && + resource->flags == cache_resource->flags && + !memcmp((const void *)resource->actions, + (const void *)cache_resource->actions, +- (resource->actions_num * +- sizeof(resource->actions[0])))) { ++ actions_len)) { + DRV_LOG(DEBUG, "modify-header resource %p: refcnt %d++", + (void *)cache_resource, + rte_atomic32_read(&cache_resource->refcnt)); +@@ -3489,18 +3695,18 @@ flow_dv_modify_hdr_resource_register + } + } + /* Register new modify-header resource. */ +- cache_resource = rte_calloc(__func__, 1, sizeof(*cache_resource), 0); ++ cache_resource = rte_calloc(__func__, 1, ++ sizeof(*cache_resource) + actions_len, 0); + if (!cache_resource) + return rte_flow_error_set(error, ENOMEM, + RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL, + "cannot allocate resource memory"); + *cache_resource = *resource; ++ rte_memcpy(cache_resource->actions, resource->actions, actions_len); + cache_resource->verbs_action = + mlx5_glue->dv_create_flow_action_modify_header +- (sh->ctx, cache_resource->ft_type, +- ns, cache_resource->flags, +- cache_resource->actions_num * +- sizeof(cache_resource->actions[0]), ++ (sh->ctx, cache_resource->ft_type, ns, ++ cache_resource->flags, actions_len, + (uint64_t *)cache_resource->actions); + if (!cache_resource->verbs_action) { + rte_free(cache_resource); +@@ -3846,11 +4052,13 @@ _flow_dv_query_count(struct rte_eth_dev *dev, + * The devX counter handle. + * @param[in] batch + * Whether the pool is for counter that was allocated by batch command. ++ * @param[in/out] cont_cur ++ * Pointer to the container pointer, it will be update in pool resize. + * + * @return +- * A new pool pointer on success, NULL otherwise and rte_errno is set. ++ * The pool container pointer on success, NULL otherwise and rte_errno is set. + */ +-static struct mlx5_flow_counter_pool * ++static struct mlx5_pools_container * + flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs, + uint32_t batch) + { +@@ -3884,12 +4092,12 @@ flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs, + */ + rte_atomic64_set(&pool->query_gen, 0x2); + TAILQ_INIT(&pool->counters); +- TAILQ_INSERT_TAIL(&cont->pool_list, pool, next); ++ TAILQ_INSERT_HEAD(&cont->pool_list, pool, next); + cont->pools[n_valid] = pool; + /* Pool initialization must be updated before host thread access. */ + rte_cio_wmb(); + rte_atomic16_add(&cont->n_valid, 1); +- return pool; ++ return cont; + } + + /** +@@ -3903,33 +4111,35 @@ flow_dv_pool_create(struct rte_eth_dev *dev, struct mlx5_devx_obj *dcs, + * Whether the pool is for counter that was allocated by batch command. + * + * @return +- * The free counter pool pointer and @p cnt_free is set on success, ++ * The counter container pointer and @p cnt_free is set on success, + * NULL otherwise and rte_errno is set. + */ +-static struct mlx5_flow_counter_pool * ++static struct mlx5_pools_container * + flow_dv_counter_pool_prepare(struct rte_eth_dev *dev, + struct mlx5_flow_counter **cnt_free, + uint32_t batch) + { + struct mlx5_priv *priv = dev->data->dev_private; ++ struct mlx5_pools_container *cont; + struct mlx5_flow_counter_pool *pool; + struct mlx5_devx_obj *dcs = NULL; + struct mlx5_flow_counter *cnt; + uint32_t i; + ++ cont = MLX5_CNT_CONTAINER(priv->sh, batch, 0); + if (!batch) { + /* bulk_bitmap must be 0 for single counter allocation. */ + dcs = mlx5_devx_cmd_flow_counter_alloc(priv->sh->ctx, 0); + if (!dcs) + return NULL; +- pool = flow_dv_find_pool_by_id +- (MLX5_CNT_CONTAINER(priv->sh, batch, 0), dcs->id); ++ pool = flow_dv_find_pool_by_id(cont, dcs->id); + if (!pool) { +- pool = flow_dv_pool_create(dev, dcs, batch); +- if (!pool) { ++ cont = flow_dv_pool_create(dev, dcs, batch); ++ if (!cont) { + mlx5_devx_cmd_destroy(dcs); + return NULL; + } ++ pool = TAILQ_FIRST(&cont->pool_list); + } else if (dcs->id < pool->min_dcs->id) { + rte_atomic64_set(&pool->a64_dcs, + (int64_t)(uintptr_t)dcs); +@@ -3938,7 +4148,7 @@ flow_dv_counter_pool_prepare(struct rte_eth_dev *dev, + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + cnt->dcs = dcs; + *cnt_free = cnt; +- return pool; ++ return cont; + } + /* bulk_bitmap is in 128 counters units. */ + if (priv->config.hca_attr.flow_counter_bulk_alloc_bitmap & 0x4) +@@ -3947,18 +4157,19 @@ flow_dv_counter_pool_prepare(struct rte_eth_dev *dev, + rte_errno = ENODATA; + return NULL; + } +- pool = flow_dv_pool_create(dev, dcs, batch); +- if (!pool) { ++ cont = flow_dv_pool_create(dev, dcs, batch); ++ if (!cont) { + mlx5_devx_cmd_destroy(dcs); + return NULL; + } ++ pool = TAILQ_FIRST(&cont->pool_list); + for (i = 0; i < MLX5_COUNTERS_PER_POOL; ++i) { + cnt = &pool->counters_raw[i]; + cnt->pool = pool; + TAILQ_INSERT_HEAD(&pool->counters, cnt, next); + } + *cnt_free = &pool->counters_raw[0]; +- return pool; ++ return cont; + } + + /** +@@ -4059,9 +4270,10 @@ flow_dv_counter_alloc(struct rte_eth_dev *dev, uint32_t shared, uint32_t id, + cnt_free = NULL; + } + if (!cnt_free) { +- pool = flow_dv_counter_pool_prepare(dev, &cnt_free, batch); +- if (!pool) ++ cont = flow_dv_counter_pool_prepare(dev, &cnt_free, batch); ++ if (!cont) + return NULL; ++ pool = TAILQ_FIRST(&cont->pool_list); + } + cnt_free->batch = batch; + /* Create a DV counter action only in the first time usage. */ +@@ -4146,7 +4358,9 @@ flow_dv_counter_release(struct rte_eth_dev *dev, + * Pointer to error structure. + * + * @return +- * 0 on success, a negative errno value otherwise and rte_errno is set. ++ * - 0 on success and non root table. ++ * - 1 on success and root table. ++ * - a negative errno value otherwise and rte_errno is set. + */ + static int + flow_dv_validate_attributes(struct rte_eth_dev *dev, +@@ -4156,6 +4370,7 @@ flow_dv_validate_attributes(struct rte_eth_dev *dev, + { + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t priority_max = priv->config.flow_prio - 1; ++ int ret = 0; + + #ifndef HAVE_MLX5DV_DR + if (attributes->group) +@@ -4164,14 +4379,15 @@ flow_dv_validate_attributes(struct rte_eth_dev *dev, + NULL, + "groups are not supported"); + #else +- uint32_t table; +- int ret; ++ uint32_t table = 0; + + ret = mlx5_flow_group_to_table(attributes, external, +- attributes->group, ++ attributes->group, !!priv->fdb_def_rule, + &table, error); + if (ret) + return ret; ++ if (!table) ++ ret = MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL; + #endif + if (attributes->priority != MLX5_FLOW_PRIO_RSVD && + attributes->priority >= priority_max) +@@ -4201,7 +4417,7 @@ flow_dv_validate_attributes(struct rte_eth_dev *dev, + RTE_FLOW_ERROR_TYPE_ATTR, NULL, + "must specify exactly one of " + "ingress or egress"); +- return 0; ++ return ret; + } + + /** +@@ -4217,6 +4433,8 @@ flow_dv_validate_attributes(struct rte_eth_dev *dev, + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. ++ * @param[in] hairpin ++ * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * +@@ -4227,7 +4445,7 @@ static int + flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], +- bool external, struct rte_flow_error *error) ++ bool external, int hairpin, struct rte_flow_error *error) + { + int ret; + uint64_t action_flags = 0; +@@ -4236,7 +4454,11 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + uint8_t next_protocol = 0xff; + uint16_t ether_type = 0; + int actions_n = 0; ++ uint8_t item_ipv6_proto = 0; + const struct rte_flow_item *gre_item = NULL; ++ const struct rte_flow_action_raw_decap *decap; ++ const struct rte_flow_action_raw_encap *encap; ++ const struct rte_flow_action_rss *rss; + struct rte_flow_item_tcp nic_tcp_mask = { + .hdr = { + .tcp_flags = 0xFF, +@@ -4246,12 +4468,17 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + }; + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_dev_config *dev_conf = &priv->config; ++ uint16_t queue_index = 0xFFFF; ++ const struct rte_flow_item_vlan *vlan_m = NULL; ++ int16_t rw_act_num = 0; ++ uint64_t is_root; + + if (items == NULL) + return -1; + ret = flow_dv_validate_attributes(dev, attr, external, error); + if (ret < 0) + return ret; ++ is_root = (uint64_t)ret; + for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) { + int tunnel = !!(item_flags & MLX5_FLOW_LAYER_TUNNEL); + int type = items->type; +@@ -4286,8 +4513,8 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + } + break; + case RTE_FLOW_ITEM_TYPE_VLAN: +- ret = mlx5_flow_validate_item_vlan(items, item_flags, +- dev, error); ++ ret = flow_dv_validate_item_vlan(items, item_flags, ++ dev, error); + if (ret < 0) + return ret; + last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN : +@@ -4303,6 +4530,9 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + } else { + ether_type = 0; + } ++ /* Store outer VLAN mask for of_push_vlan action. */ ++ if (!tunnel) ++ vlan_m = items->mask; + break; + case RTE_FLOW_ITEM_TYPE_IPV4: + mlx5_flow_tunnel_ip_check(items, next_protocol, +@@ -4343,6 +4573,9 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + if (items->mask != NULL && + ((const struct rte_flow_item_ipv6 *) + items->mask)->hdr.proto) { ++ item_ipv6_proto = ++ ((const struct rte_flow_item_ipv6 *) ++ items->spec)->hdr.proto; + next_protocol = + ((const struct rte_flow_item_ipv6 *) + items->spec)->hdr.proto; +@@ -4418,7 +4651,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + error); + if (ret < 0) + return ret; +- last_item = MLX5_FLOW_LAYER_VXLAN_GPE; ++ last_item = MLX5_FLOW_LAYER_GENEVE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + ret = mlx5_flow_validate_item_mpls(dev, items, +@@ -4457,6 +4690,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + error); + if (ret < 0) + return ret; ++ item_ipv6_proto = IPPROTO_ICMPV6; + last_item = MLX5_FLOW_LAYER_ICMP6; + break; + case RTE_FLOW_ITEM_TYPE_TAG: +@@ -4512,6 +4746,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + action_flags |= MLX5_FLOW_ACTION_FLAG; + ++actions_n; + } ++ rw_act_num += MLX5_ACT_NUM_SET_MARK; + break; + case RTE_FLOW_ACTION_TYPE_MARK: + ret = flow_dv_validate_action_mark(dev, actions, +@@ -4530,6 +4765,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + action_flags |= MLX5_FLOW_ACTION_MARK; + ++actions_n; + } ++ rw_act_num += MLX5_ACT_NUM_SET_MARK; + break; + case RTE_FLOW_ACTION_TYPE_SET_META: + ret = flow_dv_validate_action_set_meta(dev, actions, +@@ -4541,6 +4777,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_META; ++ rw_act_num += MLX5_ACT_NUM_SET_META; + break; + case RTE_FLOW_ACTION_TYPE_SET_TAG: + ret = flow_dv_validate_action_set_tag(dev, actions, +@@ -4552,6 +4789,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; ++ rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_DROP: + ret = mlx5_flow_validate_action_drop(action_flags, +@@ -4567,16 +4805,21 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + attr, error); + if (ret < 0) + return ret; ++ queue_index = ((const struct rte_flow_action_queue *) ++ (actions->conf))->index; + action_flags |= MLX5_FLOW_ACTION_QUEUE; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RSS: ++ rss = actions->conf; + ret = mlx5_flow_validate_action_rss(actions, + action_flags, dev, + attr, item_flags, + error); + if (ret < 0) + return ret; ++ if (rss != NULL && rss->queue_num) ++ queue_index = rss->queue[0]; + action_flags |= MLX5_FLOW_ACTION_RSS; + ++actions_n; + break; +@@ -4598,8 +4841,9 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: +- ret = flow_dv_validate_action_push_vlan(action_flags, +- item_flags, ++ ret = flow_dv_validate_action_push_vlan(dev, ++ action_flags, ++ vlan_m, + actions, attr, + error); + if (ret < 0) +@@ -4623,49 +4867,52 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + return ret; + /* Count VID with push_vlan command. */ + action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID; ++ rw_act_num += MLX5_ACT_NUM_MDF_VID; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_ENCAP: +- ret = flow_dv_validate_action_l2_encap(action_flags, ++ ret = flow_dv_validate_action_l2_encap(dev, ++ action_flags, + actions, attr, + error); + if (ret < 0) + return ret; +- action_flags |= actions->type == +- RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP ? +- MLX5_FLOW_ACTION_VXLAN_ENCAP : +- MLX5_FLOW_ACTION_NVGRE_ENCAP; ++ action_flags |= MLX5_FLOW_ACTION_ENCAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: +- ret = flow_dv_validate_action_l2_decap(action_flags, +- attr, error); ++ ret = flow_dv_validate_action_decap(dev, action_flags, ++ attr, error); + if (ret < 0) + return ret; +- action_flags |= actions->type == +- RTE_FLOW_ACTION_TYPE_VXLAN_DECAP ? +- MLX5_FLOW_ACTION_VXLAN_DECAP : +- MLX5_FLOW_ACTION_NVGRE_DECAP; ++ action_flags |= MLX5_FLOW_ACTION_DECAP; + ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: +- ret = flow_dv_validate_action_raw_encap(action_flags, +- actions, attr, +- error); ++ ret = flow_dv_validate_action_raw_encap_decap ++ (dev, NULL, actions->conf, attr, &action_flags, ++ &actions_n, error); + if (ret < 0) + return ret; +- action_flags |= MLX5_FLOW_ACTION_RAW_ENCAP; +- ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: +- ret = flow_dv_validate_action_raw_decap(action_flags, +- actions, attr, +- error); ++ decap = actions->conf; ++ while ((++actions)->type == RTE_FLOW_ACTION_TYPE_VOID) ++ ; ++ if (actions->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { ++ encap = NULL; ++ actions--; ++ } else { ++ encap = actions->conf; ++ } ++ ret = flow_dv_validate_action_raw_encap_decap ++ (dev, ++ decap ? decap : &empty_decap, encap, ++ attr, &action_flags, &actions_n, ++ error); + if (ret < 0) + return ret; +- action_flags |= MLX5_FLOW_ACTION_RAW_DECAP; +- ++actions_n; + break; + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: +@@ -4682,8 +4929,15 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? + MLX5_FLOW_ACTION_SET_MAC_SRC : + MLX5_FLOW_ACTION_SET_MAC_DST; ++ /* ++ * Even if the source and destination MAC addresses have ++ * overlap in the header with 4B alignment, the convert ++ * function will handle them separately and 4 SW actions ++ * will be created. And 2 actions will be added each ++ * time no matter how many bytes of address will be set. ++ */ ++ rw_act_num += MLX5_ACT_NUM_MDF_MAC; + break; +- + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + ret = flow_dv_validate_action_modify_ipv4(action_flags, +@@ -4699,6 +4953,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? + MLX5_FLOW_ACTION_SET_IPV4_SRC : + MLX5_FLOW_ACTION_SET_IPV4_DST; ++ rw_act_num += MLX5_ACT_NUM_MDF_IPV4; + break; + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: +@@ -4708,6 +4963,12 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + error); + if (ret < 0) + return ret; ++ if (item_ipv6_proto == IPPROTO_ICMPV6) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ actions, ++ "Can't change header " ++ "with ICMPv6 proto"); + /* Count all modify-header actions as one action. */ + if (!(action_flags & MLX5_FLOW_MODIFY_HDR_ACTIONS)) + ++actions_n; +@@ -4715,6 +4976,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? + MLX5_FLOW_ACTION_SET_IPV6_SRC : + MLX5_FLOW_ACTION_SET_IPV6_DST; ++ rw_act_num += MLX5_ACT_NUM_MDF_IPV6; + break; + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: +@@ -4731,6 +4993,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_SET_TP_SRC ? + MLX5_FLOW_ACTION_SET_TP_SRC : + MLX5_FLOW_ACTION_SET_TP_DST; ++ rw_act_num += MLX5_ACT_NUM_MDF_PORT; + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + case RTE_FLOW_ACTION_TYPE_SET_TTL: +@@ -4747,6 +5010,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_SET_TTL ? + MLX5_FLOW_ACTION_SET_TTL : + MLX5_FLOW_ACTION_DEC_TTL; ++ rw_act_num += MLX5_ACT_NUM_MDF_TTL; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + ret = flow_dv_validate_action_jump(actions, +@@ -4774,6 +5038,7 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ? + MLX5_FLOW_ACTION_INC_TCP_SEQ : + MLX5_FLOW_ACTION_DEC_TCP_SEQ; ++ rw_act_num += MLX5_ACT_NUM_MDF_TCPSEQ; + break; + case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK: +@@ -4791,10 +5056,13 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ? + MLX5_FLOW_ACTION_INC_TCP_ACK : + MLX5_FLOW_ACTION_DEC_TCP_ACK; ++ rw_act_num += MLX5_ACT_NUM_MDF_TCPACK; + break; +- case MLX5_RTE_FLOW_ACTION_TYPE_TAG: + case MLX5_RTE_FLOW_ACTION_TYPE_MARK: ++ break; ++ case MLX5_RTE_FLOW_ACTION_TYPE_TAG: + case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG: ++ rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + case RTE_FLOW_ACTION_TYPE_METER: + ret = mlx5_flow_validate_action_meter(dev, +@@ -4805,6 +5073,8 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + return ret; + action_flags |= MLX5_FLOW_ACTION_METER; + ++actions_n; ++ /* Meter action will add one more TAG action. */ ++ rw_act_num += MLX5_ACT_NUM_SET_TAG; + break; + default: + return rte_flow_error_set(error, ENOTSUP, +@@ -4813,13 +5083,18 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + "action not supported"); + } + } +- if ((action_flags & MLX5_FLOW_LAYER_TUNNEL) && +- (action_flags & MLX5_FLOW_VLAN_ACTIONS)) +- return rte_flow_error_set(error, ENOTSUP, +- RTE_FLOW_ERROR_TYPE_ACTION, +- actions, +- "can't have vxlan and vlan" +- " actions in the same rule"); ++ /* ++ * Validate the drop action mutual exclusion with other actions. ++ * Drop action is mutually-exclusive with any other action, except for ++ * Count action. ++ */ ++ if ((action_flags & MLX5_FLOW_ACTION_DROP) && ++ (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT))) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, NULL, ++ "Drop action is mutually-exclusive " ++ "with any other action, except for " ++ "Count action"); + /* Eswitch has few restrictions on using items and actions */ + if (attr->transfer) { + if (!mlx5_flow_ext_mreg_supported(dev) && +@@ -4856,6 +5131,37 @@ flow_dv_validate(struct rte_eth_dev *dev, const struct rte_flow_attr *attr, + actions, + "no fate action is found"); + } ++ /* Continue validation for Xcap actions.*/ ++ if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) && (queue_index == 0xFFFF || ++ mlx5_rxq_get_type(dev, queue_index) != MLX5_RXQ_TYPE_HAIRPIN)) { ++ if ((action_flags & MLX5_FLOW_XCAP_ACTIONS) == ++ MLX5_FLOW_XCAP_ACTIONS) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, "encap and decap " ++ "combination aren't supported"); ++ if (!attr->transfer && attr->ingress && (action_flags & ++ MLX5_FLOW_ACTION_ENCAP)) ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, "encap is not supported" ++ " for ingress traffic"); ++ } ++ /* Hairpin flow will add one more TAG action. */ ++ if (hairpin > 0) ++ rw_act_num += MLX5_ACT_NUM_SET_TAG; ++ /* extra metadata enabled: one more TAG action will be add. */ ++ if (dev_conf->dv_flow_en && ++ dev_conf->dv_xmeta_en != MLX5_XMETA_MODE_LEGACY && ++ mlx5_flow_ext_mreg_supported(dev)) ++ rw_act_num += MLX5_ACT_NUM_SET_TAG; ++ if ((uint32_t)rw_act_num > ++ flow_dv_modify_hdr_action_max(dev, is_root)) { ++ return rte_flow_error_set(error, ENOTSUP, ++ RTE_FLOW_ERROR_TYPE_ACTION, ++ NULL, "too many header modify" ++ " actions to support"); ++ } + return 0; + } + +@@ -4984,6 +5290,23 @@ flow_dv_translate_item_eth(void *matcher, void *key, + /* The value must be in the range of the mask. */ + for (i = 0; i < sizeof(eth_m->dst); ++i) + l24_v[i] = eth_m->src.addr_bytes[i] & eth_v->src.addr_bytes[i]; ++ if (eth_v->type) { ++ /* When ethertype is present set mask for tagged VLAN. */ ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); ++ /* Set value for tagged VLAN if ethertype is 802.1Q. */ ++ if (eth_v->type == RTE_BE16(RTE_ETHER_TYPE_VLAN) || ++ eth_v->type == RTE_BE16(RTE_ETHER_TYPE_QINQ)) { ++ MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, ++ 1); ++ /* Return here to avoid setting match on ethertype. */ ++ return; ++ } ++ } ++ /* ++ * HW supports match on one Ethertype, the Ethertype following the last ++ * VLAN tag of the packet (see PRM). ++ * Set match on ethertype only if ETH header is not followed by VLAN. ++ */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ethertype, + rte_be_to_cpu_16(eth_m->type)); + l24_v = MLX5_ADDR_OF(fte_match_set_lyr_2_4, headers_v, ethertype); +@@ -5017,10 +5340,6 @@ flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow, + uint16_t tci_m; + uint16_t tci_v; + +- if (!vlan_v) +- return; +- if (!vlan_m) +- vlan_m = &rte_flow_item_vlan_mask; + if (inner) { + headers_m = MLX5_ADDR_OF(fte_match_param, matcher, + inner_headers); +@@ -5033,13 +5352,22 @@ flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow, + * This is workaround, masks are not supported, + * and pre-validated. + */ +- dev_flow->dv.vf_vlan.tag = +- rte_be_to_cpu_16(vlan_v->tci) & 0x0fff; ++ if (vlan_v) ++ dev_flow->dv.vf_vlan.tag = ++ rte_be_to_cpu_16(vlan_v->tci) & 0x0fff; + } +- tci_m = rte_be_to_cpu_16(vlan_m->tci); +- tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); ++ /* ++ * When VLAN item exists in flow, mark packet as tagged, ++ * even if TCI is not specified. ++ */ + MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, cvlan_tag, 1); ++ if (!vlan_v) ++ return; ++ if (!vlan_m) ++ vlan_m = &rte_flow_item_vlan_mask; ++ tci_m = rte_be_to_cpu_16(vlan_m->tci); ++ tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_vid, tci_m); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, tci_v); + MLX5_SET(fte_match_set_lyr_2_4, headers_m, first_cfi, tci_m >> 12); +@@ -5061,6 +5389,8 @@ flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow, + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. ++ * @param[in] item_flags ++ * Bit-fields that holds the items detected until now. + * @param[in] inner + * Item is inner pattern. + * @param[in] group +@@ -5069,6 +5399,7 @@ flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow, + static void + flow_dv_translate_item_ipv4(void *matcher, void *key, + const struct rte_flow_item *item, ++ const uint64_t item_flags, + int inner, uint32_t group) + { + const struct rte_flow_item_ipv4 *ipv4_m = item->mask; +@@ -5101,6 +5432,13 @@ flow_dv_translate_item_ipv4(void *matcher, void *key, + else + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0x4); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, 4); ++ /* ++ * On outer header (which must contains L2), or inner header with L2, ++ * set cvlan_tag mask bit to mark this packet as untagged. ++ * This should be done even if item->spec is empty. ++ */ ++ if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2) ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + if (!ipv4_v) + return; + if (!ipv4_m) +@@ -5139,6 +5477,8 @@ flow_dv_translate_item_ipv4(void *matcher, void *key, + * Flow matcher value. + * @param[in] item + * Flow pattern to translate. ++ * @param[in] item_flags ++ * Bit-fields that holds the items detected until now. + * @param[in] inner + * Item is inner pattern. + * @param[in] group +@@ -5147,6 +5487,7 @@ flow_dv_translate_item_ipv4(void *matcher, void *key, + static void + flow_dv_translate_item_ipv6(void *matcher, void *key, + const struct rte_flow_item *item, ++ const uint64_t item_flags, + int inner, uint32_t group) + { + const struct rte_flow_item_ipv6 *ipv6_m = item->mask; +@@ -5189,6 +5530,13 @@ flow_dv_translate_item_ipv6(void *matcher, void *key, + else + MLX5_SET(fte_match_set_lyr_2_4, headers_m, ip_version, 0x6); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_version, 6); ++ /* ++ * On outer header (which must contains L2), or inner header with L2, ++ * set cvlan_tag mask bit to mark this packet as untagged. ++ * This should be done even if item->spec is empty. ++ */ ++ if (!inner || item_flags & MLX5_FLOW_LAYER_INNER_L2) ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, cvlan_tag, 1); + if (!ipv6_v) + return; + if (!ipv6_m) +@@ -5354,13 +5702,13 @@ flow_dv_translate_item_gre_key(void *matcher, void *key, + void *misc_v = MLX5_ADDR_OF(fte_match_param, key, misc_parameters); + rte_be32_t gre_key_default_mask = RTE_BE32(UINT32_MAX); + ++ /* GRE K bit must be on and should already be validated */ ++ MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, 1); ++ MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, 1); + if (!key_v) + return; + if (!key_m) + key_m = &gre_key_default_mask; +- /* GRE K bit must be on and should already be validated */ +- MLX5_SET(fte_match_set_misc, misc_m, gre_k_present, 1); +- MLX5_SET(fte_match_set_misc, misc_v, gre_k_present, 1); + MLX5_SET(fte_match_set_misc, misc_m, gre_key_h, + rte_be_to_cpu_32(*key_m) >> 8); + MLX5_SET(fte_match_set_misc, misc_v, gre_key_h, +@@ -5558,6 +5906,76 @@ flow_dv_translate_item_vxlan(void *matcher, void *key, + vni_v[i] = vni_m[i] & vxlan_v->vni[i]; + } + ++/** ++ * Add VXLAN-GPE item to matcher and to the value. ++ * ++ * @param[in, out] matcher ++ * Flow matcher. ++ * @param[in, out] key ++ * Flow matcher value. ++ * @param[in] item ++ * Flow pattern to translate. ++ * @param[in] inner ++ * Item is inner pattern. ++ */ ++ ++static void ++flow_dv_translate_item_vxlan_gpe(void *matcher, void *key, ++ const struct rte_flow_item *item, int inner) ++{ ++ const struct rte_flow_item_vxlan_gpe *vxlan_m = item->mask; ++ const struct rte_flow_item_vxlan_gpe *vxlan_v = item->spec; ++ void *headers_m; ++ void *headers_v; ++ void *misc_m = ++ MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_3); ++ void *misc_v = ++ MLX5_ADDR_OF(fte_match_param, key, misc_parameters_3); ++ char *vni_m; ++ char *vni_v; ++ uint16_t dport; ++ int size; ++ int i; ++ uint8_t flags_m = 0xff; ++ uint8_t flags_v = 0xc; ++ ++ if (inner) { ++ headers_m = MLX5_ADDR_OF(fte_match_param, matcher, ++ inner_headers); ++ headers_v = MLX5_ADDR_OF(fte_match_param, key, inner_headers); ++ } else { ++ headers_m = MLX5_ADDR_OF(fte_match_param, matcher, ++ outer_headers); ++ headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers); ++ } ++ dport = item->type == RTE_FLOW_ITEM_TYPE_VXLAN ? ++ MLX5_UDP_PORT_VXLAN : MLX5_UDP_PORT_VXLAN_GPE; ++ if (!MLX5_GET16(fte_match_set_lyr_2_4, headers_v, udp_dport)) { ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, udp_dport, 0xFFFF); ++ MLX5_SET(fte_match_set_lyr_2_4, headers_v, udp_dport, dport); ++ } ++ if (!vxlan_v) ++ return; ++ if (!vxlan_m) ++ vxlan_m = &rte_flow_item_vxlan_gpe_mask; ++ size = sizeof(vxlan_m->vni); ++ vni_m = MLX5_ADDR_OF(fte_match_set_misc3, misc_m, outer_vxlan_gpe_vni); ++ vni_v = MLX5_ADDR_OF(fte_match_set_misc3, misc_v, outer_vxlan_gpe_vni); ++ memcpy(vni_m, vxlan_m->vni, size); ++ for (i = 0; i < size; ++i) ++ vni_v[i] = vni_m[i] & vxlan_v->vni[i]; ++ if (vxlan_m->flags) { ++ flags_m = vxlan_m->flags; ++ flags_v = vxlan_v->flags; ++ } ++ MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_flags, flags_m); ++ MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_flags, flags_v); ++ MLX5_SET(fte_match_set_misc3, misc_m, outer_vxlan_gpe_next_protocol, ++ vxlan_m->protocol); ++ MLX5_SET(fte_match_set_misc3, misc_v, outer_vxlan_gpe_next_protocol, ++ vxlan_v->protocol); ++} ++ + /** + * Add Geneve item to matcher and to the value. + * +@@ -5742,6 +6160,7 @@ flow_dv_match_meta_reg(void *matcher, void *key, + MLX5_ADDR_OF(fte_match_param, matcher, misc_parameters_2); + void *misc2_v = + MLX5_ADDR_OF(fte_match_param, key, misc_parameters_2); ++ uint32_t temp; + + data &= mask; + switch (reg_type) { +@@ -5754,8 +6173,18 @@ flow_dv_match_meta_reg(void *matcher, void *key, + MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_b, data); + break; + case REG_C_0: +- MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_0, mask); +- MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_0, data); ++ /* ++ * The metadata register C0 field might be divided into ++ * source vport index and META item value, we should set ++ * this field according to specified mask, not as whole one. ++ */ ++ temp = MLX5_GET(fte_match_set_misc2, misc2_m, metadata_reg_c_0); ++ temp |= mask; ++ MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_0, temp); ++ temp = MLX5_GET(fte_match_set_misc2, misc2_v, metadata_reg_c_0); ++ temp &= ~mask; ++ temp |= data; ++ MLX5_SET(fte_match_set_misc2, misc2_v, metadata_reg_c_0, temp); + break; + case REG_C_1: + MLX5_SET(fte_match_set_misc2, misc2_m, metadata_reg_c_1, mask); +@@ -5825,6 +6254,15 @@ flow_dv_translate_item_mark(struct rte_eth_dev *dev, + /* Get the metadata register index for the mark. */ + reg = mlx5_flow_get_reg_id(dev, MLX5_FLOW_MARK, 0, NULL); + assert(reg > 0); ++ if (reg == REG_C_0) { ++ struct mlx5_priv *priv = dev->data->dev_private; ++ uint32_t msk_c0 = priv->sh->dv_regc0_mask; ++ uint32_t shl_c0 = rte_bsf32(msk_c0); ++ ++ mask &= msk_c0; ++ mask <<= shl_c0; ++ value <<= shl_c0; ++ } + flow_dv_match_meta_reg(matcher, key, reg, value, mask); + } + } +@@ -5857,7 +6295,7 @@ flow_dv_translate_item_meta(struct rte_eth_dev *dev, + meta_m = &rte_flow_item_meta_mask; + meta_v = (const void *)item->spec; + if (meta_v) { +- enum modify_reg reg; ++ int reg; + uint32_t value = meta_v->data; + uint32_t mask = meta_m->data; + +@@ -5875,8 +6313,12 @@ flow_dv_translate_item_meta(struct rte_eth_dev *dev, + struct mlx5_priv *priv = dev->data->dev_private; + uint32_t msk_c0 = priv->sh->dv_regc0_mask; + uint32_t shl_c0 = rte_bsf32(msk_c0); ++#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN ++ uint32_t shr_c0 = __builtin_clz(priv->sh->dv_meta_mask); + +- msk_c0 = rte_cpu_to_be_32(msk_c0); ++ value >>= shr_c0; ++ mask >>= shr_c0; ++#endif + value <<= shl_c0; + mask <<= shl_c0; + assert(msk_c0); +@@ -5906,6 +6348,8 @@ flow_dv_translate_item_meta_vport(void *matcher, void *key, + /** + * Add tag item to matcher + * ++ * @param[in] dev ++ * The devich to configure through. + * @param[in, out] matcher + * Flow matcher. + * @param[in, out] key +@@ -5914,15 +6358,27 @@ flow_dv_translate_item_meta_vport(void *matcher, void *key, + * Flow pattern to translate. + */ + static void +-flow_dv_translate_mlx5_item_tag(void *matcher, void *key, ++flow_dv_translate_mlx5_item_tag(struct rte_eth_dev *dev, ++ void *matcher, void *key, + const struct rte_flow_item *item) + { + const struct mlx5_rte_flow_item_tag *tag_v = item->spec; + const struct mlx5_rte_flow_item_tag *tag_m = item->mask; ++ uint32_t mask, value; + + assert(tag_v); +- flow_dv_match_meta_reg(matcher, key, tag_v->id, tag_v->data, +- tag_m ? tag_m->data : UINT32_MAX); ++ value = tag_v->data; ++ mask = tag_m ? tag_m->data : UINT32_MAX; ++ if (tag_v->id == REG_C_0) { ++ struct mlx5_priv *priv = dev->data->dev_private; ++ uint32_t msk_c0 = priv->sh->dv_regc0_mask; ++ uint32_t shl_c0 = rte_bsf32(msk_c0); ++ ++ mask &= msk_c0; ++ mask <<= shl_c0; ++ value <<= shl_c0; ++ } ++ flow_dv_match_meta_reg(matcher, key, tag_v->id, value, mask); + } + + /** +@@ -6056,6 +6512,12 @@ flow_dv_translate_item_icmp6(void *matcher, void *key, + return; + if (!icmp6_m) + icmp6_m = &rte_flow_item_icmp6_mask; ++ /* ++ * Force flow only to match the non-fragmented IPv6 ICMPv6 packets. ++ * If only the protocol is specified, no need to match the frag. ++ */ ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1); ++ MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0); + MLX5_SET(fte_match_set_misc3, misc3_m, icmpv6_type, icmp6_m->type); + MLX5_SET(fte_match_set_misc3, misc3_v, icmpv6_type, + icmp6_v->type & icmp6_m->type); +@@ -6103,6 +6565,12 @@ flow_dv_translate_item_icmp(void *matcher, void *key, + return; + if (!icmp_m) + icmp_m = &rte_flow_item_icmp_mask; ++ /* ++ * Force flow only to match the non-fragmented IPv4 ICMP packets. ++ * If only the protocol is specified, no need to match the frag. ++ */ ++ MLX5_SET(fte_match_set_lyr_2_4, headers_m, frag, 1); ++ MLX5_SET(fte_match_set_lyr_2_4, headers_v, frag, 0); + MLX5_SET(fte_match_set_misc3, misc3_m, icmp_type, + icmp_m->hdr.icmp_type); + MLX5_SET(fte_match_set_misc3, misc3_v, icmp_type, +@@ -6618,10 +7086,13 @@ __flow_dv_translate(struct rte_eth_dev *dev, + }; + int actions_n = 0; + bool actions_end = false; +- struct mlx5_flow_dv_modify_hdr_resource mhdr_res = { +- .ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : +- MLX5DV_FLOW_TABLE_TYPE_NIC_RX +- }; ++ union { ++ struct mlx5_flow_dv_modify_hdr_resource res; ++ uint8_t len[sizeof(struct mlx5_flow_dv_modify_hdr_resource) + ++ sizeof(struct mlx5_modification_cmd) * ++ (MLX5_MAX_MODIFY_NUM + 1)]; ++ } mhdr_dummy; ++ struct mlx5_flow_dv_modify_hdr_resource *mhdr_res = &mhdr_dummy.res; + union flow_dv_attr flow_attr = { .attr = 0 }; + uint32_t tag_be; + union mlx5_flow_tbl_key tbl_key; +@@ -6633,15 +7104,19 @@ __flow_dv_translate(struct rte_eth_dev *dev, + uint32_t table; + int ret = 0; + ++ mhdr_res->ft_type = attr->egress ? MLX5DV_FLOW_TABLE_TYPE_NIC_TX : ++ MLX5DV_FLOW_TABLE_TYPE_NIC_RX; + ret = mlx5_flow_group_to_table(attr, dev_flow->external, attr->group, +- &table, error); ++ !!priv->fdb_def_rule, &table, error); + if (ret) + return ret; + dev_flow->group = table; + if (attr->transfer) +- mhdr_res.ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB; ++ mhdr_res->ft_type = MLX5DV_FLOW_TABLE_TYPE_FDB; + if (priority == MLX5_FLOW_PRIO_RSVD) + priority = dev_conf->flow_prio - 1; ++ /* number of actions must be set to 0 in case of dirty stack. */ ++ mhdr_res->actions_num = 0; + for (; !actions_end ; actions++) { + const struct rte_flow_action_queue *queue; + const struct rte_flow_action_rss *rss; +@@ -6679,7 +7154,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + }; + + if (flow_dv_convert_action_mark(dev, &mark, +- &mhdr_res, ++ mhdr_res, + error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_MARK_EXT; +@@ -6701,7 +7176,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + actions->conf; + + if (flow_dv_convert_action_mark(dev, mark, +- &mhdr_res, ++ mhdr_res, + error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_MARK_EXT; +@@ -6722,7 +7197,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + break; + case RTE_FLOW_ACTION_TYPE_SET_META: + if (flow_dv_convert_action_set_meta +- (dev, &mhdr_res, attr, ++ (dev, mhdr_res, attr, + (const struct rte_flow_action_set_meta *) + actions->conf, error)) + return -rte_errno; +@@ -6730,7 +7205,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + break; + case RTE_FLOW_ACTION_TYPE_SET_TAG: + if (flow_dv_convert_action_set_tag +- (dev, &mhdr_res, ++ (dev, mhdr_res, + (const struct rte_flow_action_set_tag *) + actions->conf, error)) + return -rte_errno; +@@ -6798,7 +7273,9 @@ __flow_dv_translate(struct rte_eth_dev *dev, + action_flags |= MLX5_FLOW_ACTION_OF_POP_VLAN; + break; + case RTE_FLOW_ACTION_TYPE_OF_PUSH_VLAN: +- flow_dev_get_vlan_info_from_items(items, &vlan); ++ if (!(action_flags & ++ MLX5_FLOW_ACTION_OF_SET_VLAN_VID)) ++ flow_dev_get_vlan_info_from_items(items, &vlan); + vlan.eth_proto = rte_be_to_cpu_16 + ((((const struct rte_flow_action_of_push_vlan *) + actions->conf)->ethertype)); +@@ -6830,7 +7307,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + mlx5_update_vlan_vid_pcp(actions, &vlan); + /* If no VLAN push - this is a modify header action */ + if (flow_dv_convert_action_modify_vlan_vid +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_OF_SET_VLAN_VID; + break; +@@ -6843,10 +7320,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; +- action_flags |= actions->type == +- RTE_FLOW_ACTION_TYPE_VXLAN_ENCAP ? +- MLX5_FLOW_ACTION_VXLAN_ENCAP : +- MLX5_FLOW_ACTION_NVGRE_ENCAP; ++ action_flags |= MLX5_FLOW_ACTION_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_VXLAN_DECAP: + case RTE_FLOW_ACTION_TYPE_NVGRE_DECAP: +@@ -6856,14 +7330,11 @@ __flow_dv_translate(struct rte_eth_dev *dev, + return -rte_errno; + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; +- action_flags |= actions->type == +- RTE_FLOW_ACTION_TYPE_VXLAN_DECAP ? +- MLX5_FLOW_ACTION_VXLAN_DECAP : +- MLX5_FLOW_ACTION_NVGRE_DECAP; ++ action_flags |= MLX5_FLOW_ACTION_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_ENCAP: + /* Handle encap with preceding decap. */ +- if (action_flags & MLX5_FLOW_ACTION_RAW_DECAP) { ++ if (action_flags & MLX5_FLOW_ACTION_DECAP) { + if (flow_dv_create_action_raw_encap + (dev, actions, dev_flow, attr, error)) + return -rte_errno; +@@ -6878,15 +7349,11 @@ __flow_dv_translate(struct rte_eth_dev *dev, + dev_flow->dv.actions[actions_n++] = + dev_flow->dv.encap_decap->verbs_action; + } +- action_flags |= MLX5_FLOW_ACTION_RAW_ENCAP; ++ action_flags |= MLX5_FLOW_ACTION_ENCAP; + break; + case RTE_FLOW_ACTION_TYPE_RAW_DECAP: +- /* Check if this decap is followed by encap. */ +- for (; action->type != RTE_FLOW_ACTION_TYPE_END && +- action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP; +- action++) { +- } +- /* Handle decap only if it isn't followed by encap. */ ++ while ((++action)->type == RTE_FLOW_ACTION_TYPE_VOID) ++ ; + if (action->type != RTE_FLOW_ACTION_TYPE_RAW_ENCAP) { + if (flow_dv_create_action_l2_decap + (dev, dev_flow, attr->transfer, error)) +@@ -6895,13 +7362,14 @@ __flow_dv_translate(struct rte_eth_dev *dev, + dev_flow->dv.encap_decap->verbs_action; + } + /* If decap is followed by encap, handle it at encap. */ +- action_flags |= MLX5_FLOW_ACTION_RAW_DECAP; ++ action_flags |= MLX5_FLOW_ACTION_DECAP; + break; + case RTE_FLOW_ACTION_TYPE_JUMP: + jump_data = action->conf; + ret = mlx5_flow_group_to_table(attr, dev_flow->external, +- jump_data->group, &table, +- error); ++ jump_data->group, ++ !!priv->fdb_def_rule, ++ &table, error); + if (ret) + return ret; + tbl = flow_dv_tbl_resource_get(dev, table, +@@ -6929,7 +7397,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ACTION_TYPE_SET_MAC_SRC: + case RTE_FLOW_ACTION_TYPE_SET_MAC_DST: + if (flow_dv_convert_action_modify_mac +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_MAC_SRC ? +@@ -6939,7 +7407,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV4_DST: + if (flow_dv_convert_action_modify_ipv4 +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV4_SRC ? +@@ -6949,7 +7417,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC: + case RTE_FLOW_ACTION_TYPE_SET_IPV6_DST: + if (flow_dv_convert_action_modify_ipv6 +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_IPV6_SRC ? +@@ -6959,8 +7427,9 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ACTION_TYPE_SET_TP_SRC: + case RTE_FLOW_ACTION_TYPE_SET_TP_DST: + if (flow_dv_convert_action_modify_tp +- (&mhdr_res, actions, items, +- &flow_attr, error)) ++ (mhdr_res, actions, items, ++ &flow_attr, dev_flow, !!(action_flags & ++ MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_SET_TP_SRC ? +@@ -6969,21 +7438,24 @@ __flow_dv_translate(struct rte_eth_dev *dev, + break; + case RTE_FLOW_ACTION_TYPE_DEC_TTL: + if (flow_dv_convert_action_modify_dec_ttl +- (&mhdr_res, items, &flow_attr, error)) ++ (mhdr_res, items, &flow_attr, dev_flow, ++ !!(action_flags & ++ MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_DEC_TTL; + break; + case RTE_FLOW_ACTION_TYPE_SET_TTL: + if (flow_dv_convert_action_modify_ttl +- (&mhdr_res, actions, items, +- &flow_attr, error)) ++ (mhdr_res, actions, items, &flow_attr, ++ dev_flow, !!(action_flags & ++ MLX5_FLOW_ACTION_DECAP), error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TTL; + break; + case RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_SEQ: + if (flow_dv_convert_action_modify_tcp_seq +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_SEQ ? +@@ -6994,7 +7466,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ACTION_TYPE_INC_TCP_ACK: + case RTE_FLOW_ACTION_TYPE_DEC_TCP_ACK: + if (flow_dv_convert_action_modify_tcp_ack +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= actions->type == + RTE_FLOW_ACTION_TYPE_INC_TCP_ACK ? +@@ -7003,13 +7475,13 @@ __flow_dv_translate(struct rte_eth_dev *dev, + break; + case MLX5_RTE_FLOW_ACTION_TYPE_TAG: + if (flow_dv_convert_action_set_reg +- (&mhdr_res, actions, error)) ++ (mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + break; + case MLX5_RTE_FLOW_ACTION_TYPE_COPY_MREG: + if (flow_dv_convert_action_copy_mreg +- (dev, &mhdr_res, actions, error)) ++ (dev, mhdr_res, actions, error)) + return -rte_errno; + action_flags |= MLX5_FLOW_ACTION_SET_TAG; + break; +@@ -7034,10 +7506,10 @@ __flow_dv_translate(struct rte_eth_dev *dev, + break; + case RTE_FLOW_ACTION_TYPE_END: + actions_end = true; +- if (mhdr_res.actions_num) { ++ if (mhdr_res->actions_num) { + /* create modify action if needed. */ + if (flow_dv_modify_hdr_resource_register +- (dev, &mhdr_res, dev_flow, error)) ++ (dev, mhdr_res, dev_flow, error)) + return -rte_errno; + dev_flow->dv.actions[modify_action_position] = + dev_flow->dv.modify_hdr->verbs_action; +@@ -7046,7 +7518,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + default: + break; + } +- if (mhdr_res.actions_num && ++ if (mhdr_res->actions_num && + modify_action_position == UINT32_MAX) + modify_action_position = actions_n++; + } +@@ -7083,7 +7555,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + flow_dv_translate_item_ipv4(match_mask, match_value, +- items, tunnel, ++ items, item_flags, tunnel, + dev_flow->group); + matcher.priority = MLX5_PRIORITY_MAP_L3; + dev_flow->hash_fields |= +@@ -7111,7 +7583,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + mlx5_flow_tunnel_ip_check(items, next_protocol, + &item_flags, &tunnel); + flow_dv_translate_item_ipv6(match_mask, match_value, +- items, tunnel, ++ items, item_flags, tunnel, + dev_flow->group); + matcher.priority = MLX5_PRIORITY_MAP_L3; + dev_flow->hash_fields |= +@@ -7162,6 +7634,8 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ITEM_TYPE_GRE: + flow_dv_translate_item_gre(match_mask, match_value, + items, tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_GRE_KEY: +@@ -7172,26 +7646,37 @@ __flow_dv_translate(struct rte_eth_dev *dev, + case RTE_FLOW_ITEM_TYPE_NVGRE: + flow_dv_translate_item_nvgre(match_mask, match_value, + items, tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GRE; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN: + flow_dv_translate_item_vxlan(match_mask, match_value, + items, tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_VXLAN; + break; + case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: +- flow_dv_translate_item_vxlan(match_mask, match_value, +- items, tunnel); ++ flow_dv_translate_item_vxlan_gpe(match_mask, ++ match_value, items, ++ tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_VXLAN_GPE; + break; + case RTE_FLOW_ITEM_TYPE_GENEVE: + flow_dv_translate_item_geneve(match_mask, match_value, + items, tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_GENEVE; + break; + case RTE_FLOW_ITEM_TYPE_MPLS: + flow_dv_translate_item_mpls(match_mask, match_value, + items, last_item, tunnel); ++ matcher.priority = flow->rss.level >= 2 ? ++ MLX5_PRIORITY_MAP_L2 : MLX5_PRIORITY_MAP_L4; + last_item = MLX5_FLOW_LAYER_MPLS; + break; + case RTE_FLOW_ITEM_TYPE_MARK: +@@ -7220,7 +7705,7 @@ __flow_dv_translate(struct rte_eth_dev *dev, + last_item = MLX5_FLOW_ITEM_TAG; + break; + case MLX5_RTE_FLOW_ITEM_TYPE_TAG: +- flow_dv_translate_mlx5_item_tag(match_mask, ++ flow_dv_translate_mlx5_item_tag(dev, match_mask, + match_value, items); + last_item = MLX5_FLOW_ITEM_TAG; + break; +@@ -7236,13 +7721,13 @@ __flow_dv_translate(struct rte_eth_dev *dev, + item_flags |= last_item; + } + /* +- * In case of ingress traffic when E-Switch mode is enabled, +- * we have two cases where we need to set the source port manually. ++ * When E-Switch mode is enabled, we have two cases where we need to ++ * set the source port manually. + * The first one, is in case of Nic steering rule, and the second is + * E-Switch rule where no port_id item was found. In both cases + * the source port is set according the current port in use. + */ +- if ((attr->ingress && !(item_flags & MLX5_FLOW_ITEM_PORT_ID)) && ++ if (!(item_flags & MLX5_FLOW_ITEM_PORT_ID) && + (priv->representor || priv->master)) { + if (flow_dv_translate_item_port_id(dev, match_mask, + match_value, NULL)) +@@ -7250,7 +7735,11 @@ __flow_dv_translate(struct rte_eth_dev *dev, + } + assert(!flow_dv_check_valid_spec(matcher.mask.buf, + dev_flow->dv.value.buf)); +- dev_flow->layers = item_flags; ++ /* ++ * Layers may be already initialized from prefix flow if this dev_flow ++ * is the suffix flow. ++ */ ++ dev_flow->layers |= item_flags; + /* Register matcher. */ + matcher.crc = rte_raw_cksum((const void *)matcher.mask.buf, + matcher.mask.size); +@@ -7779,8 +8268,9 @@ flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev, + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->egress.any_matcher)); + if (mtd->egress.tbl) +- claim_zero(flow_dv_tbl_resource_release(dev, +- mtd->egress.tbl)); ++ flow_dv_tbl_resource_release(dev, mtd->egress.tbl); ++ if (mtd->egress.sfx_tbl) ++ flow_dv_tbl_resource_release(dev, mtd->egress.sfx_tbl); + if (mtd->ingress.color_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->ingress.color_matcher)); +@@ -7788,8 +8278,9 @@ flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev, + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->ingress.any_matcher)); + if (mtd->ingress.tbl) +- claim_zero(flow_dv_tbl_resource_release(dev, +- mtd->ingress.tbl)); ++ flow_dv_tbl_resource_release(dev, mtd->ingress.tbl); ++ if (mtd->ingress.sfx_tbl) ++ flow_dv_tbl_resource_release(dev, mtd->ingress.sfx_tbl); + if (mtd->transfer.color_matcher) + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->transfer.color_matcher)); +@@ -7797,8 +8288,9 @@ flow_dv_destroy_mtr_tbl(struct rte_eth_dev *dev, + claim_zero(mlx5_glue->dv_destroy_flow_matcher + (mtd->transfer.any_matcher)); + if (mtd->transfer.tbl) +- claim_zero(flow_dv_tbl_resource_release(dev, +- mtd->transfer.tbl)); ++ flow_dv_tbl_resource_release(dev, mtd->transfer.tbl); ++ if (mtd->transfer.sfx_tbl) ++ flow_dv_tbl_resource_release(dev, mtd->transfer.sfx_tbl); + if (mtd->drop_actn) + claim_zero(mlx5_glue->destroy_flow_action(mtd->drop_actn)); + rte_free(mtd); +@@ -7846,31 +8338,16 @@ flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev, + .match_mask = (void *)&mask, + }; + void *actions[METER_ACTIONS]; +- struct mlx5_flow_tbl_resource **sfx_tbl; + struct mlx5_meter_domain_info *dtb; + struct rte_flow_error error; + int i = 0; + +- if (transfer) { +- sfx_tbl = &sh->fdb_mtr_sfx_tbl; ++ if (transfer) + dtb = &mtb->transfer; +- } else if (egress) { +- sfx_tbl = &sh->tx_mtr_sfx_tbl; ++ else if (egress) + dtb = &mtb->egress; +- } else { +- sfx_tbl = &sh->rx_mtr_sfx_tbl; ++ else + dtb = &mtb->ingress; +- } +- /* If the suffix table in missing, create it. */ +- if (!(*sfx_tbl)) { +- *sfx_tbl = flow_dv_tbl_resource_get(dev, +- MLX5_FLOW_TABLE_LEVEL_SUFFIX, +- egress, transfer, &error); +- if (!(*sfx_tbl)) { +- DRV_LOG(ERR, "Failed to create meter suffix table."); +- return -1; +- } +- } + /* Create the meter table with METER level. */ + dtb->tbl = flow_dv_tbl_resource_get(dev, MLX5_FLOW_TABLE_LEVEL_METER, + egress, transfer, &error); +@@ -7878,6 +8355,14 @@ flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev, + DRV_LOG(ERR, "Failed to create meter policer table."); + return -1; + } ++ /* Create the meter suffix table with SUFFIX level. */ ++ dtb->sfx_tbl = flow_dv_tbl_resource_get(dev, ++ MLX5_FLOW_TABLE_LEVEL_SUFFIX, ++ egress, transfer, &error); ++ if (!dtb->sfx_tbl) { ++ DRV_LOG(ERR, "Failed to create meter suffix table."); ++ return -1; ++ } + /* Create matchers, Any and Color. */ + dv_attr.priority = 3; + dv_attr.match_criteria_enable = 0; +@@ -7893,7 +8378,7 @@ flow_dv_prepare_mtr_tables(struct rte_eth_dev *dev, + dv_attr.match_criteria_enable = + 1 << MLX5_MATCH_CRITERIA_ENABLE_MISC2_BIT; + flow_dv_match_meta_reg(mask.buf, value.buf, color_reg_c_idx, +- rte_col_2_mlx5_col(RTE_COLORS), UINT32_MAX); ++ rte_col_2_mlx5_col(RTE_COLORS), UINT8_MAX); + dtb->color_matcher = mlx5_glue->dv_create_flow_matcher(sh->ctx, + &dv_attr, + dtb->tbl->obj); +@@ -8048,8 +8533,6 @@ flow_dv_destroy_policer_rules(struct rte_eth_dev *dev __rte_unused, + * Pointer to flow meter structure. + * @param[in] mtb + * Pointer to DV meter table set. +- * @param[in] sfx_tb +- * Pointer to suffix table. + * @param[in] mtr_reg_c + * Color match REG_C. + * +@@ -8059,7 +8542,6 @@ flow_dv_destroy_policer_rules(struct rte_eth_dev *dev __rte_unused, + static int + flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm, + struct mlx5_meter_domain_info *dtb, +- struct mlx5_flow_tbl_resource *sfx_tb, + uint8_t mtr_reg_c) + { + struct mlx5_flow_dv_match_params matcher = { +@@ -8073,12 +8555,10 @@ flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm, + int i; + + /* Create jump action. */ +- if (!sfx_tb) +- return -1; + if (!dtb->jump_actn) + dtb->jump_actn = + mlx5_glue->dr_create_flow_action_dest_flow_tbl +- (sfx_tb->obj); ++ (dtb->sfx_tbl->obj); + if (!dtb->jump_actn) { + DRV_LOG(ERR, "Failed to create policer jump action."); + goto error; +@@ -8087,7 +8567,7 @@ flow_dv_create_policer_forward_rule(struct mlx5_flow_meter *fm, + int j = 0; + + flow_dv_match_meta_reg(matcher.buf, value.buf, mtr_reg_c, +- rte_col_2_mlx5_col(i), UINT32_MAX); ++ rte_col_2_mlx5_col(i), UINT8_MAX); + if (mtb->count_actns[i]) + actions[j++] = mtb->count_actns[i]; + if (fm->params.action[i] == MTR_POLICER_ACTION_DROP) +@@ -8133,7 +8613,6 @@ flow_dv_create_policer_rules(struct rte_eth_dev *dev, + + if (attr->egress) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->egress, +- priv->sh->tx_mtr_sfx_tbl, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create egress policer."); +@@ -8142,7 +8621,6 @@ flow_dv_create_policer_rules(struct rte_eth_dev *dev, + } + if (attr->ingress) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->ingress, +- priv->sh->rx_mtr_sfx_tbl, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create ingress policer."); +@@ -8151,7 +8629,6 @@ flow_dv_create_policer_rules(struct rte_eth_dev *dev, + } + if (attr->transfer) { + ret = flow_dv_create_policer_forward_rule(fm, &mtb->transfer, +- priv->sh->fdb_mtr_sfx_tbl, + priv->mtr_color_reg); + if (ret) { + DRV_LOG(ERR, "Failed to create transfer policer."); +diff --git a/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c b/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c +index c787c9838d..7ac6a25e43 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c ++++ b/dpdk/drivers/net/mlx5/mlx5_flow_verbs.c +@@ -493,14 +493,12 @@ flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow, + ipv6.val.traffic_class = (vtc_flow_val & RTE_IPV6_HDR_TC_MASK) >> + RTE_IPV6_HDR_TC_SHIFT; + ipv6.val.next_hdr = spec->hdr.proto; +- ipv6.val.hop_limit = spec->hdr.hop_limits; + ipv6.mask.flow_label = + rte_cpu_to_be_32((vtc_flow_mask & RTE_IPV6_HDR_FL_MASK) >> + RTE_IPV6_HDR_FL_SHIFT); + ipv6.mask.traffic_class = (vtc_flow_mask & RTE_IPV6_HDR_TC_MASK) >> + RTE_IPV6_HDR_TC_SHIFT; + ipv6.mask.next_hdr = mask->hdr.proto; +- ipv6.mask.hop_limit = mask->hdr.hop_limits; + /* Remove unwanted bits from values. */ + for (i = 0; i < RTE_DIM(ipv6.val.src_ip); ++i) { + ipv6.val.src_ip[i] &= ipv6.mask.src_ip[i]; +@@ -509,7 +507,6 @@ flow_verbs_translate_item_ipv6(struct mlx5_flow *dev_flow, + ipv6.val.flow_label &= ipv6.mask.flow_label; + ipv6.val.traffic_class &= ipv6.mask.traffic_class; + ipv6.val.next_hdr &= ipv6.mask.next_hdr; +- ipv6.val.hop_limit &= ipv6.mask.hop_limit; + } + flow_verbs_spec_add(&dev_flow->verbs, &ipv6, size); + } +@@ -589,6 +586,28 @@ flow_verbs_translate_item_udp(struct mlx5_flow *dev_flow, + udp.val.src_port &= udp.mask.src_port; + udp.val.dst_port &= udp.mask.dst_port; + } ++ item++; ++ while (item->type == RTE_FLOW_ITEM_TYPE_VOID) ++ item++; ++ if (!(udp.val.dst_port & udp.mask.dst_port)) { ++ switch ((item)->type) { ++ case RTE_FLOW_ITEM_TYPE_VXLAN: ++ udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN); ++ udp.mask.dst_port = 0xffff; ++ break; ++ case RTE_FLOW_ITEM_TYPE_VXLAN_GPE: ++ udp.val.dst_port = htons(MLX5_UDP_PORT_VXLAN_GPE); ++ udp.mask.dst_port = 0xffff; ++ break; ++ case RTE_FLOW_ITEM_TYPE_MPLS: ++ udp.val.dst_port = htons(MLX5_UDP_PORT_MPLS); ++ udp.mask.dst_port = 0xffff; ++ break; ++ default: ++ break; ++ } ++ } ++ + flow_verbs_spec_add(&dev_flow->verbs, &udp, size); + } + +@@ -1019,6 +1038,8 @@ flow_verbs_translate_action_count(struct mlx5_flow *dev_flow, + * Pointer to the list of actions. + * @param[in] external + * This flow rule is created by request external to PMD. ++ * @param[in] hairpin ++ * Number of hairpin TX actions, 0 means classic flow. + * @param[out] error + * Pointer to the error structure. + * +@@ -1031,6 +1052,7 @@ flow_verbs_validate(struct rte_eth_dev *dev, + const struct rte_flow_item items[], + const struct rte_flow_action actions[], + bool external __rte_unused, ++ int hairpin __rte_unused, + struct rte_flow_error *error) + { + int ret; +@@ -1255,6 +1277,18 @@ flow_verbs_validate(struct rte_eth_dev *dev, + "action not supported"); + } + } ++ /* ++ * Validate the drop action mutual exclusion with other actions. ++ * Drop action is mutually-exclusive with any other action, except for ++ * Count action. ++ */ ++ if ((action_flags & MLX5_FLOW_ACTION_DROP) && ++ (action_flags & ~(MLX5_FLOW_ACTION_DROP | MLX5_FLOW_ACTION_COUNT))) ++ return rte_flow_error_set(error, EINVAL, ++ RTE_FLOW_ERROR_TYPE_ACTION, NULL, ++ "Drop action is mutually-exclusive " ++ "with any other action, except for " ++ "Count action"); + if (!(action_flags & MLX5_FLOW_FATE_ACTIONS)) + return rte_flow_error_set(error, EINVAL, + RTE_FLOW_ERROR_TYPE_ACTION, actions, +diff --git a/dpdk/drivers/net/mlx5/mlx5_glue.c b/dpdk/drivers/net/mlx5/mlx5_glue.c +index 0917bf28d6..44f63116a8 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_glue.c ++++ b/dpdk/drivers/net/mlx5/mlx5_glue.c +@@ -1008,7 +1008,7 @@ mlx5_glue_devx_qp_query(struct ibv_qp *qp, + const void *in, size_t inlen, + void *out, size_t outlen) + { +-#ifdef HAVE_IBV_DEVX_OBJ ++#ifdef HAVE_IBV_DEVX_QP + return mlx5dv_devx_qp_query(qp, in, inlen, out, outlen); + #else + (void)qp; +diff --git a/dpdk/drivers/net/mlx5/mlx5_glue.h b/dpdk/drivers/net/mlx5/mlx5_glue.h +index 6442f1eba8..4e6465523a 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_glue.h ++++ b/dpdk/drivers/net/mlx5/mlx5_glue.h +@@ -258,6 +258,6 @@ struct mlx5_glue { + struct mlx5dv_devx_port *mlx5_devx_port); + }; + +-const struct mlx5_glue *mlx5_glue; ++extern const struct mlx5_glue *mlx5_glue; + + #endif /* MLX5_GLUE_H_ */ +diff --git a/dpdk/drivers/net/mlx5/mlx5_nl.c b/dpdk/drivers/net/mlx5/mlx5_nl.c +index e7ba03471d..64580b9e6a 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_nl.c ++++ b/dpdk/drivers/net/mlx5/mlx5_nl.c +@@ -269,10 +269,10 @@ mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + void *arg) + { + struct sockaddr_nl sa; +- char buf[MLX5_RECV_BUF_SIZE]; ++ void *buf = malloc(MLX5_RECV_BUF_SIZE); + struct iovec iov = { + .iov_base = buf, +- .iov_len = sizeof(buf), ++ .iov_len = MLX5_RECV_BUF_SIZE, + }; + struct msghdr msg = { + .msg_name = &sa, +@@ -284,6 +284,10 @@ mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + int multipart = 0; + int ret = 0; + ++ if (!buf) { ++ rte_errno = ENOMEM; ++ return -rte_errno; ++ } + do { + struct nlmsghdr *nh; + int recv_bytes = 0; +@@ -292,7 +296,8 @@ mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + recv_bytes = recvmsg(nlsk_fd, &msg, 0); + if (recv_bytes == -1) { + rte_errno = errno; +- return -rte_errno; ++ ret = -rte_errno; ++ goto exit; + } + nh = (struct nlmsghdr *)buf; + } while (nh->nlmsg_seq != sn); +@@ -304,24 +309,30 @@ mlx5_nl_recv(int nlsk_fd, uint32_t sn, int (*cb)(struct nlmsghdr *, void *arg), + + if (err_data->error < 0) { + rte_errno = -err_data->error; +- return -rte_errno; ++ ret = -rte_errno; ++ goto exit; + } + /* Ack message. */ +- return 0; ++ ret = 0; ++ goto exit; + } + /* Multi-part msgs and their trailing DONE message. */ + if (nh->nlmsg_flags & NLM_F_MULTI) { +- if (nh->nlmsg_type == NLMSG_DONE) +- return 0; ++ if (nh->nlmsg_type == NLMSG_DONE) { ++ ret = 0; ++ goto exit; ++ } + multipart = 1; + } + if (cb) { + ret = cb(nh, arg); + if (ret < 0) +- return ret; ++ goto exit; + } + } + } while (multipart); ++exit: ++ free(buf); + return ret; + } + +diff --git a/dpdk/drivers/net/mlx5/mlx5_prm.h b/dpdk/drivers/net/mlx5/mlx5_prm.h +index a805363757..4c86719769 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_prm.h ++++ b/dpdk/drivers/net/mlx5/mlx5_prm.h +@@ -18,6 +18,8 @@ + #pragma GCC diagnostic error "-Wpedantic" + #endif + ++#include ++ + #include + #include "mlx5_autoconf.h" + +@@ -100,7 +102,7 @@ + */ + #define MLX5_EMPW_MAX_PACKETS MLX5_TX_COMP_THRESH + #define MLX5_MPW_MAX_PACKETS 6 +-#define MLX5_MPW_INLINE_MAX_PACKETS 2 ++#define MLX5_MPW_INLINE_MAX_PACKETS 6 + + /* + * Default packet length threshold to be inlined with +@@ -251,7 +253,7 @@ + #define MLX5_MAX_LOG_RQ_SEGS 5u + + /* The alignment needed for WQ buffer. */ +-#define MLX5_WQE_BUF_ALIGNMENT 512 ++#define MLX5_WQE_BUF_ALIGNMENT sysconf(_SC_PAGESIZE) + + /* Completion mode. */ + enum mlx5_completion_mode { +@@ -1196,7 +1198,9 @@ struct mlx5_ifc_qos_cap_bits { + u8 reserved_at_8[0x8]; + u8 log_max_flow_meter[0x8]; + u8 flow_meter_reg_id[0x8]; +- u8 reserved_at_25[0x20]; ++ u8 reserved_at_25[0x8]; ++ u8 flow_meter_reg_share[0x1]; ++ u8 reserved_at_2e[0x17]; + u8 packet_pacing_max_rate[0x20]; + u8 packet_pacing_min_rate[0x20]; + u8 reserved_at_80[0x10]; +@@ -1816,6 +1820,9 @@ enum { + #define MLX5_SRTCM_CIR_MAX (8 * (1ULL << 30) * 0xFF) + #define MLX5_SRTCM_EBS_MAX 0 + ++/* The bits meter color use. */ ++#define MLX5_MTR_COLOR_BITS 8 ++ + /** + * Convert a user mark to flow mark. + * +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxq.c b/dpdk/drivers/net/mlx5/mlx5_rxq.c +index 986ec016df..2b6ab21b90 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxq.c ++++ b/dpdk/drivers/net/mlx5/mlx5_rxq.c +@@ -36,6 +36,7 @@ + #include "mlx5_autoconf.h" + #include "mlx5_defs.h" + #include "mlx5_glue.h" ++#include "mlx5_flow.h" + + /* Default RSS hash key also used for ConnectX-3. */ + uint8_t rss_hash_default_key[] = { +@@ -1260,6 +1261,7 @@ mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) + struct mlx5_devx_create_rq_attr attr = { 0 }; + struct mlx5_rxq_obj *tmpl = NULL; + int ret = 0; ++ uint32_t max_wq_data; + + assert(rxq_data); + assert(!rxq_ctrl->obj); +@@ -1275,11 +1277,15 @@ mlx5_rxq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) + tmpl->type = MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN; + tmpl->rxq_ctrl = rxq_ctrl; + attr.hairpin = 1; +- /* Workaround for hairpin startup */ +- attr.wq_attr.log_hairpin_num_packets = log2above(32); +- /* Workaround for packets larger than 1KB */ ++ max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz; ++ /* Jumbo frames > 9KB should be supported, and more packets. */ + attr.wq_attr.log_hairpin_data_sz = +- priv->config.hca_attr.log_max_hairpin_wq_data_sz; ++ (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ? ++ max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE; ++ /* Set the packets number to the maximum value for performance. */ ++ attr.wq_attr.log_hairpin_num_packets = ++ attr.wq_attr.log_hairpin_data_sz - ++ MLX5_HAIRPIN_QUEUE_STRIDE; + tmpl->rq = mlx5_devx_cmd_create_rq(priv->sh->ctx, &attr, + rxq_ctrl->socket); + if (!tmpl->rq) { +@@ -1762,9 +1768,10 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + struct mlx5_priv *priv = dev->data->dev_private; + struct mlx5_rxq_ctrl *tmpl; + unsigned int mb_len = rte_pktmbuf_data_room_size(mp); ++ unsigned int mprq_stride_nums; + unsigned int mprq_stride_size; ++ unsigned int mprq_stride_cap; + struct mlx5_dev_config *config = &priv->config; +- unsigned int strd_headroom_en; + /* + * Always allocate extra slots, even if eventually + * the vector Rx will not be used. +@@ -1810,42 +1817,42 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + tmpl->socket = socket; + if (dev->data->dev_conf.intr_conf.rxq) + tmpl->irq = 1; +- /* +- * LRO packet may consume all the stride memory, hence we cannot +- * guaranty head-room near the packet memory in the stride. +- * In this case scatter is, for sure, enabled and an empty mbuf may be +- * added in the start for the head-room. +- */ +- if (lro_on_queue && RTE_PKTMBUF_HEADROOM > 0 && +- non_scatter_min_mbuf_size > mb_len) { +- strd_headroom_en = 0; +- mprq_stride_size = RTE_MIN(max_rx_pkt_len, +- 1u << config->mprq.max_stride_size_n); +- } else { +- strd_headroom_en = 1; +- mprq_stride_size = non_scatter_min_mbuf_size; +- } ++ mprq_stride_nums = config->mprq.stride_num_n ? ++ config->mprq.stride_num_n : MLX5_MPRQ_STRIDE_NUM_N; ++ mprq_stride_size = non_scatter_min_mbuf_size <= ++ (1U << config->mprq.max_stride_size_n) ? ++ log2above(non_scatter_min_mbuf_size) : MLX5_MPRQ_STRIDE_SIZE_N; ++ mprq_stride_cap = (config->mprq.stride_num_n ? ++ (1U << config->mprq.stride_num_n) : (1U << mprq_stride_nums)) * ++ (config->mprq.stride_size_n ? ++ (1U << config->mprq.stride_size_n) : (1U << mprq_stride_size)); + /* + * This Rx queue can be configured as a Multi-Packet RQ if all of the + * following conditions are met: + * - MPRQ is enabled. + * - The number of descs is more than the number of strides. +- * - max_rx_pkt_len plus overhead is less than the max size of a +- * stride. ++ * - max_rx_pkt_len plus overhead is less than the max size ++ * of a stride or mprq_stride_size is specified by a user. ++ * Need to nake sure that there are enough stides to encap ++ * the maximum packet size in case mprq_stride_size is set. + * Otherwise, enable Rx scatter if necessary. + */ +- if (mprq_en && +- desc > (1U << config->mprq.stride_num_n) && +- mprq_stride_size <= (1U << config->mprq.max_stride_size_n)) { ++ if (mprq_en && desc > (1U << mprq_stride_nums) && ++ (non_scatter_min_mbuf_size <= ++ (1U << config->mprq.max_stride_size_n) || ++ (config->mprq.stride_size_n && ++ non_scatter_min_mbuf_size <= mprq_stride_cap))) { + /* TODO: Rx scatter isn't supported yet. */ + tmpl->rxq.sges_n = 0; + /* Trim the number of descs needed. */ +- desc >>= config->mprq.stride_num_n; +- tmpl->rxq.strd_num_n = config->mprq.stride_num_n; +- tmpl->rxq.strd_sz_n = RTE_MAX(log2above(mprq_stride_size), +- config->mprq.min_stride_size_n); ++ desc >>= mprq_stride_nums; ++ tmpl->rxq.strd_num_n = config->mprq.stride_num_n ? ++ config->mprq.stride_num_n : mprq_stride_nums; ++ tmpl->rxq.strd_sz_n = config->mprq.stride_size_n ? ++ config->mprq.stride_size_n : mprq_stride_size; + tmpl->rxq.strd_shift_en = MLX5_MPRQ_TWO_BYTE_SHIFT; +- tmpl->rxq.strd_headroom_en = strd_headroom_en; ++ tmpl->rxq.strd_scatter_en = ++ !!(offloads & DEV_RX_OFFLOAD_SCATTER); + tmpl->rxq.mprq_max_memcpy_len = RTE_MIN(first_mb_free_size, + config->mprq.max_memcpy_len); + max_lro_size = RTE_MIN(max_rx_pkt_len, +@@ -1889,14 +1896,24 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc, + tmpl->rxq.sges_n = sges_n; + max_lro_size = max_rx_pkt_len; + } +- if (mprq_en && !mlx5_rxq_mprq_enabled(&tmpl->rxq)) ++ if (config->mprq.enabled && !mlx5_rxq_mprq_enabled(&tmpl->rxq)) + DRV_LOG(WARNING, +- "port %u MPRQ is requested but cannot be enabled" +- " (requested: desc = %u, stride_sz = %u," +- " supported: min_stride_num = %u, max_stride_sz = %u).", +- dev->data->port_id, desc, mprq_stride_size, +- (1 << config->mprq.stride_num_n), +- (1 << config->mprq.max_stride_size_n)); ++ "port %u MPRQ is requested but cannot be enabled\n" ++ " (requested: pkt_sz = %u, desc_num = %u," ++ " rxq_num = %u, stride_sz = %u, stride_num = %u\n" ++ " supported: min_rxqs_num = %u," ++ " min_stride_sz = %u, max_stride_sz = %u).", ++ dev->data->port_id, non_scatter_min_mbuf_size, ++ desc, priv->rxqs_n, ++ config->mprq.stride_size_n ? ++ (1U << config->mprq.stride_size_n) : ++ (1U << mprq_stride_size), ++ config->mprq.stride_num_n ? ++ (1U << config->mprq.stride_num_n) : ++ (1U << mprq_stride_nums), ++ config->mprq.min_rxqs_num, ++ (1U << config->mprq.min_stride_size_n), ++ (1U << config->mprq.max_stride_size_n)); + DRV_LOG(DEBUG, "port %u maximum number of segments per packet: %u", + dev->data->port_id, 1 << tmpl->rxq.sges_n); + if (desc % (1 << tmpl->rxq.sges_n)) { +@@ -2465,13 +2482,42 @@ mlx5_hrxq_new(struct rte_eth_dev *dev, + memset(&tir_attr, 0, sizeof(tir_attr)); + tir_attr.disp_type = MLX5_TIRC_DISP_TYPE_INDIRECT; + tir_attr.rx_hash_fn = MLX5_RX_HASH_FN_TOEPLITZ; +- memcpy(&tir_attr.rx_hash_field_selector_outer, &hash_fields, +- sizeof(uint64_t)); ++ tir_attr.tunneled_offload_en = !!tunnel; ++ /* If needed, translate hash_fields bitmap to PRM format. */ ++ if (hash_fields) { ++#ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT ++ struct mlx5_rx_hash_field_select *rx_hash_field_select = ++ hash_fields & IBV_RX_HASH_INNER ? ++ &tir_attr.rx_hash_field_selector_inner : ++ &tir_attr.rx_hash_field_selector_outer; ++#else ++ struct mlx5_rx_hash_field_select *rx_hash_field_select = ++ &tir_attr.rx_hash_field_selector_outer; ++#endif ++ ++ /* 1 bit: 0: IPv4, 1: IPv6. */ ++ rx_hash_field_select->l3_prot_type = ++ !!(hash_fields & MLX5_IPV6_IBV_RX_HASH); ++ /* 1 bit: 0: TCP, 1: UDP. */ ++ rx_hash_field_select->l4_prot_type = ++ !!(hash_fields & MLX5_UDP_IBV_RX_HASH); ++ /* Bitmask which sets which fields to use in RX Hash. */ ++ rx_hash_field_select->selected_fields = ++ ((!!(hash_fields & MLX5_L3_SRC_IBV_RX_HASH)) << ++ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_SRC_IP) | ++ (!!(hash_fields & MLX5_L3_DST_IBV_RX_HASH)) << ++ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_DST_IP | ++ (!!(hash_fields & MLX5_L4_SRC_IBV_RX_HASH)) << ++ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_SPORT | ++ (!!(hash_fields & MLX5_L4_DST_IBV_RX_HASH)) << ++ MLX5_RX_HASH_FIELD_SELECT_SELECTED_FIELDS_L4_DPORT; ++ } + if (rxq_ctrl->obj->type == MLX5_RXQ_OBJ_TYPE_DEVX_HAIRPIN) + tir_attr.transport_domain = priv->sh->td->id; + else + tir_attr.transport_domain = priv->sh->tdn; +- memcpy(tir_attr.rx_hash_toeplitz_key, rss_key, rss_key_len); ++ memcpy(tir_attr.rx_hash_toeplitz_key, rss_key, ++ MLX5_RSS_HASH_KEY_LEN); + tir_attr.indirect_table = ind_tbl->rqt->id; + if (dev->data->dev_conf.lpbk_mode) + tir_attr.self_lb_block = +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxtx.c b/dpdk/drivers/net/mlx5/mlx5_rxtx.c +index acf0fd794b..488a87f593 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxtx.c ++++ b/dpdk/drivers/net/mlx5/mlx5_rxtx.c +@@ -654,10 +654,10 @@ check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe) + * Pointer to the error CQE. + * + * @return +- * Negative value if queue recovery failed, +- * the last Tx buffer element to free otherwise. ++ * Negative value if queue recovery failed, otherwise ++ * the error completion entry is handled successfully. + */ +-int ++static int + mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, + volatile struct mlx5_err_cqe *err_cqe) + { +@@ -701,18 +701,14 @@ mlx5_tx_error_cqe_handle(struct mlx5_txq_data *restrict txq, + */ + txq->stats.oerrors += ((txq->wqe_ci & wqe_m) - + new_wqe_pi) & wqe_m; +- if (tx_recover_qp(txq_ctrl) == 0) { +- txq->cq_ci++; +- /* Release all the remaining buffers. */ +- return txq->elts_head; ++ if (tx_recover_qp(txq_ctrl)) { ++ /* Recovering failed - retry later on the same WQE. */ ++ return -1; + } +- /* Recovering failed - try again later on the same WQE. */ +- return -1; +- } else { +- txq->cq_ci++; ++ /* Release all the remaining buffers. */ ++ txq_free_elts(txq_ctrl); + } +- /* Do not release buffers. */ +- return txq->elts_tail; ++ return 0; + } + + /** +@@ -1253,9 +1249,10 @@ rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt, + pkt->hash.fdir.hi = mlx5_flow_mark_get(mark); + } + } +- if (rte_flow_dynf_metadata_avail() && cqe->flow_table_metadata) { +- pkt->ol_flags |= PKT_RX_DYNF_METADATA; +- *RTE_FLOW_DYNF_METADATA(pkt) = cqe->flow_table_metadata; ++ if (rxq->dynf_meta && cqe->flow_table_metadata) { ++ pkt->ol_flags |= rxq->flow_meta_mask; ++ *RTE_MBUF_DYNFIELD(pkt, rxq->flow_meta_offset, uint32_t *) = ++ cqe->flow_table_metadata; + } + if (rxq->csum) + pkt->ol_flags |= rxq_cq_to_ol_flags(cqe); +@@ -1574,21 +1571,20 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + unsigned int i = 0; + uint32_t rq_ci = rxq->rq_ci; + uint16_t consumed_strd = rxq->consumed_strd; +- uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM; + struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask]; + + while (i < pkts_n) { + struct rte_mbuf *pkt; + void *addr; + int ret; +- unsigned int len; ++ uint32_t len; + uint16_t strd_cnt; + uint16_t strd_idx; + uint32_t offset; + uint32_t byte_cnt; ++ int32_t hdrm_overlap; + volatile struct mlx5_mini_cqe8 *mcqe = NULL; + uint32_t rss_hash_res = 0; +- uint8_t lro_num_seg; + + if (consumed_strd == strd_n) { + /* Replace WQE only if the buffer is still in use. */ +@@ -1634,18 +1630,6 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + } + assert(strd_idx < strd_n); + assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask)); +- lro_num_seg = cqe->lro_num_seg; +- /* +- * Currently configured to receive a packet per a stride. But if +- * MTU is adjusted through kernel interface, device could +- * consume multiple strides without raising an error. In this +- * case, the packet should be dropped because it is bigger than +- * the max_rx_pkt_len. +- */ +- if (unlikely(!lro_num_seg && strd_cnt > 1)) { +- ++rxq->stats.idropped; +- continue; +- } + pkt = rte_pktmbuf_alloc(rxq->mp); + if (unlikely(pkt == NULL)) { + ++rxq->stats.rx_nombuf; +@@ -1657,23 +1641,57 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + len -= RTE_ETHER_CRC_LEN; + offset = strd_idx * strd_sz + strd_shift; + addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset); ++ hdrm_overlap = len + RTE_PKTMBUF_HEADROOM - strd_cnt * strd_sz; + /* + * Memcpy packets to the target mbuf if: + * - The size of packet is smaller than mprq_max_memcpy_len. + * - Out of buffer in the Mempool for Multi-Packet RQ. ++ * - The packet's stride overlaps a headroom and scatter is off. + */ +- if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) { +- /* +- * When memcpy'ing packet due to out-of-buffer, the +- * packet must be smaller than the target mbuf. +- */ +- if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { ++ if (len <= rxq->mprq_max_memcpy_len || ++ rxq->mprq_repl == NULL || ++ (hdrm_overlap > 0 && !rxq->strd_scatter_en)) { ++ if (likely(rte_pktmbuf_tailroom(pkt) >= len)) { ++ rte_memcpy(rte_pktmbuf_mtod(pkt, void *), ++ addr, len); ++ DATA_LEN(pkt) = len; ++ } else if (rxq->strd_scatter_en) { ++ struct rte_mbuf *prev = pkt; ++ uint32_t seg_len = ++ RTE_MIN(rte_pktmbuf_tailroom(pkt), len); ++ uint32_t rem_len = len - seg_len; ++ ++ rte_memcpy(rte_pktmbuf_mtod(pkt, void *), ++ addr, seg_len); ++ DATA_LEN(pkt) = seg_len; ++ while (rem_len) { ++ struct rte_mbuf *next = ++ rte_pktmbuf_alloc(rxq->mp); ++ ++ if (unlikely(next == NULL)) { ++ rte_pktmbuf_free(pkt); ++ ++rxq->stats.rx_nombuf; ++ goto out; ++ } ++ NEXT(prev) = next; ++ SET_DATA_OFF(next, 0); ++ addr = RTE_PTR_ADD(addr, seg_len); ++ seg_len = RTE_MIN ++ (rte_pktmbuf_tailroom(next), ++ rem_len); ++ rte_memcpy ++ (rte_pktmbuf_mtod(next, void *), ++ addr, seg_len); ++ DATA_LEN(next) = seg_len; ++ rem_len -= seg_len; ++ prev = next; ++ ++NB_SEGS(pkt); ++ } ++ } else { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.idropped; + continue; + } +- rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len); +- DATA_LEN(pkt) = len; + } else { + rte_iova_t buf_iova; + struct rte_mbuf_ext_shared_info *shinfo; +@@ -1684,7 +1702,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + rte_atomic16_add_return(&buf->refcnt, 1); + assert((uint16_t)rte_atomic16_read(&buf->refcnt) <= + strd_n + 1); +- buf_addr = RTE_PTR_SUB(addr, headroom_sz); ++ buf_addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM); + /* + * MLX5 device doesn't use iova but it is necessary in a + * case where the Rx packet is transmitted via a +@@ -1703,43 +1721,42 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova, + buf_len, shinfo); + /* Set mbuf head-room. */ +- pkt->data_off = headroom_sz; ++ SET_DATA_OFF(pkt, RTE_PKTMBUF_HEADROOM); + assert(pkt->ol_flags == EXT_ATTACHED_MBUF); +- /* +- * Prevent potential overflow due to MTU change through +- * kernel interface. +- */ +- if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) { +- rte_pktmbuf_free_seg(pkt); +- ++rxq->stats.idropped; +- continue; +- } ++ assert(rte_pktmbuf_tailroom(pkt) >= ++ len - (hdrm_overlap > 0 ? hdrm_overlap : 0)); + DATA_LEN(pkt) = len; + /* +- * LRO packet may consume all the stride memory, in this +- * case packet head-room space is not guaranteed so must +- * to add an empty mbuf for the head-room. ++ * Copy the last fragment of a packet (up to headroom ++ * size bytes) in case there is a stride overlap with ++ * a next packet's headroom. Allocate a separate mbuf ++ * to store this fragment and link it. Scatter is on. + */ +- if (!rxq->strd_headroom_en) { +- struct rte_mbuf *headroom_mbuf = +- rte_pktmbuf_alloc(rxq->mp); ++ if (hdrm_overlap > 0) { ++ assert(rxq->strd_scatter_en); ++ struct rte_mbuf *seg = ++ rte_pktmbuf_alloc(rxq->mp); + +- if (unlikely(headroom_mbuf == NULL)) { ++ if (unlikely(seg == NULL)) { + rte_pktmbuf_free_seg(pkt); + ++rxq->stats.rx_nombuf; + break; + } +- PORT(pkt) = rxq->port_id; +- NEXT(headroom_mbuf) = pkt; +- pkt = headroom_mbuf; ++ SET_DATA_OFF(seg, 0); ++ rte_memcpy(rte_pktmbuf_mtod(seg, void *), ++ RTE_PTR_ADD(addr, len - hdrm_overlap), ++ hdrm_overlap); ++ DATA_LEN(seg) = hdrm_overlap; ++ DATA_LEN(pkt) = len - hdrm_overlap; ++ NEXT(pkt) = seg; + NB_SEGS(pkt) = 2; + } + } + rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res); +- if (lro_num_seg > 1) { ++ if (cqe->lro_num_seg > 1) { + mlx5_lro_update_hdr(addr, cqe, len); + pkt->ol_flags |= PKT_RX_LRO; +- pkt->tso_segsz = strd_sz; ++ pkt->tso_segsz = len / cqe->lro_num_seg; + } + PKT_LEN(pkt) = len; + PORT(pkt) = rxq->port_id; +@@ -1751,6 +1768,7 @@ mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n) + *(pkts++) = pkt; + ++i; + } ++out: + /* Update the consumer indexes. */ + rxq->consumed_strd = consumed_strd; + rte_cio_wmb(); +@@ -2034,8 +2052,6 @@ mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, + * Pointer to TX queue structure. + * @param valid CQE pointer + * if not NULL update txq->wqe_pi and flush the buffers +- * @param itail +- * if not negative - flush the buffers till this index. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. +@@ -2043,25 +2059,17 @@ mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq, + static __rte_always_inline void + mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq, + volatile struct mlx5_cqe *last_cqe, +- int itail, + unsigned int olx __rte_unused) + { +- uint16_t tail; +- + if (likely(last_cqe != NULL)) { ++ uint16_t tail; ++ + txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter); +- tail = ((volatile struct mlx5_wqe_cseg *) +- (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc; +- } else if (itail >= 0) { +- tail = (uint16_t)itail; +- } else { +- return; +- } +- rte_compiler_barrier(); +- *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); +- if (likely(tail != txq->elts_tail)) { +- mlx5_tx_free_elts(txq, tail, olx); +- assert(tail == txq->elts_tail); ++ tail = txq->fcqs[(txq->cq_ci - 1) & txq->cqe_m]; ++ if (likely(tail != txq->elts_tail)) { ++ mlx5_tx_free_elts(txq, tail, olx); ++ assert(tail == txq->elts_tail); ++ } + } + } + +@@ -2085,6 +2093,7 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, + { + unsigned int count = MLX5_TX_COMP_MAX_CQE; + volatile struct mlx5_cqe *last_cqe = NULL; ++ bool ring_doorbell = false; + int ret; + + static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value"); +@@ -2109,31 +2118,49 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, + rte_wmb(); + ret = mlx5_tx_error_cqe_handle + (txq, (volatile struct mlx5_err_cqe *)cqe); ++ if (unlikely(ret < 0)) { ++ /* ++ * Some error occurred on queue error ++ * handling, we do not advance the index ++ * here, allowing to retry on next call. ++ */ ++ return; ++ } + /* +- * Flush buffers, update consuming index +- * if recovery succeeded. Otherwise +- * just try to recover later. ++ * We are going to fetch all entries with ++ * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status. ++ * The send queue is supposed to be empty. + */ ++ ring_doorbell = true; ++ ++txq->cq_ci; ++ txq->cq_pi = txq->cq_ci; + last_cqe = NULL; +- break; ++ continue; + } + /* Normal transmit completion. */ ++ assert(txq->cq_ci != txq->cq_pi); ++ assert((txq->fcqs[txq->cq_ci & txq->cqe_m] >> 16) == ++ cqe->wqe_counter); ++ ring_doorbell = true; + ++txq->cq_ci; + last_cqe = cqe; +-#ifndef NDEBUG +- if (txq->cq_pi) +- --txq->cq_pi; +-#endif +- /* +- * We have to restrict the amount of processed CQEs +- * in one tx_burst routine call. The CQ may be large +- * and many CQEs may be updated by the NIC in one +- * transaction. Buffers freeing is time consuming, +- * multiple iterations may introduce significant +- * latency. +- */ +- } while (--count); +- mlx5_tx_comp_flush(txq, last_cqe, ret, olx); ++ /* ++ * We have to restrict the amount of processed CQEs ++ * in one tx_burst routine call. The CQ may be large ++ * and many CQEs may be updated by the NIC in one ++ * transaction. Buffers freeing is time consuming, ++ * multiple iterations may introduce significant ++ * latency. ++ */ ++ if (likely(--count == 0)) ++ break; ++ } while (true); ++ if (likely(ring_doorbell)) { ++ /* Ring doorbell to notify hardware. */ ++ rte_compiler_barrier(); ++ *txq->cq_db = rte_cpu_to_be_32(txq->cq_ci); ++ mlx5_tx_comp_flush(txq, last_cqe, olx); ++ } + } + + /** +@@ -2145,9 +2172,6 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, + * Pointer to TX queue structure. + * @param loc + * Pointer to burst routine local context. +- * @param multi, +- * Routine is called from multi-segment sending loop, +- * do not correct the elts_head according to the pkts_copy. + * @param olx + * Configured Tx offloads mask. It is fully defined at + * compile time and may be used for optimization. +@@ -2155,13 +2179,12 @@ mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq, + static __rte_always_inline void + mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, +- bool multi, + unsigned int olx) + { + uint16_t head = txq->elts_head; + unsigned int part; + +- part = (MLX5_TXOFF_CONFIG(INLINE) || multi) ? ++ part = MLX5_TXOFF_CONFIG(INLINE) ? + 0 : loc->pkts_sent - loc->pkts_copy; + head += part; + if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH || +@@ -2175,15 +2198,15 @@ mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq, + /* Request unconditional completion on last WQE. */ + last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS << + MLX5_COMP_MODE_OFFSET); +- /* Save elts_head in unused "immediate" field of WQE. */ +- last->cseg.misc = head; +- /* +- * A CQE slot must always be available. Count the +- * issued CEQ "always" request instead of production +- * index due to here can be CQE with errors and +- * difference with ci may become inconsistent. +- */ +- assert(txq->cqe_s > ++txq->cq_pi); ++ /* Save elts_head in dedicated free on completion queue. */ ++#ifdef NDEBUG ++ txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head; ++#else ++ txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head | ++ (last->cseg.opcode >> 8) << 16; ++#endif ++ /* A CQE slot must always be available. */ ++ assert((txq->cq_pi - txq->cq_ci) <= txq->cqe_s); + } + } + +@@ -2818,8 +2841,14 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, + unsigned int part; + uint8_t *pdst; + +- dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); +- pdst = &dseg->inline_data[0]; ++ if (!MLX5_TXOFF_CONFIG(MPW)) { ++ /* Store the descriptor byte counter for eMPW sessions. */ ++ dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE); ++ pdst = &dseg->inline_data[0]; ++ } else { ++ /* The entire legacy MPW session counter is stored on close. */ ++ pdst = (uint8_t *)dseg; ++ } + /* + * The WQEBB space availability is checked by caller. + * Here we should be aware of WQE ring buffer wraparound only. +@@ -2831,7 +2860,8 @@ mlx5_tx_dseg_empw(struct mlx5_txq_data *restrict txq, + len -= part; + if (likely(!len)) { + pdst += part; +- pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); ++ if (!MLX5_TXOFF_CONFIG(MPW)) ++ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + /* Note: no final wraparound check here. */ + return (struct mlx5_wqe_dseg *)pdst; + } +@@ -2879,9 +2909,16 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, + static_assert(MLX5_DSEG_MIN_INLINE_SIZE == + (2 * RTE_ETHER_ADDR_LEN), + "invalid Data Segment data size"); +- dseg->bcount = rte_cpu_to_be_32((len + sizeof(struct rte_vlan_hdr)) | +- MLX5_ETH_WQE_DATA_INLINE); +- pdst = &dseg->inline_data[0]; ++ if (!MLX5_TXOFF_CONFIG(MPW)) { ++ /* Store the descriptor byte counter for eMPW sessions. */ ++ dseg->bcount = rte_cpu_to_be_32 ++ ((len + sizeof(struct rte_vlan_hdr)) | ++ MLX5_ETH_WQE_DATA_INLINE); ++ pdst = &dseg->inline_data[0]; ++ } else { ++ /* The entire legacy MPW session counter is stored on close. */ ++ pdst = (uint8_t *)dseg; ++ } + memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE); + buf += MLX5_DSEG_MIN_INLINE_SIZE; + pdst += MLX5_DSEG_MIN_INLINE_SIZE; +@@ -2904,7 +2941,8 @@ mlx5_tx_dseg_vlan(struct mlx5_txq_data *restrict txq, + len -= part; + if (likely(!len)) { + pdst += part; +- pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); ++ if (!MLX5_TXOFF_CONFIG(MPW)) ++ pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE); + /* Note: no final wraparound check here. */ + return (struct mlx5_wqe_dseg *)pdst; + } +@@ -3120,8 +3158,6 @@ mlx5_tx_packet_multi_tso(struct mlx5_txq_data *restrict txq, + wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, true, olx); + return MLX5_TXCMP_CODE_MULTI; + } + +@@ -3230,8 +3266,6 @@ mlx5_tx_packet_multi_send(struct mlx5_txq_data *restrict txq, + } while (true); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, true, olx); + return MLX5_TXCMP_CODE_MULTI; + } + +@@ -3388,8 +3422,6 @@ mlx5_tx_packet_multi_inline(struct mlx5_txq_data *restrict txq, + wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, true, olx); + return MLX5_TXCMP_CODE_MULTI; + } + +@@ -3599,8 +3631,6 @@ mlx5_tx_burst_tso(struct mlx5_txq_data *restrict txq, + --loc->elts_free; + ++loc->pkts_sent; + --pkts_n; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, false, olx); + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; +@@ -3750,7 +3780,7 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, + struct mlx5_txq_local *restrict loc, + unsigned int ds, + unsigned int slen, +- unsigned int olx) ++ unsigned int olx __rte_unused) + { + assert(!MLX5_TXOFF_CONFIG(INLINE)); + #ifdef MLX5_PMD_SOFT_COUNTERS +@@ -3765,8 +3795,6 @@ mlx5_tx_sdone_empw(struct mlx5_txq_data *restrict txq, + loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds); + txq->wqe_ci += (ds + 3) / 4; + loc->wqe_free -= (ds + 3) / 4; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, false, olx); + } + + /* +@@ -3797,20 +3825,36 @@ mlx5_tx_idone_empw(struct mlx5_txq_data *restrict txq, + unsigned int slen, + unsigned int olx __rte_unused) + { ++ struct mlx5_wqe_dseg *dseg = &loc->wqe_last->dseg[0]; ++ + assert(MLX5_TXOFF_CONFIG(INLINE)); +- assert((len % MLX5_WSEG_SIZE) == 0); + #ifdef MLX5_PMD_SOFT_COUNTERS + /* Update sent data bytes counter. */ + txq->stats.obytes += slen; + #else + (void)slen; + #endif +- len = len / MLX5_WSEG_SIZE + 2; ++ if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) { ++ /* ++ * If the legacy MPW session contains the inline packets ++ * we should set the only inline data segment length ++ * and align the total length to the segment size. ++ */ ++ assert(len > sizeof(dseg->bcount)); ++ dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) | ++ MLX5_ETH_WQE_DATA_INLINE); ++ len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2; ++ } else { ++ /* ++ * The session is not legacy MPW or contains the ++ * data buffer pointer segments. ++ */ ++ assert((len % MLX5_WSEG_SIZE) == 0); ++ len = len / MLX5_WSEG_SIZE + 2; ++ } + loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len); + txq->wqe_ci += (len + 3) / 4; + loc->wqe_free -= (len + 3) / 4; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, false, olx); + } + + /** +@@ -4011,8 +4055,6 @@ mlx5_tx_burst_empw_simple(struct mlx5_txq_data *restrict txq, + txq->wqe_ci += (2 + part + 3) / 4; + loc->wqe_free -= (2 + part + 3) / 4; + pkts_n -= part; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, false, olx); + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; +@@ -4088,6 +4130,15 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, + loc->wqe_free) * MLX5_WQE_SIZE - + MLX5_WQE_CSEG_SIZE - + MLX5_WQE_ESEG_SIZE; ++ /* Limit the room for legacy MPW sessions for performance. */ ++ if (MLX5_TXOFF_CONFIG(MPW)) ++ room = RTE_MIN(room, ++ RTE_MAX(txq->inlen_empw + ++ sizeof(dseg->bcount) + ++ (MLX5_TXOFF_CONFIG(VLAN) ? ++ sizeof(struct rte_vlan_hdr) : 0), ++ MLX5_MPW_INLINE_MAX_PACKETS * ++ MLX5_WQE_DSEG_SIZE)); + /* Build WQE till we have space, packets and resources. */ + part = room; + for (;;) { +@@ -4117,8 +4168,28 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, + /* Inline or not inline - that's the Question. */ + if (dlen > txq->inlen_empw) + goto pointer_empw; ++ if (MLX5_TXOFF_CONFIG(MPW)) { ++ if (dlen > txq->inlen_send) ++ goto pointer_empw; ++ tlen = dlen; ++ if (part == room) { ++ /* Open new inline MPW session. */ ++ tlen += sizeof(dseg->bcount); ++ dseg->bcount = RTE_BE32(0); ++ dseg = RTE_PTR_ADD ++ (dseg, sizeof(dseg->bcount)); ++ } else { ++ /* ++ * No pointer and inline descriptor ++ * intermix for legacy MPW sessions. ++ */ ++ if (loc->wqe_last->dseg[0].bcount) ++ break; ++ } ++ } else { ++ tlen = sizeof(dseg->bcount) + dlen; ++ } + /* Inline entire packet, optional VLAN insertion. */ +- tlen = sizeof(dseg->bcount) + dlen; + if (MLX5_TXOFF_CONFIG(VLAN) && + loc->mbuf->ol_flags & PKT_TX_VLAN_PKT) { + /* +@@ -4143,7 +4214,8 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, + dseg = mlx5_tx_dseg_empw(txq, loc, dseg, + dptr, dlen, olx); + } +- tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); ++ if (!MLX5_TXOFF_CONFIG(MPW)) ++ tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE); + assert(room >= tlen); + room -= tlen; + /* +@@ -4153,6 +4225,14 @@ mlx5_tx_burst_empw_inline(struct mlx5_txq_data *restrict txq, + rte_pktmbuf_free_seg(loc->mbuf); + goto next_mbuf; + pointer_empw: ++ /* ++ * No pointer and inline descriptor ++ * intermix for legacy MPW sessions. ++ */ ++ if (MLX5_TXOFF_CONFIG(MPW) && ++ part != room && ++ loc->wqe_last->dseg[0].bcount == RTE_BE32(0)) ++ break; + /* + * Not inlinable VLAN packets are + * proceeded outside of this routine. +@@ -4496,8 +4576,6 @@ mlx5_tx_burst_single_send(struct mlx5_txq_data *restrict txq, + } + ++loc->pkts_sent; + --pkts_n; +- /* Request CQE generation if limits are reached. */ +- mlx5_tx_request_completion(txq, loc, false, olx); + if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free)) + return MLX5_TXCMP_CODE_EXIT; + loc->mbuf = *pkts++; +@@ -4596,7 +4674,7 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, + /* + * Calculate the number of available resources - elts and WQEs. + * There are two possible different scenarios: +- * - no data inlining into WQEs, one WQEBB may contains upto ++ * - no data inlining into WQEs, one WQEBB may contains up to + * four packets, in this case elts become scarce resource + * - data inlining into WQEs, one packet may require multiple + * WQEBBs, the WQEs become the limiting factor. +@@ -4776,6 +4854,8 @@ mlx5_tx_burst_tmpl(struct mlx5_txq_data *restrict txq, + /* Take a shortcut if nothing is sent. */ + if (unlikely(loc.pkts_sent == loc.pkts_loop)) + goto burst_exit; ++ /* Request CQE generation if limits are reached. */ ++ mlx5_tx_request_completion(txq, &loc, olx); + /* + * Ring QP doorbell immediately after WQE building completion + * to improve latencies. The pure software related data treatment +@@ -4977,7 +5057,7 @@ MLX5_TXOFF_DECL(iv, + + /* + * Generate routines with Legacy Multi-Packet Write support. +- * This mode is supported by ConnectX-4LX only and imposes ++ * This mode is supported by ConnectX-4 Lx only and imposes + * offload limitations, not supported: + * - ACL/Flows (metadata are becoming meaningless) + * - WQE Inline headers +@@ -4995,6 +5075,10 @@ MLX5_TXOFF_DECL(mci_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + ++MLX5_TXOFF_DECL(mc_mpw, ++ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | ++ MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) ++ + MLX5_TXOFF_DECL(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) +@@ -5151,6 +5235,10 @@ MLX5_TXOFF_INFO(mci_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) + ++MLX5_TXOFF_INFO(mc_mpw, ++ MLX5_TXOFF_CONFIG_MULTI | MLX5_TXOFF_CONFIG_CSUM | ++ MLX5_TXOFF_CONFIG_EMPW | MLX5_TXOFF_CONFIG_MPW) ++ + MLX5_TXOFF_INFO(i_mpw, + MLX5_TXOFF_CONFIG_INLINE | MLX5_TXOFF_CONFIG_EMPW | + MLX5_TXOFF_CONFIG_MPW) +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxtx.h b/dpdk/drivers/net/mlx5/mlx5_rxtx.h +index e927343f7d..a50f057c1e 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxtx.h ++++ b/dpdk/drivers/net/mlx5/mlx5_rxtx.h +@@ -114,9 +114,9 @@ struct mlx5_rxq_data { + unsigned int strd_sz_n:4; /* Log 2 of stride size. */ + unsigned int strd_shift_en:1; /* Enable 2bytes shift on a stride. */ + unsigned int err_state:2; /* enum mlx5_rxq_err_state. */ +- unsigned int strd_headroom_en:1; /* Enable mbuf headroom in MPRQ. */ ++ unsigned int strd_scatter_en:1; /* Scattered packets from a stride. */ + unsigned int lro:1; /* Enable LRO. */ +- unsigned int :1; /* Remaining bits. */ ++ unsigned int dynf_meta:1; /* Dynamic metadata is configured. */ + volatile uint32_t *rq_db; + volatile uint32_t *cq_db; + uint16_t port_id; +@@ -154,6 +154,8 @@ struct mlx5_rxq_data { + /* CQ (UAR) access lock required for 32bit implementations */ + #endif + uint32_t tunnel; /* Tunnel information. */ ++ uint64_t flow_meta_mask; ++ int32_t flow_meta_offset; + } __rte_cache_aligned; + + enum mlx5_rxq_obj_type { +@@ -273,9 +275,7 @@ struct mlx5_txq_data { + uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */ + /* WQ related fields. */ + uint16_t cq_ci; /* Consumer index for completion queue. */ +-#ifndef NDEBUG +- uint16_t cq_pi; /* Counter of issued CQE "always" requests. */ +-#endif ++ uint16_t cq_pi; /* Production index for completion queue. */ + uint16_t cqe_s; /* Number of CQ elements. */ + uint16_t cqe_m; /* Mask for CQ indices. */ + /* CQ related fields. */ +@@ -297,6 +297,11 @@ struct mlx5_txq_data { + struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */ + struct mlx5_wqe *wqes; /* Work queue. */ + struct mlx5_wqe *wqes_end; /* Work queue array limit. */ ++#ifdef NDEBUG ++ uint16_t *fcqs; /* Free completion queue. */ ++#else ++ uint32_t *fcqs; /* Free completion queue (debug extended). */ ++#endif + volatile struct mlx5_cqe *cqes; /* Completion queue. */ + volatile uint32_t *qp_db; /* Work queue doorbell. */ + volatile uint32_t *cq_db; /* Completion queue doorbell. */ +@@ -440,6 +445,7 @@ int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx); + int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx); + int mlx5_txq_verify(struct rte_eth_dev *dev); + void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl); ++void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl); + uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev); + + /* mlx5_rxtx.c */ +@@ -451,9 +457,6 @@ extern uint8_t mlx5_swp_types_table[]; + void mlx5_set_ptype_table(void); + void mlx5_set_cksum_table(void); + void mlx5_set_swp_types_table(void); +-__rte_noinline int mlx5_tx_error_cqe_handle +- (struct mlx5_txq_data *restrict txq, +- volatile struct mlx5_err_cqe *err_cqe); + uint16_t mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n); + void mlx5_rxq_initialize(struct mlx5_rxq_data *rxq); + __rte_noinline int mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t vec); +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h +index 8e79883dfe..feb17fe1ce 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h ++++ b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_altivec.h +@@ -11,7 +11,7 @@ + #include + #include + +-#include ++#include + + #include + #include +@@ -263,6 +263,25 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } ++ if (rxq->dynf_meta) { ++ int32_t offs = rxq->flow_meta_offset; ++ const uint32_t meta = ++ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); ++ ++ /* Check if title packet has valid metadata. */ ++ if (meta) { ++ assert(t_pkt->ol_flags & ++ rxq->flow_meta_mask); ++ *RTE_MBUF_DYNFIELD(elts[pos], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, ++ uint32_t *) = meta; ++ } ++ } + + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ +@@ -1010,9 +1029,9 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } +- if (rte_flow_dynf_metadata_avail()) { +- uint64_t flag = rte_flow_dynf_metadata_mask; +- int offs = rte_flow_dynf_metadata_offs; ++ if (rxq->dynf_meta) { ++ uint64_t flag = rxq->flow_meta_mask; ++ int32_t offs = rxq->flow_meta_offset; + uint32_t metadata; + + /* This code is subject for futher optimization. */ +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +index 86785c7496..f92ece4299 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h ++++ b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_neon.h +@@ -205,6 +205,25 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } ++ if (rxq->dynf_meta) { ++ int32_t offs = rxq->flow_meta_offset; ++ const uint32_t meta = ++ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); ++ ++ /* Check if title packet has valid metadata. */ ++ if (meta) { ++ assert(t_pkt->ol_flags & ++ rxq->flow_meta_mask); ++ *RTE_MBUF_DYNFIELD(elts[pos], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, ++ uint32_t *) = meta; ++ } ++ } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { +@@ -687,28 +706,30 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + container_of(p3, struct mlx5_cqe, + pkt_info)->timestamp); + } +- if (rte_flow_dynf_metadata_avail()) { ++ if (!!rxq->flow_meta_mask) { + /* This code is subject for futher optimization. */ +- *RTE_FLOW_DYNF_METADATA(elts[pos]) = ++ int32_t offs = rxq->flow_meta_offset; ++ ++ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p0, struct mlx5_cqe, + pkt_info)->flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(elts[pos + 1]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p1, struct mlx5_cqe, + pkt_info)->flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(elts[pos + 2]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p2, struct mlx5_cqe, + pkt_info)->flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(elts[pos + 3]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + container_of(p3, struct mlx5_cqe, + pkt_info)->flow_table_metadata; +- if (*RTE_FLOW_DYNF_METADATA(elts[pos])) +- elts[pos]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(elts[pos + 1])) +- elts[pos + 1]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(elts[pos + 2])) +- elts[pos + 2]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(elts[pos + 3])) +- elts[pos + 3]->ol_flags |= PKT_RX_DYNF_METADATA; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) ++ elts[pos]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) ++ elts[pos + 1]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) ++ elts[pos + 2]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) ++ elts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } + #ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ +diff --git a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +index 35b7761007..bb59163a26 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h ++++ b/dpdk/drivers/net/mlx5/mlx5_rxtx_vec_sse.h +@@ -118,7 +118,6 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + 14, 15, 6, 7, + 10, 11, 2, 3); + #endif +- + /* + * A. load mCQEs into a 128bit register. + * B. store rearm data to mbuf. +@@ -191,6 +190,25 @@ rxq_cq_decompress_v(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cq, + elts[pos + 2]->hash.fdir.hi = flow_tag; + elts[pos + 3]->hash.fdir.hi = flow_tag; + } ++ if (rxq->dynf_meta) { ++ int32_t offs = rxq->flow_meta_offset; ++ const uint32_t meta = ++ *RTE_MBUF_DYNFIELD(t_pkt, offs, uint32_t *); ++ ++ /* Check if title packet has valid metadata. */ ++ if (meta) { ++ assert(t_pkt->ol_flags & ++ rxq->flow_meta_mask); ++ *RTE_MBUF_DYNFIELD(elts[pos], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 1], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 2], offs, ++ uint32_t *) = meta; ++ *RTE_MBUF_DYNFIELD(elts[pos + 3], offs, ++ uint32_t *) = meta; ++ } ++ } + pos += MLX5_VPMD_DESCS_PER_LOOP; + /* Move to next CQE and invalidate consumed CQEs. */ + if (!(pos & 0x7) && pos < mcqe_n) { +@@ -640,24 +658,26 @@ rxq_burst_v(struct mlx5_rxq_data *rxq, struct rte_mbuf **pkts, uint16_t pkts_n, + pkts[pos + 3]->timestamp = + rte_be_to_cpu_64(cq[pos + p3].timestamp); + } +- if (rte_flow_dynf_metadata_avail()) { ++ if (rxq->dynf_meta) { + /* This code is subject for futher optimization. */ +- *RTE_FLOW_DYNF_METADATA(pkts[pos]) = ++ int32_t offs = rxq->flow_meta_offset; ++ ++ *RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *) = + cq[pos].flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(pkts[pos + 1]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *) = + cq[pos + p1].flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(pkts[pos + 2]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *) = + cq[pos + p2].flow_table_metadata; +- *RTE_FLOW_DYNF_METADATA(pkts[pos + 3]) = ++ *RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *) = + cq[pos + p3].flow_table_metadata; +- if (*RTE_FLOW_DYNF_METADATA(pkts[pos])) +- pkts[pos]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(pkts[pos + 1])) +- pkts[pos + 1]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(pkts[pos + 2])) +- pkts[pos + 2]->ol_flags |= PKT_RX_DYNF_METADATA; +- if (*RTE_FLOW_DYNF_METADATA(pkts[pos + 3])) +- pkts[pos + 3]->ol_flags |= PKT_RX_DYNF_METADATA; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos], offs, uint32_t *)) ++ pkts[pos]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 1], offs, uint32_t *)) ++ pkts[pos + 1]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 2], offs, uint32_t *)) ++ pkts[pos + 2]->ol_flags |= rxq->flow_meta_mask; ++ if (*RTE_MBUF_DYNFIELD(pkts[pos + 3], offs, uint32_t *)) ++ pkts[pos + 3]->ol_flags |= rxq->flow_meta_mask; + } + #ifdef MLX5_PMD_SOFT_COUNTERS + /* Add up received bytes count. */ +diff --git a/dpdk/drivers/net/mlx5/mlx5_stats.c b/dpdk/drivers/net/mlx5/mlx5_stats.c +index 205e4fec78..636fc80c7c 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_stats.c ++++ b/dpdk/drivers/net/mlx5/mlx5_stats.c +@@ -3,11 +3,13 @@ + * Copyright 2015 Mellanox Technologies, Ltd + */ + ++#include + #include + #include + #include + #include + #include ++#include + + #include + #include +@@ -136,26 +138,30 @@ static const struct mlx5_counter_ctrl mlx5_counters_init[] = { + + static const unsigned int xstats_n = RTE_DIM(mlx5_counters_init); + +-static inline void ++static inline int + mlx5_read_ib_stat(struct mlx5_priv *priv, const char *ctr_name, uint64_t *stat) + { +- FILE *file; ++ int fd; ++ + if (priv->sh) { + MKSTR(path, "%s/ports/%d/hw_counters/%s", + priv->sh->ibdev_path, + priv->ibv_port, + ctr_name); +- +- file = fopen(path, "rb"); +- if (file) { +- int n = fscanf(file, "%" SCNu64, stat); +- +- fclose(file); +- if (n == 1) +- return; ++ fd = open(path, O_RDONLY); ++ if (fd != -1) { ++ char buf[21] = {'\0'}; ++ ssize_t n = read(fd, buf, sizeof(buf)); ++ ++ close(fd); ++ if (n != -1) { ++ *stat = strtoull(buf, NULL, 10); ++ return 0; ++ } + } + } + *stat = 0; ++ return 1; + } + + /** +@@ -194,8 +200,14 @@ mlx5_read_dev_counters(struct rte_eth_dev *dev, uint64_t *stats) + } + for (i = 0; i != xstats_ctrl->mlx5_stats_n; ++i) { + if (xstats_ctrl->info[i].ib) { +- mlx5_read_ib_stat(priv, xstats_ctrl->info[i].ctr_name, +- &stats[i]); ++ ret = mlx5_read_ib_stat(priv, ++ xstats_ctrl->info[i].ctr_name, ++ &stats[i]); ++ /* return last xstats counter if fail to read. */ ++ if (ret == 0) ++ xstats_ctrl->xstats[i] = stats[i]; ++ else ++ stats[i] = xstats_ctrl->xstats[i]; + } else { + stats[i] = (uint64_t) + et_stats->data[xstats_ctrl->dev_table_idx[i]]; +@@ -301,6 +313,7 @@ mlx5_stats_init(struct rte_eth_dev *dev) + unsigned int idx = xstats_ctrl->mlx5_stats_n++; + + xstats_ctrl->info[idx] = mlx5_counters_init[i]; ++ xstats_ctrl->hw_stats[idx] = 0; + } + } + assert(xstats_ctrl->mlx5_stats_n <= MLX5_MAX_XSTATS); +@@ -311,6 +324,7 @@ mlx5_stats_init(struct rte_eth_dev *dev) + DRV_LOG(ERR, "port %u cannot read device counters: %s", + dev->data->port_id, strerror(rte_errno)); + mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base); ++ stats_ctrl->imissed = 0; + free: + rte_free(strings); + } +@@ -353,7 +367,23 @@ mlx5_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *stats, + return ret; + for (i = 0; i != mlx5_stats_n; ++i) { + stats[i].id = i; +- stats[i].value = (counters[i] - xstats_ctrl->base[i]); ++ if (xstats_ctrl->info[i].ib) { ++ uint64_t wrap_n; ++ uint64_t hw_stat = xstats_ctrl->hw_stats[i]; ++ ++ stats[i].value = (counters[i] - ++ xstats_ctrl->base[i]) & ++ (uint64_t)UINT32_MAX; ++ wrap_n = hw_stat >> 32; ++ if (stats[i].value < ++ (hw_stat & (uint64_t)UINT32_MAX)) ++ wrap_n++; ++ stats[i].value |= (wrap_n) << 32; ++ xstats_ctrl->hw_stats[i] = stats[i].value; ++ } else { ++ stats[i].value = ++ (counters[i] - xstats_ctrl->base[i]); ++ } + } + } + return mlx5_stats_n; +@@ -375,9 +405,12 @@ int + mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) + { + struct mlx5_priv *priv = dev->data->dev_private; ++ struct mlx5_stats_ctrl *stats_ctrl = &priv->stats_ctrl; + struct rte_eth_stats tmp; + unsigned int i; + unsigned int idx; ++ uint64_t wrap_n; ++ int ret; + + memset(&tmp, 0, sizeof(tmp)); + /* Add software counters. */ +@@ -420,8 +453,18 @@ mlx5_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats) + #endif + tmp.oerrors += txq->stats.oerrors; + } +- mlx5_read_ib_stat(priv, "out_of_buffer", &tmp.imissed); +- tmp.imissed -= priv->stats_ctrl.imissed_base; ++ ret = mlx5_read_ib_stat(priv, "out_of_buffer", &tmp.imissed); ++ if (ret == 0) { ++ tmp.imissed = (tmp.imissed - stats_ctrl->imissed_base) & ++ (uint64_t)UINT32_MAX; ++ wrap_n = stats_ctrl->imissed >> 32; ++ if (tmp.imissed < (stats_ctrl->imissed & (uint64_t)UINT32_MAX)) ++ wrap_n++; ++ tmp.imissed |= (wrap_n) << 32; ++ stats_ctrl->imissed = tmp.imissed; ++ } else { ++ tmp.imissed = stats_ctrl->imissed; ++ } + #ifndef MLX5_PMD_SOFT_COUNTERS + /* FIXME: retrieve and add hardware counters. */ + #endif +@@ -458,6 +501,7 @@ mlx5_stats_reset(struct rte_eth_dev *dev) + sizeof(struct mlx5_txq_stats)); + } + mlx5_read_ib_stat(priv, "out_of_buffer", &stats_ctrl->imissed_base); ++ stats_ctrl->imissed = 0; + #ifndef MLX5_PMD_SOFT_COUNTERS + /* FIXME: reset hardware counters. */ + #endif +@@ -500,8 +544,10 @@ mlx5_xstats_reset(struct rte_eth_dev *dev) + dev->data->port_id, strerror(rte_errno)); + return ret; + } +- for (i = 0; i != n; ++i) ++ for (i = 0; i != n; ++i) { + xstats_ctrl->base[i] = counters[i]; ++ xstats_ctrl->hw_stats[i] = 0; ++ } + + return 0; + } +diff --git a/dpdk/drivers/net/mlx5/mlx5_trigger.c b/dpdk/drivers/net/mlx5/mlx5_trigger.c +index cafab25c67..6fc4190f4e 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_trigger.c ++++ b/dpdk/drivers/net/mlx5/mlx5_trigger.c +@@ -106,9 +106,12 @@ mlx5_rxq_start(struct rte_eth_dev *dev) + unsigned int i; + int ret = 0; + enum mlx5_rxq_obj_type obj_type = MLX5_RXQ_OBJ_TYPE_IBV; ++ struct mlx5_rxq_data *rxq = NULL; + + for (i = 0; i < priv->rxqs_n; ++i) { +- if ((*priv->rxqs)[i]->lro) { ++ rxq = (*priv->rxqs)[i]; ++ ++ if (rxq && rxq->lro) { + obj_type = MLX5_RXQ_OBJ_TYPE_DEVX_RQ; + break; + } +@@ -269,11 +272,13 @@ mlx5_dev_start(struct rte_eth_dev *dev) + int ret; + + DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id); +- ret = mlx5_dev_configure_rss_reta(dev); +- if (ret) { +- DRV_LOG(ERR, "port %u reta config failed: %s", +- dev->data->port_id, strerror(rte_errno)); +- return -rte_errno; ++ if (dev->data->nb_rx_queues > 0) { ++ ret = mlx5_dev_configure_rss_reta(dev); ++ if (ret) { ++ DRV_LOG(ERR, "port %u reta config failed: %s", ++ dev->data->port_id, strerror(rte_errno)); ++ return -rte_errno; ++ } + } + ret = mlx5_txq_start(dev); + if (ret) { +@@ -309,6 +314,8 @@ mlx5_dev_start(struct rte_eth_dev *dev) + dev->data->port_id); + goto error; + } ++ /* Set a mask and offset of dynamic metadata flows into Rx queues*/ ++ mlx5_flow_rxq_dynf_metadata_set(dev); + ret = mlx5_flow_start(dev, &priv->flows); + if (ret) { + DRV_LOG(DEBUG, "port %u failed to set flows", +@@ -420,9 +427,14 @@ mlx5_traffic_enable(struct rte_eth_dev *dev) + } + mlx5_txq_release(dev, i); + } +- if (priv->config.dv_esw_en && !priv->config.vf) +- if (!mlx5_flow_create_esw_table_zero_flow(dev)) +- goto error; ++ if (priv->config.dv_esw_en && !priv->config.vf) { ++ if (mlx5_flow_create_esw_table_zero_flow(dev)) ++ priv->fdb_def_rule = 1; ++ else ++ DRV_LOG(INFO, "port %u FDB default rule cannot be" ++ " configured - only Eswitch group 0 flows are" ++ " supported.", dev->data->port_id); ++ } + if (priv->isolated) + return 0; + if (dev->data->promiscuous) { +diff --git a/dpdk/drivers/net/mlx5/mlx5_txq.c b/dpdk/drivers/net/mlx5/mlx5_txq.c +index bac4f71c24..c7751e83c0 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_txq.c ++++ b/dpdk/drivers/net/mlx5/mlx5_txq.c +@@ -62,7 +62,7 @@ txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl) + * @param txq_ctrl + * Pointer to TX queue structure. + */ +-static void ++void + txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl) + { + const uint16_t elts_n = 1 << txq_ctrl->txq.elts_n; +@@ -272,7 +272,6 @@ mlx5_tx_hairpin_queue_setup(struct rte_eth_dev *dev, uint16_t idx, + DRV_LOG(DEBUG, "port %u adding Tx queue %u to list", + dev->data->port_id, idx); + (*priv->txqs)[idx] = &txq_ctrl->txq; +- txq_ctrl->type = MLX5_TXQ_TYPE_HAIRPIN; + return 0; + } + +@@ -296,9 +295,9 @@ mlx5_tx_queue_release(void *dpdk_txq) + priv = txq_ctrl->priv; + for (i = 0; (i != priv->txqs_n); ++i) + if ((*priv->txqs)[i] == txq) { +- mlx5_txq_release(ETH_DEV(priv), i); + DRV_LOG(DEBUG, "port %u removing Tx queue %u from list", + PORT_ID(priv), txq->idx); ++ mlx5_txq_release(ETH_DEV(priv), i); + break; + } + } +@@ -315,7 +314,7 @@ static void + txq_uar_ncattr_init(struct mlx5_txq_ctrl *txq_ctrl, size_t page_size) + { + struct mlx5_priv *priv = txq_ctrl->priv; +- unsigned int cmd; ++ off_t cmd; + + txq_ctrl->txq.db_heu = priv->config.dbnc == MLX5_TXDB_HEURISTIC; + txq_ctrl->txq.db_nc = 0; +@@ -492,6 +491,7 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) + struct mlx5_devx_create_sq_attr attr = { 0 }; + struct mlx5_txq_obj *tmpl = NULL; + int ret = 0; ++ uint32_t max_wq_data; + + assert(txq_data); + assert(!txq_ctrl->obj); +@@ -508,11 +508,15 @@ mlx5_txq_obj_hairpin_new(struct rte_eth_dev *dev, uint16_t idx) + tmpl->txq_ctrl = txq_ctrl; + attr.hairpin = 1; + attr.tis_lst_sz = 1; +- /* Workaround for hairpin startup */ +- attr.wq_attr.log_hairpin_num_packets = log2above(32); +- /* Workaround for packets larger than 1KB */ ++ max_wq_data = priv->config.hca_attr.log_max_hairpin_wq_data_sz; ++ /* Jumbo frames > 9KB should be supported, and more packets. */ + attr.wq_attr.log_hairpin_data_sz = +- priv->config.hca_attr.log_max_hairpin_wq_data_sz; ++ (max_wq_data < MLX5_HAIRPIN_JUMBO_LOG_SIZE) ? ++ max_wq_data : MLX5_HAIRPIN_JUMBO_LOG_SIZE; ++ /* Set the packets number to the maximum value for performance. */ ++ attr.wq_attr.log_hairpin_num_packets = ++ attr.wq_attr.log_hairpin_data_sz - ++ MLX5_HAIRPIN_QUEUE_STRIDE; + attr.tis_num = priv->sh->tis->id; + tmpl->sq = mlx5_devx_cmd_create_sq(priv->sh->ctx, &attr); + if (!tmpl->sq) { +@@ -718,13 +722,22 @@ mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + txq_data->cq_db = cq_info.dbrec; + txq_data->cqes = (volatile struct mlx5_cqe *)cq_info.buf; + txq_data->cq_ci = 0; +-#ifndef NDEBUG + txq_data->cq_pi = 0; +-#endif + txq_data->wqe_ci = 0; + txq_data->wqe_pi = 0; + txq_data->wqe_comp = 0; + txq_data->wqe_thres = txq_data->wqe_s / MLX5_TX_COMP_THRESH_INLINE_DIV; ++ txq_data->fcqs = rte_calloc_socket(__func__, ++ txq_data->cqe_s, ++ sizeof(*txq_data->fcqs), ++ RTE_CACHE_LINE_SIZE, ++ txq_ctrl->socket); ++ if (!txq_data->fcqs) { ++ DRV_LOG(ERR, "port %u Tx queue %u cannot allocate memory (FCQ)", ++ dev->data->port_id, idx); ++ rte_errno = ENOMEM; ++ goto error; ++ } + #ifdef HAVE_IBV_FLOW_DV_SUPPORT + /* + * If using DevX need to query and store TIS transport domain value. +@@ -773,6 +786,8 @@ mlx5_txq_obj_new(struct rte_eth_dev *dev, uint16_t idx, + claim_zero(mlx5_glue->destroy_cq(tmpl.cq)); + if (tmpl.qp) + claim_zero(mlx5_glue->destroy_qp(tmpl.qp)); ++ if (txq_data && txq_data->fcqs) ++ rte_free(txq_data->fcqs); + if (txq_obj) + rte_free(txq_obj); + priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE; +@@ -827,6 +842,8 @@ mlx5_txq_obj_release(struct mlx5_txq_obj *txq_obj) + } else { + claim_zero(mlx5_glue->destroy_qp(txq_obj->qp)); + claim_zero(mlx5_glue->destroy_cq(txq_obj->cq)); ++ if (txq_obj->txq_ctrl->txq.fcqs) ++ rte_free(txq_obj->txq_ctrl->txq.fcqs); + } + LIST_REMOVE(txq_obj, next); + rte_free(txq_obj); +@@ -964,7 +981,7 @@ txq_set_params(struct mlx5_txq_ctrl *txq_ctrl) + * If there is requested minimal amount of data to inline + * we MUST enable inlining. This is a case for ConnectX-4 + * which usually requires L2 inlined for correct operating +- * and ConnectX-4LX which requires L2-L4 inlined to ++ * and ConnectX-4 Lx which requires L2-L4 inlined to + * support E-Switch Flows. + */ + if (inlen_mode) { +diff --git a/dpdk/drivers/net/mlx5/mlx5_utils.h b/dpdk/drivers/net/mlx5/mlx5_utils.h +index b4ed8c6dad..fdf1379866 100644 +--- a/dpdk/drivers/net/mlx5/mlx5_utils.h ++++ b/dpdk/drivers/net/mlx5/mlx5_utils.h +@@ -15,16 +15,6 @@ + + #include "mlx5_defs.h" + +-/* +- * Compilation workaround for PPC64 when AltiVec is fully enabled, e.g. std=c11. +- * Otherwise there would be a type conflict between stdbool and altivec. +- */ +-#if defined(__PPC64__) && !defined(__APPLE_ALTIVEC__) +-#undef bool +-/* redefine as in stdbool.h */ +-#define bool _Bool +-#endif +- + /* Bit-field manipulation. */ + #define BITFIELD_DECLARE(bf, type, size) \ + type bf[(((size_t)(size) / (sizeof(type) * CHAR_BIT)) + \ +@@ -146,9 +136,10 @@ extern int mlx5_logtype; + + /* Allocate a buffer on the stack and fill it with a printf format string. */ + #define MKSTR(name, ...) \ +- char name[snprintf(NULL, 0, __VA_ARGS__) + 1]; \ ++ int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \ ++ char name[mkstr_size_##name + 1]; \ + \ +- snprintf(name, sizeof(name), __VA_ARGS__) ++ snprintf(name, sizeof(name), "" __VA_ARGS__) + + /** + * Return logarithm of the nearest power of two above input value. +diff --git a/dpdk/drivers/net/mvneta/mvneta_ethdev.c b/dpdk/drivers/net/mvneta/mvneta_ethdev.c +index 865ad61aed..4aea876488 100644 +--- a/dpdk/drivers/net/mvneta/mvneta_ethdev.c ++++ b/dpdk/drivers/net/mvneta/mvneta_ethdev.c +@@ -751,7 +751,7 @@ mvneta_stats_reset(struct rte_eth_dev *dev) + + ret = mvneta_stats_get(dev, &priv->prev_stats); + if (unlikely(ret)) +- RTE_LOG(ERR, PMD, "Failed to reset port statistics"); ++ MVNETA_LOG(ERR, "Failed to reset port statistics"); + + return ret; + } +diff --git a/dpdk/drivers/net/mvpp2/mrvl_flow.c b/dpdk/drivers/net/mvpp2/mrvl_flow.c +index 381b54e291..ea43255284 100644 +--- a/dpdk/drivers/net/mvpp2/mrvl_flow.c ++++ b/dpdk/drivers/net/mvpp2/mrvl_flow.c +@@ -2511,14 +2511,14 @@ mrvl_create_cls_table(struct rte_eth_dev *dev, struct rte_flow *first_flow) + + if (first_flow->pattern & F_UDP_SPORT) { + key->proto_field[key->num_fields].proto = MV_NET_PROTO_UDP; +- key->proto_field[key->num_fields].field.tcp = MV_NET_TCP_F_SP; ++ key->proto_field[key->num_fields].field.udp = MV_NET_UDP_F_SP; + key->key_size += 2; + key->num_fields += 1; + } + + if (first_flow->pattern & F_UDP_DPORT) { + key->proto_field[key->num_fields].proto = MV_NET_PROTO_UDP; +- key->proto_field[key->num_fields].field.udp = MV_NET_TCP_F_DP; ++ key->proto_field[key->num_fields].field.udp = MV_NET_UDP_F_DP; + key->key_size += 2; + key->num_fields += 1; + } +diff --git a/dpdk/drivers/net/netvsc/hn_ethdev.c b/dpdk/drivers/net/netvsc/hn_ethdev.c +index 164e9ad174..6950682a94 100644 +--- a/dpdk/drivers/net/netvsc/hn_ethdev.c ++++ b/dpdk/drivers/net/netvsc/hn_ethdev.c +@@ -42,7 +42,8 @@ + DEV_TX_OFFLOAD_VLAN_INSERT) + + #define HN_RX_OFFLOAD_CAPS (DEV_RX_OFFLOAD_CHECKSUM | \ +- DEV_RX_OFFLOAD_VLAN_STRIP) ++ DEV_RX_OFFLOAD_VLAN_STRIP | \ ++ DEV_RX_OFFLOAD_RSS_HASH) + + int hn_logtype_init; + int hn_logtype_driver; +@@ -71,7 +72,7 @@ static const struct hn_xstats_name_off hn_stat_strings[] = { + + /* The default RSS key. + * This value is the same as MLX5 so that flows will be +- * received on same path for both VF ans synthetic NIC. ++ * received on same path for both VF and synthetic NIC. + */ + static const uint8_t rss_default_key[NDIS_HASH_KEYSIZE_TOEPLITZ] = { + 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7, +@@ -133,8 +134,6 @@ eth_dev_vmbus_allocate(struct rte_vmbus_device *dev, size_t private_data_size) + static void + eth_dev_vmbus_release(struct rte_eth_dev *eth_dev) + { +- /* mac_addrs must not be freed alone because part of dev_private */ +- eth_dev->data->mac_addrs = NULL; + /* free ether device */ + rte_eth_dev_release_port(eth_dev); + +@@ -256,15 +255,19 @@ static int hn_dev_info_get(struct rte_eth_dev *dev, + dev_info->max_rx_queues = hv->max_queues; + dev_info->max_tx_queues = hv->max_queues; + +- rc = hn_rndis_get_offload(hv, dev_info); +- if (rc != 0) +- return rc; ++ dev_info->tx_desc_lim.nb_min = 1; ++ dev_info->tx_desc_lim.nb_max = 4096; ++ ++ if (rte_eal_process_type() != RTE_PROC_PRIMARY) ++ return 0; + +- rc = hn_vf_info_get(hv, dev_info); ++ /* fills in rx and tx offload capability */ ++ rc = hn_rndis_get_offload(hv, dev_info); + if (rc != 0) + return rc; + +- return 0; ++ /* merges the offload and queues of vf */ ++ return hn_vf_info_get(hv, dev_info); + } + + static int hn_rss_reta_update(struct rte_eth_dev *dev, +@@ -291,6 +294,13 @@ static int hn_rss_reta_update(struct rte_eth_dev *dev, + hv->rss_ind[i] = reta_conf[idx].reta[shift]; + } + ++ err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE); ++ if (err) { ++ PMD_DRV_LOG(NOTICE, ++ "rss disable failed"); ++ return err; ++ } ++ + err = hn_rndis_conf_rss(hv, 0); + if (err) { + PMD_DRV_LOG(NOTICE, +@@ -366,14 +376,15 @@ static int hn_rss_hash_update(struct rte_eth_dev *dev, + + hn_rss_hash_init(hv, rss_conf); + +- err = hn_rndis_conf_rss(hv, 0); +- if (err) { +- PMD_DRV_LOG(NOTICE, +- "rss reconfig failed (RSS disabled)"); +- return err; ++ if (rss_conf->rss_hf != 0) { ++ err = hn_rndis_conf_rss(hv, 0); ++ if (err) { ++ PMD_DRV_LOG(NOTICE, ++ "rss reconfig failed (RSS disabled)"); ++ return err; ++ } + } + +- + return hn_vf_rss_hash_update(dev, rss_conf); + } + +@@ -565,7 +576,7 @@ static int hn_dev_configure(struct rte_eth_dev *dev) + dev->data->nb_tx_queues); + + for (i = 0; i < NDIS_HASH_INDCNT; i++) +- hv->rss_ind[i] = i % hv->num_queues; ++ hv->rss_ind[i] = i % dev->data->nb_rx_queues; + + hn_rss_hash_init(hv, rss_conf); + +@@ -578,12 +589,21 @@ static int hn_dev_configure(struct rte_eth_dev *dev) + return err; + } + +- err = hn_rndis_conf_rss(hv, 0); ++ err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE); + if (err) { + PMD_DRV_LOG(NOTICE, +- "initial RSS config failed"); ++ "rss disable failed"); + return err; + } ++ ++ if (rss_conf->rss_hf != 0) { ++ err = hn_rndis_conf_rss(hv, 0); ++ if (err) { ++ PMD_DRV_LOG(NOTICE, ++ "initial RSS config failed"); ++ return err; ++ } ++ } + } + + return hn_vf_configure(dev, dev_conf); +@@ -807,6 +827,10 @@ hn_dev_start(struct rte_eth_dev *dev) + if (error) + hn_rndis_set_rxfilter(hv, 0); + ++ /* Initialize Link state */ ++ if (error == 0) ++ hn_dev_link_update(dev, 0); ++ + return error; + } + +@@ -921,8 +945,14 @@ eth_hn_dev_init(struct rte_eth_dev *eth_dev) + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return 0; + +- /* Since Hyper-V only supports one MAC address, just use local data */ +- eth_dev->data->mac_addrs = &hv->mac_addr; ++ /* Since Hyper-V only supports one MAC address */ ++ eth_dev->data->mac_addrs = rte_calloc("hv_mac", HN_MAX_MAC_ADDRS, ++ sizeof(struct rte_ether_addr), 0); ++ if (eth_dev->data->mac_addrs == NULL) { ++ PMD_INIT_LOG(ERR, ++ "Failed to allocate memory store MAC addresses"); ++ return -ENOMEM; ++ } + + hv->vmbus = vmbus; + hv->rxbuf_res = &vmbus->resource[HV_RECV_BUF_MAP]; +@@ -962,11 +992,11 @@ eth_hn_dev_init(struct rte_eth_dev *eth_dev) + if (err) + goto failed; + +- err = hn_tx_pool_init(eth_dev); ++ err = hn_chim_init(eth_dev); + if (err) + goto failed; + +- err = hn_rndis_get_eaddr(hv, hv->mac_addr.addr_bytes); ++ err = hn_rndis_get_eaddr(hv, eth_dev->data->mac_addrs->addr_bytes); + if (err) + goto failed; + +@@ -998,7 +1028,7 @@ eth_hn_dev_init(struct rte_eth_dev *eth_dev) + failed: + PMD_INIT_LOG(NOTICE, "device init failed"); + +- hn_tx_pool_uninit(eth_dev); ++ hn_chim_uninit(eth_dev); + hn_detach(hv); + return err; + } +@@ -1022,7 +1052,7 @@ eth_hn_dev_uninit(struct rte_eth_dev *eth_dev) + eth_dev->rx_pkt_burst = NULL; + + hn_detach(hv); +- hn_tx_pool_uninit(eth_dev); ++ hn_chim_uninit(eth_dev); + rte_vmbus_chan_close(hv->primary->chan); + rte_free(hv->primary); + ret = rte_eth_dev_owner_delete(hv->owner.id); +diff --git a/dpdk/drivers/net/netvsc/hn_nvs.c b/dpdk/drivers/net/netvsc/hn_nvs.c +index 6b518685ab..477202b2a0 100644 +--- a/dpdk/drivers/net/netvsc/hn_nvs.c ++++ b/dpdk/drivers/net/netvsc/hn_nvs.c +@@ -54,7 +54,7 @@ static int hn_nvs_req_send(struct hn_data *hv, + } + + static int +-hn_nvs_execute(struct hn_data *hv, ++__hn_nvs_execute(struct hn_data *hv, + void *req, uint32_t reqlen, + void *resp, uint32_t resplen, + uint32_t type) +@@ -62,6 +62,7 @@ hn_nvs_execute(struct hn_data *hv, + struct vmbus_channel *chan = hn_primary_chan(hv); + char buffer[NVS_RESPSIZE_MAX]; + const struct hn_nvs_hdr *hdr; ++ uint64_t xactid; + uint32_t len; + int ret; + +@@ -77,7 +78,7 @@ hn_nvs_execute(struct hn_data *hv, + + retry: + len = sizeof(buffer); +- ret = rte_vmbus_chan_recv(chan, buffer, &len, NULL); ++ ret = rte_vmbus_chan_recv(chan, buffer, &len, &xactid); + if (ret == -EAGAIN) { + rte_delay_us(HN_CHAN_INTERVAL_US); + goto retry; +@@ -88,7 +89,20 @@ hn_nvs_execute(struct hn_data *hv, + return ret; + } + ++ if (len < sizeof(*hdr)) { ++ PMD_DRV_LOG(ERR, "response missing NVS header"); ++ return -EINVAL; ++ } ++ + hdr = (struct hn_nvs_hdr *)buffer; ++ ++ /* Silently drop received packets while waiting for response */ ++ if (hdr->type == NVS_TYPE_RNDIS) { ++ hn_nvs_ack_rxbuf(chan, xactid); ++ --hv->rxbuf_outstanding; ++ goto retry; ++ } ++ + if (hdr->type != type) { + PMD_DRV_LOG(ERR, "unexpected NVS resp %#x, expect %#x", + hdr->type, type); +@@ -108,6 +122,29 @@ hn_nvs_execute(struct hn_data *hv, + return 0; + } + ++ ++/* ++ * Execute one control command and get the response. ++ * Only one command can be active on a channel at once ++ * Unlike BSD, DPDK does not have an interrupt context ++ * so the polling is required to wait for response. ++ */ ++static int ++hn_nvs_execute(struct hn_data *hv, ++ void *req, uint32_t reqlen, ++ void *resp, uint32_t resplen, ++ uint32_t type) ++{ ++ struct hn_rx_queue *rxq = hv->primary; ++ int ret; ++ ++ rte_spinlock_lock(&rxq->ring_lock); ++ ret = __hn_nvs_execute(hv, req, reqlen, resp, resplen, type); ++ rte_spinlock_unlock(&rxq->ring_lock); ++ ++ return ret; ++} ++ + static int + hn_nvs_doinit(struct hn_data *hv, uint32_t nvs_ver) + { +diff --git a/dpdk/drivers/net/netvsc/hn_nvs.h b/dpdk/drivers/net/netvsc/hn_nvs.h +index 2563fd8d86..015839e364 100644 +--- a/dpdk/drivers/net/netvsc/hn_nvs.h ++++ b/dpdk/drivers/net/netvsc/hn_nvs.h +@@ -37,7 +37,7 @@ + #define NVS_RNDIS_MTYPE_CTRL 1 + + /* +- * NVS message transacion status codes. ++ * NVS message transaction status codes. + */ + #define NVS_STATUS_OK 1 + #define NVS_STATUS_FAILED 2 +diff --git a/dpdk/drivers/net/netvsc/hn_rxtx.c b/dpdk/drivers/net/netvsc/hn_rxtx.c +index 7212780c15..19f00a0528 100644 +--- a/dpdk/drivers/net/netvsc/hn_rxtx.c ++++ b/dpdk/drivers/net/netvsc/hn_rxtx.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -83,7 +84,7 @@ struct hn_txdesc { + struct rte_mbuf *m; + + uint16_t queue_id; +- uint16_t chim_index; ++ uint32_t chim_index; + uint32_t chim_size; + uint32_t data_size; + uint32_t packets; +@@ -98,11 +99,13 @@ struct hn_txdesc { + RNDIS_PKTINFO_SIZE(NDIS_LSO2_INFO_SIZE) + \ + RNDIS_PKTINFO_SIZE(NDIS_TXCSUM_INFO_SIZE)) + ++#define HN_RNDIS_PKT_ALIGNED RTE_ALIGN(HN_RNDIS_PKT_LEN, RTE_CACHE_LINE_SIZE) ++ + /* Minimum space required for a packet */ + #define HN_PKTSIZE_MIN(align) \ + RTE_ALIGN(RTE_ETHER_MIN_LEN + HN_RNDIS_PKT_LEN, align) + +-#define DEFAULT_TX_FREE_THRESH 32U ++#define DEFAULT_TX_FREE_THRESH 32 + + static void + hn_update_packet_stats(struct hn_stats *stats, const struct rte_mbuf *m) +@@ -150,63 +153,77 @@ hn_rndis_pktmsg_offset(uint32_t ofs) + static void hn_txd_init(struct rte_mempool *mp __rte_unused, + void *opaque, void *obj, unsigned int idx) + { ++ struct hn_tx_queue *txq = opaque; + struct hn_txdesc *txd = obj; +- struct rte_eth_dev *dev = opaque; +- struct rndis_packet_msg *pkt; + + memset(txd, 0, sizeof(*txd)); +- txd->chim_index = idx; + +- pkt = rte_malloc_socket("RNDIS_TX", HN_RNDIS_PKT_LEN, +- rte_align32pow2(HN_RNDIS_PKT_LEN), +- dev->device->numa_node); +- if (!pkt) +- rte_exit(EXIT_FAILURE, "can not allocate RNDIS header"); +- +- txd->rndis_pkt = pkt; ++ txd->queue_id = txq->queue_id; ++ txd->chim_index = NVS_CHIM_IDX_INVALID; ++ txd->rndis_pkt = (struct rndis_packet_msg *)(char *)txq->tx_rndis ++ + idx * HN_RNDIS_PKT_ALIGNED; + } + +-/* +- * Unlike Linux and FreeBSD, this driver uses a mempool +- * to limit outstanding transmits and reserve buffers +- */ + int +-hn_tx_pool_init(struct rte_eth_dev *dev) ++hn_chim_init(struct rte_eth_dev *dev) + { + struct hn_data *hv = dev->data->dev_private; +- char name[RTE_MEMPOOL_NAMESIZE]; +- struct rte_mempool *mp; ++ uint32_t i, chim_bmp_size; ++ ++ rte_spinlock_init(&hv->chim_lock); ++ chim_bmp_size = rte_bitmap_get_memory_footprint(hv->chim_cnt); ++ hv->chim_bmem = rte_zmalloc("hn_chim_bitmap", chim_bmp_size, ++ RTE_CACHE_LINE_SIZE); ++ if (hv->chim_bmem == NULL) { ++ PMD_INIT_LOG(ERR, "failed to allocate bitmap size %u", ++ chim_bmp_size); ++ return -1; ++ } + +- snprintf(name, sizeof(name), +- "hn_txd_%u", dev->data->port_id); +- +- PMD_INIT_LOG(DEBUG, "create a TX send pool %s n=%u size=%zu socket=%d", +- name, hv->chim_cnt, sizeof(struct hn_txdesc), +- dev->device->numa_node); +- +- mp = rte_mempool_create(name, hv->chim_cnt, sizeof(struct hn_txdesc), +- HN_TXD_CACHE_SIZE, 0, +- NULL, NULL, +- hn_txd_init, dev, +- dev->device->numa_node, 0); +- if (!mp) { +- PMD_DRV_LOG(ERR, +- "mempool %s create failed: %d", name, rte_errno); +- return -rte_errno; ++ hv->chim_bmap = rte_bitmap_init(hv->chim_cnt, ++ hv->chim_bmem, chim_bmp_size); ++ if (hv->chim_bmap == NULL) { ++ PMD_INIT_LOG(ERR, "failed to init chim bitmap"); ++ return -1; + } + +- hv->tx_pool = mp; ++ for (i = 0; i < hv->chim_cnt; i++) ++ rte_bitmap_set(hv->chim_bmap, i); ++ + return 0; + } + + void +-hn_tx_pool_uninit(struct rte_eth_dev *dev) ++hn_chim_uninit(struct rte_eth_dev *dev) + { + struct hn_data *hv = dev->data->dev_private; + +- if (hv->tx_pool) { +- rte_mempool_free(hv->tx_pool); +- hv->tx_pool = NULL; ++ rte_bitmap_free(hv->chim_bmap); ++ rte_free(hv->chim_bmem); ++ hv->chim_bmem = NULL; ++} ++ ++static uint32_t hn_chim_alloc(struct hn_data *hv) ++{ ++ uint32_t index = NVS_CHIM_IDX_INVALID; ++ uint64_t slab; ++ ++ rte_spinlock_lock(&hv->chim_lock); ++ if (rte_bitmap_scan(hv->chim_bmap, &index, &slab)) ++ rte_bitmap_clear(hv->chim_bmap, index); ++ rte_spinlock_unlock(&hv->chim_lock); ++ ++ return index; ++} ++ ++static void hn_chim_free(struct hn_data *hv, uint32_t chim_idx) ++{ ++ if (chim_idx >= hv->chim_cnt) { ++ PMD_DRV_LOG(ERR, "Invalid chimney index %u", chim_idx); ++ } else { ++ rte_spinlock_lock(&hv->chim_lock); ++ rte_bitmap_set(hv->chim_bmap, chim_idx); ++ rte_spinlock_unlock(&hv->chim_lock); + } + } + +@@ -220,15 +237,16 @@ static void hn_reset_txagg(struct hn_tx_queue *txq) + + int + hn_dev_tx_queue_setup(struct rte_eth_dev *dev, +- uint16_t queue_idx, uint16_t nb_desc __rte_unused, ++ uint16_t queue_idx, uint16_t nb_desc, + unsigned int socket_id, + const struct rte_eth_txconf *tx_conf) + + { + struct hn_data *hv = dev->data->dev_private; + struct hn_tx_queue *txq; ++ char name[RTE_MEMPOOL_NAMESIZE]; + uint32_t tx_free_thresh; +- int err; ++ int err = -ENOMEM; + + PMD_INIT_FUNC_TRACE(); + +@@ -244,14 +262,42 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev, + + tx_free_thresh = tx_conf->tx_free_thresh; + if (tx_free_thresh == 0) +- tx_free_thresh = RTE_MIN(hv->chim_cnt / 4, ++ tx_free_thresh = RTE_MIN(nb_desc / 4, + DEFAULT_TX_FREE_THRESH); + +- if (tx_free_thresh >= hv->chim_cnt - 3) +- tx_free_thresh = hv->chim_cnt - 3; ++ if (tx_free_thresh + 3 >= nb_desc) { ++ PMD_INIT_LOG(ERR, ++ "tx_free_thresh must be less than the number of TX entries minus 3(%u)." ++ " (tx_free_thresh=%u port=%u queue=%u)\n", ++ nb_desc - 3, ++ tx_free_thresh, dev->data->port_id, queue_idx); ++ return -EINVAL; ++ } + + txq->free_thresh = tx_free_thresh; + ++ snprintf(name, sizeof(name), ++ "hn_txd_%u_%u", dev->data->port_id, queue_idx); ++ ++ PMD_INIT_LOG(DEBUG, "TX descriptor pool %s n=%u size=%zu", ++ name, nb_desc, sizeof(struct hn_txdesc)); ++ ++ txq->tx_rndis = rte_calloc("hn_txq_rndis", nb_desc, ++ HN_RNDIS_PKT_ALIGNED, RTE_CACHE_LINE_SIZE); ++ if (txq->tx_rndis == NULL) ++ goto error; ++ ++ txq->txdesc_pool = rte_mempool_create(name, nb_desc, ++ sizeof(struct hn_txdesc), ++ 0, 0, NULL, NULL, ++ hn_txd_init, txq, ++ dev->device->numa_node, 0); ++ if (txq->txdesc_pool == NULL) { ++ PMD_DRV_LOG(ERR, ++ "mempool %s create failed: %d", name, rte_errno); ++ goto error; ++ } ++ + txq->agg_szmax = RTE_MIN(hv->chim_szmax, hv->rndis_agg_size); + txq->agg_pktmax = hv->rndis_agg_pkts; + txq->agg_align = hv->rndis_agg_align; +@@ -260,31 +306,57 @@ hn_dev_tx_queue_setup(struct rte_eth_dev *dev, + + err = hn_vf_tx_queue_setup(dev, queue_idx, nb_desc, + socket_id, tx_conf); +- if (err) { +- rte_free(txq); +- return err; ++ if (err == 0) { ++ dev->data->tx_queues[queue_idx] = txq; ++ return 0; + } + +- dev->data->tx_queues[queue_idx] = txq; +- return 0; ++error: ++ if (txq->txdesc_pool) ++ rte_mempool_free(txq->txdesc_pool); ++ rte_free(txq->tx_rndis); ++ rte_free(txq); ++ return err; ++} ++ ++ ++static struct hn_txdesc *hn_txd_get(struct hn_tx_queue *txq) ++{ ++ struct hn_txdesc *txd; ++ ++ if (rte_mempool_get(txq->txdesc_pool, (void **)&txd)) { ++ ++txq->stats.ring_full; ++ PMD_TX_LOG(DEBUG, "tx pool exhausted!"); ++ return NULL; ++ } ++ ++ txd->m = NULL; ++ txd->packets = 0; ++ txd->data_size = 0; ++ txd->chim_size = 0; ++ ++ return txd; ++} ++ ++static void hn_txd_put(struct hn_tx_queue *txq, struct hn_txdesc *txd) ++{ ++ rte_mempool_put(txq->txdesc_pool, txd); + } + + void + hn_dev_tx_queue_release(void *arg) + { + struct hn_tx_queue *txq = arg; +- struct hn_txdesc *txd; + + PMD_INIT_FUNC_TRACE(); + + if (!txq) + return; + +- /* If any pending data is still present just drop it */ +- txd = txq->agg_txd; +- if (txd) +- rte_mempool_put(txq->hv->tx_pool, txd); ++ if (txq->txdesc_pool) ++ rte_mempool_free(txq->txdesc_pool); + ++ rte_free(txq->tx_rndis); + rte_free(txq); + } + +@@ -292,6 +364,7 @@ static void + hn_nvs_send_completed(struct rte_eth_dev *dev, uint16_t queue_id, + unsigned long xactid, const struct hn_nvs_rndis_ack *ack) + { ++ struct hn_data *hv = dev->data->dev_private; + struct hn_txdesc *txd = (struct hn_txdesc *)xactid; + struct hn_tx_queue *txq; + +@@ -312,9 +385,11 @@ hn_nvs_send_completed(struct rte_eth_dev *dev, uint16_t queue_id, + ++txq->stats.errors; + } + +- rte_pktmbuf_free(txd->m); ++ if (txd->chim_index != NVS_CHIM_IDX_INVALID) ++ hn_chim_free(hv, txd->chim_index); + +- rte_mempool_put(txq->hv->tx_pool, txd); ++ rte_pktmbuf_free(txd->m); ++ hn_txd_put(txq, txd); + } + + /* Handle transmit completion events */ +@@ -894,10 +969,6 @@ uint32_t hn_process_events(struct hn_data *hv, uint16_t queue_id, + + rxq = queue_id == 0 ? hv->primary : dev->data->rx_queues[queue_id]; + +- /* If no pending data then nothing to do */ +- if (rte_vmbus_chan_rx_empty(rxq->chan)) +- return 0; +- + /* + * Since channel is shared between Rx and TX queue need to have a lock + * since DPDK does not force same CPU to be used for Rx/Tx. +@@ -961,9 +1032,6 @@ uint32_t hn_process_events(struct hn_data *hv, uint16_t queue_id, + + if (tx_limit && tx_done >= tx_limit) + break; +- +- if (rxq->rx_ring && rte_ring_full(rxq->rx_ring)) +- break; + } + + if (bytes_read > 0) +@@ -1036,28 +1104,15 @@ static int hn_flush_txagg(struct hn_tx_queue *txq, bool *need_sig) + return ret; + } + +-static struct hn_txdesc *hn_new_txd(struct hn_data *hv, +- struct hn_tx_queue *txq) +-{ +- struct hn_txdesc *txd; +- +- if (rte_mempool_get(hv->tx_pool, (void **)&txd)) { +- ++txq->stats.ring_full; +- PMD_TX_LOG(DEBUG, "tx pool exhausted!"); +- return NULL; +- } +- +- txd->m = NULL; +- txd->queue_id = txq->queue_id; +- txd->packets = 0; +- txd->data_size = 0; +- txd->chim_size = 0; +- +- return txd; +-} +- ++/* ++ * Try and find a place in a send chimney buffer to put ++ * the small packet. If space is available, this routine ++ * returns a pointer of where to place the data. ++ * If no space, caller should try direct transmit. ++ */ + static void * +-hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize) ++hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, ++ struct hn_txdesc *txd, uint32_t pktsize) + { + struct hn_txdesc *agg_txd = txq->agg_txd; + struct rndis_packet_msg *pkt; +@@ -1085,7 +1140,7 @@ hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize) + } + + chim = (uint8_t *)pkt + pkt->len; +- ++ txq->agg_prevpkt = chim; + txq->agg_pktleft--; + txq->agg_szleft -= pktsize; + if (txq->agg_szleft < HN_PKTSIZE_MIN(txq->agg_align)) { +@@ -1095,18 +1150,21 @@ hn_try_txagg(struct hn_data *hv, struct hn_tx_queue *txq, uint32_t pktsize) + */ + txq->agg_pktleft = 0; + } +- } else { +- agg_txd = hn_new_txd(hv, txq); +- if (!agg_txd) +- return NULL; +- +- chim = (uint8_t *)hv->chim_res->addr +- + agg_txd->chim_index * hv->chim_szmax; + +- txq->agg_txd = agg_txd; +- txq->agg_pktleft = txq->agg_pktmax - 1; +- txq->agg_szleft = txq->agg_szmax - pktsize; ++ hn_txd_put(txq, txd); ++ return chim; + } ++ ++ txd->chim_index = hn_chim_alloc(hv); ++ if (txd->chim_index == NVS_CHIM_IDX_INVALID) ++ return NULL; ++ ++ chim = (uint8_t *)hv->chim_res->addr ++ + txd->chim_index * hv->chim_szmax; ++ ++ txq->agg_txd = txd; ++ txq->agg_pktleft = txq->agg_pktmax - 1; ++ txq->agg_szleft = txq->agg_szmax - pktsize; + txq->agg_prevpkt = chim; + + return chim; +@@ -1314,7 +1372,7 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + struct hn_data *hv = txq->hv; + struct rte_eth_dev *vf_dev; + bool need_sig = false; +- uint16_t nb_tx; ++ uint16_t nb_tx, avail; + int ret; + + if (unlikely(hv->closed)) +@@ -1329,13 +1387,19 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + return (*vf_dev->tx_pkt_burst)(sub_q, tx_pkts, nb_pkts); + } + +- if (rte_mempool_avail_count(hv->tx_pool) <= txq->free_thresh) ++ avail = rte_mempool_avail_count(txq->txdesc_pool); ++ if (nb_pkts > avail || avail <= txq->free_thresh) + hn_process_events(hv, txq->queue_id, 0); + + for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) { + struct rte_mbuf *m = tx_pkts[nb_tx]; + uint32_t pkt_size = m->pkt_len + HN_RNDIS_PKT_LEN; + struct rndis_packet_msg *pkt; ++ struct hn_txdesc *txd; ++ ++ txd = hn_txd_get(txq); ++ if (txd == NULL) ++ break; + + /* For small packets aggregate them in chimney buffer */ + if (m->pkt_len < HN_TXCOPY_THRESHOLD && pkt_size <= txq->agg_szmax) { +@@ -1346,7 +1410,8 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + goto fail; + } + +- pkt = hn_try_txagg(hv, txq, pkt_size); ++ ++ pkt = hn_try_txagg(hv, txq, txd, pkt_size); + if (unlikely(!pkt)) + break; + +@@ -1360,21 +1425,13 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + hn_flush_txagg(txq, &need_sig)) + goto fail; + } else { +- struct hn_txdesc *txd; +- +- /* can send chimney data and large packet at once */ +- txd = txq->agg_txd; +- if (txd) { +- hn_reset_txagg(txq); +- } else { +- txd = hn_new_txd(hv, txq); +- if (unlikely(!txd)) +- break; +- } ++ /* Send any outstanding packets in buffer */ ++ if (txq->agg_txd && hn_flush_txagg(txq, &need_sig)) ++ goto fail; + + pkt = txd->rndis_pkt; + txd->m = m; +- txd->data_size += m->pkt_len; ++ txd->data_size = m->pkt_len; + ++txd->packets; + + hn_encap(pkt, queue_id, m); +@@ -1383,7 +1440,7 @@ hn_xmit_pkts(void *ptxq, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) + if (unlikely(ret != 0)) { + PMD_TX_LOG(NOTICE, "sg send failed: %d", ret); + ++txq->stats.errors; +- rte_mempool_put(hv->tx_pool, txd); ++ hn_txd_put(txq, txd); + goto fail; + } + } +diff --git a/dpdk/drivers/net/netvsc/hn_var.h b/dpdk/drivers/net/netvsc/hn_var.h +index 05bc492511..b4c6171737 100644 +--- a/dpdk/drivers/net/netvsc/hn_var.h ++++ b/dpdk/drivers/net/netvsc/hn_var.h +@@ -52,6 +52,8 @@ struct hn_tx_queue { + uint16_t port_id; + uint16_t queue_id; + uint32_t free_thresh; ++ struct rte_mempool *txdesc_pool; ++ void *tx_rndis; + + /* Applied packet transmission aggregation limits. */ + uint32_t agg_szmax; +@@ -115,8 +117,10 @@ struct hn_data { + uint16_t num_queues; + uint64_t rss_offloads; + ++ rte_spinlock_t chim_lock; + struct rte_mem_resource *chim_res; /* UIO resource for Tx */ +- struct rte_mempool *tx_pool; /* Tx descriptors */ ++ struct rte_bitmap *chim_bmap; /* Send buffer map */ ++ void *chim_bmem; + uint32_t chim_szmax; /* Max size per buffer */ + uint32_t chim_cnt; /* Max packets per buffer */ + +@@ -135,8 +139,6 @@ struct hn_data { + uint8_t rss_key[40]; + uint16_t rss_ind[128]; + +- struct rte_ether_addr mac_addr; +- + struct rte_eth_dev_owner owner; + struct rte_intr_handle vf_intr; + +@@ -157,8 +159,8 @@ uint16_t hn_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, + uint16_t hn_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, + uint16_t nb_pkts); + +-int hn_tx_pool_init(struct rte_eth_dev *dev); +-void hn_tx_pool_uninit(struct rte_eth_dev *dev); ++int hn_chim_init(struct rte_eth_dev *dev); ++void hn_chim_uninit(struct rte_eth_dev *dev); + int hn_dev_link_update(struct rte_eth_dev *dev, int wait); + int hn_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx, + uint16_t nb_desc, unsigned int socket_id, +diff --git a/dpdk/drivers/net/netvsc/hn_vf.c b/dpdk/drivers/net/netvsc/hn_vf.c +index 7a3734cadf..1261b2e2ef 100644 +--- a/dpdk/drivers/net/netvsc/hn_vf.c ++++ b/dpdk/drivers/net/netvsc/hn_vf.c +@@ -167,6 +167,17 @@ hn_nvs_handle_vfassoc(struct rte_eth_dev *dev, + hn_vf_remove(hv); + } + ++static void ++hn_vf_merge_desc_lim(struct rte_eth_desc_lim *lim, ++ const struct rte_eth_desc_lim *vf_lim) ++{ ++ lim->nb_max = RTE_MIN(vf_lim->nb_max, lim->nb_max); ++ lim->nb_min = RTE_MAX(vf_lim->nb_min, lim->nb_min); ++ lim->nb_align = RTE_MAX(vf_lim->nb_align, lim->nb_align); ++ lim->nb_seg_max = RTE_MIN(vf_lim->nb_seg_max, lim->nb_seg_max); ++ lim->nb_mtu_seg_max = RTE_MIN(vf_lim->nb_seg_max, lim->nb_seg_max); ++} ++ + /* + * Merge the info from the VF and synthetic path. + * use the default config of the VF +@@ -196,11 +207,13 @@ static int hn_vf_info_merge(struct rte_eth_dev *vf_dev, + info->max_tx_queues); + info->tx_offload_capa &= vf_info.tx_offload_capa; + info->tx_queue_offload_capa &= vf_info.tx_queue_offload_capa; ++ hn_vf_merge_desc_lim(&info->tx_desc_lim, &vf_info.tx_desc_lim); + + info->min_rx_bufsize = RTE_MAX(vf_info.min_rx_bufsize, + info->min_rx_bufsize); + info->max_rx_pktlen = RTE_MAX(vf_info.max_rx_pktlen, + info->max_rx_pktlen); ++ hn_vf_merge_desc_lim(&info->rx_desc_lim, &vf_info.rx_desc_lim); + + return 0; + } +diff --git a/dpdk/drivers/net/nfp/nfp_net.c b/dpdk/drivers/net/nfp/nfp_net.c +index 3aafa7f80f..b6ff5ecd7d 100644 +--- a/dpdk/drivers/net/nfp/nfp_net.c ++++ b/dpdk/drivers/net/nfp/nfp_net.c +@@ -3014,7 +3014,7 @@ nfp_cpp_bridge_serve_write(int sockfd, struct nfp_cpp *cpp) + size_t count, curlen, totlen = 0; + int err = 0; + +- PMD_CPP_LOG(DEBUG, "%s: offset size %lu, count_size: %lu\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: offset size %zu, count_size: %zu\n", __func__, + sizeof(off_t), sizeof(size_t)); + + /* Reading the count param */ +@@ -3033,9 +3033,9 @@ nfp_cpp_bridge_serve_write(int sockfd, struct nfp_cpp *cpp) + cpp_id = (offset >> 40) << 8; + nfp_offset = offset & ((1ull << 40) - 1); + +- PMD_CPP_LOG(DEBUG, "%s: count %lu and offset %ld\n", __func__, count, ++ PMD_CPP_LOG(DEBUG, "%s: count %zu and offset %jd\n", __func__, count, + offset); +- PMD_CPP_LOG(DEBUG, "%s: cpp_id %08x and nfp_offset %ld\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: cpp_id %08x and nfp_offset %jd\n", __func__, + cpp_id, nfp_offset); + + /* Adjust length if not aligned */ +@@ -3067,12 +3067,12 @@ nfp_cpp_bridge_serve_write(int sockfd, struct nfp_cpp *cpp) + if (len > sizeof(tmpbuf)) + len = sizeof(tmpbuf); + +- PMD_CPP_LOG(DEBUG, "%s: Receive %u of %lu\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: Receive %u of %zu\n", __func__, + len, count); + err = recv(sockfd, tmpbuf, len, MSG_WAITALL); + if (err != (int)len) { + RTE_LOG(ERR, PMD, +- "%s: error when receiving, %d of %lu\n", ++ "%s: error when receiving, %d of %zu\n", + __func__, err, count); + nfp_cpp_area_release(area); + nfp_cpp_area_free(area); +@@ -3116,7 +3116,7 @@ nfp_cpp_bridge_serve_read(int sockfd, struct nfp_cpp *cpp) + size_t count, curlen, totlen = 0; + int err = 0; + +- PMD_CPP_LOG(DEBUG, "%s: offset size %lu, count_size: %lu\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: offset size %zu, count_size: %zu\n", __func__, + sizeof(off_t), sizeof(size_t)); + + /* Reading the count param */ +@@ -3135,9 +3135,9 @@ nfp_cpp_bridge_serve_read(int sockfd, struct nfp_cpp *cpp) + cpp_id = (offset >> 40) << 8; + nfp_offset = offset & ((1ull << 40) - 1); + +- PMD_CPP_LOG(DEBUG, "%s: count %lu and offset %ld\n", __func__, count, ++ PMD_CPP_LOG(DEBUG, "%s: count %zu and offset %jd\n", __func__, count, + offset); +- PMD_CPP_LOG(DEBUG, "%s: cpp_id %08x and nfp_offset %ld\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: cpp_id %08x and nfp_offset %jd\n", __func__, + cpp_id, nfp_offset); + + /* Adjust length if not aligned */ +@@ -3174,13 +3174,13 @@ nfp_cpp_bridge_serve_read(int sockfd, struct nfp_cpp *cpp) + nfp_cpp_area_free(area); + return -EIO; + } +- PMD_CPP_LOG(DEBUG, "%s: sending %u of %lu\n", __func__, ++ PMD_CPP_LOG(DEBUG, "%s: sending %u of %zu\n", __func__, + len, count); + + err = send(sockfd, tmpbuf, len, 0); + if (err != (int)len) { + RTE_LOG(ERR, PMD, +- "%s: error when sending: %d of %lu\n", ++ "%s: error when sending: %d of %zu\n", + __func__, err, count); + nfp_cpp_area_release(area); + nfp_cpp_area_free(area); +@@ -3451,9 +3451,10 @@ nfp_pf_create_dev(struct rte_pci_device *dev, int port, int ports, + probe_failed: + rte_free(port_name); + /* free ports private data if primary process */ +- if (rte_eal_process_type() == RTE_PROC_PRIMARY) ++ if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + rte_free(eth_dev->data->dev_private); +- ++ eth_dev->data->dev_private = NULL; ++ } + rte_eth_dev_release_port(eth_dev); + + return retval; +diff --git a/dpdk/drivers/net/null/rte_eth_null.c b/dpdk/drivers/net/null/rte_eth_null.c +index 025b73acb3..beedd5f4b2 100644 +--- a/dpdk/drivers/net/null/rte_eth_null.c ++++ b/dpdk/drivers/net/null/rte_eth_null.c +@@ -584,6 +584,7 @@ rte_pmd_null_probe(struct rte_vdev_device *dev) + PMD_LOG(INFO, "Initializing pmd_null for %s", name); + + if (rte_eal_process_type() == RTE_PROC_SECONDARY) { ++ struct pmd_internals *internals; + eth_dev = rte_eth_dev_attach_secondary(name); + if (!eth_dev) { + PMD_LOG(ERR, "Failed to probe %s", name); +@@ -592,7 +593,8 @@ rte_pmd_null_probe(struct rte_vdev_device *dev) + /* TODO: request info from primary to set up Rx and Tx */ + eth_dev->dev_ops = &ops; + eth_dev->device = &dev->device; +- if (packet_copy) { ++ internals = eth_dev->data->dev_private; ++ if (internals->packet_copy) { + eth_dev->rx_pkt_burst = eth_null_copy_rx; + eth_dev->tx_pkt_burst = eth_null_copy_tx; + } else { +@@ -608,23 +610,18 @@ rte_pmd_null_probe(struct rte_vdev_device *dev) + if (kvlist == NULL) + return -1; + +- if (rte_kvargs_count(kvlist, ETH_NULL_PACKET_SIZE_ARG) == 1) { ++ ret = rte_kvargs_process(kvlist, ++ ETH_NULL_PACKET_SIZE_ARG, ++ &get_packet_size_arg, &packet_size); ++ if (ret < 0) ++ goto free_kvlist; + +- ret = rte_kvargs_process(kvlist, +- ETH_NULL_PACKET_SIZE_ARG, +- &get_packet_size_arg, &packet_size); +- if (ret < 0) +- goto free_kvlist; +- } +- +- if (rte_kvargs_count(kvlist, ETH_NULL_PACKET_COPY_ARG) == 1) { + +- ret = rte_kvargs_process(kvlist, +- ETH_NULL_PACKET_COPY_ARG, +- &get_packet_copy_arg, &packet_copy); +- if (ret < 0) +- goto free_kvlist; +- } ++ ret = rte_kvargs_process(kvlist, ++ ETH_NULL_PACKET_COPY_ARG, ++ &get_packet_copy_arg, &packet_copy); ++ if (ret < 0) ++ goto free_kvlist; + } + + PMD_LOG(INFO, "Configure pmd_null: packet size is %d, " +diff --git a/dpdk/drivers/net/octeontx/base/meson.build b/dpdk/drivers/net/octeontx/base/meson.build +index a06a2c89c9..e1060fc4ec 100644 +--- a/dpdk/drivers/net/octeontx/base/meson.build ++++ b/dpdk/drivers/net/octeontx/base/meson.build +@@ -10,7 +10,10 @@ sources = [ + depends = ['ethdev', 'mempool_octeontx'] + static_objs = [] + foreach d: depends +- static_objs += [get_variable('static_rte_' + d)] ++ if not is_variable('shared_rte_' + d) ++ subdir_done() ++ endif ++ static_objs += get_variable('static_rte_' + d) + endforeach + + c_args = cflags +diff --git a/dpdk/drivers/net/octeontx/octeontx_ethdev.c b/dpdk/drivers/net/octeontx/octeontx_ethdev.c +index 679803dd4c..e85acdde0a 100644 +--- a/dpdk/drivers/net/octeontx/octeontx_ethdev.c ++++ b/dpdk/drivers/net/octeontx/octeontx_ethdev.c +@@ -351,6 +351,10 @@ octeontx_dev_close(struct rte_eth_dev *dev) + rte_free(txq); + } + ++ /* Free MAC address table */ ++ rte_free(dev->data->mac_addrs); ++ dev->data->mac_addrs = NULL; ++ + dev->tx_pkt_burst = NULL; + dev->rx_pkt_burst = NULL; + } +@@ -1099,7 +1103,7 @@ octeontx_create(struct rte_vdev_device *dev, int port, uint8_t evdev, + octeontx_log_err("eth_dev->port_id (%d) is diff to orig (%d)", + data->port_id, nic->port_id); + res = -EINVAL; +- goto err; ++ goto free_mac_addrs; + } + + /* Update port_id mac to eth_dev */ +@@ -1118,6 +1122,9 @@ octeontx_create(struct rte_vdev_device *dev, int port, uint8_t evdev, + rte_eth_dev_probing_finish(eth_dev); + return data->port_id; + ++free_mac_addrs: ++ rte_free(data->mac_addrs); ++ data->mac_addrs = NULL; + err: + if (nic) + octeontx_port_close(nic); +diff --git a/dpdk/drivers/net/octeontx2/otx2_ethdev.c b/dpdk/drivers/net/octeontx2/otx2_ethdev.c +index ed329273dc..102d06b39b 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_ethdev.c ++++ b/dpdk/drivers/net/octeontx2/otx2_ethdev.c +@@ -18,7 +18,8 @@ nix_get_rx_offload_capa(struct otx2_eth_dev *dev) + { + uint64_t capa = NIX_RX_OFFLOAD_CAPA; + +- if (otx2_dev_is_vf(dev)) ++ if (otx2_dev_is_vf(dev) || ++ dev->npc_flow.switch_header_type == OTX2_PRIV_FLAGS_HIGIG) + capa &= ~DEV_RX_OFFLOAD_TIMESTAMP; + + return capa; +@@ -204,7 +205,7 @@ cgx_intlbk_enable(struct otx2_eth_dev *dev, bool en) + { + struct otx2_mbox *mbox = dev->mbox; + +- if (otx2_dev_is_vf_or_sdp(dev)) ++ if (en && otx2_dev_is_vf_or_sdp(dev)) + return -ENOTSUP; + + if (en) +@@ -349,10 +350,7 @@ nix_cq_rq_init(struct rte_eth_dev *eth_dev, struct otx2_eth_dev *dev, + aq->rq.first_skip = first_skip; + aq->rq.later_skip = (sizeof(struct rte_mbuf) / 8); + aq->rq.flow_tagw = 32; /* 32-bits */ +- aq->rq.lpb_sizem1 = rte_pktmbuf_data_room_size(mp); +- aq->rq.lpb_sizem1 += rte_pktmbuf_priv_size(mp); +- aq->rq.lpb_sizem1 += sizeof(struct rte_mbuf); +- aq->rq.lpb_sizem1 /= 8; ++ aq->rq.lpb_sizem1 = mp->elt_size / 8; + aq->rq.lpb_sizem1 -= 1; /* Expressed in size minus one */ + aq->rq.ena = 1; + aq->rq.pb_caching = 0x2; /* First cache aligned block to LLC */ +@@ -1114,10 +1112,12 @@ nix_store_queue_cfg_and_then_release(struct rte_eth_dev *eth_dev) + txq = (struct otx2_eth_txq **)eth_dev->data->tx_queues; + for (i = 0; i < nb_txq; i++) { + if (txq[i] == NULL) { +- otx2_err("txq[%d] is already released", i); +- goto fail; ++ tx_qconf[i].valid = false; ++ otx2_info("txq[%d] is already released", i); ++ continue; + } + memcpy(&tx_qconf[i], &txq[i]->qconf, sizeof(*tx_qconf)); ++ tx_qconf[i].valid = true; + otx2_nix_tx_queue_release(txq[i]); + eth_dev->data->tx_queues[i] = NULL; + } +@@ -1125,10 +1125,12 @@ nix_store_queue_cfg_and_then_release(struct rte_eth_dev *eth_dev) + rxq = (struct otx2_eth_rxq **)eth_dev->data->rx_queues; + for (i = 0; i < nb_rxq; i++) { + if (rxq[i] == NULL) { +- otx2_err("rxq[%d] is already released", i); +- goto fail; ++ rx_qconf[i].valid = false; ++ otx2_info("rxq[%d] is already released", i); ++ continue; + } + memcpy(&rx_qconf[i], &rxq[i]->qconf, sizeof(*rx_qconf)); ++ rx_qconf[i].valid = true; + otx2_nix_rx_queue_release(rxq[i]); + eth_dev->data->rx_queues[i] = NULL; + } +@@ -1183,6 +1185,8 @@ nix_restore_queue_cfg(struct rte_eth_dev *eth_dev) + * queues are already setup in port_configure(). + */ + for (i = 0; i < nb_txq; i++) { ++ if (!tx_qconf[i].valid) ++ continue; + rc = otx2_nix_tx_queue_setup(eth_dev, i, tx_qconf[i].nb_desc, + tx_qconf[i].socket_id, + &tx_qconf[i].conf.tx); +@@ -1198,6 +1202,8 @@ nix_restore_queue_cfg(struct rte_eth_dev *eth_dev) + free(tx_qconf); tx_qconf = NULL; + + for (i = 0; i < nb_rxq; i++) { ++ if (!rx_qconf[i].valid) ++ continue; + rc = otx2_nix_rx_queue_setup(eth_dev, i, rx_qconf[i].nb_desc, + rx_qconf[i].socket_id, + &rx_qconf[i].conf.rx, +@@ -1641,6 +1647,15 @@ otx2_nix_configure(struct rte_eth_dev *eth_dev) + goto fail_offloads; + } + ++ otx2_nix_err_intr_enb_dis(eth_dev, true); ++ otx2_nix_ras_intr_enb_dis(eth_dev, true); ++ ++ if (dev->ptp_en && ++ dev->npc_flow.switch_header_type == OTX2_PRIV_FLAGS_HIGIG) { ++ otx2_err("Both PTP and switch header enabled"); ++ goto free_nix_lf; ++ } ++ + rc = nix_lf_switch_header_type_enable(dev); + if (rc) { + otx2_err("Failed to enable switch type nix_lf rc=%d", rc); +@@ -1714,6 +1729,12 @@ otx2_nix_configure(struct rte_eth_dev *eth_dev) + goto cq_fini; + } + ++ rc = otx2_nix_flow_ctrl_init(eth_dev); ++ if (rc) { ++ otx2_err("Failed to init flow ctrl mode %d", rc); ++ goto cq_fini; ++ } ++ + rc = otx2_nix_mc_addr_list_install(eth_dev); + if (rc < 0) { + otx2_err("Failed to install mc address list rc=%d", rc); +diff --git a/dpdk/drivers/net/octeontx2/otx2_ethdev.h b/dpdk/drivers/net/octeontx2/otx2_ethdev.h +index 987e7607c4..864356e36c 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_ethdev.h ++++ b/dpdk/drivers/net/octeontx2/otx2_ethdev.h +@@ -192,6 +192,7 @@ struct otx2_eth_qconf { + void *mempool; + uint32_t socket_id; + uint16_t nb_desc; ++ uint8_t valid; + }; + + struct otx2_fc_info { +@@ -438,6 +439,8 @@ int oxt2_nix_register_cq_irqs(struct rte_eth_dev *eth_dev); + void otx2_nix_unregister_irqs(struct rte_eth_dev *eth_dev); + void oxt2_nix_unregister_queue_irqs(struct rte_eth_dev *eth_dev); + void oxt2_nix_unregister_cq_irqs(struct rte_eth_dev *eth_dev); ++void otx2_nix_err_intr_enb_dis(struct rte_eth_dev *eth_dev, bool enb); ++void otx2_nix_ras_intr_enb_dis(struct rte_eth_dev *eth_dev, bool enb); + + int otx2_nix_rx_queue_intr_enable(struct rte_eth_dev *eth_dev, + uint16_t rx_queue_id); +@@ -504,6 +507,8 @@ int otx2_cgx_mac_addr_set(struct rte_eth_dev *eth_dev, + struct rte_ether_addr *addr); + + /* Flow Control */ ++int otx2_nix_flow_ctrl_init(struct rte_eth_dev *eth_dev); ++ + int otx2_nix_flow_ctrl_get(struct rte_eth_dev *eth_dev, + struct rte_eth_fc_conf *fc_conf); + +diff --git a/dpdk/drivers/net/octeontx2/otx2_ethdev_irq.c b/dpdk/drivers/net/octeontx2/otx2_ethdev_irq.c +index 2256e40b6f..b121488faf 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_ethdev_irq.c ++++ b/dpdk/drivers/net/octeontx2/otx2_ethdev_irq.c +@@ -41,11 +41,11 @@ nix_lf_register_err_irq(struct rte_eth_dev *eth_dev) + vec = dev->nix_msixoff + NIX_LF_INT_VEC_ERR_INT; + + /* Clear err interrupt */ +- otx2_write64(~0ull, dev->base + NIX_LF_ERR_INT_ENA_W1C); ++ otx2_nix_err_intr_enb_dis(eth_dev, false); + /* Set used interrupt vectors */ + rc = otx2_register_irq(handle, nix_lf_err_irq, eth_dev, vec); + /* Enable all dev interrupt except for RQ_DISABLED */ +- otx2_write64(~BIT_ULL(11), dev->base + NIX_LF_ERR_INT_ENA_W1S); ++ otx2_nix_err_intr_enb_dis(eth_dev, true); + + return rc; + } +@@ -61,7 +61,7 @@ nix_lf_unregister_err_irq(struct rte_eth_dev *eth_dev) + vec = dev->nix_msixoff + NIX_LF_INT_VEC_ERR_INT; + + /* Clear err interrupt */ +- otx2_write64(~0ull, dev->base + NIX_LF_ERR_INT_ENA_W1C); ++ otx2_nix_err_intr_enb_dis(eth_dev, false); + otx2_unregister_irq(handle, nix_lf_err_irq, eth_dev, vec); + } + +@@ -97,11 +97,11 @@ nix_lf_register_ras_irq(struct rte_eth_dev *eth_dev) + vec = dev->nix_msixoff + NIX_LF_INT_VEC_POISON; + + /* Clear err interrupt */ +- otx2_write64(~0ull, dev->base + NIX_LF_RAS_ENA_W1C); ++ otx2_nix_ras_intr_enb_dis(eth_dev, false); + /* Set used interrupt vectors */ + rc = otx2_register_irq(handle, nix_lf_ras_irq, eth_dev, vec); + /* Enable dev interrupt */ +- otx2_write64(~0ull, dev->base + NIX_LF_RAS_ENA_W1S); ++ otx2_nix_ras_intr_enb_dis(eth_dev, true); + + return rc; + } +@@ -117,7 +117,7 @@ nix_lf_unregister_ras_irq(struct rte_eth_dev *eth_dev) + vec = dev->nix_msixoff + NIX_LF_INT_VEC_POISON; + + /* Clear err interrupt */ +- otx2_write64(~0ull, dev->base + NIX_LF_RAS_ENA_W1C); ++ otx2_nix_ras_intr_enb_dis(eth_dev, false); + otx2_unregister_irq(handle, nix_lf_ras_irq, eth_dev, vec); + } + +@@ -466,3 +466,29 @@ otx2_nix_rx_queue_intr_disable(struct rte_eth_dev *eth_dev, + + return 0; + } ++ ++void ++otx2_nix_err_intr_enb_dis(struct rte_eth_dev *eth_dev, bool enb) ++{ ++ struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); ++ ++ /* Enable all nix lf error interrupts except ++ * RQ_DISABLED and CQ_DISABLED. ++ */ ++ if (enb) ++ otx2_write64(~(BIT_ULL(11) | BIT_ULL(24)), ++ dev->base + NIX_LF_ERR_INT_ENA_W1S); ++ else ++ otx2_write64(~0ull, dev->base + NIX_LF_ERR_INT_ENA_W1C); ++} ++ ++void ++otx2_nix_ras_intr_enb_dis(struct rte_eth_dev *eth_dev, bool enb) ++{ ++ struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); ++ ++ if (enb) ++ otx2_write64(~0ull, dev->base + NIX_LF_RAS_ENA_W1S); ++ else ++ otx2_write64(~0ull, dev->base + NIX_LF_RAS_ENA_W1C); ++} +diff --git a/dpdk/drivers/net/octeontx2/otx2_flow_ctrl.c b/dpdk/drivers/net/octeontx2/otx2_flow_ctrl.c +index c6d7b1971a..76bf481001 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_flow_ctrl.c ++++ b/dpdk/drivers/net/octeontx2/otx2_flow_ctrl.c +@@ -200,19 +200,18 @@ int + otx2_nix_update_flow_ctrl_mode(struct rte_eth_dev *eth_dev) + { + struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); ++ struct otx2_fc_info *fc = &dev->fc_info; + struct rte_eth_fc_conf fc_conf; + + if (otx2_dev_is_lbk(dev) || otx2_dev_is_sdp(dev)) + return 0; + + memset(&fc_conf, 0, sizeof(struct rte_eth_fc_conf)); +- /* Both Rx & Tx flow ctrl get enabled(RTE_FC_FULL) in HW +- * by AF driver, update those info in PMD structure. +- */ +- otx2_nix_flow_ctrl_get(eth_dev, &fc_conf); ++ fc_conf.mode = fc->mode; + + /* To avoid Link credit deadlock on Ax, disable Tx FC if it's enabled */ + if (otx2_dev_is_Ax(dev) && ++ (dev->npc_flow.switch_header_type != OTX2_PRIV_FLAGS_HIGIG) && + (fc_conf.mode == RTE_FC_FULL || fc_conf.mode == RTE_FC_RX_PAUSE)) { + fc_conf.mode = + (fc_conf.mode == RTE_FC_FULL || +@@ -222,3 +221,32 @@ otx2_nix_update_flow_ctrl_mode(struct rte_eth_dev *eth_dev) + + return otx2_nix_flow_ctrl_set(eth_dev, &fc_conf); + } ++ ++int ++otx2_nix_flow_ctrl_init(struct rte_eth_dev *eth_dev) ++{ ++ struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); ++ struct otx2_fc_info *fc = &dev->fc_info; ++ struct rte_eth_fc_conf fc_conf; ++ int rc; ++ ++ if (otx2_dev_is_lbk(dev) || otx2_dev_is_sdp(dev)) ++ return 0; ++ ++ memset(&fc_conf, 0, sizeof(struct rte_eth_fc_conf)); ++ /* Both Rx & Tx flow ctrl get enabled(RTE_FC_FULL) in HW ++ * by AF driver, update those info in PMD structure. ++ */ ++ rc = otx2_nix_flow_ctrl_get(eth_dev, &fc_conf); ++ if (rc) ++ goto exit; ++ ++ fc->mode = fc_conf.mode; ++ fc->rx_pause = (fc_conf.mode == RTE_FC_FULL) || ++ (fc_conf.mode == RTE_FC_RX_PAUSE); ++ fc->tx_pause = (fc_conf.mode == RTE_FC_FULL) || ++ (fc_conf.mode == RTE_FC_TX_PAUSE); ++ ++exit: ++ return rc; ++} +diff --git a/dpdk/drivers/net/octeontx2/otx2_link.c b/dpdk/drivers/net/octeontx2/otx2_link.c +index f5679b06e7..4128f56d90 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_link.c ++++ b/dpdk/drivers/net/octeontx2/otx2_link.c +@@ -82,32 +82,57 @@ otx2_eth_dev_link_status_update(struct otx2_dev *dev, + _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL); + } + ++static int ++lbk_link_update(struct rte_eth_link *link) ++{ ++ link->link_status = ETH_LINK_UP; ++ link->link_speed = ETH_SPEED_NUM_100G; ++ link->link_autoneg = ETH_LINK_FIXED; ++ link->link_duplex = ETH_LINK_FULL_DUPLEX; ++ return 0; ++} ++ ++static int ++cgx_link_update(struct otx2_eth_dev *dev, struct rte_eth_link *link) ++{ ++ struct otx2_mbox *mbox = dev->mbox; ++ struct cgx_link_info_msg *rsp; ++ int rc; ++ otx2_mbox_alloc_msg_cgx_get_linkinfo(mbox); ++ rc = otx2_mbox_process_msg(mbox, (void *)&rsp); ++ if (rc) ++ return rc; ++ ++ link->link_status = rsp->link_info.link_up; ++ link->link_speed = rsp->link_info.speed; ++ link->link_autoneg = ETH_LINK_AUTONEG; ++ ++ if (rsp->link_info.full_duplex) ++ link->link_duplex = rsp->link_info.full_duplex; ++ return 0; ++} ++ + int + otx2_nix_link_update(struct rte_eth_dev *eth_dev, int wait_to_complete) + { + struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); +- struct otx2_mbox *mbox = dev->mbox; +- struct cgx_link_info_msg *rsp; + struct rte_eth_link link; + int rc; + + RTE_SET_USED(wait_to_complete); ++ memset(&link, 0, sizeof(struct rte_eth_link)); + +- if (otx2_dev_is_lbk(dev) || otx2_dev_is_sdp(dev)) ++ if (otx2_dev_is_sdp(dev)) + return 0; + +- otx2_mbox_alloc_msg_cgx_get_linkinfo(mbox); +- rc = otx2_mbox_process_msg(mbox, (void *)&rsp); ++ if (otx2_dev_is_lbk(dev)) ++ rc = lbk_link_update(&link); ++ else ++ rc = cgx_link_update(dev, &link); ++ + if (rc) + return rc; + +- link.link_status = rsp->link_info.link_up; +- link.link_speed = rsp->link_info.speed; +- link.link_autoneg = ETH_LINK_AUTONEG; +- +- if (rsp->link_info.full_duplex) +- link.link_duplex = rsp->link_info.full_duplex; +- + return rte_eth_linkstatus_set(eth_dev, &link); + } + +diff --git a/dpdk/drivers/net/octeontx2/otx2_lookup.c b/dpdk/drivers/net/octeontx2/otx2_lookup.c +index bcf2ff4e8f..5685571166 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_lookup.c ++++ b/dpdk/drivers/net/octeontx2/otx2_lookup.c +@@ -17,7 +17,7 @@ + const uint32_t * + otx2_nix_supported_ptypes_get(struct rte_eth_dev *eth_dev) + { +- struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev); ++ RTE_SET_USED(eth_dev); + + static const uint32_t ptypes[] = { + RTE_PTYPE_L2_ETHER_QINQ, /* LB */ +@@ -56,10 +56,7 @@ otx2_nix_supported_ptypes_get(struct rte_eth_dev *eth_dev) + RTE_PTYPE_UNKNOWN, + }; + +- if (dev->rx_offload_flags & NIX_RX_OFFLOAD_PTYPE_F) +- return ptypes; +- else +- return NULL; ++ return ptypes; + } + + int +diff --git a/dpdk/drivers/net/octeontx2/otx2_ptp.c b/dpdk/drivers/net/octeontx2/otx2_ptp.c +index f34b9339c4..ae5a2b7cd1 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_ptp.c ++++ b/dpdk/drivers/net/octeontx2/otx2_ptp.c +@@ -221,6 +221,11 @@ otx2_nix_timesync_enable(struct rte_eth_dev *eth_dev) + return -EINVAL; + } + ++ if (dev->npc_flow.switch_header_type == OTX2_PRIV_FLAGS_HIGIG) { ++ otx2_err("Both PTP and switch header enabled"); ++ return -EINVAL; ++ } ++ + /* Allocating a iova address for tx tstamp */ + const struct rte_memzone *ts; + ts = rte_eth_dma_zone_reserve(eth_dev, "otx2_ts", +diff --git a/dpdk/drivers/net/octeontx2/otx2_rss.c b/dpdk/drivers/net/octeontx2/otx2_rss.c +index bc7b64387a..d80579725a 100644 +--- a/dpdk/drivers/net/octeontx2/otx2_rss.c ++++ b/dpdk/drivers/net/octeontx2/otx2_rss.c +@@ -341,7 +341,7 @@ otx2_nix_rss_config(struct rte_eth_dev *eth_dev) + int rc; + + /* Skip further configuration if selected mode is not RSS */ +- if (eth_dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_RSS) ++ if (eth_dev->data->dev_conf.rxmode.mq_mode != ETH_MQ_RX_RSS || !qcnt) + return 0; + + /* Update default RSS key and cfg */ +diff --git a/dpdk/drivers/net/pfe/pfe_ethdev.c b/dpdk/drivers/net/pfe/pfe_ethdev.c +index 9403478198..b1de866d34 100644 +--- a/dpdk/drivers/net/pfe/pfe_ethdev.c ++++ b/dpdk/drivers/net/pfe/pfe_ethdev.c +@@ -13,7 +13,7 @@ + #include "pfe_logs.h" + #include "pfe_mod.h" + +-#define PFE_MAX_MACS 1 /*we can support upto 4 MACs per IF*/ ++#define PFE_MAX_MACS 1 /* we can support up to 4 MACs per IF */ + #define PFE_VDEV_GEM_ID_ARG "intf" + + struct pfe_vdev_init_params { +@@ -396,7 +396,6 @@ pfe_eth_exit(struct rte_eth_dev *dev, struct pfe *pfe) + /* Close the device file for link status */ + pfe_eth_close_cdev(dev->data->dev_private); + +- rte_free(dev->data->mac_addrs); + rte_eth_dev_release_port(dev); + pfe->nb_devs--; + } +@@ -990,7 +989,7 @@ pmd_pfe_probe(struct rte_vdev_device *vdev) + if (rc < 0) + return -EINVAL; + +- RTE_LOG(INFO, PMD, "Initializing pmd_pfe for %s Given gem-id %d\n", ++ PFE_PMD_LOG(INFO, "Initializing pmd_pfe for %s Given gem-id %d", + name, init_params.gem_id); + + if (g_pfe) { +@@ -1118,7 +1117,7 @@ pmd_pfe_probe(struct rte_vdev_device *vdev) + else + gem_id = init_params.gem_id; + +- RTE_LOG(INFO, PMD, "Init pmd_pfe for %s gem-id %d(given =%d)\n", ++ PFE_PMD_LOG(INFO, "Init pmd_pfe for %s gem-id %d(given =%d)", + name, gem_id, init_params.gem_id); + + rc = pfe_eth_init(vdev, g_pfe, gem_id); +diff --git a/dpdk/drivers/net/qede/base/ecore_dev.c b/dpdk/drivers/net/qede/base/ecore_dev.c +index 9d1db14590..86ecfb2690 100644 +--- a/dpdk/drivers/net/qede/base/ecore_dev.c ++++ b/dpdk/drivers/net/qede/base/ecore_dev.c +@@ -5253,7 +5253,6 @@ static void ecore_emul_hw_info_port_num(struct ecore_hwfn *p_hwfn, + + /* MISCS_REG_ECO_RESERVED[15:12]: num of ports in an engine */ + eco_reserved = ecore_rd(p_hwfn, p_ptt, MISCS_REG_ECO_RESERVED); +- + switch ((eco_reserved & 0xf000) >> 12) { + case 1: + p_dev->num_ports_in_engine = 1; +@@ -5268,7 +5267,7 @@ static void ecore_emul_hw_info_port_num(struct ecore_hwfn *p_hwfn, + DP_NOTICE(p_hwfn, false, + "Emulation: Unknown port mode [ECO_RESERVED 0x%08x]\n", + eco_reserved); +- p_dev->num_ports_in_engine = 2; /* Default to something */ ++ p_dev->num_ports_in_engine = 1; /* Default to something */ + break; + } + +@@ -5281,8 +5280,8 @@ static void ecore_emul_hw_info_port_num(struct ecore_hwfn *p_hwfn, + static void ecore_hw_info_port_num(struct ecore_hwfn *p_hwfn, + struct ecore_ptt *p_ptt) + { ++ u32 addr, global_offsize, global_addr, port_mode; + struct ecore_dev *p_dev = p_hwfn->p_dev; +- u32 addr, global_offsize, global_addr; + + #ifndef ASIC_ONLY + if (CHIP_REV_IS_TEDIBEAR(p_dev)) { +@@ -5304,15 +5303,32 @@ static void ecore_hw_info_port_num(struct ecore_hwfn *p_hwfn, + return; + } + +- addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base, +- PUBLIC_GLOBAL); +- global_offsize = ecore_rd(p_hwfn, p_ptt, addr); +- global_addr = SECTION_ADDR(global_offsize, 0); +- addr = global_addr + OFFSETOF(struct public_global, max_ports); +- p_dev->num_ports = (u8)ecore_rd(p_hwfn, p_ptt, addr); ++ /* Determine the number of ports per engine */ ++ port_mode = ecore_rd(p_hwfn, p_ptt, MISC_REG_PORT_MODE); ++ switch (port_mode) { ++ case 0x0: ++ p_dev->num_ports_in_engine = 1; ++ break; ++ case 0x1: ++ p_dev->num_ports_in_engine = 2; ++ break; ++ case 0x2: ++ p_dev->num_ports_in_engine = 4; ++ break; ++ default: ++ DP_NOTICE(p_hwfn, false, "Unknown port mode 0x%08x\n", ++ port_mode); ++ p_dev->num_ports_in_engine = 1; /* Default to something */ ++ break; ++ } + +- p_dev->num_ports_in_engine = p_dev->num_ports >> +- (ecore_device_num_engines(p_dev) - 1); ++ /* Get the total number of ports of the device */ ++ addr = SECTION_OFFSIZE_ADDR(p_hwfn->mcp_info->public_base, ++ PUBLIC_GLOBAL); ++ global_offsize = ecore_rd(p_hwfn, p_ptt, addr); ++ global_addr = SECTION_ADDR(global_offsize, 0); ++ addr = global_addr + OFFSETOF(struct public_global, max_ports); ++ p_dev->num_ports = (u8)ecore_rd(p_hwfn, p_ptt, addr); + } + + static void ecore_mcp_get_eee_caps(struct ecore_hwfn *p_hwfn, +@@ -5601,7 +5617,7 @@ ecore_hw_prepare_single(struct ecore_hwfn *p_hwfn, void OSAL_IOMEM *p_regview, + p_hwfn->db_phys_addr = db_phys_addr; + + if (IS_VF(p_dev)) +- return ecore_vf_hw_prepare(p_hwfn); ++ return ecore_vf_hw_prepare(p_hwfn, p_params); + + /* Validate that chip access is feasible */ + if (REG_RD(p_hwfn, PXP_PF_ME_OPAQUE_ADDR) == 0xffffffff) { +diff --git a/dpdk/drivers/net/qede/base/ecore_dev_api.h b/dpdk/drivers/net/qede/base/ecore_dev_api.h +index 4d5cc1a0fa..5ea8427a07 100644 +--- a/dpdk/drivers/net/qede/base/ecore_dev_api.h ++++ b/dpdk/drivers/net/qede/base/ecore_dev_api.h +@@ -277,6 +277,9 @@ struct ecore_hw_prepare_params { + + /* Indicates whether this PF serves a storage target */ + bool b_is_target; ++ ++ /* retry count for VF acquire on channel timeout */ ++ u8 acquire_retry_cnt; + }; + + /** +diff --git a/dpdk/drivers/net/qede/base/ecore_iov_api.h b/dpdk/drivers/net/qede/base/ecore_iov_api.h +index c998dbf8d5..5450018121 100644 +--- a/dpdk/drivers/net/qede/base/ecore_iov_api.h ++++ b/dpdk/drivers/net/qede/base/ecore_iov_api.h +@@ -51,6 +51,7 @@ enum ecore_iov_pf_to_vf_status { + PFVF_STATUS_NO_RESOURCE, + PFVF_STATUS_FORCED, + PFVF_STATUS_MALICIOUS, ++ PFVF_STATUS_ACQUIRED, + }; + + struct ecore_mcp_link_params; +diff --git a/dpdk/drivers/net/qede/base/ecore_sriov.c b/dpdk/drivers/net/qede/base/ecore_sriov.c +index deee04ac4b..e60257e190 100644 +--- a/dpdk/drivers/net/qede/base/ecore_sriov.c ++++ b/dpdk/drivers/net/qede/base/ecore_sriov.c +@@ -61,6 +61,39 @@ const char *qede_ecore_channel_tlvs_string[] = { + "CHANNEL_TLV_COALESCE_READ", + "CHANNEL_TLV_BULLETIN_UPDATE_MAC", + "CHANNEL_TLV_UPDATE_MTU", ++ "CHANNEL_TLV_RDMA_ACQUIRE", ++ "CHANNEL_TLV_RDMA_START", ++ "CHANNEL_TLV_RDMA_STOP", ++ "CHANNEL_TLV_RDMA_ADD_USER", ++ "CHANNEL_TLV_RDMA_REMOVE_USER", ++ "CHANNEL_TLV_RDMA_QUERY_COUNTERS", ++ "CHANNEL_TLV_RDMA_ALLOC_TID", ++ "CHANNEL_TLV_RDMA_REGISTER_TID", ++ "CHANNEL_TLV_RDMA_DEREGISTER_TID", ++ "CHANNEL_TLV_RDMA_FREE_TID", ++ "CHANNEL_TLV_RDMA_CREATE_CQ", ++ "CHANNEL_TLV_RDMA_RESIZE_CQ", ++ "CHANNEL_TLV_RDMA_DESTROY_CQ", ++ "CHANNEL_TLV_RDMA_CREATE_QP", ++ "CHANNEL_TLV_RDMA_MODIFY_QP", ++ "CHANNEL_TLV_RDMA_QUERY_QP", ++ "CHANNEL_TLV_RDMA_DESTROY_QP", ++ "CHANNEL_TLV_RDMA_CREATE_SRQ", ++ "CHANNEL_TLV_RDMA_MODIFY_SRQ", ++ "CHANNEL_TLV_RDMA_DESTROY_SRQ", ++ "CHANNEL_TLV_RDMA_QUERY_PORT", ++ "CHANNEL_TLV_RDMA_QUERY_DEVICE", ++ "CHANNEL_TLV_RDMA_IWARP_CONNECT", ++ "CHANNEL_TLV_RDMA_IWARP_ACCEPT", ++ "CHANNEL_TLV_RDMA_IWARP_CREATE_LISTEN", ++ "CHANNEL_TLV_RDMA_IWARP_DESTROY_LISTEN", ++ "CHANNEL_TLV_RDMA_IWARP_PAUSE_LISTEN", ++ "CHANNEL_TLV_RDMA_IWARP_REJECT", ++ "CHANNEL_TLV_RDMA_IWARP_SEND_RTR", ++ "CHANNEL_TLV_ESTABLISH_LL2_CONN", ++ "CHANNEL_TLV_TERMINATE_LL2_CONN", ++ "CHANNEL_TLV_ASYNC_EVENT", ++ "CHANNEL_TLV_SOFT_FLR", + "CHANNEL_TLV_MAX" + }; + +diff --git a/dpdk/drivers/net/qede/base/ecore_vf.c b/dpdk/drivers/net/qede/base/ecore_vf.c +index 24846cfb51..0e5b7d5eb3 100644 +--- a/dpdk/drivers/net/qede/base/ecore_vf.c ++++ b/dpdk/drivers/net/qede/base/ecore_vf.c +@@ -226,7 +226,6 @@ enum _ecore_status_t ecore_vf_pf_release(struct ecore_hwfn *p_hwfn) + return _ecore_vf_pf_release(p_hwfn, true); + } + +-#define VF_ACQUIRE_THRESH 3 + static void ecore_vf_pf_acquire_reduce_resc(struct ecore_hwfn *p_hwfn, + struct vf_pf_resc_request *p_req, + struct pf_vf_resc *p_resp) +@@ -251,13 +250,47 @@ static void ecore_vf_pf_acquire_reduce_resc(struct ecore_hwfn *p_hwfn, + p_req->num_cids = p_resp->num_cids; + } + +-static enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn) ++static enum _ecore_status_t ++ecore_vf_pf_soft_flr_acquire(struct ecore_hwfn *p_hwfn) ++{ ++ struct ecore_vf_iov *p_iov = p_hwfn->vf_iov_info; ++ struct pfvf_def_resp_tlv *resp; ++ struct vfpf_soft_flr_tlv *req; ++ enum _ecore_status_t rc; ++ ++ req = ecore_vf_pf_prep(p_hwfn, CHANNEL_TLV_SOFT_FLR, sizeof(*req)); ++ ++ /* add list termination tlv */ ++ ecore_add_tlv(&p_iov->offset, ++ CHANNEL_TLV_LIST_END, ++ sizeof(struct channel_list_end_tlv)); ++ ++ resp = &p_iov->pf2vf_reply->default_resp; ++ rc = ecore_send_msg2pf(p_hwfn, &resp->hdr.status, sizeof(*resp)); ++ ++ DP_VERBOSE(p_hwfn, ECORE_MSG_IOV, "rc=0x%x\n", rc); ++ ++ /* to release the mutex as ecore_vf_pf_acquire() take the mutex */ ++ ecore_vf_pf_req_end(p_hwfn, ECORE_AGAIN); ++ ++ /* As of today, there is no mechanism in place for VF to know the FLR ++ * status, so sufficiently (worst case time) wait for FLR to complete, ++ * as mailbox request to MFW by the PF for initiating VF flr and PF ++ * processing VF FLR could take time. ++ */ ++ OSAL_MSLEEP(3000); ++ ++ return ecore_vf_pf_acquire(p_hwfn); ++} ++ ++enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn) + { + struct ecore_vf_iov *p_iov = p_hwfn->vf_iov_info; + struct pfvf_acquire_resp_tlv *resp = &p_iov->pf2vf_reply->acquire_resp; + struct pf_vf_pfdev_info *pfdev_info = &resp->pfdev_info; + struct ecore_vf_acquire_sw_info vf_sw_info; + struct ecore_dev *p_dev = p_hwfn->p_dev; ++ u8 retry_cnt = p_iov->acquire_retry_cnt; + struct vf_pf_resc_request *p_resc; + bool resources_acquired = false; + struct vfpf_acquire_tlv *req; +@@ -318,6 +351,14 @@ static enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn) + /* send acquire request */ + rc = ecore_send_msg2pf(p_hwfn, + &resp->hdr.status, sizeof(*resp)); ++ ++ if (retry_cnt && rc == ECORE_TIMEOUT) { ++ DP_VERBOSE(p_hwfn, ECORE_MSG_IOV, ++ "VF retrying to acquire due to VPC timeout\n"); ++ retry_cnt--; ++ continue; ++ } ++ + if (rc != ECORE_SUCCESS) + goto exit; + +@@ -343,7 +384,7 @@ static enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn) + resources_acquired = true; + } /* PF refuses to allocate our resources */ + else if (resp->hdr.status == PFVF_STATUS_NO_RESOURCE && +- attempts < VF_ACQUIRE_THRESH) { ++ attempts < ECORE_VF_ACQUIRE_THRESH) { + ecore_vf_pf_acquire_reduce_resc(p_hwfn, p_resc, + &resp->resc); + +@@ -391,6 +432,9 @@ static enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn) + "PF rejected acquisition by VF\n"); + rc = ECORE_INVAL; + goto exit; ++ } else if (resp->hdr.status == PFVF_STATUS_ACQUIRED) { ++ ecore_vf_pf_req_end(p_hwfn, ECORE_AGAIN); ++ return ecore_vf_pf_soft_flr_acquire(p_hwfn); + } else { + DP_ERR(p_hwfn, + "PF returned err %d to VF acquisition request\n", +@@ -477,7 +521,9 @@ u32 ecore_vf_hw_bar_size(struct ecore_hwfn *p_hwfn, + return 0; + } + +-enum _ecore_status_t ecore_vf_hw_prepare(struct ecore_hwfn *p_hwfn) ++enum _ecore_status_t ++ecore_vf_hw_prepare(struct ecore_hwfn *p_hwfn, ++ struct ecore_hw_prepare_params *p_params) + { + struct ecore_hwfn *p_lead = ECORE_LEADING_HWFN(p_hwfn->p_dev); + struct ecore_vf_iov *p_iov; +@@ -583,6 +629,7 @@ enum _ecore_status_t ecore_vf_hw_prepare(struct ecore_hwfn *p_hwfn) + #endif + OSAL_MUTEX_INIT(&p_iov->mutex); + ++ p_iov->acquire_retry_cnt = p_params->acquire_retry_cnt; + p_hwfn->vf_iov_info = p_iov; + + p_hwfn->hw_info.personality = ECORE_PCI_ETH; +diff --git a/dpdk/drivers/net/qede/base/ecore_vf.h b/dpdk/drivers/net/qede/base/ecore_vf.h +index a07f82ebd9..f027eba3ea 100644 +--- a/dpdk/drivers/net/qede/base/ecore_vf.h ++++ b/dpdk/drivers/net/qede/base/ecore_vf.h +@@ -11,6 +11,7 @@ + #include "ecore_vf_api.h" + #include "ecore_l2_api.h" + #include "ecore_vfpf_if.h" ++#include "ecore_dev_api.h" + + /* Default number of CIDs [total of both Rx and Tx] to be requested + * by default. +@@ -59,6 +60,9 @@ struct ecore_vf_iov { + * bar or via the doorbell bar. + */ + bool b_doorbell_bar; ++ ++ /* retry count for VF acquire on channel timeout */ ++ u8 acquire_retry_cnt; + }; + + /** +@@ -72,6 +76,8 @@ struct ecore_vf_iov { + enum _ecore_status_t ecore_vf_pf_get_coalesce(struct ecore_hwfn *p_hwfn, + u16 *p_coal, + struct ecore_queue_cid *p_cid); ++ ++enum _ecore_status_t ecore_vf_pf_acquire(struct ecore_hwfn *p_hwfn); + /** + * @brief VF - Set Rx/Tx coalesce per VF's relative queue. + * Coalesce value '0' will omit the configuration. +@@ -92,10 +98,13 @@ enum _ecore_status_t ecore_vf_pf_set_coalesce(struct ecore_hwfn *p_hwfn, + * sends ACQUIRE message + * + * @param p_hwfn ++ * @param p_params + * + * @return enum _ecore_status_t + */ +-enum _ecore_status_t ecore_vf_hw_prepare(struct ecore_hwfn *p_hwfn); ++enum _ecore_status_t ++ecore_vf_hw_prepare(struct ecore_hwfn *p_hwfn, ++ struct ecore_hw_prepare_params *p_params); + + /** + * @brief VF - start the RX Queue by sending a message to the PF +diff --git a/dpdk/drivers/net/qede/base/ecore_vf_api.h b/dpdk/drivers/net/qede/base/ecore_vf_api.h +index 1a9fb3b1f2..43951a9a34 100644 +--- a/dpdk/drivers/net/qede/base/ecore_vf_api.h ++++ b/dpdk/drivers/net/qede/base/ecore_vf_api.h +@@ -11,6 +11,9 @@ + #include "ecore_mcp_api.h" + + #ifdef CONFIG_ECORE_SRIOV ++ ++#define ECORE_VF_ACQUIRE_THRESH 3 ++ + /** + * @brief Read the VF bulletin and act on it if needed + * +diff --git a/dpdk/drivers/net/qede/base/ecore_vfpf_if.h b/dpdk/drivers/net/qede/base/ecore_vfpf_if.h +index c7ecb01c28..f92dc428af 100644 +--- a/dpdk/drivers/net/qede/base/ecore_vfpf_if.h ++++ b/dpdk/drivers/net/qede/base/ecore_vfpf_if.h +@@ -251,6 +251,13 @@ struct vfpf_qid_tlv { + u8 padding[3]; + }; + ++/* Soft FLR req */ ++struct vfpf_soft_flr_tlv { ++ struct vfpf_first_tlv first_tlv; ++ u32 reserved1; ++ u32 reserved2; ++}; ++ + /* Setup Queue */ + struct vfpf_start_rxq_tlv { + struct vfpf_first_tlv first_tlv; +@@ -557,6 +564,7 @@ union vfpf_tlvs { + struct vfpf_read_coal_req_tlv read_coal_req; + struct vfpf_bulletin_update_mac_tlv bulletin_update_mac; + struct vfpf_update_mtu_tlv update_mtu; ++ struct vfpf_soft_flr_tlv soft_flr; + struct tlv_buffer_size tlv_buf_size; + }; + +@@ -689,6 +697,39 @@ enum { + CHANNEL_TLV_COALESCE_READ, + CHANNEL_TLV_BULLETIN_UPDATE_MAC, + CHANNEL_TLV_UPDATE_MTU, ++ CHANNEL_TLV_RDMA_ACQUIRE, ++ CHANNEL_TLV_RDMA_START, ++ CHANNEL_TLV_RDMA_STOP, ++ CHANNEL_TLV_RDMA_ADD_USER, ++ CHANNEL_TLV_RDMA_REMOVE_USER, ++ CHANNEL_TLV_RDMA_QUERY_COUNTERS, ++ CHANNEL_TLV_RDMA_ALLOC_TID, ++ CHANNEL_TLV_RDMA_REGISTER_TID, ++ CHANNEL_TLV_RDMA_DEREGISTER_TID, ++ CHANNEL_TLV_RDMA_FREE_TID, ++ CHANNEL_TLV_RDMA_CREATE_CQ, ++ CHANNEL_TLV_RDMA_RESIZE_CQ, ++ CHANNEL_TLV_RDMA_DESTROY_CQ, ++ CHANNEL_TLV_RDMA_CREATE_QP, ++ CHANNEL_TLV_RDMA_MODIFY_QP, ++ CHANNEL_TLV_RDMA_QUERY_QP, ++ CHANNEL_TLV_RDMA_DESTROY_QP, ++ CHANNEL_TLV_RDMA_QUERY_PORT, ++ CHANNEL_TLV_RDMA_QUERY_DEVICE, ++ CHANNEL_TLV_RDMA_IWARP_CONNECT, ++ CHANNEL_TLV_RDMA_IWARP_ACCEPT, ++ CHANNEL_TLV_RDMA_IWARP_CREATE_LISTEN, ++ CHANNEL_TLV_RDMA_IWARP_DESTROY_LISTEN, ++ CHANNEL_TLV_RDMA_IWARP_PAUSE_LISTEN, ++ CHANNEL_TLV_RDMA_IWARP_REJECT, ++ CHANNEL_TLV_RDMA_IWARP_SEND_RTR, ++ CHANNEL_TLV_ESTABLISH_LL2_CONN, ++ CHANNEL_TLV_TERMINATE_LL2_CONN, ++ CHANNEL_TLV_ASYNC_EVENT, ++ CHANNEL_TLV_RDMA_CREATE_SRQ, ++ CHANNEL_TLV_RDMA_MODIFY_SRQ, ++ CHANNEL_TLV_RDMA_DESTROY_SRQ, ++ CHANNEL_TLV_SOFT_FLR, + CHANNEL_TLV_MAX, + + /* Required for iterating over vport-update tlvs. +diff --git a/dpdk/drivers/net/qede/base/mcp_public.h b/dpdk/drivers/net/qede/base/mcp_public.h +index 98b9723dd4..6667c2d7ab 100644 +--- a/dpdk/drivers/net/qede/base/mcp_public.h ++++ b/dpdk/drivers/net/qede/base/mcp_public.h +@@ -1290,6 +1290,7 @@ struct public_drv_mb { + /*deprecated don't use*/ + #define DRV_MSG_CODE_INITIATE_FLR_DEPRECATED 0x02000000 + #define DRV_MSG_CODE_INITIATE_PF_FLR 0x02010000 ++#define DRV_MSG_CODE_INITIATE_VF_FLR 0x02020000 + #define DRV_MSG_CODE_VF_DISABLED_DONE 0xc0000000 + #define DRV_MSG_CODE_CFG_VF_MSIX 0xc0010000 + #define DRV_MSG_CODE_CFG_PF_VFS_MSIX 0xc0020000 +@@ -1749,6 +1750,7 @@ struct public_drv_mb { + #define FW_MSG_CODE_NIG_DRAIN_DONE 0x30000000 + #define FW_MSG_CODE_VF_DISABLED_DONE 0xb0000000 + #define FW_MSG_CODE_DRV_CFG_VF_MSIX_DONE 0xb0010000 ++#define FW_MSG_CODE_INITIATE_VF_FLR_OK 0xb0030000 + #define FW_MSG_CODE_ERR_RESOURCE_TEMPORARY_UNAVAILABLE 0x008b0000 + #define FW_MSG_CODE_ERR_RESOURCE_ALREADY_ALLOCATED 0x008c0000 + #define FW_MSG_CODE_ERR_RESOURCE_NOT_ALLOCATED 0x008d0000 +diff --git a/dpdk/drivers/net/qede/qede_ethdev.c b/dpdk/drivers/net/qede/qede_ethdev.c +index 19d2e96191..2a1c82ac9a 100644 +--- a/dpdk/drivers/net/qede/qede_ethdev.c ++++ b/dpdk/drivers/net/qede/qede_ethdev.c +@@ -1064,7 +1064,7 @@ static int qede_dev_start(struct rte_eth_dev *eth_dev) + qede_reset_queue_stats(qdev, true); + + /* Newer SR-IOV PF driver expects RX/TX queues to be started before +- * enabling RSS. Hence RSS configuration is deferred upto this point. ++ * enabling RSS. Hence RSS configuration is deferred up to this point. + * Also, we would like to retain similar behavior in PF case, so we + * don't do PF/VF specific check here. + */ +@@ -1076,6 +1076,9 @@ static int qede_dev_start(struct rte_eth_dev *eth_dev) + if (qede_activate_vport(eth_dev, true)) + goto err; + ++ /* Bring-up the link */ ++ qede_dev_set_link_state(eth_dev, true); ++ + /* Update link status */ + qede_link_update(eth_dev, 0); + +@@ -1097,6 +1100,12 @@ static void qede_dev_stop(struct rte_eth_dev *eth_dev) + + PMD_INIT_FUNC_TRACE(edev); + ++ /* Bring the link down */ ++ qede_dev_set_link_state(eth_dev, false); ++ ++ /* Update link status */ ++ qede_link_update(eth_dev, 0); ++ + /* Disable vport */ + if (qede_activate_vport(eth_dev, false)) + return; +@@ -1182,6 +1191,8 @@ static int qede_dev_configure(struct rte_eth_dev *eth_dev) + struct qede_dev *qdev = QEDE_INIT_QDEV(eth_dev); + struct ecore_dev *edev = QEDE_INIT_EDEV(qdev); + struct rte_eth_rxmode *rxmode = ð_dev->data->dev_conf.rxmode; ++ uint8_t num_rxqs; ++ uint8_t num_txqs; + int ret; + + PMD_INIT_FUNC_TRACE(edev); +@@ -1214,12 +1225,17 @@ static int qede_dev_configure(struct rte_eth_dev *eth_dev) + if (qede_check_fdir_support(eth_dev)) + return -ENOTSUP; + +- qede_dealloc_fp_resc(eth_dev); +- qdev->num_tx_queues = eth_dev->data->nb_tx_queues * edev->num_hwfns; +- qdev->num_rx_queues = eth_dev->data->nb_rx_queues * edev->num_hwfns; +- +- if (qede_alloc_fp_resc(qdev)) +- return -ENOMEM; ++ /* Allocate/reallocate fastpath resources only for new queue config */ ++ num_txqs = eth_dev->data->nb_tx_queues * edev->num_hwfns; ++ num_rxqs = eth_dev->data->nb_rx_queues * edev->num_hwfns; ++ if (qdev->num_tx_queues != num_txqs || ++ qdev->num_rx_queues != num_rxqs) { ++ qede_dealloc_fp_resc(eth_dev); ++ qdev->num_tx_queues = num_txqs; ++ qdev->num_rx_queues = num_rxqs; ++ if (qede_alloc_fp_resc(qdev)) ++ return -ENOMEM; ++ } + + /* If jumbo enabled adjust MTU */ + if (rxmode->offloads & DEV_RX_OFFLOAD_JUMBO_FRAME) +@@ -1472,7 +1488,8 @@ static void qede_dev_close(struct rte_eth_dev *eth_dev) + if (eth_dev->data->dev_started) + qede_dev_stop(eth_dev); + +- qede_stop_vport(edev); ++ if (qdev->vport_started) ++ qede_stop_vport(edev); + qdev->vport_started = false; + qede_fdir_dealloc_resc(eth_dev); + qede_dealloc_fp_resc(eth_dev); +@@ -1480,8 +1497,6 @@ static void qede_dev_close(struct rte_eth_dev *eth_dev) + eth_dev->data->nb_rx_queues = 0; + eth_dev->data->nb_tx_queues = 0; + +- /* Bring the link down */ +- qede_dev_set_link_state(eth_dev, false); + qdev->ops->common->slowpath_stop(edev); + qdev->ops->common->remove(edev); + rte_intr_disable(&pci_dev->intr_handle); +@@ -2604,9 +2619,6 @@ static int qede_common_dev_init(struct rte_eth_dev *eth_dev, bool is_vf) + + eth_dev->dev_ops = (is_vf) ? &qede_eth_vf_dev_ops : &qede_eth_dev_ops; + +- /* Bring-up the link */ +- qede_dev_set_link_state(eth_dev, true); +- + adapter->num_tx_queues = 0; + adapter->num_rx_queues = 0; + SLIST_INIT(&adapter->arfs_info.arfs_list_head); +diff --git a/dpdk/drivers/net/qede/qede_main.c b/dpdk/drivers/net/qede/qede_main.c +index 4eb79d0fbb..8580cbcd7f 100644 +--- a/dpdk/drivers/net/qede/qede_main.c ++++ b/dpdk/drivers/net/qede/qede_main.c +@@ -56,6 +56,10 @@ qed_probe(struct ecore_dev *edev, struct rte_pci_device *pci_dev, + qed_init_pci(edev, pci_dev); + + memset(&hw_prepare_params, 0, sizeof(hw_prepare_params)); ++ ++ if (is_vf) ++ hw_prepare_params.acquire_retry_cnt = ECORE_VF_ACQUIRE_THRESH; ++ + hw_prepare_params.personality = ECORE_PCI_ETH; + hw_prepare_params.drv_resc_alloc = false; + hw_prepare_params.chk_reg_fifo = false; +diff --git a/dpdk/drivers/net/qede/qede_rxtx.c b/dpdk/drivers/net/qede/qede_rxtx.c +index a28dd0a07f..3c55c0efdf 100644 +--- a/dpdk/drivers/net/qede/qede_rxtx.c ++++ b/dpdk/drivers/net/qede/qede_rxtx.c +@@ -593,12 +593,14 @@ qede_alloc_mem_sb(struct qede_dev *qdev, struct ecore_sb_info *sb_info, + + int qede_alloc_fp_resc(struct qede_dev *qdev) + { +- struct ecore_dev *edev = &qdev->edev; ++ struct ecore_dev *edev = QEDE_INIT_EDEV(qdev); + struct qede_fastpath *fp; + uint32_t num_sbs; + uint16_t sb_idx; + int i; + ++ PMD_INIT_FUNC_TRACE(edev); ++ + if (IS_VF(edev)) + ecore_vf_get_num_sbs(ECORE_LEADING_HWFN(edev), &num_sbs); + else +diff --git a/dpdk/drivers/net/ring/rte_eth_ring.c b/dpdk/drivers/net/ring/rte_eth_ring.c +index 41acbc513d..f0fafa0c0d 100644 +--- a/dpdk/drivers/net/ring/rte_eth_ring.c ++++ b/dpdk/drivers/net/ring/rte_eth_ring.c +@@ -246,6 +246,7 @@ static const struct eth_dev_ops ops = { + + static int + do_eth_dev_ring_create(const char *name, ++ struct rte_vdev_device *vdev, + struct rte_ring * const rx_queues[], + const unsigned int nb_rx_queues, + struct rte_ring *const tx_queues[], +@@ -291,12 +292,15 @@ do_eth_dev_ring_create(const char *name, + } + + /* now put it all together ++ * - store EAL device in eth_dev, + * - store queue data in internals, + * - store numa_node info in eth_dev_data + * - point eth_dev_data to internals + * - and point eth_dev structure to new eth_dev_data structure + */ + ++ eth_dev->device = &vdev->device; ++ + data = eth_dev->data; + data->rx_queues = rx_queues_local; + data->tx_queues = tx_queues_local; +@@ -408,7 +412,9 @@ rte_eth_from_ring(struct rte_ring *r) + } + + static int +-eth_dev_ring_create(const char *name, const unsigned int numa_node, ++eth_dev_ring_create(const char *name, ++ struct rte_vdev_device *vdev, ++ const unsigned int numa_node, + enum dev_action action, struct rte_eth_dev **eth_dev) + { + /* rx and tx are so-called from point of view of first port. +@@ -438,7 +444,7 @@ eth_dev_ring_create(const char *name, const unsigned int numa_node, + return -1; + } + +- if (do_eth_dev_ring_create(name, rxtx, num_rings, rxtx, num_rings, ++ if (do_eth_dev_ring_create(name, vdev, rxtx, num_rings, rxtx, num_rings, + numa_node, action, eth_dev) < 0) + return -1; + +@@ -560,12 +566,12 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + PMD_LOG(INFO, "Initializing pmd_ring for %s", name); + + if (params == NULL || params[0] == '\0') { +- ret = eth_dev_ring_create(name, rte_socket_id(), DEV_CREATE, ++ ret = eth_dev_ring_create(name, dev, rte_socket_id(), DEV_CREATE, + ð_dev); + if (ret == -1) { + PMD_LOG(INFO, + "Attach to pmd_ring for %s", name); +- ret = eth_dev_ring_create(name, rte_socket_id(), ++ ret = eth_dev_ring_create(name, dev, rte_socket_id(), + DEV_ATTACH, ð_dev); + } + } else { +@@ -574,19 +580,16 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + if (!kvlist) { + PMD_LOG(INFO, + "Ignoring unsupported parameters when creatingrings-backed ethernet device"); +- ret = eth_dev_ring_create(name, rte_socket_id(), ++ ret = eth_dev_ring_create(name, dev, rte_socket_id(), + DEV_CREATE, ð_dev); + if (ret == -1) { + PMD_LOG(INFO, + "Attach to pmd_ring for %s", + name); +- ret = eth_dev_ring_create(name, rte_socket_id(), ++ ret = eth_dev_ring_create(name, dev, rte_socket_id(), + DEV_ATTACH, ð_dev); + } + +- if (eth_dev) +- eth_dev->device = &dev->device; +- + return ret; + } + +@@ -597,7 +600,7 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + if (ret < 0) + goto out_free; + +- ret = do_eth_dev_ring_create(name, ++ ret = do_eth_dev_ring_create(name, dev, + internal_args->rx_queues, + internal_args->nb_rx_queues, + internal_args->tx_queues, +@@ -627,6 +630,7 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + + for (info->count = 0; info->count < info->total; info->count++) { + ret = eth_dev_ring_create(info->list[info->count].name, ++ dev, + info->list[info->count].node, + info->list[info->count].action, + ð_dev); +@@ -635,7 +639,7 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + PMD_LOG(INFO, + "Attach to pmd_ring for %s", + name); +- ret = eth_dev_ring_create(name, ++ ret = eth_dev_ring_create(name, dev, + info->list[info->count].node, + DEV_ATTACH, + ð_dev); +@@ -644,9 +648,6 @@ rte_pmd_ring_probe(struct rte_vdev_device *dev) + } + } + +- if (eth_dev) +- eth_dev->device = &dev->device; +- + out_free: + rte_kvargs_free(kvlist); + rte_free(info); +diff --git a/dpdk/drivers/net/sfc/base/ef10_evb.c b/dpdk/drivers/net/sfc/base/ef10_evb.c +index 1788a2c96a..9b33e89fc1 100644 +--- a/dpdk/drivers/net/sfc/base/ef10_evb.c ++++ b/dpdk/drivers/net/sfc/base/ef10_evb.c +@@ -9,15 +9,13 @@ + + #if EFSYS_OPT_EVB + +-#if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 ++#if EFX_OPTS_EF10() + + __checkReturn efx_rc_t + ef10_evb_init( + __in efx_nic_t *enp) + { +- EFSYS_ASSERT(enp->en_family == EFX_FAMILY_HUNTINGTON || +- enp->en_family == EFX_FAMILY_MEDFORD || +- enp->en_family == EFX_FAMILY_MEDFORD2); ++ EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + + return (0); + } +@@ -26,12 +24,10 @@ ef10_evb_init( + ef10_evb_fini( + __in efx_nic_t *enp) + { +- EFSYS_ASSERT(enp->en_family == EFX_FAMILY_HUNTINGTON || +- enp->en_family == EFX_FAMILY_MEDFORD || +- enp->en_family == EFX_FAMILY_MEDFORD2); ++ EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vswitch_alloc( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id, +@@ -98,7 +94,7 @@ efx_mcdi_vswitch_alloc( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vswitch_free( + __in efx_nic_t *enp) + { +@@ -129,7 +125,7 @@ efx_mcdi_vswitch_free( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vport_alloc( + __in efx_nic_t *enp, + __in efx_vport_type_t vport_type, +@@ -192,7 +188,7 @@ efx_mcdi_vport_alloc( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vport_free( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id) +@@ -223,7 +219,7 @@ efx_mcdi_vport_free( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vport_mac_addr_add( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id, +@@ -258,7 +254,7 @@ efx_mcdi_vport_mac_addr_add( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vport_mac_addr_del( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id, +@@ -293,7 +289,7 @@ efx_mcdi_vport_mac_addr_del( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_port_assign( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id, +@@ -330,7 +326,7 @@ efx_mcdi_port_assign( + return (rc); + } + +- __checkReturn efx_rc_t ++static __checkReturn efx_rc_t + efx_mcdi_vport_reconfigure( + __in efx_nic_t *enp, + __in efx_vport_id_t vport_id, +@@ -549,5 +545,5 @@ ef10_evb_vport_stats( + EFX_STATS_UPLOAD, 0)); + } + +-#endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 */ ++#endif /* EFX_OPTS_EF10() */ + #endif /* EFSYS_OPT_EVB */ +diff --git a/dpdk/drivers/net/sfc/base/ef10_filter.c b/dpdk/drivers/net/sfc/base/ef10_filter.c +index e4f8de51c0..158e77e3bb 100644 +--- a/dpdk/drivers/net/sfc/base/ef10_filter.c ++++ b/dpdk/drivers/net/sfc/base/ef10_filter.c +@@ -590,6 +590,231 @@ ef10_filter_restore( + return (rc); + } + ++enum ef10_filter_add_action_e { ++ /* Insert a new filter */ ++ EF10_FILTER_ADD_NEW, ++ /* ++ * Replace old filter with a new, overriding the old one ++ * if it has lower priority. ++ */ ++ EF10_FILTER_ADD_REPLACE, ++ /* Store new, lower priority filter as overridden by old filter */ ++ EF10_FILTER_ADD_STORE, ++ /* Special case for AUTO filters, remove AUTO_OLD flag */ ++ EF10_FILTER_ADD_REFRESH, ++}; ++ ++static __checkReturn efx_rc_t ++ef10_filter_add_lookup_equal_spec( ++ __in efx_filter_spec_t *spec, ++ __in efx_filter_spec_t *probe_spec, ++ __in efx_filter_replacement_policy_t policy, ++ __out boolean_t *found) ++{ ++ efx_rc_t rc; ++ ++ /* Refreshing AUTO filter */ ++ if (spec->efs_priority == EFX_FILTER_PRI_AUTO && ++ probe_spec->efs_priority == EFX_FILTER_PRI_AUTO) { ++ *found = B_TRUE; ++ return (0); ++ } ++ ++ /* ++ * With exclusive filters, higher priority ones ++ * override lower priority ones, and lower priority ++ * ones are stored in case the higher priority one ++ * is removed. ++ */ ++ if (ef10_filter_is_exclusive(spec)) { ++ switch (policy) { ++ case EFX_FILTER_REPLACEMENT_HIGHER_OR_EQUAL_PRIORITY: ++ if (spec->efs_priority == probe_spec->efs_priority) { ++ *found = B_TRUE; ++ break; ++ } ++ /* Fall-through */ ++ case EFX_FILTER_REPLACEMENT_HIGHER_PRIORITY: ++ if (spec->efs_priority > probe_spec->efs_priority) { ++ *found = B_TRUE; ++ break; ++ } ++ /* Fall-through */ ++ case EFX_FILTER_REPLACEMENT_NEVER: ++ /* ++ * Lower priority filter needs to be ++ * stored. It does *not* replace the ++ * old one. That is why EEXIST is not ++ * returned in that case. ++ */ ++ if (spec->efs_priority < probe_spec->efs_priority) { ++ *found = B_TRUE; ++ break; ++ } else { ++ rc = EEXIST; ++ goto fail1; ++ } ++ default: ++ EFSYS_ASSERT(0); ++ rc = EEXIST; ++ goto fail2; ++ } ++ } else { ++ *found = B_FALSE; ++ } ++ ++ return (0); ++ ++fail2: ++ EFSYS_PROBE(fail2); ++ ++fail1: ++ EFSYS_PROBE1(fail1, efx_rc_t, rc); ++ ++ return (rc); ++} ++ ++ ++static void ++ef10_filter_add_select_action( ++ __in efx_filter_spec_t *saved_spec, ++ __in efx_filter_spec_t *spec, ++ __out enum ef10_filter_add_action_e *action, ++ __out efx_filter_spec_t **overridden_spec) ++{ ++ efx_filter_spec_t *overridden = NULL; ++ ++ if (saved_spec == NULL) { ++ *action = EF10_FILTER_ADD_NEW; ++ } else if (ef10_filter_is_exclusive(spec) == B_FALSE) { ++ /* ++ * Non-exclusive filters are always stored in separate entries ++ * in the table. The only case involving a saved spec is ++ * refreshing an AUTO filter. ++ */ ++ EFSYS_ASSERT(saved_spec->efs_overridden_spec == NULL); ++ EFSYS_ASSERT(spec->efs_priority == EFX_FILTER_PRI_AUTO); ++ EFSYS_ASSERT(saved_spec->efs_priority == EFX_FILTER_PRI_AUTO); ++ *action = EF10_FILTER_ADD_REFRESH; ++ } else { ++ /* Exclusive filters stored in the same entry */ ++ if (spec->efs_priority > saved_spec->efs_priority) { ++ /* ++ * Insert a high priority filter over a lower priority ++ * one. Only two priority levels are implemented, so ++ * there must not already be an overridden filter. ++ */ ++ EFX_STATIC_ASSERT(EFX_FILTER_NPRI == 2); ++ EFSYS_ASSERT(saved_spec->efs_overridden_spec == NULL); ++ overridden = saved_spec; ++ *action = EF10_FILTER_ADD_REPLACE; ++ } else if (spec->efs_priority == saved_spec->efs_priority) { ++ /* Replace in-place or refresh an existing filter */ ++ if (spec->efs_priority == EFX_FILTER_PRI_AUTO) ++ *action = EF10_FILTER_ADD_REFRESH; ++ else ++ *action = EF10_FILTER_ADD_REPLACE; ++ } else { ++ /* ++ * Insert a lower priority filter, storing it in case ++ * the higher priority filter is removed. ++ * ++ * Currently there are only two priority levels, so this ++ * must be an AUTO filter. ++ */ ++ EFX_STATIC_ASSERT(EFX_FILTER_NPRI == 2); ++ EFSYS_ASSERT(spec->efs_priority == EFX_FILTER_PRI_AUTO); ++ if (saved_spec->efs_overridden_spec != NULL) { ++ *action = EF10_FILTER_ADD_REFRESH; ++ } else { ++ overridden = spec; ++ *action = EF10_FILTER_ADD_STORE; ++ } ++ } ++ } ++ ++ *overridden_spec = overridden; ++} ++ ++static __checkReturn efx_rc_t ++ef10_filter_add_execute_action( ++ __in efx_nic_t *enp, ++ __in efx_filter_spec_t *saved_spec, ++ __in efx_filter_spec_t *spec, ++ __in efx_filter_spec_t *overridden_spec, ++ __in enum ef10_filter_add_action_e action, ++ __in int ins_index) ++{ ++ ef10_filter_table_t *eftp = enp->en_filter.ef_ef10_filter_table; ++ efsys_lock_state_t state; ++ efx_rc_t rc; ++ ++ EFSYS_LOCK(enp->en_eslp, state); ++ ++ if (action == EF10_FILTER_ADD_REFRESH) { ++ ef10_filter_set_entry_not_auto_old(eftp, ins_index); ++ goto out_unlock; ++ } else if (action == EF10_FILTER_ADD_STORE) { ++ EFSYS_ASSERT(overridden_spec != NULL); ++ saved_spec->efs_overridden_spec = overridden_spec; ++ goto out_unlock; ++ } ++ ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ ++ switch (action) { ++ case EF10_FILTER_ADD_REPLACE: ++ /* ++ * On replacing the filter handle may change after a ++ * successful replace operation. ++ */ ++ rc = efx_mcdi_filter_op_add(enp, spec, ++ MC_CMD_FILTER_OP_IN_OP_REPLACE, ++ &eftp->eft_entry[ins_index].efe_handle); ++ break; ++ case EF10_FILTER_ADD_NEW: ++ if (ef10_filter_is_exclusive(spec)) { ++ rc = efx_mcdi_filter_op_add(enp, spec, ++ MC_CMD_FILTER_OP_IN_OP_INSERT, ++ &eftp->eft_entry[ins_index].efe_handle); ++ } else { ++ rc = efx_mcdi_filter_op_add(enp, spec, ++ MC_CMD_FILTER_OP_IN_OP_SUBSCRIBE, ++ &eftp->eft_entry[ins_index].efe_handle); ++ } ++ break; ++ default: ++ rc = EINVAL; ++ EFSYS_ASSERT(0); ++ break; ++ } ++ if (rc != 0) ++ goto fail1; ++ ++ EFSYS_LOCK(enp->en_eslp, state); ++ ++ if (action == EF10_FILTER_ADD_REPLACE) { ++ /* Update the fields that may differ */ ++ saved_spec->efs_priority = spec->efs_priority; ++ saved_spec->efs_flags = spec->efs_flags; ++ saved_spec->efs_rss_context = spec->efs_rss_context; ++ saved_spec->efs_dmaq_id = spec->efs_dmaq_id; ++ ++ if (overridden_spec != NULL) ++ saved_spec->efs_overridden_spec = overridden_spec; ++ } ++ ++out_unlock: ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ ++ return (0); ++ ++fail1: ++ EFSYS_PROBE1(fail1, efx_rc_t, rc); ++ ++ return (rc); ++} ++ + /* + * An arbitrary search limit for the software hash table. As per the linux net + * driver. +@@ -600,22 +825,24 @@ static __checkReturn efx_rc_t + ef10_filter_add_internal( + __in efx_nic_t *enp, + __inout efx_filter_spec_t *spec, +- __in boolean_t may_replace, ++ __in efx_filter_replacement_policy_t policy, + __out_opt uint32_t *filter_id) + { + efx_rc_t rc; + ef10_filter_table_t *eftp = enp->en_filter.ef_ef10_filter_table; ++ enum ef10_filter_add_action_e action; ++ efx_filter_spec_t *overridden_spec = NULL; + efx_filter_spec_t *saved_spec; + uint32_t hash; + unsigned int depth; + int ins_index; +- boolean_t replacing = B_FALSE; +- unsigned int i; + efsys_lock_state_t state; + boolean_t locked = B_FALSE; + + EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + ++ EFSYS_ASSERT(spec->efs_overridden_spec == NULL); ++ + hash = ef10_filter_hash(spec); + + /* +@@ -628,145 +855,136 @@ ef10_filter_add_internal( + * else a free slot to insert at. If any of them are busy, + * we have to wait and retry. + */ +- for (;;) { +- ins_index = -1; +- depth = 1; +- EFSYS_LOCK(enp->en_eslp, state); +- locked = B_TRUE; ++retry: ++ EFSYS_LOCK(enp->en_eslp, state); ++ locked = B_TRUE; ++ ++ ins_index = -1; ++ ++ for (depth = 1; depth <= EF10_FILTER_SEARCH_LIMIT; depth++) { ++ unsigned int probe_index; ++ efx_filter_spec_t *probe_spec; + +- for (;;) { +- i = (hash + depth) & (EFX_EF10_FILTER_TBL_ROWS - 1); +- saved_spec = ef10_filter_entry_spec(eftp, i); +- +- if (!saved_spec) { +- if (ins_index < 0) { +- ins_index = i; +- } +- } else if (ef10_filter_equal(spec, saved_spec)) { +- if (ef10_filter_entry_is_busy(eftp, i)) +- break; +- if (saved_spec->efs_priority +- == EFX_FILTER_PRI_AUTO) { +- ins_index = i; +- goto found; +- } else if (ef10_filter_is_exclusive(spec)) { +- if (may_replace) { +- ins_index = i; +- goto found; +- } else { +- rc = EEXIST; +- goto fail1; +- } +- } +- +- /* Leave existing */ ++ probe_index = (hash + depth) & (EFX_EF10_FILTER_TBL_ROWS - 1); ++ probe_spec = ef10_filter_entry_spec(eftp, probe_index); ++ ++ if (probe_spec == NULL) { ++ if (ins_index < 0) ++ ins_index = probe_index; ++ } else if (ef10_filter_equal(spec, probe_spec)) { ++ boolean_t found; ++ ++ if (ef10_filter_entry_is_busy(eftp, probe_index)) { ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ locked = B_FALSE; ++ goto retry; + } + +- /* +- * Once we reach the maximum search depth, use +- * the first suitable slot or return EBUSY if +- * there was none. +- */ +- if (depth == EF10_FILTER_SEARCH_LIMIT) { +- if (ins_index < 0) { +- rc = EBUSY; +- goto fail2; +- } +- goto found; ++ rc = ef10_filter_add_lookup_equal_spec(spec, ++ probe_spec, policy, &found); ++ if (rc != 0) ++ goto fail1; ++ ++ if (found != B_FALSE) { ++ ins_index = probe_index; ++ break; + } +- depth++; + } +- EFSYS_UNLOCK(enp->en_eslp, state); +- locked = B_FALSE; + } + +-found: + /* +- * Create a software table entry if necessary, and mark it +- * busy. We might yet fail to insert, but any attempt to +- * insert a conflicting filter while we're waiting for the +- * firmware must find the busy entry. ++ * Once we reach the maximum search depth, use the first suitable slot ++ * or return EBUSY if there was none. + */ +- saved_spec = ef10_filter_entry_spec(eftp, ins_index); +- if (saved_spec) { +- if (saved_spec->efs_priority == EFX_FILTER_PRI_AUTO) { +- /* This is a filter we are refreshing */ +- ef10_filter_set_entry_not_auto_old(eftp, ins_index); +- goto out_unlock; +- +- } +- replacing = B_TRUE; +- } else { +- EFSYS_KMEM_ALLOC(enp->en_esip, sizeof (*spec), saved_spec); +- if (!saved_spec) { +- rc = ENOMEM; +- goto fail3; +- } +- *saved_spec = *spec; +- ef10_filter_set_entry(eftp, ins_index, saved_spec); ++ if (ins_index < 0) { ++ rc = EBUSY; ++ goto fail2; + } ++ ++ /* ++ * Mark software table entry busy. We might yet fail to insert, ++ * but any attempt to insert a conflicting filter while we're ++ * waiting for the firmware must find the busy entry. ++ */ + ef10_filter_set_entry_busy(eftp, ins_index); + +- EFSYS_UNLOCK(enp->en_eslp, state); +- locked = B_FALSE; ++ saved_spec = ef10_filter_entry_spec(eftp, ins_index); ++ ef10_filter_add_select_action(saved_spec, spec, &action, ++ &overridden_spec); + + /* +- * On replacing the filter handle may change after after a successful +- * replace operation. ++ * Allocate a new filter if found entry is empty or ++ * a filter should be overridden. + */ +- if (replacing) { +- rc = efx_mcdi_filter_op_add(enp, spec, +- MC_CMD_FILTER_OP_IN_OP_REPLACE, +- &eftp->eft_entry[ins_index].efe_handle); +- } else if (ef10_filter_is_exclusive(spec)) { +- rc = efx_mcdi_filter_op_add(enp, spec, +- MC_CMD_FILTER_OP_IN_OP_INSERT, +- &eftp->eft_entry[ins_index].efe_handle); +- } else { +- rc = efx_mcdi_filter_op_add(enp, spec, +- MC_CMD_FILTER_OP_IN_OP_SUBSCRIBE, +- &eftp->eft_entry[ins_index].efe_handle); +- } +- +- if (rc != 0) +- goto fail4; ++ if (overridden_spec != NULL || saved_spec == NULL) { ++ efx_filter_spec_t *new_spec; + +- EFSYS_LOCK(enp->en_eslp, state); +- locked = B_TRUE; ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ locked = B_FALSE; + +- if (replacing) { +- /* Update the fields that may differ */ +- saved_spec->efs_priority = spec->efs_priority; +- saved_spec->efs_flags = spec->efs_flags; +- saved_spec->efs_rss_context = spec->efs_rss_context; +- saved_spec->efs_dmaq_id = spec->efs_dmaq_id; +- } ++ EFSYS_KMEM_ALLOC(enp->en_esip, sizeof (*new_spec), new_spec); ++ if (new_spec == NULL) { ++ rc = ENOMEM; ++ overridden_spec = NULL; ++ goto fail3; ++ } + +- ef10_filter_set_entry_not_busy(eftp, ins_index); ++ EFSYS_LOCK(enp->en_eslp, state); ++ locked = B_TRUE; + +-out_unlock: ++ if (saved_spec == NULL) { ++ *new_spec = *spec; ++ ef10_filter_set_entry(eftp, ins_index, new_spec); ++ } else { ++ *new_spec = *overridden_spec; ++ overridden_spec = new_spec; ++ } ++ } + + EFSYS_UNLOCK(enp->en_eslp, state); + locked = B_FALSE; + ++ rc = ef10_filter_add_execute_action(enp, saved_spec, spec, ++ overridden_spec, action, ins_index); ++ if (rc != 0) ++ goto fail4; ++ + if (filter_id) + *filter_id = ins_index; + ++ EFSYS_LOCK(enp->en_eslp, state); ++ ef10_filter_set_entry_not_busy(eftp, ins_index); ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ + return (0); + + fail4: + EFSYS_PROBE(fail4); + +- if (!replacing) { +- EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), saved_spec); +- saved_spec = NULL; ++ EFSYS_ASSERT(locked == B_FALSE); ++ EFSYS_LOCK(enp->en_eslp, state); ++ ++ if (action == EF10_FILTER_ADD_NEW) { ++ EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), ++ ef10_filter_entry_spec(eftp, ins_index)); ++ ef10_filter_set_entry(eftp, ins_index, NULL); + } +- ef10_filter_set_entry_not_busy(eftp, ins_index); +- ef10_filter_set_entry(eftp, ins_index, NULL); ++ ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ ++ if (overridden_spec != NULL) ++ EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), overridden_spec); + + fail3: + EFSYS_PROBE(fail3); + ++ EFSYS_ASSERT(locked == B_FALSE); ++ EFSYS_LOCK(enp->en_eslp, state); ++ ++ ef10_filter_set_entry_not_busy(eftp, ins_index); ++ ++ EFSYS_UNLOCK(enp->en_eslp, state); ++ + fail2: + EFSYS_PROBE(fail2); + +@@ -783,11 +1001,11 @@ ef10_filter_add_internal( + ef10_filter_add( + __in efx_nic_t *enp, + __inout efx_filter_spec_t *spec, +- __in boolean_t may_replace) ++ __in enum efx_filter_replacement_policy_e policy) + { + efx_rc_t rc; + +- rc = ef10_filter_add_internal(enp, spec, may_replace, NULL); ++ rc = ef10_filter_add_internal(enp, spec, policy, NULL); + if (rc != 0) + goto fail1; + +@@ -799,11 +1017,15 @@ ef10_filter_add( + return (rc); + } + +- ++/* ++ * Delete a filter by index from the filter table with priority ++ * that is not higher than specified. ++ */ + static __checkReturn efx_rc_t + ef10_filter_delete_internal( + __in efx_nic_t *enp, +- __in uint32_t filter_id) ++ __in uint32_t filter_id, ++ __in efx_filter_priority_t priority) + { + efx_rc_t rc; + ef10_filter_table_t *table = enp->en_filter.ef_ef10_filter_table; +@@ -825,7 +1047,8 @@ ef10_filter_delete_internal( + EFSYS_LOCK(enp->en_eslp, state); + } + if ((spec = ef10_filter_entry_spec(table, filter_idx)) != NULL) { +- ef10_filter_set_entry_busy(table, filter_idx); ++ if (spec->efs_priority <= priority) ++ ef10_filter_set_entry_busy(table, filter_idx); + } + EFSYS_UNLOCK(enp->en_eslp, state); + +@@ -834,31 +1057,53 @@ ef10_filter_delete_internal( + goto fail1; + } + +- /* +- * Try to remove the hardware filter. This may fail if the MC has +- * rebooted (which frees all hardware filter resources). +- */ +- if (ef10_filter_is_exclusive(spec)) { +- rc = efx_mcdi_filter_op_delete(enp, +- MC_CMD_FILTER_OP_IN_OP_REMOVE, +- &table->eft_entry[filter_idx].efe_handle); ++ if (spec->efs_priority > priority) { ++ /* ++ * Applied filter stays, but overridden filter is removed since ++ * next user request to delete the applied filter should not ++ * restore outdated filter. ++ */ ++ if (spec->efs_overridden_spec != NULL) { ++ EFSYS_ASSERT(spec->efs_overridden_spec->efs_overridden_spec == ++ NULL); ++ EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), ++ spec->efs_overridden_spec); ++ spec->efs_overridden_spec = NULL; ++ } + } else { +- rc = efx_mcdi_filter_op_delete(enp, +- MC_CMD_FILTER_OP_IN_OP_UNSUBSCRIBE, +- &table->eft_entry[filter_idx].efe_handle); +- } ++ /* ++ * Try to remove the hardware filter or replace it with the ++ * saved automatic filter. This may fail if the MC has ++ * rebooted (which frees all hardware filter resources). ++ */ ++ if (spec->efs_overridden_spec != NULL) { ++ rc = efx_mcdi_filter_op_add(enp, ++ spec->efs_overridden_spec, ++ MC_CMD_FILTER_OP_IN_OP_REPLACE, ++ &table->eft_entry[filter_idx].efe_handle); ++ } else if (ef10_filter_is_exclusive(spec)) { ++ rc = efx_mcdi_filter_op_delete(enp, ++ MC_CMD_FILTER_OP_IN_OP_REMOVE, ++ &table->eft_entry[filter_idx].efe_handle); ++ } else { ++ rc = efx_mcdi_filter_op_delete(enp, ++ MC_CMD_FILTER_OP_IN_OP_UNSUBSCRIBE, ++ &table->eft_entry[filter_idx].efe_handle); ++ } + +- /* Free the software table entry */ +- EFSYS_LOCK(enp->en_eslp, state); +- ef10_filter_set_entry_not_busy(table, filter_idx); +- ef10_filter_set_entry(table, filter_idx, NULL); +- EFSYS_UNLOCK(enp->en_eslp, state); ++ /* Free the software table entry */ ++ EFSYS_LOCK(enp->en_eslp, state); ++ ef10_filter_set_entry_not_busy(table, filter_idx); ++ ef10_filter_set_entry(table, filter_idx, ++ spec->efs_overridden_spec); ++ EFSYS_UNLOCK(enp->en_eslp, state); + +- EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), spec); ++ EFSYS_KMEM_FREE(enp->en_esip, sizeof (*spec), spec); + +- /* Check result of hardware filter removal */ +- if (rc != 0) +- goto fail2; ++ /* Check result of hardware filter removal */ ++ if (rc != 0) ++ goto fail2; ++ } + + return (0); + +@@ -871,6 +1116,25 @@ ef10_filter_delete_internal( + return (rc); + } + ++static void ++ef10_filter_delete_auto( ++ __in efx_nic_t *enp, ++ __in uint32_t filter_id) ++{ ++ ef10_filter_table_t *table = enp->en_filter.ef_ef10_filter_table; ++ uint32_t filter_idx = filter_id % EFX_EF10_FILTER_TBL_ROWS; ++ ++ /* ++ * AUTO_OLD flag is cleared since the auto filter that is to be removed ++ * may not be the filter at the specified index itself, but the filter ++ * that is overridden by it. ++ */ ++ ef10_filter_set_entry_not_auto_old(table, filter_idx); ++ ++ (void) ef10_filter_delete_internal(enp, filter_idx, ++ EFX_FILTER_PRI_AUTO); ++} ++ + __checkReturn efx_rc_t + ef10_filter_delete( + __in efx_nic_t *enp, +@@ -897,7 +1161,8 @@ ef10_filter_delete( + i = (hash + depth) & (EFX_EF10_FILTER_TBL_ROWS - 1); + saved_spec = ef10_filter_entry_spec(table, i); + if (saved_spec && ef10_filter_equal(spec, saved_spec) && +- ef10_filter_same_dest(spec, saved_spec)) { ++ ef10_filter_same_dest(spec, saved_spec) && ++ saved_spec->efs_priority == EFX_FILTER_PRI_MANUAL) { + break; + } + if (depth == EF10_FILTER_SEARCH_LIMIT) { +@@ -910,7 +1175,7 @@ ef10_filter_delete( + EFSYS_UNLOCK(enp->en_eslp, state); + locked = B_FALSE; + +- rc = ef10_filter_delete_internal(enp, i); ++ rc = ef10_filter_delete_internal(enp, i, EFX_FILTER_PRI_MANUAL); + if (rc != 0) + goto fail2; + +@@ -1135,7 +1400,7 @@ ef10_filter_insert_unicast( + if (rc != 0) + goto fail1; + +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, ++ rc = ef10_filter_add_internal(enp, &spec, EFX_FILTER_REPLACEMENT_NEVER, + &eftp->eft_unicst_filter_indexes[eftp->eft_unicst_filter_count]); + if (rc != 0) + goto fail2; +@@ -1169,7 +1434,7 @@ ef10_filter_insert_all_unicast( + rc = efx_filter_spec_set_uc_def(&spec); + if (rc != 0) + goto fail1; +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, ++ rc = ef10_filter_add_internal(enp, &spec, EFX_FILTER_REPLACEMENT_NEVER, + &eftp->eft_unicst_filter_indexes[eftp->eft_unicst_filter_count]); + if (rc != 0) + goto fail2; +@@ -1239,8 +1504,8 @@ ef10_filter_insert_multicast_list( + } + } + +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, +- &filter_index); ++ rc = ef10_filter_add_internal(enp, &spec, ++ EFX_FILTER_REPLACEMENT_NEVER, &filter_index); + + if (rc == 0) { + eftp->eft_mulcst_filter_indexes[filter_count] = +@@ -1267,8 +1532,8 @@ ef10_filter_insert_multicast_list( + goto rollback; + } + +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, +- &filter_index); ++ rc = ef10_filter_add_internal(enp, &spec, ++ EFX_FILTER_REPLACEMENT_NEVER, &filter_index); + + if (rc == 0) { + eftp->eft_mulcst_filter_indexes[filter_count] = +@@ -1289,7 +1554,7 @@ ef10_filter_insert_multicast_list( + /* Remove any filters we have inserted */ + i = filter_count; + while (i--) { +- (void) ef10_filter_delete_internal(enp, ++ ef10_filter_delete_auto(enp, + eftp->eft_mulcst_filter_indexes[i]); + } + eftp->eft_mulcst_filter_count = 0; +@@ -1317,7 +1582,7 @@ ef10_filter_insert_all_multicast( + if (rc != 0) + goto fail1; + +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, ++ rc = ef10_filter_add_internal(enp, &spec, EFX_FILTER_REPLACEMENT_NEVER, + &eftp->eft_mulcst_filter_indexes[0]); + if (rc != 0) + goto fail2; +@@ -1420,8 +1685,9 @@ ef10_filter_insert_encap_filters( + if (rc != 0) + goto fail1; + +- rc = ef10_filter_add_internal(enp, &spec, B_TRUE, +- &table->eft_encap_filter_indexes[ ++ rc = ef10_filter_add_internal(enp, &spec, ++ EFX_FILTER_REPLACEMENT_NEVER, ++ &table->eft_encap_filter_indexes[ + table->eft_encap_filter_count]); + if (rc != 0) { + if (rc != EACCES) +@@ -1450,7 +1716,7 @@ ef10_filter_remove_old( + + for (i = 0; i < EFX_ARRAY_SIZE(table->eft_entry); i++) { + if (ef10_filter_entry_is_auto_old(table, i)) { +- (void) ef10_filter_delete_internal(enp, i); ++ ef10_filter_delete_auto(enp, i); + } + } + } +@@ -1525,19 +1791,19 @@ ef10_filter_reconfigure( + * has rebooted, which removes hardware filters). + */ + for (i = 0; i < table->eft_unicst_filter_count; i++) { +- (void) ef10_filter_delete_internal(enp, ++ ef10_filter_delete_auto(enp, + table->eft_unicst_filter_indexes[i]); + } + table->eft_unicst_filter_count = 0; + + for (i = 0; i < table->eft_mulcst_filter_count; i++) { +- (void) ef10_filter_delete_internal(enp, ++ ef10_filter_delete_auto(enp, + table->eft_mulcst_filter_indexes[i]); + } + table->eft_mulcst_filter_count = 0; + + for (i = 0; i < table->eft_encap_filter_count; i++) { +- (void) ef10_filter_delete_internal(enp, ++ ef10_filter_delete_auto(enp, + table->eft_encap_filter_indexes[i]); + } + table->eft_encap_filter_count = 0; +diff --git a/dpdk/drivers/net/sfc/base/ef10_impl.h b/dpdk/drivers/net/sfc/base/ef10_impl.h +index 7a00047829..67abf3b853 100644 +--- a/dpdk/drivers/net/sfc/base/ef10_impl.h ++++ b/dpdk/drivers/net/sfc/base/ef10_impl.h +@@ -1079,6 +1079,8 @@ ef10_rx_fini( + + #if EFSYS_OPT_FILTER + ++enum efx_filter_replacement_policy_e; ++ + typedef struct ef10_filter_handle_s { + uint32_t efh_lo; + uint32_t efh_hi; +@@ -1148,7 +1150,7 @@ ef10_filter_restore( + ef10_filter_add( + __in efx_nic_t *enp, + __inout efx_filter_spec_t *spec, +- __in boolean_t may_replace); ++ __in enum efx_filter_replacement_policy_e policy); + + __checkReturn efx_rc_t + ef10_filter_delete( +diff --git a/dpdk/drivers/net/sfc/base/ef10_nic.c b/dpdk/drivers/net/sfc/base/ef10_nic.c +index b25ce1908e..3eb4674c5e 100644 +--- a/dpdk/drivers/net/sfc/base/ef10_nic.c ++++ b/dpdk/drivers/net/sfc/base/ef10_nic.c +@@ -2288,9 +2288,7 @@ ef10_nic_init( + efx_rc_t rc; + boolean_t alloc_vadaptor = B_TRUE; + +- EFSYS_ASSERT(enp->en_family == EFX_FAMILY_HUNTINGTON || +- enp->en_family == EFX_FAMILY_MEDFORD || +- enp->en_family == EFX_FAMILY_MEDFORD2); ++ EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + + /* Enable reporting of some events (e.g. link change) */ + if ((rc = efx_mcdi_log_ctrl(enp)) != 0) +diff --git a/dpdk/drivers/net/sfc/base/ef10_proxy.c b/dpdk/drivers/net/sfc/base/ef10_proxy.c +index 059b2f5f4d..619d98e472 100644 +--- a/dpdk/drivers/net/sfc/base/ef10_proxy.c ++++ b/dpdk/drivers/net/sfc/base/ef10_proxy.c +@@ -13,9 +13,7 @@ + ef10_proxy_auth_init( + __in efx_nic_t *enp) + { +- EFSYS_ASSERT(enp->en_family == EFX_FAMILY_HUNTINGTON || +- enp->en_family == EFX_FAMILY_MEDFORD || +- enp->en_family == EFX_FAMILY_MEDFORD2); ++ EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + + return (0); + } +@@ -24,9 +22,7 @@ ef10_proxy_auth_init( + ef10_proxy_auth_fini( + __in efx_nic_t *enp) + { +- EFSYS_ASSERT(enp->en_family == EFX_FAMILY_HUNTINGTON || +- enp->en_family == EFX_FAMILY_MEDFORD || +- enp->en_family == EFX_FAMILY_MEDFORD2); ++ EFSYS_ASSERT(EFX_FAMILY_IS_EF10(enp)); + } + + static __checkReturn efx_rc_t +diff --git a/dpdk/drivers/net/sfc/base/efx.h b/dpdk/drivers/net/sfc/base/efx.h +index 53ddaa987c..d94d3c02f7 100644 +--- a/dpdk/drivers/net/sfc/base/efx.h ++++ b/dpdk/drivers/net/sfc/base/efx.h +@@ -2949,17 +2949,15 @@ typedef uint8_t efx_filter_flags_t; + + typedef uint32_t efx_filter_match_flags_t; + ++/* Filter priority from lowest to highest */ + typedef enum efx_filter_priority_s { +- EFX_FILTER_PRI_HINT = 0, /* Performance hint */ +- EFX_FILTER_PRI_AUTO, /* Automatic filter based on device ++ EFX_FILTER_PRI_AUTO = 0, /* Automatic filter based on device + * address list or hardware + * requirements. This may only be used + * by the filter implementation for + * each NIC type. */ + EFX_FILTER_PRI_MANUAL, /* Manually configured filter */ +- EFX_FILTER_PRI_REQUIRED, /* Required for correct behaviour of the +- * client (e.g. SR-IOV, HyperV VMQ etc.) +- */ ++ EFX_FILTER_NPRI, + } efx_filter_priority_t; + + /* +@@ -2974,6 +2972,11 @@ typedef struct efx_filter_spec_s { + uint16_t efs_dmaq_id; + uint32_t efs_rss_context; + uint32_t efs_mark; ++ /* ++ * Saved lower-priority filter. If it is set, it is restored on ++ * filter delete operation. ++ */ ++ struct efx_filter_spec_s *efs_overridden_spec; + /* Fields below here are hashed for software filter lookup */ + uint16_t efs_outer_vid; + uint16_t efs_inner_vid; +diff --git a/dpdk/drivers/net/sfc/base/efx_evb.c b/dpdk/drivers/net/sfc/base/efx_evb.c +index dd64bc7854..5fa0d99809 100644 +--- a/dpdk/drivers/net/sfc/base/efx_evb.c ++++ b/dpdk/drivers/net/sfc/base/efx_evb.c +@@ -28,7 +28,7 @@ static const efx_evb_ops_t __efx_evb_dummy_ops = { + }; + #endif /* EFSYS_OPT_SIENA */ + +-#if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 ++#if EFX_OPTS_EF10() + static const efx_evb_ops_t __efx_evb_ef10_ops = { + ef10_evb_init, /* eeo_init */ + ef10_evb_fini, /* eeo_fini */ +@@ -44,7 +44,7 @@ static const efx_evb_ops_t __efx_evb_ef10_ops = { + ef10_evb_vport_reconfigure, /* eeo_vport_reconfigure */ + ef10_evb_vport_stats, /* eeo_vport_stats */ + }; +-#endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 */ ++#endif /* EFX_OPTS_EF10() */ + + __checkReturn efx_rc_t + efx_evb_init( +diff --git a/dpdk/drivers/net/sfc/base/efx_filter.c b/dpdk/drivers/net/sfc/base/efx_filter.c +index 7efb380641..9949d05bb3 100644 +--- a/dpdk/drivers/net/sfc/base/efx_filter.c ++++ b/dpdk/drivers/net/sfc/base/efx_filter.c +@@ -28,7 +28,7 @@ static __checkReturn efx_rc_t + siena_filter_add( + __in efx_nic_t *enp, + __inout efx_filter_spec_t *spec, +- __in boolean_t may_replace); ++ __in efx_filter_replacement_policy_t policy); + + static __checkReturn efx_rc_t + siena_filter_delete( +@@ -93,8 +93,16 @@ efx_filter_insert( + goto fail2; + } + +- return (efop->efo_add(enp, spec, B_FALSE)); ++ if (spec->efs_priority == EFX_FILTER_PRI_AUTO) { ++ rc = EINVAL; ++ goto fail3; ++ } + ++ return (efop->efo_add(enp, spec, ++ EFX_FILTER_REPLACEMENT_HIGHER_PRIORITY)); ++ ++fail3: ++ EFSYS_PROBE(fail3); + fail2: + EFSYS_PROBE(fail2); + fail1: +@@ -314,7 +322,7 @@ efx_filter_spec_init_tx( + EFSYS_ASSERT3P(etp, !=, NULL); + + memset(spec, 0, sizeof (*spec)); +- spec->efs_priority = EFX_FILTER_PRI_REQUIRED; ++ spec->efs_priority = EFX_FILTER_PRI_MANUAL; + spec->efs_flags = EFX_FILTER_FLAG_TX; + spec->efs_dmaq_id = (uint16_t)etp->et_index; + } +@@ -1437,7 +1445,7 @@ static __checkReturn efx_rc_t + siena_filter_add( + __in efx_nic_t *enp, + __inout efx_filter_spec_t *spec, +- __in boolean_t may_replace) ++ __in efx_filter_replacement_policy_t policy) + { + efx_rc_t rc; + siena_filter_spec_t sf_spec; +@@ -1478,9 +1486,17 @@ siena_filter_add( + saved_sf_spec = &sftp->sft_spec[filter_idx]; + + if (siena_filter_test_used(sftp, filter_idx)) { +- if (may_replace == B_FALSE) { ++ /* All Siena filter are considered the same priority */ ++ switch (policy) { ++ case EFX_FILTER_REPLACEMENT_NEVER: ++ case EFX_FILTER_REPLACEMENT_HIGHER_PRIORITY: + rc = EEXIST; + goto fail4; ++ case EFX_FILTER_REPLACEMENT_HIGHER_OR_EQUAL_PRIORITY: ++ break; ++ default: ++ EFSYS_ASSERT(0); ++ break; + } + } + siena_filter_set_used(sftp, filter_idx); +diff --git a/dpdk/drivers/net/sfc/base/efx_impl.h b/dpdk/drivers/net/sfc/base/efx_impl.h +index 85d984f651..9755f4dfd2 100644 +--- a/dpdk/drivers/net/sfc/base/efx_impl.h ++++ b/dpdk/drivers/net/sfc/base/efx_impl.h +@@ -246,12 +246,31 @@ typedef struct efx_phy_ops_s { + } efx_phy_ops_t; + + #if EFSYS_OPT_FILTER ++ ++/* ++ * Policy for replacing existing filter when inserting a new one. ++ * Note that all policies allow for storing the new lower priority ++ * filters as overridden by existing higher priority ones. It is needed ++ * to restore the lower priority filters on higher priority ones removal. ++ */ ++typedef enum efx_filter_replacement_policy_e { ++ /* Cannot replace existing filter */ ++ EFX_FILTER_REPLACEMENT_NEVER, ++ /* Higher priority filters can replace lower priotiry ones */ ++ EFX_FILTER_REPLACEMENT_HIGHER_PRIORITY, ++ /* ++ * Higher priority filters can replace lower priority ones and ++ * equal priority filters can replace each other. ++ */ ++ EFX_FILTER_REPLACEMENT_HIGHER_OR_EQUAL_PRIORITY, ++} efx_filter_replacement_policy_t; ++ + typedef struct efx_filter_ops_s { + efx_rc_t (*efo_init)(efx_nic_t *); + void (*efo_fini)(efx_nic_t *); + efx_rc_t (*efo_restore)(efx_nic_t *); + efx_rc_t (*efo_add)(efx_nic_t *, efx_filter_spec_t *, +- boolean_t may_replace); ++ efx_filter_replacement_policy_t policy); + efx_rc_t (*efo_delete)(efx_nic_t *, efx_filter_spec_t *); + efx_rc_t (*efo_supported_filters)(efx_nic_t *, uint32_t *, + size_t, size_t *); +diff --git a/dpdk/drivers/net/sfc/base/efx_proxy.c b/dpdk/drivers/net/sfc/base/efx_proxy.c +index 791105a5a0..ecf703b03d 100644 +--- a/dpdk/drivers/net/sfc/base/efx_proxy.c ++++ b/dpdk/drivers/net/sfc/base/efx_proxy.c +@@ -23,7 +23,7 @@ static const efx_proxy_ops_t __efx_proxy_dummy_ops = { + }; + #endif /* EFSYS_OPT_SIENA */ + +-#if EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 ++#if EFX_OPTS_EF10() + static const efx_proxy_ops_t __efx_proxy_ef10_ops = { + ef10_proxy_auth_init, /* epo_init */ + ef10_proxy_auth_fini, /* epo_fini */ +@@ -35,7 +35,7 @@ static const efx_proxy_ops_t __efx_proxy_ef10_ops = { + ef10_proxy_auth_exec_cmd, /* epo_exec_cmd */ + ef10_proxy_auth_get_privilege_mask, /* epo_get_privilege_mask */ + }; +-#endif /* EFSYS_OPT_HUNTINGTON || EFSYS_OPT_MEDFORD || EFSYS_OPT_MEDFORD2 */ ++#endif /* EFX_OPTS_EF10() */ + + __checkReturn efx_rc_t + efx_proxy_auth_init( +diff --git a/dpdk/drivers/net/sfc/sfc.c b/dpdk/drivers/net/sfc/sfc.c +index 141c767f09..3f5cd7758b 100644 +--- a/dpdk/drivers/net/sfc/sfc.c ++++ b/dpdk/drivers/net/sfc/sfc.c +@@ -30,7 +30,7 @@ sfc_dma_alloc(const struct sfc_adapter *sa, const char *name, uint16_t id, + { + const struct rte_memzone *mz; + +- sfc_log_init(sa, "name=%s id=%u len=%lu socket_id=%d", ++ sfc_log_init(sa, "name=%s id=%u len=%zu socket_id=%d", + name, id, len, socket_id); + + mz = rte_eth_dma_zone_reserve(sa->eth_dev, name, id, len, +@@ -241,8 +241,8 @@ sfc_estimate_resource_limits(struct sfc_adapter *sa) + return 0; + + fail_get_vi_pool: +-fail_nic_init: + efx_nic_fini(sa->nic); ++fail_nic_init: + return rc; + } + +diff --git a/dpdk/drivers/net/sfc/sfc_ethdev.c b/dpdk/drivers/net/sfc/sfc_ethdev.c +index 454b8956a2..f8867b0ec0 100644 +--- a/dpdk/drivers/net/sfc/sfc_ethdev.c ++++ b/dpdk/drivers/net/sfc/sfc_ethdev.c +@@ -405,25 +405,37 @@ sfc_dev_filter_set(struct rte_eth_dev *dev, enum sfc_dev_filter_mode mode, + static int + sfc_dev_promisc_enable(struct rte_eth_dev *dev) + { +- return sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_PROMISC, B_TRUE); ++ int rc = sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_PROMISC, B_TRUE); ++ ++ SFC_ASSERT(rc >= 0); ++ return -rc; + } + + static int + sfc_dev_promisc_disable(struct rte_eth_dev *dev) + { +- return sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_PROMISC, B_FALSE); ++ int rc = sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_PROMISC, B_FALSE); ++ ++ SFC_ASSERT(rc >= 0); ++ return -rc; + } + + static int + sfc_dev_allmulti_enable(struct rte_eth_dev *dev) + { +- return sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_ALLMULTI, B_TRUE); ++ int rc = sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_ALLMULTI, B_TRUE); ++ ++ SFC_ASSERT(rc >= 0); ++ return -rc; + } + + static int + sfc_dev_allmulti_disable(struct rte_eth_dev *dev) + { +- return sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_ALLMULTI, B_FALSE); ++ int rc = sfc_dev_filter_set(dev, SFC_DEV_FILTER_MODE_ALLMULTI, B_FALSE); ++ ++ SFC_ASSERT(rc >= 0); ++ return -rc; + } + + static int +@@ -1520,7 +1532,7 @@ sfc_dev_rss_hash_update(struct rte_eth_dev *dev, + + if ((rss_conf->rss_key != NULL) && + (rss_conf->rss_key_len != sizeof(rss->key))) { +- sfc_err(sa, "RSS key size is wrong (should be %lu)", ++ sfc_err(sa, "RSS key size is wrong (should be %zu)", + sizeof(rss->key)); + return -EINVAL; + } +diff --git a/dpdk/drivers/net/sfc/sfc_flow.c b/dpdk/drivers/net/sfc/sfc_flow.c +index 8d636f6923..023e55d951 100644 +--- a/dpdk/drivers/net/sfc/sfc_flow.c ++++ b/dpdk/drivers/net/sfc/sfc_flow.c +@@ -1132,6 +1132,7 @@ sfc_flow_parse_attr(const struct rte_flow_attr *attr, + + flow->spec.template.efs_flags |= EFX_FILTER_FLAG_RX; + flow->spec.template.efs_rss_context = EFX_RSS_CONTEXT_DEFAULT; ++ flow->spec.template.efs_priority = EFX_FILTER_PRI_MANUAL; + + return 0; + } +diff --git a/dpdk/drivers/net/sfc/sfc_rx.c b/dpdk/drivers/net/sfc/sfc_rx.c +index 74218296cd..891709fd04 100644 +--- a/dpdk/drivers/net/sfc/sfc_rx.c ++++ b/dpdk/drivers/net/sfc/sfc_rx.c +@@ -719,6 +719,7 @@ sfc_rx_default_rxq_set_filter(struct sfc_adapter *sa, struct sfc_rxq *rxq) + sfc_warn(sa, "promiscuous mode will be disabled"); + + port->promisc = B_FALSE; ++ sa->eth_dev->data->promiscuous = 0; + rc = sfc_set_rx_mode(sa); + if (rc != 0) + return rc; +@@ -732,6 +733,7 @@ sfc_rx_default_rxq_set_filter(struct sfc_adapter *sa, struct sfc_rxq *rxq) + sfc_warn(sa, "all-multicast mode will be disabled"); + + port->allmulti = B_FALSE; ++ sa->eth_dev->data->all_multicast = 0; + rc = sfc_set_rx_mode(sa); + if (rc != 0) + return rc; +@@ -820,10 +822,12 @@ sfc_rx_qstart(struct sfc_adapter *sa, unsigned int sw_index) + return 0; + + fail_mac_filter_default_rxq_set: ++ sfc_rx_qflush(sa, sw_index); + sa->priv.dp_rx->qstop(rxq_info->dp, &rxq->evq->read_ptr); ++ rxq_info->state = SFC_RXQ_INITIALIZED; + + fail_dp_qstart: +- sfc_rx_qflush(sa, sw_index); ++ efx_rx_qdestroy(rxq->common); + + fail_rx_qcreate: + fail_bad_contig_block_size: +@@ -1403,7 +1407,7 @@ sfc_rx_process_adv_conf_rss(struct sfc_adapter *sa, + + if (conf->rss_key != NULL) { + if (conf->rss_key_len != sizeof(rss->key)) { +- sfc_err(sa, "RSS key size is wrong (should be %lu)", ++ sfc_err(sa, "RSS key size is wrong (should be %zu)", + sizeof(rss->key)); + return EINVAL; + } +diff --git a/dpdk/drivers/net/softnic/rte_eth_softnic_thread.c b/dpdk/drivers/net/softnic/rte_eth_softnic_thread.c +index d610b1617e..dcfb5eb82c 100644 +--- a/dpdk/drivers/net/softnic/rte_eth_softnic_thread.c ++++ b/dpdk/drivers/net/softnic/rte_eth_softnic_thread.c +@@ -359,8 +359,6 @@ softnic_thread_pipeline_enable(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = thread_msg_send_recv(softnic, thread_id, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -444,8 +442,6 @@ softnic_thread_pipeline_disable(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = thread_msg_send_recv(softnic, thread_id, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -839,8 +835,6 @@ softnic_pipeline_port_in_stats_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -888,8 +882,6 @@ softnic_pipeline_port_in_enable(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -935,8 +927,6 @@ softnic_pipeline_port_in_disable(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -990,8 +980,6 @@ softnic_pipeline_port_out_stats_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1047,8 +1035,6 @@ softnic_pipeline_table_stats_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1327,8 +1313,6 @@ softnic_pipeline_table_rule_add(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1411,8 +1395,6 @@ softnic_pipeline_table_rule_add_default(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1569,8 +1551,6 @@ softnic_pipeline_table_rule_add_bulk(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1634,8 +1614,6 @@ softnic_pipeline_table_rule_delete(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1684,8 +1662,6 @@ softnic_pipeline_table_rule_delete_default(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1744,8 +1720,6 @@ softnic_pipeline_table_rule_stats_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1825,10 +1799,6 @@ softnic_pipeline_table_mtr_profile_add(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) { +- free(mp); +- return -1; +- } + + /* Read response */ + status = rsp->status; +@@ -1884,8 +1854,6 @@ softnic_pipeline_table_mtr_profile_delete(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1947,8 +1915,6 @@ softnic_pipeline_table_rule_mtr_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2012,8 +1978,6 @@ softnic_pipeline_table_dscp_table_update(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2077,8 +2041,6 @@ softnic_pipeline_table_rule_ttl_read(struct pmd_internals *softnic, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +diff --git a/dpdk/drivers/net/tap/rte_eth_tap.c b/dpdk/drivers/net/tap/rte_eth_tap.c +index a13d8d50d7..7081ae23e9 100644 +--- a/dpdk/drivers/net/tap/rte_eth_tap.c ++++ b/dpdk/drivers/net/tap/rte_eth_tap.c +@@ -18,8 +18,8 @@ + #include + #include + #include ++#include + +-#include + #include + #include + #include +@@ -339,6 +339,23 @@ tap_rx_offload_get_queue_capa(void) + DEV_RX_OFFLOAD_TCP_CKSUM; + } + ++static void ++tap_rxq_pool_free(struct rte_mbuf *pool) ++{ ++ struct rte_mbuf *mbuf = pool; ++ uint16_t nb_segs = 1; ++ ++ if (mbuf == NULL) ++ return; ++ ++ while (mbuf->next) { ++ mbuf = mbuf->next; ++ nb_segs++; ++ } ++ pool->nb_segs = nb_segs; ++ rte_pktmbuf_free(pool); ++} ++ + /* Callback to handle the rx burst of packets to the correct interface and + * file descriptor(s) in a multi-queue setup. + */ +@@ -389,7 +406,7 @@ pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) + goto end; + + seg->next = NULL; +- rte_pktmbuf_free(mbuf); ++ tap_rxq_pool_free(mbuf); + + goto end; + } +@@ -521,7 +538,7 @@ tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len, + } + } + +-static inline void ++static inline int + tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, + struct rte_mbuf **pmbufs, + uint16_t *num_packets, unsigned long *num_tx_bytes) +@@ -588,7 +605,7 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, + seg_len = rte_pktmbuf_data_len(mbuf); + l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len; + if (seg_len < l234_hlen) +- break; ++ return -1; + + /* To change checksums, work on a * copy of l2, l3 + * headers + l4 pseudo header +@@ -634,10 +651,12 @@ tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs, + /* copy the tx frame data */ + n = writev(process_private->txq_fds[txq->queue_id], iovecs, j); + if (n <= 0) +- break; ++ return -1; ++ + (*num_packets)++; + (*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf); + } ++ return 0; + } + + /* Callback to handle sending packets from the tap interface +@@ -663,16 +682,14 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) + uint16_t num_mbufs = 0; + uint16_t tso_segsz = 0; + int ret; ++ int num_tso_mbufs; + uint16_t hdrs_len; +- int j; + uint64_t tso; + + tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG; + if (tso) { + struct rte_gso_ctx *gso_ctx = &txq->gso_ctx; + +- assert(gso_ctx != NULL); +- + /* TCP segmentation implies TCP checksum offload */ + mbuf_in->ol_flags |= PKT_TX_TCP_CKSUM; + +@@ -686,43 +703,51 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts) + break; + } + gso_ctx->gso_size = tso_segsz; +- ret = rte_gso_segment(mbuf_in, /* packet to segment */ ++ /* 'mbuf_in' packet to segment */ ++ num_tso_mbufs = rte_gso_segment(mbuf_in, + gso_ctx, /* gso control block */ + (struct rte_mbuf **)&gso_mbufs, /* out mbufs */ + RTE_DIM(gso_mbufs)); /* max tso mbufs */ + + /* ret contains the number of new created mbufs */ +- if (ret < 0) ++ if (num_tso_mbufs < 0) + break; + + mbuf = gso_mbufs; +- num_mbufs = ret; ++ num_mbufs = num_tso_mbufs; + } else { + /* stats.errs will be incremented */ + if (rte_pktmbuf_pkt_len(mbuf_in) > max_size) + break; + + /* ret 0 indicates no new mbufs were created */ +- ret = 0; ++ num_tso_mbufs = 0; + mbuf = &mbuf_in; + num_mbufs = 1; + } + +- tap_write_mbufs(txq, num_mbufs, mbuf, ++ ret = tap_write_mbufs(txq, num_mbufs, mbuf, + &num_packets, &num_tx_bytes); ++ if (ret == -1) { ++ txq->stats.errs++; ++ /* free tso mbufs */ ++ if (num_tso_mbufs > 0) ++ rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs); ++ break; ++ } + num_tx++; + /* free original mbuf */ + rte_pktmbuf_free(mbuf_in); + /* free tso mbufs */ +- for (j = 0; j < ret; j++) +- rte_pktmbuf_free(mbuf[j]); ++ if (num_tso_mbufs > 0) ++ rte_pktmbuf_free_bulk(mbuf, num_tso_mbufs); + } + + txq->stats.opackets += num_packets; + txq->stats.errs += nb_pkts - num_tx; + txq->stats.obytes += num_tx_bytes; + +- return num_packets; ++ return num_tx; + } + + static const char * +@@ -778,7 +803,7 @@ tap_ioctl(struct pmd_internals *pmd, unsigned long request, + case SIOCSIFMTU: + break; + default: +- RTE_LOG(WARNING, PMD, "%s: ioctl() called with wrong arg\n", ++ TAP_LOG(WARNING, "%s: ioctl() called with wrong arg", + pmd->name); + return -EINVAL; + } +@@ -1013,15 +1038,25 @@ tap_dev_close(struct rte_eth_dev *dev) + int i; + struct pmd_internals *internals = dev->data->dev_private; + struct pmd_process_private *process_private = dev->process_private; ++ struct rx_queue *rxq; + + tap_link_set_down(dev); +- tap_flow_flush(dev, NULL); +- tap_flow_implicit_flush(internals, NULL); ++ if (internals->nlsk_fd != -1) { ++ tap_flow_flush(dev, NULL); ++ tap_flow_implicit_flush(internals, NULL); ++ tap_nl_final(internals->nlsk_fd); ++ internals->nlsk_fd = -1; ++ } + + for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { + if (process_private->rxq_fds[i] != -1) { ++ rxq = &internals->rxq[i]; + close(process_private->rxq_fds[i]); + process_private->rxq_fds[i] = -1; ++ tap_rxq_pool_free(rxq->pool); ++ rte_free(rxq->iovecs); ++ rxq->pool = NULL; ++ rxq->iovecs = NULL; + } + if (process_private->txq_fds[i] != -1) { + close(process_private->txq_fds[i]); +@@ -1054,10 +1089,10 @@ tap_rx_queue_release(void *queue) + if (!rxq) + return; + process_private = rte_eth_devices[rxq->in_port].process_private; +- if (process_private->rxq_fds[rxq->queue_id] > 0) { ++ if (process_private->rxq_fds[rxq->queue_id] != -1) { + close(process_private->rxq_fds[rxq->queue_id]); + process_private->rxq_fds[rxq->queue_id] = -1; +- rte_pktmbuf_free(rxq->pool); ++ tap_rxq_pool_free(rxq->pool); + rte_free(rxq->iovecs); + rxq->pool = NULL; + rxq->iovecs = NULL; +@@ -1074,7 +1109,7 @@ tap_tx_queue_release(void *queue) + return; + process_private = rte_eth_devices[txq->out_port].process_private; + +- if (process_private->txq_fds[txq->queue_id] > 0) { ++ if (process_private->txq_fds[txq->queue_id] != -1) { + close(process_private->txq_fds[txq->queue_id]); + process_private->txq_fds[txq->queue_id] = -1; + } +@@ -1301,7 +1336,9 @@ tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev) + SOCKET_ID_ANY); + if (!mp) { + struct pmd_internals *pmd = dev->data->dev_private; +- RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n", ++ ++ TAP_LOG(ERR, ++ "%s: failed to create mbuf pool for device %s\n", + pmd->name, dev->device->name); + return -1; + } +@@ -1465,7 +1502,7 @@ tap_rx_queue_setup(struct rte_eth_dev *dev, + return 0; + + error: +- rte_pktmbuf_free(rxq->pool); ++ tap_rxq_pool_free(rxq->pool); + rxq->pool = NULL; + rte_free(rxq->iovecs); + rxq->iovecs = NULL; +@@ -1563,13 +1600,12 @@ static int + tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set) + { + struct pmd_internals *pmd = dev->data->dev_private; ++ int ret; + + /* In any case, disable interrupt if the conf is no longer there. */ + if (!dev->data->dev_conf.intr_conf.lsc) { + if (pmd->intr_handle.fd != -1) { +- tap_nl_final(pmd->intr_handle.fd); +- rte_intr_callback_unregister(&pmd->intr_handle, +- tap_dev_intr_handler, dev); ++ goto clean; + } + return 0; + } +@@ -1580,9 +1616,26 @@ tap_lsc_intr_handle_set(struct rte_eth_dev *dev, int set) + return rte_intr_callback_register( + &pmd->intr_handle, tap_dev_intr_handler, dev); + } ++ ++clean: ++ do { ++ ret = rte_intr_callback_unregister(&pmd->intr_handle, ++ tap_dev_intr_handler, dev); ++ if (ret >= 0) { ++ break; ++ } else if (ret == -EAGAIN) { ++ rte_delay_ms(100); ++ } else { ++ TAP_LOG(ERR, "intr callback unregister failed: %d", ++ ret); ++ break; ++ } ++ } while (true); ++ + tap_nl_final(pmd->intr_handle.fd); +- return rte_intr_callback_unregister(&pmd->intr_handle, +- tap_dev_intr_handler, dev); ++ pmd->intr_handle.fd = -1; ++ ++ return 0; + } + + static int +@@ -1591,8 +1644,11 @@ tap_intr_handle_set(struct rte_eth_dev *dev, int set) + int err; + + err = tap_lsc_intr_handle_set(dev, set); +- if (err) ++ if (err < 0) { ++ if (!set) ++ tap_rx_intr_vec_set(dev, 0); + return err; ++ } + err = tap_rx_intr_vec_set(dev, set); + if (err && set) + tap_lsc_intr_handle_set(dev, 0); +@@ -1784,6 +1840,8 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name, + pmd->dev = dev; + strlcpy(pmd->name, tap_name, sizeof(pmd->name)); + pmd->type = type; ++ pmd->ka_fd = -1; ++ pmd->nlsk_fd = -1; + + pmd->ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0); + if (pmd->ioctl_sock == -1) { +@@ -1814,7 +1872,6 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name, + dev->intr_handle = &pmd->intr_handle; + + /* Presetup the fds to -1 as being not valid */ +- pmd->ka_fd = -1; + for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { + process_private->rxq_fds[i] = -1; + process_private->txq_fds[i] = -1; +@@ -1954,7 +2011,11 @@ eth_dev_tap_create(struct rte_vdev_device *vdev, const char *tap_name, + tap_flow_implicit_flush(pmd, NULL); + + error_exit: +- if (pmd->ioctl_sock > 0) ++ if (pmd->nlsk_fd != -1) ++ close(pmd->nlsk_fd); ++ if (pmd->ka_fd != -1) ++ close(pmd->ka_fd); ++ if (pmd->ioctl_sock != -1) + close(pmd->ioctl_sock); + /* mac_addrs must not be freed alone because part of dev_private */ + dev->data->mac_addrs = NULL; +@@ -2386,8 +2447,6 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev) + { + struct rte_eth_dev *eth_dev = NULL; + struct pmd_internals *internals; +- struct pmd_process_private *process_private; +- int i; + + /* find the ethdev entry */ + eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev)); +@@ -2400,28 +2459,12 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev) + if (rte_eal_process_type() != RTE_PROC_PRIMARY) + return rte_eth_dev_release_port(eth_dev); + +- internals = eth_dev->data->dev_private; +- process_private = eth_dev->process_private; ++ tap_dev_close(eth_dev); + ++ internals = eth_dev->data->dev_private; + TAP_LOG(DEBUG, "Closing %s Ethernet device on numa %u", + tuntap_types[internals->type], rte_socket_id()); + +- if (internals->nlsk_fd) { +- tap_flow_flush(eth_dev, NULL); +- tap_flow_implicit_flush(internals, NULL); +- tap_nl_final(internals->nlsk_fd); +- } +- for (i = 0; i < RTE_PMD_TAP_MAX_QUEUES; i++) { +- if (process_private->rxq_fds[i] != -1) { +- close(process_private->rxq_fds[i]); +- process_private->rxq_fds[i] = -1; +- } +- if (process_private->txq_fds[i] != -1) { +- close(process_private->txq_fds[i]); +- process_private->txq_fds[i] = -1; +- } +- } +- + close(internals->ioctl_sock); + rte_free(eth_dev->process_private); + if (tap_devices_count == 1) +@@ -2429,10 +2472,6 @@ rte_pmd_tap_remove(struct rte_vdev_device *dev) + tap_devices_count--; + rte_eth_dev_release_port(eth_dev); + +- if (internals->ka_fd != -1) { +- close(internals->ka_fd); +- internals->ka_fd = -1; +- } + return 0; + } + +diff --git a/dpdk/drivers/net/tap/tap_flow.c b/dpdk/drivers/net/tap/tap_flow.c +index 9d90361d99..1538349e9c 100644 +--- a/dpdk/drivers/net/tap/tap_flow.c ++++ b/dpdk/drivers/net/tap/tap_flow.c +@@ -1380,7 +1380,7 @@ tap_flow_create(struct rte_eth_dev *dev, + NULL, "priority value too big"); + goto fail; + } +- flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); ++ flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); + if (!flow) { + rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, + NULL, "cannot allocate memory for rte_flow"); +@@ -1416,7 +1416,7 @@ tap_flow_create(struct rte_eth_dev *dev, + * to the local pmd->if_index. + */ + if (pmd->remote_if_index) { +- remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); ++ remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + rte_flow_error_set( + error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE, NULL, +@@ -1693,7 +1693,7 @@ int tap_flow_implicit_create(struct pmd_internals *pmd, + } + }; + +- remote_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); ++ remote_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); + if (!remote_flow) { + TAP_LOG(ERR, "Cannot allocate memory for rte_flow"); + goto fail; +@@ -1896,7 +1896,7 @@ static int rss_enable(struct pmd_internals *pmd, + return -ENOTSUP; + } + +- rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); ++ rss_flow = rte_zmalloc(__func__, sizeof(struct rte_flow), 0); + if (!rss_flow) { + TAP_LOG(ERR, + "Cannot allocate memory for rte_flow"); +diff --git a/dpdk/drivers/net/tap/tap_intr.c b/dpdk/drivers/net/tap/tap_intr.c +index 7af0010e37..5cf4f173a0 100644 +--- a/dpdk/drivers/net/tap/tap_intr.c ++++ b/dpdk/drivers/net/tap/tap_intr.c +@@ -7,7 +7,6 @@ + * Interrupts handling for tap driver. + */ + +-#include + #include + #include + #include +@@ -72,7 +71,7 @@ tap_rx_intr_vec_install(struct rte_eth_dev *dev) + struct rx_queue *rxq = pmd->dev->data->rx_queues[i]; + + /* Skip queues that cannot request interrupts. */ +- if (!rxq || process_private->rxq_fds[i] <= 0) { ++ if (!rxq || process_private->rxq_fds[i] == -1) { + /* Use invalid intr_vec[] index to disable entry. */ + intr_handle->intr_vec[i] = + RTE_INTR_VEC_RXTX_OFFSET + +diff --git a/dpdk/drivers/net/thunderx/nicvf_ethdev.c b/dpdk/drivers/net/thunderx/nicvf_ethdev.c +index 2cf0ffe13b..26191586f7 100644 +--- a/dpdk/drivers/net/thunderx/nicvf_ethdev.c ++++ b/dpdk/drivers/net/thunderx/nicvf_ethdev.c +@@ -496,9 +496,10 @@ nicvf_dev_reta_query(struct rte_eth_dev *dev, + int ret, i, j; + + if (reta_size != NIC_MAX_RSS_IDR_TBL_SIZE) { +- RTE_LOG(ERR, PMD, "The size of hash lookup table configured " +- "(%d) doesn't match the number hardware can supported " +- "(%d)", reta_size, NIC_MAX_RSS_IDR_TBL_SIZE); ++ PMD_DRV_LOG(ERR, ++ "The size of hash lookup table configured " ++ "(%u) doesn't match the number hardware can supported " ++ "(%u)", reta_size, NIC_MAX_RSS_IDR_TBL_SIZE); + return -EINVAL; + } + +@@ -526,9 +527,9 @@ nicvf_dev_reta_update(struct rte_eth_dev *dev, + int ret, i, j; + + if (reta_size != NIC_MAX_RSS_IDR_TBL_SIZE) { +- RTE_LOG(ERR, PMD, "The size of hash lookup table configured " +- "(%d) doesn't match the number hardware can supported " +- "(%d)", reta_size, NIC_MAX_RSS_IDR_TBL_SIZE); ++ PMD_DRV_LOG(ERR, "The size of hash lookup table configured " ++ "(%u) doesn't match the number hardware can supported " ++ "(%u)", reta_size, NIC_MAX_RSS_IDR_TBL_SIZE); + return -EINVAL; + } + +@@ -569,8 +570,8 @@ nicvf_dev_rss_hash_update(struct rte_eth_dev *dev, + + if (rss_conf->rss_key && + rss_conf->rss_key_len != RSS_HASH_KEY_BYTE_SIZE) { +- RTE_LOG(ERR, PMD, "Hash key size mismatch %d", +- rss_conf->rss_key_len); ++ PMD_DRV_LOG(ERR, "Hash key size mismatch %u", ++ rss_conf->rss_key_len); + return -EINVAL; + } + +diff --git a/dpdk/drivers/net/vhost/rte_eth_vhost.c b/dpdk/drivers/net/vhost/rte_eth_vhost.c +index 46f01a7f46..85f91f0b9d 100644 +--- a/dpdk/drivers/net/vhost/rte_eth_vhost.c ++++ b/dpdk/drivers/net/vhost/rte_eth_vhost.c +@@ -97,6 +97,8 @@ struct pmd_internal { + rte_atomic32_t dev_attached; + char *dev_name; + char *iface_name; ++ uint64_t flags; ++ uint64_t disable_flags; + uint16_t max_queues; + int vid; + rte_atomic32_t started; +@@ -491,17 +493,6 @@ eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs) + return nb_tx; + } + +-static int +-eth_dev_configure(struct rte_eth_dev *dev __rte_unused) +-{ +- struct pmd_internal *internal = dev->data->dev_private; +- const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; +- +- internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP); +- +- return 0; +-} +- + static inline struct internal_list * + find_internal_resource(char *ifname) + { +@@ -877,6 +868,74 @@ static struct vhost_device_ops vhost_ops = { + .vring_state_changed = vring_state_changed, + }; + ++static int ++vhost_driver_setup(struct rte_eth_dev *eth_dev) ++{ ++ struct pmd_internal *internal = eth_dev->data->dev_private; ++ struct internal_list *list = NULL; ++ struct rte_vhost_vring_state *vring_state = NULL; ++ unsigned int numa_node = eth_dev->device->numa_node; ++ const char *name = eth_dev->device->name; ++ ++ /* Don't try to setup again if it has already been done. */ ++ list = find_internal_resource(internal->iface_name); ++ if (list) ++ return 0; ++ ++ list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node); ++ if (list == NULL) ++ return -1; ++ ++ vring_state = rte_zmalloc_socket(name, sizeof(*vring_state), ++ 0, numa_node); ++ if (vring_state == NULL) ++ goto free_list; ++ ++ list->eth_dev = eth_dev; ++ pthread_mutex_lock(&internal_list_lock); ++ TAILQ_INSERT_TAIL(&internal_list, list, next); ++ pthread_mutex_unlock(&internal_list_lock); ++ ++ rte_spinlock_init(&vring_state->lock); ++ vring_states[eth_dev->data->port_id] = vring_state; ++ ++ if (rte_vhost_driver_register(internal->iface_name, internal->flags)) ++ goto list_remove; ++ ++ if (internal->disable_flags) { ++ if (rte_vhost_driver_disable_features(internal->iface_name, ++ internal->disable_flags)) ++ goto drv_unreg; ++ } ++ ++ if (rte_vhost_driver_callback_register(internal->iface_name, ++ &vhost_ops) < 0) { ++ VHOST_LOG(ERR, "Can't register callbacks\n"); ++ goto drv_unreg; ++ } ++ ++ if (rte_vhost_driver_start(internal->iface_name) < 0) { ++ VHOST_LOG(ERR, "Failed to start driver for %s\n", ++ internal->iface_name); ++ goto drv_unreg; ++ } ++ ++ return 0; ++ ++drv_unreg: ++ rte_vhost_driver_unregister(internal->iface_name); ++list_remove: ++ vring_states[eth_dev->data->port_id] = NULL; ++ pthread_mutex_lock(&internal_list_lock); ++ TAILQ_REMOVE(&internal_list, list, next); ++ pthread_mutex_unlock(&internal_list_lock); ++ rte_free(vring_state); ++free_list: ++ rte_free(list); ++ ++ return -1; ++} ++ + int + rte_eth_vhost_get_queue_event(uint16_t port_id, + struct rte_eth_vhost_queue_event *event) +@@ -943,6 +1002,24 @@ rte_eth_vhost_get_vid_from_port_id(uint16_t port_id) + return vid; + } + ++static int ++eth_dev_configure(struct rte_eth_dev *dev) ++{ ++ struct pmd_internal *internal = dev->data->dev_private; ++ const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode; ++ ++ /* NOTE: the same process has to operate a vhost interface ++ * from beginning to end (from eth_dev configure to eth_dev close). ++ * It is user's responsibility at the moment. ++ */ ++ if (vhost_driver_setup(dev) < 0) ++ return -1; ++ ++ internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP); ++ ++ return 0; ++} ++ + static int + eth_dev_start(struct rte_eth_dev *eth_dev) + { +@@ -989,16 +1066,14 @@ eth_dev_close(struct rte_eth_dev *dev) + + eth_dev_stop(dev); + +- rte_vhost_driver_unregister(internal->iface_name); +- + list = find_internal_resource(internal->iface_name); +- if (!list) +- return; +- +- pthread_mutex_lock(&internal_list_lock); +- TAILQ_REMOVE(&internal_list, list, next); +- pthread_mutex_unlock(&internal_list_lock); +- rte_free(list); ++ if (list) { ++ rte_vhost_driver_unregister(internal->iface_name); ++ pthread_mutex_lock(&internal_list_lock); ++ TAILQ_REMOVE(&internal_list, list, next); ++ pthread_mutex_unlock(&internal_list_lock); ++ rte_free(list); ++ } + + if (dev->data->rx_queues) + for (i = 0; i < dev->data->nb_rx_queues; i++) +@@ -1009,7 +1084,7 @@ eth_dev_close(struct rte_eth_dev *dev) + rte_free(dev->data->tx_queues[i]); + + free(internal->dev_name); +- free(internal->iface_name); ++ rte_free(internal->iface_name); + rte_free(internal); + + dev->data->dev_private = NULL; +@@ -1219,16 +1294,10 @@ eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name, + struct pmd_internal *internal = NULL; + struct rte_eth_dev *eth_dev = NULL; + struct rte_ether_addr *eth_addr = NULL; +- struct rte_vhost_vring_state *vring_state = NULL; +- struct internal_list *list = NULL; + + VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n", + numa_node); + +- list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node); +- if (list == NULL) +- goto error; +- + /* reserve an ethdev entry */ + eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal)); + if (eth_dev == NULL) +@@ -1242,11 +1311,6 @@ eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name, + *eth_addr = base_eth_addr; + eth_addr->addr_bytes[5] = eth_dev->data->port_id; + +- vring_state = rte_zmalloc_socket(name, +- sizeof(*vring_state), 0, numa_node); +- if (vring_state == NULL) +- goto error; +- + /* now put it all together + * - store queue data in internal, + * - point eth_dev_data to internals +@@ -1256,22 +1320,18 @@ eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name, + internal->dev_name = strdup(name); + if (internal->dev_name == NULL) + goto error; +- internal->iface_name = strdup(iface_name); ++ internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1, ++ 0, numa_node); + if (internal->iface_name == NULL) + goto error; +- +- list->eth_dev = eth_dev; +- pthread_mutex_lock(&internal_list_lock); +- TAILQ_INSERT_TAIL(&internal_list, list, next); +- pthread_mutex_unlock(&internal_list_lock); +- +- rte_spinlock_init(&vring_state->lock); +- vring_states[eth_dev->data->port_id] = vring_state; ++ strcpy(internal->iface_name, iface_name); + + data->nb_rx_queues = queues; + data->nb_tx_queues = queues; + internal->max_queues = queues; + internal->vid = -1; ++ internal->flags = flags; ++ internal->disable_flags = disable_flags; + data->dev_link = pmd_link; + data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE; + +@@ -1281,37 +1341,15 @@ eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name, + eth_dev->rx_pkt_burst = eth_vhost_rx; + eth_dev->tx_pkt_burst = eth_vhost_tx; + +- if (rte_vhost_driver_register(iface_name, flags)) +- goto error; +- +- if (disable_flags) { +- if (rte_vhost_driver_disable_features(iface_name, +- disable_flags)) +- goto error; +- } +- +- if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) { +- VHOST_LOG(ERR, "Can't register callbacks\n"); +- goto error; +- } +- +- if (rte_vhost_driver_start(iface_name) < 0) { +- VHOST_LOG(ERR, "Failed to start driver for %s\n", +- iface_name); +- goto error; +- } +- + rte_eth_dev_probing_finish(eth_dev); +- return data->port_id; ++ return 0; + + error: + if (internal) { +- free(internal->iface_name); ++ rte_free(internal->iface_name); + free(internal->dev_name); + } +- rte_free(vring_state); + rte_eth_dev_release_port(eth_dev); +- rte_free(list); + + return -1; + } +@@ -1369,8 +1407,11 @@ rte_pmd_vhost_probe(struct rte_vdev_device *dev) + VHOST_LOG(ERR, "Failed to probe %s\n", name); + return -1; + } +- /* TODO: request info from primary to set up Rx and Tx */ ++ eth_dev->rx_pkt_burst = eth_vhost_rx; ++ eth_dev->tx_pkt_burst = eth_vhost_tx; + eth_dev->dev_ops = &ops; ++ if (dev->device.numa_node == SOCKET_ID_ANY) ++ dev->device.numa_node = rte_socket_id(); + eth_dev->device = &dev->device; + rte_eth_dev_probing_finish(eth_dev); + return 0; +@@ -1455,8 +1496,10 @@ rte_pmd_vhost_probe(struct rte_vdev_device *dev) + if (dev->device.numa_node == SOCKET_ID_ANY) + dev->device.numa_node = rte_socket_id(); + +- eth_dev_vhost_create(dev, iface_name, queues, dev->device.numa_node, +- flags, disable_flags); ++ ret = eth_dev_vhost_create(dev, iface_name, queues, ++ dev->device.numa_node, flags, disable_flags); ++ if (ret == -1) ++ VHOST_LOG(ERR, "Failed to create %s\n", name); + + out_free: + rte_kvargs_free(kvlist); +diff --git a/dpdk/drivers/net/virtio/virtio_ethdev.c b/dpdk/drivers/net/virtio/virtio_ethdev.c +index 044eb10a70..35203940a7 100644 +--- a/dpdk/drivers/net/virtio/virtio_ethdev.c ++++ b/dpdk/drivers/net/virtio/virtio_ethdev.c +@@ -466,7 +466,7 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx) + } + + if (!vtpci_packed_queue(hw) && !rte_is_power_of_2(vq_size)) { +- PMD_INIT_LOG(ERR, "split virtqueue size is not powerof 2"); ++ PMD_INIT_LOG(ERR, "split virtqueue size is not power of 2"); + return -EINVAL; + } + +@@ -588,8 +588,8 @@ virtio_init_queue(struct rte_eth_dev *dev, uint16_t vtpci_queue_idx) + hw->cvq = cvq; + } + +- /* For virtio_user case (that is when hw->dev is NULL), we use +- * virtual address. And we need properly set _offset_, please see ++ /* For virtio_user case (that is when hw->virtio_user_dev is not NULL), ++ * we use virtual address. And we need properly set _offset_, please see + * VIRTIO_MBUF_DATA_DMA_ADDR in virtqueue.h for more information. + */ + if (!hw->virtio_user_dev) +@@ -1913,6 +1913,8 @@ eth_virtio_dev_init(struct rte_eth_dev *eth_dev) + goto err_vtpci_init; + } + ++ rte_spinlock_init(&hw->state_lock); ++ + /* reset device and negotiate default features */ + ret = virtio_init_device(eth_dev, VIRTIO_PMD_DEFAULT_GUEST_FEATURES); + if (ret < 0) +@@ -2155,8 +2157,6 @@ virtio_dev_configure(struct rte_eth_dev *dev) + return -EBUSY; + } + +- rte_spinlock_init(&hw->state_lock); +- + hw->use_simple_rx = 1; + + if (vtpci_with_feature(hw, VIRTIO_F_IN_ORDER)) { +diff --git a/dpdk/drivers/net/virtio/virtio_rxtx.c b/dpdk/drivers/net/virtio/virtio_rxtx.c +index 752faa0f6e..060410577a 100644 +--- a/dpdk/drivers/net/virtio/virtio_rxtx.c ++++ b/dpdk/drivers/net/virtio/virtio_rxtx.c +@@ -1085,7 +1085,7 @@ virtio_dev_tx_queue_setup(struct rte_eth_dev *dev, + RTE_MIN(vq->vq_nentries / 4, DEFAULT_TX_FREE_THRESH); + + if (tx_free_thresh >= (vq->vq_nentries - 3)) { +- RTE_LOG(ERR, PMD, "tx_free_thresh must be less than the " ++ PMD_DRV_LOG(ERR, "tx_free_thresh must be less than the " + "number of TX entries minus 3 (%u)." + " (tx_free_thresh=%u port=%u queue=%u)\n", + vq->vq_nentries - 3, +@@ -1133,7 +1133,7 @@ virtio_discard_rxbuf(struct virtqueue *vq, struct rte_mbuf *m) + error = virtqueue_enqueue_recv_refill(vq, &m, 1); + + if (unlikely(error)) { +- RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf"); ++ PMD_DRV_LOG(ERR, "cannot requeue discarded mbuf"); + rte_pktmbuf_free(m); + } + } +@@ -1145,7 +1145,7 @@ virtio_discard_rxbuf_inorder(struct virtqueue *vq, struct rte_mbuf *m) + + error = virtqueue_enqueue_refill_inorder(vq, &m, 1); + if (unlikely(error)) { +- RTE_LOG(ERR, PMD, "cannot requeue discarded mbuf"); ++ PMD_DRV_LOG(ERR, "cannot requeue discarded mbuf"); + rte_pktmbuf_free(m); + } + } +diff --git a/dpdk/drivers/net/virtio/virtio_rxtx_simple_altivec.c b/dpdk/drivers/net/virtio/virtio_rxtx_simple_altivec.c +index 47225f4121..003b6ec3f6 100644 +--- a/dpdk/drivers/net/virtio/virtio_rxtx_simple_altivec.c ++++ b/dpdk/drivers/net/virtio/virtio_rxtx_simple_altivec.c +@@ -9,8 +9,7 @@ + #include + #include + +-#include +- ++#include + #include + #include + #include +diff --git a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c +index 5c81e8dd9f..2c805077af 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c ++++ b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel.c +@@ -330,16 +330,34 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, + + vhostfd = dev->vhostfds[pair_idx]; + ++ if (dev->qp_enabled[pair_idx] == enable) ++ return 0; ++ + if (!enable) { +- if (dev->tapfds[pair_idx] >= 0) { +- close(dev->tapfds[pair_idx]); +- dev->tapfds[pair_idx] = -1; ++ tapfd = dev->tapfds[pair_idx]; ++ if (vhost_kernel_set_backend(vhostfd, -1) < 0) { ++ PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); ++ return -1; + } +- return vhost_kernel_set_backend(vhostfd, -1); +- } else if (dev->tapfds[pair_idx] >= 0) { ++ if (req_mq && vhost_kernel_tap_set_queue(tapfd, false) < 0) { ++ PMD_DRV_LOG(ERR, "fail to disable tap for vhost kernel"); ++ return -1; ++ } ++ dev->qp_enabled[pair_idx] = false; + return 0; + } + ++ if (dev->tapfds[pair_idx] >= 0) { ++ tapfd = dev->tapfds[pair_idx]; ++ if (vhost_kernel_tap_set_offload(tapfd, dev->features) == -1) ++ return -1; ++ if (req_mq && vhost_kernel_tap_set_queue(tapfd, true) < 0) { ++ PMD_DRV_LOG(ERR, "fail to enable tap for vhost kernel"); ++ return -1; ++ } ++ goto set_backend; ++ } ++ + if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) || + (dev->features & (1ULL << VIRTIO_F_VERSION_1))) + hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); +@@ -353,13 +371,15 @@ vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev, + return -1; + } + ++ dev->tapfds[pair_idx] = tapfd; ++ ++set_backend: + if (vhost_kernel_set_backend(vhostfd, tapfd) < 0) { + PMD_DRV_LOG(ERR, "fail to set backend for vhost kernel"); +- close(tapfd); + return -1; + } + +- dev->tapfds[pair_idx] = tapfd; ++ dev->qp_enabled[pair_idx] = true; + return 0; + } + +diff --git a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c +index 76bf75423e..2fa4f0d661 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c ++++ b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.c +@@ -18,7 +18,7 @@ + #include "../virtio_logs.h" + #include "../virtio_pci.h" + +-static int ++int + vhost_kernel_tap_set_offload(int fd, uint64_t features) + { + unsigned int offload = 0; +@@ -37,26 +37,34 @@ vhost_kernel_tap_set_offload(int fd, uint64_t features) + offload |= TUN_F_UFO; + } + +- if (offload != 0) { +- /* Check if our kernel supports TUNSETOFFLOAD */ +- if (ioctl(fd, TUNSETOFFLOAD, 0) != 0 && errno == EINVAL) { +- PMD_DRV_LOG(ERR, "Kernel does't support TUNSETOFFLOAD\n"); +- return -ENOTSUP; +- } ++ /* Check if our kernel supports TUNSETOFFLOAD */ ++ if (ioctl(fd, TUNSETOFFLOAD, 0) != 0 && errno == EINVAL) { ++ PMD_DRV_LOG(ERR, "Kernel does't support TUNSETOFFLOAD\n"); ++ return -ENOTSUP; ++ } + ++ if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { ++ offload &= ~TUN_F_UFO; + if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { +- offload &= ~TUN_F_UFO; +- if (ioctl(fd, TUNSETOFFLOAD, offload) != 0) { +- PMD_DRV_LOG(ERR, "TUNSETOFFLOAD ioctl() failed: %s\n", +- strerror(errno)); +- return -1; +- } ++ PMD_DRV_LOG(ERR, "TUNSETOFFLOAD ioctl() failed: %s\n", ++ strerror(errno)); ++ return -1; + } + } + + return 0; + } + ++int ++vhost_kernel_tap_set_queue(int fd, bool attach) ++{ ++ struct ifreq ifr = { ++ .ifr_flags = attach ? IFF_ATTACH_QUEUE : IFF_DETACH_QUEUE, ++ }; ++ ++ return ioctl(fd, TUNSETQUEUE, &ifr); ++} ++ + int + vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, + const char *mac, uint64_t features) +@@ -66,6 +74,7 @@ vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, + int sndbuf = INT_MAX; + struct ifreq ifr; + int tapfd; ++ int ret; + + /* TODO: + * 1. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len +@@ -131,7 +140,9 @@ vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, + goto error; + } + +- vhost_kernel_tap_set_offload(tapfd, features); ++ ret = vhost_kernel_tap_set_offload(tapfd, features); ++ if (ret < 0 && ret != -ENOTSUP) ++ goto error; + + memset(&ifr, 0, sizeof(ifr)); + ifr.ifr_hwaddr.sa_family = ARPHRD_ETHER; +diff --git a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h +index e0e95b4f59..5c4447b296 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h ++++ b/dpdk/drivers/net/virtio/virtio_user/vhost_kernel_tap.h +@@ -2,6 +2,10 @@ + * Copyright(c) 2016 Intel Corporation + */ + ++#ifndef _VHOST_KERNEL_TAP_H ++#define _VHOST_KERNEL_TAP_H ++ ++#include + #include + + /* TUN ioctls */ +@@ -37,3 +41,7 @@ + + int vhost_kernel_open_tap(char **p_ifname, int hdr_size, int req_mq, + const char *mac, uint64_t features); ++int vhost_kernel_tap_set_offload(int fd, uint64_t features); ++int vhost_kernel_tap_set_queue(int fd, bool attach); ++ ++#endif +diff --git a/dpdk/drivers/net/virtio/virtio_user/vhost_user.c b/dpdk/drivers/net/virtio/virtio_user/vhost_user.c +index a4b5c25cd3..d8e083ba8b 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/vhost_user.c ++++ b/dpdk/drivers/net/virtio/virtio_user/vhost_user.c +@@ -456,6 +456,9 @@ vhost_user_enable_queue_pair(struct virtio_user_dev *dev, + { + int i; + ++ if (dev->qp_enabled[pair_idx] == enable) ++ return 0; ++ + for (i = 0; i < 2; ++i) { + struct vhost_vring_state state = { + .index = pair_idx * 2 + i, +@@ -466,6 +469,7 @@ vhost_user_enable_queue_pair(struct virtio_user_dev *dev, + return -1; + } + ++ dev->qp_enabled[pair_idx] = enable; + return 0; + } + +diff --git a/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c b/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c +index ea016e85d8..1c6b26f8d3 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c ++++ b/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.c +@@ -537,7 +537,8 @@ virtio_user_dev_uninit(struct virtio_user_dev *dev) + close(dev->kickfds[i]); + } + +- close(dev->vhostfd); ++ if (dev->vhostfd >= 0) ++ close(dev->vhostfd); + + if (dev->is_server && dev->listenfd >= 0) { + close(dev->listenfd); +@@ -545,8 +546,11 @@ virtio_user_dev_uninit(struct virtio_user_dev *dev) + } + + if (dev->vhostfds) { +- for (i = 0; i < dev->max_queue_pairs; ++i) ++ for (i = 0; i < dev->max_queue_pairs; ++i) { + close(dev->vhostfds[i]); ++ if (dev->tapfds[i] >= 0) ++ close(dev->tapfds[i]); ++ } + free(dev->vhostfds); + free(dev->tapfds); + } +diff --git a/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h b/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h +index ad86837717..3b6b6065a5 100644 +--- a/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h ++++ b/dpdk/drivers/net/virtio/virtio_user/virtio_user_dev.h +@@ -49,6 +49,7 @@ struct virtio_user_dev { + struct vring_packed packed_vrings[VIRTIO_MAX_VIRTQUEUES]; + }; + struct virtio_user_queue packed_queues[VIRTIO_MAX_VIRTQUEUES]; ++ bool qp_enabled[VIRTIO_MAX_VIRTQUEUE_PAIRS]; + + struct virtio_user_backend_ops *ops; + pthread_mutex_t mutex; +diff --git a/dpdk/drivers/net/virtio/virtio_user_ethdev.c b/dpdk/drivers/net/virtio/virtio_user_ethdev.c +index 3fc1725736..e2cbd2478d 100644 +--- a/dpdk/drivers/net/virtio/virtio_user_ethdev.c ++++ b/dpdk/drivers/net/virtio/virtio_user_ethdev.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + #include "virtio_ethdev.h" + #include "virtio_logs.h" +@@ -25,12 +26,48 @@ + #define virtio_user_get_dev(hw) \ + ((struct virtio_user_dev *)(hw)->virtio_user_dev) + ++static void ++virtio_user_reset_queues_packed(struct rte_eth_dev *dev) ++{ ++ struct virtio_hw *hw = dev->data->dev_private; ++ struct virtnet_rx *rxvq; ++ struct virtnet_tx *txvq; ++ uint16_t i; ++ ++ /* Add lock to avoid queue contention. */ ++ rte_spinlock_lock(&hw->state_lock); ++ hw->started = 0; ++ ++ /* ++ * Waitting for datapath to complete before resetting queues. ++ * 1 ms should be enough for the ongoing Tx/Rx function to finish. ++ */ ++ rte_delay_ms(1); ++ ++ /* Vring reset for each Tx queue and Rx queue. */ ++ for (i = 0; i < dev->data->nb_rx_queues; i++) { ++ rxvq = dev->data->rx_queues[i]; ++ virtqueue_rxvq_reset_packed(rxvq->vq); ++ virtio_dev_rx_queue_setup_finish(dev, i); ++ } ++ ++ for (i = 0; i < dev->data->nb_tx_queues; i++) { ++ txvq = dev->data->tx_queues[i]; ++ virtqueue_txvq_reset_packed(txvq->vq); ++ } ++ ++ hw->started = 1; ++ rte_spinlock_unlock(&hw->state_lock); ++} ++ ++ + static int + virtio_user_server_reconnect(struct virtio_user_dev *dev) + { + int ret; + int connectfd; + struct rte_eth_dev *eth_dev = &rte_eth_devices[dev->port_id]; ++ struct virtio_hw *hw = eth_dev->data->dev_private; + + connectfd = accept(dev->listenfd, NULL, NULL); + if (connectfd < 0) +@@ -51,6 +88,14 @@ virtio_user_server_reconnect(struct virtio_user_dev *dev) + + dev->features &= dev->device_features; + ++ /* For packed ring, resetting queues is required in reconnection. */ ++ if (vtpci_packed_queue(hw) && ++ (vtpci_get_status(hw) & VIRTIO_CONFIG_STATUS_DRIVER_OK)) { ++ PMD_INIT_LOG(NOTICE, "Packets on the fly will be dropped" ++ " when packed ring reconnecting."); ++ virtio_user_reset_queues_packed(eth_dev); ++ } ++ + ret = virtio_user_start_device(dev); + if (ret < 0) + return -1; +@@ -433,12 +478,17 @@ static int + get_integer_arg(const char *key __rte_unused, + const char *value, void *extra_args) + { ++ uint64_t integer = 0; + if (!value || !extra_args) + return -EINVAL; +- +- *(uint64_t *)extra_args = strtoull(value, NULL, 0); +- +- return 0; ++ errno = 0; ++ integer = strtoull(value, NULL, 0); ++ /* extra_args keeps default value, it should be replaced ++ * only in case of successful parsing of the 'value' arg ++ */ ++ if (errno == 0) ++ *(uint64_t *)extra_args = integer; ++ return -errno; + } + + static struct rte_eth_dev * +@@ -517,7 +567,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) + const char *name = rte_vdev_device_name(dev); + eth_dev = rte_eth_dev_attach_secondary(name); + if (!eth_dev) { +- RTE_LOG(ERR, PMD, "Failed to probe %s\n", name); ++ PMD_INIT_LOG(ERR, "Failed to probe %s", name); + return -1; + } + +@@ -669,7 +719,7 @@ virtio_user_pmd_probe(struct rte_vdev_device *dev) + goto end; + } + +- /* previously called by rte_pci_probe() for physical dev */ ++ /* previously called by pci probing for physical dev */ + if (eth_virtio_dev_init(eth_dev) < 0) { + PMD_INIT_LOG(ERR, "eth_virtio_dev_init fails"); + virtio_user_eth_dev_free(eth_dev); +diff --git a/dpdk/drivers/net/virtio/virtqueue.c b/dpdk/drivers/net/virtio/virtqueue.c +index 5ff1e3587e..02c8b9fc54 100644 +--- a/dpdk/drivers/net/virtio/virtqueue.c ++++ b/dpdk/drivers/net/virtio/virtqueue.c +@@ -141,3 +141,76 @@ virtqueue_rxvq_flush(struct virtqueue *vq) + else + virtqueue_rxvq_flush_split(vq); + } ++ ++int ++virtqueue_rxvq_reset_packed(struct virtqueue *vq) ++{ ++ int size = vq->vq_nentries; ++ struct vq_desc_extra *dxp; ++ struct virtnet_rx *rxvq; ++ uint16_t desc_idx; ++ ++ vq->vq_used_cons_idx = 0; ++ vq->vq_desc_head_idx = 0; ++ vq->vq_avail_idx = 0; ++ vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); ++ vq->vq_free_cnt = vq->vq_nentries; ++ ++ vq->vq_packed.used_wrap_counter = 1; ++ vq->vq_packed.cached_flags = VRING_PACKED_DESC_F_AVAIL; ++ vq->vq_packed.event_flags_shadow = 0; ++ vq->vq_packed.cached_flags |= VRING_DESC_F_WRITE; ++ ++ rxvq = &vq->rxq; ++ memset(rxvq->mz->addr, 0, rxvq->mz->len); ++ ++ for (desc_idx = 0; desc_idx < vq->vq_nentries; desc_idx++) { ++ dxp = &vq->vq_descx[desc_idx]; ++ if (dxp->cookie != NULL) { ++ rte_pktmbuf_free(dxp->cookie); ++ dxp->cookie = NULL; ++ } ++ } ++ ++ vring_desc_init_packed(vq, size); ++ ++ virtqueue_disable_intr(vq); ++ return 0; ++} ++ ++int ++virtqueue_txvq_reset_packed(struct virtqueue *vq) ++{ ++ int size = vq->vq_nentries; ++ struct vq_desc_extra *dxp; ++ struct virtnet_tx *txvq; ++ uint16_t desc_idx; ++ ++ vq->vq_used_cons_idx = 0; ++ vq->vq_desc_head_idx = 0; ++ vq->vq_avail_idx = 0; ++ vq->vq_desc_tail_idx = (uint16_t)(vq->vq_nentries - 1); ++ vq->vq_free_cnt = vq->vq_nentries; ++ ++ vq->vq_packed.used_wrap_counter = 1; ++ vq->vq_packed.cached_flags = VRING_PACKED_DESC_F_AVAIL; ++ vq->vq_packed.event_flags_shadow = 0; ++ ++ txvq = &vq->txq; ++ memset(txvq->mz->addr, 0, txvq->mz->len); ++ memset(txvq->virtio_net_hdr_mz->addr, 0, ++ txvq->virtio_net_hdr_mz->len); ++ ++ for (desc_idx = 0; desc_idx < vq->vq_nentries; desc_idx++) { ++ dxp = &vq->vq_descx[desc_idx]; ++ if (dxp->cookie != NULL) { ++ rte_pktmbuf_free(dxp->cookie); ++ dxp->cookie = NULL; ++ } ++ } ++ ++ vring_desc_init_packed(vq, size); ++ ++ virtqueue_disable_intr(vq); ++ return 0; ++} +diff --git a/dpdk/drivers/net/virtio/virtqueue.h b/dpdk/drivers/net/virtio/virtqueue.h +index 8d7f197b13..58ad7309ae 100644 +--- a/dpdk/drivers/net/virtio/virtqueue.h ++++ b/dpdk/drivers/net/virtio/virtqueue.h +@@ -443,6 +443,10 @@ struct rte_mbuf *virtqueue_detach_unused(struct virtqueue *vq); + /* Flush the elements in the used ring. */ + void virtqueue_rxvq_flush(struct virtqueue *vq); + ++int virtqueue_rxvq_reset_packed(struct virtqueue *vq); ++ ++int virtqueue_txvq_reset_packed(struct virtqueue *vq); ++ + static inline int + virtqueue_full(const struct virtqueue *vq) + { +diff --git a/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.c b/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.c +index 6e6efa9603..705e9760f4 100644 +--- a/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.c ++++ b/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.c +@@ -771,7 +771,8 @@ vmxnet3_dev_start(struct rte_eth_dev *dev) + PMD_INIT_LOG(DEBUG, "Failed to setup memory region\n"); + } + +- if (VMXNET3_VERSION_GE_4(hw)) { ++ if (VMXNET3_VERSION_GE_4(hw) && ++ dev->data->dev_conf.rxmode.mq_mode == ETH_MQ_RX_RSS) { + /* Check for additional RSS */ + ret = vmxnet3_v4_rss_configure(dev); + if (ret != VMXNET3_SUCCESS) { +diff --git a/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.h b/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.h +index 8c2b6f8771..dd685b02b7 100644 +--- a/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.h ++++ b/dpdk/drivers/net/vmxnet3/vmxnet3_ethdev.h +@@ -38,6 +38,10 @@ + ETH_RSS_NONFRAG_IPV4_UDP | \ + ETH_RSS_NONFRAG_IPV6_UDP) + ++#define VMXNET3_MANDATORY_V4_RSS ( \ ++ ETH_RSS_NONFRAG_IPV4_TCP | \ ++ ETH_RSS_NONFRAG_IPV6_TCP) ++ + /* RSS configuration structure - shared with device through GPA */ + typedef struct VMXNET3_RSSConf { + uint16_t hashType; +diff --git a/dpdk/drivers/net/vmxnet3/vmxnet3_rxtx.c b/dpdk/drivers/net/vmxnet3/vmxnet3_rxtx.c +index 7794d74214..73e270f30f 100644 +--- a/dpdk/drivers/net/vmxnet3/vmxnet3_rxtx.c ++++ b/dpdk/drivers/net/vmxnet3/vmxnet3_rxtx.c +@@ -950,13 +950,17 @@ vmxnet3_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) + + RTE_ASSERT(rxd->btype == VMXNET3_RXD_BTYPE_BODY); + +- if (rxm->data_len) { ++ if (likely(start && rxm->data_len > 0)) { + start->pkt_len += rxm->data_len; + start->nb_segs++; + + rxq->last_seg->next = rxm; + rxq->last_seg = rxm; + } else { ++ PMD_RX_LOG(ERR, "Error received empty or out of order frame."); ++ rxq->stats.drop_total++; ++ rxq->stats.drop_err++; ++ + rte_pktmbuf_free_seg(rxm); + } + } +@@ -1311,6 +1315,14 @@ vmxnet3_v4_rss_configure(struct rte_eth_dev *dev) + + cmdInfo->setRSSFields = 0; + port_rss_conf = &dev->data->dev_conf.rx_adv_conf.rss_conf; ++ ++ if ((port_rss_conf->rss_hf & VMXNET3_MANDATORY_V4_RSS) != ++ VMXNET3_MANDATORY_V4_RSS) { ++ PMD_INIT_LOG(WARNING, "RSS: IPv4/6 TCP is required for vmxnet3 v4 RSS," ++ "automatically setting it"); ++ port_rss_conf->rss_hf |= VMXNET3_MANDATORY_V4_RSS; ++ } ++ + rss_hf = port_rss_conf->rss_hf & + (VMXNET3_V4_RSS_MASK | VMXNET3_RSS_OFFLOAD_ALL); + +diff --git a/dpdk/drivers/raw/ifpga/meson.build b/dpdk/drivers/raw/ifpga/meson.build +index 206136ff48..d4027068d6 100644 +--- a/dpdk/drivers/raw/ifpga/meson.build ++++ b/dpdk/drivers/raw/ifpga/meson.build +@@ -15,7 +15,7 @@ if build + objs = [base_objs] + + deps += ['ethdev', 'rawdev', 'pci', 'bus_pci', 'kvargs', +- 'bus_vdev', 'bus_ifpga', 'net', 'i40e', 'ipn3ke'] ++ 'bus_vdev', 'bus_ifpga', 'net', 'pmd_i40e', 'pmd_ipn3ke'] + ext_deps += dep + + sources = files('ifpga_rawdev.c') +diff --git a/dpdk/drivers/raw/ntb/ntb.c b/dpdk/drivers/raw/ntb/ntb.c +index ad7f6abfd3..dd0b72f8c5 100644 +--- a/dpdk/drivers/raw/ntb/ntb.c ++++ b/dpdk/drivers/raw/ntb/ntb.c +@@ -683,8 +683,8 @@ ntb_enqueue_bufs(struct rte_rawdev *dev, + sizeof(struct ntb_used) * nb1); + rte_memcpy(txq->tx_used_ring, tx_used + nb1, + sizeof(struct ntb_used) * nb2); +- *txq->used_cnt = txq->last_used; + rte_wmb(); ++ *txq->used_cnt = txq->last_used; + + /* update queue stats */ + hw->ntb_xstats[NTB_TX_BYTES_ID + off] += bytes; +@@ -789,8 +789,8 @@ ntb_dequeue_bufs(struct rte_rawdev *dev, + sizeof(struct ntb_desc) * nb1); + rte_memcpy(rxq->rx_desc_ring, rx_desc + nb1, + sizeof(struct ntb_desc) * nb2); +- *rxq->avail_cnt = rxq->last_avail; + rte_wmb(); ++ *rxq->avail_cnt = rxq->last_avail; + + /* update queue stats */ + off = NTB_XSTATS_NUM * ((size_t)context + 1); +diff --git a/dpdk/examples/ethtool/lib/rte_ethtool.c b/dpdk/examples/ethtool/lib/rte_ethtool.c +index 667d7eaf27..db8150efd5 100644 +--- a/dpdk/examples/ethtool/lib/rte_ethtool.c ++++ b/dpdk/examples/ethtool/lib/rte_ethtool.c +@@ -402,7 +402,9 @@ rte_ethtool_net_set_rx_mode(uint16_t port_id) + } + + /* Enable Rx vlan filter, VF unspport status is discard */ +- rte_eth_dev_set_vlan_offload(port_id, ETH_VLAN_FILTER_MASK); ++ ret = rte_eth_dev_set_vlan_offload(port_id, ETH_VLAN_FILTER_MASK); ++ if (ret != 0) ++ return ret; + + return 0; + } +diff --git a/dpdk/examples/eventdev_pipeline/main.c b/dpdk/examples/eventdev_pipeline/main.c +index d3ff1bbe4f..21958269f7 100644 +--- a/dpdk/examples/eventdev_pipeline/main.c ++++ b/dpdk/examples/eventdev_pipeline/main.c +@@ -10,6 +10,8 @@ + + #include "pipeline_common.h" + ++struct fastpath_data *fdata; ++ + struct config_data cdata = { + .num_packets = (1L << 25), /* do ~32M packets */ + .num_fids = 512, +@@ -299,12 +301,6 @@ signal_handler(int signum) + + rte_eal_mp_wait_lcore(); + +- RTE_ETH_FOREACH_DEV(portid) { +- rte_eth_dev_close(portid); +- } +- +- rte_event_dev_stop(0); +- rte_event_dev_close(0); + } + if (signum == SIGTSTP) + rte_event_dev_dump(0, stdout); +@@ -467,5 +463,14 @@ main(int argc, char **argv) + + } + ++ RTE_ETH_FOREACH_DEV(portid) { ++ rte_eth_dev_close(portid); ++ } ++ ++ rte_event_dev_stop(0); ++ rte_event_dev_close(0); ++ ++ rte_eal_cleanup(); ++ + return 0; + } +diff --git a/dpdk/examples/eventdev_pipeline/pipeline_common.h b/dpdk/examples/eventdev_pipeline/pipeline_common.h +index 8e30393d09..c7245f7f0f 100644 +--- a/dpdk/examples/eventdev_pipeline/pipeline_common.h ++++ b/dpdk/examples/eventdev_pipeline/pipeline_common.h +@@ -93,8 +93,8 @@ struct port_link { + uint8_t priority; + }; + +-struct fastpath_data *fdata; +-struct config_data cdata; ++extern struct fastpath_data *fdata; ++extern struct config_data cdata; + + static __rte_always_inline void + exchange_mac(struct rte_mbuf *m) +diff --git a/dpdk/examples/fips_validation/fips_validation.c b/dpdk/examples/fips_validation/fips_validation.c +index 07ffa62e9e..b79a095aca 100644 +--- a/dpdk/examples/fips_validation/fips_validation.c ++++ b/dpdk/examples/fips_validation/fips_validation.c +@@ -144,6 +144,24 @@ fips_test_parse_header(void) + ret = parse_test_tdes_init(); + if (ret < 0) + return 0; ++ } else if (strstr(info.vec[i], "PERMUTATION")) { ++ algo_parsed = 1; ++ info.algo = FIPS_TEST_ALGO_TDES; ++ ret = parse_test_tdes_init(); ++ if (ret < 0) ++ return 0; ++ } else if (strstr(info.vec[i], "VARIABLE")) { ++ algo_parsed = 1; ++ info.algo = FIPS_TEST_ALGO_TDES; ++ ret = parse_test_tdes_init(); ++ if (ret < 0) ++ return 0; ++ } else if (strstr(info.vec[i], "SUBSTITUTION")) { ++ algo_parsed = 1; ++ info.algo = FIPS_TEST_ALGO_TDES; ++ ret = parse_test_tdes_init(); ++ if (ret < 0) ++ return 0; + } else if (strstr(info.vec[i], "SHA-")) { + algo_parsed = 1; + info.algo = FIPS_TEST_ALGO_SHA; +diff --git a/dpdk/examples/fips_validation/fips_validation_gcm.c b/dpdk/examples/fips_validation/fips_validation_gcm.c +index ea48ddf707..47576e9a38 100644 +--- a/dpdk/examples/fips_validation/fips_validation_gcm.c ++++ b/dpdk/examples/fips_validation/fips_validation_gcm.c +@@ -46,6 +46,10 @@ struct fips_test_callback gcm_interim_vectors[] = { + {KEYLEN_STR, parser_read_uint32_bit_val, &vec.aead.key}, + {IVLEN_STR, parser_read_uint32_bit_val, &vec.iv}, + {PTLEN_STR, parser_read_uint32_bit_val, &vec.pt}, ++ {PTLEN_STR, parser_read_uint32_bit_val, &vec.ct}, ++ /**< The NIST test vectors use 'PTlen' to denote input text ++ * length in case of decrypt & encrypt operations. ++ */ + {AADLEN_STR, parser_read_uint32_bit_val, &vec.aead.aad}, + {TAGLEN_STR, parser_read_uint32_bit_val, + &vec.aead.digest}, +diff --git a/dpdk/examples/ioat/ioatfwd.c b/dpdk/examples/ioat/ioatfwd.c +index e9117718fe..53de231795 100644 +--- a/dpdk/examples/ioat/ioatfwd.c ++++ b/dpdk/examples/ioat/ioatfwd.c +@@ -460,7 +460,7 @@ ioat_tx_port(struct rxtx_port_config *tx_config) + MAX_PKT_BURST, NULL); + } + +- if (nb_dq <= 0) ++ if ((int32_t) nb_dq <= 0) + return; + + if (copy_mode == COPY_MODE_IOAT_NUM) +@@ -697,7 +697,7 @@ check_link_status(uint32_t port_mask) + { + uint16_t portid; + struct rte_eth_link link; +- int retval = 0; ++ int ret, link_status = 0; + + printf("\nChecking link status\n"); + RTE_ETH_FOREACH_DEV(portid) { +@@ -705,7 +705,12 @@ check_link_status(uint32_t port_mask) + continue; + + memset(&link, 0, sizeof(link)); +- rte_eth_link_get(portid, &link); ++ ret = rte_eth_link_get(portid, &link); ++ if (ret < 0) { ++ printf("Port %u link get failed: err=%d\n", ++ portid, ret); ++ continue; ++ } + + /* Print link status */ + if (link.link_status) { +@@ -713,12 +718,12 @@ check_link_status(uint32_t port_mask) + "Port %d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); +- retval = 1; ++ ("full-duplex") : ("half-duplex")); ++ link_status = 1; + } else + printf("Port %d Link Down\n", portid); + } +- return retval; ++ return link_status; + } + + static void +@@ -824,7 +829,11 @@ port_init(uint16_t portid, struct rte_mempool *mbuf_pool, uint16_t nb_queues) + /* Init port */ + printf("Initializing port %u... ", portid); + fflush(stdout); +- rte_eth_dev_info_get(portid, &dev_info); ++ ret = rte_eth_dev_info_get(portid, &dev_info); ++ if (ret < 0) ++ rte_exit(EXIT_FAILURE, "Cannot get device info: %s, port=%u\n", ++ rte_strerror(-ret), portid); ++ + local_port_conf.rx_adv_conf.rss_conf.rss_hf &= + dev_info.flow_type_rss_offloads; + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) +diff --git a/dpdk/examples/ip_fragmentation/main.c b/dpdk/examples/ip_fragmentation/main.c +index 104612339c..90e4d1ea4a 100644 +--- a/dpdk/examples/ip_fragmentation/main.c ++++ b/dpdk/examples/ip_fragmentation/main.c +@@ -617,7 +617,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up .Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +@@ -646,7 +646,7 @@ check_all_ports_link_status(uint32_t port_mask) + } + } + +-/* Check L3 packet type detection capablity of the NIC port */ ++/* Check L3 packet type detection capability of the NIC port */ + static int + check_ptype(int portid) + { +diff --git a/dpdk/examples/ip_pipeline/thread.c b/dpdk/examples/ip_pipeline/thread.c +index 272fbbeed1..adb83167cd 100644 +--- a/dpdk/examples/ip_pipeline/thread.c ++++ b/dpdk/examples/ip_pipeline/thread.c +@@ -325,8 +325,6 @@ thread_pipeline_enable(uint32_t thread_id, + + /* Send request and wait for response */ + rsp = thread_msg_send_recv(thread_id, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -412,8 +410,6 @@ thread_pipeline_disable(uint32_t thread_id, + + /* Send request and wait for response */ + rsp = thread_msg_send_recv(thread_id, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -815,8 +811,6 @@ pipeline_port_in_stats_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -863,8 +857,6 @@ pipeline_port_in_enable(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -909,8 +901,6 @@ pipeline_port_in_disable(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -963,8 +953,6 @@ pipeline_port_out_stats_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1019,8 +1007,6 @@ pipeline_table_stats_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1436,10 +1422,6 @@ pipeline_table_rule_add(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) { +- free(rule); +- return -1; +- } + + /* Read response */ + status = rsp->status; +@@ -1538,10 +1520,6 @@ pipeline_table_rule_add_default(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) { +- free(rule); +- return -1; +- } + + /* Read response */ + status = rsp->status; +@@ -1655,10 +1633,6 @@ pipeline_table_rule_add_bulk(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) { +- table_rule_list_free(list); +- return -ENOMEM; +- } + + /* Read response */ + status = rsp->status; +@@ -1733,8 +1707,6 @@ pipeline_table_rule_delete(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1790,8 +1762,6 @@ pipeline_table_rule_delete_default(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1857,8 +1827,6 @@ pipeline_table_rule_stats_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1915,8 +1883,6 @@ pipeline_table_mtr_profile_add(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -1967,8 +1933,6 @@ pipeline_table_mtr_profile_delete(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2037,8 +2001,6 @@ pipeline_table_rule_mtr_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2096,8 +2058,6 @@ pipeline_table_dscp_table_update(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2164,8 +2124,6 @@ pipeline_table_rule_ttl_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +@@ -2229,8 +2187,6 @@ pipeline_table_rule_time_read(const char *pipeline_name, + + /* Send request and wait for response */ + rsp = pipeline_msg_send_recv(p, req); +- if (rsp == NULL) +- return -1; + + /* Read response */ + status = rsp->status; +diff --git a/dpdk/examples/ip_reassembly/main.c b/dpdk/examples/ip_reassembly/main.c +index d59e6d02ff..29b34d0710 100644 +--- a/dpdk/examples/ip_reassembly/main.c ++++ b/dpdk/examples/ip_reassembly/main.c +@@ -736,7 +736,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/ipsec-secgw/ipsec-secgw.c b/dpdk/examples/ipsec-secgw/ipsec-secgw.c +index 3b5aaf6832..1493be9025 100644 +--- a/dpdk/examples/ipsec-secgw/ipsec-secgw.c ++++ b/dpdk/examples/ipsec-secgw/ipsec-secgw.c +@@ -1668,7 +1668,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up - speed %u Mbps -%s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/ipsec-secgw/ipsec_process.c b/dpdk/examples/ipsec-secgw/ipsec_process.c +index 2eb5c8b345..37f406d46c 100644 +--- a/dpdk/examples/ipsec-secgw/ipsec_process.c ++++ b/dpdk/examples/ipsec-secgw/ipsec_process.c +@@ -125,6 +125,7 @@ sa_group(void *sa_ptr[], struct rte_mbuf *pkts[], + void * const nosa = &spi; + + sa = nosa; ++ grp[0].m = pkts; + for (i = 0, n = 0; i != num; i++) { + + if (sa != sa_ptr[i]) { +diff --git a/dpdk/examples/ipsec-secgw/sa.c b/dpdk/examples/ipsec-secgw/sa.c +index 7f046e3ed7..fcc6695388 100644 +--- a/dpdk/examples/ipsec-secgw/sa.c ++++ b/dpdk/examples/ipsec-secgw/sa.c +@@ -314,6 +314,9 @@ parse_sa_tokens(char **tokens, uint32_t n_tokens, + APP_CHECK(algo != NULL, status, "unrecognized " + "input \"%s\"", tokens[ti]); + ++ if (status->status < 0) ++ return; ++ + rule->cipher_algo = algo->algo; + rule->block_size = algo->block_size; + rule->iv_len = algo->iv_len; +@@ -378,6 +381,9 @@ parse_sa_tokens(char **tokens, uint32_t n_tokens, + APP_CHECK(algo != NULL, status, "unrecognized " + "input \"%s\"", tokens[ti]); + ++ if (status->status < 0) ++ return; ++ + rule->auth_algo = algo->algo; + rule->auth_key_len = algo->key_len; + rule->digest_len = algo->digest_len; +@@ -433,6 +439,9 @@ parse_sa_tokens(char **tokens, uint32_t n_tokens, + APP_CHECK(algo != NULL, status, "unrecognized " + "input \"%s\"", tokens[ti]); + ++ if (status->status < 0) ++ return; ++ + rule->aead_algo = algo->algo; + rule->cipher_key_len = algo->key_len; + rule->digest_len = algo->digest_len; +@@ -984,7 +993,6 @@ sa_add_rules(struct sa_ctx *sa_ctx, const struct ipsec_sa entries[], + } + + if (sa->aead_algo == RTE_CRYPTO_AEAD_AES_GCM) { +- struct rte_ipsec_session *ips; + iv_length = 12; + + sa_ctx->xf[idx].a.type = RTE_CRYPTO_SYM_XFORM_AEAD; +@@ -1004,20 +1012,6 @@ sa_add_rules(struct sa_ctx *sa_ctx, const struct ipsec_sa entries[], + sa->digest_len; + + sa->xforms = &sa_ctx->xf[idx].a; +- +- ips = ipsec_get_primary_session(sa); +- if (ips->type == +- RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL || +- ips->type == +- RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO) { +- rc = create_inline_session(skt_ctx, sa, ips); +- if (rc != 0) { +- RTE_LOG(ERR, IPSEC_ESP, +- "create_inline_session() failed\n"); +- return -EINVAL; +- } +- } +- print_one_sa_rule(sa, inbound); + } else { + switch (sa->cipher_algo) { + case RTE_CRYPTO_CIPHER_NULL: +@@ -1082,9 +1076,21 @@ sa_add_rules(struct sa_ctx *sa_ctx, const struct ipsec_sa entries[], + sa_ctx->xf[idx].a.next = &sa_ctx->xf[idx].b; + sa_ctx->xf[idx].b.next = NULL; + sa->xforms = &sa_ctx->xf[idx].a; ++ } + +- print_one_sa_rule(sa, inbound); ++ if (ips->type == ++ RTE_SECURITY_ACTION_TYPE_INLINE_PROTOCOL || ++ ips->type == ++ RTE_SECURITY_ACTION_TYPE_INLINE_CRYPTO) { ++ rc = create_inline_session(skt_ctx, sa, ips); ++ if (rc != 0) { ++ RTE_LOG(ERR, IPSEC_ESP, ++ "create_inline_session() failed\n"); ++ return -EINVAL; ++ } + } ++ ++ print_one_sa_rule(sa, inbound); + } + + return 0; +diff --git a/dpdk/examples/ipv4_multicast/main.c b/dpdk/examples/ipv4_multicast/main.c +index 63333b5b69..09d9270aff 100644 +--- a/dpdk/examples/ipv4_multicast/main.c ++++ b/dpdk/examples/ipv4_multicast/main.c +@@ -600,7 +600,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/kni/main.c b/dpdk/examples/kni/main.c +index 5f713e6b22..d48a59fcb1 100644 +--- a/dpdk/examples/kni/main.c ++++ b/dpdk/examples/kni/main.c +@@ -679,7 +679,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up - speed %uMbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +@@ -764,15 +764,16 @@ monitor_all_ports_link_status(void *arg) + return NULL; + } + +-/* Callback for request of changing MTU */ + static int +-kni_change_mtu(uint16_t port_id, unsigned int new_mtu) ++kni_change_mtu_(uint16_t port_id, unsigned int new_mtu) + { + int ret; + uint16_t nb_rxd = NB_RXD; ++ uint16_t nb_txd = NB_TXD; + struct rte_eth_conf conf; + struct rte_eth_dev_info dev_info; + struct rte_eth_rxconf rxq_conf; ++ struct rte_eth_txconf txq_conf; + + if (!rte_eth_dev_is_valid_port(port_id)) { + RTE_LOG(ERR, APP, "Invalid port id %d\n", port_id); +@@ -800,7 +801,7 @@ kni_change_mtu(uint16_t port_id, unsigned int new_mtu) + return ret; + } + +- ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, NULL); ++ ret = rte_eth_dev_adjust_nb_rx_tx_desc(port_id, &nb_rxd, &nb_txd); + if (ret < 0) + rte_exit(EXIT_FAILURE, "Could not adjust number of descriptors " + "for port%u (%d)\n", (unsigned int)port_id, +@@ -825,6 +826,16 @@ kni_change_mtu(uint16_t port_id, unsigned int new_mtu) + return ret; + } + ++ txq_conf = dev_info.default_txconf; ++ txq_conf.offloads = conf.txmode.offloads; ++ ret = rte_eth_tx_queue_setup(port_id, 0, nb_txd, ++ rte_eth_dev_socket_id(port_id), &txq_conf); ++ if (ret < 0) { ++ RTE_LOG(ERR, APP, "Fail to setup Tx queue of port %d\n", ++ port_id); ++ return ret; ++ } ++ + /* Restart specific port */ + ret = rte_eth_dev_start(port_id); + if (ret < 0) { +@@ -835,6 +846,19 @@ kni_change_mtu(uint16_t port_id, unsigned int new_mtu) + return 0; + } + ++/* Callback for request of changing MTU */ ++static int ++kni_change_mtu(uint16_t port_id, unsigned int new_mtu) ++{ ++ int ret; ++ ++ rte_atomic32_inc(&kni_pause); ++ ret = kni_change_mtu_(port_id, new_mtu); ++ rte_atomic32_dec(&kni_pause); ++ ++ return ret; ++} ++ + /* Callback for request of configuring network interface up/down */ + static int + kni_config_network_interface(uint16_t port_id, uint8_t if_up) +diff --git a/dpdk/examples/l2fwd-crypto/main.c b/dpdk/examples/l2fwd-crypto/main.c +index 61d78295d4..fcb55c370a 100644 +--- a/dpdk/examples/l2fwd-crypto/main.c ++++ b/dpdk/examples/l2fwd-crypto/main.c +@@ -1756,7 +1756,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/l2fwd-event/l2fwd_common.c b/dpdk/examples/l2fwd-event/l2fwd_common.c +index 181301fe6b..ab341e55b2 100644 +--- a/dpdk/examples/l2fwd-event/l2fwd_common.c ++++ b/dpdk/examples/l2fwd-event/l2fwd_common.c +@@ -50,6 +50,17 @@ l2fwd_event_init_ports(struct l2fwd_resources *rsrc) + if (ret != 0) + rte_panic("Error during getting device (port %u) info: %s\n", + port_id, strerror(-ret)); ++ local_port_conf.rx_adv_conf.rss_conf.rss_hf &= ++ dev_info.flow_type_rss_offloads; ++ if (local_port_conf.rx_adv_conf.rss_conf.rss_hf != ++ port_conf.rx_adv_conf.rss_conf.rss_hf) { ++ printf("Port %u modified RSS hash function based on hardware support," ++ "requested:%#"PRIx64" configured:%#"PRIx64"", ++ port_id, ++ port_conf.rx_adv_conf.rss_conf.rss_hf, ++ local_port_conf.rx_adv_conf.rss_conf.rss_hf); ++ } ++ + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) + local_port_conf.txmode.offloads |= + DEV_TX_OFFLOAD_MBUF_FAST_FREE; +diff --git a/dpdk/examples/l2fwd-event/l2fwd_event.c b/dpdk/examples/l2fwd-event/l2fwd_event.c +index 0379c580d6..38d590c14c 100644 +--- a/dpdk/examples/l2fwd-event/l2fwd_event.c ++++ b/dpdk/examples/l2fwd-event/l2fwd_event.c +@@ -67,7 +67,7 @@ l2fwd_event_service_setup(struct l2fwd_resources *rsrc) + int ret, i; + + rte_event_dev_info_get(evt_rsrc->event_d_id, &evdev_info); +- if (evdev_info.event_dev_cap & RTE_EVENT_DEV_CAP_DISTRIBUTED_SCHED) { ++ if (!(evdev_info.event_dev_cap & RTE_EVENT_DEV_CAP_DISTRIBUTED_SCHED)) { + ret = rte_event_dev_service_id_get(evt_rsrc->event_d_id, + &service_id); + if (ret != -ESRCH && ret != 0) +diff --git a/dpdk/examples/l2fwd-event/l2fwd_event_generic.c b/dpdk/examples/l2fwd-event/l2fwd_event_generic.c +index b7e467c1e1..2dc95e5f7d 100644 +--- a/dpdk/examples/l2fwd-event/l2fwd_event_generic.c ++++ b/dpdk/examples/l2fwd-event/l2fwd_event_generic.c +@@ -42,8 +42,10 @@ l2fwd_event_device_setup_generic(struct l2fwd_resources *rsrc) + + /* Event device configurtion */ + rte_event_dev_info_get(event_d_id, &dev_info); +- evt_rsrc->disable_implicit_release = !!(dev_info.event_dev_cap & +- RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE); ++ ++ /* Enable implicit release */ ++ if (dev_info.event_dev_cap & RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE) ++ evt_rsrc->disable_implicit_release = 0; + + if (dev_info.event_dev_cap & RTE_EVENT_DEV_CAP_QUEUE_ALL_TYPES) + event_queue_cfg |= RTE_EVENT_QUEUE_CFG_ALL_TYPES; +@@ -70,7 +72,8 @@ l2fwd_event_device_setup_generic(struct l2fwd_resources *rsrc) + event_d_conf.nb_event_port_enqueue_depth = + dev_info.max_event_port_enqueue_depth; + +- num_workers = rte_lcore_count() - rte_service_lcore_count(); ++ /* Ignore Master core and service cores. */ ++ num_workers = rte_lcore_count() - 1 - rte_service_lcore_count(); + if (dev_info.max_event_ports < num_workers) + num_workers = dev_info.max_event_ports; + +@@ -109,7 +112,9 @@ l2fwd_event_port_setup_generic(struct l2fwd_resources *rsrc) + rte_panic("No space is available\n"); + + memset(&def_p_conf, 0, sizeof(struct rte_event_port_conf)); +- rte_event_port_default_conf_get(event_d_id, 0, &def_p_conf); ++ ret = rte_event_port_default_conf_get(event_d_id, 0, &def_p_conf); ++ if (ret < 0) ++ rte_panic("Error to get default configuration of event port\n"); + + if (def_p_conf.new_event_threshold < event_p_conf.new_event_threshold) + event_p_conf.new_event_threshold = +@@ -170,7 +175,10 @@ l2fwd_event_queue_setup_generic(struct l2fwd_resources *rsrc, + if (!evt_rsrc->evq.event_q_id) + rte_panic("Memory allocation failure\n"); + +- rte_event_queue_default_conf_get(event_d_id, 0, &def_q_conf); ++ ret = rte_event_queue_default_conf_get(event_d_id, 0, &def_q_conf); ++ if (ret < 0) ++ rte_panic("Error to get default config of event queue\n"); ++ + if (def_q_conf.nb_atomic_flows < event_q_conf.nb_atomic_flows) + event_q_conf.nb_atomic_flows = def_q_conf.nb_atomic_flows; + +diff --git a/dpdk/examples/l2fwd-event/l2fwd_event_internal_port.c b/dpdk/examples/l2fwd-event/l2fwd_event_internal_port.c +index b382763dd9..63d57b46c2 100644 +--- a/dpdk/examples/l2fwd-event/l2fwd_event_internal_port.c ++++ b/dpdk/examples/l2fwd-event/l2fwd_event_internal_port.c +@@ -27,7 +27,6 @@ l2fwd_event_device_setup_internal_port(struct l2fwd_resources *rsrc) + .nb_event_port_enqueue_depth = 128 + }; + struct rte_event_dev_info dev_info; +- uint8_t disable_implicit_release; + const uint8_t event_d_id = 0; /* Always use first event device only */ + uint32_t event_queue_cfg = 0; + uint16_t ethdev_count = 0; +@@ -44,10 +43,9 @@ l2fwd_event_device_setup_internal_port(struct l2fwd_resources *rsrc) + /* Event device configurtion */ + rte_event_dev_info_get(event_d_id, &dev_info); + +- disable_implicit_release = !!(dev_info.event_dev_cap & +- RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE); +- evt_rsrc->disable_implicit_release = +- disable_implicit_release; ++ /* Enable implicit release */ ++ if (dev_info.event_dev_cap & RTE_EVENT_DEV_CAP_IMPLICIT_RELEASE_DISABLE) ++ evt_rsrc->disable_implicit_release = 0; + + if (dev_info.event_dev_cap & RTE_EVENT_DEV_CAP_QUEUE_ALL_TYPES) + event_queue_cfg |= RTE_EVENT_QUEUE_CFG_ALL_TYPES; +@@ -73,7 +71,8 @@ l2fwd_event_device_setup_internal_port(struct l2fwd_resources *rsrc) + event_d_conf.nb_event_port_enqueue_depth = + dev_info.max_event_port_enqueue_depth; + +- num_workers = rte_lcore_count(); ++ /* Ignore Master core. */ ++ num_workers = rte_lcore_count() - 1; + if (dev_info.max_event_ports < num_workers) + num_workers = dev_info.max_event_ports; + +@@ -110,7 +109,10 @@ l2fwd_event_port_setup_internal_port(struct l2fwd_resources *rsrc) + if (!evt_rsrc->evp.event_p_id) + rte_panic("Failed to allocate memory for Event Ports\n"); + +- rte_event_port_default_conf_get(event_d_id, 0, &def_p_conf); ++ ret = rte_event_port_default_conf_get(event_d_id, 0, &def_p_conf); ++ if (ret < 0) ++ rte_panic("Error to get default configuration of event port\n"); ++ + if (def_p_conf.new_event_threshold < event_p_conf.new_event_threshold) + event_p_conf.new_event_threshold = + def_p_conf.new_event_threshold; +@@ -162,7 +164,10 @@ l2fwd_event_queue_setup_internal_port(struct l2fwd_resources *rsrc, + uint8_t event_q_id = 0; + int32_t ret; + +- rte_event_queue_default_conf_get(event_d_id, event_q_id, &def_q_conf); ++ ret = rte_event_queue_default_conf_get(event_d_id, event_q_id, ++ &def_q_conf); ++ if (ret < 0) ++ rte_panic("Error to get default config of event queue\n"); + + if (def_q_conf.nb_atomic_flows < event_q_conf.nb_atomic_flows) + event_q_conf.nb_atomic_flows = def_q_conf.nb_atomic_flows; +diff --git a/dpdk/examples/l2fwd-event/l2fwd_poll.c b/dpdk/examples/l2fwd-event/l2fwd_poll.c +index a3a3835582..2033c65e54 100644 +--- a/dpdk/examples/l2fwd-event/l2fwd_poll.c ++++ b/dpdk/examples/l2fwd-event/l2fwd_poll.c +@@ -116,6 +116,7 @@ l2fwd_poll_lcore_config(struct l2fwd_resources *rsrc) + + /* get the lcore_id for this port */ + while (rte_lcore_is_enabled(rx_lcore_id) == 0 || ++ rx_lcore_id == rte_get_master_lcore() || + poll_rsrc->lcore_queue_conf[rx_lcore_id].n_rx_port == + rsrc->rx_queue_per_lcore) { + rx_lcore_id++; +diff --git a/dpdk/examples/l2fwd-event/main.c b/dpdk/examples/l2fwd-event/main.c +index 89a6bb9a44..384b71238f 100644 +--- a/dpdk/examples/l2fwd-event/main.c ++++ b/dpdk/examples/l2fwd-event/main.c +@@ -263,7 +263,7 @@ check_all_ports_link_status(struct l2fwd_resources *rsrc, + "Port%d Link Up. Speed %u Mbps - %s\n", + port_id, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", port_id); + continue; +diff --git a/dpdk/examples/l2fwd-jobstats/main.c b/dpdk/examples/l2fwd-jobstats/main.c +index f975aa12d0..e0255080e2 100644 +--- a/dpdk/examples/l2fwd-jobstats/main.c ++++ b/dpdk/examples/l2fwd-jobstats/main.c +@@ -710,7 +710,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/l2fwd-keepalive/main.c b/dpdk/examples/l2fwd-keepalive/main.c +index b36834974e..3d59e2ca90 100644 +--- a/dpdk/examples/l2fwd-keepalive/main.c ++++ b/dpdk/examples/l2fwd-keepalive/main.c +@@ -44,7 +44,7 @@ + + #define RTE_LOGTYPE_L2FWD RTE_LOGTYPE_USER1 + +-#define NB_MBUF 8192 ++#define NB_MBUF_PER_PORT 3000 + + #define MAX_PKT_BURST 32 + #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */ +@@ -475,7 +475,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +@@ -536,6 +536,7 @@ main(int argc, char **argv) + uint16_t portid, last_port; + unsigned lcore_id, rx_lcore_id; + unsigned nb_ports_in_mask = 0; ++ unsigned int total_nb_mbufs; + struct sigaction signal_handler; + struct rte_keepalive_shm *ka_shm; + +@@ -561,16 +562,19 @@ main(int argc, char **argv) + if (ret < 0) + rte_exit(EXIT_FAILURE, "Invalid L2FWD arguments\n"); + +- /* create the mbuf pool */ +- l2fwd_pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", NB_MBUF, 32, +- 0, RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); +- if (l2fwd_pktmbuf_pool == NULL) +- rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n"); +- + nb_ports = rte_eth_dev_count_avail(); + if (nb_ports == 0) + rte_exit(EXIT_FAILURE, "No Ethernet ports - bye\n"); + ++ /* create the mbuf pool */ ++ total_nb_mbufs = NB_MBUF_PER_PORT * nb_ports; ++ ++ l2fwd_pktmbuf_pool = rte_pktmbuf_pool_create("mbuf_pool", ++ total_nb_mbufs, 32, 0, RTE_MBUF_DEFAULT_BUF_SIZE, ++ rte_socket_id()); ++ if (l2fwd_pktmbuf_pool == NULL) ++ rte_exit(EXIT_FAILURE, "Cannot init mbuf pool\n"); ++ + /* reset l2fwd_dst_ports */ + for (portid = 0; portid < RTE_MAX_ETHPORTS; portid++) + l2fwd_dst_ports[portid] = 0; +diff --git a/dpdk/examples/l2fwd/main.c b/dpdk/examples/l2fwd/main.c +index 09257aab1c..fcef232731 100644 +--- a/dpdk/examples/l2fwd/main.c ++++ b/dpdk/examples/l2fwd/main.c +@@ -478,7 +478,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/l3fwd-acl/main.c b/dpdk/examples/l3fwd-acl/main.c +index fa92a28297..cfbeee962b 100644 +--- a/dpdk/examples/l3fwd-acl/main.c ++++ b/dpdk/examples/l3fwd-acl/main.c +@@ -1839,7 +1839,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/l3fwd-power/main.c b/dpdk/examples/l3fwd-power/main.c +index d049d8a5dc..aa6ff2627b 100644 +--- a/dpdk/examples/l3fwd-power/main.c ++++ b/dpdk/examples/l3fwd-power/main.c +@@ -880,9 +880,6 @@ sleep_until_rx_interrupt(int num) + port_id = ((uintptr_t)data) >> CHAR_BIT; + queue_id = ((uintptr_t)data) & + RTE_LEN2MASK(CHAR_BIT, uint8_t); +- rte_spinlock_lock(&(locks[port_id])); +- rte_eth_dev_rx_intr_disable(port_id, queue_id); +- rte_spinlock_unlock(&(locks[port_id])); + RTE_LOG(INFO, L3FWD_POWER, + "lcore %u is waked up from rx interrupt on" + " port %d queue %d\n", +@@ -892,7 +889,7 @@ sleep_until_rx_interrupt(int num) + return 0; + } + +-static void turn_on_intr(struct lcore_conf *qconf) ++static void turn_on_off_intr(struct lcore_conf *qconf, bool on) + { + int i; + struct lcore_rx_queue *rx_queue; +@@ -905,7 +902,10 @@ static void turn_on_intr(struct lcore_conf *qconf) + queue_id = rx_queue->queue_id; + + rte_spinlock_lock(&(locks[port_id])); +- rte_eth_dev_rx_intr_enable(port_id, queue_id); ++ if (on) ++ rte_eth_dev_rx_intr_enable(port_id, queue_id); ++ else ++ rte_eth_dev_rx_intr_disable(port_id, queue_id); + rte_spinlock_unlock(&(locks[port_id])); + } + } +@@ -1338,11 +1338,12 @@ main_loop(__attribute__((unused)) void *dummy) + */ + rte_delay_us(lcore_idle_hint); + else { +- /* suspend until rx interrupt trigges */ ++ /* suspend until rx interrupt triggers */ + if (intr_en) { +- turn_on_intr(qconf); ++ turn_on_off_intr(qconf, 1); + sleep_until_rx_interrupt( + qconf->n_rx_queue); ++ turn_on_off_intr(qconf, 0); + /** + * start receiving packets immediately + */ +@@ -1997,7 +1998,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Mbps - %s\n", (uint8_t)portid, + (unsigned)link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", + (uint8_t)portid); +diff --git a/dpdk/examples/l3fwd/main.c b/dpdk/examples/l3fwd/main.c +index 4dea12a653..3a8ec5a7f2 100644 +--- a/dpdk/examples/l3fwd/main.c ++++ b/dpdk/examples/l3fwd/main.c +@@ -747,7 +747,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps -%s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/link_status_interrupt/main.c b/dpdk/examples/link_status_interrupt/main.c +index a924aa2313..72f86e502f 100644 +--- a/dpdk/examples/link_status_interrupt/main.c ++++ b/dpdk/examples/link_status_interrupt/main.c +@@ -500,7 +500,7 @@ check_all_ports_link_status(uint16_t port_num, uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/multi_process/client_server_mp/mp_server/init.c b/dpdk/examples/multi_process/client_server_mp/mp_server/init.c +index ad9f46f0aa..c2ec07ac65 100644 +--- a/dpdk/examples/multi_process/client_server_mp/mp_server/init.c ++++ b/dpdk/examples/multi_process/client_server_mp/mp_server/init.c +@@ -209,7 +209,7 @@ check_all_ports_link_status(uint16_t port_num, uint32_t port_mask) + "Mbps - %s\n", ports->id[portid], + (unsigned)link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", + (uint8_t)ports->id[portid]); +diff --git a/dpdk/examples/multi_process/symmetric_mp/main.c b/dpdk/examples/multi_process/symmetric_mp/main.c +index 7f491452a7..c5cd8825e5 100644 +--- a/dpdk/examples/multi_process/symmetric_mp/main.c ++++ b/dpdk/examples/multi_process/symmetric_mp/main.c +@@ -389,7 +389,7 @@ check_all_ports_link_status(uint16_t port_num, uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/ntb/ntb_fwd.c b/dpdk/examples/ntb/ntb_fwd.c +index c914256dd4..17eedcf0b8 100644 +--- a/dpdk/examples/ntb/ntb_fwd.c ++++ b/dpdk/examples/ntb/ntb_fwd.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + /* Per-port statistics struct */ + struct ntb_port_statistics { +@@ -1256,6 +1257,11 @@ ntb_mbuf_pool_create(uint16_t mbuf_seg_size, uint32_t nb_mbuf, + if (mp == NULL) + return NULL; + ++ if (rte_mempool_set_ops_byname(mp, rte_mbuf_best_mempool_ops(), NULL)) { ++ printf("error setting mempool handler\n"); ++ goto fail; ++ } ++ + memset(&mbp_priv, 0, sizeof(mbp_priv)); + mbp_priv.mbuf_data_room_size = mbuf_seg_size; + mbp_priv.mbuf_priv_size = 0; +diff --git a/dpdk/examples/performance-thread/l3fwd-thread/main.c b/dpdk/examples/performance-thread/l3fwd-thread/main.c +index ad540fd842..f58a70b77f 100644 +--- a/dpdk/examples/performance-thread/l3fwd-thread/main.c ++++ b/dpdk/examples/performance-thread/l3fwd-thread/main.c +@@ -3457,7 +3457,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Port%d Link Up. Speed %u Mbps - %s\n", + portid, link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", portid); + continue; +diff --git a/dpdk/examples/qos_sched/cfg_file.c b/dpdk/examples/qos_sched/cfg_file.c +index 5714c3f36d..f078e4f7de 100644 +--- a/dpdk/examples/qos_sched/cfg_file.c ++++ b/dpdk/examples/qos_sched/cfg_file.c +@@ -20,6 +20,9 @@ + * for new entries do we add in */ + #define CFG_ALLOC_ENTRY_BATCH 16 + ++uint32_t active_queues[RTE_SCHED_QUEUES_PER_PIPE]; ++uint32_t n_active_queues; ++ + int + cfg_load_port(struct rte_cfgfile *cfg, struct rte_sched_port_params *port_params) + { +diff --git a/dpdk/examples/qos_sched/init.c b/dpdk/examples/qos_sched/init.c +index 0a17e0d4d5..9626c15b81 100644 +--- a/dpdk/examples/qos_sched/init.c ++++ b/dpdk/examples/qos_sched/init.c +@@ -164,7 +164,7 @@ app_init_port(uint16_t portid, struct rte_mempool *mp) + printf(" Link Up - speed %u Mbps - %s\n", + (uint32_t) link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + } else { + printf(" Link Down\n"); + } +diff --git a/dpdk/examples/qos_sched/main.h b/dpdk/examples/qos_sched/main.h +index baa2b3eadc..23bc418d97 100644 +--- a/dpdk/examples/qos_sched/main.h ++++ b/dpdk/examples/qos_sched/main.h +@@ -148,8 +148,8 @@ extern struct burst_conf burst_conf; + extern struct ring_thresh rx_thresh; + extern struct ring_thresh tx_thresh; + +-uint32_t active_queues[RTE_SCHED_QUEUES_PER_PIPE]; +-uint32_t n_active_queues; ++extern uint32_t active_queues[RTE_SCHED_QUEUES_PER_PIPE]; ++extern uint32_t n_active_queues; + + extern struct rte_sched_port_params port_params; + extern struct rte_sched_subport_params subport_params[MAX_SCHED_SUBPORTS]; +diff --git a/dpdk/examples/server_node_efd/server/init.c b/dpdk/examples/server_node_efd/server/init.c +index 00e2e40599..378a74fa5c 100644 +--- a/dpdk/examples/server_node_efd/server/init.c ++++ b/dpdk/examples/server_node_efd/server/init.c +@@ -272,7 +272,7 @@ check_all_ports_link_status(uint16_t port_num, uint32_t port_mask) + info->id[portid], + link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", + info->id[portid]); +diff --git a/dpdk/examples/tep_termination/vxlan_setup.c b/dpdk/examples/tep_termination/vxlan_setup.c +index eca119a728..4b44ccc143 100644 +--- a/dpdk/examples/tep_termination/vxlan_setup.c ++++ b/dpdk/examples/tep_termination/vxlan_setup.c +@@ -194,8 +194,6 @@ vxlan_port_init(uint16_t port, struct rte_mempool *mbuf_pool) + ports_eth_addr[port].addr_bytes[5]); + + if (tso_segsz != 0) { +- struct rte_eth_dev_info dev_info; +- rte_eth_dev_info_get(port, &dev_info); + if ((dev_info.tx_offload_capa & DEV_TX_OFFLOAD_TCP_TSO) == 0) + RTE_LOG(WARNING, PORT, + "hardware TSO offload is not supported\n"); +diff --git a/dpdk/examples/vhost_blk/vhost_blk.c b/dpdk/examples/vhost_blk/vhost_blk.c +index 3182a488bb..b757c9228b 100644 +--- a/dpdk/examples/vhost_blk/vhost_blk.c ++++ b/dpdk/examples/vhost_blk/vhost_blk.c +@@ -31,6 +31,8 @@ + (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | \ + (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) + ++struct vhost_blk_ctrlr *g_vhost_ctrlr; ++ + /* Path to folder where character device will be created. Can be set by user. */ + static char dev_pathname[PATH_MAX] = ""; + static sem_t exit_sem; +@@ -856,6 +858,7 @@ new_device(int vid) + ctrlr->bdev->vid, i, + &blk_vq->last_avail_idx, + &blk_vq->last_used_idx); ++ assert(ret == 0); + + blk_vq->avail_wrap_counter = blk_vq->last_avail_idx & + (1 << 15); +@@ -993,11 +996,7 @@ vhost_blk_ctrlr_construct(const char *ctrlr_name) + } + snprintf(dev_pathname, sizeof(dev_pathname), "%s/%s", path, ctrlr_name); + +- if (access(dev_pathname, F_OK) != -1) { +- if (unlink(dev_pathname) != 0) +- rte_exit(EXIT_FAILURE, "Cannot remove %s.\n", +- dev_pathname); +- } ++ unlink(dev_pathname); + + if (rte_vhost_driver_register(dev_pathname, 0) != 0) { + fprintf(stderr, "socket %s already exists\n", dev_pathname); +@@ -1040,8 +1039,7 @@ signal_handler(__rte_unused int signum) + { + struct vhost_blk_ctrlr *ctrlr; + +- if (access(dev_pathname, F_OK) == 0) +- unlink(dev_pathname); ++ unlink(dev_pathname); + + if (g_should_stop != -1) { + g_should_stop = 1; +diff --git a/dpdk/examples/vhost_blk/vhost_blk.h b/dpdk/examples/vhost_blk/vhost_blk.h +index 933e2b7c57..17258d284b 100644 +--- a/dpdk/examples/vhost_blk/vhost_blk.h ++++ b/dpdk/examples/vhost_blk/vhost_blk.h +@@ -112,8 +112,8 @@ struct inflight_blk_task { + struct rte_vhost_inflight_info_packed *inflight_packed; + }; + +-struct vhost_blk_ctrlr *g_vhost_ctrlr; +-struct vhost_device_ops vhost_blk_device_ops; ++extern struct vhost_blk_ctrlr *g_vhost_ctrlr; ++extern struct vhost_device_ops vhost_blk_device_ops; + + int vhost_bdev_process_blk_commands(struct vhost_block_dev *bdev, + struct vhost_blk_task *task); +diff --git a/dpdk/examples/vm_power_manager/channel_manager.c b/dpdk/examples/vm_power_manager/channel_manager.c +index 4ac21f02c1..74a2a677e8 100644 +--- a/dpdk/examples/vm_power_manager/channel_manager.c ++++ b/dpdk/examples/vm_power_manager/channel_manager.c +@@ -4,7 +4,6 @@ + + #include + #include +-#include + #include + #include + #include +@@ -35,6 +34,8 @@ + + #define RTE_LOGTYPE_CHANNEL_MANAGER RTE_LOGTYPE_USER1 + ++struct libvirt_vm_info lvm_info[MAX_CLIENTS]; ++ + /* Global pointer to libvirt connection */ + static virConnectPtr global_vir_conn_ptr; + +diff --git a/dpdk/examples/vm_power_manager/channel_manager.h b/dpdk/examples/vm_power_manager/channel_manager.h +index 8284be0a18..e55376fcdb 100644 +--- a/dpdk/examples/vm_power_manager/channel_manager.h ++++ b/dpdk/examples/vm_power_manager/channel_manager.h +@@ -10,7 +10,7 @@ extern "C" { + #endif + + #include +-#include ++#include + #include + #include + +@@ -26,11 +26,6 @@ extern "C" { + /* FIFO file name template */ + #define CHANNEL_MGR_FIFO_PATTERN_NAME "fifo" + +-#ifndef UNIX_PATH_MAX +-struct sockaddr_un _sockaddr_un; +-#define UNIX_PATH_MAX sizeof(_sockaddr_un.sun_path) +-#endif +- + #define MAX_CLIENTS 64 + #define MAX_VCPUS 20 + +@@ -41,7 +36,7 @@ struct libvirt_vm_info { + uint8_t num_cpus; + }; + +-struct libvirt_vm_info lvm_info[MAX_CLIENTS]; ++extern struct libvirt_vm_info lvm_info[MAX_CLIENTS]; + /* Communication Channel Status */ + enum channel_status { CHANNEL_MGR_CHANNEL_DISCONNECTED = 0, + CHANNEL_MGR_CHANNEL_CONNECTED, +diff --git a/dpdk/examples/vm_power_manager/channel_monitor.c b/dpdk/examples/vm_power_manager/channel_monitor.c +index 090c2a98b0..1d00a6cf6c 100644 +--- a/dpdk/examples/vm_power_manager/channel_monitor.c ++++ b/dpdk/examples/vm_power_manager/channel_monitor.c +@@ -868,7 +868,7 @@ process_request(struct channel_packet *pkt, struct channel_info *chan_info) + if (valid_unit) { + ret = send_ack_for_received_cmd(pkt, + chan_info, +- scale_res > 0 ? ++ scale_res >= 0 ? + CPU_POWER_CMD_ACK : + CPU_POWER_CMD_NACK); + if (ret < 0) +diff --git a/dpdk/examples/vm_power_manager/main.c b/dpdk/examples/vm_power_manager/main.c +index d39f044c1e..0409a832b5 100644 +--- a/dpdk/examples/vm_power_manager/main.c ++++ b/dpdk/examples/vm_power_manager/main.c +@@ -272,7 +272,7 @@ check_all_ports_link_status(uint32_t port_mask) + "Mbps - %s\n", (uint16_t)portid, + (unsigned int)link.link_speed, + (link.link_duplex == ETH_LINK_FULL_DUPLEX) ? +- ("full-duplex") : ("half-duplex\n")); ++ ("full-duplex") : ("half-duplex")); + else + printf("Port %d Link Down\n", + (uint16_t)portid); +diff --git a/dpdk/examples/vm_power_manager/power_manager.c b/dpdk/examples/vm_power_manager/power_manager.c +index 7b4f4b3c4d..cd51d4741f 100644 +--- a/dpdk/examples/vm_power_manager/power_manager.c ++++ b/dpdk/examples/vm_power_manager/power_manager.c +@@ -6,7 +6,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/dpdk/examples/vmdq/main.c b/dpdk/examples/vmdq/main.c +index 6e6fc91ec0..b082bc8c1c 100644 +--- a/dpdk/examples/vmdq/main.c ++++ b/dpdk/examples/vmdq/main.c +@@ -59,6 +59,7 @@ static uint32_t enabled_port_mask; + /* number of pools (if user does not specify any, 8 by default */ + static uint32_t num_queues = 8; + static uint32_t num_pools = 8; ++static uint8_t rss_enable; + + /* empty vmdq configuration structure. Filled in programatically */ + static const struct rte_eth_conf vmdq_conf_default = { +@@ -143,6 +144,13 @@ get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_pools) + (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf))); + (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf, + sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf))); ++ if (rss_enable) { ++ eth_conf->rxmode.mq_mode = ETH_MQ_RX_VMDQ_RSS; ++ eth_conf->rx_adv_conf.rss_conf.rss_hf = ETH_RSS_IP | ++ ETH_RSS_UDP | ++ ETH_RSS_TCP | ++ ETH_RSS_SCTP; ++ } + return 0; + } + +@@ -164,6 +172,7 @@ port_init(uint16_t port, struct rte_mempool *mbuf_pool) + uint16_t q; + uint16_t queues_per_pool; + uint32_t max_nb_pools; ++ uint64_t rss_hf_tmp; + + /* + * The max pool number from dev_info will be used to validate the pool +@@ -209,6 +218,17 @@ port_init(uint16_t port, struct rte_mempool *mbuf_pool) + if (!rte_eth_dev_is_valid_port(port)) + return -1; + ++ rss_hf_tmp = port_conf.rx_adv_conf.rss_conf.rss_hf; ++ port_conf.rx_adv_conf.rss_conf.rss_hf &= ++ dev_info.flow_type_rss_offloads; ++ if (port_conf.rx_adv_conf.rss_conf.rss_hf != rss_hf_tmp) { ++ printf("Port %u modified RSS hash function based on hardware support," ++ "requested:%#"PRIx64" configured:%#"PRIx64"\n", ++ port, ++ rss_hf_tmp, ++ port_conf.rx_adv_conf.rss_conf.rss_hf); ++ } ++ + /* + * Though in this example, we only receive packets from the first queue + * of each pool and send packets through first rte_lcore_count() tx +@@ -363,7 +383,8 @@ static void + vmdq_usage(const char *prgname) + { + printf("%s [EAL options] -- -p PORTMASK]\n" +- " --nb-pools NP: number of pools\n", ++ " --nb-pools NP: number of pools\n" ++ " --enable-rss: enable RSS (disabled by default)\n", + prgname); + } + +@@ -377,6 +398,7 @@ vmdq_parse_args(int argc, char **argv) + const char *prgname = argv[0]; + static struct option long_option[] = { + {"nb-pools", required_argument, NULL, 0}, ++ {"enable-rss", 0, NULL, 0}, + {NULL, 0, 0, 0} + }; + +@@ -394,11 +416,18 @@ vmdq_parse_args(int argc, char **argv) + } + break; + case 0: +- if (vmdq_parse_num_pools(optarg) == -1) { +- printf("invalid number of pools\n"); +- vmdq_usage(prgname); +- return -1; ++ if (!strcmp(long_option[option_index].name, ++ "nb-pools")) { ++ if (vmdq_parse_num_pools(optarg) == -1) { ++ printf("invalid number of pools\n"); ++ vmdq_usage(prgname); ++ return -1; ++ } + } ++ ++ if (!strcmp(long_option[option_index].name, ++ "enable-rss")) ++ rss_enable = 1; + break; + + default: +@@ -441,10 +470,11 @@ update_mac_address(struct rte_mbuf *m, unsigned dst_port) + static void + sighup_handler(int signum) + { +- unsigned q; +- for (q = 0; q < num_queues; q++) { +- if (q % (num_queues/num_pools) == 0) +- printf("\nPool %u: ", q/(num_queues/num_pools)); ++ unsigned int q = vmdq_queue_base; ++ for (; q < num_queues; q++) { ++ if ((q - vmdq_queue_base) % (num_vmdq_queues / num_pools) == 0) ++ printf("\nPool %u: ", (q - vmdq_queue_base) / ++ (num_vmdq_queues / num_pools)); + printf("%lu ", rxPackets[q]); + } + printf("\nFinished handling signal %d\n", signum); +diff --git a/dpdk/kernel/freebsd/contigmem/contigmem.c b/dpdk/kernel/freebsd/contigmem/contigmem.c +index 64e0a7fecd..abb76f241e 100644 +--- a/dpdk/kernel/freebsd/contigmem/contigmem.c ++++ b/dpdk/kernel/freebsd/contigmem/contigmem.c +@@ -165,9 +165,11 @@ contigmem_load() + + error: + for (i = 0; i < contigmem_num_buffers; i++) { +- if (contigmem_buffers[i].addr != NULL) ++ if (contigmem_buffers[i].addr != NULL) { + contigfree(contigmem_buffers[i].addr, + contigmem_buffer_size, M_CONTIGMEM); ++ contigmem_buffers[i].addr = NULL; ++ } + if (mtx_initialized(&contigmem_buffers[i].mtx)) + mtx_destroy(&contigmem_buffers[i].mtx); + } +diff --git a/dpdk/kernel/linux/kni/compat.h b/dpdk/kernel/linux/kni/compat.h +index 7109474ec5..9ee45dbf6f 100644 +--- a/dpdk/kernel/linux/kni/compat.h ++++ b/dpdk/kernel/linux/kni/compat.h +@@ -130,3 +130,7 @@ + #if KERNEL_VERSION(4, 10, 0) <= LINUX_VERSION_CODE + #define HAVE_IOVA_TO_KVA_MAPPING_SUPPORT + #endif ++ ++#if KERNEL_VERSION(5, 6, 0) <= LINUX_VERSION_CODE ++#define HAVE_TX_TIMEOUT_TXQUEUE ++#endif +diff --git a/dpdk/kernel/linux/kni/kni_dev.h b/dpdk/kernel/linux/kni/kni_dev.h +index 5e75c6371f..ca5f92a47b 100644 +--- a/dpdk/kernel/linux/kni/kni_dev.h ++++ b/dpdk/kernel/linux/kni/kni_dev.h +@@ -32,7 +32,7 @@ + #define MBUF_BURST_SZ 32 + + /* Default carrier state for created KNI network interfaces */ +-extern uint32_t dflt_carrier; ++extern uint32_t kni_dflt_carrier; + + /** + * A structure describing the private information for a kni device. +diff --git a/dpdk/kernel/linux/kni/kni_misc.c b/dpdk/kernel/linux/kni/kni_misc.c +index cda71bde08..2b464c4381 100644 +--- a/dpdk/kernel/linux/kni/kni_misc.c ++++ b/dpdk/kernel/linux/kni/kni_misc.c +@@ -39,7 +39,7 @@ static uint32_t multiple_kthread_on; + + /* Default carrier state for created KNI network interfaces */ + static char *carrier; +-uint32_t dflt_carrier; ++uint32_t kni_dflt_carrier; + + #define KNI_DEV_IN_USE_BIT_NUM 0 /* Bit number for device in use */ + +@@ -554,14 +554,14 @@ static int __init + kni_parse_carrier_state(void) + { + if (!carrier) { +- dflt_carrier = 0; ++ kni_dflt_carrier = 0; + return 0; + } + + if (strcmp(carrier, "off") == 0) +- dflt_carrier = 0; ++ kni_dflt_carrier = 0; + else if (strcmp(carrier, "on") == 0) +- dflt_carrier = 1; ++ kni_dflt_carrier = 1; + else + return -1; + +@@ -588,7 +588,7 @@ kni_init(void) + return -EINVAL; + } + +- if (dflt_carrier == 0) ++ if (kni_dflt_carrier == 0) + pr_debug("Default carrier state set to off.\n"); + else + pr_debug("Default carrier state set to on.\n"); +diff --git a/dpdk/kernel/linux/kni/kni_net.c b/dpdk/kernel/linux/kni/kni_net.c +index 1ba9b1b99f..c82c881a2c 100644 +--- a/dpdk/kernel/linux/kni/kni_net.c ++++ b/dpdk/kernel/linux/kni/kni_net.c +@@ -158,7 +158,7 @@ kni_net_open(struct net_device *dev) + struct kni_dev *kni = netdev_priv(dev); + + netif_start_queue(dev); +- if (dflt_carrier == 1) ++ if (kni_dflt_carrier == 1) + netif_carrier_on(dev); + else + netif_carrier_off(dev); +@@ -623,8 +623,13 @@ kni_net_rx(struct kni_dev *kni) + /* + * Deal with a transmit timeout. + */ ++#ifdef HAVE_TX_TIMEOUT_TXQUEUE ++static void ++kni_net_tx_timeout(struct net_device *dev, unsigned int txqueue) ++#else + static void + kni_net_tx_timeout(struct net_device *dev) ++#endif + { + pr_debug("Transmit timeout at %ld, latency %ld\n", jiffies, + jiffies - dev_trans_start(dev)); +diff --git a/dpdk/kernel/linux/kni/meson.build b/dpdk/kernel/linux/kni/meson.build +index 955eec9496..f93e97fa09 100644 +--- a/dpdk/kernel/linux/kni/meson.build ++++ b/dpdk/kernel/linux/kni/meson.build +@@ -23,7 +23,6 @@ custom_target('rte_kni', + ' -I' + meson.current_source_dir(), + 'modules'], + depends: kni_mkfile, +- console: true, + install: true, + install_dir: kernel_dir + '/extra/dpdk', + build_by_default: get_option('enable_kmods')) +diff --git a/dpdk/lib/Makefile b/dpdk/lib/Makefile +index 46b91ae1a4..2cbb096f12 100644 +--- a/dpdk/lib/Makefile ++++ b/dpdk/lib/Makefile +@@ -113,7 +113,7 @@ DIRS-$(CONFIG_RTE_LIBRTE_BPF) += librte_bpf + DEPDIRS-librte_bpf := librte_eal librte_mempool librte_mbuf librte_ethdev + DIRS-$(CONFIG_RTE_LIBRTE_IPSEC) += librte_ipsec + DEPDIRS-librte_ipsec := librte_eal librte_mbuf librte_cryptodev librte_security \ +- librte_net ++ librte_net librte_hash + DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry + DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev + DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu +diff --git a/dpdk/lib/librte_acl/acl_bld.c b/dpdk/lib/librte_acl/acl_bld.c +index b06bbe9207..d1f920b09c 100644 +--- a/dpdk/lib/librte_acl/acl_bld.c ++++ b/dpdk/lib/librte_acl/acl_bld.c +@@ -778,9 +778,8 @@ acl_build_reset(struct rte_acl_ctx *ctx) + } + + static void +-acl_gen_range(struct acl_build_context *context, +- const uint8_t *hi, const uint8_t *lo, int size, int level, +- struct rte_acl_node *root, struct rte_acl_node *end) ++acl_gen_full_range(struct acl_build_context *context, struct rte_acl_node *root, ++ struct rte_acl_node *end, int size, int level) + { + struct rte_acl_node *node, *prev; + uint32_t n; +@@ -788,10 +787,71 @@ acl_gen_range(struct acl_build_context *context, + prev = root; + for (n = size - 1; n > 0; n--) { + node = acl_alloc_node(context, level++); +- acl_add_ptr_range(context, prev, node, lo[n], hi[n]); ++ acl_add_ptr_range(context, prev, node, 0, UINT8_MAX); + prev = node; + } +- acl_add_ptr_range(context, prev, end, lo[0], hi[0]); ++ acl_add_ptr_range(context, prev, end, 0, UINT8_MAX); ++} ++ ++static void ++acl_gen_range_mdl(struct acl_build_context *context, struct rte_acl_node *root, ++ struct rte_acl_node *end, uint8_t lo, uint8_t hi, int size, int level) ++{ ++ struct rte_acl_node *node; ++ ++ node = acl_alloc_node(context, level++); ++ acl_add_ptr_range(context, root, node, lo, hi); ++ acl_gen_full_range(context, node, end, size - 1, level); ++} ++ ++static void ++acl_gen_range_low(struct acl_build_context *context, struct rte_acl_node *root, ++ struct rte_acl_node *end, const uint8_t *lo, int size, int level) ++{ ++ struct rte_acl_node *node; ++ uint32_t n; ++ ++ n = size - 1; ++ if (n == 0) { ++ acl_add_ptr_range(context, root, end, lo[0], UINT8_MAX); ++ return; ++ } ++ ++ node = acl_alloc_node(context, level++); ++ acl_add_ptr_range(context, root, node, lo[n], lo[n]); ++ ++ /* generate lower-bound sub-trie */ ++ acl_gen_range_low(context, node, end, lo, n, level); ++ ++ /* generate middle sub-trie */ ++ if (n > 1 && lo[n - 1] != UINT8_MAX) ++ acl_gen_range_mdl(context, node, end, lo[n - 1] + 1, UINT8_MAX, ++ n, level); ++} ++ ++static void ++acl_gen_range_high(struct acl_build_context *context, struct rte_acl_node *root, ++ struct rte_acl_node *end, const uint8_t *hi, int size, int level) ++{ ++ struct rte_acl_node *node; ++ uint32_t n; ++ ++ n = size - 1; ++ if (n == 0) { ++ acl_add_ptr_range(context, root, end, 0, hi[0]); ++ return; ++ } ++ ++ node = acl_alloc_node(context, level++); ++ acl_add_ptr_range(context, root, node, hi[n], hi[n]); ++ ++ /* generate upper-bound sub-trie */ ++ acl_gen_range_high(context, node, end, hi, n, level); ++ ++ /* generate middle sub-trie */ ++ if (n > 1 && hi[n - 1] != 0) ++ acl_gen_range_mdl(context, node, end, 0, hi[n - 1] - 1, ++ n, level); + } + + static struct rte_acl_node * +@@ -799,52 +859,56 @@ acl_gen_range_trie(struct acl_build_context *context, + const void *min, const void *max, + int size, int level, struct rte_acl_node **pend) + { +- int32_t n; +- struct rte_acl_node *root; +- const uint8_t *lo = min; +- const uint8_t *hi = max; ++ int32_t k, n; ++ uint8_t hi_ff, lo_00; ++ struct rte_acl_node *node, *prev, *root; ++ const uint8_t *lo; ++ const uint8_t *hi; ++ ++ lo = min; ++ hi = max; + +- *pend = acl_alloc_node(context, level+size); ++ *pend = acl_alloc_node(context, level + size); + root = acl_alloc_node(context, level++); ++ prev = root; + +- if (lo[size - 1] == hi[size - 1]) { +- acl_gen_range(context, hi, lo, size, level, root, *pend); +- } else { +- uint8_t limit_lo[64]; +- uint8_t limit_hi[64]; +- uint8_t hi_ff = UINT8_MAX; +- uint8_t lo_00 = 0; ++ /* build common sub-trie till possible */ ++ for (n = size - 1; n > 0 && lo[n] == hi[n]; n--) { ++ node = acl_alloc_node(context, level++); ++ acl_add_ptr_range(context, prev, node, lo[n], hi[n]); ++ prev = node; ++ } + +- memset(limit_lo, 0, RTE_DIM(limit_lo)); +- memset(limit_hi, UINT8_MAX, RTE_DIM(limit_hi)); ++ /* no branch needed, just one sub-trie */ ++ if (n == 0) { ++ acl_add_ptr_range(context, prev, *pend, lo[0], hi[0]); ++ return root; ++ } + +- for (n = size - 2; n >= 0; n--) { +- hi_ff = (uint8_t)(hi_ff & hi[n]); +- lo_00 = (uint8_t)(lo_00 | lo[n]); +- } ++ /* gather information about divirgent paths */ ++ lo_00 = 0; ++ hi_ff = UINT8_MAX; ++ for (k = n - 1; k >= 0; k--) { ++ hi_ff &= hi[k]; ++ lo_00 |= lo[k]; ++ } + +- if (hi_ff != UINT8_MAX) { +- limit_lo[size - 1] = hi[size - 1]; +- acl_gen_range(context, hi, limit_lo, size, level, +- root, *pend); +- } ++ /* generate left (lower-bound) sub-trie */ ++ if (lo_00 != 0) ++ acl_gen_range_low(context, prev, *pend, lo, n + 1, level); + +- if (lo_00 != 0) { +- limit_hi[size - 1] = lo[size - 1]; +- acl_gen_range(context, limit_hi, lo, size, level, +- root, *pend); +- } ++ /* generate right (upper-bound) sub-trie */ ++ if (hi_ff != UINT8_MAX) ++ acl_gen_range_high(context, prev, *pend, hi, n + 1, level); + +- if (hi[size - 1] - lo[size - 1] > 1 || +- lo_00 == 0 || +- hi_ff == UINT8_MAX) { +- limit_lo[size-1] = (uint8_t)(lo[size-1] + (lo_00 != 0)); +- limit_hi[size-1] = (uint8_t)(hi[size-1] - +- (hi_ff != UINT8_MAX)); +- acl_gen_range(context, limit_hi, limit_lo, size, +- level, root, *pend); +- } ++ /* generate sub-trie in the middle */ ++ if (lo[n] + 1 != hi[n] || lo_00 == 0 || hi_ff == UINT8_MAX) { ++ lo_00 = lo[n] + (lo_00 != 0); ++ hi_ff = hi[n] - (hi_ff != UINT8_MAX); ++ acl_gen_range_mdl(context, prev, *pend, lo_00, hi_ff, ++ n + 1, level); + } ++ + return root; + } + +diff --git a/dpdk/lib/librte_bbdev/rte_bbdev.h b/dpdk/lib/librte_bbdev/rte_bbdev.h +index 591fb7914a..1f58a0762f 100644 +--- a/dpdk/lib/librte_bbdev/rte_bbdev.h ++++ b/dpdk/lib/librte_bbdev/rte_bbdev.h +@@ -440,21 +440,21 @@ TAILQ_HEAD(rte_bbdev_cb_list, rte_bbdev_callback); + * these fields, but should only write to the *_ops fields. + */ + struct __rte_cache_aligned rte_bbdev { +- /**< Enqueue encode function */ ++ /** Enqueue encode function */ + rte_bbdev_enqueue_enc_ops_t enqueue_enc_ops; +- /**< Enqueue decode function */ ++ /** Enqueue decode function */ + rte_bbdev_enqueue_dec_ops_t enqueue_dec_ops; +- /**< Dequeue encode function */ ++ /** Dequeue encode function */ + rte_bbdev_dequeue_enc_ops_t dequeue_enc_ops; +- /**< Dequeue decode function */ ++ /** Dequeue decode function */ + rte_bbdev_dequeue_dec_ops_t dequeue_dec_ops; +- /**< Enqueue encode function */ ++ /** Enqueue encode function */ + rte_bbdev_enqueue_enc_ops_t enqueue_ldpc_enc_ops; +- /**< Enqueue decode function */ ++ /** Enqueue decode function */ + rte_bbdev_enqueue_dec_ops_t enqueue_ldpc_dec_ops; +- /**< Dequeue encode function */ ++ /** Dequeue encode function */ + rte_bbdev_dequeue_enc_ops_t dequeue_ldpc_enc_ops; +- /**< Dequeue decode function */ ++ /** Dequeue decode function */ + rte_bbdev_dequeue_dec_ops_t dequeue_ldpc_dec_ops; + const struct rte_bbdev_ops *dev_ops; /**< Functions exported by PMD */ + struct rte_bbdev_data *data; /**< Pointer to device data */ +diff --git a/dpdk/lib/librte_bbdev/rte_bbdev_op.h b/dpdk/lib/librte_bbdev/rte_bbdev_op.h +index 1e119a757b..6e43495fb4 100644 +--- a/dpdk/lib/librte_bbdev/rte_bbdev_op.h ++++ b/dpdk/lib/librte_bbdev/rte_bbdev_op.h +@@ -389,12 +389,12 @@ struct rte_bbdev_op_turbo_dec { + */ + uint8_t num_maps; + +- /**< [0 - TB : 1 - CB] */ ++ /** [0 - TB : 1 - CB] */ + uint8_t code_block_mode; + union { +- /**< Struct which stores Code Block specific parameters */ ++ /** Struct which stores Code Block specific parameters */ + struct rte_bbdev_op_dec_turbo_cb_params cb_params; +- /**< Struct which stores Transport Block specific parameters */ ++ /** Struct which stores Transport Block specific parameters */ + struct rte_bbdev_op_dec_turbo_tb_params tb_params; + }; + }; +@@ -545,7 +545,7 @@ struct rte_bbdev_op_enc_turbo_tb_params { + * the Turbo operation when r >= C-, [K:3*Kpi] + */ + uint16_t ncb_pos; +- /**< The index of the first CB in the inbound mbuf data, default is 0 */ ++ /** The index of the first CB in the inbound mbuf data, default is 0 */ + uint8_t r; + }; + +@@ -744,11 +744,11 @@ enum { + + /** Structure specifying a single encode operation */ + struct rte_bbdev_enc_op { +- /**< Status of operation that was performed */ ++ /** Status of operation that was performed */ + int status; +- /**< Mempool which op instance is in */ ++ /** Mempool which op instance is in */ + struct rte_mempool *mempool; +- /**< Opaque pointer for user data */ ++ /** Opaque pointer for user data */ + void *opaque_data; + union { + /** Contains turbo decoder specific parameters */ +@@ -785,7 +785,7 @@ struct rte_bbdev_op_cap { + } cap; /**< Operation-type specific capabilities */ + }; + +-/**< @internal Private data structure stored with operation pool. */ ++/** @internal Private data structure stored with operation pool. */ + struct rte_bbdev_op_pool_private { + enum rte_bbdev_op_type type; /**< Type of operations in a pool */ + }; +diff --git a/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h b/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h +index 24ddcee7af..237e3361d7 100644 +--- a/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h ++++ b/dpdk/lib/librte_bbdev/rte_bbdev_pmd.h +@@ -146,18 +146,18 @@ typedef int (*rte_bbdev_queue_intr_disable_t)(struct rte_bbdev *dev, + * fields are for non-vital operations + */ + struct rte_bbdev_ops { +- /**< Allocate and configure device memory. Optional. */ ++ /** Allocate and configure device memory. Optional. */ + rte_bbdev_setup_queues_t setup_queues; +- /**< Configure interrupts. Optional. */ ++ /** Configure interrupts. Optional. */ + rte_bbdev_intr_enable_t intr_enable; +- /**< Start device. Optional. */ ++ /** Start device. Optional. */ + rte_bbdev_start_t start; +- /**< Stop device. Optional. */ ++ /** Stop device. Optional. */ + rte_bbdev_stop_t stop; +- /**< Close device. Optional. */ ++ /** Close device. Optional. */ + rte_bbdev_close_t close; + +- /**< Get device info. Required. */ ++ /** Get device info. Required. */ + rte_bbdev_info_get_t info_get; + /** Get device statistics. Optional. */ + rte_bbdev_stats_get_t stats_get; +@@ -170,7 +170,7 @@ struct rte_bbdev_ops { + rte_bbdev_queue_release_t queue_release; + /** Start a queue. Optional. */ + rte_bbdev_queue_start_t queue_start; +- /**< Stop a queue pair. Optional. */ ++ /** Stop a queue pair. Optional. */ + rte_bbdev_queue_stop_t queue_stop; + + /** Enable queue interrupt. Optional */ +diff --git a/dpdk/lib/librte_bpf/meson.build b/dpdk/lib/librte_bpf/meson.build +index 13fc02db38..52cfaf9ac2 100644 +--- a/dpdk/lib/librte_bpf/meson.build ++++ b/dpdk/lib/librte_bpf/meson.build +@@ -14,7 +14,7 @@ elif dpdk_conf.has('RTE_ARCH_ARM64') + sources += files('bpf_jit_arm64.c') + endif + +-install_headers = files('bpf_def.h', ++install_headers('bpf_def.h', + 'rte_bpf.h', + 'rte_bpf_ethdev.h') + +diff --git a/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map b/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map +index 906eee96bf..22c999fe16 100644 +--- a/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map ++++ b/dpdk/lib/librte_cfgfile/rte_cfgfile_version.map +@@ -15,6 +15,7 @@ DPDK_20.0 { + rte_cfgfile_section_entries; + rte_cfgfile_section_entries_by_index; + rte_cfgfile_section_num_entries; ++ rte_cfgfile_section_num_entries_by_index; + rte_cfgfile_sections; + rte_cfgfile_set_entry; + +diff --git a/dpdk/lib/librte_cryptodev/rte_crypto_sym.h b/dpdk/lib/librte_cryptodev/rte_crypto_sym.h +index ffa038dc40..4e05c7c6ac 100644 +--- a/dpdk/lib/librte_cryptodev/rte_crypto_sym.h ++++ b/dpdk/lib/librte_cryptodev/rte_crypto_sym.h +@@ -208,9 +208,12 @@ enum rte_crypto_auth_algorithm { + /**< HMAC using MD5 algorithm */ + + RTE_CRYPTO_AUTH_SHA1, +- /**< 128 bit SHA algorithm. */ ++ /**< 160 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA1_HMAC, +- /**< HMAC using 128 bit SHA algorithm. */ ++ /**< HMAC using 160 bit SHA algorithm. ++ * HMAC-SHA-1-96 can be generated by setting ++ * digest_length to 12 bytes in auth/aead xforms. ++ */ + RTE_CRYPTO_AUTH_SHA224, + /**< 224 bit SHA algorithm. */ + RTE_CRYPTO_AUTH_SHA224_HMAC, +diff --git a/dpdk/lib/librte_cryptodev/rte_cryptodev.c b/dpdk/lib/librte_cryptodev/rte_cryptodev.c +index 89aa2ed3e2..ed9de3eb92 100644 +--- a/dpdk/lib/librte_cryptodev/rte_cryptodev.c ++++ b/dpdk/lib/librte_cryptodev/rte_cryptodev.c +@@ -491,6 +491,8 @@ rte_cryptodev_get_feature_name(uint64_t flag) + return "RSA_PRIV_OP_KEY_QT"; + case RTE_CRYPTODEV_FF_DIGEST_ENCRYPTED: + return "DIGEST_ENCRYPTED"; ++ case RTE_CRYPTODEV_FF_ASYM_SESSIONLESS: ++ return "ASYM_SESSIONLESS"; + default: + return NULL; + } +@@ -525,7 +527,8 @@ rte_cryptodev_pmd_get_named_dev(const char *name) + static inline uint8_t + rte_cryptodev_is_valid_device_data(uint8_t dev_id) + { +- if (rte_crypto_devices[dev_id].data == NULL) ++ if (dev_id >= RTE_CRYPTO_MAX_DEVS || ++ rte_crypto_devices[dev_id].data == NULL) + return 0; + + return 1; +@@ -617,8 +620,9 @@ rte_cryptodev_devices_get(const char *driver_name, uint8_t *devices, + void * + rte_cryptodev_get_sec_ctx(uint8_t dev_id) + { +- if (rte_crypto_devices[dev_id].feature_flags & +- RTE_CRYPTODEV_FF_SECURITY) ++ if (dev_id < RTE_CRYPTO_MAX_DEVS && ++ (rte_crypto_devices[dev_id].feature_flags & ++ RTE_CRYPTODEV_FF_SECURITY)) + return rte_crypto_devices[dev_id].security_ctx; + + return NULL; +@@ -789,6 +793,11 @@ rte_cryptodev_queue_pair_count(uint8_t dev_id) + { + struct rte_cryptodev *dev; + ++ if (!rte_cryptodev_is_valid_device_data(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return 0; ++ } ++ + dev = &rte_crypto_devices[dev_id]; + return dev->data->nb_queue_pairs; + } +@@ -1254,6 +1263,11 @@ rte_cryptodev_sym_session_init(uint8_t dev_id, + uint8_t index; + int ret; + ++ if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return -EINVAL; ++ } ++ + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (sess == NULL || xforms == NULL || dev == NULL) +@@ -1293,6 +1307,11 @@ rte_cryptodev_asym_session_init(uint8_t dev_id, + uint8_t index; + int ret; + ++ if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return -EINVAL; ++ } ++ + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (sess == NULL || xforms == NULL || dev == NULL) +@@ -1428,6 +1447,11 @@ rte_cryptodev_sym_session_clear(uint8_t dev_id, + struct rte_cryptodev *dev; + uint8_t driver_id; + ++ if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return -EINVAL; ++ } ++ + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (dev == NULL || sess == NULL) +@@ -1452,6 +1476,11 @@ rte_cryptodev_asym_session_clear(uint8_t dev_id, + { + struct rte_cryptodev *dev; + ++ if (!rte_cryptodev_pmd_is_valid_dev(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return -EINVAL; ++ } ++ + dev = rte_cryptodev_pmd_get_dev(dev_id); + + if (dev == NULL || sess == NULL) +@@ -1754,8 +1783,14 @@ rte_cryptodev_driver_id_get(const char *name) + const char * + rte_cryptodev_name_get(uint8_t dev_id) + { +- struct rte_cryptodev *dev = rte_cryptodev_pmd_get_dev(dev_id); ++ struct rte_cryptodev *dev; + ++ if (!rte_cryptodev_is_valid_device_data(dev_id)) { ++ CDEV_LOG_ERR("Invalid dev_id=%" PRIu8, dev_id); ++ return NULL; ++ } ++ ++ dev = rte_cryptodev_pmd_get_dev(dev_id); + if (dev == NULL) + return NULL; + +diff --git a/dpdk/lib/librte_distributor/meson.build b/dpdk/lib/librte_distributor/meson.build +index 50b91887b5..266af64348 100644 +--- a/dpdk/lib/librte_distributor/meson.build ++++ b/dpdk/lib/librte_distributor/meson.build +@@ -9,7 +9,6 @@ else + endif + headers = files('rte_distributor.h') + deps += ['mbuf'] +-use_function_versioning = true + + # for clang 32-bit compiles we need libatomic for 64-bit atomic ops + if cc.get_id() == 'clang' and dpdk_conf.get('RTE_ARCH_64') == false +diff --git a/dpdk/lib/librte_distributor/rte_distributor.c b/dpdk/lib/librte_distributor/rte_distributor.c +index 6c5b0c86e8..1c047f065a 100644 +--- a/dpdk/lib/librte_distributor/rte_distributor.c ++++ b/dpdk/lib/librte_distributor/rte_distributor.c +@@ -8,7 +8,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/dpdk/lib/librte_distributor/rte_distributor_single.c b/dpdk/lib/librte_distributor/rte_distributor_single.c +index 91d8824c64..abaf7730c3 100644 +--- a/dpdk/lib/librte_distributor/rte_distributor_single.c ++++ b/dpdk/lib/librte_distributor/rte_distributor_single.c +@@ -9,7 +9,6 @@ + #include + #include + #include +-#include + #include + #include + #include +diff --git a/dpdk/lib/librte_eal/common/eal_common_fbarray.c b/dpdk/lib/librte_eal/common/eal_common_fbarray.c +index 1312f936b8..4f8f1af73c 100644 +--- a/dpdk/lib/librte_eal/common/eal_common_fbarray.c ++++ b/dpdk/lib/librte_eal/common/eal_common_fbarray.c +@@ -1337,7 +1337,7 @@ fbarray_find_biggest(struct rte_fbarray *arr, unsigned int start, bool used, + */ + + /* the API's called are thread-safe, but something may still happen +- * inbetween the API calls, so lock the fbarray. all other API's are ++ * between the API calls, so lock the fbarray. all other API's are + * read-locking the fbarray, so read lock here is OK. + */ + rte_rwlock_read_lock(&arr->rwlock); +diff --git a/dpdk/lib/librte_eal/common/eal_common_log.c b/dpdk/lib/librte_eal/common/eal_common_log.c +index c0efd5214f..975aea90db 100644 +--- a/dpdk/lib/librte_eal/common/eal_common_log.c ++++ b/dpdk/lib/librte_eal/common/eal_common_log.c +@@ -302,7 +302,7 @@ rte_log_register_type_and_pick_level(const char *name, uint32_t level_def) + continue; + + if (opt_ll->pattern) { +- if (fnmatch(opt_ll->pattern, name, 0)) ++ if (fnmatch(opt_ll->pattern, name, 0) == 0) + level = opt_ll->level; + } else { + if (regexec(&opt_ll->re_match, name, 0, NULL, 0) == 0) +diff --git a/dpdk/lib/librte_eal/common/eal_common_memory.c b/dpdk/lib/librte_eal/common/eal_common_memory.c +index 4a9cc1f19a..cc7d54e0c7 100644 +--- a/dpdk/lib/librte_eal/common/eal_common_memory.c ++++ b/dpdk/lib/librte_eal/common/eal_common_memory.c +@@ -97,7 +97,7 @@ eal_get_virtual_area(void *requested_addr, size_t *size, + return NULL; + } + +- mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_READ, ++ mapped_addr = mmap(requested_addr, (size_t)map_sz, PROT_NONE, + mmap_flags, -1, 0); + if (mapped_addr == MAP_FAILED && allow_shrink) + *size -= page_sz; +diff --git a/dpdk/lib/librte_eal/common/eal_common_options.c b/dpdk/lib/librte_eal/common/eal_common_options.c +index a7f9c5f9bd..f791e9671d 100644 +--- a/dpdk/lib/librte_eal/common/eal_common_options.c ++++ b/dpdk/lib/librte_eal/common/eal_common_options.c +@@ -1039,7 +1039,7 @@ eal_parse_log_level(const char *arg) + if (regex) { + if (rte_log_set_level_regexp(regex, priority) < 0) { + fprintf(stderr, "cannot set log level %s,%d\n", +- pattern, priority); ++ regex, priority); + goto fail; + } + if (rte_log_save_regexp(regex, priority) < 0) +diff --git a/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h b/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +index 859b09748c..f79718ce8c 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h ++++ b/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_32.h +@@ -57,7 +57,7 @@ __rte_rdtsc_syscall(void) + * asm volatile("mcr p15, 0, %0, c9, c12, 0" : : "r"(29)); + * asm volatile("mcr p15, 0, %0, c9, c12, 1" : : "r"(0x8000000f)); + * +- * which is possible only from the priviledged mode (kernel space). ++ * which is possible only from the privileged mode (kernel space). + */ + static inline uint64_t + __rte_rdtsc_pmccntr(void) +diff --git a/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h b/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h +index 68e7c73384..da557b6a10 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h ++++ b/dpdk/lib/librte_eal/common/include/arch/arm/rte_cycles_64.h +@@ -62,7 +62,7 @@ rte_rdtsc(void) + static inline uint64_t + rte_rdtsc_precise(void) + { +- rte_mb(); ++ asm volatile("isb" : : : "memory"); + return rte_rdtsc(); + } + +diff --git a/dpdk/lib/librte_eal/common/include/arch/ppc_64/meson.build b/dpdk/lib/librte_eal/common/include/arch/ppc_64/meson.build +index 00f9611768..7949c86258 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/ppc_64/meson.build ++++ b/dpdk/lib/librte_eal/common/include/arch/ppc_64/meson.build +@@ -2,6 +2,7 @@ + # Copyright(c) 2018 Luca Boccassi + + install_headers( ++ 'rte_altivec.h', + 'rte_atomic.h', + 'rte_byteorder.h', + 'rte_cpuflags.h', +diff --git a/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_altivec.h b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_altivec.h +new file mode 100644 +index 0000000000..1551a94544 +--- /dev/null ++++ b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_altivec.h +@@ -0,0 +1,22 @@ ++/* ++ * SPDX-License-Identifier: BSD-3-Clause ++ * Copyright (C) Mellanox 2020. ++ */ ++ ++#ifndef _RTE_ALTIVEC_H_ ++#define _RTE_ALTIVEC_H_ ++ ++/* To include altivec.h, GCC version must be >= 4.8 */ ++#include ++ ++/* ++ * Compilation workaround for PPC64 when AltiVec is fully enabled, e.g. std=c11. ++ * Otherwise there would be a type conflict between stdbool and altivec. ++ */ ++#if defined(__PPC64__) && !defined(__APPLE_ALTIVEC__) ++#undef bool ++/* redefine as in stdbool.h */ ++#define bool _Bool ++#endif ++ ++#endif /* _RTE_ALTIVEC_H_ */ +diff --git a/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h +index 25311ba1d7..e63a1211a8 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h ++++ b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_memcpy.h +@@ -8,8 +8,10 @@ + + #include + #include +-/*To include altivec.h, GCC version must >= 4.8 */ +-#include ++ ++#include "rte_altivec.h" ++ ++#include "rte_common.h" + + #ifdef __cplusplus + extern "C" { +@@ -17,6 +19,11 @@ extern "C" { + + #include "generic/rte_memcpy.h" + ++#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Warray-bounds" ++#endif ++ + static inline void + rte_mov16(uint8_t *dst, const uint8_t *src) + { +@@ -192,6 +199,10 @@ rte_memcpy_func(void *dst, const void *src, size_t n) + return ret; + } + ++#if (GCC_VERSION >= 90000 && GCC_VERSION < 90400) ++#pragma GCC diagnostic pop ++#endif ++ + #ifdef __cplusplus + } + #endif +diff --git a/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h +index 068c805b22..4caafd9d2b 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h ++++ b/dpdk/lib/librte_eal/common/include/arch/ppc_64/rte_vect.h +@@ -6,7 +6,8 @@ + #ifndef _RTE_VECT_PPC_64_H_ + #define _RTE_VECT_PPC_64_H_ + +-#include ++#include "rte_altivec.h" ++ + #include "generic/rte_vect.h" + + #ifdef __cplusplus +diff --git a/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h b/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h +index 148398f50a..b9dcd30aba 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h ++++ b/dpdk/lib/librte_eal/common/include/arch/x86/rte_atomic.h +@@ -55,7 +55,7 @@ extern "C" { + * + * As pointed by Java guys, that makes possible to use lock-prefixed + * instructions to get the same effect as mfence and on most modern HW +- * that gives a better perfomance then using mfence: ++ * that gives a better performance then using mfence: + * https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ + * Basic idea is to use lock prefixed add with some dummy memory location + * as the destination. From their experiments 128B(2 cache lines) below +diff --git a/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +index ba44c4a328..9c67232df9 100644 +--- a/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h ++++ b/dpdk/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +@@ -22,6 +22,11 @@ + extern "C" { + #endif + ++#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) ++#pragma GCC diagnostic push ++#pragma GCC diagnostic ignored "-Wstringop-overflow" ++#endif ++ + /** + * Copy bytes from one location to another. The locations must not overlap. + * +@@ -869,6 +874,10 @@ rte_memcpy(void *dst, const void *src, size_t n) + return rte_memcpy_generic(dst, src, n); + } + ++#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION >= 100000) ++#pragma GCC diagnostic pop ++#endif ++ + #ifdef __cplusplus + } + #endif +diff --git a/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h b/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h +index 38e8cfd32b..9ca960932f 100644 +--- a/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h ++++ b/dpdk/lib/librte_eal/common/include/generic/rte_byteorder.h +@@ -93,9 +93,9 @@ + #define RTE_BE16(v) (rte_be16_t)(RTE_STATIC_BSWAP16(v)) + #define RTE_BE32(v) (rte_be32_t)(RTE_STATIC_BSWAP32(v)) + #define RTE_BE64(v) (rte_be64_t)(RTE_STATIC_BSWAP64(v)) +-#define RTE_LE16(v) (rte_be16_t)(v) +-#define RTE_LE32(v) (rte_be32_t)(v) +-#define RTE_LE64(v) (rte_be64_t)(v) ++#define RTE_LE16(v) (rte_le16_t)(v) ++#define RTE_LE32(v) (rte_le32_t)(v) ++#define RTE_LE64(v) (rte_le64_t)(v) + #else + #error Unsupported endianness. + #endif +diff --git a/dpdk/lib/librte_eal/common/include/rte_common.h b/dpdk/lib/librte_eal/common/include/rte_common.h +index 459d082d14..41e2778ec1 100644 +--- a/dpdk/lib/librte_eal/common/include/rte_common.h ++++ b/dpdk/lib/librte_eal/common/include/rte_common.h +@@ -347,7 +347,7 @@ typedef uint64_t rte_iova_t; + * The combined value. + */ + static inline uint32_t +-rte_combine32ms1b(register uint32_t x) ++rte_combine32ms1b(uint32_t x) + { + x |= x >> 1; + x |= x >> 2; +@@ -369,7 +369,7 @@ rte_combine32ms1b(register uint32_t x) + * The combined value. + */ + static inline uint64_t +-rte_combine64ms1b(register uint64_t v) ++rte_combine64ms1b(uint64_t v) + { + v |= v >> 1; + v |= v >> 2; +@@ -538,6 +538,9 @@ rte_bsf32_safe(uint64_t v, uint32_t *pos) + /** + * Return the rounded-up log2 of a integer. + * ++ * @note Contrary to the logarithm mathematical operation, ++ * rte_log2_u32(0) == 0 and not -inf. ++ * + * @param v + * The input parameter. + * @return +@@ -632,6 +635,9 @@ rte_fls_u64(uint64_t x) + /** + * Return the rounded-up log2 of a 64-bit integer. + * ++ * @note Contrary to the logarithm mathematical operation, ++ * rte_log2_u64(0) == 0 and not -inf. ++ * + * @param v + * The input parameter. + * @return +diff --git a/dpdk/lib/librte_eal/common/include/rte_service.h b/dpdk/lib/librte_eal/common/include/rte_service.h +index d8701dd4cf..3a1c735c58 100644 +--- a/dpdk/lib/librte_eal/common/include/rte_service.h ++++ b/dpdk/lib/librte_eal/common/include/rte_service.h +@@ -104,12 +104,16 @@ int32_t rte_service_probe_capability(uint32_t id, uint32_t capability); + * Each core can be added or removed from running a specific service. This + * function enables or disables *lcore* to run *service_id*. + * +- * If multiple cores are enabled on a service, an atomic is used to ensure that +- * only one cores runs the service at a time. The exception to this is when ++ * If multiple cores are enabled on a service, a lock is used to ensure that ++ * only one core runs the service at a time. The exception to this is when + * a service indicates that it is multi-thread safe by setting the capability + * called RTE_SERVICE_CAP_MT_SAFE. With the multi-thread safe capability set, + * the service function can be run on multiple threads at the same time. + * ++ * If the service is known to be mapped to a single lcore, setting the ++ * capability of the service to RTE_SERVICE_CAP_MT_SAFE can achieve ++ * better performance by avoiding the use of lock. ++ * + * @param service_id the service to apply the lcore to + * @param lcore The lcore that will be mapped to service + * @param enable Zero to unmap or disable the core, non-zero to enable +diff --git a/dpdk/lib/librte_eal/common/include/rte_service_component.h b/dpdk/lib/librte_eal/common/include/rte_service_component.h +index 16eab79eea..b75aba11b9 100644 +--- a/dpdk/lib/librte_eal/common/include/rte_service_component.h ++++ b/dpdk/lib/librte_eal/common/include/rte_service_component.h +@@ -43,7 +43,7 @@ struct rte_service_spec { + /** + * Register a new service. + * +- * A service represents a component that the requires CPU time periodically to ++ * A service represents a component that requires CPU time periodically to + * achieve its purpose. + * + * For example the eventdev SW PMD requires CPU cycles to perform its +@@ -56,6 +56,10 @@ struct rte_service_spec { + * *rte_service_component_runstate_set*, which indicates that the service + * component is ready to be executed. + * ++ * If the service is known to be mapped to a single lcore, setting the ++ * capability of the service to RTE_SERVICE_CAP_MT_SAFE can achieve ++ * better performance. ++ * + * @param spec The specification of the service to register + * @param[out] service_id A pointer to a uint32_t, which will be filled in + * during registration of the service. It is set to the integers +diff --git a/dpdk/lib/librte_eal/common/malloc_elem.c b/dpdk/lib/librte_eal/common/malloc_elem.c +index 885d00424b..51cdfc5d59 100644 +--- a/dpdk/lib/librte_eal/common/malloc_elem.c ++++ b/dpdk/lib/librte_eal/common/malloc_elem.c +@@ -171,7 +171,7 @@ malloc_elem_insert(struct malloc_elem *elem) + next_elem = NULL; + heap->last = elem; + } else { +- /* the new memory is somewhere inbetween start and end */ ++ /* the new memory is somewhere between start and end */ + uint64_t dist_from_start, dist_from_end; + + dist_from_end = RTE_PTR_DIFF(heap->last, elem); +diff --git a/dpdk/lib/librte_eal/common/malloc_heap.c b/dpdk/lib/librte_eal/common/malloc_heap.c +index 842eb9de75..bd5065698d 100644 +--- a/dpdk/lib/librte_eal/common/malloc_heap.c ++++ b/dpdk/lib/librte_eal/common/malloc_heap.c +@@ -241,6 +241,9 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size, + size = RTE_CACHE_LINE_ROUNDUP(size); + align = RTE_CACHE_LINE_ROUNDUP(align); + ++ /* roundup might cause an overflow */ ++ if (size == 0) ++ return NULL; + elem = find_suitable_element(heap, size, flags, align, bound, contig); + if (elem != NULL) { + elem = malloc_elem_alloc(elem, size, align, bound, contig); +diff --git a/dpdk/lib/librte_eal/common/rte_random.c b/dpdk/lib/librte_eal/common/rte_random.c +index 57ec8fb2b3..b7a089ac4f 100644 +--- a/dpdk/lib/librte_eal/common/rte_random.c ++++ b/dpdk/lib/librte_eal/common/rte_random.c +@@ -198,7 +198,7 @@ __rte_random_initial_seed(void) + return (uint64_t)rdseed_low | ((uint64_t)rdseed_high << 32); + #endif + /* second fallback: seed using rdtsc */ +- return rte_get_timer_cycles(); ++ return rte_get_tsc_cycles(); + } + + RTE_INIT(rte_rand_init) +diff --git a/dpdk/lib/librte_eal/common/rte_service.c b/dpdk/lib/librte_eal/common/rte_service.c +index 79235c03f8..d5dd32d8d9 100644 +--- a/dpdk/lib/librte_eal/common/rte_service.c ++++ b/dpdk/lib/librte_eal/common/rte_service.c +@@ -50,6 +50,10 @@ struct rte_service_spec_impl { + uint8_t internal_flags; + + /* per service statistics */ ++ /* Indicates how many cores the service is mapped to run on. ++ * It does not indicate the number of cores the service is running ++ * on currently. ++ */ + rte_atomic32_t num_mapped_cores; + uint64_t calls; + uint64_t cycles_spent; +@@ -122,6 +126,9 @@ rte_service_finalize(void) + if (!rte_service_library_initialized) + return; + ++ rte_service_lcore_reset_all(); ++ rte_eal_mp_wait_lcore(); ++ + rte_free(rte_services); + rte_free(lcore_states); + +@@ -137,6 +144,12 @@ service_valid(uint32_t id) + return !!(rte_services[id].internal_flags & SERVICE_F_REGISTERED); + } + ++static struct rte_service_spec_impl * ++service_get(uint32_t id) ++{ ++ return &rte_services[id]; ++} ++ + /* validate ID and retrieve service pointer, or return error value */ + #define SERVICE_VALID_GET_OR_ERR_RET(id, service, retval) do { \ + if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) \ +@@ -327,8 +340,8 @@ rte_service_runstate_get(uint32_t id) + } + + static inline void +-rte_service_runner_do_callback(struct rte_service_spec_impl *s, +- struct core_state *cs, uint32_t service_idx) ++service_runner_do_callback(struct rte_service_spec_impl *s, ++ struct core_state *cs, uint32_t service_idx) + { + void *userdata = s->spec.callback_userdata; + +@@ -344,12 +357,14 @@ rte_service_runner_do_callback(struct rte_service_spec_impl *s, + } + + +-static inline int32_t +-service_run(uint32_t i, struct core_state *cs, uint64_t service_mask) ++/* Expects the service 's' is valid. */ ++static int32_t ++service_run(uint32_t i, struct core_state *cs, uint64_t service_mask, ++ struct rte_service_spec_impl *s, uint32_t serialize_mt_unsafe) + { +- if (!service_valid(i)) ++ if (!s) + return -EINVAL; +- struct rte_service_spec_impl *s = &rte_services[i]; ++ + if (s->comp_runstate != RUNSTATE_RUNNING || + s->app_runstate != RUNSTATE_RUNNING || + !(service_mask & (UINT64_C(1) << i))) { +@@ -359,19 +374,14 @@ service_run(uint32_t i, struct core_state *cs, uint64_t service_mask) + + cs->service_active_on_lcore[i] = 1; + +- /* check do we need cmpset, if MT safe or <= 1 core +- * mapped, atomic ops are not required. +- */ +- const int use_atomics = (service_mt_safe(s) == 0) && +- (rte_atomic32_read(&s->num_mapped_cores) > 1); +- if (use_atomics) { ++ if ((service_mt_safe(s) == 0) && (serialize_mt_unsafe == 1)) { + if (!rte_atomic32_cmpset((uint32_t *)&s->execute_lock, 0, 1)) + return -EBUSY; + +- rte_service_runner_do_callback(s, cs, i); ++ service_runner_do_callback(s, cs, i); + rte_atomic32_clear(&s->execute_lock); + } else +- rte_service_runner_do_callback(s, cs, i); ++ service_runner_do_callback(s, cs, i); + + return 0; + } +@@ -383,7 +393,7 @@ rte_service_may_be_active(uint32_t id) + int32_t lcore_count = rte_service_lcore_list(ids, RTE_MAX_LCORE); + int i; + +- if (!service_valid(id)) ++ if (id >= RTE_SERVICE_NUM_MAX || !service_valid(id)) + return -EINVAL; + + for (i = 0; i < lcore_count; i++) { +@@ -397,49 +407,39 @@ rte_service_may_be_active(uint32_t id) + int32_t + rte_service_run_iter_on_app_lcore(uint32_t id, uint32_t serialize_mt_unsafe) + { +- /* run service on calling core, using all-ones as the service mask */ +- if (!service_valid(id)) +- return -EINVAL; +- + struct core_state *cs = &lcore_states[rte_lcore_id()]; +- struct rte_service_spec_impl *s = &rte_services[id]; ++ struct rte_service_spec_impl *s; + +- /* Atomically add this core to the mapped cores first, then examine if +- * we can run the service. This avoids a race condition between +- * checking the value, and atomically adding to the mapped count. +- */ +- if (serialize_mt_unsafe) +- rte_atomic32_inc(&s->num_mapped_cores); ++ SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + +- if (service_mt_safe(s) == 0 && +- rte_atomic32_read(&s->num_mapped_cores) > 1) { +- if (serialize_mt_unsafe) +- rte_atomic32_dec(&s->num_mapped_cores); +- return -EBUSY; +- } ++ /* Increment num_mapped_cores to reflect that this core is ++ * now mapped capable of running the service. ++ */ ++ rte_atomic32_inc(&s->num_mapped_cores); + +- int ret = service_run(id, cs, UINT64_MAX); ++ int ret = service_run(id, cs, UINT64_MAX, s, serialize_mt_unsafe); + +- if (serialize_mt_unsafe) +- rte_atomic32_dec(&s->num_mapped_cores); ++ rte_atomic32_dec(&s->num_mapped_cores); + + return ret; + } + + static int32_t +-rte_service_runner_func(void *arg) ++service_runner_func(void *arg) + { + RTE_SET_USED(arg); + uint32_t i; + const int lcore = rte_lcore_id(); + struct core_state *cs = &lcore_states[lcore]; + +- while (lcore_states[lcore].runstate == RUNSTATE_RUNNING) { ++ while (cs->runstate == RUNSTATE_RUNNING) { + const uint64_t service_mask = cs->service_mask; + + for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { ++ if (!service_valid(i)) ++ continue; + /* return value ignored as no change to code flow */ +- service_run(i, cs, service_mask); ++ service_run(i, cs, service_mask, service_get(i), 1); + } + + cs->loops++; +@@ -693,9 +693,9 @@ rte_service_lcore_start(uint32_t lcore) + /* set core to run state first, and then launch otherwise it will + * return immediately as runstate keeps it in the service poll loop + */ +- lcore_states[lcore].runstate = RUNSTATE_RUNNING; ++ cs->runstate = RUNSTATE_RUNNING; + +- int ret = rte_eal_remote_launch(rte_service_runner_func, 0, lcore); ++ int ret = rte_eal_remote_launch(service_runner_func, 0, lcore); + /* returns -EBUSY if the core is already launched, 0 on success */ + return ret; + } +@@ -774,13 +774,9 @@ rte_service_lcore_attr_get(uint32_t lcore, uint32_t attr_id, + } + + static void +-rte_service_dump_one(FILE *f, struct rte_service_spec_impl *s, +- uint64_t all_cycles, uint32_t reset) ++service_dump_one(FILE *f, struct rte_service_spec_impl *s, uint32_t reset) + { + /* avoid divide by zero */ +- if (all_cycles == 0) +- all_cycles = 1; +- + int calls = 1; + if (s->calls != 0) + calls = s->calls; +@@ -807,7 +803,7 @@ rte_service_attr_reset_all(uint32_t id) + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + + int reset = 1; +- rte_service_dump_one(NULL, s, 0, reset); ++ service_dump_one(NULL, s, reset); + return 0; + } + +@@ -851,21 +847,13 @@ rte_service_dump(FILE *f, uint32_t id) + uint32_t i; + int print_one = (id != UINT32_MAX); + +- uint64_t total_cycles = 0; +- +- for (i = 0; i < RTE_SERVICE_NUM_MAX; i++) { +- if (!service_valid(i)) +- continue; +- total_cycles += rte_services[i].cycles_spent; +- } +- + /* print only the specified service */ + if (print_one) { + struct rte_service_spec_impl *s; + SERVICE_VALID_GET_OR_ERR_RET(id, s, -EINVAL); + fprintf(f, "Service %s Summary\n", s->spec.name); + uint32_t reset = 0; +- rte_service_dump_one(f, s, total_cycles, reset); ++ service_dump_one(f, s, reset); + return 0; + } + +@@ -875,7 +863,7 @@ rte_service_dump(FILE *f, uint32_t id) + if (!service_valid(i)) + continue; + uint32_t reset = 0; +- rte_service_dump_one(f, &rte_services[i], total_cycles, reset); ++ service_dump_one(f, &rte_services[i], reset); + } + + fprintf(f, "Service Cores Summary\n"); +diff --git a/dpdk/lib/librte_eal/freebsd/eal/eal_interrupts.c b/dpdk/lib/librte_eal/freebsd/eal/eal_interrupts.c +index f6831b7902..3fee762be9 100644 +--- a/dpdk/lib/librte_eal/freebsd/eal/eal_interrupts.c ++++ b/dpdk/lib/librte_eal/freebsd/eal/eal_interrupts.c +@@ -83,9 +83,9 @@ int + rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + rte_intr_callback_fn cb, void *cb_arg) + { +- struct rte_intr_callback *callback = NULL; +- struct rte_intr_source *src = NULL; +- int ret, add_event; ++ struct rte_intr_callback *callback; ++ struct rte_intr_source *src; ++ int ret, add_event = 0; + + /* first do parameter checking */ + if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) { +@@ -98,47 +98,53 @@ rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + return -ENODEV; + } + +- /* allocate a new interrupt callback entity */ +- callback = calloc(1, sizeof(*callback)); +- if (callback == NULL) { +- RTE_LOG(ERR, EAL, "Can not allocate memory\n"); +- return -ENOMEM; +- } +- callback->cb_fn = cb; +- callback->cb_arg = cb_arg; +- callback->pending_delete = 0; +- callback->ucb_fn = NULL; +- + rte_spinlock_lock(&intr_lock); + +- /* check if there is at least one callback registered for the fd */ ++ /* find the source for this intr_handle */ + TAILQ_FOREACH(src, &intr_sources, next) { +- if (src->intr_handle.fd == intr_handle->fd) { +- /* we had no interrupts for this */ +- if (TAILQ_EMPTY(&src->callbacks)) +- add_event = 1; +- +- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); +- ret = 0; ++ if (src->intr_handle.fd == intr_handle->fd) + break; +- } + } + +- /* no existing callbacks for this - add new source */ +- if (src == NULL) { +- src = calloc(1, sizeof(*src)); +- if (src == NULL) { ++ /* if this is an alarm interrupt and it already has a callback, ++ * then we don't want to create a new callback because the only ++ * thing on the list should be eal_alarm_callback() and we may ++ * be called just to reset the timer. ++ */ ++ if (src != NULL && src->intr_handle.type == RTE_INTR_HANDLE_ALARM && ++ !TAILQ_EMPTY(&src->callbacks)) { ++ callback = NULL; ++ } else { ++ /* allocate a new interrupt callback entity */ ++ callback = calloc(1, sizeof(*callback)); ++ if (callback == NULL) { + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); + ret = -ENOMEM; + goto fail; +- } else { +- src->intr_handle = *intr_handle; +- TAILQ_INIT(&src->callbacks); +- TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); +- TAILQ_INSERT_TAIL(&intr_sources, src, next); +- add_event = 1; +- ret = 0; + } ++ callback->cb_fn = cb; ++ callback->cb_arg = cb_arg; ++ callback->pending_delete = 0; ++ callback->ucb_fn = NULL; ++ ++ if (src == NULL) { ++ src = calloc(1, sizeof(*src)); ++ if (src == NULL) { ++ RTE_LOG(ERR, EAL, "Can not allocate memory\n"); ++ ret = -ENOMEM; ++ goto fail; ++ } else { ++ src->intr_handle = *intr_handle; ++ TAILQ_INIT(&src->callbacks); ++ TAILQ_INSERT_TAIL(&intr_sources, src, next); ++ } ++ } ++ ++ /* we had no interrupts for this */ ++ if (TAILQ_EMPTY(&src->callbacks)) ++ add_event = 1; ++ ++ TAILQ_INSERT_TAIL(&(src->callbacks), callback, next); + } + + /* add events to the queue. timer events are special as we need to +@@ -178,11 +184,12 @@ rte_intr_callback_register(const struct rte_intr_handle *intr_handle, + } + rte_spinlock_unlock(&intr_lock); + +- return ret; ++ return 0; + fail: + /* clean up */ + if (src != NULL) { +- TAILQ_REMOVE(&(src->callbacks), callback, next); ++ if (callback != NULL) ++ TAILQ_REMOVE(&(src->callbacks), callback, next); + if (TAILQ_EMPTY(&(src->callbacks))) { + TAILQ_REMOVE(&intr_sources, src, next); + free(src); +diff --git a/dpdk/lib/librte_eal/freebsd/eal/eal_memory.c b/dpdk/lib/librte_eal/freebsd/eal/eal_memory.c +index a97d8f0f0c..5bc2da160c 100644 +--- a/dpdk/lib/librte_eal/freebsd/eal/eal_memory.c ++++ b/dpdk/lib/librte_eal/freebsd/eal/eal_memory.c +@@ -449,7 +449,7 @@ memseg_primary_init(void) + * + * we need (N*2)-1 segments because we cannot guarantee that + * each segment will be IOVA-contiguous with the previous one, +- * so we will allocate more and put spaces inbetween segments ++ * so we will allocate more and put spaces between segments + * that are non-contiguous. + */ + avail_segs = (hpi->num_pages[0] * 2) - 1; +diff --git a/dpdk/lib/librte_eal/linux/eal/eal.c b/dpdk/lib/librte_eal/linux/eal/eal.c +index c4233ec3c8..e6d4cc7178 100644 +--- a/dpdk/lib/librte_eal/linux/eal/eal.c ++++ b/dpdk/lib/librte_eal/linux/eal/eal.c +@@ -25,6 +25,7 @@ + #if defined(RTE_ARCH_X86) + #include + #endif ++#include + + #include + #include +@@ -1076,7 +1077,7 @@ rte_eal_init(int argc, char **argv) + #if defined(RTE_LIBRTE_KNI) && LINUX_VERSION_CODE >= KERNEL_VERSION(4, 10, 0) + } else if (rte_eal_check_module("rte_kni") == 1) { + iova_mode = RTE_IOVA_PA; +- RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI perfomance.\n"); ++ RTE_LOG(DEBUG, EAL, "KNI is loaded, selecting IOVA as PA mode for better KNI performance.\n"); + #endif + } else if (is_iommu_enabled()) { + /* we have an IOMMU, pick IOVA as VA mode */ +diff --git a/dpdk/lib/librte_eal/linux/eal/eal_interrupts.c b/dpdk/lib/librte_eal/linux/eal/eal_interrupts.c +index 1955324d30..14ebb108ce 100644 +--- a/dpdk/lib/librte_eal/linux/eal/eal_interrupts.c ++++ b/dpdk/lib/librte_eal/linux/eal/eal_interrupts.c +@@ -1045,8 +1045,6 @@ eal_intr_handle_interrupts(int pfd, unsigned totalfds) + static __attribute__((noreturn)) void * + eal_intr_thread_main(__rte_unused void *arg) + { +- struct epoll_event ev; +- + /* host thread, never break out */ + for (;;) { + /* build up the epoll fd with all descriptors we are to +@@ -1078,8 +1076,11 @@ eal_intr_thread_main(__rte_unused void *arg) + rte_spinlock_lock(&intr_lock); + + TAILQ_FOREACH(src, &intr_sources, next) { ++ struct epoll_event ev; ++ + if (src->callbacks.tqh_first == NULL) + continue; /* skip those with no callbacks */ ++ memset(&ev, 0, sizeof(ev)); + ev.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; + ev.data.fd = src->intr_handle.fd; + +diff --git a/dpdk/lib/librte_eal/linux/eal/eal_memalloc.c b/dpdk/lib/librte_eal/linux/eal/eal_memalloc.c +index af6d0d023a..678094acf9 100644 +--- a/dpdk/lib/librte_eal/linux/eal/eal_memalloc.c ++++ b/dpdk/lib/librte_eal/linux/eal/eal_memalloc.c +@@ -680,7 +680,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi, + /* erase page data */ + memset(ms->addr, 0, ms->len); + +- if (mmap(ms->addr, ms->len, PROT_READ, ++ if (mmap(ms->addr, ms->len, PROT_NONE, + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0) == + MAP_FAILED) { + RTE_LOG(DEBUG, EAL, "couldn't unmap page\n"); +diff --git a/dpdk/lib/librte_eal/linux/eal/eal_memory.c b/dpdk/lib/librte_eal/linux/eal/eal_memory.c +index 43e4ffc757..7a9c97ff88 100644 +--- a/dpdk/lib/librte_eal/linux/eal/eal_memory.c ++++ b/dpdk/lib/librte_eal/linux/eal/eal_memory.c +@@ -1340,6 +1340,8 @@ eal_legacy_hugepage_init(void) + + /* hugetlbfs can be disabled */ + if (internal_config.no_hugetlbfs) { ++ void *prealloc_addr; ++ size_t mem_sz; + struct rte_memseg_list *msl; + int n_segs, cur_seg, fd, flags; + #ifdef MEMFD_SUPPORTED +@@ -1395,17 +1397,31 @@ eal_legacy_hugepage_init(void) + } + } + #endif +- addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE, +- flags, fd, 0); +- if (addr == MAP_FAILED) { ++ /* preallocate address space for the memory, so that it can be ++ * fit into the DMA mask. ++ */ ++ mem_sz = internal_config.memory; ++ prealloc_addr = eal_get_virtual_area( ++ NULL, &mem_sz, page_sz, 0, 0); ++ if (prealloc_addr == NULL) { ++ RTE_LOG(ERR, EAL, ++ "%s: reserving memory area failed: " ++ "%s\n", ++ __func__, strerror(errno)); ++ return -1; ++ } ++ addr = mmap(prealloc_addr, mem_sz, PROT_READ | PROT_WRITE, ++ flags | MAP_FIXED, fd, 0); ++ if (addr == MAP_FAILED || addr != prealloc_addr) { + RTE_LOG(ERR, EAL, "%s: mmap() failed: %s\n", __func__, + strerror(errno)); ++ munmap(prealloc_addr, mem_sz); + return -1; + } + msl->base_va = addr; + msl->page_sz = page_sz; + msl->socket_id = 0; +- msl->len = internal_config.memory; ++ msl->len = mem_sz; + msl->heap = 1; + + /* we're in single-file segments mode, so only the segment list +@@ -1928,7 +1944,7 @@ eal_legacy_hugepage_attach(void) + if (flock(fd, LOCK_SH) < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Locking file failed: %s\n", + __func__, strerror(errno)); +- goto fd_error; ++ goto mmap_error; + } + + /* find segment data */ +@@ -1936,13 +1952,13 @@ eal_legacy_hugepage_attach(void) + if (msl == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg list\n", + __func__); +- goto fd_error; ++ goto mmap_error; + } + ms = rte_mem_virt2memseg(map_addr, msl); + if (ms == NULL) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg\n", + __func__); +- goto fd_error; ++ goto mmap_error; + } + + msl_idx = msl - mcfg->memsegs; +@@ -1950,7 +1966,7 @@ eal_legacy_hugepage_attach(void) + if (ms_idx < 0) { + RTE_LOG(DEBUG, EAL, "%s(): Cannot find memseg idx\n", + __func__); +- goto fd_error; ++ goto mmap_error; + } + + /* store segment fd internally */ +@@ -1963,18 +1979,15 @@ eal_legacy_hugepage_attach(void) + close(fd_hugepage); + return 0; + ++mmap_error: ++ munmap(hp[i].final_va, hp[i].size); + fd_error: + close(fd); + error: +- /* map all segments into memory to make sure we get the addrs */ +- cur_seg = 0; +- for (cur_seg = 0; cur_seg < i; cur_seg++) { +- struct hugepage_file *hf = &hp[i]; +- size_t map_sz = hf->size; +- void *map_addr = hf->final_va; ++ /* unwind mmap's done so far */ ++ for (cur_seg = 0; cur_seg < i; cur_seg++) ++ munmap(hp[cur_seg].final_va, hp[cur_seg].size); + +- munmap(map_addr, map_sz); +- } + if (hp != NULL && hp != MAP_FAILED) + munmap(hp, size); + if (fd_hugepage >= 0) +diff --git a/dpdk/lib/librte_eal/linux/eal/eal_vfio.c b/dpdk/lib/librte_eal/linux/eal/eal_vfio.c +index 95f615c2e3..62ffe13e0e 100644 +--- a/dpdk/lib/librte_eal/linux/eal/eal_vfio.c ++++ b/dpdk/lib/librte_eal/linux/eal/eal_vfio.c +@@ -379,7 +379,7 @@ vfio_get_group_fd(struct vfio_config *vfio_cfg, + } + + vfio_group_fd = vfio_open_group_fd(iommu_group_num); +- if (vfio_group_fd < 0) { ++ if (vfio_group_fd <= 0) { + RTE_LOG(ERR, EAL, "Failed to open group %d\n", iommu_group_num); + return -1; + } +@@ -532,6 +532,17 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + return; + } + ++#ifdef RTE_ARCH_PPC_64 ++ ms = rte_mem_virt2memseg(addr, msl); ++ while (cur_len < len) { ++ int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); ++ ++ rte_fbarray_set_free(&msl->memseg_arr, idx); ++ cur_len += ms->len; ++ ++ms; ++ } ++ cur_len = 0; ++#endif + /* memsegs are contiguous in memory */ + ms = rte_mem_virt2memseg(addr, msl); + while (cur_len < len) { +@@ -551,6 +562,17 @@ vfio_mem_event_callback(enum rte_mem_event type, const void *addr, size_t len, + cur_len += ms->len; + ++ms; + } ++#ifdef RTE_ARCH_PPC_64 ++ cur_len = 0; ++ ms = rte_mem_virt2memseg(addr, msl); ++ while (cur_len < len) { ++ int idx = rte_fbarray_find_idx(&msl->memseg_arr, ms); ++ ++ rte_fbarray_set_used(&msl->memseg_arr, idx); ++ cur_len += ms->len; ++ ++ms; ++ } ++#endif + } + + static int +@@ -1027,6 +1049,7 @@ vfio_get_default_container_fd(void) + struct rte_mp_reply mp_reply = {0}; + struct timespec ts = {.tv_sec = 5, .tv_nsec = 0}; + struct vfio_mp_param *p = (struct vfio_mp_param *)mp_req.param; ++ int container_fd; + + if (default_vfio_cfg->vfio_enabled) + return default_vfio_cfg->vfio_container_fd; +@@ -1049,8 +1072,9 @@ vfio_get_default_container_fd(void) + mp_rep = &mp_reply.msgs[0]; + p = (struct vfio_mp_param *)mp_rep->param; + if (p->result == SOCKET_OK && mp_rep->num_fds == 1) { ++ container_fd = mp_rep->fds[0]; + free(mp_reply.msgs); +- return mp_rep->fds[0]; ++ return container_fd; + } + } + +@@ -1416,16 +1440,11 @@ vfio_spapr_dma_do_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + return 0; + } + +-struct spapr_remap_walk_param { +- int vfio_container_fd; +- uint64_t addr_64; +-}; +- + static int + vfio_spapr_map_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) + { +- struct spapr_remap_walk_param *param = arg; ++ int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) +@@ -1435,10 +1454,7 @@ vfio_spapr_map_walk(const struct rte_memseg_list *msl, + if (ms->iova == RTE_BAD_IOVA) + return 0; + +- if (ms->addr_64 == param->addr_64) +- return 0; +- +- return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova, ++ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 1); + } + +@@ -1446,7 +1462,7 @@ static int + vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, + const struct rte_memseg *ms, void *arg) + { +- struct spapr_remap_walk_param *param = arg; ++ int *vfio_container_fd = arg; + + /* skip external memory that isn't a heap */ + if (msl->external && !msl->heap) +@@ -1456,17 +1472,13 @@ vfio_spapr_unmap_walk(const struct rte_memseg_list *msl, + if (ms->iova == RTE_BAD_IOVA) + return 0; + +- if (ms->addr_64 == param->addr_64) +- return 0; +- +- return vfio_spapr_dma_do_map(param->vfio_container_fd, ms->addr_64, ms->iova, ++ return vfio_spapr_dma_do_map(*vfio_container_fd, ms->addr_64, ms->iova, + ms->len, 0); + } + + struct spapr_walk_param { + uint64_t window_size; + uint64_t hugepage_sz; +- uint64_t addr_64; + }; + + static int +@@ -1484,10 +1496,6 @@ vfio_spapr_window_size_walk(const struct rte_memseg_list *msl, + if (ms->iova == RTE_BAD_IOVA) + return 0; + +- /* do not iterate ms we haven't mapped yet */ +- if (param->addr_64 && ms->addr_64 == param->addr_64) +- return 0; +- + if (max > param->window_size) { + param->hugepage_sz = ms->hugepage_sz; + param->window_size = max; +@@ -1531,20 +1539,11 @@ vfio_spapr_create_new_dma_window(int vfio_container_fd, + /* try possible page_shift and levels for workaround */ + uint32_t levels; + +- for (levels = 1; levels <= info.ddw.levels; levels++) { +- uint32_t pgsizes = info.ddw.pgsizes; +- +- while (pgsizes != 0) { +- create->page_shift = 31 - __builtin_clz(pgsizes); +- create->levels = levels; +- ret = ioctl(vfio_container_fd, +- VFIO_IOMMU_SPAPR_TCE_CREATE, create); +- if (!ret) +- break; +- pgsizes &= ~(1 << create->page_shift); +- } +- if (!ret) +- break; ++ for (levels = create->levels + 1; ++ ret && levels <= info.ddw.levels; levels++) { ++ create->levels = levels; ++ ret = ioctl(vfio_container_fd, ++ VFIO_IOMMU_SPAPR_TCE_CREATE, create); + } + #endif + if (ret) { +@@ -1585,7 +1584,6 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + + /* check if window size needs to be adjusted */ + memset(¶m, 0, sizeof(param)); +- param.addr_64 = vaddr; + + /* we're inside a callback so use thread-unsafe version */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, +@@ -1610,14 +1608,9 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + if (do_map) { + /* re-create window and remap the entire memory */ + if (iova + len > create.window_size) { +- struct spapr_remap_walk_param remap_param = { +- .vfio_container_fd = vfio_container_fd, +- .addr_64 = vaddr, +- }; +- + /* release all maps before recreating the window */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_unmap_walk, +- &remap_param) < 0) { ++ &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not release DMA maps\n"); + ret = -1; + goto out; +@@ -1644,7 +1637,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova, + /* we're inside a callback, so use thread-unsafe version + */ + if (rte_memseg_walk_thread_unsafe(vfio_spapr_map_walk, +- &remap_param) < 0) { ++ &vfio_container_fd) < 0) { + RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n"); + ret = -1; + goto out; +@@ -1691,7 +1684,6 @@ vfio_spapr_dma_map(int vfio_container_fd) + struct spapr_walk_param param; + + memset(¶m, 0, sizeof(param)); +- param.addr_64 = 0UL; + + /* create DMA window from 0 to max(phys_addr + len) */ + rte_memseg_walk(vfio_spapr_window_size_walk, ¶m); +diff --git a/dpdk/lib/librte_eal/windows/eal/include/sched.h b/dpdk/lib/librte_eal/windows/eal/include/sched.h +index 257060594c..29868c93d1 100644 +--- a/dpdk/lib/librte_eal/windows/eal/include/sched.h ++++ b/dpdk/lib/librte_eal/windows/eal/include/sched.h +@@ -14,8 +14,8 @@ + extern "C" { + #endif + +-#ifndef CPU_SET_SIZE +-#define CPU_SET_SIZE RTE_MAX_LCORE ++#ifndef CPU_SETSIZE ++#define CPU_SETSIZE RTE_MAX_LCORE + #endif + + #define _BITS_PER_SET (sizeof(long long) * 8) +@@ -26,7 +26,7 @@ extern "C" { + #define _WHICH_BIT(b) ((b) & (_BITS_PER_SET - 1)) + + typedef struct _rte_cpuset_s { +- long long _bits[_NUM_SETS(CPU_SET_SIZE)]; ++ long long _bits[_NUM_SETS(CPU_SETSIZE)]; + } rte_cpuset_t; + + #define CPU_SET(b, s) ((s)->_bits[_WHICH_SET(b)] |= (1LL << _WHICH_BIT(b))) +@@ -35,7 +35,7 @@ typedef struct _rte_cpuset_s { + do { \ + unsigned int _i; \ + \ +- for (_i = 0; _i < _NUM_SETS(CPU_SET_SIZE); _i++) \ ++ for (_i = 0; _i < _NUM_SETS(CPU_SETSIZE); _i++) \ + (s)->_bits[_i] = 0LL; \ + } while (0) + +diff --git a/dpdk/lib/librte_ethdev/ethdev_profile.h b/dpdk/lib/librte_ethdev/ethdev_profile.h +index 65031e6f3f..e5ee4df824 100644 +--- a/dpdk/lib/librte_ethdev/ethdev_profile.h ++++ b/dpdk/lib/librte_ethdev/ethdev_profile.h +@@ -24,4 +24,13 @@ + int + __rte_eth_dev_profile_init(uint16_t port_id, struct rte_eth_dev *dev); + ++#ifdef RTE_ETHDEV_PROFILE_WITH_VTUNE ++ ++uint16_t ++profile_hook_rx_burst_cb(uint16_t port_id, uint16_t queue_id, ++ struct rte_mbuf *pkts[], uint16_t nb_pkts, ++ uint16_t max_pkts, void *user_param); ++ ++#endif /* RTE_ETHDEV_PROFILE_WITH_VTUNE */ ++ + #endif +diff --git a/dpdk/lib/librte_ethdev/rte_ethdev.c b/dpdk/lib/librte_ethdev/rte_ethdev.c +index 6e9cb243ea..c3657509c5 100644 +--- a/dpdk/lib/librte_ethdev/rte_ethdev.c ++++ b/dpdk/lib/librte_ethdev/rte_ethdev.c +@@ -1166,14 +1166,14 @@ check_lro_pkt_size(uint16_t port_id, uint32_t config_size, + + /* + * Validate offloads that are requested through rte_eth_dev_configure against +- * the offloads successfuly set by the ethernet device. ++ * the offloads successfully set by the ethernet device. + * + * @param port_id + * The port identifier of the Ethernet device. + * @param req_offloads + * The offloads that have been requested through `rte_eth_dev_configure`. + * @param set_offloads +- * The offloads successfuly set by the ethernet device. ++ * The offloads successfully set by the ethernet device. + * @param offload_type + * The offload type i.e. Rx/Tx string. + * @param offload_name +@@ -1202,7 +1202,7 @@ validate_offloads(uint16_t port_id, uint64_t req_offloads, + ret = -EINVAL; + } + +- /* Chech if offload couldn't be disabled. */ ++ /* Check if offload couldn't be disabled. */ + if (offload & set_offloads) { + RTE_ETHDEV_LOG(DEBUG, + "Port %u %s offload %s is not requested but enabled\n", +@@ -2968,6 +2968,7 @@ rte_eth_dev_info_get(uint16_t port_id, struct rte_eth_dev_info *dev_info) + * return status and does not know if get is successful or not. + */ + memset(dev_info, 0, sizeof(struct rte_eth_dev_info)); ++ dev_info->switch_info.domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; +@@ -3253,53 +3254,53 @@ rte_eth_dev_set_vlan_offload(uint16_t port_id, int offload_mask) + int mask = 0; + int cur, org = 0; + uint64_t orig_offloads; +- uint64_t *dev_offloads; ++ uint64_t dev_offloads; + + RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV); + dev = &rte_eth_devices[port_id]; + + /* save original values in case of failure */ + orig_offloads = dev->data->dev_conf.rxmode.offloads; +- dev_offloads = &dev->data->dev_conf.rxmode.offloads; ++ dev_offloads = orig_offloads; + +- /*check which option changed by application*/ ++ /* check which option changed by application */ + cur = !!(offload_mask & ETH_VLAN_STRIP_OFFLOAD); +- org = !!(*dev_offloads & DEV_RX_OFFLOAD_VLAN_STRIP); ++ org = !!(dev_offloads & DEV_RX_OFFLOAD_VLAN_STRIP); + if (cur != org) { + if (cur) +- *dev_offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; ++ dev_offloads |= DEV_RX_OFFLOAD_VLAN_STRIP; + else +- *dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP; ++ dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_STRIP; + mask |= ETH_VLAN_STRIP_MASK; + } + + cur = !!(offload_mask & ETH_VLAN_FILTER_OFFLOAD); +- org = !!(*dev_offloads & DEV_RX_OFFLOAD_VLAN_FILTER); ++ org = !!(dev_offloads & DEV_RX_OFFLOAD_VLAN_FILTER); + if (cur != org) { + if (cur) +- *dev_offloads |= DEV_RX_OFFLOAD_VLAN_FILTER; ++ dev_offloads |= DEV_RX_OFFLOAD_VLAN_FILTER; + else +- *dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_FILTER; ++ dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_FILTER; + mask |= ETH_VLAN_FILTER_MASK; + } + + cur = !!(offload_mask & ETH_VLAN_EXTEND_OFFLOAD); +- org = !!(*dev_offloads & DEV_RX_OFFLOAD_VLAN_EXTEND); ++ org = !!(dev_offloads & DEV_RX_OFFLOAD_VLAN_EXTEND); + if (cur != org) { + if (cur) +- *dev_offloads |= DEV_RX_OFFLOAD_VLAN_EXTEND; ++ dev_offloads |= DEV_RX_OFFLOAD_VLAN_EXTEND; + else +- *dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_EXTEND; ++ dev_offloads &= ~DEV_RX_OFFLOAD_VLAN_EXTEND; + mask |= ETH_VLAN_EXTEND_MASK; + } + + cur = !!(offload_mask & ETH_QINQ_STRIP_OFFLOAD); +- org = !!(*dev_offloads & DEV_RX_OFFLOAD_QINQ_STRIP); ++ org = !!(dev_offloads & DEV_RX_OFFLOAD_QINQ_STRIP); + if (cur != org) { + if (cur) +- *dev_offloads |= DEV_RX_OFFLOAD_QINQ_STRIP; ++ dev_offloads |= DEV_RX_OFFLOAD_QINQ_STRIP; + else +- *dev_offloads &= ~DEV_RX_OFFLOAD_QINQ_STRIP; ++ dev_offloads &= ~DEV_RX_OFFLOAD_QINQ_STRIP; + mask |= ETH_QINQ_STRIP_MASK; + } + +@@ -3308,10 +3309,11 @@ rte_eth_dev_set_vlan_offload(uint16_t port_id, int offload_mask) + return ret; + + RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->vlan_offload_set, -ENOTSUP); ++ dev->data->dev_conf.rxmode.offloads = dev_offloads; + ret = (*dev->dev_ops->vlan_offload_set)(dev, mask); + if (ret) { + /* hit an error restore original values */ +- *dev_offloads = orig_offloads; ++ dev->data->dev_conf.rxmode.offloads = orig_offloads; + } + + return eth_err(port_id, ret); +@@ -4039,7 +4041,7 @@ rte_eth_dev_callback_unregister(uint16_t port_id, + next = TAILQ_NEXT(cb, next); + + if (cb->cb_fn != cb_fn || cb->event != event || +- (cb->cb_arg != (void *)-1 && cb->cb_arg != cb_arg)) ++ (cb_arg != (void *)-1 && cb->cb_arg != cb_arg)) + continue; + + /* +@@ -4452,7 +4454,7 @@ rte_eth_add_first_rx_callback(uint16_t port_id, uint16_t queue_id, + cb->param = user_param; + + rte_spinlock_lock(&rte_eth_rx_cb_lock); +- /* Add the callbacks at fisrt position*/ ++ /* Add the callbacks at first position */ + cb->next = rte_eth_devices[port_id].post_rx_burst_cbs[queue_id]; + rte_smp_wmb(); + rte_eth_devices[port_id].post_rx_burst_cbs[queue_id] = cb; +@@ -5064,8 +5066,7 @@ rte_eth_switch_domain_alloc(uint16_t *domain_id) + + *domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID; + +- for (i = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID + 1; +- i < RTE_MAX_ETHPORTS; i++) { ++ for (i = 0; i < RTE_MAX_ETHPORTS; i++) { + if (rte_eth_switch_domains[i].state == + RTE_ETH_SWITCH_DOMAIN_UNUSED) { + rte_eth_switch_domains[i].state = +diff --git a/dpdk/lib/librte_ethdev/rte_ethdev.h b/dpdk/lib/librte_ethdev/rte_ethdev.h +index 18a9defc24..d1a593ad11 100644 +--- a/dpdk/lib/librte_ethdev/rte_ethdev.h ++++ b/dpdk/lib/librte_ethdev/rte_ethdev.h +@@ -1196,7 +1196,7 @@ struct rte_eth_dev_portconf { + * Default values for switch domain id when ethdev does not support switch + * domain definitions. + */ +-#define RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID (0) ++#define RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID (UINT16_MAX) + + /** + * Ethernet device associated switch information +diff --git a/dpdk/lib/librte_ethdev/rte_ethdev_pci.h b/dpdk/lib/librte_ethdev/rte_ethdev_pci.h +index ccdbb46ec0..cca94ec864 100644 +--- a/dpdk/lib/librte_ethdev/rte_ethdev_pci.h ++++ b/dpdk/lib/librte_ethdev/rte_ethdev_pci.h +@@ -42,6 +42,8 @@ + + /** + * Copy pci device info to the Ethernet device data. ++ * Shared memory (eth_dev->data) only updated by primary process, so it is safe ++ * to call this function from both primary and secondary processes. + * + * @param eth_dev + * The *eth_dev* pointer is the address of the *rte_eth_dev* structure. +@@ -60,14 +62,16 @@ rte_eth_copy_pci_info(struct rte_eth_dev *eth_dev, + + eth_dev->intr_handle = &pci_dev->intr_handle; + +- eth_dev->data->dev_flags = 0; +- if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC) +- eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC; +- if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_RMV) +- eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV; +- +- eth_dev->data->kdrv = pci_dev->kdrv; +- eth_dev->data->numa_node = pci_dev->device.numa_node; ++ if (rte_eal_process_type() == RTE_PROC_PRIMARY) { ++ eth_dev->data->dev_flags = 0; ++ if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC) ++ eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC; ++ if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_RMV) ++ eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_RMV; ++ ++ eth_dev->data->kdrv = pci_dev->kdrv; ++ eth_dev->data->numa_node = pci_dev->device.numa_node; ++ } + } + + static inline int +diff --git a/dpdk/lib/librte_ethdev/rte_flow.c b/dpdk/lib/librte_ethdev/rte_flow.c +index 87a3e8c4c6..391165646a 100644 +--- a/dpdk/lib/librte_ethdev/rte_flow.c ++++ b/dpdk/lib/librte_ethdev/rte_flow.c +@@ -19,7 +19,7 @@ + #include "rte_flow.h" + + /* Mbuf dynamic field name for metadata. */ +-int rte_flow_dynf_metadata_offs = -1; ++int32_t rte_flow_dynf_metadata_offs = -1; + + /* Mbuf dynamic field flag bit number for metadata. */ + uint64_t rte_flow_dynf_metadata_mask; +diff --git a/dpdk/lib/librte_ethdev/rte_flow.h b/dpdk/lib/librte_ethdev/rte_flow.h +index 452d359a16..693824da8a 100644 +--- a/dpdk/lib/librte_ethdev/rte_flow.h ++++ b/dpdk/lib/librte_ethdev/rte_flow.h +@@ -502,7 +502,7 @@ enum rte_flow_item_type { + */ + RTE_FLOW_ITEM_TYPE_HIGIG2, + +- /* ++ /** + * [META] + * + * Matches a tag value. +@@ -2531,7 +2531,7 @@ struct rte_flow_action_set_meta { + }; + + /* Mbuf dynamic field offset for metadata. */ +-extern int rte_flow_dynf_metadata_offs; ++extern int32_t rte_flow_dynf_metadata_offs; + + /* Mbuf dynamic field flag mask for metadata. */ + extern uint64_t rte_flow_dynf_metadata_mask; +diff --git a/dpdk/lib/librte_eventdev/rte_eventdev.c b/dpdk/lib/librte_eventdev/rte_eventdev.c +index b987e07454..9aca7fbd52 100644 +--- a/dpdk/lib/librte_eventdev/rte_eventdev.c ++++ b/dpdk/lib/librte_eventdev/rte_eventdev.c +@@ -1364,14 +1364,17 @@ rte_event_pmd_allocate(const char *name, int socket_id) + + eventdev->data = eventdev_data; + +- strlcpy(eventdev->data->name, name, RTE_EVENTDEV_NAME_MAX_LEN); ++ if (rte_eal_process_type() == RTE_PROC_PRIMARY) { + +- eventdev->data->dev_id = dev_id; +- eventdev->data->socket_id = socket_id; +- eventdev->data->dev_started = 0; ++ strlcpy(eventdev->data->name, name, ++ RTE_EVENTDEV_NAME_MAX_LEN); + +- eventdev->attached = RTE_EVENTDEV_ATTACHED; ++ eventdev->data->dev_id = dev_id; ++ eventdev->data->socket_id = socket_id; ++ eventdev->data->dev_started = 0; ++ } + ++ eventdev->attached = RTE_EVENTDEV_ATTACHED; + eventdev_globals.nb_devs++; + } + +diff --git a/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h b/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h +index 8fb61386fd..443cd38c23 100644 +--- a/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h ++++ b/dpdk/lib/librte_eventdev/rte_eventdev_pmd_pci.h +@@ -112,9 +112,11 @@ rte_event_pmd_pci_remove(struct rte_pci_device *pci_dev, + if (eventdev == NULL) + return -ENODEV; + +- ret = rte_event_dev_close(eventdev->data->dev_id); +- if (ret < 0) +- return ret; ++ if (rte_eal_process_type() == RTE_PROC_PRIMARY) { ++ ret = rte_event_dev_close(eventdev->data->dev_id); ++ if (ret < 0) ++ return ret; ++ } + + /* Invoke PMD device un-init function */ + if (devuninit) +diff --git a/dpdk/lib/librte_fib/rte_fib.h b/dpdk/lib/librte_fib/rte_fib.h +index d06c5ef55a..af3bbf07ee 100644 +--- a/dpdk/lib/librte_fib/rte_fib.h ++++ b/dpdk/lib/librte_fib/rte_fib.h +@@ -14,6 +14,10 @@ + + #include + ++#ifdef __cplusplus ++extern "C" { ++#endif ++ + struct rte_fib; + struct rte_rib; + +@@ -185,4 +189,8 @@ __rte_experimental + struct rte_rib * + rte_fib_get_rib(struct rte_fib *fib); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif /* _RTE_FIB_H_ */ +diff --git a/dpdk/lib/librte_fib/rte_fib6.h b/dpdk/lib/librte_fib/rte_fib6.h +index 4268704038..66c71c84c9 100644 +--- a/dpdk/lib/librte_fib/rte_fib6.h ++++ b/dpdk/lib/librte_fib/rte_fib6.h +@@ -14,6 +14,10 @@ + + #include + ++#ifdef __cplusplus ++extern "C" { ++#endif ++ + #define RTE_FIB6_IPV6_ADDR_SIZE 16 + /** Maximum depth value possible for IPv6 FIB. */ + #define RTE_FIB6_MAXDEPTH 128 +@@ -190,4 +194,8 @@ __rte_experimental + struct rte_rib6 * + rte_fib6_get_rib(struct rte_fib6 *fib); + ++#ifdef __cplusplus ++} ++#endif ++ + #endif /* _RTE_FIB6_H_ */ +diff --git a/dpdk/lib/librte_fib/trie.c b/dpdk/lib/librte_fib/trie.c +index 124aa8b98b..2ae2add4f3 100644 +--- a/dpdk/lib/librte_fib/trie.c ++++ b/dpdk/lib/librte_fib/trie.c +@@ -240,9 +240,8 @@ tbl8_alloc(struct rte_trie_tbl *dp, uint64_t nh) + tbl8_idx = tbl8_get(dp); + if (tbl8_idx < 0) + return tbl8_idx; +- tbl8_ptr = (uint8_t *)dp->tbl8 + +- ((tbl8_idx * TRIE_TBL8_GRP_NUM_ENT) << +- dp->nh_sz); ++ tbl8_ptr = get_tbl_p_by_idx(dp->tbl8, ++ tbl8_idx * TRIE_TBL8_GRP_NUM_ENT, dp->nh_sz); + /*Init tbl8 entries with nexthop from tbl24*/ + write_to_dp((void *)tbl8_ptr, nh, dp->nh_sz, + TRIE_TBL8_GRP_NUM_ENT); +@@ -317,7 +316,7 @@ get_idx(const uint8_t *ip, uint32_t prev_idx, int bytes, int first_byte) + bitshift = (int8_t)(((first_byte + bytes - 1) - i)*BYTE_SIZE); + idx |= ip[i] << bitshift; + } +- return (prev_idx * 256) + idx; ++ return (prev_idx * TRIE_TBL8_GRP_NUM_ENT) + idx; + } + + static inline uint64_t +@@ -354,8 +353,8 @@ recycle_root_path(struct rte_trie_tbl *dp, const uint8_t *ip_part, + return; + + if (common_tbl8 != 0) { +- p = get_tbl_p_by_idx(dp->tbl8, (val >> 1) * 256 + *ip_part, +- dp->nh_sz); ++ p = get_tbl_p_by_idx(dp->tbl8, (val >> 1) * ++ TRIE_TBL8_GRP_NUM_ENT + *ip_part, dp->nh_sz); + recycle_root_path(dp, ip_part + 1, common_tbl8 - 1, p); + } + tbl8_recycle(dp, prev, val >> 1); +@@ -388,7 +387,8 @@ build_common_root(struct rte_trie_tbl *dp, const uint8_t *ip, + j = i; + cur_tbl = dp->tbl8; + } +- *tbl = get_tbl_p_by_idx(cur_tbl, prev_idx * 256, dp->nh_sz); ++ *tbl = get_tbl_p_by_idx(cur_tbl, prev_idx * TRIE_TBL8_GRP_NUM_ENT, ++ dp->nh_sz); + return 0; + } + +@@ -411,8 +411,8 @@ write_edge(struct rte_trie_tbl *dp, const uint8_t *ip_part, uint64_t next_hop, + return tbl8_idx; + val = (tbl8_idx << 1)|TRIE_EXT_ENT; + } +- p = get_tbl_p_by_idx(dp->tbl8, (tbl8_idx * 256) + *ip_part, +- dp->nh_sz); ++ p = get_tbl_p_by_idx(dp->tbl8, (tbl8_idx * ++ TRIE_TBL8_GRP_NUM_ENT) + *ip_part, dp->nh_sz); + ret = write_edge(dp, ip_part + 1, next_hop, len - 1, edge, p); + if (ret < 0) + return ret; +@@ -420,8 +420,8 @@ write_edge(struct rte_trie_tbl *dp, const uint8_t *ip_part, uint64_t next_hop, + write_to_dp((uint8_t *)p + (1 << dp->nh_sz), + next_hop << 1, dp->nh_sz, UINT8_MAX - *ip_part); + } else { +- write_to_dp(get_tbl_p_by_idx(dp->tbl8, tbl8_idx * 256, +- dp->nh_sz), ++ write_to_dp(get_tbl_p_by_idx(dp->tbl8, tbl8_idx * ++ TRIE_TBL8_GRP_NUM_ENT, dp->nh_sz), + next_hop << 1, dp->nh_sz, *ip_part); + } + tbl8_recycle(dp, &val, tbl8_idx); +diff --git a/dpdk/lib/librte_hash/meson.build b/dpdk/lib/librte_hash/meson.build +index 5d02b3084f..bce11ad9e0 100644 +--- a/dpdk/lib/librte_hash/meson.build ++++ b/dpdk/lib/librte_hash/meson.build +@@ -1,10 +1,7 @@ + # SPDX-License-Identifier: BSD-3-Clause + # Copyright(c) 2017 Intel Corporation + +-headers = files('rte_cmp_arm64.h', +- 'rte_cmp_x86.h', +- 'rte_crc_arm64.h', +- 'rte_cuckoo_hash.h', ++headers = files('rte_crc_arm64.h', + 'rte_fbk_hash.h', + 'rte_hash_crc.h', + 'rte_hash.h', +diff --git a/dpdk/lib/librte_hash/rte_hash.h b/dpdk/lib/librte_hash/rte_hash.h +index 0d73370dc4..ab7be1d528 100644 +--- a/dpdk/lib/librte_hash/rte_hash.h ++++ b/dpdk/lib/librte_hash/rte_hash.h +@@ -51,8 +51,6 @@ extern "C" { + + /** Flag to support lock free reader writer concurrency. Both single writer + * and multi writer use cases are supported. +- * Currently, extendable bucket table feature is not supported with +- * this feature. + */ + #define RTE_HASH_EXTRA_FLAGS_RW_CONCURRENCY_LF 0x20 + +diff --git a/dpdk/lib/librte_ipsec/ipsec_sad.c b/dpdk/lib/librte_ipsec/ipsec_sad.c +index db2c44c804..31b5956d89 100644 +--- a/dpdk/lib/librte_ipsec/ipsec_sad.c ++++ b/dpdk/lib/librte_ipsec/ipsec_sad.c +@@ -94,6 +94,8 @@ add_specific(struct rte_ipsec_sad *sad, const void *key, + + /* Update a counter for a given SPI */ + ret = rte_hash_lookup(sad->hash[RTE_IPSEC_SAD_SPI_ONLY], key); ++ if (ret < 0) ++ return ret; + if (key_type == RTE_IPSEC_SAD_SPI_DIP) + sad->cnt_arr[ret].cnt_dip += notexist; + else +diff --git a/dpdk/lib/librte_ipsec/sa.h b/dpdk/lib/librte_ipsec/sa.h +index 51e69ad05a..0cfe82f634 100644 +--- a/dpdk/lib/librte_ipsec/sa.h ++++ b/dpdk/lib/librte_ipsec/sa.h +@@ -113,7 +113,7 @@ struct rte_ipsec_sa { + * sqn and replay window + * In case of SA handled by multiple threads *sqn* cacheline + * could be shared by multiple cores. +- * To minimise perfomance impact, we try to locate in a separate ++ * To minimise performance impact, we try to locate in a separate + * place from other frequently accesed data. + */ + union { +diff --git a/dpdk/lib/librte_kni/rte_kni.c b/dpdk/lib/librte_kni/rte_kni.c +index e388751e33..bcf82cc2d5 100644 +--- a/dpdk/lib/librte_kni/rte_kni.c ++++ b/dpdk/lib/librte_kni/rte_kni.c +@@ -145,31 +145,38 @@ kni_reserve_mz(struct rte_kni *kni) + char mz_name[RTE_MEMZONE_NAMESIZE]; + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_TX_Q_MZ_NAME_FMT, kni->name); +- kni->m_tx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_tx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_tx_q == NULL, tx_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RX_Q_MZ_NAME_FMT, kni->name); +- kni->m_rx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_rx_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_rx_q == NULL, rx_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_ALLOC_Q_MZ_NAME_FMT, kni->name); +- kni->m_alloc_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_alloc_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_alloc_q == NULL, alloc_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_FREE_Q_MZ_NAME_FMT, kni->name); +- kni->m_free_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_free_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_free_q == NULL, free_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_REQ_Q_MZ_NAME_FMT, kni->name); +- kni->m_req_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_req_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_req_q == NULL, req_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_RESP_Q_MZ_NAME_FMT, kni->name); +- kni->m_resp_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_resp_q = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_resp_q == NULL, resp_q_fail); + + snprintf(mz_name, RTE_MEMZONE_NAMESIZE, KNI_SYNC_ADDR_MZ_NAME_FMT, kni->name); +- kni->m_sync_addr = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, 0); ++ kni->m_sync_addr = rte_memzone_reserve(mz_name, KNI_FIFO_SIZE, SOCKET_ID_ANY, ++ RTE_MEMZONE_IOVA_CONTIG); + KNI_MEM_CHECK(kni->m_sync_addr == NULL, sync_addr_fail); + + return 0; +diff --git a/dpdk/lib/librte_kvargs/rte_kvargs.c b/dpdk/lib/librte_kvargs/rte_kvargs.c +index d39332999e..285081c86c 100644 +--- a/dpdk/lib/librte_kvargs/rte_kvargs.c ++++ b/dpdk/lib/librte_kvargs/rte_kvargs.c +@@ -50,6 +50,8 @@ rte_kvargs_tokenize(struct rte_kvargs *kvlist, const char *params) + /* Find the end of the list. */ + while (str[strlen(str) - 1] != ']') { + /* Restore the comma erased by strtok_r(). */ ++ if (ctx1 == NULL || ctx1[0] == '\0') ++ return -1; /* no closing bracket */ + str[strlen(str)] = ','; + /* Parse until next comma. */ + str = strtok_r(NULL, RTE_KVARGS_PAIRS_DELIM, &ctx1); +diff --git a/dpdk/lib/librte_kvargs/rte_kvargs.h b/dpdk/lib/librte_kvargs/rte_kvargs.h +index 1946195de4..eff598e08b 100644 +--- a/dpdk/lib/librte_kvargs/rte_kvargs.h ++++ b/dpdk/lib/librte_kvargs/rte_kvargs.h +@@ -171,7 +171,7 @@ unsigned rte_kvargs_count(const struct rte_kvargs *kvlist, + * 0 if the strings match. + * !0 otherwise or on error. + * +- * Unless strcmp, comparison ordering is not kept. ++ * Unlike strcmp, comparison ordering is not kept. + * In order for rte_kvargs_process to stop processing on match error, + * a negative value is returned even if strcmp had returned a positive one. + */ +diff --git a/dpdk/lib/librte_latencystats/rte_latencystats.c b/dpdk/lib/librte_latencystats/rte_latencystats.c +index 98e018939e..ba2fff3bcb 100644 +--- a/dpdk/lib/librte_latencystats/rte_latencystats.c ++++ b/dpdk/lib/librte_latencystats/rte_latencystats.c +@@ -42,6 +42,7 @@ struct rte_latency_stats { + float avg_latency; /**< Average latency in nano seconds */ + float max_latency; /**< Maximum latency in nano seconds */ + float jitter; /** Latency variation */ ++ rte_spinlock_t lock; /** Latency calculation lock */ + }; + + static struct rte_latency_stats *glob_stats; +@@ -164,6 +165,7 @@ calc_latency(uint16_t pid __rte_unused, + latency[cnt++] = now - pkts[i]->timestamp; + } + ++ rte_spinlock_lock(&glob_stats->lock); + for (i = 0; i < cnt; i++) { + /* + * The jitter is calculated as statistical mean of interpacket +@@ -193,6 +195,7 @@ calc_latency(uint16_t pid __rte_unused, + alpha * (latency[i] - glob_stats->avg_latency); + prev_latency = latency[i]; + } ++ rte_spinlock_unlock(&glob_stats->lock); + + return nb_pkts; + } +@@ -223,6 +226,7 @@ rte_latencystats_init(uint64_t app_samp_intvl, + } + + glob_stats = mz->addr; ++ rte_spinlock_init(&glob_stats->lock); + samp_intvl = app_samp_intvl * latencystat_cycles_per_ns(); + + /** Register latency stats with stats library */ +diff --git a/dpdk/lib/librte_lpm/meson.build b/dpdk/lib/librte_lpm/meson.build +index 27ce45b531..021ac6d8d4 100644 +--- a/dpdk/lib/librte_lpm/meson.build ++++ b/dpdk/lib/librte_lpm/meson.build +@@ -7,4 +7,3 @@ headers = files('rte_lpm.h', 'rte_lpm6.h') + # without worrying about which architecture we actually need + headers += files('rte_lpm_altivec.h', 'rte_lpm_neon.h', 'rte_lpm_sse.h') + deps += ['hash'] +-use_function_versioning = true +diff --git a/dpdk/lib/librte_lpm/rte_lpm.c b/dpdk/lib/librte_lpm/rte_lpm.c +index b78c487447..2687564194 100644 +--- a/dpdk/lib/librte_lpm/rte_lpm.c ++++ b/dpdk/lib/librte_lpm/rte_lpm.c +@@ -22,7 +22,6 @@ + #include + #include + #include +-#include + + #include "rte_lpm.h" + +diff --git a/dpdk/lib/librte_lpm/rte_lpm6.c b/dpdk/lib/librte_lpm/rte_lpm6.c +index c46e557e23..6e1b18d6fd 100644 +--- a/dpdk/lib/librte_lpm/rte_lpm6.c ++++ b/dpdk/lib/librte_lpm/rte_lpm6.c +@@ -25,7 +25,6 @@ + #include + #include + #include +-#include + + #include "rte_lpm6.h" + +@@ -727,7 +726,8 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + tbl8_group_start = tbl8_gindex * + RTE_LPM6_TBL8_GROUP_NUM_ENTRIES; + memset(&lpm->tbl8[tbl8_group_start], 0, +- RTE_LPM6_TBL8_GROUP_NUM_ENTRIES); ++ RTE_LPM6_TBL8_GROUP_NUM_ENTRIES * ++ sizeof(struct rte_lpm6_tbl_entry)); + + /* init the new table's header: + * save the reference to the owner table +@@ -814,7 +814,7 @@ add_step(struct rte_lpm6 *lpm, struct rte_lpm6_tbl_entry *tbl, + * + * Returns: + * 0 on success +- * -ENOSPC not enought tbl8 left ++ * -ENOSPC not enough tbl8 left + */ + static int + simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) +@@ -844,7 +844,7 @@ simulate_add(struct rte_lpm6 *lpm, const uint8_t *masked_ip, uint8_t depth) + } + + if (tbl8_available(lpm) < total_need_tbl_nb) +- /* not enought tbl8 to add a rule */ ++ /* not enough tbl8 to add a rule */ + return -ENOSPC; + + return 0; +@@ -1212,7 +1212,7 @@ rule_find_range(struct rte_lpm6 *lpm, const uint8_t *ip, uint8_t depth, + /* minus top level */ + depth -= 24; + +- /* interate through levels (tbl8s) ++ /* iterate through levels (tbl8s) + * until we reach the last one + */ + while (depth > 8) { +diff --git a/dpdk/lib/librte_mbuf/rte_mbuf.h b/dpdk/lib/librte_mbuf/rte_mbuf.h +index 219b110b76..6d080527f6 100644 +--- a/dpdk/lib/librte_mbuf/rte_mbuf.h ++++ b/dpdk/lib/librte_mbuf/rte_mbuf.h +@@ -1535,7 +1535,7 @@ static inline int rte_pktmbuf_trim(struct rte_mbuf *m, uint16_t len) + static inline int rte_pktmbuf_is_contiguous(const struct rte_mbuf *m) + { + __rte_mbuf_sanity_check(m, 1); +- return !!(m->nb_segs == 1); ++ return m->nb_segs == 1; + } + + /** +diff --git a/dpdk/lib/librte_mempool/rte_mempool.c b/dpdk/lib/librte_mempool/rte_mempool.c +index 78d8eb941e..08906df9ee 100644 +--- a/dpdk/lib/librte_mempool/rte_mempool.c ++++ b/dpdk/lib/librte_mempool/rte_mempool.c +@@ -297,8 +297,8 @@ mempool_ops_alloc_once(struct rte_mempool *mp) + * zone. Return the number of objects added, or a negative value + * on error. + */ +-int +-rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, ++static int ++__rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, + void *opaque) + { +@@ -332,7 +332,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + off = RTE_PTR_ALIGN_CEIL(vaddr, RTE_MEMPOOL_ALIGN) - vaddr; + + if (off > len) { +- ret = -EINVAL; ++ ret = 0; + goto fail; + } + +@@ -343,7 +343,7 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + + /* not enough room to store one object */ + if (i == 0) { +- ret = -EINVAL; ++ ret = 0; + goto fail; + } + +@@ -356,6 +356,21 @@ rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, + return ret; + } + ++int ++rte_mempool_populate_iova(struct rte_mempool *mp, char *vaddr, ++ rte_iova_t iova, size_t len, rte_mempool_memchunk_free_cb_t *free_cb, ++ void *opaque) ++{ ++ int ret; ++ ++ ret = __rte_mempool_populate_iova(mp, vaddr, iova, len, free_cb, ++ opaque); ++ if (ret == 0) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ + static rte_iova_t + get_iova(void *addr) + { +@@ -406,8 +421,10 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + break; + } + +- ret = rte_mempool_populate_iova(mp, addr + off, iova, ++ ret = __rte_mempool_populate_iova(mp, addr + off, iova, + phys_len, free_cb, opaque); ++ if (ret == 0) ++ continue; + if (ret < 0) + goto fail; + /* no need to call the free callback for next chunks */ +@@ -415,6 +432,9 @@ rte_mempool_populate_virt(struct rte_mempool *mp, char *addr, + cnt += ret; + } + ++ if (cnt == 0) ++ return -EINVAL; ++ + return cnt; + + fail: +@@ -463,6 +483,7 @@ rte_mempool_populate_default(struct rte_mempool *mp) + unsigned mz_id, n; + int ret; + bool need_iova_contig_obj; ++ size_t max_alloc_size = SIZE_MAX; + + ret = mempool_ops_alloc_once(mp); + if (ret != 0) +@@ -542,30 +563,24 @@ rte_mempool_populate_default(struct rte_mempool *mp) + if (min_chunk_size == (size_t)mem_size) + mz_flags |= RTE_MEMZONE_IOVA_CONTIG; + +- mz = rte_memzone_reserve_aligned(mz_name, mem_size, ++ /* Allocate a memzone, retrying with a smaller area on ENOMEM */ ++ do { ++ mz = rte_memzone_reserve_aligned(mz_name, ++ RTE_MIN((size_t)mem_size, max_alloc_size), + mp->socket_id, mz_flags, align); + +- /* don't try reserving with 0 size if we were asked to reserve +- * IOVA-contiguous memory. +- */ +- if (min_chunk_size < (size_t)mem_size && mz == NULL) { +- /* not enough memory, retry with the biggest zone we +- * have +- */ +- mz = rte_memzone_reserve_aligned(mz_name, 0, +- mp->socket_id, mz_flags, align); +- } ++ if (mz == NULL && rte_errno != ENOMEM) ++ break; ++ ++ max_alloc_size = RTE_MIN(max_alloc_size, ++ (size_t)mem_size) / 2; ++ } while (mz == NULL && max_alloc_size >= min_chunk_size); ++ + if (mz == NULL) { + ret = -rte_errno; + goto fail; + } + +- if (mz->len < min_chunk_size) { +- rte_memzone_free(mz); +- ret = -ENOMEM; +- goto fail; +- } +- + if (need_iova_contig_obj) + iova = mz->iova; + else +@@ -645,8 +660,10 @@ rte_mempool_populate_anon(struct rte_mempool *mp) + } + + ret = mempool_ops_alloc_once(mp); +- if (ret != 0) +- return ret; ++ if (ret < 0) { ++ rte_errno = -ret; ++ return 0; ++ } + + size = get_anon_size(mp); + if (size < 0) { +@@ -670,8 +687,10 @@ rte_mempool_populate_anon(struct rte_mempool *mp) + + ret = rte_mempool_populate_virt(mp, addr, size, getpagesize(), + rte_mempool_memchunk_anon_free, addr); +- if (ret == 0) ++ if (ret < 0) { ++ rte_errno = -ret; + goto fail; ++ } + + return mp->populated_size; + +diff --git a/dpdk/lib/librte_mempool/rte_mempool.h b/dpdk/lib/librte_mempool/rte_mempool.h +index f81152af96..4907c0808e 100644 +--- a/dpdk/lib/librte_mempool/rte_mempool.h ++++ b/dpdk/lib/librte_mempool/rte_mempool.h +@@ -1167,8 +1167,8 @@ int rte_mempool_populate_default(struct rte_mempool *mp); + * A pointer to the mempool structure. + * @return + * The number of objects added on success. +- * On error, the chunk is not added in the memory list of the +- * mempool and a negative errno is returned. ++ * On error, 0 is returned, rte_errno is set, and the chunk is not added in ++ * the memory list of the mempool. + */ + int rte_mempool_populate_anon(struct rte_mempool *mp); + +@@ -1653,7 +1653,7 @@ rte_mempool_in_use_count(const struct rte_mempool *mp); + static inline int + rte_mempool_full(const struct rte_mempool *mp) + { +- return !!(rte_mempool_avail_count(mp) == mp->size); ++ return rte_mempool_avail_count(mp) == mp->size; + } + + /** +@@ -1672,7 +1672,7 @@ rte_mempool_full(const struct rte_mempool *mp) + static inline int + rte_mempool_empty(const struct rte_mempool *mp) + { +- return !!(rte_mempool_avail_count(mp) == 0); ++ return rte_mempool_avail_count(mp) == 0; + } + + /** +diff --git a/dpdk/lib/librte_mempool/rte_mempool_version.map b/dpdk/lib/librte_mempool/rte_mempool_version.map +index d002dfc46f..d67ed2e2b9 100644 +--- a/dpdk/lib/librte_mempool/rte_mempool_version.map ++++ b/dpdk/lib/librte_mempool/rte_mempool_version.map +@@ -4,18 +4,14 @@ DPDK_20.0 { + rte_mempool_audit; + rte_mempool_avail_count; + rte_mempool_cache_create; +- rte_mempool_cache_flush; + rte_mempool_cache_free; + rte_mempool_calc_obj_size; + rte_mempool_check_cookies; + rte_mempool_contig_blocks_check_cookies; + rte_mempool_create; + rte_mempool_create_empty; +- rte_mempool_default_cache; + rte_mempool_dump; + rte_mempool_free; +- rte_mempool_generic_get; +- rte_mempool_generic_put; + rte_mempool_in_use_count; + rte_mempool_list_dump; + rte_mempool_lookup; +diff --git a/dpdk/lib/librte_pci/rte_pci.c b/dpdk/lib/librte_pci/rte_pci.c +index a753cf3eca..5f7726fa89 100644 +--- a/dpdk/lib/librte_pci/rte_pci.c ++++ b/dpdk/lib/librte_pci/rte_pci.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + + #include "rte_pci.h" + +@@ -34,6 +35,12 @@ get_u8_pciaddr_field(const char *in, void *_u8, char dlm) + if (*in == '\0') + return NULL; + ++ /* PCI field starting with spaces is forbidden. ++ * Negative wrap-around is not reported as an error by strtoul. ++ */ ++ if (*in == ' ' || *in == '-') ++ return NULL; ++ + errno = 0; + val = strtoul(in, &end, 16); + if (errno != 0 || end[0] != dlm || val > UINT8_MAX) { +@@ -69,11 +76,17 @@ pci_dbdf_parse(const char *input, struct rte_pci_addr *dev_addr) + unsigned long val; + char *end; + ++ /* PCI id starting with spaces is forbidden. ++ * Negative wrap-around is not reported as an error by strtoul. ++ */ ++ if (*in == ' ' || *in == '-') ++ return -EINVAL; ++ + errno = 0; + val = strtoul(in, &end, 16); +- if (errno != 0 || end[0] != ':' || val > UINT16_MAX) ++ if (errno != 0 || end[0] != ':' || val > UINT32_MAX) + return -EINVAL; +- dev_addr->domain = (uint16_t)val; ++ dev_addr->domain = (uint32_t)val; + in = end + 1; + in = get_u8_pciaddr_field(in, &dev_addr->bus, ':'); + if (in == NULL) +diff --git a/dpdk/lib/librte_pci/rte_pci.h b/dpdk/lib/librte_pci/rte_pci.h +index c87891405c..4087771c1e 100644 +--- a/dpdk/lib/librte_pci/rte_pci.h ++++ b/dpdk/lib/librte_pci/rte_pci.h +@@ -17,16 +17,10 @@ extern "C" { + #endif + + #include +-#include + #include +-#include + #include +-#include + #include + +-#include +-#include +- + /** Formatting string for PCI device identifier: Ex: 0000:00:01.0 */ + #define PCI_PRI_FMT "%.4" PRIx16 ":%.2" PRIx8 ":%.2" PRIx8 ".%" PRIx8 + #define PCI_PRI_STR_SIZE sizeof("XXXXXXXX:XX:XX.X") +diff --git a/dpdk/lib/librte_security/rte_security.c b/dpdk/lib/librte_security/rte_security.c +index bc81ce15d1..dc9a3e89cd 100644 +--- a/dpdk/lib/librte_security/rte_security.c ++++ b/dpdk/lib/librte_security/rte_security.c +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: BSD-3-Clause + * Copyright 2017 NXP. + * Copyright(c) 2017 Intel Corporation. ++ * Copyright (c) 2020 Samsung Electronics Co., Ltd All Rights Reserved + */ + + #include +@@ -9,6 +10,19 @@ + #include "rte_security.h" + #include "rte_security_driver.h" + ++/* Macro to check for invalid pointers */ ++#define RTE_PTR_OR_ERR_RET(ptr, retval) do { \ ++ if ((ptr) == NULL) \ ++ return retval; \ ++} while (0) ++ ++/* Macro to check for invalid pointers chains */ ++#define RTE_PTR_CHAIN3_OR_ERR_RET(p1, p2, p3, retval, last_retval) do { \ ++ RTE_PTR_OR_ERR_RET(p1, retval); \ ++ RTE_PTR_OR_ERR_RET(p1->p2, retval); \ ++ RTE_PTR_OR_ERR_RET(p1->p2->p3, last_retval); \ ++} while (0) ++ + struct rte_security_session * + rte_security_session_create(struct rte_security_ctx *instance, + struct rte_security_session_conf *conf, +@@ -16,10 +30,9 @@ rte_security_session_create(struct rte_security_ctx *instance, + { + struct rte_security_session *sess = NULL; + +- if (conf == NULL) +- return NULL; +- +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_create, NULL); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, session_create, NULL, NULL); ++ RTE_PTR_OR_ERR_RET(conf, NULL); ++ RTE_PTR_OR_ERR_RET(mp, NULL); + + if (rte_mempool_get(mp, (void **)&sess)) + return NULL; +@@ -38,14 +51,19 @@ rte_security_session_update(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_session_conf *conf) + { +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_update, -ENOTSUP); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, session_update, -EINVAL, ++ -ENOTSUP); ++ RTE_PTR_OR_ERR_RET(sess, -EINVAL); ++ RTE_PTR_OR_ERR_RET(conf, -EINVAL); ++ + return instance->ops->session_update(instance->device, sess, conf); + } + + unsigned int + rte_security_session_get_size(struct rte_security_ctx *instance) + { +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_get_size, 0); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, session_get_size, 0, 0); ++ + return instance->ops->session_get_size(instance->device); + } + +@@ -54,7 +72,11 @@ rte_security_session_stats_get(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_security_stats *stats) + { +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_stats_get, -ENOTSUP); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, session_stats_get, -EINVAL, ++ -ENOTSUP); ++ /* Parameter sess can be NULL in case of getting global statistics. */ ++ RTE_PTR_OR_ERR_RET(stats, -EINVAL); ++ + return instance->ops->session_stats_get(instance->device, sess, stats); + } + +@@ -64,16 +86,20 @@ rte_security_session_destroy(struct rte_security_ctx *instance, + { + int ret; + +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->session_destroy, -ENOTSUP); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, session_destroy, -EINVAL, ++ -ENOTSUP); ++ RTE_PTR_OR_ERR_RET(sess, -EINVAL); ++ ++ ret = instance->ops->session_destroy(instance->device, sess); ++ if (ret != 0) ++ return ret; ++ ++ rte_mempool_put(rte_mempool_from_obj(sess), (void *)sess); + + if (instance->sess_cnt) + instance->sess_cnt--; + +- ret = instance->ops->session_destroy(instance->device, sess); +- if (!ret) +- rte_mempool_put(rte_mempool_from_obj(sess), (void *)sess); +- +- return ret; ++ return 0; + } + + int +@@ -81,6 +107,11 @@ rte_security_set_pkt_metadata(struct rte_security_ctx *instance, + struct rte_security_session *sess, + struct rte_mbuf *m, void *params) + { ++#ifdef RTE_DEBUG ++ RTE_PTR_OR_ERR_RET(sess, -EINVAL); ++ RTE_PTR_OR_ERR_RET(instance, -EINVAL); ++ RTE_PTR_OR_ERR_RET(instance->ops, -EINVAL); ++#endif + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->set_pkt_metadata, -ENOTSUP); + return instance->ops->set_pkt_metadata(instance->device, + sess, m, params); +@@ -91,6 +122,10 @@ rte_security_get_userdata(struct rte_security_ctx *instance, uint64_t md) + { + void *userdata = NULL; + ++#ifdef RTE_DEBUG ++ RTE_PTR_OR_ERR_RET(instance, NULL); ++ RTE_PTR_OR_ERR_RET(instance->ops, NULL); ++#endif + RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->get_userdata, NULL); + if (instance->ops->get_userdata(instance->device, md, &userdata)) + return NULL; +@@ -101,7 +136,8 @@ rte_security_get_userdata(struct rte_security_ctx *instance, uint64_t md) + const struct rte_security_capability * + rte_security_capabilities_get(struct rte_security_ctx *instance) + { +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->capabilities_get, NULL); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, capabilities_get, NULL, NULL); ++ + return instance->ops->capabilities_get(instance->device); + } + +@@ -113,7 +149,9 @@ rte_security_capability_get(struct rte_security_ctx *instance, + const struct rte_security_capability *capability; + uint16_t i = 0; + +- RTE_FUNC_PTR_OR_ERR_RET(*instance->ops->capabilities_get, NULL); ++ RTE_PTR_CHAIN3_OR_ERR_RET(instance, ops, capabilities_get, NULL, NULL); ++ RTE_PTR_OR_ERR_RET(idx, NULL); ++ + capabilities = instance->ops->capabilities_get(instance->device); + + if (capabilities == NULL) +@@ -121,7 +159,7 @@ rte_security_capability_get(struct rte_security_ctx *instance, + + while ((capability = &capabilities[i++])->action + != RTE_SECURITY_ACTION_TYPE_NONE) { +- if (capability->action == idx->action && ++ if (capability->action == idx->action && + capability->protocol == idx->protocol) { + if (idx->protocol == RTE_SECURITY_PROTOCOL_IPSEC) { + if (capability->ipsec.proto == +diff --git a/dpdk/lib/librte_security/rte_security.h b/dpdk/lib/librte_security/rte_security.h +index 546779df2b..b4b4eb2d85 100644 +--- a/dpdk/lib/librte_security/rte_security.h ++++ b/dpdk/lib/librte_security/rte_security.h +@@ -374,7 +374,7 @@ rte_security_session_create(struct rte_security_ctx *instance, + * @param conf update configuration parameters + * @return + * - On success returns 0 +- * - On failure return errno ++ * - On failure returns a negative errno value. + */ + __rte_experimental + int +@@ -399,12 +399,14 @@ rte_security_session_get_size(struct rte_security_ctx *instance); + * return it to its original mempool. + * + * @param instance security instance +- * @param sess security session to freed ++ * @param sess security session to be freed + * + * @return + * - 0 if successful. +- * - -EINVAL if session is NULL. ++ * - -EINVAL if session or context instance is NULL. + * - -EBUSY if not all device private data has been freed. ++ * - -ENOTSUP if destroying private data is not supported. ++ * - other negative values in case of freeing private data errors. + */ + int + rte_security_session_destroy(struct rte_security_ctx *instance, +diff --git a/dpdk/lib/librte_telemetry/rte_telemetry_parser.c b/dpdk/lib/librte_telemetry/rte_telemetry_parser.c +index 9601323970..e8c269e85e 100644 +--- a/dpdk/lib/librte_telemetry/rte_telemetry_parser.c ++++ b/dpdk/lib/librte_telemetry/rte_telemetry_parser.c +@@ -456,9 +456,9 @@ rte_telemetry_command_ports_stats_values_by_name(struct telemetry_impl + size_t index; + json_t *value; + ++ memset(&ep, 0, sizeof(ep)); + ep.pp.num_port_ids = json_array_size(port_ids_json); + ep.pp.num_metric_ids = num_stat_names; +- memset(&ep, 0, sizeof(ep)); + if (telemetry == NULL) { + TELEMETRY_LOG_ERR("Invalid telemetry argument"); + return -1; +diff --git a/dpdk/lib/librte_timer/meson.build b/dpdk/lib/librte_timer/meson.build +index b7edfe2e7d..d3b828ce9d 100644 +--- a/dpdk/lib/librte_timer/meson.build ++++ b/dpdk/lib/librte_timer/meson.build +@@ -4,4 +4,3 @@ + sources = files('rte_timer.c') + headers = files('rte_timer.h') + allow_experimental_apis = true +-use_function_versioning = true +diff --git a/dpdk/lib/librte_timer/rte_timer.c b/dpdk/lib/librte_timer/rte_timer.c +index ca88454ff6..99862a3ba1 100644 +--- a/dpdk/lib/librte_timer/rte_timer.c ++++ b/dpdk/lib/librte_timer/rte_timer.c +@@ -26,7 +26,6 @@ + #include + #include + #include +-#include + + #include "rte_timer.h" + +@@ -146,11 +145,13 @@ rte_timer_subsystem_init(void) + const size_t mem_size = data_arr_size + sizeof(*rte_timer_mz_refcnt); + bool do_full_init = true; + +- if (rte_timer_subsystem_initialized) +- return -EALREADY; +- + rte_mcfg_timer_lock(); + ++ if (rte_timer_subsystem_initialized) { ++ rte_mcfg_timer_unlock(); ++ return -EALREADY; ++ } ++ + mz = rte_memzone_lookup(mz_name); + if (mz == NULL) { + mz = rte_memzone_reserve_aligned(mz_name, mem_size, +@@ -184,27 +185,29 @@ rte_timer_subsystem_init(void) + rte_timer_data_arr[default_data_id].internal_flags |= FL_ALLOCATED; + (*rte_timer_mz_refcnt)++; + +- rte_mcfg_timer_unlock(); +- + rte_timer_subsystem_initialized = 1; + ++ rte_mcfg_timer_unlock(); ++ + return 0; + } + + void + rte_timer_subsystem_finalize(void) + { +- if (!rte_timer_subsystem_initialized) +- return; +- + rte_mcfg_timer_lock(); + ++ if (!rte_timer_subsystem_initialized) { ++ rte_mcfg_timer_unlock(); ++ return; ++ } ++ + if (--(*rte_timer_mz_refcnt) == 0) + rte_memzone_free(rte_timer_data_mz); + +- rte_mcfg_timer_unlock(); +- + rte_timer_subsystem_initialized = 0; ++ ++ rte_mcfg_timer_unlock(); + } + + /* Initialize the timer handle tim for use */ +diff --git a/dpdk/lib/librte_vhost/iotlb.c b/dpdk/lib/librte_vhost/iotlb.c +index 4a1d8c1253..07443a94bc 100644 +--- a/dpdk/lib/librte_vhost/iotlb.c ++++ b/dpdk/lib/librte_vhost/iotlb.c +@@ -308,8 +308,9 @@ vhost_user_iotlb_init(struct virtio_net *dev, int vq_index) + TAILQ_INIT(&vq->iotlb_list); + TAILQ_INIT(&vq->iotlb_pending_list); + +- snprintf(pool_name, sizeof(pool_name), "iotlb_cache_%d_%d", +- dev->vid, vq_index); ++ snprintf(pool_name, sizeof(pool_name), "iotlb_%u_%d_%d", ++ getpid(), dev->vid, vq_index); ++ RTE_LOG(DEBUG, VHOST_CONFIG, "IOTLB cache name: %s\n", pool_name); + + /* If already created, free it and recreate */ + vq->iotlb_pool = rte_mempool_lookup(pool_name); +diff --git a/dpdk/lib/librte_vhost/rte_vhost.h b/dpdk/lib/librte_vhost/rte_vhost.h +index 7b5dc87c2e..532ee0dec7 100644 +--- a/dpdk/lib/librte_vhost/rte_vhost.h ++++ b/dpdk/lib/librte_vhost/rte_vhost.h +@@ -68,6 +68,10 @@ extern "C" { + #define VHOST_USER_PROTOCOL_F_PAGEFAULT 8 + #endif + ++#ifndef VHOST_USER_PROTOCOL_F_CONFIG ++#define VHOST_USER_PROTOCOL_F_CONFIG 9 ++#endif ++ + #ifndef VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD + #define VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD 10 + #endif +@@ -85,6 +89,7 @@ extern "C" { + #define VHOST_USER_F_PROTOCOL_FEATURES 30 + #endif + ++ + /** + * Information relating to memory regions including offsets to + * addresses in QEMUs memory file. +@@ -253,7 +258,7 @@ struct vhost_device_ops { + + /** + * This callback gets called each time a guest gets notified +- * about waiting packets. This is the interrupt handling trough ++ * about waiting packets. This is the interrupt handling through + * the eventfd_write(callfd), which can be used for counting these + * "slow" syscalls. + */ +diff --git a/dpdk/lib/librte_vhost/socket.c b/dpdk/lib/librte_vhost/socket.c +index ebb2ff6c28..2461549fea 100644 +--- a/dpdk/lib/librte_vhost/socket.c ++++ b/dpdk/lib/librte_vhost/socket.c +@@ -127,7 +127,8 @@ read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds, + + ret = recvmsg(sockfd, &msgh, 0); + if (ret <= 0) { +- RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); ++ if (ret) ++ RTE_LOG(ERR, VHOST_CONFIG, "recvmsg failed\n"); + return ret; + } + +@@ -318,16 +319,16 @@ vhost_user_read_cb(int connfd, void *dat, int *remove) + + vhost_destroy_device(conn->vid); + ++ if (vsocket->reconnect) { ++ create_unix_socket(vsocket); ++ vhost_user_start_client(vsocket); ++ } ++ + pthread_mutex_lock(&vsocket->conn_mutex); + TAILQ_REMOVE(&vsocket->conn_list, conn, next); + pthread_mutex_unlock(&vsocket->conn_mutex); + + free(conn); +- +- if (vsocket->reconnect) { +- create_unix_socket(vsocket); +- vhost_user_start_client(vsocket); +- } + } + } + +@@ -877,6 +878,7 @@ rte_vhost_driver_register(const char *path, uint64_t flags) + "error: failed to init connection mutex\n"); + goto out_free; + } ++ vsocket->vdpa_dev_id = -1; + vsocket->dequeue_zero_copy = flags & RTE_VHOST_USER_DEQUEUE_ZERO_COPY; + vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT; + vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT; +@@ -924,6 +926,12 @@ rte_vhost_driver_register(const char *path, uint64_t flags) + ret = -1; + goto out_mutex; + } ++ if ((flags & RTE_VHOST_USER_CLIENT) != 0) { ++ RTE_LOG(ERR, VHOST_CONFIG, ++ "error: zero copy is incompatible with vhost client mode\n"); ++ ret = -1; ++ goto out_mutex; ++ } + vsocket->supported_features &= ~(1ULL << VIRTIO_F_IN_ORDER); + vsocket->features &= ~(1ULL << VIRTIO_F_IN_ORDER); + +@@ -1051,9 +1059,10 @@ rte_vhost_driver_unregister(const char *path) + next = TAILQ_NEXT(conn, next); + + /* +- * If r/wcb is executing, release the +- * conn_mutex lock, and try again since +- * the r/wcb may use the conn_mutex lock. ++ * If r/wcb is executing, release vsocket's ++ * conn_mutex and vhost_user's mutex locks, and ++ * try again since the r/wcb may use the ++ * conn_mutex and mutex locks. + */ + if (fdset_try_del(&vhost_user.fdset, + conn->connfd) == -1) { +@@ -1074,8 +1083,17 @@ rte_vhost_driver_unregister(const char *path) + pthread_mutex_unlock(&vsocket->conn_mutex); + + if (vsocket->is_server) { +- fdset_del(&vhost_user.fdset, +- vsocket->socket_fd); ++ /* ++ * If r/wcb is executing, release vhost_user's ++ * mutex lock, and try again since the r/wcb ++ * may use the mutex lock. ++ */ ++ if (fdset_try_del(&vhost_user.fdset, ++ vsocket->socket_fd) == -1) { ++ pthread_mutex_unlock(&vhost_user.mutex); ++ goto again; ++ } ++ + close(vsocket->socket_fd); + unlink(path); + } else if (vsocket->reconnect) { +diff --git a/dpdk/lib/librte_vhost/vhost.c b/dpdk/lib/librte_vhost/vhost.c +index 1cbe948f74..20fda61518 100644 +--- a/dpdk/lib/librte_vhost/vhost.c ++++ b/dpdk/lib/librte_vhost/vhost.c +@@ -350,6 +350,57 @@ free_device(struct virtio_net *dev) + rte_free(dev); + } + ++static __rte_always_inline int ++log_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) ++{ ++ if (likely(!(vq->ring_addrs.flags & (1 << VHOST_VRING_F_LOG)))) ++ return 0; ++ ++ vq->log_guest_addr = translate_log_addr(dev, vq, ++ vq->ring_addrs.log_guest_addr); ++ if (vq->log_guest_addr == 0) ++ return -1; ++ ++ return 0; ++} ++ ++/* ++ * Converts vring log address to GPA ++ * If IOMMU is enabled, the log address is IOVA ++ * If IOMMU not enabled, the log address is already GPA ++ * ++ * Caller should have iotlb_lock read-locked ++ */ ++uint64_t ++translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq, ++ uint64_t log_addr) ++{ ++ if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { ++ const uint64_t exp_size = sizeof(uint64_t); ++ uint64_t hva, gpa; ++ uint64_t size = exp_size; ++ ++ hva = vhost_iova_to_vva(dev, vq, log_addr, ++ &size, VHOST_ACCESS_RW); ++ ++ if (size != exp_size) ++ return 0; ++ ++ gpa = hva_to_gpa(dev, hva, exp_size); ++ if (!gpa) { ++ RTE_LOG(ERR, VHOST_CONFIG, ++ "VQ: Failed to find GPA for log_addr: 0x%" ++ PRIx64 " hva: 0x%" PRIx64 "\n", ++ log_addr, hva); ++ return 0; ++ } ++ return gpa; ++ ++ } else ++ return log_addr; ++} ++ ++/* Caller should have iotlb_lock read-locked */ + static int + vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq) + { +@@ -388,6 +439,7 @@ vring_translate_split(struct virtio_net *dev, struct vhost_virtqueue *vq) + return 0; + } + ++/* Caller should have iotlb_lock read-locked */ + static int + vring_translate_packed(struct virtio_net *dev, struct vhost_virtqueue *vq) + { +@@ -434,6 +486,10 @@ vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq) + if (vring_translate_split(dev, vq) < 0) + return -1; + } ++ ++ if (log_translate(dev, vq) < 0) ++ return -1; ++ + vq->access_ok = 1; + + return 0; +diff --git a/dpdk/lib/librte_vhost/vhost.h b/dpdk/lib/librte_vhost/vhost.h +index 9f11b28a31..844904ca3b 100644 +--- a/dpdk/lib/librte_vhost/vhost.h ++++ b/dpdk/lib/librte_vhost/vhost.h +@@ -462,14 +462,23 @@ static __rte_always_inline void + vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) + { +- vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len); ++ if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) { ++ if (unlikely(vq->log_guest_addr == 0)) ++ return; ++ __vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, ++ len); ++ } + } + + static __rte_always_inline void + vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq, + uint64_t offset, uint64_t len) + { +- vhost_log_write(dev, vq->log_guest_addr + offset, len); ++ if (unlikely(dev->features & (1ULL << VHOST_F_LOG_ALL))) { ++ if (unlikely(vq->log_guest_addr == 0)) ++ return; ++ __vhost_log_write(dev, vq->log_guest_addr + offset, len); ++ } + } + + static __rte_always_inline void +@@ -528,7 +537,6 @@ vhost_log_write_iova(struct virtio_net *dev, struct vhost_virtqueue *vq, + #define PRINT_PACKET(device, addr, size, header) do {} while (0) + #endif + +-extern uint64_t VHOST_FEATURES; + #define MAX_VHOST_DEVICE 1024 + extern struct virtio_net *vhost_devices[MAX_VHOST_DEVICE]; + +@@ -620,6 +628,8 @@ void *vhost_alloc_copy_ind_table(struct virtio_net *dev, + struct vhost_virtqueue *vq, + uint64_t desc_addr, uint64_t desc_len); + int vring_translate(struct virtio_net *dev, struct vhost_virtqueue *vq); ++uint64_t translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq, ++ uint64_t log_addr); + void vring_invalidate(struct virtio_net *dev, struct vhost_virtqueue *vq); + + static __rte_always_inline uint64_t +diff --git a/dpdk/lib/librte_vhost/vhost_crypto.c b/dpdk/lib/librte_vhost/vhost_crypto.c +index 684fddc30b..0f9df4059d 100644 +--- a/dpdk/lib/librte_vhost/vhost_crypto.c ++++ b/dpdk/lib/librte_vhost/vhost_crypto.c +@@ -40,7 +40,8 @@ + (1 << VIRTIO_RING_F_EVENT_IDX) | \ + (1 << VIRTIO_CRYPTO_SERVICE_CIPHER) | \ + (1 << VIRTIO_CRYPTO_SERVICE_MAC) | \ +- (1 << VIRTIO_NET_F_CTRL_VQ)) ++ (1 << VIRTIO_NET_F_CTRL_VQ) | \ ++ (1 << VHOST_USER_PROTOCOL_F_CONFIG)) + + #define IOVA_TO_VVA(t, r, a, l, p) \ + ((t)(uintptr_t)vhost_iova_to_vva(r->dev, r->vq, a, l, p)) +@@ -237,6 +238,11 @@ transform_cipher_param(struct rte_crypto_sym_xform *xform, + if (unlikely(ret < 0)) + return ret; + ++ if (param->cipher_key_len > VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH) { ++ VC_LOG_DBG("Invalid cipher key length\n"); ++ return -VIRTIO_CRYPTO_BADMSG; ++ } ++ + xform->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform->cipher.key.length = param->cipher_key_len; + if (xform->cipher.key.length > 0) +@@ -287,6 +293,12 @@ transform_chain_param(struct rte_crypto_sym_xform *xforms, + &xform_cipher->cipher.algo); + if (unlikely(ret < 0)) + return ret; ++ ++ if (param->cipher_key_len > VHOST_USER_CRYPTO_MAX_CIPHER_KEY_LENGTH) { ++ VC_LOG_DBG("Invalid cipher key length\n"); ++ return -VIRTIO_CRYPTO_BADMSG; ++ } ++ + xform_cipher->type = RTE_CRYPTO_SYM_XFORM_CIPHER; + xform_cipher->cipher.key.length = param->cipher_key_len; + xform_cipher->cipher.key.data = param->cipher_key_buf; +@@ -301,6 +313,12 @@ transform_chain_param(struct rte_crypto_sym_xform *xforms, + ret = auth_algo_transform(param->hash_algo, &xform_auth->auth.algo); + if (unlikely(ret < 0)) + return ret; ++ ++ if (param->auth_key_len > VHOST_USER_CRYPTO_MAX_HMAC_KEY_LENGTH) { ++ VC_LOG_DBG("Invalid auth key length\n"); ++ return -VIRTIO_CRYPTO_BADMSG; ++ } ++ + xform_auth->auth.digest_length = param->digest_len; + xform_auth->auth.key.length = param->auth_key_len; + xform_auth->auth.key.data = param->auth_key_buf; +@@ -1539,18 +1557,18 @@ rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + + if (unlikely(dev == NULL)) { + VC_LOG_ERR("Invalid vid %i", vid); +- return -EINVAL; ++ return 0; + } + + if (unlikely(qid >= VHOST_MAX_QUEUE_PAIRS)) { + VC_LOG_ERR("Invalid qid %u", qid); +- return -EINVAL; ++ return 0; + } + + vcrypto = (struct vhost_crypto *)dev->extern_data; + if (unlikely(vcrypto == NULL)) { + VC_LOG_ERR("Cannot find required data, is it initialized?"); +- return -ENOENT; ++ return 0; + } + + vq = dev->virtqueue[qid]; +@@ -1572,7 +1590,7 @@ rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool, + (void **)mbufs, count * 2) < 0)) { + VC_LOG_ERR("Insufficient memory"); +- return -ENOMEM; ++ return 0; + } + + for (i = 0; i < count; i++) { +@@ -1602,7 +1620,7 @@ rte_vhost_crypto_fetch_requests(int vid, uint32_t qid, + if (unlikely(rte_mempool_get_bulk(vcrypto->mbuf_pool, + (void **)mbufs, count) < 0)) { + VC_LOG_ERR("Insufficient memory"); +- return -ENOMEM; ++ return 0; + } + + for (i = 0; i < count; i++) { +diff --git a/dpdk/lib/librte_vhost/vhost_user.c b/dpdk/lib/librte_vhost/vhost_user.c +index 0cfb8b792b..31080be2bd 100644 +--- a/dpdk/lib/librte_vhost/vhost_user.c ++++ b/dpdk/lib/librte_vhost/vhost_user.c +@@ -206,7 +206,7 @@ vhost_backend_cleanup(struct virtio_net *dev) + dev->inflight_info->addr = NULL; + } + +- if (dev->inflight_info->fd > 0) { ++ if (dev->inflight_info->fd >= 0) { + close(dev->inflight_info->fd); + dev->inflight_info->fd = -1; + } +@@ -656,13 +656,11 @@ ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + { + if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { + uint64_t vva; +- uint64_t req_size = *size; + +- vva = vhost_user_iotlb_cache_find(vq, ra, ++ vhost_user_iotlb_rd_lock(vq); ++ vva = vhost_iova_to_vva(dev, vq, ra, + size, VHOST_ACCESS_RW); +- if (req_size != *size) +- vhost_user_iotlb_miss(dev, (ra + *size), +- VHOST_ACCESS_RW); ++ vhost_user_iotlb_rd_unlock(vq); + + return vva; + } +@@ -670,37 +668,16 @@ ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq, + return qva_to_vva(dev, ra, size); + } + +-/* +- * Converts vring log address to GPA +- * If IOMMU is enabled, the log address is IOVA +- * If IOMMU not enabled, the log address is already GPA +- */ + static uint64_t +-translate_log_addr(struct virtio_net *dev, struct vhost_virtqueue *vq, +- uint64_t log_addr) ++log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq) + { +- if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) { +- const uint64_t exp_size = sizeof(struct vring_used) + +- sizeof(struct vring_used_elem) * vq->size; +- uint64_t hva, gpa; +- uint64_t size = exp_size; +- +- hva = vhost_iova_to_vva(dev, vq, log_addr, +- &size, VHOST_ACCESS_RW); +- if (size != exp_size) +- return 0; ++ uint64_t log_gpa; + +- gpa = hva_to_gpa(dev, hva, exp_size); +- if (!gpa) { +- RTE_LOG(ERR, VHOST_CONFIG, +- "VQ: Failed to find GPA for log_addr: 0x%" PRIx64 " hva: 0x%" PRIx64 "\n", +- log_addr, hva); +- return 0; +- } +- return gpa; ++ vhost_user_iotlb_rd_lock(vq); ++ log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr); ++ vhost_user_iotlb_rd_unlock(vq); + +- } else +- return log_addr; ++ return log_gpa; + } + + static struct virtio_net * +@@ -712,7 +689,7 @@ translate_ring_addresses(struct virtio_net *dev, int vq_index) + + if (addr->flags & (1 << VHOST_VRING_F_LOG)) { + vq->log_guest_addr = +- translate_log_addr(dev, vq, addr->log_guest_addr); ++ log_addr_to_gpa(dev, vq); + if (vq->log_guest_addr == 0) { + RTE_LOG(DEBUG, VHOST_CONFIG, + "(%d) failed to map log_guest_addr.\n", +@@ -1145,6 +1122,21 @@ vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg, + goto err_mmap; + } + mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment); ++ if (mmap_size == 0) { ++ /* ++ * It could happen if initial mmap_size + alignment ++ * overflows the sizeof uint64, which could happen if ++ * either mmap_size or alignment value is wrong. ++ * ++ * mmap() kernel implementation would return an error, ++ * but better catch it before and provide useful info ++ * in the logs. ++ */ ++ RTE_LOG(ERR, VHOST_CONFIG, "mmap size (0x%" PRIx64 ") " ++ "or alignment (0x%" PRIx64 ") is invalid\n", ++ reg->size + mmap_offset, alignment); ++ goto err_mmap; ++ } + + populate = (dev->dequeue_zero_copy) ? MAP_POPULATE : 0; + mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, +@@ -1298,7 +1290,8 @@ vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq) + return false; + + if (vq_is_packed(dev)) +- rings_ok = !!vq->desc_packed; ++ rings_ok = vq->desc_packed && vq->driver_event && ++ vq->device_event; + else + rings_ok = vq->desc && vq->avail && vq->used; + +@@ -1415,6 +1408,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, + "failed to alloc dev inflight area\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } ++ dev->inflight_info->fd = -1; + } + + num_queues = msg->payload.inflight.num_queues; +@@ -1440,6 +1434,16 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev, + } + memset(addr, 0, mmap_size); + ++ if (dev->inflight_info->addr) { ++ munmap(dev->inflight_info->addr, dev->inflight_info->size); ++ dev->inflight_info->addr = NULL; ++ } ++ ++ if (dev->inflight_info->fd >= 0) { ++ close(dev->inflight_info->fd); ++ dev->inflight_info->fd = -1; ++ } ++ + dev->inflight_info->addr = addr; + dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size; + dev->inflight_info->fd = msg->fds[0] = fd; +@@ -1522,10 +1526,13 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, + "failed to alloc dev inflight area\n"); + return RTE_VHOST_MSG_RESULT_ERR; + } ++ dev->inflight_info->fd = -1; + } + +- if (dev->inflight_info->addr) ++ if (dev->inflight_info->addr) { + munmap(dev->inflight_info->addr, dev->inflight_info->size); ++ dev->inflight_info->addr = NULL; ++ } + + addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, + fd, mmap_offset); +@@ -1534,8 +1541,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg, + return RTE_VHOST_MSG_RESULT_ERR; + } + +- if (dev->inflight_info->fd) ++ if (dev->inflight_info->fd >= 0) { + close(dev->inflight_info->fd); ++ dev->inflight_info->fd = -1; ++ } + + dev->inflight_info->fd = fd; + dev->inflight_info->addr = addr; +@@ -1629,8 +1638,11 @@ vhost_check_queue_inflights_split(struct virtio_net *dev, + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) + return RTE_VHOST_MSG_RESULT_OK; + ++ /* The frontend may still not support the inflight feature ++ * although we negotiate the protocol feature. ++ */ + if ((!vq->inflight_split)) +- return RTE_VHOST_MSG_RESULT_ERR; ++ return RTE_VHOST_MSG_RESULT_OK; + + if (!vq->inflight_split->version) { + vq->inflight_split->version = INFLIGHT_VERSION; +@@ -1710,8 +1722,11 @@ vhost_check_queue_inflights_packed(struct virtio_net *dev, + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) + return RTE_VHOST_MSG_RESULT_OK; + ++ /* The frontend may still not support the inflight feature ++ * although we negotiate the protocol feature. ++ */ + if ((!vq->inflight_packed)) +- return RTE_VHOST_MSG_RESULT_ERR; ++ return RTE_VHOST_MSG_RESULT_OK; + + if (!vq->inflight_packed->version) { + vq->inflight_packed->version = INFLIGHT_VERSION; +@@ -2060,10 +2075,10 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg, + size = msg->payload.log.mmap_size; + off = msg->payload.log.mmap_offset; + +- /* Don't allow mmap_offset to point outside the mmap region */ +- if (off > size) { ++ /* Check for mmap size and offset overflow. */ ++ if (off >= -size) { + RTE_LOG(ERR, VHOST_CONFIG, +- "log offset %#"PRIx64" exceeds log size %#"PRIx64"\n", ++ "log offset %#"PRIx64" and log size %#"PRIx64" overflow\n", + off, size); + return RTE_VHOST_MSG_RESULT_ERR; + } +@@ -2229,6 +2244,13 @@ is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) + if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) + return 1; + ++ if (ra->flags & (1 << VHOST_VRING_F_LOG)) { ++ len = sizeof(uint64_t); ++ if (ra->log_guest_addr < end && ++ (ra->log_guest_addr + len) > start) ++ return 1; ++ } ++ + return 0; + } + +@@ -2254,6 +2276,13 @@ is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg) + if (ra->used_user_addr < end && (ra->used_user_addr + len) > start) + return 1; + ++ if (ra->flags & (1 << VHOST_VRING_F_LOG)) { ++ len = sizeof(uint64_t); ++ if (ra->log_guest_addr < end && ++ (ra->log_guest_addr + len) > start) ++ return 1; ++ } ++ + return 0; + } + +@@ -2440,8 +2469,13 @@ read_vhost_message(int sockfd, struct VhostUserMsg *msg) + + ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE, + msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num); +- if (ret <= 0) ++ if (ret <= 0) { + return ret; ++ } else if (ret != VHOST_USER_HDR_SIZE) { ++ RTE_LOG(ERR, VHOST_CONFIG, "Unexpected header size read\n"); ++ close_msg_fds(msg); ++ return -1; ++ } + + if (msg->size) { + if (msg->size > sizeof(msg->payload)) { +@@ -2508,7 +2542,7 @@ static int + vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, + struct VhostUserMsg *msg) + { +- uint16_t vring_idx; ++ uint32_t vring_idx; + + switch (msg->request.master) { + case VHOST_USER_SET_VRING_KICK: +@@ -2794,11 +2828,19 @@ static int process_slave_message_reply(struct virtio_net *dev, + if ((msg->flags & VHOST_USER_NEED_REPLY) == 0) + return 0; + +- if (read_vhost_message(dev->slave_req_fd, &msg_reply) < 0) { ++ ret = read_vhost_message(dev->slave_req_fd, &msg_reply); ++ if (ret <= 0) { ++ if (ret < 0) ++ RTE_LOG(ERR, VHOST_CONFIG, ++ "vhost read slave message reply failed\n"); ++ else ++ RTE_LOG(INFO, VHOST_CONFIG, ++ "vhost peer closed\n"); + ret = -1; + goto out; + } + ++ ret = 0; + if (msg_reply.request.slave != msg->request.slave) { + RTE_LOG(ERR, VHOST_CONFIG, + "Received unexpected msg type (%u), expected %u\n", +diff --git a/dpdk/lib/librte_vhost/virtio_net.c b/dpdk/lib/librte_vhost/virtio_net.c +index 21c311732a..a6c106c13c 100644 +--- a/dpdk/lib/librte_vhost/virtio_net.c ++++ b/dpdk/lib/librte_vhost/virtio_net.c +@@ -43,6 +43,36 @@ is_valid_virt_queue_idx(uint32_t idx, int is_tx, uint32_t nr_vring) + return (is_tx ^ (idx & 1)) == 0 && idx < nr_vring; + } + ++static inline void ++do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) ++{ ++ struct batch_copy_elem *elem = vq->batch_copy_elems; ++ uint16_t count = vq->batch_copy_nb_elems; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); ++ vhost_log_cache_write_iova(dev, vq, elem[i].log_addr, ++ elem[i].len); ++ PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); ++ } ++ ++ vq->batch_copy_nb_elems = 0; ++} ++ ++static inline void ++do_data_copy_dequeue(struct vhost_virtqueue *vq) ++{ ++ struct batch_copy_elem *elem = vq->batch_copy_elems; ++ uint16_t count = vq->batch_copy_nb_elems; ++ int i; ++ ++ for (i = 0; i < count; i++) ++ rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); ++ ++ vq->batch_copy_nb_elems = 0; ++} ++ + static __rte_always_inline void + do_flush_shadow_used_ring_split(struct virtio_net *dev, + struct vhost_virtqueue *vq, +@@ -186,6 +216,11 @@ vhost_flush_enqueue_batch_packed(struct virtio_net *dev, + uint16_t i; + uint16_t flags; + ++ if (vq->shadow_used_idx) { ++ do_data_copy_enqueue(dev, vq); ++ vhost_flush_enqueue_shadow_packed(dev, vq); ++ } ++ + flags = PACKED_DESC_ENQUEUE_USED_FLAG(vq->used_wrap_counter); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { +@@ -325,36 +360,6 @@ vhost_shadow_dequeue_single_packed_inorder(struct vhost_virtqueue *vq, + vq_inc_last_used_packed(vq, count); + } + +-static inline void +-do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq) +-{ +- struct batch_copy_elem *elem = vq->batch_copy_elems; +- uint16_t count = vq->batch_copy_nb_elems; +- int i; +- +- for (i = 0; i < count; i++) { +- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); +- vhost_log_cache_write_iova(dev, vq, elem[i].log_addr, +- elem[i].len); +- PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0); +- } +- +- vq->batch_copy_nb_elems = 0; +-} +- +-static inline void +-do_data_copy_dequeue(struct vhost_virtqueue *vq) +-{ +- struct batch_copy_elem *elem = vq->batch_copy_elems; +- uint16_t count = vq->batch_copy_nb_elems; +- int i; +- +- for (i = 0; i < count; i++) +- rte_memcpy(elem[i].dst, elem[i].src, elem[i].len); +- +- vq->batch_copy_nb_elems = 0; +-} +- + static __rte_always_inline void + vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + struct vhost_virtqueue *vq, +@@ -382,25 +387,6 @@ vhost_shadow_enqueue_single_packed(struct virtio_net *dev, + } + } + +-static __rte_always_inline void +-vhost_flush_dequeue_packed(struct virtio_net *dev, +- struct vhost_virtqueue *vq) +-{ +- int shadow_count; +- if (!vq->shadow_used_idx) +- return; +- +- shadow_count = vq->last_used_idx - vq->shadow_last_used_idx; +- if (shadow_count <= 0) +- shadow_count += vq->size; +- +- if ((uint32_t)shadow_count >= (vq->size - MAX_PKT_BURST)) { +- do_data_copy_dequeue(vq); +- vhost_flush_dequeue_shadow_packed(dev, vq); +- vhost_vring_call_packed(dev, vq); +- } +-} +- + /* avoid write operation when necessary, to lessen cache issues */ + #define ASSIGN_UNLESS_EQUAL(var, val) do { \ + if ((var) != (val)) \ +@@ -1086,6 +1072,8 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev, + VHOST_ACCESS_RW); + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { ++ if (unlikely(!desc_addrs[i])) ++ return -1; + if (unlikely(lens[i] != descs[avail_idx + i].len)) + return -1; + } +@@ -1688,6 +1676,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + { + uint16_t i; + uint16_t free_entries; ++ uint16_t dropped = 0; ++ static bool allocerr_warned; + + if (unlikely(dev->dequeue_zero_copy)) { + struct zcopy_mbuf *zmbuf, *next; +@@ -1751,13 +1741,35 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + update_shadow_used_ring_split(vq, head_idx, 0); + + pkts[i] = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); +- if (unlikely(pkts[i] == NULL)) ++ if (unlikely(pkts[i] == NULL)) { ++ /* ++ * mbuf allocation fails for jumbo packets when external ++ * buffer allocation is not allowed and linear buffer ++ * is required. Drop this packet. ++ */ ++ if (!allocerr_warned) { ++ RTE_LOG(ERR, VHOST_DATA, ++ "Failed mbuf alloc of size %d from %s on %s.\n", ++ buf_len, mbuf_pool->name, dev->ifname); ++ allocerr_warned = true; ++ } ++ dropped += 1; ++ i++; + break; ++ } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, pkts[i], + mbuf_pool); + if (unlikely(err)) { + rte_pktmbuf_free(pkts[i]); ++ if (!allocerr_warned) { ++ RTE_LOG(ERR, VHOST_DATA, ++ "Failed to copy desc to mbuf on %s.\n", ++ dev->ifname); ++ allocerr_warned = true; ++ } ++ dropped += 1; ++ i++; + break; + } + +@@ -1767,6 +1779,8 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + zmbuf = get_zmbuf(vq); + if (!zmbuf) { + rte_pktmbuf_free(pkts[i]); ++ dropped += 1; ++ i++; + break; + } + zmbuf->mbuf = pkts[i]; +@@ -1796,7 +1810,7 @@ virtio_dev_tx_split(struct virtio_net *dev, struct vhost_virtqueue *vq, + } + } + +- return i; ++ return (i - dropped); + } + + static __rte_always_inline int +@@ -1841,6 +1855,8 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev, + } + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { ++ if (unlikely(!desc_addrs[i])) ++ return -1; + if (unlikely((lens[i] != descs[avail_idx + i].len))) + return -1; + } +@@ -1928,6 +1944,7 @@ vhost_dequeue_single_packed(struct virtio_net *dev, + uint32_t buf_len; + uint16_t nr_vec = 0; + int err; ++ static bool allocerr_warned; + + if (unlikely(fill_vec_buf_packed(dev, vq, + vq->last_avail_idx, desc_count, +@@ -1938,14 +1955,24 @@ vhost_dequeue_single_packed(struct virtio_net *dev, + + *pkts = virtio_dev_pktmbuf_alloc(dev, mbuf_pool, buf_len); + if (unlikely(*pkts == NULL)) { +- RTE_LOG(ERR, VHOST_DATA, +- "Failed to allocate memory for mbuf.\n"); ++ if (!allocerr_warned) { ++ RTE_LOG(ERR, VHOST_DATA, ++ "Failed mbuf alloc of size %d from %s on %s.\n", ++ buf_len, mbuf_pool->name, dev->ifname); ++ allocerr_warned = true; ++ } + return -1; + } + + err = copy_desc_to_mbuf(dev, vq, buf_vec, nr_vec, *pkts, + mbuf_pool); + if (unlikely(err)) { ++ if (!allocerr_warned) { ++ RTE_LOG(ERR, VHOST_DATA, ++ "Failed to copy desc to mbuf on %s.\n", ++ dev->ifname); ++ allocerr_warned = true; ++ } + rte_pktmbuf_free(*pkts); + return -1; + } +@@ -1960,21 +1987,24 @@ virtio_dev_tx_single_packed(struct virtio_net *dev, + struct rte_mbuf **pkts) + { + +- uint16_t buf_id, desc_count; ++ uint16_t buf_id, desc_count = 0; ++ int ret; + +- if (vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, +- &desc_count)) +- return -1; ++ ret = vhost_dequeue_single_packed(dev, vq, mbuf_pool, pkts, &buf_id, ++ &desc_count); + +- if (virtio_net_is_inorder(dev)) +- vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, +- desc_count); +- else +- vhost_shadow_dequeue_single_packed(vq, buf_id, desc_count); ++ if (likely(desc_count > 0)) { ++ if (virtio_net_is_inorder(dev)) ++ vhost_shadow_dequeue_single_packed_inorder(vq, buf_id, ++ desc_count); ++ else ++ vhost_shadow_dequeue_single_packed(vq, buf_id, ++ desc_count); + +- vq_inc_last_avail_packed(vq, desc_count); ++ vq_inc_last_avail_packed(vq, desc_count); ++ } + +- return 0; ++ return ret; + } + + static __rte_always_inline int +@@ -2004,7 +2034,7 @@ virtio_dev_tx_batch_packed_zmbuf(struct virtio_net *dev, + + vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) { + zmbufs[i]->mbuf = pkts[i]; +- zmbufs[i]->desc_idx = avail_idx + i; ++ zmbufs[i]->desc_idx = ids[i]; + zmbufs[i]->desc_count = 1; + } + +@@ -2045,7 +2075,7 @@ virtio_dev_tx_single_packed_zmbuf(struct virtio_net *dev, + return -1; + } + zmbuf->mbuf = *pkts; +- zmbuf->desc_idx = vq->last_avail_idx; ++ zmbuf->desc_idx = buf_id; + zmbuf->desc_count = desc_count; + + rte_mbuf_refcnt_update(*pkts, 1); +@@ -2149,7 +2179,6 @@ virtio_dev_tx_packed(struct virtio_net *dev, + if (remained >= PACKED_BATCH_SIZE) { + if (!virtio_dev_tx_batch_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) { +- vhost_flush_dequeue_packed(dev, vq); + pkt_idx += PACKED_BATCH_SIZE; + remained -= PACKED_BATCH_SIZE; + continue; +@@ -2159,15 +2188,18 @@ virtio_dev_tx_packed(struct virtio_net *dev, + if (virtio_dev_tx_single_packed(dev, vq, mbuf_pool, + &pkts[pkt_idx])) + break; +- vhost_flush_dequeue_packed(dev, vq); + pkt_idx++; + remained--; + + } while (remained); + +- if (vq->shadow_used_idx) ++ if (vq->shadow_used_idx) { + do_data_copy_dequeue(vq); + ++ vhost_flush_dequeue_shadow_packed(dev, vq); ++ vhost_vring_call_packed(dev, vq); ++ } ++ + return pkt_idx; + } + +diff --git a/dpdk/lib/meson.build b/dpdk/lib/meson.build +index 6ceb5e756e..d5a507fb43 100644 +--- a/dpdk/lib/meson.build ++++ b/dpdk/lib/meson.build +@@ -148,12 +148,16 @@ foreach l:libraries + command: [map_to_def_cmd, '@INPUT@', '@OUTPUT@'], + input: version_map, + output: 'rte_@0@_exports.def'.format(name)) +- lk_deps = [version_map, def_file] +- if is_windows ++ ++ if is_ms_linker + lk_args = ['-Wl,/def:' + def_file.full_path(), + '-Wl,/implib:lib\\' + implib] + else + lk_args = ['-Wl,--version-script=' + version_map] ++ endif ++ ++ lk_deps = [version_map, def_file] ++ if not is_windows + # on unix systems check the output of the + # experimental syms script, using it as a + # dependency of the .so build +diff --git a/dpdk/meson_options.txt b/dpdk/meson_options.txt +index bc369d06c9..0de16b4fdb 100644 +--- a/dpdk/meson_options.txt ++++ b/dpdk/meson_options.txt +@@ -12,8 +12,8 @@ option('examples', type: 'string', value: '', + description: 'Comma-separated list of examples to build by default') + option('flexran_sdk', type: 'string', value: '', + description: 'Path to FlexRAN SDK optional Libraries for BBDEV device') +-option('ibverbs_link', type: 'combo', choices : ['shared', 'dlopen'], value: 'shared', +- description: 'Linkage method (shared/dlopen) for Mellanox PMDs with ibverbs dependencies.') ++option('ibverbs_link', type: 'combo', choices : ['static', 'shared', 'dlopen'], value: 'shared', ++ description: 'Linkage method (static/shared/dlopen) for Mellanox PMDs with ibverbs dependencies.') + option('include_subdir_arch', type: 'string', value: '', + description: 'subdirectory where to install arch-dependent headers') + option('kernel_dir', type: 'string', value: '', +diff --git a/dpdk/mk/internal/rte.compile-pre.mk b/dpdk/mk/internal/rte.compile-pre.mk +index 0cf3791b4d..82fe098f7c 100644 +--- a/dpdk/mk/internal/rte.compile-pre.mk ++++ b/dpdk/mk/internal/rte.compile-pre.mk +@@ -61,7 +61,7 @@ CHECK_EXPERIMENTAL = $(EXPERIMENTAL_CHECK) $(SRCDIR)/$(EXPORT_MAP) $@ + + PMDINFO_GEN = $(RTE_SDK_BIN)/app/dpdk-pmdinfogen $@ $@.pmd.c + PMDINFO_CC = $(CC) $(CPPFLAGS) $(CFLAGS) $(EXTRA_CFLAGS) -c -o $@.pmd.o $@.pmd.c +-PMDINFO_LD = $(CROSS)ld $(LDFLAGS) -r -o $@.o $@.pmd.o $@ ++PMDINFO_LD = $(CROSS)ld -r $(filter-out -export-dynamic,$(LDFLAGS)) -o $@.o $@.pmd.o $@ + PMDINFO_TO_O = if grep -q 'RTE_PMD_REGISTER_.*(.*)' $<; then \ + echo "$(if $V,$(PMDINFO_GEN), PMDINFO $@.pmd.c)" && \ + $(PMDINFO_GEN) && \ +diff --git a/dpdk/mk/rte.app.mk b/dpdk/mk/rte.app.mk +index 05ea034b99..44dd684cb1 100644 +--- a/dpdk/mk/rte.app.mk ++++ b/dpdk/mk/rte.app.mk +@@ -196,8 +196,12 @@ _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += -ldl + _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -ldl + else ifeq ($(CONFIG_RTE_IBVERBS_LINK_STATIC),y) + LIBS_IBVERBS_STATIC = $(shell $(RTE_SDK)/buildtools/options-ibverbs-static.sh) ++_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += --no-whole-archive + _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += $(LIBS_IBVERBS_STATIC) ++_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += --whole-archive ++_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += --no-whole-archive + _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += $(LIBS_IBVERBS_STATIC) ++_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += --whole-archive + else + _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX4_PMD) += -libverbs -lmlx4 + _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -libverbs -lmlx5 +diff --git a/dpdk/mk/toolchain/gcc/rte.vars.mk b/dpdk/mk/toolchain/gcc/rte.vars.mk +index 9fc704193b..b3473c06fd 100644 +--- a/dpdk/mk/toolchain/gcc/rte.vars.mk ++++ b/dpdk/mk/toolchain/gcc/rte.vars.mk +@@ -83,6 +83,11 @@ ifeq ($(shell test $(GCC_VERSION) -lt 47 && echo 1), 1) + WERROR_FLAGS += -Wno-uninitialized + endif + ++ifeq ($(shell test $(GCC_VERSION) -ge 100 && echo 1), 1) ++# FIXME: Bugzilla 396 ++WERROR_FLAGS += -Wno-zero-length-bounds ++endif ++ + HOST_WERROR_FLAGS := $(WERROR_FLAGS) + + ifeq ($(shell test $(HOST_GCC_VERSION) -gt 70 && echo 1), 1) +diff --git a/dpdk/usertools/dpdk-pmdinfo.py b/dpdk/usertools/dpdk-pmdinfo.py +index 069a3bf124..12f20735e0 100755 +--- a/dpdk/usertools/dpdk-pmdinfo.py ++++ b/dpdk/usertools/dpdk-pmdinfo.py +@@ -539,7 +539,7 @@ def scan_for_autoload_pmds(dpdk_path): + return + + (autoload_path, scannedfile) = readelf.search_for_autoload_path() +- if (autoload_path is None or autoload_path is ""): ++ if not autoload_path: + if (raw_output is False): + print("No autoload path configured in %s" % dpdk_path) + return +@@ -561,7 +561,10 @@ def main(stream=None): + + pcifile_default = "./pci.ids" # For unknown OS's assume local file + if platform.system() == 'Linux': +- pcifile_default = "/usr/share/hwdata/pci.ids" ++ # hwdata is the legacy location, misc is supported going forward ++ pcifile_default = "/usr/share/misc/pci.ids" ++ if not os.path.exists(pcifile_default): ++ pcifile_default = "/usr/share/hwdata/pci.ids" + elif platform.system() == 'FreeBSD': + pcifile_default = "/usr/local/share/pciids/pci.ids" + if not os.path.exists(pcifile_default): +diff --git a/dpdk/usertools/dpdk-telemetry-client.py b/dpdk/usertools/dpdk-telemetry-client.py +index 290345dcc4..35edb7cd26 100755 +--- a/dpdk/usertools/dpdk-telemetry-client.py ++++ b/dpdk/usertools/dpdk-telemetry-client.py +@@ -3,6 +3,7 @@ + # Copyright(c) 2018 Intel Corporation + + from __future__ import print_function ++from __future__ import unicode_literals + + import socket + import os +@@ -65,18 +66,19 @@ def register(self): # Connects a client to DPDK-instance + self.socket.recv_fd.settimeout(2) + self.socket.send_fd.connect("/var/run/dpdk/rte/telemetry") + JSON = (API_REG + self.file_path + "\"}}") +- self.socket.send_fd.sendall(JSON) ++ self.socket.send_fd.sendall(JSON.encode()) ++ + self.socket.recv_fd.listen(1) + self.socket.client_fd = self.socket.recv_fd.accept()[0] + + def unregister(self): # Unregister a given client +- self.socket.client_fd.send(API_UNREG + self.file_path + "\"}}") ++ self.socket.client_fd.send((API_UNREG + self.file_path + "\"}}").encode()) + self.socket.client_fd.close() + + def requestMetrics(self): # Requests metrics for given client +- self.socket.client_fd.send(METRICS_REQ) +- data = self.socket.client_fd.recv(BUFFER_SIZE) +- print("\nResponse: \n", str(data)) ++ self.socket.client_fd.send(METRICS_REQ.encode()) ++ data = self.socket.client_fd.recv(BUFFER_SIZE).decode() ++ print("\nResponse: \n", data) + + def repeatedlyRequestMetrics(self, sleep_time): # Recursively requests metrics for given client + print("\nPlease enter the number of times you'd like to continuously request Metrics:") +@@ -88,9 +90,9 @@ def repeatedlyRequestMetrics(self, sleep_time): # Recursively requests metrics f + time.sleep(sleep_time) + + def requestGlobalMetrics(self): #Requests global metrics for given client +- self.socket.client_fd.send(GLOBAL_METRICS_REQ) +- data = self.socket.client_fd.recv(BUFFER_SIZE) +- print("\nResponse: \n", str(data)) ++ self.socket.client_fd.send(GLOBAL_METRICS_REQ.encode()) ++ data = self.socket.client_fd.recv(BUFFER_SIZE).decode() ++ print("\nResponse: \n", data) + + def interactiveMenu(self, sleep_time): # Creates Interactive menu within the script + while self.choice != 4: +diff --git a/include/openvswitch/compiler.h b/include/openvswitch/compiler.h +index 5289a70f6e..cf009f8264 100644 +--- a/include/openvswitch/compiler.h ++++ b/include/openvswitch/compiler.h +@@ -113,6 +113,8 @@ + * OVS_REQUIRES OVS_REQ_RDLOCK OVS_REQ_WRLOCK + * OVS_EXCLUDED OVS_EXCLUDED OVS_EXCLUDED + */ ++ ++/* Please keep OVS_CTAGS_IDENTIFIERS up-to-date in acinclude.m4. */ + #define OVS_LOCKABLE __attribute__((lockable)) + #define OVS_REQ_RDLOCK(...) __attribute__((shared_locks_required(__VA_ARGS__))) + #define OVS_ACQ_RDLOCK(...) __attribute__((shared_lock_function(__VA_ARGS__))) +diff --git a/ipsec/ovs-monitor-ipsec.in b/ipsec/ovs-monitor-ipsec.in +index 37e3703245..1c185bbd85 100755 +--- a/ipsec/ovs-monitor-ipsec.in ++++ b/ipsec/ovs-monitor-ipsec.in +@@ -101,7 +101,7 @@ class XFRM(object): + proc = subprocess.Popen([self.IP, 'xfrm', 'policy'], + stdout=subprocess.PIPE) + while True: +- line = proc.stdout.readline().strip() ++ line = proc.stdout.readline().strip().decode() + if line == '': + break + a = line.split(" ") +@@ -124,7 +124,7 @@ class XFRM(object): + proc = subprocess.Popen([self.IP, 'xfrm', 'state'], + stdout=subprocess.PIPE) + while True: +- line = proc.stdout.readline().strip() ++ line = proc.stdout.readline().strip().decode() + if line == '': + break + a = line.split(" ") +@@ -246,7 +246,7 @@ conn prevent_unencrypted_vxlan + proc = subprocess.Popen([self.IPSEC, 'status'], stdout=subprocess.PIPE) + + while True: +- line = proc.stdout.readline().strip() ++ line = proc.stdout.readline().strip().decode() + if line == '': + break + tunnel_name = line.split(":") +@@ -340,7 +340,7 @@ conn prevent_unencrypted_vxlan + # about possibility of ovs-monitor-ipsec to block for each tunnel + # while strongSwan sends IKE messages over Internet. + conns_dict = self.get_active_conns() +- for ifname, conns in conns_dict.iteritems(): ++ for ifname, conns in conns_dict.items(): + tunnel = monitor.tunnels.get(ifname) + for conn in conns: + # IPsec "connection" names that we choose in strongswan +@@ -536,7 +536,7 @@ conn prevent_unencrypted_vxlan + + # Delete old connections + conns_dict = self.get_active_conns() +- for ifname, conns in conns_dict.iteritems(): ++ for ifname, conns in conns_dict.items(): + tunnel = monitor.tunnels.get(ifname) + + for conn in conns: +@@ -608,7 +608,7 @@ conn prevent_unencrypted_vxlan + proc = subprocess.Popen([self.IPSEC, 'status'], stdout=subprocess.PIPE) + + while True: +- line = proc.stdout.readline().strip() ++ line = proc.stdout.readline().strip().decode() + if line == '': + break + +@@ -989,7 +989,7 @@ class IPsecMonitor(object): + skb_mark = None + is_valid = False + +- for row in data["Open_vSwitch"].rows.itervalues(): ++ for row in data["Open_vSwitch"].rows.values(): + pki[0] = row.other_config.get("certificate") + pki[1] = row.other_config.get("private_key") + pki[2] = row.other_config.get("ca_cert") +@@ -1016,7 +1016,7 @@ class IPsecMonitor(object): + table.""" + ifaces = set() + +- for row in data["Interface"].rows.itervalues(): ++ for row in data["Interface"].rows.values(): + if not self.is_tunneling_type_supported(row.type): + continue + if not self.is_ipsec_required(row.options): +@@ -1047,7 +1047,7 @@ class IPsecMonitor(object): + return + s = "" + conns = self.ike_helper.get_active_conns() +- for name, tunnel in self.tunnels.iteritems(): ++ for name, tunnel in self.tunnels.items(): + s += tunnel.show(policies, securities, conns) + unix_conn.reply(s) + +@@ -1064,7 +1064,7 @@ class IPsecMonitor(object): + if self.ike_helper.config_global(self): + needs_refresh = True + +- for name, tunnel in self.tunnels.iteritems(): ++ for name, tunnel in self.tunnels.items(): + if tunnel.last_refreshed_version != tunnel.version: + tunnel.last_refreshed_version = tunnel.version + needs_refresh = True +@@ -1094,7 +1094,7 @@ class IPsecMonitor(object): + proc.wait() + if proc.returncode: + raise Exception(proc.stderr.read()) +- m = re.search(r"CN=(.+?),", proc.stdout.readline()) ++ m = re.search(r"CN=(.+?),", proc.stdout.readline().decode()) + if not m: + raise Exception("No CN in the certificate subject.") + except Exception as e: +diff --git a/lib/classifier.c b/lib/classifier.c +index 0fad953213..f2c3497c2d 100644 +--- a/lib/classifier.c ++++ b/lib/classifier.c +@@ -393,7 +393,9 @@ classifier_set_prefix_fields(struct classifier *cls, + bitmap_set1(fields.bm, trie_fields[i]); + + new_fields[n_tries] = NULL; +- if (n_tries >= cls->n_tries || field != cls->tries[n_tries].field) { ++ const struct mf_field *cls_field ++ = ovsrcu_get(struct mf_field *, &cls->tries[n_tries].field); ++ if (n_tries >= cls->n_tries || field != cls_field) { + new_fields[n_tries] = field; + changed = true; + } +@@ -454,7 +456,7 @@ trie_init(struct classifier *cls, int trie_idx, const struct mf_field *field) + } else { + ovsrcu_set_hidden(&trie->root, NULL); + } +- trie->field = field; ++ ovsrcu_set_hidden(&trie->field, CONST_CAST(struct mf_field *, field)); + + /* Add existing rules to the new trie. */ + CMAP_FOR_EACH (subtable, cmap_node, &cls->subtables_map) { +@@ -839,7 +841,6 @@ classifier_remove_assert(struct classifier *cls, + struct trie_ctx { + const struct cls_trie *trie; + bool lookup_done; /* Status of the lookup. */ +- uint8_t be32ofs; /* U32 offset of the field in question. */ + unsigned int maskbits; /* Prefix length needed to avoid false matches. */ + union trie_prefix match_plens; /* Bitmask of prefix lengths with possible + * matches. */ +@@ -849,7 +850,6 @@ static void + trie_ctx_init(struct trie_ctx *ctx, const struct cls_trie *trie) + { + ctx->trie = trie; +- ctx->be32ofs = trie->field->flow_be32ofs; + ctx->lookup_done = false; + } + +@@ -1531,8 +1531,10 @@ insert_subtable(struct classifier *cls, const struct minimask *mask) + *CONST_CAST(uint8_t *, &subtable->n_indices) = index; + + for (i = 0; i < cls->n_tries; i++) { +- subtable->trie_plen[i] = minimask_get_prefix_len(mask, +- cls->tries[i].field); ++ const struct mf_field *field ++ = ovsrcu_get(struct mf_field *, &cls->tries[i].field); ++ subtable->trie_plen[i] ++ = field ? minimask_get_prefix_len(mask, field) : 0; + } + + /* Ports trie. */ +@@ -1575,11 +1577,17 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries, + * fields using the prefix tries. The trie checks are done only as + * needed to avoid folding in additional bits to the wildcards mask. */ + for (j = 0; j < n_tries; j++) { +- /* Is the trie field relevant for this subtable, and +- is the trie field within the current range of fields? */ +- if (field_plen[j] && +- flowmap_is_set(&range_map, trie_ctx[j].be32ofs / 2)) { ++ /* Is the trie field relevant for this subtable? */ ++ if (field_plen[j]) { + struct trie_ctx *ctx = &trie_ctx[j]; ++ const struct mf_field *ctx_field ++ = ovsrcu_get(struct mf_field *, &ctx->trie->field); ++ ++ /* Is the trie field within the current range of fields? */ ++ if (!ctx_field ++ || !flowmap_is_set(&range_map, ctx_field->flow_be32ofs / 2)) { ++ continue; ++ } + + /* On-demand trie lookup. */ + if (!ctx->lookup_done) { +@@ -1601,14 +1609,16 @@ check_tries(struct trie_ctx trie_ctx[CLS_MAX_TRIES], unsigned int n_tries, + * than this subtable would otherwise. */ + if (ctx->maskbits <= field_plen[j]) { + /* Unwildcard the bits and skip the rest. */ +- mask_set_prefix_bits(wc, ctx->be32ofs, ctx->maskbits); ++ mask_set_prefix_bits(wc, ctx_field->flow_be32ofs, ++ ctx->maskbits); + /* Note: Prerequisite already unwildcarded, as the only + * prerequisite of the supported trie lookup fields is + * the ethertype, which is always unwildcarded. */ + return true; + } + /* Can skip if the field is already unwildcarded. */ +- if (mask_prefix_bits_set(wc, ctx->be32ofs, ctx->maskbits)) { ++ if (mask_prefix_bits_set(wc, ctx_field->flow_be32ofs, ++ ctx->maskbits)) { + return true; + } + } +@@ -2001,12 +2011,12 @@ static unsigned int + trie_lookup(const struct cls_trie *trie, const struct flow *flow, + union trie_prefix *plens) + { +- const struct mf_field *mf = trie->field; ++ const struct mf_field *mf = ovsrcu_get(struct mf_field *, &trie->field); + + /* Check that current flow matches the prerequisites for the trie + * field. Some match fields are used for multiple purposes, so we + * must check that the trie is relevant for this flow. */ +- if (mf_are_prereqs_ok(mf, flow, NULL)) { ++ if (mf && mf_are_prereqs_ok(mf, flow, NULL)) { + return trie_lookup_value(&trie->root, + &((ovs_be32 *)flow)[mf->flow_be32ofs], + &plens->be32, mf->n_bits); +@@ -2053,8 +2063,9 @@ minimask_get_prefix_len(const struct minimask *minimask, + * happened to be zeros. + */ + static const ovs_be32 * +-minimatch_get_prefix(const struct minimatch *match, const struct mf_field *mf) ++minimatch_get_prefix(const struct minimatch *match, rcu_field_ptr *field) + { ++ struct mf_field *mf = ovsrcu_get_protected(struct mf_field *, field); + size_t u64_ofs = mf->flow_be32ofs / 2; + + return (OVS_FORCE const ovs_be32 *)miniflow_get__(match->flow, u64_ofs) +@@ -2068,7 +2079,7 @@ static void + trie_insert(struct cls_trie *trie, const struct cls_rule *rule, int mlen) + { + trie_insert_prefix(&trie->root, +- minimatch_get_prefix(&rule->match, trie->field), mlen); ++ minimatch_get_prefix(&rule->match, &trie->field), mlen); + } + + static void +@@ -2123,7 +2134,7 @@ static void + trie_remove(struct cls_trie *trie, const struct cls_rule *rule, int mlen) + { + trie_remove_prefix(&trie->root, +- minimatch_get_prefix(&rule->match, trie->field), mlen); ++ minimatch_get_prefix(&rule->match, &trie->field), mlen); + } + + /* 'mlen' must be the (non-zero) CIDR prefix length of the 'trie->field' mask +diff --git a/lib/classifier.h b/lib/classifier.h +index d1bd4aa12a..f646a8f742 100644 +--- a/lib/classifier.h ++++ b/lib/classifier.h +@@ -314,13 +314,15 @@ extern "C" { + struct cls_subtable; + struct cls_match; + ++struct mf_field; ++typedef OVSRCU_TYPE(struct mf_field *) rcu_field_ptr; + struct trie_node; + typedef OVSRCU_TYPE(struct trie_node *) rcu_trie_ptr; + + /* Prefix trie for a 'field' */ + struct cls_trie { +- const struct mf_field *field; /* Trie field, or NULL. */ +- rcu_trie_ptr root; /* NULL if none. */ ++ rcu_field_ptr field; /* Trie field, or NULL. */ ++ rcu_trie_ptr root; /* NULL if none. */ + }; + + enum { +diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c +index 416cb769d2..47261c7551 100644 +--- a/lib/conntrack-tcp.c ++++ b/lib/conntrack-tcp.c +@@ -189,7 +189,7 @@ tcp_conn_update(struct conntrack *ct, struct conn *conn_, + } else if (src->state <= CT_DPIF_TCPS_SYN_SENT) { + src->state = CT_DPIF_TCPS_SYN_SENT; + conn_update_expiration(ct, &conn->up, CT_TM_TCP_FIRST_PACKET, now); +- return CT_UPDATE_NEW; ++ return CT_UPDATE_VALID_NEW; + } + } + +diff --git a/lib/conntrack.c b/lib/conntrack.c +index ff5a89457c..0cbc8f6d2b 100644 +--- a/lib/conntrack.c ++++ b/lib/conntrack.c +@@ -1277,6 +1277,11 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, + const struct nat_action_info_t *nat_action_info, + ovs_be16 tp_src, ovs_be16 tp_dst, const char *helper) + { ++ /* Reset ct_state whenever entering a new zone. */ ++ if (pkt->md.ct_state && pkt->md.ct_zone != zone) { ++ pkt->md.ct_state = 0; ++ } ++ + bool create_new_conn = false; + conn_key_lookup(ct, &ctx->key, ctx->hash, now, &ctx->conn, &ctx->reply); + struct conn *conn = ctx->conn; +@@ -1300,9 +1305,10 @@ process_one(struct conntrack *ct, struct dp_packet *pkt, + conn_key_lookup(ct, &ctx->key, hash, now, &conn, &ctx->reply); + + if (!conn) { +- pkt->md.ct_state |= CS_TRACKED | CS_INVALID; ++ pkt->md.ct_state |= CS_INVALID; ++ write_ct_md(pkt, zone, NULL, NULL, NULL); + char *log_msg = xasprintf("Missing master conn %p", rev_conn); +- ct_print_conn_info(conn, log_msg, VLL_INFO, true, true); ++ ct_print_conn_info(rev_conn, log_msg, VLL_INFO, true, true); + free(log_msg); + return; + } +diff --git a/lib/dpctl.c b/lib/dpctl.c +index db2b1f8961..09ae97f25c 100644 +--- a/lib/dpctl.c ++++ b/lib/dpctl.c +@@ -1031,7 +1031,7 @@ dpctl_dump_flows(int argc, const char *argv[], struct dpctl_params *dpctl_p) + memset(&dump_types, 0, sizeof dump_types); + error = populate_dump_types(types_list, &dump_types, dpctl_p); + if (error) { +- goto out_free; ++ goto out_dpifclose; + } + determine_dpif_flow_dump_types(&dump_types, &dpif_dump_types); + +diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c +index d393aab5e3..42e1c44ae8 100644 +--- a/lib/dpif-netdev.c ++++ b/lib/dpif-netdev.c +@@ -481,6 +481,12 @@ struct dp_netdev_flow_stats { + atomic_uint16_t tcp_flags; /* Bitwise-OR of seen tcp_flags values. */ + }; + ++/* Contained by struct dp_netdev_flow's 'last_attrs' member. */ ++struct dp_netdev_flow_attrs { ++ atomic_bool offloaded; /* True if flow is offloaded to HW. */ ++ ATOMIC(const char *) dp_layer; /* DP layer the flow is handled in. */ ++}; ++ + /* A flow in 'dp_netdev_pmd_thread's 'flow_table'. + * + * +@@ -541,6 +547,11 @@ struct dp_netdev_flow { + /* Statistics. */ + struct dp_netdev_flow_stats stats; + ++ /* Statistics and attributes received from the netdev offload provider. */ ++ atomic_int netdev_flow_get_result; ++ struct dp_netdev_flow_stats last_stats; ++ struct dp_netdev_flow_attrs last_attrs; ++ + /* Actions. */ + OVSRCU_TYPE(struct dp_netdev_actions *) actions; + +@@ -2149,7 +2160,11 @@ dp_netdev_pmd_find_dpcls(struct dp_netdev_pmd_thread *pmd, + } + + #define MAX_FLOW_MARK (UINT32_MAX - 1) +-#define INVALID_FLOW_MARK (UINT32_MAX) ++#define INVALID_FLOW_MARK 0 ++/* Zero flow mark is used to indicate the HW to remove the mark. A packet ++ * marked with zero mark is received in SW without a mark at all, so it ++ * cannot be used as a valid mark. ++ */ + + struct megaflow_to_mark_data { + const struct cmap_node node; +@@ -2175,7 +2190,7 @@ flow_mark_alloc(void) + + if (!flow_mark.pool) { + /* Haven't initiated yet, do it here */ +- flow_mark.pool = id_pool_create(0, MAX_FLOW_MARK); ++ flow_mark.pool = id_pool_create(1, MAX_FLOW_MARK); + } + + if (id_pool_alloc_id(flow_mark.pool, &mark)) { +@@ -2280,6 +2295,12 @@ mark_to_flow_disassociate(struct dp_netdev_pmd_thread *pmd, + struct cmap_node *mark_node = CONST_CAST(struct cmap_node *, + &flow->mark_node); + ++ /* INVALID_FLOW_MARK may mean that the flow has been disassociated or ++ * never associated. */ ++ if (OVS_UNLIKELY(mark == INVALID_FLOW_MARK)) { ++ return EINVAL; ++ } ++ + cmap_remove(&flow_mark.mark_to_flow, mark_node, hash_int(mark, 0)); + flow->mark = INVALID_FLOW_MARK; + +@@ -2433,6 +2454,7 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload) + mark = flow_mark_alloc(); + if (mark == INVALID_FLOW_MARK) { + VLOG_ERR("Failed to allocate flow mark!\n"); ++ return -1; + } + } + info.flow_mark = mark; +@@ -2512,6 +2534,7 @@ dp_netdev_flow_offload_main(void *data OVS_UNUSED) + VLOG_DBG("%s to %s netdev flow\n", + ret == 0 ? "succeed" : "failed", op); + dp_netdev_free_flow_offload(offload); ++ ovsrcu_quiesce(); + } + + return NULL; +@@ -3032,9 +3055,56 @@ dp_netdev_pmd_find_flow(const struct dp_netdev_pmd_thread *pmd, + return NULL; + } + ++static void ++dp_netdev_flow_set_last_stats_attrs(struct dp_netdev_flow *netdev_flow, ++ const struct dpif_flow_stats *stats, ++ const struct dpif_flow_attrs *attrs, ++ int result) ++{ ++ struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; ++ struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; ++ ++ atomic_store_relaxed(&netdev_flow->netdev_flow_get_result, result); ++ if (result) { ++ return; ++ } ++ ++ atomic_store_relaxed(&last_stats->used, stats->used); ++ atomic_store_relaxed(&last_stats->packet_count, stats->n_packets); ++ atomic_store_relaxed(&last_stats->byte_count, stats->n_bytes); ++ atomic_store_relaxed(&last_stats->tcp_flags, stats->tcp_flags); ++ ++ atomic_store_relaxed(&last_attrs->offloaded, attrs->offloaded); ++ atomic_store_relaxed(&last_attrs->dp_layer, attrs->dp_layer); ++ ++} ++ ++static void ++dp_netdev_flow_get_last_stats_attrs(struct dp_netdev_flow *netdev_flow, ++ struct dpif_flow_stats *stats, ++ struct dpif_flow_attrs *attrs, ++ int *result) ++{ ++ struct dp_netdev_flow_stats *last_stats = &netdev_flow->last_stats; ++ struct dp_netdev_flow_attrs *last_attrs = &netdev_flow->last_attrs; ++ ++ atomic_read_relaxed(&netdev_flow->netdev_flow_get_result, result); ++ if (*result) { ++ return; ++ } ++ ++ atomic_read_relaxed(&last_stats->used, &stats->used); ++ atomic_read_relaxed(&last_stats->packet_count, &stats->n_packets); ++ atomic_read_relaxed(&last_stats->byte_count, &stats->n_bytes); ++ atomic_read_relaxed(&last_stats->tcp_flags, &stats->tcp_flags); ++ ++ atomic_read_relaxed(&last_attrs->offloaded, &attrs->offloaded); ++ atomic_read_relaxed(&last_attrs->dp_layer, &attrs->dp_layer); ++} ++ + static bool + dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp, +- const struct dp_netdev_flow *netdev_flow, ++ struct dp_netdev_flow *netdev_flow, + struct dpif_flow_stats *stats, + struct dpif_flow_attrs *attrs) + { +@@ -3056,11 +3126,31 @@ dpif_netdev_get_flow_offload_status(const struct dp_netdev *dp, + } + ofpbuf_use_stack(&buf, &act_buf, sizeof act_buf); + /* Taking a global 'port_mutex' to fulfill thread safety +- * restrictions for the netdev-offload-dpdk module. */ +- ovs_mutex_lock(&dp->port_mutex); +- ret = netdev_flow_get(netdev, &match, &actions, &netdev_flow->mega_ufid, +- stats, attrs, &buf); +- ovs_mutex_unlock(&dp->port_mutex); ++ * restrictions for the netdev-offload-dpdk module. ++ * ++ * XXX: Main thread will try to pause/stop all revalidators during datapath ++ * reconfiguration via datapath purge callback (dp_purge_cb) while ++ * holding 'dp->port_mutex'. So we're not waiting for mutex here. ++ * Otherwise, deadlock is possible, bcause revalidators might sleep ++ * waiting for the main thread to release the lock and main thread ++ * will wait for them to stop processing. ++ * This workaround might make statistics less accurate. Especially ++ * for flow deletion case, since there will be no other attempt. */ ++ if (!ovs_mutex_trylock(&dp->port_mutex)) { ++ ret = netdev_flow_get(netdev, &match, &actions, ++ &netdev_flow->mega_ufid, stats, attrs, &buf); ++ /* Storing statistics and attributes from the last request for ++ * later use on mutex contention. */ ++ dp_netdev_flow_set_last_stats_attrs(netdev_flow, stats, attrs, ret); ++ ovs_mutex_unlock(&dp->port_mutex); ++ } else { ++ dp_netdev_flow_get_last_stats_attrs(netdev_flow, stats, attrs, &ret); ++ if (!ret && !attrs->dp_layer) { ++ /* Flow was never reported as 'offloaded' so it's harmless ++ * to continue to think so. */ ++ ret = EAGAIN; ++ } ++ } + netdev_close(netdev); + if (ret) { + return false; +@@ -3329,6 +3419,9 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd, + /* Do not allocate extra space. */ + flow = xmalloc(sizeof *flow - sizeof flow->cr.flow.mf + mask.len); + memset(&flow->stats, 0, sizeof flow->stats); ++ atomic_init(&flow->netdev_flow_get_result, 0); ++ memset(&flow->last_stats, 0, sizeof flow->last_stats); ++ memset(&flow->last_attrs, 0, sizeof flow->last_attrs); + flow->dead = false; + flow->batch = NULL; + flow->mark = INVALID_FLOW_MARK; +@@ -4940,9 +5033,17 @@ reconfigure_datapath(struct dp_netdev *dp) + + /* Check for all the ports that need reconfiguration. We cache this in + * 'port->need_reconfigure', because netdev_is_reconf_required() can +- * change at any time. */ ++ * change at any time. ++ * Also mark for reconfiguration all ports which will likely change their ++ * 'dynamic_txqs' parameter. It's required to stop using them before ++ * changing this setting and it's simpler to mark ports here and allow ++ * 'pmd_remove_stale_ports' to remove them from threads. There will be ++ * no actual reconfiguration in 'port_reconfigure' because it's ++ * unnecessary. */ + HMAP_FOR_EACH (port, node, &dp->ports) { +- if (netdev_is_reconf_required(port->netdev)) { ++ if (netdev_is_reconf_required(port->netdev) ++ || (port->dynamic_txqs ++ != (netdev_n_txq(port->netdev) < wanted_txqs))) { + port->need_reconfigure = true; + } + } +diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c +index 5b5c96d727..f9c732886f 100644 +--- a/lib/dpif-netlink.c ++++ b/lib/dpif-netlink.c +@@ -691,6 +691,7 @@ dpif_netlink_set_features(struct dpif *dpif_, uint32_t new_features) + + dpif_netlink_dp_init(&request); + request.cmd = OVS_DP_CMD_SET; ++ request.name = dpif_->base_name; + request.dp_ifindex = dpif->dp_ifindex; + request.user_features = dpif->user_features | new_features; + +@@ -2091,6 +2092,7 @@ parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put) + info.tunnel_csum_on = csum_on; + info.recirc_id_shared_with_tc = (dpif->user_features + & OVS_DP_F_TC_RECIRC_SHARING); ++ info.tc_modify_flow_deleted = false; + err = netdev_flow_put(dev, &match, + CONST_CAST(struct nlattr *, put->actions), + put->actions_len, +@@ -2141,7 +2143,11 @@ parse_flow_put(struct dpif_netlink *dpif, struct dpif_flow_put *put) + out: + if (err && err != EEXIST && (put->flags & DPIF_FP_MODIFY)) { + /* Modified rule can't be offloaded, try and delete from HW */ +- int del_err = netdev_flow_del(dev, put->ufid, put->stats); ++ int del_err = 0; ++ ++ if (!info.tc_modify_flow_deleted) { ++ del_err = netdev_flow_del(dev, put->ufid, put->stats); ++ } + + if (!del_err) { + /* Delete from hw success, so old flow was offloaded. +diff --git a/lib/meta-flow.c b/lib/meta-flow.c +index 8b62e6d968..80063b933d 100644 +--- a/lib/meta-flow.c ++++ b/lib/meta-flow.c +@@ -2296,12 +2296,6 @@ mf_set(const struct mf_field *mf, + switch (mf->id) { + case MFF_CT_ZONE: + case MFF_CT_NW_PROTO: +- case MFF_CT_NW_SRC: +- case MFF_CT_NW_DST: +- case MFF_CT_IPV6_SRC: +- case MFF_CT_IPV6_DST: +- case MFF_CT_TP_SRC: +- case MFF_CT_TP_DST: + case MFF_RECIRC_ID: + case MFF_PACKET_TYPE: + case MFF_CONJ_ID: +@@ -2419,6 +2413,30 @@ mf_set(const struct mf_field *mf, + ntoh128(mask->be128)); + break; + ++ case MFF_CT_NW_SRC: ++ match_set_ct_nw_src_masked(match, value->be32, mask->be32); ++ break; ++ ++ case MFF_CT_NW_DST: ++ match_set_ct_nw_dst_masked(match, value->be32, mask->be32); ++ break; ++ ++ case MFF_CT_IPV6_SRC: ++ match_set_ct_ipv6_src_masked(match, &value->ipv6, &mask->ipv6); ++ break; ++ ++ case MFF_CT_IPV6_DST: ++ match_set_ct_ipv6_dst_masked(match, &value->ipv6, &mask->ipv6); ++ break; ++ ++ case MFF_CT_TP_SRC: ++ match_set_ct_tp_src_masked(match, value->be16, mask->be16); ++ break; ++ ++ case MFF_CT_TP_DST: ++ match_set_ct_tp_dst_masked(match, value->be16, mask->be16); ++ break; ++ + case MFF_ETH_DST: + match_set_dl_dst_masked(match, value->mac, mask->mac); + break; +diff --git a/lib/meta-flow.xml b/lib/meta-flow.xml +index 90b405c737..2f9c5ee163 100644 +--- a/lib/meta-flow.xml ++++ b/lib/meta-flow.xml +@@ -2566,8 +2566,8 @@ actions=clone(load:0->NXM_OF_IN_PORT[],output:123) + +
est (0x02)
+
+- Part of an existing connection. Set to 1 if this is a committed +- connection. ++ Part of an existing connection. Set to 1 if packets of a committed ++ connection have been seen by conntrack from both directions. +
+ +
rel (0x04)
+diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c +index 6187129c00..7ab81864db 100644 +--- a/lib/netdev-dpdk.c ++++ b/lib/netdev-dpdk.c +@@ -152,6 +152,16 @@ typedef uint16_t dpdk_port_t; + + #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) + ++/* List of required flags advertised by the hardware that will be used ++ * if TSO is enabled. Ideally this should include DEV_TX_OFFLOAD_SCTP_CKSUM. ++ * However, very few drivers supports that the moment and SCTP is not a ++ * widely used protocol as TCP and UDP, so it's optional. */ ++#define DPDK_TX_TSO_OFFLOAD_FLAGS (DEV_TX_OFFLOAD_TCP_TSO \ ++ | DEV_TX_OFFLOAD_TCP_CKSUM \ ++ | DEV_TX_OFFLOAD_UDP_CKSUM \ ++ | DEV_TX_OFFLOAD_IPV4_CKSUM) ++ ++ + static const struct rte_eth_conf port_conf = { + .rxmode = { + .mq_mode = ETH_MQ_RX_RSS, +@@ -415,6 +425,7 @@ enum dpdk_hw_ol_features { + NETDEV_RX_HW_CRC_STRIP = 1 << 1, + NETDEV_RX_HW_SCATTER = 1 << 2, + NETDEV_TX_TSO_OFFLOAD = 1 << 3, ++ NETDEV_TX_SCTP_CHECKSUM_OFFLOAD = 1 << 4, + }; + + /* +@@ -997,9 +1008,10 @@ dpdk_eth_dev_port_config(struct netdev_dpdk *dev, int n_rxq, int n_txq) + } + + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { +- conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_TSO; +- conf.txmode.offloads |= DEV_TX_OFFLOAD_TCP_CKSUM; +- conf.txmode.offloads |= DEV_TX_OFFLOAD_IPV4_CKSUM; ++ conf.txmode.offloads |= DPDK_TX_TSO_OFFLOAD_FLAGS; ++ if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { ++ conf.txmode.offloads |= DEV_TX_OFFLOAD_SCTP_CKSUM; ++ } + } + + /* Limit configured rss hash functions to only those supported +@@ -1100,12 +1112,10 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) + struct rte_ether_addr eth_addr; + int diag; + int n_rxq, n_txq; ++ uint32_t tx_tso_offload_capa = DPDK_TX_TSO_OFFLOAD_FLAGS; + uint32_t rx_chksm_offload_capa = DEV_RX_OFFLOAD_UDP_CKSUM | + DEV_RX_OFFLOAD_TCP_CKSUM | + DEV_RX_OFFLOAD_IPV4_CKSUM; +- uint32_t tx_tso_offload_capa = DEV_TX_OFFLOAD_TCP_TSO | +- DEV_TX_OFFLOAD_TCP_CKSUM | +- DEV_TX_OFFLOAD_IPV4_CKSUM; + + rte_eth_dev_info_get(dev->port_id, &info); + +@@ -1137,6 +1147,13 @@ dpdk_eth_dev_init(struct netdev_dpdk *dev) + if ((info.tx_offload_capa & tx_tso_offload_capa) + == tx_tso_offload_capa) { + dev->hw_ol_features |= NETDEV_TX_TSO_OFFLOAD; ++ if (info.tx_offload_capa & DEV_TX_OFFLOAD_SCTP_CKSUM) { ++ dev->hw_ol_features |= NETDEV_TX_SCTP_CHECKSUM_OFFLOAD; ++ } else { ++ VLOG_WARN("%s: Tx SCTP checksum offload is not supported, " ++ "SCTP packets sent to this device will be dropped", ++ netdev_get_name(&dev->up)); ++ } + } else { + VLOG_WARN("%s: Tx TSO offload is not supported.", + netdev_get_name(&dev->up)); +@@ -5110,7 +5127,11 @@ netdev_dpdk_reconfigure(struct netdev *netdev) + if (dev->hw_ol_features & NETDEV_TX_TSO_OFFLOAD) { + netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; + netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; ++ netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; + netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; ++ if (dev->hw_ol_features & NETDEV_TX_SCTP_CHECKSUM_OFFLOAD) { ++ netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; ++ } + } + + dev->tx_q = netdev_dpdk_alloc_txq(netdev->n_txq); +@@ -5186,6 +5207,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + int err; + uint64_t vhost_flags = 0; ++ uint64_t vhost_unsup_flags; + bool zc_enabled; + + ovs_mutex_lock(&dev->mutex); +@@ -5251,17 +5273,24 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) + if (userspace_tso_enabled()) { + netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; + netdev->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; ++ netdev->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; ++ netdev->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; + netdev->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; ++ vhost_unsup_flags = 1ULL << VIRTIO_NET_F_HOST_ECN ++ | 1ULL << VIRTIO_NET_F_HOST_UFO; + } else { +- err = rte_vhost_driver_disable_features(dev->vhost_id, +- 1ULL << VIRTIO_NET_F_HOST_TSO4 +- | 1ULL << VIRTIO_NET_F_HOST_TSO6 +- | 1ULL << VIRTIO_NET_F_CSUM); +- if (err) { +- VLOG_ERR("rte_vhost_driver_disable_features failed for " +- "vhost user client port: %s\n", dev->up.name); +- goto unlock; +- } ++ /* This disables checksum offloading and all the features ++ * that depends on it (TSO, UFO, ECN) according to virtio ++ * specification. */ ++ vhost_unsup_flags = 1ULL << VIRTIO_NET_F_CSUM; ++ } ++ ++ err = rte_vhost_driver_disable_features(dev->vhost_id, ++ vhost_unsup_flags); ++ if (err) { ++ VLOG_ERR("rte_vhost_driver_disable_features failed for " ++ "vhost user client port: %s\n", dev->up.name); ++ goto unlock; + } + + err = rte_vhost_driver_start(dev->vhost_id); +diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c +index c6f3d27409..8d779945a1 100644 +--- a/lib/netdev-linux.c ++++ b/lib/netdev-linux.c +@@ -231,6 +231,14 @@ struct rtnl_link_stats64 { + uint64_t tx_compressed; + }; + ++/* Linux 3.19 introduced virtio_types.h. It might be missing ++ * if we are using old kernel. */ ++#ifndef HAVE_VIRTIO_TYPES ++typedef __u16 __bitwise__ __virtio16; ++typedef __u32 __bitwise__ __virtio32; ++typedef __u64 __bitwise__ __virtio64; ++#endif ++ + enum { + VALID_IFINDEX = 1 << 0, + VALID_ETHERADDR = 1 << 1, +@@ -659,10 +667,6 @@ netdev_linux_update_lag(struct rtnetlink_change *change) + { + struct linux_lag_slave *lag; + +- if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) { +- return; +- } +- + if (change->slave && netdev_linux_kind_is_lag(change->slave)) { + lag = shash_find_data(&lag_shash, change->ifname); + +@@ -760,8 +764,11 @@ netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED) + netdev_linux_update(netdev, nsid, &change); + ovs_mutex_unlock(&netdev->mutex); + } +- else if (!netdev_ && change.ifname) { +- /* Netdev is not present in OvS but its master could be. */ ++ ++ if (change.ifname && ++ rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) { ++ ++ /* Need to try updating the LAG information. */ + ovs_mutex_lock(&lag_mutex); + netdev_linux_update_lag(&change); + ovs_mutex_unlock(&lag_mutex); +@@ -923,6 +930,8 @@ netdev_linux_common_construct(struct netdev *netdev_) + if (userspace_tso_enabled()) { + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_TSO; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_TCP_CKSUM; ++ netdev_->ol_flags |= NETDEV_TX_OFFLOAD_UDP_CKSUM; ++ netdev_->ol_flags |= NETDEV_TX_OFFLOAD_SCTP_CKSUM; + netdev_->ol_flags |= NETDEV_TX_OFFLOAD_IPV4_CKSUM; + } + +diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c +index f8c46bbaad..4538baf5e6 100644 +--- a/lib/netdev-offload-dpdk.c ++++ b/lib/netdev-offload-dpdk.c +@@ -565,8 +565,18 @@ parse_flow_match(struct flow_patterns *patterns, + uint8_t proto = 0; + + /* Eth */ +- if (!eth_addr_is_zero(match->wc.masks.dl_src) || +- !eth_addr_is_zero(match->wc.masks.dl_dst)) { ++ if (match->wc.masks.dl_type == OVS_BE16_MAX && is_ip_any(&match->flow) ++ && eth_addr_is_zero(match->wc.masks.dl_dst) ++ && eth_addr_is_zero(match->wc.masks.dl_src)) { ++ /* ++ * This is a temporary work around to fix ethernet pattern for partial ++ * hardware offload for X710 devices. This fix will be reverted once ++ * the issue is fixed within the i40e PMD driver. ++ */ ++ add_flow_pattern(patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL); ++ } else if (match->wc.masks.dl_type || ++ !eth_addr_is_zero(match->wc.masks.dl_src) || ++ !eth_addr_is_zero(match->wc.masks.dl_dst)) { + struct rte_flow_item_eth *spec, *mask; + + spec = xzalloc(sizeof *spec); +@@ -581,15 +591,6 @@ parse_flow_match(struct flow_patterns *patterns, + mask->type = match->wc.masks.dl_type; + + add_flow_pattern(patterns, RTE_FLOW_ITEM_TYPE_ETH, spec, mask); +- } else { +- /* +- * If user specifies a flow (like UDP flow) without L2 patterns, +- * OVS will at least set the dl_type. Normally, it's enough to +- * create an eth pattern just with it. Unluckily, some Intel's +- * NIC (such as XL710) doesn't support that. Below is a workaround, +- * which simply matches any L2 pkts. +- */ +- add_flow_pattern(patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL); + } + + /* VLAN */ +diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c +index 550e440b3a..e188e63e56 100644 +--- a/lib/netdev-offload-tc.c ++++ b/lib/netdev-offload-tc.c +@@ -1727,7 +1727,7 @@ netdev_tc_flow_put(struct netdev *netdev, struct match *match, + if (get_ufid_tc_mapping(ufid, &id) == 0) { + VLOG_DBG_RL(&rl, "updating old handle: %d prio: %d", + id.handle, id.prio); +- del_filter_and_ufid_mapping(&id, ufid); ++ info->tc_modify_flow_deleted = !del_filter_and_ufid_mapping(&id, ufid); + } + + prio = get_prio_for_tc_flower(&flower); +@@ -1907,6 +1907,7 @@ netdev_tc_init_flow_api(struct netdev *netdev) + static struct ovsthread_once block_once = OVSTHREAD_ONCE_INITIALIZER; + enum tc_qdisc_hook hook = get_tc_qdisc_hook(netdev); + uint32_t block_id = 0; ++ struct tcf_id id; + int ifindex; + int error; + +@@ -1917,11 +1918,21 @@ netdev_tc_init_flow_api(struct netdev *netdev) + return -ifindex; + } + ++ block_id = get_block_id_from_netdev(netdev); ++ ++ /* Flush rules explicitly needed when we work with ingress_block, ++ * so we will not fail with reattaching block to bond iface, for ex. ++ */ ++ id = tc_make_tcf_id(ifindex, block_id, 0, hook); ++ tc_del_filter(&id); ++ + /* make sure there is no ingress/egress qdisc */ + tc_add_del_qdisc(ifindex, false, 0, hook); + + if (ovsthread_once_start(&block_once)) { + probe_tc_block_support(ifindex); ++ /* Need to re-fetch block id as it depends on feature availability. */ ++ block_id = get_block_id_from_netdev(netdev); + ovsthread_once_done(&block_once); + } + +@@ -1930,7 +1941,6 @@ netdev_tc_init_flow_api(struct netdev *netdev) + ovsthread_once_done(&multi_mask_once); + } + +- block_id = get_block_id_from_netdev(netdev); + error = tc_add_del_qdisc(ifindex, true, block_id, hook); + + if (error && error != EEXIST) { +diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h +index cd6dfdfff4..b4b882a56a 100644 +--- a/lib/netdev-offload.h ++++ b/lib/netdev-offload.h +@@ -74,6 +74,9 @@ struct offload_info { + * it will be in the pkt meta data. + */ + uint32_t flow_mark; ++ ++ bool tc_modify_flow_deleted; /* Indicate the tc modify flow put success ++ * to delete the original flow. */ + }; + + int netdev_flow_flush(struct netdev *); +diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h +index 22f4cde333..6f509424bc 100644 +--- a/lib/netdev-provider.h ++++ b/lib/netdev-provider.h +@@ -40,7 +40,9 @@ struct netdev_tnl_build_header_params; + enum netdev_ol_flags { + NETDEV_TX_OFFLOAD_IPV4_CKSUM = 1 << 0, + NETDEV_TX_OFFLOAD_TCP_CKSUM = 1 << 1, +- NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 2, ++ NETDEV_TX_OFFLOAD_UDP_CKSUM = 1 << 2, ++ NETDEV_TX_OFFLOAD_SCTP_CKSUM = 1 << 3, ++ NETDEV_TX_OFFLOAD_TCP_TSO = 1 << 4, + }; + + /* A network device (e.g. an Ethernet device). +diff --git a/lib/netdev.c b/lib/netdev.c +index f95b19af4d..8c44eee8e9 100644 +--- a/lib/netdev.c ++++ b/lib/netdev.c +@@ -791,6 +791,8 @@ static bool + netdev_send_prepare_packet(const uint64_t netdev_flags, + struct dp_packet *packet, char **errormsg) + { ++ uint64_t l4_mask; ++ + if (dp_packet_hwol_is_tso(packet) + && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_TSO)) { + /* Fall back to GSO in software. */ +@@ -798,11 +800,31 @@ netdev_send_prepare_packet(const uint64_t netdev_flags, + return false; + } + +- if (dp_packet_hwol_l4_mask(packet) +- && !(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { +- /* Fall back to L4 csum in software. */ +- VLOG_ERR_BUF(errormsg, "No L4 checksum support"); ++ l4_mask = dp_packet_hwol_l4_mask(packet); ++ if (l4_mask) { ++ if (dp_packet_hwol_l4_is_tcp(packet)) { ++ if (!(netdev_flags & NETDEV_TX_OFFLOAD_TCP_CKSUM)) { ++ /* Fall back to TCP csum in software. */ ++ VLOG_ERR_BUF(errormsg, "No TCP checksum support"); ++ return false; ++ } ++ } else if (dp_packet_hwol_l4_is_udp(packet)) { ++ if (!(netdev_flags & NETDEV_TX_OFFLOAD_UDP_CKSUM)) { ++ /* Fall back to UDP csum in software. */ ++ VLOG_ERR_BUF(errormsg, "No UDP checksum support"); ++ return false; ++ } ++ } else if (dp_packet_hwol_l4_is_sctp(packet)) { ++ if (!(netdev_flags & NETDEV_TX_OFFLOAD_SCTP_CKSUM)) { ++ /* Fall back to SCTP csum in software. */ ++ VLOG_ERR_BUF(errormsg, "No SCTP checksum support"); ++ return false; ++ } ++ } else { ++ VLOG_ERR_BUF(errormsg, "No L4 checksum support: mask: %"PRIu64, ++ l4_mask); + return false; ++ } + } + + return true; +diff --git a/lib/odp-execute.c b/lib/odp-execute.c +index 42d3335f0f..97320a4dba 100644 +--- a/lib/odp-execute.c ++++ b/lib/odp-execute.c +@@ -761,10 +761,11 @@ odp_execute_check_pkt_len(void *dp, struct dp_packet *packet, bool steal, + + const struct nlattr *a; + struct dp_packet_batch pb; ++ uint32_t size = dp_packet_get_send_len(packet) ++ - dp_packet_l2_pad_size(packet); + + a = attrs[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]; +- bool is_greater = dp_packet_size(packet) > nl_attr_get_u16(a); +- if (is_greater) { ++ if (size > nl_attr_get_u16(a)) { + a = attrs[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER]; + } else { + a = attrs[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL]; +diff --git a/lib/odp-util.c b/lib/odp-util.c +index 746d1e97d4..6baa2a8a70 100644 +--- a/lib/odp-util.c ++++ b/lib/odp-util.c +@@ -6225,7 +6225,9 @@ odp_flow_key_from_flow__(const struct odp_flow_key_parms *parms, + struct ovs_key_nd_extensions *nd_ext_key; + + if (data->igmp_group_ip4 != 0 || data->tcp_flags != 0) { +- nd_ext_key = nl_msg_put_unspec_uninit(buf, ++ /* 'struct ovs_key_nd_extensions' has padding, ++ * clear it. */ ++ nd_ext_key = nl_msg_put_unspec_zero(buf, + OVS_KEY_ATTR_ND_EXTENSIONS, + sizeof *nd_ext_key); + nd_ext_key->nd_reserved = data->igmp_group_ip4; +@@ -6275,6 +6277,10 @@ odp_key_from_dp_packet(struct ofpbuf *buf, const struct dp_packet *packet) + + nl_msg_put_u32(buf, OVS_KEY_ATTR_PRIORITY, md->skb_priority); + ++ if (md->dp_hash) { ++ nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, md->dp_hash); ++ } ++ + if (flow_tnl_dst_is_set(&md->tunnel)) { + tun_key_to_attr(buf, &md->tunnel, &md->tunnel, NULL, NULL); + } +@@ -7565,6 +7571,28 @@ struct offsetof_sizeof { + int size; + }; + ++ ++/* Performs bitwise OR over the fields in 'dst_' and 'src_' specified in ++ * 'offsetof_sizeof_arr' array. Result is stored in 'dst_'. */ ++static void ++or_masks(void *dst_, const void *src_, ++ struct offsetof_sizeof *offsetof_sizeof_arr) ++{ ++ int field, size, offset; ++ const uint8_t *src = src_; ++ uint8_t *dst = dst_; ++ ++ for (field = 0; ; field++) { ++ size = offsetof_sizeof_arr[field].size; ++ offset = offsetof_sizeof_arr[field].offset; ++ ++ if (!size) { ++ return; ++ } ++ or_bytes(dst + offset, src + offset, size); ++ } ++} ++ + /* Compares each of the fields in 'key0' and 'key1'. The fields are specified + * in 'offsetof_sizeof_arr', which is an array terminated by a 0-size field. + * Returns true if all of the fields are equal, false if at least one differs. +@@ -7643,9 +7671,10 @@ commit_set_ether_action(const struct flow *flow, struct flow *base_flow, + struct flow_wildcards *wc, + bool use_masked) + { +- struct ovs_key_ethernet key, base, mask; ++ struct ovs_key_ethernet key, base, mask, orig_mask; + struct offsetof_sizeof ovs_key_ethernet_offsetof_sizeof_arr[] = + OVS_KEY_ETHERNET_OFFSETOF_SIZEOF_ARR; ++ + if (flow->packet_type != htonl(PT_ETH)) { + return; + } +@@ -7653,11 +7682,13 @@ commit_set_ether_action(const struct flow *flow, struct flow *base_flow, + get_ethernet_key(flow, &key); + get_ethernet_key(base_flow, &base); + get_ethernet_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(OVS_KEY_ATTR_ETHERNET, use_masked, + &key, &base, &mask, sizeof key, + ovs_key_ethernet_offsetof_sizeof_arr, odp_actions)) { + put_ethernet_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_ethernet_offsetof_sizeof_arr); + put_ethernet_key(&mask, &wc->masks); + } + } +@@ -7781,7 +7812,7 @@ commit_set_ipv4_action(const struct flow *flow, struct flow *base_flow, + struct ofpbuf *odp_actions, struct flow_wildcards *wc, + bool use_masked) + { +- struct ovs_key_ipv4 key, mask, base; ++ struct ovs_key_ipv4 key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_ipv4_offsetof_sizeof_arr[] = + OVS_KEY_IPV4_OFFSETOF_SIZEOF_ARR; + +@@ -7792,6 +7823,7 @@ commit_set_ipv4_action(const struct flow *flow, struct flow *base_flow, + get_ipv4_key(flow, &key, false); + get_ipv4_key(base_flow, &base, false); + get_ipv4_key(&wc->masks, &mask, true); ++ memcpy(&orig_mask, &mask, sizeof mask); + mask.ipv4_proto = 0; /* Not writeable. */ + mask.ipv4_frag = 0; /* Not writable. */ + +@@ -7803,9 +7835,8 @@ commit_set_ipv4_action(const struct flow *flow, struct flow *base_flow, + if (commit(OVS_KEY_ATTR_IPV4, use_masked, &key, &base, &mask, sizeof key, + ovs_key_ipv4_offsetof_sizeof_arr, odp_actions)) { + put_ipv4_key(&base, base_flow, false); +- if (mask.ipv4_proto != 0) { /* Mask was changed by commit(). */ +- put_ipv4_key(&mask, &wc->masks, true); +- } ++ or_masks(&mask, &orig_mask, ovs_key_ipv4_offsetof_sizeof_arr); ++ put_ipv4_key(&mask, &wc->masks, true); + } + } + +@@ -7838,7 +7869,7 @@ commit_set_ipv6_action(const struct flow *flow, struct flow *base_flow, + struct ofpbuf *odp_actions, struct flow_wildcards *wc, + bool use_masked) + { +- struct ovs_key_ipv6 key, mask, base; ++ struct ovs_key_ipv6 key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_ipv6_offsetof_sizeof_arr[] = + OVS_KEY_IPV6_OFFSETOF_SIZEOF_ARR; + +@@ -7849,6 +7880,7 @@ commit_set_ipv6_action(const struct flow *flow, struct flow *base_flow, + get_ipv6_key(flow, &key, false); + get_ipv6_key(base_flow, &base, false); + get_ipv6_key(&wc->masks, &mask, true); ++ memcpy(&orig_mask, &mask, sizeof mask); + mask.ipv6_proto = 0; /* Not writeable. */ + mask.ipv6_frag = 0; /* Not writable. */ + mask.ipv6_label &= htonl(IPV6_LABEL_MASK); /* Not writable. */ +@@ -7861,9 +7893,8 @@ commit_set_ipv6_action(const struct flow *flow, struct flow *base_flow, + if (commit(OVS_KEY_ATTR_IPV6, use_masked, &key, &base, &mask, sizeof key, + ovs_key_ipv6_offsetof_sizeof_arr, odp_actions)) { + put_ipv6_key(&base, base_flow, false); +- if (mask.ipv6_proto != 0) { /* Mask was changed by commit(). */ +- put_ipv6_key(&mask, &wc->masks, true); +- } ++ or_masks(&mask, &orig_mask, ovs_key_ipv6_offsetof_sizeof_arr); ++ put_ipv6_key(&mask, &wc->masks, true); + } + } + +@@ -7894,17 +7925,19 @@ static enum slow_path_reason + commit_set_arp_action(const struct flow *flow, struct flow *base_flow, + struct ofpbuf *odp_actions, struct flow_wildcards *wc) + { +- struct ovs_key_arp key, mask, base; ++ struct ovs_key_arp key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_arp_offsetof_sizeof_arr[] = + OVS_KEY_ARP_OFFSETOF_SIZEOF_ARR; + + get_arp_key(flow, &key); + get_arp_key(base_flow, &base); + get_arp_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(OVS_KEY_ATTR_ARP, true, &key, &base, &mask, sizeof key, + ovs_key_arp_offsetof_sizeof_arr, odp_actions)) { + put_arp_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_arp_offsetof_sizeof_arr); + put_arp_key(&mask, &wc->masks); + return SLOW_ACTION; + } +@@ -7931,7 +7964,7 @@ static enum slow_path_reason + commit_set_icmp_action(const struct flow *flow, struct flow *base_flow, + struct ofpbuf *odp_actions, struct flow_wildcards *wc) + { +- struct ovs_key_icmp key, mask, base; ++ struct ovs_key_icmp key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_icmp_offsetof_sizeof_arr[] = + OVS_KEY_ICMP_OFFSETOF_SIZEOF_ARR; + enum ovs_key_attr attr; +@@ -7947,10 +7980,12 @@ commit_set_icmp_action(const struct flow *flow, struct flow *base_flow, + get_icmp_key(flow, &key); + get_icmp_key(base_flow, &base); + get_icmp_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(attr, false, &key, &base, &mask, sizeof key, + ovs_key_icmp_offsetof_sizeof_arr, odp_actions)) { + put_icmp_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_icmp_offsetof_sizeof_arr); + put_icmp_key(&mask, &wc->masks); + return SLOW_ACTION; + } +@@ -7998,17 +8033,19 @@ commit_set_nd_action(const struct flow *flow, struct flow *base_flow, + struct ofpbuf *odp_actions, + struct flow_wildcards *wc, bool use_masked) + { +- struct ovs_key_nd key, mask, base; ++ struct ovs_key_nd key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_nd_offsetof_sizeof_arr[] = + OVS_KEY_ND_OFFSETOF_SIZEOF_ARR; + + get_nd_key(flow, &key); + get_nd_key(base_flow, &base); + get_nd_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(OVS_KEY_ATTR_ND, use_masked, &key, &base, &mask, sizeof key, + ovs_key_nd_offsetof_sizeof_arr, odp_actions)) { + put_nd_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_nd_offsetof_sizeof_arr); + put_nd_key(&mask, &wc->masks); + return SLOW_ACTION; + } +@@ -8022,18 +8059,20 @@ commit_set_nd_extensions_action(const struct flow *flow, + struct ofpbuf *odp_actions, + struct flow_wildcards *wc, bool use_masked) + { +- struct ovs_key_nd_extensions key, mask, base; ++ struct ovs_key_nd_extensions key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_nd_extensions_offsetof_sizeof_arr[] = + OVS_KEY_ND_EXTENSIONS_OFFSETOF_SIZEOF_ARR; + + get_nd_extensions_key(flow, &key); + get_nd_extensions_key(base_flow, &base); + get_nd_extensions_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(OVS_KEY_ATTR_ND_EXTENSIONS, use_masked, &key, &base, &mask, + sizeof key, ovs_key_nd_extensions_offsetof_sizeof_arr, + odp_actions)) { + put_nd_extensions_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_nd_extensions_offsetof_sizeof_arr); + put_nd_extensions_key(&mask, &wc->masks); + return SLOW_ACTION; + } +@@ -8248,7 +8287,7 @@ commit_set_port_action(const struct flow *flow, struct flow *base_flow, + bool use_masked) + { + enum ovs_key_attr key_type; +- union ovs_key_tp key, mask, base; ++ union ovs_key_tp key, mask, orig_mask, base; + struct offsetof_sizeof ovs_key_tp_offsetof_sizeof_arr[] = + OVS_KEY_TCP_OFFSETOF_SIZEOF_ARR; + +@@ -8274,10 +8313,12 @@ commit_set_port_action(const struct flow *flow, struct flow *base_flow, + get_tp_key(flow, &key); + get_tp_key(base_flow, &base); + get_tp_key(&wc->masks, &mask); ++ memcpy(&orig_mask, &mask, sizeof mask); + + if (commit(key_type, use_masked, &key, &base, &mask, sizeof key, + ovs_key_tp_offsetof_sizeof_arr, odp_actions)) { + put_tp_key(&base, base_flow); ++ or_masks(&mask, &orig_mask, ovs_key_tp_offsetof_sizeof_arr); + put_tp_key(&mask, &wc->masks); + } + } +@@ -8301,7 +8342,7 @@ commit_set_priority_action(const struct flow *flow, struct flow *base_flow, + if (commit(OVS_KEY_ATTR_PRIORITY, use_masked, &key, &base, &mask, + sizeof key, ovs_key_prio_offsetof_sizeof_arr, odp_actions)) { + base_flow->skb_priority = base; +- wc->masks.skb_priority = mask; ++ wc->masks.skb_priority |= mask; + } + } + +@@ -8325,7 +8366,7 @@ commit_set_pkt_mark_action(const struct flow *flow, struct flow *base_flow, + sizeof key, ovs_key_pkt_mark_offsetof_sizeof_arr, + odp_actions)) { + base_flow->pkt_mark = base; +- wc->masks.pkt_mark = mask; ++ wc->masks.pkt_mark |= mask; + } + } + +diff --git a/lib/ofp-actions.c b/lib/ofp-actions.c +index ddef3b0c87..ef8b2b4527 100644 +--- a/lib/ofp-actions.c ++++ b/lib/ofp-actions.c +@@ -6657,6 +6657,7 @@ parse_CT(char *arg, const struct ofpact_parse_params *pp) + } + + if (ofpbuf_oversized(pp->ofpacts)) { ++ free(error); + return xasprintf("input too big"); + } + +diff --git a/lib/ovs-rcu.c b/lib/ovs-rcu.c +index ebc8120f0f..cde1e925ba 100644 +--- a/lib/ovs-rcu.c ++++ b/lib/ovs-rcu.c +@@ -30,6 +30,8 @@ + + VLOG_DEFINE_THIS_MODULE(ovs_rcu); + ++#define MIN_CBS 16 ++ + struct ovsrcu_cb { + void (*function)(void *aux); + void *aux; +@@ -37,7 +39,8 @@ struct ovsrcu_cb { + + struct ovsrcu_cbset { + struct ovs_list list_node; +- struct ovsrcu_cb cbs[16]; ++ struct ovsrcu_cb *cbs; ++ size_t n_allocated; + int n_cbs; + }; + +@@ -310,16 +313,19 @@ ovsrcu_postpone__(void (*function)(void *aux), void *aux) + cbset = perthread->cbset; + if (!cbset) { + cbset = perthread->cbset = xmalloc(sizeof *perthread->cbset); ++ cbset->cbs = xmalloc(MIN_CBS * sizeof *cbset->cbs); ++ cbset->n_allocated = MIN_CBS; + cbset->n_cbs = 0; + } + ++ if (cbset->n_cbs == cbset->n_allocated) { ++ cbset->cbs = x2nrealloc(cbset->cbs, &cbset->n_allocated, ++ sizeof *cbset->cbs); ++ } ++ + cb = &cbset->cbs[cbset->n_cbs++]; + cb->function = function; + cb->aux = aux; +- +- if (cbset->n_cbs >= ARRAY_SIZE(cbset->cbs)) { +- ovsrcu_flush_cbset(perthread); +- } + } + + static bool +@@ -341,6 +347,7 @@ ovsrcu_call_postponed(void) + for (cb = cbset->cbs; cb < &cbset->cbs[cbset->n_cbs]; cb++) { + cb->function(cb->aux); + } ++ free(cbset->cbs); + free(cbset); + } + +diff --git a/lib/ovs-router.c b/lib/ovs-router.c +index bfb2b7071b..09b81c6e5a 100644 +--- a/lib/ovs-router.c ++++ b/lib/ovs-router.c +@@ -505,7 +505,7 @@ ovs_router_flush(void) + ovs_mutex_lock(&mutex); + classifier_defer(&cls); + CLS_FOR_EACH(rt, cr, &cls) { +- if (rt->priority == rt->plen) { ++ if (rt->priority == rt->plen || rt->local) { + rt_entry_delete__(&rt->cr); + } + } +diff --git a/lib/ovsdb-idl-provider.h b/lib/ovsdb-idl-provider.h +index 30d1d08eba..00497d940c 100644 +--- a/lib/ovsdb-idl-provider.h ++++ b/lib/ovsdb-idl-provider.h +@@ -122,8 +122,12 @@ struct ovsdb_idl_table { + unsigned int change_seqno[OVSDB_IDL_CHANGE_MAX]; + struct ovs_list indexes; /* Contains "struct ovsdb_idl_index"s */ + struct ovs_list track_list; /* Tracked rows (ovsdb_idl_row.track_node). */ +- struct ovsdb_idl_condition condition; +- bool cond_changed; ++ struct ovsdb_idl_condition *ack_cond; /* Last condition acked by the ++ * server. */ ++ struct ovsdb_idl_condition *req_cond; /* Last condition requested to the ++ * server. */ ++ struct ovsdb_idl_condition *new_cond; /* Latest condition set by the IDL ++ * client. */ + }; + + struct ovsdb_idl_class { +diff --git a/lib/ovsdb-idl.c b/lib/ovsdb-idl.c +index 190143f363..5abe40f6d8 100644 +--- a/lib/ovsdb-idl.c ++++ b/lib/ovsdb-idl.c +@@ -240,6 +240,10 @@ static void ovsdb_idl_send_monitor_request(struct ovsdb_idl *, + struct ovsdb_idl_db *, + enum ovsdb_idl_monitor_method); + static void ovsdb_idl_db_clear(struct ovsdb_idl_db *db); ++static void ovsdb_idl_db_ack_condition(struct ovsdb_idl_db *db); ++static void ovsdb_idl_db_sync_condition(struct ovsdb_idl_db *db); ++static void ovsdb_idl_condition_move(struct ovsdb_idl_condition **dst, ++ struct ovsdb_idl_condition **src); + + struct ovsdb_idl { + struct ovsdb_idl_db server; +@@ -422,9 +426,11 @@ ovsdb_idl_db_init(struct ovsdb_idl_db *db, const struct ovsdb_idl_class *class, + = table->change_seqno[OVSDB_IDL_CHANGE_MODIFY] + = table->change_seqno[OVSDB_IDL_CHANGE_DELETE] = 0; + table->db = db; +- ovsdb_idl_condition_init(&table->condition); +- ovsdb_idl_condition_add_clause_true(&table->condition); +- table->cond_changed = false; ++ table->ack_cond = NULL; ++ table->req_cond = NULL; ++ table->new_cond = xmalloc(sizeof *table->new_cond); ++ ovsdb_idl_condition_init(table->new_cond); ++ ovsdb_idl_condition_add_clause_true(table->new_cond); + } + db->monitor_id = json_array_create_2(json_string_create("monid"), + json_string_create(class->database)); +@@ -556,12 +562,15 @@ ovsdb_idl_set_shuffle_remotes(struct ovsdb_idl *idl, bool shuffle) + static void + ovsdb_idl_db_destroy(struct ovsdb_idl_db *db) + { ++ struct ovsdb_idl_condition *null_cond = NULL; + ovs_assert(!db->txn); + ovsdb_idl_db_txn_abort_all(db); + ovsdb_idl_db_clear(db); + for (size_t i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; +- ovsdb_idl_condition_destroy(&table->condition); ++ ovsdb_idl_condition_move(&table->ack_cond, &null_cond); ++ ovsdb_idl_condition_move(&table->req_cond, &null_cond); ++ ovsdb_idl_condition_move(&table->new_cond, &null_cond); + ovsdb_idl_destroy_indexes(table); + shash_destroy(&table->columns); + hmap_destroy(&table->rows); +@@ -610,7 +619,6 @@ ovsdb_idl_db_clear(struct ovsdb_idl_db *db) + struct ovsdb_idl_table *table = &db->tables[i]; + struct ovsdb_idl_row *row, *next_row; + +- table->cond_changed = false; + if (hmap_is_empty(&table->rows)) { + continue; + } +@@ -634,7 +642,6 @@ ovsdb_idl_db_clear(struct ovsdb_idl_db *db) + } + ovsdb_idl_row_destroy_postprocess(db); + +- db->cond_changed = false; + db->cond_seqno = 0; + ovsdb_idl_db_track_clear(db); + +@@ -692,6 +699,12 @@ ovsdb_idl_send_request(struct ovsdb_idl *idl, struct jsonrpc_msg *request) + static void + ovsdb_idl_restart_fsm(struct ovsdb_idl *idl) + { ++ /* Resync data DB table conditions to avoid missing updates due to ++ * conditions that were in flight or changed locally while the connection ++ * was down. ++ */ ++ ovsdb_idl_db_sync_condition(&idl->data); ++ + ovsdb_idl_send_schema_request(idl, &idl->server); + ovsdb_idl_transition(idl, IDL_S_SERVER_SCHEMA_REQUESTED); + idl->data.monitoring = OVSDB_IDL_NOT_MONITORING; +@@ -799,7 +812,9 @@ ovsdb_idl_process_response(struct ovsdb_idl *idl, struct jsonrpc_msg *msg) + * do, it's a "monitor_cond_change", which means that the conditional + * monitor clauses were updated. + * +- * If further condition changes were pending, send them now. */ ++ * Mark the last requested conditions as acked and if further ++ * condition changes were pending, send them now. */ ++ ovsdb_idl_db_ack_condition(&idl->data); + ovsdb_idl_send_cond_change(idl); + idl->data.cond_seqno++; + break; +@@ -1495,30 +1510,60 @@ ovsdb_idl_condition_equals(const struct ovsdb_idl_condition *a, + } + + static void +-ovsdb_idl_condition_clone(struct ovsdb_idl_condition *dst, ++ovsdb_idl_condition_clone(struct ovsdb_idl_condition **dst, + const struct ovsdb_idl_condition *src) + { +- ovsdb_idl_condition_init(dst); ++ if (*dst) { ++ ovsdb_idl_condition_destroy(*dst); ++ } else { ++ *dst = xmalloc(sizeof **dst); ++ } ++ ovsdb_idl_condition_init(*dst); + +- dst->is_true = src->is_true; ++ (*dst)->is_true = src->is_true; + + const struct ovsdb_idl_clause *clause; + HMAP_FOR_EACH (clause, hmap_node, &src->clauses) { +- ovsdb_idl_condition_add_clause__(dst, clause, clause->hmap_node.hash); ++ ovsdb_idl_condition_add_clause__(*dst, clause, clause->hmap_node.hash); + } + } + ++static void ++ovsdb_idl_condition_move(struct ovsdb_idl_condition **dst, ++ struct ovsdb_idl_condition **src) ++{ ++ if (*dst) { ++ ovsdb_idl_condition_destroy(*dst); ++ free(*dst); ++ } ++ *dst = *src; ++ *src = NULL; ++} ++ + static unsigned int + ovsdb_idl_db_set_condition(struct ovsdb_idl_db *db, + const struct ovsdb_idl_table_class *tc, + const struct ovsdb_idl_condition *condition) + { ++ struct ovsdb_idl_condition *table_cond; + struct ovsdb_idl_table *table = ovsdb_idl_db_table_from_class(db, tc); + unsigned int seqno = db->cond_seqno; +- if (!ovsdb_idl_condition_equals(condition, &table->condition)) { +- ovsdb_idl_condition_destroy(&table->condition); +- ovsdb_idl_condition_clone(&table->condition, condition); +- db->cond_changed = table->cond_changed = true; ++ ++ /* Compare the new condition to the last known condition which can be ++ * either "new" (not sent yet), "requested" or "acked", in this order. ++ */ ++ if (table->new_cond) { ++ table_cond = table->new_cond; ++ } else if (table->req_cond) { ++ table_cond = table->req_cond; ++ } else { ++ table_cond = table->ack_cond; ++ } ++ ovs_assert(table_cond); ++ ++ if (!ovsdb_idl_condition_equals(condition, table_cond)) { ++ ovsdb_idl_condition_clone(&table->new_cond, condition); ++ db->cond_changed = true; + poll_immediate_wake(); + return seqno + 1; + } +@@ -1563,9 +1608,8 @@ ovsdb_idl_condition_to_json(const struct ovsdb_idl_condition *cnd) + } + + static struct json * +-ovsdb_idl_create_cond_change_req(struct ovsdb_idl_table *table) ++ovsdb_idl_create_cond_change_req(const struct ovsdb_idl_condition *cond) + { +- const struct ovsdb_idl_condition *cond = &table->condition; + struct json *monitor_cond_change_request = json_object_create(); + struct json *cond_json = ovsdb_idl_condition_to_json(cond); + +@@ -1585,8 +1629,12 @@ ovsdb_idl_db_compose_cond_change(struct ovsdb_idl_db *db) + for (size_t i = 0; i < db->class_->n_tables; i++) { + struct ovsdb_idl_table *table = &db->tables[i]; + +- if (table->cond_changed) { +- struct json *req = ovsdb_idl_create_cond_change_req(table); ++ /* Always use the most recent conditions set by the IDL client when ++ * requesting monitor_cond_change, i.e., table->new_cond. ++ */ ++ if (table->new_cond) { ++ struct json *req = ++ ovsdb_idl_create_cond_change_req(table->new_cond); + if (req) { + if (!monitor_cond_change_requests) { + monitor_cond_change_requests = json_object_create(); +@@ -1595,7 +1643,11 @@ ovsdb_idl_db_compose_cond_change(struct ovsdb_idl_db *db) + table->class_->name, + json_array_create_1(req)); + } +- table->cond_changed = false; ++ /* Mark the new condition as requested by moving it to req_cond. ++ * If there's already requested condition that's a bug. ++ */ ++ ovs_assert(table->req_cond == NULL); ++ ovsdb_idl_condition_move(&table->req_cond, &table->new_cond); + } + } + +@@ -1610,6 +1662,73 @@ ovsdb_idl_db_compose_cond_change(struct ovsdb_idl_db *db) + return jsonrpc_create_request("monitor_cond_change", params, NULL); + } + ++/* Marks all requested table conditions in 'db' as acked by the server. ++ * It should be called when the server replies to monitor_cond_change ++ * requests. ++ */ ++static void ++ovsdb_idl_db_ack_condition(struct ovsdb_idl_db *db) ++{ ++ for (size_t i = 0; i < db->class_->n_tables; i++) { ++ struct ovsdb_idl_table *table = &db->tables[i]; ++ ++ if (table->req_cond) { ++ ovsdb_idl_condition_move(&table->ack_cond, &table->req_cond); ++ } ++ } ++} ++ ++/* Should be called when the IDL fsm is restarted and resyncs table conditions ++ * based on the state the DB is in: ++ * - if a non-zero last_id is available for the DB then upon reconnect ++ * the IDL should first request acked conditions to avoid missing updates ++ * about records that were added before the transaction with ++ * txn-id == last_id. If there were requested condition changes in flight ++ * (i.e., req_cond not NULL) and the IDL client didn't set new conditions ++ * (i.e., new_cond is NULL) then move req_cond to new_cond to trigger a ++ * follow up monitor_cond_change request. ++ * - if there's no last_id available for the DB then it's safe to use the ++ * latest conditions set by the IDL client even if they weren't acked yet. ++ */ ++static void ++ovsdb_idl_db_sync_condition(struct ovsdb_idl_db *db) ++{ ++ bool ack_all = uuid_is_zero(&db->last_id); ++ ++ db->cond_changed = false; ++ for (size_t i = 0; i < db->class_->n_tables; i++) { ++ struct ovsdb_idl_table *table = &db->tables[i]; ++ ++ /* When monitor_cond_since requests will be issued, the ++ * table->ack_cond condition will be added to the "where" clause". ++ * Follow up monitor_cond_change requests will use table->new_cond. ++ */ ++ if (ack_all) { ++ if (table->new_cond) { ++ ovsdb_idl_condition_move(&table->req_cond, &table->new_cond); ++ } ++ ++ if (table->req_cond) { ++ ovsdb_idl_condition_move(&table->ack_cond, &table->req_cond); ++ } ++ } else { ++ /* If there was no "unsent" condition but instead a ++ * monitor_cond_change request was in flight, move table->req_cond ++ * to table->new_cond and set db->cond_changed to trigger a new ++ * monitor_cond_change request. ++ * ++ * However, if a new condition has been set by the IDL client, ++ * monitor_cond_change will be sent anyway and will use the most ++ * recent table->new_cond so there's no need to update it here. ++ */ ++ if (table->req_cond && !table->new_cond) { ++ ovsdb_idl_condition_move(&table->new_cond, &table->req_cond); ++ db->cond_changed = true; ++ } ++ } ++ } ++} ++ + static void + ovsdb_idl_send_cond_change(struct ovsdb_idl *idl) + { +@@ -2064,13 +2183,15 @@ ovsdb_idl_send_monitor_request(struct ovsdb_idl *idl, struct ovsdb_idl_db *db, + monitor_request = json_object_create(); + json_object_put(monitor_request, "columns", columns); + +- const struct ovsdb_idl_condition *cond = &table->condition; ++ /* Always use acked conditions when requesting ++ * monitor_cond/monitor_cond_since. ++ */ ++ const struct ovsdb_idl_condition *cond = table->ack_cond; + if ((monitor_method == OVSDB_IDL_MM_MONITOR_COND || + monitor_method == OVSDB_IDL_MM_MONITOR_COND_SINCE) && +- !ovsdb_idl_condition_is_true(cond)) { ++ cond && !ovsdb_idl_condition_is_true(cond)) { + json_object_put(monitor_request, "where", + ovsdb_idl_condition_to_json(cond)); +- table->cond_changed = false; + } + json_object_put(monitor_requests, tc->name, + json_array_create_1(monitor_request)); +@@ -2078,8 +2199,6 @@ ovsdb_idl_send_monitor_request(struct ovsdb_idl *idl, struct ovsdb_idl_db *db, + } + free_schema(schema); + +- db->cond_changed = false; +- + struct json *params = json_array_create_3( + json_string_create(db->class_->database), + json_clone(db->monitor_id), +diff --git a/lib/pvector.c b/lib/pvector.c +index aaeee92147..cc527fdc41 100644 +--- a/lib/pvector.c ++++ b/lib/pvector.c +@@ -33,7 +33,7 @@ pvector_impl_alloc(size_t size) + struct pvector_impl *impl; + + impl = xmalloc(sizeof *impl + size * sizeof impl->vector[0]); +- impl->size = 0; ++ atomic_init(&impl->size, 0); + impl->allocated = size; + + return impl; +@@ -117,18 +117,22 @@ pvector_insert(struct pvector *pvec, void *ptr, int priority) + { + struct pvector_impl *temp = pvec->temp; + struct pvector_impl *old = pvector_impl_get(pvec); ++ size_t size; + + ovs_assert(ptr != NULL); + ++ /* There is no possible concurrent writer. Insertions must be protected ++ * by mutex or be always excuted from the same thread. */ ++ atomic_read_relaxed(&old->size, &size); ++ + /* Check if can add to the end without reallocation. */ +- if (!temp && old->allocated > old->size && +- (!old->size || priority <= old->vector[old->size - 1].priority)) { +- old->vector[old->size].ptr = ptr; +- old->vector[old->size].priority = priority; ++ if (!temp && old->allocated > size && ++ (!size || priority <= old->vector[size - 1].priority)) { ++ old->vector[size].ptr = ptr; ++ old->vector[size].priority = priority; + /* Size increment must not be visible to the readers before the new + * entry is stored. */ +- atomic_thread_fence(memory_order_release); +- ++old->size; ++ atomic_store_explicit(&old->size, size + 1, memory_order_release); + } else { + if (!temp) { + temp = pvector_impl_dup(old); +diff --git a/lib/pvector.h b/lib/pvector.h +index b990ed9d59..0d3290dc37 100644 +--- a/lib/pvector.h ++++ b/lib/pvector.h +@@ -69,8 +69,8 @@ struct pvector_entry { + }; + + struct pvector_impl { +- size_t size; /* Number of entries in the vector. */ +- size_t allocated; /* Number of allocated entries. */ ++ atomic_size_t size; /* Number of entries in the vector. */ ++ size_t allocated; /* Number of allocated entries. */ + struct pvector_entry vector[]; + }; + +@@ -181,12 +181,17 @@ pvector_cursor_init(const struct pvector *pvec, + { + const struct pvector_impl *impl; + struct pvector_cursor cursor; ++ size_t size; + + impl = ovsrcu_get(struct pvector_impl *, &pvec->impl); + +- ovs_prefetch_range(impl->vector, impl->size * sizeof impl->vector[0]); ++ /* Use memory_order_acquire to ensure entry access can not be ++ * reordered to happen before size read. */ ++ atomic_read_explicit(&CONST_CAST(struct pvector_impl *, impl)->size, ++ &size, memory_order_acquire); ++ ovs_prefetch_range(impl->vector, size * sizeof impl->vector[0]); + +- cursor.size = impl->size; ++ cursor.size = size; + cursor.vector = impl->vector; + cursor.entry_idx = -1; + +diff --git a/lib/tc.c b/lib/tc.c +index 12af0192b6..cc8c2d849e 100644 +--- a/lib/tc.c ++++ b/lib/tc.c +@@ -1647,8 +1647,10 @@ nl_parse_single_action(struct nlattr *action, struct tc_flower *flower) + } + + bs = nl_attr_get_unspec(stats_attrs[TCA_STATS_BASIC], sizeof *bs); +- put_32aligned_u64(&stats->n_packets, bs->packets); +- put_32aligned_u64(&stats->n_bytes, bs->bytes); ++ if (bs->packets) { ++ put_32aligned_u64(&stats->n_packets, bs->packets); ++ put_32aligned_u64(&stats->n_bytes, bs->bytes); ++ } + + return 0; + } +diff --git a/lib/tc.h b/lib/tc.h +index d31c0953ed..24a4994fd1 100644 +--- a/lib/tc.h ++++ b/lib/tc.h +@@ -235,7 +235,7 @@ struct tc_action { + } ipv6; + }; + +- union { ++ struct { + ovs_be16 min; + ovs_be16 max; + } port; +diff --git a/lib/util.c b/lib/util.c +index 830e14516f..25635b27ff 100644 +--- a/lib/util.c ++++ b/lib/util.c +@@ -1395,6 +1395,19 @@ is_all_ones(const void *p, size_t n) + return is_all_byte(p, n, 0xff); + } + ++/* *dst |= *src for 'n' bytes. */ ++void ++or_bytes(void *dst_, const void *src_, size_t n) ++{ ++ const uint8_t *src = src_; ++ uint8_t *dst = dst_; ++ size_t i; ++ ++ for (i = 0; i < n; i++) { ++ *dst++ |= *src++; ++ } ++} ++ + /* Copies 'n_bits' bits starting from bit 'src_ofs' in 'src' to the 'n_bits' + * starting from bit 'dst_ofs' in 'dst'. 'src' is 'src_len' bytes long and + * 'dst' is 'dst_len' bytes long. +diff --git a/lib/util.h b/lib/util.h +index 7ad8758fe6..067dcad157 100644 +--- a/lib/util.h ++++ b/lib/util.h +@@ -484,6 +484,7 @@ be64_is_superset(ovs_be64 super, ovs_be64 sub) + bool is_all_zeros(const void *, size_t); + bool is_all_ones(const void *, size_t); + bool is_all_byte(const void *, size_t, uint8_t byte); ++void or_bytes(void *dst, const void *src, size_t n); + void bitwise_copy(const void *src, unsigned int src_len, unsigned int src_ofs, + void *dst, unsigned int dst_len, unsigned int dst_ofs, + unsigned int n_bits); +diff --git a/ofproto/connmgr.c b/ofproto/connmgr.c +index 51d656cba9..aee676d93e 100644 +--- a/ofproto/connmgr.c ++++ b/ofproto/connmgr.c +@@ -190,8 +190,8 @@ struct ofservice { + + static void ofservice_run(struct ofservice *); + static void ofservice_wait(struct ofservice *); +-static void ofservice_reconfigure(struct ofservice *, +- const struct ofproto_controller *) ++static int ofservice_reconfigure(struct ofservice *, ++ const struct ofproto_controller *) + OVS_REQUIRES(ofproto_mutex); + static void ofservice_create(struct connmgr *mgr, const char *target, + const struct ofproto_controller *) +@@ -602,7 +602,15 @@ connmgr_set_controllers(struct connmgr *mgr, struct shash *controllers) + target); + ofservice_destroy(ofservice); + } else { +- ofservice_reconfigure(ofservice, c); ++ if (ofservice_reconfigure(ofservice, c)) { ++ char *target_to_restore = xstrdup(target); ++ VLOG_INFO("%s: Changes to controller \"%s\" " ++ "expects re-initialization: Re-initializing now.", ++ mgr->name, target); ++ ofservice_destroy(ofservice); ++ ofservice_create(mgr, target_to_restore, c); ++ free(target_to_restore); ++ } + } + } + +@@ -2011,16 +2019,15 @@ ofservice_wait(struct ofservice *ofservice) + } + } + +-static void ++static int + ofservice_reconfigure(struct ofservice *ofservice, + const struct ofproto_controller *settings) + OVS_REQUIRES(ofproto_mutex) + { +- /* If the allowed OpenFlow versions change, close all of the existing +- * connections to allow them to reconnect and possibly negotiate a new +- * version. */ ++ /* If the allowed OpenFlow versions change, a full cleanup is needed ++ * for the ofservice and connections. */ + if (ofservice->s.allowed_versions != settings->allowed_versions) { +- ofservice_close_all(ofservice); ++ return -EINVAL; + } + + ofservice->s = *settings; +@@ -2029,6 +2036,8 @@ ofservice_reconfigure(struct ofservice *ofservice, + LIST_FOR_EACH (ofconn, ofservice_node, &ofservice->conns) { + ofconn_reconfigure(ofconn, settings); + } ++ ++ return 0; + } + + /* Finds and returns the ofservice within 'mgr' that has the given +diff --git a/ofproto/ofproto-dpif-rid.h b/ofproto/ofproto-dpif-rid.h +index 147ef9c333..97699cb905 100644 +--- a/ofproto/ofproto-dpif-rid.h ++++ b/ofproto/ofproto-dpif-rid.h +@@ -22,6 +22,7 @@ + + #include "cmap.h" + #include "ofproto-dpif-mirror.h" ++#include "ofproto/ofproto-provider.h" + #include "openvswitch/list.h" + #include "openvswitch/ofp-actions.h" + #include "ovs-thread.h" +@@ -115,16 +116,25 @@ frozen_metadata_from_flow(struct frozen_metadata *md, + { + memset(md, 0, sizeof *md); + md->tunnel = flow->tunnel; ++ /* It is unsafe for frozen_state to reference tun_table because ++ * tun_table is protected by RCU while the lifecycle of frozen_state ++ * can span several RCU quiesce states. ++ * ++ * The latest valid tun_table can be found by ofproto_get_tun_tab() ++ * efficiently. */ ++ md->tunnel.metadata.tab = NULL; + md->metadata = flow->metadata; + memcpy(md->regs, flow->regs, sizeof md->regs); + md->in_port = flow->in_port.ofp_port; + } + + static inline void +-frozen_metadata_to_flow(const struct frozen_metadata *md, ++frozen_metadata_to_flow(struct ofproto *ofproto, ++ const struct frozen_metadata *md, + struct flow *flow) + { + flow->tunnel = md->tunnel; ++ flow->tunnel.metadata.tab = ofproto_get_tun_tab(ofproto); + flow->metadata = md->metadata; + memcpy(flow->regs, md->regs, sizeof flow->regs); + flow->in_port.ofp_port = md->in_port; +diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c +index 409286ab15..3a290e4918 100644 +--- a/ofproto/ofproto-dpif-upcall.c ++++ b/ofproto/ofproto-dpif-upcall.c +@@ -1545,7 +1545,8 @@ process_upcall(struct udpif *udpif, struct upcall *upcall, + flow_clear_conntrack(&frozen_flow); + } + +- frozen_metadata_to_flow(&state->metadata, &frozen_flow); ++ frozen_metadata_to_flow(&upcall->ofproto->up, &state->metadata, ++ &frozen_flow); + flow_get_metadata(&frozen_flow, &am->pin.up.base.flow_metadata); + + ofproto_dpif_send_async_msg(upcall->ofproto, am); +diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c +index 4407f9c97a..dd89cb47c0 100644 +--- a/ofproto/ofproto-dpif-xlate.c ++++ b/ofproto/ofproto-dpif-xlate.c +@@ -1516,15 +1516,32 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer, + return NULL; + } + +- /* If recirculation was initiated due to bond (in_port = OFPP_NONE) +- * then frozen state is static and xport_uuid is not defined, so xport +- * cannot be restored from frozen state. */ +- if (recirc_id_node->state.metadata.in_port != OFPP_NONE) { ++ ofp_port_t in_port = recirc_id_node->state.metadata.in_port; ++ if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) { + struct uuid xport_uuid = recirc_id_node->state.xport_uuid; + xport = xport_lookup_by_uuid(xcfg, &xport_uuid); + if (xport && xport->xbridge && xport->xbridge->ofproto) { + goto out; + } ++ } else { ++ /* OFPP_NONE and OFPP_CONTROLLER are not real ports. They indicate ++ * that the packet originated from the controller via an OpenFlow ++ * "packet-out". The right thing to do is to find just the ++ * ofproto. There is no xport, which is OK. ++ * ++ * OFPP_NONE can also indicate that a bond caused recirculation. */ ++ struct uuid uuid = recirc_id_node->state.ofproto_uuid; ++ const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid); ++ if (bridge && bridge->ofproto) { ++ if (errorp) { ++ *errorp = NULL; ++ } ++ *xportp = NULL; ++ if (ofp_in_port) { ++ *ofp_in_port = in_port; ++ } ++ return bridge->ofproto; ++ } + } + } + +@@ -7519,7 +7536,8 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) + + /* Restore pipeline metadata. May change flow's in_port and other + * metadata to the values that existed when freezing was triggered. */ +- frozen_metadata_to_flow(&state->metadata, flow); ++ frozen_metadata_to_flow(&ctx.xbridge->ofproto->up, ++ &state->metadata, flow); + + /* Restore stack, if any. */ + if (state->stack) { +@@ -7571,14 +7589,10 @@ xlate_actions(struct xlate_in *xin, struct xlate_out *xout) + ctx.error = XLATE_INVALID_TUNNEL_METADATA; + goto exit; + } +- } else if (!flow->tunnel.metadata.tab || xin->frozen_state) { ++ } else if (!flow->tunnel.metadata.tab) { + /* If the original flow did not come in on a tunnel, then it won't have + * FLOW_TNL_F_UDPIF set. However, we still need to have a metadata + * table in case we generate tunnel actions. */ +- /* If the translation is from a frozen state, we use the latest +- * TLV map to avoid segmentation fault in case the old TLV map is +- * replaced by a new one. +- * XXX: It is better to abort translation if the table is changed. */ + flow->tunnel.metadata.tab = ofproto_get_tun_tab( + &ctx.xbridge->ofproto->up); + } +diff --git a/ofproto/ofproto.c b/ofproto/ofproto.c +index 08830d8371..8594afad4a 100644 +--- a/ofproto/ofproto.c ++++ b/ofproto/ofproto.c +@@ -6077,8 +6077,8 @@ ofproto_rule_send_removed(struct rule *rule) + fr.hard_timeout = rule->hard_timeout; + ovs_mutex_unlock(&rule->mutex); + rule->ofproto->ofproto_class->rule_get_stats(rule, &stats, &used); +- fr.packet_count += stats.n_packets; +- fr.byte_count += stats.n_bytes; ++ fr.packet_count = stats.n_packets; ++ fr.byte_count = stats.n_bytes; + connmgr_send_flow_removed(connmgr, &fr); + ovs_mutex_unlock(&ofproto_mutex); + } +diff --git a/ovsdb/execution.c b/ovsdb/execution.c +index e45f3d6796..3a0dad5d0a 100644 +--- a/ovsdb/execution.c ++++ b/ovsdb/execution.c +@@ -712,7 +712,7 @@ ovsdb_execute_wait(struct ovsdb_execution *x, struct ovsdb_parser *parser, + long long int timeout_msec = 0; + size_t i; + +- timeout = ovsdb_parser_member(parser, "timeout", OP_NUMBER | OP_OPTIONAL); ++ timeout = ovsdb_parser_member(parser, "timeout", OP_INTEGER | OP_OPTIONAL); + where = ovsdb_parser_member(parser, "where", OP_ARRAY); + columns_json = ovsdb_parser_member(parser, "columns", + OP_ARRAY | OP_OPTIONAL); +@@ -730,7 +730,7 @@ ovsdb_execute_wait(struct ovsdb_execution *x, struct ovsdb_parser *parser, + } + if (!error) { + if (timeout) { +- timeout_msec = MIN(LLONG_MAX, json_real(timeout)); ++ timeout_msec = json_integer(timeout); + if (timeout_msec < 0) { + error = ovsdb_syntax_error(timeout, NULL, + "timeout must be nonnegative"); +diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c +index b6957d7300..fd7891a729 100644 +--- a/ovsdb/ovsdb-server.c ++++ b/ovsdb/ovsdb-server.c +@@ -540,7 +540,7 @@ close_db(struct server_config *config, struct db *db, char *comment) + + static struct ovsdb_error * OVS_WARN_UNUSED_RESULT + parse_txn(struct server_config *config, struct db *db, +- struct ovsdb_schema *schema, const struct json *txn_json, ++ const struct ovsdb_schema *schema, const struct json *txn_json, + const struct uuid *txnid) + { + if (schema) { +@@ -548,7 +548,9 @@ parse_txn(struct server_config *config, struct db *db, + * (first grabbing its storage), then replace it with the new schema. + * The transaction must also include the replacement data. + * +- * Only clustered database schema changes go through this path. */ ++ * Only clustered database schema changes and snapshot installs ++ * go through this path. ++ */ + ovs_assert(txn_json); + ovs_assert(ovsdb_storage_is_clustered(db->db->storage)); + +@@ -558,13 +560,17 @@ parse_txn(struct server_config *config, struct db *db, + return error; + } + +- ovsdb_jsonrpc_server_reconnect( +- config->jsonrpc, false, +- (db->db->schema +- ? xasprintf("database %s schema changed", db->db->name) +- : xasprintf("database %s connected to storage", db->db->name))); ++ if (!db->db->schema || ++ strcmp(schema->version, db->db->schema->version)) { ++ ovsdb_jsonrpc_server_reconnect( ++ config->jsonrpc, false, ++ (db->db->schema ++ ? xasprintf("database %s schema changed", db->db->name) ++ : xasprintf("database %s connected to storage", ++ db->db->name))); ++ } + +- ovsdb_replace(db->db, ovsdb_create(schema, NULL)); ++ ovsdb_replace(db->db, ovsdb_create(ovsdb_schema_clone(schema), NULL)); + + /* Force update to schema in _Server database. */ + db->row_uuid = UUID_ZERO; +@@ -613,6 +619,7 @@ read_db(struct server_config *config, struct db *db) + } else { + error = parse_txn(config, db, schema, txn_json, &txnid); + json_destroy(txn_json); ++ ovsdb_schema_destroy(schema); + if (error) { + break; + } +diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c +index cfc96b32f8..2da117cb36 100644 +--- a/ovsdb/ovsdb.c ++++ b/ovsdb/ovsdb.c +@@ -414,7 +414,7 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage) + db->storage = storage; + ovs_list_init(&db->monitors); + ovs_list_init(&db->triggers); +- db->run_triggers = false; ++ db->run_triggers_now = db->run_triggers = false; + + shash_init(&db->tables); + if (schema) { +@@ -502,6 +502,10 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage) + } + + simap_increase(usage, "cells", cells); ++ ++ if (db->storage) { ++ ovsdb_storage_get_memory_usage(db->storage, usage); ++ } + } + + struct ovsdb_table * +diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h +index 32e5333163..5c30a83d92 100644 +--- a/ovsdb/ovsdb.h ++++ b/ovsdb/ovsdb.h +@@ -83,6 +83,7 @@ struct ovsdb { + /* Triggers. */ + struct ovs_list triggers; /* Contains "struct ovsdb_trigger"s. */ + bool run_triggers; ++ bool run_triggers_now; + + struct ovsdb_table *rbac_role; + +diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c +index 26d39a087f..9468fdaf4a 100644 +--- a/ovsdb/raft-private.c ++++ b/ovsdb/raft-private.c +@@ -137,6 +137,7 @@ raft_server_destroy(struct raft_server *s) + if (s) { + free(s->address); + free(s->nickname); ++ free(s->last_install_snapshot_request); + free(s); + } + } +diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h +index ac8656d42f..1f366b4ab3 100644 +--- a/ovsdb/raft-private.h ++++ b/ovsdb/raft-private.h +@@ -27,6 +27,7 @@ + + struct ds; + struct ovsdb_parser; ++struct raft_install_snapshot_request; + + /* Formatting server IDs and cluster IDs for use in human-readable logs. Do + * not use these in cases where the whole server or cluster ID is needed; use +@@ -83,6 +84,9 @@ struct raft_server { + bool replied; /* Reply to append_request was received from this + node during current election_timeout interval. + */ ++ /* Copy of the last install_snapshot_request sent to this server. */ ++ struct raft_install_snapshot_request *last_install_snapshot_request; ++ + /* For use in adding and removing servers: */ + struct uuid requester_sid; /* Nonzero if requested via RPC. */ + struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */ +diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c +index 18c83fe9c2..dd14d81091 100644 +--- a/ovsdb/raft-rpc.c ++++ b/ovsdb/raft-rpc.c +@@ -544,8 +544,8 @@ raft_format_install_snapshot_request( + ds_put_format(s, " last_index=%"PRIu64, rq->last_index); + ds_put_format(s, " last_term=%"PRIu64, rq->last_term); + ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid)); +- ds_put_cstr(s, " last_servers="); + ds_put_format(s, " election_timer=%"PRIu64, rq->election_timer); ++ ds_put_cstr(s, " last_servers="); + + struct hmap servers; + struct ovsdb_error *error = +diff --git a/ovsdb/raft.c b/ovsdb/raft.c +index 4789bc4f22..8df386fa19 100644 +--- a/ovsdb/raft.c ++++ b/ovsdb/raft.c +@@ -36,6 +36,7 @@ + #include "ovsdb/log.h" + #include "raft-rpc.h" + #include "random.h" ++#include "simap.h" + #include "socket-util.h" + #include "stream.h" + #include "timeval.h" +@@ -73,7 +74,8 @@ enum raft_failure_test { + FT_CRASH_BEFORE_SEND_EXEC_REQ, + FT_CRASH_AFTER_SEND_EXEC_REQ, + FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE, +- FT_DELAY_ELECTION ++ FT_DELAY_ELECTION, ++ FT_DONT_SEND_VOTE_REQUEST + }; + static enum raft_failure_test failure_test; + +@@ -298,6 +300,11 @@ struct raft { + bool had_leader; /* There has been leader elected since last + election initiated. This is to help setting + candidate_retrying. */ ++ ++ /* For all. */ ++ bool ever_had_leader; /* There has been leader elected since the raft ++ is initialized, meaning it is ever ++ connected. */ + }; + + /* All Raft structures. */ +@@ -932,6 +939,7 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js, + &conn->sid); + conn->incoming = incoming; + conn->js_seqno = jsonrpc_session_get_seqno(conn->js); ++ jsonrpc_session_set_probe_interval(js, 0); + } + + /* Starts the local server in an existing Raft cluster, using the local copy of +@@ -1007,6 +1015,21 @@ raft_get_sid(const struct raft *raft) + return &raft->sid; + } + ++/* Adds memory consumption info to 'usage' for later use by memory_report(). */ ++void ++raft_get_memory_usage(const struct raft *raft, struct simap *usage) ++{ ++ struct raft_conn *conn; ++ int cnt = 0; ++ ++ LIST_FOR_EACH (conn, list_node, &raft->conns) { ++ simap_increase(usage, "raft-backlog", ++ jsonrpc_session_get_backlog(conn->js)); ++ cnt++; ++ } ++ simap_increase(usage, "raft-connections", cnt); ++} ++ + /* Returns true if 'raft' has completed joining its cluster, has not left or + * initiated leaving the cluster, does not have failed disk storage, and is + * apparently connected to the leader in a healthy way (or is itself the +@@ -1024,7 +1047,8 @@ raft_is_connected(const struct raft *raft) + && !raft->joining + && !raft->leaving + && !raft->left +- && !raft->failed); ++ && !raft->failed ++ && raft->ever_had_leader); + VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false"); + return ret; + } +@@ -1397,8 +1421,20 @@ raft_conn_run(struct raft *raft, struct raft_conn *conn) + jsonrpc_session_run(conn->js); + + unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js); +- bool just_connected = (new_seqno != conn->js_seqno ++ bool reconnected = new_seqno != conn->js_seqno; ++ bool just_connected = (reconnected + && jsonrpc_session_is_connected(conn->js)); ++ ++ if (reconnected) { ++ /* Clear 'last_install_snapshot_request' since it might not reach the ++ * destination or server was restarted. */ ++ struct raft_server *server = raft_find_server(raft, &conn->sid); ++ if (server) { ++ free(server->last_install_snapshot_request); ++ server->last_install_snapshot_request = NULL; ++ } ++ } ++ + conn->js_seqno = new_seqno; + if (just_connected) { + if (raft->joining) { +@@ -1641,6 +1677,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer) + } + + ovs_assert(raft->role != RAFT_LEADER); ++ + raft->role = RAFT_CANDIDATE; + /* If there was no leader elected since last election, we know we are + * retrying now. */ +@@ -1684,7 +1721,9 @@ raft_start_election(struct raft *raft, bool leadership_transfer) + .leadership_transfer = leadership_transfer, + }, + }; +- raft_send(raft, &rq); ++ if (failure_test != FT_DONT_SEND_VOTE_REQUEST) { ++ raft_send(raft, &rq); ++ } + } + + /* Vote for ourselves. */ +@@ -2519,7 +2558,7 @@ static void + raft_set_leader(struct raft *raft, const struct uuid *sid) + { + raft->leader_sid = *sid; +- raft->had_leader = true; ++ raft->ever_had_leader = raft->had_leader = true; + raft->candidate_retrying = false; + } + +@@ -2960,6 +2999,15 @@ raft_update_leader(struct raft *raft, const struct uuid *sid) + }; + ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r))); + } ++ if (raft->role == RAFT_CANDIDATE) { ++ /* Section 3.4: While waiting for votes, a candidate may ++ * receive an AppendEntries RPC from another server claiming to ++ * be leader. If the leader’s term (included in its RPC) is at ++ * least as large as the candidate’s current term, then the ++ * candidate recognizes the leader as legitimate and returns to ++ * follower state. */ ++ raft->role = RAFT_FOLLOWER; ++ } + return true; + } + +@@ -3260,6 +3308,31 @@ raft_send_install_snapshot_request(struct raft *raft, + .election_timer = raft->election_timer, /* use latest value */ + } + }; ++ ++ if (s->last_install_snapshot_request) { ++ struct raft_install_snapshot_request *old, *new; ++ ++ old = s->last_install_snapshot_request; ++ new = &rpc.install_snapshot_request; ++ if ( old->term == new->term ++ && old->last_index == new->last_index ++ && old->last_term == new->last_term ++ && old->last_servers == new->last_servers ++ && old->data == new->data ++ && old->election_timer == new->election_timer ++ && uuid_equals(&old->last_eid, &new->last_eid)) { ++ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); ++ ++ VLOG_WARN_RL(&rl, "not sending exact same install_snapshot_request" ++ " to server %s again", s->nickname); ++ return; ++ } ++ } ++ free(s->last_install_snapshot_request); ++ CONST_CAST(struct raft_server *, s)->last_install_snapshot_request ++ = xmemdup(&rpc.install_snapshot_request, ++ sizeof rpc.install_snapshot_request); ++ + raft_send(raft, &rpc); + } + +@@ -3992,8 +4065,9 @@ raft_handle_install_snapshot_reply( + VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s " + " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid), + s->nickname, rpy->last_term, rpy->last_index); +- s->next_index = raft->log_end; +- raft_send_append_request(raft, s, 0, "snapshot installed"); ++ s->next_index = raft->log_start; ++ raft_send_append_request(raft, s, raft->log_end - s->next_index, ++ "snapshot installed"); + } + + /* Returns true if 'raft' has grown enough since the last snapshot that +@@ -4143,9 +4217,7 @@ raft_handle_execute_command_request__( + cmd->sid = rq->common.sid; + + enum raft_command_status status = cmd->status; +- if (status != RAFT_CMD_INCOMPLETE) { +- raft_command_unref(cmd); +- } ++ raft_command_unref(cmd); + return status; + } + +@@ -4667,6 +4739,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED, + raft_reset_election_timer(raft); + } + } ++ } else if (!strcmp(test, "dont-send-vote-request")) { ++ failure_test = FT_DONT_SEND_VOTE_REQUEST; + } else if (!strcmp(test, "clear")) { + failure_test = FT_NO_TEST; + unixctl_command_reply(conn, "test dismissed"); +diff --git a/ovsdb/raft.h b/ovsdb/raft.h +index 3d448995af..99d5307e54 100644 +--- a/ovsdb/raft.h ++++ b/ovsdb/raft.h +@@ -67,6 +67,7 @@ + struct json; + struct ovsdb_log; + struct raft; ++struct simap; + struct sset; + + #define RAFT_MAGIC "CLUSTER" +@@ -113,6 +114,7 @@ const struct uuid *raft_get_cid(const struct raft *); + const struct uuid *raft_get_sid(const struct raft *); + bool raft_is_connected(const struct raft *); + bool raft_is_leader(const struct raft *); ++void raft_get_memory_usage(const struct raft *, struct simap *usage); + + /* Joining a cluster. */ + bool raft_is_joining(const struct raft *); +diff --git a/ovsdb/storage.c b/ovsdb/storage.c +index e26252b066..7b4ad16f60 100644 +--- a/ovsdb/storage.c ++++ b/ovsdb/storage.c +@@ -26,6 +26,7 @@ + #include "ovsdb.h" + #include "raft.h" + #include "random.h" ++#include "simap.h" + #include "timeval.h" + #include "util.h" + +@@ -188,6 +189,15 @@ ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage) + return storage->raft ? raft_get_applied_index(storage->raft) : 0; + } + ++void ++ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage, ++ struct simap *usage) ++{ ++ if (storage->raft) { ++ raft_get_memory_usage(storage->raft, usage); ++ } ++} ++ + void + ovsdb_storage_run(struct ovsdb_storage *storage) + { +diff --git a/ovsdb/storage.h b/ovsdb/storage.h +index 8a9bbab709..a223968912 100644 +--- a/ovsdb/storage.h ++++ b/ovsdb/storage.h +@@ -23,6 +23,7 @@ + struct json; + struct ovsdb_schema; + struct ovsdb_storage; ++struct simap; + struct uuid; + + struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw, +@@ -39,6 +40,8 @@ bool ovsdb_storage_is_leader(const struct ovsdb_storage *); + const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *); + const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *); + uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *); ++void ovsdb_storage_get_memory_usage(const struct ovsdb_storage *, ++ struct simap *usage); + + void ovsdb_storage_run(struct ovsdb_storage *); + void ovsdb_storage_wait(struct ovsdb_storage *); +diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c +index 369436bffb..8ffefcf7c9 100644 +--- a/ovsdb/transaction.c ++++ b/ovsdb/transaction.c +@@ -967,7 +967,7 @@ ovsdb_txn_complete(struct ovsdb_txn *txn) + { + if (!ovsdb_txn_is_empty(txn)) { + +- txn->db->run_triggers = true; ++ txn->db->run_triggers_now = txn->db->run_triggers = true; + ovsdb_monitors_commit(txn->db, txn); + ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs)); + ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit)); +diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c +index 7e62e90ae3..0372302af4 100644 +--- a/ovsdb/trigger.c ++++ b/ovsdb/trigger.c +@@ -141,7 +141,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) + struct ovsdb_trigger *t, *next; + + bool run_triggers = db->run_triggers; +- db->run_triggers = false; ++ db->run_triggers_now = db->run_triggers = false; + + bool disconnect_all = false; + +@@ -160,7 +160,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now) + void + ovsdb_trigger_wait(struct ovsdb *db, long long int now) + { +- if (db->run_triggers) { ++ if (db->run_triggers_now) { + poll_immediate_wake(); + } else { + long long int deadline = LLONG_MAX; +@@ -319,9 +319,16 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now) + if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) { + /* Temporary error. Transition back to "initialized" state to + * try again. */ ++ char *err_s = ovsdb_error_to_string(error); ++ VLOG_DBG("cluster error %s", err_s); ++ + jsonrpc_msg_destroy(t->reply); + t->reply = NULL; + t->db->run_triggers = true; ++ if (!strstr(err_s, "not leader")) { ++ t->db->run_triggers_now = true; ++ } ++ free(err_s); + ovsdb_error_destroy(error); + } else { + /* Permanent error. Transition to "completed" state to report +diff --git a/rhel/openvswitch-kmod-fedora.spec.in b/rhel/openvswitch-kmod-fedora.spec.in +index c94f2f5358..15eec6d4c0 100644 +--- a/rhel/openvswitch-kmod-fedora.spec.in ++++ b/rhel/openvswitch-kmod-fedora.spec.in +@@ -17,7 +17,8 @@ + # - 3.10.0 major revision 693 (RHEL 7.4) + # - 3.10.0 major revision 957 (RHEL 7.6) + # - 3.10.0 major revision 1062 (RHEL 7.7) +-# - 3.10.0 major revision 1101 (RHEL 7.8) ++# - 3.10.0 major revision 1101 (RHEL 7.8 Beta) ++# - 3.10.0 major revision 1127 (RHEL 7.8 GA) + # By default, build against the current running kernel version + #%define kernel 3.1.5-1.fc16.x86_64 + #define kernel %{kernel_source} +@@ -97,7 +98,7 @@ if grep -qs "suse" /etc/os-release; then + elif [ "$mainline_major" = "3" ] && [ "$mainline_minor" = "10" ] && + { [ "$major_rev" = "327" ] || [ "$major_rev" = "693" ] || \ + [ "$major_rev" = "957" ] || [ "$major_rev" == "1062" ] || \ +- [ "$major_rev" = "1101" ]; }; then ++ [ "$major_rev" = "1101" ] || [ "$major_rev" = "1127" ] ; }; then + # For RHEL 7.2, 7.4, 7.6, 7.7, and 7.8 + if [ -x "%{_datadir}/openvswitch/scripts/ovs-kmod-manage.sh" ]; then + %{_datadir}/openvswitch/scripts/ovs-kmod-manage.sh +diff --git a/rhel/usr_share_openvswitch_scripts_ovs-kmod-manage.sh b/rhel/usr_share_openvswitch_scripts_ovs-kmod-manage.sh +index a9b5cdd817..c70e135cd5 100644 +--- a/rhel/usr_share_openvswitch_scripts_ovs-kmod-manage.sh ++++ b/rhel/usr_share_openvswitch_scripts_ovs-kmod-manage.sh +@@ -19,7 +19,8 @@ + # - 3.10.0 major revision 693 (RHEL 7.4) + # - 3.10.0 major revision 957 (RHEL 7.6) + # - 3.10.0 major revision 1062 (RHEL 7.7) +-# - 3.10.0 major revision 1101 (RHEL 7.8) ++# - 3.10.0 major revision 1101 (RHEL 7.8 Beta) ++# - 3.10.0 major revision 1127 (RHEL 7.8 GA) + # - 4.4.x, x >= 73 (SLES 12 SP3) + # - 4.12.x, x >= 14 (SLES 12 SP4). + # It is packaged in the openvswitch kmod RPM and run in the post-install +@@ -108,6 +109,11 @@ if [ "$mainline_major" = "3" ] && [ "$mainline_minor" = "10" ]; then + ver_offset=4 + installed_ver="$minor_rev" + elif [ "$major_rev" = "1101" ]; then ++# echo "rhel78" ++ comp_ver=10 ++ ver_offset=4 ++ installed_ver="$minor_rev" ++ elif [ "$major_rev" = "1127" ]; then + # echo "rhel78" + comp_ver=10 + ver_offset=4 +diff --git a/tests/automake.mk b/tests/automake.mk +index 9c7ebdce9b..3d90f97687 100644 +--- a/tests/automake.mk ++++ b/tests/automake.mk +@@ -152,7 +152,8 @@ SYSTEM_KMOD_TESTSUITE_AT = \ + SYSTEM_USERSPACE_TESTSUITE_AT = \ + tests/system-userspace-testsuite.at \ + tests/system-userspace-macros.at \ +- tests/system-userspace-packet-type-aware.at ++ tests/system-userspace-packet-type-aware.at \ ++ tests/system-route.at + + SYSTEM_AFXDP_TESTSUITE_AT = \ + tests/system-userspace-macros.at \ +diff --git a/tests/bridge.at b/tests/bridge.at +index d48463e263..904f1381c7 100644 +--- a/tests/bridge.at ++++ b/tests/bridge.at +@@ -103,3 +103,20 @@ AT_CHECK([ovs-appctl -t ovs-vswitchd version], [0], [ignore]) + OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) + OVS_APP_EXIT_AND_WAIT([ovsdb-server]) + AT_CLEANUP ++ ++AT_SETUP([bridge - change ofproto versions]) ++dnl Start vswitch and add a version test bridge ++OVS_VSWITCHD_START( ++ [add-br vr_test0 -- \ ++ set bridge vr_test0 datapath-type=dummy \ ++ protocols=OpenFlow10]) ++ ++dnl set the version to include, say, OpenFlow14 ++AT_CHECK([ovs-vsctl set bridge vr_test0 protocols=OpenFlow10,OpenFlow14]) ++ ++dnl now try to use bundle action on a flow ++AT_CHECK([ovs-ofctl add-flow vr_test0 --bundle actions=normal]) ++ ++OVS_APP_EXIT_AND_WAIT([ovs-vswitchd]) ++OVS_APP_EXIT_AND_WAIT([ovsdb-server]) ++AT_CLEANUP +diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at +index 0aeb4e788f..1651e02d29 100644 +--- a/tests/dpif-netdev.at ++++ b/tests/dpif-netdev.at +@@ -371,7 +371,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD], + [AT_SETUP([dpif-netdev - partial hw offload - $1]) + OVS_VSWITCHD_START( + [add-port br0 p1 -- \ +- set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock options:ifindex=1 -- \ ++ set interface p1 type=$1 ofport_request=1 options:pstream=punix:$OVS_RUNDIR/p1.sock options:ifindex=1100 -- \ + set bridge br0 datapath-type=dummy \ + other-config:datapath-id=1234 fail-mode=secure], [], [], + [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) +@@ -393,7 +393,7 @@ skb_priority(0),skb_mark(0),ct_state(0),ct_zone(0),ct_mark(0),ct_label(0),recirc + # Check that flow successfully offloaded. + OVS_WAIT_UNTIL([grep "succeed to add netdev flow" ovs-vswitchd.log]) + AT_CHECK([filter_hw_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +-p1: flow put[[create]]: flow match: recirc_id=0,eth,ip,in_port=1,vlan_tci=0x0000,nw_frag=no, mark: 0 ++p1: flow put[[create]]: flow match: recirc_id=0,eth,ip,in_port=1,vlan_tci=0x0000,nw_frag=no, mark: 1 + ]) + # Check that datapath flow installed successfully. + AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +@@ -404,7 +404,7 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), a + + # Check for succesfull packet matching with installed offloaded flow. + AT_CHECK([filter_hw_packet_netdev_dummy < ovs-vswitchd.log | strip_xout], [0], [dnl +-p1: packet: ip,vlan_tci=0x0000,dl_src=00:06:07:08:09:0a,dl_dst=00:01:02:03:04:05,nw_src=127.0.0.1,nw_dst=127.0.0.1,nw_proto=0,nw_tos=0,nw_ecn=0,nw_ttl=64 matches with flow: recirc_id=0,eth,ip,vlan_tci=0x0000,nw_frag=no with mark: 0 ++p1: packet: ip,vlan_tci=0x0000,dl_src=00:06:07:08:09:0a,dl_dst=00:01:02:03:04:05,nw_src=127.0.0.1,nw_dst=127.0.0.1,nw_proto=0,nw_tos=0,nw_ecn=0,nw_ttl=64 matches with flow: recirc_id=0,eth,ip,vlan_tci=0x0000,nw_frag=no with mark: 1 + ]) + + ovs-appctl revalidator/wait +@@ -421,7 +421,7 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x0800),ipv4(frag=no), p + # Check that flow successfully deleted from HW. + OVS_WAIT_UNTIL([grep "succeed to delete netdev flow" ovs-vswitchd.log]) + AT_CHECK([filter_hw_flow_del < ovs-vswitchd.log | strip_xout], [0], [dnl +-p1: flow del: mark: 0 ++p1: flow del: mark: 1 + ]) + OVS_VSWITCHD_STOP + AT_CLEANUP]) +@@ -434,7 +434,7 @@ m4_define([DPIF_NETDEV_FLOW_HW_OFFLOAD_OFFSETS], + [AT_SETUP([dpif-netdev - partial hw offload with packet modifications - $1]) + OVS_VSWITCHD_START( + [add-port br0 p1 -- \ +- set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1 -- \ ++ set interface p1 type=$1 ofport_request=1 options:pcap=p1.pcap options:ifindex=1101 -- \ + set bridge br0 datapath-type=dummy \ + other-config:datapath-id=1234 fail-mode=secure], [], [], + [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) +@@ -460,7 +460,7 @@ packet_type(ns=0,id=0),eth(src=00:06:07:08:09:0a,dst=00:01:02:03:04:05),eth_type + # Check that flow successfully offloaded. + OVS_WAIT_UNTIL([grep "succeed to add netdev flow" ovs-vswitchd.log]) + AT_CHECK([filter_hw_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +-p1: flow put[[create]]: flow match: recirc_id=0,eth,udp,in_port=1,dl_vlan=99,dl_vlan_pcp=7,nw_src=127.0.0.1,nw_frag=no,tp_dst=82, mark: 0 ++p1: flow put[[create]]: flow match: recirc_id=0,eth,udp,in_port=1,dl_vlan=99,dl_vlan_pcp=7,nw_src=127.0.0.1,nw_frag=no,tp_dst=82, mark: 1 + ]) + # Check that datapath flow installed successfully. + AT_CHECK([filter_flow_install < ovs-vswitchd.log | strip_xout], [0], [dnl +@@ -472,7 +472,7 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth_type(0x8100),vlan(vid=99,pcp= + # Check for succesfull packet matching with installed offloaded flow. + AT_CHECK([filter_hw_packet_netdev_dummy < ovs-vswitchd.log | strip_xout], [0], [dnl + p1: packet: udp,dl_vlan=99,dl_vlan_pcp=7,vlan_tci1=0x0000,dl_src=00:06:07:08:09:0a,dl_dst=00:01:02:03:04:05,nw_src=127.0.0.1,nw_dst=127.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=81,tp_dst=82 dnl +-matches with flow: recirc_id=0,eth,udp,dl_vlan=99,dl_vlan_pcp=7,nw_src=127.0.0.1,nw_frag=no,tp_dst=82 with mark: 0 ++matches with flow: recirc_id=0,eth,udp,dl_vlan=99,dl_vlan_pcp=7,nw_src=127.0.0.1,nw_frag=no,tp_dst=82 with mark: 1 + ]) + + ovs-appctl revalidator/wait +@@ -490,7 +490,7 @@ packets:1, bytes:64, used:0.0s, actions:set(ipv4(src=192.168.0.7)),set(udp(dst=3 + # Check that flow successfully deleted from HW. + OVS_WAIT_UNTIL([grep "succeed to delete netdev flow" ovs-vswitchd.log]) + AT_CHECK([filter_hw_flow_del < ovs-vswitchd.log | strip_xout], [0], [dnl +-p1: flow del: mark: 0 ++p1: flow del: mark: 1 + ]) + + # Check that ip address and udp port were correctly modified in output packets. +diff --git a/tests/idltest.ovsschema b/tests/idltest.ovsschema +index bee79fc50f..d08f7e7ead 100644 +--- a/tests/idltest.ovsschema ++++ b/tests/idltest.ovsschema +@@ -54,6 +54,15 @@ + }, + "isRoot" : true + }, ++ "indexed": { ++ "columns": { ++ "i": { ++ "type": "integer" ++ } ++ }, ++ "indexes": [["i"]], ++ "isRoot" : true ++ }, + "simple": { + "columns": { + "b": { +diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at +index ff1cc93707..6415a8a04d 100644 +--- a/tests/ofproto-dpif.at ++++ b/tests/ofproto-dpif.at +@@ -5171,6 +5171,36 @@ AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: 2 + OVS_VSWITCHD_STOP + AT_CLEANUP + ++# Checks for regression against a bug in which OVS dropped packets ++# with in_port=CONTROLLER when they were recirculated (because ++# CONTROLLER isn't a real port and could not be looked up). ++AT_SETUP([ofproto-dpif - packet-out recirculation]) ++OVS_VSWITCHD_START ++add_of_ports br0 1 2 ++ ++AT_DATA([flows.txt], [dnl ++table=0 ip actions=mod_dl_dst:83:83:83:83:83:83,ct(table=1) ++table=1 ip actions=ct(commit),output:2 ++]) ++AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) ++ ++packet=ffffffffffff00102030405008004500001c00000000401100000a000002ffffffff0035111100080000 ++AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet actions=table"]) ++ ++# Dumps out the flow table, extracts the number of packets that have gone ++# through the (single) flow in table 1, and returns success if it's exactly 1. ++# ++# If this remains 0, then the recirculation isn't working properly since the ++# packet never goes through flow in table 1. ++check_flows () { ++ n=$(ovs-ofctl dump-flows br0 table=1 | sed -n 's/.*n_packets=\([[0-9]]\{1,\}\).*/\1/p') ++ echo "n_packets=$n" ++ test "$n" = 1 ++} ++OVS_WAIT_UNTIL([check_flows], [ovs dump-flows br0]) ++ ++OVS_VSWITCHD_STOP ++AT_CLEANUP + + AT_SETUP([ofproto-dpif - debug_slow action]) + OVS_VSWITCHD_START +@@ -8632,6 +8662,29 @@ recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(dst=50:54:00:00:00:0c),eth_ty + OVS_VSWITCHD_STOP + AT_CLEANUP + ++AT_SETUP([ofproto-dpif megaflow - set dl_dst with match on dl_src]) ++OVS_VSWITCHD_START ++AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) ++add_of_ports br0 1 2 ++AT_DATA([flows.txt], [dnl ++table=0 in_port=1,dl_src=50:54:00:00:00:09 actions=mod_dl_dst(50:54:00:00:00:0a),output(2) ++]) ++AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.4,dst=10.0.0.3,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:0b,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(src=10.0.0.6,dst=10.0.0.5,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) ++sleep 1 ++dnl The first packet is essentially a no-op, as the new destination MAC is the ++dnl same as the original. The second entry actually updates the destination ++dnl MAC. The last one must be dropped as it doesn't match with dl_src. ++AT_CHECK([strip_ufid < ovs-vswitchd.log | filter_flow_install | strip_used], [0], [dnl ++recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(frag=no), actions:2 ++recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0c),eth_type(0x0800),ipv4(frag=no), actions:set(eth(dst=50:54:00:00:00:0a)),2 ++recirc_id(0),in_port(1),packet_type(ns=0,id=0),eth(src=50:54:00:00:00:0b),eth_type(0x0800),ipv4(frag=no), actions:drop ++]) ++OVS_VSWITCHD_STOP ++AT_CLEANUP ++ + m4_define([OFPROTO_DPIF_MEGAFLOW_DISABLED], + [AT_SETUP([ofproto-dpif megaflow - disabled$1]) + OVS_VSWITCHD_START([], [], [], [m4_if([$1], [], [], [--dummy-numa="0,0,0,0,1,1,1,1"])]) +@@ -10540,6 +10593,62 @@ udp,vlan_tci=0x0000,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,nw_src=10. + OVS_VSWITCHD_STOP + AT_CLEANUP + ++AT_SETUP([ofproto-dpif - conntrack - match masked ct fields]) ++OVS_VSWITCHD_START ++ ++add_of_ports br0 1 2 ++ ++AT_CHECK([ovs-appctl vlog/set dpif_netdev:dbg vconn:info ofproto_dpif:info]) ++ ++dnl Allow new connections on p1->p2. Allow only established connections p2->p1 ++AT_DATA([flows.txt], [dnl ++table=0,arp,action=normal ++table=0,ip,in_port=1,udp,nw_src=10.1.2.1/24,action=ct(commit) ++table=0,ip,in_port=1,udp6,ipv6_dst=2001:db8::1/64,action=ct(commit) ++table=0,ip,in_port=1,udp,tp_src=3/0x1,action=ct(commit) ++table=0,ip,in_port=2,actions=ct(table=1) ++table=0,ip6,in_port=2,actions=ct(table=1) ++table=1,priority=10,udp,ct_state=+trk+rpl,ct_nw_src=10.1.2.1/24,actions=controller ++table=1,priority=10,udp6,ct_state=+trk+rpl,ct_ipv6_dst=2001:db8::1/64,actions=controller ++table=1,priority=10,udp,ct_state=+trk+rpl,ct_tp_src=3/0x1,actions=controller ++table=1,priority=1,action=drop ++]) ++ ++AT_CHECK([ovs-ofctl add-flows br0 flows.txt]) ++ ++AT_CAPTURE_FILE([ofctl_monitor.log]) ++AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl -P nxt_packet_in --detach --no-chdir --pidfile 2> ofctl_monitor.log]) ++ ++dnl Match ct_nw_src=10.1.2.1/24 ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.1.2.100,dst=10.1.2.200,proto=17,tos=0,ttl=64,frag=no),udp(src=6,dst=6)']) ++AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x0800),ipv4(src=10.1.2.200,dst=10.1.2.100,proto=17,tos=0,ttl=64,frag=no),udp(src=6,dst=6)']) ++ ++dnl Match ct_ipv6_dst=2001:db8::1/64 ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x86dd),ipv6(src=2001:db8::1,dst=2001:db8::2,label=0,proto=17,tclass=0x70,hlimit=128,frag=no),udp(src=1,dst=2)']) ++AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x86dd),ipv6(src=2001:db8::2,dst=2001:db8::1,label=0,proto=17,tclass=0x70,hlimit=128,frag=no),udp(src=2,dst=1)']) ++ ++dnl Match ct_tp_src=3/0x1 ++AT_CHECK([ovs-appctl netdev-dummy/receive p1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.1.1.1,dst=10.1.1.2,proto=17,tos=0,ttl=64,frag=no),udp(src=1,dst=2)']) ++AT_CHECK([ovs-appctl netdev-dummy/receive p2 'in_port(2),eth(src=50:54:00:00:00:0a,dst=50:54:00:00:00:09),eth_type(0x0800),ipv4(src=10.1.1.2,dst=10.1.1.1,proto=17,tos=0,ttl=64,frag=no),udp(src=2,dst=1)']) ++ ++OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 6]) ++OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) ++ ++dnl Check this output. ++AT_CHECK([cat ofctl_monitor.log], [0], [dnl ++NXT_PACKET_IN (xid=0x0): table_id=1 cookie=0x0 total_len=106 ct_state=est|rpl|trk,ct_nw_src=10.1.2.100,ct_nw_dst=10.1.2.200,ct_nw_proto=17,ct_tp_src=6,ct_tp_dst=6,ip,in_port=2 (via action) data_len=106 (unbuffered) ++udp,vlan_tci=0x0000,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,nw_src=10.1.2.200,nw_dst=10.1.2.100,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=6,tp_dst=6 udp_csum:221 ++dnl ++NXT_PACKET_IN (xid=0x0): table_id=1 cookie=0x0 total_len=126 ct_state=est|rpl|trk,ct_ipv6_src=2001:db8::1,ct_ipv6_dst=2001:db8::2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2,ipv6,in_port=2 (via action) data_len=126 (unbuffered) ++udp6,vlan_tci=0x0000,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,ipv6_src=2001:db8::2,ipv6_dst=2001:db8::1,ipv6_label=0x00000,nw_tos=112,nw_ecn=0,nw_ttl=128,tp_src=2,tp_dst=1 udp_csum:bfe2 ++dnl ++NXT_PACKET_IN (xid=0x0): table_id=1 cookie=0x0 total_len=106 ct_state=est|rpl|trk,ct_nw_src=10.1.1.1,ct_nw_dst=10.1.1.2,ct_nw_proto=17,ct_tp_src=1,ct_tp_dst=2,ip,in_port=2 (via action) data_len=106 (unbuffered) ++udp,vlan_tci=0x0000,dl_src=50:54:00:00:00:0a,dl_dst=50:54:00:00:00:09,nw_src=10.1.1.2,nw_dst=10.1.1.1,nw_tos=0,nw_ecn=0,nw_ttl=64,tp_src=2,tp_dst=1 udp_csum:553 ++]) ++ ++OVS_VSWITCHD_STOP ++AT_CLEANUP ++ + AT_SETUP([ofproto-dpif - conntrack - ofproto/trace]) + OVS_VSWITCHD_START + +diff --git a/tests/ovs-vsctl.at b/tests/ovs-vsctl.at +index 55c7a6e179..c8babe3612 100644 +--- a/tests/ovs-vsctl.at ++++ b/tests/ovs-vsctl.at +@@ -966,6 +966,14 @@ AT_CHECK([RUN_OVS_VSCTL([--if-exists del-zone-tp netdev zone=1])]) + AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:2, Timeout Policies: icmp_first=2 icmp_reply=3 + ]) + ++AT_CHECK( ++ [RUN_OVS_VSCTL_TOGETHER([--id=@n create CT_Zone external_ids:"test"="123"], ++ [--id=@m create Datapath datapath_version=0 ct_zones:"10"=@n], ++ [set Open_vSwitch . datapaths:"netdev"=@m])], ++ [0], [stdout]) ++AT_CHECK([RUN_OVS_VSCTL([list-zone-tp netdev])], [0], [Zone:10, Timeout Policies: system default ++]) ++ + AT_CHECK([RUN_OVS_VSCTL([-- --id=@m create Datapath datapath_version=0 'capabilities={recirc=true}' -- set Open_vSwitch . datapaths:"system"=@m])], [0], [stdout]) + AT_CHECK([RUN_OVS_VSCTL([list-dp-cap system])], [0], [recirc=true + ]) +diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at +index 3a0bd4579e..e0758e954c 100644 +--- a/tests/ovsdb-cluster.at ++++ b/tests/ovsdb-cluster.at +@@ -179,6 +179,41 @@ AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) + ovsdb_test_cluster_disconnect 5 leader yes + AT_CLEANUP + ++AT_SETUP([OVSDB cluster - initial status should be disconnected]) ++AT_KEYWORDS([ovsdb server negative unix cluster disconnect]) ++ ++n=3 ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++ordinal_schema > schema ++AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) ++cid=`ovsdb-tool db-cid s1.db` ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++for i in `seq 2 $n`; do ++ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) ++done ++ ++on_exit 'kill `cat *.pid`' ++for i in `seq $n`; do ++ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) ++done ++for i in `seq $n`; do ++ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) ++done ++ ++# Stop all servers, and start the s1 only, to test initial connection status ++# when there is no leader yet. ++for i in `seq 1 $n`; do ++ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) ++done ++i=1 ++AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) ++ ++# The initial status should be disconnected. So wait should fail. ++AT_CHECK([ovsdb_client_wait --timeout=1 unix:s$i.ovsdb $schema_name connected], [142], [ignore], [ignore]) ++OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) ++ ++AT_CLEANUP ++ + + + AT_BANNER([OVSDB cluster election timer change]) +@@ -273,6 +308,88 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s4 cluster/status $schema_name | grep "Ele + + AT_CLEANUP + ++ ++AT_BANNER([OVSDB cluster install snapshot RPC]) ++ ++AT_SETUP([OVSDB cluster - install snapshot RPC]) ++AT_KEYWORDS([ovsdb server positive unix cluster snapshot]) ++ ++n=3 ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++ordinal_schema > schema ++AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) ++cid=`ovsdb-tool db-cid s1.db` ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++for i in `seq 2 $n`; do ++ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) ++done ++ ++on_exit 'kill `cat *.pid`' ++for i in `seq $n`; do ++ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) ++done ++for i in `seq $n`; do ++ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) ++done ++ ++AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", ++ {"op": "insert", ++ "table": "indexed", ++ "row": {"i": 0}}]]'], [0], [ignore], [ignore]) ++ ++# Kill one follower (s2) and write some data to cluster, so that the follower is falling behind ++printf "\ns2: stopping\n" ++OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s2], [s2.pid]) ++ ++# Delete "i":0 and readd it to get a different UUID for it. ++AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", ++ {"op": "delete", ++ "table": "indexed", ++ "where": [["i", "==", 0]]}]]'], [0], [ignore], [ignore]) ++ ++AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", ++ {"op": "insert", ++ "table": "indexed", ++ "row": {"i": 0}}]]'], [0], [ignore], [ignore]) ++ ++AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest", ++ {"op": "insert", ++ "table": "indexed", ++ "row": {"i": 1}}]]'], [0], [ignore], [ignore]) ++ ++# Compact leader online to generate snapshot ++AT_CHECK([ovs-appctl -t "`pwd`"/s1 ovsdb-server/compact]) ++ ++# Start the follower s2 again. ++AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb s2.db]) ++AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name connected]) ++ ++# A client transaction through s2. During this transaction, there will be a ++# install_snapshot RPC because s2 detects it is behind and s1 doesn't have the ++# pre_log_index requested by s2 because it is already compacted. ++# After the install_snapshot RPC process, the transaction through s2 should ++# succeed. ++AT_CHECK([ovsdb-client transact unix:s2.ovsdb '[["idltest", ++ {"op": "insert", ++ "table": "indexed", ++ "row": {"i": 2}}]]'], [0], [ignore], [ignore]) ++ ++# The snapshot should overwrite the in-memory contents of the DB on S2 ++# without generating any constraint violations. All tree records (0, 1, 2) ++# should be in the DB at this point. ++AT_CHECK([ovsdb-client --no-headings dump unix:s2.ovsdb idltest indexed | uuidfilt | sort -k 2], [0], [dnl ++<0> 0 ++<1> 1 ++<2> 2 ++indexed table ++]) ++ ++for i in `seq $n`; do ++ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) ++done ++ ++AT_CLEANUP ++ + + + OVS_START_SHELL_HELPERS +@@ -436,6 +553,61 @@ AT_KEYWORDS([ovsdb server negative unix cluster pending-txn]) + ovsdb_cluster_failure_test 2 2 3 crash-after-receiving-append-request-update + AT_CLEANUP + ++ ++AT_SETUP([OVSDB cluster - competing candidates]) ++AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates]) ++ ++n=3 ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++ordinal_schema > schema ++AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr]) ++cid=`ovsdb-tool db-cid s1.db` ++schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema` ++for i in `seq 2 $n`; do ++ AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft]) ++done ++ ++on_exit 'kill `cat *.pid`' ++for i in `seq $n`; do ++ AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db]) ++done ++for i in `seq $n`; do ++ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) ++done ++ ++# We need to simulate the situation when 2 candidates starts election with same ++# term. ++# ++# Before triggering leader election, tell follower s2 don't send vote request (simulating ++# vote-request lost or not handled in time), and tell follower s3 to delay ++# election timer to make sure s3 doesn't send vote-request before s2 enters ++# term 2. ++AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test dont-send-vote-request], [0], [ignore]) ++AT_CHECK([ovs-appctl -t "`pwd`"/s3 cluster/failure-test delay-election], [0], [ignore]) ++ ++# Restart leader, which will become follower, and both old followers will start ++# election as candidate. The new follower (old leader) will vote one of them, ++# and the other candidate should step back as follower as again. ++kill -9 `cat s1.pid` ++AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb s1.db]) ++ ++# Tell s1 to delay election timer so that it won't start election before s3 ++# becomes candidate. ++AT_CHECK([ovs-appctl -t "`pwd`"/s1 cluster/failure-test delay-election], [0], [ignore]) ++ ++OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s1 cluster/status $schema_name | grep "Term: 2"]) ++ ++for i in `seq $n`; do ++ OVS_WAIT_WHILE([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep "candidate"]) ++ AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected]) ++done ++ ++for i in `seq $n`; do ++ OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid]) ++done ++ ++AT_CLEANUP ++ + + AT_BANNER([OVSDB - cluster tests]) + +diff --git a/tests/ovsdb-idl.at b/tests/ovsdb-idl.at +index cc38d69c10..cc53da923b 100644 +--- a/tests/ovsdb-idl.at ++++ b/tests/ovsdb-idl.at +@@ -954,6 +954,7 @@ AT_CHECK([sort stdout | uuidfilt], [0], + + # Check that ovsdb-idl figured out that table link2 and column l2 are missing. + AT_CHECK([grep ovsdb_idl stderr | sort], [0], [dnl ++test-ovsdb|ovsdb_idl|idltest database lacks indexed table (database needs upgrade?) + test-ovsdb|ovsdb_idl|idltest database lacks link2 table (database needs upgrade?) + test-ovsdb|ovsdb_idl|idltest database lacks singleton table (database needs upgrade?) + test-ovsdb|ovsdb_idl|link1 table in idltest database lacks l2 column (database needs upgrade?) +@@ -1814,3 +1815,59 @@ m4_define([OVSDB_CHECK_IDL_LEADER_ONLY_PY], + + OVSDB_CHECK_IDL_LEADER_ONLY_PY([Check Python IDL connects to leader], 3, ['remote']) + OVSDB_CHECK_IDL_LEADER_ONLY_PY([Check Python IDL reconnects to leader], 3, ['remote' '+remotestop' 'remote']) ++ ++# same as OVSDB_CHECK_IDL but uses C IDL implementation with tcp ++# with multiple remotes. ++m4_define([OVSDB_CHECK_CLUSTER_IDL_C], ++ [AT_SETUP([$1 - C - tcp]) ++ AT_KEYWORDS([ovsdb server idl positive tcp socket $5]) ++ m4_define([LPBK],[127.0.0.1]) ++ AT_CHECK([ovsdb_cluster_start_idltest $2 "ptcp:0:"LPBK]) ++ PARSE_LISTENING_PORT([s1.log], [TCP_PORT_1]) ++ PARSE_LISTENING_PORT([s2.log], [TCP_PORT_2]) ++ PARSE_LISTENING_PORT([s3.log], [TCP_PORT_3]) ++ remotes=tcp:LPBK:$TCP_PORT_1,tcp:LPBK:$TCP_PORT_2,tcp:LPBK:$TCP_PORT_3 ++ ++ m4_if([$3], [], [], ++ [AT_CHECK([ovsdb-client transact $remotes $3], [0], [ignore], [ignore])]) ++ AT_CHECK([test-ovsdb '-vPATTERN:console:test-ovsdb|%c|%m' -vjsonrpc -t10 idl tcp:LPBK:$TCP_PORT_1 $4], ++ [0], [stdout], [ignore]) ++ AT_CHECK([sort stdout | uuidfilt]m4_if([$7],,, [[| $7]]), ++ [0], [$5]) ++ AT_CLEANUP]) ++ ++# Checks that monitor_cond_since works fine when disconnects happen ++# with cond_change requests in flight (i.e., IDL is properly updated). ++OVSDB_CHECK_CLUSTER_IDL_C([simple idl, monitor_cond_since, cluster disconnect], ++ 3, ++ [['["idltest", ++ {"op": "insert", ++ "table": "simple", ++ "row": {"i": 1, ++ "r": 1.0, ++ "b": true}}, ++ {"op": "insert", ++ "table": "simple", ++ "row": {"i": 2, ++ "r": 1.0, ++ "b": true}}]']], ++ [['condition simple []' \ ++ 'condition simple [["i","==",2]]' \ ++ 'condition simple [["i","==",1]]' \ ++ '+reconnect' \ ++ '["idltest", ++ {"op": "update", ++ "table": "simple", ++ "where": [["i", "==", 1]], ++ "row": {"r": 2.0 }}]']], ++ [[000: change conditions ++001: empty ++002: change conditions ++003: i=2 r=1 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> ++004: change conditions ++005: reconnect ++006: i=2 r=1 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<1> ++007: {"error":null,"result":[{"count":1}]} ++008: i=1 r=2 b=true s= u=<0> ia=[] ra=[] ba=[] sa=[] ua=[] uuid=<2> ++009: done ++]]) +diff --git a/tests/system-route.at b/tests/system-route.at +new file mode 100644 +index 0000000000..1714273e35 +--- /dev/null ++++ b/tests/system-route.at +@@ -0,0 +1,28 @@ ++AT_BANNER([system-route]) ++ ++dnl Add an interface, add/del ip address, check that OVS catches route updates. ++AT_SETUP([ovs-route - add/remove system route]) ++AT_KEYWORDS([route]) ++OVS_TRAFFIC_VSWITCHD_START() ++ ++dnl Create tap port. ++AT_CHECK([ip tuntap add name p1-route mode tap]) ++AT_CHECK([ip link set p1-route up]) ++on_exit 'ip link del p1-route' ++ ++dnl Add ip address. ++AT_CHECK([ip addr add 10.0.0.17/24 dev p1-route], [0], [stdout]) ++ ++dnl Check that OVS catches route updates. ++OVS_WAIT_UNTIL([ovs-appctl ovs/route/show | grep 'p1-route' | sort], [0], [dnl ++Cached: 10.0.0.17/24 dev p1-route SRC 10.0.0.17 ++Cached: 10.0.0.17/32 dev p1-route SRC 10.0.0.17 local ++]) ++ ++dnl Delete ip address. ++AT_CHECK([ip addr del 10.0.0.17/24 dev p1-route], [0], [stdout]) ++dnl Check that routes was removed from OVS. ++OVS_WAIT_UNTIL([test `ovs-appctl ovs/route/show | grep -c 'p1-route'` -eq 0 ]) ++ ++OVS_TRAFFIC_VSWITCHD_STOP ++AT_CLEANUP +diff --git a/tests/system-traffic.at b/tests/system-traffic.at +index 4a39c929c2..3ed03d92b5 100644 +--- a/tests/system-traffic.at ++++ b/tests/system-traffic.at +@@ -611,6 +611,16 @@ NS_CHECK_EXEC([at_ns0], [ping -q -c 3 10.1.1.100 | FORMAT_PING], [0], [dnl + 3 packets transmitted, 3 received, 0% packet loss, time 0ms + ]) + ++dnl Test OVS handles TLV map modifictions properly when restores frozen state. ++NS_CHECK_EXEC([at_ns0], [ping 10.1.1.100 > /dev/null &]) ++ ++AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0x88,len=4}->tun_metadata1"]) ++sleep 1 ++AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0x99,len=4}->tun_metadata2"]) ++sleep 1 ++AT_CHECK([ovs-ofctl add-tlv-map br0 "{class=0xffff,type=0xaa,len=4}->tun_metadata3"]) ++sleep 1 ++ + OVS_APP_EXIT_AND_WAIT([ovs-ofctl]) + OVS_TRAFFIC_VSWITCHD_STOP + AT_CLEANUP +diff --git a/tests/system-userspace-testsuite.at b/tests/system-userspace-testsuite.at +index b40da9579e..2e9659a675 100644 +--- a/tests/system-userspace-testsuite.at ++++ b/tests/system-userspace-testsuite.at +@@ -26,3 +26,4 @@ m4_include([tests/system-traffic.at]) + m4_include([tests/system-layer3-tunnels.at]) + m4_include([tests/system-interface.at]) + m4_include([tests/system-userspace-packet-type-aware.at]) ++m4_include([tests/system-route.at]) +diff --git a/tests/test-classifier.c b/tests/test-classifier.c +index 6d53d016de..2d98fad485 100644 +--- a/tests/test-classifier.c ++++ b/tests/test-classifier.c +@@ -512,8 +512,9 @@ verify_tries(struct classifier *cls) + int i; + + for (i = 0; i < cls->n_tries; i++) { +- n_rules += trie_verify(&cls->tries[i].root, 0, +- cls->tries[i].field->n_bits); ++ const struct mf_field * cls_field ++ = ovsrcu_get(struct mf_field *, &cls->tries[i].field); ++ n_rules += trie_verify(&cls->tries[i].root, 0, cls_field->n_bits); + } + assert(n_rules <= cls->n_rules); + } +diff --git a/utilities/bugtool/ovs-bugtool.in b/utilities/bugtool/ovs-bugtool.in +index e55bfc2ed5..47f3c4629f 100755 +--- a/utilities/bugtool/ovs-bugtool.in ++++ b/utilities/bugtool/ovs-bugtool.in +@@ -33,8 +33,7 @@ + # or func_output(). + # + +-import StringIO +-import commands ++from io import BytesIO + import fcntl + import getopt + import hashlib +@@ -48,7 +47,7 @@ import warnings + import zipfile + from select import select + from signal import SIGTERM +-from subprocess import PIPE, Popen ++from subprocess import PIPE, Popen, check_output + + from xml.dom.minidom import getDOMImplementation, parse + +@@ -348,7 +347,7 @@ def collect_data(): + cap = v['cap'] + if 'cmd_args' in v: + if 'output' not in v.keys(): +- v['output'] = StringIOmtime() ++ v['output'] = BytesIOmtime() + if v['repeat_count'] > 0: + if cap not in process_lists: + process_lists[cap] = [] +@@ -373,20 +372,23 @@ def collect_data(): + if 'filename' in v and v['filename'].startswith('/proc/'): + # proc files must be read into memory + try: +- f = open(v['filename'], 'r') ++ f = open(v['filename'], 'rb') + s = f.read() + f.close() + if check_space(cap, v['filename'], len(s)): +- v['output'] = StringIOmtime(s) ++ v['output'] = BytesIOmtime(s) + except: + pass + elif 'func' in v: + try: + s = v['func'](cap) + except Exception as e: +- s = str(e) ++ s = str(e).encode() + if check_space(cap, k, len(s)): +- v['output'] = StringIOmtime(s) ++ if isinstance(s, str): ++ v['output'] = BytesIOmtime(s.encode()) ++ else: ++ v['output'] = BytesIOmtime(s) + + + def main(argv=None): +@@ -704,7 +706,7 @@ exclude those logs from the archive. + + # permit the user to filter out data + # We cannot use iteritems, since we modify 'data' as we pass through +- for (k, v) in sorted(data.items()): ++ for (k, v) in data.items(): + cap = v['cap'] + if 'filename' in v: + key = k[0] +@@ -721,7 +723,7 @@ exclude those logs from the archive. + + # include inventory + data['inventory.xml'] = {'cap': None, +- 'output': StringIOmtime(make_inventory(data, subdir))} ++ 'output': BytesIOmtime(make_inventory(data, subdir))} + + # create archive + if output_fd == -1: +@@ -782,7 +784,7 @@ def dump_scsi_hosts(cap): + + + def module_info(cap): +- output = StringIO.StringIO() ++ output = BytesIO() + modules = open(PROC_MODULES, 'r') + procs = [] + +@@ -806,7 +808,7 @@ def multipathd_topology(cap): + + + def dp_list(): +- output = StringIO.StringIO() ++ output = BytesIO() + procs = [ProcOutput([OVS_DPCTL, 'dump-dps'], + caps[CAP_NETWORK_STATUS][MAX_TIME], output)] + +@@ -828,7 +830,7 @@ def collect_ovsdb(): + if os.path.isfile(OPENVSWITCH_COMPACT_DB): + os.unlink(OPENVSWITCH_COMPACT_DB) + +- output = StringIO.StringIO() ++ output = BytesIO() + max_time = 5 + procs = [ProcOutput(['ovsdb-tool', 'compact', + OPENVSWITCH_CONF_DB, OPENVSWITCH_COMPACT_DB], +@@ -871,7 +873,7 @@ def fd_usage(cap): + + + def dump_rdac_groups(cap): +- output = StringIO.StringIO() ++ output = BytesIO() + procs = [ProcOutput([MPPUTIL, '-a'], caps[cap][MAX_TIME], output)] + + run_procs([procs]) +@@ -896,7 +898,7 @@ def load_plugins(just_capabilities=False, filter=None): + for node in nodelist: + if node.nodeType == node.TEXT_NODE: + rc += node.data +- return rc.encode() ++ return rc + + def getBoolAttr(el, attr, default=False): + ret = default +@@ -1037,7 +1039,7 @@ def make_tar(subdir, suffix, output_fd, output_file): + s = os.stat(v['filename']) + ti.mtime = s.st_mtime + ti.size = s.st_size +- tf.addfile(ti, open(v['filename'])) ++ tf.addfile(ti, open(v['filename'], 'rb')) + except: + pass + finally: +@@ -1095,12 +1097,12 @@ def make_inventory(inventory, subdir): + s.setAttribute('date', time.strftime('%c')) + s.setAttribute('hostname', platform.node()) + s.setAttribute('uname', ' '.join(platform.uname())) +- s.setAttribute('uptime', commands.getoutput(UPTIME)) ++ s.setAttribute('uptime', check_output(UPTIME).decode()) + document.getElementsByTagName(INVENTORY_XML_ROOT)[0].appendChild(s) + + map(lambda k_v: inventory_entry(document, subdir, k_v[0], k_v[1]), + inventory.items()) +- return document.toprettyxml() ++ return document.toprettyxml().encode() + + + def inventory_entry(document, subdir, k, v): +@@ -1301,7 +1303,7 @@ class ProcOutput(object): + line = self.proc.stdout.readline() + else: + line = self.proc.stdout.read(self.bufsize) +- if line == '': ++ if line == b'': + # process exited + self.proc.stdout.close() + self.status = self.proc.wait() +@@ -1391,13 +1393,13 @@ def get_free_disk_space(path): + return s.f_frsize * s.f_bfree + + +-class StringIOmtime(StringIO.StringIO): +- def __init__(self, buf=''): +- StringIO.StringIO.__init__(self, buf) ++class BytesIOmtime(BytesIO): ++ def __init__(self, buf=b''): ++ BytesIO.__init__(self, buf) + self.mtime = time.time() + + def write(self, s): +- StringIO.StringIO.write(self, s) ++ BytesIO.write(self, s) + self.mtime = time.time() + + +diff --git a/utilities/ovs-dpctl-top.in b/utilities/ovs-dpctl-top.in +index f2cc3f7f2a..011cc64b74 100755 +--- a/utilities/ovs-dpctl-top.in ++++ b/utilities/ovs-dpctl-top.in +@@ -592,7 +592,7 @@ def flows_read(ihdl, flow_db): + + try: + flow_db.flow_line_add(line) +- except ValueError, arg: ++ except ValueError as arg: + logging.error(arg) + + return flow_db +@@ -958,6 +958,9 @@ class FlowDB: + change order of fields of the same flow. + """ + ++ if not isinstance(line, str): ++ line = str(line) ++ + line = line.rstrip("\n") + (fields, stats, _) = flow_line_split(line) + +@@ -988,7 +991,7 @@ class FlowDB: + + self.flow_event(fields_dict, stats_old_dict, stats_dict) + +- except ValueError, arg: ++ except ValueError as arg: + logging.error(arg) + self._error_count += 1 + raise +@@ -1192,7 +1195,7 @@ def flows_top(args): + flows_read(ihdl, flow_db) + finally: + ihdl.close() +- except OSError, arg: ++ except OSError as arg: + logging.critical(arg) + break + +@@ -1220,7 +1223,7 @@ def flows_top(args): + + # repeat output + for (count, line) in lines: +- print line ++ print(line) + + + def flows_script(args): +@@ -1249,7 +1252,7 @@ def flows_script(args): + render = Render(console_width, Render.FIELD_SELECT_SCRIPT) + + for line in render.format(flow_db): +- print line ++ print(line) + + + def main(): +diff --git a/utilities/ovs-vsctl.c b/utilities/ovs-vsctl.c +index bd3972636e..37cc72d401 100644 +--- a/utilities/ovs-vsctl.c ++++ b/utilities/ovs-vsctl.c +@@ -1344,9 +1344,13 @@ cmd_list_zone_tp(struct ctl_context *ctx) + + struct ovsrec_ct_timeout_policy *tp = zone->timeout_policy; + +- for (int j = 0; j < tp->n_timeouts; j++) { +- ds_put_format(&ctx->output, "%s=%"PRIu64" ", +- tp->key_timeouts[j], tp->value_timeouts[j]); ++ if (tp) { ++ for (int j = 0; j < tp->n_timeouts; j++) { ++ ds_put_format(&ctx->output, "%s=%"PRIu64" ", ++ tp->key_timeouts[j], tp->value_timeouts[j]); ++ } ++ } else { ++ ds_put_cstr(&ctx->output, "system default"); + } + ds_chomp(&ctx->output, ' '); + ds_put_char(&ctx->output, '\n'); +diff --git a/vswitchd/bridge.c b/vswitchd/bridge.c +index e591c26a6c..ce348b9d16 100644 +--- a/vswitchd/bridge.c ++++ b/vswitchd/bridge.c +@@ -634,8 +634,10 @@ static void + get_timeout_policy_from_ovsrec(struct simap *tp, + const struct ovsrec_ct_timeout_policy *tp_cfg) + { +- for (size_t i = 0; i < tp_cfg->n_timeouts; i++) { +- simap_put(tp, tp_cfg->key_timeouts[i], tp_cfg->value_timeouts[i]); ++ if (tp_cfg) { ++ for (size_t i = 0; i < tp_cfg->n_timeouts; i++) { ++ simap_put(tp, tp_cfg->key_timeouts[i], tp_cfg->value_timeouts[i]); ++ } + } + } + diff --git a/SOURCES/ppc_64-power8-linuxapp-gcc-config b/SOURCES/ppc_64-power8-linuxapp-gcc-config index 7b21579..394713d 100644 --- a/SOURCES/ppc_64-power8-linuxapp-gcc-config +++ b/SOURCES/ppc_64-power8-linuxapp-gcc-config @@ -1,4 +1,4 @@ -# -*- cfg-sha: f7b9a8671f1828542f6b8389a63bc60574d9c9ab21d06d5e8adefbaf7c929bc3 +# -*- cfg-sha: ed6bcdfa02f885357548558116ba4f4693048c72eb35043c2de856708c6f7257 # SPDX-License-Identifier: BSD-3-Clause # Copyright (C) IBM Corporation 2014. # SPDX-License-Identifier: BSD-3-Clause @@ -10,7 +10,7 @@ CONFIG_RTE_VER_PREFIX="DPDK" # Version information completed when this file is processed for a build CONFIG_RTE_VER_YEAR=19 CONFIG_RTE_VER_MONTH=11 -CONFIG_RTE_VER_MINOR=1 +CONFIG_RTE_VER_MINOR=3 CONFIG_RTE_VER_SUFFIX="" CONFIG_RTE_VER_RELEASE=99 # RTE_EXEC_ENV values are the directories in mk/exec-env/ @@ -590,4 +590,3 @@ CONFIG_RTE_TOOLCHAIN_GCC=y # Note: Power doesn't have this support # Note: Initially, all of architecture we compile for. PMD drivers compilation are turned off on Power # Will turn on them only after architecture we compile for. successful testing on Power -CONFIG_RTE_LIBRTE_PMD_XENVIRT=n diff --git a/SOURCES/x86_64-native-linuxapp-gcc-config b/SOURCES/x86_64-native-linuxapp-gcc-config index 81175d1..30d033b 100644 --- a/SOURCES/x86_64-native-linuxapp-gcc-config +++ b/SOURCES/x86_64-native-linuxapp-gcc-config @@ -1,4 +1,4 @@ -# -*- cfg-sha: 59724fb7100d28a9ee24efa79c4206bcde839bc29bb98eea771474514e57e022 +# -*- cfg-sha: f4cf137e2d4d96b2fa1ea8a0f1029d8d6553993747fda3f9f37fd01138fae055 # SPDX-License-Identifier: BSD-3-Clause # Copyright(c) 2010-2014 Intel Corporation # SPDX-License-Identifier: BSD-3-Clause @@ -10,7 +10,7 @@ CONFIG_RTE_VER_PREFIX="DPDK" # Version information completed when this file is processed for a build CONFIG_RTE_VER_YEAR=19 CONFIG_RTE_VER_MONTH=11 -CONFIG_RTE_VER_MINOR=1 +CONFIG_RTE_VER_MINOR=3 CONFIG_RTE_VER_SUFFIX="" CONFIG_RTE_VER_RELEASE=99 # RTE_EXEC_ENV values are the directories in mk/exec-env/ @@ -588,4 +588,3 @@ CONFIG_RTE_ARCH_X86_64=y CONFIG_RTE_ARCH_X86=y CONFIG_RTE_ARCH_64=y CONFIG_RTE_TOOLCHAIN_GCC=y -CONFIG_RTE_LIBRTE_PMD_XENVIRT=n diff --git a/SPECS/openvswitch2.13.spec b/SPECS/openvswitch2.13.spec index 8258d7e..21b0e82 100644 --- a/SPECS/openvswitch2.13.spec +++ b/SPECS/openvswitch2.13.spec @@ -62,14 +62,14 @@ Summary: Open vSwitch Group: System Environment/Daemons daemon/database/utilities URL: http://www.openvswitch.org/ Version: 2.13.0 -Release: 39%{?commit0:.%{date}git%{shortcommit0}}%{?commit1:dpdk%{shortcommit1}}%{?dist} +Release: 57%{?commit0:.%{date}git%{shortcommit0}}%{?commit1:dpdk%{shortcommit1}}%{?dist} # Nearly all of openvswitch is ASL 2.0. The bugtool is LGPLv2+, and the # lib/sflow*.[ch] files are SISSL # datapath/ is GPLv2 (although not built into any of the binary packages) License: ASL 2.0 and LGPLv2+ and SISSL -%define dpdkver %{?commit1}%{!?commit1:19.11.1} +%define dpdkver %{?commit1}%{!?commit1:19.11} %define dpdkdir dpdk %define dpdksver %(echo %{dpdkver} | cut -d. -f-2) # NOTE: DPDK does not currently build for s390x @@ -700,6 +700,78 @@ exit 0 %endif %changelog +* Wed Aug 26 2020 Open vSwitch CI - 2.13.0-57 +- Merging upstream branch-2.13 + [2fe3a06bffcd907f8f6561ec0e56963de9766c97] + +* Tue Aug 18 2020 Flavio Leitner - 2.13.0-56 +- dpdk: Updated configs to 19.11.3 + [4e4acaf40ab114e958b299cdff55c11240bfd4da] + +* Tue Aug 18 2020 Flavio Leitner - 2.13.0-55 +- Merging 798524b5e3 version: 19.11.3 (#1868709) + [64c883ec66425ad67a70599c549008442e3217cd] + +* Thu Aug 13 2020 Open vSwitch CI - 2.13.0-54 +- Merging upstream branch-2.13 + [5dddb2d4f863203ec3560fcfaf8f20844b053073] + +* Mon Aug 10 2020 Open vSwitch CI - 2.13.0-53 +- Merging upstream branch-2.13 + [bb436c2999218e59e06f089b42e19d3778869c63] + +* Mon Aug 10 2020 Dumitru Ceara - 2.13.0-52 +- ovsdb-server: Replace in-memory DB contents at raft install_snapshot. (#1867185) + [9f646ec051fa2a2bf980843b7c1859479e87c228] + +* Sat Aug 08 2020 Flavio Leitner - 2.13.0-51 +- redhat: Add support to custom RPM releases. + [7eb5b56344c07f237b2883f655eeee9c1ea0535e] + +* Sat Aug 08 2020 Flavio Leitner - 2.13.0-50 +- pkgtool: Use OVS static version in package NVR. + [a0b572aaa173f2a4b4f57b8b396706777bf83395] + +* Thu Jul 30 2020 Timothy Redaelli - 2.13.0-49 +- odp-util: Fix clearing match mask if set action is partially unnecessary. (#1862153) + [6d85fea8b4c7db954c051d0bad7bc9505c1fdf7c] + +* Thu Jul 16 2020 Flavio Leitner - 2.13.0-48 +- redhat: Clean old changelog entries. + [6cf8d909e81a715a302a2c401ef60abcc726fc78] + +* Thu Jul 16 2020 Flavio Leitner - 2.13.0-47 +- redhat: Update the documentation. + [c9571d2dad6b1e47ba1d398350d8cd101a93e6a7] + +* Thu Jul 16 2020 Flavio Leitner - 2.13.0-46 +- redhat: Add merge script. + [752c59ba745c3c82bc7ca1e31caefbc4b6514b07] + +* Thu Jul 16 2020 Flavio Leitner - 2.13.0-45 +- redhat: Use static references. + [f1025c1515c00e9ec8e1fbc3a5337c412a3ce0c8] + +* Wed Jul 15 2020 Flavio Leitner - 2.13.0-44 +- Update DPDK configs to v19.11.2. + [98e6e9823b54d5f7f52aa531a5479289a4fc40d7] + +* Wed Jul 15 2020 Flavio Leitner - 2.13.0-43 +- Merge DPDK tag 'v19.11.2' into fast-datapath-rhel-8 + [755e86c61ae905a1485850f9e44a3502a63f52fb] + +* Wed Jul 15 2020 Flavio Leitner - 2.13.0-42 +- Merging upstream branch-2.13 to fast-datapath-rhel-8 + [735b3f94c2655e930b0ee86556eb01191518f7e8] + +* Sun Jul 12 2020 Flavio Leitner - 2.13.0-41 +- redhat: Rename OVSCI job name. + [a61f1d1095e58fb7c2ad38d37b86f3012f5aecfe] + +* Wed Jul 08 2020 Timothy Redaelli - 2.13.0-40 +- redhat: pkgtool: use diff instead of format-patch + [da2129ac827efe85db1e0ceeff8996e5045a862b] + * Thu Jun 25 2020 Timothy Redaelli - 2.13.0-39 - bus/pci: fix VF memory access (#1851169) [2b22bcd9ad02d0180ad5c46a2cccf34a3afba600] @@ -844,720 +916,3 @@ exit 0 - vhost: protect log address translation in IOTLB update (#1806599) [0d4370404fa971cb07ca2bf9cb0cdf98ecc54d4b] -* Tue Feb 25 2020 Timothy Redaelli - 2.13.0-3 -- Remove Docutils, Pygments and Sphinx directories - [0857b41c11694061bc94122c3c026ff552745703] - -* Tue Feb 25 2020 Timothy Redaelli - 2.13.0-2 -- Update Red Hat build files to use upstream tarballs and one patch - [b14f867126d5d9cfbe24d54c89aa917384c8c133] - -* Thu Feb 20 2020 Flavio Leitner - 2.13.0-1 -- Open vSwitch version 2.13.0 - [44ed4ed8d98d8c21e715a7014d89a2f14f56b96b] - -* Wed Jan 22 2020 Open vSwitch Bot - 2.13.0-0.20200121git2a4f006 -- Snapshot of branch-2.13 2a4f006c79c0 - -* Fri Jan 17 2020 Open vSwitch Bot - 2.13.0-0.20200117git8ae6a5f -- Snapshot of master 8ae6a5f98c3a - -* Tue Jan 14 2020 Open vSwitch Bot - 2.13.0-0.20200114gitb9b7b98 -- Snapshot of master b9b7b989d105 - -* Tue Jan 14 2020 Open vSwitch Bot - 2.13.0-0.20200114gitb9b7b98 -- Snapshot of master b9b7b989d105 - -* Tue Jan 14 2020 Open vSwitch Bot - 2.13.0-0.20200114gitb9b7b98 -- Snapshot of master b9b7b989d105 -- Remove MLX{4,5} glue libraries, since Python3 is included in RHEL 7.6 that - ships the correct libibverbs library. - -* Tue Jan 14 2020 Open vSwitch Bot - 2.13.0-0.20200113git67eb811 -- Snapshot of master 67eb8110171f - -* Mon Jan 13 2020 Timothy Redaelli - 2.13.0-0.20200109git2109841.1 -- Add a not-upstream-yet patch to remove dependency for python3-netifaces, - since it's not available on RHEL7 - -* Mon Jan 13 2020 Open vSwitch Bot - 2.13.0-0.20200109git2109841 -- Snapshot of master 2109841b7984 - -* Thu Jan 09 2020 Open vSwitch Bot - 2.13.0-0.20200109gitb926f57 -- Snapshot of master b926f577aaf1 - -* Tue Jan 07 2020 David Marchand - 2.11.0-16 -- Backport DPDK interrupt fixes for qede (#1788515) - -* Mon Dec 23 2019 Eelco Chaudron - 2.12.0-15 - -- Backport "vhost: add device op when notification to guest is sent" (#1726579) -- Backport "netdev-dpdk: Add coverage counter to count vhost IRQs" (#1726579) - -* Mon Dec 23 2019 Eelco Chaudron - 2.12.0-14 -- Backport "net/i40e: downgrade error log" (#1719644) -- Backport "net/i40e: re-program promiscuous mode on VF interface" (#1733402) -- Backport "bridge: Allow manual notifications about interfaces' updates" (#1719644) -- Backport "netdev-dpdk: add support for the RTE_ETH_EVENT_INTR_RESET" (#1719644) - -* Thu Dec 19 2019 Timothy Redaelli - 2.12.0-13 -- Add --with ipsec flag to build OVS with IPSEC support - -* Tue Dec 10 2019 Timothy Redaelli - 2.12.0-12 -- Fix librte_pmd_mlx{4,5}_glue.so error in Execshield part of RPMDiff - by backporting the DPDK flags from dpdk spec file. - -* Fri Dec 06 2019 Timothy Redaelli - 2.12.0-11 -- Backport "ovs-tcpundump: allow multiple packet lengths" (#1780553) -- Backport "ovs-tcpundump: exit when getting version" (#1780555) -- Backport "ovs-check-dead-ifs: python3 print format" (#1780563) -- Backport "ovs-check-dead-ifs: unshadow pid variable" (#1780563) -- Backport "flake8: also check the ovs-check-dead-ifs script" (#1780563) - -* Wed Dec 04 2019 Timothy Redaelli - 2.12.0-10 -- Rebase internal DPDK to 18.11.5 (#1773780) (CVE-2019-14818) - -* Tue Nov 26 2019 Lorenzo Bianconi - 2.12.0-9 -- Backport "jsonrpc: increase input buffer size from 512 to 4096" (#1720653) - -* Fri Nov 22 2019 Flavio Leitner - 2.12.0-8 -- updated spec to conflict with previous versions. - -* Fri Nov 22 2019 Flavio Leitner - 2.12.0-7 -- Backport "ofproto-dpif: Allow IPv6 ND Extensions only if supported" (#1773598) - [df5db2a7a0fe9a4b6f5eafaada20a9b834aebbac] - -* Wed Nov 13 2019 Numan Siddique - 2.12.0-6 -- Backport "ovsdb-server: Allow replication from older schema version servers" (#1771854) - -* Tue Nov 12 2019 David Marchand - 2.12.0-5 -- Backport "netdev-dpdk: Track vhost tx contention." (#1771390) - -* Tue Nov 05 2019 David Marchand - 2.12.0-4 -- Renumbered dpdk patches -- Backport IOVA fixes (#1769027) - -* Mon Oct 14 2019 Numan Siddique - 2.12.0-3 -- Backport "ovsdb-server: Don't drop all connections on read/write status change" (#1761573) - -* Tue Oct 08 2019 Flavio Leitner - 2.12.0-2 -- updated to 2.12.0 plus patches till 093fd99a4c12d (#1758820) - -* Mon Oct 07 2019 Aaron Conole - 2.12.0-1.20190723gitcbff264 -- Backport "vswitch: ratelimit the device add log" (#1737146) - -* Wed Jul 24 2019 Open vSwitch Bot - 2.12.0-0.20190723gitcbff264 -- Snapshot of branch-2.12 cbff264a084a - -* Tue Jul 16 2019 Timothy Redaelli - 2.11.0-18 -- Increase CONFIG_RTE_MAX_ETHPORTS to 128 (#1730421) - -* Tue Jul 16 2019 Timothy Redaelli - 2.11.0-17 -- Backport "tunnel: Add layer 2 IPv6 GRE encapsulation support." and - "netdev-vport: Make ip6gre netdev type to use TC rules" (#1725623) - -* Fri Jul 12 2019 Timothy Redaelli - 2.11.0-16 -- Rebase internal DPDK to 18.11.2 (#1713698) - -* Tue Jul 09 2019 David Marchand - 2.11.0-15 -- Backport "net/i40e: fix dropped packets statistics name" (#1728610) - -* Tue Jul 02 2019 Timothy Redaelli - 2.11.0-14 -- Backport "netdev-tc-offloads: Use correct hook qdisc at init tc flow" (#1721219) - -* Fri Jun 21 2019 Timothy Redaelli - 2.11.0-13 -- Backport "netdev-tc-offloads: Support match on priority tags" (#1722249) - -* Thu Jun 13 2019 Maxime Coquelin - 2.11.0-12 -- Backport Vhost performance regression fixes (#1672538) - -* Thu Jun 13 2019 Flavio Leitner - 2.11.0-11 -- Backport "rhel: limit stack size to 2M." (#1720315) - -* Thu May 16 2019 Pablo Cascón - 2.11.0-10 -- Backport "ovs-tc: support OvS internal port offload" and deps (#1702334) - -* Wed Apr 24 2019 Numan Siddique - 2.11.0-9 -- Backport "[OVN] Fragmentation support - check_pkt_larger action" (#1702564) - -* Thu Apr 11 2019 Kevin Traynor - 2.11.0-8 -- Backport "net/qede: support IOVA VA mode" (#1684605) - -* Wed Apr 10 2019 David Marchand - 2.11.0-7 -- Backport cpu affinity fixes (#1687320) - -* Tue Apr 09 2019 Timothy Redaelli - 2.11.0-6 -- Add missing dependencies for ovs-tcpdump (#1697978) - -* Tue Mar 26 2019 Flavio Leitner - 2.11.0-5 -- fixed netlink msg corruption when updating netdev. (#1692812) - -* Tue Mar 12 2019 Davide Caratti - 2.11.0-4 -- Backport "net/bnxt: support IOVA VA mode" (#1645523) - -* Tue Mar 12 2019 Timothy Redaelli - 2.11.0-3 -- Backport "ovs-ctl: Permit to specify additional options" (#1687775) -- Remove useless -fPIC from DPDK - -* Fri Mar 01 2019 Timothy Redaelli - 2.11.0-2 -- Backport "rhel: Use PIDFile on forking systemd service files" (#1684477) - -* Thu Feb 28 2019 Timothy Redaelli - 2.11.0-1 -- Update to official 2.11 release - -* Thu Jan 31 2019 Open vSwitch Bot - 2.11.0-0.20190129gitd3a10db -- Snapshot of branch-2.11 d3a10db4fd38 - -* Sun Jan 27 2019 Open vSwitch Bot - 2.11.0-0.20190126gitd4ff5b2 -- Snapshot of branch-2.11 d4ff5b2be7fc - -* Mon Jan 14 2019 Timothy Redaelli - 2.11.0-0.20190114gitadb3f0b -- Update to a snapshot of OVS 2.11 from master - -* Mon Jan 7 2019 Lorenzo Bianconi - 2.10.0-42 -- Backport "OVN: add static IP support to IPAM" (#1664028) - -* Thu Jan 03 2019 Timothy Redaelli - 2.10.0-41 -- Backport some patches to improve offload indications (#1655990) - -* Wed Jan 02 2019 Timothy Redaelli - 2.10.0-40 -- Add "Requires: openvswitch = %%{version}-%%{release}" to python-openvswitch2.10 (#1662944) - -* Wed Jan 2 2019 Lorenzo Bianconi - 2.10.0-39 -- Backport "OVN: add mac address only support to IPAM/MACAM" (#1662905) - -* Thu Dec 20 2018 Numan Siddique - 2.10.0-38 -- Backport "ovn-controller: Inject GARPs to logical switch pipeline to update neighbors" (#1643902) - -* Tue Dec 18 2018 David Marchand - 2.10.0-37 -- Backport 'ovs-ctl: fix system-id.conf owner' (#1659391) -- Do not check /var/log/openvswitch owner/group (#1659391) - -* Tue Dec 18 2018 Numan Siddique - 2.10.0-36 -- Backport "ovn: Fix the invalid eth.dst and ip6.dst set by nd_ns action for certain cases." (#1656018) - -* Mon Dec 10 2018 Timothy Redaelli - 2.10.0-35 -- Backport "dpif-netdev: Add vlan to mask for flow_put operation" (#1649516) - -* Tue Nov 27 2018 Numan Siddique - 2.10.0-34 -- Backport "ovn: Avoid tunneling for VLAN packets redirected to a gateway chassis" (#1561880) - -* Fri Nov 23 2018 Eelco Chaudron - 2.10.0-33 -- Backport "mem: fix memory initialization time" (#1647498) - -* Thu Nov 22 2018 Timothy Redaelli - 2.10.0-32 -- Backport "tests: Use the default key length when generating RSA keys" - -* Wed Nov 14 2018 Timothy Redaelli - 2.10.0-31 -- Backport "net/qede: fix crash when configure fails" (#1648183) - -* Tue Nov 13 2018 Lorenzo Bianconi - 2.10.0-30 -- Backport 'pinctrl: Fix dp_packet structure leak' and 'pinctrl: Fix crash on - buffered packets hmap double remove'. Moreover align 'ovn -- 3 HVs, 3 LS, 3 - lports/LS, 1 LR' test to upstream one (#1649008) - -* Tue Nov 13 2018 Eelco Chaudron - 2.10.0-29 -- Backup "netdev-dpdk: Bring link down when NETDEV_UP is not set" (#1645288) - -* Fri Nov 09 2018 Lorenzo Bianconi - 2.10.0-28 -- OVN: configure L2 address according to the used IP address (#1648272) - -* Thu Nov 08 2018 Timothy Redaelli - 2.10.0-27 -- Backport "bond: Honor updelay and downdelay when LACP is in use" (#1646923) - -* Thu Nov 08 2018 Lorenzo Bianconi - 2.10.0-26 -- OVN: introduce mac_prefix support to IPAM (#1647750) - -* Tue Nov 06 2018 Timothy Redaelli - 2.10.0-25 -- Backport "ofproto-dpif-xlate: Avoid deadlock on multicast snooping recursion" (#1643065) - -* Tue Nov 06 2018 Timothy Redaelli - 2.10.0-24 -- Re-enable "make check" - -* Fri Nov 02 2018 Kevin Traynor - 2.10.0-23 -- Update to DPDK 17.11.4 (#1566069) - -* Thu Oct 25 2018 Timothy Redaelli - 2.10.0-22 -- Ship statically linked OVS binaries (#1643478) - -* Tue Oct 23 2018 Numan Siddique - 2.10.0-21 -- Backport connmgr: Fix vswitchd abort when a port is added and the controller is down (#1637926) - -* Mon Oct 22 2018 Timothy Redaelli - 2.10.0-20 -- Backport "ovn: Add DHCP support for option 252" (#1641740) - -* Wed Oct 17 2018 Timothy Redaelli - 2.10.0-19 -- Backport "net/i40e: fix VLAN offload setting issue" (#1637893) - -* Wed Oct 17 2018 Timothy Redaelli - 2.10.0-18 -- Backport "Python: Make Row's __getattr__ less error prone" (#1639963) - -* Fri Oct 12 2018 Numan Siddique - 2.10.0-17 -- OVN: ovn-ctl: Fix the wrong pidfile argument passed to ovsdb-servers (#1636714) - -* Fri Oct 12 2018 Numan Siddique - 2.10.0-16 -- OVN: Support processing DHCPv6 information request message type (#1636874) - -* Fri Oct 12 2018 Numan Siddique - 2.10.0-15 -- OVN: Fix IPv6 DAD failure for container ports (#1616129) - -* Thu Oct 11 2018 Numan Siddique - 2.10.0-14 -- OVN: Fix the issue in IPv6 Neigh Solicitation responder for router IPs (#1567735) - -* Tue Oct 09 2018 Lorenzo Bianconi - 2.10.0-13 -- OVN: add buffering support for ip packets (#1637466) - -* Mon Oct 08 2018 Matteo Croce - 2.10.0-12 -- Fix null pointer (#1634015) -* Tue Oct 02 2018 Lorenzo Bianconi - 2.10.0-11 -- OVN: add CT_LB action to ovn-trace (#1635344) - -* Mon Oct 01 2018 Timothy Redaelli - 2.10.0-10 -- Backport NFP PMD's non-root related commits for > 1TB of RAM (#1634820): - - net/nfp: support IOVA VA mode - - bus/pci: forbid IOVA mode if IOMMU address width too small - - net/nfp: check hugepages IOVAs based on DMA mask - - mem: use address hint for mapping hugepages - - bus/pci: use IOVAs check when setting IOVA mode - - mem: add function for checking memsegs IOVAs addresses - - mem: fix max DMA maskbit size - -* Thu Sep 27 2018 Matteo Croce - 2.10.0-9 -- Backport "Remove support for multiple queues per port" (#1634015) - -* Wed Sep 26 2018 Matteo Croce - 2.10.0-8 -- Backport EMC reorder fix (#1565205) - -* Wed Sep 26 2018 Matteo Croce - 2.10.0-7 -- Backport per-port socket netlink creation with EPOLLEXCLUSIVE (#1634015) - -* Fri Sep 21 2018 Kevin Traynor - 2.10.0-6 -- Backport roundrobin rxq to pmd assignment (#1631797) - -* Fri Sep 14 2018 Timothy Redaelli - 2.10.0-5 -- Backport "ovs-save: Don't always include the default flow during restore" (#1628905) - -* Thu Sep 13 2018 Flavio Leitner - 2.10.0-4 -- applied Fix translation of groups with no buckets (#1626488) - -* Thu Sep 13 2018 Flavio Leitner - 2.10.0-3 -- Removed provides and obsoletes for openvswitch-dpdk (#1628603) - -* Tue Sep 11 2018 Timothy Redaelli - 2.10.0-2 -- Backported "net/mlx{4,5}: avoid stripping the glue library" (#1627700) - -* Tue Aug 21 2018 Flavio Leitner - 2.10-1 -- Updated with 2.10.0 official tarball (#1618551) - -* Fri Aug 17 2018 Flavio Leitner - 2.10-0 -- Sync'ed with fd-next (4452afaa58) -- vhost: flush IOTLB cache on new mem table handling (#1609643) -- OVN: introduce ovs-appctl command to monitor HVs sb (#1593804) - -* Thu Aug 16 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch-2.10 6bced903bb50 - -* Fri Aug 10 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch-2.10 58a7ce60b9f7 - -* Wed Aug 08 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch-2.10 faf64fb8861f - -* Tue Aug 07 2018 Flavio Leitner - 2.10-0 -- Snapshot of branch master 7a78d1c1ad73 - -* Tue Jul 31 2018 Flavio Leitner - 2.10-0 -- Sync'ed spec file with fd-next-57 (shared linking). - (DPDK patches not included) -- Fixed package dependencies (#1610603) - -* Fri Jul 27 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master b1ca64f020f7 - -* Fri Jul 27 2018 Flavio Leitner - 2.10-0 -- Replace macro %%{name} with 'openvswitch'. - -* Tue Jul 24 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master 1ac690899592 - -* Tue Jul 24 2018 Flavio Leitner - 2.10-0 -- Versioned conflict to be less than 2.10. - -* Thu Jul 19 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master 3c921cc2b6b7 - -* Wed Jul 18 2018 Flavio Leitner - 2.10-0 -- Fixed unbound requires and buildrequires. - -* Tue Jul 10 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master 93c0ef12039c - -* Tue Jul 03 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master 79d0dfa4e99a - -* Wed Jun 27 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master e46148133067 - -* Wed Jun 27 2018 Open vSwitch Bot - 2.10-0 -- Snapshot of branch master 61677bf976e9 - -* Tue Jun 26 2018 Flavio Leitner - 2.10-0 -- snapshot of branch master - -* Mon Jun 11 2018 Aaron Conole - 2.9.0-47 -- Backport "net/mlx5: fix memory region cache lookup" (#1581230) -- Backport "net/mlx5: fix memory region boundary checks" (#1581230) - -* Mon Jun 11 2018 Timothy Redaelli - 2.9.0-46 -- Backport "net/qede: fix memory alloc for multiple port reconfig" (#1589866) - -* Thu Jun 07 2018 Timothy Redaelli - 2.9.0-45 -- Backport "net/qede: fix unicast filter routine return code" (#1578590) - -* Thu Jun 07 2018 Timothy Redaelli - 2.9.0-44 -- Backport "net/qede: fix L2-handles used for RSS hash update" (#1578981) - -* Tue May 29 2018 Timothy Redaelli - 2.9.0-43 -- Backport "net/nfp: fix lock file usage" (#1583670) - -* Mon May 28 2018 Timothy Redaelli - 2.9.0-42 -- Backport "net/nfp: configure default RSS reta table" (#1583161) - -* Mon May 28 2018 Timothy Redaelli - 2.9.0-41 -- Backport "netdev-dpdk: don't enable scatter for jumbo RX support for nfp" (#1578324) - -* Mon May 28 2018 Timothy Redaelli - 2.9.0-40 -- Backport "ovn pacemaker: Fix promotion issue when the master node is reset" (#1579025) - -* Thu May 24 2018 Timothy Redaelli - 2.9.0-39 -- Backport spec file modfications from "rhel: Use openvswitch user/group for - the log directory" - -* Wed May 23 2018 Maxime Coquelin - 2.9.0-38 -- Backport "vhost: improve dirty pages logging performance" (#1552465) - -* Wed May 16 2018 Timothy Redaelli - 2.9.0-37 -- Backport "ovn: Set proper Neighbour Adv flag when replying for NS request for - router IP" (#1567735) - -* Mon May 14 2018 Timothy Redaelli - 2.9.0-36 -- Enable QEDE PMDs (only on x86_64) (#1578003) - -* Thu May 10 2018 Lorenzo Bianconi - 2.9.0-35 -- ovn-nbctl: Show gw chassis in decreasing prio order (#1576725) - -* Wed May 09 2018 Timothy Redaelli - 2.9.0-34 -- Fix hugetlbfs group when DPDK is enabled - -* Wed May 09 2018 Timothy Redaelli - 2.9.0-33 -- Backport "eal: abstract away the auxiliary vector" (#1560728) -- Re-enable DPDK on ppc64le - -* Wed May 09 2018 Aaron Conole - 2.9.0-32 -- Require the selinux policy module (#1555440) - -* Tue May 08 2018 Timothy Redaelli - 2.9.0-31 -- Backport fix QEDE PMD (#1494616) - -* Tue May 08 2018 Timothy Redaelli - 2.9.0-30 -- Backport "net/nfp: fix mbufs releasing when stop or close" (#1575067) - -* Sun May 06 2018 Timothy Redaelli - 2.9.0-29 -- Backport net/mlx4: fix broadcast Rx (#1568908) - -* Fri May 04 2018 Kevin Traynor - 2.9.0-28 -- Backport mempool use after free fix and debug (#1575016) - -* Fri May 04 2018 Aaron Conole - 2.9.0-27 -- Fix the email address in the changelog. - -* Wed May 02 2018 Aaron Conole - 2.9.0-26 -- Backport fix for missing user during install/upgrade (#1559374) - -* Mon Apr 30 2018 Jakub Sitnicki - 2.9.0-25 -- Backport fix for Unicode encoding in Python IDL (#1547065) - -* Thu Apr 26 2018 Aaron Conole - 2.9.0-24 -- Backport the cisco enic patches - -* Thu Apr 26 2018 Timothy Redaelli - 2.9.0-23 -- Backport a fix for "Offload of Fragment Matching in OvS Userspace" (#1559111) - -* Thu Apr 26 2018 Timothy Redaelli - 2.9.0-22 -- Backport "ovn-controller: Handle Port_Binding's "requested-chassis" option" (#1559222) - -* Thu Apr 26 2018 Timothy Redaelli - 2.9.0-21 -- Backport "python: avoid useless JSON conversion to enhance performance" (#1551016) - -* Thu Apr 26 2018 Timothy Redaelli - 2.9.0-20 -- Backport "ovn: Set router lifetime value for IPv6 periodic RA" (#1567735) -- Remove useless libpcap-devel dependency - -* Mon Apr 23 2018 Kevin Traynor - 2.9.0-19 -- Backport DPDK CVE-2018-1059 (#1544298) - -* Fri Apr 20 2018 Davide Caratti - 2.9.0-18 -- Backport fix for PMD segfault when BNXT receives tunneled traffic (#1567634) - -* Mon Apr 16 2018 Timothy Redaelli - 2.9.0-17 -- Backport patches to make NFP detect the correct firmware (#1566712) -- Backport "rhel: Fix literal dollar sign usage in systemd service files" - -* Fri Mar 30 2018 Timothy Redaelli - 2.9.0-16 -- Backport "rhel: don't drop capabilities when running as root" -- Change owner of /etc/openvswitch during upgrade - -* Tue Mar 27 2018 Timothy Redaelli - 2.9.0-14 -- Disable DPDK on ppc64le - -* Sun Mar 25 2018 Timothy Redaelli - 2.9.0-13 -- Disable DPDK on aarch64 - -* Thu Mar 22 2018 Flavio Leitner - 2.9.0-12 -- fixes i40e link status timeout trough direct register access (#1559612) - -* Thu Mar 22 2018 Timothy Redaelli - 2.9.0-11 -- Enable BNXT, MLX4, MLX5 and NFP (aligned from FDB) - -* Thu Mar 22 2018 Timothy Redaelli - 2.9.0-10 -- Backport "Offload of Fragment Matching in OvS Userspace" (#1559111) - -* Thu Mar 15 2018 Timothy Redaelli - 2.9.0-9 -- Avoid to unpack openvswitch 2 times and to overwrite all the patched files - Fixes 2.9.0-4 - -* Thu Mar 08 2018 Eric Garver - 2.9.0-8 -- Backport "ofproto-dpif-xlate: translate action_set in clone action" (#1544892) - -* Thu Mar 08 2018 Timothy Redaelli - 2.9.0-7 -- Backport "ovn: Calculate UDP checksum for DNS over IPv6" (#1553023) - -* Tue Mar 06 2018 Aaron Conole - 2.9.0-6 -- Require the latest rhel selinux policy (#1549673) - -* Fri Mar 02 2018 Matteo Croce - 2.9.0-5 -- Backport vhost patches (#1541881) - -* Fri Mar 02 2018 Timothy Redaelli - 2.9.0-4 -- Don't require python-sphinx directly, but built it since python-sphinx is in - the optional repository that is not available on RHEV and TPS test fails. - -* Tue Feb 20 2018 Timothy Redaelli - 2.9.0-3 -- Don't verify the user and group of /etc/openvswitch and /etc/sysconfig/openvswitch - This is needed since we cannot change the user and group if you upgrade from - an old version that still uses root:root. - -* Tue Feb 20 2018 Timothy Redaelli - 2.9.0-1 -- Update to OVS 2.9.0 + DPDK 17.11 (#1475436) -- Backport of ofproto-dpif: Delete system tunnel interface when remove ovs bridge (#1505776) -- Backport DPDK patches from FDB (vhost user async fix and enic fixes) -- Backport 94cd8383e297 and 951d79e638ec to fix permissions (#1489465) -- Use a static configuration file for DPDK - -* Fri Jan 12 2018 Timothy Redaelli - 2.7.3-3.git20180112 -- Rebase to latest OVS branch-2.7 fixes + DPDK 16.11.4 (#1533872) - -* Wed Oct 18 2017 Timothy Redaelli - 2.7.3-2.git20171010 -- Remove ovs-test and ovs-vlan-test from openvswitch-test package -- Add an option to enable openvswitch-ovn-docker package (disabled by default) - -* Tue Oct 10 2017 Timothy Redaelli - 2.7.3-1.git20171010 -- Update to OVS 2.7.3 + branch-2.7 bugfixes (#1502742) - -* Mon Sep 18 2017 Kevin Traynor - 2.7.2-10.git20170914 -- Backport of fix for i40e flow control get (#1491791) - -* Thu Sep 14 2017 Timothy Redaelli - 2.7.2-9.git20170914 -- Rebase to latest OVS branch fixes + DPDK 16.11.3 - -* Wed Sep 06 2017 Timothy Redaelli - 2.7.2-8.git20170719 -- Backport of enic driver crash fix to dpdk-16.11 (#1489010) - -* Tue Aug 22 2017 Aaron Conole - 2.7.2-7.git20170719 -- Re-enable Cisco enic PMD (#1482675) - -* Tue Aug 22 2017 Aaron Conole - 2.7.2-6.git20170719 -- Update based on multi-arch - -* Tue Aug 22 2017 Aaron Conole - 2.7.2-5.git20170719 -- Disable unsupported PMDs (#1482675) -- software and hardware PMDs audited by the team - -* Thu Aug 03 2017 John W. Linville - 2.7.2-4.git20170719 -- Backport mmap fix for memory initialization on ppc64le to dpdk-16.11 - -* Thu Aug 03 2017 John W. Linville - 2.7.2-3.git20170719 -- Backport support for vfio-pci based PMD in ppc64le to dpdk-16.11 - -* Thu Aug 03 2017 John W. Linville - 2.7.2-2.git20170719 -- Backport support for Intel XL710 (i40e) pmd in ppc64le to dpdk-16.11 - -* Wed Jul 19 2017 Timothy Redaelli - 2.7.2-1.git20170719 -- Update to OVS 2.7.2 + branch-2.7 bugfixes (#1472854) -- Add a symlink of the OCF script in the OCF resources folder (#1472729) - -* Mon Jul 10 2017 Timothy Redaelli - 2.7.1-1.git20170710 -- Align to FDB openvswitch-2.7.1-1.git20170710.el7fdb (#1459286) - -* Wed Jun 07 2017 Timothy Redaelli - 2.6.1-20.git20161206 -- backport "mcast-snooping: Avoid segfault for vswitchd" (#1456356) -- backport "mcast-snooping: Flush ports mdb when VLAN cfg changed." (#1456358) - -* Sun May 21 2017 Lance Richardson - 2.6.1-19.git20161206 -- backport patch to not automatically restard ovn svcs after upgrade (#1438901) - -* Tue May 09 2017 Timothy Redaelli - 2.6.1-18.git20161206 -- rconn: Avoid abort for ill-behaved remote (#1449109) - -* Fri May 05 2017 Timothy Redaelli - 2.6.1-17.git20161206 -- Fix race in "PMD - change numa node" test (#1447714) -- Report only un-deleted groups in group stats replies. (#1447724) -- Workaround some races in "ofproto - asynchronous message control" tests (#1448536) - -* Mon Apr 10 2017 Eric Garver - 2.6.1-16.git20161206 -- Fix an issue using set_field action on nw_ecn (#1410715) - -* Fri Mar 31 2017 Kevin Traynor - 2.6.1-15.git20161206 -- backport patch to fix uni-dir vhost perf drop (#1414919) - -* Wed Mar 29 2017 Lance Richardson - 2.6.1-14.git20161206 -- backport patch to correct port number in firewalld service file (#1390938) - -* Fri Mar 10 2017 Timothy Redaelli - 2.6.1-13.git20161206 -- backport patch to enable/disable libcap-ng support (--with libcapng) - -* Thu Mar 09 2017 Aaron Conole - 2.6.1-12.git20161206 -- Fix an MTU issue with ovs mirror ports (#1426342) - -* Wed Mar 08 2017 Lance Richardson - 2.6.1-11.git20161206 -- update spec file to install firewalld service files (#1390938) - -* Thu Feb 16 2017 Aaron Conole - 2.6.1-10.git20161206 -- vhostuser client mode support for ifup/ifdown (#1418957) - -* Thu Feb 16 2017 Lance Richardson - 2.6.1-9.git20161206 -- OVN-DHCP is not sending DHCP responses after a MAC change in north db (#1418261) - -* Thu Feb 16 2017 Timothy Redaelli - 2.6.1-8.git20161206 -- systemd service starts too fast (#1422227) - -* Fri Feb 10 2017 Lance Richardson - 2.6.1-7.git20161206 -- iptables should be easily configurable for OVN hosts and OVN central server (#1390938) - -* Thu Feb 09 2017 Aaron Conole - 2.6.1-6.git20161206 -- ovn: IPAM has no reply to DHCP request for renewal (#1415449) - -* Tue Feb 07 2017 Timothy Redaelli - 2.6.1-5.git20161206 -- ovn-controller: Provide the option to set Encap.options:csum (#1418742) - -* Mon Feb 06 2017 Flavio Leitner 2.5.0-23.git20160727 -- fixed broken service after a package upgrade (#1403958) - -* Wed Dec 21 2016 Lance Richardson 2.6.1-3.git20161206 -- ovsdb-idlc: Initialize nonnull string columns for inserted rows. (#1405094) - -* Fri Dec 09 2016 Lance Richardson 2.6.1-2.git20161206 -- OVN: Support IPAM with externally specified MAC (#1368043) - -* Tue Dec 06 2016 Kevin Traynor 2.6.1-1.git20161206 -- Update to OVS 2.6.1 + branch-2.6 bugfixes (#1335865) -- Update to use DPDK 16.11 (#1335865) -- Enable OVN - -* Tue Nov 22 2016 Flavio Leitner 2.5.0-22.git20160727 -- ifnotifier: do not wake up when there is no db connection (#1397504) - -* Tue Nov 22 2016 Flavio Leitner 2.5.0-21.git20160727 -- Use instant sending instead of queue (#1397481) - -* Mon Nov 21 2016 Flavio Leitner 2.5.0-20.git20160727 -- dpdk vhost: workaround stale vring base (#1376217) - -* Thu Oct 20 2016 Aaron Conole - 2.5.0-19.git20160727 -- Applied tnl fix (#1346232) - -* Tue Oct 18 2016 Aaron Conole - 2.5.0-18.git20160727 -- Applied the systemd backports - -* Tue Oct 18 2016 Flavio Leitner - 2.5.0-17.git20160727 -- Fixed OVS to not require SSSE3 if DPDK is not used (#1378501) - -* Tue Oct 18 2016 Flavio Leitner - 2.5.0-16.git20160727 -- Fixed a typo (#1385096) - -* Tue Oct 18 2016 Flavio Leitner - 2.5.0-15.git20160727 -- Do not restart the service after a package upgrade (#1385096) - -* Mon Sep 26 2016 Panu Matilainen - 2.5.0-14.git20160727 -- Permit running just the kernel datapath tests (#1375660) - -* Wed Sep 14 2016 Panu Matilainen - 2.5.0-13.git20160727 -- Obsolete openvswitch-dpdk < 2.6.0 to provide migration path -- Add spec option to run kernel datapath tests (#1375660) - -* Fri Sep 09 2016 Panu Matilainen - 2.5.0-12.git20160727 -- Backport ovs-tcpdump support (#1335560) -- Add ovs-pcap, ovs-tcpdump and ovs-tcpundump to -test package - -* Thu Sep 08 2016 Panu Matilainen - 2.5.0-11.git20160727 -- Add openvswitch-dpdk provide for testing and depending on dpdk-enablement -- Disable bnx2x driver, it's not stable -- Build dpdk with -Wno-error to permit for newer compilers -- Drop subpkgs conditional from spec, its not useful anymore - -* Fri Aug 26 2016 Panu Matilainen - 2.5.0-10.git20160727 -- Fix adding ukeys for same flow by different pmds (#1364898) - -* Thu Jul 28 2016 Flavio Leitner - 2.5.0-9.git20160727 -- Fixed ifup-ovs to support DPDK Bond (#1360426) - -* Thu Jul 28 2016 Flavio Leitner - 2.5.0-8.git20160727 -- Fixed ifup-ovs to delete the ports first (#1359890) - -* Wed Jul 27 2016 Flavio Leitner - 2.5.0-7.git20160727 -- pull bugfixes from upstream 2.5 branch (#1360431) - -* Tue Jul 26 2016 Flavio Leitner - 2.5.0-6.git20160628 -- Removed redundant provides for openvswitch -- Added epoch to the provides for -static package - -* Thu Jul 21 2016 Flavio Leitner - 2.5.0-5.git20160628 -- Renamed to openvswitch (dpdk enabled) -- Enabled sub-packages -- Removed conflicts to openvswitch -- Increased epoch to give this package preference over stable - -* Tue Jun 28 2016 Panu Matilainen - 2.5.0-4.git20160628 -- pull bugfixes from upstream 2.5 branch (#1346313) - -* Wed Apr 27 2016 Panu Matilainen - 2.5.0-4 -- Enable DPDK bnx2x driver (#1330589) -- Add README.DPDK-PMDS document listing drivers included in this package - -* Thu Mar 17 2016 Flavio Leitner - 2.5.0-3 -- Run testsuite by default on x86 arches (#1318786) - (this sync the spec with non-dpdk version though the testsuite - was already enabled here) - -* Thu Mar 17 2016 Panu Matilainen - 2.5.0-2 -- eliminate debuginfo-artifacts (#1281913) - -* Thu Mar 17 2016 Panu Matilainen - 2.5.0-1 -- Update to OVS to 2.5.0 and bundled DPDK to 2.2.0 (#1317889) - -* Mon Nov 23 2015 Panu Matilainen -- Provide openvswitch ver-rel (#1281894) - -* Thu Aug 13 2015 Flavio Leitner -- ExclusiveArch to x86_64 (dpdk) -- Provides bundled(dpdk) -- Re-enable testsuite - -* Fri Aug 07 2015 Panu Matilainen -- Enable building from pre-release snapshots, update to pre 2.4 version -- Bundle a minimal, private build of DPDK 2.0 and link statically -- Rename package to openvswitch-dpdk, conflict with regular openvswitch -- Disable all sub-packages - -* Wed Jan 12 2011 Ralf Spenneberg -- First build on F14