Blob Blame History Raw
From 4ee0f6af9e601cbb5f69a486526d1011314bbfed Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@ovn.org>
Date: Thu, 19 Mar 2020 17:53:10 -0700
Subject: [PATCH 01/15] ofproto-dpif-xlate: Fix recirculation when in_port is
 OFPP_CONTROLLER.

[ upstream commit c5a910dd92ecbad24f86b4c59b4ff8105b5149fd ]

Recirculation usually requires finding the pre-recirculation input port.
Packets sent by the controller, with in_port of OFPP_CONTROLLER or
OFPP_NONE, do not have a real input port data structure, only a port
number.  The code in xlate_lookup_ofproto_() mishandled this case,
failing to return the ofproto data structure.  This commit fixes the
problem and adds a test to guard against regression.

Reported-by: Numan Siddique <numans@ovn.org>
Reported-at: https://mail.openvswitch.org/pipermail/ovs-dev/2020-March/368642.html
Tested-by: Numan Siddique <numans@ovn.org>
Acked-by: Numan Siddique <numans@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1775160
Signed-off-by: Numan Siddique <nusiddiq@redhat.com>
---
 ofproto/ofproto-dpif-xlate.c | 25 +++++++++++++++++++++----
 tests/ofproto-dpif.at        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index 4407f9c97a..54cfbfbdff 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -1516,15 +1516,32 @@ xlate_lookup_ofproto_(const struct dpif_backer *backer,
             return NULL;
         }
 
-        /* If recirculation was initiated due to bond (in_port = OFPP_NONE)
-         * then frozen state is static and xport_uuid is not defined, so xport
-         * cannot be restored from frozen state. */
-        if (recirc_id_node->state.metadata.in_port != OFPP_NONE) {
+        ofp_port_t in_port = recirc_id_node->state.metadata.in_port;
+        if (in_port != OFPP_NONE && in_port != OFPP_CONTROLLER) {
             struct uuid xport_uuid = recirc_id_node->state.xport_uuid;
             xport = xport_lookup_by_uuid(xcfg, &xport_uuid);
             if (xport && xport->xbridge && xport->xbridge->ofproto) {
                 goto out;
             }
+        } else {
+            /* OFPP_NONE and OFPP_CONTROLLER are not real ports.  They indicate
+             * that the packet originated from the controller via an OpenFlow
+             * "packet-out".  The right thing to do is to find just the
+             * ofproto.  There is no xport, which is OK.
+             *
+             * OFPP_NONE can also indicate that a bond caused recirculation. */
+            struct uuid uuid = recirc_id_node->state.ofproto_uuid;
+            const struct xbridge *bridge = xbridge_lookup_by_uuid(xcfg, &uuid);
+            if (bridge && bridge->ofproto) {
+                if (errorp) {
+                    *errorp = NULL;
+                }
+                *xportp = NULL;
+                if (ofp_in_port) {
+                    *ofp_in_port = in_port;
+                }
+                return bridge->ofproto;
+            }
         }
     }
 
diff --git a/tests/ofproto-dpif.at b/tests/ofproto-dpif.at
index ff1cc93707..d444cf0844 100644
--- a/tests/ofproto-dpif.at
+++ b/tests/ofproto-dpif.at
@@ -5171,6 +5171,36 @@ AT_CHECK_UNQUOTED([tail -1 stdout], [0], [Datapath actions: 2
 OVS_VSWITCHD_STOP
 AT_CLEANUP
 
+# Checks for regression against a bug in which OVS dropped packets
+# with in_port=CONTROLLER when they were recirculated (because
+# CONTROLLER isn't a real port and could not be looked up).
+AT_SETUP([ofproto-dpif - packet-out recirculation])
+OVS_VSWITCHD_START
+add_of_ports br0 1 2
+
+AT_DATA([flows.txt], [dnl
+table=0 ip actions=mod_dl_dst:83:83:83:83:83:83,ct(table=1)
+table=1 ip actions=ct(commit),output:2
+])
+AT_CHECK([ovs-ofctl add-flows br0 flows.txt])
+
+packet=ffffffffffff00102030405008004500001c00000000401100000a000002ffffffff0035111100080000
+AT_CHECK([ovs-ofctl packet-out br0 "in_port=controller packet=$packet actions=table"])
+
+# Dumps out the flow table, extracts the number of packets that have gone
+# through the (single) flow in table 1, and returns success if it's exactly 1.
+#
+# If this remains 0, then the recirculation isn't working properly since the
+# packet never goes through flow in table 1.
+check_flows () {
+    n=$(ovs-ofctl dump-flows br0 table=1 | sed -n 's/.*n_packets=\([[0-9]]\{1,\}\).*/\1/p')
+    echo "n_packets=$n"
+    test "$n" = 1
+}
+OVS_WAIT_UNTIL([check_flows], [ovs dump-flows br0])
+
+OVS_VSWITCHD_STOP
+AT_CLEANUP
 
 AT_SETUP([ofproto-dpif - debug_slow action])
 OVS_VSWITCHD_START
-- 
2.25.1


From 71f25b7920093daa59827a0a4be4095309aec6ff Mon Sep 17 00:00:00 2001
From: Timothy Redaelli <tredaelli@redhat.com>
Date: Thu, 19 Mar 2020 20:05:39 +0100
Subject: [PATCH 02/15] bugtool: Fix for Python3.

Currently ovs-bugtool tool doesn't start on Python 3.
This commit fixes ovs-bugtool to make it works on Python 3.

Replaced StringIO.StringIO with io.BytesIO since the script is
processing binary data.

Reported-at: https://bugzilla.redhat.com/1809241
Reported-by: Flavio Leitner <fbl@sysclose.org>
Signed-off-by: Timothy Redaelli <tredaelli@redhat.com>
Co-authored-by: William Tu <u9012063@gmail.com>
Signed-off-by: William Tu <u9012063@gmail.com>
(cherry picked from commit 9e6c00bca9af29031d0e160d33174b7ae99b9244)
---
 utilities/bugtool/ovs-bugtool.in | 48 +++++++++++++++++---------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/utilities/bugtool/ovs-bugtool.in b/utilities/bugtool/ovs-bugtool.in
index e55bfc2ed5..47f3c4629f 100755
--- a/utilities/bugtool/ovs-bugtool.in
+++ b/utilities/bugtool/ovs-bugtool.in
@@ -33,8 +33,7 @@
 # or func_output().
 #
 
-import StringIO
-import commands
+from io import BytesIO
 import fcntl
 import getopt
 import hashlib
@@ -48,7 +47,7 @@ import warnings
 import zipfile
 from select import select
 from signal import SIGTERM
-from subprocess import PIPE, Popen
+from subprocess import PIPE, Popen, check_output
 
 from xml.dom.minidom import getDOMImplementation, parse
 
@@ -348,7 +347,7 @@ def collect_data():
             cap = v['cap']
             if 'cmd_args' in v:
                 if 'output' not in v.keys():
-                    v['output'] = StringIOmtime()
+                    v['output'] = BytesIOmtime()
                 if v['repeat_count'] > 0:
                     if cap not in process_lists:
                         process_lists[cap] = []
@@ -373,20 +372,23 @@ def collect_data():
         if 'filename' in v and v['filename'].startswith('/proc/'):
             # proc files must be read into memory
             try:
-                f = open(v['filename'], 'r')
+                f = open(v['filename'], 'rb')
                 s = f.read()
                 f.close()
                 if check_space(cap, v['filename'], len(s)):
-                    v['output'] = StringIOmtime(s)
+                    v['output'] = BytesIOmtime(s)
             except:
                 pass
         elif 'func' in v:
             try:
                 s = v['func'](cap)
             except Exception as e:
-                s = str(e)
+                s = str(e).encode()
             if check_space(cap, k, len(s)):
-                v['output'] = StringIOmtime(s)
+                if isinstance(s, str):
+                    v['output'] = BytesIOmtime(s.encode())
+                else:
+                    v['output'] = BytesIOmtime(s)
 
 
 def main(argv=None):
@@ -704,7 +706,7 @@ exclude those logs from the archive.
 
     # permit the user to filter out data
     # We cannot use iteritems, since we modify 'data' as we pass through
-    for (k, v) in sorted(data.items()):
+    for (k, v) in data.items():
         cap = v['cap']
         if 'filename' in v:
             key = k[0]
@@ -721,7 +723,7 @@ exclude those logs from the archive.
 
     # include inventory
     data['inventory.xml'] = {'cap': None,
-                        'output': StringIOmtime(make_inventory(data, subdir))}
+                        'output': BytesIOmtime(make_inventory(data, subdir))}
 
     # create archive
     if output_fd == -1:
@@ -782,7 +784,7 @@ def dump_scsi_hosts(cap):
 
 
 def module_info(cap):
-    output = StringIO.StringIO()
+    output = BytesIO()
     modules = open(PROC_MODULES, 'r')
     procs = []
 
@@ -806,7 +808,7 @@ def multipathd_topology(cap):
 
 
 def dp_list():
-    output = StringIO.StringIO()
+    output = BytesIO()
     procs = [ProcOutput([OVS_DPCTL, 'dump-dps'],
              caps[CAP_NETWORK_STATUS][MAX_TIME], output)]
 
@@ -828,7 +830,7 @@ def collect_ovsdb():
             if os.path.isfile(OPENVSWITCH_COMPACT_DB):
                 os.unlink(OPENVSWITCH_COMPACT_DB)
 
-            output = StringIO.StringIO()
+            output = BytesIO()
             max_time = 5
             procs = [ProcOutput(['ovsdb-tool', 'compact',
                                 OPENVSWITCH_CONF_DB, OPENVSWITCH_COMPACT_DB],
@@ -871,7 +873,7 @@ def fd_usage(cap):
 
 
 def dump_rdac_groups(cap):
-    output = StringIO.StringIO()
+    output = BytesIO()
     procs = [ProcOutput([MPPUTIL, '-a'], caps[cap][MAX_TIME], output)]
 
     run_procs([procs])
@@ -896,7 +898,7 @@ def load_plugins(just_capabilities=False, filter=None):
         for node in nodelist:
             if node.nodeType == node.TEXT_NODE:
                 rc += node.data
-        return rc.encode()
+        return rc
 
     def getBoolAttr(el, attr, default=False):
         ret = default
@@ -1037,7 +1039,7 @@ def make_tar(subdir, suffix, output_fd, output_file):
                     s = os.stat(v['filename'])
                     ti.mtime = s.st_mtime
                     ti.size = s.st_size
-                    tf.addfile(ti, open(v['filename']))
+                    tf.addfile(ti, open(v['filename'], 'rb'))
             except:
                 pass
     finally:
@@ -1095,12 +1097,12 @@ def make_inventory(inventory, subdir):
     s.setAttribute('date', time.strftime('%c'))
     s.setAttribute('hostname', platform.node())
     s.setAttribute('uname', ' '.join(platform.uname()))
-    s.setAttribute('uptime', commands.getoutput(UPTIME))
+    s.setAttribute('uptime', check_output(UPTIME).decode())
     document.getElementsByTagName(INVENTORY_XML_ROOT)[0].appendChild(s)
 
     map(lambda k_v: inventory_entry(document, subdir, k_v[0], k_v[1]),
         inventory.items())
-    return document.toprettyxml()
+    return document.toprettyxml().encode()
 
 
 def inventory_entry(document, subdir, k, v):
@@ -1301,7 +1303,7 @@ class ProcOutput(object):
             line = self.proc.stdout.readline()
         else:
             line = self.proc.stdout.read(self.bufsize)
-        if line == '':
+        if line == b'':
             # process exited
             self.proc.stdout.close()
             self.status = self.proc.wait()
@@ -1391,13 +1393,13 @@ def get_free_disk_space(path):
     return s.f_frsize * s.f_bfree
 
 
-class StringIOmtime(StringIO.StringIO):
-    def __init__(self, buf=''):
-        StringIO.StringIO.__init__(self, buf)
+class BytesIOmtime(BytesIO):
+    def __init__(self, buf=b''):
+        BytesIO.__init__(self, buf)
         self.mtime = time.time()
 
     def write(self, s):
-        StringIO.StringIO.write(self, s)
+        BytesIO.write(self, s)
         self.mtime = time.time()
 
 
-- 
2.25.1


From 914d885061c9f7e7e6e5f921065301e08837e122 Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Fri, 28 Feb 2020 18:07:04 -0800
Subject: [PATCH 03/15] raft-rpc: Fix message format.

[ upstream commit 78c8011f58daec41ec97440f2e42795699322742 ]

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft-rpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ovsdb/raft-rpc.c b/ovsdb/raft-rpc.c
index 18c83fe9c2..dd14d81091 100644
--- a/ovsdb/raft-rpc.c
+++ b/ovsdb/raft-rpc.c
@@ -544,8 +544,8 @@ raft_format_install_snapshot_request(
     ds_put_format(s, " last_index=%"PRIu64, rq->last_index);
     ds_put_format(s, " last_term=%"PRIu64, rq->last_term);
     ds_put_format(s, " last_eid="UUID_FMT, UUID_ARGS(&rq->last_eid));
-    ds_put_cstr(s, " last_servers=");
     ds_put_format(s, " election_timer=%"PRIu64, rq->election_timer);
+    ds_put_cstr(s, " last_servers=");
 
     struct hmap servers;
     struct ovsdb_error *error =
-- 
2.25.1


From 8ff30dfee6cb075e36ed38b77695ff03321ce12b Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Fri, 28 Feb 2020 18:07:05 -0800
Subject: [PATCH 04/15] ovsdb-server: Don't disconnect clients after raft
 install_snapshot.

[ upstream commit f0c8b44c5832c36989fad78927407fc14e64ce46 ]

When "schema" field is found in read_db(), there can be two cases:
1. There is a schema change in clustered DB and the "schema" is the new one.
2. There is a install_snapshot RPC happened, which caused log compaction on the
server and the next log is just the snapshot, which always constains "schema"
field, even though the schema hasn't been changed.

The current implementation doesn't handle case 2), and always assume the schema
is changed hence disconnect all clients of the server. It can cause stability
problem when there are big number of clients connected when this happens in
a large scale environment.

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/ovsdb-server.c   |  3 ++-
 tests/ovsdb-cluster.at | 56 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
index b6957d7300..d416f1b606 100644
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -543,7 +543,8 @@ parse_txn(struct server_config *config, struct db *db,
           struct ovsdb_schema *schema, const struct json *txn_json,
           const struct uuid *txnid)
 {
-    if (schema) {
+    if (schema && (!db->db->schema || strcmp(schema->version,
+                                             db->db->schema->version))) {
         /* We're replacing the schema (and the data).  Destroy the database
          * (first grabbing its storage), then replace it with the new schema.
          * The transaction must also include the replacement data.
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 3a0bd4579e..5b6188b96f 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -273,6 +273,62 @@ OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s4 cluster/status $schema_name | grep "Ele
 
 AT_CLEANUP
 
+
+AT_BANNER([OVSDB cluster install snapshot RPC])
+
+AT_SETUP([OVSDB cluster - install snapshot RPC])
+AT_KEYWORDS([ovsdb server positive unix cluster snapshot])
+
+n=3
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ordinal_schema > schema
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=`ovsdb-tool db-cid s1.db`
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+for i in `seq 2 $n`; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill `cat *.pid`'
+for i in `seq $n`; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+done
+for i in `seq $n`; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+# Kill one follower (s2) and write some data to cluster, so that the follower is falling behind
+printf "\ns2: stopping\n"
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s2], [s2.pid])
+
+AT_CHECK([ovsdb-client transact unix:s1.ovsdb '[["idltest",
+      {"op": "insert",
+       "table": "simple",
+       "row": {"i": 1}}]]'], [0], [ignore], [ignore])
+
+# Compact leader online to generate snapshot
+AT_CHECK([ovs-appctl -t "`pwd`"/s1 ovsdb-server/compact])
+
+# Start the follower s2 again.
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s2.log --pidfile=s2.pid --unixctl=s2 --remote=punix:s2.ovsdb s2.db])
+AT_CHECK([ovsdb_client_wait unix:s2.ovsdb $schema_name connected])
+
+# A client transaction through s2. During this transaction, there will be a
+# install_snapshot RPC because s2 detects it is behind and s1 doesn't have the
+# pre_log_index requested by s2 because it is already compacted.
+# After the install_snapshot RPC process, the transaction through s2 should
+# succeed.
+AT_CHECK([ovsdb-client transact unix:s2.ovsdb '[["idltest",
+      {"op": "insert",
+       "table": "simple",
+       "row": {"i": 1}}]]'], [0], [ignore], [ignore])
+
+for i in `seq $n`; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+done
+
+AT_CLEANUP
+
 
 
 OVS_START_SHELL_HELPERS
-- 
2.25.1


From e732012d7be335650398ff03c2431c64b2c4aaba Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Fri, 28 Feb 2020 18:07:06 -0800
Subject: [PATCH 05/15] raft: Fix raft_is_connected() when there is no leader
 yet.

[ upstream commit adc64ab057345f7004c44bf92363b9adda862134 ]

If there is never a leader known by the current server, it's status
should be "disconnected" to the cluster. Without this patch, when
a server in cluster is restarted, before it successfully connecting
back to the cluster it will appear as connected, which is wrong.

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft.c           | 10 ++++++++--
 tests/ovsdb-cluster.at | 35 +++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 4789bc4f22..6cd7b0041a 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -298,6 +298,11 @@ struct raft {
     bool had_leader;            /* There has been leader elected since last
                                    election initiated. This is to help setting
                                    candidate_retrying. */
+
+    /* For all. */
+    bool ever_had_leader;       /* There has been leader elected since the raft
+                                   is initialized, meaning it is ever
+                                   connected. */
 };
 
 /* All Raft structures. */
@@ -1024,7 +1029,8 @@ raft_is_connected(const struct raft *raft)
             && !raft->joining
             && !raft->leaving
             && !raft->left
-            && !raft->failed);
+            && !raft->failed
+            && raft->ever_had_leader);
     VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false");
     return ret;
 }
@@ -2519,7 +2525,7 @@ static void
 raft_set_leader(struct raft *raft, const struct uuid *sid)
 {
     raft->leader_sid = *sid;
-    raft->had_leader = true;
+    raft->ever_had_leader = raft->had_leader = true;
     raft->candidate_retrying = false;
 }
 
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 5b6188b96f..0aa4564480 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -179,6 +179,41 @@ AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
 ovsdb_test_cluster_disconnect 5 leader yes
 AT_CLEANUP
 
+AT_SETUP([OVSDB cluster - initial status should be disconnected])
+AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
+
+n=3
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ordinal_schema > schema
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=`ovsdb-tool db-cid s1.db`
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+for i in `seq 2 $n`; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill `cat *.pid`'
+for i in `seq $n`; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+done
+for i in `seq $n`; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+# Stop all servers, and start the s1 only, to test initial connection status
+# when there is no leader yet.
+for i in `seq 1 $n`; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+done
+i=1
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+
+# The initial status should be disconnected. So wait should fail.
+AT_CHECK([ovsdb_client_wait --timeout=1 unix:s$i.ovsdb $schema_name connected], [142], [ignore], [ignore])
+OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+
+AT_CLEANUP
+
 
 
 AT_BANNER([OVSDB cluster election timer change])
-- 
2.25.1


From 053b78c8d60ffb4d212fd7894f91be52027f291f Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Fri, 28 Feb 2020 18:07:07 -0800
Subject: [PATCH 06/15] raft: Avoid busy loop during leader election.

[ upstream commit 3ae90e1899c5a05148ea1870d9bb4ac3c05e3a19 ]

When a server doesn't see a leader yet, e.g. during leader re-election,
if a transaction comes from a client, it will cause 100% CPU busy loop.
With debug log enabled it is like:

2020-02-28T04:04:35.631Z|00059|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00062|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00065|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00068|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00071|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00074|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
2020-02-28T04:04:35.631Z|00077|poll_loop|DBG|wakeup due to 0-ms timeout at ../ovsdb/trigger.c:164
...

The problem is that in ovsdb_trigger_try(), all cluster errors are treated
as temporary error and retry immediately. This patch fixes it by introducing
'run_triggers_now', which tells if a retry is needed immediately. When the
cluster error is with detail 'not leader', we don't immediately retry, but
will wait for the next poll event to trigger the retry. When 'not leader'
status changes, there must be a event, i.e. raft RPC that changes the
status, so the trigger is guaranteed to be triggered, without busy loop.

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/ovsdb.c       |  2 +-
 ovsdb/ovsdb.h       |  1 +
 ovsdb/transaction.c |  2 +-
 ovsdb/trigger.c     | 11 +++++++++--
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
index cfc96b32f8..7e683e6815 100644
--- a/ovsdb/ovsdb.c
+++ b/ovsdb/ovsdb.c
@@ -414,7 +414,7 @@ ovsdb_create(struct ovsdb_schema *schema, struct ovsdb_storage *storage)
     db->storage = storage;
     ovs_list_init(&db->monitors);
     ovs_list_init(&db->triggers);
-    db->run_triggers = false;
+    db->run_triggers_now = db->run_triggers = false;
 
     shash_init(&db->tables);
     if (schema) {
diff --git a/ovsdb/ovsdb.h b/ovsdb/ovsdb.h
index 32e5333163..5c30a83d92 100644
--- a/ovsdb/ovsdb.h
+++ b/ovsdb/ovsdb.h
@@ -83,6 +83,7 @@ struct ovsdb {
     /* Triggers. */
     struct ovs_list triggers;   /* Contains "struct ovsdb_trigger"s. */
     bool run_triggers;
+    bool run_triggers_now;
 
     struct ovsdb_table *rbac_role;
 
diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c
index 369436bffb..8ffefcf7c9 100644
--- a/ovsdb/transaction.c
+++ b/ovsdb/transaction.c
@@ -967,7 +967,7 @@ ovsdb_txn_complete(struct ovsdb_txn *txn)
 {
     if (!ovsdb_txn_is_empty(txn)) {
 
-        txn->db->run_triggers = true;
+        txn->db->run_triggers_now = txn->db->run_triggers = true;
         ovsdb_monitors_commit(txn->db, txn);
         ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_update_weak_refs));
         ovsdb_error_assert(for_each_txn_row(txn, ovsdb_txn_row_commit));
diff --git a/ovsdb/trigger.c b/ovsdb/trigger.c
index 7e62e90ae3..0372302af4 100644
--- a/ovsdb/trigger.c
+++ b/ovsdb/trigger.c
@@ -141,7 +141,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now)
     struct ovsdb_trigger *t, *next;
 
     bool run_triggers = db->run_triggers;
-    db->run_triggers = false;
+    db->run_triggers_now = db->run_triggers = false;
 
     bool disconnect_all = false;
 
@@ -160,7 +160,7 @@ ovsdb_trigger_run(struct ovsdb *db, long long int now)
 void
 ovsdb_trigger_wait(struct ovsdb *db, long long int now)
 {
-    if (db->run_triggers) {
+    if (db->run_triggers_now) {
         poll_immediate_wake();
     } else {
         long long int deadline = LLONG_MAX;
@@ -319,9 +319,16 @@ ovsdb_trigger_try(struct ovsdb_trigger *t, long long int now)
             if (!strcmp(ovsdb_error_get_tag(error), "cluster error")) {
                 /* Temporary error.  Transition back to "initialized" state to
                  * try again. */
+                char *err_s = ovsdb_error_to_string(error);
+                VLOG_DBG("cluster error %s", err_s);
+
                 jsonrpc_msg_destroy(t->reply);
                 t->reply = NULL;
                 t->db->run_triggers = true;
+                if (!strstr(err_s, "not leader")) {
+                    t->db->run_triggers_now = true;
+                }
+                free(err_s);
                 ovsdb_error_destroy(error);
             } else {
                 /* Permanent error.  Transition to "completed" state to report
-- 
2.25.1


From cc3d02699203e2fe9d9fd384d09e268ba614828d Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Fri, 28 Feb 2020 18:07:10 -0800
Subject: [PATCH 07/15] raft: Fix next_index in install_snapshot reply
 handling.

[ upstream commit 877618fc833273d1e29e012b5e925d51cba80ff5 ]

When a leader handles install_snapshot reply, the next_index for
the follower should be log_start instead of log_end, because there
can be new entries added in leader's log after initiating the
install_snapshot procedure.  Also, it should send all the accumulated
entries to follower in the following append-request message, instead
of sending 0 entries, to speed up the converge.

Without this fix, there is no functional problem, but it takes
uncessary extra rounds of append-requests responsed with "inconsistency"
by follower, although finally will be converged.

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 6cd7b0041a..fa04d8c80b 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -3998,8 +3998,9 @@ raft_handle_install_snapshot_reply(
     VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s "
                  " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid),
                  s->nickname, rpy->last_term, rpy->last_index);
-    s->next_index = raft->log_end;
-    raft_send_append_request(raft, s, 0, "snapshot installed");
+    s->next_index = raft->log_start;
+    raft_send_append_request(raft, s, raft->log_end - s->next_index,
+                             "snapshot installed");
 }
 
 /* Returns true if 'raft' has grown enough since the last snapshot that
-- 
2.25.1


From 9c76350e271546eedfeb18720975e35b4e36e1f1 Mon Sep 17 00:00:00 2001
From: Han Zhou <hzhou@ovn.org>
Date: Thu, 5 Mar 2020 23:48:45 -0800
Subject: [PATCH 08/15] raft: Fix the problem of stuck in candidate role
 forever.

[ upstream commit 25a7e5547f1e107db0f032ad269f447c57401531 ]

Sometimes a server can stay in candidate role forever, even if the server
already see the new leader and handles append-requests normally. However,
because of the wrong role, it appears as disconnected from cluster and
so the clients are disconnected.

This problem happens when 2 servers become candidates in the same
term, and one of them is elected as leader in that term. It can be
reproduced by the test cases added in this patch.

The root cause is that the current implementation only changes role to
follower when a bigger term is observed (in raft_receive_term__()).
According to the RAFT paper, if another candidate becomes leader with
the same term, the candidate should change to follower.

This patch fixes it by changing the role to follower when leader
is being updated in raft_update_leader().

Signed-off-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>

Resolves: #1836305
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft.c           | 19 +++++++++++++--
 tests/ovsdb-cluster.at | 55 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index fa04d8c80b..6452182ba6 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -73,7 +73,8 @@ enum raft_failure_test {
     FT_CRASH_BEFORE_SEND_EXEC_REQ,
     FT_CRASH_AFTER_SEND_EXEC_REQ,
     FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE,
-    FT_DELAY_ELECTION
+    FT_DELAY_ELECTION,
+    FT_DONT_SEND_VOTE_REQUEST
 };
 static enum raft_failure_test failure_test;
 
@@ -1647,6 +1648,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
     }
 
     ovs_assert(raft->role != RAFT_LEADER);
+
     raft->role = RAFT_CANDIDATE;
     /* If there was no leader elected since last election, we know we are
      * retrying now. */
@@ -1690,7 +1692,9 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
                 .leadership_transfer = leadership_transfer,
             },
         };
-        raft_send(raft, &rq);
+        if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
+            raft_send(raft, &rq);
+        }
     }
 
     /* Vote for ourselves. */
@@ -2966,6 +2970,15 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
         };
         ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
     }
+    if (raft->role == RAFT_CANDIDATE) {
+        /* Section 3.4: While waiting for votes, a candidate may
+         * receive an AppendEntries RPC from another server claiming to
+         * be leader. If the leader’s term (included in its RPC) is at
+         * least as large as the candidate’s current term, then the
+         * candidate recognizes the leader as legitimate and returns to
+         * follower state. */
+        raft->role = RAFT_FOLLOWER;
+    }
     return true;
 }
 
@@ -4674,6 +4687,8 @@ raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED,
                 raft_reset_election_timer(raft);
             }
         }
+    } else if (!strcmp(test, "dont-send-vote-request")) {
+        failure_test = FT_DONT_SEND_VOTE_REQUEST;
     } else if (!strcmp(test, "clear")) {
         failure_test = FT_NO_TEST;
         unixctl_command_reply(conn, "test dismissed");
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 0aa4564480..9714545151 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -527,6 +527,61 @@ AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
 ovsdb_cluster_failure_test 2 2 3 crash-after-receiving-append-request-update
 AT_CLEANUP
 
+
+AT_SETUP([OVSDB cluster - competing candidates])
+AT_KEYWORDS([ovsdb server negative unix cluster competing-candidates])
+
+n=3
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+ordinal_schema > schema
+AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
+cid=`ovsdb-tool db-cid s1.db`
+schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
+for i in `seq 2 $n`; do
+    AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
+done
+
+on_exit 'kill `cat *.pid`'
+for i in `seq $n`; do
+    AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
+done
+for i in `seq $n`; do
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+# We need to simulate the situation when 2 candidates starts election with same
+# term.
+#
+# Before triggering leader election, tell follower s2 don't send vote request (simulating
+# vote-request lost or not handled in time), and tell follower s3 to delay
+# election timer to make sure s3 doesn't send vote-request before s2 enters
+# term 2.
+AT_CHECK([ovs-appctl -t "`pwd`"/s2 cluster/failure-test dont-send-vote-request], [0], [ignore])
+AT_CHECK([ovs-appctl -t "`pwd`"/s3 cluster/failure-test delay-election], [0], [ignore])
+
+# Restart leader, which will become follower, and both old followers will start
+# election as candidate. The new follower (old leader) will vote one of them,
+# and the other candidate should step back as follower as again.
+kill -9 `cat s1.pid`
+AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s1.log --pidfile=s1.pid --unixctl=s1 --remote=punix:s1.ovsdb s1.db])
+
+# Tell s1 to delay election timer so that it won't start election before s3
+# becomes candidate.
+AT_CHECK([ovs-appctl -t "`pwd`"/s1 cluster/failure-test delay-election], [0], [ignore])
+
+OVS_WAIT_UNTIL([ovs-appctl -t "`pwd`"/s1 cluster/status $schema_name | grep "Term: 2"])
+
+for i in `seq $n`; do
+    OVS_WAIT_WHILE([ovs-appctl -t "`pwd`"/s$i cluster/status $schema_name | grep "candidate"])
+    AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
+done
+
+for i in `seq $n`; do
+    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+done
+
+AT_CLEANUP
+
 
 AT_BANNER([OVSDB - cluster tests])
 
-- 
2.25.1


From 5c38ccd52fb3925e82eda20f1897ec02abb390d9 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Mon, 4 May 2020 21:55:41 +0200
Subject: [PATCH 09/15] raft: Fix leak of the incomplete command.

[ upstream commit 168beb87ca63056e8896b09a60031565b7b60728 ]

Function raft_command_initiate() returns correctly referenced command
instance.  'n_ref' equals 1 for complete commands and 2 for incomplete
commands because one more reference is in raft->commands list.
raft_handle_execute_command_request__() leaks the reference by not
returning pointer anywhere and not unreferencing incomplete commands.

 792 bytes in 11 blocks are definitely lost in loss record 258 of 262
    at 0x483BB1A: calloc (vg_replace_malloc.c:762)
    by 0x44BA32: xcalloc (util.c:121)
    by 0x422E5F: raft_command_create_incomplete (raft.c:2038)
    by 0x422E5F: raft_command_initiate (raft.c:2061)
    by 0x428651: raft_handle_execute_command_request__ (raft.c:4161)
    by 0x428651: raft_handle_execute_command_request (raft.c:4177)
    by 0x428651: raft_handle_rpc (raft.c:4230)
    by 0x428651: raft_conn_run (raft.c:1445)
    by 0x428DEA: raft_run (raft.c:1803)
    by 0x407392: main_loop (ovsdb-server.c:226)
    by 0x407392: main (ovsdb-server.c:469)

Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: William Tu <u9012063@gmail.com>

Resolves: #1836307
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 6452182ba6..1505814138 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -4163,9 +4163,7 @@ raft_handle_execute_command_request__(
     cmd->sid = rq->common.sid;
 
     enum raft_command_status status = cmd->status;
-    if (status != RAFT_CMD_INCOMPLETE) {
-        raft_command_unref(cmd);
-    }
+    raft_command_unref(cmd);
     return status;
 }
 
-- 
2.25.1


From 3d9b529afb098531190d57d6f35d1622bb4093cd Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhewang@nvidia.com>
Date: Mon, 30 Mar 2020 17:21:04 -0700
Subject: [PATCH 10/15] raft: Disable RAFT jsonrpc inactivity probe.

[ upstream commit 1600e0040caded7eaa9b1f41926f9619d8e0ec8d ]

With the scale test of 640 nodes k8s cluster, raft DB nodes' jsonrpc
session got closed due to the timeout of default 5 seconds probe.
It will cause disturbance of the raft cluster. Since we already have
the heartbeat for RAFT, just disable the probe between the servers
to avoid the unnecessary jsonrpc inactivity probe.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Zhen Wang <zhewang@nvidia.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

Resolves: #1836308
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 ovsdb/raft.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 1505814138..395cc56113 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -938,6 +938,7 @@ raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
                                               &conn->sid);
     conn->incoming = incoming;
     conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
+    jsonrpc_session_set_probe_interval(js, 0);
 }
 
 /* Starts the local server in an existing Raft cluster, using the local copy of
-- 
2.25.1


From 8b155475749cdb7a1817810d447e4cf6598cb6fa Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@redhat.com>
Date: Fri, 15 May 2020 16:36:18 -0400
Subject: [PATCH 11/15] netdev-linux: Update LAG in all cases.

In some cases, when processing a netlink change event, it's possible for
an alternate part of OvS (like the IPv6 endpoint processing) to hold an
active netdev interface.  This creates a race-condition, where sometimes
the OvS change processing will take the normal path.  This doesn't work
because the netdev device object won't actually be enslaved to the
ovs-system (for instance, a linux bond) and ingress qdisc entries will
be missing.

To address this, we update the LAG information in ALL cases where
LAG information could come in.

Fixes: d22f8927c3c9 ("netdev-linux: monitor and offload LAG slaves to TC")
Cc: Marcelo Leitner <mleitner@redhat.com>
Cc: John Hurley <john.hurley@netronome.com>
Acked-by: Roi Dayan <roid@mellanox.com>
Signed-off-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 lib/netdev-linux.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index c6f3d27409..2bf8d4c477 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -659,10 +659,6 @@ netdev_linux_update_lag(struct rtnetlink_change *change)
 {
     struct linux_lag_slave *lag;
 
-    if (!rtnetlink_type_is_rtnlgrp_link(change->nlmsg_type)) {
-        return;
-    }
-
     if (change->slave && netdev_linux_kind_is_lag(change->slave)) {
         lag = shash_find_data(&lag_shash, change->ifname);
 
@@ -760,8 +756,11 @@ netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
                     netdev_linux_update(netdev, nsid, &change);
                     ovs_mutex_unlock(&netdev->mutex);
                 }
-                else if (!netdev_ && change.ifname) {
-                    /* Netdev is not present in OvS but its master could be. */
+
+                if (change.ifname &&
+                    rtnetlink_type_is_rtnlgrp_link(change.nlmsg_type)) {
+
+                    /* Need to try updating the LAG information. */
                     ovs_mutex_lock(&lag_mutex);
                     netdev_linux_update_lag(&change);
                     ovs_mutex_unlock(&lag_mutex);
-- 
2.25.1


From d14e39f81bec29064a58df0177ce457765305f8b Mon Sep 17 00:00:00 2001
From: Aaron Conole <aconole@redhat.com>
Date: Fri, 15 May 2020 16:36:19 -0400
Subject: [PATCH 12/15] netdev-offload-tc: Re-fetch block ID after probing.

It's possible that block_id could changes after the probe for block
support.  Therefore, fetch the block_id again after the probe.

Fixes: edc2055a2bf7 ("netdev-offload-tc: Flush rules on ingress block when init tc flow api")
Cc: Dmytro Linkin <dmitrolin@mellanox.com>
Acked-by: Roi Dayan <roid@mellanox.com>
Co-authored-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: Marcelo Leitner <mleitner@redhat.com>
Signed-off-by: Aaron Conole <aconole@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
---
 lib/netdev-offload-tc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
index 550e440b3a..f577311aec 100644
--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -1922,6 +1922,8 @@ netdev_tc_init_flow_api(struct netdev *netdev)
 
     if (ovsthread_once_start(&block_once)) {
         probe_tc_block_support(ifindex);
+        /* Need to re-fetch block id as it depends on feature availability. */
+        block_id = get_block_id_from_netdev(netdev);
         ovsthread_once_done(&block_once);
     }
 
-- 
2.25.1


From fb32a78921e50b1ffa0c52f873167f68622e8723 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Fri, 22 May 2020 18:31:19 +0200
Subject: [PATCH 13/15] ovsdb: Add raft memory usage to memory report.

[ upstream commit 3423cd97f88fe6a8de8b649d79fe6ac83bce94d1 ]

Memory reports could be found in logs or by calling 'memory/show'
appctl command.  For ovsdb-server it includes information about db
cells, monitor connections with their backlog size, etc.  But it
doesn't contain any information about memory consumed by raft.
Backlogs of raft connections could be insanely large because of
snapshot installation requests that simply contains the whole database.
In not that healthy clusters where one of ovsdb servers is not able to
timely handle all the incoming raft traffic, backlog on a sender's side
could cause significant memory consumption issues.

Adding new 'raft-connections' and 'raft-backlog' counters to the
memory report to better track such conditions.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

Related: #1834838
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
---
 ovsdb/ovsdb.c   |  4 ++++
 ovsdb/raft.c    | 16 ++++++++++++++++
 ovsdb/raft.h    |  2 ++
 ovsdb/storage.c | 10 ++++++++++
 ovsdb/storage.h |  3 +++
 5 files changed, 35 insertions(+)

diff --git a/ovsdb/ovsdb.c b/ovsdb/ovsdb.c
index 7e683e6815..2da117cb36 100644
--- a/ovsdb/ovsdb.c
+++ b/ovsdb/ovsdb.c
@@ -502,6 +502,10 @@ ovsdb_get_memory_usage(const struct ovsdb *db, struct simap *usage)
     }
 
     simap_increase(usage, "cells", cells);
+
+    if (db->storage) {
+        ovsdb_storage_get_memory_usage(db->storage, usage);
+    }
 }
 
 struct ovsdb_table *
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 395cc56113..6ca63b4352 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -36,6 +36,7 @@
 #include "ovsdb/log.h"
 #include "raft-rpc.h"
 #include "random.h"
+#include "simap.h"
 #include "socket-util.h"
 #include "stream.h"
 #include "timeval.h"
@@ -1014,6 +1015,21 @@ raft_get_sid(const struct raft *raft)
     return &raft->sid;
 }
 
+/* Adds memory consumption info to 'usage' for later use by memory_report(). */
+void
+raft_get_memory_usage(const struct raft *raft, struct simap *usage)
+{
+    struct raft_conn *conn;
+    int cnt = 0;
+
+    LIST_FOR_EACH (conn, list_node, &raft->conns) {
+        simap_increase(usage, "raft-backlog",
+                       jsonrpc_session_get_backlog(conn->js));
+        cnt++;
+    }
+    simap_increase(usage, "raft-connections", cnt);
+}
+
 /* Returns true if 'raft' has completed joining its cluster, has not left or
  * initiated leaving the cluster, does not have failed disk storage, and is
  * apparently connected to the leader in a healthy way (or is itself the
diff --git a/ovsdb/raft.h b/ovsdb/raft.h
index 3d448995af..99d5307e54 100644
--- a/ovsdb/raft.h
+++ b/ovsdb/raft.h
@@ -67,6 +67,7 @@
 struct json;
 struct ovsdb_log;
 struct raft;
+struct simap;
 struct sset;
 
 #define RAFT_MAGIC "CLUSTER"
@@ -113,6 +114,7 @@ const struct uuid *raft_get_cid(const struct raft *);
 const struct uuid *raft_get_sid(const struct raft *);
 bool raft_is_connected(const struct raft *);
 bool raft_is_leader(const struct raft *);
+void raft_get_memory_usage(const struct raft *, struct simap *usage);
 
 /* Joining a cluster. */
 bool raft_is_joining(const struct raft *);
diff --git a/ovsdb/storage.c b/ovsdb/storage.c
index e26252b066..7b4ad16f60 100644
--- a/ovsdb/storage.c
+++ b/ovsdb/storage.c
@@ -26,6 +26,7 @@
 #include "ovsdb.h"
 #include "raft.h"
 #include "random.h"
+#include "simap.h"
 #include "timeval.h"
 #include "util.h"
 
@@ -188,6 +189,15 @@ ovsdb_storage_get_applied_index(const struct ovsdb_storage *storage)
     return storage->raft ? raft_get_applied_index(storage->raft) : 0;
 }
 
+void
+ovsdb_storage_get_memory_usage(const struct ovsdb_storage *storage,
+                               struct simap *usage)
+{
+    if (storage->raft) {
+        raft_get_memory_usage(storage->raft, usage);
+    }
+}
+
 void
 ovsdb_storage_run(struct ovsdb_storage *storage)
 {
diff --git a/ovsdb/storage.h b/ovsdb/storage.h
index 8a9bbab709..a223968912 100644
--- a/ovsdb/storage.h
+++ b/ovsdb/storage.h
@@ -23,6 +23,7 @@
 struct json;
 struct ovsdb_schema;
 struct ovsdb_storage;
+struct simap;
 struct uuid;
 
 struct ovsdb_error *ovsdb_storage_open(const char *filename, bool rw,
@@ -39,6 +40,8 @@ bool ovsdb_storage_is_leader(const struct ovsdb_storage *);
 const struct uuid *ovsdb_storage_get_cid(const struct ovsdb_storage *);
 const struct uuid *ovsdb_storage_get_sid(const struct ovsdb_storage *);
 uint64_t ovsdb_storage_get_applied_index(const struct ovsdb_storage *);
+void ovsdb_storage_get_memory_usage(const struct ovsdb_storage *,
+                                    struct simap *usage);
 
 void ovsdb_storage_run(struct ovsdb_storage *);
 void ovsdb_storage_wait(struct ovsdb_storage *);
-- 
2.25.1


From 92a1e56c8a37927441fb1742e6054a9118654ef0 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Thu, 14 May 2020 22:10:45 +0200
Subject: [PATCH 14/15] ovsdb-server: Fix schema leak while reading db.

[ upstream commit 16e3a80cf646f6c53d22ef98599d5aecb8310414 ]

parse_txn() function doesn't always take ownership of the 'schema'
passed.  So, if the schema of the clustered db has same version as the
one that already in use, parse_txn() will not use it, resulting with a
memory leak:

 7,827 (56 direct, 7,771 indirect) bytes in 1 blocks are definitely lost
    at 0x483BB1A: calloc (vg_replace_malloc.c:762)
    by 0x44AD02: xcalloc (util.c:121)
    by 0x40E70E: ovsdb_schema_create (ovsdb.c:41)
    by 0x40EA6D: ovsdb_schema_from_json (ovsdb.c:217)
    by 0x415EDD: ovsdb_storage_read (storage.c:280)
    by 0x408968: read_db (ovsdb-server.c:607)
    by 0x40733D: main_loop (ovsdb-server.c:227)
    by 0x40733D: main (ovsdb-server.c:469)

While we could put ovsdb_schema_destroy() in a few places inside
'parse_txn()', from the users' point of view it seems better to have a
constant argument and just clone the 'schema' if needed.  The caller
will be responsible for destroying the 'schema' it owns.

Fixes: 1b1d2e6daa56 ("ovsdb: Introduce experimental support for clustered databases.")
Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

Related: #1834838
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
---
 ovsdb/ovsdb-server.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
index d416f1b606..ef4e996df2 100644
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -540,7 +540,7 @@ close_db(struct server_config *config, struct db *db, char *comment)
 
 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 parse_txn(struct server_config *config, struct db *db,
-          struct ovsdb_schema *schema, const struct json *txn_json,
+          const struct ovsdb_schema *schema, const struct json *txn_json,
           const struct uuid *txnid)
 {
     if (schema && (!db->db->schema || strcmp(schema->version,
@@ -565,7 +565,7 @@ parse_txn(struct server_config *config, struct db *db,
              ? xasprintf("database %s schema changed", db->db->name)
              : xasprintf("database %s connected to storage", db->db->name)));
 
-        ovsdb_replace(db->db, ovsdb_create(schema, NULL));
+        ovsdb_replace(db->db, ovsdb_create(ovsdb_schema_clone(schema), NULL));
 
         /* Force update to schema in _Server database. */
         db->row_uuid = UUID_ZERO;
@@ -614,6 +614,7 @@ read_db(struct server_config *config, struct db *db)
         } else {
             error = parse_txn(config, db, schema, txn_json, &txnid);
             json_destroy(txn_json);
+            ovsdb_schema_destroy(schema);
             if (error) {
                 break;
             }
-- 
2.25.1


From 3168eba559cbce28937be4e785c3337030694455 Mon Sep 17 00:00:00 2001
From: Ilya Maximets <i.maximets@ovn.org>
Date: Fri, 22 May 2020 22:36:27 +0200
Subject: [PATCH 15/15] raft: Avoid sending equal snapshots.

[ upstream commit 8c2c503bdb0da1ce6044a53d462f905fd4f8acf5 ]

Snapshots are huge.  In some cases we could receive several outdated
append replies from the remote server.  This could happen in high
scale cases if the remote server is overloaded and not able to process
all the raft requests in time.  As an action to each outdated append
reply we're sending full database snapshot.  While remote server is
already overloaded those snapshots will stuck in jsonrpc backlog for
a long time making it grow up to few GB.  Since remote server wasn't
able to timely process incoming messages it will likely not able to
process snapshots leading to the same situation with low chances to
recover.  Remote server will likely stuck in 'candidate' state, other
servers will grow their memory consumption due to growing jsonrpc
backlogs:

jsonrpc|INFO|excessive sending backlog, jsonrpc: ssl:192.16.0.3:6644,
             num of msgs: 3795, backlog: 8838994624.

This patch is trying to avoid that situation by avoiding sending of
equal snapshot install requests.  This helps maintain reasonable memory
consumption and allows the cluster to recover on a larger scale.

Acked-by: Han Zhou <hzhou@ovn.org>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>

Related: #1834838
Signed-off-by: Ilya Maximets <i.maximets@redhat.com>
---
 ovsdb/raft-private.c |  1 +
 ovsdb/raft-private.h |  4 ++++
 ovsdb/raft.c         | 39 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/ovsdb/raft-private.c b/ovsdb/raft-private.c
index 26d39a087f..9468fdaf4a 100644
--- a/ovsdb/raft-private.c
+++ b/ovsdb/raft-private.c
@@ -137,6 +137,7 @@ raft_server_destroy(struct raft_server *s)
     if (s) {
         free(s->address);
         free(s->nickname);
+        free(s->last_install_snapshot_request);
         free(s);
     }
 }
diff --git a/ovsdb/raft-private.h b/ovsdb/raft-private.h
index ac8656d42f..1f366b4ab3 100644
--- a/ovsdb/raft-private.h
+++ b/ovsdb/raft-private.h
@@ -27,6 +27,7 @@
 
 struct ds;
 struct ovsdb_parser;
+struct raft_install_snapshot_request;
 
 /* Formatting server IDs and cluster IDs for use in human-readable logs.  Do
  * not use these in cases where the whole server or cluster ID is needed; use
@@ -83,6 +84,9 @@ struct raft_server {
     bool replied;            /* Reply to append_request was received from this
                                 node during current election_timeout interval.
                                 */
+    /* Copy of the last install_snapshot_request sent to this server. */
+    struct raft_install_snapshot_request *last_install_snapshot_request;
+
     /* For use in adding and removing servers: */
     struct uuid requester_sid;  /* Nonzero if requested via RPC. */
     struct unixctl_conn *requester_conn; /* Only if requested via unixctl. */
diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 6ca63b4352..8df386fa19 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -1421,8 +1421,20 @@ raft_conn_run(struct raft *raft, struct raft_conn *conn)
     jsonrpc_session_run(conn->js);
 
     unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js);
-    bool just_connected = (new_seqno != conn->js_seqno
+    bool reconnected = new_seqno != conn->js_seqno;
+    bool just_connected = (reconnected
                            && jsonrpc_session_is_connected(conn->js));
+
+    if (reconnected) {
+        /* Clear 'last_install_snapshot_request' since it might not reach the
+         * destination or server was restarted. */
+        struct raft_server *server = raft_find_server(raft, &conn->sid);
+        if (server) {
+            free(server->last_install_snapshot_request);
+            server->last_install_snapshot_request = NULL;
+        }
+    }
+
     conn->js_seqno = new_seqno;
     if (just_connected) {
         if (raft->joining) {
@@ -3296,6 +3308,31 @@ raft_send_install_snapshot_request(struct raft *raft,
             .election_timer = raft->election_timer, /* use latest value */
         }
     };
+
+    if (s->last_install_snapshot_request) {
+        struct raft_install_snapshot_request *old, *new;
+
+        old = s->last_install_snapshot_request;
+        new = &rpc.install_snapshot_request;
+        if (   old->term           == new->term
+            && old->last_index     == new->last_index
+            && old->last_term      == new->last_term
+            && old->last_servers   == new->last_servers
+            && old->data           == new->data
+            && old->election_timer == new->election_timer
+            && uuid_equals(&old->last_eid, &new->last_eid)) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+
+            VLOG_WARN_RL(&rl, "not sending exact same install_snapshot_request"
+                              " to server %s again", s->nickname);
+            return;
+        }
+    }
+    free(s->last_install_snapshot_request);
+    CONST_CAST(struct raft_server *, s)->last_install_snapshot_request
+        = xmemdup(&rpc.install_snapshot_request,
+                  sizeof rpc.install_snapshot_request);
+
     raft_send(raft, &rpc);
 }
 
-- 
2.25.1

diff --git a/dpdk/drivers/bus/pci/linux/pci_vfio.c b/dpdk/drivers/bus/pci/linux/pci_vfio.c
index 64cd84a689..ba60e7ce99 100644
--- a/dpdk/drivers/bus/pci/linux/pci_vfio.c
+++ b/dpdk/drivers/bus/pci/linux/pci_vfio.c
@@ -149,6 +149,38 @@ pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
 	return 0;
 }
 
+/* enable PCI bus memory space */
+static int
+pci_vfio_enable_bus_memory(int dev_fd)
+{
+	uint16_t cmd;
+	int ret;
+
+	ret = pread64(dev_fd, &cmd, sizeof(cmd),
+		      VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+		      PCI_COMMAND);
+
+	if (ret != sizeof(cmd)) {
+		RTE_LOG(ERR, EAL, "Cannot read command from PCI config space!\n");
+		return -1;
+	}
+
+	if (cmd & PCI_COMMAND_MEMORY)
+		return 0;
+
+	cmd |= PCI_COMMAND_MEMORY;
+	ret = pwrite64(dev_fd, &cmd, sizeof(cmd),
+		       VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
+		       PCI_COMMAND);
+
+	if (ret != sizeof(cmd)) {
+		RTE_LOG(ERR, EAL, "Cannot write command to PCI config space!\n");
+		return -1;
+	}
+
+	return 0;
+}
+
 /* set PCI bus mastering */
 static int
 pci_vfio_set_bus_master(int dev_fd, bool op)
@@ -427,6 +459,11 @@ pci_rte_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
 		return -1;
 	}
 
+	if (pci_vfio_enable_bus_memory(vfio_dev_fd)) {
+		RTE_LOG(ERR, EAL, "Cannot enable bus memory!\n");
+		return -1;
+	}
+
 	/* set bus mastering for the device */
 	if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
 		RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
diff --git a/dpdk/lib/librte_vhost/vhost_user.c b/dpdk/lib/librte_vhost/vhost_user.c
index 40c4520c08..8954f7930e 100644
--- a/dpdk/lib/librte_vhost/vhost_user.c
+++ b/dpdk/lib/librte_vhost/vhost_user.c
@@ -206,7 +206,7 @@ vhost_backend_cleanup(struct virtio_net *dev)
 			dev->inflight_info->addr = NULL;
 		}
 
-		if (dev->inflight_info->fd > 0) {
+		if (dev->inflight_info->fd >= 0) {
 			close(dev->inflight_info->fd);
 			dev->inflight_info->fd = -1;
 		}
@@ -1408,6 +1408,7 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
 				"failed to alloc dev inflight area\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
+		dev->inflight_info->fd = -1;
 	}
 
 	num_queues = msg->payload.inflight.num_queues;
@@ -1433,6 +1434,16 @@ vhost_user_get_inflight_fd(struct virtio_net **pdev,
 	}
 	memset(addr, 0, mmap_size);
 
+	if (dev->inflight_info->addr) {
+		munmap(dev->inflight_info->addr, dev->inflight_info->size);
+		dev->inflight_info->addr = NULL;
+	}
+
+	if (dev->inflight_info->fd >= 0) {
+		close(dev->inflight_info->fd);
+		dev->inflight_info->fd = -1;
+	}
+
 	dev->inflight_info->addr = addr;
 	dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size;
 	dev->inflight_info->fd = msg->fds[0] = fd;
@@ -1515,10 +1526,13 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
 				"failed to alloc dev inflight area\n");
 			return RTE_VHOST_MSG_RESULT_ERR;
 		}
+		dev->inflight_info->fd = -1;
 	}
 
-	if (dev->inflight_info->addr)
+	if (dev->inflight_info->addr) {
 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
+		dev->inflight_info->addr = NULL;
+	}
 
 	addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
 		    fd, mmap_offset);
@@ -1527,8 +1541,10 @@ vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
 		return RTE_VHOST_MSG_RESULT_ERR;
 	}
 
-	if (dev->inflight_info->fd)
+	if (dev->inflight_info->fd >= 0) {
 		close(dev->inflight_info->fd);
+		dev->inflight_info->fd = -1;
+	}
 
 	dev->inflight_info->fd = fd;
 	dev->inflight_info->addr = addr;
@@ -2059,10 +2075,10 @@ vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
 	size = msg->payload.log.mmap_size;
 	off  = msg->payload.log.mmap_offset;
 
-	/* Don't allow mmap_offset to point outside the mmap region */
-	if (off > size) {
+	/* Check for mmap size and offset overflow. */
+	if (off >= -size) {
 		RTE_LOG(ERR, VHOST_CONFIG,
-			"log offset %#"PRIx64" exceeds log size %#"PRIx64"\n",
+			"log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
 			off, size);
 		return RTE_VHOST_MSG_RESULT_ERR;
 	}
@@ -2526,7 +2542,7 @@ static int
 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
 			struct VhostUserMsg *msg)
 {
-	uint16_t vring_idx;
+	uint32_t vring_idx;
 
 	switch (msg->request.master) {
 	case VHOST_USER_SET_VRING_KICK:
diff --git a/dpdk/lib/librte_vhost/virtio_net.c b/dpdk/lib/librte_vhost/virtio_net.c
index ac2842b2d2..33f10258cf 100644
--- a/dpdk/lib/librte_vhost/virtio_net.c
+++ b/dpdk/lib/librte_vhost/virtio_net.c
@@ -1086,6 +1086,8 @@ virtio_dev_rx_batch_packed(struct virtio_net *dev,
 						  VHOST_ACCESS_RW);
 
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
 		if (unlikely(lens[i] != descs[avail_idx + i].len))
 			return -1;
 	}
@@ -1841,6 +1843,8 @@ vhost_reserve_avail_batch_packed(struct virtio_net *dev,
 	}
 
 	vhost_for_each_try_unroll(i, 0, PACKED_BATCH_SIZE) {
+		if (unlikely(!desc_addrs[i]))
+			return -1;
 		if (unlikely((lens[i] != descs[avail_idx + i].len)))
 			return -1;
 	}