diff --git a/SOURCES/openvswitch-2.16.0.patch b/SOURCES/openvswitch-2.16.0.patch index a39d807..eb54fbe 100644 --- a/SOURCES/openvswitch-2.16.0.patch +++ b/SOURCES/openvswitch-2.16.0.patch @@ -3591,7 +3591,7 @@ index a69e37e5c2..48c6df511f 100644 /* On disk data serialization and deserialization. */ diff --git a/ovsdb/raft.c b/ovsdb/raft.c -index 2fb5156519..ce40c5bc07 100644 +index 2fb5156519..1a3447a8dd 100644 --- a/ovsdb/raft.c +++ b/ovsdb/raft.c @@ -494,11 +494,11 @@ raft_create_cluster(const char *file_name, const char *name, @@ -3832,6 +3832,15 @@ index 2fb5156519..ce40c5bc07 100644 raft_get_servers_from_log(raft, VLL_INFO); raft_get_election_timer_from_log(raft); +@@ -4216,7 +4226,7 @@ raft_may_snapshot(const struct raft *raft) + && !raft->leaving + && !raft->left + && !raft->failed +- && raft->role != RAFT_LEADER ++ && (raft->role == RAFT_FOLLOWER || hmap_count(&raft->servers) == 1) + && raft->last_applied >= raft->log_start); + } + @@ -4265,11 +4275,12 @@ raft_store_snapshot(struct raft *raft, const struct json *new_snapshot_data) uint64_t new_log_start = raft->last_applied + 1; struct raft_entry new_snapshot = { @@ -4097,7 +4106,7 @@ index 394ac8eb49..fe04555d0c 100644 struct ovsdb_row *ovsdb_row_clone(const struct ovsdb_row *); void ovsdb_row_destroy(struct ovsdb_row *); diff --git a/ovsdb/storage.c b/ovsdb/storage.c -index d727b1eacd..9e32efe582 100644 +index d727b1eacd..d4984be250 100644 --- a/ovsdb/storage.c +++ b/ovsdb/storage.c @@ -268,9 +268,7 @@ ovsdb_storage_read(struct ovsdb_storage *storage, @@ -4111,6 +4120,57 @@ index d727b1eacd..9e32efe582 100644 if (!json) { return NULL; } else if (json->type != JSON_ARRAY || json->array.n != 2) { +@@ -509,7 +507,11 @@ schedule_next_snapshot(struct ovsdb_storage *storage, bool quick) + + long long int now = time_msec(); + storage->next_snapshot_min = now + base + random_range(range); +- storage->next_snapshot_max = now + 60LL * 60 * 24 * 1000; /* 1 day */ ++ if (!quick) { ++ long long int one_day = 60LL * 60 * 24 * 1000; ++ ++ storage->next_snapshot_max = now + one_day; ++ } + } else { + storage->next_snapshot_min = LLONG_MAX; + storage->next_snapshot_max = LLONG_MAX; +@@ -517,7 +519,7 @@ schedule_next_snapshot(struct ovsdb_storage *storage, bool quick) + } + + bool +-ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage) ++ovsdb_storage_should_snapshot(struct ovsdb_storage *storage) + { + if (storage->raft || storage->log) { + /* If we haven't reached the minimum snapshot time, don't snapshot. */ +@@ -546,6 +548,15 @@ ovsdb_storage_should_snapshot(const struct ovsdb_storage *storage) + } + + if (!snapshot_recommended) { ++ if (storage->raft) { ++ /* Re-scheduling with a quick retry in order to avoid condition ++ * where all the raft servers passed the minimal time already, ++ * but the log didn't grow a lot, so they are all checking on ++ * every iteration. This will randomize the time of the next ++ * attempt, so all the servers will not start snapshotting at ++ * the same time when the log reaches a critical size. */ ++ schedule_next_snapshot(storage, true); ++ } + return false; + } + +diff --git a/ovsdb/storage.h b/ovsdb/storage.h +index e120094d7a..ff026b77fa 100644 +--- a/ovsdb/storage.h ++++ b/ovsdb/storage.h +@@ -76,7 +76,7 @@ uint64_t ovsdb_write_get_commit_index(const struct ovsdb_write *); + void ovsdb_write_wait(const struct ovsdb_write *); + void ovsdb_write_destroy(struct ovsdb_write *); + +-bool ovsdb_storage_should_snapshot(const struct ovsdb_storage *); ++bool ovsdb_storage_should_snapshot(struct ovsdb_storage *); + struct ovsdb_error *ovsdb_storage_store_snapshot(struct ovsdb_storage *storage, + const struct json *schema, + const struct json *snapshot) diff --git a/ovsdb/transaction.c b/ovsdb/transaction.c index 8ffefcf7c9..88e0528002 100644 --- a/ovsdb/transaction.c diff --git a/SPECS/openvswitch2.16.spec b/SPECS/openvswitch2.16.spec index ead87aa..7010282 100644 --- a/SPECS/openvswitch2.16.spec +++ b/SPECS/openvswitch2.16.spec @@ -57,7 +57,7 @@ Summary: Open vSwitch Group: System Environment/Daemons daemon/database/utilities URL: http://www.openvswitch.org/ Version: 2.16.0 -Release: 41%{?dist} +Release: 43%{?dist} # Nearly all of openvswitch is ASL 2.0. The bugtool is LGPLv2+, and the # lib/sflow*.[ch] files are SISSL @@ -699,6 +699,104 @@ exit 0 %endif %changelog +* Fri Jan 28 2022 Ilya Maximets - 2.16.0-43 +- ovsdb: storage: Randomize should_snapshot checks when the minimum time passed. [RH git: abe61535ca] (#2044614) + commit 339f97044e3c2312fbb65b932fa14a181acf40d5 + Author: Ilya Maximets + Date: Mon Dec 13 16:43:33 2021 +0100 + + ovsdb: storage: Randomize should_snapshot checks when the minimum time passed. + + Snapshots are scheduled for every 10-20 minutes. It's a random value + in this interval for each server. Once the time is up, but the maximum + time (24 hours) not reached yet, ovsdb will start checking if the log + grew a lot on every iteration. Once the growth is detected, compaction + is triggered. + + OTOH, it's very common for an OVSDB cluster to not have the log growing + very fast. If the log didn't grow 2x in 20 minutes, the randomness of + the initial scheduled time is gone and all the servers are checking if + they need to create snapshot on every iteration. And since all of them + are part of the same cluster, their logs are growing with the same + speed. Once the critical mass is reached, all the servers will start + creating snapshots at the same time. If the database is big enough, + that might leave the cluster unresponsive for an extended period of + time (e.g. 10-15 seconds for OVN_Southbound database in a larger scale + OVN deployment) until the compaction completed. + + Fix that by re-scheduling a quick retry if the minimal time already + passed. Effectively, this will work as a randomized 1-2 min delay + between checks, so the servers will not synchronize. + + Scheduling function updated to not change the upper limit on quick + reschedules to avoid delaying the snapshot creation indefinitely. + Currently quick re-schedules are only used for the error cases, and + there is always a 'slow' re-schedule after the successful compaction. + So, the change of a scheduling function doesn't change the current + behavior much. + + Signed-off-by: Ilya Maximets + Acked-by: Han Zhou + Acked-by: Dumitru Ceara + + Reported-at: https://bugzilla.redhat.com/2044614 + Signed-off-by: Ilya Maximets + + +* Fri Jan 28 2022 Ilya Maximets - 2.16.0-42 +- raft: Only allow followers to snapshot. [RH git: 915efc8c00] (#2044614) + commit bf07cc9cdb2f37fede8c0363937f1eb9f4cfd730 + Author: Dumitru Ceara + Date: Mon Dec 13 20:46:03 2021 +0100 + + raft: Only allow followers to snapshot. + + Commit 3c2d6274bcee ("raft: Transfer leadership before creating + snapshots.") made it such that raft leaders transfer leadership before + snapshotting. However, there's still the case when the next leader to + be is in the process of snapshotting. To avoid delays in that case too, + we now explicitly allow snapshots only on followers. Cluster members + will have to wait until the current election is settled before + snapshotting. + + Given the following logs taken from an OVN_Southbound 3-server cluster + during a scale test: + + S1 (old leader): + 19:07:51.226Z|raft|INFO|Transferring leadership to write a snapshot. + 19:08:03.830Z|ovsdb|INFO|OVN_Southbound: Database compaction took 12601ms + 19:08:03.940Z|raft|INFO|server 8b8d is leader for term 43 + + S2 (follower): + 19:08:00.870Z|raft|INFO|server 8b8d is leader for term 43 + + S3 (new leader): + 19:07:51.242Z|raft|INFO|received leadership transfer from f5c9 in term 42 + 19:07:51.244Z|raft|INFO|term 43: starting election + 19:08:00.805Z|ovsdb|INFO|OVN_Southbound: Database compaction took 9559ms + 19:08:00.869Z|raft|INFO|term 43: elected leader by 2+ of 3 servers + + We see that the leader to be (S3) receives the leadership transfer, + initiates the election and immediately after starts a snapshot that + takes ~9.5 seconds. During this time, S2 votes for S3 electing it + as cluster leader but S3 doesn't effectively become leader until it + finishes snapshotting, essentially keeping the cluster without a + leader for up to ~9.5 seconds. + + With the current change, S3 will delay compaction and snapshotting until + the election is finished. + + The only exception is the case of single-node clusters for which we + allow the node to snapshot regardless of role. + + Acked-by: Han Zhou + Signed-off-by: Dumitru Ceara + Signed-off-by: Ilya Maximets + + Reported-at: https://bugzilla.redhat.com/2044614 + Signed-off-by: Ilya Maximets + + * Wed Jan 26 2022 Open vSwitch CI - 2.16.0-41 - Merging upstream branch-2.16 [RH git: f1ca7b8ac3] Commit list: