|
|
70e049 |
From 88a6289a4f72b11253c01a5a5d834b74d5abb6c3 Mon Sep 17 00:00:00 2001
|
|
|
70e049 |
From: Laszlo Ersek <lersek@redhat.com>
|
|
|
70e049 |
Date: Sun, 24 Apr 2022 09:08:28 +0200
|
|
|
70e049 |
Subject: [PATCH] Base.reset: plug (temporary) leak of libsolv's page file
|
|
|
70e049 |
descriptors
|
|
|
70e049 |
|
|
|
70e049 |
Consider the following call paths (mixed Python and C), extending from
|
|
|
70e049 |
livecd-creator down to libsolv:
|
|
|
70e049 |
|
|
|
70e049 |
main [livecd-tools/tools/livecd-creator]
|
|
|
70e049 |
install() [livecd-tools/imgcreate/creator.py]
|
|
|
70e049 |
fill_sack() [dnf/dnf/base.py]
|
|
|
70e049 |
_add_repo_to_sack() [dnf/dnf/base.py]
|
|
|
70e049 |
load_repo() [libdnf/python/hawkey/sack-py.cpp]
|
|
|
70e049 |
dnf_sack_load_repo() [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
write_main() [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
repo_add_solv() [libsolv/src/repo_solv.c]
|
|
|
70e049 |
repopagestore_read_or_setup_pages() [libsolv/src/repopage.c]
|
|
|
70e049 |
dup()
|
|
|
70e049 |
write_ext() [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
repo_add_solv() [libsolv/src/repo_solv.c]
|
|
|
70e049 |
repopagestore_read_or_setup_pages() [libsolv/src/repopage.c]
|
|
|
70e049 |
dup()
|
|
|
70e049 |
|
|
|
70e049 |
The dup() calls create the following file descriptors (output from
|
|
|
70e049 |
"lsof"):
|
|
|
70e049 |
|
|
|
70e049 |
> COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
|
|
|
70e049 |
> python3 6500 root 7r REG 8,1 25320727 395438 /var/tmp/imgcreate-mytcghah/install_root/var/cache/dnf/fedora.solv (deleted)
|
|
|
70e049 |
> python3 6500 root 8r REG 8,1 52531426 395450 /var/tmp/imgcreate-mytcghah/install_root/var/cache/dnf/fedora-filenames.solvx
|
|
|
70e049 |
|
|
|
70e049 |
These file descriptors are *owned* by the DnfSack object (which is derived
|
|
|
70e049 |
from GObject), as follows:
|
|
|
70e049 |
|
|
|
70e049 |
sack->priv->pool->repos[1]->repodata[1]->store.pagefd = 7
|
|
|
70e049 |
sack->priv->pool->repos[1]->repodata[2]->store.pagefd = 8
|
|
|
70e049 |
^ ^ ^ ^ ^ ^ ^
|
|
|
70e049 |
| | | | | | |
|
|
|
70e049 |
| | | | | | int
|
|
|
70e049 |
| | | | | Repopagestore [libsolv/src/repopage.h]
|
|
|
70e049 |
| | | | Repodata [libsolv/src/repodata.h]
|
|
|
70e049 |
| | | struct s_Repo [libsolv/src/repo.h]
|
|
|
70e049 |
| | struct s_Pool (aka Pool) [libsolv/src/pool.h]
|
|
|
70e049 |
| DnfSackPrivate [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
DnfSack [libdnf/libdnf/dnf-sack.h]
|
|
|
70e049 |
|
|
|
70e049 |
The file descriptors are *supposed* to be closed on the following call
|
|
|
70e049 |
path:
|
|
|
70e049 |
|
|
|
70e049 |
main [livecd-tools/tools/livecd-creator]
|
|
|
70e049 |
install() [livecd-tools/imgcreate/creator.py]
|
|
|
70e049 |
close() [livecd-tools/imgcreate/dnfinst.py]
|
|
|
70e049 |
close() [dnf/dnf/base.py]
|
|
|
70e049 |
reset() [dnf/dnf/base.py]
|
|
|
70e049 |
_sack = None
|
|
|
70e049 |
_goal = None
|
|
|
70e049 |
_transaction = None
|
|
|
70e049 |
...
|
|
|
70e049 |
dnf_sack_finalize() [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
pool_free() [libsolv/src/pool.c]
|
|
|
70e049 |
pool_freeallrepos() [libsolv/src/pool.c]
|
|
|
70e049 |
repo_freedata() [libsolv/src/repo.c]
|
|
|
70e049 |
repodata_freedata() [libsolv/src/repodata.c]
|
|
|
70e049 |
repopagestore_free() [libsolv/src/repopage.c]
|
|
|
70e049 |
close()
|
|
|
70e049 |
|
|
|
70e049 |
Namely, when dnf.Base.reset() [dnf/dnf/base.py] is called with (sack=True,
|
|
|
70e049 |
goal=True), the reference counts of the objects pointed to by the "_sack",
|
|
|
70e049 |
"_goal" and "_transaction" fields are supposed to reach zero, and then, as
|
|
|
70e049 |
part of the DnfSack object's finalization, the libsolv file descriptors
|
|
|
70e049 |
are supposed to be closed.
|
|
|
70e049 |
|
|
|
70e049 |
Now, while this *may* happen immediately in dnf.Base.reset(), it may as
|
|
|
70e049 |
well not. The reason is that there is a multitude of *circular references*
|
|
|
70e049 |
between DnfSack and the packages that it contains. When dnf.Base.reset()
|
|
|
70e049 |
is entered, we have the following picture:
|
|
|
70e049 |
|
|
|
70e049 |
_sack _goal
|
|
|
70e049 |
| |
|
|
|
70e049 |
v v
|
|
|
70e049 |
+----------------+ +-------------+
|
|
|
70e049 |
| DnfSack object | <--- | Goal object |
|
|
|
70e049 |
+----------------+ +-------------+
|
|
|
70e049 |
|^ |^ |^
|
|
|
70e049 |
|| || ||
|
|
|
70e049 |
|| || ||
|
|
|
70e049 |
+--||----||----||---+
|
|
|
70e049 |
| v| v| v| | <-- _transaction
|
|
|
70e049 |
| Pkg1 Pkg2 PkgN |
|
|
|
70e049 |
| |
|
|
|
70e049 |
| Transaction oject |
|
|
|
70e049 |
+-------------------+
|
|
|
70e049 |
|
|
|
70e049 |
That is, the reference count of the DnfSack object is (1 + 1 + N), where N
|
|
|
70e049 |
is the number of packages in the transaction. Details:
|
|
|
70e049 |
|
|
|
70e049 |
(a) The first reference comes from the "_sack" field, established like
|
|
|
70e049 |
this:
|
|
|
70e049 |
|
|
|
70e049 |
main [livecd-tools/tools/livecd-creator]
|
|
|
70e049 |
install() [livecd-tools/imgcreate/creator.py]
|
|
|
70e049 |
fill_sack() [dnf/dnf/base.py]
|
|
|
70e049 |
_build_sack() [dnf/dnf/sack.py]
|
|
|
70e049 |
Sack()
|
|
|
70e049 |
sack_init() [libdnf/python/hawkey/sack-py.cpp]
|
|
|
70e049 |
dnf_sack_new() [libdnf/libdnf/dnf-sack.cpp]
|
|
|
70e049 |
|
|
|
70e049 |
(b) The second reference on the DnfSack object comes from "_goal":
|
|
|
70e049 |
|
|
|
70e049 |
main [livecd-tools/tools/livecd-creator]
|
|
|
70e049 |
install() [livecd-tools/imgcreate/creator.py]
|
|
|
70e049 |
fill_sack() [dnf/dnf/base.py]
|
|
|
70e049 |
_goal = Goal(_sack)
|
|
|
70e049 |
goal_init() [libdnf/python/hawkey/goal-py.cpp]
|
|
|
70e049 |
Py_INCREF(_sack)
|
|
|
70e049 |
|
|
|
70e049 |
(c) Then there is one reference to "_sack" *per package* in the
|
|
|
70e049 |
transaction:
|
|
|
70e049 |
|
|
|
70e049 |
main [livecd-tools/tools/livecd-creator]
|
|
|
70e049 |
install() [livecd-tools/imgcreate/creator.py]
|
|
|
70e049 |
runInstall() [livecd-tools/imgcreate/dnfinst.py]
|
|
|
70e049 |
resolve() [dnf/dnf/base.py]
|
|
|
70e049 |
_goal2transaction() [dnf/dnf/base.py]
|
|
|
70e049 |
list_installs() [libdnf/python/hawkey/goal-py.cpp]
|
|
|
70e049 |
list_generic() [libdnf/python/hawkey/goal-py.cpp]
|
|
|
70e049 |
packagelist_to_pylist() [libdnf/python/hawkey/iutil-py.cpp]
|
|
|
70e049 |
new_package() [libdnf/python/hawkey/sack-py.cpp]
|
|
|
70e049 |
Py_BuildValue()
|
|
|
70e049 |
ts.add_install()
|
|
|
70e049 |
|
|
|
70e049 |
list_installs() creates a list of packages that need to be installed
|
|
|
70e049 |
by DNF. Inside the loop in packagelist_to_pylist(), which constructs
|
|
|
70e049 |
the elements of that list, Py_BuildValue() is called with the "O"
|
|
|
70e049 |
format specifier, and that increases the reference count on "_sack".
|
|
|
70e049 |
|
|
|
70e049 |
Subsequently, in the _goal2transaction() method, we iterate over the
|
|
|
70e049 |
package list created by list_installs(), and add each package to the
|
|
|
70e049 |
transaction (ts.add_install()). After _goal2transaction() returns,
|
|
|
70e049 |
this transaction is assigned to "self._transaction" in resolve(). This
|
|
|
70e049 |
is where the last N (back-)references on the DnfSack object come from.
|
|
|
70e049 |
|
|
|
70e049 |
(d) Now, to quote the defintion of the DnfSack object
|
|
|
70e049 |
("libdnf/docs/hawkey/tutorial-py.rst"):
|
|
|
70e049 |
|
|
|
70e049 |
> *Sack* is an abstraction for a collection of packages.
|
|
|
70e049 |
|
|
|
70e049 |
That's why the DnfSack object references all the Pkg1 through PkgN
|
|
|
70e049 |
packages.
|
|
|
70e049 |
|
|
|
70e049 |
So, when the dnf.Base.reset() method completes, the picture changes like
|
|
|
70e049 |
this:
|
|
|
70e049 |
|
|
|
70e049 |
_sack _goal
|
|
|
70e049 |
| |
|
|
|
70e049 |
-- [CUT] -- -- [CUT] --
|
|
|
70e049 |
| |
|
|
|
70e049 |
v | v
|
|
|
70e049 |
+----------------+ [C] +-------------+
|
|
|
70e049 |
| DnfSack object | <-[U]- | Goal object |
|
|
|
70e049 |
+----------------+ [T] +-------------+
|
|
|
70e049 |
|^ |^ |^ |
|
|
|
70e049 |
|| || ||
|
|
|
70e049 |
|| || || |
|
|
|
70e049 |
+--||----||----||---+ [C]
|
|
|
70e049 |
| v| v| v| | <--[U]-- _transaction
|
|
|
70e049 |
| Pkg1 Pkg2 PkgN | [T]
|
|
|
70e049 |
| | |
|
|
|
70e049 |
| Transaction oject |
|
|
|
70e049 |
+-------------------+
|
|
|
70e049 |
|
|
|
70e049 |
and we are left with N reference cycles (one between each package and the
|
|
|
70e049 |
same DnfSack object).
|
|
|
70e049 |
|
|
|
70e049 |
This set of cycles can only be cleaned up by Python's generational garbage
|
|
|
70e049 |
collector <https://stackify.com/python-garbage-collection/>. The GC will
|
|
|
70e049 |
collect the DnfSack object, and consequently close the libsolv page file
|
|
|
70e049 |
descriptors via dnf_sack_finalize() -- but garbage collection will happen
|
|
|
70e049 |
*only eventually*, unpredictably.
|
|
|
70e049 |
|
|
|
70e049 |
This means that the dnf.Base.reset() method breaks its interface contract:
|
|
|
70e049 |
|
|
|
70e049 |
> Make the Base object forget about various things.
|
|
|
70e049 |
|
|
|
70e049 |
because the libsolv file descriptors can (and frequently do, in practice)
|
|
|
70e049 |
survive dnf.Base.reset().
|
|
|
70e049 |
|
|
|
70e049 |
In general, as long as the garbage collector only tracks process-private
|
|
|
70e049 |
memory blocks, there's nothing wrong; however, file descriptors are
|
|
|
70e049 |
visible to the kernel. When dnf.Base.reset() *temporarily* leaks file
|
|
|
70e049 |
descriptors as explained above, then immediately subsequent operations
|
|
|
70e049 |
that depend on those file descriptors having been closed, can fail.
|
|
|
70e049 |
|
|
|
70e049 |
An example is livecd-creator's unmounting of:
|
|
|
70e049 |
|
|
|
70e049 |
/var/tmp/imgcreate-mytcghah/install_root/var/cache/dnf
|
|
|
70e049 |
|
|
|
70e049 |
which the kernel refuses, due to libsolv's still open file descriptors
|
|
|
70e049 |
pointing into that filesystem:
|
|
|
70e049 |
|
|
|
70e049 |
> umount: /var/tmp/imgcreate-mytcghah/install_root/var/cache/dnf: target
|
|
|
70e049 |
> is busy.
|
|
|
70e049 |
> Unable to unmount /var/tmp/imgcreate-mytcghah/install_root/var/cache/dnf
|
|
|
70e049 |
> normally, using lazy unmount
|
|
|
70e049 |
|
|
|
70e049 |
(Unfortunately, the whole lazy umount idea is misguided in livecd-tools;
|
|
|
70e049 |
it's a misfeature that should be removed, as it permits the corruption of
|
|
|
70e049 |
the loop-backed filesystem. Now that the real bug is being fixed in DNF,
|
|
|
70e049 |
lazy umount is not needed as a (broken) workaround in livecd-tools. But
|
|
|
70e049 |
that's a separate patch for livecd-tools:
|
|
|
70e049 |
<https://github.com/livecd-tools/livecd-tools/pull/227>.)
|
|
|
70e049 |
|
|
|
70e049 |
Plug the fd leak by forcing a garbage collection in dnf.Base.reset()
|
|
|
70e049 |
whenever we cut the "_sack", "_goal" and "_transaction" links -- that is,
|
|
|
70e049 |
when the "sack" and "goal" parameters are True.
|
|
|
70e049 |
|
|
|
70e049 |
Note that precisely due to the unpredictable behavior of the garbage
|
|
|
70e049 |
collector, reproducing the bug may prove elusive. In order to reproduce it
|
|
|
70e049 |
deterministically, through usage with livecd-creator, disabling automatic
|
|
|
70e049 |
garbage collection with the following patch (for livecd-tools) is
|
|
|
70e049 |
sufficient:
|
|
|
70e049 |
|
|
|
70e049 |
> diff --git a/tools/livecd-creator b/tools/livecd-creator
|
|
|
70e049 |
> index 291de10cbbf9..8d2c740c238b 100755
|
|
|
70e049 |
> --- a/tools/livecd-creator
|
|
|
70e049 |
> +++ b/tools/livecd-creator
|
|
|
70e049 |
> @@ -31,6 +31,8 @@ from dnf.exceptions import Error as DnfBaseError
|
|
|
70e049 |
> import imgcreate
|
|
|
70e049 |
> from imgcreate.errors import KickstartError
|
|
|
70e049 |
>
|
|
|
70e049 |
> +import gc
|
|
|
70e049 |
> +
|
|
|
70e049 |
> class Usage(Exception):
|
|
|
70e049 |
> def __init__(self, msg = None, no_error = False):
|
|
|
70e049 |
> Exception.__init__(self, msg, no_error)
|
|
|
70e049 |
> @@ -261,5 +263,6 @@ def do_nss_libs_hack():
|
|
|
70e049 |
> return hack
|
|
|
70e049 |
>
|
|
|
70e049 |
> if __name__ == "__main__":
|
|
|
70e049 |
> + gc.disable()
|
|
|
70e049 |
> hack = do_nss_libs_hack()
|
|
|
70e049 |
> sys.exit(main())
|
|
|
70e049 |
|
|
|
70e049 |
Also note that you need to use livecd-tools at git commit 4afde9352e82 or
|
|
|
70e049 |
later, for this fix to make any difference: said commit fixes a different
|
|
|
70e049 |
(independent) bug in livecd-tools that produces identical symptoms, but
|
|
|
70e049 |
from a different origin. In other words, if you don't have commit
|
|
|
70e049 |
4afde9352e82 in your livecd-tools install, then said bug in livecd-tools
|
|
|
70e049 |
will mask this DNF fix.
|
|
|
70e049 |
|
|
|
70e049 |
Signed-off-by: Laszlo Ersek <lersek@redhat.com>
|
|
|
70e049 |
(cherry picked from commit 5ce5ed1ea08ad6e198c1c1642c4d9ea2db6eab86)
|
|
|
70e049 |
---
|
|
|
70e049 |
dnf/base.py | 41 +++++++++++++++++++++++++++++++++++++++++
|
|
|
70e049 |
1 file changed, 41 insertions(+)
|
|
|
70e049 |
|
|
|
70e049 |
diff --git a/dnf/base.py b/dnf/base.py
|
|
|
70e049 |
index babca31d..852fcdd8 100644
|
|
|
70e049 |
--- a/dnf/base.py
|
|
|
70e049 |
+++ b/dnf/base.py
|
|
|
70e049 |
@@ -72,6 +72,7 @@ import dnf.transaction
|
|
|
70e049 |
import dnf.util
|
|
|
70e049 |
import dnf.yum.rpmtrans
|
|
|
70e049 |
import functools
|
|
|
70e049 |
+import gc
|
|
|
70e049 |
import hawkey
|
|
|
70e049 |
import itertools
|
|
|
70e049 |
import logging
|
|
|
70e049 |
@@ -568,6 +569,46 @@ class Base(object):
|
|
|
70e049 |
self._comps_trans = dnf.comps.TransactionBunch()
|
|
|
70e049 |
self._transaction = None
|
|
|
70e049 |
self._update_security_filters = []
|
|
|
70e049 |
+ if sack and goal:
|
|
|
70e049 |
+ # We've just done this, above:
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # _sack _goal
|
|
|
70e049 |
+ # | |
|
|
|
70e049 |
+ # -- [CUT] -- -- [CUT] --
|
|
|
70e049 |
+ # | |
|
|
|
70e049 |
+ # v | v
|
|
|
70e049 |
+ # +----------------+ [C] +-------------+
|
|
|
70e049 |
+ # | DnfSack object | <-[U]- | Goal object |
|
|
|
70e049 |
+ # +----------------+ [T] +-------------+
|
|
|
70e049 |
+ # |^ |^ |^ |
|
|
|
70e049 |
+ # || || ||
|
|
|
70e049 |
+ # || || || |
|
|
|
70e049 |
+ # +--||----||----||---+ [C]
|
|
|
70e049 |
+ # | v| v| v| | <--[U]-- _transaction
|
|
|
70e049 |
+ # | Pkg1 Pkg2 PkgN | [T]
|
|
|
70e049 |
+ # | | |
|
|
|
70e049 |
+ # | Transaction oject |
|
|
|
70e049 |
+ # +-------------------+
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # At this point, the DnfSack object would be released only
|
|
|
70e049 |
+ # eventually, by Python's generational garbage collector, due to the
|
|
|
70e049 |
+ # cyclic references DnfSack<->Pkg1 ... DnfSack<->PkgN.
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # The delayed release is a problem: the DnfSack object may
|
|
|
70e049 |
+ # (indirectly) own "page file" file descriptors in libsolv, via
|
|
|
70e049 |
+ # libdnf. For example,
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # sack->priv->pool->repos[1]->repodata[1]->store.pagefd = 7
|
|
|
70e049 |
+ # sack->priv->pool->repos[1]->repodata[2]->store.pagefd = 8
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # These file descriptors are closed when the DnfSack object is
|
|
|
70e049 |
+ # eventually released, that is, when dnf_sack_finalize() (in libdnf)
|
|
|
70e049 |
+ # calls pool_free() (in libsolv).
|
|
|
70e049 |
+ #
|
|
|
70e049 |
+ # We need that to happen right now, as callers may want to unmount
|
|
|
70e049 |
+ # the filesystems which those file descriptors refer to immediately
|
|
|
70e049 |
+ # after reset() returns. Therefore, force a garbage collection here.
|
|
|
70e049 |
+ gc.collect()
|
|
|
70e049 |
|
|
|
70e049 |
def _closeRpmDB(self):
|
|
|
70e049 |
"""Closes down the instances of rpmdb that could be open."""
|
|
|
70e049 |
--
|
|
|
70e049 |
2.35.1
|
|
|
70e049 |
|