From aefff389c4d11bd69180db7177135c4645a9b1bd Mon Sep 17 00:00:00 2001 From: Sergio Lopez Pascual Date: Fri, 7 Feb 2020 11:27:46 +0000 Subject: [PATCH 13/18] block/backup-top: Don't acquire context while dropping top RH-Author: Sergio Lopez Pascual Message-id: <20200207112749.25073-7-slp@redhat.com> Patchwork-id: 93759 O-Subject: [RHEL-AV-8.2.0 qemu-kvm PATCH v2 6/9] block/backup-top: Don't acquire context while dropping top Bugzilla: 1745606 1746217 1773517 1779036 1782111 1782175 1783965 RH-Acked-by: Stefano Garzarella RH-Acked-by: Paolo Bonzini RH-Acked-by: Max Reitz RH-Acked-by: Stefan Hajnoczi All paths that lead to bdrv_backup_top_drop(), except for the call from backup_clean(), imply that the BDS AioContext has already been acquired, so doing it there too can potentially lead to QEMU hanging on AIO_WAIT_WHILE(). An easy way to trigger this situation is by issuing a two actions transaction, with a proper and a bogus blockdev-backup, so the second one will trigger a rollback. This will trigger a hang with an stack trace like this one: #0 0x00007fb680c75016 in __GI_ppoll (fds=0x55e74580f7c0, nfds=1, timeout=, timeout@entry=0x0, sigmask=sigmask@entry=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:39 #1 0x000055e743386e09 in ppoll (__ss=0x0, __timeout=0x0, __nfds=, __fds=) at /usr/include/bits/poll2.h:77 #2 0x000055e743386e09 in qemu_poll_ns (fds=, nfds=, timeout=) at util/qemu-timer.c:336 #3 0x000055e743388dc4 in aio_poll (ctx=0x55e7458925d0, blocking=blocking@entry=true) at util/aio-posix.c:669 #4 0x000055e743305dea in bdrv_flush (bs=bs@entry=0x55e74593c0d0) at block/io.c:2878 #5 0x000055e7432be58e in bdrv_close (bs=0x55e74593c0d0) at block.c:4017 #6 0x000055e7432be58e in bdrv_delete (bs=) at block.c:4262 #7 0x000055e7432be58e in bdrv_unref (bs=bs@entry=0x55e74593c0d0) at block.c:5644 #8 0x000055e743316b9b in bdrv_backup_top_drop (bs=bs@entry=0x55e74593c0d0) at block/backup-top.c:273 #9 0x000055e74331461f in backup_job_create (job_id=0x0, bs=bs@entry=0x55e7458d5820, target=target@entry=0x55e74589f640, speed=0, sync_mode=MIRROR_SYNC_MODE_FULL, sync_bitmap=sync_bitmap@entry=0x0, bitmap_mode=BITMAP_SYNC_MODE_ON_SUCCESS, compress=false, filter_node_name=0x0, on_source_error=BLOCKDEV_ON_ERROR_REPORT, on_target_error=BLOCKDEV_ON_ERROR_REPORT, creation_flags=0, cb=0x0, opaque=0x0, txn=0x0, errp=0x7ffddfd1efb0) at block/backup.c:478 #10 0x000055e74315bc52 in do_backup_common (backup=backup@entry=0x55e746c066d0, bs=bs@entry=0x55e7458d5820, target_bs=target_bs@entry=0x55e74589f640, aio_context=aio_context@entry=0x55e7458a91e0, txn=txn@entry=0x0, errp=errp@entry=0x7ffddfd1efb0) at blockdev.c:3580 #11 0x000055e74315c37c in do_blockdev_backup (backup=backup@entry=0x55e746c066d0, txn=0x0, errp=errp@entry=0x7ffddfd1efb0) at /usr/src/debug/qemu-kvm-4.2.0-2.module+el8.2.0+5135+ed3b2489.x86_64/./qapi/qapi-types-block-core.h:1492 #12 0x000055e74315c449 in blockdev_backup_prepare (common=0x55e746a8de90, errp=0x7ffddfd1f018) at blockdev.c:1885 #13 0x000055e743160152 in qmp_transaction (dev_list=, has_props=, props=0x55e7467fe2c0, errp=errp@entry=0x7ffddfd1f088) at blockdev.c:2340 #14 0x000055e743287ff5 in qmp_marshal_transaction (args=, ret=, errp=0x7ffddfd1f0f8) at qapi/qapi-commands-transaction.c:44 #15 0x000055e74333de6c in do_qmp_dispatch (errp=0x7ffddfd1f0f0, allow_oob=, request=, cmds=0x55e743c28d60 ) at qapi/qmp-dispatch.c:132 #16 0x000055e74333de6c in qmp_dispatch (cmds=0x55e743c28d60 , request=, allow_oob=) at qapi/qmp-dispatch.c:175 #17 0x000055e74325c061 in monitor_qmp_dispatch (mon=0x55e745908030, req=) at monitor/qmp.c:145 #18 0x000055e74325c6fa in monitor_qmp_bh_dispatcher (data=) at monitor/qmp.c:234 #19 0x000055e743385866 in aio_bh_call (bh=0x55e745807ae0) at util/async.c:117 #20 0x000055e743385866 in aio_bh_poll (ctx=ctx@entry=0x55e7458067a0) at util/async.c:117 #21 0x000055e743388c54 in aio_dispatch (ctx=0x55e7458067a0) at util/aio-posix.c:459 #22 0x000055e743385742 in aio_ctx_dispatch (source=, callback=, user_data=) at util/async.c:260 #23 0x00007fb68543e67d in g_main_dispatch (context=0x55e745893a40) at gmain.c:3176 #24 0x00007fb68543e67d in g_main_context_dispatch (context=context@entry=0x55e745893a40) at gmain.c:3829 #25 0x000055e743387d08 in glib_pollfds_poll () at util/main-loop.c:219 #26 0x000055e743387d08 in os_host_main_loop_wait (timeout=) at util/main-loop.c:242 #27 0x000055e743387d08 in main_loop_wait (nonblocking=) at util/main-loop.c:518 #28 0x000055e74316a3c1 in main_loop () at vl.c:1828 #29 0x000055e743016a72 in main (argc=, argv=, envp=) at vl.c:4504 Fix this by not acquiring the AioContext there, and ensuring all paths leading to it have it already acquired (backup_clean()). RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1782111 Signed-off-by: Sergio Lopez Signed-off-by: Kevin Wolf (cherry picked from commit 0abf2581717a19d9749d5c2ff8acd0ac203452c2) Signed-off-by: Sergio Lopez Signed-off-by: Danilo C. L. de Paula --- block/backup-top.c | 5 ----- block/backup.c | 3 +++ 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/block/backup-top.c b/block/backup-top.c index 818d3f2..b8d863f 100644 --- a/block/backup-top.c +++ b/block/backup-top.c @@ -255,9 +255,6 @@ append_failed: void bdrv_backup_top_drop(BlockDriverState *bs) { BDRVBackupTopState *s = bs->opaque; - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); bdrv_drained_begin(bs); @@ -271,6 +268,4 @@ void bdrv_backup_top_drop(BlockDriverState *bs) bdrv_drained_end(bs); bdrv_unref(bs); - - aio_context_release(aio_context); } diff --git a/block/backup.c b/block/backup.c index cf62b1a..1383e21 100644 --- a/block/backup.c +++ b/block/backup.c @@ -135,8 +135,11 @@ static void backup_abort(Job *job) static void backup_clean(Job *job) { BackupBlockJob *s = container_of(job, BackupBlockJob, common.job); + AioContext *aio_context = bdrv_get_aio_context(s->backup_top); + aio_context_acquire(aio_context); bdrv_backup_top_drop(s->backup_top); + aio_context_release(aio_context); } void backup_do_checkpoint(BlockJob *job, Error **errp) -- 1.8.3.1