| From c5386144fbf09f628148101bc674e2421cdd16e3 Mon Sep 17 00:00:00 2001 |
| Message-Id: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com> |
| From: Nigel Croxon <ncroxon@redhat.com> |
| Date: Thu, 14 Nov 2013 22:52:37 +0100 |
| Subject: [PATCH 01/46] add a header file for atomic operations |
| |
| RH-Author: Nigel Croxon <ncroxon@redhat.com> |
| Message-id: <1384469598-13137-2-git-send-email-ncroxon@redhat.com> |
| Patchwork-id: 55686 |
| O-Subject: [RHEL7.0 PATCH 01/42] add a header file for atomic operations |
| Bugzilla: 1011720 |
| RH-Acked-by: Orit Wasserman <owasserm@redhat.com> |
| RH-Acked-by: Amit Shah <amit.shah@redhat.com> |
| RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com> |
| |
| Bugzilla: 1011720 |
| https://bugzilla.redhat.com/show_bug.cgi?id=1011720 |
| |
| >From commit ID: |
| commit 5444e768ee1abe6e021bece19a9a932351f88c88 |
| Author: Paolo Bonzini <pbonzini@redhat.com> |
| Date: Mon May 13 13:29:47 2013 +0200 |
| |
| add a header file for atomic operations |
| |
| We're already using them in several places, but __sync builtins are just |
| too ugly to type, and do not provide seqcst load/store operations. |
| |
| Reviewed-by: Richard Henderson <rth@twiddle.net> |
| Signed-off-by: Paolo Bonzini <pbonzini@redhat.com> |
| |
| docs/atomics.txt | 352 ++++++++++++++++++++++++++++++++++++++++++++++ |
| hw/display/qxl.c | 3 +- |
| hw/virtio/vhost.c | 9 +- |
| include/qemu/atomic.h | 198 ++++++++++++++++++++++---- |
| migration.c | 3 +- |
| tests/test-thread-pool.c | 8 +- |
| 6 files changed, 529 insertions(+), 44 deletions(-) |
| create mode 100644 docs/atomics.txt |
| |
| Signed-off-by: Michal Novotny <minovotn@redhat.com> |
| |
| docs/atomics.txt | 352 +++++++++++++++++++++++++++++++++++++++++++++++ |
| hw/display/qxl.c | 3 +- |
| hw/virtio/vhost.c | 9 +- |
| include/qemu/atomic.h | 198 +++++++++++++++++++++----- |
| migration.c | 3 +- |
| tests/test-thread-pool.c | 8 +- |
| 6 files changed, 529 insertions(+), 44 deletions(-) |
| create mode 100644 docs/atomics.txt |
| |
| diff --git a/docs/atomics.txt b/docs/atomics.txt |
| new file mode 100644 |
| index 0000000..6f2997b |
| |
| |
| @@ -0,0 +1,352 @@ |
| +CPUs perform independent memory operations effectively in random order. |
| +but this can be a problem for CPU-CPU interaction (including interactions |
| +between QEMU and the guest). Multi-threaded programs use various tools |
| +to instruct the compiler and the CPU to restrict the order to something |
| +that is consistent with the expectations of the programmer. |
| + |
| +The most basic tool is locking. Mutexes, condition variables and |
| +semaphores are used in QEMU, and should be the default approach to |
| +synchronization. Anything else is considerably harder, but it's |
| +also justified more often than one would like. The two tools that |
| +are provided by qemu/atomic.h are memory barriers and atomic operations. |
| + |
| +Macros defined by qemu/atomic.h fall in three camps: |
| + |
| +- compiler barriers: barrier(); |
| + |
| +- weak atomic access and manual memory barriers: atomic_read(), |
| + atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_read_barrier_depends(); |
| + |
| +- sequentially consistent atomic access: everything else. |
| + |
| + |
| +COMPILER MEMORY BARRIER |
| +======================= |
| + |
| +barrier() prevents the compiler from moving the memory accesses either |
| +side of it to the other side. The compiler barrier has no direct effect |
| +on the CPU, which may then reorder things however it wishes. |
| + |
| +barrier() is mostly used within qemu/atomic.h itself. On some |
| +architectures, CPU guarantees are strong enough that blocking compiler |
| +optimizations already ensures the correct order of execution. In this |
| +case, qemu/atomic.h will reduce stronger memory barriers to simple |
| +compiler barriers. |
| + |
| +Still, barrier() can be useful when writing code that can be interrupted |
| +by signal handlers. |
| + |
| + |
| +SEQUENTIALLY CONSISTENT ATOMIC ACCESS |
| +===================================== |
| + |
| +Most of the operations in the qemu/atomic.h header ensure *sequential |
| +consistency*, where "the result of any execution is the same as if the |
| +operations of all the processors were executed in some sequential order, |
| +and the operations of each individual processor appear in this sequence |
| +in the order specified by its program". |
| + |
| +qemu/atomic.h provides the following set of atomic read-modify-write |
| +operations: |
| + |
| + void atomic_inc(ptr) |
| + void atomic_dec(ptr) |
| + void atomic_add(ptr, val) |
| + void atomic_sub(ptr, val) |
| + void atomic_and(ptr, val) |
| + void atomic_or(ptr, val) |
| + |
| + typeof(*ptr) atomic_fetch_inc(ptr) |
| + typeof(*ptr) atomic_fetch_dec(ptr) |
| + typeof(*ptr) atomic_fetch_add(ptr, val) |
| + typeof(*ptr) atomic_fetch_sub(ptr, val) |
| + typeof(*ptr) atomic_fetch_and(ptr, val) |
| + typeof(*ptr) atomic_fetch_or(ptr, val) |
| + typeof(*ptr) atomic_xchg(ptr, val |
| + typeof(*ptr) atomic_cmpxchg(ptr, old, new) |
| + |
| +all of which return the old value of *ptr. These operations are |
| +polymorphic; they operate on any type that is as wide as an int. |
| + |
| +Sequentially consistent loads and stores can be done using: |
| + |
| + atomic_fetch_add(ptr, 0) for loads |
| + atomic_xchg(ptr, val) for stores |
| + |
| +However, they are quite expensive on some platforms, notably POWER and |
| +ARM. Therefore, qemu/atomic.h provides two primitives with slightly |
| +weaker constraints: |
| + |
| + typeof(*ptr) atomic_mb_read(ptr) |
| + void atomic_mb_set(ptr, val) |
| + |
| +The semantics of these primitives map to Java volatile variables, |
| +and are strongly related to memory barriers as used in the Linux |
| +kernel (see below). |
| + |
| +As long as you use atomic_mb_read and atomic_mb_set, accesses cannot |
| +be reordered with each other, and it is also not possible to reorder |
| +"normal" accesses around them. |
| + |
| +However, and this is the important difference between |
| +atomic_mb_read/atomic_mb_set and sequential consistency, it is important |
| +for both threads to access the same volatile variable. It is not the |
| +case that everything visible to thread A when it writes volatile field f |
| +becomes visible to thread B after it reads volatile field g. The store |
| +and load have to "match" (i.e., be performed on the same volatile |
| +field) to achieve the right semantics. |
| + |
| + |
| +These operations operate on any type that is as wide as an int or smaller. |
| + |
| + |
| +WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS |
| +============================================= |
| + |
| +Compared to sequentially consistent atomic access, programming with |
| +weaker consistency models can be considerably more complicated. |
| +In general, if the algorithm you are writing includes both writes |
| +and reads on the same side, it is generally simpler to use sequentially |
| +consistent primitives. |
| + |
| +When using this model, variables are accessed with atomic_read() and |
| +atomic_set(), and restrictions to the ordering of accesses is enforced |
| +using the smp_rmb(), smp_wmb(), smp_mb() and smp_read_barrier_depends() |
| +memory barriers. |
| + |
| +atomic_read() and atomic_set() prevents the compiler from using |
| +optimizations that might otherwise optimize accesses out of existence |
| +on the one hand, or that might create unsolicited accesses on the other. |
| +In general this should not have any effect, because the same compiler |
| +barriers are already implied by memory barriers. However, it is useful |
| +to do so, because it tells readers which variables are shared with |
| +other threads, and which are local to the current thread or protected |
| +by other, more mundane means. |
| + |
| +Memory barriers control the order of references to shared memory. |
| +They come in four kinds: |
| + |
| +- smp_rmb() guarantees that all the LOAD operations specified before |
| + the barrier will appear to happen before all the LOAD operations |
| + specified after the barrier with respect to the other components of |
| + the system. |
| + |
| + In other words, smp_rmb() puts a partial ordering on loads, but is not |
| + required to have any effect on stores. |
| + |
| +- smp_wmb() guarantees that all the STORE operations specified before |
| + the barrier will appear to happen before all the STORE operations |
| + specified after the barrier with respect to the other components of |
| + the system. |
| + |
| + In other words, smp_wmb() puts a partial ordering on stores, but is not |
| + required to have any effect on loads. |
| + |
| +- smp_mb() guarantees that all the LOAD and STORE operations specified |
| + before the barrier will appear to happen before all the LOAD and |
| + STORE operations specified after the barrier with respect to the other |
| + components of the system. |
| + |
| + smp_mb() puts a partial ordering on both loads and stores. It is |
| + stronger than both a read and a write memory barrier; it implies both |
| + smp_rmb() and smp_wmb(), but it also prevents STOREs coming before the |
| + barrier from overtaking LOADs coming after the barrier and vice versa. |
| + |
| +- smp_read_barrier_depends() is a weaker kind of read barrier. On |
| + most processors, whenever two loads are performed such that the |
| + second depends on the result of the first (e.g., the first load |
| + retrieves the address to which the second load will be directed), |
| + the processor will guarantee that the first LOAD will appear to happen |
| + before the second with respect to the other components of the system. |
| + However, this is not always true---for example, it was not true on |
| + Alpha processors. Whenever this kind of access happens to shared |
| + memory (that is not protected by a lock), a read barrier is needed, |
| + and smp_read_barrier_depends() can be used instead of smp_rmb(). |
| + |
| + Note that the first load really has to have a _data_ dependency and not |
| + a control dependency. If the address for the second load is dependent |
| + on the first load, but the dependency is through a conditional rather |
| + than actually loading the address itself, then it's a _control_ |
| + dependency and a full read barrier or better is required. |
| + |
| + |
| +This is the set of barriers that is required *between* two atomic_read() |
| +and atomic_set() operations to achieve sequential consistency: |
| + |
| + | 2nd operation | |
| + |-----------------------------------------| |
| + 1st operation | (after last) | atomic_read | atomic_set | |
| + ---------------+--------------+-------------+------------| |
| + (before first) | | none | smp_wmb() | |
| + ---------------+--------------+-------------+------------| |
| + atomic_read | smp_rmb() | smp_rmb()* | ** | |
| + ---------------+--------------+-------------+------------| |
| + atomic_set | none | smp_mb()*** | smp_wmb() | |
| + ---------------+--------------+-------------+------------| |
| + |
| + * Or smp_read_barrier_depends(). |
| + |
| + ** This requires a load-store barrier. How to achieve this varies |
| + depending on the machine, but in practice smp_rmb()+smp_wmb() |
| + should have the desired effect. For example, on PowerPC the |
| + lwsync instruction is a combined load-load, load-store and |
| + store-store barrier. |
| + |
| + *** This requires a store-load barrier. On most machines, the only |
| + way to achieve this is a full barrier. |
| + |
| + |
| +You can see that the two possible definitions of atomic_mb_read() |
| +and atomic_mb_set() are the following: |
| + |
| + 1) atomic_mb_read(p) = atomic_read(p); smp_rmb() |
| + atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); smp_mb() |
| + |
| + 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_rmb() |
| + atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); |
| + |
| +Usually the former is used, because smp_mb() is expensive and a program |
| +normally has more reads than writes. Therefore it makes more sense to |
| +make atomic_mb_set() the more expensive operation. |
| + |
| +There are two common cases in which atomic_mb_read and atomic_mb_set |
| +generate too many memory barriers, and thus it can be useful to manually |
| +place barriers instead: |
| + |
| +- when a data structure has one thread that is always a writer |
| + and one thread that is always a reader, manual placement of |
| + memory barriers makes the write side faster. Furthermore, |
| + correctness is easy to check for in this case using the "pairing" |
| + trick that is explained below: |
| + |
| + thread 1 thread 1 |
| + ------------------------- ------------------------ |
| + (other writes) |
| + smp_wmb() |
| + atomic_mb_set(&a, x) atomic_set(&a, x) |
| + smp_wmb() |
| + atomic_mb_set(&b, y) atomic_set(&b, y) |
| + |
| + => |
| + thread 2 thread 2 |
| + ------------------------- ------------------------ |
| + y = atomic_mb_read(&b) y = atomic_read(&b) |
| + smp_rmb() |
| + x = atomic_mb_read(&a) x = atomic_read(&a) |
| + smp_rmb() |
| + |
| +- sometimes, a thread is accessing many variables that are otherwise |
| + unrelated to each other (for example because, apart from the current |
| + thread, exactly one other thread will read or write each of these |
| + variables). In this case, it is possible to "hoist" the implicit |
| + barriers provided by atomic_mb_read() and atomic_mb_set() outside |
| + a loop. For example, the above definition atomic_mb_read() gives |
| + the following transformation: |
| + |
| + n = 0; n = 0; |
| + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) |
| + n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); |
| + smp_rmb(); |
| + |
| + Similarly, atomic_mb_set() can be transformed as follows: |
| + smp_mb(): |
| + |
| + smp_wmb(); |
| + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) |
| + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); |
| + smp_mb(); |
| + |
| + |
| +The two tricks can be combined. In this case, splitting a loop in |
| +two lets you hoist the barriers out of the loops _and_ eliminate the |
| +expensive smp_mb(): |
| + |
| + smp_wmb(); |
| + for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) |
| + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); |
| + atomic_mb_set(&b[i], false); smb_wmb(); |
| + } for (i = 0; i < 10; i++) |
| + atomic_set(&a[i], false); |
| + smp_mb(); |
| + |
| + The other thread can still use atomic_mb_read()/atomic_mb_set() |
| + |
| + |
| +Memory barrier pairing |
| +---------------------- |
| + |
| +A useful rule of thumb is that memory barriers should always, or almost |
| +always, be paired with another barrier. In the case of QEMU, however, |
| +note that the other barrier may actually be in a driver that runs in |
| +the guest! |
| + |
| +For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() |
| +both count as read barriers. A read barriers shall pair with a write |
| +barrier or a full barrier; a write barrier shall pair with a read |
| +barrier or a full barrier. A full barrier can pair with anything. |
| +For example: |
| + |
| + thread 1 thread 2 |
| + =============== =============== |
| + a = 1; |
| + smp_wmb(); |
| + b = 2; x = b; |
| + smp_rmb(); |
| + y = a; |
| + |
| +Note that the "writing" thread are accessing the variables in the |
| +opposite order as the "reading" thread. This is expected: stores |
| +before the write barrier will normally match the loads after the |
| +read barrier, and vice versa. The same is true for more than 2 |
| +access and for data dependency barriers: |
| + |
| + thread 1 thread 2 |
| + =============== =============== |
| + b[2] = 1; |
| + smp_wmb(); |
| + x->i = 2; |
| + smp_wmb(); |
| + a = x; x = a; |
| + smp_read_barrier_depends(); |
| + y = x->i; |
| + smp_read_barrier_depends(); |
| + z = b[y]; |
| + |
| +smp_wmb() also pairs with atomic_mb_read(), and smp_rmb() also pairs |
| +with atomic_mb_set(). |
| + |
| + |
| +COMPARISON WITH LINUX KERNEL MEMORY BARRIERS |
| +============================================ |
| + |
| +Here is a list of differences between Linux kernel atomic operations |
| +and memory barriers, and the equivalents in QEMU: |
| + |
| +- atomic operations in Linux are always on a 32-bit int type and |
| + use a boxed atomic_t type; atomic operations in QEMU are polymorphic |
| + and use normal C types. |
| + |
| +- atomic_read and atomic_set in Linux give no guarantee at all; |
| + atomic_read and atomic_set in QEMU include a compiler barrier |
| + (similar to the ACCESS_ONCE macro in Linux). |
| + |
| +- most atomic read-modify-write operations in Linux return void; |
| + in QEMU, all of them return the old value of the variable. |
| + |
| +- different atomic read-modify-write operations in Linux imply |
| + a different set of memory barriers; in QEMU, all of them enforce |
| + sequential consistency, which means they imply full memory barriers |
| + before and after the operation. |
| + |
| +- Linux does not have an equivalent of atomic_mb_read() and |
| + atomic_mb_set(). In particular, note that set_mb() is a little |
| + weaker than atomic_mb_set(). |
| + |
| + |
| +SOURCES |
| +======= |
| + |
| +* Documentation/memory-barriers.txt from the Linux kernel |
| + |
| +* "The JSR-133 Cookbook for Compiler Writers", available at |
| + http://g.oswego.edu/dl/jmm/cookbook.html |
| diff --git a/hw/display/qxl.c b/hw/display/qxl.c |
| index ea985d2..830b3c5 100644 |
| |
| |
| @@ -23,6 +23,7 @@ |
| #include "qemu-common.h" |
| #include "qemu/timer.h" |
| #include "qemu/queue.h" |
| +#include "qemu/atomic.h" |
| #include "monitor/monitor.h" |
| #include "sysemu/sysemu.h" |
| #include "trace.h" |
| @@ -1726,7 +1727,7 @@ static void qxl_send_events(PCIQXLDevice *d, uint32_t events) |
| trace_qxl_send_events_vm_stopped(d->id, events); |
| return; |
| } |
| - old_pending = __sync_fetch_and_or(&d->ram->int_pending, le_events); |
| + old_pending = atomic_fetch_or(&d->ram->int_pending, le_events); |
| if ((old_pending & le_events) == le_events) { |
| return; |
| } |
| diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c |
| index 0dabf26..54aa569 100644 |
| |
| |
| @@ -16,6 +16,7 @@ |
| #include <sys/ioctl.h> |
| #include "hw/virtio/vhost.h" |
| #include "hw/hw.h" |
| +#include "qemu/atomic.h" |
| #include "qemu/range.h" |
| #include <linux/vhost.h> |
| #include "exec/address-spaces.h" |
| @@ -47,11 +48,9 @@ static void vhost_dev_sync_region(struct vhost_dev *dev, |
| addr += VHOST_LOG_CHUNK; |
| continue; |
| } |
| - /* Data must be read atomically. We don't really |
| - * need the barrier semantics of __sync |
| - * builtins, but it's easier to use them than |
| - * roll our own. */ |
| - log = __sync_fetch_and_and(from, 0); |
| + /* Data must be read atomically. We don't really need barrier semantics |
| + * but it's easier to use atomic_* than roll our own. */ |
| + log = atomic_xchg(from, 0); |
| while ((bit = sizeof(log) > sizeof(int) ? |
| ffsll(log) : ffs(log))) { |
| hwaddr page_addr; |
| diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h |
| index 10becb6..0aa8913 100644 |
| |
| |
| @@ -1,68 +1,202 @@ |
| -#ifndef __QEMU_BARRIER_H |
| -#define __QEMU_BARRIER_H 1 |
| +/* |
| + * Simple interface for atomic operations. |
| + * |
| + * Copyright (C) 2013 Red Hat, Inc. |
| + * |
| + * Author: Paolo Bonzini <pbonzini@redhat.com> |
| + * |
| + * This work is licensed under the terms of the GNU GPL, version 2 or later. |
| + * See the COPYING file in the top-level directory. |
| + * |
| + */ |
| |
| -/* Compiler barrier */ |
| -#define barrier() asm volatile("" ::: "memory") |
| +#ifndef __QEMU_ATOMIC_H |
| +#define __QEMU_ATOMIC_H 1 |
| |
| -#if defined(__i386__) |
| +#include "qemu/compiler.h" |
| |
| -#include "qemu/compiler.h" /* QEMU_GNUC_PREREQ */ |
| +/* For C11 atomic ops */ |
| |
| -/* |
| - * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops |
| - * on x86(well, a compiler barrier only). Well, at least as long as |
| - * qemu doesn't do accesses to write-combining memory or non-temporal |
| - * load/stores from C code. |
| - */ |
| -#define smp_wmb() barrier() |
| -#define smp_rmb() barrier() |
| +/* Compiler barrier */ |
| +#define barrier() ({ asm volatile("" ::: "memory"); (void)0; }) |
| + |
| +#ifndef __ATOMIC_RELAXED |
| |
| /* |
| - * We use GCC builtin if it's available, as that can use |
| - * mfence on 32 bit as well, e.g. if built with -march=pentium-m. |
| - * However, on i386, there seem to be known bugs as recently as 4.3. |
| - * */ |
| -#if QEMU_GNUC_PREREQ(4, 4) |
| -#define smp_mb() __sync_synchronize() |
| + * We use GCC builtin if it's available, as that can use mfence on |
| + * 32-bit as well, e.g. if built with -march=pentium-m. However, on |
| + * i386 the spec is buggy, and the implementation followed it until |
| + * 4.3 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793). |
| + */ |
| +#if defined(__i386__) || defined(__x86_64__) |
| +#if !QEMU_GNUC_PREREQ(4, 4) |
| +#if defined __x86_64__ |
| +#define smp_mb() ({ asm volatile("mfence" ::: "memory"); (void)0; }) |
| #else |
| -#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory") |
| +#define smp_mb() ({ asm volatile("lock; addl $0,0(%%esp) " ::: "memory"); (void)0; }) |
| +#endif |
| +#endif |
| +#endif |
| + |
| + |
| +#ifdef __alpha__ |
| +#define smp_read_barrier_depends() asm volatile("mb":::"memory") |
| #endif |
| |
| -#elif defined(__x86_64__) |
| +#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__) |
| |
| +/* |
| + * Because of the strongly ordered storage model, wmb() and rmb() are nops |
| + * here (a compiler barrier only). QEMU doesn't do accesses to write-combining |
| + * qemu memory or non-temporal load/stores from C code. |
| + */ |
| #define smp_wmb() barrier() |
| #define smp_rmb() barrier() |
| -#define smp_mb() asm volatile("mfence" ::: "memory") |
| + |
| +/* |
| + * __sync_lock_test_and_set() is documented to be an acquire barrier only, |
| + * but it is a full barrier at the hardware level. Add a compiler barrier |
| + * to make it a full barrier also at the compiler level. |
| + */ |
| +#define atomic_xchg(ptr, i) (barrier(), __sync_lock_test_and_set(ptr, i)) |
| + |
| +/* |
| + * Load/store with Java volatile semantics. |
| + */ |
| +#define atomic_mb_set(ptr, i) ((void)atomic_xchg(ptr, i)) |
| |
| #elif defined(_ARCH_PPC) |
| |
| /* |
| * We use an eieio() for wmb() on powerpc. This assumes we don't |
| * need to order cacheable and non-cacheable stores with respect to |
| - * each other |
| + * each other. |
| + * |
| + * smp_mb has the same problem as on x86 for not-very-new GCC |
| + * (http://patchwork.ozlabs.org/patch/126184/, Nov 2011). |
| */ |
| -#define smp_wmb() asm volatile("eieio" ::: "memory") |
| - |
| +#define smp_wmb() ({ asm volatile("eieio" ::: "memory"); (void)0; }) |
| #if defined(__powerpc64__) |
| -#define smp_rmb() asm volatile("lwsync" ::: "memory") |
| +#define smp_rmb() ({ asm volatile("lwsync" ::: "memory"); (void)0; }) |
| #else |
| -#define smp_rmb() asm volatile("sync" ::: "memory") |
| +#define smp_rmb() ({ asm volatile("sync" ::: "memory"); (void)0; }) |
| #endif |
| +#define smp_mb() ({ asm volatile("sync" ::: "memory"); (void)0; }) |
| |
| -#define smp_mb() asm volatile("sync" ::: "memory") |
| +#endif /* _ARCH_PPC */ |
| |
| -#else |
| +#endif /* C11 atomics */ |
| |
| /* |
| * For (host) platforms we don't have explicit barrier definitions |
| * for, we use the gcc __sync_synchronize() primitive to generate a |
| * full barrier. This should be safe on all platforms, though it may |
| - * be overkill for wmb() and rmb(). |
| + * be overkill for smp_wmb() and smp_rmb(). |
| */ |
| +#ifndef smp_mb |
| +#define smp_mb() __sync_synchronize() |
| +#endif |
| + |
| +#ifndef smp_wmb |
| +#ifdef __ATOMIC_RELEASE |
| +#define smp_wmb() __atomic_thread_fence(__ATOMIC_RELEASE) |
| +#else |
| #define smp_wmb() __sync_synchronize() |
| -#define smp_mb() __sync_synchronize() |
| +#endif |
| +#endif |
| + |
| +#ifndef smp_rmb |
| +#ifdef __ATOMIC_ACQUIRE |
| +#define smp_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE) |
| +#else |
| #define smp_rmb() __sync_synchronize() |
| +#endif |
| +#endif |
| + |
| +#ifndef smp_read_barrier_depends |
| +#ifdef __ATOMIC_CONSUME |
| +#define smp_read_barrier_depends() __atomic_thread_fence(__ATOMIC_CONSUME) |
| +#else |
| +#define smp_read_barrier_depends() barrier() |
| +#endif |
| +#endif |
| |
| +#ifndef atomic_read |
| +#define atomic_read(ptr) (*(__typeof__(*ptr) *volatile) (ptr)) |
| #endif |
| |
| +#ifndef atomic_set |
| +#define atomic_set(ptr, i) ((*(__typeof__(*ptr) *volatile) (ptr)) = (i)) |
| +#endif |
| + |
| +/* These have the same semantics as Java volatile variables. |
| + * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html: |
| + * "1. Issue a StoreStore barrier (wmb) before each volatile store." |
| + * 2. Issue a StoreLoad barrier after each volatile store. |
| + * Note that you could instead issue one before each volatile load, but |
| + * this would be slower for typical programs using volatiles in which |
| + * reads greatly outnumber writes. Alternatively, if available, you |
| + * can implement volatile store as an atomic instruction (for example |
| + * XCHG on x86) and omit the barrier. This may be more efficient if |
| + * atomic instructions are cheaper than StoreLoad barriers. |
| + * 3. Issue LoadLoad and LoadStore barriers after each volatile load." |
| + * |
| + * If you prefer to think in terms of "pairing" of memory barriers, |
| + * an atomic_mb_read pairs with an atomic_mb_set. |
| + * |
| + * And for the few ia64 lovers that exist, an atomic_mb_read is a ld.acq, |
| + * while an atomic_mb_set is a st.rel followed by a memory barrier. |
| + * |
| + * These are a bit weaker than __atomic_load/store with __ATOMIC_SEQ_CST |
| + * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough. |
| + * Just always use the barriers manually by the rules above. |
| + */ |
| +#ifndef atomic_mb_read |
| +#define atomic_mb_read(ptr) ({ \ |
| + typeof(*ptr) _val = atomic_read(ptr); \ |
| + smp_rmb(); \ |
| + _val; \ |
| +}) |
| +#endif |
| + |
| +#ifndef atomic_mb_set |
| +#define atomic_mb_set(ptr, i) do { \ |
| + smp_wmb(); \ |
| + atomic_set(ptr, i); \ |
| + smp_mb(); \ |
| +} while (0) |
| +#endif |
| + |
| +#ifndef atomic_xchg |
| +#ifdef __ATOMIC_SEQ_CST |
| +#define atomic_xchg(ptr, i) ({ \ |
| + typeof(*ptr) _new = (i), _old; \ |
| + __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \ |
| + _old; \ |
| +}) |
| +#elif defined __clang__ |
| +#define atomic_xchg(ptr, i) __sync_exchange(ptr, i) |
| +#else |
| +/* __sync_lock_test_and_set() is documented to be an acquire barrier only. */ |
| +#define atomic_xchg(ptr, i) (smp_mb(), __sync_lock_test_and_set(ptr, i)) |
| +#endif |
| +#endif |
| + |
| +/* Provide shorter names for GCC atomic builtins. */ |
| +#define atomic_fetch_inc(ptr) __sync_fetch_and_add(ptr, 1) |
| +#define atomic_fetch_dec(ptr) __sync_fetch_and_add(ptr, -1) |
| +#define atomic_fetch_add __sync_fetch_and_add |
| +#define atomic_fetch_sub __sync_fetch_and_sub |
| +#define atomic_fetch_and __sync_fetch_and_and |
| +#define atomic_fetch_or __sync_fetch_and_or |
| +#define atomic_cmpxchg __sync_val_compare_and_swap |
| + |
| +/* And even shorter names that return void. */ |
| +#define atomic_inc(ptr) ((void) __sync_fetch_and_add(ptr, 1)) |
| +#define atomic_dec(ptr) ((void) __sync_fetch_and_add(ptr, -1)) |
| +#define atomic_add(ptr, n) ((void) __sync_fetch_and_add(ptr, n)) |
| +#define atomic_sub(ptr, n) ((void) __sync_fetch_and_sub(ptr, n)) |
| +#define atomic_and(ptr, n) ((void) __sync_fetch_and_and(ptr, n)) |
| +#define atomic_or(ptr, n) ((void) __sync_fetch_and_or(ptr, n)) |
| + |
| #endif |
| diff --git a/migration.c b/migration.c |
| index 46c633a..d91e702 100644 |
| |
| |
| @@ -291,8 +291,7 @@ static void migrate_fd_cleanup(void *opaque) |
| |
| static void migrate_finish_set_state(MigrationState *s, int new_state) |
| { |
| - if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE, |
| - new_state) == new_state) { |
| + if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) { |
| trace_migrate_set_state(new_state); |
| } |
| } |
| diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c |
| index 22915aa..b62338f 100644 |
| |
| |
| @@ -17,15 +17,15 @@ typedef struct { |
| static int worker_cb(void *opaque) |
| { |
| WorkerTestData *data = opaque; |
| - return __sync_fetch_and_add(&data->n, 1); |
| + return atomic_fetch_inc(&data->n); |
| } |
| |
| static int long_cb(void *opaque) |
| { |
| WorkerTestData *data = opaque; |
| - __sync_fetch_and_add(&data->n, 1); |
| + atomic_inc(&data->n); |
| g_usleep(2000000); |
| - __sync_fetch_and_add(&data->n, 1); |
| + atomic_inc(&data->n); |
| return 0; |
| } |
| |
| @@ -169,7 +169,7 @@ static void test_cancel(void) |
| /* Cancel the jobs that haven't been started yet. */ |
| num_canceled = 0; |
| for (i = 0; i < 100; i++) { |
| - if (__sync_val_compare_and_swap(&data[i].n, 0, 3) == 0) { |
| + if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) { |
| data[i].ret = -ECANCELED; |
| bdrv_aio_cancel(data[i].aiocb); |
| active--; |
| -- |
| 1.7.11.7 |
| |