9ae3a8
From c5386144fbf09f628148101bc674e2421cdd16e3 Mon Sep 17 00:00:00 2001
9ae3a8
Message-Id: <c5386144fbf09f628148101bc674e2421cdd16e3.1387382496.git.minovotn@redhat.com>
9ae3a8
From: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Date: Thu, 14 Nov 2013 22:52:37 +0100
9ae3a8
Subject: [PATCH 01/46] add a header file for atomic operations
9ae3a8
9ae3a8
RH-Author: Nigel Croxon <ncroxon@redhat.com>
9ae3a8
Message-id: <1384469598-13137-2-git-send-email-ncroxon@redhat.com>
9ae3a8
Patchwork-id: 55686
9ae3a8
O-Subject: [RHEL7.0 PATCH 01/42] add a header file for atomic operations
9ae3a8
Bugzilla: 1011720
9ae3a8
RH-Acked-by: Orit Wasserman <owasserm@redhat.com>
9ae3a8
RH-Acked-by: Amit Shah <amit.shah@redhat.com>
9ae3a8
RH-Acked-by: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
9ae3a8
Bugzilla: 1011720
9ae3a8
https://bugzilla.redhat.com/show_bug.cgi?id=1011720
9ae3a8
9ae3a8
>From commit ID:
9ae3a8
commit 5444e768ee1abe6e021bece19a9a932351f88c88
9ae3a8
Author: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
Date:   Mon May 13 13:29:47 2013 +0200
9ae3a8
9ae3a8
    add a header file for atomic operations
9ae3a8
9ae3a8
    We're already using them in several places, but __sync builtins are just
9ae3a8
    too ugly to type, and do not provide seqcst load/store operations.
9ae3a8
9ae3a8
    Reviewed-by: Richard Henderson <rth@twiddle.net>
9ae3a8
    Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
---
9ae3a8
 docs/atomics.txt         |  352 ++++++++++++++++++++++++++++++++++++++++++++++
9ae3a8
 hw/display/qxl.c         |    3 +-
9ae3a8
 hw/virtio/vhost.c        |    9 +-
9ae3a8
 include/qemu/atomic.h    |  198 ++++++++++++++++++++++----
9ae3a8
 migration.c              |    3 +-
9ae3a8
 tests/test-thread-pool.c |    8 +-
9ae3a8
 6 files changed, 529 insertions(+), 44 deletions(-)
9ae3a8
 create mode 100644 docs/atomics.txt
9ae3a8
9ae3a8
Signed-off-by: Michal Novotny <minovotn@redhat.com>
9ae3a8
---
9ae3a8
 docs/atomics.txt         | 352 +++++++++++++++++++++++++++++++++++++++++++++++
9ae3a8
 hw/display/qxl.c         |   3 +-
9ae3a8
 hw/virtio/vhost.c        |   9 +-
9ae3a8
 include/qemu/atomic.h    | 198 +++++++++++++++++++++-----
9ae3a8
 migration.c              |   3 +-
9ae3a8
 tests/test-thread-pool.c |   8 +-
9ae3a8
 6 files changed, 529 insertions(+), 44 deletions(-)
9ae3a8
 create mode 100644 docs/atomics.txt
9ae3a8
9ae3a8
diff --git a/docs/atomics.txt b/docs/atomics.txt
9ae3a8
new file mode 100644
9ae3a8
index 0000000..6f2997b
9ae3a8
--- /dev/null
9ae3a8
+++ b/docs/atomics.txt
9ae3a8
@@ -0,0 +1,352 @@
9ae3a8
+CPUs perform independent memory operations effectively in random order.
9ae3a8
+but this can be a problem for CPU-CPU interaction (including interactions
9ae3a8
+between QEMU and the guest).  Multi-threaded programs use various tools
9ae3a8
+to instruct the compiler and the CPU to restrict the order to something
9ae3a8
+that is consistent with the expectations of the programmer.
9ae3a8
+
9ae3a8
+The most basic tool is locking.  Mutexes, condition variables and
9ae3a8
+semaphores are used in QEMU, and should be the default approach to
9ae3a8
+synchronization.  Anything else is considerably harder, but it's
9ae3a8
+also justified more often than one would like.  The two tools that
9ae3a8
+are provided by qemu/atomic.h are memory barriers and atomic operations.
9ae3a8
+
9ae3a8
+Macros defined by qemu/atomic.h fall in three camps:
9ae3a8
+
9ae3a8
+- compiler barriers: barrier();
9ae3a8
+
9ae3a8
+- weak atomic access and manual memory barriers: atomic_read(),
9ae3a8
+  atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_read_barrier_depends();
9ae3a8
+
9ae3a8
+- sequentially consistent atomic access: everything else.
9ae3a8
+
9ae3a8
+
9ae3a8
+COMPILER MEMORY BARRIER
9ae3a8
+=======================
9ae3a8
+
9ae3a8
+barrier() prevents the compiler from moving the memory accesses either
9ae3a8
+side of it to the other side.  The compiler barrier has no direct effect
9ae3a8
+on the CPU, which may then reorder things however it wishes.
9ae3a8
+
9ae3a8
+barrier() is mostly used within qemu/atomic.h itself.  On some
9ae3a8
+architectures, CPU guarantees are strong enough that blocking compiler
9ae3a8
+optimizations already ensures the correct order of execution.  In this
9ae3a8
+case, qemu/atomic.h will reduce stronger memory barriers to simple
9ae3a8
+compiler barriers.
9ae3a8
+
9ae3a8
+Still, barrier() can be useful when writing code that can be interrupted
9ae3a8
+by signal handlers.
9ae3a8
+
9ae3a8
+
9ae3a8
+SEQUENTIALLY CONSISTENT ATOMIC ACCESS
9ae3a8
+=====================================
9ae3a8
+
9ae3a8
+Most of the operations in the qemu/atomic.h header ensure *sequential
9ae3a8
+consistency*, where "the result of any execution is the same as if the
9ae3a8
+operations of all the processors were executed in some sequential order,
9ae3a8
+and the operations of each individual processor appear in this sequence
9ae3a8
+in the order specified by its program".
9ae3a8
+
9ae3a8
+qemu/atomic.h provides the following set of atomic read-modify-write
9ae3a8
+operations:
9ae3a8
+
9ae3a8
+    void atomic_inc(ptr)
9ae3a8
+    void atomic_dec(ptr)
9ae3a8
+    void atomic_add(ptr, val)
9ae3a8
+    void atomic_sub(ptr, val)
9ae3a8
+    void atomic_and(ptr, val)
9ae3a8
+    void atomic_or(ptr, val)
9ae3a8
+
9ae3a8
+    typeof(*ptr) atomic_fetch_inc(ptr)
9ae3a8
+    typeof(*ptr) atomic_fetch_dec(ptr)
9ae3a8
+    typeof(*ptr) atomic_fetch_add(ptr, val)
9ae3a8
+    typeof(*ptr) atomic_fetch_sub(ptr, val)
9ae3a8
+    typeof(*ptr) atomic_fetch_and(ptr, val)
9ae3a8
+    typeof(*ptr) atomic_fetch_or(ptr, val)
9ae3a8
+    typeof(*ptr) atomic_xchg(ptr, val
9ae3a8
+    typeof(*ptr) atomic_cmpxchg(ptr, old, new)
9ae3a8
+
9ae3a8
+all of which return the old value of *ptr.  These operations are
9ae3a8
+polymorphic; they operate on any type that is as wide as an int.
9ae3a8
+
9ae3a8
+Sequentially consistent loads and stores can be done using:
9ae3a8
+
9ae3a8
+    atomic_fetch_add(ptr, 0) for loads
9ae3a8
+    atomic_xchg(ptr, val) for stores
9ae3a8
+
9ae3a8
+However, they are quite expensive on some platforms, notably POWER and
9ae3a8
+ARM.  Therefore, qemu/atomic.h provides two primitives with slightly
9ae3a8
+weaker constraints:
9ae3a8
+
9ae3a8
+    typeof(*ptr) atomic_mb_read(ptr)
9ae3a8
+    void         atomic_mb_set(ptr, val)
9ae3a8
+
9ae3a8
+The semantics of these primitives map to Java volatile variables,
9ae3a8
+and are strongly related to memory barriers as used in the Linux
9ae3a8
+kernel (see below).
9ae3a8
+
9ae3a8
+As long as you use atomic_mb_read and atomic_mb_set, accesses cannot
9ae3a8
+be reordered with each other, and it is also not possible to reorder
9ae3a8
+"normal" accesses around them.
9ae3a8
+
9ae3a8
+However, and this is the important difference between
9ae3a8
+atomic_mb_read/atomic_mb_set and sequential consistency, it is important
9ae3a8
+for both threads to access the same volatile variable.  It is not the
9ae3a8
+case that everything visible to thread A when it writes volatile field f
9ae3a8
+becomes visible to thread B after it reads volatile field g. The store
9ae3a8
+and load have to "match" (i.e., be performed on the same volatile
9ae3a8
+field) to achieve the right semantics.
9ae3a8
+
9ae3a8
+
9ae3a8
+These operations operate on any type that is as wide as an int or smaller.
9ae3a8
+
9ae3a8
+
9ae3a8
+WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS
9ae3a8
+=============================================
9ae3a8
+
9ae3a8
+Compared to sequentially consistent atomic access, programming with
9ae3a8
+weaker consistency models can be considerably more complicated.
9ae3a8
+In general, if the algorithm you are writing includes both writes
9ae3a8
+and reads on the same side, it is generally simpler to use sequentially
9ae3a8
+consistent primitives.
9ae3a8
+
9ae3a8
+When using this model, variables are accessed with atomic_read() and
9ae3a8
+atomic_set(), and restrictions to the ordering of accesses is enforced
9ae3a8
+using the smp_rmb(), smp_wmb(), smp_mb() and smp_read_barrier_depends()
9ae3a8
+memory barriers.
9ae3a8
+
9ae3a8
+atomic_read() and atomic_set() prevents the compiler from using
9ae3a8
+optimizations that might otherwise optimize accesses out of existence
9ae3a8
+on the one hand, or that might create unsolicited accesses on the other.
9ae3a8
+In general this should not have any effect, because the same compiler
9ae3a8
+barriers are already implied by memory barriers.  However, it is useful
9ae3a8
+to do so, because it tells readers which variables are shared with
9ae3a8
+other threads, and which are local to the current thread or protected
9ae3a8
+by other, more mundane means.
9ae3a8
+
9ae3a8
+Memory barriers control the order of references to shared memory.
9ae3a8
+They come in four kinds:
9ae3a8
+
9ae3a8
+- smp_rmb() guarantees that all the LOAD operations specified before
9ae3a8
+  the barrier will appear to happen before all the LOAD operations
9ae3a8
+  specified after the barrier with respect to the other components of
9ae3a8
+  the system.
9ae3a8
+
9ae3a8
+  In other words, smp_rmb() puts a partial ordering on loads, but is not
9ae3a8
+  required to have any effect on stores.
9ae3a8
+
9ae3a8
+- smp_wmb() guarantees that all the STORE operations specified before
9ae3a8
+  the barrier will appear to happen before all the STORE operations
9ae3a8
+  specified after the barrier with respect to the other components of
9ae3a8
+  the system.
9ae3a8
+
9ae3a8
+  In other words, smp_wmb() puts a partial ordering on stores, but is not
9ae3a8
+  required to have any effect on loads.
9ae3a8
+
9ae3a8
+- smp_mb() guarantees that all the LOAD and STORE operations specified
9ae3a8
+  before the barrier will appear to happen before all the LOAD and
9ae3a8
+  STORE operations specified after the barrier with respect to the other
9ae3a8
+  components of the system.
9ae3a8
+
9ae3a8
+  smp_mb() puts a partial ordering on both loads and stores.  It is
9ae3a8
+  stronger than both a read and a write memory barrier; it implies both
9ae3a8
+  smp_rmb() and smp_wmb(), but it also prevents STOREs coming before the
9ae3a8
+  barrier from overtaking LOADs coming after the barrier and vice versa.
9ae3a8
+
9ae3a8
+- smp_read_barrier_depends() is a weaker kind of read barrier.  On
9ae3a8
+  most processors, whenever two loads are performed such that the
9ae3a8
+  second depends on the result of the first (e.g., the first load
9ae3a8
+  retrieves the address to which the second load will be directed),
9ae3a8
+  the processor will guarantee that the first LOAD will appear to happen
9ae3a8
+  before the second with respect to the other components of the system.
9ae3a8
+  However, this is not always true---for example, it was not true on
9ae3a8
+  Alpha processors.  Whenever this kind of access happens to shared
9ae3a8
+  memory (that is not protected by a lock), a read barrier is needed,
9ae3a8
+  and smp_read_barrier_depends() can be used instead of smp_rmb().
9ae3a8
+
9ae3a8
+  Note that the first load really has to have a _data_ dependency and not
9ae3a8
+  a control dependency.  If the address for the second load is dependent
9ae3a8
+  on the first load, but the dependency is through a conditional rather
9ae3a8
+  than actually loading the address itself, then it's a _control_
9ae3a8
+  dependency and a full read barrier or better is required.
9ae3a8
+
9ae3a8
+
9ae3a8
+This is the set of barriers that is required *between* two atomic_read()
9ae3a8
+and atomic_set() operations to achieve sequential consistency:
9ae3a8
+
9ae3a8
+                    |               2nd operation             |
9ae3a8
+                    |-----------------------------------------|
9ae3a8
+     1st operation  | (after last) | atomic_read | atomic_set |
9ae3a8
+     ---------------+--------------+-------------+------------|
9ae3a8
+     (before first) |              | none        | smp_wmb()  |
9ae3a8
+     ---------------+--------------+-------------+------------|
9ae3a8
+     atomic_read    | smp_rmb()    | smp_rmb()*  | **         |
9ae3a8
+     ---------------+--------------+-------------+------------|
9ae3a8
+     atomic_set     | none         | smp_mb()*** | smp_wmb()  |
9ae3a8
+     ---------------+--------------+-------------+------------|
9ae3a8
+
9ae3a8
+       * Or smp_read_barrier_depends().
9ae3a8
+
9ae3a8
+      ** This requires a load-store barrier.  How to achieve this varies
9ae3a8
+         depending on the machine, but in practice smp_rmb()+smp_wmb()
9ae3a8
+         should have the desired effect.  For example, on PowerPC the
9ae3a8
+         lwsync instruction is a combined load-load, load-store and
9ae3a8
+         store-store barrier.
9ae3a8
+
9ae3a8
+     *** This requires a store-load barrier.  On most machines, the only
9ae3a8
+         way to achieve this is a full barrier.
9ae3a8
+
9ae3a8
+
9ae3a8
+You can see that the two possible definitions of atomic_mb_read()
9ae3a8
+and atomic_mb_set() are the following:
9ae3a8
+
9ae3a8
+    1) atomic_mb_read(p)   = atomic_read(p); smp_rmb()
9ae3a8
+       atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); smp_mb()
9ae3a8
+
9ae3a8
+    2) atomic_mb_read(p)   = smp_mb() atomic_read(p); smp_rmb()
9ae3a8
+       atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v);
9ae3a8
+
9ae3a8
+Usually the former is used, because smp_mb() is expensive and a program
9ae3a8
+normally has more reads than writes.  Therefore it makes more sense to
9ae3a8
+make atomic_mb_set() the more expensive operation.
9ae3a8
+
9ae3a8
+There are two common cases in which atomic_mb_read and atomic_mb_set
9ae3a8
+generate too many memory barriers, and thus it can be useful to manually
9ae3a8
+place barriers instead:
9ae3a8
+
9ae3a8
+- when a data structure has one thread that is always a writer
9ae3a8
+  and one thread that is always a reader, manual placement of
9ae3a8
+  memory barriers makes the write side faster.  Furthermore,
9ae3a8
+  correctness is easy to check for in this case using the "pairing"
9ae3a8
+  trick that is explained below:
9ae3a8
+
9ae3a8
+     thread 1                                thread 1
9ae3a8
+     -------------------------               ------------------------
9ae3a8
+     (other writes)
9ae3a8
+                                             smp_wmb()
9ae3a8
+     atomic_mb_set(&a, x)                    atomic_set(&a, x)
9ae3a8
+                                             smp_wmb()
9ae3a8
+     atomic_mb_set(&b, y)                    atomic_set(&b, y)
9ae3a8
+
9ae3a8
+                                       =>
9ae3a8
+     thread 2                                thread 2
9ae3a8
+     -------------------------               ------------------------
9ae3a8
+     y = atomic_mb_read(&b)                  y = atomic_read(&b)
9ae3a8
+                                             smp_rmb()
9ae3a8
+     x = atomic_mb_read(&a)                  x = atomic_read(&a)
9ae3a8
+                                             smp_rmb()
9ae3a8
+
9ae3a8
+- sometimes, a thread is accessing many variables that are otherwise
9ae3a8
+  unrelated to each other (for example because, apart from the current
9ae3a8
+  thread, exactly one other thread will read or write each of these
9ae3a8
+  variables).  In this case, it is possible to "hoist" the implicit
9ae3a8
+  barriers provided by atomic_mb_read() and atomic_mb_set() outside
9ae3a8
+  a loop.  For example, the above definition atomic_mb_read() gives
9ae3a8
+  the following transformation:
9ae3a8
+
9ae3a8
+     n = 0;                                  n = 0;
9ae3a8
+     for (i = 0; i < 10; i++)          =>    for (i = 0; i < 10; i++)
9ae3a8
+       n += atomic_mb_read(&a[i]);             n += atomic_read(&a[i]);
9ae3a8
+                                             smp_rmb();
9ae3a8
+
9ae3a8
+  Similarly, atomic_mb_set() can be transformed as follows:
9ae3a8
+  smp_mb():
9ae3a8
+
9ae3a8
+                                             smp_wmb();
9ae3a8
+     for (i = 0; i < 10; i++)          =>    for (i = 0; i < 10; i++)
9ae3a8
+       atomic_mb_set(&a[i], false);            atomic_set(&a[i], false);
9ae3a8
+                                             smp_mb();
9ae3a8
+
9ae3a8
+
9ae3a8
+The two tricks can be combined.  In this case, splitting a loop in
9ae3a8
+two lets you hoist the barriers out of the loops _and_ eliminate the
9ae3a8
+expensive smp_mb():
9ae3a8
+
9ae3a8
+                                             smp_wmb();
9ae3a8
+     for (i = 0; i < 10; i++) {        =>    for (i = 0; i < 10; i++)
9ae3a8
+       atomic_mb_set(&a[i], false);            atomic_set(&a[i], false);
9ae3a8
+       atomic_mb_set(&b[i], false);          smb_wmb();
9ae3a8
+     }                                       for (i = 0; i < 10; i++)
9ae3a8
+                                               atomic_set(&a[i], false);
9ae3a8
+                                             smp_mb();
9ae3a8
+
9ae3a8
+  The other thread can still use atomic_mb_read()/atomic_mb_set()
9ae3a8
+
9ae3a8
+
9ae3a8
+Memory barrier pairing
9ae3a8
+----------------------
9ae3a8
+
9ae3a8
+A useful rule of thumb is that memory barriers should always, or almost
9ae3a8
+always, be paired with another barrier.  In the case of QEMU, however,
9ae3a8
+note that the other barrier may actually be in a driver that runs in
9ae3a8
+the guest!
9ae3a8
+
9ae3a8
+For the purposes of pairing, smp_read_barrier_depends() and smp_rmb()
9ae3a8
+both count as read barriers.  A read barriers shall pair with a write
9ae3a8
+barrier or a full barrier; a write barrier shall pair with a read
9ae3a8
+barrier or a full barrier.  A full barrier can pair with anything.
9ae3a8
+For example:
9ae3a8
+
9ae3a8
+        thread 1             thread 2
9ae3a8
+        ===============      ===============
9ae3a8
+        a = 1;
9ae3a8
+        smp_wmb();
9ae3a8
+        b = 2;               x = b;
9ae3a8
+                             smp_rmb();
9ae3a8
+                             y = a;
9ae3a8
+
9ae3a8
+Note that the "writing" thread are accessing the variables in the
9ae3a8
+opposite order as the "reading" thread.  This is expected: stores
9ae3a8
+before the write barrier will normally match the loads after the
9ae3a8
+read barrier, and vice versa.  The same is true for more than 2
9ae3a8
+access and for data dependency barriers:
9ae3a8
+
9ae3a8
+        thread 1             thread 2
9ae3a8
+        ===============      ===============
9ae3a8
+        b[2] = 1;
9ae3a8
+        smp_wmb();
9ae3a8
+        x->i = 2;
9ae3a8
+        smp_wmb();
9ae3a8
+        a = x;               x = a;
9ae3a8
+                             smp_read_barrier_depends();
9ae3a8
+                             y = x->i;
9ae3a8
+                             smp_read_barrier_depends();
9ae3a8
+                             z = b[y];
9ae3a8
+
9ae3a8
+smp_wmb() also pairs with atomic_mb_read(), and smp_rmb() also pairs
9ae3a8
+with atomic_mb_set().
9ae3a8
+
9ae3a8
+
9ae3a8
+COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
9ae3a8
+============================================
9ae3a8
+
9ae3a8
+Here is a list of differences between Linux kernel atomic operations
9ae3a8
+and memory barriers, and the equivalents in QEMU:
9ae3a8
+
9ae3a8
+- atomic operations in Linux are always on a 32-bit int type and
9ae3a8
+  use a boxed atomic_t type; atomic operations in QEMU are polymorphic
9ae3a8
+  and use normal C types.
9ae3a8
+
9ae3a8
+- atomic_read and atomic_set in Linux give no guarantee at all;
9ae3a8
+  atomic_read and atomic_set in QEMU include a compiler barrier
9ae3a8
+  (similar to the ACCESS_ONCE macro in Linux).
9ae3a8
+
9ae3a8
+- most atomic read-modify-write operations in Linux return void;
9ae3a8
+  in QEMU, all of them return the old value of the variable.
9ae3a8
+
9ae3a8
+- different atomic read-modify-write operations in Linux imply
9ae3a8
+  a different set of memory barriers; in QEMU, all of them enforce
9ae3a8
+  sequential consistency, which means they imply full memory barriers
9ae3a8
+  before and after the operation.
9ae3a8
+
9ae3a8
+- Linux does not have an equivalent of atomic_mb_read() and
9ae3a8
+  atomic_mb_set().  In particular, note that set_mb() is a little
9ae3a8
+  weaker than atomic_mb_set().
9ae3a8
+
9ae3a8
+
9ae3a8
+SOURCES
9ae3a8
+=======
9ae3a8
+
9ae3a8
+* Documentation/memory-barriers.txt from the Linux kernel
9ae3a8
+
9ae3a8
+* "The JSR-133 Cookbook for Compiler Writers", available at
9ae3a8
+  http://g.oswego.edu/dl/jmm/cookbook.html
9ae3a8
diff --git a/hw/display/qxl.c b/hw/display/qxl.c
9ae3a8
index ea985d2..830b3c5 100644
9ae3a8
--- a/hw/display/qxl.c
9ae3a8
+++ b/hw/display/qxl.c
9ae3a8
@@ -23,6 +23,7 @@
9ae3a8
 #include "qemu-common.h"
9ae3a8
 #include "qemu/timer.h"
9ae3a8
 #include "qemu/queue.h"
9ae3a8
+#include "qemu/atomic.h"
9ae3a8
 #include "monitor/monitor.h"
9ae3a8
 #include "sysemu/sysemu.h"
9ae3a8
 #include "trace.h"
9ae3a8
@@ -1726,7 +1727,7 @@ static void qxl_send_events(PCIQXLDevice *d, uint32_t events)
9ae3a8
         trace_qxl_send_events_vm_stopped(d->id, events);
9ae3a8
         return;
9ae3a8
     }
9ae3a8
-    old_pending = __sync_fetch_and_or(&d->ram->int_pending, le_events);
9ae3a8
+    old_pending = atomic_fetch_or(&d->ram->int_pending, le_events);
9ae3a8
     if ((old_pending & le_events) == le_events) {
9ae3a8
         return;
9ae3a8
     }
9ae3a8
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
9ae3a8
index 0dabf26..54aa569 100644
9ae3a8
--- a/hw/virtio/vhost.c
9ae3a8
+++ b/hw/virtio/vhost.c
9ae3a8
@@ -16,6 +16,7 @@
9ae3a8
 #include <sys/ioctl.h>
9ae3a8
 #include "hw/virtio/vhost.h"
9ae3a8
 #include "hw/hw.h"
9ae3a8
+#include "qemu/atomic.h"
9ae3a8
 #include "qemu/range.h"
9ae3a8
 #include <linux/vhost.h>
9ae3a8
 #include "exec/address-spaces.h"
9ae3a8
@@ -47,11 +48,9 @@ static void vhost_dev_sync_region(struct vhost_dev *dev,
9ae3a8
             addr += VHOST_LOG_CHUNK;
9ae3a8
             continue;
9ae3a8
         }
9ae3a8
-        /* Data must be read atomically. We don't really
9ae3a8
-         * need the barrier semantics of __sync
9ae3a8
-         * builtins, but it's easier to use them than
9ae3a8
-         * roll our own. */
9ae3a8
-        log = __sync_fetch_and_and(from, 0);
9ae3a8
+        /* Data must be read atomically. We don't really need barrier semantics
9ae3a8
+         * but it's easier to use atomic_* than roll our own. */
9ae3a8
+        log = atomic_xchg(from, 0);
9ae3a8
         while ((bit = sizeof(log) > sizeof(int) ?
9ae3a8
                 ffsll(log) : ffs(log))) {
9ae3a8
             hwaddr page_addr;
9ae3a8
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
9ae3a8
index 10becb6..0aa8913 100644
9ae3a8
--- a/include/qemu/atomic.h
9ae3a8
+++ b/include/qemu/atomic.h
9ae3a8
@@ -1,68 +1,202 @@
9ae3a8
-#ifndef __QEMU_BARRIER_H
9ae3a8
-#define __QEMU_BARRIER_H 1
9ae3a8
+/*
9ae3a8
+ * Simple interface for atomic operations.
9ae3a8
+ *
9ae3a8
+ * Copyright (C) 2013 Red Hat, Inc.
9ae3a8
+ *
9ae3a8
+ * Author: Paolo Bonzini <pbonzini@redhat.com>
9ae3a8
+ *
9ae3a8
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
9ae3a8
+ * See the COPYING file in the top-level directory.
9ae3a8
+ *
9ae3a8
+ */
9ae3a8
 
9ae3a8
-/* Compiler barrier */
9ae3a8
-#define barrier()   asm volatile("" ::: "memory")
9ae3a8
+#ifndef __QEMU_ATOMIC_H
9ae3a8
+#define __QEMU_ATOMIC_H 1
9ae3a8
 
9ae3a8
-#if defined(__i386__)
9ae3a8
+#include "qemu/compiler.h"
9ae3a8
 
9ae3a8
-#include "qemu/compiler.h"        /* QEMU_GNUC_PREREQ */
9ae3a8
+/* For C11 atomic ops */
9ae3a8
 
9ae3a8
-/*
9ae3a8
- * Because of the strongly ordered x86 storage model, wmb() and rmb() are nops
9ae3a8
- * on x86(well, a compiler barrier only).  Well, at least as long as
9ae3a8
- * qemu doesn't do accesses to write-combining memory or non-temporal
9ae3a8
- * load/stores from C code.
9ae3a8
- */
9ae3a8
-#define smp_wmb()   barrier()
9ae3a8
-#define smp_rmb()   barrier()
9ae3a8
+/* Compiler barrier */
9ae3a8
+#define barrier()   ({ asm volatile("" ::: "memory"); (void)0; })
9ae3a8
+
9ae3a8
+#ifndef __ATOMIC_RELAXED
9ae3a8
 
9ae3a8
 /*
9ae3a8
- * We use GCC builtin if it's available, as that can use
9ae3a8
- * mfence on 32 bit as well, e.g. if built with -march=pentium-m.
9ae3a8
- * However, on i386, there seem to be known bugs as recently as 4.3.
9ae3a8
- * */
9ae3a8
-#if QEMU_GNUC_PREREQ(4, 4)
9ae3a8
-#define smp_mb() __sync_synchronize()
9ae3a8
+ * We use GCC builtin if it's available, as that can use mfence on
9ae3a8
+ * 32-bit as well, e.g. if built with -march=pentium-m. However, on
9ae3a8
+ * i386 the spec is buggy, and the implementation followed it until
9ae3a8
+ * 4.3 (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36793).
9ae3a8
+ */
9ae3a8
+#if defined(__i386__) || defined(__x86_64__)
9ae3a8
+#if !QEMU_GNUC_PREREQ(4, 4)
9ae3a8
+#if defined __x86_64__
9ae3a8
+#define smp_mb()    ({ asm volatile("mfence" ::: "memory"); (void)0; })
9ae3a8
 #else
9ae3a8
-#define smp_mb() asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
9ae3a8
+#define smp_mb()    ({ asm volatile("lock; addl $0,0(%%esp) " ::: "memory"); (void)0; })
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+
9ae3a8
+#ifdef __alpha__
9ae3a8
+#define smp_read_barrier_depends()   asm volatile("mb":::"memory")
9ae3a8
 #endif
9ae3a8
 
9ae3a8
-#elif defined(__x86_64__)
9ae3a8
+#if defined(__i386__) || defined(__x86_64__) || defined(__s390x__)
9ae3a8
 
9ae3a8
+/*
9ae3a8
+ * Because of the strongly ordered storage model, wmb() and rmb() are nops
9ae3a8
+ * here (a compiler barrier only).  QEMU doesn't do accesses to write-combining
9ae3a8
+ * qemu memory or non-temporal load/stores from C code.
9ae3a8
+ */
9ae3a8
 #define smp_wmb()   barrier()
9ae3a8
 #define smp_rmb()   barrier()
9ae3a8
-#define smp_mb() asm volatile("mfence" ::: "memory")
9ae3a8
+
9ae3a8
+/*
9ae3a8
+ * __sync_lock_test_and_set() is documented to be an acquire barrier only,
9ae3a8
+ * but it is a full barrier at the hardware level.  Add a compiler barrier
9ae3a8
+ * to make it a full barrier also at the compiler level.
9ae3a8
+ */
9ae3a8
+#define atomic_xchg(ptr, i)    (barrier(), __sync_lock_test_and_set(ptr, i))
9ae3a8
+
9ae3a8
+/*
9ae3a8
+ * Load/store with Java volatile semantics.
9ae3a8
+ */
9ae3a8
+#define atomic_mb_set(ptr, i)  ((void)atomic_xchg(ptr, i))
9ae3a8
 
9ae3a8
 #elif defined(_ARCH_PPC)
9ae3a8
 
9ae3a8
 /*
9ae3a8
  * We use an eieio() for wmb() on powerpc.  This assumes we don't
9ae3a8
  * need to order cacheable and non-cacheable stores with respect to
9ae3a8
- * each other
9ae3a8
+ * each other.
9ae3a8
+ *
9ae3a8
+ * smp_mb has the same problem as on x86 for not-very-new GCC
9ae3a8
+ * (http://patchwork.ozlabs.org/patch/126184/, Nov 2011).
9ae3a8
  */
9ae3a8
-#define smp_wmb()   asm volatile("eieio" ::: "memory")
9ae3a8
-
9ae3a8
+#define smp_wmb()   ({ asm volatile("eieio" ::: "memory"); (void)0; })
9ae3a8
 #if defined(__powerpc64__)
9ae3a8
-#define smp_rmb()   asm volatile("lwsync" ::: "memory")
9ae3a8
+#define smp_rmb()   ({ asm volatile("lwsync" ::: "memory"); (void)0; })
9ae3a8
 #else
9ae3a8
-#define smp_rmb()   asm volatile("sync" ::: "memory")
9ae3a8
+#define smp_rmb()   ({ asm volatile("sync" ::: "memory"); (void)0; })
9ae3a8
 #endif
9ae3a8
+#define smp_mb()    ({ asm volatile("sync" ::: "memory"); (void)0; })
9ae3a8
 
9ae3a8
-#define smp_mb()   asm volatile("sync" ::: "memory")
9ae3a8
+#endif /* _ARCH_PPC */
9ae3a8
 
9ae3a8
-#else
9ae3a8
+#endif /* C11 atomics */
9ae3a8
 
9ae3a8
 /*
9ae3a8
  * For (host) platforms we don't have explicit barrier definitions
9ae3a8
  * for, we use the gcc __sync_synchronize() primitive to generate a
9ae3a8
  * full barrier.  This should be safe on all platforms, though it may
9ae3a8
- * be overkill for wmb() and rmb().
9ae3a8
+ * be overkill for smp_wmb() and smp_rmb().
9ae3a8
  */
9ae3a8
+#ifndef smp_mb
9ae3a8
+#define smp_mb()    __sync_synchronize()
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+#ifndef smp_wmb
9ae3a8
+#ifdef __ATOMIC_RELEASE
9ae3a8
+#define smp_wmb()   __atomic_thread_fence(__ATOMIC_RELEASE)
9ae3a8
+#else
9ae3a8
 #define smp_wmb()   __sync_synchronize()
9ae3a8
-#define smp_mb()   __sync_synchronize()
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+#ifndef smp_rmb
9ae3a8
+#ifdef __ATOMIC_ACQUIRE
9ae3a8
+#define smp_rmb()   __atomic_thread_fence(__ATOMIC_ACQUIRE)
9ae3a8
+#else
9ae3a8
 #define smp_rmb()   __sync_synchronize()
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+#ifndef smp_read_barrier_depends
9ae3a8
+#ifdef __ATOMIC_CONSUME
9ae3a8
+#define smp_read_barrier_depends()   __atomic_thread_fence(__ATOMIC_CONSUME)
9ae3a8
+#else
9ae3a8
+#define smp_read_barrier_depends()   barrier()
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
 
9ae3a8
+#ifndef atomic_read
9ae3a8
+#define atomic_read(ptr)       (*(__typeof__(*ptr) *volatile) (ptr))
9ae3a8
 #endif
9ae3a8
 
9ae3a8
+#ifndef atomic_set
9ae3a8
+#define atomic_set(ptr, i)     ((*(__typeof__(*ptr) *volatile) (ptr)) = (i))
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+/* These have the same semantics as Java volatile variables.
9ae3a8
+ * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
9ae3a8
+ * "1. Issue a StoreStore barrier (wmb) before each volatile store."
9ae3a8
+ *  2. Issue a StoreLoad barrier after each volatile store.
9ae3a8
+ *     Note that you could instead issue one before each volatile load, but
9ae3a8
+ *     this would be slower for typical programs using volatiles in which
9ae3a8
+ *     reads greatly outnumber writes. Alternatively, if available, you
9ae3a8
+ *     can implement volatile store as an atomic instruction (for example
9ae3a8
+ *     XCHG on x86) and omit the barrier. This may be more efficient if
9ae3a8
+ *     atomic instructions are cheaper than StoreLoad barriers.
9ae3a8
+ *  3. Issue LoadLoad and LoadStore barriers after each volatile load."
9ae3a8
+ *
9ae3a8
+ * If you prefer to think in terms of "pairing" of memory barriers,
9ae3a8
+ * an atomic_mb_read pairs with an atomic_mb_set.
9ae3a8
+ *
9ae3a8
+ * And for the few ia64 lovers that exist, an atomic_mb_read is a ld.acq,
9ae3a8
+ * while an atomic_mb_set is a st.rel followed by a memory barrier.
9ae3a8
+ *
9ae3a8
+ * These are a bit weaker than __atomic_load/store with __ATOMIC_SEQ_CST
9ae3a8
+ * (see docs/atomics.txt), and I'm not sure that __ATOMIC_ACQ_REL is enough.
9ae3a8
+ * Just always use the barriers manually by the rules above.
9ae3a8
+ */
9ae3a8
+#ifndef atomic_mb_read
9ae3a8
+#define atomic_mb_read(ptr)    ({           \
9ae3a8
+    typeof(*ptr) _val = atomic_read(ptr);   \
9ae3a8
+    smp_rmb();                              \
9ae3a8
+    _val;                                   \
9ae3a8
+})
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+#ifndef atomic_mb_set
9ae3a8
+#define atomic_mb_set(ptr, i)  do {         \
9ae3a8
+    smp_wmb();                              \
9ae3a8
+    atomic_set(ptr, i);                     \
9ae3a8
+    smp_mb();                               \
9ae3a8
+} while (0)
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+#ifndef atomic_xchg
9ae3a8
+#ifdef __ATOMIC_SEQ_CST
9ae3a8
+#define atomic_xchg(ptr, i)    ({                           \
9ae3a8
+    typeof(*ptr) _new = (i), _old;                          \
9ae3a8
+    __atomic_exchange(ptr, &_new, &_old, __ATOMIC_SEQ_CST); \
9ae3a8
+    _old;                                                   \
9ae3a8
+})
9ae3a8
+#elif defined __clang__
9ae3a8
+#define atomic_xchg(ptr, i)    __sync_exchange(ptr, i)
9ae3a8
+#else
9ae3a8
+/* __sync_lock_test_and_set() is documented to be an acquire barrier only.  */
9ae3a8
+#define atomic_xchg(ptr, i)    (smp_mb(), __sync_lock_test_and_set(ptr, i))
9ae3a8
+#endif
9ae3a8
+#endif
9ae3a8
+
9ae3a8
+/* Provide shorter names for GCC atomic builtins.  */
9ae3a8
+#define atomic_fetch_inc(ptr)  __sync_fetch_and_add(ptr, 1)
9ae3a8
+#define atomic_fetch_dec(ptr)  __sync_fetch_and_add(ptr, -1)
9ae3a8
+#define atomic_fetch_add       __sync_fetch_and_add
9ae3a8
+#define atomic_fetch_sub       __sync_fetch_and_sub
9ae3a8
+#define atomic_fetch_and       __sync_fetch_and_and
9ae3a8
+#define atomic_fetch_or        __sync_fetch_and_or
9ae3a8
+#define atomic_cmpxchg         __sync_val_compare_and_swap
9ae3a8
+
9ae3a8
+/* And even shorter names that return void.  */
9ae3a8
+#define atomic_inc(ptr)        ((void) __sync_fetch_and_add(ptr, 1))
9ae3a8
+#define atomic_dec(ptr)        ((void) __sync_fetch_and_add(ptr, -1))
9ae3a8
+#define atomic_add(ptr, n)     ((void) __sync_fetch_and_add(ptr, n))
9ae3a8
+#define atomic_sub(ptr, n)     ((void) __sync_fetch_and_sub(ptr, n))
9ae3a8
+#define atomic_and(ptr, n)     ((void) __sync_fetch_and_and(ptr, n))
9ae3a8
+#define atomic_or(ptr, n)      ((void) __sync_fetch_and_or(ptr, n))
9ae3a8
+
9ae3a8
 #endif
9ae3a8
diff --git a/migration.c b/migration.c
9ae3a8
index 46c633a..d91e702 100644
9ae3a8
--- a/migration.c
9ae3a8
+++ b/migration.c
9ae3a8
@@ -291,8 +291,7 @@ static void migrate_fd_cleanup(void *opaque)
9ae3a8
 
9ae3a8
 static void migrate_finish_set_state(MigrationState *s, int new_state)
9ae3a8
 {
9ae3a8
-    if (__sync_val_compare_and_swap(&s->state, MIG_STATE_ACTIVE,
9ae3a8
-                                    new_state) == new_state) {
9ae3a8
+    if (atomic_cmpxchg(&s->state, MIG_STATE_ACTIVE, new_state) == new_state) {
9ae3a8
         trace_migrate_set_state(new_state);
9ae3a8
     }
9ae3a8
 }
9ae3a8
diff --git a/tests/test-thread-pool.c b/tests/test-thread-pool.c
9ae3a8
index 22915aa..b62338f 100644
9ae3a8
--- a/tests/test-thread-pool.c
9ae3a8
+++ b/tests/test-thread-pool.c
9ae3a8
@@ -17,15 +17,15 @@ typedef struct {
9ae3a8
 static int worker_cb(void *opaque)
9ae3a8
 {
9ae3a8
     WorkerTestData *data = opaque;
9ae3a8
-    return __sync_fetch_and_add(&data->n, 1);
9ae3a8
+    return atomic_fetch_inc(&data->n);
9ae3a8
 }
9ae3a8
 
9ae3a8
 static int long_cb(void *opaque)
9ae3a8
 {
9ae3a8
     WorkerTestData *data = opaque;
9ae3a8
-    __sync_fetch_and_add(&data->n, 1);
9ae3a8
+    atomic_inc(&data->n);
9ae3a8
     g_usleep(2000000);
9ae3a8
-    __sync_fetch_and_add(&data->n, 1);
9ae3a8
+    atomic_inc(&data->n);
9ae3a8
     return 0;
9ae3a8
 }
9ae3a8
 
9ae3a8
@@ -169,7 +169,7 @@ static void test_cancel(void)
9ae3a8
     /* Cancel the jobs that haven't been started yet.  */
9ae3a8
     num_canceled = 0;
9ae3a8
     for (i = 0; i < 100; i++) {
9ae3a8
-        if (__sync_val_compare_and_swap(&data[i].n, 0, 3) == 0) {
9ae3a8
+        if (atomic_cmpxchg(&data[i].n, 0, 3) == 0) {
9ae3a8
             data[i].ret = -ECANCELED;
9ae3a8
             bdrv_aio_cancel(data[i].aiocb);
9ae3a8
             active--;
9ae3a8
-- 
9ae3a8
1.7.11.7
9ae3a8