40 files changed, 1941 insertions, 312 deletions
diff --git a/cpu-exec.c b/cpu-exec.c
index a4f0effaf4..fa506e628a 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -61,8 +61,7 @@ static void align_clocks(SyncClocks *sc, const CPUState *cpu)
         sleep_delay.tv_sec = sc->diff_clk / 1000000000LL;
         sleep_delay.tv_nsec = sc->diff_clk % 1000000000LL;
         if (nanosleep(&sleep_delay, &rem_delay) < 0) {
-            sc->diff_clk -= (sleep_delay.tv_sec - rem_delay.tv_sec) * 1000000000LL;
-            sc->diff_clk -= sleep_delay.tv_nsec - rem_delay.tv_nsec;
+            sc->diff_clk = rem_delay.tv_sec * 1000000000LL + rem_delay.tv_nsec;
         } else {
             sc->diff_clk = 0;
         }
@@ -101,10 +100,8 @@ static void init_delay_params(SyncClocks *sc,
     if (!icount_align_option) {
         return;
     }
-    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
-    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) -
-                   sc->realtime_clock +
-                   cpu_get_clock_offset();
+    sc->realtime_clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL_RT);
+    sc->diff_clk = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - sc->realtime_clock;
     sc->last_cpu_icount = cpu->icount_extra + cpu->icount_decr.u16.low;
     if (sc->diff_clk < max_delay) {
         max_delay = sc->diff_clk;
diff --git a/cpus.c b/cpus.c
index 3a5323b1fc..0cdd1d7156 100644
--- a/cpus.c
+++ b/cpus.c
@@ -229,23 +229,6 @@ int64_t cpu_get_clock(void)
     return ti;
 }
 
-/* return the offset between the host clock and virtual CPU clock */
-int64_t cpu_get_clock_offset(void)
-{
-    int64_t ti;
-    unsigned start;
-
-    do {
-        start = seqlock_read_begin(&timers_state.vm_clock_seqlock);
-        ti = timers_state.cpu_clock_offset;
-        if (!timers_state.cpu_ticks_enabled) {
-            ti -= get_clock();
-        }
-    } while (seqlock_read_retry(&timers_state.vm_clock_seqlock, start));
-
-    return -ti;
-}
-
 /* enable cpu_get_ticks()
  * Caller must hold BQL which server as mutex for vm_clock_seqlock.
  */
diff --git a/disas/s390.c b/disas/s390.c
index 25499ba419..974460c814 100644
--- a/disas/s390.c
+++ b/disas/s390.c
@@ -106,10 +106,6 @@ struct s390_opcode
 static const struct s390_opcode s390_opcodes[];
 static const int                s390_num_opcodes;
 
-/* A opcode format table for the .insn pseudo mnemonic.  */
-static const struct s390_opcode s390_opformats[];
-static const int                s390_num_opformats;
-
 /* Values defined for the flags field of a struct powerpc_opcode.  */
 
 /* The operands table is an array of struct s390_operand.  */
@@ -844,37 +840,6 @@ static const struct s390_operand s390_operands[] =
 #define MASK_SIY_DRI     { 0xff, 0x00, 0x00, 0x00, 0x00, 0xff }
 /* QEMU-END */
 
-/* The opcode formats table (blueprints for .insn pseudo mnemonic).  */
-
-static const struct s390_opcode s390_opformats[] =
-  {
-  { "e",	OP8(0x00LL),	MASK_E,		INSTR_E,	3, 0 },
-  { "ri",	OP8(0x00LL),	MASK_RI_RI,	INSTR_RI_RI,	3, 0 },
-  { "rie",	OP8(0x00LL),	MASK_RIE_RRP,	INSTR_RIE_RRP,	3, 0 },
-  { "ril",	OP8(0x00LL),	MASK_RIL_RP,	INSTR_RIL_RP,	3, 0 },
-  { "rilu",	OP8(0x00LL),	MASK_RIL_RU,	INSTR_RIL_RU,	3, 0 },
-  { "rr",	OP8(0x00LL),	MASK_RR_RR,	INSTR_RR_RR,	3, 0 },
-  { "rre",	OP8(0x00LL),	MASK_RRE_RR,	INSTR_RRE_RR,	3, 0 },
-  { "rrf",	OP8(0x00LL),	MASK_RRF_RURR,	INSTR_RRF_RURR,	3, 0 },
-  { "rs",	OP8(0x00LL),	MASK_RS_RRRD,	INSTR_RS_RRRD,	3, 0 },
-  { "rse",	OP8(0x00LL),	MASK_RSE_RRRD,	INSTR_RSE_RRRD,	3, 0 },
-  { "rsi",	OP8(0x00LL),	MASK_RSI_RRP,	INSTR_RSI_RRP,	3, 0 },
-  { "rsy",	OP8(0x00LL),	MASK_RSY_RRRD,	INSTR_RSY_RRRD,	3, 3 },
-  { "rx",	OP8(0x00LL),	MASK_RX_RRRD,	INSTR_RX_RRRD,	3, 0 },
-  { "rxe",	OP8(0x00LL),	MASK_RXE_RRRD,	INSTR_RXE_RRRD,	3, 0 },
-  { "rxf",	OP8(0x00LL),	MASK_RXF_RRRDR,	INSTR_RXF_RRRDR,3, 0 },
-  { "rxy",	OP8(0x00LL),	MASK_RXY_RRRD,	INSTR_RXY_RRRD,	3, 3 },
-  { "s",	OP8(0x00LL),	MASK_S_RD,	INSTR_S_RD,	3, 0 },
-  { "si",	OP8(0x00LL),	MASK_SI_URD,	INSTR_SI_URD,	3, 0 },
-  { "siy",	OP8(0x00LL),	MASK_SIY_URD,	INSTR_SIY_URD,	3, 3 },
-  { "ss",	OP8(0x00LL),	MASK_SS_RRRDRD,	INSTR_SS_RRRDRD,3, 0 },
-  { "sse",	OP8(0x00LL),	MASK_SSE_RDRD,	INSTR_SSE_RDRD,	3, 0 },
-  { "ssf",	OP8(0x00LL),	MASK_SSF_RRDRD,	INSTR_SSF_RRDRD,3, 0 },
-};
-
-static const int s390_num_opformats =
-  sizeof (s390_opformats) / sizeof (s390_opformats[0]);
-
 /* include "s390-opc.tab" generated from opcodes/s390-opc.txt rev 1.17 */
 /* The opcode table. This file was generated by s390-mkopc.
 
diff --git a/docs/rcu.txt b/docs/rcu.txt
new file mode 100644
index 0000000000..61752b93ab
--- /dev/null
+++ b/docs/rcu.txt
@@ -0,0 +1,387 @@
+Using RCU (Read-Copy-Update) for synchronization
+================================================
+
+Read-copy update (RCU) is a synchronization mechanism that is used to
+protect read-mostly data structures.  RCU is very efficient and scalable
+on the read side (it is wait-free), and thus can make the read paths
+extremely fast.
+
+RCU supports concurrency between a single writer and multiple readers,
+thus it is not used alone.  Typically, the write-side will use a lock to
+serialize multiple updates, but other approaches are possible (e.g.,
+restricting updates to a single task).  In QEMU, when a lock is used,
+this will often be the "iothread mutex", also known as the "big QEMU
+lock" (BQL).  Also, restricting updates to a single task is done in
+QEMU using the "bottom half" API.
+
+RCU is fundamentally a "wait-to-finish" mechanism.  The read side marks
+sections of code with "critical sections", and the update side will wait
+for the execution of all *currently running* critical sections before
+proceeding, or before asynchronously executing a callback.
+
+The key point here is that only the currently running critical sections
+are waited for; critical sections that are started _after_ the beginning
+of the wait do not extend the wait, despite running concurrently with
+the updater.  This is the reason why RCU is more scalable than,
+for example, reader-writer locks.  It is so much more scalable that
+the system will have a single instance of the RCU mechanism; a single
+mechanism can be used for an arbitrary number of "things", without
+having to worry about things such as contention or deadlocks.
+
+How is this possible?  The basic idea is to split updates in two phases,
+"removal" and "reclamation".  During removal, we ensure that subsequent
+readers will not be able to get a reference to the old data.  After
+removal has completed, a critical section will not be able to access
+the old data.  Therefore, critical sections that begin after removal
+do not matter; as soon as all previous critical sections have finished,
+there cannot be any readers who hold references to the data structure,
+and these can now be safely reclaimed (e.g., freed or unref'ed).
+
+Here is a picutre:
+
+        thread 1                  thread 2                  thread 3
+    -------------------    ------------------------    -------------------
+    enter RCU crit.sec.
+           |                finish removal phase
+           |                begin wait
+           |                      |                    enter RCU crit.sec.
+    exit RCU crit.sec             |                           |
+                            complete wait                     |
+                            begin reclamation phase           |
+                                                       exit RCU crit.sec.
+
+
+Note how thread 3 is still executing its critical section when thread 2
+starts reclaiming data.  This is possible, because the old version of the
+data structure was not accessible at the time thread 3 began executing
+that critical section.
+
+
+RCU API
+=======
+
+The core RCU API is small:
+
+     void rcu_read_lock(void);
+
+        Used by a reader to inform the reclaimer that the reader is
+        entering an RCU read-side critical section.
+
+     void rcu_read_unlock(void);
+
+        Used by a reader to inform the reclaimer that the reader is
+        exiting an RCU read-side critical section.  Note that RCU
+        read-side critical sections may be nested and/or overlapping.
+
+     void synchronize_rcu(void);
+
+        Blocks until all pre-existing RCU read-side critical sections
+        on all threads have completed.  This marks the end of the removal
+        phase and the beginning of reclamation phase.
+
+        Note that it would be valid for another update to come while
+        synchronize_rcu is running.  Because of this, it is better that
+        the updater releases any locks it may hold before calling
+        synchronize_rcu.  If this is not possible (for example, because
+        the updater is protected by the BQL), you can use call_rcu.
+
+     void call_rcu1(struct rcu_head * head,
+                    void (*func)(struct rcu_head *head));
+
+        This function invokes func(head) after all pre-existing RCU
+        read-side critical sections on all threads have completed.  This
+        marks the end of the removal phase, with func taking care
+        asynchronously of the reclamation phase.
+
+        The foo struct needs to have an rcu_head structure added,
+        perhaps as follows:
+
+            struct foo {
+                struct rcu_head rcu;
+                int a;
+                char b;
+                long c;
+            };
+
+        so that the reclaimer function can fetch the struct foo address
+        and free it:
+
+            call_rcu1(&foo.rcu, foo_reclaim);
+
+            void foo_reclaim(struct rcu_head *rp)
+            {
+                struct foo *fp = container_of(rp, struct foo, rcu);
+                g_free(fp);
+            }
+
+        For the common case where the rcu_head member is the first of the
+        struct, you can use the following macro.
+
+     void call_rcu(T *p,
+                   void (*func)(T *p),
+                   field-name);
+
+        call_rcu1 is typically used through this macro, in the common case
+        where the "struct rcu_head" is the first field in the struct.  In
+        the above case, one could have written simply:
+
+            call_rcu(foo_reclaim, g_free, rcu);
+
+     typeof(*p) atomic_rcu_read(p);
+
+        atomic_rcu_read() is similar to atomic_mb_read(), but it makes
+        some assumptions on the code that calls it.  This allows a more
+        optimized implementation.
+
+        atomic_rcu_read assumes that whenever a single RCU critical
+        section reads multiple shared data, these reads are either
+        data-dependent or need no ordering.  This is almost always the
+        case when using RCU, because read-side critical sections typically
+        navigate one or more pointers (the pointers that are changed on
+        every update) until reaching a data structure of interest,
+        and then read from there.
+
+        RCU read-side critical sections must use atomic_rcu_read() to
+        read data, unless concurrent writes are presented by another
+        synchronization mechanism.
+
+        Furthermore, RCU read-side critical sections should traverse the
+        data structure in a single direction, opposite to the direction
+        in which the updater initializes it.
+
+     void atomic_rcu_set(p, typeof(*p) v);
+
+        atomic_rcu_set() is also similar to atomic_mb_set(), and it also
+        makes assumptions on the code that calls it in order to allow a more
+        optimized implementation.
+
+        In particular, atomic_rcu_set() suffices for synchronization
+        with readers, if the updater never mutates a field within a
+        data item that is already accessible to readers.  This is the
+        case when initializing a new copy of the RCU-protected data
+        structure; just ensure that initialization of *p is carried out
+        before atomic_rcu_set() makes the data item visible to readers.
+        If this rule is observed, writes will happen in the opposite
+        order as reads in the RCU read-side critical sections (or if
+        there is just one update), and there will be no need for other
+        synchronization mechanism to coordinate the accesses.
+
+The following APIs must be used before RCU is used in a thread:
+
+     void rcu_register_thread(void);
+
+        Mark a thread as taking part in the RCU mechanism.  Such a thread
+        will have to report quiescent points regularly, either manually
+        or through the QemuCond/QemuSemaphore/QemuEvent APIs.
+
+     void rcu_unregister_thread(void);
+
+        Mark a thread as not taking part anymore in the RCU mechanism.
+        It is not a problem if such a thread reports quiescent points,
+        either manually or by using the QemuCond/QemuSemaphore/QemuEvent
+        APIs.
+
+Note that these APIs are relatively heavyweight, and should _not_ be
+nested.
+
+
+DIFFERENCES WITH LINUX
+======================
+
+- Waiting on a mutex is possible, though discouraged, within an RCU critical
+  section.  This is because spinlocks are rarely (if ever) used in userspace
+  programming; not allowing this would prevent upgrading an RCU read-side
+  critical section to become an updater.
+
+- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and
+  rcu_assign_pointer.  They take a _pointer_ to the variable being accessed.
+
+- call_rcu is a macro that has an extra argument (the name of the first
+  field in the struct, which must be a struct rcu_head), and expects the
+  type of the callback's argument to be the type of the first argument.
+  call_rcu1 is the same as Linux's call_rcu.
+
+
+RCU PATTERNS
+============
+
+Many patterns using read-writer locks translate directly to RCU, with
+the advantages of higher scalability and deadlock immunity.
+
+In general, RCU can be used whenever it is possible to create a new
+"version" of a data structure every time the updater runs.  This may
+sound like a very strict restriction, however:
+
+- the updater does not mean "everything that writes to a data structure",
+  but rather "everything that involves a reclamation step".  See the
+  array example below
+
+- in some cases, creating a new version of a data structure may actually
+  be very cheap.  For example, modifying the "next" pointer of a singly
+  linked list is effectively creating a new version of the list.
+
+Here are some frequently-used RCU idioms that are worth noting.
+
+
+RCU list processing
+-------------------
+
+TBD (not yet used in QEMU)
+
+
+RCU reference counting
+----------------------
+
+Because grace periods are not allowed to complete while there is an RCU
+read-side critical section in progress, the RCU read-side primitives
+may be used as a restricted reference-counting mechanism.  For example,
+consider the following code fragment:
+
+    rcu_read_lock();
+    p = atomic_rcu_read(&foo);
+    /* do something with p. */
+    rcu_read_unlock();
+
+The RCU read-side critical section ensures that the value of "p" remains
+valid until after the rcu_read_unlock().  In some sense, it is acquiring
+a reference to p that is later released when the critical section ends.
+The write side looks simply like this (with appropriate locking):
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    synchronize_rcu();
+    free(old);
+
+If the processing cannot be done purely within the critical section, it
+is possible to combine this idiom with a "real" reference count:
+
+    rcu_read_lock();
+    p = atomic_rcu_read(&foo);
+    foo_ref(p);
+    rcu_read_unlock();
+    /* do something with p. */
+    foo_unref(p);
+
+The write side can be like this:
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    synchronize_rcu();
+    foo_unref(old);
+
+or with call_rcu:
+
+    qemu_mutex_lock(&foo_mutex);
+    old = foo;
+    atomic_rcu_set(&foo, new);
+    qemu_mutex_unlock(&foo_mutex);
+    call_rcu(foo_unref, old, rcu);
+
+In both cases, the write side only performs removal.  Reclamation
+happens when the last reference to a "foo" object is dropped.
+Using synchronize_rcu() is undesirably expensive, because the
+last reference may be dropped on the read side.  Hence you can
+use call_rcu() instead:
+
+     foo_unref(struct foo *p) {
+        if (atomic_fetch_dec(&p->refcount) == 1) {
+            call_rcu(foo_destroy, p, rcu);
+        }
+    }
+
+
+Note that the same idioms would be possible with reader/writer
+locks:
+
+    read_lock(&foo_rwlock);         write_mutex_lock(&foo_rwlock);
+    p = foo;                        p = foo;
+    /* do something with p. */      foo = new;
+    read_unlock(&foo_rwlock);       free(p);
+                                    write_mutex_unlock(&foo_rwlock);
+                                    free(p);
+
+    ------------------------------------------------------------------
+
+    read_lock(&foo_rwlock);         write_mutex_lock(&foo_rwlock);
+    p = foo;                        old = foo;
+    foo_ref(p);                     foo = new;
+    read_unlock(&foo_rwlock);       foo_unref(old);
+    /* do something with p. */      write_mutex_unlock(&foo_rwlock);
+    read_lock(&foo_rwlock);
+    foo_unref(p);
+    read_unlock(&foo_rwlock);
+
+foo_unref could use a mechanism such as bottom halves to move deallocation
+out of the write-side critical section.
+
+
+RCU resizable arrays
+--------------------
+
+Resizable arrays can be used with RCU.  The expensive RCU synchronization
+(or call_rcu) only needs to take place when the array is resized.
+The two items to take care of are:
+
+- ensuring that the old version of the array is available between removal
+  and reclamation;
+
+- avoiding mismatches in the read side between the array data and the
+  array size.
+
+The first problem is avoided simply by not using realloc.  Instead,
+each resize will allocate a new array and copy the old data into it.
+The second problem would arise if the size and the data pointers were
+two members of a larger struct:
+
+    struct mystuff {
+        ...
+        int data_size;
+        int data_alloc;
+        T   *data;
+        ...
+    };
+
+Instead, we store the size of the array with the array itself:
+
+    struct arr {
+        int size;
+        int alloc;
+        T   data[];
+    };
+    struct arr *global_array;
+
+    read side:
+        rcu_read_lock();
+        struct arr *array = atomic_rcu_read(&global_array);
+        x = i < array->size ? array->data[i] : -1;
+        rcu_read_unlock();
+        return x;
+
+    write side (running under a lock):
+        if (global_array->size == global_array->alloc) {
+            /* Creating a new version.  */
+            new_array = g_malloc(sizeof(struct arr) +
+                                 global_array->alloc * 2 * sizeof(T));
+            new_array->size = global_array->size;
+            new_array->alloc = global_array->alloc * 2;
+            memcpy(new_array->data, global_array->data,
+                   global_array->alloc * sizeof(T));
+
+            /* Removal phase.  */
+            old_array = global_array;
+            atomic_rcu_set(&new_array->data, new_array);
+            synchronize_rcu();
+
+            /* Reclamation phase.  */
+            free(old_array);
+        }
+
+
+SOURCES
+=======
+
+* Documentation/RCU/ from the Linux kernel
diff --git a/fpu/softfloat-macros.h b/fpu/softfloat-macros.h
index 0dcda93f72..5e030cd8e5 100644
--- a/fpu/softfloat-macros.h
+++ b/fpu/softfloat-macros.h
@@ -1,13 +1,24 @@
 /*
  * QEMU float support macros
  *
- * Derived from SoftFloat.
+ * The code in this source file is derived from release 2a of the SoftFloat
+ * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
+ * some later contributions) are provided under that license, as detailed below.
+ * It has subsequently been modified by contributors to the QEMU Project,
+ * so some portions are provided under:
+ *  the SoftFloat-2a license
+ *  the BSD license
+ *  GPL-v2-or-later
+ *
+ * Any future contributions to this file after December 1st 2014 will be
+ * taken to be licensed under the Softfloat-2a license unless specifically
+ * indicated otherwise.
  */
 
-/*============================================================================
-
+/*
+===============================================================================
 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
-Arithmetic Package, Release 2b.
+Arithmetic Package, Release 2a.
 
 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@@ -16,24 +27,57 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
 arithmetic/SoftFloat.html'.
 
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
-been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
-RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
-AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
-COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
-EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
-INSTITUTE (possibly via similar legal notice) AGAINST ALL LOSSES, COSTS, OR
-OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
 
 Derivative works are acceptable, even for commercial purposes, so long as
-(1) the source code for the derivative work includes prominent notice that
-the work is derivative, and (2) the source code includes prominent notice with
-these four paragraphs for those parts of this code that are retained.
+(1) they include prominent notice that the work is derivative, and (2) they
+include prominent notice akin to these four paragraphs for those parts of
+this code that are retained.
+
+===============================================================================
+*/
+
+/* BSD licensing:
+ * Copyright (c) 2006, Fabrice Bellard
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-=============================================================================*/
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
 
 /*----------------------------------------------------------------------------
 | This macro tests for minimum version of the GNU C compiler.
@@ -107,10 +151,10 @@ static inline void shift64RightJamming(uint64_t a, int_fast16_t count, uint64_t
 | 63 bits of the extra result are all zero if and only if _all_but_the_last_
 | bits shifted off were all zero.  This extra result is stored in the location
 | pointed to by `z1Ptr'.  The value of `count' can be arbitrarily large.
-|     (This routine makes more sense if `a0' and `a1' are considered to form
-| a fixed-point value with binary point between `a0' and `a1'.  This fixed-
-| point value is shifted right by the number of bits given in `count', and
-| the integer part of the result is returned at the location pointed to by
+|     (This routine makes more sense if `a0' and `a1' are considered to form a
+| fixed-point value with binary point between `a0' and `a1'.  This fixed-point
+| value is shifted right by the number of bits given in `count', and the
+| integer part of the result is returned at the location pointed to by
 | `z0Ptr'.  The fractional part of the result may be slightly corrupted as
 | described above, and is returned at the location pointed to by `z1Ptr'.)
 *----------------------------------------------------------------------------*/
diff --git a/fpu/softfloat-specialize.h b/fpu/softfloat-specialize.h
index 518f694a68..23d73788ed 100644
--- a/fpu/softfloat-specialize.h
+++ b/fpu/softfloat-specialize.h
@@ -1,13 +1,24 @@
 /*
  * QEMU float support
  *
- * Derived from SoftFloat.
+ * The code in this source file is derived from release 2a of the SoftFloat
+ * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
+ * some later contributions) are provided under that license, as detailed below.
+ * It has subsequently been modified by contributors to the QEMU Project,
+ * so some portions are provided under:
+ *  the SoftFloat-2a license
+ *  the BSD license
+ *  GPL-v2-or-later
+ *
+ * Any future contributions to this file after December 1st 2014 will be
+ * taken to be licensed under the Softfloat-2a license unless specifically
+ * indicated otherwise.
  */
 
-/*============================================================================
-
+/*
+===============================================================================
 This C source fragment is part of the SoftFloat IEC/IEEE Floating-point
-Arithmetic Package, Release 2b.
+Arithmetic Package, Release 2a.
 
 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@@ -16,29 +27,66 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
 arithmetic/SoftFloat.html'.
 
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
-been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
-RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
-AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
-COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
-EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
-INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
-OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
 
 Derivative works are acceptable, even for commercial purposes, so long as
-(1) the source code for the derivative work includes prominent notice that
-the work is derivative, and (2) the source code includes prominent notice with
-these four paragraphs for those parts of this code that are retained.
+(1) they include prominent notice that the work is derivative, and (2) they
+include prominent notice akin to these four paragraphs for those parts of
+this code that are retained.
+
+===============================================================================
+*/
+
+/* BSD licensing:
+ * Copyright (c) 2006, Fabrice Bellard
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-=============================================================================*/
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
 
+/* Does the target distinguish signaling NaNs from non-signaling NaNs
+ * by setting the most significant bit of the mantissa for a signaling NaN?
+ * (The more common choice is to have it be zero for SNaN and one for QNaN.)
+ */
 #if defined(TARGET_MIPS) || defined(TARGET_SH4) || defined(TARGET_UNICORE32)
-#define SNAN_BIT_IS_ONE		1
+#define SNAN_BIT_IS_ONE 1
 #else
-#define SNAN_BIT_IS_ONE		0
+#define SNAN_BIT_IS_ONE 0
 #endif
 
 #if defined(TARGET_XTENSA)
@@ -81,7 +129,7 @@ const float64 float64_default_nan = const_float64(LIT64( 0x7FFFFFFFFFFFFFFF ));
 #elif defined(TARGET_PPC) || defined(TARGET_ARM) || defined(TARGET_ALPHA)
 const float64 float64_default_nan = const_float64(LIT64( 0x7FF8000000000000 ));
 #elif SNAN_BIT_IS_ONE
-const float64 float64_default_nan = const_float64(LIT64( 0x7FF7FFFFFFFFFFFF ));
+const float64 float64_default_nan = const_float64(LIT64(0x7FF7FFFFFFFFFFFF));
 #else
 const float64 float64_default_nan = const_float64(LIT64( 0xFFF8000000000000 ));
 #endif
@@ -91,7 +139,7 @@ const float64 float64_default_nan = const_float64(LIT64( 0xFFF8000000000000 ));
 *----------------------------------------------------------------------------*/
 #if SNAN_BIT_IS_ONE
 #define floatx80_default_nan_high 0x7FFF
-#define floatx80_default_nan_low  LIT64( 0xBFFFFFFFFFFFFFFF )
+#define floatx80_default_nan_low  LIT64(0xBFFFFFFFFFFFFFFF)
 #else
 #define floatx80_default_nan_high 0xFFFF
 #define floatx80_default_nan_low  LIT64( 0xC000000000000000 )
@@ -105,8 +153,8 @@ const floatx80 floatx80_default_nan
 | `low' values hold the most- and least-significant bits, respectively.
 *----------------------------------------------------------------------------*/
 #if SNAN_BIT_IS_ONE
-#define float128_default_nan_high LIT64( 0x7FFF7FFFFFFFFFFF )
-#define float128_default_nan_low  LIT64( 0xFFFFFFFFFFFFFFFF )
+#define float128_default_nan_high LIT64(0x7FFF7FFFFFFFFFFF)
+#define float128_default_nan_low  LIT64(0xFFFFFFFFFFFFFFFF)
 #else
 #define float128_default_nan_high LIT64( 0xFFFF800000000000 )
 #define float128_default_nan_low  LIT64( 0x0000000000000000 )
@@ -257,9 +305,9 @@ int float32_is_quiet_nan( float32 a_ )
 {
     uint32_t a = float32_val(a_);
 #if SNAN_BIT_IS_ONE
-    return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
+    return (((a >> 22) & 0x1ff) == 0x1fe) && (a & 0x003fffff);
 #else
-    return ( 0xFF800000 <= (uint32_t) ( a<<1 ) );
+    return ((uint32_t)(a << 1) >= 0xff800000);
 #endif
 }
 
@@ -272,7 +320,7 @@ int float32_is_signaling_nan( float32 a_ )
 {
     uint32_t a = float32_val(a_);
 #if SNAN_BIT_IS_ONE
-    return ( 0xFF800000 <= (uint32_t) ( a<<1 ) );
+    return ((uint32_t)(a << 1) >= 0xff800000);
 #else
     return ( ( ( a>>22 ) & 0x1FF ) == 0x1FE ) && ( a & 0x003FFFFF );
 #endif
@@ -665,11 +713,10 @@ int float64_is_quiet_nan( float64 a_ )
 {
     uint64_t a = float64_val(a_);
 #if SNAN_BIT_IS_ONE
-    return
-           ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
-        && ( a & LIT64( 0x0007FFFFFFFFFFFF ) );
+    return (((a >> 51) & 0xfff) == 0xffe)
+           && (a & 0x0007ffffffffffffULL);
 #else
-    return ( LIT64( 0xFFF0000000000000 ) <= (uint64_t) ( a<<1 ) );
+    return ((a << 1) >= 0xfff0000000000000ULL);
 #endif
 }
 
@@ -682,7 +729,7 @@ int float64_is_signaling_nan( float64 a_ )
 {
     uint64_t a = float64_val(a_);
 #if SNAN_BIT_IS_ONE
-    return ( LIT64( 0xFFF0000000000000 ) <= (uint64_t) ( a<<1 ) );
+    return ((a << 1) >= 0xfff0000000000000ULL);
 #else
     return
            ( ( ( a>>51 ) & 0xFFF ) == 0xFFE )
@@ -866,11 +913,10 @@ int floatx80_is_quiet_nan( floatx80 a )
 #if SNAN_BIT_IS_ONE
     uint64_t aLow;
 
-    aLow = a.low & ~ LIT64( 0x4000000000000000 );
-    return
-           ( ( a.high & 0x7FFF ) == 0x7FFF )
-        && (uint64_t) ( aLow<<1 )
-        && ( a.low == aLow );
+    aLow = a.low & ~0x4000000000000000ULL;
+    return ((a.high & 0x7fff) == 0x7fff)
+        && (aLow << 1)
+        && (a.low == aLow);
 #else
     return ( ( a.high & 0x7FFF ) == 0x7FFF )
         && (LIT64( 0x8000000000000000 ) <= ((uint64_t) ( a.low<<1 )));
@@ -886,8 +932,8 @@ int floatx80_is_quiet_nan( floatx80 a )
 int floatx80_is_signaling_nan( floatx80 a )
 {
 #if SNAN_BIT_IS_ONE
-    return ( ( a.high & 0x7FFF ) == 0x7FFF )
-        && (LIT64( 0x8000000000000000 ) <= ((uint64_t) ( a.low<<1 )));
+    return ((a.high & 0x7fff) == 0x7fff)
+        && ((a.low << 1) >= 0x8000000000000000ULL);
 #else
     uint64_t aLow;
 
@@ -1031,13 +1077,12 @@ int float128_is_signaling_nan(float128 a_)
 int float128_is_quiet_nan( float128 a )
 {
 #if SNAN_BIT_IS_ONE
-    return
-           ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
-        && ( a.low || ( a.high & LIT64( 0x00007FFFFFFFFFFF ) ) );
+    return (((a.high >> 47) & 0xffff) == 0xfffe)
+        && (a.low || (a.high & 0x00007fffffffffffULL));
 #else
     return
-           ( LIT64( 0xFFFE000000000000 ) <= (uint64_t) ( a.high<<1 ) )
-        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
+        ((a.high << 1) >= 0xffff000000000000ULL)
+        && (a.low || (a.high & 0x0000ffffffffffffULL));
 #endif
 }
 
@@ -1050,8 +1095,8 @@ int float128_is_signaling_nan( float128 a )
 {
 #if SNAN_BIT_IS_ONE
     return
-           ( LIT64( 0xFFFE000000000000 ) <= (uint64_t) ( a.high<<1 ) )
-        && ( a.low || ( a.high & LIT64( 0x0000FFFFFFFFFFFF ) ) );
+        ((a.high << 1) >= 0xffff000000000000ULL)
+        && (a.low || (a.high & 0x0000ffffffffffffULL));
 #else
     return
            ( ( ( a.high>>47 ) & 0xFFFF ) == 0xFFFE )
diff --git a/fpu/softfloat.c b/fpu/softfloat.c
index 16b21ebe61..a1f1cb33b4 100644
--- a/fpu/softfloat.c
+++ b/fpu/softfloat.c
@@ -1,13 +1,24 @@
 /*
  * QEMU float support
  *
- * Derived from SoftFloat.
+ * The code in this source file is derived from release 2a of the SoftFloat
+ * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
+ * some later contributions) are provided under that license, as detailed below.
+ * It has subsequently been modified by contributors to the QEMU Project,
+ * so some portions are provided under:
+ *  the SoftFloat-2a license
+ *  the BSD license
+ *  GPL-v2-or-later
+ *
+ * Any future contributions to this file after December 1st 2014 will be
+ * taken to be licensed under the Softfloat-2a license unless specifically
+ * indicated otherwise.
  */
 
-/*============================================================================
-
-This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
-Package, Release 2b.
+/*
+===============================================================================
+This C source file is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2a.
 
 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@@ -16,24 +27,57 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
 arithmetic/SoftFloat.html'.
 
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
-been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
-RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
-AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
-COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
-EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
-INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
-OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
 
 Derivative works are acceptable, even for commercial purposes, so long as
-(1) the source code for the derivative work includes prominent notice that
-the work is derivative, and (2) the source code includes prominent notice with
-these four paragraphs for those parts of this code that are retained.
+(1) they include prominent notice that the work is derivative, and (2) they
+include prominent notice akin to these four paragraphs for those parts of
+this code that are retained.
+
+===============================================================================
+*/
+
+/* BSD licensing:
+ * Copyright (c) 2006, Fabrice Bellard
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-=============================================================================*/
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
 
 /* softfloat (and in particular the code in softfloat-specialize.h) is
  * target-dependent and needs the TARGET_* macros.
@@ -529,9 +573,9 @@ static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
 | the inexact exception raised if the abstract input cannot be represented
 | exactly.  However, if the abstract value is too large, the overflow and
 | inexact exceptions are raised and an infinity or maximal finite value is
-| returned.  If the abstract value is too small, the input value is rounded
-| to a subnormal number, and the underflow and inexact exceptions are raised
-| if the abstract input cannot be represented exactly as a subnormal double-
+| returned.  If the abstract value is too small, the input value is rounded to
+| a subnormal number, and the underflow and inexact exceptions are raised if
+| the abstract input cannot be represented exactly as a subnormal double-
 | precision floating-point number.
 |     The input significand `zSig' has its binary point between bits 62
 | and 61, which is 10 bits to the left of the usual location.  This shifted
@@ -1304,27 +1348,6 @@ float32 int64_to_float32(int64_t a STATUS_PARAM)
 
 }
 
-float32 uint64_to_float32(uint64_t a STATUS_PARAM)
-{
-    int8 shiftCount;
-
-    if ( a == 0 ) return float32_zero;
-    shiftCount = countLeadingZeros64( a ) - 40;
-    if ( 0 <= shiftCount ) {
-        return packFloat32(0, 0x95 - shiftCount, a<<shiftCount);
-    }
-    else {
-        shiftCount += 7;
-        if ( shiftCount < 0 ) {
-            shift64RightJamming( a, - shiftCount, &a );
-        }
-        else {
-            a <<= shiftCount;
-        }
-        return roundAndPackFloat32(0, 0x9C - shiftCount, a STATUS_VAR);
-    }
-}
-
 /*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit two's complement integer `a'
 | to the double-precision floating-point format.  The conversion is performed
@@ -1344,20 +1367,6 @@ float64 int64_to_float64(int64_t a STATUS_PARAM)
 
 }
 
-float64 uint64_to_float64(uint64_t a STATUS_PARAM)
-{
-    int exp =  0x43C;
-
-    if (a == 0) {
-        return float64_zero;
-    }
-    if ((int64_t)a < 0) {
-        shift64RightJamming(a, 1, &a);
-        exp += 1;
-    }
-    return normalizeRoundAndPackFloat64(0, exp, a STATUS_VAR);
-}
-
 /*----------------------------------------------------------------------------
 | Returns the result of converting the 64-bit two's complement integer `a'
 | to the extended double-precision floating-point format.  The conversion
@@ -1412,6 +1421,71 @@ float128 int64_to_float128(int64_t a STATUS_PARAM)
 
 }
 
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit unsigned integer `a'
+| to the single-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float32 uint64_to_float32(uint64_t a STATUS_PARAM)
+{
+    int shiftcount;
+
+    if (a == 0) {
+        return float32_zero;
+    }
+
+    /* Determine (left) shift needed to put first set bit into bit posn 23
+     * (since packFloat32() expects the binary point between bits 23 and 22);
+     * this is the fast case for smallish numbers.
+     */
+    shiftcount = countLeadingZeros64(a) - 40;
+    if (shiftcount >= 0) {
+        return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
+    }
+    /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
+     * expects the binary point between bits 30 and 29, hence the + 7.
+     */
+    shiftcount += 7;
+    if (shiftcount < 0) {
+        shift64RightJamming(a, -shiftcount, &a);
+    } else {
+        a <<= shiftcount;
+    }
+
+    return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit unsigned integer `a'
+| to the double-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
+float64 uint64_to_float64(uint64_t a STATUS_PARAM)
+{
+    int exp = 0x43C;
+    int shiftcount;
+
+    if (a == 0) {
+        return float64_zero;
+    }
+
+    shiftcount = countLeadingZeros64(a) - 1;
+    if (shiftcount < 0) {
+        shift64RightJamming(a, -shiftcount, &a);
+    } else {
+        a <<= shiftcount;
+    }
+    return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
+}
+
+/*----------------------------------------------------------------------------
+| Returns the result of converting the 64-bit unsigned integer `a'
+| to the quadruple-precision floating-point format.  The conversion is performed
+| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
+*----------------------------------------------------------------------------*/
+
 float128 uint64_to_float128(uint64_t a STATUS_PARAM)
 {
     if (a == 0) {
diff --git a/hw/9pfs/virtio-9p-synth.c b/hw/9pfs/virtio-9p-synth.c
index 71262bccd2..e75aa8772e 100644
--- a/hw/9pfs/virtio-9p-synth.c
+++ b/hw/9pfs/virtio-9p-synth.c
@@ -17,6 +17,7 @@
 #include "virtio-9p-xattr.h"
 #include "fsdev/qemu-fsdev.h"
 #include "virtio-9p-synth.h"
+#include "qemu/rcu.h"
 
 #include <sys/stat.h>
 
diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
index 3b77c9a227..4ba8409668 100644
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -62,6 +62,7 @@ typedef struct S390IPLState {
 static int s390_ipl_init(SysBusDevice *dev)
 {
     S390IPLState *ipl = S390_IPL(dev);
+    uint64_t pentry = KERN_IMAGE_START;
     int kernel_size;
 
     if (!ipl->kernel) {
@@ -94,31 +95,31 @@ static int s390_ipl_init(SysBusDevice *dev)
             hw_error("could not load bootloader '%s'\n", bios_name);
         }
         return 0;
+    }
+
+    kernel_size = load_elf(ipl->kernel, NULL, NULL, &pentry, NULL,
+                           NULL, 1, ELF_MACHINE, 0);
+    if (kernel_size < 0) {
+        kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
+    }
+    if (kernel_size < 0) {
+        fprintf(stderr, "could not load kernel '%s'\n", ipl->kernel);
+        return -1;
+    }
+    /*
+     * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
+     * kernel parameters here as well. Note: For old kernels (up to 3.2)
+     * we can not rely on the ELF entry point - it was 0x800 (the SALIPL
+     * loader) and it won't work. For this case we force it to 0x10000, too.
+     */
+    if (pentry == KERN_IMAGE_START || pentry == 0x800) {
+        ipl->start_addr = KERN_IMAGE_START;
+        /* Overwrite parameters in the kernel image, which are "rom" */
+        strcpy(rom_ptr(KERN_PARM_AREA), ipl->cmdline);
     } else {
-        uint64_t pentry = KERN_IMAGE_START;
-        kernel_size = load_elf(ipl->kernel, NULL, NULL, &pentry, NULL,
-                               NULL, 1, ELF_MACHINE, 0);
-        if (kernel_size < 0) {
-            kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
-        }
-        if (kernel_size < 0) {
-            fprintf(stderr, "could not load kernel '%s'\n", ipl->kernel);
-            return -1;
-        }
-        /*
-         * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
-         * kernel parameters here as well. Note: For old kernels (up to 3.2)
-         * we can not rely on the ELF entry point - it was 0x800 (the SALIPL
-         * loader) and it won't work. For this case we force it to 0x10000, too.
-         */
-        if (pentry == KERN_IMAGE_START || pentry == 0x800) {
-            ipl->start_addr = KERN_IMAGE_START;
-            /* Overwrite parameters in the kernel image, which are "rom" */
-            strcpy(rom_ptr(KERN_PARM_AREA), ipl->cmdline);
-        } else {
-            ipl->start_addr = pentry;
-        }
+        ipl->start_addr = pentry;
     }
+
     if (ipl->initrd) {
         ram_addr_t initrd_offset;
         int initrd_size;
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 1201b8d57c..dc455a2bb7 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -170,7 +170,7 @@ S390PCIBusDevice *s390_pci_find_dev_by_fh(uint32_t fh)
     S390pciState *s = S390_PCI_HOST_BRIDGE(
         object_resolve_path(TYPE_S390_PCI_HOST_BRIDGE, NULL));
 
-    if (!s) {
+    if (!s || !fh) {
         return NULL;
     }
 
@@ -187,7 +187,7 @@ S390PCIBusDevice *s390_pci_find_dev_by_fh(uint32_t fh)
 static void s390_pci_generate_event(uint8_t cc, uint16_t pec, uint32_t fh,
                                     uint32_t fid, uint64_t faddr, uint32_t e)
 {
-    SeiContainer *sei_cont = g_malloc0(sizeof(SeiContainer));
+    SeiContainer *sei_cont;
     S390pciState *s = S390_PCI_HOST_BRIDGE(
         object_resolve_path(TYPE_S390_PCI_HOST_BRIDGE, NULL));
 
@@ -195,6 +195,7 @@ static void s390_pci_generate_event(uint8_t cc, uint16_t pec, uint32_t fh,
         return;
     }
 
+    sei_cont = g_malloc0(sizeof(SeiContainer));
     sei_cont->fh = fh;
     sei_cont->fid = fid;
     sei_cont->cc = cc;
diff --git a/hw/s390x/s390-pci-inst.c b/hw/s390x/s390-pci-inst.c
index 5ea13e5d79..9e5bc5b899 100644
--- a/hw/s390x/s390-pci-inst.c
+++ b/hw/s390x/s390-pci-inst.c
@@ -487,7 +487,7 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
     CPUS390XState *env = &cpu->env;
     uint32_t fh;
     S390PCIBusDevice *pbdev;
-    ram_addr_t size;
+    hwaddr start, end;
     IOMMUTLBEntry entry;
     MemoryRegion *mr;
 
@@ -504,7 +504,8 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
     }
 
     fh = env->regs[r1] >> 32;
-    size = env->regs[r2 + 1];
+    start = env->regs[r2];
+    end = start + env->regs[r2 + 1];
 
     pbdev = s390_pci_find_dev_by_fh(fh);
 
@@ -515,15 +516,18 @@ int rpcit_service_call(S390CPU *cpu, uint8_t r1, uint8_t r2)
     }
 
     mr = pci_device_iommu_address_space(pbdev->pdev)->root;
-    entry = mr->iommu_ops->translate(mr, env->regs[r2], 0);
+    while (start < end) {
+        entry = mr->iommu_ops->translate(mr, start, 0);
 
-    if (!entry.translated_addr) {
-        setcc(cpu, ZPCI_PCI_LS_ERR);
-        goto out;
+        if (!entry.translated_addr) {
+            setcc(cpu, ZPCI_PCI_LS_ERR);
+            goto out;
+        }
+
+        memory_region_notify_iommu(mr, entry);
+        start += entry.addr_mask + 1;
     }
 
-    entry.addr_mask = size - 1;
-    memory_region_notify_iommu(mr, entry);
     setcc(cpu, ZPCI_PCI_LS_OK);
 out:
     return 0;
@@ -784,10 +788,10 @@ int stpcifc_service_call(S390CPU *cpu, uint8_t r1, uint64_t fiba)
     stq_p(&fib.aisb, pbdev->routes.adapter.summary_addr);
     stq_p(&fib.fmb_addr, pbdev->fmb_addr);
 
-    data = (pbdev->isc << 28) | (pbdev->noi << 16) |
-           (pbdev->routes.adapter.ind_offset << 8) | (pbdev->sum << 7) |
-           pbdev->routes.adapter.summary_offset;
-    stw_p(&fib.data, data);
+    data = ((uint32_t)pbdev->isc << 28) | ((uint32_t)pbdev->noi << 16) |
+           ((uint32_t)pbdev->routes.adapter.ind_offset << 8) |
+           ((uint32_t)pbdev->sum << 7) | pbdev->routes.adapter.summary_offset;
+    stl_p(&fib.data, data);
 
     if (pbdev->fh >> ENABLE_BIT_OFFSET) {
         fib.fc |= 0x80;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index 9b740a3cfa..db39ae0e23 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -1756,6 +1756,8 @@ void scsi_req_cancel_async(SCSIRequest *req, Notifier *notifier)
     req->io_canceled = true;
     if (req->aiocb) {
         blk_aio_cancel_async(req->aiocb);
+    } else {
+        scsi_req_cancel_complete(req);
     }
 }
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index cf483fffa9..e71385e4fe 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -32,7 +32,7 @@
 #include "trace.h"
 
 struct vfio_group_head vfio_group_list =
-    QLIST_HEAD_INITIALIZER(vfio_address_spaces);
+    QLIST_HEAD_INITIALIZER(vfio_group_list);
 struct vfio_as_head vfio_address_spaces =
     QLIST_HEAD_INITIALIZER(vfio_address_spaces);
 
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 014a92ce5f..29caabc149 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -3065,6 +3065,7 @@ static void vfio_put_device(VFIOPCIDevice *vdev)
 {
     g_free(vdev->vbasedev.name);
     if (vdev->msix) {
+        object_unparent(OBJECT(&vdev->msix->mmap_mem));
         g_free(vdev->msix);
         vdev->msix = NULL;
     }
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 0cd96b152e..06ffa1d185 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -33,6 +33,7 @@
 #include "qemu/notify.h"
 #include "qapi/error.h"
 #include "qom/object.h"
+#include "qemu/rcu.h"
 
 #define MAX_PHYS_ADDR_SPACE_BITS 62
 #define MAX_PHYS_ADDR            (((hwaddr)1 << MAX_PHYS_ADDR_SPACE_BITS) - 1)
@@ -207,9 +208,13 @@ struct MemoryListener {
  */
 struct AddressSpace {
     /* All fields are private. */
+    struct rcu_head rcu;
     char *name;
     MemoryRegion *root;
+
+    /* Accessed via RCU.  */
     struct FlatView *current_map;
+
     int ioeventfd_nb;
     struct MemoryRegionIoeventfd *ioeventfds;
     struct AddressSpaceDispatch *dispatch;
diff --git a/include/fpu/softfloat.h b/include/fpu/softfloat.h
index e32e25d547..35019c9bf3 100644
--- a/include/fpu/softfloat.h
+++ b/include/fpu/softfloat.h
@@ -1,13 +1,24 @@
 /*
  * QEMU float support
  *
- * Derived from SoftFloat.
+ * The code in this source file is derived from release 2a of the SoftFloat
+ * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
+ * some later contributions) are provided under that license, as detailed below.
+ * It has subsequently been modified by contributors to the QEMU Project,
+ * so some portions are provided under:
+ *  the SoftFloat-2a license
+ *  the BSD license
+ *  GPL-v2-or-later
+ *
+ * Any future contributions to this file after December 1st 2014 will be
+ * taken to be licensed under the Softfloat-2a license unless specifically
+ * indicated otherwise.
  */
 
-/*============================================================================
-
-This C header file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
-Package, Release 2b.
+/*
+===============================================================================
+This C header file is part of the SoftFloat IEC/IEEE Floating-point
+Arithmetic Package, Release 2a.
 
 Written by John R. Hauser.  This work was made possible in part by the
 International Computer Science Institute, located at Suite 600, 1947 Center
@@ -16,24 +27,57 @@ National Science Foundation under grant MIP-9311980.  The original version
 of this code was written as part of a project to build a fixed-point vector
 processor in collaboration with the University of California at Berkeley,
 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
-is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
+is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
 arithmetic/SoftFloat.html'.
 
-THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
-been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
-RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
-AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
-COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
-EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
-INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
-OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
+THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort
+has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
+TIMES RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO
+PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
+AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
 
 Derivative works are acceptable, even for commercial purposes, so long as
-(1) the source code for the derivative work includes prominent notice that
-the work is derivative, and (2) the source code includes prominent notice with
-these four paragraphs for those parts of this code that are retained.
+(1) they include prominent notice that the work is derivative, and (2) they
+include prominent notice akin to these four paragraphs for those parts of
+this code that are retained.
+
+===============================================================================
+*/
+
+/* BSD licensing:
+ * Copyright (c) 2006, Fabrice Bellard
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its contributors
+ * may be used to endorse or promote products derived from this software without
+ * specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-=============================================================================*/
+/* Portions of this work are licensed under the terms of the GNU GPL,
+ * version 2 or later. See the COPYING file in the top-level directory.
+ */
 
 #ifndef SOFTFLOAT_H
 #define SOFTFLOAT_H
@@ -275,11 +319,11 @@ float64 uint32_to_float64(uint32_t STATUS_PARAM);
 floatx80 int32_to_floatx80(int32_t STATUS_PARAM);
 float128 int32_to_float128(int32_t STATUS_PARAM);
 float32 int64_to_float32(int64_t STATUS_PARAM);
-float32 uint64_to_float32(uint64_t STATUS_PARAM);
 float64 int64_to_float64(int64_t STATUS_PARAM);
-float64 uint64_to_float64(uint64_t STATUS_PARAM);
 floatx80 int64_to_floatx80(int64_t STATUS_PARAM);
 float128 int64_to_float128(int64_t STATUS_PARAM);
+float32 uint64_to_float32(uint64_t STATUS_PARAM);
+float64 uint64_to_float64(uint64_t STATUS_PARAM);
 float128 uint64_to_float128(uint64_t STATUS_PARAM);
 
 /* We provide the int16 versions for symmetry of API with float-to-int */
diff --git a/include/qemu/atomic.h b/include/qemu/atomic.h
index 93c2ae2f37..98e05ca875 100644
--- a/include/qemu/atomic.h
+++ b/include/qemu/atomic.h
@@ -129,6 +129,67 @@
 #define atomic_set(ptr, i)     ((*(__typeof__(*ptr) volatile*) (ptr)) = (i))
 #endif
 
+/**
+ * atomic_rcu_read - reads a RCU-protected pointer to a local variable
+ * into a RCU read-side critical section. The pointer can later be safely
+ * dereferenced within the critical section.
+ *
+ * This ensures that the pointer copy is invariant thorough the whole critical
+ * section.
+ *
+ * Inserts memory barriers on architectures that require them (currently only
+ * Alpha) and documents which pointers are protected by RCU.
+ *
+ * Unless the __ATOMIC_CONSUME memory order is available, atomic_rcu_read also
+ * includes a compiler barrier to ensure that value-speculative optimizations
+ * (e.g. VSS: Value Speculation Scheduling) does not perform the data read
+ * before the pointer read by speculating the value of the pointer.  On new
+ * enough compilers, atomic_load takes care of such concern about
+ * dependency-breaking optimizations.
+ *
+ * Should match atomic_rcu_set(), atomic_xchg(), atomic_cmpxchg().
+ */
+#ifndef atomic_rcu_read
+#ifdef __ATOMIC_CONSUME
+#define atomic_rcu_read(ptr)    ({                \
+    typeof(*ptr) _val;                            \
+     __atomic_load(ptr, &_val, __ATOMIC_CONSUME); \
+    _val;                                         \
+})
+#else
+#define atomic_rcu_read(ptr)    ({                \
+    typeof(*ptr) _val = atomic_read(ptr);         \
+    smp_read_barrier_depends();                   \
+    _val;                                         \
+})
+#endif
+#endif
+
+/**
+ * atomic_rcu_set - assigns (publicizes) a pointer to a new data structure
+ * meant to be read by RCU read-side critical sections.
+ *
+ * Documents which pointers will be dereferenced by RCU read-side critical
+ * sections and adds the required memory barriers on architectures requiring
+ * them. It also makes sure the compiler does not reorder code initializing the
+ * data structure before its publication.
+ *
+ * Should match atomic_rcu_read().
+ */
+#ifndef atomic_rcu_set
+#ifdef __ATOMIC_RELEASE
+#define atomic_rcu_set(ptr, i)  do {              \
+    typeof(*ptr) _val = (i);                      \
+    __atomic_store(ptr, &_val, __ATOMIC_RELEASE); \
+} while(0)
+#else
+#define atomic_rcu_set(ptr, i)  do {              \
+    smp_wmb();                                    \
+    atomic_set(ptr, i);                           \
+} while (0)
+#endif
+#endif
+
 /* These have the same semantics as Java volatile variables.
  * See http://gee.cs.oswego.edu/dl/jmm/cookbook.html:
  * "1. Issue a StoreStore barrier (wmb) before each volatile store."
diff --git a/include/qemu/queue.h b/include/qemu/queue.h
index a98eb3ad79..c602797652 100644
--- a/include/qemu/queue.h
+++ b/include/qemu/queue.h
@@ -104,6 +104,19 @@ struct {                                                                \
         (head)->lh_first = NULL;                                        \
 } while (/*CONSTCOND*/0)
 
+#define QLIST_SWAP(dstlist, srclist, field) do {                        \
+        void *tmplist;                                                  \
+        tmplist = (srclist)->lh_first;                                  \
+        (srclist)->lh_first = (dstlist)->lh_first;                      \
+        if ((srclist)->lh_first != NULL) {                              \
+            (srclist)->lh_first->field.le_prev = &(srclist)->lh_first;  \
+        }                                                               \
+        (dstlist)->lh_first = tmplist;                                  \
+        if ((dstlist)->lh_first != NULL) {                              \
+            (dstlist)->lh_first->field.le_prev = &(dstlist)->lh_first;  \
+        }                                                               \
+} while (/*CONSTCOND*/0)
+
 #define QLIST_INSERT_AFTER(listelm, elm, field) do {                    \
         if (((elm)->field.le_next = (listelm)->field.le_next) != NULL)  \
                 (listelm)->field.le_next->field.le_prev =               \
diff --git a/include/qemu/rcu.h b/include/qemu/rcu.h
new file mode 100644
index 0000000000..068a279a79
--- /dev/null
+++ b/include/qemu/rcu.h
@@ -0,0 +1,147 @@
+#ifndef QEMU_RCU_H
+#define QEMU_RCU_H
+
+/*
+ * urcu-mb.h
+ *
+ * Userspace RCU header with explicit memory barrier.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include <stdlib.h>
+#include <assert.h>
+#include <limits.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <glib.h>
+
+#include "qemu/compiler.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/atomic.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Important !
+ *
+ * Each thread containing read-side critical sections must be registered
+ * with rcu_register_thread() before calling rcu_read_lock().
+ * rcu_unregister_thread() should be called before the thread exits.
+ */
+
+#ifdef DEBUG_RCU
+#define rcu_assert(args...)    assert(args)
+#else
+#define rcu_assert(args...)
+#endif
+
+/*
+ * Global quiescent period counter with low-order bits unused.
+ * Using a int rather than a char to eliminate false register dependencies
+ * causing stalls on some architectures.
+ */
+extern unsigned long rcu_gp_ctr;
+
+extern QemuEvent rcu_gp_event;
+
+struct rcu_reader_data {
+    /* Data used by both reader and synchronize_rcu() */
+    unsigned long ctr;
+    bool waiting;
+
+    /* Data used by reader only */
+    unsigned depth;
+
+    /* Data used for registry, protected by rcu_gp_lock */
+    QLIST_ENTRY(rcu_reader_data) node;
+};
+
+extern __thread struct rcu_reader_data rcu_reader;
+
+static inline void rcu_read_lock(void)
+{
+    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+    unsigned ctr;
+
+    if (p_rcu_reader->depth++ > 0) {
+        return;
+    }
+
+    ctr = atomic_read(&rcu_gp_ctr);
+    atomic_xchg(&p_rcu_reader->ctr, ctr);
+    if (atomic_read(&p_rcu_reader->waiting)) {
+        atomic_set(&p_rcu_reader->waiting, false);
+        qemu_event_set(&rcu_gp_event);
+    }
+}
+
+static inline void rcu_read_unlock(void)
+{
+    struct rcu_reader_data *p_rcu_reader = &rcu_reader;
+
+    assert(p_rcu_reader->depth != 0);
+    if (--p_rcu_reader->depth > 0) {
+        return;
+    }
+
+    atomic_xchg(&p_rcu_reader->ctr, 0);
+    if (atomic_read(&p_rcu_reader->waiting)) {
+        atomic_set(&p_rcu_reader->waiting, false);
+        qemu_event_set(&rcu_gp_event);
+    }
+}
+
+extern void synchronize_rcu(void);
+
+/*
+ * Reader thread registration.
+ */
+extern void rcu_register_thread(void);
+extern void rcu_unregister_thread(void);
+
+struct rcu_head;
+typedef void RCUCBFunc(struct rcu_head *head);
+
+struct rcu_head {
+    struct rcu_head *next;
+    RCUCBFunc *func;
+};
+
+extern void call_rcu1(struct rcu_head *head, RCUCBFunc *func);
+
+/* The operands of the minus operator must have the same type,
+ * which must be the one that we specify in the cast.
+ */
+#define call_rcu(head, func, field)                                      \
+    call_rcu1(({                                                         \
+         char __attribute__((unused))                                    \
+            offset_must_be_zero[-offsetof(typeof(*(head)), field)],      \
+            func_type_invalid = (func) - (void (*)(typeof(head)))(func); \
+         &(head)->field;                                                 \
+      }),                                                                \
+      (RCUCBFunc *)(func))
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* QEMU_RCU_H */
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index e89fdc9785..5114ec8e79 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,9 +25,6 @@ void qemu_mutex_lock(QemuMutex *mutex);
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
-#define rcu_read_lock() do { } while (0)
-#define rcu_read_unlock() do { } while (0)
-
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index ca5befba0e..eba8b2109c 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -838,7 +838,6 @@ static inline int64_t get_clock(void)
 int64_t cpu_get_icount_raw(void);
 int64_t cpu_get_icount(void);
 int64_t cpu_get_clock(void);
-int64_t cpu_get_clock_offset(void);
 int64_t cpu_icount_to_ns(int64_t icount);
 
 /*******************************************/
diff --git a/memory.c b/memory.c
index c343bf37df..9b91243978 100644
--- a/memory.c
+++ b/memory.c
@@ -33,26 +33,12 @@ static bool memory_region_update_pending;
 static bool ioeventfd_update_pending;
 static bool global_dirty_log = false;
 
-/* flat_view_mutex is taken around reading as->current_map; the critical
- * section is extremely short, so I'm using a single mutex for every AS.
- * We could also RCU for the read-side.
- *
- * The BQL is taken around transaction commits, hence both locks are taken
- * while writing to as->current_map (with the BQL taken outside).
- */
-static QemuMutex flat_view_mutex;
-
 static QTAILQ_HEAD(memory_listeners, MemoryListener) memory_listeners
     = QTAILQ_HEAD_INITIALIZER(memory_listeners);
 
 static QTAILQ_HEAD(, AddressSpace) address_spaces
     = QTAILQ_HEAD_INITIALIZER(address_spaces);
 
-static void memory_init(void)
-{
-    qemu_mutex_init(&flat_view_mutex);
-}
-
 typedef struct AddrRange AddrRange;
 
 /*
@@ -242,6 +228,7 @@ struct FlatRange {
  * order.
  */
 struct FlatView {
+    struct rcu_head rcu;
     unsigned ref;
     FlatRange *ranges;
     unsigned nr;
@@ -654,10 +641,10 @@ static FlatView *address_space_get_flatview(AddressSpace *as)
 {
     FlatView *view;
 
-    qemu_mutex_lock(&flat_view_mutex);
-    view = as->current_map;
+    rcu_read_lock();
+    view = atomic_rcu_read(&as->current_map);
     flatview_ref(view);
-    qemu_mutex_unlock(&flat_view_mutex);
+    rcu_read_unlock();
     return view;
 }
 
@@ -766,10 +753,9 @@ static void address_space_update_topology(AddressSpace *as)
     address_space_update_topology_pass(as, old_view, new_view, false);
     address_space_update_topology_pass(as, old_view, new_view, true);
 
-    qemu_mutex_lock(&flat_view_mutex);
-    flatview_unref(as->current_map);
-    as->current_map = new_view;
-    qemu_mutex_unlock(&flat_view_mutex);
+    /* Writes are protected by the BQL.  */
+    atomic_rcu_set(&as->current_map, new_view);
+    call_rcu(old_view, flatview_unref, rcu);
 
     /* Note that all the old MemoryRegions are still alive up to this
      * point.  This relieves most MemoryListeners from the need to
@@ -1263,7 +1249,6 @@ static void memory_region_finalize(Object *obj)
     MemoryRegion *mr = MEMORY_REGION(obj);
 
     assert(QTAILQ_EMPTY(&mr->subregions));
-    assert(memory_region_transaction_depth == 0);
     mr->destructor(mr);
     memory_region_clear_coalescing(mr);
     g_free((char *)mr->name);
@@ -1843,11 +1828,11 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr,
     }
     range = addrrange_make(int128_make64(addr), int128_make64(size));
 
-    view = address_space_get_flatview(as);
+    rcu_read_lock();
+    view = atomic_rcu_read(&as->current_map);
     fr = flatview_lookup(view, range);
     if (!fr) {
-        flatview_unref(view);
-        return ret;
+        goto out;
     }
 
     while (fr > view->ranges && addrrange_intersects(fr[-1].addr, range)) {
@@ -1864,8 +1849,8 @@ MemoryRegionSection memory_region_find(MemoryRegion *mr,
     ret.offset_within_address_space = int128_get64(range.start);
     ret.readonly = fr->readonly;
     memory_region_ref(ret.mr);
-
-    flatview_unref(view);
+out:
+    rcu_read_unlock();
     return ret;
 }
 
@@ -1958,10 +1943,6 @@ void memory_listener_unregister(MemoryListener *listener)
 
 void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
 {
-    if (QTAILQ_EMPTY(&address_spaces)) {
-        memory_init();
-    }
-
     memory_region_transaction_begin();
     as->root = root;
     as->current_map = g_new(FlatView, 1);
@@ -1975,15 +1956,10 @@ void address_space_init(AddressSpace *as, MemoryRegion *root, const char *name)
     memory_region_transaction_commit();
 }
 
-void address_space_destroy(AddressSpace *as)
+static void do_address_space_destroy(AddressSpace *as)
 {
     MemoryListener *listener;
 
-    /* Flush out anything from MemoryListeners listening in on this */
-    memory_region_transaction_begin();
-    as->root = NULL;
-    memory_region_transaction_commit();
-    QTAILQ_REMOVE(&address_spaces, as, address_spaces_link);
     address_space_destroy_dispatch(as);
 
     QTAILQ_FOREACH(listener, &memory_listeners, link) {
@@ -1995,6 +1971,21 @@ void address_space_destroy(AddressSpace *as)
     g_free(as->ioeventfds);
 }
 
+void address_space_destroy(AddressSpace *as)
+{
+    /* Flush out anything from MemoryListeners listening in on this */
+    memory_region_transaction_begin();
+    as->root = NULL;
+    memory_region_transaction_commit();
+    QTAILQ_REMOVE(&address_spaces, as, address_spaces_link);
+
+    /* At this point, as->dispatch and as->current_map are dummy
+     * entries that the guest should never use.  Wait for the old
+     * values to expire before freeing the data.
+     */
+    call_rcu(as, do_address_space_destroy, rcu);
+}
+
 bool io_mem_read(MemoryRegion *mr, hwaddr addr, uint64_t *pval, unsigned size)
 {
     return memory_region_dispatch_read(mr, addr, pval, size);
diff --git a/pc-bios/s390-ccw.img b/pc-bios/s390-ccw.img
index 44873ad181..dbe5a38262 100644
--- a/pc-bios/s390-ccw.img
+++ b/pc-bios/s390-ccw.img
Binary files differdiff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index 115d8bbac6..b678d5ebb8 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -33,7 +33,7 @@ typedef struct ResetInfo {
     uint32_t ipl_continue;
 } ResetInfo;
 
-ResetInfo save;
+static ResetInfo save;
 
 static void jump_to_IPL_2(void)
 {
@@ -80,7 +80,7 @@ static void jump_to_IPL_code(uint64_t address)
  */
 
 static unsigned char _bprs[8*1024]; /* guessed "max" ECKD sector size */
-const int max_bprs_entries = sizeof(_bprs) / sizeof(ExtEckdBlockPtr);
+static const int max_bprs_entries = sizeof(_bprs) / sizeof(ExtEckdBlockPtr);
 
 static inline void verify_boot_info(BootInfo *bip)
 {
diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h
index 6a4823d544..ab132e3579 100644
--- a/pc-bios/s390-ccw/bootmap.h
+++ b/pc-bios/s390-ccw/bootmap.h
@@ -15,7 +15,7 @@
 #include "virtio.h"
 
 typedef uint64_t block_number_t;
-#define NULL_BLOCK_NR 0xffffffffffffffff
+#define NULL_BLOCK_NR 0xffffffffffffffffULL
 
 #define FREE_SPACE_FILLER '\xAA'
 
diff --git a/pc-bios/s390-ccw/main.c b/pc-bios/s390-ccw/main.c
index f9ec2157ad..6f707bbcd4 100644
--- a/pc-bios/s390-ccw/main.c
+++ b/pc-bios/s390-ccw/main.c
@@ -13,7 +13,7 @@
 
 char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
 uint64_t boot_value;
-struct subchannel_id blk_schid = { .one = 1 };
+static struct subchannel_id blk_schid = { .one = 1 };
 
 /*
  * Priniciples of Operations (SA22-7832-09) chapter 17 requires that
diff --git a/pc-bios/s390-ccw/s390-ccw.h b/pc-bios/s390-ccw/s390-ccw.h
index 2b773deafa..ceb7418a50 100644
--- a/pc-bios/s390-ccw/s390-ccw.h
+++ b/pc-bios/s390-ccw/s390-ccw.h
@@ -51,6 +51,8 @@ void disabled_wait(void);
 /* main.c */
 void virtio_panic(const char *string);
 void write_subsystem_identification(void);
+extern char stack[PAGE_SIZE * 8] __attribute__((__aligned__(PAGE_SIZE)));
+extern uint64_t boot_value;
 
 /* sclp-ascii.c */
 void sclp_print(const char *string);
diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c
index c0540d1cd4..4dc91a7c43 100644
--- a/pc-bios/s390-ccw/virtio.c
+++ b/pc-bios/s390-ccw/virtio.c
@@ -11,7 +11,7 @@
 #include "s390-ccw.h"
 #include "virtio.h"
 
-struct vring block;
+static struct vring block;
 
 static char chsc_page[PAGE_SIZE] __attribute__((__aligned__(PAGE_SIZE)));
 
diff --git a/target-s390x/cc_helper.c b/target-s390x/cc_helper.c
index 373eb176a1..00bc883a8a 100644
--- a/target-s390x/cc_helper.c
+++ b/target-s390x/cc_helper.c
@@ -179,16 +179,11 @@ static uint32_t cc_calc_subu_64(uint64_t a1, uint64_t a2, uint64_t ar)
 
 static uint32_t cc_calc_subb_64(uint64_t a1, uint64_t a2, uint64_t ar)
 {
-    /* We had borrow-in if normal subtraction isn't equal.  */
-    int borrow_in = ar - (a1 - a2);
     int borrow_out;
 
-    /* If a2 was ULONG_MAX, and borrow_in, then a2 is logically 65 bits,
-       and we must have had borrow out.  */
-    if (borrow_in && a2 == (uint64_t)-1) {
-        borrow_out = 1;
+    if (ar != a1 - a2) {	/* difference means borrow-in */
+        borrow_out = (a2 >= a1);
     } else {
-        a2 += borrow_in;
         borrow_out = (a2 > a1);
     }
 
@@ -285,16 +280,11 @@ static uint32_t cc_calc_subu_32(uint32_t a1, uint32_t a2, uint32_t ar)
 
 static uint32_t cc_calc_subb_32(uint32_t a1, uint32_t a2, uint32_t ar)
 {
-    /* We had borrow-in if normal subtraction isn't equal.  */
-    int borrow_in = ar - (a1 - a2);
     int borrow_out;
 
-    /* If a2 was UINT_MAX, and borrow_in, then a2 is logically 65 bits,
-       and we must have had borrow out.  */
-    if (borrow_in && a2 == (uint32_t)-1) {
-        borrow_out = 1;
+    if (ar != a1 - a2) {	/* difference means borrow-in */
+        borrow_out = (a2 >= a1);
     } else {
-        a2 += borrow_in;
         borrow_out = (a2 > a1);
     }
 
diff --git a/target-s390x/cpu.h b/target-s390x/cpu.h
index c123b6f023..2e2554c4b3 100644
--- a/target-s390x/cpu.h
+++ b/target-s390x/cpu.h
@@ -133,7 +133,9 @@ typedef struct CPUS390XState {
 
     /* reset does memset(0) up to here */
 
-    int cpu_num;
+    uint32_t cpu_num;
+    uint32_t machine_type;
+
     uint8_t *storage_keys;
 
     uint64_t tod_offset;
diff --git a/target-s390x/helper.h b/target-s390x/helper.h
index faebfd96aa..8d2c8596bb 100644
--- a/target-s390x/helper.h
+++ b/target-s390x/helper.h
@@ -111,5 +111,8 @@ DEF_HELPER_FLAGS_2(sacf, TCG_CALL_NO_WG, void, env, i64)
 DEF_HELPER_FLAGS_3(ipte, TCG_CALL_NO_RWG, void, env, i64, i64)
 DEF_HELPER_FLAGS_1(ptlb, TCG_CALL_NO_RWG, void, env)
 DEF_HELPER_2(lra, i64, env, i64)
+DEF_HELPER_FLAGS_2(lura, TCG_CALL_NO_WG, i64, env, i64)
+DEF_HELPER_FLAGS_2(lurag, TCG_CALL_NO_WG, i64, env, i64)
 DEF_HELPER_FLAGS_3(stura, TCG_CALL_NO_WG, void, env, i64, i64)
+DEF_HELPER_FLAGS_3(sturg, TCG_CALL_NO_WG, void, env, i64, i64)
 #endif
diff --git a/target-s390x/insn-data.def b/target-s390x/insn-data.def
index 4d2feb6977..8d8e47e0bf 100644
--- a/target-s390x/insn-data.def
+++ b/target-s390x/insn-data.def
@@ -285,8 +285,12 @@
 
 /* EXTRACT ACCESS */
     C(0xb24f, EAR,     RRE,   Z,   0, 0, new, r1_32, ear, 0)
+/* EXTRACT CPU ATTRIBUTE */
+    C(0xeb4c, ECAG,    RSY_a, GIE, 0, a2, r1, 0, ecag, 0)
 /* EXTRACT FPC */
     C(0xb38c, EFPC,    RRE,   Z,   0, 0, new, r1_32, efpc, 0)
+/* EXTRACT PSW */
+    C(0xb98d, EPSW,    RRE,   Z,   0, 0, 0, 0, epsw, 0)
 
 /* FIND LEFTMOST ONE */
     C(0xb983, FLOGR,   RRE,   EI,  0, r2_o, r1_P, 0, flogr, 0)
@@ -566,6 +570,10 @@
 
 /* SET ACCESS */
     C(0xb24e, SAR,     RRE,   Z,   0, r2_o, 0, 0, sar, 0)
+/* SET ADDRESSING MODE */
+    D(0x010c, SAM24,   E,     Z,   0, 0, 0, 0, sam, 0, 0)
+    D(0x010d, SAM31,   E,     Z,   0, 0, 0, 0, sam, 0, 1)
+    D(0x010e, SAM64,   E,     Z,   0, 0, 0, 0, sam, 0, 3)
 /* SET FPC */
     C(0xb384, SFPC,    RRE,   Z,   0, r1_o, 0, 0, sfpc, 0)
 /* SET FPC AND SIGNAL */
@@ -733,6 +741,9 @@
     C(0xb100, LRA,     RX_a,  Z,   0, a2, r1, 0, lra, 0)
     C(0xe313, LRAY,    RXY_a, LD,  0, a2, r1, 0, lra, 0)
     C(0xe303, LRAG,    RXY_a, Z,   0, a2, r1, 0, lra, 0)
+/* LOAD USING REAL ADDRESS */
+    C(0xb24b, LURA,    RRE,   Z,   0, r2, new, r1_32, lura, 0)
+    C(0xb905, LURAG,   RRE,   Z,   0, r2, r1, 0, lurag, 0)
 /* MOVE TO PRIMARY */
     C(0xda00, MVCP,    SS_d,  Z,   la1, a2, 0, 0, mvcp, 0)
 /* MOVE TO SECONDARY */
@@ -743,10 +754,6 @@
     C(0xb22a, RRBE,    RRE,   Z,   0, r2_o, 0, 0, rrbe, 0)
 /* SERVICE CALL LOGICAL PROCESSOR (PV hypercall) */
     C(0xb220, SERVC,   RRE,   Z,   r1_o, r2_o, 0, 0, servc, 0)
-/* SET ADDRESSING MODE */
-    D(0x010c, SAM24,   E,     Z,   0, 0, 0, 0, sam, 0, 0)
-    D(0x010d, SAM31,   E,     Z,   0, 0, 0, 0, sam, 0, 1)
-    D(0x010e, SAM64,   E,     Z,   0, 0, 0, 0, sam, 0, 3)
 /* SET ADDRESS SPACE CONTROL FAST */
     C(0xb279, SACF,    S,     Z,   0, a2, 0, 0, sacf, 0)
 /* SET CLOCK */
@@ -794,6 +801,7 @@
     C(0xad00, STOSM,   SI,    Z,   la1, 0, 0, 0, stnosm, 0)
 /* STORE USING REAL ADDRESS */
     C(0xb246, STURA,   RRE,   Z,   r1_o, r2_o, 0, 0, stura, 0)
+    C(0xb925, STURG,   RRE,   Z,   r1_o, r2_o, 0, 0, sturg, 0)
 /* TEST PROTECTION */
     C(0xe501, TPROT,   SSE,   Z,   la1, a2, 0, 0, tprot, 0)
 
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index dcd75055c1..6f2d5b4924 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -1046,7 +1046,7 @@ static void kvm_handle_diag_308(S390CPU *cpu, struct kvm_run *run)
     uint64_t r1, r3;
 
     cpu_synchronize_state(CPU(cpu));
-    r1 = (run->s390_sieic.ipa & 0x00f0) >> 8;
+    r1 = (run->s390_sieic.ipa & 0x00f0) >> 4;
     r3 = run->s390_sieic.ipa & 0x000f;
     handle_diag_308(&cpu->env, r1, r3);
 }
@@ -1091,7 +1091,7 @@ static int handle_diag(S390CPU *cpu, struct kvm_run *run, uint32_t ipb)
         break;
     default:
         DPRINTF("KVM: unknown DIAG: 0x%x\n", func_code);
-        r = -1;
+        enter_pgmcheck(cpu, PGM_SPECIFICATION);
         break;
     }
 
diff --git a/target-s390x/mem_helper.c b/target-s390x/mem_helper.c
index 5a55de86a1..d67b345ad1 100644
--- a/target-s390x/mem_helper.c
+++ b/target-s390x/mem_helper.c
@@ -490,10 +490,18 @@ uint32_t HELPER(ex)(CPUS390XState *env, uint32_t cc, uint64_t v1,
             helper_mvc(env, l, get_address(env, 0, b1, d1),
                        get_address(env, 0, b2, d2));
             break;
+        case 0x400:
+            cc = helper_nc(env, l, get_address(env, 0, b1, d1),
+                            get_address(env, 0, b2, d2));
+            break;
         case 0x500:
             cc = helper_clc(env, l, get_address(env, 0, b1, d1),
                             get_address(env, 0, b2, d2));
             break;
+        case 0x600:
+            cc = helper_oc(env, l, get_address(env, 0, b1, d1),
+                            get_address(env, 0, b2, d2));
+            break;
         case 0x700:
             cc = helper_xc(env, l, get_address(env, 0, b1, d1),
                            get_address(env, 0, b2, d2));
@@ -1034,12 +1042,34 @@ void HELPER(ptlb)(CPUS390XState *env)
     tlb_flush(CPU(cpu), 1);
 }
 
+/* load using real address */
+uint64_t HELPER(lura)(CPUS390XState *env, uint64_t addr)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    return (uint32_t)ldl_phys(cs->as, get_address(env, 0, 0, addr));
+}
+
+uint64_t HELPER(lurag)(CPUS390XState *env, uint64_t addr)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    return ldq_phys(cs->as, get_address(env, 0, 0, addr));
+}
+
 /* store using real address */
 void HELPER(stura)(CPUS390XState *env, uint64_t addr, uint64_t v1)
 {
     CPUState *cs = CPU(s390_env_get_cpu(env));
 
-    stw_phys(cs->as, get_address(env, 0, 0, addr), (uint32_t)v1);
+    stl_phys(cs->as, get_address(env, 0, 0, addr), (uint32_t)v1);
+}
+
+void HELPER(sturg)(CPUS390XState *env, uint64_t addr, uint64_t v1)
+{
+    CPUState *cs = CPU(s390_env_get_cpu(env));
+
+    stq_phys(cs->as, get_address(env, 0, 0, addr), v1);
 }
 
 /* load real address */
diff --git a/target-s390x/translate.c b/target-s390x/translate.c
index ab01bc004e..8b36eca718 100644
--- a/target-s390x/translate.c
+++ b/target-s390x/translate.c
@@ -317,12 +317,14 @@ static inline void gen_illegal_opcode(DisasContext *s)
     gen_program_exception(s, PGM_SPECIFICATION);
 }
 
-static inline void check_privileged(DisasContext *s)
+#ifndef CONFIG_USER_ONLY
+static void check_privileged(DisasContext *s)
 {
     if (s->tb->flags & (PSW_MASK_PSTATE >> 32)) {
         gen_program_exception(s, PGM_PRIVILEGED);
     }
 }
+#endif
 
 static TCGv_i64 get_address(DisasContext *s, int x2, int b2, int d2)
 {
@@ -2045,12 +2047,37 @@ static ExitStatus op_ear(DisasContext *s, DisasOps *o)
     return NO_EXIT;
 }
 
+static ExitStatus op_ecag(DisasContext *s, DisasOps *o)
+{
+    /* No cache information provided.  */
+    tcg_gen_movi_i64(o->out, -1);
+    return NO_EXIT;
+}
+
 static ExitStatus op_efpc(DisasContext *s, DisasOps *o)
 {
     tcg_gen_ld32u_i64(o->out, cpu_env, offsetof(CPUS390XState, fpc));
     return NO_EXIT;
 }
 
+static ExitStatus op_epsw(DisasContext *s, DisasOps *o)
+{
+    int r1 = get_field(s->fields, r1);
+    int r2 = get_field(s->fields, r2);
+    TCGv_i64 t = tcg_temp_new_i64();
+
+    /* Note the "subsequently" in the PoO, which implies a defined result
+       if r1 == r2.  Thus we cannot defer these writes to an output hook.  */
+    tcg_gen_shri_i64(t, psw_mask, 32);
+    store_reg32_i64(r1, t);
+    if (r2 != 0) {
+        store_reg32_i64(r2, psw_mask);
+    }
+
+    tcg_temp_free_i64(t);
+    return NO_EXIT;
+}
+
 static ExitStatus op_ex(DisasContext *s, DisasOps *o)
 {
     /* ??? Perhaps a better way to implement EXECUTE is to set a bit in
@@ -2460,6 +2487,24 @@ static ExitStatus op_lm64(DisasContext *s, DisasOps *o)
     return NO_EXIT;
 }
 
+#ifndef CONFIG_USER_ONLY
+static ExitStatus op_lura(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_lura(o->out, cpu_env, o->in2);
+    return NO_EXIT;
+}
+
+static ExitStatus op_lurag(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_lurag(o->out, cpu_env, o->in2);
+    return NO_EXIT;
+}
+#endif
+
 static ExitStatus op_mov2(DisasContext *s, DisasOps *o)
 {
     o->out = o->in2;
@@ -2925,19 +2970,42 @@ static ExitStatus op_sacf(DisasContext *s, DisasOps *o)
     /* Addressing mode has changed, so end the block.  */
     return EXIT_PC_STALE;
 }
+#endif
 
 static ExitStatus op_sam(DisasContext *s, DisasOps *o)
 {
     int sam = s->insn->data;
-    TCGv_i64 tsam = tcg_const_i64(sam);
+    TCGv_i64 tsam;
+    uint64_t mask;
 
-    /* Overwrite PSW_MASK_64 and PSW_MASK_32 */
-    tcg_gen_deposit_i64(psw_mask, psw_mask, tsam, 31, 2);
+    switch (sam) {
+    case 0:
+        mask = 0xffffff;
+        break;
+    case 1:
+        mask = 0x7fffffff;
+        break;
+    default:
+        mask = -1;
+        break;
+    }
 
+    /* Bizzare but true, we check the address of the current insn for the
+       specification exception, not the next to be executed.  Thus the PoO
+       documents that Bad Things Happen two bytes before the end.  */
+    if (s->pc & ~mask) {
+        gen_program_exception(s, PGM_SPECIFICATION);
+        return EXIT_NORETURN;
+    }
+    s->next_pc &= mask;
+
+    tsam = tcg_const_i64(sam);
+    tcg_gen_deposit_i64(psw_mask, psw_mask, tsam, 31, 2);
     tcg_temp_free_i64(tsam);
+
+    /* Always exit the TB, since we (may have) changed execution mode.  */
     return EXIT_PC_STALE;
 }
-#endif
 
 static ExitStatus op_sar(DisasContext *s, DisasOps *o)
 {
@@ -3221,8 +3289,14 @@ static ExitStatus op_stctl(DisasContext *s, DisasOps *o)
 
 static ExitStatus op_stidp(DisasContext *s, DisasOps *o)
 {
+    TCGv_i64 t1 = tcg_temp_new_i64();
+
     check_privileged(s);
     tcg_gen_ld32u_i64(o->out, cpu_env, offsetof(CPUS390XState, cpu_num));
+    tcg_gen_ld32u_i64(t1, cpu_env, offsetof(CPUS390XState, machine_type));
+    tcg_gen_deposit_i64(o->out, o->out, t1, 32, 32);
+    tcg_temp_free_i64(t1);
+
     return NO_EXIT;
 }
 
@@ -3317,6 +3391,14 @@ static ExitStatus op_stura(DisasContext *s, DisasOps *o)
     gen_helper_stura(cpu_env, o->in2, o->in1);
     return NO_EXIT;
 }
+
+static ExitStatus op_sturg(DisasContext *s, DisasOps *o)
+{
+    check_privileged(s);
+    potential_page_fault(s);
+    gen_helper_sturg(cpu_env, o->in2, o->in1);
+    return NO_EXIT;
+}
 #endif
 
 static ExitStatus op_st8(DisasContext *s, DisasOps *o)
diff --git a/tests/Makefile b/tests/Makefile
index c2e2e52f22..db5b3c3df1 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -60,6 +60,8 @@ gcov-files-test-mul64-y = util/host-utils.c
 check-unit-y += tests/test-int128$(EXESUF)
 # all code tested by test-int128 is inside int128.h
 gcov-files-test-int128-y =
+check-unit-y += tests/rcutorture$(EXESUF)
+gcov-files-rcutorture-y = util/rcu.c
 check-unit-y += tests/test-bitops$(EXESUF)
 check-unit-$(CONFIG_HAS_GLIB_SUBPROCESS_TESTS) += tests/test-qdev-global-props$(EXESUF)
 check-unit-y += tests/check-qom-interface$(EXESUF)
@@ -223,7 +225,8 @@ test-obj-y = tests/check-qint.o tests/check-qstring.o tests/check-qdict.o \
 	tests/test-qmp-input-visitor.o tests/test-qmp-input-strict.o \
 	tests/test-qmp-commands.o tests/test-visitor-serialization.o \
 	tests/test-x86-cpuid.o tests/test-mul64.o tests/test-int128.o \
-	tests/test-opts-visitor.o tests/test-qmp-event.o
+	tests/test-opts-visitor.o tests/test-qmp-event.o \
+	tests/rcutorture.o
 
 test-qapi-obj-y = tests/test-qapi-visit.o tests/test-qapi-types.o \
 		  tests/test-qapi-event.o
@@ -252,6 +255,8 @@ tests/test-x86-cpuid$(EXESUF): tests/test-x86-cpuid.o
 tests/test-xbzrle$(EXESUF): tests/test-xbzrle.o migration/xbzrle.o page_cache.o libqemuutil.a
 tests/test-cutils$(EXESUF): tests/test-cutils.o util/cutils.o
 tests/test-int128$(EXESUF): tests/test-int128.o
+tests/rcutorture$(EXESUF): tests/rcutorture.o libqemuutil.a
+
 tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
 	hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
 	hw/core/irq.o \
diff --git a/tests/rcutorture.c b/tests/rcutorture.c
new file mode 100644
index 0000000000..60a2ccfe2e
--- /dev/null
+++ b/tests/rcutorture.c
@@ -0,0 +1,451 @@
+/*
+ * rcutorture.c: simple user-level performance/stress test of RCU.
+ *
+ * Usage:
+ *     ./rcu <nreaders> rperf [ <seconds> ]
+ *         Run a read-side performance test with the specified
+ *         number of readers for <seconds> seconds.
+ *     ./rcu <nupdaters> uperf [ <seconds> ]
+ *         Run an update-side performance test with the specified
+ *         number of updaters and specified duration.
+ *     ./rcu <nreaders> perf [ <seconds> ]
+ *         Run a combined read/update performance test with the specified
+ *         number of readers and one updater and specified duration.
+ *
+ * The above tests produce output as follows:
+ *
+ * n_reads: 46008000  n_updates: 146026  nreaders: 2  nupdaters: 1 duration: 1
+ * ns/read: 43.4707  ns/update: 6848.1
+ *
+ * The first line lists the total number of RCU reads and updates executed
+ * during the test, the number of reader threads, the number of updater
+ * threads, and the duration of the test in seconds.  The second line
+ * lists the average duration of each type of operation in nanoseconds,
+ * or "nan" if the corresponding type of operation was not performed.
+ *
+ *     ./rcu <nreaders> stress [ <seconds> ]
+ *         Run a stress test with the specified number of readers and
+ *         one updater.
+ *
+ * This test produces output as follows:
+ *
+ * n_reads: 114633217  n_updates: 3903415  n_mberror: 0
+ * rcu_stress_count: 114618391 14826 0 0 0 0 0 0 0 0 0
+ *
+ * The first line lists the number of RCU read and update operations
+ * executed, followed by the number of memory-ordering violations
+ * (which will be zero in a correct RCU implementation).  The second
+ * line lists the number of readers observing progressively more stale
+ * data.  A correct RCU implementation will have all but the first two
+ * numbers non-zero.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (c) 2008 Paul E. McKenney, IBM Corporation.
+ */
+
+/*
+ * Test variables.
+ */
+
+#include <glib.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include "qemu/atomic.h"
+#include "qemu/rcu.h"
+#include "qemu/compiler.h"
+#include "qemu/thread.h"
+
+long long n_reads = 0LL;
+long n_updates = 0L;
+int nthreadsrunning;
+
+#define GOFLAG_INIT 0
+#define GOFLAG_RUN  1
+#define GOFLAG_STOP 2
+
+static volatile int goflag = GOFLAG_INIT;
+
+#define RCU_READ_RUN 1000
+
+#define NR_THREADS 100
+static QemuThread threads[NR_THREADS];
+static struct rcu_reader_data *data[NR_THREADS];
+static int n_threads;
+
+static void create_thread(void *(*func)(void *))
+{
+    if (n_threads >= NR_THREADS) {
+        fprintf(stderr, "Thread limit of %d exceeded!\n", NR_THREADS);
+        exit(-1);
+    }
+    qemu_thread_create(&threads[n_threads], "test", func, &data[n_threads],
+                       QEMU_THREAD_JOINABLE);
+    n_threads++;
+}
+
+static void wait_all_threads(void)
+{
+    int i;
+
+    for (i = 0; i < n_threads; i++) {
+        qemu_thread_join(&threads[i]);
+    }
+    n_threads = 0;
+}
+
+/*
+ * Performance test.
+ */
+
+static void *rcu_read_perf_test(void *arg)
+{
+    int i;
+    long long n_reads_local = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    atomic_inc(&nthreadsrunning);
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        for (i = 0; i < RCU_READ_RUN; i++) {
+            rcu_read_lock();
+            rcu_read_unlock();
+        }
+        n_reads_local += RCU_READ_RUN;
+    }
+    atomic_add(&n_reads, n_reads_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_update_perf_test(void *arg)
+{
+    long long n_updates_local = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    atomic_inc(&nthreadsrunning);
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        synchronize_rcu();
+        n_updates_local++;
+    }
+    atomic_add(&n_updates, n_updates_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void perftestinit(void)
+{
+    nthreadsrunning = 0;
+}
+
+static void perftestrun(int nthreads, int duration, int nreaders, int nupdaters)
+{
+    while (atomic_read(&nthreadsrunning) < nthreads) {
+        g_usleep(1000);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    printf("n_reads: %lld  n_updates: %ld  nreaders: %d  nupdaters: %d duration: %d\n",
+           n_reads, n_updates, nreaders, nupdaters, duration);
+    printf("ns/read: %g  ns/update: %g\n",
+           ((duration * 1000*1000*1000.*(double)nreaders) /
+        (double)n_reads),
+           ((duration * 1000*1000*1000.*(double)nupdaters) /
+        (double)n_updates));
+    exit(0);
+}
+
+static void perftest(int nreaders, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_perf_test);
+    }
+    create_thread(rcu_update_perf_test);
+    perftestrun(i + 1, duration, nreaders, 1);
+}
+
+static void rperftest(int nreaders, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_perf_test);
+    }
+    perftestrun(i, duration, nreaders, 0);
+}
+
+static void uperftest(int nupdaters, int duration)
+{
+    int i;
+
+    perftestinit();
+    for (i = 0; i < nupdaters; i++) {
+        create_thread(rcu_update_perf_test);
+    }
+    perftestrun(i, duration, 0, nupdaters);
+}
+
+/*
+ * Stress test.
+ */
+
+#define RCU_STRESS_PIPE_LEN 10
+
+struct rcu_stress {
+    int pipe_count;
+    int mbtest;
+};
+
+struct rcu_stress rcu_stress_array[RCU_STRESS_PIPE_LEN] = { { 0 } };
+struct rcu_stress *rcu_stress_current;
+int rcu_stress_idx;
+
+int n_mberror;
+long long rcu_stress_count[RCU_STRESS_PIPE_LEN + 1];
+
+
+static void *rcu_read_stress_test(void *arg)
+{
+    int i;
+    int itercnt = 0;
+    struct rcu_stress *p;
+    int pc;
+    long long n_reads_local = 0;
+    volatile int garbage = 0;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        rcu_read_lock();
+        p = atomic_rcu_read(&rcu_stress_current);
+        if (p->mbtest == 0) {
+            n_mberror++;
+        }
+        rcu_read_lock();
+        for (i = 0; i < 100; i++) {
+            garbage++;
+        }
+        rcu_read_unlock();
+        pc = p->pipe_count;
+        rcu_read_unlock();
+        if ((pc > RCU_STRESS_PIPE_LEN) || (pc < 0)) {
+            pc = RCU_STRESS_PIPE_LEN;
+        }
+        atomic_inc(&rcu_stress_count[pc]);
+        n_reads_local++;
+        if ((++itercnt % 0x1000) == 0) {
+            synchronize_rcu();
+        }
+    }
+    atomic_add(&n_reads, n_reads_local);
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_update_stress_test(void *arg)
+{
+    int i;
+    struct rcu_stress *p;
+
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        i = rcu_stress_idx + 1;
+        if (i >= RCU_STRESS_PIPE_LEN) {
+            i = 0;
+        }
+        p = &rcu_stress_array[i];
+        p->mbtest = 0;
+        smp_mb();
+        p->pipe_count = 0;
+        p->mbtest = 1;
+        atomic_rcu_set(&rcu_stress_current, p);
+        rcu_stress_idx = i;
+        for (i = 0; i < RCU_STRESS_PIPE_LEN; i++) {
+            if (i != rcu_stress_idx) {
+                rcu_stress_array[i].pipe_count++;
+            }
+        }
+        synchronize_rcu();
+        n_updates++;
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void *rcu_fake_update_stress_test(void *arg)
+{
+    rcu_register_thread();
+
+    *(struct rcu_reader_data **)arg = &rcu_reader;
+    while (goflag == GOFLAG_INIT) {
+        g_usleep(1000);
+    }
+    while (goflag == GOFLAG_RUN) {
+        synchronize_rcu();
+        g_usleep(1000);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static void stresstest(int nreaders, int duration)
+{
+    int i;
+
+    rcu_stress_current = &rcu_stress_array[0];
+    rcu_stress_current->pipe_count = 0;
+    rcu_stress_current->mbtest = 1;
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_stress_test);
+    }
+    create_thread(rcu_update_stress_test);
+    for (i = 0; i < 5; i++) {
+        create_thread(rcu_fake_update_stress_test);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    printf("n_reads: %lld  n_updates: %ld  n_mberror: %d\n",
+           n_reads, n_updates, n_mberror);
+    printf("rcu_stress_count:");
+    for (i = 0; i <= RCU_STRESS_PIPE_LEN; i++) {
+        printf(" %lld", rcu_stress_count[i]);
+    }
+    printf("\n");
+    exit(0);
+}
+
+/* GTest interface */
+
+static void gtest_stress(int nreaders, int duration)
+{
+    int i;
+
+    rcu_stress_current = &rcu_stress_array[0];
+    rcu_stress_current->pipe_count = 0;
+    rcu_stress_current->mbtest = 1;
+    for (i = 0; i < nreaders; i++) {
+        create_thread(rcu_read_stress_test);
+    }
+    create_thread(rcu_update_stress_test);
+    for (i = 0; i < 5; i++) {
+        create_thread(rcu_fake_update_stress_test);
+    }
+    goflag = GOFLAG_RUN;
+    g_usleep(duration * G_USEC_PER_SEC);
+    goflag = GOFLAG_STOP;
+    wait_all_threads();
+    g_assert_cmpint(n_mberror, ==, 0);
+    for (i = 2; i <= RCU_STRESS_PIPE_LEN; i++) {
+        g_assert_cmpint(rcu_stress_count[i], ==, 0);
+    }
+}
+
+static void gtest_stress_1_1(void)
+{
+    gtest_stress(1, 1);
+}
+
+static void gtest_stress_10_1(void)
+{
+    gtest_stress(10, 1);
+}
+
+static void gtest_stress_1_5(void)
+{
+    gtest_stress(1, 5);
+}
+
+static void gtest_stress_10_5(void)
+{
+    gtest_stress(10, 5);
+}
+
+/*
+ * Mainprogram.
+ */
+
+static void usage(int argc, char *argv[])
+{
+    fprintf(stderr, "Usage: %s [nreaders [ perf | stress ] ]\n", argv[0]);
+    exit(-1);
+}
+
+int main(int argc, char *argv[])
+{
+    int nreaders = 1;
+    int duration = 1;
+
+    if (argc >= 2 && argv[1][0] == '-') {
+        g_test_init(&argc, &argv, NULL);
+        if (g_test_quick()) {
+            g_test_add_func("/rcu/torture/1reader", gtest_stress_1_1);
+            g_test_add_func("/rcu/torture/10readers", gtest_stress_10_1);
+        } else {
+            g_test_add_func("/rcu/torture/1reader", gtest_stress_1_5);
+            g_test_add_func("/rcu/torture/10readers", gtest_stress_10_5);
+        }
+        return g_test_run();
+    }
+
+    if (argc >= 2) {
+        nreaders = strtoul(argv[1], NULL, 0);
+    }
+    if (argc > 3) {
+        duration = strtoul(argv[3], NULL, 0);
+    }
+    if (argc < 3 || strcmp(argv[2], "stress") == 0) {
+        stresstest(nreaders, duration);
+    } else if (strcmp(argv[2], "rperf") == 0) {
+        rperftest(nreaders, duration);
+    } else if (strcmp(argv[2], "uperf") == 0) {
+        uperftest(nreaders, duration);
+    } else if (strcmp(argv[2], "perf") == 0) {
+        perftest(nreaders, duration);
+    }
+    usage(argc, argv);
+    return 0;
+}
diff --git a/util/Makefile.objs b/util/Makefile.objs
index 93007e2f56..ceaba30939 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -17,3 +17,4 @@ util-obj-y += throttle.o
 util-obj-y += getauxval.o
 util-obj-y += readline.o
 util-obj-y += rfifolock.o
+util-obj-y += rcu.o
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index 41cb23df0c..50a29d8f7a 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -307,11 +307,13 @@ static inline void futex_wait(QemuEvent *ev, unsigned val)
 #else
 static inline void futex_wake(QemuEvent *ev, int n)
 {
+    pthread_mutex_lock(&ev->lock);
     if (n == 1) {
         pthread_cond_signal(&ev->cond);
     } else {
         pthread_cond_broadcast(&ev->cond);
     }
+    pthread_mutex_unlock(&ev->lock);
 }
 
 static inline void futex_wait(QemuEvent *ev, unsigned val)
diff --git a/util/rcu.c b/util/rcu.c
new file mode 100644
index 0000000000..c9c3e6e4ab
--- /dev/null
+++ b/util/rcu.c
@@ -0,0 +1,291 @@
+/*
+ * urcu-mb.c
+ *
+ * Userspace RCU library with explicit memory barriers
+ *
+ * Copyright (c) 2009 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ * Copyright (c) 2009 Paul E. McKenney, IBM Corporation.
+ * Copyright 2015 Red Hat, Inc.
+ *
+ * Ported to QEMU by Paolo Bonzini  <pbonzini@redhat.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * IBM's contributions to this file may be relicensed under LGPLv2 or later.
+ */
+
+#include "qemu-common.h"
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <errno.h>
+#include "qemu/rcu.h"
+#include "qemu/atomic.h"
+#include "qemu/thread.h"
+
+/*
+ * Global grace period counter.  Bit 0 is always one in rcu_gp_ctr.
+ * Bits 1 and above are defined in synchronize_rcu.
+ */
+#define RCU_GP_LOCKED           (1UL << 0)
+#define RCU_GP_CTR              (1UL << 1)
+
+unsigned long rcu_gp_ctr = RCU_GP_LOCKED;
+
+QemuEvent rcu_gp_event;
+static QemuMutex rcu_gp_lock;
+
+/*
+ * Check whether a quiescent state was crossed between the beginning of
+ * update_counter_and_wait and now.
+ */
+static inline int rcu_gp_ongoing(unsigned long *ctr)
+{
+    unsigned long v;
+
+    v = atomic_read(ctr);
+    return v && (v != rcu_gp_ctr);
+}
+
+/* Written to only by each individual reader. Read by both the reader and the
+ * writers.
+ */
+__thread struct rcu_reader_data rcu_reader;
+
+/* Protected by rcu_gp_lock.  */
+typedef QLIST_HEAD(, rcu_reader_data) ThreadList;
+static ThreadList registry = QLIST_HEAD_INITIALIZER(registry);
+
+/* Wait for previous parity/grace period to be empty of readers.  */
+static void wait_for_readers(void)
+{
+    ThreadList qsreaders = QLIST_HEAD_INITIALIZER(qsreaders);
+    struct rcu_reader_data *index, *tmp;
+
+    for (;;) {
+        /* We want to be notified of changes made to rcu_gp_ongoing
+         * while we walk the list.
+         */
+        qemu_event_reset(&rcu_gp_event);
+
+        /* Instead of using atomic_mb_set for index->waiting, and
+         * atomic_mb_read for index->ctr, memory barriers are placed
+         * manually since writes to different threads are independent.
+         * atomic_mb_set has a smp_wmb before...
+         */
+        smp_wmb();
+        QLIST_FOREACH(index, &registry, node) {
+            atomic_set(&index->waiting, true);
+        }
+
+        /* ... and a smp_mb after.  */
+        smp_mb();
+
+        QLIST_FOREACH_SAFE(index, &registry, node, tmp) {
+            if (!rcu_gp_ongoing(&index->ctr)) {
+                QLIST_REMOVE(index, node);
+                QLIST_INSERT_HEAD(&qsreaders, index, node);
+
+                /* No need for mb_set here, worst of all we
+                 * get some extra futex wakeups.
+                 */
+                atomic_set(&index->waiting, false);
+            }
+        }
+
+        /* atomic_mb_read has smp_rmb after.  */
+        smp_rmb();
+
+        if (QLIST_EMPTY(&registry)) {
+            break;
+        }
+
+        /* Wait for one thread to report a quiescent state and
+         * try again.
+         */
+        qemu_event_wait(&rcu_gp_event);
+    }
+
+    /* put back the reader list in the registry */
+    QLIST_SWAP(&registry, &qsreaders, node);
+}
+
+void synchronize_rcu(void)
+{
+    qemu_mutex_lock(&rcu_gp_lock);
+
+    if (!QLIST_EMPTY(&registry)) {
+        /* In either case, the atomic_mb_set below blocks stores that free
+         * old RCU-protected pointers.
+         */
+        if (sizeof(rcu_gp_ctr) < 8) {
+            /* For architectures with 32-bit longs, a two-subphases algorithm
+             * ensures we do not encounter overflow bugs.
+             *
+             * Switch parity: 0 -> 1, 1 -> 0.
+             */
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+            wait_for_readers();
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+        } else {
+            /* Increment current grace period.  */
+            atomic_mb_set(&rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
+        }
+
+        wait_for_readers();
+    }
+
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+
+#define RCU_CALL_MIN_SIZE        30
+
+/* Multi-producer, single-consumer queue based on urcu/static/wfqueue.h
+ * from liburcu.  Note that head is only used by the consumer.
+ */
+static struct rcu_head dummy;
+static struct rcu_head *head = &dummy, **tail = &dummy.next;
+static int rcu_call_count;
+static QemuEvent rcu_call_ready_event;
+
+static void enqueue(struct rcu_head *node)
+{
+    struct rcu_head **old_tail;
+
+    node->next = NULL;
+    old_tail = atomic_xchg(&tail, &node->next);
+    atomic_mb_set(old_tail, node);
+}
+
+static struct rcu_head *try_dequeue(void)
+{
+    struct rcu_head *node, *next;
+
+retry:
+    /* Test for an empty list, which we do not expect.  Note that for
+     * the consumer head and tail are always consistent.  The head
+     * is consistent because only the consumer reads/writes it.
+     * The tail, because it is the first step in the enqueuing.
+     * It is only the next pointers that might be inconsistent.
+     */
+    if (head == &dummy && atomic_mb_read(&tail) == &dummy.next) {
+        abort();
+    }
+
+    /* If the head node has NULL in its next pointer, the value is
+     * wrong and we need to wait until its enqueuer finishes the update.
+     */
+    node = head;
+    next = atomic_mb_read(&head->next);
+    if (!next) {
+        return NULL;
+    }
+
+    /* Since we are the sole consumer, and we excluded the empty case
+     * above, the queue will always have at least two nodes: the
+     * dummy node, and the one being removed.  So we do not need to update
+     * the tail pointer.
+     */
+    head = next;
+
+    /* If we dequeued the dummy node, add it back at the end and retry.  */
+    if (node == &dummy) {
+        enqueue(node);
+        goto retry;
+    }
+
+    return node;
+}
+
+static void *call_rcu_thread(void *opaque)
+{
+    struct rcu_head *node;
+
+    for (;;) {
+        int tries = 0;
+        int n = atomic_read(&rcu_call_count);
+
+        /* Heuristically wait for a decent number of callbacks to pile up.
+         * Fetch rcu_call_count now, we only must process elements that were
+         * added before synchronize_rcu() starts.
+         */
+        while (n < RCU_CALL_MIN_SIZE && ++tries <= 5) {
+            g_usleep(100000);
+            qemu_event_reset(&rcu_call_ready_event);
+            n = atomic_read(&rcu_call_count);
+            if (n < RCU_CALL_MIN_SIZE) {
+                qemu_event_wait(&rcu_call_ready_event);
+                n = atomic_read(&rcu_call_count);
+            }
+        }
+
+        atomic_sub(&rcu_call_count, n);
+        synchronize_rcu();
+        while (n > 0) {
+            node = try_dequeue();
+            while (!node) {
+                qemu_event_reset(&rcu_call_ready_event);
+                node = try_dequeue();
+                if (!node) {
+                    qemu_event_wait(&rcu_call_ready_event);
+                    node = try_dequeue();
+                }
+            }
+
+            n--;
+            node->func(node);
+        }
+    }
+    abort();
+}
+
+void call_rcu1(struct rcu_head *node, void (*func)(struct rcu_head *node))
+{
+    node->func = func;
+    enqueue(node);
+    atomic_inc(&rcu_call_count);
+    qemu_event_set(&rcu_call_ready_event);
+}
+
+void rcu_register_thread(void)
+{
+    assert(rcu_reader.ctr == 0);
+    qemu_mutex_lock(&rcu_gp_lock);
+    QLIST_INSERT_HEAD(&registry, &rcu_reader, node);
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+void rcu_unregister_thread(void)
+{
+    qemu_mutex_lock(&rcu_gp_lock);
+    QLIST_REMOVE(&rcu_reader, node);
+    qemu_mutex_unlock(&rcu_gp_lock);
+}
+
+static void __attribute__((__constructor__)) rcu_init(void)
+{
+    QemuThread thread;
+
+    qemu_mutex_init(&rcu_gp_lock);
+    qemu_event_init(&rcu_gp_event, true);
+
+    qemu_event_init(&rcu_call_ready_event, false);
+    qemu_thread_create(&thread, "call_rcu", call_rcu_thread,
+                       NULL, QEMU_THREAD_DETACHED);
+
+    rcu_register_thread();
+}