summary refs log tree commit diff stats
path: root/accel/tcg/cpu-exec.c
diff options
context:
space:
mode:
authorEmilio G. Cota <cota@braap.org>2017-08-04 23:46:31 -0400
committerRichard Henderson <richard.henderson@linaro.org>2018-06-15 08:18:48 -1000
commit0ac20318ce16f4de288969b2007ef5a654176058 (patch)
treeba285e1540b4d875b476acd72f364d8aaa7165d5 /accel/tcg/cpu-exec.c
parent705ad1ff0ce264475cb4c9a3aa31ba94a04869fe (diff)
downloadfocaccia-qemu-0ac20318ce16f4de288969b2007ef5a654176058.tar.gz
focaccia-qemu-0ac20318ce16f4de288969b2007ef5a654176058.zip
tcg: remove tb_lock
Use mmap_lock in user-mode to protect TCG state and the page descriptors.
In !user-mode, each vCPU has its own TCG state, so no locks needed.
Per-page locks are used to protect the page descriptors.

Per-TB locks are used in both modes to protect TB jumps.

Some notes:

- tb_lock is removed from notdirty_mem_write by passing a
  locked page_collection to tb_invalidate_phys_page_fast.

- tcg_tb_lookup/remove/insert/etc have their own internal lock(s),
  so there is no need to further serialize access to them.

- do_tb_flush is run in a safe async context, meaning no other
  vCPU threads are running. Therefore acquiring mmap_lock there
  is just to please tools such as thread sanitizer.

- Not visible in the diff, but tb_invalidate_phys_page already
  has an assert_memory_lock.

- cpu_io_recompile is !user-only, so no mmap_lock there.

- Added mmap_unlock()'s before all siglongjmp's that could
  be called in user-mode while mmap_lock is held.
  + Added an assert for !have_mmap_lock() after returning from
    the longjmp in cpu_exec, just like we do in cpu_exec_step_atomic.

Performance numbers before/after:

Host: AMD Opteron(tm) Processor 6376

                 ubuntu 17.04 ppc64 bootup+shutdown time

  700 +-+--+----+------+------------+-----------+------------*--+-+
      |    +    +      +            +           +           *B    |
      |         before ***B***                            ** *    |
      |tb lock removal ###D###                         ***        |
  600 +-+                                           ***         +-+
      |                                           **         #    |
      |                                        *B*          #D    |
      |                                     *** *         ##      |
  500 +-+                                ***           ###      +-+
      |                             * ***           ###           |
      |                            *B*          # ##              |
      |                          ** *          #D#                |
  400 +-+                      **            ##                 +-+
      |                      **           ###                     |
      |                    **           ##                        |
      |                  **         # ##                          |
  300 +-+  *           B*          #D#                          +-+
      |    B         ***        ###                               |
      |    *       **       ####                                  |
      |     *   ***      ###                                      |
  200 +-+   B  *B     #D#                                       +-+
      |     #B* *   ## #                                          |
      |     #*    ##                                              |
      |    + D##D#     +            +           +            +    |
  100 +-+--+----+------+------------+-----------+------------+--+-+
           1    8      16      Guest CPUs       48           64
  png: https://imgur.com/HwmBHXe

              debian jessie aarch64 bootup+shutdown time

  90 +-+--+-----+-----+------------+------------+------------+--+-+
     |    +     +     +            +            +            +    |
     |         before ***B***                                B    |
  80 +tb lock removal ###D###                              **D  +-+
     |                                                   **###    |
     |                                                 **##       |
  70 +-+                                             ** #       +-+
     |                                             ** ##          |
     |                                           **  #            |
  60 +-+                                       *B  ##           +-+
     |                                       **  ##               |
     |                                    ***  #D                 |
  50 +-+                               ***   ##                 +-+
     |                             * **   ###                     |
     |                           **B*  ###                        |
  40 +-+                     ****  # ##                         +-+
     |                   ****     #D#                             |
     |             ***B**      ###                                |
  30 +-+    B***B**        ####                                 +-+
     |    B *   *     # ###                                       |
     |     B       ###D#                                          |
  20 +-+   D  ##D##                                             +-+
     |      D#                                                    |
     |    +     +     +            +            +            +    |
  10 +-+--+-----+-----+------------+------------+------------+--+-+
          1     8     16      Guest CPUs        48           64
  png: https://imgur.com/iGpGFtv

The gains are high for 4-8 CPUs. Beyond that point, however, unrelated
lock contention significantly hurts scalability.

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'accel/tcg/cpu-exec.c')
-rw-r--r--accel/tcg/cpu-exec.c34
1 files changed, 8 insertions, 26 deletions
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index c482008bc7..c738b7f7d6 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -212,20 +212,20 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
        We only end up here when an existing TB is too long.  */
     cflags |= MIN(max_cycles, CF_COUNT_MASK);
 
-    tb_lock();
+    mmap_lock();
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base,
                      orig_tb->flags, cflags);
     tb->orig_tb = orig_tb;
-    tb_unlock();
+    mmap_unlock();
 
     /* execute the generated code */
     trace_exec_tb_nocache(tb, tb->pc);
     cpu_tb_exec(cpu, tb);
 
-    tb_lock();
+    mmap_lock();
     tb_phys_invalidate(tb, -1);
+    mmap_unlock();
     tcg_tb_remove(tb);
-    tb_unlock();
 }
 #endif
 
@@ -244,9 +244,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
         tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
         if (tb == NULL) {
             mmap_lock();
-            tb_lock();
             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
-            tb_unlock();
             mmap_unlock();
         }
 
@@ -261,15 +259,13 @@ void cpu_exec_step_atomic(CPUState *cpu)
         cpu_tb_exec(cpu, tb);
         cc->cpu_exec_exit(cpu);
     } else {
-        /* We may have exited due to another problem here, so we need
-         * to reset any tb_locks we may have taken but didn't release.
+        /*
          * The mmap_lock is dropped by tb_gen_code if it runs out of
          * memory.
          */
 #ifndef CONFIG_SOFTMMU
         tcg_debug_assert(!have_mmap_lock());
 #endif
-        tb_lock_reset();
         assert_no_pages_locked();
     }
 
@@ -398,20 +394,11 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags;
-    bool acquired_tb_lock = false;
 
     tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
     if (tb == NULL) {
-        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-         * taken outside tb_lock. As system emulation is currently
-         * single threaded the locks are NOPs.
-         */
         mmap_lock();
-        tb_lock();
-        acquired_tb_lock = true;
-
         tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
-
         mmap_unlock();
         /* We add the TB in the virtual pc hash table for the fast lookup */
         atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
@@ -427,15 +414,8 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
 #endif
     /* See if we can patch the calling TB. */
     if (last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
-        if (!acquired_tb_lock) {
-            tb_lock();
-            acquired_tb_lock = true;
-        }
         tb_add_jump(last_tb, tb_exit, tb);
     }
-    if (acquired_tb_lock) {
-        tb_unlock();
-    }
     return tb;
 }
 
@@ -710,7 +690,9 @@ int cpu_exec(CPUState *cpu)
         g_assert(cpu == current_cpu);
         g_assert(cc == CPU_GET_CLASS(cpu));
 #endif /* buggy compiler */
-        tb_lock_reset();
+#ifndef CONFIG_SOFTMMU
+        tcg_debug_assert(!have_mmap_lock());
+#endif
         if (qemu_mutex_iothread_locked()) {
             qemu_mutex_unlock_iothread();
         }