summary refs log tree commit diff stats
path: root/include
diff options
context:
space:
mode:
authorEmilio G. Cota <cota@braap.org>2017-07-11 17:33:33 -0400
committerRichard Henderson <richard.henderson@linaro.org>2017-10-10 07:37:10 -0700
commitf6bb84d53110398f4899c19dab4e0fe9908ec060 (patch)
tree0ff97e935b20d07c0d427c99f3acf41f70bd71c1 /include
parent7f11636dbee89b0e4d03e9e2b96e14649a7db778 (diff)
downloadfocaccia-qemu-f6bb84d53110398f4899c19dab4e0fe9908ec060.tar.gz
focaccia-qemu-f6bb84d53110398f4899c19dab4e0fe9908ec060.zip
tcg: consolidate TB lookups in tb_lookup__cpu_state
This avoids duplicating code. cpu_exec_step will also use the
new common function once we integrate parallel_cpus into tb->cflags.

Note that in this commit we also fix a race, described by Richard Henderson
during review. Think of this scenario with threads A and B:

   (A) Lookup succeeds for TB in hash without tb_lock
        (B) Sets the TB's tb->invalid flag
        (B) Removes the TB from tb_htable
        (B) Clears all CPU's tb_jmp_cache
   (A) Store TB into local tb_jmp_cache

Given that order of events, (A) will keep executing that invalid TB until
another flush of its tb_jmp_cache happens, which in theory might never happen.
We can fix this by checking the tb->invalid flag every time we look up a TB
from tb_jmp_cache, so that in the above scenario, next time we try to find
that TB in tb_jmp_cache, we won't, and will therefore be forced to look it
up in tb_htable.

Performance-wise, I measured a small improvement when booting debian-arm.
Note that inlining pays off:

 Performance counter stats for 'taskset -c 0 qemu-system-arm \
	-machine type=virt -nographic -smp 1 -m 4096 \
	-netdev user,id=unet,hostfwd=tcp::2222-:22 \
	-device virtio-net-device,netdev=unet \
	-drive file=jessie.qcow2,id=myblock,index=0,if=none \
	-device virtio-blk-device,drive=myblock \
	-kernel kernel.img -append console=ttyAMA0 root=/dev/vda1 \
	-name arm,debug-threads=on -smp 1' (10 runs):

Before:
      18714.917392 task-clock                #    0.952 CPUs utilized            ( +-  0.95% )
            23,142 context-switches          #    0.001 M/sec                    ( +-  0.50% )
                 1 CPU-migrations            #    0.000 M/sec
            10,558 page-faults               #    0.001 M/sec                    ( +-  0.95% )
    53,957,727,252 cycles                    #    2.883 GHz                      ( +-  0.91% ) [83.33%]
    24,440,599,852 stalled-cycles-frontend   #   45.30% frontend cycles idle     ( +-  1.20% ) [83.33%]
    16,495,714,424 stalled-cycles-backend    #   30.57% backend  cycles idle     ( +-  0.95% ) [66.66%]
    76,267,572,582 instructions              #    1.41  insns per cycle
                                             #    0.32  stalled cycles per insn  ( +-  0.87% ) [83.34%]
    12,692,186,323 branches                  #  678.186 M/sec                    ( +-  0.92% ) [83.35%]
       263,486,879 branch-misses             #    2.08% of all branches          ( +-  0.73% ) [83.34%]

      19.648474449 seconds time elapsed                                          ( +-  0.82% )

After, w/ inline (this patch):
      18471.376627 task-clock                #    0.955 CPUs utilized            ( +-  0.96% )
            23,048 context-switches          #    0.001 M/sec                    ( +-  0.48% )
                 1 CPU-migrations            #    0.000 M/sec
            10,708 page-faults               #    0.001 M/sec                    ( +-  0.81% )
    53,208,990,796 cycles                    #    2.881 GHz                      ( +-  0.98% ) [83.34%]
    23,941,071,673 stalled-cycles-frontend   #   44.99% frontend cycles idle     ( +-  0.95% ) [83.34%]
    16,161,773,848 stalled-cycles-backend    #   30.37% backend  cycles idle     ( +-  0.76% ) [66.67%]
    75,786,269,766 instructions              #    1.42  insns per cycle
                                             #    0.32  stalled cycles per insn  ( +-  1.24% ) [83.34%]
    12,573,617,143 branches                  #  680.708 M/sec                    ( +-  1.34% ) [83.33%]
       260,235,550 branch-misses             #    2.07% of all branches          ( +-  0.66% ) [83.33%]

      19.340502161 seconds time elapsed                                          ( +-  0.56% )

After, w/o inline:
      18791.253967 task-clock                #    0.954 CPUs utilized            ( +-  0.78% )
            23,230 context-switches          #    0.001 M/sec                    ( +-  0.42% )
                 1 CPU-migrations            #    0.000 M/sec
            10,563 page-faults               #    0.001 M/sec                    ( +-  1.27% )
    54,168,674,622 cycles                    #    2.883 GHz                      ( +-  0.80% ) [83.34%]
    24,244,712,629 stalled-cycles-frontend   #   44.76% frontend cycles idle     ( +-  1.37% ) [83.33%]
    16,288,648,572 stalled-cycles-backend    #   30.07% backend  cycles idle     ( +-  0.95% ) [66.66%]
    77,659,755,503 instructions              #    1.43  insns per cycle
                                             #    0.31  stalled cycles per insn  ( +-  0.97% ) [83.34%]
    12,922,780,045 branches                  #  687.702 M/sec                    ( +-  1.06% ) [83.34%]
       261,962,386 branch-misses             #    2.03% of all branches          ( +-  0.71% ) [83.35%]

      19.700174670 seconds time elapsed                                          ( +-  0.56% )

Reviewed-by: Richard Henderson <rth@twiddle.net>
Signed-off-by: Emilio G. Cota <cota@braap.org>
Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
Diffstat (limited to 'include')
-rw-r--r--include/exec/tb-lookup.h49
1 files changed, 49 insertions, 0 deletions
diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
new file mode 100644
index 0000000000..9d32cb0c6e
--- /dev/null
+++ b/include/exec/tb-lookup.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2017, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ *   See the COPYING file in the top-level directory.
+ */
+#ifndef EXEC_TB_LOOKUP_H
+#define EXEC_TB_LOOKUP_H
+
+#include "qemu/osdep.h"
+
+#ifdef NEED_CPU_H
+#include "cpu.h"
+#else
+#include "exec/poison.h"
+#endif
+
+#include "exec/exec-all.h"
+#include "exec/tb-hash.h"
+
+/* Might cause an exception, so have a longjmp destination ready */
+static inline TranslationBlock *
+tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
+                     uint32_t *flags)
+{
+    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
+    TranslationBlock *tb;
+    uint32_t hash;
+
+    cpu_get_tb_cpu_state(env, pc, cs_base, flags);
+    hash = tb_jmp_cache_hash_func(*pc);
+    tb = atomic_rcu_read(&cpu->tb_jmp_cache[hash]);
+    if (likely(tb &&
+               tb->pc == *pc &&
+               tb->cs_base == *cs_base &&
+               tb->flags == *flags &&
+               tb->trace_vcpu_dstate == *cpu->trace_dstate &&
+               !atomic_read(&tb->invalid))) {
+        return tb;
+    }
+    tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags);
+    if (tb == NULL) {
+        return NULL;
+    }
+    atomic_set(&cpu->tb_jmp_cache[hash], tb);
+    return tb;
+}
+
+#endif /* EXEC_TB_LOOKUP_H */