| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340 |
- From e3c7bff633fc1210c6b19dd3ebcafb9f6716d586 Mon Sep 17 00:00:00 2001
- From: Andy Lutomirski <[email protected]>
- Date: Mon, 24 Jul 2017 21:41:38 -0700
- Subject: [PATCH 042/242] x86/mm: Implement PCID based optimization: try to
- preserve old TLB entries using PCID
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- CVE-2017-5754
- PCID is a "process context ID" -- it's what other architectures call
- an address space ID. Every non-global TLB entry is tagged with a
- PCID, only TLB entries that match the currently selected PCID are
- used, and we can switch PGDs without flushing the TLB. x86's
- PCID is 12 bits.
- This is an unorthodox approach to using PCID. x86's PCID is far too
- short to uniquely identify a process, and we can't even really
- uniquely identify a running process because there are monster
- systems with over 4096 CPUs. To make matters worse, past attempts
- to use all 12 PCID bits have resulted in slowdowns instead of
- speedups.
- This patch uses PCID differently. We use a PCID to identify a
- recently-used mm on a per-cpu basis. An mm has no fixed PCID
- binding at all; instead, we give it a fresh PCID each time it's
- loaded except in cases where we want to preserve the TLB, in which
- case we reuse a recent value.
- Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
- (turbo off, intel_pstate requesting max performance) under KVM with
- the guest using idle=poll (to avoid artifacts when bouncing between
- CPUs). I haven't done any real statistics here -- I just ran them
- in a loop and picked the fastest results that didn't look like
- outliers. Unpatched means commit a4eb8b993554, so all the
- bookkeeping overhead is gone.
- ping-pong between two mms on the same CPU using eventfd:
- patched: 1.22µs
- patched, nopcid: 1.33µs
- unpatched: 1.34µs
- Same ping-pong, but now touch 512 pages (all zero-page to minimize
- cache misses) each iteration. dTLB misses are measured by
- dtlb_load_misses.miss_causes_a_walk:
- patched: 1.8µs 11M dTLB misses
- patched, nopcid: 6.2µs, 207M dTLB misses
- unpatched: 6.1µs, 190M dTLB misses
- Signed-off-by: Andy Lutomirski <[email protected]>
- Reviewed-by: Nadav Amit <[email protected]>
- Cc: Andrew Morton <[email protected]>
- Cc: Arjan van de Ven <[email protected]>
- Cc: Borislav Petkov <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Rik van Riel <[email protected]>
- Cc: Thomas Gleixner <[email protected]>
- Cc: [email protected]
- Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
- Signed-off-by: Ingo Molnar <[email protected]>
- (backported from commit 10af6235e0d327d42e1bad974385197817923dc1)
- Signed-off-by: Andy Whitcroft <[email protected]>
- Signed-off-by: Kleber Sacilotto de Souza <[email protected]>
- (cherry picked from commit d833a976288cdcf7fb1dabb48ebf614ebf6a311c)
- Signed-off-by: Fabian Grünbichler <[email protected]>
- ---
- arch/x86/include/asm/mmu_context.h | 3 ++
- arch/x86/include/asm/processor-flags.h | 2 +
- arch/x86/include/asm/tlbflush.h | 18 +++++++-
- arch/x86/mm/init.c | 1 +
- arch/x86/mm/tlb.c | 84 +++++++++++++++++++++++++---------
- 5 files changed, 85 insertions(+), 23 deletions(-)
- diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
- index d6b055b328f2..7ae318c340d9 100644
- --- a/arch/x86/include/asm/mmu_context.h
- +++ b/arch/x86/include/asm/mmu_context.h
- @@ -298,6 +298,9 @@ static inline unsigned long __get_current_cr3_fast(void)
- {
- unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd);
-
- + if (static_cpu_has(X86_FEATURE_PCID))
- + cr3 |= this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- +
- /* For now, be very restrictive about when this can be called. */
- VM_WARN_ON(in_nmi() || preemptible());
-
- diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
- index 79aa2f98398d..791b60199aa4 100644
- --- a/arch/x86/include/asm/processor-flags.h
- +++ b/arch/x86/include/asm/processor-flags.h
- @@ -35,6 +35,7 @@
- /* Mask off the address space ID bits. */
- #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull
- #define CR3_PCID_MASK 0xFFFull
- +#define CR3_NOFLUSH (1UL << 63)
- #else
- /*
- * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
- @@ -42,6 +43,7 @@
- */
- #define CR3_ADDR_MASK 0xFFFFFFFFull
- #define CR3_PCID_MASK 0ull
- +#define CR3_NOFLUSH 0
- #endif
-
- #endif /* _ASM_X86_PROCESSOR_FLAGS_H */
- diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
- index 6397275008db..d23e61dc0640 100644
- --- a/arch/x86/include/asm/tlbflush.h
- +++ b/arch/x86/include/asm/tlbflush.h
- @@ -82,6 +82,12 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
- #define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
- #endif
-
- +/*
- + * 6 because 6 should be plenty and struct tlb_state will fit in
- + * two cache lines.
- + */
- +#define TLB_NR_DYN_ASIDS 6
- +
- struct tlb_context {
- u64 ctx_id;
- u64 tlb_gen;
- @@ -95,6 +101,8 @@ struct tlb_state {
- * mode even if we've already switched back to swapper_pg_dir.
- */
- struct mm_struct *loaded_mm;
- + u16 loaded_mm_asid;
- + u16 next_asid;
-
- /*
- * Access to this CR4 shadow and to H/W CR4 is protected by
- @@ -104,7 +112,8 @@ struct tlb_state {
-
- /*
- * This is a list of all contexts that might exist in the TLB.
- - * Since we don't yet use PCID, there is only one context.
- + * There is one per ASID that we use, and the ASID (what the
- + * CPU calls PCID) is the index into ctxts.
- *
- * For each context, ctx_id indicates which mm the TLB's user
- * entries came from. As an invariant, the TLB will never
- @@ -114,8 +123,13 @@ struct tlb_state {
- * To be clear, this means that it's legal for the TLB code to
- * flush the TLB without updating tlb_gen. This can happen
- * (for now, at least) due to paravirt remote flushes.
- + *
- + * NB: context 0 is a bit special, since it's also used by
- + * various bits of init code. This is fine -- code that
- + * isn't aware of PCID will end up harmlessly flushing
- + * context 0.
- */
- - struct tlb_context ctxs[1];
- + struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
- };
- DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
-
- diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
- index c86dc071bb10..af5c1ed21d43 100644
- --- a/arch/x86/mm/init.c
- +++ b/arch/x86/mm/init.c
- @@ -849,6 +849,7 @@ void __init zone_sizes_init(void)
-
- DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
- .loaded_mm = &init_mm,
- + .next_asid = 1,
- .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
- };
- EXPORT_SYMBOL_GPL(cpu_tlbstate);
- diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
- index 0982c997d36f..57943b4d8f2e 100644
- --- a/arch/x86/mm/tlb.c
- +++ b/arch/x86/mm/tlb.c
- @@ -30,6 +30,40 @@
-
- atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
-
- +static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen,
- + u16 *new_asid, bool *need_flush)
- +{
- + u16 asid;
- +
- + if (!static_cpu_has(X86_FEATURE_PCID)) {
- + *new_asid = 0;
- + *need_flush = true;
- + return;
- + }
- +
- + for (asid = 0; asid < TLB_NR_DYN_ASIDS; asid++) {
- + if (this_cpu_read(cpu_tlbstate.ctxs[asid].ctx_id) !=
- + next->context.ctx_id)
- + continue;
- +
- + *new_asid = asid;
- + *need_flush = (this_cpu_read(cpu_tlbstate.ctxs[asid].tlb_gen) <
- + next_tlb_gen);
- + return;
- + }
- +
- + /*
- + * We don't currently own an ASID slot on this CPU.
- + * Allocate a slot.
- + */
- + *new_asid = this_cpu_add_return(cpu_tlbstate.next_asid, 1) - 1;
- + if (*new_asid >= TLB_NR_DYN_ASIDS) {
- + *new_asid = 0;
- + this_cpu_write(cpu_tlbstate.next_asid, 1);
- + }
- + *need_flush = true;
- +}
- +
- void leave_mm(int cpu)
- {
- struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
- @@ -66,6 +100,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
- {
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
- + u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- unsigned cpu = smp_processor_id();
- u64 next_tlb_gen;
-
- @@ -85,12 +120,13 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- /*
- * Verify that CR3 is what we think it is. This will catch
- * hypothetical buggy code that directly switches to swapper_pg_dir
- - * without going through leave_mm() / switch_mm_irqs_off().
- + * without going through leave_mm() / switch_mm_irqs_off() or that
- + * does something like write_cr3(read_cr3_pa()).
- */
- - VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
- + VM_BUG_ON(__read_cr3() != (__sme_pa(real_prev->pgd) | prev_asid));
-
- if (real_prev == next) {
- - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
- + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
- next->context.ctx_id);
-
- if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- @@ -107,16 +143,17 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- - if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
- + if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) <
- + next_tlb_gen) {
- /*
- * Ideally, we'd have a flush_tlb() variant that
- * takes the known CR3 value as input. This would
- * be faster on Xen PV and on hypothetical CPUs
- * on which INVPCID is fast.
- */
- - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
- + this_cpu_write(cpu_tlbstate.ctxs[prev_asid].tlb_gen,
- next_tlb_gen);
- - write_cr3(__pa(next->pgd));
- + write_cr3(__pa(next->pgd) | prev_asid);
-
- /*
- * This gets called via leave_mm() in the idle path
- @@ -134,8 +171,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- * are not reflected in tlb_gen.)
- */
- } else {
- - VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
- - next->context.ctx_id);
- + u16 new_asid;
- + bool need_flush;
-
- if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- @@ -162,18 +199,22 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- cpumask_set_cpu(cpu, mm_cpumask(next));
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- - this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
- - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
- - this_cpu_write(cpu_tlbstate.loaded_mm, next);
- - write_cr3(__pa(next->pgd));
- + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
-
- - /*
- - * This gets called via leave_mm() in the idle path where RCU
- - * functions differently. Tracing normally uses RCU, so we
- - * have to call the tracepoint specially here.
- - */
- - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
- + if (need_flush) {
- + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- + write_cr3(__pa(next->pgd) | new_asid);
- + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH,
- TLB_FLUSH_ALL);
- + } else {
- + /* The new ASID is already up to date. */
- + write_cr3(__sme_pa(next->pgd) | new_asid | CR3_NOFLUSH);
- + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 0);
- + }
- +
- + this_cpu_write(cpu_tlbstate.loaded_mm, next);
- + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
- }
-
- load_mm_cr4(next);
- @@ -200,13 +241,14 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
- * wants us to catch up to.
- */
- struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
- + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
- - u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen);
- + u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
-
- /* This code cannot presently handle being reentered. */
- VM_WARN_ON(!irqs_disabled());
-
- - VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
- + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
- loaded_mm->context.ctx_id);
-
- if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
- @@ -294,7 +336,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
- }
-
- /* Both paths above update our state to mm_tlb_gen. */
- - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, mm_tlb_gen);
- + this_cpu_write(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen, mm_tlb_gen);
- }
-
- static void flush_tlb_func_local(void *info, enum tlb_flush_reason reason)
- --
- 2.14.2
|