| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454 |
- From caa3549fe709971498eaf080c1710ef627a0df5a Mon Sep 17 00:00:00 2001
- From: Andy Lutomirski <[email protected]>
- Date: Thu, 29 Jun 2017 08:53:17 -0700
- Subject: [PATCH 041/242] x86/mm: Rework lazy TLB mode and TLB freshness
- tracking
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- CVE-2017-5754
- x86's lazy TLB mode used to be fairly weak -- it would switch to
- init_mm the first time it tried to flush a lazy TLB. This meant an
- unnecessary CR3 write and, if the flush was remote, an unnecessary
- IPI.
- Rewrite it entirely. When we enter lazy mode, we simply remove the
- CPU from mm_cpumask. This means that we need a way to figure out
- whether we've missed a flush when we switch back out of lazy mode.
- I use the tlb_gen machinery to track whether a context is up to
- date.
- Note to reviewers: this patch, my itself, looks a bit odd. I'm
- using an array of length 1 containing (ctx_id, tlb_gen) rather than
- just storing tlb_gen, and making it at array isn't necessary yet.
- I'm doing this because the next few patches add PCID support, and,
- with PCID, we need ctx_id, and the array will end up with a length
- greater than 1. Making it an array now means that there will be
- less churn and therefore less stress on your eyeballs.
- NB: This is dubious but, AFAICT, still correct on Xen and UV.
- xen_exit_mmap() uses mm_cpumask() for nefarious purposes and this
- patch changes the way that mm_cpumask() works. This should be okay,
- since Xen *also* iterates all online CPUs to find all the CPUs it
- needs to twiddle.
- The UV tlbflush code is rather dated and should be changed.
- Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
- (turbo off, intel_pstate requesting max performance) under KVM with
- the guest using idle=poll (to avoid artifacts when bouncing between
- CPUs). I haven't done any real statistics here -- I just ran them
- in a loop and picked the fastest results that didn't look like
- outliers. Unpatched means commit a4eb8b993554, so all the
- bookkeeping overhead is gone.
- MADV_DONTNEED; touch the page; switch CPUs using sched_setaffinity. In
- an unpatched kernel, MADV_DONTNEED will send an IPI to the previous CPU.
- This is intended to be a nearly worst-case test.
- patched: 13.4µs
- unpatched: 21.6µs
- Vitaly's pthread_mmap microbenchmark with 8 threads (on four cores),
- nrounds = 100, 256M data
- patched: 1.1 seconds or so
- unpatched: 1.9 seconds or so
- The sleepup on Vitaly's test appearss to be because it spends a lot
- of time blocked on mmap_sem, and this patch avoids sending IPIs to
- blocked CPUs.
- Signed-off-by: Andy Lutomirski <[email protected]>
- Reviewed-by: Nadav Amit <[email protected]>
- Reviewed-by: Thomas Gleixner <[email protected]>
- Cc: Andrew Banman <[email protected]>
- Cc: Andrew Morton <[email protected]>
- Cc: Arjan van de Ven <[email protected]>
- Cc: Boris Ostrovsky <[email protected]>
- Cc: Borislav Petkov <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Dimitri Sivanich <[email protected]>
- Cc: Juergen Gross <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Mel Gorman <[email protected]>
- Cc: Mike Travis <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Cc: Rik van Riel <[email protected]>
- Cc: [email protected]
- Link: http://lkml.kernel.org/r/ddf2c92962339f4ba39d8fc41b853936ec0b44f1.1498751203.git.luto@kernel.org
- Signed-off-by: Ingo Molnar <[email protected]>
- (cherry picked from commit 94b1b03b519b81c494900cb112aa00ed205cc2d9)
- Signed-off-by: Andy Whitcroft <[email protected]>
- Signed-off-by: Kleber Sacilotto de Souza <[email protected]>
- (cherry picked from commit b381b7ae452f2bc6384507a897247be7c93a71cc)
- Signed-off-by: Fabian Grünbichler <[email protected]>
- ---
- arch/x86/include/asm/mmu_context.h | 6 +-
- arch/x86/include/asm/tlbflush.h | 4 -
- arch/x86/mm/init.c | 1 -
- arch/x86/mm/tlb.c | 197 ++++++++++++++++++++++---------------
- arch/x86/xen/mmu_pv.c | 5 +-
- 5 files changed, 124 insertions(+), 89 deletions(-)
- diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
- index 6c05679c715b..d6b055b328f2 100644
- --- a/arch/x86/include/asm/mmu_context.h
- +++ b/arch/x86/include/asm/mmu_context.h
- @@ -128,8 +128,10 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
-
- static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
- {
- - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
- + int cpu = smp_processor_id();
- +
- + if (cpumask_test_cpu(cpu, mm_cpumask(mm)))
- + cpumask_clear_cpu(cpu, mm_cpumask(mm));
- }
-
- static inline int init_new_context(struct task_struct *tsk,
- diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
- index 3a167c214560..6397275008db 100644
- --- a/arch/x86/include/asm/tlbflush.h
- +++ b/arch/x86/include/asm/tlbflush.h
- @@ -95,7 +95,6 @@ struct tlb_state {
- * mode even if we've already switched back to swapper_pg_dir.
- */
- struct mm_struct *loaded_mm;
- - int state;
-
- /*
- * Access to this CR4 shadow and to H/W CR4 is protected by
- @@ -318,9 +317,6 @@ static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
- void native_flush_tlb_others(const struct cpumask *cpumask,
- const struct flush_tlb_info *info);
-
- -#define TLBSTATE_OK 1
- -#define TLBSTATE_LAZY 2
- -
- static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
- struct mm_struct *mm)
- {
- diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
- index df2624b091a7..c86dc071bb10 100644
- --- a/arch/x86/mm/init.c
- +++ b/arch/x86/mm/init.c
- @@ -849,7 +849,6 @@ void __init zone_sizes_init(void)
-
- DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
- .loaded_mm = &init_mm,
- - .state = 0,
- .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */
- };
- EXPORT_SYMBOL_GPL(cpu_tlbstate);
- diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
- index 4e5a5ddb9e4d..0982c997d36f 100644
- --- a/arch/x86/mm/tlb.c
- +++ b/arch/x86/mm/tlb.c
- @@ -45,8 +45,8 @@ void leave_mm(int cpu)
- if (loaded_mm == &init_mm)
- return;
-
- - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
- - BUG();
- + /* Warn if we're not lazy. */
- + WARN_ON(cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm)));
-
- switch_mm(NULL, &init_mm, NULL);
- }
- @@ -65,94 +65,117 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
- {
- - unsigned cpu = smp_processor_id();
- struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
- + unsigned cpu = smp_processor_id();
- + u64 next_tlb_gen;
-
- /*
- - * NB: The scheduler will call us with prev == next when
- - * switching from lazy TLB mode to normal mode if active_mm
- - * isn't changing. When this happens, there is no guarantee
- - * that CR3 (and hence cpu_tlbstate.loaded_mm) matches next.
- + * NB: The scheduler will call us with prev == next when switching
- + * from lazy TLB mode to normal mode if active_mm isn't changing.
- + * When this happens, we don't assume that CR3 (and hence
- + * cpu_tlbstate.loaded_mm) matches next.
- *
- * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
- */
-
- - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- + /* We don't want flush_tlb_func_* to run concurrently with us. */
- + if (IS_ENABLED(CONFIG_PROVE_LOCKING))
- + WARN_ON_ONCE(!irqs_disabled());
- +
- + /*
- + * Verify that CR3 is what we think it is. This will catch
- + * hypothetical buggy code that directly switches to swapper_pg_dir
- + * without going through leave_mm() / switch_mm_irqs_off().
- + */
- + VM_BUG_ON(read_cr3_pa() != __pa(real_prev->pgd));
-
- if (real_prev == next) {
- - /*
- - * There's nothing to do: we always keep the per-mm control
- - * regs in sync with cpu_tlbstate.loaded_mm. Just
- - * sanity-check mm_cpumask.
- - */
- - if (WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(next))))
- - cpumask_set_cpu(cpu, mm_cpumask(next));
- - return;
- - }
- + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
- + next->context.ctx_id);
- +
- + if (cpumask_test_cpu(cpu, mm_cpumask(next))) {
- + /*
- + * There's nothing to do: we weren't lazy, and we
- + * aren't changing our mm. We don't need to flush
- + * anything, nor do we need to update CR3, CR4, or
- + * LDTR.
- + */
- + return;
- + }
- +
- + /* Resume remote flushes and then read tlb_gen. */
- + cpumask_set_cpu(cpu, mm_cpumask(next));
- + next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- +
- + if (this_cpu_read(cpu_tlbstate.ctxs[0].tlb_gen) < next_tlb_gen) {
- + /*
- + * Ideally, we'd have a flush_tlb() variant that
- + * takes the known CR3 value as input. This would
- + * be faster on Xen PV and on hypothetical CPUs
- + * on which INVPCID is fast.
- + */
- + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen,
- + next_tlb_gen);
- + write_cr3(__pa(next->pgd));
- +
- + /*
- + * This gets called via leave_mm() in the idle path
- + * where RCU functions differently. Tracing normally
- + * uses RCU, so we have to call the tracepoint
- + * specially here.
- + */
- + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
- + TLB_FLUSH_ALL);
- + }
-
- - if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- /*
- - * If our current stack is in vmalloc space and isn't
- - * mapped in the new pgd, we'll double-fault. Forcibly
- - * map it.
- + * We just exited lazy mode, which means that CR4 and/or LDTR
- + * may be stale. (Changes to the required CR4 and LDTR states
- + * are not reflected in tlb_gen.)
- */
- - unsigned int stack_pgd_index = pgd_index(current_stack_pointer());
- -
- - pgd_t *pgd = next->pgd + stack_pgd_index;
- -
- - if (unlikely(pgd_none(*pgd)))
- - set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
- - }
- + } else {
- + VM_BUG_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) ==
- + next->context.ctx_id);
- +
- + if (IS_ENABLED(CONFIG_VMAP_STACK)) {
- + /*
- + * If our current stack is in vmalloc space and isn't
- + * mapped in the new pgd, we'll double-fault. Forcibly
- + * map it.
- + */
- + unsigned int index = pgd_index(current_stack_pointer());
- + pgd_t *pgd = next->pgd + index;
- +
- + if (unlikely(pgd_none(*pgd)))
- + set_pgd(pgd, init_mm.pgd[index]);
- + }
-
- - this_cpu_write(cpu_tlbstate.loaded_mm, next);
- - this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
- - this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, atomic64_read(&next->context.tlb_gen));
- + /* Stop remote flushes for the previous mm */
- + if (cpumask_test_cpu(cpu, mm_cpumask(real_prev)))
- + cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
-
- - WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
- - cpumask_set_cpu(cpu, mm_cpumask(next));
- + VM_WARN_ON_ONCE(cpumask_test_cpu(cpu, mm_cpumask(next)));
-
- - /*
- - * Re-load page tables.
- - *
- - * This logic has an ordering constraint:
- - *
- - * CPU 0: Write to a PTE for 'next'
- - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- - * CPU 1: set bit 1 in next's mm_cpumask
- - * CPU 1: load from the PTE that CPU 0 writes (implicit)
- - *
- - * We need to prevent an outcome in which CPU 1 observes
- - * the new PTE value and CPU 0 observes bit 1 clear in
- - * mm_cpumask. (If that occurs, then the IPI will never
- - * be sent, and CPU 0's TLB will contain a stale entry.)
- - *
- - * The bad outcome can occur if either CPU's load is
- - * reordered before that CPU's store, so both CPUs must
- - * execute full barriers to prevent this from happening.
- - *
- - * Thus, switch_mm needs a full barrier between the
- - * store to mm_cpumask and any operation that could load
- - * from next->pgd. TLB fills are special and can happen
- - * due to instruction fetches or for no reason at all,
- - * and neither LOCK nor MFENCE orders them.
- - * Fortunately, load_cr3() is serializing and gives the
- - * ordering guarantee we need.
- - */
- - load_cr3(next->pgd);
- + /*
- + * Start remote flushes and then read tlb_gen.
- + */
- + cpumask_set_cpu(cpu, mm_cpumask(next));
- + next_tlb_gen = atomic64_read(&next->context.tlb_gen);
-
- - /*
- - * This gets called via leave_mm() in the idle path where RCU
- - * functions differently. Tracing normally uses RCU, so we have to
- - * call the tracepoint specially here.
- - */
- - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- + this_cpu_write(cpu_tlbstate.ctxs[0].ctx_id, next->context.ctx_id);
- + this_cpu_write(cpu_tlbstate.ctxs[0].tlb_gen, next_tlb_gen);
- + this_cpu_write(cpu_tlbstate.loaded_mm, next);
- + write_cr3(__pa(next->pgd));
-
- - /* Stop flush ipis for the previous mm */
- - WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
- - real_prev != &init_mm);
- - cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
- + /*
- + * This gets called via leave_mm() in the idle path where RCU
- + * functions differently. Tracing normally uses RCU, so we
- + * have to call the tracepoint specially here.
- + */
- + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH,
- + TLB_FLUSH_ALL);
- + }
-
- - /* Load per-mm CR4 and LDTR state */
- load_mm_cr4(next);
- switch_ldt(real_prev, next);
- }
- @@ -186,13 +209,13 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[0].ctx_id) !=
- loaded_mm->context.ctx_id);
-
- - if (this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) {
- + if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(loaded_mm))) {
- /*
- - * leave_mm() is adequate to handle any type of flush, and
- - * we would prefer not to receive further IPIs. leave_mm()
- - * clears this CPU's bit in mm_cpumask().
- + * We're in lazy mode -- don't flush. We can get here on
- + * remote flushes due to races and on local flushes if a
- + * kernel thread coincidentally flushes the mm it's lazily
- + * still using.
- */
- - leave_mm(smp_processor_id());
- return;
- }
-
- @@ -203,6 +226,7 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
- * be handled can catch us all the way up, leaving no work for
- * the second flush.
- */
- + trace_tlb_flush(reason, 0);
- return;
- }
-
- @@ -304,6 +328,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
- (info->end - info->start) >> PAGE_SHIFT);
-
- if (is_uv_system()) {
- + /*
- + * This whole special case is confused. UV has a "Broadcast
- + * Assist Unit", which seems to be a fancy way to send IPIs.
- + * Back when x86 used an explicit TLB flush IPI, UV was
- + * optimized to use its own mechanism. These days, x86 uses
- + * smp_call_function_many(), but UV still uses a manual IPI,
- + * and that IPI's action is out of date -- it does a manual
- + * flush instead of calling flush_tlb_func_remote(). This
- + * means that the percpu tlb_gen variables won't be updated
- + * and we'll do pointless flushes on future context switches.
- + *
- + * Rather than hooking native_flush_tlb_others() here, I think
- + * that UV should be updated so that smp_call_function_many(),
- + * etc, are optimal on UV.
- + */
- unsigned int cpu;
-
- cpu = smp_processor_id();
- @@ -363,6 +402,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
-
- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids)
- flush_tlb_others(mm_cpumask(mm), &info);
- +
- put_cpu();
- }
-
- @@ -371,8 +411,6 @@ static void do_flush_tlb_all(void *info)
- {
- count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
- __flush_tlb_all();
- - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
- - leave_mm(smp_processor_id());
- }
-
- void flush_tlb_all(void)
- @@ -425,6 +463,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
-
- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids)
- flush_tlb_others(&batch->cpumask, &info);
- +
- cpumask_clear(&batch->cpumask);
-
- put_cpu();
- diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
- index 5f61b7e2e6b2..ba76f3ce997f 100644
- --- a/arch/x86/xen/mmu_pv.c
- +++ b/arch/x86/xen/mmu_pv.c
- @@ -1005,14 +1005,12 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
- /* Get the "official" set of cpus referring to our pagetable. */
- if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) {
- for_each_online_cpu(cpu) {
- - if (!cpumask_test_cpu(cpu, mm_cpumask(mm))
- - && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
- + if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
- continue;
- smp_call_function_single(cpu, drop_mm_ref_this_cpu, mm, 1);
- }
- return;
- }
- - cpumask_copy(mask, mm_cpumask(mm));
-
- /*
- * It's possible that a vcpu may have a stale reference to our
- @@ -1021,6 +1019,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
- * look at its actual current cr3 value, and force it to flush
- * if needed.
- */
- + cpumask_clear(mask);
- for_each_online_cpu(cpu) {
- if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
- cpumask_set_cpu(cpu, mask);
- --
- 2.14.2
|