| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466 |
- From e0e5d2785d4b282a1f82f36199f52f9196868d6b Mon Sep 17 00:00:00 2001
- From: Andy Lutomirski <[email protected]>
- Date: Tue, 12 Dec 2017 07:56:45 -0800
- Subject: [PATCH 205/242] x86/pti: Put the LDT in its own PGD if PTI is on
- MIME-Version: 1.0
- Content-Type: text/plain; charset=UTF-8
- Content-Transfer-Encoding: 8bit
- CVE-2017-5754
- With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
- The LDT is per process, i.e. per mm.
- An earlier approach mapped the LDT on context switch into a fixmap area,
- but that's a big overhead and exhausted the fixmap space when NR_CPUS got
- big.
- Take advantage of the fact that there is an address space hole which
- provides a completely unused pgd. Use this pgd to manage per-mm LDT
- mappings.
- This has a down side: the LDT isn't (currently) randomized, and an attack
- that can write the LDT is instant root due to call gates (thanks, AMD, for
- leaving call gates in AMD64 but designing them wrong so they're only useful
- for exploits). This can be mitigated by making the LDT read-only or
- randomizing the mapping, either of which is strightforward on top of this
- patch.
- This will significantly slow down LDT users, but that shouldn't matter for
- important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
- old libc implementations.
- [ tglx: Cleaned it up. ]
- Signed-off-by: Andy Lutomirski <[email protected]>
- Signed-off-by: Thomas Gleixner <[email protected]>
- Cc: Borislav Petkov <[email protected]>
- Cc: Brian Gerst <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: Dave Hansen <[email protected]>
- Cc: David Laight <[email protected]>
- Cc: H. Peter Anvin <[email protected]>
- Cc: Josh Poimboeuf <[email protected]>
- Cc: Juergen Gross <[email protected]>
- Cc: Kees Cook <[email protected]>
- Cc: Kirill A. Shutemov <[email protected]>
- Cc: Linus Torvalds <[email protected]>
- Cc: Peter Zijlstra <[email protected]>
- Signed-off-by: Ingo Molnar <[email protected]>
- (cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d)
- Signed-off-by: Andy Whitcroft <[email protected]>
- Signed-off-by: Kleber Sacilotto de Souza <[email protected]>
- (cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7)
- Signed-off-by: Fabian Grünbichler <[email protected]>
- ---
- Documentation/x86/x86_64/mm.txt | 3 +-
- arch/x86/include/asm/mmu_context.h | 59 ++++++++++++--
- arch/x86/include/asm/pgtable_64_types.h | 4 +
- arch/x86/include/asm/processor.h | 23 ++++--
- arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++-
- arch/x86/mm/dump_pagetables.c | 9 +++
- 6 files changed, 220 insertions(+), 17 deletions(-)
- diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
- index 496a1dbf139d..ad41b3813f0a 100644
- --- a/Documentation/x86/x86_64/mm.txt
- +++ b/Documentation/x86/x86_64/mm.txt
- @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
- ... unused hole ...
- ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
- ... unused hole ...
- +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
- fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
- ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
- ... unused hole ...
- @@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
- hole caused by [56:63] sign extension
- ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
- ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
- -ff90000000000000 - ff9fffffffffffff (=52 bits) hole
- +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
- ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
- ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
- ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
- diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
- index 89a01ad7e370..9e3546e1c0f4 100644
- --- a/arch/x86/include/asm/mmu_context.h
- +++ b/arch/x86/include/asm/mmu_context.h
- @@ -49,10 +49,33 @@ struct ldt_struct {
- * call gates. On native, we could merge the ldt_struct and LDT
- * allocations, but it's not worth trying to optimize.
- */
- - struct desc_struct *entries;
- - unsigned int nr_entries;
- + struct desc_struct *entries;
- + unsigned int nr_entries;
- +
- + /*
- + * If PTI is in use, then the entries array is not mapped while we're
- + * in user mode. The whole array will be aliased at the addressed
- + * given by ldt_slot_va(slot). We use two slots so that we can allocate
- + * and map, and enable a new LDT without invalidating the mapping
- + * of an older, still-in-use LDT.
- + *
- + * slot will be -1 if this LDT doesn't have an alias mapping.
- + */
- + int slot;
- };
-
- +/* This is a multiple of PAGE_SIZE. */
- +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
- +
- +static inline void *ldt_slot_va(int slot)
- +{
- +#ifdef CONFIG_X86_64
- + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
- +#else
- + BUG();
- +#endif
- +}
- +
- /*
- * Used for LDT copy/destruction.
- */
- @@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
- }
- int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
- void destroy_context_ldt(struct mm_struct *mm);
- +void ldt_arch_exit_mmap(struct mm_struct *mm);
- #else /* CONFIG_MODIFY_LDT_SYSCALL */
- static inline void init_new_context_ldt(struct mm_struct *mm) { }
- static inline int ldt_dup_context(struct mm_struct *oldmm,
- @@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
- {
- return 0;
- }
- -static inline void destroy_context_ldt(struct mm_struct *mm) {}
- +static inline void destroy_context_ldt(struct mm_struct *mm) { }
- +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
- #endif
-
- static inline void load_mm_ldt(struct mm_struct *mm)
- @@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
- * that we can see.
- */
-
- - if (unlikely(ldt))
- - set_ldt(ldt->entries, ldt->nr_entries);
- - else
- + if (unlikely(ldt)) {
- + if (static_cpu_has(X86_FEATURE_PTI)) {
- + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
- + /*
- + * Whoops -- either the new LDT isn't mapped
- + * (if slot == -1) or is mapped into a bogus
- + * slot (if slot > 1).
- + */
- + clear_LDT();
- + return;
- + }
- +
- + /*
- + * If page table isolation is enabled, ldt->entries
- + * will not be mapped in the userspace pagetables.
- + * Tell the CPU to access the LDT through the alias
- + * at ldt_slot_va(ldt->slot).
- + */
- + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
- + } else {
- + set_ldt(ldt->entries, ldt->nr_entries);
- + }
- + } else {
- clear_LDT();
- + }
- #else
- clear_LDT();
- #endif
- @@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
- static inline void arch_exit_mmap(struct mm_struct *mm)
- {
- paravirt_arch_exit_mmap(mm);
- + ldt_arch_exit_mmap(mm);
- }
-
- #ifdef CONFIG_X86_64
- diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
- index 5932dead34ee..e8a809ee0bb6 100644
- --- a/arch/x86/include/asm/pgtable_64_types.h
- +++ b/arch/x86/include/asm/pgtable_64_types.h
- @@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t;
- # define VMALLOC_SIZE_TB _AC(12800, UL)
- # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
- # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
- +# define LDT_PGD_ENTRY _AC(-112, UL)
- +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
- #else
- # define VMALLOC_SIZE_TB _AC(32, UL)
- # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
- # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
- +# define LDT_PGD_ENTRY _AC(-4, UL)
- +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
- #endif
-
- #ifdef CONFIG_RANDOMIZE_MEMORY
- diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
- index 935d68609922..24503521c947 100644
- --- a/arch/x86/include/asm/processor.h
- +++ b/arch/x86/include/asm/processor.h
- @@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x)
-
- #else
- /*
- - * User space process size. 47bits minus one guard page. The guard
- - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
- - * the highest possible canonical userspace address, then that
- - * syscall will enter the kernel with a non-canonical return
- - * address, and SYSRET will explode dangerously. We avoid this
- - * particular problem by preventing anything from being mapped
- - * at the maximum canonical address.
- + * User space process size. This is the first address outside the user range.
- + * There are a few constraints that determine this:
- + *
- + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
- + * address, then that syscall will enter the kernel with a
- + * non-canonical return address, and SYSRET will explode dangerously.
- + * We avoid this particular problem by preventing anything executable
- + * from being mapped at the maximum canonical address.
- + *
- + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
- + * CPUs malfunction if they execute code from the highest canonical page.
- + * They'll speculate right off the end of the canonical space, and
- + * bad things happen. This is worked around in the same way as the
- + * Intel problem.
- + *
- + * With page table isolation enabled, we map the LDT in ... [stay tuned]
- */
- #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
-
- diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
- index 74a5aaf13f3c..eceaada581ff 100644
- --- a/arch/x86/kernel/ldt.c
- +++ b/arch/x86/kernel/ldt.c
- @@ -23,6 +23,7 @@
- #include <linux/uaccess.h>
-
- #include <asm/ldt.h>
- +#include <asm/tlb.h>
- #include <asm/desc.h>
- #include <asm/mmu_context.h>
- #include <asm/syscalls.h>
- @@ -50,13 +51,11 @@ static void refresh_ldt_segments(void)
- static void flush_ldt(void *__mm)
- {
- struct mm_struct *mm = __mm;
- - mm_context_t *pc;
-
- if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
- return;
-
- - pc = &mm->context;
- - set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
- + load_mm_ldt(mm);
-
- refresh_ldt_segments();
- }
- @@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
- return NULL;
- }
-
- + /* The new LDT isn't aliased for PTI yet. */
- + new_ldt->slot = -1;
- +
- new_ldt->nr_entries = num_entries;
- return new_ldt;
- }
-
- +/*
- + * If PTI is enabled, this maps the LDT into the kernelmode and
- + * usermode tables for the given mm.
- + *
- + * There is no corresponding unmap function. Even if the LDT is freed, we
- + * leave the PTEs around until the slot is reused or the mm is destroyed.
- + * This is harmless: the LDT is always in ordinary memory, and no one will
- + * access the freed slot.
- + *
- + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
- + * it useful, and the flush would slow down modify_ldt().
- + */
- +static int
- +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
- +{
- +#ifdef CONFIG_PAGE_TABLE_ISOLATION
- + bool is_vmalloc, had_top_level_entry;
- + unsigned long va;
- + spinlock_t *ptl;
- + pgd_t *pgd;
- + int i;
- +
- + if (!static_cpu_has(X86_FEATURE_PTI))
- + return 0;
- +
- + /*
- + * Any given ldt_struct should have map_ldt_struct() called at most
- + * once.
- + */
- + WARN_ON(ldt->slot != -1);
- +
- + /*
- + * Did we already have the top level entry allocated? We can't
- + * use pgd_none() for this because it doens't do anything on
- + * 4-level page table kernels.
- + */
- + pgd = pgd_offset(mm, LDT_BASE_ADDR);
- + had_top_level_entry = (pgd->pgd != 0);
- +
- + is_vmalloc = is_vmalloc_addr(ldt->entries);
- +
- + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
- + unsigned long offset = i << PAGE_SHIFT;
- + const void *src = (char *)ldt->entries + offset;
- + unsigned long pfn;
- + pte_t pte, *ptep;
- +
- + va = (unsigned long)ldt_slot_va(slot) + offset;
- + pfn = is_vmalloc ? vmalloc_to_pfn(src) :
- + page_to_pfn(virt_to_page(src));
- + /*
- + * Treat the PTI LDT range as a *userspace* range.
- + * get_locked_pte() will allocate all needed pagetables
- + * and account for them in this mm.
- + */
- + ptep = get_locked_pte(mm, va, &ptl);
- + if (!ptep)
- + return -ENOMEM;
- + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
- + set_pte_at(mm, va, ptep, pte);
- + pte_unmap_unlock(ptep, ptl);
- + }
- +
- + if (mm->context.ldt) {
- + /*
- + * We already had an LDT. The top-level entry should already
- + * have been allocated and synchronized with the usermode
- + * tables.
- + */
- + WARN_ON(!had_top_level_entry);
- + if (static_cpu_has(X86_FEATURE_PTI))
- + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
- + } else {
- + /*
- + * This is the first time we're mapping an LDT for this process.
- + * Sync the pgd to the usermode tables.
- + */
- + WARN_ON(had_top_level_entry);
- + if (static_cpu_has(X86_FEATURE_PTI)) {
- + WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
- + set_pgd(kernel_to_user_pgdp(pgd), *pgd);
- + }
- + }
- +
- + va = (unsigned long)ldt_slot_va(slot);
- + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
- +
- + ldt->slot = slot;
- +#endif
- + return 0;
- +}
- +
- +static void free_ldt_pgtables(struct mm_struct *mm)
- +{
- +#ifdef CONFIG_PAGE_TABLE_ISOLATION
- + struct mmu_gather tlb;
- + unsigned long start = LDT_BASE_ADDR;
- + unsigned long end = start + (1UL << PGDIR_SHIFT);
- +
- + if (!static_cpu_has(X86_FEATURE_PTI))
- + return;
- +
- + tlb_gather_mmu(&tlb, mm, start, end);
- + free_pgd_range(&tlb, start, end, start, end);
- + tlb_finish_mmu(&tlb, start, end);
- +#endif
- +}
- +
- /* After calling this, the LDT is immutable. */
- static void finalize_ldt_struct(struct ldt_struct *ldt)
- {
- @@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
- new_ldt->nr_entries * LDT_ENTRY_SIZE);
- finalize_ldt_struct(new_ldt);
-
- + retval = map_ldt_struct(mm, new_ldt, 0);
- + if (retval) {
- + free_ldt_pgtables(mm);
- + free_ldt_struct(new_ldt);
- + goto out_unlock;
- + }
- mm->context.ldt = new_ldt;
-
- out_unlock:
- @@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm)
- mm->context.ldt = NULL;
- }
-
- +void ldt_arch_exit_mmap(struct mm_struct *mm)
- +{
- + free_ldt_pgtables(mm);
- +}
- +
- static int read_ldt(void __user *ptr, unsigned long bytecount)
- {
- struct mm_struct *mm = current->mm;
- @@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
- new_ldt->entries[ldt_info.entry_number] = ldt;
- finalize_ldt_struct(new_ldt);
-
- + /*
- + * If we are using PTI, map the new LDT into the userspace pagetables.
- + * If there is already an LDT, use the other slot so that other CPUs
- + * will continue to use the old LDT until install_ldt() switches
- + * them over to the new LDT.
- + */
- + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
- + if (error) {
- + free_ldt_struct(old_ldt);
- + goto out_unlock;
- + }
- +
- install_ldt(mm, new_ldt);
- free_ldt_struct(old_ldt);
- error = 0;
- diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
- index 3b7720404a9f..eed93dd4cb4a 100644
- --- a/arch/x86/mm/dump_pagetables.c
- +++ b/arch/x86/mm/dump_pagetables.c
- @@ -52,11 +52,17 @@ enum address_markers_idx {
- USER_SPACE_NR = 0,
- KERNEL_SPACE_NR,
- LOW_KERNEL_NR,
- +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
- + LDT_NR,
- +#endif
- VMALLOC_START_NR,
- VMEMMAP_START_NR,
- #ifdef CONFIG_KASAN
- KASAN_SHADOW_START_NR,
- KASAN_SHADOW_END_NR,
- +#endif
- +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
- + LDT_NR,
- #endif
- CPU_ENTRY_AREA_NR,
- #ifdef CONFIG_X86_ESPFIX64
- @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
- #ifdef CONFIG_KASAN
- [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
- [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
- +#endif
- +#ifdef CONFIG_MODIFY_LDT_SYSCALL
- + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
- #endif
- [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
- #ifdef CONFIG_X86_ESPFIX64
- --
- 2.14.2
|