0205-x86-pti-Put-the-LDT-in-its-own-PGD-if-PTI-is-on.patch 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466
  1. From e0e5d2785d4b282a1f82f36199f52f9196868d6b Mon Sep 17 00:00:00 2001
  2. From: Andy Lutomirski <[email protected]>
  3. Date: Tue, 12 Dec 2017 07:56:45 -0800
  4. Subject: [PATCH 205/242] x86/pti: Put the LDT in its own PGD if PTI is on
  5. MIME-Version: 1.0
  6. Content-Type: text/plain; charset=UTF-8
  7. Content-Transfer-Encoding: 8bit
  8. CVE-2017-5754
  9. With PTI enabled, the LDT must be mapped in the usermode tables somewhere.
  10. The LDT is per process, i.e. per mm.
  11. An earlier approach mapped the LDT on context switch into a fixmap area,
  12. but that's a big overhead and exhausted the fixmap space when NR_CPUS got
  13. big.
  14. Take advantage of the fact that there is an address space hole which
  15. provides a completely unused pgd. Use this pgd to manage per-mm LDT
  16. mappings.
  17. This has a down side: the LDT isn't (currently) randomized, and an attack
  18. that can write the LDT is instant root due to call gates (thanks, AMD, for
  19. leaving call gates in AMD64 but designing them wrong so they're only useful
  20. for exploits). This can be mitigated by making the LDT read-only or
  21. randomizing the mapping, either of which is strightforward on top of this
  22. patch.
  23. This will significantly slow down LDT users, but that shouldn't matter for
  24. important workloads -- the LDT is only used by DOSEMU(2), Wine, and very
  25. old libc implementations.
  26. [ tglx: Cleaned it up. ]
  27. Signed-off-by: Andy Lutomirski <[email protected]>
  28. Signed-off-by: Thomas Gleixner <[email protected]>
  29. Cc: Borislav Petkov <[email protected]>
  30. Cc: Brian Gerst <[email protected]>
  31. Cc: Dave Hansen <[email protected]>
  32. Cc: Dave Hansen <[email protected]>
  33. Cc: David Laight <[email protected]>
  34. Cc: H. Peter Anvin <[email protected]>
  35. Cc: Josh Poimboeuf <[email protected]>
  36. Cc: Juergen Gross <[email protected]>
  37. Cc: Kees Cook <[email protected]>
  38. Cc: Kirill A. Shutemov <[email protected]>
  39. Cc: Linus Torvalds <[email protected]>
  40. Cc: Peter Zijlstra <[email protected]>
  41. Signed-off-by: Ingo Molnar <[email protected]>
  42. (cherry picked from commit f55f0501cbf65ec41cca5058513031b711730b1d)
  43. Signed-off-by: Andy Whitcroft <[email protected]>
  44. Signed-off-by: Kleber Sacilotto de Souza <[email protected]>
  45. (cherry picked from commit c250643846b45ea6782fb0cfcc15e8cd34744bc7)
  46. Signed-off-by: Fabian Grünbichler <[email protected]>
  47. ---
  48. Documentation/x86/x86_64/mm.txt | 3 +-
  49. arch/x86/include/asm/mmu_context.h | 59 ++++++++++++--
  50. arch/x86/include/asm/pgtable_64_types.h | 4 +
  51. arch/x86/include/asm/processor.h | 23 ++++--
  52. arch/x86/kernel/ldt.c | 139 +++++++++++++++++++++++++++++++-
  53. arch/x86/mm/dump_pagetables.c | 9 +++
  54. 6 files changed, 220 insertions(+), 17 deletions(-)
  55. diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
  56. index 496a1dbf139d..ad41b3813f0a 100644
  57. --- a/Documentation/x86/x86_64/mm.txt
  58. +++ b/Documentation/x86/x86_64/mm.txt
  59. @@ -12,6 +12,7 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
  60. ... unused hole ...
  61. ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
  62. ... unused hole ...
  63. +fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
  64. fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
  65. ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
  66. ... unused hole ...
  67. @@ -29,7 +30,7 @@ Virtual memory map with 5 level page tables:
  68. hole caused by [56:63] sign extension
  69. ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
  70. ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
  71. -ff90000000000000 - ff9fffffffffffff (=52 bits) hole
  72. +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
  73. ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
  74. ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
  75. ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
  76. diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
  77. index 89a01ad7e370..9e3546e1c0f4 100644
  78. --- a/arch/x86/include/asm/mmu_context.h
  79. +++ b/arch/x86/include/asm/mmu_context.h
  80. @@ -49,10 +49,33 @@ struct ldt_struct {
  81. * call gates. On native, we could merge the ldt_struct and LDT
  82. * allocations, but it's not worth trying to optimize.
  83. */
  84. - struct desc_struct *entries;
  85. - unsigned int nr_entries;
  86. + struct desc_struct *entries;
  87. + unsigned int nr_entries;
  88. +
  89. + /*
  90. + * If PTI is in use, then the entries array is not mapped while we're
  91. + * in user mode. The whole array will be aliased at the addressed
  92. + * given by ldt_slot_va(slot). We use two slots so that we can allocate
  93. + * and map, and enable a new LDT without invalidating the mapping
  94. + * of an older, still-in-use LDT.
  95. + *
  96. + * slot will be -1 if this LDT doesn't have an alias mapping.
  97. + */
  98. + int slot;
  99. };
  100. +/* This is a multiple of PAGE_SIZE. */
  101. +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
  102. +
  103. +static inline void *ldt_slot_va(int slot)
  104. +{
  105. +#ifdef CONFIG_X86_64
  106. + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
  107. +#else
  108. + BUG();
  109. +#endif
  110. +}
  111. +
  112. /*
  113. * Used for LDT copy/destruction.
  114. */
  115. @@ -63,6 +86,7 @@ static inline void init_new_context_ldt(struct mm_struct *mm)
  116. }
  117. int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
  118. void destroy_context_ldt(struct mm_struct *mm);
  119. +void ldt_arch_exit_mmap(struct mm_struct *mm);
  120. #else /* CONFIG_MODIFY_LDT_SYSCALL */
  121. static inline void init_new_context_ldt(struct mm_struct *mm) { }
  122. static inline int ldt_dup_context(struct mm_struct *oldmm,
  123. @@ -70,7 +94,8 @@ static inline int ldt_dup_context(struct mm_struct *oldmm,
  124. {
  125. return 0;
  126. }
  127. -static inline void destroy_context_ldt(struct mm_struct *mm) {}
  128. +static inline void destroy_context_ldt(struct mm_struct *mm) { }
  129. +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
  130. #endif
  131. static inline void load_mm_ldt(struct mm_struct *mm)
  132. @@ -95,10 +120,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
  133. * that we can see.
  134. */
  135. - if (unlikely(ldt))
  136. - set_ldt(ldt->entries, ldt->nr_entries);
  137. - else
  138. + if (unlikely(ldt)) {
  139. + if (static_cpu_has(X86_FEATURE_PTI)) {
  140. + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
  141. + /*
  142. + * Whoops -- either the new LDT isn't mapped
  143. + * (if slot == -1) or is mapped into a bogus
  144. + * slot (if slot > 1).
  145. + */
  146. + clear_LDT();
  147. + return;
  148. + }
  149. +
  150. + /*
  151. + * If page table isolation is enabled, ldt->entries
  152. + * will not be mapped in the userspace pagetables.
  153. + * Tell the CPU to access the LDT through the alias
  154. + * at ldt_slot_va(ldt->slot).
  155. + */
  156. + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
  157. + } else {
  158. + set_ldt(ldt->entries, ldt->nr_entries);
  159. + }
  160. + } else {
  161. clear_LDT();
  162. + }
  163. #else
  164. clear_LDT();
  165. #endif
  166. @@ -193,6 +239,7 @@ static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
  167. static inline void arch_exit_mmap(struct mm_struct *mm)
  168. {
  169. paravirt_arch_exit_mmap(mm);
  170. + ldt_arch_exit_mmap(mm);
  171. }
  172. #ifdef CONFIG_X86_64
  173. diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
  174. index 5932dead34ee..e8a809ee0bb6 100644
  175. --- a/arch/x86/include/asm/pgtable_64_types.h
  176. +++ b/arch/x86/include/asm/pgtable_64_types.h
  177. @@ -81,10 +81,14 @@ typedef struct { pteval_t pte; } pte_t;
  178. # define VMALLOC_SIZE_TB _AC(12800, UL)
  179. # define __VMALLOC_BASE _AC(0xffa0000000000000, UL)
  180. # define __VMEMMAP_BASE _AC(0xffd4000000000000, UL)
  181. +# define LDT_PGD_ENTRY _AC(-112, UL)
  182. +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
  183. #else
  184. # define VMALLOC_SIZE_TB _AC(32, UL)
  185. # define __VMALLOC_BASE _AC(0xffffc90000000000, UL)
  186. # define __VMEMMAP_BASE _AC(0xffffea0000000000, UL)
  187. +# define LDT_PGD_ENTRY _AC(-4, UL)
  188. +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT)
  189. #endif
  190. #ifdef CONFIG_RANDOMIZE_MEMORY
  191. diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
  192. index 935d68609922..24503521c947 100644
  193. --- a/arch/x86/include/asm/processor.h
  194. +++ b/arch/x86/include/asm/processor.h
  195. @@ -843,13 +843,22 @@ static inline void spin_lock_prefetch(const void *x)
  196. #else
  197. /*
  198. - * User space process size. 47bits minus one guard page. The guard
  199. - * page is necessary on Intel CPUs: if a SYSCALL instruction is at
  200. - * the highest possible canonical userspace address, then that
  201. - * syscall will enter the kernel with a non-canonical return
  202. - * address, and SYSRET will explode dangerously. We avoid this
  203. - * particular problem by preventing anything from being mapped
  204. - * at the maximum canonical address.
  205. + * User space process size. This is the first address outside the user range.
  206. + * There are a few constraints that determine this:
  207. + *
  208. + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical
  209. + * address, then that syscall will enter the kernel with a
  210. + * non-canonical return address, and SYSRET will explode dangerously.
  211. + * We avoid this particular problem by preventing anything executable
  212. + * from being mapped at the maximum canonical address.
  213. + *
  214. + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the
  215. + * CPUs malfunction if they execute code from the highest canonical page.
  216. + * They'll speculate right off the end of the canonical space, and
  217. + * bad things happen. This is worked around in the same way as the
  218. + * Intel problem.
  219. + *
  220. + * With page table isolation enabled, we map the LDT in ... [stay tuned]
  221. */
  222. #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE)
  223. diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
  224. index 74a5aaf13f3c..eceaada581ff 100644
  225. --- a/arch/x86/kernel/ldt.c
  226. +++ b/arch/x86/kernel/ldt.c
  227. @@ -23,6 +23,7 @@
  228. #include <linux/uaccess.h>
  229. #include <asm/ldt.h>
  230. +#include <asm/tlb.h>
  231. #include <asm/desc.h>
  232. #include <asm/mmu_context.h>
  233. #include <asm/syscalls.h>
  234. @@ -50,13 +51,11 @@ static void refresh_ldt_segments(void)
  235. static void flush_ldt(void *__mm)
  236. {
  237. struct mm_struct *mm = __mm;
  238. - mm_context_t *pc;
  239. if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm)
  240. return;
  241. - pc = &mm->context;
  242. - set_ldt(pc->ldt->entries, pc->ldt->nr_entries);
  243. + load_mm_ldt(mm);
  244. refresh_ldt_segments();
  245. }
  246. @@ -93,10 +92,121 @@ static struct ldt_struct *alloc_ldt_struct(unsigned int num_entries)
  247. return NULL;
  248. }
  249. + /* The new LDT isn't aliased for PTI yet. */
  250. + new_ldt->slot = -1;
  251. +
  252. new_ldt->nr_entries = num_entries;
  253. return new_ldt;
  254. }
  255. +/*
  256. + * If PTI is enabled, this maps the LDT into the kernelmode and
  257. + * usermode tables for the given mm.
  258. + *
  259. + * There is no corresponding unmap function. Even if the LDT is freed, we
  260. + * leave the PTEs around until the slot is reused or the mm is destroyed.
  261. + * This is harmless: the LDT is always in ordinary memory, and no one will
  262. + * access the freed slot.
  263. + *
  264. + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make
  265. + * it useful, and the flush would slow down modify_ldt().
  266. + */
  267. +static int
  268. +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot)
  269. +{
  270. +#ifdef CONFIG_PAGE_TABLE_ISOLATION
  271. + bool is_vmalloc, had_top_level_entry;
  272. + unsigned long va;
  273. + spinlock_t *ptl;
  274. + pgd_t *pgd;
  275. + int i;
  276. +
  277. + if (!static_cpu_has(X86_FEATURE_PTI))
  278. + return 0;
  279. +
  280. + /*
  281. + * Any given ldt_struct should have map_ldt_struct() called at most
  282. + * once.
  283. + */
  284. + WARN_ON(ldt->slot != -1);
  285. +
  286. + /*
  287. + * Did we already have the top level entry allocated? We can't
  288. + * use pgd_none() for this because it doens't do anything on
  289. + * 4-level page table kernels.
  290. + */
  291. + pgd = pgd_offset(mm, LDT_BASE_ADDR);
  292. + had_top_level_entry = (pgd->pgd != 0);
  293. +
  294. + is_vmalloc = is_vmalloc_addr(ldt->entries);
  295. +
  296. + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) {
  297. + unsigned long offset = i << PAGE_SHIFT;
  298. + const void *src = (char *)ldt->entries + offset;
  299. + unsigned long pfn;
  300. + pte_t pte, *ptep;
  301. +
  302. + va = (unsigned long)ldt_slot_va(slot) + offset;
  303. + pfn = is_vmalloc ? vmalloc_to_pfn(src) :
  304. + page_to_pfn(virt_to_page(src));
  305. + /*
  306. + * Treat the PTI LDT range as a *userspace* range.
  307. + * get_locked_pte() will allocate all needed pagetables
  308. + * and account for them in this mm.
  309. + */
  310. + ptep = get_locked_pte(mm, va, &ptl);
  311. + if (!ptep)
  312. + return -ENOMEM;
  313. + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL & ~_PAGE_GLOBAL));
  314. + set_pte_at(mm, va, ptep, pte);
  315. + pte_unmap_unlock(ptep, ptl);
  316. + }
  317. +
  318. + if (mm->context.ldt) {
  319. + /*
  320. + * We already had an LDT. The top-level entry should already
  321. + * have been allocated and synchronized with the usermode
  322. + * tables.
  323. + */
  324. + WARN_ON(!had_top_level_entry);
  325. + if (static_cpu_has(X86_FEATURE_PTI))
  326. + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd);
  327. + } else {
  328. + /*
  329. + * This is the first time we're mapping an LDT for this process.
  330. + * Sync the pgd to the usermode tables.
  331. + */
  332. + WARN_ON(had_top_level_entry);
  333. + if (static_cpu_has(X86_FEATURE_PTI)) {
  334. + WARN_ON(kernel_to_user_pgdp(pgd)->pgd);
  335. + set_pgd(kernel_to_user_pgdp(pgd), *pgd);
  336. + }
  337. + }
  338. +
  339. + va = (unsigned long)ldt_slot_va(slot);
  340. + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0);
  341. +
  342. + ldt->slot = slot;
  343. +#endif
  344. + return 0;
  345. +}
  346. +
  347. +static void free_ldt_pgtables(struct mm_struct *mm)
  348. +{
  349. +#ifdef CONFIG_PAGE_TABLE_ISOLATION
  350. + struct mmu_gather tlb;
  351. + unsigned long start = LDT_BASE_ADDR;
  352. + unsigned long end = start + (1UL << PGDIR_SHIFT);
  353. +
  354. + if (!static_cpu_has(X86_FEATURE_PTI))
  355. + return;
  356. +
  357. + tlb_gather_mmu(&tlb, mm, start, end);
  358. + free_pgd_range(&tlb, start, end, start, end);
  359. + tlb_finish_mmu(&tlb, start, end);
  360. +#endif
  361. +}
  362. +
  363. /* After calling this, the LDT is immutable. */
  364. static void finalize_ldt_struct(struct ldt_struct *ldt)
  365. {
  366. @@ -155,6 +265,12 @@ int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm)
  367. new_ldt->nr_entries * LDT_ENTRY_SIZE);
  368. finalize_ldt_struct(new_ldt);
  369. + retval = map_ldt_struct(mm, new_ldt, 0);
  370. + if (retval) {
  371. + free_ldt_pgtables(mm);
  372. + free_ldt_struct(new_ldt);
  373. + goto out_unlock;
  374. + }
  375. mm->context.ldt = new_ldt;
  376. out_unlock:
  377. @@ -173,6 +289,11 @@ void destroy_context_ldt(struct mm_struct *mm)
  378. mm->context.ldt = NULL;
  379. }
  380. +void ldt_arch_exit_mmap(struct mm_struct *mm)
  381. +{
  382. + free_ldt_pgtables(mm);
  383. +}
  384. +
  385. static int read_ldt(void __user *ptr, unsigned long bytecount)
  386. {
  387. struct mm_struct *mm = current->mm;
  388. @@ -286,6 +407,18 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode)
  389. new_ldt->entries[ldt_info.entry_number] = ldt;
  390. finalize_ldt_struct(new_ldt);
  391. + /*
  392. + * If we are using PTI, map the new LDT into the userspace pagetables.
  393. + * If there is already an LDT, use the other slot so that other CPUs
  394. + * will continue to use the old LDT until install_ldt() switches
  395. + * them over to the new LDT.
  396. + */
  397. + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0);
  398. + if (error) {
  399. + free_ldt_struct(old_ldt);
  400. + goto out_unlock;
  401. + }
  402. +
  403. install_ldt(mm, new_ldt);
  404. free_ldt_struct(old_ldt);
  405. error = 0;
  406. diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
  407. index 3b7720404a9f..eed93dd4cb4a 100644
  408. --- a/arch/x86/mm/dump_pagetables.c
  409. +++ b/arch/x86/mm/dump_pagetables.c
  410. @@ -52,11 +52,17 @@ enum address_markers_idx {
  411. USER_SPACE_NR = 0,
  412. KERNEL_SPACE_NR,
  413. LOW_KERNEL_NR,
  414. +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
  415. + LDT_NR,
  416. +#endif
  417. VMALLOC_START_NR,
  418. VMEMMAP_START_NR,
  419. #ifdef CONFIG_KASAN
  420. KASAN_SHADOW_START_NR,
  421. KASAN_SHADOW_END_NR,
  422. +#endif
  423. +#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
  424. + LDT_NR,
  425. #endif
  426. CPU_ENTRY_AREA_NR,
  427. #ifdef CONFIG_X86_ESPFIX64
  428. @@ -81,6 +87,9 @@ static struct addr_marker address_markers[] = {
  429. #ifdef CONFIG_KASAN
  430. [KASAN_SHADOW_START_NR] = { KASAN_SHADOW_START, "KASAN shadow" },
  431. [KASAN_SHADOW_END_NR] = { KASAN_SHADOW_END, "KASAN shadow end" },
  432. +#endif
  433. +#ifdef CONFIG_MODIFY_LDT_SYSCALL
  434. + [LDT_NR] = { LDT_BASE_ADDR, "LDT remap" },
  435. #endif
  436. [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
  437. #ifdef CONFIG_X86_ESPFIX64
  438. --
  439. 2.14.2