aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRick Edgecombe <rick.p.edgecombe@intel.com>2019-04-25 20:11:36 -0400
committerIngo Molnar <mingo@kernel.org>2019-04-30 06:37:58 -0400
commit868b104d7379e28013e9d48bdd2db25e0bdcf751 (patch)
tree39fb070adb18f4f4aacde7f2541f334e15c34290
parentd63326928611600ad65baff54a70f53b02b3cdfe (diff)
mm/vmalloc: Add flag for freeing of special permsissions
Add a new flag VM_FLUSH_RESET_PERMS, for enabling vfree operations to immediately clear executable TLB entries before freeing pages, and handle resetting permissions on the directmap. This flag is useful for any kind of memory with elevated permissions, or where there can be related permissions changes on the directmap. Today this is RO+X and RO memory. Although this enables directly vfreeing non-writeable memory now, non-writable memory cannot be freed in an interrupt because the allocation itself is used as a node on deferred free list. So when RO memory needs to be freed in an interrupt the code doing the vfree needs to have its own work queue, as was the case before the deferred vfree list was added to vmalloc. For architectures with set_direct_map_ implementations this whole operation can be done with one TLB flush when centralized like this. For others with directmap permissions, currently only arm64, a backup method using set_memory functions is used to reset the directmap. When arm64 adds set_direct_map_ functions, this backup can be removed. When the TLB is flushed to both remove TLB entries for the vmalloc range mapping and the direct map permissions, the lazy purge operation could be done to try to save a TLB flush later. However today vm_unmap_aliases could flush a TLB range that does not include the directmap. So a helper is added with extra parameters that can allow both the vmalloc address and the direct mapping to be flushed during this operation. The behavior of the normal vm_unmap_aliases function is unchanged. Suggested-by: Dave Hansen <dave.hansen@intel.com> Suggested-by: Andy Lutomirski <luto@kernel.org> Suggested-by: Will Deacon <will.deacon@arm.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: <akpm@linux-foundation.org> Cc: <ard.biesheuvel@linaro.org> Cc: <deneen.t.dock@intel.com> Cc: <kernel-hardening@lists.openwall.com> Cc: <kristen@linux.intel.com> Cc: <linux_dti@icloud.com> Cc: Borislav Petkov <bp@alien8.de> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Rik van Riel <riel@surriel.com> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20190426001143.4983-17-namit@vmware.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/vmalloc.h15
-rw-r--r--mm/vmalloc.c113
2 files changed, 109 insertions, 19 deletions
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 398e9c95cd61..c6eebb839552 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -21,6 +21,11 @@ struct notifier_block; /* in notifier.h */
21#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ 21#define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */
22#define VM_NO_GUARD 0x00000040 /* don't add guard page */ 22#define VM_NO_GUARD 0x00000040 /* don't add guard page */
23#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ 23#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */
24/*
25 * Memory with VM_FLUSH_RESET_PERMS cannot be freed in an interrupt or with
26 * vfree_atomic().
27 */
28#define VM_FLUSH_RESET_PERMS 0x00000100 /* Reset direct map and flush TLB on unmap */
24/* bits [20..32] reserved for arch specific ioremap internals */ 29/* bits [20..32] reserved for arch specific ioremap internals */
25 30
26/* 31/*
@@ -142,6 +147,13 @@ extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
142 pgprot_t prot, struct page **pages); 147 pgprot_t prot, struct page **pages);
143extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); 148extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
144extern void unmap_kernel_range(unsigned long addr, unsigned long size); 149extern void unmap_kernel_range(unsigned long addr, unsigned long size);
150static inline void set_vm_flush_reset_perms(void *addr)
151{
152 struct vm_struct *vm = find_vm_area(addr);
153
154 if (vm)
155 vm->flags |= VM_FLUSH_RESET_PERMS;
156}
145#else 157#else
146static inline int 158static inline int
147map_kernel_range_noflush(unsigned long start, unsigned long size, 159map_kernel_range_noflush(unsigned long start, unsigned long size,
@@ -157,6 +169,9 @@ static inline void
157unmap_kernel_range(unsigned long addr, unsigned long size) 169unmap_kernel_range(unsigned long addr, unsigned long size)
158{ 170{
159} 171}
172static inline void set_vm_flush_reset_perms(void *addr)
173{
174}
160#endif 175#endif
161 176
162/* Allocate/destroy a 'vmalloc' VM area. */ 177/* Allocate/destroy a 'vmalloc' VM area. */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e86ba6e74b50..e5e9e1fcac01 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -18,6 +18,7 @@
18#include <linux/interrupt.h> 18#include <linux/interrupt.h>
19#include <linux/proc_fs.h> 19#include <linux/proc_fs.h>
20#include <linux/seq_file.h> 20#include <linux/seq_file.h>
21#include <linux/set_memory.h>
21#include <linux/debugobjects.h> 22#include <linux/debugobjects.h>
22#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
23#include <linux/list.h> 24#include <linux/list.h>
@@ -1059,24 +1060,9 @@ static void vb_free(const void *addr, unsigned long size)
1059 spin_unlock(&vb->lock); 1060 spin_unlock(&vb->lock);
1060} 1061}
1061 1062
1062/** 1063static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
1063 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1064 *
1065 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1066 * to amortize TLB flushing overheads. What this means is that any page you
1067 * have now, may, in a former life, have been mapped into kernel virtual
1068 * address by the vmap layer and so there might be some CPUs with TLB entries
1069 * still referencing that page (additional to the regular 1:1 kernel mapping).
1070 *
1071 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1072 * be sure that none of the pages we have control over will have any aliases
1073 * from the vmap layer.
1074 */
1075void vm_unmap_aliases(void)
1076{ 1064{
1077 unsigned long start = ULONG_MAX, end = 0;
1078 int cpu; 1065 int cpu;
1079 int flush = 0;
1080 1066
1081 if (unlikely(!vmap_initialized)) 1067 if (unlikely(!vmap_initialized))
1082 return; 1068 return;
@@ -1113,6 +1099,27 @@ void vm_unmap_aliases(void)
1113 flush_tlb_kernel_range(start, end); 1099 flush_tlb_kernel_range(start, end);
1114 mutex_unlock(&vmap_purge_lock); 1100 mutex_unlock(&vmap_purge_lock);
1115} 1101}
1102
1103/**
1104 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
1105 *
1106 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
1107 * to amortize TLB flushing overheads. What this means is that any page you
1108 * have now, may, in a former life, have been mapped into kernel virtual
1109 * address by the vmap layer and so there might be some CPUs with TLB entries
1110 * still referencing that page (additional to the regular 1:1 kernel mapping).
1111 *
1112 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
1113 * be sure that none of the pages we have control over will have any aliases
1114 * from the vmap layer.
1115 */
1116void vm_unmap_aliases(void)
1117{
1118 unsigned long start = ULONG_MAX, end = 0;
1119 int flush = 0;
1120
1121 _vm_unmap_aliases(start, end, flush);
1122}
1116EXPORT_SYMBOL_GPL(vm_unmap_aliases); 1123EXPORT_SYMBOL_GPL(vm_unmap_aliases);
1117 1124
1118/** 1125/**
@@ -1505,6 +1512,72 @@ struct vm_struct *remove_vm_area(const void *addr)
1505 return NULL; 1512 return NULL;
1506} 1513}
1507 1514
1515static inline void set_area_direct_map(const struct vm_struct *area,
1516 int (*set_direct_map)(struct page *page))
1517{
1518 int i;
1519
1520 for (i = 0; i < area->nr_pages; i++)
1521 if (page_address(area->pages[i]))
1522 set_direct_map(area->pages[i]);
1523}
1524
1525/* Handle removing and resetting vm mappings related to the vm_struct. */
1526static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
1527{
1528 unsigned long addr = (unsigned long)area->addr;
1529 unsigned long start = ULONG_MAX, end = 0;
1530 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
1531 int i;
1532
1533 /*
1534 * The below block can be removed when all architectures that have
1535 * direct map permissions also have set_direct_map_() implementations.
1536 * This is concerned with resetting the direct map any an vm alias with
1537 * execute permissions, without leaving a RW+X window.
1538 */
1539 if (flush_reset && !IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) {
1540 set_memory_nx(addr, area->nr_pages);
1541 set_memory_rw(addr, area->nr_pages);
1542 }
1543
1544 remove_vm_area(area->addr);
1545
1546 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
1547 if (!flush_reset)
1548 return;
1549
1550 /*
1551 * If not deallocating pages, just do the flush of the VM area and
1552 * return.
1553 */
1554 if (!deallocate_pages) {
1555 vm_unmap_aliases();
1556 return;
1557 }
1558
1559 /*
1560 * If execution gets here, flush the vm mapping and reset the direct
1561 * map. Find the start and end range of the direct mappings to make sure
1562 * the vm_unmap_aliases() flush includes the direct map.
1563 */
1564 for (i = 0; i < area->nr_pages; i++) {
1565 if (page_address(area->pages[i])) {
1566 start = min(addr, start);
1567 end = max(addr, end);
1568 }
1569 }
1570
1571 /*
1572 * Set direct map to something invalid so that it won't be cached if
1573 * there are any accesses after the TLB flush, then flush the TLB and
1574 * reset the direct map permissions to the default.
1575 */
1576 set_area_direct_map(area, set_direct_map_invalid_noflush);
1577 _vm_unmap_aliases(start, end, 1);
1578 set_area_direct_map(area, set_direct_map_default_noflush);
1579}
1580
1508static void __vunmap(const void *addr, int deallocate_pages) 1581static void __vunmap(const void *addr, int deallocate_pages)
1509{ 1582{
1510 struct vm_struct *area; 1583 struct vm_struct *area;
@@ -1526,7 +1599,8 @@ static void __vunmap(const void *addr, int deallocate_pages)
1526 debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); 1599 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
1527 debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); 1600 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
1528 1601
1529 remove_vm_area(addr); 1602 vm_remove_mappings(area, deallocate_pages);
1603
1530 if (deallocate_pages) { 1604 if (deallocate_pages) {
1531 int i; 1605 int i;
1532 1606
@@ -1961,8 +2035,9 @@ EXPORT_SYMBOL(vzalloc_node);
1961 */ 2035 */
1962void *vmalloc_exec(unsigned long size) 2036void *vmalloc_exec(unsigned long size)
1963{ 2037{
1964 return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, 2038 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
1965 NUMA_NO_NODE, __builtin_return_address(0)); 2039 GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
2040 NUMA_NO_NODE, __builtin_return_address(0));
1966} 2041}
1967 2042
1968#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 2043#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)