aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 19:13:31 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-06 19:13:31 -0400
commit0bc40e549aeea2de20fc571749de9bbfc099fb34 (patch)
treed18f3339bd383a17431fca23b6c5f3e54c93cf2f /kernel
parente913c4a4c21cd83317fafe63bfdc9d34d2910114 (diff)
parentcaa841360134f863987f2d4f77b8dc2fbb7596f8 (diff)
Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Ingo Molnar: "The changes in here are: - text_poke() fixes and an extensive set of executability lockdowns, to (hopefully) eliminate the last residual circumstances under which we are using W|X mappings even temporarily on x86 kernels. This required a broad range of surgery in text patching facilities, module loading, trampoline handling and other bits. - tweak page fault messages to be more informative and more structured. - remove DISCONTIGMEM support on x86-32 and make SPARSEMEM the default. - reduce KASLR granularity on 5-level paging kernels from 512 GB to 1 GB. - misc other changes and updates" * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (36 commits) x86/mm: Initialize PGD cache during mm initialization x86/alternatives: Add comment about module removal races x86/kprobes: Use vmalloc special flag x86/ftrace: Use vmalloc special flag bpf: Use vmalloc special flag modules: Use vmalloc special flag mm/vmalloc: Add flag for freeing of special permsissions mm/hibernation: Make hibernation handle unmapped pages x86/mm/cpa: Add set_direct_map_*() functions x86/alternatives: Remove the return value of text_poke_*() x86/jump-label: Remove support for custom text poker x86/modules: Avoid breaking W^X while loading modules x86/kprobes: Set instruction page as executable x86/ftrace: Set trampoline pages as executable x86/kgdb: Avoid redundant comparison of patched code x86/alternatives: Use temporary mm for text poking x86/alternatives: Initialize temporary mm for patching fork: Provide a function for copying init_mm uprobes: Initialize uprobes earlier x86/mm: Save debug registers when loading a temporary mm ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c1
-rw-r--r--kernel/events/uprobes.c8
-rw-r--r--kernel/fork.c25
-rw-r--r--kernel/module.c82
-rw-r--r--kernel/power/snapshot.c5
-rw-r--r--kernel/trace/bpf_trace.c8
6 files changed, 77 insertions, 52 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index ff09d32a8a1b..c605397c79f0 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -848,7 +848,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
848 if (fp->jited) { 848 if (fp->jited) {
849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); 849 struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
850 850
851 bpf_jit_binary_unlock_ro(hdr);
852 bpf_jit_binary_free(hdr); 851 bpf_jit_binary_free(hdr);
853 852
854 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); 853 WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index c5cde87329c7..e6a0d6be87e3 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -2294,16 +2294,14 @@ static struct notifier_block uprobe_exception_nb = {
2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */ 2294 .priority = INT_MAX-1, /* notified after kprobes, kgdb */
2295}; 2295};
2296 2296
2297static int __init init_uprobes(void) 2297void __init uprobes_init(void)
2298{ 2298{
2299 int i; 2299 int i;
2300 2300
2301 for (i = 0; i < UPROBES_HASH_SZ; i++) 2301 for (i = 0; i < UPROBES_HASH_SZ; i++)
2302 mutex_init(&uprobes_mmap_mutex[i]); 2302 mutex_init(&uprobes_mmap_mutex[i]);
2303 2303
2304 if (percpu_init_rwsem(&dup_mmap_sem)) 2304 BUG_ON(percpu_init_rwsem(&dup_mmap_sem));
2305 return -ENOMEM;
2306 2305
2307 return register_die_notifier(&uprobe_exception_nb); 2306 BUG_ON(register_die_notifier(&uprobe_exception_nb));
2308} 2307}
2309__initcall(init_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 9dcd18aa210b..fbe9dfcd8680 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -815,6 +815,7 @@ void __init fork_init(void)
815#endif 815#endif
816 816
817 lockdep_init_task(&init_task); 817 lockdep_init_task(&init_task);
818 uprobes_init();
818} 819}
819 820
820int __weak arch_dup_task_struct(struct task_struct *dst, 821int __weak arch_dup_task_struct(struct task_struct *dst,
@@ -1298,13 +1299,20 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
1298 complete_vfork_done(tsk); 1299 complete_vfork_done(tsk);
1299} 1300}
1300 1301
1301/* 1302/**
1302 * Allocate a new mm structure and copy contents from the 1303 * dup_mm() - duplicates an existing mm structure
1303 * mm structure of the passed in task structure. 1304 * @tsk: the task_struct with which the new mm will be associated.
1305 * @oldmm: the mm to duplicate.
1306 *
1307 * Allocates a new mm structure and duplicates the provided @oldmm structure
1308 * content into it.
1309 *
1310 * Return: the duplicated mm or NULL on failure.
1304 */ 1311 */
1305static struct mm_struct *dup_mm(struct task_struct *tsk) 1312static struct mm_struct *dup_mm(struct task_struct *tsk,
1313 struct mm_struct *oldmm)
1306{ 1314{
1307 struct mm_struct *mm, *oldmm = current->mm; 1315 struct mm_struct *mm;
1308 int err; 1316 int err;
1309 1317
1310 mm = allocate_mm(); 1318 mm = allocate_mm();
@@ -1371,7 +1379,7 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
1371 } 1379 }
1372 1380
1373 retval = -ENOMEM; 1381 retval = -ENOMEM;
1374 mm = dup_mm(tsk); 1382 mm = dup_mm(tsk, current->mm);
1375 if (!mm) 1383 if (!mm)
1376 goto fail_nomem; 1384 goto fail_nomem;
1377 1385
@@ -2186,6 +2194,11 @@ struct task_struct *fork_idle(int cpu)
2186 return task; 2194 return task;
2187} 2195}
2188 2196
2197struct mm_struct *copy_init_mm(void)
2198{
2199 return dup_mm(NULL, &init_mm);
2200}
2201
2189/* 2202/*
2190 * Ok, this is the main fork-routine. 2203 * Ok, this is the main fork-routine.
2191 * 2204 *
diff --git a/kernel/module.c b/kernel/module.c
index 0b9aa8ab89f0..a9020bdd4cf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -98,6 +98,10 @@ DEFINE_MUTEX(module_mutex);
98EXPORT_SYMBOL_GPL(module_mutex); 98EXPORT_SYMBOL_GPL(module_mutex);
99static LIST_HEAD(modules); 99static LIST_HEAD(modules);
100 100
101/* Work queue for freeing init sections in success case */
102static struct work_struct init_free_wq;
103static struct llist_head init_free_list;
104
101#ifdef CONFIG_MODULES_TREE_LOOKUP 105#ifdef CONFIG_MODULES_TREE_LOOKUP
102 106
103/* 107/*
@@ -1949,9 +1953,16 @@ void module_enable_ro(const struct module *mod, bool after_init)
1949 if (!rodata_enabled) 1953 if (!rodata_enabled)
1950 return; 1954 return;
1951 1955
1956 set_vm_flush_reset_perms(mod->core_layout.base);
1957 set_vm_flush_reset_perms(mod->init_layout.base);
1952 frob_text(&mod->core_layout, set_memory_ro); 1958 frob_text(&mod->core_layout, set_memory_ro);
1959 frob_text(&mod->core_layout, set_memory_x);
1960
1953 frob_rodata(&mod->core_layout, set_memory_ro); 1961 frob_rodata(&mod->core_layout, set_memory_ro);
1962
1954 frob_text(&mod->init_layout, set_memory_ro); 1963 frob_text(&mod->init_layout, set_memory_ro);
1964 frob_text(&mod->init_layout, set_memory_x);
1965
1955 frob_rodata(&mod->init_layout, set_memory_ro); 1966 frob_rodata(&mod->init_layout, set_memory_ro);
1956 1967
1957 if (after_init) 1968 if (after_init)
@@ -1967,15 +1978,6 @@ static void module_enable_nx(const struct module *mod)
1967 frob_writable_data(&mod->init_layout, set_memory_nx); 1978 frob_writable_data(&mod->init_layout, set_memory_nx);
1968} 1979}
1969 1980
1970static void module_disable_nx(const struct module *mod)
1971{
1972 frob_rodata(&mod->core_layout, set_memory_x);
1973 frob_ro_after_init(&mod->core_layout, set_memory_x);
1974 frob_writable_data(&mod->core_layout, set_memory_x);
1975 frob_rodata(&mod->init_layout, set_memory_x);
1976 frob_writable_data(&mod->init_layout, set_memory_x);
1977}
1978
1979/* Iterate through all modules and set each module's text as RW */ 1981/* Iterate through all modules and set each module's text as RW */
1980void set_all_modules_text_rw(void) 1982void set_all_modules_text_rw(void)
1981{ 1983{
@@ -2019,23 +2021,8 @@ void set_all_modules_text_ro(void)
2019 } 2021 }
2020 mutex_unlock(&module_mutex); 2022 mutex_unlock(&module_mutex);
2021} 2023}
2022
2023static void disable_ro_nx(const struct module_layout *layout)
2024{
2025 if (rodata_enabled) {
2026 frob_text(layout, set_memory_rw);
2027 frob_rodata(layout, set_memory_rw);
2028 frob_ro_after_init(layout, set_memory_rw);
2029 }
2030 frob_rodata(layout, set_memory_x);
2031 frob_ro_after_init(layout, set_memory_x);
2032 frob_writable_data(layout, set_memory_x);
2033}
2034
2035#else 2024#else
2036static void disable_ro_nx(const struct module_layout *layout) { }
2037static void module_enable_nx(const struct module *mod) { } 2025static void module_enable_nx(const struct module *mod) { }
2038static void module_disable_nx(const struct module *mod) { }
2039#endif 2026#endif
2040 2027
2041#ifdef CONFIG_LIVEPATCH 2028#ifdef CONFIG_LIVEPATCH
@@ -2115,6 +2102,11 @@ static void free_module_elf(struct module *mod)
2115 2102
2116void __weak module_memfree(void *module_region) 2103void __weak module_memfree(void *module_region)
2117{ 2104{
2105 /*
2106 * This memory may be RO, and freeing RO memory in an interrupt is not
2107 * supported by vmalloc.
2108 */
2109 WARN_ON(in_interrupt());
2118 vfree(module_region); 2110 vfree(module_region);
2119} 2111}
2120 2112
@@ -2166,7 +2158,6 @@ static void free_module(struct module *mod)
2166 mutex_unlock(&module_mutex); 2158 mutex_unlock(&module_mutex);
2167 2159
2168 /* This may be empty, but that's OK */ 2160 /* This may be empty, but that's OK */
2169 disable_ro_nx(&mod->init_layout);
2170 module_arch_freeing_init(mod); 2161 module_arch_freeing_init(mod);
2171 module_memfree(mod->init_layout.base); 2162 module_memfree(mod->init_layout.base);
2172 kfree(mod->args); 2163 kfree(mod->args);
@@ -2176,7 +2167,6 @@ static void free_module(struct module *mod)
2176 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); 2167 lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size);
2177 2168
2178 /* Finally, free the core (containing the module structure) */ 2169 /* Finally, free the core (containing the module structure) */
2179 disable_ro_nx(&mod->core_layout);
2180 module_memfree(mod->core_layout.base); 2170 module_memfree(mod->core_layout.base);
2181} 2171}
2182 2172
@@ -3415,17 +3405,34 @@ static void do_mod_ctors(struct module *mod)
3415 3405
3416/* For freeing module_init on success, in case kallsyms traversing */ 3406/* For freeing module_init on success, in case kallsyms traversing */
3417struct mod_initfree { 3407struct mod_initfree {
3418 struct rcu_head rcu; 3408 struct llist_node node;
3419 void *module_init; 3409 void *module_init;
3420}; 3410};
3421 3411
3422static void do_free_init(struct rcu_head *head) 3412static void do_free_init(struct work_struct *w)
3423{ 3413{
3424 struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); 3414 struct llist_node *pos, *n, *list;
3425 module_memfree(m->module_init); 3415 struct mod_initfree *initfree;
3426 kfree(m); 3416
3417 list = llist_del_all(&init_free_list);
3418
3419 synchronize_rcu();
3420
3421 llist_for_each_safe(pos, n, list) {
3422 initfree = container_of(pos, struct mod_initfree, node);
3423 module_memfree(initfree->module_init);
3424 kfree(initfree);
3425 }
3427} 3426}
3428 3427
3428static int __init modules_wq_init(void)
3429{
3430 INIT_WORK(&init_free_wq, do_free_init);
3431 init_llist_head(&init_free_list);
3432 return 0;
3433}
3434module_init(modules_wq_init);
3435
3429/* 3436/*
3430 * This is where the real work happens. 3437 * This is where the real work happens.
3431 * 3438 *
@@ -3502,7 +3509,6 @@ static noinline int do_init_module(struct module *mod)
3502#endif 3509#endif
3503 module_enable_ro(mod, true); 3510 module_enable_ro(mod, true);
3504 mod_tree_remove_init(mod); 3511 mod_tree_remove_init(mod);
3505 disable_ro_nx(&mod->init_layout);
3506 module_arch_freeing_init(mod); 3512 module_arch_freeing_init(mod);
3507 mod->init_layout.base = NULL; 3513 mod->init_layout.base = NULL;
3508 mod->init_layout.size = 0; 3514 mod->init_layout.size = 0;
@@ -3513,14 +3519,18 @@ static noinline int do_init_module(struct module *mod)
3513 * We want to free module_init, but be aware that kallsyms may be 3519 * We want to free module_init, but be aware that kallsyms may be
3514 * walking this with preempt disabled. In all the failure paths, we 3520 * walking this with preempt disabled. In all the failure paths, we
3515 * call synchronize_rcu(), but we don't want to slow down the success 3521 * call synchronize_rcu(), but we don't want to slow down the success
3516 * path, so use actual RCU here. 3522 * path. module_memfree() cannot be called in an interrupt, so do the
3523 * work and call synchronize_rcu() in a work queue.
3524 *
3517 * Note that module_alloc() on most architectures creates W+X page 3525 * Note that module_alloc() on most architectures creates W+X page
3518 * mappings which won't be cleaned up until do_free_init() runs. Any 3526 * mappings which won't be cleaned up until do_free_init() runs. Any
3519 * code such as mark_rodata_ro() which depends on those mappings to 3527 * code such as mark_rodata_ro() which depends on those mappings to
3520 * be cleaned up needs to sync with the queued work - ie 3528 * be cleaned up needs to sync with the queued work - ie
3521 * rcu_barrier() 3529 * rcu_barrier()
3522 */ 3530 */
3523 call_rcu(&freeinit->rcu, do_free_init); 3531 if (llist_add(&freeinit->node, &init_free_list))
3532 schedule_work(&init_free_wq);
3533
3524 mutex_unlock(&module_mutex); 3534 mutex_unlock(&module_mutex);
3525 wake_up_all(&module_wq); 3535 wake_up_all(&module_wq);
3526 3536
@@ -3817,10 +3827,6 @@ static int load_module(struct load_info *info, const char __user *uargs,
3817 module_bug_cleanup(mod); 3827 module_bug_cleanup(mod);
3818 mutex_unlock(&module_mutex); 3828 mutex_unlock(&module_mutex);
3819 3829
3820 /* we can't deallocate the module until we clear memory protection */
3821 module_disable_ro(mod);
3822 module_disable_nx(mod);
3823
3824 ddebug_cleanup: 3830 ddebug_cleanup:
3825 ftrace_release_mod(mod); 3831 ftrace_release_mod(mod);
3826 dynamic_debug_remove(mod, info->debug); 3832 dynamic_debug_remove(mod, info->debug);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f08a1e4ee1d4..bc9558ab1e5b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1342,8 +1342,9 @@ static inline void do_copy_page(long *dst, long *src)
1342 * safe_copy_page - Copy a page in a safe way. 1342 * safe_copy_page - Copy a page in a safe way.
1343 * 1343 *
1344 * Check if the page we are going to copy is marked as present in the kernel 1344 * Check if the page we are going to copy is marked as present in the kernel
1345 * page tables (this always is the case if CONFIG_DEBUG_PAGEALLOC is not set 1345 * page tables. This always is the case if CONFIG_DEBUG_PAGEALLOC or
1346 * and in that case kernel_page_present() always returns 'true'). 1346 * CONFIG_ARCH_HAS_SET_DIRECT_MAP is not set. In that case kernel_page_present()
1347 * always returns 'true'.
1347 */ 1348 */
1348static void safe_copy_page(void *dst, struct page *s_page) 1349static void safe_copy_page(void *dst, struct page *s_page)
1349{ 1350{
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index d64c00afceb5..94b0e37d90ef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -14,6 +14,8 @@
14#include <linux/syscalls.h> 14#include <linux/syscalls.h>
15#include <linux/error-injection.h> 15#include <linux/error-injection.h>
16 16
17#include <asm/tlb.h>
18
17#include "trace_probe.h" 19#include "trace_probe.h"
18#include "trace.h" 20#include "trace.h"
19 21
@@ -163,6 +165,10 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
163 * access_ok() should prevent writing to non-user memory, but in 165 * access_ok() should prevent writing to non-user memory, but in
164 * some situations (nommu, temporary switch, etc) access_ok() does 166 * some situations (nommu, temporary switch, etc) access_ok() does
165 * not provide enough validation, hence the check on KERNEL_DS. 167 * not provide enough validation, hence the check on KERNEL_DS.
168 *
169 * nmi_uaccess_okay() ensures the probe is not run in an interim
170 * state, when the task or mm are switched. This is specifically
171 * required to prevent the use of temporary mm.
166 */ 172 */
167 173
168 if (unlikely(in_interrupt() || 174 if (unlikely(in_interrupt() ||
@@ -170,6 +176,8 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src,
170 return -EPERM; 176 return -EPERM;
171 if (unlikely(uaccess_kernel())) 177 if (unlikely(uaccess_kernel()))
172 return -EPERM; 178 return -EPERM;
179 if (unlikely(!nmi_uaccess_okay()))
180 return -EPERM;
173 if (!access_ok(unsafe_ptr, size)) 181 if (!access_ok(unsafe_ptr, size))
174 return -EPERM; 182 return -EPERM;
175 183