aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2013-04-22 20:32:51 -0400
committerDavid S. Miller <davem@davemloft.net>2013-04-22 20:32:51 -0400
commit6e0895c2ea326cc4bb11e8fa2f654628d5754c31 (patch)
tree7089303ac11a12edc43a8c4fa1b23974e10937ea /kernel
parent55fbbe46e9eb3cbe6c335503f5550855a1128dce (diff)
parent60d509fa6a9c4653a86ad830e4c4b30360b23f0e (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Conflicts: drivers/net/ethernet/emulex/benet/be_main.c drivers/net/ethernet/intel/igb/igb_main.c drivers/net/wireless/brcm80211/brcmsmac/mac80211_if.c include/net/scm.h net/batman-adv/routing.c net/ipv4/tcp_input.c The e{uid,gid} --> {uid,gid} credentials fix conflicted with the cleanup in net-next to now pass cred structs around. The be2net driver had a bug fix in 'net' that overlapped with the VLAN interface changes by Patrick McHardy in net-next. An IGB conflict existed because in 'net' the build_skb() support was reverted, and in 'net-next' there was a comment style fix within that code. Several batman-adv conflicts were resolved by making sure that all calls to batadv_is_my_mac() are changed to have a new bat_priv first argument. Eric Dumazet's TS ECR fix in TCP in 'net' conflicted with the F-RTO rewrite in 'net-next', mostly overlapping changes. Thanks to Stephen Rothwell and Antonio Quartulli for help with several of these merge resolutions. Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/events/core.c6
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c22
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/kexec.c118
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/kthread.c52
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smpboot.c14
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/trace/blktrace.c26
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/trace.c9
-rw-r--r--kernel/trace/trace_stack.c2
-rw-r--r--kernel/user_namespace.c22
19 files changed, 289 insertions, 125 deletions
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
393EXPORT_SYMBOL(ns_capable); 393EXPORT_SYMBOL(ns_capable);
394 394
395/** 395/**
396 * file_ns_capable - Determine if the file's opener had a capability in effect
397 * @file: The file we want to check
398 * @ns: The usernamespace we want the capability in
399 * @cap: The capability to be tested for
400 *
401 * Return true if task that opened the file had a capability in effect
402 * when the file was opened.
403 *
404 * This does not set PF_SUPERPRIV because the caller may not
405 * actually be privileged.
406 */
407bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
408{
409 if (WARN_ON_ONCE(!cap_valid(cap)))
410 return false;
411
412 if (security_capable(file->f_cred, ns, cap) == 0)
413 return true;
414
415 return false;
416}
417EXPORT_SYMBOL(file_ns_capable);
418
419/**
396 * capable - Determine if the current task has a superior capability in effect 420 * capable - Determine if the current task has a superior capability in effect
397 * @cap: The capability to be tested for 421 * @cap: The capability to be tested for
398 * 422 *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 59412d037eed..4d3124b39277 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4737,7 +4737,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4737 } else { 4737 } else {
4738 if (arch_vma_name(mmap_event->vma)) { 4738 if (arch_vma_name(mmap_event->vma)) {
4739 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 4739 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4740 sizeof(tmp)); 4740 sizeof(tmp) - 1);
4741 tmp[sizeof(tmp) - 1] = '\0';
4741 goto got_name; 4742 goto got_name;
4742 } 4743 }
4743 4744
@@ -5330,7 +5331,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
5330 5331
5331static int perf_swevent_init(struct perf_event *event) 5332static int perf_swevent_init(struct perf_event *event)
5332{ 5333{
5333 int event_id = event->attr.config; 5334 u64 event_id = event->attr.config;
5334 5335
5335 if (event->attr.type != PERF_TYPE_SOFTWARE) 5336 if (event->attr.type != PERF_TYPE_SOFTWARE)
5336 return -ENOENT; 5337 return -ENOENT;
@@ -5986,6 +5987,7 @@ skip_type:
5986 if (pmu->pmu_cpu_context) 5987 if (pmu->pmu_cpu_context)
5987 goto got_cpu_context; 5988 goto got_cpu_context;
5988 5989
5990 ret = -ENOMEM;
5989 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5991 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5990 if (!pmu->pmu_cpu_context) 5992 if (!pmu->pmu_cpu_context)
5991 goto free_dev; 5993 goto free_dev;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
16 int page_order; /* allocation order */ 16 int page_order; /* allocation order */
17#endif 17#endif
18 int nr_pages; /* nr of data pages */ 18 int nr_pages; /* nr of data pages */
19 int writable; /* are we writable */ 19 int overwrite; /* can overwrite itself */
20 20
21 atomic_t poll; /* POLL_ for wakeups */ 21 atomic_t poll; /* POLL_ for wakeups */
22 22
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..97fddb09762b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head) 19 unsigned long offset, unsigned long head)
20{ 20{
21 unsigned long mask; 21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
22 23
23 if (!rb->writable) 24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
24 return true; 30 return true;
25 31
26 mask = perf_data_size(rb) - 1; 32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
27 39
28 offset = (offset - tail) & mask; 40 offset = (offset - tail) & mask;
29 head = (head - tail) & mask; 41 head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
212 rb->watermark = max_size / 2; 224 rb->watermark = max_size / 2;
213 225
214 if (flags & RING_BUFFER_WRITABLE) 226 if (flags & RING_BUFFER_WRITABLE)
215 rb->writable = 1; 227 rb->overwrite = 0;
228 else
229 rb->overwrite = 1;
216 230
217 atomic_set(&rb->refcount, 1); 231 atomic_set(&rb->refcount, 1);
218 232
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..14be27feda49 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = 63DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
64{ 64{
65 65
66 .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
66 .clock_base = 67 .clock_base =
67 { 68 {
68 { 69 {
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1642 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1643 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1643 int i; 1644 int i;
1644 1645
1645 raw_spin_lock_init(&cpu_base->lock);
1646
1647 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1646 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1648 cpu_base->clock_base[i].cpu_base = cpu_base; 1647 cpu_base->clock_base[i].cpu_base = cpu_base;
1649 timerqueue_init_head(&cpu_base->clock_base[i].active); 1648 timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..ffd4e111fd67 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 55 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
56}; 56};
57struct resource crashk_low_res = { 57struct resource crashk_low_res = {
58 .name = "Crash kernel low", 58 .name = "Crash kernel",
59 .start = 0, 59 .start = 0,
60 .end = 0, 60 .end = 0,
61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM 61 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -1368,35 +1368,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
1368 return 0; 1368 return 0;
1369} 1369}
1370 1370
1371#define SUFFIX_HIGH 0
1372#define SUFFIX_LOW 1
1373#define SUFFIX_NULL 2
1374static __initdata char *suffix_tbl[] = {
1375 [SUFFIX_HIGH] = ",high",
1376 [SUFFIX_LOW] = ",low",
1377 [SUFFIX_NULL] = NULL,
1378};
1379
1371/* 1380/*
1372 * That function is the entry point for command line parsing and should be 1381 * That function parses "suffix" crashkernel command lines like
1373 * called from the arch-specific code. 1382 *
1383 * crashkernel=size,[high|low]
1384 *
1385 * It returns 0 on success and -EINVAL on failure.
1374 */ 1386 */
1387static int __init parse_crashkernel_suffix(char *cmdline,
1388 unsigned long long *crash_size,
1389 unsigned long long *crash_base,
1390 const char *suffix)
1391{
1392 char *cur = cmdline;
1393
1394 *crash_size = memparse(cmdline, &cur);
1395 if (cmdline == cur) {
1396 pr_warn("crashkernel: memory value expected\n");
1397 return -EINVAL;
1398 }
1399
1400 /* check with suffix */
1401 if (strncmp(cur, suffix, strlen(suffix))) {
1402 pr_warn("crashkernel: unrecognized char\n");
1403 return -EINVAL;
1404 }
1405 cur += strlen(suffix);
1406 if (*cur != ' ' && *cur != '\0') {
1407 pr_warn("crashkernel: unrecognized char\n");
1408 return -EINVAL;
1409 }
1410
1411 return 0;
1412}
1413
1414static __init char *get_last_crashkernel(char *cmdline,
1415 const char *name,
1416 const char *suffix)
1417{
1418 char *p = cmdline, *ck_cmdline = NULL;
1419
1420 /* find crashkernel and use the last one if there are more */
1421 p = strstr(p, name);
1422 while (p) {
1423 char *end_p = strchr(p, ' ');
1424 char *q;
1425
1426 if (!end_p)
1427 end_p = p + strlen(p);
1428
1429 if (!suffix) {
1430 int i;
1431
1432 /* skip the one with any known suffix */
1433 for (i = 0; suffix_tbl[i]; i++) {
1434 q = end_p - strlen(suffix_tbl[i]);
1435 if (!strncmp(q, suffix_tbl[i],
1436 strlen(suffix_tbl[i])))
1437 goto next;
1438 }
1439 ck_cmdline = p;
1440 } else {
1441 q = end_p - strlen(suffix);
1442 if (!strncmp(q, suffix, strlen(suffix)))
1443 ck_cmdline = p;
1444 }
1445next:
1446 p = strstr(p+1, name);
1447 }
1448
1449 if (!ck_cmdline)
1450 return NULL;
1451
1452 return ck_cmdline;
1453}
1454
1375static int __init __parse_crashkernel(char *cmdline, 1455static int __init __parse_crashkernel(char *cmdline,
1376 unsigned long long system_ram, 1456 unsigned long long system_ram,
1377 unsigned long long *crash_size, 1457 unsigned long long *crash_size,
1378 unsigned long long *crash_base, 1458 unsigned long long *crash_base,
1379 const char *name) 1459 const char *name,
1460 const char *suffix)
1380{ 1461{
1381 char *p = cmdline, *ck_cmdline = NULL;
1382 char *first_colon, *first_space; 1462 char *first_colon, *first_space;
1463 char *ck_cmdline;
1383 1464
1384 BUG_ON(!crash_size || !crash_base); 1465 BUG_ON(!crash_size || !crash_base);
1385 *crash_size = 0; 1466 *crash_size = 0;
1386 *crash_base = 0; 1467 *crash_base = 0;
1387 1468
1388 /* find crashkernel and use the last one if there are more */ 1469 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1389 p = strstr(p, name);
1390 while (p) {
1391 ck_cmdline = p;
1392 p = strstr(p+1, name);
1393 }
1394 1470
1395 if (!ck_cmdline) 1471 if (!ck_cmdline)
1396 return -EINVAL; 1472 return -EINVAL;
1397 1473
1398 ck_cmdline += strlen(name); 1474 ck_cmdline += strlen(name);
1399 1475
1476 if (suffix)
1477 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1478 crash_base, suffix);
1400 /* 1479 /*
1401 * if the commandline contains a ':', then that's the extended 1480 * if the commandline contains a ':', then that's the extended
1402 * syntax -- if not, it must be the classic syntax 1481 * syntax -- if not, it must be the classic syntax
@@ -1413,13 +1492,26 @@ static int __init __parse_crashkernel(char *cmdline,
1413 return 0; 1492 return 0;
1414} 1493}
1415 1494
1495/*
1496 * That function is the entry point for command line parsing and should be
1497 * called from the arch-specific code.
1498 */
1416int __init parse_crashkernel(char *cmdline, 1499int __init parse_crashkernel(char *cmdline,
1417 unsigned long long system_ram, 1500 unsigned long long system_ram,
1418 unsigned long long *crash_size, 1501 unsigned long long *crash_size,
1419 unsigned long long *crash_base) 1502 unsigned long long *crash_base)
1420{ 1503{
1421 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1504 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1422 "crashkernel="); 1505 "crashkernel=", NULL);
1506}
1507
1508int __init parse_crashkernel_high(char *cmdline,
1509 unsigned long long system_ram,
1510 unsigned long long *crash_size,
1511 unsigned long long *crash_base)
1512{
1513 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1514 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1423} 1515}
1424 1516
1425int __init parse_crashkernel_low(char *cmdline, 1517int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1520,7 @@ int __init parse_crashkernel_low(char *cmdline,
1428 unsigned long long *crash_base) 1520 unsigned long long *crash_base)
1429{ 1521{
1430 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, 1522 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1431 "crashkernel_low="); 1523 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1432} 1524}
1433 1525
1434static void update_vmcoreinfo_note(void) 1526static void update_vmcoreinfo_note(void)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
794} 794}
795 795
796#ifdef CONFIG_SYSCTL 796#ifdef CONFIG_SYSCTL
797/* This should be called with kprobe_mutex locked */
798static void __kprobes optimize_all_kprobes(void) 797static void __kprobes optimize_all_kprobes(void)
799{ 798{
800 struct hlist_head *head; 799 struct hlist_head *head;
801 struct kprobe *p; 800 struct kprobe *p;
802 unsigned int i; 801 unsigned int i;
803 802
803 mutex_lock(&kprobe_mutex);
804 /* If optimization is already allowed, just return */ 804 /* If optimization is already allowed, just return */
805 if (kprobes_allow_optimization) 805 if (kprobes_allow_optimization)
806 return; 806 goto out;
807 807
808 kprobes_allow_optimization = true; 808 kprobes_allow_optimization = true;
809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 809 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
813 optimize_kprobe(p); 813 optimize_kprobe(p);
814 } 814 }
815 printk(KERN_INFO "Kprobes globally optimized\n"); 815 printk(KERN_INFO "Kprobes globally optimized\n");
816out:
817 mutex_unlock(&kprobe_mutex);
816} 818}
817 819
818/* This should be called with kprobe_mutex locked */
819static void __kprobes unoptimize_all_kprobes(void) 820static void __kprobes unoptimize_all_kprobes(void)
820{ 821{
821 struct hlist_head *head; 822 struct hlist_head *head;
822 struct kprobe *p; 823 struct kprobe *p;
823 unsigned int i; 824 unsigned int i;
824 825
826 mutex_lock(&kprobe_mutex);
825 /* If optimization is already prohibited, just return */ 827 /* If optimization is already prohibited, just return */
826 if (!kprobes_allow_optimization) 828 if (!kprobes_allow_optimization) {
829 mutex_unlock(&kprobe_mutex);
827 return; 830 return;
831 }
828 832
829 kprobes_allow_optimization = false; 833 kprobes_allow_optimization = false;
830 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 834 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
834 unoptimize_kprobe(p, false); 838 unoptimize_kprobe(p, false);
835 } 839 }
836 } 840 }
841 mutex_unlock(&kprobe_mutex);
842
837 /* Wait for unoptimizing completion */ 843 /* Wait for unoptimizing completion */
838 wait_for_kprobe_optimizer(); 844 wait_for_kprobe_optimizer();
839 printk(KERN_INFO "Kprobes globally unoptimized\n"); 845 printk(KERN_INFO "Kprobes globally unoptimized\n");
840} 846}
841 847
848static DEFINE_MUTEX(kprobe_sysctl_mutex);
842int sysctl_kprobes_optimization; 849int sysctl_kprobes_optimization;
843int proc_kprobes_optimization_handler(struct ctl_table *table, int write, 850int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
844 void __user *buffer, size_t *length, 851 void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
846{ 853{
847 int ret; 854 int ret;
848 855
849 mutex_lock(&kprobe_mutex); 856 mutex_lock(&kprobe_sysctl_mutex);
850 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; 857 sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
851 ret = proc_dointvec_minmax(table, write, buffer, length, ppos); 858 ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
852 859
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
854 optimize_all_kprobes(); 861 optimize_all_kprobes();
855 else 862 else
856 unoptimize_all_kprobes(); 863 unoptimize_all_kprobes();
857 mutex_unlock(&kprobe_mutex); 864 mutex_unlock(&kprobe_sysctl_mutex);
858 865
859 return ret; 866 return ret;
860} 867}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..9eb7fed0bbaa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
124 124
125static void __kthread_parkme(struct kthread *self) 125static void __kthread_parkme(struct kthread *self)
126{ 126{
127 __set_current_state(TASK_INTERRUPTIBLE); 127 __set_current_state(TASK_PARKED);
128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { 128 while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) 129 if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
130 complete(&self->parked); 130 complete(&self->parked);
131 schedule(); 131 schedule();
132 __set_current_state(TASK_INTERRUPTIBLE); 132 __set_current_state(TASK_PARKED);
133 } 133 }
134 clear_bit(KTHREAD_IS_PARKED, &self->flags); 134 clear_bit(KTHREAD_IS_PARKED, &self->flags);
135 __set_current_state(TASK_RUNNING); 135 __set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
256} 256}
257EXPORT_SYMBOL(kthread_create_on_node); 257EXPORT_SYMBOL(kthread_create_on_node);
258 258
259static void __kthread_bind(struct task_struct *p, unsigned int cpu) 259static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
260{ 260{
261 /* Must have done schedule() in kthread() before we set_task_cpu */
262 if (!wait_task_inactive(p, state)) {
263 WARN_ON(1);
264 return;
265 }
261 /* It's safe because the task is inactive. */ 266 /* It's safe because the task is inactive. */
262 do_set_cpus_allowed(p, cpumask_of(cpu)); 267 do_set_cpus_allowed(p, cpumask_of(cpu));
263 p->flags |= PF_THREAD_BOUND; 268 p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
274 */ 279 */
275void kthread_bind(struct task_struct *p, unsigned int cpu) 280void kthread_bind(struct task_struct *p, unsigned int cpu)
276{ 281{
277 /* Must have done schedule() in kthread() before we set_task_cpu */ 282 __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
278 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
279 WARN_ON(1);
280 return;
281 }
282 __kthread_bind(p, cpu);
283} 283}
284EXPORT_SYMBOL(kthread_bind); 284EXPORT_SYMBOL(kthread_bind);
285 285
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
324 return NULL; 324 return NULL;
325} 325}
326 326
327static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
328{
329 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
330 /*
331 * We clear the IS_PARKED bit here as we don't wait
332 * until the task has left the park code. So if we'd
333 * park before that happens we'd see the IS_PARKED bit
334 * which might be about to be cleared.
335 */
336 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
337 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
338 __kthread_bind(k, kthread->cpu, TASK_PARKED);
339 wake_up_state(k, TASK_PARKED);
340 }
341}
342
327/** 343/**
328 * kthread_unpark - unpark a thread created by kthread_create(). 344 * kthread_unpark - unpark a thread created by kthread_create().
329 * @k: thread created by kthread_create(). 345 * @k: thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
336{ 352{
337 struct kthread *kthread = task_get_live_kthread(k); 353 struct kthread *kthread = task_get_live_kthread(k);
338 354
339 if (kthread) { 355 if (kthread)
340 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 356 __kthread_unpark(k, kthread);
341 /*
342 * We clear the IS_PARKED bit here as we don't wait
343 * until the task has left the park code. So if we'd
344 * park before that happens we'd see the IS_PARKED bit
345 * which might be about to be cleared.
346 */
347 if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
348 if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
349 __kthread_bind(k, kthread->cpu);
350 wake_up_process(k);
351 }
352 }
353 put_task_struct(k); 357 put_task_struct(k);
354} 358}
355 359
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
407 trace_sched_kthread_stop(k); 411 trace_sched_kthread_stop(k);
408 if (kthread) { 412 if (kthread) {
409 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 413 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
410 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 414 __kthread_unpark(k, kthread);
411 wake_up_process(k); 415 wake_up_process(k);
412 wait_for_completion(&kthread->exited); 416 wait_for_completion(&kthread->exited);
413 } 417 }
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
176 u64 this_clock, remote_clock; 176 u64 this_clock, remote_clock;
177 u64 *ptr, old_val, val; 177 u64 *ptr, old_val, val;
178 178
179#if BITS_PER_LONG != 64
180again:
181 /*
182 * Careful here: The local and the remote clock values need to
183 * be read out atomic as we need to compare the values and
184 * then update either the local or the remote side. So the
185 * cmpxchg64 below only protects one readout.
186 *
187 * We must reread via sched_clock_local() in the retry case on
188 * 32bit as an NMI could use sched_clock_local() via the
189 * tracer and hit between the readout of
190 * the low32bit and the high 32bit portion.
191 */
192 this_clock = sched_clock_local(my_scd);
193 /*
194 * We must enforce atomic readout on 32bit, otherwise the
195 * update on the remote cpu can hit inbetween the readout of
196 * the low32bit and the high 32bit portion.
197 */
198 remote_clock = cmpxchg64(&scd->clock, 0, 0);
199#else
200 /*
201 * On 64bit the read of [my]scd->clock is atomic versus the
202 * update, so we can avoid the above 32bit dance.
203 */
179 sched_clock_local(my_scd); 204 sched_clock_local(my_scd);
180again: 205again:
181 this_clock = my_scd->clock; 206 this_clock = my_scd->clock;
182 remote_clock = scd->clock; 207 remote_clock = scd->clock;
208#endif
183 209
184 /* 210 /*
185 * Use the opportunity that we have both locks 211 * Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..67d04651f44b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1498,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)
1498{ 1498{
1499 struct rq *rq = task_rq(p); 1499 struct rq *rq = task_rq(p);
1500 1500
1501 BUG_ON(rq != this_rq()); 1501 if (WARN_ON_ONCE(rq != this_rq()) ||
1502 BUG_ON(p == current); 1502 WARN_ON_ONCE(p == current))
1503 return;
1504
1503 lockdep_assert_held(&rq->lock); 1505 lockdep_assert_held(&rq->lock);
1504 1506
1505 if (!raw_spin_trylock(&p->pi_lock)) { 1507 if (!raw_spin_trylock(&p->pi_lock)) {
@@ -4999,7 +5001,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
4999} 5001}
5000 5002
5001static int min_load_idx = 0; 5003static int min_load_idx = 0;
5002static int max_load_idx = CPU_LOAD_IDX_MAX; 5004static int max_load_idx = CPU_LOAD_IDX_MAX-1;
5003 5005
5004static void 5006static void
5005set_table_entry(struct ctl_table *entry, 5007set_table_entry(struct ctl_table *entry,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..e93cca92f38b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
310 310
311 t = tsk; 311 t = tsk;
312 do { 312 do {
313 task_cputime(tsk, &utime, &stime); 313 task_cputime(t, &utime, &stime);
314 times->utime += utime; 314 times->utime += utime;
315 times->stime += stime; 315 times->stime += stime;
316 times->sum_exec_runtime += task_sched_runtime(t); 316 times->sum_exec_runtime += task_sched_runtime(t);
diff --git a/kernel/signal.c b/kernel/signal.c
index 497330ec2ae9..06ff7764ab7c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2950,7 +2950,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
2950 2950
2951static int do_tkill(pid_t tgid, pid_t pid, int sig) 2951static int do_tkill(pid_t tgid, pid_t pid, int sig)
2952{ 2952{
2953 struct siginfo info; 2953 struct siginfo info = {};
2954 2954
2955 info.si_signo = sig; 2955 info.si_signo = sig;
2956 info.si_errno = 0; 2956 info.si_errno = 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
185 } 185 }
186 get_task_struct(tsk); 186 get_task_struct(tsk);
187 *per_cpu_ptr(ht->store, cpu) = tsk; 187 *per_cpu_ptr(ht->store, cpu) = tsk;
188 if (ht->create) 188 if (ht->create) {
189 ht->create(cpu); 189 /*
190 * Make sure that the task has actually scheduled out
191 * into park position, before calling the create
192 * callback. At least the migration thread callback
193 * requires that the task is off the runqueue.
194 */
195 if (!wait_task_inactive(tsk, TASK_PARKED))
196 WARN_ON(1);
197 else
198 ht->create(cpu);
199 }
190 return 0; 200 return 0;
191} 201}
192 202
diff --git a/kernel/sys.c b/kernel/sys.c
index 39c9c4a2949f..0da73cf73e60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
324 system_state = SYSTEM_RESTART; 324 system_state = SYSTEM_RESTART;
325 usermodehelper_disable(); 325 usermodehelper_disable();
326 device_shutdown(); 326 device_shutdown();
327 syscore_shutdown();
328} 327}
329 328
330/** 329/**
@@ -370,6 +369,7 @@ void kernel_restart(char *cmd)
370{ 369{
371 kernel_restart_prepare(cmd); 370 kernel_restart_prepare(cmd);
372 disable_nonboot_cpus(); 371 disable_nonboot_cpus();
372 syscore_shutdown();
373 if (!cmd) 373 if (!cmd)
374 printk(KERN_EMERG "Restarting system.\n"); 374 printk(KERN_EMERG "Restarting system.\n");
375 else 375 else
@@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
395void kernel_halt(void) 395void kernel_halt(void)
396{ 396{
397 kernel_shutdown_prepare(SYSTEM_HALT); 397 kernel_shutdown_prepare(SYSTEM_HALT);
398 disable_nonboot_cpus();
398 syscore_shutdown(); 399 syscore_shutdown();
399 printk(KERN_EMERG "System halted.\n"); 400 printk(KERN_EMERG "System halted.\n");
400 kmsg_dump(KMSG_DUMP_HALT); 401 kmsg_dump(KMSG_DUMP_HALT);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..5a0f781cd729 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
739 struct request_queue *q, 739 struct request_queue *q,
740 struct request *rq) 740 struct request *rq)
741{ 741{
742 struct blk_trace *bt = q->blk_trace;
743
744 /* if control ever passes through here, it's a request based driver */
745 if (unlikely(bt && !bt->rq_based))
746 bt->rq_based = true;
747
748 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 742 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
749} 743}
750 744
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
780 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); 774 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
781} 775}
782 776
783static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) 777static void blk_add_trace_bio_complete(void *ignore,
778 struct request_queue *q, struct bio *bio,
779 int error)
784{ 780{
785 struct request_queue *q;
786 struct blk_trace *bt;
787
788 if (!bio->bi_bdev)
789 return;
790
791 q = bdev_get_queue(bio->bi_bdev);
792 bt = q->blk_trace;
793
794 /*
795 * Request based drivers will generate both rq and bio completions.
796 * Ignore bio ones.
797 */
798 if (likely(!bt) || bt->rq_based)
799 return;
800
801 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); 781 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
802} 782}
803 783
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6893d5a2bf08..b3fde6d7b7fc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
66 66
67static struct ftrace_ops ftrace_list_end __read_mostly = { 67static struct ftrace_ops ftrace_list_end __read_mostly = {
68 .func = ftrace_stub, 68 .func = ftrace_stub,
69 .flags = FTRACE_OPS_FL_RECURSION_SAFE, 69 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
70}; 70};
71 71
72/* ftrace_enabled is a method to turn ftrace on or off */ 72/* ftrace_enabled is a method to turn ftrace on or off */
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
694 free_page(tmp); 694 free_page(tmp);
695 } 695 }
696 696
697 free_page((unsigned long)stat->pages);
698 stat->pages = NULL; 697 stat->pages = NULL;
699 stat->start = NULL; 698 stat->start = NULL;
700 699
@@ -1053,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
1053 1052
1054static struct pid * const ftrace_swapper_pid = &init_struct_pid; 1053static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1055 1054
1055loff_t
1056ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
1057{
1058 loff_t ret;
1059
1060 if (file->f_mode & FMODE_READ)
1061 ret = seq_lseek(file, offset, whence);
1062 else
1063 file->f_pos = ret = 1;
1064
1065 return ret;
1066}
1067
1056#ifdef CONFIG_DYNAMIC_FTRACE 1068#ifdef CONFIG_DYNAMIC_FTRACE
1057 1069
1058#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1070#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2613,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
2613 * routine, you can use ftrace_filter_write() for the write 2625 * routine, you can use ftrace_filter_write() for the write
2614 * routine if @flag has FTRACE_ITER_FILTER set, or 2626 * routine if @flag has FTRACE_ITER_FILTER set, or
2615 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. 2627 * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
2616 * ftrace_regex_lseek() should be used as the lseek routine, and 2628 * ftrace_filter_lseek() should be used as the lseek routine, and
2617 * release must call ftrace_regex_release(). 2629 * release must call ftrace_regex_release().
2618 */ 2630 */
2619int 2631int
@@ -2697,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
2697 inode, file); 2709 inode, file);
2698} 2710}
2699 2711
2700loff_t
2701ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
2702{
2703 loff_t ret;
2704
2705 if (file->f_mode & FMODE_READ)
2706 ret = seq_lseek(file, offset, whence);
2707 else
2708 file->f_pos = ret = 1;
2709
2710 return ret;
2711}
2712
2713static int ftrace_match(char *str, char *regex, int len, int type) 2712static int ftrace_match(char *str, char *regex, int len, int type)
2714{ 2713{
2715 int matched = 0; 2714 int matched = 0;
@@ -3441,14 +3440,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
3441 3440
3442static int __init set_ftrace_notrace(char *str) 3441static int __init set_ftrace_notrace(char *str)
3443{ 3442{
3444 strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); 3443 strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
3445 return 1; 3444 return 1;
3446} 3445}
3447__setup("ftrace_notrace=", set_ftrace_notrace); 3446__setup("ftrace_notrace=", set_ftrace_notrace);
3448 3447
3449static int __init set_ftrace_filter(char *str) 3448static int __init set_ftrace_filter(char *str)
3450{ 3449{
3451 strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); 3450 strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
3452 return 1; 3451 return 1;
3453} 3452}
3454__setup("ftrace_filter=", set_ftrace_filter); 3453__setup("ftrace_filter=", set_ftrace_filter);
@@ -3571,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
3571 .open = ftrace_filter_open, 3570 .open = ftrace_filter_open,
3572 .read = seq_read, 3571 .read = seq_read,
3573 .write = ftrace_filter_write, 3572 .write = ftrace_filter_write,
3574 .llseek = ftrace_regex_lseek, 3573 .llseek = ftrace_filter_lseek,
3575 .release = ftrace_regex_release, 3574 .release = ftrace_regex_release,
3576}; 3575};
3577 3576
@@ -3579,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
3579 .open = ftrace_notrace_open, 3578 .open = ftrace_notrace_open,
3580 .read = seq_read, 3579 .read = seq_read,
3581 .write = ftrace_notrace_write, 3580 .write = ftrace_notrace_write,
3582 .llseek = ftrace_regex_lseek, 3581 .llseek = ftrace_filter_lseek,
3583 .release = ftrace_regex_release, 3582 .release = ftrace_regex_release,
3584}; 3583};
3585 3584
@@ -3784,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
3784 .open = ftrace_graph_open, 3783 .open = ftrace_graph_open,
3785 .read = seq_read, 3784 .read = seq_read,
3786 .write = ftrace_graph_write, 3785 .write = ftrace_graph_write,
3786 .llseek = ftrace_filter_lseek,
3787 .release = ftrace_graph_release, 3787 .release = ftrace_graph_release,
3788 .llseek = seq_lseek,
3789}; 3788};
3790#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 3789#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
3791 3790
@@ -4131,7 +4130,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4131 preempt_disable_notrace(); 4130 preempt_disable_notrace();
4132 trace_recursion_set(TRACE_CONTROL_BIT); 4131 trace_recursion_set(TRACE_CONTROL_BIT);
4133 do_for_each_ftrace_op(op, ftrace_control_list) { 4132 do_for_each_ftrace_op(op, ftrace_control_list) {
4134 if (!ftrace_function_local_disabled(op) && 4133 if (!(op->flags & FTRACE_OPS_FL_STUB) &&
4134 !ftrace_function_local_disabled(op) &&
4135 ftrace_ops_test(op, ip)) 4135 ftrace_ops_test(op, ip))
4136 op->func(ip, parent_ip, op, regs); 4136 op->func(ip, parent_ip, op, regs);
4137 } while_for_each_ftrace_op(op); 4137 } while_for_each_ftrace_op(op);
@@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
4439 .open = ftrace_pid_open, 4439 .open = ftrace_pid_open,
4440 .write = ftrace_pid_write, 4440 .write = ftrace_pid_write,
4441 .read = seq_read, 4441 .read = seq_read,
4442 .llseek = seq_lseek, 4442 .llseek = ftrace_filter_lseek,
4443 .release = ftrace_pid_release, 4443 .release = ftrace_pid_release,
4444}; 4444};
4445 4445
@@ -4555,12 +4555,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
4555 ftrace_startup_sysctl(); 4555 ftrace_startup_sysctl();
4556 4556
4557 /* we are starting ftrace again */ 4557 /* we are starting ftrace again */
4558 if (ftrace_ops_list != &ftrace_list_end) { 4558 if (ftrace_ops_list != &ftrace_list_end)
4559 if (ftrace_ops_list->next == &ftrace_list_end) 4559 update_ftrace_function();
4560 ftrace_trace_function = ftrace_ops_list->func;
4561 else
4562 ftrace_trace_function = ftrace_ops_list_func;
4563 }
4564 4560
4565 } else { 4561 } else {
4566 /* stopping ftrace calls (just send to ftrace_stub) */ 4562 /* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade56981..66338c4f7f4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -132,7 +132,7 @@ static char *default_bootup_tracer;
132 132
133static int __init set_cmdline_ftrace(char *str) 133static int __init set_cmdline_ftrace(char *str)
134{ 134{
135 strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); 135 strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
136 default_bootup_tracer = bootup_tracer_buf; 136 default_bootup_tracer = bootup_tracer_buf;
137 /* We are using ftrace early, expand it */ 137 /* We are using ftrace early, expand it */
138 ring_buffer_expanded = 1; 138 ring_buffer_expanded = 1;
@@ -162,7 +162,7 @@ static char *trace_boot_options __initdata;
162 162
163static int __init set_trace_boot_options(char *str) 163static int __init set_trace_boot_options(char *str)
164{ 164{
165 strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); 165 strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
166 trace_boot_options = trace_boot_options_buf; 166 trace_boot_options = trace_boot_options_buf;
167 return 0; 167 return 0;
168} 168}
@@ -744,8 +744,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
744 return; 744 return;
745 745
746 WARN_ON_ONCE(!irqs_disabled()); 746 WARN_ON_ONCE(!irqs_disabled());
747 if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) 747 if (!current_trace->allocated_snapshot) {
748 /* Only the nop tracer should hit this when disabling */
749 WARN_ON_ONCE(current_trace != &nop_trace);
748 return; 750 return;
751 }
749 752
750 arch_spin_lock(&ftrace_max_lock); 753 arch_spin_lock(&ftrace_max_lock);
751 754
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..83a8b5b7bd35 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
322 .open = stack_trace_filter_open, 322 .open = stack_trace_filter_open,
323 .read = seq_read, 323 .read = seq_read,
324 .write = ftrace_filter_write, 324 .write = ftrace_filter_write,
325 .llseek = ftrace_regex_lseek, 325 .llseek = ftrace_filter_lseek,
326 .release = ftrace_regex_release, 326 .release = ftrace_regex_release,
327}; 327};
328 328
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a54f26f82eb2..e134d8f365dd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -25,7 +25,8 @@
25 25
26static struct kmem_cache *user_ns_cachep __read_mostly; 26static struct kmem_cache *user_ns_cachep __read_mostly;
27 27
28static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 28static bool new_idmap_permitted(const struct file *file,
29 struct user_namespace *ns, int cap_setid,
29 struct uid_gid_map *map); 30 struct uid_gid_map *map);
30 31
31static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) 32static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -612,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
612 if (map->nr_extents != 0) 613 if (map->nr_extents != 0)
613 goto out; 614 goto out;
614 615
615 /* Require the appropriate privilege CAP_SETUID or CAP_SETGID 616 /*
616 * over the user namespace in order to set the id mapping. 617 * Adjusting namespace settings requires capabilities on the target.
617 */ 618 */
618 if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) 619 if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
619 goto out; 620 goto out;
620 621
621 /* Get a buffer */ 622 /* Get a buffer */
@@ -700,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
700 701
701 ret = -EPERM; 702 ret = -EPERM;
702 /* Validate the user is allowed to use user id's mapped to. */ 703 /* Validate the user is allowed to use user id's mapped to. */
703 if (!new_idmap_permitted(ns, cap_setid, &new_map)) 704 if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
704 goto out; 705 goto out;
705 706
706 /* Map the lower ids from the parent user namespace to the 707 /* Map the lower ids from the parent user namespace to the
@@ -787,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
787 &ns->projid_map, &ns->parent->projid_map); 788 &ns->projid_map, &ns->parent->projid_map);
788} 789}
789 790
790static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, 791static bool new_idmap_permitted(const struct file *file,
792 struct user_namespace *ns, int cap_setid,
791 struct uid_gid_map *new_map) 793 struct uid_gid_map *new_map)
792{ 794{
793 /* Allow mapping to your own filesystem ids */ 795 /* Allow mapping to your own filesystem ids */
@@ -795,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
795 u32 id = new_map->extent[0].lower_first; 797 u32 id = new_map->extent[0].lower_first;
796 if (cap_setid == CAP_SETUID) { 798 if (cap_setid == CAP_SETUID) {
797 kuid_t uid = make_kuid(ns->parent, id); 799 kuid_t uid = make_kuid(ns->parent, id);
798 if (uid_eq(uid, current_fsuid())) 800 if (uid_eq(uid, file->f_cred->fsuid))
799 return true; 801 return true;
800 } 802 }
801 else if (cap_setid == CAP_SETGID) { 803 else if (cap_setid == CAP_SETGID) {
802 kgid_t gid = make_kgid(ns->parent, id); 804 kgid_t gid = make_kgid(ns->parent, id);
803 if (gid_eq(gid, current_fsgid())) 805 if (gid_eq(gid, file->f_cred->fsgid))
804 return true; 806 return true;
805 } 807 }
806 } 808 }
@@ -811,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
811 813
812 /* Allow the specified ids if we have the appropriate capability 814 /* Allow the specified ids if we have the appropriate capability
813 * (CAP_SETUID or CAP_SETGID) over the parent user namespace. 815 * (CAP_SETUID or CAP_SETGID) over the parent user namespace.
816 * And the opener of the id file also had the approprpiate capability.
814 */ 817 */
815 if (ns_capable(ns->parent, cap_setid)) 818 if (ns_capable(ns->parent, cap_setid) &&
819 file_ns_capable(file, ns->parent, cap_setid))
816 return true; 820 return true;
817 821
818 return false; 822 return false;